# Copyright (c) Microsoft Corporation. # Licensed under the MIT License. import re import importlib import time import bisect import pickle import random import requests import functools from pathlib import Path from typing import Iterable, Tuple, List import numpy as np import pandas as pd from lxml import etree from loguru import logger from yahooquery import Ticker from tqdm import tqdm from functools import partial from concurrent.futures import ProcessPoolExecutor from bs4 import BeautifulSoup HS_SYMBOLS_URL = "http://app.finance.ifeng.com/hq/list.php?type=stock_a&class={s_type}" CALENDAR_URL_BASE = "http://push2his.eastmoney.com/api/qt/stock/kline/get?secid={market}.{bench_code}&fields1=f1%2Cf2%2Cf3%2Cf4%2Cf5&fields2=f51%2Cf52%2Cf53%2Cf54%2Cf55%2Cf56%2Cf57%2Cf58&klt=101&fqt=0&beg=19900101&end=20991231" SZSE_CALENDAR_URL = "http://www.szse.cn/api/report/exchange/onepersistenthour/monthList?month={month}&random={random}" CALENDAR_BENCH_URL_MAP = { "CSI300": CALENDAR_URL_BASE.format(market=1, bench_code="000300"), "CSI500": CALENDAR_URL_BASE.format(market=1, bench_code="000905"), "CSI100": CALENDAR_URL_BASE.format(market=1, bench_code="000903"), # NOTE: Use the time series of SH600000 as the sequence of all stocks "ALL": CALENDAR_URL_BASE.format(market=1, bench_code="000905"), # NOTE: Use the time series of ^GSPC(SP500) as the sequence of all stocks "US_ALL": "^GSPC", "IN_ALL": "^NSEI", "BR_ALL": "^BVSP", } _BENCH_CALENDAR_LIST = None _ALL_CALENDAR_LIST = None _HS_SYMBOLS = None _US_SYMBOLS = None _IN_SYMBOLS = None _BR_SYMBOLS = None _EN_FUND_SYMBOLS = None _CALENDAR_MAP = {} # NOTE: Until 2020-10-20 20:00:00 MINIMUM_SYMBOLS_NUM = 3900 def get_calendar_list(bench_code="CSI300") -> List[pd.Timestamp]: """get SH/SZ history calendar list Parameters ---------- bench_code: str value from ["CSI300", "CSI500", "ALL", "US_ALL"] Returns ------- history calendar list """ logger.info(f"get calendar list: {bench_code}......") def _get_calendar(url): _value_list = requests.get(url).json()["data"]["klines"] return sorted(map(lambda x: pd.Timestamp(x.split(",")[0]), _value_list)) calendar = _CALENDAR_MAP.get(bench_code, None) if calendar is None: if bench_code.startswith("US_") or bench_code.startswith("IN_") or bench_code.startswith("BR_"): print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code])) print(Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max")) df = Ticker(CALENDAR_BENCH_URL_MAP[bench_code]).history(interval="1d", period="max") calendar = df.index.get_level_values(level="date").map(pd.Timestamp).unique().tolist() else: if bench_code.upper() == "ALL": @deco_retry def _get_calendar(month): _cal = [] try: resp = requests.get(SZSE_CALENDAR_URL.format(month=month, random=random.random)).json() for _r in resp["data"]: if int(_r["jybz"]): _cal.append(pd.Timestamp(_r["jyrq"])) except Exception as e: raise ValueError(f"{month}-->{e}") return _cal month_range = pd.date_range(start="2000-01", end=pd.Timestamp.now() + pd.Timedelta(days=31), freq="M") calendar = [] for _m in month_range: cal = _get_calendar(_m.strftime("%Y-%m")) if cal: calendar += cal calendar = list(filter(lambda x: x <= pd.Timestamp.now(), calendar)) else: calendar = _get_calendar(CALENDAR_BENCH_URL_MAP[bench_code]) _CALENDAR_MAP[bench_code] = calendar logger.info(f"end of get calendar list: {bench_code}.") return calendar def return_date_list(date_field_name: str, file_path: Path): date_list = pd.read_csv(file_path, sep=",", index_col=0)[date_field_name].to_list() return sorted(map(lambda x: pd.Timestamp(x), date_list)) def get_calendar_list_by_ratio( source_dir: [str, Path], date_field_name: str = "date", threshold: float = 0.5, minimum_count: int = 10, max_workers: int = 16, ) -> list: """get calendar list by selecting the date when few funds trade in this day Parameters ---------- source_dir: str or Path The directory where the raw data collected from the Internet is saved date_field_name: str date field name, default is date threshold: float threshold to exclude some days when few funds trade in this day, default 0.5 minimum_count: int minimum count of funds should trade in one day max_workers: int Concurrent number, default is 16 Returns ------- history calendar list """ logger.info(f"get calendar list from {source_dir} by threshold = {threshold}......") source_dir = Path(source_dir).expanduser() file_list = list(source_dir.glob("*.csv")) _number_all_funds = len(file_list) logger.info(f"count how many funds trade in this day......") _dict_count_trade = dict() # dict{date:count} _fun = partial(return_date_list, date_field_name) all_oldest_list = [] with tqdm(total=_number_all_funds) as p_bar: with ProcessPoolExecutor(max_workers=max_workers) as executor: for date_list in executor.map(_fun, file_list): if date_list: all_oldest_list.append(date_list[0]) for date in date_list: if date not in _dict_count_trade.keys(): _dict_count_trade[date] = 0 _dict_count_trade[date] += 1 p_bar.update() logger.info(f"count how many funds have founded in this day......") _dict_count_founding = {date: _number_all_funds for date in _dict_count_trade.keys()} # dict{date:count} with tqdm(total=_number_all_funds) as p_bar: for oldest_date in all_oldest_list: for date in _dict_count_founding.keys(): if date < oldest_date: _dict_count_founding[date] -= 1 calendar = [ date for date in _dict_count_trade if _dict_count_trade[date] >= max(int(_dict_count_founding[date] * threshold), minimum_count) ] return calendar def get_hs_stock_symbols() -> list: """get SH/SZ stock symbols Returns ------- stock symbols """ global _HS_SYMBOLS def _get_symbol(): _res = set() for _k, _v in (("ha", "ss"), ("sa", "sz"), ("gem", "sz")): resp = requests.get(HS_SYMBOLS_URL.format(s_type=_k)) _res |= set( map( lambda x: "{}.{}".format(re.findall(r"\d+", x)[0], _v), etree.HTML(resp.text).xpath("//div[@class='result']/ul//li/a/text()"), ) ) time.sleep(3) return _res if _HS_SYMBOLS is None: symbols = set() _retry = 60 # It may take multiple times to get the complete while len(symbols) < MINIMUM_SYMBOLS_NUM: symbols |= _get_symbol() time.sleep(3) symbol_cache_path = Path("~/.cache/hs_symbols_cache.pkl").expanduser().resolve() symbol_cache_path.parent.mkdir(parents=True, exist_ok=True) if symbol_cache_path.exists(): with symbol_cache_path.open("rb") as fp: cache_symbols = pickle.load(fp) symbols |= cache_symbols with symbol_cache_path.open("wb") as fp: pickle.dump(symbols, fp) _HS_SYMBOLS = sorted(list(symbols)) return _HS_SYMBOLS def get_us_stock_symbols(qlib_data_path: [str, Path] = None) -> list: """get US stock symbols Returns ------- stock symbols """ global _US_SYMBOLS @deco_retry def _get_eastmoney(): url = "http://4.push2.eastmoney.com/api/qt/clist/get?pn=1&pz=10000&fs=m:105,m:106,m:107&fields=f12" resp = requests.get(url) if resp.status_code != 200: raise ValueError("request error") try: _symbols = [_v["f12"].replace("_", "-P") for _v in resp.json()["data"]["diff"].values()] except Exception as e: logger.warning(f"request error: {e}") raise if len(_symbols) < 8000: raise ValueError("request error") return _symbols @deco_retry def _get_nasdaq(): _res_symbols = [] for _name in ["otherlisted", "nasdaqtraded"]: url = f"ftp://ftp.nasdaqtrader.com/SymbolDirectory/{_name}.txt" df = pd.read_csv(url, sep="|") df = df.rename(columns={"ACT Symbol": "Symbol"}) _symbols = df["Symbol"].dropna() _symbols = _symbols.str.replace("$", "-P", regex=False) _symbols = _symbols.str.replace(".W", "-WT", regex=False) _symbols = _symbols.str.replace(".U", "-UN", regex=False) _symbols = _symbols.str.replace(".R", "-RI", regex=False) _symbols = _symbols.str.replace(".", "-", regex=False) _res_symbols += _symbols.unique().tolist() return _res_symbols @deco_retry def _get_nyse(): url = "https://www.nyse.com/api/quotes/filter" _parms = { "instrumentType": "EQUITY", "pageNumber": 1, "sortColumn": "NORMALIZED_TICKER", "sortOrder": "ASC", "maxResultsPerPage": 10000, "filterToken": "", } resp = requests.post(url, json=_parms) if resp.status_code != 200: raise ValueError("request error") try: _symbols = [_v["symbolTicker"].replace("-", "-P") for _v in resp.json()] except Exception as e: logger.warning(f"request error: {e}") _symbols = [] return _symbols if _US_SYMBOLS is None: _all_symbols = _get_eastmoney() + _get_nasdaq() + _get_nyse() if qlib_data_path is not None: for _index in ["nasdaq100", "sp500"]: ins_df = pd.read_csv( Path(qlib_data_path).joinpath(f"instruments/{_index}.txt"), sep="\t", names=["symbol", "start_date", "end_date"], ) _all_symbols += ins_df["symbol"].unique().tolist() def _format(s_): s_ = s_.replace(".", "-") s_ = s_.strip("$") s_ = s_.strip("*") return s_ _US_SYMBOLS = sorted(set(map(_format, filter(lambda x: len(x) < 8 and not x.endswith("WS"), _all_symbols)))) return _US_SYMBOLS def get_in_stock_symbols(qlib_data_path: [str, Path] = None) -> list: """get IN stock symbols Returns ------- stock symbols """ global _IN_SYMBOLS @deco_retry def _get_nifty(): url = f"https://www1.nseindia.com/content/equities/EQUITY_L.csv" df = pd.read_csv(url) df = df.rename(columns={"SYMBOL": "Symbol"}) df["Symbol"] = df["Symbol"] + ".NS" _symbols = df["Symbol"].dropna() _symbols = _symbols.unique().tolist() return _symbols if _IN_SYMBOLS is None: _all_symbols = _get_nifty() if qlib_data_path is not None: for _index in ["nifty"]: ins_df = pd.read_csv( Path(qlib_data_path).joinpath(f"instruments/{_index}.txt"), sep="\t", names=["symbol", "start_date", "end_date"], ) _all_symbols += ins_df["symbol"].unique().tolist() def _format(s_): s_ = s_.replace(".", "-") s_ = s_.strip("$") s_ = s_.strip("*") return s_ _IN_SYMBOLS = sorted(set(_all_symbols)) return _IN_SYMBOLS def get_br_stock_symbols(qlib_data_path: [str, Path] = None) -> list: """get Brazil(B3) stock symbols Returns ------- B3 stock symbols """ global _BR_SYMBOLS @deco_retry def _get_ibovespa(): _symbols = [] url = "https://www.fundamentus.com.br/detalhes.php?papel=" # Request agent = {"User-Agent": "Mozilla/5.0"} page = requests.get(url, headers=agent) # BeautifulSoup soup = BeautifulSoup(page.content, "html.parser") tbody = soup.find("tbody") children = tbody.findChildren("a", recursive=True) for child in children: _symbols.append(str(child).split('"')[-1].split(">")[1].split("<")[0]) return _symbols if _BR_SYMBOLS is None: _all_symbols = _get_ibovespa() if qlib_data_path is not None: for _index in ["ibov"]: ins_df = pd.read_csv( Path(qlib_data_path).joinpath(f"instruments/{_index}.txt"), sep="\t", names=["symbol", "start_date", "end_date"], ) _all_symbols += ins_df["symbol"].unique().tolist() def _format(s_): s_ = s_.strip() s_ = s_.strip("$") s_ = s_.strip("*") s_ = s_ + ".SA" return s_ _BR_SYMBOLS = sorted(set(map(_format, _all_symbols))) return _BR_SYMBOLS def get_en_fund_symbols(qlib_data_path: [str, Path] = None) -> list: """get en fund symbols Returns ------- fund symbols in China """ global _EN_FUND_SYMBOLS @deco_retry def _get_eastmoney(): url = "http://fund.eastmoney.com/js/fundcode_search.js" resp = requests.get(url) if resp.status_code != 200: raise ValueError("request error") try: _symbols = [] for sub_data in re.findall(r"[\[](.*?)[\]]", resp.content.decode().split("= [")[-1].replace("];", "")): data = sub_data.replace('"', "").replace("'", "") # TODO: do we need other information, like fund_name from ['000001', 'HXCZHH', '华夏成长混合', '混合型', 'HUAXIACHENGZHANGHUNHE'] _symbols.append(data.split(",")[0]) except Exception as e: logger.warning(f"request error: {e}") raise if len(_symbols) < 8000: raise ValueError("request error") return _symbols if _EN_FUND_SYMBOLS is None: _all_symbols = _get_eastmoney() _EN_FUND_SYMBOLS = sorted(set(_all_symbols)) return _EN_FUND_SYMBOLS def symbol_suffix_to_prefix(symbol: str, capital: bool = True) -> str: """symbol suffix to prefix Parameters ---------- symbol: str symbol capital : bool by default True Returns ------- """ code, exchange = symbol.split(".") if exchange.lower() in ["sh", "ss"]: res = f"sh{code}" else: res = f"{exchange}{code}" return res.upper() if capital else res.lower() def symbol_prefix_to_sufix(symbol: str, capital: bool = True) -> str: """symbol prefix to sufix Parameters ---------- symbol: str symbol capital : bool by default True Returns ------- """ res = f"{symbol[:-2]}.{symbol[-2:]}" return res.upper() if capital else res.lower() def deco_retry(retry: int = 5, retry_sleep: int = 3): def deco_func(func): @functools.wraps(func) def wrapper(*args, **kwargs): _retry = 5 if callable(retry) else retry _result = None for _i in range(1, _retry + 1): try: _result = func(*args, **kwargs) break except Exception as e: logger.warning(f"{func.__name__}: {_i} :{e}") if _i == _retry: raise time.sleep(retry_sleep) return _result return wrapper return deco_func(retry) if callable(retry) else deco_func def get_trading_date_by_shift(trading_list: list, trading_date: pd.Timestamp, shift: int = 1): """get trading date by shift Parameters ---------- trading_list: list trading calendar list shift : int shift, default is 1 trading_date : pd.Timestamp trading date Returns ------- """ trading_date = pd.Timestamp(trading_date) left_index = bisect.bisect_left(trading_list, trading_date) try: res = trading_list[left_index + shift] except IndexError: res = trading_date return res def generate_minutes_calendar_from_daily( calendars: Iterable, freq: str = "1min", am_range: Tuple[str, str] = ("09:30:00", "11:29:00"), pm_range: Tuple[str, str] = ("13:00:00", "14:59:00"), ) -> pd.Index: """generate minutes calendar Parameters ---------- calendars: Iterable daily calendar freq: str by default 1min am_range: Tuple[str, str] AM Time Range, by default China-Stock: ("09:30:00", "11:29:00") pm_range: Tuple[str, str] PM Time Range, by default China-Stock: ("13:00:00", "14:59:00") """ daily_format: str = "%Y-%m-%d" res = [] for _day in calendars: for _range in [am_range, pm_range]: res.append( pd.date_range( f"{pd.Timestamp(_day).strftime(daily_format)} {_range[0]}", f"{pd.Timestamp(_day).strftime(daily_format)} {_range[1]}", freq=freq, ) ) return pd.Index(sorted(set(np.hstack(res)))) def get_instruments( qlib_dir: str, index_name: str, method: str = "parse_instruments", freq: str = "day", request_retry: int = 5, retry_sleep: int = 3, market_index: str = "cn_index", ): """ Parameters ---------- qlib_dir: str qlib data dir, default "Path(__file__).parent/qlib_data" index_name: str index name, value from ["csi100", "csi300"] method: str method, value from ["parse_instruments", "save_new_companies"] freq: str freq, value from ["day", "1min"] request_retry: int request retry, by default 5 retry_sleep: int request sleep, by default 3 market_index: str Where the files to obtain the index are located, for example data_collector.cn_index.collector Examples ------- # parse instruments $ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method parse_instruments # parse new companies $ python collector.py --index_name CSI300 --qlib_dir ~/.qlib/qlib_data/cn_data --method save_new_companies """ _cur_module = importlib.import_module("data_collector.{}.collector".format(market_index)) obj = getattr(_cur_module, f"{index_name.upper()}Index")( qlib_dir=qlib_dir, index_name=index_name, freq=freq, request_retry=request_retry, retry_sleep=retry_sleep ) getattr(obj, method)() if __name__ == "__main__": assert len(get_hs_stock_symbols()) >= MINIMUM_SYMBOLS_NUM