|
|
import os, time
|
|
|
from tqdm import tqdm
|
|
|
import itertools
|
|
|
from pathlib import Path
|
|
|
|
|
|
from collections import namedtuple
|
|
|
|
|
|
import numpy as np
|
|
|
import pandas as pd
|
|
|
|
|
|
from tsl import *
|
|
|
from config import *
|
|
|
|
|
|
class DLMarket():
|
|
|
|
|
|
k_daily_data_shards = [
|
|
|
DateRange(20000101, 20091231),
|
|
|
DateRange(20100101, 20191231),
|
|
|
DateRange(20200101, 20220630)
|
|
|
]
|
|
|
|
|
|
|
|
|
def __init__(self):
|
|
|
pass
|
|
|
|
|
|
def do_k_daily(self):
|
|
|
for date_range in self.k_daily_data_shards:
|
|
|
start_date, end_date = date_range['start_date'], date_range['end_date']
|
|
|
self._dump_mkt_k_daily(start_date, end_date)
|
|
|
|
|
|
|
|
|
def do_k_1min(self):
|
|
|
start_date, end_date= 20210701, 20211231
|
|
|
self._dump_mkt_k_1min(start_date, end_date)
|
|
|
|
|
|
|
|
|
def _dump_mkt_k_1min(self, start_date, end_date):
|
|
|
"""
|
|
|
For some reason, 1min k cannot be returned by a date range,
|
|
|
or the program is prune to give address boundary error,
|
|
|
which should be an issue of the tinysoft API.
|
|
|
|
|
|
Thus, we have to do the query day by day.
|
|
|
|
|
|
But, maybe is is possible to query a range of stock for one day,
|
|
|
which has about 1M rows/day (depends on the #stocks on list that day).
|
|
|
"""
|
|
|
print('准备构建1分钟K线数据', start_date, end_date)
|
|
|
with tsl() as t:
|
|
|
date_list = t.get_mkt_trading_days(start_date, end_date)
|
|
|
|
|
|
with tqdm(date_list) as pbar:
|
|
|
for date in pbar:
|
|
|
pbar.set_description('正在获取当日1分钟K线数据:' + str(date))
|
|
|
|
|
|
dump_folder = '{}/行情数据/分钟K线/{}/'.format(
|
|
|
TINYSOFT_DATA_PATH,
|
|
|
date,
|
|
|
)
|
|
|
# ensure the folder for the shard does exist
|
|
|
Path(dump_folder).mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
with tsl() as t:
|
|
|
stock_list = t.get_stock_list(date)
|
|
|
r = t.get_mkt_stock_k_1min(date, stock_list)
|
|
|
|
|
|
pbar.set_description('当日数据已经获取,准备创建DataFrame,' + str(date))
|
|
|
df = pd.DataFrame(r.value())
|
|
|
|
|
|
pbar.set_description('DataFrame已经创建完成,分块后写入磁盘,' + str(date))
|
|
|
|
|
|
if len(df.index) == 0:
|
|
|
print('因数据缺失,跳过', date)
|
|
|
print(r.message)
|
|
|
time.sleep(0.1)
|
|
|
continue
|
|
|
|
|
|
for stock_id, df_stock in df.groupby('StockID'):
|
|
|
df_stock.set_index(['StockID', 'time'], inplace=True)
|
|
|
df_stock.to_csv('{}/{}.csv'.format(dump_folder, stock_id))
|
|
|
# to avoid massive concurrent disk writing
|
|
|
time.sleep(0.001)
|
|
|
|
|
|
del(df)
|
|
|
# 由于不断重新链接tsl客户端,所以每次完成之后需要等待更长的时间,特别是在获取数据缺失的情况下
|
|
|
time.sleep(0.1)
|
|
|
|
|
|
|
|
|
def _dump_mkt_k_daily(self, start_date, end_date):
|
|
|
|
|
|
dump_folder = dump_path = '{}/行情数据/日K线/shards/{}/'.format(
|
|
|
TINYSOFT_DATA_PATH,
|
|
|
str(start_date) + '-' + str(end_date),
|
|
|
)
|
|
|
Path(dump_folder).mkdir(parents=True, exist_ok=True)
|
|
|
|
|
|
with tsl() as t:
|
|
|
stock_list = t.get_stock_list()
|
|
|
print('正在获取日频行情数据:')
|
|
|
|
|
|
with tqdm(stock_list) as pbar:
|
|
|
for stock_id in pbar:
|
|
|
dump_path = '{}/{}.csv'.format(dump_folder, stock_id)
|
|
|
pbar.set_description(dump_path)
|
|
|
|
|
|
# 每次重新启动tsl客户端,以免大量数据在tsl客户端内累积,会导致segment fault
|
|
|
# 在更稳定的服务器条件下,有可能不需要反复重新连接tsl,这里尝试直接使用原有tsl对象
|
|
|
# with tsl() as t:
|
|
|
df1, r1 = t.get_mkt_stock_k_daily(start_date, end_date, stock_id)
|
|
|
df2, r2 = t.get_mkt_stock_k_daily_ext(start_date, end_date, stock_id)
|
|
|
|
|
|
if len(df1) == 0 or len(df2) == 0:
|
|
|
print('因数据缺失,跳过', stock_id)
|
|
|
# 由于不断重新链接tsl客户端,所以每次完成之后需要等待更长的时间
|
|
|
# 特别是在获取数据缺失的情况下需要额外等待一定时间
|
|
|
time.sleep(0.05)
|
|
|
continue
|
|
|
|
|
|
df = pd.concat([df1, df2], axis=1)
|
|
|
df.to_csv(dump_path)
|
|
|
del(df)
|
|
|
# 由于不断重新链接tsl客户端,所以每次完成之后需要等待更长的时间,特别是在获取数据缺失的情况下
|
|
|
time.sleep(0.005)
|
|
|
|
|
|
|
|
|
def _dump_mkt_calendar(self, start_date, end_date):
|
|
|
|
|
|
def _dump_df_to_csv(series):
|
|
|
dump_path = '{}/行情数据/{}/calendar.csv'.format(
|
|
|
TINYSOFT_DATA_PATH,
|
|
|
str(start_date) + '-' + str(end_date),
|
|
|
)
|
|
|
series.to_csv(dump_path, index=False)
|
|
|
|
|
|
with tsl() as t:
|
|
|
date_list = t.get_mkt_trading_days(
|
|
|
start_date, end_date)
|
|
|
|
|
|
s = pd.Series(date_list, dtype=np.int64, name='date')
|
|
|
_dump_df_to_csv(s)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
#do_k_1min()
|
|
|
#do_fin_report()
|
|
|
Market().do_k_daily()
|
|
|
|