You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

152 lines
5.5 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import os, time
from tqdm import tqdm
import itertools
from pathlib import Path
from collections import namedtuple
import numpy as np
import pandas as pd
from tsl import *
from config import *
class DLMarket():
k_daily_data_shards = [
DateRange(20000101, 20091231),
DateRange(20100101, 20191231),
DateRange(20200101, 20220630)
]
def __init__(self):
pass
def do_k_daily(self):
for date_range in self.k_daily_data_shards:
start_date, end_date = date_range['start_date'], date_range['end_date']
self._dump_mkt_k_daily(start_date, end_date)
def do_k_1min(self):
start_date, end_date= 20210701, 20211231
self._dump_mkt_k_1min(start_date, end_date)
def _dump_mkt_k_1min(self, start_date, end_date):
"""
For some reason, 1min k cannot be returned by a date range,
or the program is prune to give address boundary error,
which should be an issue of the tinysoft API.
Thus, we have to do the query day by day.
But, maybe is is possible to query a range of stock for one day,
which has about 1M rows/day (depends on the #stocks on list that day).
"""
print('准备构建1分钟K线数据', start_date, end_date)
with tsl() as t:
date_list = t.get_mkt_trading_days(start_date, end_date)
with tqdm(date_list) as pbar:
for date in pbar:
pbar.set_description('正在获取当日1分钟K线数据' + str(date))
dump_folder = '{}/行情数据/分钟K线/{}/'.format(
TINYSOFT_DATA_PATH,
date,
)
# ensure the folder for the shard does exist
Path(dump_folder).mkdir(parents=True, exist_ok=True)
with tsl() as t:
stock_list = t.get_stock_list(date)
r = t.get_mkt_stock_k_1min(date, stock_list)
pbar.set_description('当日数据已经获取准备创建DataFrame' + str(date))
df = pd.DataFrame(r.value())
pbar.set_description('DataFrame已经创建完成分块后写入磁盘' + str(date))
if len(df.index) == 0:
print('因数据缺失,跳过', date)
print(r.message)
time.sleep(0.1)
continue
for stock_id, df_stock in df.groupby('StockID'):
df_stock.set_index(['StockID', 'time'], inplace=True)
df_stock.to_csv('{}/{}.csv'.format(dump_folder, stock_id))
# to avoid massive concurrent disk writing
time.sleep(0.001)
del(df)
# 由于不断重新链接tsl客户端所以每次完成之后需要等待更长的时间特别是在获取数据缺失的情况下
time.sleep(0.1)
def _dump_mkt_k_daily(self, start_date, end_date):
dump_folder = dump_path = '{}/行情数据/日K线/shards/{}/'.format(
TINYSOFT_DATA_PATH,
str(start_date) + '-' + str(end_date),
)
Path(dump_folder).mkdir(parents=True, exist_ok=True)
with tsl() as t:
stock_list = t.get_stock_list()
print('正在获取日频行情数据:')
with tqdm(stock_list) as pbar:
for stock_id in pbar:
dump_path = '{}/{}.csv'.format(dump_folder, stock_id)
pbar.set_description(dump_path)
# 每次重新启动tsl客户端以免大量数据在tsl客户端内累积会导致segment fault
# 在更稳定的服务器条件下有可能不需要反复重新连接tsl这里尝试直接使用原有tsl对象
# with tsl() as t:
df1, r1 = t.get_mkt_stock_k_daily(start_date, end_date, stock_id)
df2, r2 = t.get_mkt_stock_k_daily_ext(start_date, end_date, stock_id)
if len(df1) == 0 or len(df2) == 0:
print('因数据缺失,跳过', stock_id)
# 由于不断重新链接tsl客户端所以每次完成之后需要等待更长的时间
# 特别是在获取数据缺失的情况下需要额外等待一定时间
time.sleep(0.05)
continue
df = pd.concat([df1, df2], axis=1)
df.to_csv(dump_path)
del(df)
# 由于不断重新链接tsl客户端所以每次完成之后需要等待更长的时间特别是在获取数据缺失的情况下
time.sleep(0.005)
def _dump_mkt_calendar(self, start_date, end_date):
def _dump_df_to_csv(series):
dump_path = '{}/行情数据/{}/calendar.csv'.format(
TINYSOFT_DATA_PATH,
str(start_date) + '-' + str(end_date),
)
series.to_csv(dump_path, index=False)
with tsl() as t:
date_list = t.get_mkt_trading_days(
start_date, end_date)
s = pd.Series(date_list, dtype=np.int64, name='date')
_dump_df_to_csv(s)
if __name__ == '__main__':
#do_k_1min()
#do_fin_report()
Market().do_k_daily()