from pprint import pprint from tqdm import tqdm import functools import dolphindb as ddb from DDBLoader import DDBLoader def load_ddb_table(hft_tbl_name): """ 这是一个用来简化载入分区表过程的语法糖,但似乎需要预先调用这个函数的场景并不多,简化效果不是很明显。 """ def decorator(func): @functools.wraps(func) def wrapper(self, *args, **kwargs): self.ddb_sess.run(""" // 载入计算使用的原始数据:分钟线数据 tbl = loadTable("{hft_ddb_path}", "{hft_tbl_name}"); """.format( hft_ddb_path = DDBLoader.ddb_path, hft_tbl_name = hft_tbl_name, )) print('Did load', hft_tbl_name) return func(self) return wrapper return decorator class DailyFactor(object): #ddb_hft_path = "dfs://hft_stock_ts" #ddb_hft_dbname = "db_hft_stock" ddb_daily_path = "dfs://daily_stock_ts" ddb_daily_dbname = "db_daily_stock" ddb_config = { 'host' : '192.168.1.167', 'username' : 'admin', 'password' : '123456', } # 这里的partition数量未必需要和hft表的一致 # 当读取hft表的时候,需要使用DDBLoader中的`num_code_partition`,而不是此字段 num_code_partition = 50 def __init__(self): self.ddb_sess = ddb.session(self.ddb_config['host'], 8848) self.ddb_sess.login(self.ddb_config['username'], self.ddb_config['password']) def create_ddb_database(self): """ 因为日频数据量较小,内部可以使用m_nDate作为sort_key,所以分区仅需要对stock_id做[HASH, 50]即可,因此不需要输入calendar数据 """ self.ddb_sess.run(""" daily_stock_ts = database( "{ddb_hft_path}", HASH, [SYMBOL, {num_code_parition}], engine = 'TSDB' ) """.format( ddb_hft_path = DDBLoader.ddb_path, num_code_partition = self.num_code_partition )) print('Did create database') def load_ddb_database(self): self.ddb_sess.run(""" {dbName} = database( directory = '{dbPath}', partitionType = HASH, partitionScheme = [SYMBOL, {num_code_partition}], engine = 'TSDB' ) """.format( dbName = self.ddb_daily_dbname, dbPath = self.ddb_daily_path, num_code_partition = self.num_code_partition )) print('Did load database.') def append_factor_columns(self, factor_name_list, memory_tbl_name, partition_tbl_name): code = """ addColumn({partition_tbl_name}, {col_name_list}, {col_type_list}); """.format( partition_tbl_name = partition_tbl_name, col_name_list = '`' + '`'.join(factor_name_list), col_type_list = '[' + ','.join(['DOUBLE']*len(factor_name_list)) + ']' ) print('Will add columns via script:') print(code) self.ddb_sess.run(code) code = """ {partition_tbl_name}.tableInsert({memory_tbl_name}) """.format( partition_tbl_name = partition_tbl_name, memory_tbl_name = memory_tbl_name ) print('Will append date via script:') print(code) self.ddb_sess.run(code) def append_to_partition_table(self, partition_tbl_name, memory_tbl_name): self.ddb_sess.run(""" {partition_tbl_name}.tableInsert({memory_tbl_name}) """.format( partition_tbl_name = partition_tbl_name, memory_tbl_name = memory_tbl_name )) def create_factor_partition_table(self, partition_tbl_name, memory_tbl_name): """ 把数据从内存表(`memory_tbl_name`)添加到分区表(`partition_tbl_name`) """ # createPartitionedTable( # dbHandle, table, tableName, # [partitionColumns], [compressMethods], # [sortColumns], [keepDuplicates=ALL], [sortKeyMappingFunction]) code = """ // 保证创建新的分区表不会和已经存在表冲突 if (existsTable("{ddb_daily_path}", "{partition_tbl_name}")) {{ dropTable({ddb_daily_dbname}, "{partition_tbl_name}"); }} {partition_tbl_name} = createPartitionedTable( dbHandle = {ddb_daily_dbname}, table = {memory_tbl_name}, tableName = "{partition_tbl_name}", partitionColumns = 'code', compressMethods = {{'m_nDate' : 'delta'}}, sortColumns = `code`m_nDate ); """.format( ddb_daily_path = self.ddb_daily_path, ddb_daily_dbname = self.ddb_daily_dbname, partition_tbl_name = partition_tbl_name, memory_tbl_name = memory_tbl_name, ) print('Will create partitioned factor table via script:') print(code) self.ddb_sess.run(code) @load_ddb_table("KLinePartitioned") def make_kurto_memory_table(self): memory_table_name = "kurto" code_tpl = """ // 需要首先创建分钟线收益表 // 使用`context by`使得计算结果仍然为一个序列 // 使用`where partition()`来逐个加载分区 ret_sql = select code, m_nDate, eachPre(\, m_nClose)-1.0 as ret from tbl where partition(code, {partition_id}) context by m_nDate; // 计算kurto指标,`ret`表中每日第一条记录为空,似乎并不造成影响 kurto_sql = select code, m_nDate, sqrt(239) * sum(pow(ret, 3)) / pow(sum(pow(ret, 2)), 1.5) as kurto from ret_sql group by code, m_nDate; """ with tqdm(range(DDBLoader.num_code_partition)) as pbar: #with tqdm(range(1)) as pbar: for partition_id in pbar: self.ddb_sess.run(code_tpl.format( partition_id = partition_id, )) # 因为原表有50个分区,需要逐个计算,因此先创建一个内存临时表 # 否则一旦第一个分区插入分区表后,就无法再插入后续只包含部分字段的数据了 if partition_id == 0: self.ddb_sess.run(""" {memory_table_name} = table(kurto_sql) """.format( memory_table_name = memory_table_name )) # 上面`table`语句仅仅是创建表结构, # 然后使用`tableInsert`把真实数据插入进去 self.ddb_sess.run(""" {memory_table_name}.tableInsert(kurto_sql) """.format( memory_table_name = memory_table_name )) print('Did finish all parititons for kurto.') pprint(self.ddb_sess.run(f"{memory_table_name}")) return memory_table_name def main(): factor = DailyFactor() factor.load_ddb_database() memory_table_name = factor.make_kurto_memory_table() factor.create_factor_partition_table( 'hft_daily_factor', memory_table_name ) factor.append_to_partition_table( 'hft_daily_factor', memory_table_name ) if __name__ == '__main__': main()