- Add CTA1DLoaderParquet class for loading from Parquet files (parallel to CTA1DLoader) - Add data_ops_research/ requirements for CTA Parquet dataset export - Fix broken HFFACTOR_COLS import by defining inline - Update exports in __init__.py files New files: - cta_1d/src/loader_parquet.py: Parquet-based loader - cta_1d/config_parquet.yaml: Parquet-specific config - data_ops_research/cta_1d/requirements.yaml: Data export requirements - data_ops_research/cta_1d/README.md: Dataset documentation Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>master
parent
19f7c522e4
commit
49c9dae181
@ -0,0 +1,41 @@
|
||||
# CTA 1D Configuration for Parquet Loading
|
||||
# Use this instead of config.yaml when using CTA1DLoaderParquet
|
||||
|
||||
data:
|
||||
dt_range: ['2020-01-01', '2023-12-31']
|
||||
|
||||
# Parquet paths (populated by data_ops_research)
|
||||
feature_path: /data/parquet/dataset/cta_alpha158_1d
|
||||
hffactor_path: /data/parquet/dataset/cta_hffactor_1d
|
||||
dom_path: /data/parquet/dataset/cta_dom_1d
|
||||
label_path: /data/parquet/dataset/cta_labels_1d
|
||||
|
||||
feature_sets:
|
||||
- alpha158
|
||||
- hffactor
|
||||
normalization: dual
|
||||
blend_weights: equal
|
||||
|
||||
segments:
|
||||
train: ['2020-01-01', '2022-06-30']
|
||||
valid: ['2022-07-01', '2022-12-31']
|
||||
test: ['2023-01-01', '2023-12-31']
|
||||
|
||||
model:
|
||||
type: xgb
|
||||
params:
|
||||
objective: reg:squarederror
|
||||
eval_metric: rmse
|
||||
eta: 0.05
|
||||
max_depth: 6
|
||||
subsample: 0.8
|
||||
colsample_bytree: 0.8
|
||||
seed: 42
|
||||
num_boost_round: 500
|
||||
early_stopping_rounds: 50
|
||||
|
||||
training:
|
||||
return_type: o2c_twap1min
|
||||
weight_factors:
|
||||
positive: 1.0
|
||||
negative: 2.0
|
||||
@ -0,0 +1,427 @@
|
||||
"""CTA 1D loader using Parquet/Polars (parallel to CTA1DLoader)."""
|
||||
|
||||
from dataclasses import dataclass
|
||||
from datetime import date, datetime
|
||||
from typing import List, Optional
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import polars as pl
|
||||
|
||||
from qshare.data import pl_Dataset
|
||||
from qshare.io.polars import load_from_pq
|
||||
|
||||
from .labels import get_blend_weights
|
||||
|
||||
# HFFACTOR columns (defined inline to avoid qshare.config.research dependency)
|
||||
HFFACTOR_COLS = [
|
||||
'vol_1min',
|
||||
'skew_1min',
|
||||
'volp_1min',
|
||||
'volp_ratio_1min',
|
||||
'voln_ratio_1min',
|
||||
'trend_strength_1min',
|
||||
'pv_corr_1min',
|
||||
'flowin_ratio_1min',
|
||||
]
|
||||
|
||||
|
||||
@dataclass
|
||||
class CTA1DLoaderParquet:
|
||||
"""CTA 1D loader using Parquet files instead of DolphinDB.
|
||||
|
||||
This is a parallel implementation to CTA1DLoader that reads from
|
||||
pre-exported Parquet tables instead of querying DolphinDB directly.
|
||||
|
||||
Example:
|
||||
>>> loader = CTA1DLoaderParquet(
|
||||
... return_type='o2c_twap1min',
|
||||
... normalization='dual',
|
||||
... feature_sets=['alpha158', 'hffactor']
|
||||
... )
|
||||
>>> dataset = loader.load(dt_range=['2020-01-01', '2023-12-31'])
|
||||
>>> dataset = dataset.with_segments({
|
||||
... 'train': ('2020-01-01', '2022-12-31'),
|
||||
... 'test': ('2023-01-01', '2023-12-31')
|
||||
... })
|
||||
>>> X_train, y_train, w_train = dataset.split('train').to_numpy()
|
||||
"""
|
||||
|
||||
return_type: str = 'o2c_twap1min'
|
||||
normalization: str = 'dual'
|
||||
feature_sets: List[str] = None
|
||||
weight_factors: dict = None
|
||||
blend_weights: str | List[float] | None = None
|
||||
|
||||
# Parquet paths (populated by data_ops_research)
|
||||
feature_path: str = '/data/parquet/dataset/cta_alpha158_1d'
|
||||
hffactor_path: str = '/data/parquet/dataset/cta_hffactor_1d'
|
||||
dom_path: str = '/data/parquet/dataset/cta_dom_1d'
|
||||
label_path: str = '/data/parquet/dataset/cta_labels_1d'
|
||||
|
||||
label_cap_upper: float = 0.5
|
||||
label_cap_lower: float = -0.5
|
||||
|
||||
def __post_init__(self):
|
||||
if self.feature_sets is None:
|
||||
self.feature_sets = ['alpha158', 'hffactor']
|
||||
if self.weight_factors is None:
|
||||
self.weight_factors = {'positive': 1.0, 'negative': 2.0}
|
||||
|
||||
def load(
|
||||
self,
|
||||
dt_range: List[str],
|
||||
fit_range: Optional[List[str]] = None
|
||||
) -> pl_Dataset:
|
||||
"""Load and prepare CTA 1-day training dataset from Parquet files.
|
||||
|
||||
Args:
|
||||
dt_range: Date range [start_date, end_date] for dataset
|
||||
fit_range: Date range [start, end] for fitting normalization params.
|
||||
If None, uses first 60% of dt_range.
|
||||
|
||||
Returns:
|
||||
pl_Dataset with features, label, and weight columns
|
||||
"""
|
||||
start_date, end_date = dt_range
|
||||
|
||||
if fit_range is None:
|
||||
# Default: use first 60% for fit
|
||||
all_dates = pd.date_range(start_date, end_date)
|
||||
split_idx = int(len(all_dates) * 0.6)
|
||||
fit_range = [
|
||||
all_dates[0].strftime('%Y-%m-%d'),
|
||||
all_dates[split_idx].strftime('%Y-%m-%d')
|
||||
]
|
||||
|
||||
# Load extended history for rolling normalization
|
||||
load_start = (pd.to_datetime(start_date) - pd.Timedelta(days=120)).strftime('%Y-%m-%d')
|
||||
|
||||
# Load features
|
||||
df_features = self._load_features(load_start, end_date)
|
||||
|
||||
# Load and normalize labels
|
||||
df_label = self._load_labels(load_start, end_date, fit_range)
|
||||
|
||||
# Combine
|
||||
df = df_features.join(df_label, on=['datetime', 'instrument'], how='inner')
|
||||
|
||||
# Filter to requested date range
|
||||
start_dt = datetime.strptime(start_date, '%Y-%m-%d').date()
|
||||
end_dt = datetime.strptime(end_date, '%Y-%m-%d').date()
|
||||
df = df.filter(
|
||||
(pl.col('datetime') >= start_dt) &
|
||||
(pl.col('datetime') <= end_dt)
|
||||
)
|
||||
|
||||
# Calculate weights
|
||||
df = self._calculate_weights(df)
|
||||
|
||||
# Clean data
|
||||
df = self._clean_data(df)
|
||||
|
||||
# Get feature columns
|
||||
feature_cols = [c for c in df.columns
|
||||
if any(c.startswith(prefix) for prefix in ['f_a158_', 'f_hf_'])]
|
||||
|
||||
return pl_Dataset(
|
||||
data=df,
|
||||
features=feature_cols,
|
||||
label='label',
|
||||
weight='weight' if self.weight_factors else None
|
||||
)
|
||||
|
||||
def _load_features(self, start_date: str, end_date: str) -> pl.DataFrame:
|
||||
"""Load feature data from Parquet files."""
|
||||
feature_dfs = []
|
||||
|
||||
if 'alpha158' in self.feature_sets:
|
||||
df_alpha = self._load_alpha158(start_date, end_date)
|
||||
feature_dfs.append(df_alpha)
|
||||
|
||||
if 'hffactor' in self.feature_sets:
|
||||
df_hf = self._load_hffactor(start_date, end_date)
|
||||
feature_dfs.append(df_hf)
|
||||
|
||||
# Join all feature sets
|
||||
result = feature_dfs[0]
|
||||
for df in feature_dfs[1:]:
|
||||
result = result.join(df, on=['datetime', 'instrument'], how='inner')
|
||||
|
||||
return result
|
||||
|
||||
def _load_alpha158(self, start_date: str, end_date: str) -> pl.DataFrame:
|
||||
"""Load alpha158 features from Parquet."""
|
||||
# Load using qshare's parquet loader
|
||||
df = load_from_pq(
|
||||
self.feature_path,
|
||||
start_date=start_date,
|
||||
end_date=end_date
|
||||
)
|
||||
|
||||
# Drop non-numeric columns if present
|
||||
if 'code_init' in df.columns:
|
||||
df = df.drop('code_init')
|
||||
|
||||
# Rename columns with prefix
|
||||
rename_map = {
|
||||
c: f'f_a158_{c}'
|
||||
for c in df.columns
|
||||
if c not in ['datetime', 'instrument']
|
||||
}
|
||||
df = df.rename(rename_map)
|
||||
|
||||
return df
|
||||
|
||||
def _load_hffactor(self, start_date: str, end_date: str) -> pl.DataFrame:
|
||||
"""Load hffactor features from Parquet (already pivoted)."""
|
||||
# Load using qshare's parquet loader
|
||||
df = load_from_pq(
|
||||
self.hffactor_path,
|
||||
start_date=start_date,
|
||||
end_date=end_date
|
||||
)
|
||||
|
||||
# Rename columns with prefix
|
||||
rename_map = {
|
||||
c: f'f_hf_{c}'
|
||||
for c in df.columns
|
||||
if c not in ['datetime', 'instrument']
|
||||
}
|
||||
df = df.rename(rename_map)
|
||||
|
||||
return df
|
||||
|
||||
def _load_labels(
|
||||
self,
|
||||
start_date: str,
|
||||
end_date: str,
|
||||
fit_range: List[str]
|
||||
) -> pl.DataFrame:
|
||||
"""Load and normalize labels from Parquet files."""
|
||||
# Map return type to indicator name
|
||||
indicator_map = {
|
||||
'o2c_twap1min': 'twap_open1m@1_twap_close1m@1',
|
||||
'o2o_twap1min': 'twap_open1m@1_twap_open1m@2',
|
||||
}
|
||||
indicator = indicator_map.get(self.return_type, self.return_type)
|
||||
|
||||
# Load dominant contract mapping from Parquet
|
||||
df_contract = load_from_pq(
|
||||
self.dom_path,
|
||||
start_date=start_date,
|
||||
end_date=end_date
|
||||
)
|
||||
|
||||
# Load returns from Parquet
|
||||
df_return = load_from_pq(
|
||||
self.label_path,
|
||||
start_date=start_date,
|
||||
end_date=end_date
|
||||
)
|
||||
|
||||
# Filter for the specific indicator
|
||||
df_return = df_return.filter(pl.col('indicator') == indicator)
|
||||
|
||||
# Merge with dominant contract mapping
|
||||
df_return = df_return.join(
|
||||
df_contract,
|
||||
on=['datetime', 'instrument'],
|
||||
how='inner'
|
||||
)
|
||||
|
||||
# Use code_init as the instrument (continuous contract)
|
||||
df_return = df_return.with_columns(
|
||||
(pl.col('code_init') + 'Ind').alias('instrument')
|
||||
)
|
||||
|
||||
# Select and rename return column
|
||||
df_return = df_return.select([
|
||||
'datetime',
|
||||
'instrument',
|
||||
pl.col('value').alias('ret')
|
||||
])
|
||||
|
||||
# Apply normalization
|
||||
df_return = self._normalize_label(df_return, fit_range)
|
||||
|
||||
return df_return
|
||||
|
||||
def _normalize_label(self, pl_df: pl.DataFrame, fit_range: List[str]) -> pl.DataFrame:
|
||||
"""Apply specified normalization to label."""
|
||||
fit_start, fit_end = fit_range
|
||||
|
||||
# Convert fit_range strings to date objects for comparison
|
||||
fit_start_date = datetime.strptime(fit_start, '%Y-%m-%d').date()
|
||||
fit_end_date = datetime.strptime(fit_end, '%Y-%m-%d').date()
|
||||
|
||||
if self.normalization == 'zscore':
|
||||
# Calculate mean/std on fit range
|
||||
fit_data = pl_df.filter(
|
||||
(pl.col('datetime') >= fit_start_date) &
|
||||
(pl.col('datetime') <= fit_end_date)
|
||||
)
|
||||
mean = fit_data['ret'].mean()
|
||||
std = fit_data['ret'].std()
|
||||
|
||||
result = pl_df.with_columns(
|
||||
((pl.col('ret') - mean) / std).clip(
|
||||
self.label_cap_lower, self.label_cap_upper
|
||||
).alias('label')
|
||||
).select(['datetime', 'instrument', 'label'])
|
||||
return result
|
||||
|
||||
elif self.normalization == 'cs_zscore':
|
||||
# Cross-sectional z-score per datetime
|
||||
return pl_df.with_columns(
|
||||
((pl.col('ret') - pl.col('ret').mean().over('datetime')) /
|
||||
pl.col('ret').std().over('datetime')).clip(
|
||||
self.label_cap_lower, self.label_cap_upper
|
||||
).alias('label')
|
||||
).select(['datetime', 'instrument', 'label'])
|
||||
|
||||
elif self.normalization == 'rolling_20':
|
||||
return self._apply_rolling_norm(pl_df, window=20, fit_range=fit_range)
|
||||
|
||||
elif self.normalization == 'rolling_60':
|
||||
return self._apply_rolling_norm(pl_df, window=60, fit_range=fit_range)
|
||||
|
||||
elif self.normalization == 'dual':
|
||||
# Create all normalization variants
|
||||
label_zscore = self._normalize_zscore(pl_df, fit_range)
|
||||
label_cszscore = self._normalize_cs_zscore(pl_df)
|
||||
label_roll20 = self._normalize_rolling(pl_df, window=20, fit_range=fit_range)
|
||||
label_roll60 = self._normalize_rolling(pl_df, window=60, fit_range=fit_range)
|
||||
|
||||
# Get blend weights
|
||||
weights = get_blend_weights(self.blend_weights)
|
||||
|
||||
# Join and blend
|
||||
pl_df = label_zscore.join(label_cszscore, on=['datetime', 'instrument'])
|
||||
pl_df = pl_df.join(label_roll20, on=['datetime', 'instrument'])
|
||||
pl_df = pl_df.join(label_roll60, on=['datetime', 'instrument'])
|
||||
|
||||
return pl_df.with_columns(
|
||||
(weights[0] * pl.col('label_zscore') +
|
||||
weights[1] * pl.col('label_cszscore') +
|
||||
weights[2] * pl.col('label_roll20') +
|
||||
weights[3] * pl.col('label_roll60')).clip(
|
||||
self.label_cap_lower, self.label_cap_upper
|
||||
).alias('label')
|
||||
).select(['datetime', 'instrument', 'label'])
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown normalization: {self.normalization}")
|
||||
|
||||
def _normalize_zscore(self, pl_df: pl.DataFrame, fit_range: List[str]) -> pl.DataFrame:
|
||||
"""Create z-score normalized label."""
|
||||
fit_start, fit_end = fit_range
|
||||
|
||||
fit_start_date = datetime.strptime(fit_start, '%Y-%m-%d').date()
|
||||
fit_end_date = datetime.strptime(fit_end, '%Y-%m-%d').date()
|
||||
|
||||
fit_data = pl_df.filter(
|
||||
(pl.col('datetime') >= fit_start_date) &
|
||||
(pl.col('datetime') <= fit_end_date)
|
||||
)
|
||||
|
||||
mean = fit_data['ret'].mean()
|
||||
std = fit_data['ret'].std()
|
||||
|
||||
return pl_df.with_columns(
|
||||
((pl.col('ret') - mean) / std).alias('label_zscore')
|
||||
).select(['datetime', 'instrument', 'label_zscore'])
|
||||
|
||||
def _normalize_cs_zscore(self, pl_df: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Create cross-sectional z-score normalized label."""
|
||||
return pl_df.with_columns(
|
||||
((pl.col('ret') - pl.col('ret').mean().over('datetime')) /
|
||||
pl.col('ret').std().over('datetime')).alias('label_cszscore')
|
||||
).select(['datetime', 'instrument', 'label_cszscore'])
|
||||
|
||||
def _normalize_rolling(
|
||||
self,
|
||||
pl_df: pl.DataFrame,
|
||||
window: int,
|
||||
fit_range: List[str]
|
||||
) -> pl.DataFrame:
|
||||
"""Create rolling window normalized label."""
|
||||
# Convert to pandas for rolling calculation
|
||||
pd_df = pl_df.to_pandas().set_index(['datetime', 'instrument'])
|
||||
|
||||
# Unstack to wide format
|
||||
df_wide = pd_df['ret'].unstack('instrument')
|
||||
|
||||
# Calculate rolling mean and std
|
||||
rolling_mean = df_wide.rolling(window=window, min_periods=window//2).mean()
|
||||
rolling_std = df_wide.rolling(window=window, min_periods=window//2).std()
|
||||
|
||||
# Normalize
|
||||
df_normalized = (df_wide - rolling_mean) / rolling_std
|
||||
|
||||
# Restack
|
||||
rolling_label = df_normalized.stack().reset_index()
|
||||
rolling_label.columns = ['datetime', 'instrument', f'label_roll{window}']
|
||||
|
||||
return pl.from_pandas(rolling_label)
|
||||
|
||||
def _apply_rolling_norm(
|
||||
self,
|
||||
pl_df: pl.DataFrame,
|
||||
window: int,
|
||||
fit_range: List[str]
|
||||
) -> pl.DataFrame:
|
||||
"""Apply rolling normalization and cap."""
|
||||
result = self._normalize_rolling(pl_df, window, fit_range)
|
||||
return result.with_columns(
|
||||
pl.col(f'label_roll{window}').clip(
|
||||
self.label_cap_lower, self.label_cap_upper
|
||||
).alias('label')
|
||||
).select(['datetime', 'instrument', 'label'])
|
||||
|
||||
def _calculate_weights(self, pl_df: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Calculate sample weights based on return magnitude."""
|
||||
# Base weights by return magnitude tiers
|
||||
pl_df = pl_df.with_columns(
|
||||
pl.when(pl.col('label').abs() > 1.5).then(pl.lit(2.5))
|
||||
.when(pl.col('label').abs() > 1.0).then(pl.lit(2.0))
|
||||
.when(pl.col('label').abs() > 0.5).then(pl.lit(1.5))
|
||||
.when(pl.col('label').abs() > 0.2).then(pl.lit(1.0))
|
||||
.otherwise(0.0).alias('weight')
|
||||
)
|
||||
|
||||
# Apply negative return multiplier
|
||||
if self.weight_factors.get('negative'):
|
||||
pl_df = pl_df.with_columns(
|
||||
pl.when(pl.col('label') < -0.5)
|
||||
.then(pl.col('weight') * self.weight_factors['negative'])
|
||||
.otherwise(pl.col('weight'))
|
||||
.alias('weight')
|
||||
)
|
||||
|
||||
# Apply positive return multiplier
|
||||
if self.weight_factors.get('positive'):
|
||||
pl_df = pl_df.with_columns(
|
||||
pl.when(pl.col('label') > 0.5)
|
||||
.then(pl.col('weight') * self.weight_factors['positive'])
|
||||
.otherwise(pl.col('weight'))
|
||||
.alias('weight')
|
||||
)
|
||||
|
||||
return pl_df
|
||||
|
||||
def _clean_data(self, pl_df: pl.DataFrame) -> pl.DataFrame:
|
||||
"""Clean data: remove inf/nan values."""
|
||||
# Get numeric columns only
|
||||
numeric_cols = [
|
||||
c for c in pl_df.columns
|
||||
if pl_df[c].dtype in [pl.Float32, pl.Float64, pl.Int32, pl.Int64]
|
||||
]
|
||||
|
||||
# Replace inf with null, then drop nulls
|
||||
pl_df = pl_df.with_columns([
|
||||
pl.when(pl.col(c).is_infinite()).then(None).otherwise(pl.col(c)).alias(c)
|
||||
for c in numeric_cols
|
||||
])
|
||||
|
||||
return pl_df.drop_nulls()
|
||||
@ -0,0 +1,44 @@
|
||||
# CTA 1D Parquet Dataset
|
||||
|
||||
This directory contains requirements for CTA (Commodity Trading Advisor) futures
|
||||
Parquet datasets used by alpha_lab.
|
||||
|
||||
## Tables
|
||||
|
||||
### cta_alpha158_1d
|
||||
Alpha158 features for CTA futures.
|
||||
- **Source**: `dfs://daily_stock_run.stg_1day_tinysoft_cta_alpha159_0_7_beta`
|
||||
- **Output**: `/data/parquet/dataset/cta_alpha158_1d/`
|
||||
- **Columns**: ~163 feature columns + code, m_nDate
|
||||
|
||||
### cta_hffactor_1d
|
||||
High-frequency factor features (8 columns).
|
||||
- **Source**: `dfs://daily_stock_run.stg_1day_tinysoft_cta_hffactor`
|
||||
- **Output**: `/data/parquet/dataset/cta_hffactor_1d/`
|
||||
- **Transformation**: Pivot from long to wide format
|
||||
- Input columns: code, m_nDate, factor_name, value
|
||||
- Output columns: code, m_nDate, vol_1min, skew_1min, ... (8 features)
|
||||
- **Filter**: Only include factor_name in [vol_1min, skew_1min, volp_1min,
|
||||
volp_ratio_1min, voln_ratio_1min, trend_strength_1min, pv_corr_1min,
|
||||
flowin_ratio_1min]
|
||||
|
||||
### cta_dom_1d
|
||||
Dominant contract mapping for continuous contracts.
|
||||
- **Source**: `dfs://daily_stock_run.dwm_1day_cta_dom`
|
||||
- **Output**: `/data/parquet/dataset/cta_dom_1d/`
|
||||
- **Filter**: version = 'vp_csmax_roll2_cummax'
|
||||
- **Aggregation**: GROUP BY m_nDate, code_init; SELECT first(code) as code
|
||||
|
||||
### cta_labels_1d
|
||||
Return labels for different return types.
|
||||
- **Source**: `dfs://daily_stock_run.stg_1day_tinysoft_cta_hfvalue`
|
||||
- **Output**: `/data/parquet/dataset/cta_labels_1d/`
|
||||
- **Filter**: indicator in [twap_open1m@1_twap_close1m@1, twap_open1m@1_twap_open1m@2]
|
||||
- **Columns**: code, m_nDate, indicator, value
|
||||
|
||||
## Consumer
|
||||
|
||||
Used by: `alpha_lab/cta_1d/src/loader_parquet.py`
|
||||
|
||||
The alpha_lab project will create a parallel loader that reads from these
|
||||
Parquet tables instead of DolphinDB.
|
||||
@ -0,0 +1,88 @@
|
||||
# CTA 1D Parquet Dataset Requirements
|
||||
# This file specifies the required Parquet tables for alpha_lab CTA 1D task
|
||||
|
||||
# Table 1: Alpha158 Features
|
||||
cta_alpha158_1d:
|
||||
source:
|
||||
database: dfs://daily_stock_run
|
||||
table: stg_1day_tinysoft_cta_alpha159_0_7_beta
|
||||
host: 192.168.1.146
|
||||
port: 8848
|
||||
target:
|
||||
path: cta_alpha158_1d/
|
||||
partition_freq: 1D
|
||||
col_datetime: m_nDate
|
||||
code_format: tscode
|
||||
description: Alpha158 features for CTA futures (~163 columns)
|
||||
priority: medium
|
||||
|
||||
# Table 2: HFFactor Features (requires pivot)
|
||||
cta_hffactor_1d:
|
||||
source:
|
||||
database: dfs://daily_stock_run
|
||||
table: stg_1day_tinysoft_cta_hffactor
|
||||
host: 192.168.1.146
|
||||
port: 8848
|
||||
# Long format: code, m_nDate, factor_name, value
|
||||
# Pivot to wide format during export
|
||||
pivot:
|
||||
index: [code, m_nDate]
|
||||
columns: factor_name
|
||||
values: value
|
||||
filter: # Only these 8 columns needed
|
||||
- vol_1min
|
||||
- skew_1min
|
||||
- volp_1min
|
||||
- volp_ratio_1min
|
||||
- voln_ratio_1min
|
||||
- trend_strength_1min
|
||||
- pv_corr_1min
|
||||
- flowin_ratio_1min
|
||||
target:
|
||||
path: cta_hffactor_1d/
|
||||
partition_freq: 1D
|
||||
col_datetime: m_nDate
|
||||
code_format: tscode
|
||||
description: High-frequency factor features (8 columns, pivoted from long format)
|
||||
priority: medium
|
||||
notes: Requires pivot transformation from long to wide format
|
||||
|
||||
# Table 3: Dominant Contract Mapping
|
||||
cta_dom_1d:
|
||||
source:
|
||||
database: dfs://daily_stock_run
|
||||
table: dwm_1day_cta_dom
|
||||
host: 192.168.1.146
|
||||
port: 8848
|
||||
# Group and aggregate during export
|
||||
group_by: [m_nDate, code_init]
|
||||
filter: "version='vp_csmax_roll2_cummax'"
|
||||
agg: "first(code) as code"
|
||||
target:
|
||||
path: cta_dom_1d/
|
||||
partition_freq: 1D
|
||||
col_datetime: m_nDate
|
||||
code_format: tscode
|
||||
description: Dominant contract mapping for continuous contracts
|
||||
priority: medium
|
||||
notes: Requires group_by + aggregation, filter by version
|
||||
|
||||
# Table 4: Return Labels
|
||||
cta_labels_1d:
|
||||
source:
|
||||
database: dfs://daily_stock_run
|
||||
table: stg_1day_tinysoft_cta_hfvalue
|
||||
host: 192.168.1.146
|
||||
port: 8848
|
||||
# Filter for specific indicators
|
||||
indicators:
|
||||
- twap_open1m@1_twap_close1m@1 # o2c_twap1min
|
||||
- twap_open1m@1_twap_open1m@2 # o2o_twap1min
|
||||
target:
|
||||
path: cta_labels_1d/
|
||||
partition_freq: 1D
|
||||
col_datetime: m_nDate
|
||||
code_format: tscode
|
||||
description: Return labels for different return types
|
||||
priority: medium
|
||||
notes: Filter indicator column for specific return types
|
||||
Loading…
Reference in new issue