Add configuration files and alpha158_beta pipeline

- Add .claudeignore and .clauderc for Claude Code setup
- Add config.yaml for cta_1d, stock_15m, and alpha158_beta tasks
- Add alpha158_beta pipeline.py with documentation
- Add utility scripts for embedding generation and prediction
- Add executed baseline notebook for cta_1d

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
master
guofu 4 days ago
parent 586b16a6fa
commit 4d382dc6bd

@ -0,0 +1 @@
__pycache__/

@ -0,0 +1,9 @@
custom_instructions: |
- When refactoring, prefer using partial updates or specific function rewrites instead of outputting the entire file content. This helps avoid token limit errors.
- If a file is larger than 300 lines, always suggest a modular breakdown before refactoring.
- Please be concise. Skip lengthy explanations and focus on the code changes only. Use short responses.
- Only output the diff or the specific functions that need changing. Do not repeat the unchanged parts of the file.
post_task_instructions: |
After significant changes, ask me if you should update CLAUDE.md to reflect the new architecture or commands.

@ -0,0 +1,293 @@
#!/usr/bin/env python3
"""
Compare generated embeddings with database embeddings (0_7 version).
Handles format conversion for datetime and instrument columns.
SUMMARY OF FINDINGS:
- Generated embeddings and database embeddings have DIFFERENT values
- Instrument mapping: 430xxx -> SHxxxxx, 830xxx -> SZxxxxx, 6xxxxx -> SH6xxxxx
- Correlation between corresponding dimensions: ~0.0067 (essentially zero)
- The generated embeddings are NOT the same as the database 0_7 embeddings
- Possible reasons:
1. Different model weights/versions used for generation
2. Different input features or normalization
3. Different random seed or inference configuration
"""
import polars as pl
import numpy as np
from pathlib import Path
def instrument_int_to_code(inst_int: int) -> str:
"""Convert integer instrument code to exchange-prefixed string.
The encoding in the embedding file uses:
- 4xxxxx -> SHxxxxxx (Shanghai A-shares, but code mapping is non-trivial)
- 8xxxxx -> SZxxxxxx (Shenzhen A-shares)
- Direct 6-digit codes are also present (600xxx, 000xxx, 300xxx)
Note: The exact mapping from 430017 -> SH600021 requires the original
features file. We attempt an approximate mapping here.
"""
inst_str = str(inst_int)
# Already 6-digit code
if len(inst_str) == 6 and inst_str[0] not in ('4', '8'):
if inst_str.startswith('6'):
return f"SH{inst_str}"
else:
return f"SZ{inst_str}"
# 6-digit with exchange prefix (4=SH, 8=SZ)
if len(inst_str) == 6 and inst_str[0] in ('4', '8'):
exchange = 'SH' if inst_str[0] == '4' else 'SZ'
# The mapping from 430xxx -> 600xxx is not 1:1
# Return the code as-is for matching attempts
return f"{exchange}{inst_str[1:]}"
return inst_str
def load_generated_embedding(date_int: int, sample_n: int = None):
"""Load generated embedding for a specific date."""
gen_path = Path('/home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data/embedding_0_7_beta.parquet')
lf = pl.scan_parquet(gen_path)
lf = lf.filter(pl.col('datetime') == date_int)
if sample_n:
lf = lf.head(sample_n)
df = lf.collect()
# Convert wide format (embedding_0, embedding_1, ...) to list format
embedding_cols = [c for c in df.columns if c.startswith('embedding_')]
embedding_cols.sort(key=lambda x: int(x.split('_')[1]))
embedding_structs = df.select(embedding_cols).to_struct()
embeddings_list = [[v for v in struct.values()] for struct in embedding_structs]
df = df.with_columns([
pl.Series('values', embeddings_list),
pl.col('datetime').cast(pl.UInt32).alias('datetime_uint32'),
pl.col('instrument').alias('instrument_orig'),
pl.col('instrument').cast(pl.String).alias('instrument_str'),
pl.col('instrument').map_elements(instrument_int_to_code, return_dtype=pl.String).alias('instrument_code')
])
return df
def load_database_embedding(date_str: str):
"""Load database embedding for a specific date."""
db_path = Path(f'/data/parquet/dataset/dwm_1day_multicast_csencode_1D/version=csiallx_feature2_ntrla_flag_pnlnorm_vae4_dim32a_beta0001/datetime={date_str}/0.parquet')
if not db_path.exists():
return None
df = pl.read_parquet(db_path)
df = df.with_columns([
pl.col('datetime').cast(pl.Int64).alias('datetime_int')
])
return df
def analyze_instrument_mapping(date_int: int):
"""Analyze the instrument mapping between generated and database embeddings."""
date_str = str(date_int)
print(f"\n{'='*80}")
print(f"Analyzing instrument mapping for date: {date_int}")
print(f"{'='*80}")
gen_df = load_generated_embedding(date_int)
db_df = load_database_embedding(date_str)
if db_df is None:
print(f"ERROR: Database embedding not found for {date_str}")
return
print(f"\nGenerated embeddings: {gen_df.shape[0]} rows")
print(f"Database embeddings: {db_df.shape[0]} rows")
# Show samples
print("\n--- Generated Embedding Sample ---")
sample_gen = gen_df.select(['datetime', 'instrument_orig', 'instrument_str', 'instrument_code', 'values']).head(10)
print(sample_gen)
print("\n--- Database Embedding Sample ---")
print(db_df.head(10))
# Try different matching strategies
gen_insts_set = set(gen_df['instrument_code'].to_list())
db_insts_set = set(db_df['instrument'].to_list())
common = gen_insts_set & db_insts_set
gen_only = gen_insts_set - db_insts_set
db_only = db_insts_set - gen_insts_set
print(f"\n--- Matching Results (with code conversion) ---")
print(f"Common instruments: {len(common)}")
print(f"Generated only: {len(gen_only)}")
print(f"Database only: {len(db_only)}")
if len(common) == 0:
print("\nNo common instruments found with code conversion!")
print("\nTrying to find mapping patterns...")
# Show some samples for analysis
print("\nGenerated instrument samples (original, converted):")
gen_samples = list(zip(gen_df['instrument_orig'].head(20).to_list(),
gen_df['instrument_code'].head(20).to_list()))
for orig, conv in gen_samples:
print(f" {orig} -> {conv}")
print("\nDatabase instrument samples:")
db_samples = db_df['instrument'].head(20).to_list()
for inst in db_samples:
print(f" {inst}")
# Check if there's a position-based alignment possible
# Sort both and compare by position
gen_sorted = sorted(gen_df['instrument_orig'].to_list())
db_sorted = sorted([int(inst[2:]) for inst in db_df['instrument'].to_list()])
print("\n--- Attempting position-based matching ---")
print(f"Generated sorted (first 10): {gen_sorted[:10]}")
print(f"Database sorted (first 10): {db_sorted[:10]}")
else:
# We have matches, compare embeddings
print(f"\n--- Comparing embeddings for {len(common)} common instruments ---")
gen_common = gen_df.filter(pl.col('instrument_code').is_in(list(common)))
db_common = db_df.filter(pl.col('instrument').is_in(list(common)))
# Join and compare
comparison = gen_common.join(
db_common,
left_on='instrument_code',
right_on='instrument',
how='inner',
suffix='_db'
)
# Calculate differences
diffs = []
for row in comparison.iter_rows():
# Find indices for the values columns
gen_vals_idx = comparison.columns.index('values')
db_vals_idx = comparison.columns.index('values_db')
gen_emb = np.array(row[gen_vals_idx])
db_emb = np.array(row[db_vals_idx])
diff = gen_emb - db_emb
diff_norm = np.linalg.norm(diff)
rel_diff = diff_norm / (np.linalg.norm(db_emb) + 1e-10)
diffs.append({
'instrument': row[comparison.columns.index('instrument_code')],
'l2_norm_diff': diff_norm,
'relative_diff': rel_diff,
'max_abs_diff': np.max(np.abs(diff)),
'gen_emb_norm': np.linalg.norm(gen_emb),
'db_emb_norm': np.linalg.norm(db_emb)
})
if diffs:
diff_df = pl.DataFrame(diffs)
print("\nDifference statistics:")
print(diff_df.select(['l2_norm_diff', 'relative_diff', 'max_abs_diff']).describe())
max_rel_diff = diff_df['relative_diff'].max()
print(f"\nMax relative difference: {max_rel_diff:.6e}")
if max_rel_diff < 1e-5:
print("✓ Embeddings match within numerical precision!")
elif max_rel_diff < 0.01:
print("~ Embeddings are very similar")
else:
print("✗ Embeddings differ significantly")
# Show some comparison samples
print("\nSample comparison:")
for i in range(min(5, len(diffs))):
d = diffs[i]
print(f" {d['instrument']}: gen_norm={d['gen_emb_norm']:.4f}, "
f"db_norm={d['db_emb_norm']:.4f}, rel_diff={d['relative_diff']:.6e}")
def calculate_correlation(date_int: int):
"""Calculate correlation between generated and database embeddings."""
import numpy as np
date_str = str(date_int)
print(f"\n{'='*80}")
print(f"Correlation Analysis for date: {date_int}")
print(f"{'='*80}")
gen_df = load_generated_embedding(date_int)
db_df = load_database_embedding(date_str)
if db_df is None:
print(f"ERROR: Database embedding not found for {date_str}")
return
# Find common instruments
gen_insts = set(gen_df['instrument_code'].to_list())
db_insts = set(db_df['instrument'].to_list())
common = list(gen_insts & db_insts)
print(f"\nCommon instruments: {len(common)}")
if len(common) == 0:
print("No common instruments found!")
return
# Filter to common and sort
gen_common = gen_df.filter(pl.col('instrument_code').is_in(common)).sort('instrument_code')
db_common = db_df.filter(pl.col('instrument').is_in(common)).sort('instrument')
# Extract embedding matrices
gen_embs = np.array(gen_common['values'].to_list())
db_embs = np.array(db_common['values'].to_list())
print(f"Generated embeddings shape: {gen_embs.shape}")
print(f"Database embeddings shape: {db_embs.shape}")
# Calculate correlation per dimension
correlations = []
for i in range(32):
gen_dim = gen_embs[:, i]
db_dim = db_embs[:, i]
corr = np.corrcoef(gen_dim, db_dim)[0, 1]
correlations.append(corr)
print(f"\nCorrelation statistics across 32 dimensions:")
print(f" Mean: {np.mean(correlations):.4f}")
print(f" Median: {np.median(correlations):.4f}")
print(f" Min: {np.min(correlations):.4f}")
print(f" Max: {np.max(correlations):.4f}")
# Overall correlation
overall_corr = np.corrcoef(gen_embs.flatten(), db_embs.flatten())[0, 1]
print(f"\nOverall correlation (all dims flattened): {overall_corr:.4f}")
# Interpretation
mean_corr = np.mean(correlations)
if abs(mean_corr) < 0.1:
print("\n✗ CONCLUSION: Embeddings are NOT correlated (essentially independent)")
elif abs(mean_corr) < 0.5:
print("\n~ CONCLUSION: Weak correlation between embeddings")
else:
print(f"\n✓ CONCLUSION: {'Strong' if abs(mean_corr) > 0.8 else 'Moderate'} correlation")
if __name__ == '__main__':
# Analyze for a few dates
dates_to_compare = [20190102, 20200102, 20240102]
for date in dates_to_compare:
try:
analyze_instrument_mapping(date)
calculate_correlation(date)
except Exception as e:
print(f"\nError analyzing date {date}: {e}")
import traceback
traceback.print_exc()

File diff suppressed because one or more lines are too long

@ -0,0 +1,50 @@
# CTA 1-Day Return Prediction - Experiment Configuration
# Data Configuration
data:
dt_range: ['2020-01-01', '2023-12-31']
feature_sets:
- alpha158
- hffactor
normalization: dual
blend_weights: equal # Options: equal, zscore_heavy, rolling_heavy, cs_heavy, short_term, long_term
# Data Segments (train/valid/test split)
segments:
train: ['2020-01-01', '2022-06-30']
valid: ['2022-07-01', '2022-12-31']
test: ['2023-01-01', '2023-12-31']
# Model Configuration
model:
type: xgb
params:
objective: reg:squarederror
eval_metric: rmse
eta: 0.05
max_depth: 6
subsample: 0.8
colsample_bytree: 0.8
seed: 42
num_boost_round: 500
early_stopping_rounds: 50
# Training Configuration
training:
return_type: o2c_twap1min
weight_factors:
positive: 1.0
negative: 2.0
# Backtest Configuration
backtest:
num_trades: 4
signal_dist: normal
pos_weight: true
# Output Configuration
output:
base_dir: results/cta_1d
save_model: true
save_predictions: true
save_importance: true

@ -0,0 +1,34 @@
# Stock 15-Minute Return Prediction - Experiment Configuration
# Data Configuration
data:
dt_range: ['2020-01-01', '2023-12-31']
feature_path: /data/parquet/stock_1min_alpha158
kline_path: /data/parquet/stock_1min_kline
industry_path: /data/parquet/stock_industry # Optional
normalization_mode: dual # Options: industry, cs_zscore, dual
# Model Configuration
model:
type: xgb
params:
objective: reg:squarederror
eval_metric: rmse
eta: 0.05
max_depth: 6
subsample: 0.8
colsample_bytree: 0.8
seed: 42
num_boost_round: 500
early_stopping_rounds: 50
# Training Configuration
training:
positive_factor: 1.0 # Weight multiplier for positive returns
negative_factor: 2.0 # Weight multiplier for negative returns
# Output Configuration
output:
base_dir: results/stock_15m
save_model: true
save_predictions: true

@ -0,0 +1,123 @@
# Data Pipeline Bug Analysis
## Summary
The generated embeddings do not match the database 0_7 embeddings due to multiple bugs in the data pipeline migration from qlib to standalone Polars implementation.
---
## Bugs Fixed
### 1. Market Classification (`FlagMarketInjector`) ✓ FIXED
**Original (incorrect):**
```python
market_0 = (instrument >= 600000) # SH
market_1 = (instrument < 600000) # SZ
```
**Fixed:**
```python
inst_str = str(instrument).zfill(6)
market_0 = inst_str.startswith('6') # SH: 6xxxxx
market_1 = inst_str.startswith('0') | inst_str.startswith('3') # SZ: 0xxx, 3xxx
market_2 = inst_str.startswith('4') | inst_str.startswith('8') # NE: 4xxx, 8xxx
```
**Impact:** 167 instruments (4xxxxx, 8xxxxx - 新三板) were misclassified.
---
### 2. ColumnRemover Missing `IsN` ✓ FIXED
**Original (incorrect):**
```python
columns_to_remove = ['TotalValue_diff', 'IsZt', 'IsDt']
```
**Fixed:**
```python
columns_to_remove = ['TotalValue_diff', 'IsN', 'IsZt', 'IsDt']
```
**Impact:** Extra column caused feature dimension mismatch.
---
### 3. RobustZScoreNorm Applied to Wrong Columns ✓ FIXED
**Original (incorrect):**
Applied normalization to ALL 341 features including market flags and indus_idx.
**Fixed:**
Only normalize `alpha158 + alpha158_ntrl + market_ext + market_ext_ntrl` (330 features), excluding:
- Market flags (Limit, Stopping, IsTp, IsXD, IsXR, IsDR, market_0, market_1, market_2, IsST)
- indus_idx
---
## Critical Remaining Issue: Data Schema Mismatch
### `Limit` and `Stopping` Column Types Changed
**Original qlib pipeline expected:**
- `Limit`: **Boolean** flag (True = limit up)
- `Stopping`: **Boolean** flag (True = suspended trading)
**Current Parquet data has:**
- `Limit`: **Float64** price change percentage (0.0 to 1301.3)
- `Stopping`: **Float64** price change percentage
**Evidence:**
```
Limit values sample: [8.86, 9.36, 31.0, 7.32, 2.28, 6.39, 5.38, 4.03, 3.86, 9.89]
Limit == 0: only 2 rows
Limit > 0: 3738 rows
```
This is a **fundamental data schema change**. The current Parquet files contain different data than what the original VAE model was trained on.
**Possible fixes:**
1. Convert `Limit` and `Stopping` to boolean flags using a threshold
2. Find the original data source that had boolean flags
3. Re-train the VAE model with the new data schema
---
## Correlation Results
After fixing bugs 1-3, the embedding correlation with database 0_7:
| Metric | Value |
|--------|-------|
| Mean correlation (32 dims) | 0.0068 |
| Median correlation | 0.0094 |
| Overall correlation | 0.2330 |
**Conclusion:** Embeddings remain essentially uncorrelated (≈0).
---
## Root Cause
The **Limit/Stopping data schema change** is the most likely root cause. The VAE model learned to encode features that included binary limit/stopping flags, but the standalone pipeline feeds it continuous price change percentages instead.
---
## Next Steps
1. **Verify original data schema:**
- Check if the original DolphinDB table had boolean `Limit` and `Stopping` columns
- Compare with the current Parquet schema
2. **Fix the data loading:**
- Either convert continuous values to binary flags
- Or use the correct boolean columns (`IsZt`, `IsDt`) for limit flags
3. **Verify feature order:**
- Ensure the qlib RobustZScoreNorm parameters are applied in the correct order
- Check that `[alpha158, alpha158_ntrl, market_ext, market_ext_ntrl]` matches the 330-parameter shape
4. **Re-run comparison:**
- Generate new embeddings with the corrected pipeline
- Compare correlation with database

@ -0,0 +1,85 @@
# Data Pipeline Bug Analysis - Final Status
## Summary
After fixing all identified bugs, the feature count now matches (341), but the embeddings remain uncorrelated with the database 0_7 version.
**Latest Version**: v5
- Feature count: 341 ✓ (matches VAE input dim)
- Mean correlation with DB: 0.0050 (essentially zero)
- Status: All identified bugs fixed, but embeddings still differ
---
## Bugs Fixed
### 1. Market Classification (`FlagMarketInjector`) ✓ FIXED
- **Bug**: Used `instrument >= 600000` which misclassified 新三板 instruments
- **Fix**: Use string prefix matching with vocab_size=2 (not 3)
- **Impact**: 167 instruments corrected
### 2. ColumnRemover Missing `IsN` ✓ FIXED
- **Bug**: Only removed `IsZt, IsDt` but not `IsN`
- **Fix**: Added `IsN` to removal list
- **Impact**: Feature count alignment
### 3. RobustZScoreNorm Scope ✓ FIXED
- **Bug**: Applied normalization to all 341 features
- **Fix**: Only normalize 330 features (alpha158 + market_ext, both original + neutralized)
- **Impact**: Correct normalization scope
### 4. Wrong Data Sources for Market Flags ✓ FIXED
- **Bug**: Used `Limit, Stopping` (Float64) from kline_adjusted
- **Fix**: Load from correct sources:
- kline_adjusted: `IsZt, IsDt, IsN, IsXD, IsXR, IsDR` (Boolean)
- market_flag: `open_limit, close_limit, low_limit, high_stop` (Boolean, 4 cols)
- **Impact**: Correct boolean flag data
### 5. Feature Count Mismatch ✓ FIXED
- **Bug**: 344 features (3 extra)
- **Fix**: vocab_size=2 + 4 market_flag cols = 341 features
- **Impact**: VAE input dimension matches
---
## Correlation Results (v5)
| Metric | Value |
|--------|-------|
| Mean correlation (32 dims) | 0.0050 |
| Median correlation | 0.0079 |
| Min | -0.0420 |
| Max | 0.0372 |
| Overall (flattened) | 0.2225 |
**Conclusion**: Embeddings remain essentially uncorrelated with database.
---
## Possible Remaining Issues
1. **Different input data values**: The alpha158_0_7_beta Parquet files may contain different values than the original DolphinDB data used to train the VAE.
2. **Feature ordering mismatch**: The 330 RobustZScoreNorm parameters must be applied in the exact order:
- [0:158] = alpha158 original
- [158:316] = alpha158_ntrl
- [316:323] = market_ext original (7 cols)
- [323:330] = market_ext_ntrl (7 cols)
3. **Industry neutralization differences**: Our `IndusNtrlInjector` implementation may differ from qlib's.
4. **Missing transformations**: There may be additional preprocessing steps not captured in handler.yaml.
5. **VAE model mismatch**: The VAE model may have been trained with different data than what handler.yaml specifies.
---
## Recommended Next Steps
1. **Compare intermediate features**: Run both the qlib pipeline and our pipeline on the same input data and compare outputs at each step.
2. **Verify RobustZScoreNorm parameter order**: Check if our feature ordering matches the order used during VAE training.
3. **Compare predictions, not embeddings**: Instead of comparing VAE embeddings, compare the final d033 model predictions with the original 0_7 predictions.
4. **Check alpha158 data source**: Verify that `stg_1day_wind_alpha158_0_7_beta_1D` contains the same data as the original DolphinDB `stg_1day_wind_alpha158_0_7_beta` table.

@ -0,0 +1,146 @@
# First, let me create a script to train a VAE model on the 0_7_beta data
# This would need to be done separately as it's a prerequisite for the prediction script above
"""
Workflow configuration to train a VAE model on alpha158 0_7_beta data.
This creates a VAE-encoded version of the 0_7_beta factors that can be used
for prediction comparison with the original 0_7 model.
"""
experiment_name: vae_alpha158_0_7_beta
qlib_init:
provider_uri: "~/.qlib/data_ops/target"
region: cn
load_start: &load_start 2013-01-01
load_end: &load_end 2023-09-30
train_start: &train_start 2013-01-01
train_end: &train_end 2018-12-31
benchmark_name: &benchmark_name SH000985
market: &market csiallx
dataset_cache_path: &dataset_cache_path tasks/artifacts/csiallx_dataset_alpha158_0_7_beta_vae.pkl
# DolphinDB configuration
ddb_config: &ddb_config
host: 192.168.1.146
port: 8848
username: "admin"
password: "123456"
data_handler_config: &data_handler_config
start_time: *load_start
end_time: *load_end
fit_start_time: *train_start
fit_end_time: *train_end
instruments: *market
ddb_config: *ddb_config
handler_list:
# Alpha158 0_7_beta features
- class: DDBZWindDataHandler
module_path: qlib.contrib.data.ddb_handlers.ddb_wind_handler
kwargs:
col_set: "feature"
query_config:
- db_path: "dfs://daily_stock_run"
dtype: "float32"
field_list: "alpha158" # All alpha158 factors
table_name: "stg_1day_wind_alpha158_0_7_beta" # Use the beta version
# Additional handlers as needed
- class: DDBZWindDataHandler
module_path: qlib.contrib.data.ddb_handlers.ddb_wind_handler
kwargs:
col_set: "risk_factor"
query_config:
- db_path: "dfs://daily_stock_run"
dtype: "float32"
field_list: ["MarketValue as total_size"]
table_name: "stg_1day_wind_kline_adjusted"
- class: DDBZWindDataHandler
module_path: qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler
kwargs:
col_set: "indus_flag"
query_config:
- db_path: "dfs://daily_stock_run"
dtype: "bool"
field_list: "industry_code_cc.csv"
table_name: "stg_1day_gds_indus_flag_cc1"
- class: DDBZWindDataHandler
module_path: qlib.contrib.data.ddb_handlers.ddb_st_flag_handler
kwargs:
col_set: "st_flag"
query_config:
- db_path: "dfs://daily_stock_run"
dtype: "bool"
field_list: ["ST_Y", "ST_S", "ST_T", "ST_L", "ST_Z", "ST_X"]
table_name: "stg_1day_wind_st_flag"
infer_processors:
- class: FlagToOnehot
module_path: qlib.contrib.data.processor_flag
kwargs:
fields_group: indus_flag
onehot_group: indus_idx
- class: FactorNtrlInjector
module_path: qlib.contrib.data.processor_ntrl
kwargs:
fields_group: "feature"
factor_col: "risk_factor"
dummy_col: "indus_idx"
ntrl_type: "size_indus"
- class: RobustZScoreNorm
kwargs:
fields_group: ["feature"]
clip_outlier: true
- class: Fillna
kwargs:
fields_group: ["feature"]
task:
model:
class: VAEModel
module_path: qlib.contrib.model.task.task_vae_flat
kwargs:
model_config:
hidden_size: 32 # Same as the original model for consistency
nn_module:
class: VAE
module_path: qlib.contrib.model.module.module_vae
kwargs:
variational: true
optim_config:
seed: 1234567
bootstrap_config: 1.2
distort_config: 1e-3
beta: 1e-3 # KL divergence weight
n_epochs: 300
early_stop: 10
lr: 1e-3
optimizer: adamw
batch_size: 10000
n_jobs: 4
checkpoint:
save_path: tasks/artifacts/checkpoints/csiallx_alpha158_0_7_beta_vae32
dataset:
class: DatasetH
module_path: qlib.data.dataset
kwargs:
config_module: qlib.contrib.data.config
from_cache: *dataset_cache_path
require_setup: true
handler:
class: AggHandler
module_path: qlib.contrib.data.agg_handler
kwargs: *data_handler_config
segments:
train: [*train_start, *train_end]
test: [*load_start, *load_end]
record:
- class: SignalRecord
module_path: qlib.contrib.workflow.record_temp
kwargs:
model: <MODEL>
dataset: <DATASET>
col_set: "feature"

@ -0,0 +1,58 @@
# Analysis Report: Enhanced Prediction Comparison Visualization
## Issue Identified
The original `prediction_comparison.png` visualization lacked meaningful evaluation metrics such as:
- IC (Information Coefficient) time series
- RankIC (Rank Information Coefficient) time series
- Top-tier return cumulative difference
- Other requested financial metrics
Instead, it only showed basic scatter plots and prediction distributions.
## Solution Implemented
Updated the `compare_predictions.py` script with enhanced visualization functionality that includes:
### 1. IC Time Series Comparison
- Calculates daily IC for both 0_7 and 0_7_beta prediction sets
- Plots both series on the same chart for easy comparison
- Shows temporal trends in predictive power
### 2. RankIC Time Series Comparison
- Calculates daily RankIC (Spearman correlation) for both versions
- Displays time series comparison to show rank correlation trends
- Helps evaluate monotonic relationships over time
### 3. Cumulative Top-Tier Returns
- Identifies top 10% of stocks based on predictions each day
- Calculates cumulative returns for both prediction sets
- Shows performance divergence over time
### 4. Difference in Cumulative Returns
- Visualizes the spread between 0_7 and 0_7_beta cumulative returns
- Helps quantify the performance gap between the two approaches
- Provides insight into which version performs better over time
### 5. Additional Improvements
- Fixed date type mismatch issues that prevented proper joins
- Added graceful fallback to basic visualization when actual returns unavailable
- Maintained all original basic comparison plots for comprehensive analysis
## Files Updated
- `compare_predictions.py` - Enhanced visualization functionality
- `generate_mock_returns.py` - Script to create test returns data
- `test_enhanced_visualization.py` - Verification script
## Results
The enhanced visualization now provides:
- Meaningful financial metrics that directly address the comparison requirements
- Time series analysis of IC and RankIC metrics
- Cumulative performance comparison of top-tier selections
- Proper error handling for different data formats
- Comprehensive side-by-side comparison of both alpha versions
## Verification
Successfully tested the enhanced functionality with mock data, confirming that:
- All requested metrics are now visualized
- The plot contains 6 meaningful panels with financial insights
- The output file `prediction_comparison.png` includes all requested metrics
- Basic comparison functionality remains intact

@ -0,0 +1,345 @@
#!/usr/bin/env python
"""
Main pipeline orchestration script for Alpha158 0_7 vs 0_7_beta comparison.
This script orchestrates the full workflow:
1. Generate beta embeddings from alpha158_0_7_beta factors
2. Fetch original 0_7 predictions from DolphinDB
3. Generate predictions using beta embeddings
4. Generate actual returns from kline data
5. Compare predictions (IC, RankIC, correlation, etc.)
Usage:
python pipeline.py --start-date 2019-01-01 --end-date 2020-11-30 --skip-embeddings --skip-fetch
Arguments:
--start-date: Start date for data loading (default: 2019-01-01)
--end-date: End date for data loading (default: 2020-11-30)
--skip-embeddings: Skip embeddings generation (use existing)
--skip-fetch: Skip fetching original predictions (use existing)
--skip-returns: Skip returns generation (use existing)
--skip-comparison: Skip final comparison
"""
import os
import sys
import argparse
from datetime import datetime
from pathlib import Path
# Add scripts directory to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'scripts'))
def step_generate_embeddings(start_date: str, end_date: str, data_dir: str) -> bool:
"""Step 1: Generate beta embeddings."""
print("\n" + "=" * 70)
print("STEP 1: Generate Beta Embeddings")
print("=" * 70)
embedding_file = os.path.join(data_dir, "embedding_0_7_beta.parquet")
if os.path.exists(embedding_file):
print(f"Embeddings file already exists: {embedding_file}")
response = input("Regenerate? (y/N): ").strip().lower()
if response != 'y':
print("Skipping embeddings generation.")
return True
try:
from generate_beta_embedding import generate_embeddings
df = generate_embeddings(
start_date=start_date,
end_date=end_date,
output_file=embedding_file,
use_vae=True
)
print(f"\nGenerated {len(df)} embeddings")
return True
except Exception as e:
print(f"Error generating embeddings: {e}")
import traceback
traceback.print_exc()
return False
def step_fetch_predictions(start_date: str, end_date: str, data_dir: str) -> bool:
"""Step 2: Fetch original predictions from DolphinDB."""
print("\n" + "=" * 70)
print("STEP 2: Fetch Original Predictions from DolphinDB")
print("=" * 70)
predictions_file = os.path.join(data_dir, "original_predictions_0_7.parquet")
if os.path.exists(predictions_file):
print(f"Predictions file already exists: {predictions_file}")
response = input("Refetch? (y/N): ").strip().lower()
if response != 'y':
print("Skipping fetch.")
return True
try:
from fetch_predictions import fetch_original_predictions
df = fetch_original_predictions(
start_date=start_date,
end_date=end_date,
output_file=predictions_file
)
print(f"\nFetched {len(df)} predictions")
return True
except Exception as e:
print(f"Error fetching predictions: {e}")
import traceback
traceback.print_exc()
return False
def step_generate_beta_predictions(data_dir: str) -> bool:
"""Step 3: Generate predictions using beta embeddings."""
print("\n" + "=" * 70)
print("STEP 3: Generate Predictions with Beta Embeddings")
print("=" * 70)
embedding_file = os.path.join(data_dir, "embedding_0_7_beta.parquet")
predictions_file = os.path.join(data_dir, "predictions_beta_embedding.parquet")
if not os.path.exists(embedding_file):
print(f"Embeddings file not found: {embedding_file}")
print("Run step 1 first.")
return False
if os.path.exists(predictions_file):
print(f"Beta predictions file already exists: {predictions_file}")
response = input("Regenerate? (y/N): ").strip().lower()
if response != 'y':
print("Skipping prediction generation.")
return True
try:
from predict_with_embedding import generate_predictions
df = generate_predictions(
embedding_file=embedding_file,
output_file=predictions_file,
seq_len=40,
batch_size=1000
)
print(f"\nGenerated {len(df)} predictions")
return True
except Exception as e:
print(f"Error generating predictions: {e}")
import traceback
traceback.print_exc()
return False
def step_generate_returns(data_dir: str) -> bool:
"""Step 4: Generate actual returns from kline data."""
print("\n" + "=" * 70)
print("STEP 4: Generate Actual Returns")
print("=" * 70)
predictions_file = os.path.join(data_dir, "original_predictions_0_7.parquet")
returns_file = os.path.join(data_dir, "actual_returns.parquet")
if os.path.exists(returns_file):
print(f"Returns file already exists: {returns_file}")
response = input("Regenerate? (y/N): ").strip().lower()
if response != 'y':
print("Skipping returns generation.")
return True
try:
from generate_returns import generate_real_returns_from_kline
# Use prediction file to determine date range if available
prediction_file = predictions_file if os.path.exists(predictions_file) else None
df = generate_real_returns_from_kline(
input_kline_path="/data/parquet/dataset/stg_1day_wind_kline_adjusted_1D/",
prediction_file=prediction_file,
output_file=returns_file,
return_days=5
)
if df is not None:
print(f"\nGenerated {len(df)} returns")
return True
else:
print("\nFailed to generate returns")
return False
except Exception as e:
print(f"Error generating returns: {e}")
import traceback
traceback.print_exc()
return False
def step_compare_predictions(data_dir: str) -> bool:
"""Step 5: Compare 0_7 vs 0_7_beta predictions."""
print("\n" + "=" * 70)
print("STEP 5: Compare Predictions")
print("=" * 70)
required_files = [
os.path.join(data_dir, "original_predictions_0_7.parquet"),
os.path.join(data_dir, "predictions_beta_embedding.parquet"),
]
for f in required_files:
if not os.path.exists(f):
print(f"Required file not found: {f}")
return False
try:
# Import and run comparison
from compare_predictions import main as compare_main
compare_main()
return True
except Exception as e:
print(f"Error comparing predictions: {e}")
import traceback
traceback.print_exc()
return False
def main():
"""Main pipeline orchestration."""
parser = argparse.ArgumentParser(
description="Alpha158 0_7 vs 0_7_beta Comparison Pipeline"
)
parser.add_argument(
"--start-date",
type=str,
default="2019-01-01",
help="Start date (YYYY-MM-DD)"
)
parser.add_argument(
"--end-date",
type=str,
default="2020-11-30",
help="End date (YYYY-MM-DD)"
)
parser.add_argument(
"--skip-embeddings",
action="store_true",
help="Skip embeddings generation"
)
parser.add_argument(
"--skip-fetch",
action="store_true",
help="Skip fetching original predictions"
)
parser.add_argument(
"--skip-returns",
action="store_true",
help="Skip returns generation"
)
parser.add_argument(
"--skip-comparison",
action="store_true",
help="Skip final comparison"
)
parser.add_argument(
"--data-dir",
type=str,
default=None,
help="Data directory (default: ./data)"
)
args = parser.parse_args()
# Determine data directory
script_dir = os.path.dirname(os.path.abspath(__file__))
data_dir = args.data_dir or os.path.join(script_dir, "data")
print("=" * 70)
print("Alpha158 0_7 vs 0_7_beta Comparison Pipeline")
print("=" * 70)
print(f"Date range: {args.start_date} to {args.end_date}")
print(f"Data directory: {data_dir}")
# Ensure data directory exists
os.makedirs(data_dir, exist_ok=True)
# Track results
results = {}
# Step 1: Generate embeddings
if not args.skip_embeddings:
results['embeddings'] = step_generate_embeddings(
args.start_date, args.end_date, data_dir
)
else:
print("\nSkipping embeddings generation (as requested)")
results['embeddings'] = True
# Step 2: Fetch original predictions
if not args.skip_fetch:
results['fetch'] = step_fetch_predictions(
args.start_date, args.end_date, data_dir
)
else:
print("\nSkipping fetch (as requested)")
results['fetch'] = True
# Step 3: Generate beta predictions
if results.get('embeddings', True):
results['beta_predictions'] = step_generate_beta_predictions(data_dir)
else:
print("\nSkipping beta predictions (embeddings generation failed)")
results['beta_predictions'] = False
# Step 4: Generate returns
if not args.skip_returns:
results['returns'] = step_generate_returns(data_dir)
else:
print("\nSkipping returns generation (as requested)")
results['returns'] = True
# Step 5: Compare predictions
if not args.skip_comparison:
if all([
results.get('fetch', True),
results.get('beta_predictions', True)
]):
results['comparison'] = step_compare_predictions(data_dir)
else:
print("\nSkipping comparison (previous steps failed)")
results['comparison'] = False
else:
print("\nSkipping comparison (as requested)")
results['comparison'] = True
# Summary
print("\n" + "=" * 70)
print("PIPELINE SUMMARY")
print("=" * 70)
for step, success in results.items():
status = "✓ PASSED" if success else "✗ FAILED"
print(f" {step:20s}: {status}")
all_passed = all(results.values())
print("=" * 70)
if all_passed:
print("Pipeline completed successfully!")
else:
print("Pipeline completed with errors.")
sys.exit(1)
if __name__ == "__main__":
main()

@ -0,0 +1,129 @@
#!/usr/bin/env python
"""
Compare generated embeddings with gold standard embeddings from DolphinDB.
"""
import polars as pl
import numpy as np
from pathlib import Path
DATA_DIR = Path(__file__).parent / "../data"
def compare_embeddings():
"""Compare generated and gold standard embeddings."""
# Load data
gold_path = DATA_DIR / "embedding_0_7_beta_gold_standard.parquet"
gen_path = DATA_DIR / "embedding_0_7_beta_sample.parquet"
print("=" * 60)
print("Loading embeddings")
print("=" * 60)
gold = pl.read_parquet(gold_path)
gen = pl.read_parquet(gen_path)
print(f"Gold standard: {gold.shape}")
print(f"Generated: {gen.shape}")
# Get embedding columns
emb_cols = [f"embedding_{i}" for i in range(32)]
# Compare by date
dates = sorted(gold["datetime"].unique().to_list())
print("\n" + "=" * 60)
print("Comparison by date")
print("=" * 60)
for dt in dates:
gold_dt = gold.filter(pl.col("datetime") == dt)
gen_dt = gen.filter(pl.col("datetime") == dt)
print(f"\nDate: {dt}")
print(f" Gold instruments: {gold_dt.height}, Generated instruments: {gen_dt.height}")
print(f" Gold instrument sample: {gold_dt['instrument'].head(5).to_list()}")
print(f" Gen instrument sample: {gen_dt['instrument'].head(5).to_list()}")
# Check for common instruments
gold_insts = set(gold_dt["instrument"].to_list())
gen_insts = set(gen_dt["instrument"].to_list())
common = gold_insts & gen_insts
print(f" Common instruments: {len(common)}")
if len(common) > 0:
# Compare embeddings for common instruments
gold_common = gold_dt.filter(pl.col("instrument").is_in(list(common))).sort("instrument")
gen_common = gen_dt.filter(pl.col("instrument").is_in(list(common))).sort("instrument")
# Calculate embedding differences
diffs = []
for i in range(len(gold_common)):
gold_emb = np.array([gold_common[col][i] for col in emb_cols])
gen_emb = np.array([gen_common[col][i] for col in emb_cols])
diff = gold_emb - gen_emb
l2_norm = np.linalg.norm(diff)
rel_diff = l2_norm / (np.linalg.norm(gold_emb) + 1e-8)
max_abs_diff = np.max(np.abs(diff))
diffs.append({
"l2_norm": l2_norm,
"rel_diff": rel_diff,
"max_abs_diff": max_abs_diff,
"gold_norm": np.linalg.norm(gold_emb),
"gen_norm": np.linalg.norm(gen_emb)
})
diff_df = pl.DataFrame(diffs)
print(f"\n Embedding comparison:")
print(f" Mean L2 norm diff: {diff_df['l2_norm'].mean():.4f}")
print(f" Mean rel diff: {diff_df['rel_diff'].mean():.4%}")
print(f" Mean max abs diff: {diff_df['max_abs_diff'].mean():.4f}")
print(f" Gold emb norm (mean): {diff_df['gold_norm'].mean():.4f}")
print(f" Gen emb norm (mean): {diff_df['gen_norm'].mean():.4f}")
# Correlation analysis
gold_embs = np.array([[gold_common[col][i] for col in emb_cols] for i in range(len(gold_common))])
gen_embs = np.array([[gen_common[col][i] for col in emb_cols] for i in range(len(gen_common))])
correlations = []
for d in range(32):
corr = np.corrcoef(gold_embs[:, d], gen_embs[:, d])[0, 1]
correlations.append(corr)
print(f"\n Correlation by dimension:")
print(f" Mean: {np.mean(correlations):.4f}")
print(f" Median: {np.median(correlations):.4f}")
print(f" Min: {np.min(correlations):.4f}")
print(f" Max: {np.max(correlations):.4f}")
# Overall correlation
overall_corr = np.corrcoef(gold_embs.flatten(), gen_embs.flatten())[0, 1]
print(f" Overall (flattened): {overall_corr:.4f}")
print("\n" + "=" * 60)
print("Summary Statistics")
print("=" * 60)
# Gold standard stats
gold_embs = gold.select(emb_cols).to_numpy()
print("\nGold standard embeddings:")
print(f" Mean: {np.mean(gold_embs):.6f}")
print(f" Std: {np.std(gold_embs):.6f}")
print(f" Min: {np.min(gold_embs):.6f}")
print(f" Max: {np.max(gold_embs):.6f}")
# Generated stats
gen_embs = gen.select(emb_cols).to_numpy()
print("\nGenerated embeddings:")
print(f" Mean: {np.mean(gen_embs):.6f}")
print(f" Std: {np.std(gen_embs):.6f}")
print(f" Min: {np.min(gen_embs):.6f}")
print(f" Max: {np.max(gen_embs):.6f}")
if __name__ == "__main__":
compare_embeddings()

@ -0,0 +1,306 @@
#!/usr/bin/env python
"""
Compare 0_7 vs 0_7_beta predictions.
This script:
1. Loads original 0_7 predictions (from DDB)
2. Loads 0_7_beta predictions (from new embeddings)
3. Calculates correlation between predictions
4. Compares metrics (IC, RankIC, etc.) if actual returns available
"""
import os
import numpy as np
import polars as pl
import pandas as pd
from scipy.stats import spearmanr
from typing import Optional, Dict
# File paths
PRED_0_7_FILE = "../data/original_predictions_0_7.parquet"
PRED_0_7_BETA_FILE = "../data/predictions_beta_embedding.parquet"
ACTUAL_RETURNS_FILE = "../data/actual_returns.parquet"
def load_and_align_predictions():
"""Load both prediction files and align them by datetime and instrument."""
print("Loading predictions...")
# Load 0_7 predictions
df_0_7 = pl.read_parquet(PRED_0_7_FILE)
print(f"0_7 predictions: {df_0_7.shape}")
print(f" Date range: {df_0_7['datetime'].min()} to {df_0_7['datetime'].max()}")
print(f" Unique instruments: {df_0_7['instrument'].n_unique()}")
# Load 0_7_beta predictions
df_beta = pl.read_parquet(PRED_0_7_BETA_FILE)
print(f"\n0_7_beta predictions: {df_beta.shape}")
print(f" Date range: {df_beta['datetime'].min()} to {df_beta['datetime'].max()}")
print(f" Unique instruments: {df_beta['instrument'].n_unique()}")
# Ensure compatible types
df_0_7 = df_0_7.with_columns([
pl.col('datetime').cast(pl.Int64),
pl.col('instrument').cast(pl.Int64)
])
df_beta = df_beta.with_columns([
pl.col('datetime').cast(pl.Int64),
pl.col('instrument').cast(pl.Int64)
])
# Rename prediction columns
df_0_7 = df_0_7.rename({'prediction': 'pred_0_7'})
df_beta = df_beta.rename({'prediction': 'pred_beta'})
# Join on datetime and instrument
df_joined = df_0_7.join(
df_beta,
on=['datetime', 'instrument'],
how='inner'
)
print(f"\nJoined predictions: {df_joined.shape}")
print(f" Overlapping dates: {df_joined['datetime'].n_unique()}")
print(f" Overlapping instruments: {df_joined['instrument'].n_unique()}")
return df_joined
def calculate_correlation(df: pl.DataFrame) -> Dict[str, float]:
"""Calculate correlation between 0_7 and 0_7_beta predictions."""
df_pd = df.to_pandas()
# Overall correlation
pearson_corr = df_pd['pred_0_7'].corr(df_pd['pred_beta'])
spearman_corr, _ = spearmanr(df_pd['pred_0_7'], df_pd['pred_beta'])
# Correlation by date
daily_corrs = []
for date, group in df_pd.groupby('datetime'):
if len(group) >= 2:
corr = group['pred_0_7'].corr(group['pred_beta'])
daily_corrs.append(corr)
daily_corr_mean = np.mean(daily_corrs)
daily_corr_std = np.std(daily_corrs)
return {
'pearson_corr': pearson_corr,
'spearman_corr': spearman_corr,
'daily_corr_mean': daily_corr_mean,
'daily_corr_std': daily_corr_std
}
def calculate_ic_metrics(df: pl.DataFrame, actual_returns: pl.DataFrame) -> Dict:
"""Calculate IC metrics for both prediction sets."""
# Join with actual returns
df_joined = df.join(
actual_returns,
on=['datetime', 'instrument'],
how='inner'
)
print(f"\nJoined with returns: {df_joined.shape}")
df_pd = df_joined.to_pandas()
# Find return column
return_col = None
for col in ['v2v_5d', 'return', 'actual_return', 'ret']:
if col in df_pd.columns:
return_col = col
break
if return_col is None:
print("No return column found!")
return {}
print(f"Using return column: {return_col}")
# Calculate daily IC for both predictions
results_0_7 = []
results_beta = []
for date, group in df_pd.groupby('datetime'):
if len(group) < 5: # Need enough samples
continue
# IC (Pearson)
ic_0_7 = group['pred_0_7'].corr(group[return_col])
ic_beta = group['pred_beta'].corr(group[return_col])
# RankIC (Spearman)
rankic_0_7, _ = spearmanr(group['pred_0_7'], group[return_col])
rankic_beta, _ = spearmanr(group['pred_beta'], group[return_col])
results_0_7.append({'date': date, 'ic': ic_0_7, 'rankic': rankic_0_7})
results_beta.append({'date': date, 'ic': ic_beta, 'rankic': rankic_beta})
df_ic_0_7 = pd.DataFrame(results_0_7)
df_ic_beta = pd.DataFrame(results_beta)
metrics = {
'0_7': {
'ic_mean': df_ic_0_7['ic'].mean(),
'ic_std': df_ic_0_7['ic'].std(),
'ic_ir': df_ic_0_7['ic'].mean() / df_ic_0_7['ic'].std() if df_ic_0_7['ic'].std() > 0 else 0,
'rankic_mean': df_ic_0_7['rankic'].mean(),
'rankic_std': df_ic_0_7['rankic'].std(),
'rankic_ir': df_ic_0_7['rankic'].mean() / df_ic_0_7['rankic'].std() if df_ic_0_7['rankic'].std() > 0 else 0,
},
'0_7_beta': {
'ic_mean': df_ic_beta['ic'].mean(),
'ic_std': df_ic_beta['ic'].std(),
'ic_ir': df_ic_beta['ic'].mean() / df_ic_beta['ic'].std() if df_ic_beta['ic'].std() > 0 else 0,
'rankic_mean': df_ic_beta['rankic'].mean(),
'rankic_std': df_ic_beta['rankic'].std(),
'rankic_ir': df_ic_beta['rankic'].mean() / df_ic_beta['rankic'].std() if df_ic_beta['rankic'].std() > 0 else 0,
}
}
return metrics
def calculate_top_tier_return(df: pl.DataFrame, actual_returns: pl.DataFrame, top_pct: float = 0.1) -> Dict:
"""Calculate top-tier returns for both predictions."""
# Join with actual returns
df_joined = df.join(
actual_returns,
on=['datetime', 'instrument'],
how='inner'
)
df_pd = df_joined.to_pandas()
# Find return column
return_col = None
for col in ['v2v_5d', 'return', 'actual_return', 'ret']:
if col in df_pd.columns:
return_col = col
break
if return_col is None:
return {}
# Calculate top-tier returns
top_returns_0_7 = []
top_returns_beta = []
for date, group in df_pd.groupby('datetime'):
if len(group) < 10:
continue
n_top = max(1, int(len(group) * top_pct))
# Top predictions from 0_7
top_0_7 = group.nlargest(n_top, 'pred_0_7')
top_returns_0_7.append(top_0_7[return_col].mean())
# Top predictions from beta
top_beta = group.nlargest(n_top, 'pred_beta')
top_returns_beta.append(top_beta[return_col].mean())
return {
'0_7': {
'top_tier_return': np.mean(top_returns_0_7),
'top_tier_std': np.std(top_returns_0_7)
},
'0_7_beta': {
'top_tier_return': np.mean(top_returns_beta),
'top_tier_std': np.std(top_returns_beta)
}
}
def main():
"""Main comparison function."""
print("=" * 70)
print("COMPARISON: Alpha158 0_7 vs 0_7_beta Predictions")
print("=" * 70)
# Load and align predictions
df_joined = load_and_align_predictions()
if len(df_joined) == 0:
print("\nERROR: No overlapping predictions found!")
return
# Calculate correlation
print("\n" + "-" * 70)
print("PREDICTION CORRELATION")
print("-" * 70)
corr_metrics = calculate_correlation(df_joined)
print(f"Overall Pearson correlation: {corr_metrics['pearson_corr']:.4f}")
print(f"Overall Spearman correlation: {corr_metrics['spearman_corr']:.4f}")
print(f"Daily correlation mean: {corr_metrics['daily_corr_mean']:.4f}")
print(f"Daily correlation std: {corr_metrics['daily_corr_std']:.4f}")
# Prediction statistics
print("\n" + "-" * 70)
print("PREDICTION STATISTICS")
print("-" * 70)
df_pd = df_joined.to_pandas()
print(f"0_7 predictions:")
print(f" Mean: {df_pd['pred_0_7'].mean():.6f}")
print(f" Std: {df_pd['pred_0_7'].std():.6f}")
print(f" Min: {df_pd['pred_0_7'].min():.6f}")
print(f" Max: {df_pd['pred_0_7'].max():.6f}")
print(f"\n0_7_beta predictions:")
print(f" Mean: {df_pd['pred_beta'].mean():.6f}")
print(f" Std: {df_pd['pred_beta'].std():.6f}")
print(f" Min: {df_pd['pred_beta'].min():.6f}")
print(f" Max: {df_pd['pred_beta'].max():.6f}")
# Load actual returns and calculate IC metrics if available
if os.path.exists(ACTUAL_RETURNS_FILE):
print("\n" + "-" * 70)
print("IC METRICS (with actual returns)")
print("-" * 70)
actual_returns = pl.read_parquet(ACTUAL_RETURNS_FILE)
print(f"Loaded actual returns: {actual_returns.shape}")
ic_metrics = calculate_ic_metrics(df_joined, actual_returns)
if ic_metrics:
print(f"\n{'Metric':<20} {'0_7':<12} {'0_7_beta':<12} {'Diff':<12}")
print("-" * 56)
for metric in ['ic_mean', 'ic_std', 'ic_ir', 'rankic_mean', 'rankic_std', 'rankic_ir']:
v0 = ic_metrics['0_7'][metric]
v1 = ic_metrics['0_7_beta'][metric]
diff = v1 - v0
print(f"{metric:<20} {v0:>11.4f} {v1:>11.4f} {diff:>+11.4f}")
# Top-tier returns
print("\n" + "-" * 70)
print("TOP-TIER RETURNS (top 10%)")
print("-" * 70)
top_tier = calculate_top_tier_return(df_joined, actual_returns, top_pct=0.1)
if top_tier:
print(f"{'':<20} {'0_7':<12} {'0_7_beta':<12} {'Diff':<12}")
print("-" * 56)
t0 = top_tier['0_7']['top_tier_return']
t1 = top_tier['0_7_beta']['top_tier_return']
diff = t1 - t0
print(f"{'Top-tier return':<20} {t0:>11.4f} {t1:>11.4f} {diff:>+11.4f}")
else:
print(f"\nActual returns file not found: {ACTUAL_RETURNS_FILE}")
print("Skipping IC metrics calculation.")
print("\n" + "=" * 70)
print("Comparison complete!")
print("=" * 70)
if __name__ == "__main__":
main()

@ -0,0 +1,421 @@
#!/usr/bin/env python
"""
Dump Gold-Standard Data from Qlib Pipeline
This script exports processed feature data from the original Qlib pipeline
in multiple formats for debugging and comparison with the standalone Polars implementation.
Usage:
python dump_qlib_gold_standard.py --start-date 2020-01-02 --end-date 2020-01-10 --output-dir ../data/
"""
import argparse
import os
import sys
import pickle as pkl
from datetime import datetime, timedelta
from pathlib import Path
import pandas as pd
import polars as pl
import numpy as np
# Patch NumPy 2.0 compatibility: np.NaN was removed, use np.nan
if not hasattr(np, 'NaN'):
np.NaN = np.nan
def parse_args():
parser = argparse.ArgumentParser(
description="Dump gold-standard data from Qlib pipeline"
)
parser.add_argument(
"--start-date",
type=str,
default="2020-01-02",
help="Start date for data export (YYYY-MM-DD)",
)
parser.add_argument(
"--end-date",
type=str,
default="2020-01-10",
help="End date for data export (YYYY-MM-DD)",
)
parser.add_argument(
"--output-dir",
type=str,
default="../data/",
help="Output directory for exported files",
)
parser.add_argument(
"--qlib-dataset-path",
type=str,
default="/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/",
help="Path to Qlib dataset module",
)
return parser.parse_args()
def load_qlib_data(qlib_dataset_path, since_date):
"""
Load processed data from Qlib pipeline.
This function loads data using the original Qlib pipeline and handles
the SepDataFrame return type by concatenating column groups.
Args:
qlib_dataset_path: Path to the Qlib dataset module
since_date: Start date for loading data (YYYY-MM-DD)
Returns:
pd.DataFrame: Processed DataFrame from Qlib pipeline with all column groups concatenated
"""
import importlib.util
import datetime as dt
# Patch ruamel.yaml to provide safe_load compatibility
import ruamel.yaml as yaml
# Create a YAML instance with safe loader for backward compatibility
_yaml = yaml.YAML(typ='safe', pure=True)
# Monkey-patch safe_load to use the new API
def patched_safe_load(stream):
import io
if isinstance(stream, str):
stream = io.StringIO(stream)
return _yaml.load(stream)
yaml.safe_load = patched_safe_load
# Load the module directly
spec = importlib.util.spec_from_file_location(
"qlib_dataset",
os.path.join(qlib_dataset_path, "__init__.py")
)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
# Parse since_date
since_date_dt = pd.to_datetime(since_date)
# Load with extra history for Diff processor
load_start = (since_date_dt - dt.timedelta(days=20)).strftime("%Y-%m-%d")
print(f" Loading data with handler (load_start={load_start})...")
# Use _load_from_yaml to get raw handler data (SepDataFrame)
handler_data = module._load_from_yaml(
os.path.join(qlib_dataset_path, "handler.yaml"),
load_start
)
# Handle SepDataFrame - extract and concatenate column groups
if hasattr(handler_data, '_data') or hasattr(handler_data, '_df_dict'):
# It's a SepDataFrame from AggHandler
df_dict = getattr(handler_data, '_data', None) or getattr(handler_data, '_df_dict', {})
group_names = list(df_dict.keys())
print(f" Handler returned SepDataFrame with groups: {group_names}")
# Concatenate all column groups into a single DataFrame
all_dfs = []
for group in group_names:
df = df_dict[group]
if df is not None and len(df.columns) > 0:
df_copy = df.copy()
# Add group prefix to columns
df_copy.columns = [f"{group}::{col}" for col in df_copy.columns]
all_dfs.append(df_copy)
print(f" Group '{group}': {df_copy.shape}")
# Concatenate all groups along axis 1
raw_df = pd.concat(all_dfs, axis=1)
print(f" Concatenated raw data shape: {raw_df.shape}")
else:
raw_df = handler_data
print(f" Raw data shape: {raw_df.shape}")
# Load processor list
proc_path = os.path.join(qlib_dataset_path, "proc_list.proc")
print(f" Loading processor list from: {proc_path}")
with open(proc_path, "rb") as f:
proc_list = pkl.load(f)
print(f" Processor list has {len(proc_list)} processors")
for i, proc in enumerate(proc_list):
print(f" {i+1}. {type(proc).__name__}")
# Apply processors
from qlib.contrib.data.utils import apply_proc_list
print(f" Applying processor list (with_fit=False)...")
# The processor list expects columns without the group prefix
# We need to strip the prefix before applying processors
# Create a mapping and restore original column names
col_mapping = {}
for col in raw_df.columns:
if '::' in col:
original = col.split('::', 1)[1]
col_mapping[col] = original
# Rename columns back to original names for processor application
raw_df_renamed = raw_df.rename(columns=col_mapping)
print(f" Renamed columns for processor compatibility. Shape: {raw_df_renamed.shape}")
# Convert boolean columns to object to avoid NaN -> int conversion issues
bool_cols = raw_df_renamed.select_dtypes(include=['bool']).columns
print(f" Converting {len(bool_cols)} boolean columns to object dtype")
for col in bool_cols:
raw_df_renamed[col] = raw_df_renamed[col].astype(object)
# Apply processors
df = apply_proc_list(raw_df_renamed, proc_list=proc_list, with_fit=False)
print(f" Applied processor list. Result shape: {df.shape}")
# Add back group prefixes to columns
new_col_mapping = {v: k for k, v in col_mapping.items()}
df = df.rename(columns=new_col_mapping)
print(f" Restored column group prefixes. Shape: {df.shape}")
# Filter to requested date range
df = df.loc(axis=0)[slice(since_date_dt, None)]
print(f" Filtered to since_date={since_date}. Final shape: {df.shape}")
return df
def export_column_groups(df, output_dir, prefix="gold_standard"):
"""
Export separate files for different column groups.
Column groups:
- feature: alpha158 + alpha158_ntrl
- feature_ext: extended features (log_size_diff, etc.)
- feature_flag: market flags (IsST, IsN, IsZt, IsDt, etc.)
- indus_idx: industry index columns
"""
# Identify column groups based on naming conventions
feature_cols = [c for c in df.columns if c.startswith("feature::")]
feature_ext_cols = [c for c in df.columns if c.startswith("feature_ext::")]
feature_flag_cols = [c for c in df.columns if c.startswith("feature_flag::")]
indus_idx_cols = [c for c in df.columns if c.startswith("indus_idx::")]
# Also include the ntrl suffixed columns
feature_ntrl_cols = [c for c in df.columns if c.endswith("_ntrl")]
export_paths = {}
# Export feature columns (alpha158 + alpha158_ntrl)
if feature_cols:
feature_df = df[feature_cols]
path = os.path.join(output_dir, f"{prefix}_feature.parquet")
feature_df.to_parquet(path)
export_paths["feature"] = path
print(f" Exported feature columns ({len(feature_cols)}): {path}")
# Export feature_ext columns
if feature_ext_cols:
feature_ext_df = df[feature_ext_cols]
path = os.path.join(output_dir, f"{prefix}_feature_ext.parquet")
feature_ext_df.to_parquet(path)
export_paths["feature_ext"] = path
print(f" Exported feature_ext columns ({len(feature_ext_cols)}): {path}")
# Export feature_flag columns
if feature_flag_cols:
feature_flag_df = df[feature_flag_cols]
path = os.path.join(output_dir, f"{prefix}_feature_flag.parquet")
feature_flag_df.to_parquet(path)
export_paths["feature_flag"] = path
print(f" Exported feature_flag columns ({len(feature_flag_cols)}): {path}")
# Export indus_idx columns
if indus_idx_cols:
indus_idx_df = df[indus_idx_cols]
path = os.path.join(output_dir, f"{prefix}_indus_idx.parquet")
indus_idx_df.to_parquet(path)
export_paths["indus_idx"] = path
print(f" Exported indus_idx columns ({len(indus_idx_cols)}): {path}")
# Export feature_ntrl columns separately
if feature_ntrl_cols:
feature_ntrl_df = df[feature_ntrl_cols]
path = os.path.join(output_dir, f"{prefix}_feature_ntrl.parquet")
feature_ntrl_df.to_parquet(path)
export_paths["feature_ntrl"] = path
print(f" Exported feature_ntrl columns ({len(feature_ntrl_cols)}): {path}")
return export_paths
def export_metadata(df, output_dir, prefix="gold_standard", proc_list_path=None):
"""
Export metadata about the dataset.
Includes:
- Column names and shapes
- Processor list configuration
- Date range coverage
- NaN value statistics
"""
metadata_path = os.path.join(output_dir, f"{prefix}_metadata.txt")
with open(metadata_path, "w") as f:
f.write("=" * 80 + "\n")
f.write("GOLD-STANDARD QLIB PIPELINE OUTPUT - METADATA\n")
f.write("=" * 80 + "\n\n")
f.write(f"Export Date: {datetime.now().isoformat()}\n\n")
f.write("DATAFRAME SHAPE\n")
f.write("-" * 40 + "\n")
f.write(f"Shape: {df.shape}\n")
f.write(f"Rows: {len(df)}\n")
f.write(f"Columns: {len(df.columns)}\n\n")
f.write("DATE RANGE\n")
f.write("-" * 40 + "\n")
dates = df.index.get_level_values("datetime").unique()
f.write(f"Min Date: {dates.min()}\n")
f.write(f"Max Date: {dates.max()}\n")
f.write(f"Unique Dates: {len(dates)}\n\n")
f.write("INSTRUMENTS\n")
f.write("-" * 40 + "\n")
instruments = df.index.get_level_values("instrument").unique()
f.write(f"Unique Instruments: {len(instruments)}\n")
f.write(f"Sample Instruments: {list(instruments[:10])}\n\n")
f.write("COLUMN GROUPS\n")
f.write("-" * 40 + "\n")
# Categorize columns
feature_cols = [c for c in df.columns if c.startswith("feature::")]
feature_ext_cols = [c for c in df.columns if c.startswith("feature_ext::")]
feature_flag_cols = [c for c in df.columns if c.startswith("feature_flag::")]
indus_idx_cols = [c for c in df.columns if c.startswith("indus_idx::")]
feature_ntrl_cols = [c for c in df.columns if c.endswith("_ntrl")]
f.write(f"feature:: columns: {len(feature_cols)}\n")
f.write(f"feature_ext:: columns: {len(feature_ext_cols)}\n")
f.write(f"feature_flag:: columns: {len(feature_flag_cols)}\n")
f.write(f"indus_idx:: columns: {len(indus_idx_cols)}\n")
f.write(f"*_ntrl columns: {len(feature_ntrl_cols)}\n\n")
f.write("COLUMN DTYPES\n")
f.write("-" * 40 + "\n")
dtype_counts = df.dtypes.value_counts()
for dtype, count in dtype_counts.items():
f.write(f"{dtype}: {count}\n")
f.write("\n")
f.write("NAN STATISTICS\n")
f.write("-" * 40 + "\n")
nan_counts = df.isna().sum()
cols_with_nan = nan_counts[nan_counts > 0]
f.write(f"Columns with NaN: {len(cols_with_nan)}\n")
f.write(f"Total NaN values: {df.isna().sum().sum()}\n\n")
if len(cols_with_nan) > 0:
f.write("NaN per column (top 20):\n")
for col, cnt in cols_with_nan.nlargest(20).items():
f.write(f" {col}: {cnt} ({100*cnt/len(df):.2f}%)\n")
f.write("\n")
f.write("ALL COLUMN NAMES\n")
f.write("-" * 40 + "\n")
for i, col in enumerate(df.columns):
f.write(f" {i+1}. {col}\n")
f.write("\n")
if proc_list_path and os.path.exists(proc_list_path):
f.write("PROCESSOR LIST\n")
f.write("-" * 40 + "\n")
f.write(f"Source: {proc_list_path}\n")
try:
with open(proc_list_path, "rb") as pf:
proc_list = pkl.load(pf)
f.write(f"Number of processors: {len(proc_list)}\n\n")
for i, proc in enumerate(proc_list):
f.write(f" {i+1}. {proc}\n")
except Exception as e:
f.write(f"Could not load processor list: {e}\n")
f.write("\n")
print(f"Exported metadata: {metadata_path}")
return metadata_path
def main():
args = parse_args()
# Parse dates
start_date = pd.to_datetime(args.start_date)
end_date = pd.to_datetime(args.end_date)
# Create output directory if it doesn't exist
output_dir = Path(args.output_dir).resolve()
output_dir.mkdir(parents=True, exist_ok=True)
print("=" * 80)
print("DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE")
print("=" * 80)
print(f"Date Range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
print(f"Output Directory: {output_dir}")
print(f"Qlib Dataset Path: {args.qlib_dataset_path}")
print()
# Load data from Qlib pipeline
print("Step 1: Loading data from Qlib pipeline...")
print(f" Loading since_date={start_date.strftime('%Y-%m-%d')}")
try:
df = load_qlib_data(args.qlib_dataset_path, start_date.strftime("%Y-%m-%d"))
print(f" Loaded DataFrame with shape: {df.shape}")
except Exception as e:
print(f" ERROR: Failed to load data from Qlib pipeline: {e}")
sys.exit(1)
# Filter to requested date range
print("\nStep 2: Filtering to requested date range...")
df = df.loc(axis=0)[slice(start_date, end_date)]
print(f" Filtered shape: {df.shape}")
# Export full DataFrame
print("\nStep 3: Exporting full DataFrame...")
prefix = f"gold_standard_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}"
parquet_path = output_dir / f"{prefix}.parquet"
df.to_parquet(parquet_path)
print(f" Exported parquet: {parquet_path}")
pkl_path = output_dir / f"{prefix}.pkl"
df.to_pickle(pkl_path)
print(f" Exported pickle: {pkl_path}")
# Export column groups
print("\nStep 4: Exporting column groups...")
export_paths = export_column_groups(df, str(output_dir), prefix=prefix)
# Export metadata
print("\nStep 5: Exporting metadata...")
proc_list_path = os.path.join(args.qlib_dataset_path, "proc_list.proc")
export_metadata(df, str(output_dir), prefix=prefix, proc_list_path=proc_list_path)
# Summary
print("\n" + "=" * 80)
print("EXPORT SUMMARY")
print("=" * 80)
print(f"Date range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
print(f"Output directory: {output_dir}")
print(f"Total rows: {len(df)}")
print(f"Total columns: {len(df.columns)}")
print(f"\nFiles exported:")
print(f" - {prefix}.parquet (full DataFrame)")
print(f" - {prefix}.pkl (pickle, preserves dtypes)")
print(f" - {prefix}_metadata.txt (column info, statistics)")
for group, path in export_paths.items():
print(f" - {os.path.basename(path)} ({group} columns)")
print("\nDone!")
if __name__ == "__main__":
main()

@ -0,0 +1,270 @@
#!/usr/bin/env python
"""
Dump Gold-Standard Data from Qlib Pipeline (Simple Version)
This script exports the RAW feature data from the Qlib pipeline BEFORE
any processors are applied. This is useful for debugging and comparison.
NOTE: This script loads ALL data from DolphinDB and then filters to the
requested date range. For large date ranges, this may require significant memory.
Usage:
python dump_qlib_gold_standard_simple.py --start-date 2020-01-02 --end-date 2020-01-10
"""
import argparse
import os
import sys
import pickle as pkl
from datetime import datetime, timedelta
from pathlib import Path
import pandas as pd
import numpy as np
# Patch NumPy 2.0 compatibility: np.NaN was removed, use np.nan
if not hasattr(np, 'NaN'):
np.NaN = np.nan
def parse_args():
parser = argparse.ArgumentParser(
description="Dump gold-standard raw data from Qlib pipeline",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Export a few days for debugging (recommended)
python dump_qlib_gold_standard_simple.py --start-date 2020-01-02 --end-date 2020-01-10
# Export with custom output directory
python dump_qlib_gold_standard_simple.py --start-date 2020-01-02 --end-date 2020-01-10 --output-dir /path/to/output
"""
)
parser.add_argument(
"--start-date",
type=str,
default="2020-01-02",
help="Start date for data export (YYYY-MM-DD)",
)
parser.add_argument(
"--end-date",
type=str,
default="2020-01-10",
help="End date for data export (YYYY-MM-DD)",
)
parser.add_argument(
"--output-dir",
type=str,
default="../data/",
help="Output directory for exported files",
)
parser.add_argument(
"--qlib-dataset-path",
type=str,
default="/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/",
help="Path to Qlib dataset module",
)
parser.add_argument(
"--instruments",
type=str,
default=None,
help="Comma-separated list of instrument codes to export (default: all)",
)
return parser.parse_args()
def load_raw_data(qlib_dataset_path, since_date, instruments=None):
"""
Load RAW data from Qlib pipeline (before processor list is applied).
Returns a dict of DataFrames, one per column group.
Args:
qlib_dataset_path: Path to Qlib dataset module
since_date: Start date for loading (needs history before for Diff)
instruments: Optional list of instrument codes to filter
"""
import importlib.util
import ruamel.yaml as yaml
# Create a YAML instance with safe loader for backward compatibility
_yaml = yaml.YAML(typ='safe', pure=True)
def patched_safe_load(stream):
import io
if isinstance(stream, str):
stream = io.StringIO(stream)
return _yaml.load(stream)
yaml.safe_load = patched_safe_load
# Load the module directly
spec = importlib.util.spec_from_file_location(
"qlib_dataset",
os.path.join(qlib_dataset_path, "__init__.py")
)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
# Parse since_date
since_date_dt = pd.to_datetime(since_date)
# Load with extra history for Diff processor
load_start = (since_date_dt - timedelta(days=20)).strftime("%Y-%m-%d")
print(f" Loading raw data from handler (load_start={load_start})...")
if instruments:
print(f" Filtering instruments: {instruments[:5]}... ({len(instruments)} total)")
# Use _load_from_yaml to get raw handler data (SepDataFrame)
handler_data = module._load_from_yaml(
os.path.join(qlib_dataset_path, "handler.yaml"),
load_start
)
# Handle SepDataFrame - extract column groups
if hasattr(handler_data, '_data') or hasattr(handler_data, '_df_dict'):
df_dict = getattr(handler_data, '_data', None) or getattr(handler_data, '_df_dict', {})
group_names = list(df_dict.keys())
print(f" Handler returned SepDataFrame with groups: {group_names}")
# Filter instruments if specified
if instruments:
print(f" Filtering to specified instruments...")
for group in group_names:
if df_dict[group] is not None:
df = df_dict[group]
# Filter by instrument level
if isinstance(df.index, pd.MultiIndex):
mask = df.index.get_level_values('instrument').isin(instruments)
df_dict[group] = df[mask]
print(f" Group '{group}': {df_dict[group].shape} (filtered)")
for group in group_names:
df = df_dict[group]
if df is not None:
print(f" Group '{group}': shape={df.shape}, columns={len(df.columns)}")
return df_dict, handler_data.index
else:
print(f" Handler returned DataFrame: shape={handler_data.shape}")
return {"default": handler_data}, handler_data.index
def export_data(df_dict, index, output_dir, start_date, end_date):
"""Export data to parquet and pickle files."""
output_dir = Path(output_dir).resolve()
output_dir.mkdir(parents=True, exist_ok=True)
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)
# Filter index
mask = (index >= start_date) & (index <= end_date)
filtered_index = index[mask]
print(f"\nExporting data for date range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
print(f" Filtered index has {len(filtered_index)} dates")
prefix = f"gold_standard_raw_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}"
exported_files = []
# Export each group separately
for group, df in df_dict.items():
if df is None or len(df.columns) == 0:
print(f" Skipping empty group '{group}'")
continue
# Filter by date
df_filtered = df.loc[df.index.isin(filtered_index)]
print(f" Group '{group}': {df_filtered.shape}")
# Export to parquet
parquet_path = output_dir / f"{prefix}_{group}.parquet"
df_filtered.to_parquet(parquet_path)
exported_files.append(str(parquet_path))
print(f" -> {parquet_path}")
# Export to pickle (preserves dtypes)
pkl_path = output_dir / f"{prefix}_{group}.pkl"
df_filtered.to_pickle(pkl_path)
exported_files.append(str(pkl_path))
# Also create a metadata file
metadata_path = output_dir / f"{prefix}_metadata.txt"
with open(metadata_path, "w") as f:
f.write("=" * 80 + "\n")
f.write("GOLD-STANDARD RAW DATA - METADATA\n")
f.write("=" * 80 + "\n\n")
f.write(f"Export Date: {datetime.now().isoformat()}\n")
f.write(f"Date Range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}\n")
f.write(f"Total Dates: {len(filtered_index)}\n\n")
f.write("COLUMN GROUPS:\n")
f.write("-" * 40 + "\n")
for group, df in df_dict.items():
if df is not None:
f.write(f" {group}:\n")
f.write(f" Shape: {df.shape}\n")
f.write(f" Columns: {len(df.columns)}\n")
f.write(f" Sample columns: {list(df.columns[:5])}...\n\n")
f.write("\nPROCESSOR LIST (for reference):\n")
f.write("-" * 40 + "\n")
proc_path = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc"
if os.path.exists(proc_path):
with open(proc_path, "rb") as pf:
proc_list = pkl.load(pf)
f.write(f"Number of processors: {len(proc_list)}\n\n")
for i, proc in enumerate(proc_list):
f.write(f" {i+1}. {type(proc).__module__}.{type(proc).__name__}\n")
else:
f.write(f"Processor list not found: {proc_path}\n")
exported_files.append(str(metadata_path))
return exported_files
def main():
args = parse_args()
print("=" * 80)
print("DUMP GOLD-STANDARD RAW DATA FROM QLIB PIPELINE")
print("=" * 80)
print(f"Date Range: {args.start_date} to {args.end_date}")
print(f"Output Directory: {args.output_dir}")
print(f"Qlib Dataset Path: {args.qlib_dataset_path}")
print()
# Load raw data
print("Step 1: Loading raw data from Qlib pipeline...")
try:
instruments = None
if args.instruments:
instruments = args.instruments.split(',')
df_dict, index = load_raw_data(args.qlib_dataset_path, args.start_date, instruments=instruments)
except Exception as e:
print(f" ERROR: Failed to load data: {e}")
import traceback
traceback.print_exc()
sys.exit(1)
# Export data
print("\nStep 2: Exporting data...")
exported_files = export_data(df_dict, index, args.output_dir, args.start_date, args.end_date)
# Summary
print("\n" + "=" * 80)
print("EXPORT SUMMARY")
print("=" * 80)
print(f"Date range: {args.start_date} to {args.end_date}")
print(f"Output directory: {Path(args.output_dir).resolve()}")
print(f"\nFiles exported ({len(exported_files)}):")
for f in exported_files:
print(f" - {f}")
print("\nDone!")
if __name__ == "__main__":
main()

@ -0,0 +1,210 @@
#!/usr/bin/env python
"""
Fetch embedding data from DolphinDB and save to parquet.
This script:
1. Connects to DolphinDB
2. Queries the dwm_1day_multicast_csencode table
3. Filters by version (default: 'csiallx_feature2_ntrla_flag_pnlnorm')
4. Filters by date range
5. Transforms columns (m_nDate -> datetime, code -> instrument)
6. Saves to local parquet file
"""
import os
import polars as pl
import pandas as pd
from datetime import datetime
from typing import Optional
# DolphinDB config (from CLAUDE.md)
DDB_CONFIG = {
"host": "192.168.1.146",
"port": 8848,
"username": "admin",
"password": "123456"
}
DB_PATH = "dfs://daily_stock_run_multicast"
TABLE_NAME = "dwm_1day_multicast_csencode"
DEFAULT_VERSION = "csix_alpha158b_ext2_zscore_vae4"
DEFAULT_START_DATE = "2019-01-01"
DEFAULT_END_DATE = "2025-12-31"
OUTPUT_FILE = "../data/embeddings_from_ddb.parquet"
def fetch_embeddings(
start_date: str = DEFAULT_START_DATE,
end_date: str = DEFAULT_END_DATE,
version: str = DEFAULT_VERSION,
output_file: str = OUTPUT_FILE
) -> pl.DataFrame:
"""
Fetch embedding data from DolphinDB.
Args:
start_date: Start date filter (YYYY-MM-DD)
end_date: End date filter (YYYY-MM-DD)
version: Version string to filter by
output_file: Output parquet file path
Returns:
Polars DataFrame with columns: [datetime, instrument, embedding_0, embedding_1, ...]
"""
print("=" * 60)
print("Fetching embedding data from DolphinDB")
print("=" * 60)
print(f"Database: {DB_PATH}")
print(f"Table: {TABLE_NAME}")
print(f"Version: {version}")
print(f"Date range: {start_date} to {end_date}")
# Connect to DolphinDB
try:
from qshare.io.ddb import get_ddb_sess
sess = get_ddb_sess(host=DDB_CONFIG["host"], port=DDB_CONFIG["port"])
print(f"Connected to DolphinDB at {DDB_CONFIG['host']}:{DDB_CONFIG['port']}")
except Exception as e:
print(f"Error connecting to DolphinDB: {e}")
raise
# Convert date strings to DolphinDB date format (YYYY.MM.DD)
start_ddb = start_date.replace("-", ".")
end_ddb = end_date.replace("-", ".")
# Build SQL query with filters in the WHERE clause
# Note: DolphinDB requires date() function for date literals
# Use single-line SQL to avoid parsing issues
sql = f'select * from loadTable("{DB_PATH}", "{TABLE_NAME}") where version = "{version}" and m_nDate >= date({start_ddb}) and m_nDate <= date({end_ddb})'
print(f"Executing SQL: {sql.strip()}")
try:
# Execute query and get pandas DataFrame
df_pd = sess.run(sql)
print(f"Fetched {len(df_pd)} rows from DolphinDB")
print(f"Columns: {df_pd.columns.tolist()}")
if len(df_pd) > 0:
print(f"Sample:\n{df_pd.head()}")
except Exception as e:
print(f"Error executing query: {e}")
raise
finally:
sess.close()
# Convert to Polars
df = pl.from_pandas(df_pd)
print(f"Columns in result: {df.columns}")
# Transform columns
# Rename m_nDate -> datetime and convert to uint32 (YYYYMMDD)
if 'm_nDate' in df.columns:
df = df.rename({"m_nDate": "datetime"})
if df["datetime"].dtype == pl.Datetime:
df = df.with_columns([
pl.col("datetime").dt.strftime("%Y%m%d").cast(pl.UInt32).alias("datetime")
])
elif df["datetime"].dtype == pl.Date:
df = df.with_columns([
pl.col("datetime").dt.strftime("%Y%m%d").cast(pl.UInt32).alias("datetime")
])
elif df["datetime"].dtype in [pl.Utf8, pl.String]:
df = df.with_columns([
pl.col("datetime").str.replace("-", "").cast(pl.UInt32).alias("datetime")
])
else:
df = df.with_columns([pl.col("datetime").cast(pl.UInt32).alias("datetime")])
# Rename code -> instrument and convert to uint32
if 'code' in df.columns:
df = df.rename({"code": "instrument"})
# Convert TS code (e.g., 'SH600085') to uint32 by removing prefix and casting
df = df.with_columns([
pl.col("instrument")
.str.replace("SH", "")
.str.replace("SZ", "")
.str.replace("BJ", "")
.cast(pl.UInt32)
.alias("instrument")
])
# Drop version column if present (no longer needed)
if 'version' in df.columns:
df = df.drop('version')
# Check if 'values' column contains lists (embedding vectors)
if 'values' in df.columns and df['values'].dtype == pl.List:
# Get the embedding dimension from the first row
first_val = df['values'][0]
if first_val is not None:
emb_dim = len(first_val)
print(f"Detected embedding dimension: {emb_dim}")
# Expand the list column to separate embedding columns
embedding_cols = []
for i in range(emb_dim):
col_name = f"embedding_{i}"
embedding_cols.append(col_name)
df = df.with_columns([
pl.col('values').list.get(i).alias(col_name)
])
# Drop the original values column
df = df.drop('values')
# Reorder columns: datetime, instrument, embedding_0, embedding_1, ...
core_cols = ['datetime', 'instrument']
final_cols = core_cols + embedding_cols
df = df.select(final_cols)
print(f"Expanded embeddings into {emb_dim} columns")
else:
# Identify embedding columns (typically named 'feature_0', 'feature_1', etc. or 'emb_0', 'emb_1', etc.)
# Keep datetime, instrument, and any embedding/feature columns
core_cols = ['datetime', 'instrument']
embedding_cols = [c for c in df.columns if c not in core_cols + ['version']]
# Select and order columns
final_cols = core_cols + sorted(embedding_cols)
df = df.select(final_cols)
print(f"\nTransformed data:")
print(f" Shape: {df.shape}")
print(f" Columns: {df.columns[:10]}..." if len(df.columns) > 10 else f" Columns: {df.columns}")
print(f" Date range: {df['datetime'].min()} to {df['datetime'].max()}")
print(f" Instrument count: {df['instrument'].n_unique()}")
print(f" Sample:\n{df.head()}")
# Save to parquet
os.makedirs(os.path.dirname(output_file), exist_ok=True)
df.write_parquet(output_file)
print(f"\nSaved to: {output_file}")
return df
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Fetch embedding data from DolphinDB")
parser.add_argument("--start-date", type=str, default=DEFAULT_START_DATE,
help="Start date (YYYY-MM-DD)")
parser.add_argument("--end-date", type=str, default=DEFAULT_END_DATE,
help="End date (YYYY-MM-DD)")
parser.add_argument("--version", type=str, default=DEFAULT_VERSION,
help="Version string to filter by")
parser.add_argument("--output", type=str, default=OUTPUT_FILE,
help="Output parquet file")
args = parser.parse_args()
df = fetch_embeddings(
start_date=args.start_date,
end_date=args.end_date,
version=args.version,
output_file=args.output
)
print("\nDone!")

@ -0,0 +1,211 @@
#!/usr/bin/env python
"""
Fetch original 0_7 predictions from DolphinDB and save to parquet.
This script:
1. Connects to DolphinDB
2. Queries the app_1day_multicast_longsignal_port table
3. Filters for version 'host140_exp20_d033'
4. Transforms columns (m_nDate -> datetime, code -> instrument)
5. Saves to local parquet file
"""
import os
import polars as pl
import pandas as pd
from datetime import datetime
from typing import Optional
# DolphinDB config (from CLAUDE.md)
DDB_CONFIG = {
"host": "192.168.1.146",
"port": 8848,
"username": "admin",
"password": "123456"
}
TABLE_PATH = "dfs://daily_stock_run_multicast/app_1day_multicast_longsignal_port"
VERSION = "host140_exp20_d033"
OUTPUT_FILE = "../data/original_predictions_0_7.parquet"
def datetime_to_uint32(dt) -> int:
"""Convert datetime to YYYYMMDD uint32 format."""
if isinstance(dt, (int, float)):
return int(dt)
if hasattr(dt, 'strftime'):
return int(dt.strftime('%Y%m%d'))
return int(dt)
def tscode_to_uint32(code) -> int:
"""Convert TS code (e.g., '000001.SZ') to uint32 instrument code."""
if isinstance(code, int):
return code
# Remove exchange suffix and leading zeros
code_str = str(code).split('.')[0]
return int(code_str)
def fetch_original_predictions(
start_date: Optional[str] = None,
end_date: Optional[str] = None,
output_file: str = OUTPUT_FILE
) -> pl.DataFrame:
"""
Fetch original 0_7 predictions from DolphinDB.
Args:
start_date: Optional start date filter (YYYY-MM-DD)
end_date: Optional end date filter (YYYY-MM-DD)
output_file: Output parquet file path
Returns:
Polars DataFrame with columns: [datetime, instrument, prediction]
"""
print("Fetching original 0_7 predictions from DolphinDB...")
print(f"Table: {TABLE_PATH}")
print(f"Version: {VERSION}")
# Connect to DolphinDB
try:
from qshare.io.ddb import get_ddb_sess
sess = get_ddb_sess(host=DDB_CONFIG["host"], port=DDB_CONFIG["port"])
print(f"Connected to DolphinDB at {DDB_CONFIG['host']}:{DDB_CONFIG['port']}")
except Exception as e:
print(f"Error connecting to DolphinDB: {e}")
raise
# Build SQL query using DolphinDB syntax
# Need to load the table via database() first using dfs:// path
db_path, table_name = TABLE_PATH.replace("dfs://", "").split("/", 1)
# Use DolphinDB's SQL syntax with loadTable and dfs://
sql = f"""
select * from loadTable("dfs://{db_path}", "{table_name}")
"""
# We'll filter in Python after loading since DolphinDB's SQL syntax
# for partitioned tables can be tricky
print(f"Executing SQL: {sql.strip()}")
try:
# Execute query and get pandas DataFrame
df_full = sess.run(sql)
print(f"Fetched {len(df_full)} total rows from DolphinDB")
print(f"Columns: {df_full.columns.tolist()}")
print(f"Sample:\n{df_full.head()}")
print(f"Version values: {df_full['version'].unique()[:10] if 'version' in df_full.columns else 'N/A'}")
# Filter for version in Python
# Version string contains additional parameters, use startswith
if 'version' in df_full.columns:
df_pd = df_full[df_full['version'].str.startswith(VERSION)]
print(f"Filtered to {len(df_pd)} rows for version '{VERSION}'")
if len(df_pd) > 0:
print(f"Matching versions: {df_pd['version'].unique()[:5]}")
else:
print("Warning: 'version' column not found, using all data")
df_pd = df_full
# Apply date filters if specified
# m_nDate is datetime64, convert to YYYYMMDD int for comparison
if start_date and 'm_nDate' in df_pd.columns:
start_dt = pd.to_datetime(start_date)
df_pd = df_pd[df_pd['m_nDate'] >= start_dt]
if end_date and 'm_nDate' in df_pd.columns:
end_dt = pd.to_datetime(end_date)
df_pd = df_pd[df_pd['m_nDate'] <= end_dt]
print(f"After date filter: {len(df_pd)} rows")
except Exception as e:
print(f"Error executing query: {e}")
raise
finally:
sess.close()
# Convert to Polars
df = pl.from_pandas(df_pd)
print(f"Columns in result: {df.columns}")
print(f"Sample data:\n{df.head()}")
# Transform columns
# Rename m_nDate -> datetime and convert to uint32
df = df.rename({"m_nDate": "datetime"})
# Handle datetime conversion from datetime[ns] to uint32 (YYYYMMDD)
if df["datetime"].dtype == pl.Datetime:
df = df.with_columns([
pl.col("datetime").dt.strftime("%Y%m%d").cast(pl.UInt32).alias("datetime")
])
elif df["datetime"].dtype == pl.Date:
df = df.with_columns([
pl.col("datetime").dt.strftime("%Y%m%d").cast(pl.UInt32).alias("datetime")
])
elif df["datetime"].dtype in [pl.Utf8, pl.String]:
df = df.with_columns([
pl.col("datetime").str.replace("-", "").cast(pl.UInt32).alias("datetime")
])
else:
# Already numeric, just cast
df = df.with_columns([pl.col("datetime").cast(pl.UInt32).alias("datetime")])
# Rename code -> instrument and convert to uint32
# The code is in format "SH600085" or "SZ000001"
df = df.rename({"code": "instrument"})
# Convert TS code (e.g., 'SH600085') to uint32 by removing prefix and casting
df = df.with_columns([
pl.col("instrument")
.str.replace("SH", "")
.str.replace("SZ", "")
.str.replace("BJ", "")
.cast(pl.UInt32)
.alias("instrument")
])
# The prediction column is 'weight' in this table
# Rename it to 'prediction' for consistency
if 'weight' in df.columns:
df = df.rename({'weight': 'prediction'})
else:
# Fallback: find any numeric column that's not datetime or instrument
for col in df.columns:
if col not in ['datetime', 'instrument'] and df[col].dtype in [pl.Float32, pl.Float64]:
df = df.rename({col: 'prediction'})
break
# Select only the columns we need
df = df.select(["datetime", "instrument", "prediction"])
print(f"\nTransformed data:")
print(f" Shape: {df.shape}")
print(f" Columns: {df.columns}")
print(f" Date range: {df['datetime'].min()} to {df['datetime'].max()}")
print(f" Sample:\n{df.head()}")
# Save to parquet
os.makedirs(os.path.dirname(output_file), exist_ok=True)
df.write_parquet(output_file)
print(f"\nSaved to: {output_file}")
return df
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Fetch original 0_7 predictions from DolphinDB")
parser.add_argument("--start-date", type=str, default=None, help="Start date (YYYY-MM-DD)")
parser.add_argument("--end-date", type=str, default=None, help="End date (YYYY-MM-DD)")
parser.add_argument("--output", type=str, default=OUTPUT_FILE, help="Output parquet file")
args = parser.parse_args()
df = fetch_original_predictions(
start_date=args.start_date,
end_date=args.end_date,
output_file=args.output
)
print("\nDone!")

@ -0,0 +1,292 @@
#!/usr/bin/env python
"""
Script to generate actual returns using real kline data without changing the original format.
This calculates real returns from kline VWAP prices using the original datetime and instrument format
and saves the result as 'v2v_5d' column.
"""
import pandas as pd
import numpy as np
import polars as pl
from datetime import datetime, timedelta
import os
def generate_real_returns_from_kline(input_kline_path="/data/parquet/dataset/stg_1day_wind_kline_adjusted_1D/",
prediction_file=None,
output_file="../data/actual_returns.parquet",
return_days=5):
"""
Generate real returns based on kline data using original datetime and instrument format.
Args:
input_kline_path: Path to the kline data
prediction_file: Optional prediction file to determine date range
output_file: Output file for actual returns
return_days: Number of days for return calculation (default 5)
"""
print(f"Generating real returns from kline data...")
# Import qshare functions for return calculation and spine operations
try:
from qshare.algo.polars.eval import calc_daily_return
from qshare.algo.polars.spine import create_spine, align_to_calendar, merge_data_onto_spine
print("Successfully imported qshare functions including spine operations")
calc_daily_return_fn = calc_daily_return
except ImportError as e:
print(f"Could not import qshare functions: {e}")
print("Falling back to manual return calculation without spine-filling")
def calc_daily_return_manual(df, price_col, window_len, col_name, bias=1):
"""Manual implementation of daily return calculation."""
# Sort by instrument and datetime
df = df.sort(['instrument', 'datetime'])
# Calculate shifted prices for future returns
df = df.with_columns([
pl.col(price_col).shift(-bias).over('instrument').alias('price_base'),
pl.col(price_col).shift(-(bias + window_len - 1)).over('instrument').alias('price_end')
])
# Calculate returns
df = df.with_columns([
((pl.col('price_end') / pl.col('price_base')) - 1).alias(col_name)
])
# Clean up temporary columns
df = df.drop(['price_base', 'price_end'])
return df
calc_daily_return_fn = calc_daily_return_manual
# Determine date range - either from prediction file or use default range
if prediction_file and os.path.exists(prediction_file):
print(f"Using prediction file {prediction_file} to determine date range...")
df_pred = pl.read_parquet(prediction_file)
pred_min_date = df_pred['date'].min()
pred_max_date = df_pred['date'].max()
pred_min_date_int = int(pred_min_date.strftime('%Y%m%d'))
pred_max_date_int = int(pred_max_date.strftime('%Y%m%d'))
print(f"Prediction date range: {pred_min_date} to {pred_max_date}")
else:
# Use a reasonable default range if no prediction file provided
print("No prediction file provided, using default date range...")
# Default to a range that should have data: 2019-01-01 to 2020-11-30
pred_min_date_int = 20190101
pred_max_date_int = 20201130
print(f"Default date range: {pred_min_date_int} to {pred_max_date_int}")
print(f"Loading kline data from {input_kline_path} and filtering to date range...")
# Use lazy loading for efficiency and filter kline data
try:
df_kline = (
pl.scan_parquet(input_kline_path)
.filter(
pl.col('datetime').is_between(pred_min_date_int, pred_max_date_int)
)
.collect()
)
print(f"Kline data shape after filtering: {df_kline.shape}")
print(f"Kline columns: {df_kline.columns}")
print(f"Kline schema: {df_kline.schema}")
except Exception as e:
print(f"Error loading kline data: {e}")
return None
if df_kline.height == 0:
print("No kline data found within the date range!")
return None
# Verify that we have required columns
if 'datetime' not in df_kline.columns:
raise ValueError("No datetime column found in kline data")
if 'instrument' not in df_kline.columns:
raise ValueError("No instrument column found in kline data")
# Use VWAP as the price column for return calculation
price_col = 'vwap'
if price_col not in df_kline.columns:
print(f"Column '{price_col}' not found in kline data.")
# Look for other possible price columns
possible_price_cols = []
for col in df_kline.columns:
if any(price_term in col.lower() for price_term in ['price', 'vwap', 'close', 'adj', 'pct', 'open', 'high', 'low']):
possible_price_cols.append(col)
print(f"Possible price columns: {possible_price_cols}")
if possible_price_cols:
price_col = possible_price_cols[0] # Use first available price-like column
print(f"Using '{price_col}' as price column instead.")
else:
# If no obvious price column, use the first numeric column
for col in df_kline.columns:
if col not in ['datetime', 'instrument']: # Exclude datetime and instrument
if df_kline[col].dtype in [pl.Float32, pl.Float64, pl.Int32, pl.Int64]:
price_col = col
break
if 'vwap' not in locals() and 'vwap' not in globals():
raise ValueError("No suitable price column found in kline data")
print(f"Using price column: {price_col}")
# Sort data by instrument and datetime to ensure proper temporal ordering
df_kline = df_kline.sort(['instrument', 'datetime'])
# Apply spine-filling approach to ensure complete coverage
print("Applying spine-filling to ensure complete date/instrument coverage...")
# Create spine to fill in missing combinations
try:
# Get unique dates and instruments in the dataset
unique_dates = df_kline.select(pl.col('datetime').unique().sort()).get_column('datetime')
unique_instruments = df_kline.select(pl.col('instrument').unique()).get_column('instrument')
# Create a spine (cartesian product of dates and instruments)
spine_dates = unique_dates.cast(pl.Int32).to_frame()
spine_dates = spine_dates.rename({'datetime': 'datetime'})
spine_instruments = unique_instruments.cast(pl.Int32).to_frame()
spine_instruments = spine_instruments.rename({'instrument': 'instrument'})
# Create full spine by cross join
df_spine = spine_dates.join(spine_instruments, how='cross')
# Merge the kline data onto the spine (left join to keep all spine entries)
df_filled = df_spine.join(
df_kline,
on=['datetime', 'instrument'],
how='left'
)
print(f"Spine-filling completed. Shape before: {df_kline.shape}, after: {df_filled.shape}")
# Forward fill and backward fill to handle missing price data where possible
# Sort by instrument and datetime for proper fill operations
df_filled = df_filled.sort(['instrument', 'datetime'])
# Fill missing price data using forward/backward fill within each instrument
df_filled = df_filled.with_columns([
pl.col(price_col).forward_fill().backward_fill().over('instrument').alias(f'{price_col}_filled')
])
# Use filled price if original was null, otherwise use original
df_filled = df_filled.with_columns([
pl.when(pl.col(price_col).is_null())
.then(pl.col(f'{price_col}_filled'))
.otherwise(pl.col(price_col))
.alias(price_col)
]).drop(f'{price_col}_filled')
# Keep only rows where we have price data after filling
df_kline = df_filled.filter(pl.col(price_col).is_not_null())
print(f"After spine-filling and cleaning: {df_kline.shape}")
except Exception as e:
print(f"Error during spine-filling: {e}")
print("Continuing with original data...")
# Continue with original df_kline
print(f"Calculating {return_days}-day returns from T+1's {price_col} to T+{return_days+1}'s {price_col}...")
# Calculate returns using qshare function if available, otherwise manual
try:
df_returns = calc_daily_return_fn(
pldf=df_kline,
price_col=price_col,
window_len=return_days, # specified return period
col_name='v2v_5d', # Output column name (as requested)
bias=1, # Use T+1 price as base for forward return
return_type='normal' # Regular return (not log return)
)
except Exception as e:
print(f"Error calculating returns with qshare function: {e}")
print("Attempting manual calculation...")
# Manual calculation as fallback
df_returns = df_kline.sort(['instrument', 'datetime']).with_columns([
pl.col(price_col).shift(-1).over('instrument').alias('price_base'),
pl.col(price_col).shift(-(1 + return_days - 1)).over('instrument').alias('price_end')
]).with_columns([
((pl.col('price_end') / pl.col('price_base')) - 1).alias('v2v_5d')
]).drop(['price_base', 'price_end'])
print(f"Calculated returns shape: {df_returns.shape}")
print(f"Calculated returns schema: {df_returns.schema}")
# Handle potential missing values or infinite returns
df_returns = df_returns.filter(
pl.col('v2v_5d').is_not_null() &
pl.col('v2v_5d').is_finite()
)
print(f"Returns after filtering invalid values: {df_returns.shape}")
# Keep the original datetime and instrument columns without conversion
df_output = df_returns.select([
'datetime', # Keep original uint32 datetime format
'instrument', # Keep original uint32 instrument format
'v2v_5d' # Use requested column name
])
print(f"Final output shape: {df_output.shape}")
print(f"Output schema: {df_output.schema}")
# Display some statistics about the returns
if 'v2v_5d' in df_output.columns and len(df_output) > 0:
returns_data = df_output['v2v_5d'].drop_nulls()
if len(returns_data) > 0:
print(f"Return statistics:")
print(f" Count: {len(returns_data)}")
print(f" Mean: {returns_data.mean():.6f}")
print(f" Std: {returns_data.std():.6f}")
print(f" Min: {returns_data.min():.6f}")
print(f" Max: {returns_data.max():.6f}")
print(f" 25th percentile: {returns_data.quantile(0.25):.6f}")
print(f" 75th percentile: {returns_data.quantile(0.75):.6f}")
else:
print(" No valid returns data after filtering")
# Save to parquet preserving original datetime and instrument formats
df_output.write_parquet(output_file)
print(f"Real returns saved to {output_file} with original datetime and instrument formats")
return df_output
if __name__ == "__main__":
# Generate real returns from kline data using the original format
# Define the prediction file to use as date range reference (optional)
pred_file = "../data/original_predictions_0_7.parquet"
# Check if the prediction file exists to use for date range
pred_file_path = pred_file if os.path.exists(pred_file) else None
if pred_file_path:
print(f"Using prediction file to determine date range: {pred_file_path}")
else:
print("Prediction file not found, using default date range")
print("Generating real returns from kline data...")
real_returns_df = generate_real_returns_from_kline(
input_kline_path="/data/parquet/dataset/stg_1day_wind_kline_adjusted_1D/",
prediction_file=pred_file_path,
output_file="../data/actual_returns.parquet",
return_days=5
)
if real_returns_df is not None:
print("Real return generation completed successfully!")
print("The output file contains:")
print("- Original datetime format (uint32 YYYYMMDD)")
print("- Original instrument format (uint32)")
print("- Returns in 'v2v_5d' column")
else:
print("Failed to generate real returns.")

@ -0,0 +1,433 @@
#!/usr/bin/env python
"""
Standalone script to generate predictions using the d033 model with locally generated alpha158_0_7_beta embeddings.
This script does NOT rely on qlib's data handlers. It:
1. Loads beta embeddings from local parquet file (generated by generate_beta_embedding.py)
2. Applies the necessary processing (normalization, neutralization)
3. Uses the d033 model to generate predictions
4. Saves predictions to parquet
"""
import os
import sys
import pickle as pkl
import io
import numpy as np
import polars as pl
import pandas as pd
import torch
import torch.nn as nn
from datetime import date, timedelta
from typing import Optional, List, Tuple, Dict
from pathlib import Path
# Constants
EMBEDDING_FILE = "../data/embedding_0_7_beta.parquet"
MODEL_PATH = "/home/guofu/Workspaces/alpha/data_ops/tasks/app_longsignal/model/host140_exp20_d033/module.pt"
OUTPUT_DIR = "../data"
# Industry flag path for neutralization (optional)
INDUSTRY_FLAG_PATH = "/data/parquet/dataset/stg_1day_gds_indus_flag_cc1_1D/"
RISK_FACTOR_PATH = "/data/parquet/dataset/stg_1day_wind_kline_adjusted_1D/"
def load_beta_embeddings(embedding_file: str, start_date: Optional[str] = None, end_date: Optional[str] = None) -> pl.DataFrame:
"""
Load beta embeddings from local parquet file.
Args:
embedding_file: Path to the embeddings parquet file
start_date: Optional start date filter (YYYY-MM-DD)
end_date: Optional end date filter (YYYY-MM-DD)
Returns:
Polars DataFrame with embeddings
"""
print(f"Loading beta embeddings from {embedding_file}...")
df = pl.read_parquet(embedding_file)
print(f"Loaded embeddings: {df.shape}")
# Convert datetime if needed
if 'datetime' in df.columns:
# Filter by date range if specified
if start_date:
start_dt = int(start_date.replace("-", ""))
df = df.filter(pl.col('datetime') >= start_dt)
if end_date:
end_dt = int(end_date.replace("-", ""))
df = df.filter(pl.col('datetime') <= end_dt)
print(f"Filtered embeddings: {df.shape}")
return df
def load_d033_model(model_path: str) -> nn.Module:
"""
Load the d033 prediction model.
Returns the underlying PyTorch model ready for inference on CPU.
"""
print(f"Loading d033 model from {model_path}...")
# Patch torch.load to always use CPU
original_torch_load = torch.load
def cpu_torch_load(*args, **kwargs):
kwargs['map_location'] = 'cpu'
return original_torch_load(*args, **kwargs)
# Apply the patch
torch.load = cpu_torch_load
try:
with open(model_path, "rb") as fin:
model = pkl.load(fin)
# The model is already an RNNPredict instance
# Set to eval mode for inference
model.eval()
# Set signal function (required for prediction)
if not hasattr(model, 'signal_func'):
model.signal_func = {"type": "logistic"}
print("Model loaded successfully (CPU)")
return model
except RuntimeError as e:
if "CUDA" in str(e):
print("Model contains CUDA tensors, attempting CPU conversion...")
with open(model_path, "rb") as fin:
content = fin.read()
model = torch.load(io.BytesIO(content), map_location='cpu', weights_only=False)
model.eval()
# Set signal function (required for prediction)
if not hasattr(model, 'signal_func'):
model.signal_func = {"type": "logistic"}
print("Model loaded and converted to CPU")
return model
else:
raise
finally:
# Restore original torch.load
torch.load = original_torch_load
def apply_cs_zscore_norm(df: pl.DataFrame, embedding_cols: List[str]) -> pl.DataFrame:
"""
Apply cross-sectional z-score normalization per datetime.
This normalizes features within each date (cross-sectionally).
Args:
df: DataFrame with embeddings
embedding_cols: List of embedding column names
Returns:
DataFrame with normalized embeddings
"""
print("Applying cross-sectional z-score normalization...")
# For each embedding column, calculate mean/std per datetime and normalize
normalized_cols = []
for col in embedding_cols:
# Calculate mean and std per datetime (cross-sectionally)
mean_expr = pl.col(col).mean().over('datetime')
std_expr = pl.col(col).std().over('datetime')
# Z-score: (x - mean) / std, with protection against div by zero
norm_col = pl.when(std_expr > 0).then(
(pl.col(col) - mean_expr) / std_expr
).otherwise(0.0).alias(col)
normalized_cols.append(norm_col)
# Select non-embedding columns + normalized embedding columns
other_cols = [c for c in df.columns if c not in embedding_cols]
df_normalized = df.select(other_cols + normalized_cols)
return df_normalized
def apply_robust_zscore_norm(df: pl.DataFrame, embedding_cols: List[str]) -> pl.DataFrame:
"""
Apply robust z-score normalization (uses median instead of mean).
Args:
df: DataFrame with embeddings
embedding_cols: List of embedding column names
Returns:
DataFrame with normalized embeddings
"""
print("Applying robust z-score normalization...")
normalized_cols = []
for col in embedding_cols:
# Calculate median and MAD (median absolute deviation) per datetime
median_expr = pl.col(col).median().over('datetime')
mad_expr = (pl.col(col) - median_expr).abs().median().over('datetime')
# Robust z-score: (x - median) / (1.4826 * MAD), with protection
norm_col = pl.when(mad_expr > 0).then(
(pl.col(col) - median_expr) / (1.4826 * mad_expr)
).otherwise(0.0).alias(col)
normalized_cols.append(norm_col)
# Clip outliers at [-10, 10]
clipped_cols = []
for col in normalized_cols:
clipped_cols.append(
pl.col(col.name).clip(-10, 10).alias(col.name)
)
other_cols = [c for c in df.columns if c not in embedding_cols]
df_normalized = df.select(other_cols + clipped_cols)
return df_normalized
def apply_fillna(df: pl.DataFrame, embedding_cols: List[str], fill_value: float = 0.0) -> pl.DataFrame:
"""Fill NA values in embedding columns."""
filled_cols = []
for col in embedding_cols:
filled_cols.append(pl.col(col).fill_null(fill_value).alias(col))
other_cols = [c for c in df.columns if c not in embedding_cols]
return df.select(other_cols + filled_cols)
def prepare_features_for_model(df: pl.DataFrame, embedding_cols: List[str], seq_len: int = 40) -> Tuple[np.ndarray, pl.DataFrame]:
"""
Prepare features for the model by creating sequences.
The d033 model expects 3D input: [batch_size, seq_len, d_feat]
where seq_len is the lookback window (default 40 days).
Args:
df: DataFrame with normalized embeddings
embedding_cols: List of embedding column names
seq_len: Sequence length (lookback window)
Returns:
Tuple of (features_array, aligned_df)
"""
print(f"Preparing sequences with length {seq_len}...")
# Sort by instrument and datetime
df = df.sort(['instrument', 'datetime'])
# Get unique instruments
instruments = df['instrument'].unique().to_list()
features_list = []
metadata_list = []
for inst in instruments:
# Get data for this instrument
inst_df = df.filter(pl.col('instrument') == inst)
inst_data = inst_df.select(embedding_cols).to_numpy().astype(np.float32)
inst_meta = inst_df.select(['datetime', 'instrument']).to_numpy()
# Create sliding windows
for i in range(seq_len - 1, len(inst_data)):
# Get sequence of seq_len days ending at day i
seq = inst_data[i - seq_len + 1:i + 1] # [seq_len, d_feat]
features_list.append(seq)
# Metadata for this prediction (the last day in sequence)
metadata_list.append(inst_meta[i])
if not features_list:
raise ValueError(f"Not enough data to create sequences of length {seq_len}")
features_array = np.stack(features_list, axis=0) # [N, seq_len, d_feat]
metadata_array = np.array(metadata_list)
# Create metadata DataFrame
metadata_df = pl.DataFrame({
'datetime': metadata_array[:, 0],
'instrument': metadata_array[:, 1]
})
print(f"Prepared features shape: {features_array.shape}")
print(f"Metadata shape: {metadata_df.shape}")
return features_array, metadata_df
def predict_with_model(model, features: np.ndarray, batch_size: int = 1000) -> np.ndarray:
"""
Generate predictions using the loaded model.
Args:
model: Loaded d033 model
features: Feature array [N, seq_len, d_feat]
batch_size: Batch size for inference
Returns:
Predictions array [N]
"""
print(f"Generating predictions for {features.shape[0]} samples...")
device = torch.device('cpu')
model = model.to(device)
model.eval()
all_preds = []
with torch.no_grad():
for i in range(0, len(features), batch_size):
batch = features[i:i + batch_size]
batch_tensor = torch.tensor(batch, dtype=torch.float32, device=device)
# Forward pass
_, pred = model(batch_tensor)
# Convert to numpy
pred_np = pred.cpu().numpy()
all_preds.append(pred_np)
predictions = np.concatenate(all_preds, axis=0)
print(f"Generated {len(predictions)} predictions")
return predictions
def predict_with_embeddings(
embeddings_df: pl.DataFrame,
model,
output_file: Optional[str] = None,
seq_len: int = 40,
batch_size: int = 1000
) -> pl.DataFrame:
"""
Generate predictions using embeddings and the d033 model.
Args:
embeddings_df: DataFrame with beta embeddings
model: Loaded d033 model
output_file: Optional output file path
seq_len: Sequence length for model input
batch_size: Batch size for inference
Returns:
DataFrame with predictions
"""
print("Generating predictions...")
# Get embedding columns
embedding_cols = [col for col in embeddings_df.columns if col.startswith('embedding_')]
print(f"Found {len(embedding_cols)} embedding columns")
# Apply inference processors
df_processed = apply_cs_zscore_norm(embeddings_df, embedding_cols)
df_processed = apply_fillna(df_processed, embedding_cols, fill_value=0.0)
# Prepare sequences for model
features, metadata_df = prepare_features_for_model(df_processed, embedding_cols, seq_len=seq_len)
# Generate predictions
predictions = predict_with_model(model, features, batch_size=batch_size)
# Create output DataFrame
result_df = metadata_df.with_columns([
pl.Series(name="prediction", values=predictions)
])
# Save to parquet
if output_file is None:
output_file = os.path.join(OUTPUT_DIR, "predictions_beta_embedding.parquet")
os.makedirs(os.path.dirname(output_file), exist_ok=True)
result_df.write_parquet(output_file)
print(f"Predictions saved to {output_file}")
return result_df
def generate_predictions(
embedding_file: str = EMBEDDING_FILE,
model_path: str = MODEL_PATH,
start_date: Optional[str] = None,
end_date: Optional[str] = None,
output_file: Optional[str] = None,
seq_len: int = 40,
batch_size: int = 1000
) -> pl.DataFrame:
"""
Main function to generate predictions using beta embeddings.
Args:
embedding_file: Path to beta embeddings parquet file
model_path: Path to d033 model
start_date: Optional start date filter
end_date: Optional end date filter
output_file: Optional output file path
seq_len: Sequence length for model input (lookback window)
batch_size: Batch size for inference
Returns:
DataFrame with predictions
"""
print("=" * 60)
print("Generating Predictions with Alpha158 0_7 Beta Embeddings")
print("=" * 60)
# Load embeddings
df_embeddings = load_beta_embeddings(embedding_file, start_date, end_date)
# Load model
model = load_d033_model(model_path)
# Generate predictions
predictions = predict_with_embeddings(
df_embeddings, model, output_file,
seq_len=seq_len, batch_size=batch_size
)
return predictions
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Generate predictions with beta embeddings")
parser.add_argument("--embeddings", type=str, default=EMBEDDING_FILE,
help="Path to beta embeddings parquet file")
parser.add_argument("--model", type=str, default=MODEL_PATH,
help="Path to d033 model")
parser.add_argument("--start-date", type=str, default=None,
help="Start date (YYYY-MM-DD)")
parser.add_argument("--end-date", type=str, default=None,
help="End date (YYYY-MM-DD)")
parser.add_argument("--output", type=str, default=None,
help="Output parquet file path")
parser.add_argument("--seq-len", type=int, default=40,
help="Sequence length (lookback window)")
parser.add_argument("--batch-size", type=int, default=1000,
help="Batch size for inference")
args = parser.parse_args()
df = generate_predictions(
embedding_file=args.embeddings,
model_path=args.model,
start_date=args.start_date,
end_date=args.end_date,
output_file=args.output,
seq_len=args.seq_len,
batch_size=args.batch_size
)
print("\nDone!")
print(f"Generated {len(df)} predictions")
print(df.head())

@ -0,0 +1,186 @@
#!/usr/bin/env python
"""
Regenerate beta embeddings for a few days of sample data.
This script generates embeddings for a small date range to test the pipeline.
"""
import os
import sys
import pickle as pkl
import numpy as np
import polars as pl
import torch
import torch.nn as nn
from pathlib import Path
from datetime import datetime
from typing import List, Dict, Optional
# Add parent directory to path
sys.path.insert(0, str(Path(__file__).parent))
# Import from the main generate script
from generate_beta_embedding import (
load_all_data,
merge_data_sources,
apply_feature_pipeline,
prepare_vae_features,
load_vae_model,
encode_with_vae,
load_qlib_processor_params,
VAE_INPUT_DIM,
OUTPUT_DIR,
)
# Sample dates for testing (5 consecutive trading days)
SAMPLE_DATES = [
"2019-01-02",
"2019-01-03",
"2019-01-04",
"2019-01-07",
"2019-01-08",
]
VAE_MODEL_PATH = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/model/csiallx_feature2_ntrla_flag_pnlnorm_vae4_dim32a_beta0001/module.pt"
def generate_sample_embeddings(
dates: List[str] = SAMPLE_DATES,
output_file: str = "embedding_0_7_beta_sample.parquet",
use_vae: bool = True
) -> pl.DataFrame:
"""
Generate embeddings for a sample of dates.
Args:
dates: List of dates in YYYY-MM-DD format
output_file: Output parquet file path
use_vae: Whether to use VAE for encoding (or random embeddings)
"""
start_date = dates[0]
end_date = dates[-1]
print("=" * 60)
print("Generating Sample Beta Embeddings")
print(f"Dates: {dates}")
print(f"Use VAE: {use_vae}")
print("=" * 60)
# Load all data sources
df_alpha, df_kline, df_flag, df_industry = load_all_data(start_date, end_date)
print(f"\nLoaded data:")
print(f" Alpha158: {df_alpha.shape}")
print(f" Kline: {df_kline.shape}")
print(f" Flags: {df_flag.shape}")
print(f" Industry: {df_industry.shape}")
# Filter to only the sample dates
date_ints = [int(d.replace("-", "")) for d in dates]
df_alpha = df_alpha.filter(pl.col("datetime").is_in(date_ints))
df_kline = df_kline.filter(pl.col("datetime").is_in(date_ints))
df_flag = df_flag.filter(pl.col("datetime").is_in(date_ints))
df_industry = df_industry.filter(pl.col("datetime").is_in(date_ints))
print(f"\nAfter filtering to sample dates:")
print(f" Alpha158: {df_alpha.shape}")
print(f" Kline: {df_kline.shape}")
print(f" Flags: {df_flag.shape}")
print(f" Industry: {df_industry.shape}")
# Merge data sources
df = merge_data_sources(df_alpha, df_kline, df_flag, df_industry)
print(f"\nMerged data shape: {df.shape}")
# Save datetime and instrument before processing
datetime_col = df["datetime"].clone()
instrument_col = df["instrument"].clone()
# Apply feature transformation pipeline
df_processed, feature_cols, norm_feature_cols, market_flag_for_vae = apply_feature_pipeline(df)
# Prepare features for VAE
features = prepare_vae_features(
df_processed, feature_cols,
norm_feature_cols=norm_feature_cols,
market_flag_for_vae=market_flag_for_vae
)
print(f"\nFeature matrix shape: {features.shape}")
# Encode with VAE
if use_vae:
try:
model = load_vae_model(VAE_MODEL_PATH)
embeddings = encode_with_vae(features, model)
print(f"\nVAE encoding successful!")
except Exception as e:
print(f"\nVAE encoding failed: {e}")
import traceback
traceback.print_exc()
print("\nFalling back to random embeddings...")
np.random.seed(42)
embeddings = np.random.randn(features.shape[0], 32).astype(np.float32)
else:
print("\nUsing random embeddings (VAE disabled)...")
np.random.seed(42)
embeddings = np.random.randn(features.shape[0], 32).astype(np.float32)
# Create output DataFrame
embedding_cols = [f"embedding_{i}" for i in range(embeddings.shape[1])]
result_data = {
"datetime": datetime_col.to_list(),
"instrument": instrument_col.to_list(),
**{col_name: embeddings[:, i].tolist() for i, col_name in enumerate(embedding_cols)}
}
df_result = pl.DataFrame(result_data)
# Ensure output directory exists
output_path = Path(output_file)
output_path.parent.mkdir(parents=True, exist_ok=True)
# Save to parquet
df_result.write_parquet(output_path)
print(f"\nEmbeddings saved to: {output_path}")
print(f"Output shape: {df_result.shape}")
print(f"\nSample output:")
print(df_result.head(10))
# Print summary statistics
print("\n" + "=" * 60)
print("Summary Statistics")
print("=" * 60)
print(f"Total samples: {len(df_result)}")
print(f"Embedding dimension: {embeddings.shape[1]}")
print(f"Date range: {df_result['datetime'].min()} to {df_result['datetime'].max()}")
print(f"Instruments: {df_result['instrument'].n_unique()}")
print(f"Embedding mean: {np.mean(embeddings):.6f}")
print(f"Embedding std: {np.std(embeddings):.6f}")
print(f"Embedding min: {np.min(embeddings):.6f}")
print(f"Embedding max: {np.max(embeddings):.6f}")
return df_result
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Generate sample beta embeddings")
parser.add_argument("--dates", nargs="+", default=SAMPLE_DATES,
help="List of dates (YYYY-MM-DD)")
parser.add_argument("--output", type=str, default="embedding_0_7_beta_sample.parquet",
help="Output parquet file")
parser.add_argument("--no-vae", action="store_true",
help="Skip VAE encoding (use random embeddings)")
args = parser.parse_args()
generate_sample_embeddings(
dates=args.dates,
output_file=args.output,
use_vae=not args.no_vae
)
print("\nDone!")

@ -0,0 +1,394 @@
[2715583:MainThread](2026-02-26 19:58:16,674) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
[2715583:MainThread](2026-02-26 19:58:16,680) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[2715583:MainThread](2026-02-26 19:58:16,681) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
================================================================================
DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE
================================================================================
Date Range: 2020-01-02 to 2020-01-10
Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
Step 1: Loading data from Qlib pipeline...
Loading since_date=2020-01-02
Will use `placehorder_value` from module: qlib.contrib.data.config
Will init handler object from config:
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'end_time': datetime.date(2026, 2, 26),
'handler_list': [{'class': 'DDBAlpha158Handler',
'kwargs': {'col_set': 'feature',
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': 'alpha158',
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
{'class': 'DDBMarketExtHandler',
'kwargs': {'col_set': 'feature_ext',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['Turnover '
'as '
'turnover',
'FreeTurnover '
'as '
'free_turnover',
'log(MarketValue) '
'as '
'log_size'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['con_rating_strength'],
'table_name': 'stg_1day_gds_con_rating'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
{'class': 'DDBMarketFlagHandler',
'kwargs': {'col_set': 'feature_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['IsZt',
'IsDt',
'IsN',
'IsXD',
'IsXR',
'IsDR'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['open_limit',
'close_limit',
'low_limit',
'open_stop',
'close_stop',
'high_stop'],
'table_name': 'stg_1day_wind_market_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
{'class': 'DDBIndusFlagHandler',
'kwargs': {'col_set': 'indus_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': 'industry_code_cc.csv',
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
{'class': 'DDBStFlagHandler',
'kwargs': {'col_set': 'st_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['ST_Y',
'ST_S',
'ST_T',
'ST_L',
'ST_Z',
'ST_X'],
'table_name': 'stg_1day_wind_st_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
'instruments': 'csiallx',
'start_time': Timestamp('2019-12-03 00:00:00')},
'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'handler': {'class': 'AggHandler',
'kwargs': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'end_time': datetime.date(2026, 2, 26)[2715583:MainThread](2026-02-26 19:58:16,707) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
[2715583:MainThread](2026-02-26 19:58:16,707) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
[2715583:MainThread](2026-02-26 19:58:17,067) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
[2715583:MainThread](2026-02-26 20:05:39,665) INFO - qlib.timer - [log.py:117] - Time cost: 442.946s | DDB query: Done
[2715583:MainThread](2026-02-26 20:05:40,469) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
,
'handler_list': [{'class': 'DDBAlpha158Handler',
'kwargs': {'col_set': 'feature',
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': 'alpha158',
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
{'class': 'DDBMarketExtHandler',
'kwargs': {'col_set': 'feature_ext',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['Turnover '
'as '
'turnover',
'FreeTurnover '
'as '
'free_turnover',
'log(MarketValue) '
'as '
'log_size'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['con_rating_strength'],
'table_name': 'stg_1day_gds_con_rating'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
{'class': 'DDBMarketFlagHandler',
'kwargs': {'col_set': 'feature_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['IsZt',
'IsDt',
'IsN',
'IsXD',
'IsXR',
'IsDR'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['open_limit',
'close_limit',
'low_limit',
'open_stop',
'close_stop',
'high_stop'],
'table_name': 'stg_1day_wind_market_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
{'class': 'DDBIndusFlagHandler',
'kwargs': {'col_set': 'indus_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': 'industry_code_cc.csv',
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
{'class': 'DDBStFlagHandler',
'kwargs': {'col_set': 'st_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['ST_Y',
'ST_S',
'ST_T',
'ST_L',
'ST_Z',
'ST_X'],
'table_name': 'stg_1day_wind_st_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
'instruments': 'csiallx',
'start_time': Timestamp('2019-12-03 00:00:00')},
'module_path': 'qlib.contrib.data.agg_handler'},
'load_end': datetime.date(2026, 2, 26),
'load_start': Timestamp('2019-12-03 00:00:00'),
'market': 'csiallx',
'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target',
'region': 'cn'}}
Query config:
#alpha158: 1;
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e88b0587d0>
[2715583:MainThread](2026-02-26 20:07:46,118) INFO - qlib.timer - [log.py:117] - Time cost: 115.964s | Instruments filter: Done
[2715583:MainThread](2026-02-26 20:07:53,273) INFO - qlib.timer - [log.py:117] - Time cost: 576.561s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x70e88b086d80>) Done
[2715583:MainThread](2026-02-26 20:07:53,274) INFO - qlib.timer - [log.py:117] - Time cost: 576.562s | Init data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x70e88b086d80>) Done
[2715583:MainThread](2026-02-26 20:07:53,276) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2715583:MainThread](2026-02-26 20:07:56,700) INFO - qlib.timer - [log.py:117] - Time cost: 3.423s | fetch_df_by_index Done
[2715583:MainThread](2026-02-26 20:07:58,185) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
KMID KLEN ... VSUMD30 VSUMD60
datetime instrument ...
2019-12-03 SH600000 0.004234 0.011008 ... -0.031454 -0.009671
SH600004 0.015467 0.031529 ... -0.004401 0.007701
SH600006 0.022573 0.033860 ... 0.060561 -0.000159
SH600007 0.012129 0.025470 ... 0.008489 -0.054056
SH600008 0.006173 0.009259 ... -0.088065 -0.080770
... ... ... ... ... ...
2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708
SZ301662 0.060584 0.087834 ... -0.014658 -0.014613
SZ301665 -0.012899 0.040541 ... 0.083229 0.055994
SZ301678 0.018182 0.027879 ... -0.054124 0.014202
SZ302132 0.001754 0.016416 ... -0.049558 -0.038667
[6886779 rows x 158 columns]
[2715583:MainThread](2026-02-26 20:07:58,186) INFO - qlib.timer - [log.py:117] - Time cost: 4.911s | Fetching dataframe Done
[2715583:MainThread](2026-02-26 20:07:58,203) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3
[2715583:MainThread](2026-02-26 20:08:15,182) INFO - qlib.timer - [log.py:117] - Time cost: 16.990s | DDB query: Done
[2715583:MainThread](2026-02-26 20:08:15,974) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2715583:MainThread](2026-02-26 20:08:16,548) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,con_rating_strength from
loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating")
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH
[2715583:MainThread](2026-02-26 20:08:27,838) INFO - qlib.timer - [log.py:117] - Time cost: 11.299s | DDB query: Done
[2715583:MainThread](2026-02-26 20:08:28,690) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-25 00:00:00
[2715583:MainThread](2026-02-26 20:09:53,616) INFO - qlib.timer - [log.py:117] - Time cost: 81.815s | Instruments filter: Done
[2715583:MainThread](2026-02-26 20:09:54,168) INFO - qlib.timer - [log.py:117] - Time cost: 115.981s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x70ea4ba85f40>) Done
[2715583:MainThread](2026-02-26 20:09:54,169) INFO - qlib.timer - [log.py:117] - Time cost: 115.982s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x70ea4ba85f40>) Done
[2715583:MainThread](2026-02-26 20:09:54,170) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2715583:MainThread](2026-02-26 20:09:54,893) INFO - qlib.timer - [log.py:117] - Time cost: 0.723s | fetch_df_by_index Done
[2715583:MainThread](2026-02-26 20:09:54,901) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
turnover free_turnover log_size con_rating_strength
datetime instrument
2019-12-03 SH600000 0.0696 0.1275 17.322001 0.6618
SH600004 0.6009 1.2276 15.077468 0.8269
SH600006 0.5976 1.5087 13.716795 1.0000
SH600007 0.0961 0.4969 14.334991 0.7500
SH600008 0.0967 0.1793 14.432563 0.6591
... ... ... ... ...
2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN
SZ301662 12.5950 12.5950 12.681215 NaN
SZ301665 14.0077 14.0077 11.719415 NaN
SZ301678 6.6518 6.6518 12.799973 NaN
SZ302132 1.3868 3.0296 15.359885 NaN
[7601552 rows x 4 columns]
[2715583:MainThread](2026-02-26 20:09:54,902) INFO - qlib.timer - [log.py:117] - Time cost: 0.732s | Fetching dataframe Done
[2715583:MainThread](2026-02-26 20:09:54,917) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657',
[2715583:MainThread](2026-02-26 20:10:15,465) INFO - qlib.timer - [log.py:117] - Time cost: 20.556s | DDB query: Done
[2715583:MainThread](2026-02-26 20:10:16,265) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2715583:MainThread](2026-02-26 20:10:16,775) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from
loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag")
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','
[2715583:MainThread](2026-02-26 20:10:36,740) INFO - qlib.timer - [log.py:117] - Time cost: 19.975s | DDB query: Done
[2715583:MainThread](2026-02-26 20:10:37,558) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2715583:MainThread](2026-02-26 20:12:04,978) INFO - qlib.timer - [log.py:117] - Time cost: 84.148s | Instruments filter: Done
[2715583:MainThread](2026-02-26 20:12:05,899) INFO - qlib.timer - [log.py:117] - Time cost: 130.996s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x70e88d06acc0>) Done
[2715583:MainThread](2026-02-26 20:12:05,900) INFO - qlib.timer - [log.py:117] - Time cost: 130.997s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x70e88d06acc0>) Done
[2715583:MainThread](2026-02-26 20:12:05,902) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2715583:MainThread](2026-02-26 20:12:06,745) INFO - qlib.timer - [log.py:117] - Time cost: 0.842s | fetch_df_by_index Done
[2715583:MainThread](2026-02-26 20:12:06,758) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
IsZt IsDt IsN ... open_stop close_stop high_stop
datetime instrument ...
2019-12-03 SH600000 False False False ... False False False
SH600004 False False False ... False False False
SH600006 False False False ... False False False
SH600007 False False False ... False False False
SH600008 False False False ... False False False
... ... ... ... ... ... ... ...
2026-02-26 SZ301658 False False False ... False False False
SZ301662 False False False ... False False False
SZ301665 False False False ... False False False
SZ301678 False False False ... False False False
SZ302132 False False False ... False False False
[6903684 rows x 12 columns]
[2715583:MainThread](2026-02-26 20:12:06,759) INFO - qlib.timer - [log.py:117] - Time cost: 0.857s | Fetching dataframe Done
[2715583:MainThread](2026-02-26 20:12:06,777) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from
loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1")
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S
[2715583:MainThread](2026-02-26 20:12:08,840) INFO - qlib.timer - [log.py:117] - Time cost: 2.073s | DDB query: Done
[2715583:MainThread](2026-02-26 20:12:08,849) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2715583:MainThread](2026-02-26 20:13:26,572) INFO - qlib.timer - [log.py:117] - Time cost: 77.719s | Instruments filter: Done
[2715583:MainThread](2026-02-26 20:13:26,601) INFO - qlib.timer - [log.py:117] - Time cost: 79.839s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x70e67060acc0>) Done
[2715583:MainThread](2026-02-26 20:13:26,602) INFO - qlib.timer - [log.py:117] - Time cost: 79.840s | Init data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x70e67060acc0>) Done
[2715583:MainThread](2026-02-26 20:13:26,603) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2715583:MainThread](2026-02-26 20:13:26,612) INFO - qlib.timer - [log.py:117] - Time cost: 0.008s | fetch_df_by_index Done
[2715583:MainThread](2026-02-26 20:13:26,633) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
gds_CC10 gds_CC11 ... gds_CC63 gds_CC70
datetime instrument ...
2026-02-09 SH600000 False False ... False False
SH600004 False False ... False False
SH600006 False False ... False False
SH600007 False False ... False False
SH600008 False False ... False False
... ... ... ... ... ...
2026-02-26 SZ301658 False False ... False False
SZ301662 False False ... False False
SZ301665 False False ... False False
SZ301678 False False ... False False
SZ302132 False False ... False False
[41168 rows x 30 columns]
[2715583:MainThread](2026-02-26 20:13:26,634) INFO - qlib.timer - [log.py:117] - Time cost: 0.031s | Fetching dataframe Done
[2715583:MainThread](2026-02-26 20:13:26,652) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from
loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag")
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002
[2715583:MainThread](2026-02-26 20:13:55,744) INFO - qlib.timer - [log.py:117] - Time cost: 29.102s | DDB query: Done
[2715583:MainThread](2026-02-26 20:13:56,520) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2715583:MainThread](2026-02-26 20:15:27,625) INFO - qlib.timer - [log.py:117] - Time cost: 90.586s | Instruments filter: Done
[2715583:MainThread](2026-02-26 20:15:28,257) INFO - qlib.timer - [log.py:117] - Time cost: 121.621s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x70e88c9710d0>) Done
[2715583:MainThread](2026-02-26 20:15:28,257) INFO - qlib.timer - [log.py:117] - Time cost: 121.622s | Init data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x70e88c9710d0>) Done
[2715583:MainThread](2026-02-26 20:15:28,258) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2715583:MainThread](2026-02-26 20:15:28,867) INFO - qlib.timer - [log.py:117] - Time cost: 0.608s | fetch_df_by_index Done
[2715583:MainThread](2026-02-26 20:15:28,875) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
ST_Y ST_S ST_T ST_L ST_Z ST_X
datetime instrument
2019-12-03 SH600000 False False False False False False
SH600004 False False False False False False
SH600006 False False False False False False
SH600007 False False False False False False
SH600008 False False False False False False
... ... ... ... ... ... ...
2026-02-26 SZ301658 False False False False False False
SZ301662 False False False False False False
SZ301665 False False False False False False
SZ301678 False False False False False False
SZ302132 False False False False False False
[6903687 rows x 6 columns]
[2715583:MainThread](2026-02-26 20:15:28,876) INFO - qlib.timer - [log.py:117] - Time cost: 0.617s | Fetching dataframe Done
/home/guofu/.venv/alpha2/lib/python3.12/site-packages/qlib/contrib/utils/paral.py:22: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
group_list = [_df.resample("M", level="datetime")\
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e88bd2a000>
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e88bd2a000>
Query config:
#concepts: 2;
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e88c9cf6e0>
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e88c9cf6e0>
Field list: ['gds_CC10', 'gds_CC11', 'gds_CC12', 'gds_CC20', 'gds_CC21', 'gds_CC22', 'gds_CC23', 'gds_CC24', 'gds_CC25', 'gds_CC26', 'gds_CC27', 'gds_CC28', 'gds_CC30', 'gds_CC31', 'gds_CC32', 'gds_CC33', 'gds_CC34', 'gds_CC35', 'gds_CC36', 'gds_CC37', 'gds_CC40', 'gds_CC41', 'gds_CC42', 'gds_CC43', 'gds_CC50', 'gds_CC60', 'gds_CC61', 'gds_CC62', 'gds_CC63', 'gds_CC70']
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e6706082f0>
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e65fdafd40>
[2715583:MainThread](2026-02-26 20:15:32,735) INFO - qlib.timer - [log.py:117] - Time cost: 3.858s | Concat index: Done
[2715583:MainThread](2026-02-26 20:15:32,737) INFO - qlib.timer - [log.py:117] - Time cost: 0.001s | Sort index: Done
[2715583:MainThread](2026-02-26 20:15:36,349) INFO - qlib.timer - [log.py:117] - Time cost: 3.611s | Creating SepDataFrame: Done
[2715583:MainThread](2026-02-26 20:15:37,245) INFO - qlib.timer - [log.py:117] - Time cost: 1040.537s | Loading data (<qlib.contrib.data.agg_handler.AggHandler object at 0x70e88b12b3e0>) Done
[2715583:MainThread](2026-02-26 20:15:37,246) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
[]
[2715583:MainThread](2026-02-26 20:15:37,248) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
[2715583:MainThread](2026-02-26 20:15:37,265) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
[]
[2715583:MainThread](2026-02-26 20:15:37,266) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
[2715583:MainThread](2026-02-26 20:15:37,277) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
[]
[2715583:MainThread](2026-02-26 20:15:37,277) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
[2715583:MainThread](2026-02-26 20:15:37,293) INFO - qlib.timer - [log.py:117] - Time cost: 0.047s | fit & process data Done
[2715583:MainThread](2026-02-26 20:15:37,294) INFO - qlib.timer - [log.py:117] - Time cost: 1040.587s | Init data (<qlib.contrib.data.agg_handler.AggHandler object at 0x70e88b12b3e0>) Done
[2715583:MainThread](2026-02-26 20:15:37,963) INFO - qlib.DataHandlerLP - [handler.py:487] - Will apply processor <qlib.contrib.data.processor.common.Diff object at 0x70e88bf4af30>
[2715583:MainThread](2026-02-26 20:15:40,135) INFO - qlib.timer - [log.py:117] - Time cost: 2.171s | Diff Done
[2715583:MainThread](2026-02-26 20:15:40,136) INFO - qlib.DataHandlerLP - [handler.py:487] - Will apply processor <qlib.contrib.data.processor.flag.FlagMarketInjector object at 0x70e88cd8fd40>
All processors are readonly
All processors are readonly
All processors are readonly
Did load data from config: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/handler.yaml
Did load norm from: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc
Will assign `feature_ext` with
turnover ... con_rating_strength_diff
datetime instrument ...
2026-02-09 SH600000 0.1837 ... 0.0
SH600004 0.6948 ... 0.0
SH600006 0.5542 ... 0.0
SH600007 0.2057 ... 0.0
SH600008 0.9809 ... 0.0
... ... ... ...
2026-02-26 SZ301658 6.0785 ... 0.0
SZ301662 12.5950 ... 0.0
SZ301665 14.0077 ... 0.0
SZ301678 6.6518 ... 0.0
SZ302132 1.3868 ... 0.0
[41085 rows x 8 columns]
---
ERROR: Failed to load data from Qlib pipeline: Cannot convert non-finite values (NA or inf) to integer

@ -0,0 +1,373 @@
[2730312:MainThread](2026-02-26 21:28:33,675) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
[2730312:MainThread](2026-02-26 21:28:33,679) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[2730312:MainThread](2026-02-26 21:28:33,680) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
================================================================================
DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE
================================================================================
Date Range: 2020-01-02 to 2020-01-10
Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
Step 1: Loading data from Qlib pipeline...
Loading since_date=2020-01-02
Loading raw data from handler.yaml...
Will use `placehorder_value` from module: qlib.contrib.data.config
Will init handler object from config:
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'end_time': datetime.date(2026, 2, 26),
'handler_list': [{'class': 'DDBAlpha158Handler',
'kwargs': {'col_set': 'feature',
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': 'alpha158',
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
{'class': 'DDBMarketExtHandler',
'kwargs': {'col_set': 'feature_ext',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['Turnover '
'as '
'turnover',
'FreeTurnover '
'as '
'free_turnover',
'log(MarketValue) '
'as '
'log_size'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['con_rating_strength'],
'table_name': 'stg_1day_gds_con_rating'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
{'class': 'DDBMarketFlagHandler',
'kwargs': {'col_set': 'feature_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['IsZt',
'IsDt',
'IsN',
'IsXD',
'IsXR',
'IsDR'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['open_limit',
'close_limit',
'low_limit',
'open_stop',
'close_stop',
'high_stop'],
'table_name': 'stg_1day_wind_market_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
{'class': 'DDBIndusFlagHandler',
'kwargs': {'col_set': 'indus_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': 'industry_code_cc.csv',
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
{'class': 'DDBStFlagHandler',
'kwargs': {'col_set': 'st_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['ST_Y',
'ST_S',
'ST_T',
'ST_L',
'ST_Z',
'ST_X'],
'table_name': 'stg_1day_wind_st_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
'instruments': 'csiallx',
'start_time': Timestamp('2019-12-13 00:00:00')},
'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'handler': {'class': 'AggHandler',
'kwargs': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'end_time': [2730312:MainThread](2026-02-26 21:28:33,704) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
[2730312:MainThread](2026-02-26 21:28:33,704) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
[2730312:MainThread](2026-02-26 21:28:34,011) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
[2730312:MainThread](2026-02-26 21:36:00,317) INFO - qlib.timer - [log.py:117] - Time cost: 446.602s | DDB query: Done
[2730312:MainThread](2026-02-26 21:36:01,106) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
datetime.date(2026, 2, 26),
'handler_list': [{'class': 'DDBAlpha158Handler',
'kwargs': {'col_set': 'feature',
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': 'alpha158',
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
{'class': 'DDBMarketExtHandler',
'kwargs': {'col_set': 'feature_ext',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['Turnover '
'as '
'turnover',
'FreeTurnover '
'as '
'free_turnover',
'log(MarketValue) '
'as '
'log_size'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['con_rating_strength'],
'table_name': 'stg_1day_gds_con_rating'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
{'class': 'DDBMarketFlagHandler',
'kwargs': {'col_set': 'feature_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['IsZt',
'IsDt',
'IsN',
'IsXD',
'IsXR',
'IsDR'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['open_limit',
'close_limit',
'low_limit',
'open_stop',
'close_stop',
'high_stop'],
'table_name': 'stg_1day_wind_market_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
{'class': 'DDBIndusFlagHandler',
'kwargs': {'col_set': 'indus_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': 'industry_code_cc.csv',
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
{'class': 'DDBStFlagHandler',
'kwargs': {'col_set': 'st_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['ST_Y',
'ST_S',
'ST_T',
'ST_L',
'ST_Z',
'ST_X'],
'table_name': 'stg_1day_wind_st_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
'instruments': 'csiallx',
'start_time': Timestamp('2019-12-13 00:00:00')},
'module_path': 'qlib.contrib.data.agg_handler'},
'load_end': datetime.date(2026, 2, 26),
'load_start': Timestamp('2019-12-13 00:00:00'),
'market': 'csiallx',
'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target',
'region': 'cn'}}
Query config:
#alpha158: 1;
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x761f37e75a60>
[2730312:MainThread](2026-02-26 21:38:13,636) INFO - qlib.timer - [log.py:117] - Time cost: 123.423s | Instruments filter: Done
[2730312:MainThread](2026-02-26 21:38:20,733) INFO - qlib.timer - [log.py:117] - Time cost: 587.024s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x761f366b3bc0>) Done
[2730312:MainThread](2026-02-26 21:38:20,734) INFO - qlib.timer - [log.py:117] - Time cost: 587.026s | Init data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x761f366b3bc0>) Done
[2730312:MainThread](2026-02-26 21:38:20,736) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2730312:MainThread](2026-02-26 21:38:24,302) INFO - qlib.timer - [log.py:117] - Time cost: 3.564s | fetch_df_by_index Done
[2730312:MainThread](2026-02-26 21:38:25,946) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
KMID KLEN ... VSUMD30 VSUMD60
datetime instrument ...
2019-12-13 SH600000 0.011686 0.015025 ... -0.011573 0.039735
SH600004 0.000000 0.009169 ... -0.146051 0.024757
SH600006 -0.004329 0.015152 ... 0.136883 0.024626
SH600007 0.005590 0.019005 ... -0.012912 0.017215
SH600008 0.012270 0.012270 ... 0.039878 -0.013888
... ... ... ... ... ...
2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708
SZ301662 0.060584 0.087834 ... -0.014658 -0.014613
SZ301665 -0.012899 0.040541 ... 0.083229 0.055994
SZ301678 0.018182 0.027879 ... -0.054124 0.014202
SZ302132 0.001754 0.016416 ... -0.049558 -0.038667
[6858048 rows x 158 columns]
[2730312:MainThread](2026-02-26 21:38:25,947) INFO - qlib.timer - [log.py:117] - Time cost: 5.212s | Fetching dataframe Done
[2730312:MainThread](2026-02-26 21:38:25,965) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3
[2730312:MainThread](2026-02-26 21:38:43,081) INFO - qlib.timer - [log.py:117] - Time cost: 17.127s | DDB query: Done
[2730312:MainThread](2026-02-26 21:38:43,874) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2730312:MainThread](2026-02-26 21:38:44,458) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,con_rating_strength from
loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating")
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH
[2730312:MainThread](2026-02-26 21:38:55,720) INFO - qlib.timer - [log.py:117] - Time cost: 11.271s | DDB query: Done
[2730312:MainThread](2026-02-26 21:38:56,586) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-25 00:00:00
[2730312:MainThread](2026-02-26 21:40:21,007) INFO - qlib.timer - [log.py:117] - Time cost: 81.315s | Instruments filter: Done
[2730312:MainThread](2026-02-26 21:40:21,576) INFO - qlib.timer - [log.py:117] - Time cost: 115.627s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x7620fb822930>) Done
[2730312:MainThread](2026-02-26 21:40:21,576) INFO - qlib.timer - [log.py:117] - Time cost: 115.628s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x7620fb822930>) Done
[2730312:MainThread](2026-02-26 21:40:21,577) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2730312:MainThread](2026-02-26 21:40:22,309) INFO - qlib.timer - [log.py:117] - Time cost: 0.731s | fetch_df_by_index Done
[2730312:MainThread](2026-02-26 21:40:22,317) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
turnover free_turnover log_size con_rating_strength
datetime instrument
2019-12-13 SH600000 0.2118 0.3879 17.343685 0.7143
SH600004 0.7518 1.5357 15.099485 0.8214
SH600006 0.7827 1.9762 13.732129 1.0000
SH600007 0.1368 0.7071 14.409998 0.7500
SH600008 0.2152 0.3990 14.444757 0.7500
... ... ... ... ...
2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN
SZ301662 12.5950 12.5950 12.681215 NaN
SZ301665 14.0077 14.0077 11.719415 NaN
SZ301678 6.6518 6.6518 12.799973 NaN
SZ302132 1.3868 3.0296 15.359885 NaN
[7572626 rows x 4 columns]
[2730312:MainThread](2026-02-26 21:40:22,318) INFO - qlib.timer - [log.py:117] - Time cost: 0.741s | Fetching dataframe Done
[2730312:MainThread](2026-02-26 21:40:22,334) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657',
[2730312:MainThread](2026-02-26 21:40:43,075) INFO - qlib.timer - [log.py:117] - Time cost: 20.751s | DDB query: Done
[2730312:MainThread](2026-02-26 21:40:43,889) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2730312:MainThread](2026-02-26 21:40:44,394) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from
loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag")
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','
[2730312:MainThread](2026-02-26 21:41:04,632) INFO - qlib.timer - [log.py:117] - Time cost: 20.246s | DDB query: Done
[2730312:MainThread](2026-02-26 21:41:05,434) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2730312:MainThread](2026-02-26 21:42:33,029) INFO - qlib.timer - [log.py:117] - Time cost: 84.294s | Instruments filter: Done
[2730312:MainThread](2026-02-26 21:42:34,049) INFO - qlib.timer - [log.py:117] - Time cost: 131.730s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x761f37e74470>) Done
[2730312:MainThread](2026-02-26 21:42:34,050) INFO - qlib.timer - [log.py:117] - Time cost: 131.731s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x761f37e74470>) Done
[2730312:MainThread](2026-02-26 21:42:34,051) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2730312:MainThread](2026-02-26 21:42:34,895) INFO - qlib.timer - [log.py:117] - Time cost: 0.843s | fetch_df_by_index Done
[2730312:MainThread](2026-02-26 21:42:34,907) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
IsZt IsDt IsN ... open_stop close_stop high_stop
datetime instrument ...
2019-12-13 SH600000 False False False ... False False False
SH600004 False False False ... False False False
SH600006 False False False ... False False False
SH600007 False False False ... False False False
SH600008 False False False ... False False False
... ... ... ... ... ... ... ...
2026-02-26 SZ301658 False False False ... False False False
SZ301662 False False False ... False False False
SZ301665 False False False ... False False False
SZ301678 False False False ... False False False
SZ302132 False False False ... False False False
[6874830 rows x 12 columns]
[2730312:MainThread](2026-02-26 21:42:34,908) INFO - qlib.timer - [log.py:117] - Time cost: 0.857s | Fetching dataframe Done
[2730312:MainThread](2026-02-26 21:42:34,927) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from
loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1")
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S
[2730312:MainThread](2026-02-26 21:42:36,986) INFO - qlib.timer - [log.py:117] - Time cost: 2.069s | DDB query: Done
[2730312:MainThread](2026-02-26 21:42:36,996) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2730312:MainThread](2026-02-26 21:43:53,198) INFO - qlib.timer - [log.py:117] - Time cost: 76.199s | Instruments filter: Done
[2730312:MainThread](2026-02-26 21:43:53,230) INFO - qlib.timer - [log.py:117] - Time cost: 78.318s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x761f381e1c40>) Done
[2730312:MainThread](2026-02-26 21:43:53,231) INFO - qlib.timer - [log.py:117] - Time cost: 78.319s | Init data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x761f381e1c40>) Done
[2730312:MainThread](2026-02-26 21:43:53,231) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2730312:MainThread](2026-02-26 21:43:53,239) INFO - qlib.timer - [log.py:117] - Time cost: 0.007s | fetch_df_by_index Done
[2730312:MainThread](2026-02-26 21:43:53,257) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
gds_CC10 gds_CC11 ... gds_CC63 gds_CC70
datetime instrument ...
2026-02-09 SH600000 False False ... False False
SH600004 False False ... False False
SH600006 False False ... False False
SH600007 False False ... False False
SH600008 False False ... False False
... ... ... ... ... ...
2026-02-26 SZ301658 False False ... False False
SZ301662 False False ... False False
SZ301665 False False ... False False
SZ301678 False False ... False False
SZ302132 False False ... False False
[41168 rows x 30 columns]
[2730312:MainThread](2026-02-26 21:43:53,258) INFO - qlib.timer - [log.py:117] - Time cost: 0.027s | Fetching dataframe Done
[2730312:MainThread](2026-02-26 21:43:53,274) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from
loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag")
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002
[2730312:MainThread](2026-02-26 21:44:44,876) INFO - qlib.timer - [log.py:117] - Time cost: 51.611s | DDB query: Done
[2730312:MainThread](2026-02-26 21:44:45,602) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2730312:MainThread](2026-02-26 21:46:07,184) INFO - qlib.timer - [log.py:117] - Time cost: 81.056s | Instruments filter: Done
[2730312:MainThread](2026-02-26 21:46:07,747) INFO - qlib.timer - [log.py:117] - Time cost: 134.487s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x761f387b3080>) Done
[2730312:MainThread](2026-02-26 21:46:07,748) INFO - qlib.timer - [log.py:117] - Time cost: 134.488s | Init data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x761f387b3080>) Done
[2730312:MainThread](2026-02-26 21:46:07,748) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2730312:MainThread](2026-02-26 21:46:08,349) INFO - qlib.timer - [log.py:117] - Time cost: 0.600s | fetch_df_by_index Done
[2730312:MainThread](2026-02-26 21:46:08,358) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
ST_Y ST_S ST_T ST_L ST_Z ST_X
datetime instrument
2019-12-13 SH600000 False False False False False False
SH600004 False False False False False False
SH600006 False False False False False False
SH600007 False False False False False False
SH600008 False False False False False False
... ... ... ... ... ... ...
2026-02-26 SZ301658 False False False False False False
SZ301662 False False False False False False
SZ301665 False False False False False False
SZ301678 False False False False False False
SZ302132 False False False False False False
[6874833 rows x 6 columns]
[2730312:MainThread](2026-02-26 21:46:08,359) INFO - qlib.timer - [log.py:117] - Time cost: 0.610s | Fetching dataframe Done
/home/guofu/.venv/alpha2/lib/python3.12/site-packages/qlib/contrib/utils/paral.py:22: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
group_list = [_df.resample("M", level="datetime")\
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x76203dfa91f0>
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x76203dfa91f0>
Query config:
#concepts: 2;
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x761f346e7aa0>
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x761f346e7aa0>
Field list: ['gds_CC10', 'gds_CC11', 'gds_CC12', 'gds_CC20', 'gds_CC21', 'gds_CC22', 'gds_CC23', 'gds_CC24', 'gds_CC25', 'gds_CC26', 'gds_CC27', 'gds_CC28', 'gds_CC30', 'gds_CC31', 'gds_CC32', 'gds_CC33', 'gds_CC34', 'gds_CC35', 'gds_CC36', 'gds_CC37', 'gds_CC40', 'gds_CC41', 'gds_CC42', 'gds_CC43', 'gds_CC50', 'gds_CC60', 'gds_CC61', 'gds_CC62', 'gds_CC63', 'gds_CC70']
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x761cc3995760>
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x761a968d1d00>
[2730312:MainThread](2026-02-26 21:46:11,623) INFO - qlib.timer - [log.py:117] - Time cost: 3.264s | Concat index: Done
[2730312:MainThread](2026-02-26 21:46:11,625) INFO - qlib.timer - [log.py:117] - Time cost: 0.001s | Sort index: Done
[2730312:MainThread](2026-02-26 21:46:15,058) INFO - qlib.timer - [log.py:117] - Time cost: 3.433s | Creating SepDataFrame: Done
[2730312:MainThread](2026-02-26 21:46:15,928) INFO - qlib.timer - [log.py:117] - Time cost: 1062.224s | Loading data (<qlib.contrib.data.agg_handler.AggHandler object at 0x761f36612720>) Done
[2730312:MainThread](2026-02-26 21:46:15,929) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
[]
[2730312:MainThread](2026-02-26 21:46:15,931) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
[2730312:MainThread](2026-02-26 21:46:15,935) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
[]
[2730312:MainThread](2026-02-26 21:46:15,936) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
[2730312:MainThread](2026-02-26 21:46:15,939) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
[]
[2730312:MainThread](2026-02-26 21:46:15,940) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
[2730312:MainThread](2026-02-26 21:46:15,943) INFO - qlib.timer - [log.py:117] - Time cost: 0.014s | fit & process data Done
[2730312:MainThread](2026-02-26 21:46:15,943) INFO - qlib.timer - [log.py:117] - Time cost: 1062.239s | Init data (<qlib.contrib.data.agg_handler.AggHandler object at 0x761f36612720>) Done
All processors are readonly
All processors are readonly
All processors are readonly
ERROR: Failed to load data from Qlib pipeline: 'SepDataFrame' object has no attribute 'shape'

@ -0,0 +1,373 @@
[2734404:MainThread](2026-02-26 22:10:11,609) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
[2734404:MainThread](2026-02-26 22:10:11,613) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[2734404:MainThread](2026-02-26 22:10:11,613) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
================================================================================
DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE
================================================================================
Date Range: 2020-01-02 to 2020-01-10
Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
Step 1: Loading data from Qlib pipeline...
Loading since_date=2020-01-02
Loading data with handler (load_start=2019-12-13)...
Will use `placehorder_value` from module: qlib.contrib.data.config
Will init handler object from config:
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'end_time': datetime.date(2026, 2, 26),
'handler_list': [{'class': 'DDBAlpha158Handler',
'kwargs': {'col_set': 'feature',
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': 'alpha158',
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
{'class': 'DDBMarketExtHandler',
'kwargs': {'col_set': 'feature_ext',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['Turnover '
'as '
'turnover',
'FreeTurnover '
'as '
'free_turnover',
'log(MarketValue) '
'as '
'log_size'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['con_rating_strength'],
'table_name': 'stg_1day_gds_con_rating'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
{'class': 'DDBMarketFlagHandler',
'kwargs': {'col_set': 'feature_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['IsZt',
'IsDt',
'IsN',
'IsXD',
'IsXR',
'IsDR'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['open_limit',
'close_limit',
'low_limit',
'open_stop',
'close_stop',
'high_stop'],
'table_name': 'stg_1day_wind_market_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
{'class': 'DDBIndusFlagHandler',
'kwargs': {'col_set': 'indus_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': 'industry_code_cc.csv',
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
{'class': 'DDBStFlagHandler',
'kwargs': {'col_set': 'st_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['ST_Y',
'ST_S',
'ST_T',
'ST_L',
'ST_Z',
'ST_X'],
'table_name': 'stg_1day_wind_st_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
'instruments': 'csiallx',
'start_time': Timestamp('2019-11-23 00:00:00')},
'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'handler': {'class': 'AggHandler',
'kwargs': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
[2734404:MainThread](2026-02-26 22:10:11,634) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
[2734404:MainThread](2026-02-26 22:10:11,634) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
[2734404:MainThread](2026-02-26 22:10:11,842) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
[2734404:MainThread](2026-02-26 22:17:41,432) INFO - qlib.timer - [log.py:117] - Time cost: 449.788s | DDB query: Done
[2734404:MainThread](2026-02-26 22:17:42,271) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
'end_time': datetime.date(2026, 2, 26),
'handler_list': [{'class': 'DDBAlpha158Handler',
'kwargs': {'col_set': 'feature',
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': 'alpha158',
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
{'class': 'DDBMarketExtHandler',
'kwargs': {'col_set': 'feature_ext',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['Turnover '
'as '
'turnover',
'FreeTurnover '
'as '
'free_turnover',
'log(MarketValue) '
'as '
'log_size'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['con_rating_strength'],
'table_name': 'stg_1day_gds_con_rating'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
{'class': 'DDBMarketFlagHandler',
'kwargs': {'col_set': 'feature_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['IsZt',
'IsDt',
'IsN',
'IsXD',
'IsXR',
'IsDR'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['open_limit',
'close_limit',
'low_limit',
'open_stop',
'close_stop',
'high_stop'],
'table_name': 'stg_1day_wind_market_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
{'class': 'DDBIndusFlagHandler',
'kwargs': {'col_set': 'indus_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': 'industry_code_cc.csv',
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
{'class': 'DDBStFlagHandler',
'kwargs': {'col_set': 'st_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['ST_Y',
'ST_S',
'ST_T',
'ST_L',
'ST_Z',
'ST_X'],
'table_name': 'stg_1day_wind_st_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
'instruments': 'csiallx',
'start_time': Timestamp('2019-11-23 00:00:00')},
'module_path': 'qlib.contrib.data.agg_handler'},
'load_end': datetime.date(2026, 2, 26),
'load_start': Timestamp('2019-11-23 00:00:00'),
'market': 'csiallx',
'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target',
'region': 'cn'}}
Query config:
#alpha158: 1;
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c5e04773e0>
[2734404:MainThread](2026-02-26 22:19:46,550) INFO - qlib.timer - [log.py:117] - Time cost: 115.118s | Instruments filter: Done
[2734404:MainThread](2026-02-26 22:19:53,556) INFO - qlib.timer - [log.py:117] - Time cost: 581.918s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x71c5e0475d60>) Done
[2734404:MainThread](2026-02-26 22:19:53,557) INFO - qlib.timer - [log.py:117] - Time cost: 581.920s | Init data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x71c5e0475d60>) Done
[2734404:MainThread](2026-02-26 22:19:53,560) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2734404:MainThread](2026-02-26 22:19:57,060) INFO - qlib.timer - [log.py:117] - Time cost: 3.499s | fetch_df_by_index Done
[2734404:MainThread](2026-02-26 22:19:58,834) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
KMID KLEN ... VSUMD30 VSUMD60
datetime instrument ...
2019-11-25 SH600000 0.003325 0.011638 ... -0.238055 -0.010125
SH600004 -0.013806 0.030012 ... -0.017610 0.039195
SH600006 0.009238 0.016166 ... -0.034782 -0.014306
SH600007 -0.014749 0.018879 ... -0.032427 0.034279
SH600008 0.009259 0.024691 ... -0.063490 0.003978
... ... ... ... ... ...
2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708
SZ301662 0.060584 0.087834 ... -0.014658 -0.014613
SZ301665 -0.012899 0.040541 ... 0.083229 0.055994
SZ301678 0.018182 0.027879 ... -0.054124 0.014202
SZ302132 0.001754 0.016416 ... -0.049558 -0.038667
[6908346 rows x 158 columns]
[2734404:MainThread](2026-02-26 22:19:58,835) INFO - qlib.timer - [log.py:117] - Time cost: 5.276s | Fetching dataframe Done
[2734404:MainThread](2026-02-26 22:19:59,042) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3
[2734404:MainThread](2026-02-26 22:20:16,326) INFO - qlib.timer - [log.py:117] - Time cost: 17.485s | DDB query: Done
[2734404:MainThread](2026-02-26 22:20:17,102) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2734404:MainThread](2026-02-26 22:20:17,676) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,con_rating_strength from
loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH
[2734404:MainThread](2026-02-26 22:20:29,343) INFO - qlib.timer - [log.py:117] - Time cost: 11.676s | DDB query: Done
[2734404:MainThread](2026-02-26 22:20:30,245) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-25 00:00:00
[2734404:MainThread](2026-02-26 22:21:55,033) INFO - qlib.timer - [log.py:117] - Time cost: 81.592s | Instruments filter: Done
[2734404:MainThread](2026-02-26 22:21:55,586) INFO - qlib.timer - [log.py:117] - Time cost: 116.751s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x71c7a492c4d0>) Done
[2734404:MainThread](2026-02-26 22:21:55,587) INFO - qlib.timer - [log.py:117] - Time cost: 116.752s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x71c7a492c4d0>) Done
[2734404:MainThread](2026-02-26 22:21:55,588) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2734404:MainThread](2026-02-26 22:21:56,302) INFO - qlib.timer - [log.py:117] - Time cost: 0.713s | fetch_df_by_index Done
[2734404:MainThread](2026-02-26 22:21:56,309) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
turnover free_turnover log_size con_rating_strength
datetime instrument
2019-11-25 SH600000 0.0895 0.1639 17.339552 0.8214
SH600004 0.9386 1.9173 15.039255 0.8125
SH600006 0.2566 0.6479 13.680836 1.0000
SH600007 0.1647 0.8513 14.335590 0.7500
SH600008 0.1813 0.3362 14.435625 0.6875
... ... ... ... ...
2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN
SZ301662 12.5950 12.5950 12.681215 NaN
SZ301665 14.0077 14.0077 11.719415 NaN
SZ301678 6.6518 6.6518 12.799973 NaN
SZ302132 1.3868 3.0296 15.359885 NaN
[7623242 rows x 4 columns]
[2734404:MainThread](2026-02-26 22:21:56,310) INFO - qlib.timer - [log.py:117] - Time cost: 0.722s | Fetching dataframe Done
[2734404:MainThread](2026-02-26 22:21:56,327) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657',
[2734404:MainThread](2026-02-26 22:22:17,215) INFO - qlib.timer - [log.py:117] - Time cost: 20.899s | DDB query: Done
[2734404:MainThread](2026-02-26 22:22:17,952) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2734404:MainThread](2026-02-26 22:22:18,463) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from
loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','
[2734404:MainThread](2026-02-26 22:22:38,963) INFO - qlib.timer - [log.py:117] - Time cost: 20.509s | DDB query: Done
[2734404:MainThread](2026-02-26 22:22:39,774) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2734404:MainThread](2026-02-26 22:24:07,744) INFO - qlib.timer - [log.py:117] - Time cost: 84.654s | Instruments filter: Done
[2734404:MainThread](2026-02-26 22:24:08,702) INFO - qlib.timer - [log.py:117] - Time cost: 132.391s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x71c5e0847cb0>) Done
[2734404:MainThread](2026-02-26 22:24:08,703) INFO - qlib.timer - [log.py:117] - Time cost: 132.392s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x71c5e0847cb0>) Done
[2734404:MainThread](2026-02-26 22:24:08,704) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2734404:MainThread](2026-02-26 22:24:09,549) INFO - qlib.timer - [log.py:117] - Time cost: 0.844s | fetch_df_by_index Done
[2734404:MainThread](2026-02-26 22:24:09,561) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
IsZt IsDt IsN ... open_stop close_stop high_stop
datetime instrument ...
2019-11-25 SH600000 False False False ... False False False
SH600004 False False False ... False False False
SH600006 False False False ... False False False
SH600007 False False False ... False False False
SH600008 False False False ... False False False
... ... ... ... ... ... ... ...
2026-02-26 SZ301658 False False False ... False False False
SZ301662 False False False ... False False False
SZ301665 False False False ... False False False
SZ301678 False False False ... False False False
SZ302132 False False False ... False False False
[6925320 rows x 12 columns]
[2734404:MainThread](2026-02-26 22:24:09,562) INFO - qlib.timer - [log.py:117] - Time cost: 0.858s | Fetching dataframe Done
[2734404:MainThread](2026-02-26 22:24:09,760) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from
loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S
[2734404:MainThread](2026-02-26 22:24:11,809) INFO - qlib.timer - [log.py:117] - Time cost: 2.238s | DDB query: Done
[2734404:MainThread](2026-02-26 22:24:11,822) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2734404:MainThread](2026-02-26 22:25:28,259) INFO - qlib.timer - [log.py:117] - Time cost: 76.433s | Instruments filter: Done
[2734404:MainThread](2026-02-26 22:25:28,285) INFO - qlib.timer - [log.py:117] - Time cost: 78.720s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x71c139b2af60>) Done
[2734404:MainThread](2026-02-26 22:25:28,285) INFO - qlib.timer - [log.py:117] - Time cost: 78.720s | Init data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x71c139b2af60>) Done
[2734404:MainThread](2026-02-26 22:25:28,286) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2734404:MainThread](2026-02-26 22:25:28,290) INFO - qlib.timer - [log.py:117] - Time cost: 0.003s | fetch_df_by_index Done
[2734404:MainThread](2026-02-26 22:25:28,310) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
gds_CC10 gds_CC11 ... gds_CC63 gds_CC70
datetime instrument ...
2026-02-09 SH600000 False False ... False False
SH600004 False False ... False False
SH600006 False False ... False False
SH600007 False False ... False False
SH600008 False False ... False False
... ... ... ... ... ...
2026-02-26 SZ301658 False False ... False False
SZ301662 False False ... False False
SZ301665 False False ... False False
SZ301678 False False ... False False
SZ302132 False False ... False False
[41168 rows x 30 columns]
[2734404:MainThread](2026-02-26 22:25:28,311) INFO - qlib.timer - [log.py:117] - Time cost: 0.025s | Fetching dataframe Done
[2734404:MainThread](2026-02-26 22:25:28,470) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from
loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002
[2734404:MainThread](2026-02-26 22:25:58,108) INFO - qlib.timer - [log.py:117] - Time cost: 29.791s | DDB query: Done
[2734404:MainThread](2026-02-26 22:25:58,818) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2734404:MainThread](2026-02-26 22:27:21,291) INFO - qlib.timer - [log.py:117] - Time cost: 81.957s | Instruments filter: Done
[2734404:MainThread](2026-02-26 22:27:21,828) INFO - qlib.timer - [log.py:117] - Time cost: 113.516s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x71c5e0981ca0>) Done
[2734404:MainThread](2026-02-26 22:27:21,829) INFO - qlib.timer - [log.py:117] - Time cost: 113.517s | Init data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x71c5e0981ca0>) Done
[2734404:MainThread](2026-02-26 22:27:21,830) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2734404:MainThread](2026-02-26 22:27:22,439) INFO - qlib.timer - [log.py:117] - Time cost: 0.608s | fetch_df_by_index Done
[2734404:MainThread](2026-02-26 22:27:22,448) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
ST_Y ST_S ST_T ST_L ST_Z ST_X
datetime instrument
2019-11-25 SH600000 False False False False False False
SH600004 False False False False False False
SH600006 False False False False False False
SH600007 False False False False False False
SH600008 False False False False False False
... ... ... ... ... ... ...
2026-02-26 SZ301658 False False False False False False
SZ301662 False False False False False False
SZ301665 False False False False False False
SZ301678 False False False False False False
SZ302132 False False False False False False
[6925323 rows x 6 columns]
[2734404:MainThread](2026-02-26 22:27:22,448) INFO - qlib.timer - [log.py:117] - Time cost: 0.618s | Fetching dataframe Done
/home/guofu/.venv/alpha2/lib/python3.12/site-packages/qlib/contrib/utils/paral.py:22: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
group_list = [_df.resample("M", level="datetime")\
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c5e4d5d940>
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c5e4d5d940>
Query config:
#concepts: 2;
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c13b9b8b60>
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c13b9b8b60>
Field list: ['gds_CC10', 'gds_CC11', 'gds_CC12', 'gds_CC20', 'gds_CC21', 'gds_CC22', 'gds_CC23', 'gds_CC24', 'gds_CC25', 'gds_CC26', 'gds_CC27', 'gds_CC28', 'gds_CC30', 'gds_CC31', 'gds_CC32', 'gds_CC33', 'gds_CC34', 'gds_CC35', 'gds_CC36', 'gds_CC37', 'gds_CC40', 'gds_CC41', 'gds_CC42', 'gds_CC43', 'gds_CC50', 'gds_CC60', 'gds_CC61', 'gds_CC62', 'gds_CC63', 'gds_CC70']
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c139b28aa0>
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c5e07e8f20>
[2734404:MainThread](2026-02-26 22:27:25,764) INFO - qlib.timer - [log.py:117] - Time cost: 3.315s | Concat index: Done
[2734404:MainThread](2026-02-26 22:27:25,766) INFO - qlib.timer - [log.py:117] - Time cost: 0.001s | Sort index: Done
[2734404:MainThread](2026-02-26 22:27:29,485) INFO - qlib.timer - [log.py:117] - Time cost: 3.718s | Creating SepDataFrame: Done
[2734404:MainThread](2026-02-26 22:27:30,310) INFO - qlib.timer - [log.py:117] - Time cost: 1038.675s | Loading data (<qlib.contrib.data.agg_handler.AggHandler object at 0x71c5e17ec230>) Done
[2734404:MainThread](2026-02-26 22:27:30,311) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
[]
[2734404:MainThread](2026-02-26 22:27:30,313) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
[2734404:MainThread](2026-02-26 22:27:30,318) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
[]
[2734404:MainThread](2026-02-26 22:27:30,319) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
[2734404:MainThread](2026-02-26 22:27:30,322) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
[]
[2734404:MainThread](2026-02-26 22:27:30,323) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
[2734404:MainThread](2026-02-26 22:27:30,326) INFO - qlib.timer - [log.py:117] - Time cost: 0.015s | fit & process data Done
[2734404:MainThread](2026-02-26 22:27:30,327) INFO - qlib.timer - [log.py:117] - Time cost: 1038.692s | Init data (<qlib.contrib.data.agg_handler.AggHandler object at 0x71c5e17ec230>) Done
All processors are readonly
All processors are readonly
All processors are readonly
ERROR: Failed to load data from Qlib pipeline: 'SepDataFrame' object has no attribute 'shape'

@ -0,0 +1,321 @@
[2739486:MainThread](2026-02-26 22:59:30,849) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
[2739486:MainThread](2026-02-26 22:59:30,854) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[2739486:MainThread](2026-02-26 22:59:30,855) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
================================================================================
DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE
================================================================================
Date Range: 2020-01-02 to 2020-01-10
Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
Step 1: Loading data from Qlib pipeline...
Loading since_date=2020-01-02
Loading data with handler (load_start=2019-12-13)...
Will use `placehorder_value` from module: qlib.contrib.data.config
Will init handler object from config:
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'end_time': datetime.date(2026, 2, 26),
'handler_list': [{'class': 'DDBAlpha158Handler',
'kwargs': {'col_set': 'feature',
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': 'alpha158',
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
{'class': 'DDBMarketExtHandler',
'kwargs': {'col_set': 'feature_ext',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['Turnover '
'as '
'turnover',
'FreeTurnover '
'as '
'free_turnover',
'log(MarketValue) '
'as '
'log_size'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['con_rating_strength'],
'table_name': 'stg_1day_gds_con_rating'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
{'class': 'DDBMarketFlagHandler',
'kwargs': {'col_set': 'feature_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['IsZt',
'IsDt',
'IsN',
'IsXD',
'IsXR',
'IsDR'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['open_limit',
'close_limit',
'low_limit',
'open_stop',
'close_stop',
'high_stop'],
'table_name': 'stg_1day_wind_market_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
{'class': 'DDBIndusFlagHandler',
'kwargs': {'col_set': 'indus_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': 'industry_code_cc.csv',
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
{'class': 'DDBStFlagHandler',
'kwargs': {'col_set': 'st_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['ST_Y',
'ST_S',
'ST_T',
'ST_L',
'ST_Z',
'ST_X'],
'table_name': 'stg_1day_wind_st_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
'instruments': 'csiallx',
'start_time': Timestamp('2019-11-23 00:00:00')},
'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'handler': {'class': 'AggHandler',
'kwargs': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
[2739486:MainThread](2026-02-26 22:59:30,878) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
[2739486:MainThread](2026-02-26 22:59:30,878) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
[2739486:MainThread](2026-02-26 22:59:30,938) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
[2739486:MainThread](2026-02-26 23:07:16,353) INFO - qlib.timer - [log.py:117] - Time cost: 465.464s | DDB query: Done
[2739486:MainThread](2026-02-26 23:07:17,149) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
'end_time': datetime.date(2026, 2, 26),
'handler_list': [{'class': 'DDBAlpha158Handler',
'kwargs': {'col_set': 'feature',
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': 'alpha158',
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
{'class': 'DDBMarketExtHandler',
'kwargs': {'col_set': 'feature_ext',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['Turnover '
'as '
'turnover',
'FreeTurnover '
'as '
'free_turnover',
'log(MarketValue) '
'as '
'log_size'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['con_rating_strength'],
'table_name': 'stg_1day_gds_con_rating'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
{'class': 'DDBMarketFlagHandler',
'kwargs': {'col_set': 'feature_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['IsZt',
'IsDt',
'IsN',
'IsXD',
'IsXR',
'IsDR'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['open_limit',
'close_limit',
'low_limit',
'open_stop',
'close_stop',
'high_stop'],
'table_name': 'stg_1day_wind_market_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
{'class': 'DDBIndusFlagHandler',
'kwargs': {'col_set': 'indus_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': 'industry_code_cc.csv',
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
{'class': 'DDBStFlagHandler',
'kwargs': {'col_set': 'st_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['ST_Y',
'ST_S',
'ST_T',
'ST_L',
'ST_Z',
'ST_X'],
'table_name': 'stg_1day_wind_st_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
'instruments': 'csiallx',
'start_time': Timestamp('2019-11-23 00:00:00')},
'module_path': 'qlib.contrib.data.agg_handler'},
'load_end': datetime.date(2026, 2, 26),
'load_start': Timestamp('2019-11-23 00:00:00'),
'market': 'csiallx',
'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target',
'region': 'cn'}}
Query config:
#alpha158: 1;
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71847694be90>
[2739486:MainThread](2026-02-26 23:09:19,001) INFO - qlib.timer - [log.py:117] - Time cost: 112.707s | Instruments filter: Done
[2739486:MainThread](2026-02-26 23:09:26,016) INFO - qlib.timer - [log.py:117] - Time cost: 595.133s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x7184769a5fa0>) Done
[2739486:MainThread](2026-02-26 23:09:26,017) INFO - qlib.timer - [log.py:117] - Time cost: 595.135s | Init data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x7184769a5fa0>) Done
[2739486:MainThread](2026-02-26 23:09:26,019) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2739486:MainThread](2026-02-26 23:09:29,432) INFO - qlib.timer - [log.py:117] - Time cost: 3.412s | fetch_df_by_index Done
[2739486:MainThread](2026-02-26 23:09:31,228) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
KMID KLEN ... VSUMD30 VSUMD60
datetime instrument ...
2019-11-25 SH600000 0.003325 0.011638 ... -0.238055 -0.010125
SH600004 -0.013806 0.030012 ... -0.017610 0.039195
SH600006 0.009238 0.016166 ... -0.034782 -0.014306
SH600007 -0.014749 0.018879 ... -0.032427 0.034279
SH600008 0.009259 0.024691 ... -0.063490 0.003978
... ... ... ... ... ...
2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708
SZ301662 0.060584 0.087834 ... -0.014658 -0.014613
SZ301665 -0.012899 0.040541 ... 0.083229 0.055994
SZ301678 0.018182 0.027879 ... -0.054124 0.014202
SZ302132 0.001754 0.016416 ... -0.049558 -0.038667
[6908346 rows x 158 columns]
[2739486:MainThread](2026-02-26 23:09:31,229) INFO - qlib.timer - [log.py:117] - Time cost: 5.211s | Fetching dataframe Done
[2739486:MainThread](2026-02-26 23:09:31,242) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3
[2739486:MainThread](2026-02-26 23:09:54,142) INFO - qlib.timer - [log.py:117] - Time cost: 22.909s | DDB query: Done
[2739486:MainThread](2026-02-26 23:09:54,927) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2739486:MainThread](2026-02-26 23:09:55,507) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,con_rating_strength from
loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH
[2739486:MainThread](2026-02-26 23:10:10,691) INFO - qlib.timer - [log.py:117] - Time cost: 15.192s | DDB query: Done
[2739486:MainThread](2026-02-26 23:10:11,588) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2739486:MainThread](2026-02-26 23:11:37,528) INFO - qlib.timer - [log.py:117] - Time cost: 82.525s | Instruments filter: Done
[2739486:MainThread](2026-02-26 23:11:38,259) INFO - qlib.timer - [log.py:117] - Time cost: 127.029s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x7185777e3e90>) Done
[2739486:MainThread](2026-02-26 23:11:38,260) INFO - qlib.timer - [log.py:117] - Time cost: 127.030s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x7185777e3e90>) Done
[2739486:MainThread](2026-02-26 23:11:38,261) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2739486:MainThread](2026-02-26 23:11:39,000) INFO - qlib.timer - [log.py:117] - Time cost: 0.738s | fetch_df_by_index Done
[2739486:MainThread](2026-02-26 23:11:39,009) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
turnover free_turnover log_size con_rating_strength
datetime instrument
2019-11-25 SH600000 0.0895 0.1639 17.339552 0.8214
SH600004 0.9386 1.9173 15.039255 0.8125
SH600006 0.2566 0.6479 13.680836 1.0000
SH600007 0.1647 0.8513 14.335590 0.7500
SH600008 0.1813 0.3362 14.435625 0.6875
... ... ... ... ...
2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN
SZ301662 12.5950 12.5950 12.681215 1.0000
SZ301665 14.0077 14.0077 11.719415 1.0000
SZ301678 6.6518 6.6518 12.799973 0.7500
SZ302132 1.3868 3.0296 15.359885 0.8750
[7623255 rows x 4 columns]
[2739486:MainThread](2026-02-26 23:11:39,010) INFO - qlib.timer - [log.py:117] - Time cost: 0.749s | Fetching dataframe Done
[2739486:MainThread](2026-02-26 23:11:39,191) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657',
[2739486:MainThread](2026-02-26 23:12:05,839) INFO - qlib.timer - [log.py:117] - Time cost: 26.825s | DDB query: Done
[2739486:MainThread](2026-02-26 23:12:06,554) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2739486:MainThread](2026-02-26 23:12:07,075) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from
loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','
[2739486:MainThread](2026-02-26 23:12:32,695) INFO - qlib.timer - [log.py:117] - Time cost: 25.629s | DDB query: Done
[2739486:MainThread](2026-02-26 23:12:33,566) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2739486:MainThread](2026-02-26 23:14:02,232) INFO - qlib.timer - [log.py:117] - Time cost: 85.158s | Instruments filter: Done
[2739486:MainThread](2026-02-26 23:14:03,155) INFO - qlib.timer - [log.py:117] - Time cost: 144.143s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x718478991880>) Done
[2739486:MainThread](2026-02-26 23:14:03,156) INFO - qlib.timer - [log.py:117] - Time cost: 144.144s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x718478991880>) Done
[2739486:MainThread](2026-02-26 23:14:03,156) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2739486:MainThread](2026-02-26 23:14:04,046) INFO - qlib.timer - [log.py:117] - Time cost: 0.889s | fetch_df_by_index Done
[2739486:MainThread](2026-02-26 23:14:04,060) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
IsZt IsDt IsN ... open_stop close_stop high_stop
datetime instrument ...
2019-11-25 SH600000 False False False ... False False False
SH600004 False False False ... False False False
SH600006 False False False ... False False False
SH600007 False False False ... False False False
SH600008 False False False ... False False False
... ... ... ... ... ... ... ...
2026-02-26 SZ301658 False False False ... False False False
SZ301662 False False False ... False False False
SZ301665 False False False ... False False False
SZ301678 False False False ... False False False
SZ302132 False False False ... False False False
[6925320 rows x 12 columns]
[2739486:MainThread](2026-02-26 23:14:04,061) INFO - qlib.timer - [log.py:117] - Time cost: 0.904s | Fetching dataframe Done
[2739486:MainThread](2026-02-26 23:14:04,079) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from
loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S
[2739486:MainThread](2026-02-26 23:14:06,440) INFO - qlib.timer - [log.py:117] - Time cost: 2.370s | DDB query: Done
[2739486:MainThread](2026-02-26 23:14:06,448) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
[2739486:MainThread](2026-02-26 23:15:23,146) INFO - qlib.timer - [log.py:117] - Time cost: 76.695s | Instruments filter: Done
[2739486:MainThread](2026-02-26 23:15:23,184) INFO - qlib.timer - [log.py:117] - Time cost: 79.120s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x7184782fa0c0>) Done
[2739486:MainThread](2026-02-26 23:15:23,185) INFO - qlib.timer - [log.py:117] - Time cost: 79.121s | Init data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x7184782fa0c0>) Done
[2739486:MainThread](2026-02-26 23:15:23,186) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
[2739486:MainThread](2026-02-26 23:15:23,190) INFO - qlib.timer - [log.py:117] - Time cost: 0.003s | fetch_df_by_index Done
[2739486:MainThread](2026-02-26 23:15:23,210) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
gds_CC10 gds_CC11 ... gds_CC63 gds_CC70
datetime instrument ...
2026-02-09 SH600000 False False ... False False
SH600004 False False ... False False
SH600006 False False ... False False
SH600007 False False ... False False
SH600008 False False ... False False
... ... ... ... ... ...
2026-02-26 SZ301658 False False ... False False
SZ301662 False False ... False False
SZ301665 False False ... False False
SZ301678 False False ... False False
SZ302132 False False ... False False
[41168 rows x 30 columns]
[2739486:MainThread](2026-02-26 23:15:23,210) INFO - qlib.timer - [log.py:117] - Time cost: 0.025s | Fetching dataframe Done
[2739486:MainThread](2026-02-26 23:15:23,226) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from
loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag")
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002
[2739486:MainThread](2026-02-26 23:15:53,388) INFO - qlib.timer - [log.py:117] - Time cost: 30.171s | DDB query: Done
[2739486:MainThread](2026-02-26 23:15:54,166) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00

@ -0,0 +1,104 @@
[2745445:MainThread](2026-02-26 23:18:06,410) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
[2745445:MainThread](2026-02-26 23:18:06,414) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[2745445:MainThread](2026-02-26 23:18:06,415) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
================================================================================
DUMP GOLD-STANDARD RAW DATA FROM QLIB PIPELINE
================================================================================
Date Range: 2020-01-02 to 2020-01-10
Output Directory: ../data/
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
Step 1: Loading raw data from Qlib pipeline...
Loading raw data from handler (load_start=2019-12-13)...
Will use `placehorder_value` from module: qlib.contrib.data.config
Will init handler object from config:
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'end_time': datetime.date(2026, 2, 26),
'handler_list': [{'class': 'DDBAlpha158Handler',
'kwargs': {'col_set': 'feature',
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': 'alpha158',
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
{'class': 'DDBMarketExtHandler',
'kwargs': {'col_set': 'feature_ext',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['Turnover '
'as '
'turnover',
'FreeTurnover '
'as '
'free_turnover',
'log(MarketValue) '
'as '
'log_size'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['con_rating_strength'],
'table_name': 'stg_1day_gds_con_rating'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
{'class': 'DDBMarketFlagHandler',
'kwargs': {'col_set': 'feature_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['IsZt',
'IsDt',
'IsN',
'IsXD',
'IsXR',
'IsDR'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['open_limit',
'close_limit',
'low_limit',
'open_stop',
'close_stop',
'high_stop'],
'table_name': 'stg_1day_wind_market_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
{'class': 'DDBIndusFlagHandler',
'kwargs': {'col_set': 'indus_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': 'industry_code_cc.csv',
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
{'class': 'DDBStFlagHandler',
'kwargs': {'col_set': 'st_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['ST_Y',
'ST_S',
'ST_T',
'ST_L',
'ST_Z',
'ST_X'],
'table_name': 'stg_1day_wind_st_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
'instruments': 'csiallx',
'start_time': Timestamp('2019-11-23 00:00:00')},
'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'handler': {'class': 'AggHandler',
'kwargs': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'end_time': datetime.date(2026, 2, 26),
'handler_list': [{[2745445:MainThread](2026-02-26 23:18:06,436) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
[2745445:MainThread](2026-02-26 23:18:06,437) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
[2745445:MainThread](2026-02-26 23:18:06,492) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX

@ -0,0 +1,103 @@
[2746177:MainThread](2026-02-26 23:21:56,618) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
[2746177:MainThread](2026-02-26 23:21:56,622) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
[2746177:MainThread](2026-02-26 23:21:56,623) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
================================================================================
DUMP GOLD-STANDARD RAW DATA FROM QLIB PIPELINE
================================================================================
Date Range: 2020-01-02 to 2020-01-10
Output Directory: ../data/
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
Step 1: Loading raw data from Qlib pipeline...
Loading raw data from handler (load_start=2019-12-13)...
Filtering instruments: ['SH600000', 'SH600004', 'SH600006', 'SH600007', 'SH600008']... (5 total)
Will use `placehorder_value` from module: qlib.contrib.data.config
Will init handler object from config:
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'end_time': datetime.date(2026, 2, 26),
'handler_list': [{'class': 'DDBAlpha158Handler',
'kwargs': {'col_set': 'feature',
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': 'alpha158',
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
{'class': 'DDBMarketExtHandler',
'kwargs': {'col_set': 'feature_ext',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['Turnover '
'as '
'turnover',
'FreeTurnover '
'as '
'free_turnover',
'log(MarketValue) '
'as '
'log_size'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'float32',
'field_list': ['con_rating_strength'],
'table_name': 'stg_1day_gds_con_rating'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
{'class': 'DDBMarketFlagHandler',
'kwargs': {'col_set': 'feature_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['IsZt',
'IsDt',
'IsN',
'IsXD',
'IsXR',
'IsDR'],
'table_name': 'stg_1day_wind_kline_adjusted'},
{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['open_limit',
'close_limit',
'low_limit',
'open_stop',
'close_stop',
'high_stop'],
'table_name': 'stg_1day_wind_market_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
{'class': 'DDBIndusFlagHandler',
'kwargs': {'col_set': 'indus_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': 'industry_code_cc.csv',
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
{'class': 'DDBStFlagHandler',
'kwargs': {'col_set': 'st_flag',
'query_config': [{'db_path': 'dfs://daily_stock_run',
'dtype': 'bool',
'field_list': ['ST_Y',
'ST_S',
'ST_T',
'ST_L',
'ST_Z',
'ST_X'],
'table_name': 'stg_1day_wind_st_flag'}]},
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
'instruments': 'csiallx',
'start_time': Timestamp('2019-11-23 00:00:00')},
'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'},
'handler': {'class': 'AggHandler',
'kwargs': {'ddb_config': {'host': '192.168.1.146',
'password': '123456',
'port': 8848,
'username': 'admin'}[2746177:MainThread](2026-02-26 23:21:56,647) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
[2746177:MainThread](2026-02-26 23:21:56,648) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
[2746177:MainThread](2026-02-26 23:21:56,716) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
use mytt;
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX

@ -0,0 +1,187 @@
#!/usr/bin/env python
"""
Verify feature column order between standalone pipeline and qlib gold standard.
This script:
1. Loads a small sample using the qlib pipeline
2. Runs the same sample through the standalone generate_beta_embedding pipeline
3. Compares the column order and feature values
"""
import pickle as pkl
import ruamel.yaml as yaml
import pandas as pd
import polars as pl
import numpy as np
import sys
import os
# Patch yaml.safe_load for compatibility
_yaml = yaml.YAML(typ='safe', pure=True)
def patched_safe_load(stream):
import io
if isinstance(stream, str):
stream = io.StringIO(stream)
return _yaml.load(stream)
yaml.safe_load = patched_safe_load
# Add scripts directory to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'scripts'))
def main():
print("=" * 70)
print("VERIFY FEATURE ORDER: Standalone vs Qlib Gold Standard")
print("=" * 70)
# Step 1: Load processor list
print("\nStep 1: Loading processor list...")
proc_path = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc"
with open(proc_path, "rb") as f:
proc_list = pkl.load(f)
print(f" Loaded {len(proc_list)} processors")
# Step 2: Load small sample from qlib pipeline
print("\nStep 2: Loading sample from qlib pipeline...")
import qlib
from qlib.config import REG_CN
qlib.init(provider_uri='/home/guofu/.qlib/data_ops/target', region=REG_CN)
from qlib.workflow.cli import sys_config
from qlib.utils import fill_placeholder
import datetime as dt
yaml_path = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/handler.yaml"
with open(yaml_path) as fin:
config = yaml.safe_load(fin)
sys_config(config, "qlib.contrib.data.config")
qlib.init(**config.get("qlib_init"))
load_start = pd.to_datetime("2020-01-02") - dt.timedelta(days=20)
placehorder_value = {
"<SINCE_DATE>": load_start,
"<TODAY>": dt.date.today()
}
config_filled = fill_placeholder(config, placehorder_value)
handler = qlib.init_instance_by_config(config_filled["handler"])
handler_data = handler._data
# Get data from SepDataFrame
if hasattr(handler_data, '_data'):
df_dict = handler_data._data
print(f" Handler groups: {list(df_dict.keys())}")
# Concatenate groups
raw_dfs = []
for group, df in df_dict.items():
df_copy = df.copy()
df_copy.columns = [f"{group}::{col}" for col in df_copy.columns]
raw_dfs.append(df_copy)
print(f" {group}: {len(df_copy.columns)} columns")
raw_df = pd.concat(raw_dfs, axis=1)
print(f" Raw concatenated shape: {raw_df.shape}")
# Step 3: Apply processors to get gold standard features
print("\nStep 3: Applying processors (qlib gold standard)...")
from qlib.contrib.data.utils import apply_proc_list
# Strip group prefixes for processor application
col_mapping = {col: col.split('::', 1)[1] for col in raw_df.columns if '::' in col}
raw_df_stripped = raw_df.rename(columns=col_mapping)
# Convert bool to object for processor compatibility
bool_cols = raw_df_stripped.select_dtypes(include=['bool']).columns
for col in bool_cols:
raw_df_stripped[col] = raw_df_stripped[col].astype(object)
df_gold = apply_proc_list(raw_df_stripped, proc_list=proc_list, with_fit=False)
print(f" Gold standard shape after processors: {df_gold.shape}")
# Restore group prefixes
reverse_mapping = {v: k for k, v in col_mapping.items()}
df_gold = df_gold.rename(columns=reverse_mapping)
# Get gold standard column order
gold_columns = list(df_gold.columns)
print(f"\nGold standard column groups:")
feature_cols = [c for c in gold_columns if c.startswith('feature::')]
feature_ext_cols = [c for c in gold_columns if c.startswith('feature_ext::')]
feature_flag_cols = [c for c in gold_columns if c.startswith('feature_flag::')]
indus_idx_cols = [c for c in gold_columns if c.startswith('indus_idx::')]
print(f" feature:: {len(feature_cols)} cols")
print(f" feature_ext:: {len(feature_ext_cols)} cols")
print(f" feature_flag:: {len(feature_flag_cols)} cols")
print(f" indus_idx:: {len(indus_idx_cols)} cols")
# Step 4: Now run standalone pipeline on same data
print("\nStep 4: Running standalone pipeline...")
# Load parquet data for same date range
from generate_beta_embedding import load_all_data, merge_data_sources, apply_feature_pipeline
df_alpha, df_kline, df_flag, df_industry = load_all_data("2020-01-02", "2020-01-10")
df_standalone = merge_data_sources(df_alpha, df_kline, df_flag, df_industry)
print(f" Standalone loaded shape: {df_standalone.shape}")
# Apply feature pipeline
df_processed, feature_cols_standalone = apply_feature_pipeline(df_standalone)
print(f" Standalone processed shape: {df_processed.shape}")
print(f" Standalone feature columns: {len(feature_cols_standalone)}")
# Step 5: Compare column counts
print("\n" + "=" * 70)
print("COMPARISON SUMMARY")
print("=" * 70)
print(f"\nGold standard total columns: {len(gold_columns)}")
print(f" feature:: {len(feature_cols)}")
print(f" feature_ext:: {len(feature_ext_cols)}")
print(f" feature_flag:: {len(feature_flag_cols)}")
print(f" indus_idx:: {len(indus_idx_cols)}")
print(f"\nStandalone feature columns: {len(feature_cols_standalone)}")
# The gold standard columns (without prefix) should match standalone
gold_feature_cols = [c.split('::', 1)[1] for c in feature_cols]
gold_feature_ext_cols = [c.split('::', 1)[1] for c in feature_ext_cols]
gold_feature_flag_cols = [c.split('::', 1)[1] for c in feature_flag_cols]
gold_indus_idx_cols = [c.split('::', 1)[1] for c in indus_idx_cols]
gold_all = gold_feature_cols + gold_feature_ext_cols + gold_feature_flag_cols + gold_indus_idx_cols
print(f"\nGold standard (flat): {len(gold_all)} features")
print(f"Standalone: {len(feature_cols_standalone)} features")
if len(gold_all) != len(feature_cols_standalone):
print(f"\nWARNING: Feature count mismatch! Difference: {len(gold_all) - len(feature_cols_standalone)}")
# Check column order
print("\nFirst 20 column comparison:")
print(f"{'Idx':<5} {'Gold Standard':<40} {'Standalone':<40} {'Match':<6}")
print("-" * 90)
for i in range(min(20, len(gold_all), len(feature_cols_standalone))):
match = "" if gold_all[i] == feature_cols_standalone[i] else ""
print(f"{i:<5} {gold_all[i]:<40} {feature_cols_standalone[i]:<40} {match:<6}")
# Check if orders match
if gold_all == feature_cols_standalone:
print("\n✓ Column order MATCHES!")
else:
print("\n✗ Column order DOES NOT MATCH!")
print("\nFinding differences...")
diff_count = 0
for i in range(min(len(gold_all), len(feature_cols_standalone))):
if gold_all[i] != feature_cols_standalone[i]:
diff_count += 1
if diff_count <= 20:
print(f" [{i}] Gold: {gold_all[i]} vs Standalone: {feature_cols_standalone[i]}")
print(f"Total differences: {diff_count}")
if __name__ == "__main__":
main()
Loading…
Cancel
Save