- Add .claudeignore and .clauderc for Claude Code setup - Add config.yaml for cta_1d, stock_15m, and alpha158_beta tasks - Add alpha158_beta pipeline.py with documentation - Add utility scripts for embedding generation and prediction - Add executed baseline notebook for cta_1d Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>master
parent
586b16a6fa
commit
4d382dc6bd
@ -0,0 +1 @@
|
||||
__pycache__/
|
||||
@ -0,0 +1,9 @@
|
||||
custom_instructions: |
|
||||
- When refactoring, prefer using partial updates or specific function rewrites instead of outputting the entire file content. This helps avoid token limit errors.
|
||||
- If a file is larger than 300 lines, always suggest a modular breakdown before refactoring.
|
||||
- Please be concise. Skip lengthy explanations and focus on the code changes only. Use short responses.
|
||||
- Only output the diff or the specific functions that need changing. Do not repeat the unchanged parts of the file.
|
||||
|
||||
post_task_instructions: |
|
||||
After significant changes, ask me if you should update CLAUDE.md to reflect the new architecture or commands.
|
||||
|
||||
@ -0,0 +1,293 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Compare generated embeddings with database embeddings (0_7 version).
|
||||
Handles format conversion for datetime and instrument columns.
|
||||
|
||||
SUMMARY OF FINDINGS:
|
||||
- Generated embeddings and database embeddings have DIFFERENT values
|
||||
- Instrument mapping: 430xxx -> SHxxxxx, 830xxx -> SZxxxxx, 6xxxxx -> SH6xxxxx
|
||||
- Correlation between corresponding dimensions: ~0.0067 (essentially zero)
|
||||
- The generated embeddings are NOT the same as the database 0_7 embeddings
|
||||
- Possible reasons:
|
||||
1. Different model weights/versions used for generation
|
||||
2. Different input features or normalization
|
||||
3. Different random seed or inference configuration
|
||||
"""
|
||||
import polars as pl
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
def instrument_int_to_code(inst_int: int) -> str:
|
||||
"""Convert integer instrument code to exchange-prefixed string.
|
||||
|
||||
The encoding in the embedding file uses:
|
||||
- 4xxxxx -> SHxxxxxx (Shanghai A-shares, but code mapping is non-trivial)
|
||||
- 8xxxxx -> SZxxxxxx (Shenzhen A-shares)
|
||||
- Direct 6-digit codes are also present (600xxx, 000xxx, 300xxx)
|
||||
|
||||
Note: The exact mapping from 430017 -> SH600021 requires the original
|
||||
features file. We attempt an approximate mapping here.
|
||||
"""
|
||||
inst_str = str(inst_int)
|
||||
|
||||
# Already 6-digit code
|
||||
if len(inst_str) == 6 and inst_str[0] not in ('4', '8'):
|
||||
if inst_str.startswith('6'):
|
||||
return f"SH{inst_str}"
|
||||
else:
|
||||
return f"SZ{inst_str}"
|
||||
|
||||
# 6-digit with exchange prefix (4=SH, 8=SZ)
|
||||
if len(inst_str) == 6 and inst_str[0] in ('4', '8'):
|
||||
exchange = 'SH' if inst_str[0] == '4' else 'SZ'
|
||||
# The mapping from 430xxx -> 600xxx is not 1:1
|
||||
# Return the code as-is for matching attempts
|
||||
return f"{exchange}{inst_str[1:]}"
|
||||
|
||||
return inst_str
|
||||
|
||||
def load_generated_embedding(date_int: int, sample_n: int = None):
|
||||
"""Load generated embedding for a specific date."""
|
||||
gen_path = Path('/home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data/embedding_0_7_beta.parquet')
|
||||
|
||||
lf = pl.scan_parquet(gen_path)
|
||||
lf = lf.filter(pl.col('datetime') == date_int)
|
||||
|
||||
if sample_n:
|
||||
lf = lf.head(sample_n)
|
||||
|
||||
df = lf.collect()
|
||||
|
||||
# Convert wide format (embedding_0, embedding_1, ...) to list format
|
||||
embedding_cols = [c for c in df.columns if c.startswith('embedding_')]
|
||||
embedding_cols.sort(key=lambda x: int(x.split('_')[1]))
|
||||
|
||||
embedding_structs = df.select(embedding_cols).to_struct()
|
||||
embeddings_list = [[v for v in struct.values()] for struct in embedding_structs]
|
||||
|
||||
df = df.with_columns([
|
||||
pl.Series('values', embeddings_list),
|
||||
pl.col('datetime').cast(pl.UInt32).alias('datetime_uint32'),
|
||||
pl.col('instrument').alias('instrument_orig'),
|
||||
pl.col('instrument').cast(pl.String).alias('instrument_str'),
|
||||
pl.col('instrument').map_elements(instrument_int_to_code, return_dtype=pl.String).alias('instrument_code')
|
||||
])
|
||||
|
||||
return df
|
||||
|
||||
def load_database_embedding(date_str: str):
|
||||
"""Load database embedding for a specific date."""
|
||||
db_path = Path(f'/data/parquet/dataset/dwm_1day_multicast_csencode_1D/version=csiallx_feature2_ntrla_flag_pnlnorm_vae4_dim32a_beta0001/datetime={date_str}/0.parquet')
|
||||
|
||||
if not db_path.exists():
|
||||
return None
|
||||
|
||||
df = pl.read_parquet(db_path)
|
||||
df = df.with_columns([
|
||||
pl.col('datetime').cast(pl.Int64).alias('datetime_int')
|
||||
])
|
||||
return df
|
||||
|
||||
def analyze_instrument_mapping(date_int: int):
|
||||
"""Analyze the instrument mapping between generated and database embeddings."""
|
||||
date_str = str(date_int)
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Analyzing instrument mapping for date: {date_int}")
|
||||
print(f"{'='*80}")
|
||||
|
||||
gen_df = load_generated_embedding(date_int)
|
||||
db_df = load_database_embedding(date_str)
|
||||
|
||||
if db_df is None:
|
||||
print(f"ERROR: Database embedding not found for {date_str}")
|
||||
return
|
||||
|
||||
print(f"\nGenerated embeddings: {gen_df.shape[0]} rows")
|
||||
print(f"Database embeddings: {db_df.shape[0]} rows")
|
||||
|
||||
# Show samples
|
||||
print("\n--- Generated Embedding Sample ---")
|
||||
sample_gen = gen_df.select(['datetime', 'instrument_orig', 'instrument_str', 'instrument_code', 'values']).head(10)
|
||||
print(sample_gen)
|
||||
|
||||
print("\n--- Database Embedding Sample ---")
|
||||
print(db_df.head(10))
|
||||
|
||||
# Try different matching strategies
|
||||
gen_insts_set = set(gen_df['instrument_code'].to_list())
|
||||
db_insts_set = set(db_df['instrument'].to_list())
|
||||
|
||||
common = gen_insts_set & db_insts_set
|
||||
gen_only = gen_insts_set - db_insts_set
|
||||
db_only = db_insts_set - gen_insts_set
|
||||
|
||||
print(f"\n--- Matching Results (with code conversion) ---")
|
||||
print(f"Common instruments: {len(common)}")
|
||||
print(f"Generated only: {len(gen_only)}")
|
||||
print(f"Database only: {len(db_only)}")
|
||||
|
||||
if len(common) == 0:
|
||||
print("\nNo common instruments found with code conversion!")
|
||||
print("\nTrying to find mapping patterns...")
|
||||
|
||||
# Show some samples for analysis
|
||||
print("\nGenerated instrument samples (original, converted):")
|
||||
gen_samples = list(zip(gen_df['instrument_orig'].head(20).to_list(),
|
||||
gen_df['instrument_code'].head(20).to_list()))
|
||||
for orig, conv in gen_samples:
|
||||
print(f" {orig} -> {conv}")
|
||||
|
||||
print("\nDatabase instrument samples:")
|
||||
db_samples = db_df['instrument'].head(20).to_list()
|
||||
for inst in db_samples:
|
||||
print(f" {inst}")
|
||||
|
||||
# Check if there's a position-based alignment possible
|
||||
# Sort both and compare by position
|
||||
gen_sorted = sorted(gen_df['instrument_orig'].to_list())
|
||||
db_sorted = sorted([int(inst[2:]) for inst in db_df['instrument'].to_list()])
|
||||
|
||||
print("\n--- Attempting position-based matching ---")
|
||||
print(f"Generated sorted (first 10): {gen_sorted[:10]}")
|
||||
print(f"Database sorted (first 10): {db_sorted[:10]}")
|
||||
|
||||
else:
|
||||
# We have matches, compare embeddings
|
||||
print(f"\n--- Comparing embeddings for {len(common)} common instruments ---")
|
||||
|
||||
gen_common = gen_df.filter(pl.col('instrument_code').is_in(list(common)))
|
||||
db_common = db_df.filter(pl.col('instrument').is_in(list(common)))
|
||||
|
||||
# Join and compare
|
||||
comparison = gen_common.join(
|
||||
db_common,
|
||||
left_on='instrument_code',
|
||||
right_on='instrument',
|
||||
how='inner',
|
||||
suffix='_db'
|
||||
)
|
||||
|
||||
# Calculate differences
|
||||
diffs = []
|
||||
for row in comparison.iter_rows():
|
||||
# Find indices for the values columns
|
||||
gen_vals_idx = comparison.columns.index('values')
|
||||
db_vals_idx = comparison.columns.index('values_db')
|
||||
|
||||
gen_emb = np.array(row[gen_vals_idx])
|
||||
db_emb = np.array(row[db_vals_idx])
|
||||
|
||||
diff = gen_emb - db_emb
|
||||
diff_norm = np.linalg.norm(diff)
|
||||
rel_diff = diff_norm / (np.linalg.norm(db_emb) + 1e-10)
|
||||
|
||||
diffs.append({
|
||||
'instrument': row[comparison.columns.index('instrument_code')],
|
||||
'l2_norm_diff': diff_norm,
|
||||
'relative_diff': rel_diff,
|
||||
'max_abs_diff': np.max(np.abs(diff)),
|
||||
'gen_emb_norm': np.linalg.norm(gen_emb),
|
||||
'db_emb_norm': np.linalg.norm(db_emb)
|
||||
})
|
||||
|
||||
if diffs:
|
||||
diff_df = pl.DataFrame(diffs)
|
||||
print("\nDifference statistics:")
|
||||
print(diff_df.select(['l2_norm_diff', 'relative_diff', 'max_abs_diff']).describe())
|
||||
|
||||
max_rel_diff = diff_df['relative_diff'].max()
|
||||
print(f"\nMax relative difference: {max_rel_diff:.6e}")
|
||||
|
||||
if max_rel_diff < 1e-5:
|
||||
print("✓ Embeddings match within numerical precision!")
|
||||
elif max_rel_diff < 0.01:
|
||||
print("~ Embeddings are very similar")
|
||||
else:
|
||||
print("✗ Embeddings differ significantly")
|
||||
|
||||
# Show some comparison samples
|
||||
print("\nSample comparison:")
|
||||
for i in range(min(5, len(diffs))):
|
||||
d = diffs[i]
|
||||
print(f" {d['instrument']}: gen_norm={d['gen_emb_norm']:.4f}, "
|
||||
f"db_norm={d['db_emb_norm']:.4f}, rel_diff={d['relative_diff']:.6e}")
|
||||
|
||||
def calculate_correlation(date_int: int):
|
||||
"""Calculate correlation between generated and database embeddings."""
|
||||
import numpy as np
|
||||
|
||||
date_str = str(date_int)
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Correlation Analysis for date: {date_int}")
|
||||
print(f"{'='*80}")
|
||||
|
||||
gen_df = load_generated_embedding(date_int)
|
||||
db_df = load_database_embedding(date_str)
|
||||
|
||||
if db_df is None:
|
||||
print(f"ERROR: Database embedding not found for {date_str}")
|
||||
return
|
||||
|
||||
# Find common instruments
|
||||
gen_insts = set(gen_df['instrument_code'].to_list())
|
||||
db_insts = set(db_df['instrument'].to_list())
|
||||
common = list(gen_insts & db_insts)
|
||||
|
||||
print(f"\nCommon instruments: {len(common)}")
|
||||
|
||||
if len(common) == 0:
|
||||
print("No common instruments found!")
|
||||
return
|
||||
|
||||
# Filter to common and sort
|
||||
gen_common = gen_df.filter(pl.col('instrument_code').is_in(common)).sort('instrument_code')
|
||||
db_common = db_df.filter(pl.col('instrument').is_in(common)).sort('instrument')
|
||||
|
||||
# Extract embedding matrices
|
||||
gen_embs = np.array(gen_common['values'].to_list())
|
||||
db_embs = np.array(db_common['values'].to_list())
|
||||
|
||||
print(f"Generated embeddings shape: {gen_embs.shape}")
|
||||
print(f"Database embeddings shape: {db_embs.shape}")
|
||||
|
||||
# Calculate correlation per dimension
|
||||
correlations = []
|
||||
for i in range(32):
|
||||
gen_dim = gen_embs[:, i]
|
||||
db_dim = db_embs[:, i]
|
||||
corr = np.corrcoef(gen_dim, db_dim)[0, 1]
|
||||
correlations.append(corr)
|
||||
|
||||
print(f"\nCorrelation statistics across 32 dimensions:")
|
||||
print(f" Mean: {np.mean(correlations):.4f}")
|
||||
print(f" Median: {np.median(correlations):.4f}")
|
||||
print(f" Min: {np.min(correlations):.4f}")
|
||||
print(f" Max: {np.max(correlations):.4f}")
|
||||
|
||||
# Overall correlation
|
||||
overall_corr = np.corrcoef(gen_embs.flatten(), db_embs.flatten())[0, 1]
|
||||
print(f"\nOverall correlation (all dims flattened): {overall_corr:.4f}")
|
||||
|
||||
# Interpretation
|
||||
mean_corr = np.mean(correlations)
|
||||
if abs(mean_corr) < 0.1:
|
||||
print("\n✗ CONCLUSION: Embeddings are NOT correlated (essentially independent)")
|
||||
elif abs(mean_corr) < 0.5:
|
||||
print("\n~ CONCLUSION: Weak correlation between embeddings")
|
||||
else:
|
||||
print(f"\n✓ CONCLUSION: {'Strong' if abs(mean_corr) > 0.8 else 'Moderate'} correlation")
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Analyze for a few dates
|
||||
dates_to_compare = [20190102, 20200102, 20240102]
|
||||
|
||||
for date in dates_to_compare:
|
||||
try:
|
||||
analyze_instrument_mapping(date)
|
||||
calculate_correlation(date)
|
||||
except Exception as e:
|
||||
print(f"\nError analyzing date {date}: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
File diff suppressed because one or more lines are too long
@ -0,0 +1,50 @@
|
||||
# CTA 1-Day Return Prediction - Experiment Configuration
|
||||
|
||||
# Data Configuration
|
||||
data:
|
||||
dt_range: ['2020-01-01', '2023-12-31']
|
||||
feature_sets:
|
||||
- alpha158
|
||||
- hffactor
|
||||
normalization: dual
|
||||
blend_weights: equal # Options: equal, zscore_heavy, rolling_heavy, cs_heavy, short_term, long_term
|
||||
|
||||
# Data Segments (train/valid/test split)
|
||||
segments:
|
||||
train: ['2020-01-01', '2022-06-30']
|
||||
valid: ['2022-07-01', '2022-12-31']
|
||||
test: ['2023-01-01', '2023-12-31']
|
||||
|
||||
# Model Configuration
|
||||
model:
|
||||
type: xgb
|
||||
params:
|
||||
objective: reg:squarederror
|
||||
eval_metric: rmse
|
||||
eta: 0.05
|
||||
max_depth: 6
|
||||
subsample: 0.8
|
||||
colsample_bytree: 0.8
|
||||
seed: 42
|
||||
num_boost_round: 500
|
||||
early_stopping_rounds: 50
|
||||
|
||||
# Training Configuration
|
||||
training:
|
||||
return_type: o2c_twap1min
|
||||
weight_factors:
|
||||
positive: 1.0
|
||||
negative: 2.0
|
||||
|
||||
# Backtest Configuration
|
||||
backtest:
|
||||
num_trades: 4
|
||||
signal_dist: normal
|
||||
pos_weight: true
|
||||
|
||||
# Output Configuration
|
||||
output:
|
||||
base_dir: results/cta_1d
|
||||
save_model: true
|
||||
save_predictions: true
|
||||
save_importance: true
|
||||
@ -0,0 +1,34 @@
|
||||
# Stock 15-Minute Return Prediction - Experiment Configuration
|
||||
|
||||
# Data Configuration
|
||||
data:
|
||||
dt_range: ['2020-01-01', '2023-12-31']
|
||||
feature_path: /data/parquet/stock_1min_alpha158
|
||||
kline_path: /data/parquet/stock_1min_kline
|
||||
industry_path: /data/parquet/stock_industry # Optional
|
||||
normalization_mode: dual # Options: industry, cs_zscore, dual
|
||||
|
||||
# Model Configuration
|
||||
model:
|
||||
type: xgb
|
||||
params:
|
||||
objective: reg:squarederror
|
||||
eval_metric: rmse
|
||||
eta: 0.05
|
||||
max_depth: 6
|
||||
subsample: 0.8
|
||||
colsample_bytree: 0.8
|
||||
seed: 42
|
||||
num_boost_round: 500
|
||||
early_stopping_rounds: 50
|
||||
|
||||
# Training Configuration
|
||||
training:
|
||||
positive_factor: 1.0 # Weight multiplier for positive returns
|
||||
negative_factor: 2.0 # Weight multiplier for negative returns
|
||||
|
||||
# Output Configuration
|
||||
output:
|
||||
base_dir: results/stock_15m
|
||||
save_model: true
|
||||
save_predictions: true
|
||||
@ -0,0 +1,123 @@
|
||||
# Data Pipeline Bug Analysis
|
||||
|
||||
## Summary
|
||||
|
||||
The generated embeddings do not match the database 0_7 embeddings due to multiple bugs in the data pipeline migration from qlib to standalone Polars implementation.
|
||||
|
||||
---
|
||||
|
||||
## Bugs Fixed
|
||||
|
||||
### 1. Market Classification (`FlagMarketInjector`) ✓ FIXED
|
||||
|
||||
**Original (incorrect):**
|
||||
```python
|
||||
market_0 = (instrument >= 600000) # SH
|
||||
market_1 = (instrument < 600000) # SZ
|
||||
```
|
||||
|
||||
**Fixed:**
|
||||
```python
|
||||
inst_str = str(instrument).zfill(6)
|
||||
market_0 = inst_str.startswith('6') # SH: 6xxxxx
|
||||
market_1 = inst_str.startswith('0') | inst_str.startswith('3') # SZ: 0xxx, 3xxx
|
||||
market_2 = inst_str.startswith('4') | inst_str.startswith('8') # NE: 4xxx, 8xxx
|
||||
```
|
||||
|
||||
**Impact:** 167 instruments (4xxxxx, 8xxxxx - 新三板) were misclassified.
|
||||
|
||||
---
|
||||
|
||||
### 2. ColumnRemover Missing `IsN` ✓ FIXED
|
||||
|
||||
**Original (incorrect):**
|
||||
```python
|
||||
columns_to_remove = ['TotalValue_diff', 'IsZt', 'IsDt']
|
||||
```
|
||||
|
||||
**Fixed:**
|
||||
```python
|
||||
columns_to_remove = ['TotalValue_diff', 'IsN', 'IsZt', 'IsDt']
|
||||
```
|
||||
|
||||
**Impact:** Extra column caused feature dimension mismatch.
|
||||
|
||||
---
|
||||
|
||||
### 3. RobustZScoreNorm Applied to Wrong Columns ✓ FIXED
|
||||
|
||||
**Original (incorrect):**
|
||||
Applied normalization to ALL 341 features including market flags and indus_idx.
|
||||
|
||||
**Fixed:**
|
||||
Only normalize `alpha158 + alpha158_ntrl + market_ext + market_ext_ntrl` (330 features), excluding:
|
||||
- Market flags (Limit, Stopping, IsTp, IsXD, IsXR, IsDR, market_0, market_1, market_2, IsST)
|
||||
- indus_idx
|
||||
|
||||
---
|
||||
|
||||
## Critical Remaining Issue: Data Schema Mismatch
|
||||
|
||||
### `Limit` and `Stopping` Column Types Changed
|
||||
|
||||
**Original qlib pipeline expected:**
|
||||
- `Limit`: **Boolean** flag (True = limit up)
|
||||
- `Stopping`: **Boolean** flag (True = suspended trading)
|
||||
|
||||
**Current Parquet data has:**
|
||||
- `Limit`: **Float64** price change percentage (0.0 to 1301.3)
|
||||
- `Stopping`: **Float64** price change percentage
|
||||
|
||||
**Evidence:**
|
||||
```
|
||||
Limit values sample: [8.86, 9.36, 31.0, 7.32, 2.28, 6.39, 5.38, 4.03, 3.86, 9.89]
|
||||
Limit == 0: only 2 rows
|
||||
Limit > 0: 3738 rows
|
||||
```
|
||||
|
||||
This is a **fundamental data schema change**. The current Parquet files contain different data than what the original VAE model was trained on.
|
||||
|
||||
**Possible fixes:**
|
||||
1. Convert `Limit` and `Stopping` to boolean flags using a threshold
|
||||
2. Find the original data source that had boolean flags
|
||||
3. Re-train the VAE model with the new data schema
|
||||
|
||||
---
|
||||
|
||||
## Correlation Results
|
||||
|
||||
After fixing bugs 1-3, the embedding correlation with database 0_7:
|
||||
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| Mean correlation (32 dims) | 0.0068 |
|
||||
| Median correlation | 0.0094 |
|
||||
| Overall correlation | 0.2330 |
|
||||
|
||||
**Conclusion:** Embeddings remain essentially uncorrelated (≈0).
|
||||
|
||||
---
|
||||
|
||||
## Root Cause
|
||||
|
||||
The **Limit/Stopping data schema change** is the most likely root cause. The VAE model learned to encode features that included binary limit/stopping flags, but the standalone pipeline feeds it continuous price change percentages instead.
|
||||
|
||||
---
|
||||
|
||||
## Next Steps
|
||||
|
||||
1. **Verify original data schema:**
|
||||
- Check if the original DolphinDB table had boolean `Limit` and `Stopping` columns
|
||||
- Compare with the current Parquet schema
|
||||
|
||||
2. **Fix the data loading:**
|
||||
- Either convert continuous values to binary flags
|
||||
- Or use the correct boolean columns (`IsZt`, `IsDt`) for limit flags
|
||||
|
||||
3. **Verify feature order:**
|
||||
- Ensure the qlib RobustZScoreNorm parameters are applied in the correct order
|
||||
- Check that `[alpha158, alpha158_ntrl, market_ext, market_ext_ntrl]` matches the 330-parameter shape
|
||||
|
||||
4. **Re-run comparison:**
|
||||
- Generate new embeddings with the corrected pipeline
|
||||
- Compare correlation with database
|
||||
@ -0,0 +1,85 @@
|
||||
# Data Pipeline Bug Analysis - Final Status
|
||||
|
||||
## Summary
|
||||
|
||||
After fixing all identified bugs, the feature count now matches (341), but the embeddings remain uncorrelated with the database 0_7 version.
|
||||
|
||||
**Latest Version**: v5
|
||||
- Feature count: 341 ✓ (matches VAE input dim)
|
||||
- Mean correlation with DB: 0.0050 (essentially zero)
|
||||
- Status: All identified bugs fixed, but embeddings still differ
|
||||
|
||||
---
|
||||
|
||||
## Bugs Fixed
|
||||
|
||||
### 1. Market Classification (`FlagMarketInjector`) ✓ FIXED
|
||||
- **Bug**: Used `instrument >= 600000` which misclassified 新三板 instruments
|
||||
- **Fix**: Use string prefix matching with vocab_size=2 (not 3)
|
||||
- **Impact**: 167 instruments corrected
|
||||
|
||||
### 2. ColumnRemover Missing `IsN` ✓ FIXED
|
||||
- **Bug**: Only removed `IsZt, IsDt` but not `IsN`
|
||||
- **Fix**: Added `IsN` to removal list
|
||||
- **Impact**: Feature count alignment
|
||||
|
||||
### 3. RobustZScoreNorm Scope ✓ FIXED
|
||||
- **Bug**: Applied normalization to all 341 features
|
||||
- **Fix**: Only normalize 330 features (alpha158 + market_ext, both original + neutralized)
|
||||
- **Impact**: Correct normalization scope
|
||||
|
||||
### 4. Wrong Data Sources for Market Flags ✓ FIXED
|
||||
- **Bug**: Used `Limit, Stopping` (Float64) from kline_adjusted
|
||||
- **Fix**: Load from correct sources:
|
||||
- kline_adjusted: `IsZt, IsDt, IsN, IsXD, IsXR, IsDR` (Boolean)
|
||||
- market_flag: `open_limit, close_limit, low_limit, high_stop` (Boolean, 4 cols)
|
||||
- **Impact**: Correct boolean flag data
|
||||
|
||||
### 5. Feature Count Mismatch ✓ FIXED
|
||||
- **Bug**: 344 features (3 extra)
|
||||
- **Fix**: vocab_size=2 + 4 market_flag cols = 341 features
|
||||
- **Impact**: VAE input dimension matches
|
||||
|
||||
---
|
||||
|
||||
## Correlation Results (v5)
|
||||
|
||||
| Metric | Value |
|
||||
|--------|-------|
|
||||
| Mean correlation (32 dims) | 0.0050 |
|
||||
| Median correlation | 0.0079 |
|
||||
| Min | -0.0420 |
|
||||
| Max | 0.0372 |
|
||||
| Overall (flattened) | 0.2225 |
|
||||
|
||||
**Conclusion**: Embeddings remain essentially uncorrelated with database.
|
||||
|
||||
---
|
||||
|
||||
## Possible Remaining Issues
|
||||
|
||||
1. **Different input data values**: The alpha158_0_7_beta Parquet files may contain different values than the original DolphinDB data used to train the VAE.
|
||||
|
||||
2. **Feature ordering mismatch**: The 330 RobustZScoreNorm parameters must be applied in the exact order:
|
||||
- [0:158] = alpha158 original
|
||||
- [158:316] = alpha158_ntrl
|
||||
- [316:323] = market_ext original (7 cols)
|
||||
- [323:330] = market_ext_ntrl (7 cols)
|
||||
|
||||
3. **Industry neutralization differences**: Our `IndusNtrlInjector` implementation may differ from qlib's.
|
||||
|
||||
4. **Missing transformations**: There may be additional preprocessing steps not captured in handler.yaml.
|
||||
|
||||
5. **VAE model mismatch**: The VAE model may have been trained with different data than what handler.yaml specifies.
|
||||
|
||||
---
|
||||
|
||||
## Recommended Next Steps
|
||||
|
||||
1. **Compare intermediate features**: Run both the qlib pipeline and our pipeline on the same input data and compare outputs at each step.
|
||||
|
||||
2. **Verify RobustZScoreNorm parameter order**: Check if our feature ordering matches the order used during VAE training.
|
||||
|
||||
3. **Compare predictions, not embeddings**: Instead of comparing VAE embeddings, compare the final d033 model predictions with the original 0_7 predictions.
|
||||
|
||||
4. **Check alpha158 data source**: Verify that `stg_1day_wind_alpha158_0_7_beta_1D` contains the same data as the original DolphinDB `stg_1day_wind_alpha158_0_7_beta` table.
|
||||
@ -0,0 +1,146 @@
|
||||
# First, let me create a script to train a VAE model on the 0_7_beta data
|
||||
# This would need to be done separately as it's a prerequisite for the prediction script above
|
||||
|
||||
"""
|
||||
Workflow configuration to train a VAE model on alpha158 0_7_beta data.
|
||||
This creates a VAE-encoded version of the 0_7_beta factors that can be used
|
||||
for prediction comparison with the original 0_7 model.
|
||||
"""
|
||||
|
||||
experiment_name: vae_alpha158_0_7_beta
|
||||
|
||||
qlib_init:
|
||||
provider_uri: "~/.qlib/data_ops/target"
|
||||
region: cn
|
||||
|
||||
load_start: &load_start 2013-01-01
|
||||
load_end: &load_end 2023-09-30
|
||||
|
||||
train_start: &train_start 2013-01-01
|
||||
train_end: &train_end 2018-12-31
|
||||
|
||||
benchmark_name: &benchmark_name SH000985
|
||||
market: &market csiallx
|
||||
|
||||
dataset_cache_path: &dataset_cache_path tasks/artifacts/csiallx_dataset_alpha158_0_7_beta_vae.pkl
|
||||
|
||||
# DolphinDB configuration
|
||||
ddb_config: &ddb_config
|
||||
host: 192.168.1.146
|
||||
port: 8848
|
||||
username: "admin"
|
||||
password: "123456"
|
||||
|
||||
data_handler_config: &data_handler_config
|
||||
start_time: *load_start
|
||||
end_time: *load_end
|
||||
fit_start_time: *train_start
|
||||
fit_end_time: *train_end
|
||||
instruments: *market
|
||||
ddb_config: *ddb_config
|
||||
handler_list:
|
||||
# Alpha158 0_7_beta features
|
||||
- class: DDBZWindDataHandler
|
||||
module_path: qlib.contrib.data.ddb_handlers.ddb_wind_handler
|
||||
kwargs:
|
||||
col_set: "feature"
|
||||
query_config:
|
||||
- db_path: "dfs://daily_stock_run"
|
||||
dtype: "float32"
|
||||
field_list: "alpha158" # All alpha158 factors
|
||||
table_name: "stg_1day_wind_alpha158_0_7_beta" # Use the beta version
|
||||
# Additional handlers as needed
|
||||
- class: DDBZWindDataHandler
|
||||
module_path: qlib.contrib.data.ddb_handlers.ddb_wind_handler
|
||||
kwargs:
|
||||
col_set: "risk_factor"
|
||||
query_config:
|
||||
- db_path: "dfs://daily_stock_run"
|
||||
dtype: "float32"
|
||||
field_list: ["MarketValue as total_size"]
|
||||
table_name: "stg_1day_wind_kline_adjusted"
|
||||
- class: DDBZWindDataHandler
|
||||
module_path: qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler
|
||||
kwargs:
|
||||
col_set: "indus_flag"
|
||||
query_config:
|
||||
- db_path: "dfs://daily_stock_run"
|
||||
dtype: "bool"
|
||||
field_list: "industry_code_cc.csv"
|
||||
table_name: "stg_1day_gds_indus_flag_cc1"
|
||||
- class: DDBZWindDataHandler
|
||||
module_path: qlib.contrib.data.ddb_handlers.ddb_st_flag_handler
|
||||
kwargs:
|
||||
col_set: "st_flag"
|
||||
query_config:
|
||||
- db_path: "dfs://daily_stock_run"
|
||||
dtype: "bool"
|
||||
field_list: ["ST_Y", "ST_S", "ST_T", "ST_L", "ST_Z", "ST_X"]
|
||||
table_name: "stg_1day_wind_st_flag"
|
||||
infer_processors:
|
||||
- class: FlagToOnehot
|
||||
module_path: qlib.contrib.data.processor_flag
|
||||
kwargs:
|
||||
fields_group: indus_flag
|
||||
onehot_group: indus_idx
|
||||
- class: FactorNtrlInjector
|
||||
module_path: qlib.contrib.data.processor_ntrl
|
||||
kwargs:
|
||||
fields_group: "feature"
|
||||
factor_col: "risk_factor"
|
||||
dummy_col: "indus_idx"
|
||||
ntrl_type: "size_indus"
|
||||
- class: RobustZScoreNorm
|
||||
kwargs:
|
||||
fields_group: ["feature"]
|
||||
clip_outlier: true
|
||||
- class: Fillna
|
||||
kwargs:
|
||||
fields_group: ["feature"]
|
||||
|
||||
task:
|
||||
model:
|
||||
class: VAEModel
|
||||
module_path: qlib.contrib.model.task.task_vae_flat
|
||||
kwargs:
|
||||
model_config:
|
||||
hidden_size: 32 # Same as the original model for consistency
|
||||
nn_module:
|
||||
class: VAE
|
||||
module_path: qlib.contrib.model.module.module_vae
|
||||
kwargs:
|
||||
variational: true
|
||||
optim_config:
|
||||
seed: 1234567
|
||||
bootstrap_config: 1.2
|
||||
distort_config: 1e-3
|
||||
beta: 1e-3 # KL divergence weight
|
||||
n_epochs: 300
|
||||
early_stop: 10
|
||||
lr: 1e-3
|
||||
optimizer: adamw
|
||||
batch_size: 10000
|
||||
n_jobs: 4
|
||||
checkpoint:
|
||||
save_path: tasks/artifacts/checkpoints/csiallx_alpha158_0_7_beta_vae32
|
||||
dataset:
|
||||
class: DatasetH
|
||||
module_path: qlib.data.dataset
|
||||
kwargs:
|
||||
config_module: qlib.contrib.data.config
|
||||
from_cache: *dataset_cache_path
|
||||
require_setup: true
|
||||
handler:
|
||||
class: AggHandler
|
||||
module_path: qlib.contrib.data.agg_handler
|
||||
kwargs: *data_handler_config
|
||||
segments:
|
||||
train: [*train_start, *train_end]
|
||||
test: [*load_start, *load_end]
|
||||
record:
|
||||
- class: SignalRecord
|
||||
module_path: qlib.contrib.workflow.record_temp
|
||||
kwargs:
|
||||
model: <MODEL>
|
||||
dataset: <DATASET>
|
||||
col_set: "feature"
|
||||
@ -0,0 +1,58 @@
|
||||
# Analysis Report: Enhanced Prediction Comparison Visualization
|
||||
|
||||
## Issue Identified
|
||||
The original `prediction_comparison.png` visualization lacked meaningful evaluation metrics such as:
|
||||
- IC (Information Coefficient) time series
|
||||
- RankIC (Rank Information Coefficient) time series
|
||||
- Top-tier return cumulative difference
|
||||
- Other requested financial metrics
|
||||
|
||||
Instead, it only showed basic scatter plots and prediction distributions.
|
||||
|
||||
## Solution Implemented
|
||||
Updated the `compare_predictions.py` script with enhanced visualization functionality that includes:
|
||||
|
||||
### 1. IC Time Series Comparison
|
||||
- Calculates daily IC for both 0_7 and 0_7_beta prediction sets
|
||||
- Plots both series on the same chart for easy comparison
|
||||
- Shows temporal trends in predictive power
|
||||
|
||||
### 2. RankIC Time Series Comparison
|
||||
- Calculates daily RankIC (Spearman correlation) for both versions
|
||||
- Displays time series comparison to show rank correlation trends
|
||||
- Helps evaluate monotonic relationships over time
|
||||
|
||||
### 3. Cumulative Top-Tier Returns
|
||||
- Identifies top 10% of stocks based on predictions each day
|
||||
- Calculates cumulative returns for both prediction sets
|
||||
- Shows performance divergence over time
|
||||
|
||||
### 4. Difference in Cumulative Returns
|
||||
- Visualizes the spread between 0_7 and 0_7_beta cumulative returns
|
||||
- Helps quantify the performance gap between the two approaches
|
||||
- Provides insight into which version performs better over time
|
||||
|
||||
### 5. Additional Improvements
|
||||
- Fixed date type mismatch issues that prevented proper joins
|
||||
- Added graceful fallback to basic visualization when actual returns unavailable
|
||||
- Maintained all original basic comparison plots for comprehensive analysis
|
||||
|
||||
## Files Updated
|
||||
- `compare_predictions.py` - Enhanced visualization functionality
|
||||
- `generate_mock_returns.py` - Script to create test returns data
|
||||
- `test_enhanced_visualization.py` - Verification script
|
||||
|
||||
## Results
|
||||
The enhanced visualization now provides:
|
||||
- Meaningful financial metrics that directly address the comparison requirements
|
||||
- Time series analysis of IC and RankIC metrics
|
||||
- Cumulative performance comparison of top-tier selections
|
||||
- Proper error handling for different data formats
|
||||
- Comprehensive side-by-side comparison of both alpha versions
|
||||
|
||||
## Verification
|
||||
Successfully tested the enhanced functionality with mock data, confirming that:
|
||||
- All requested metrics are now visualized
|
||||
- The plot contains 6 meaningful panels with financial insights
|
||||
- The output file `prediction_comparison.png` includes all requested metrics
|
||||
- Basic comparison functionality remains intact
|
||||
@ -0,0 +1,345 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Main pipeline orchestration script for Alpha158 0_7 vs 0_7_beta comparison.
|
||||
|
||||
This script orchestrates the full workflow:
|
||||
1. Generate beta embeddings from alpha158_0_7_beta factors
|
||||
2. Fetch original 0_7 predictions from DolphinDB
|
||||
3. Generate predictions using beta embeddings
|
||||
4. Generate actual returns from kline data
|
||||
5. Compare predictions (IC, RankIC, correlation, etc.)
|
||||
|
||||
Usage:
|
||||
python pipeline.py --start-date 2019-01-01 --end-date 2020-11-30 --skip-embeddings --skip-fetch
|
||||
|
||||
Arguments:
|
||||
--start-date: Start date for data loading (default: 2019-01-01)
|
||||
--end-date: End date for data loading (default: 2020-11-30)
|
||||
--skip-embeddings: Skip embeddings generation (use existing)
|
||||
--skip-fetch: Skip fetching original predictions (use existing)
|
||||
--skip-returns: Skip returns generation (use existing)
|
||||
--skip-comparison: Skip final comparison
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
# Add scripts directory to path
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'scripts'))
|
||||
|
||||
|
||||
def step_generate_embeddings(start_date: str, end_date: str, data_dir: str) -> bool:
|
||||
"""Step 1: Generate beta embeddings."""
|
||||
print("\n" + "=" * 70)
|
||||
print("STEP 1: Generate Beta Embeddings")
|
||||
print("=" * 70)
|
||||
|
||||
embedding_file = os.path.join(data_dir, "embedding_0_7_beta.parquet")
|
||||
|
||||
if os.path.exists(embedding_file):
|
||||
print(f"Embeddings file already exists: {embedding_file}")
|
||||
response = input("Regenerate? (y/N): ").strip().lower()
|
||||
if response != 'y':
|
||||
print("Skipping embeddings generation.")
|
||||
return True
|
||||
|
||||
try:
|
||||
from generate_beta_embedding import generate_embeddings
|
||||
|
||||
df = generate_embeddings(
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
output_file=embedding_file,
|
||||
use_vae=True
|
||||
)
|
||||
|
||||
print(f"\nGenerated {len(df)} embeddings")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error generating embeddings: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
|
||||
def step_fetch_predictions(start_date: str, end_date: str, data_dir: str) -> bool:
|
||||
"""Step 2: Fetch original predictions from DolphinDB."""
|
||||
print("\n" + "=" * 70)
|
||||
print("STEP 2: Fetch Original Predictions from DolphinDB")
|
||||
print("=" * 70)
|
||||
|
||||
predictions_file = os.path.join(data_dir, "original_predictions_0_7.parquet")
|
||||
|
||||
if os.path.exists(predictions_file):
|
||||
print(f"Predictions file already exists: {predictions_file}")
|
||||
response = input("Refetch? (y/N): ").strip().lower()
|
||||
if response != 'y':
|
||||
print("Skipping fetch.")
|
||||
return True
|
||||
|
||||
try:
|
||||
from fetch_predictions import fetch_original_predictions
|
||||
|
||||
df = fetch_original_predictions(
|
||||
start_date=start_date,
|
||||
end_date=end_date,
|
||||
output_file=predictions_file
|
||||
)
|
||||
|
||||
print(f"\nFetched {len(df)} predictions")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error fetching predictions: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
|
||||
def step_generate_beta_predictions(data_dir: str) -> bool:
|
||||
"""Step 3: Generate predictions using beta embeddings."""
|
||||
print("\n" + "=" * 70)
|
||||
print("STEP 3: Generate Predictions with Beta Embeddings")
|
||||
print("=" * 70)
|
||||
|
||||
embedding_file = os.path.join(data_dir, "embedding_0_7_beta.parquet")
|
||||
predictions_file = os.path.join(data_dir, "predictions_beta_embedding.parquet")
|
||||
|
||||
if not os.path.exists(embedding_file):
|
||||
print(f"Embeddings file not found: {embedding_file}")
|
||||
print("Run step 1 first.")
|
||||
return False
|
||||
|
||||
if os.path.exists(predictions_file):
|
||||
print(f"Beta predictions file already exists: {predictions_file}")
|
||||
response = input("Regenerate? (y/N): ").strip().lower()
|
||||
if response != 'y':
|
||||
print("Skipping prediction generation.")
|
||||
return True
|
||||
|
||||
try:
|
||||
from predict_with_embedding import generate_predictions
|
||||
|
||||
df = generate_predictions(
|
||||
embedding_file=embedding_file,
|
||||
output_file=predictions_file,
|
||||
seq_len=40,
|
||||
batch_size=1000
|
||||
)
|
||||
|
||||
print(f"\nGenerated {len(df)} predictions")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error generating predictions: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
|
||||
def step_generate_returns(data_dir: str) -> bool:
|
||||
"""Step 4: Generate actual returns from kline data."""
|
||||
print("\n" + "=" * 70)
|
||||
print("STEP 4: Generate Actual Returns")
|
||||
print("=" * 70)
|
||||
|
||||
predictions_file = os.path.join(data_dir, "original_predictions_0_7.parquet")
|
||||
returns_file = os.path.join(data_dir, "actual_returns.parquet")
|
||||
|
||||
if os.path.exists(returns_file):
|
||||
print(f"Returns file already exists: {returns_file}")
|
||||
response = input("Regenerate? (y/N): ").strip().lower()
|
||||
if response != 'y':
|
||||
print("Skipping returns generation.")
|
||||
return True
|
||||
|
||||
try:
|
||||
from generate_returns import generate_real_returns_from_kline
|
||||
|
||||
# Use prediction file to determine date range if available
|
||||
prediction_file = predictions_file if os.path.exists(predictions_file) else None
|
||||
|
||||
df = generate_real_returns_from_kline(
|
||||
input_kline_path="/data/parquet/dataset/stg_1day_wind_kline_adjusted_1D/",
|
||||
prediction_file=prediction_file,
|
||||
output_file=returns_file,
|
||||
return_days=5
|
||||
)
|
||||
|
||||
if df is not None:
|
||||
print(f"\nGenerated {len(df)} returns")
|
||||
return True
|
||||
else:
|
||||
print("\nFailed to generate returns")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error generating returns: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
|
||||
def step_compare_predictions(data_dir: str) -> bool:
|
||||
"""Step 5: Compare 0_7 vs 0_7_beta predictions."""
|
||||
print("\n" + "=" * 70)
|
||||
print("STEP 5: Compare Predictions")
|
||||
print("=" * 70)
|
||||
|
||||
required_files = [
|
||||
os.path.join(data_dir, "original_predictions_0_7.parquet"),
|
||||
os.path.join(data_dir, "predictions_beta_embedding.parquet"),
|
||||
]
|
||||
|
||||
for f in required_files:
|
||||
if not os.path.exists(f):
|
||||
print(f"Required file not found: {f}")
|
||||
return False
|
||||
|
||||
try:
|
||||
# Import and run comparison
|
||||
from compare_predictions import main as compare_main
|
||||
|
||||
compare_main()
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error comparing predictions: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
"""Main pipeline orchestration."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Alpha158 0_7 vs 0_7_beta Comparison Pipeline"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--start-date",
|
||||
type=str,
|
||||
default="2019-01-01",
|
||||
help="Start date (YYYY-MM-DD)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--end-date",
|
||||
type=str,
|
||||
default="2020-11-30",
|
||||
help="End date (YYYY-MM-DD)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-embeddings",
|
||||
action="store_true",
|
||||
help="Skip embeddings generation"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-fetch",
|
||||
action="store_true",
|
||||
help="Skip fetching original predictions"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-returns",
|
||||
action="store_true",
|
||||
help="Skip returns generation"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-comparison",
|
||||
action="store_true",
|
||||
help="Skip final comparison"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--data-dir",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Data directory (default: ./data)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Determine data directory
|
||||
script_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
data_dir = args.data_dir or os.path.join(script_dir, "data")
|
||||
|
||||
print("=" * 70)
|
||||
print("Alpha158 0_7 vs 0_7_beta Comparison Pipeline")
|
||||
print("=" * 70)
|
||||
print(f"Date range: {args.start_date} to {args.end_date}")
|
||||
print(f"Data directory: {data_dir}")
|
||||
|
||||
# Ensure data directory exists
|
||||
os.makedirs(data_dir, exist_ok=True)
|
||||
|
||||
# Track results
|
||||
results = {}
|
||||
|
||||
# Step 1: Generate embeddings
|
||||
if not args.skip_embeddings:
|
||||
results['embeddings'] = step_generate_embeddings(
|
||||
args.start_date, args.end_date, data_dir
|
||||
)
|
||||
else:
|
||||
print("\nSkipping embeddings generation (as requested)")
|
||||
results['embeddings'] = True
|
||||
|
||||
# Step 2: Fetch original predictions
|
||||
if not args.skip_fetch:
|
||||
results['fetch'] = step_fetch_predictions(
|
||||
args.start_date, args.end_date, data_dir
|
||||
)
|
||||
else:
|
||||
print("\nSkipping fetch (as requested)")
|
||||
results['fetch'] = True
|
||||
|
||||
# Step 3: Generate beta predictions
|
||||
if results.get('embeddings', True):
|
||||
results['beta_predictions'] = step_generate_beta_predictions(data_dir)
|
||||
else:
|
||||
print("\nSkipping beta predictions (embeddings generation failed)")
|
||||
results['beta_predictions'] = False
|
||||
|
||||
# Step 4: Generate returns
|
||||
if not args.skip_returns:
|
||||
results['returns'] = step_generate_returns(data_dir)
|
||||
else:
|
||||
print("\nSkipping returns generation (as requested)")
|
||||
results['returns'] = True
|
||||
|
||||
# Step 5: Compare predictions
|
||||
if not args.skip_comparison:
|
||||
if all([
|
||||
results.get('fetch', True),
|
||||
results.get('beta_predictions', True)
|
||||
]):
|
||||
results['comparison'] = step_compare_predictions(data_dir)
|
||||
else:
|
||||
print("\nSkipping comparison (previous steps failed)")
|
||||
results['comparison'] = False
|
||||
else:
|
||||
print("\nSkipping comparison (as requested)")
|
||||
results['comparison'] = True
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 70)
|
||||
print("PIPELINE SUMMARY")
|
||||
print("=" * 70)
|
||||
|
||||
for step, success in results.items():
|
||||
status = "✓ PASSED" if success else "✗ FAILED"
|
||||
print(f" {step:20s}: {status}")
|
||||
|
||||
all_passed = all(results.values())
|
||||
|
||||
print("=" * 70)
|
||||
if all_passed:
|
||||
print("Pipeline completed successfully!")
|
||||
else:
|
||||
print("Pipeline completed with errors.")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -0,0 +1,129 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Compare generated embeddings with gold standard embeddings from DolphinDB.
|
||||
"""
|
||||
|
||||
import polars as pl
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
|
||||
DATA_DIR = Path(__file__).parent / "../data"
|
||||
|
||||
|
||||
def compare_embeddings():
|
||||
"""Compare generated and gold standard embeddings."""
|
||||
|
||||
# Load data
|
||||
gold_path = DATA_DIR / "embedding_0_7_beta_gold_standard.parquet"
|
||||
gen_path = DATA_DIR / "embedding_0_7_beta_sample.parquet"
|
||||
|
||||
print("=" * 60)
|
||||
print("Loading embeddings")
|
||||
print("=" * 60)
|
||||
|
||||
gold = pl.read_parquet(gold_path)
|
||||
gen = pl.read_parquet(gen_path)
|
||||
|
||||
print(f"Gold standard: {gold.shape}")
|
||||
print(f"Generated: {gen.shape}")
|
||||
|
||||
# Get embedding columns
|
||||
emb_cols = [f"embedding_{i}" for i in range(32)]
|
||||
|
||||
# Compare by date
|
||||
dates = sorted(gold["datetime"].unique().to_list())
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Comparison by date")
|
||||
print("=" * 60)
|
||||
|
||||
for dt in dates:
|
||||
gold_dt = gold.filter(pl.col("datetime") == dt)
|
||||
gen_dt = gen.filter(pl.col("datetime") == dt)
|
||||
|
||||
print(f"\nDate: {dt}")
|
||||
print(f" Gold instruments: {gold_dt.height}, Generated instruments: {gen_dt.height}")
|
||||
print(f" Gold instrument sample: {gold_dt['instrument'].head(5).to_list()}")
|
||||
print(f" Gen instrument sample: {gen_dt['instrument'].head(5).to_list()}")
|
||||
|
||||
# Check for common instruments
|
||||
gold_insts = set(gold_dt["instrument"].to_list())
|
||||
gen_insts = set(gen_dt["instrument"].to_list())
|
||||
common = gold_insts & gen_insts
|
||||
|
||||
print(f" Common instruments: {len(common)}")
|
||||
|
||||
if len(common) > 0:
|
||||
# Compare embeddings for common instruments
|
||||
gold_common = gold_dt.filter(pl.col("instrument").is_in(list(common))).sort("instrument")
|
||||
gen_common = gen_dt.filter(pl.col("instrument").is_in(list(common))).sort("instrument")
|
||||
|
||||
# Calculate embedding differences
|
||||
diffs = []
|
||||
for i in range(len(gold_common)):
|
||||
gold_emb = np.array([gold_common[col][i] for col in emb_cols])
|
||||
gen_emb = np.array([gen_common[col][i] for col in emb_cols])
|
||||
|
||||
diff = gold_emb - gen_emb
|
||||
l2_norm = np.linalg.norm(diff)
|
||||
rel_diff = l2_norm / (np.linalg.norm(gold_emb) + 1e-8)
|
||||
max_abs_diff = np.max(np.abs(diff))
|
||||
|
||||
diffs.append({
|
||||
"l2_norm": l2_norm,
|
||||
"rel_diff": rel_diff,
|
||||
"max_abs_diff": max_abs_diff,
|
||||
"gold_norm": np.linalg.norm(gold_emb),
|
||||
"gen_norm": np.linalg.norm(gen_emb)
|
||||
})
|
||||
|
||||
diff_df = pl.DataFrame(diffs)
|
||||
print(f"\n Embedding comparison:")
|
||||
print(f" Mean L2 norm diff: {diff_df['l2_norm'].mean():.4f}")
|
||||
print(f" Mean rel diff: {diff_df['rel_diff'].mean():.4%}")
|
||||
print(f" Mean max abs diff: {diff_df['max_abs_diff'].mean():.4f}")
|
||||
print(f" Gold emb norm (mean): {diff_df['gold_norm'].mean():.4f}")
|
||||
print(f" Gen emb norm (mean): {diff_df['gen_norm'].mean():.4f}")
|
||||
|
||||
# Correlation analysis
|
||||
gold_embs = np.array([[gold_common[col][i] for col in emb_cols] for i in range(len(gold_common))])
|
||||
gen_embs = np.array([[gen_common[col][i] for col in emb_cols] for i in range(len(gen_common))])
|
||||
|
||||
correlations = []
|
||||
for d in range(32):
|
||||
corr = np.corrcoef(gold_embs[:, d], gen_embs[:, d])[0, 1]
|
||||
correlations.append(corr)
|
||||
|
||||
print(f"\n Correlation by dimension:")
|
||||
print(f" Mean: {np.mean(correlations):.4f}")
|
||||
print(f" Median: {np.median(correlations):.4f}")
|
||||
print(f" Min: {np.min(correlations):.4f}")
|
||||
print(f" Max: {np.max(correlations):.4f}")
|
||||
|
||||
# Overall correlation
|
||||
overall_corr = np.corrcoef(gold_embs.flatten(), gen_embs.flatten())[0, 1]
|
||||
print(f" Overall (flattened): {overall_corr:.4f}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Summary Statistics")
|
||||
print("=" * 60)
|
||||
|
||||
# Gold standard stats
|
||||
gold_embs = gold.select(emb_cols).to_numpy()
|
||||
print("\nGold standard embeddings:")
|
||||
print(f" Mean: {np.mean(gold_embs):.6f}")
|
||||
print(f" Std: {np.std(gold_embs):.6f}")
|
||||
print(f" Min: {np.min(gold_embs):.6f}")
|
||||
print(f" Max: {np.max(gold_embs):.6f}")
|
||||
|
||||
# Generated stats
|
||||
gen_embs = gen.select(emb_cols).to_numpy()
|
||||
print("\nGenerated embeddings:")
|
||||
print(f" Mean: {np.mean(gen_embs):.6f}")
|
||||
print(f" Std: {np.std(gen_embs):.6f}")
|
||||
print(f" Min: {np.min(gen_embs):.6f}")
|
||||
print(f" Max: {np.max(gen_embs):.6f}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
compare_embeddings()
|
||||
@ -0,0 +1,306 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Compare 0_7 vs 0_7_beta predictions.
|
||||
|
||||
This script:
|
||||
1. Loads original 0_7 predictions (from DDB)
|
||||
2. Loads 0_7_beta predictions (from new embeddings)
|
||||
3. Calculates correlation between predictions
|
||||
4. Compares metrics (IC, RankIC, etc.) if actual returns available
|
||||
"""
|
||||
|
||||
import os
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
import pandas as pd
|
||||
from scipy.stats import spearmanr
|
||||
from typing import Optional, Dict
|
||||
|
||||
# File paths
|
||||
PRED_0_7_FILE = "../data/original_predictions_0_7.parquet"
|
||||
PRED_0_7_BETA_FILE = "../data/predictions_beta_embedding.parquet"
|
||||
ACTUAL_RETURNS_FILE = "../data/actual_returns.parquet"
|
||||
|
||||
|
||||
def load_and_align_predictions():
|
||||
"""Load both prediction files and align them by datetime and instrument."""
|
||||
print("Loading predictions...")
|
||||
|
||||
# Load 0_7 predictions
|
||||
df_0_7 = pl.read_parquet(PRED_0_7_FILE)
|
||||
print(f"0_7 predictions: {df_0_7.shape}")
|
||||
print(f" Date range: {df_0_7['datetime'].min()} to {df_0_7['datetime'].max()}")
|
||||
print(f" Unique instruments: {df_0_7['instrument'].n_unique()}")
|
||||
|
||||
# Load 0_7_beta predictions
|
||||
df_beta = pl.read_parquet(PRED_0_7_BETA_FILE)
|
||||
print(f"\n0_7_beta predictions: {df_beta.shape}")
|
||||
print(f" Date range: {df_beta['datetime'].min()} to {df_beta['datetime'].max()}")
|
||||
print(f" Unique instruments: {df_beta['instrument'].n_unique()}")
|
||||
|
||||
# Ensure compatible types
|
||||
df_0_7 = df_0_7.with_columns([
|
||||
pl.col('datetime').cast(pl.Int64),
|
||||
pl.col('instrument').cast(pl.Int64)
|
||||
])
|
||||
df_beta = df_beta.with_columns([
|
||||
pl.col('datetime').cast(pl.Int64),
|
||||
pl.col('instrument').cast(pl.Int64)
|
||||
])
|
||||
|
||||
# Rename prediction columns
|
||||
df_0_7 = df_0_7.rename({'prediction': 'pred_0_7'})
|
||||
df_beta = df_beta.rename({'prediction': 'pred_beta'})
|
||||
|
||||
# Join on datetime and instrument
|
||||
df_joined = df_0_7.join(
|
||||
df_beta,
|
||||
on=['datetime', 'instrument'],
|
||||
how='inner'
|
||||
)
|
||||
|
||||
print(f"\nJoined predictions: {df_joined.shape}")
|
||||
print(f" Overlapping dates: {df_joined['datetime'].n_unique()}")
|
||||
print(f" Overlapping instruments: {df_joined['instrument'].n_unique()}")
|
||||
|
||||
return df_joined
|
||||
|
||||
|
||||
def calculate_correlation(df: pl.DataFrame) -> Dict[str, float]:
|
||||
"""Calculate correlation between 0_7 and 0_7_beta predictions."""
|
||||
df_pd = df.to_pandas()
|
||||
|
||||
# Overall correlation
|
||||
pearson_corr = df_pd['pred_0_7'].corr(df_pd['pred_beta'])
|
||||
spearman_corr, _ = spearmanr(df_pd['pred_0_7'], df_pd['pred_beta'])
|
||||
|
||||
# Correlation by date
|
||||
daily_corrs = []
|
||||
for date, group in df_pd.groupby('datetime'):
|
||||
if len(group) >= 2:
|
||||
corr = group['pred_0_7'].corr(group['pred_beta'])
|
||||
daily_corrs.append(corr)
|
||||
|
||||
daily_corr_mean = np.mean(daily_corrs)
|
||||
daily_corr_std = np.std(daily_corrs)
|
||||
|
||||
return {
|
||||
'pearson_corr': pearson_corr,
|
||||
'spearman_corr': spearman_corr,
|
||||
'daily_corr_mean': daily_corr_mean,
|
||||
'daily_corr_std': daily_corr_std
|
||||
}
|
||||
|
||||
|
||||
def calculate_ic_metrics(df: pl.DataFrame, actual_returns: pl.DataFrame) -> Dict:
|
||||
"""Calculate IC metrics for both prediction sets."""
|
||||
|
||||
# Join with actual returns
|
||||
df_joined = df.join(
|
||||
actual_returns,
|
||||
on=['datetime', 'instrument'],
|
||||
how='inner'
|
||||
)
|
||||
|
||||
print(f"\nJoined with returns: {df_joined.shape}")
|
||||
|
||||
df_pd = df_joined.to_pandas()
|
||||
|
||||
# Find return column
|
||||
return_col = None
|
||||
for col in ['v2v_5d', 'return', 'actual_return', 'ret']:
|
||||
if col in df_pd.columns:
|
||||
return_col = col
|
||||
break
|
||||
|
||||
if return_col is None:
|
||||
print("No return column found!")
|
||||
return {}
|
||||
|
||||
print(f"Using return column: {return_col}")
|
||||
|
||||
# Calculate daily IC for both predictions
|
||||
results_0_7 = []
|
||||
results_beta = []
|
||||
|
||||
for date, group in df_pd.groupby('datetime'):
|
||||
if len(group) < 5: # Need enough samples
|
||||
continue
|
||||
|
||||
# IC (Pearson)
|
||||
ic_0_7 = group['pred_0_7'].corr(group[return_col])
|
||||
ic_beta = group['pred_beta'].corr(group[return_col])
|
||||
|
||||
# RankIC (Spearman)
|
||||
rankic_0_7, _ = spearmanr(group['pred_0_7'], group[return_col])
|
||||
rankic_beta, _ = spearmanr(group['pred_beta'], group[return_col])
|
||||
|
||||
results_0_7.append({'date': date, 'ic': ic_0_7, 'rankic': rankic_0_7})
|
||||
results_beta.append({'date': date, 'ic': ic_beta, 'rankic': rankic_beta})
|
||||
|
||||
df_ic_0_7 = pd.DataFrame(results_0_7)
|
||||
df_ic_beta = pd.DataFrame(results_beta)
|
||||
|
||||
metrics = {
|
||||
'0_7': {
|
||||
'ic_mean': df_ic_0_7['ic'].mean(),
|
||||
'ic_std': df_ic_0_7['ic'].std(),
|
||||
'ic_ir': df_ic_0_7['ic'].mean() / df_ic_0_7['ic'].std() if df_ic_0_7['ic'].std() > 0 else 0,
|
||||
'rankic_mean': df_ic_0_7['rankic'].mean(),
|
||||
'rankic_std': df_ic_0_7['rankic'].std(),
|
||||
'rankic_ir': df_ic_0_7['rankic'].mean() / df_ic_0_7['rankic'].std() if df_ic_0_7['rankic'].std() > 0 else 0,
|
||||
},
|
||||
'0_7_beta': {
|
||||
'ic_mean': df_ic_beta['ic'].mean(),
|
||||
'ic_std': df_ic_beta['ic'].std(),
|
||||
'ic_ir': df_ic_beta['ic'].mean() / df_ic_beta['ic'].std() if df_ic_beta['ic'].std() > 0 else 0,
|
||||
'rankic_mean': df_ic_beta['rankic'].mean(),
|
||||
'rankic_std': df_ic_beta['rankic'].std(),
|
||||
'rankic_ir': df_ic_beta['rankic'].mean() / df_ic_beta['rankic'].std() if df_ic_beta['rankic'].std() > 0 else 0,
|
||||
}
|
||||
}
|
||||
|
||||
return metrics
|
||||
|
||||
|
||||
def calculate_top_tier_return(df: pl.DataFrame, actual_returns: pl.DataFrame, top_pct: float = 0.1) -> Dict:
|
||||
"""Calculate top-tier returns for both predictions."""
|
||||
|
||||
# Join with actual returns
|
||||
df_joined = df.join(
|
||||
actual_returns,
|
||||
on=['datetime', 'instrument'],
|
||||
how='inner'
|
||||
)
|
||||
|
||||
df_pd = df_joined.to_pandas()
|
||||
|
||||
# Find return column
|
||||
return_col = None
|
||||
for col in ['v2v_5d', 'return', 'actual_return', 'ret']:
|
||||
if col in df_pd.columns:
|
||||
return_col = col
|
||||
break
|
||||
|
||||
if return_col is None:
|
||||
return {}
|
||||
|
||||
# Calculate top-tier returns
|
||||
top_returns_0_7 = []
|
||||
top_returns_beta = []
|
||||
|
||||
for date, group in df_pd.groupby('datetime'):
|
||||
if len(group) < 10:
|
||||
continue
|
||||
|
||||
n_top = max(1, int(len(group) * top_pct))
|
||||
|
||||
# Top predictions from 0_7
|
||||
top_0_7 = group.nlargest(n_top, 'pred_0_7')
|
||||
top_returns_0_7.append(top_0_7[return_col].mean())
|
||||
|
||||
# Top predictions from beta
|
||||
top_beta = group.nlargest(n_top, 'pred_beta')
|
||||
top_returns_beta.append(top_beta[return_col].mean())
|
||||
|
||||
return {
|
||||
'0_7': {
|
||||
'top_tier_return': np.mean(top_returns_0_7),
|
||||
'top_tier_std': np.std(top_returns_0_7)
|
||||
},
|
||||
'0_7_beta': {
|
||||
'top_tier_return': np.mean(top_returns_beta),
|
||||
'top_tier_std': np.std(top_returns_beta)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
def main():
|
||||
"""Main comparison function."""
|
||||
print("=" * 70)
|
||||
print("COMPARISON: Alpha158 0_7 vs 0_7_beta Predictions")
|
||||
print("=" * 70)
|
||||
|
||||
# Load and align predictions
|
||||
df_joined = load_and_align_predictions()
|
||||
|
||||
if len(df_joined) == 0:
|
||||
print("\nERROR: No overlapping predictions found!")
|
||||
return
|
||||
|
||||
# Calculate correlation
|
||||
print("\n" + "-" * 70)
|
||||
print("PREDICTION CORRELATION")
|
||||
print("-" * 70)
|
||||
|
||||
corr_metrics = calculate_correlation(df_joined)
|
||||
print(f"Overall Pearson correlation: {corr_metrics['pearson_corr']:.4f}")
|
||||
print(f"Overall Spearman correlation: {corr_metrics['spearman_corr']:.4f}")
|
||||
print(f"Daily correlation mean: {corr_metrics['daily_corr_mean']:.4f}")
|
||||
print(f"Daily correlation std: {corr_metrics['daily_corr_std']:.4f}")
|
||||
|
||||
# Prediction statistics
|
||||
print("\n" + "-" * 70)
|
||||
print("PREDICTION STATISTICS")
|
||||
print("-" * 70)
|
||||
|
||||
df_pd = df_joined.to_pandas()
|
||||
print(f"0_7 predictions:")
|
||||
print(f" Mean: {df_pd['pred_0_7'].mean():.6f}")
|
||||
print(f" Std: {df_pd['pred_0_7'].std():.6f}")
|
||||
print(f" Min: {df_pd['pred_0_7'].min():.6f}")
|
||||
print(f" Max: {df_pd['pred_0_7'].max():.6f}")
|
||||
|
||||
print(f"\n0_7_beta predictions:")
|
||||
print(f" Mean: {df_pd['pred_beta'].mean():.6f}")
|
||||
print(f" Std: {df_pd['pred_beta'].std():.6f}")
|
||||
print(f" Min: {df_pd['pred_beta'].min():.6f}")
|
||||
print(f" Max: {df_pd['pred_beta'].max():.6f}")
|
||||
|
||||
# Load actual returns and calculate IC metrics if available
|
||||
if os.path.exists(ACTUAL_RETURNS_FILE):
|
||||
print("\n" + "-" * 70)
|
||||
print("IC METRICS (with actual returns)")
|
||||
print("-" * 70)
|
||||
|
||||
actual_returns = pl.read_parquet(ACTUAL_RETURNS_FILE)
|
||||
print(f"Loaded actual returns: {actual_returns.shape}")
|
||||
|
||||
ic_metrics = calculate_ic_metrics(df_joined, actual_returns)
|
||||
|
||||
if ic_metrics:
|
||||
print(f"\n{'Metric':<20} {'0_7':<12} {'0_7_beta':<12} {'Diff':<12}")
|
||||
print("-" * 56)
|
||||
|
||||
for metric in ['ic_mean', 'ic_std', 'ic_ir', 'rankic_mean', 'rankic_std', 'rankic_ir']:
|
||||
v0 = ic_metrics['0_7'][metric]
|
||||
v1 = ic_metrics['0_7_beta'][metric]
|
||||
diff = v1 - v0
|
||||
print(f"{metric:<20} {v0:>11.4f} {v1:>11.4f} {diff:>+11.4f}")
|
||||
|
||||
# Top-tier returns
|
||||
print("\n" + "-" * 70)
|
||||
print("TOP-TIER RETURNS (top 10%)")
|
||||
print("-" * 70)
|
||||
|
||||
top_tier = calculate_top_tier_return(df_joined, actual_returns, top_pct=0.1)
|
||||
|
||||
if top_tier:
|
||||
print(f"{'':<20} {'0_7':<12} {'0_7_beta':<12} {'Diff':<12}")
|
||||
print("-" * 56)
|
||||
|
||||
t0 = top_tier['0_7']['top_tier_return']
|
||||
t1 = top_tier['0_7_beta']['top_tier_return']
|
||||
diff = t1 - t0
|
||||
print(f"{'Top-tier return':<20} {t0:>11.4f} {t1:>11.4f} {diff:>+11.4f}")
|
||||
else:
|
||||
print(f"\nActual returns file not found: {ACTUAL_RETURNS_FILE}")
|
||||
print("Skipping IC metrics calculation.")
|
||||
|
||||
print("\n" + "=" * 70)
|
||||
print("Comparison complete!")
|
||||
print("=" * 70)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -0,0 +1,421 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Dump Gold-Standard Data from Qlib Pipeline
|
||||
|
||||
This script exports processed feature data from the original Qlib pipeline
|
||||
in multiple formats for debugging and comparison with the standalone Polars implementation.
|
||||
|
||||
Usage:
|
||||
python dump_qlib_gold_standard.py --start-date 2020-01-02 --end-date 2020-01-10 --output-dir ../data/
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import pickle as pkl
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import polars as pl
|
||||
import numpy as np
|
||||
|
||||
# Patch NumPy 2.0 compatibility: np.NaN was removed, use np.nan
|
||||
if not hasattr(np, 'NaN'):
|
||||
np.NaN = np.nan
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Dump gold-standard data from Qlib pipeline"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--start-date",
|
||||
type=str,
|
||||
default="2020-01-02",
|
||||
help="Start date for data export (YYYY-MM-DD)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--end-date",
|
||||
type=str,
|
||||
default="2020-01-10",
|
||||
help="End date for data export (YYYY-MM-DD)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
default="../data/",
|
||||
help="Output directory for exported files",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--qlib-dataset-path",
|
||||
type=str,
|
||||
default="/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/",
|
||||
help="Path to Qlib dataset module",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_qlib_data(qlib_dataset_path, since_date):
|
||||
"""
|
||||
Load processed data from Qlib pipeline.
|
||||
|
||||
This function loads data using the original Qlib pipeline and handles
|
||||
the SepDataFrame return type by concatenating column groups.
|
||||
|
||||
Args:
|
||||
qlib_dataset_path: Path to the Qlib dataset module
|
||||
since_date: Start date for loading data (YYYY-MM-DD)
|
||||
|
||||
Returns:
|
||||
pd.DataFrame: Processed DataFrame from Qlib pipeline with all column groups concatenated
|
||||
"""
|
||||
import importlib.util
|
||||
import datetime as dt
|
||||
|
||||
# Patch ruamel.yaml to provide safe_load compatibility
|
||||
import ruamel.yaml as yaml
|
||||
|
||||
# Create a YAML instance with safe loader for backward compatibility
|
||||
_yaml = yaml.YAML(typ='safe', pure=True)
|
||||
|
||||
# Monkey-patch safe_load to use the new API
|
||||
def patched_safe_load(stream):
|
||||
import io
|
||||
if isinstance(stream, str):
|
||||
stream = io.StringIO(stream)
|
||||
return _yaml.load(stream)
|
||||
|
||||
yaml.safe_load = patched_safe_load
|
||||
|
||||
# Load the module directly
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"qlib_dataset",
|
||||
os.path.join(qlib_dataset_path, "__init__.py")
|
||||
)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
|
||||
# Parse since_date
|
||||
since_date_dt = pd.to_datetime(since_date)
|
||||
# Load with extra history for Diff processor
|
||||
load_start = (since_date_dt - dt.timedelta(days=20)).strftime("%Y-%m-%d")
|
||||
|
||||
print(f" Loading data with handler (load_start={load_start})...")
|
||||
|
||||
# Use _load_from_yaml to get raw handler data (SepDataFrame)
|
||||
handler_data = module._load_from_yaml(
|
||||
os.path.join(qlib_dataset_path, "handler.yaml"),
|
||||
load_start
|
||||
)
|
||||
|
||||
# Handle SepDataFrame - extract and concatenate column groups
|
||||
if hasattr(handler_data, '_data') or hasattr(handler_data, '_df_dict'):
|
||||
# It's a SepDataFrame from AggHandler
|
||||
df_dict = getattr(handler_data, '_data', None) or getattr(handler_data, '_df_dict', {})
|
||||
group_names = list(df_dict.keys())
|
||||
print(f" Handler returned SepDataFrame with groups: {group_names}")
|
||||
|
||||
# Concatenate all column groups into a single DataFrame
|
||||
all_dfs = []
|
||||
for group in group_names:
|
||||
df = df_dict[group]
|
||||
if df is not None and len(df.columns) > 0:
|
||||
df_copy = df.copy()
|
||||
# Add group prefix to columns
|
||||
df_copy.columns = [f"{group}::{col}" for col in df_copy.columns]
|
||||
all_dfs.append(df_copy)
|
||||
print(f" Group '{group}': {df_copy.shape}")
|
||||
|
||||
# Concatenate all groups along axis 1
|
||||
raw_df = pd.concat(all_dfs, axis=1)
|
||||
print(f" Concatenated raw data shape: {raw_df.shape}")
|
||||
else:
|
||||
raw_df = handler_data
|
||||
print(f" Raw data shape: {raw_df.shape}")
|
||||
|
||||
# Load processor list
|
||||
proc_path = os.path.join(qlib_dataset_path, "proc_list.proc")
|
||||
print(f" Loading processor list from: {proc_path}")
|
||||
with open(proc_path, "rb") as f:
|
||||
proc_list = pkl.load(f)
|
||||
print(f" Processor list has {len(proc_list)} processors")
|
||||
for i, proc in enumerate(proc_list):
|
||||
print(f" {i+1}. {type(proc).__name__}")
|
||||
|
||||
# Apply processors
|
||||
from qlib.contrib.data.utils import apply_proc_list
|
||||
print(f" Applying processor list (with_fit=False)...")
|
||||
|
||||
# The processor list expects columns without the group prefix
|
||||
# We need to strip the prefix before applying processors
|
||||
# Create a mapping and restore original column names
|
||||
col_mapping = {}
|
||||
for col in raw_df.columns:
|
||||
if '::' in col:
|
||||
original = col.split('::', 1)[1]
|
||||
col_mapping[col] = original
|
||||
|
||||
# Rename columns back to original names for processor application
|
||||
raw_df_renamed = raw_df.rename(columns=col_mapping)
|
||||
print(f" Renamed columns for processor compatibility. Shape: {raw_df_renamed.shape}")
|
||||
|
||||
# Convert boolean columns to object to avoid NaN -> int conversion issues
|
||||
bool_cols = raw_df_renamed.select_dtypes(include=['bool']).columns
|
||||
print(f" Converting {len(bool_cols)} boolean columns to object dtype")
|
||||
for col in bool_cols:
|
||||
raw_df_renamed[col] = raw_df_renamed[col].astype(object)
|
||||
|
||||
# Apply processors
|
||||
df = apply_proc_list(raw_df_renamed, proc_list=proc_list, with_fit=False)
|
||||
print(f" Applied processor list. Result shape: {df.shape}")
|
||||
|
||||
# Add back group prefixes to columns
|
||||
new_col_mapping = {v: k for k, v in col_mapping.items()}
|
||||
df = df.rename(columns=new_col_mapping)
|
||||
print(f" Restored column group prefixes. Shape: {df.shape}")
|
||||
|
||||
# Filter to requested date range
|
||||
df = df.loc(axis=0)[slice(since_date_dt, None)]
|
||||
print(f" Filtered to since_date={since_date}. Final shape: {df.shape}")
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def export_column_groups(df, output_dir, prefix="gold_standard"):
|
||||
"""
|
||||
Export separate files for different column groups.
|
||||
|
||||
Column groups:
|
||||
- feature: alpha158 + alpha158_ntrl
|
||||
- feature_ext: extended features (log_size_diff, etc.)
|
||||
- feature_flag: market flags (IsST, IsN, IsZt, IsDt, etc.)
|
||||
- indus_idx: industry index columns
|
||||
"""
|
||||
# Identify column groups based on naming conventions
|
||||
feature_cols = [c for c in df.columns if c.startswith("feature::")]
|
||||
feature_ext_cols = [c for c in df.columns if c.startswith("feature_ext::")]
|
||||
feature_flag_cols = [c for c in df.columns if c.startswith("feature_flag::")]
|
||||
indus_idx_cols = [c for c in df.columns if c.startswith("indus_idx::")]
|
||||
|
||||
# Also include the ntrl suffixed columns
|
||||
feature_ntrl_cols = [c for c in df.columns if c.endswith("_ntrl")]
|
||||
|
||||
export_paths = {}
|
||||
|
||||
# Export feature columns (alpha158 + alpha158_ntrl)
|
||||
if feature_cols:
|
||||
feature_df = df[feature_cols]
|
||||
path = os.path.join(output_dir, f"{prefix}_feature.parquet")
|
||||
feature_df.to_parquet(path)
|
||||
export_paths["feature"] = path
|
||||
print(f" Exported feature columns ({len(feature_cols)}): {path}")
|
||||
|
||||
# Export feature_ext columns
|
||||
if feature_ext_cols:
|
||||
feature_ext_df = df[feature_ext_cols]
|
||||
path = os.path.join(output_dir, f"{prefix}_feature_ext.parquet")
|
||||
feature_ext_df.to_parquet(path)
|
||||
export_paths["feature_ext"] = path
|
||||
print(f" Exported feature_ext columns ({len(feature_ext_cols)}): {path}")
|
||||
|
||||
# Export feature_flag columns
|
||||
if feature_flag_cols:
|
||||
feature_flag_df = df[feature_flag_cols]
|
||||
path = os.path.join(output_dir, f"{prefix}_feature_flag.parquet")
|
||||
feature_flag_df.to_parquet(path)
|
||||
export_paths["feature_flag"] = path
|
||||
print(f" Exported feature_flag columns ({len(feature_flag_cols)}): {path}")
|
||||
|
||||
# Export indus_idx columns
|
||||
if indus_idx_cols:
|
||||
indus_idx_df = df[indus_idx_cols]
|
||||
path = os.path.join(output_dir, f"{prefix}_indus_idx.parquet")
|
||||
indus_idx_df.to_parquet(path)
|
||||
export_paths["indus_idx"] = path
|
||||
print(f" Exported indus_idx columns ({len(indus_idx_cols)}): {path}")
|
||||
|
||||
# Export feature_ntrl columns separately
|
||||
if feature_ntrl_cols:
|
||||
feature_ntrl_df = df[feature_ntrl_cols]
|
||||
path = os.path.join(output_dir, f"{prefix}_feature_ntrl.parquet")
|
||||
feature_ntrl_df.to_parquet(path)
|
||||
export_paths["feature_ntrl"] = path
|
||||
print(f" Exported feature_ntrl columns ({len(feature_ntrl_cols)}): {path}")
|
||||
|
||||
return export_paths
|
||||
|
||||
|
||||
def export_metadata(df, output_dir, prefix="gold_standard", proc_list_path=None):
|
||||
"""
|
||||
Export metadata about the dataset.
|
||||
|
||||
Includes:
|
||||
- Column names and shapes
|
||||
- Processor list configuration
|
||||
- Date range coverage
|
||||
- NaN value statistics
|
||||
"""
|
||||
metadata_path = os.path.join(output_dir, f"{prefix}_metadata.txt")
|
||||
|
||||
with open(metadata_path, "w") as f:
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write("GOLD-STANDARD QLIB PIPELINE OUTPUT - METADATA\n")
|
||||
f.write("=" * 80 + "\n\n")
|
||||
|
||||
f.write(f"Export Date: {datetime.now().isoformat()}\n\n")
|
||||
|
||||
f.write("DATAFRAME SHAPE\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
f.write(f"Shape: {df.shape}\n")
|
||||
f.write(f"Rows: {len(df)}\n")
|
||||
f.write(f"Columns: {len(df.columns)}\n\n")
|
||||
|
||||
f.write("DATE RANGE\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
dates = df.index.get_level_values("datetime").unique()
|
||||
f.write(f"Min Date: {dates.min()}\n")
|
||||
f.write(f"Max Date: {dates.max()}\n")
|
||||
f.write(f"Unique Dates: {len(dates)}\n\n")
|
||||
|
||||
f.write("INSTRUMENTS\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
instruments = df.index.get_level_values("instrument").unique()
|
||||
f.write(f"Unique Instruments: {len(instruments)}\n")
|
||||
f.write(f"Sample Instruments: {list(instruments[:10])}\n\n")
|
||||
|
||||
f.write("COLUMN GROUPS\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
|
||||
# Categorize columns
|
||||
feature_cols = [c for c in df.columns if c.startswith("feature::")]
|
||||
feature_ext_cols = [c for c in df.columns if c.startswith("feature_ext::")]
|
||||
feature_flag_cols = [c for c in df.columns if c.startswith("feature_flag::")]
|
||||
indus_idx_cols = [c for c in df.columns if c.startswith("indus_idx::")]
|
||||
feature_ntrl_cols = [c for c in df.columns if c.endswith("_ntrl")]
|
||||
|
||||
f.write(f"feature:: columns: {len(feature_cols)}\n")
|
||||
f.write(f"feature_ext:: columns: {len(feature_ext_cols)}\n")
|
||||
f.write(f"feature_flag:: columns: {len(feature_flag_cols)}\n")
|
||||
f.write(f"indus_idx:: columns: {len(indus_idx_cols)}\n")
|
||||
f.write(f"*_ntrl columns: {len(feature_ntrl_cols)}\n\n")
|
||||
|
||||
f.write("COLUMN DTYPES\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
dtype_counts = df.dtypes.value_counts()
|
||||
for dtype, count in dtype_counts.items():
|
||||
f.write(f"{dtype}: {count}\n")
|
||||
f.write("\n")
|
||||
|
||||
f.write("NAN STATISTICS\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
nan_counts = df.isna().sum()
|
||||
cols_with_nan = nan_counts[nan_counts > 0]
|
||||
f.write(f"Columns with NaN: {len(cols_with_nan)}\n")
|
||||
f.write(f"Total NaN values: {df.isna().sum().sum()}\n\n")
|
||||
|
||||
if len(cols_with_nan) > 0:
|
||||
f.write("NaN per column (top 20):\n")
|
||||
for col, cnt in cols_with_nan.nlargest(20).items():
|
||||
f.write(f" {col}: {cnt} ({100*cnt/len(df):.2f}%)\n")
|
||||
f.write("\n")
|
||||
|
||||
f.write("ALL COLUMN NAMES\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
for i, col in enumerate(df.columns):
|
||||
f.write(f" {i+1}. {col}\n")
|
||||
f.write("\n")
|
||||
|
||||
if proc_list_path and os.path.exists(proc_list_path):
|
||||
f.write("PROCESSOR LIST\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
f.write(f"Source: {proc_list_path}\n")
|
||||
try:
|
||||
with open(proc_list_path, "rb") as pf:
|
||||
proc_list = pkl.load(pf)
|
||||
f.write(f"Number of processors: {len(proc_list)}\n\n")
|
||||
for i, proc in enumerate(proc_list):
|
||||
f.write(f" {i+1}. {proc}\n")
|
||||
except Exception as e:
|
||||
f.write(f"Could not load processor list: {e}\n")
|
||||
f.write("\n")
|
||||
|
||||
print(f"Exported metadata: {metadata_path}")
|
||||
return metadata_path
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
# Parse dates
|
||||
start_date = pd.to_datetime(args.start_date)
|
||||
end_date = pd.to_datetime(args.end_date)
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
output_dir = Path(args.output_dir).resolve()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print("=" * 80)
|
||||
print("DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE")
|
||||
print("=" * 80)
|
||||
print(f"Date Range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
||||
print(f"Output Directory: {output_dir}")
|
||||
print(f"Qlib Dataset Path: {args.qlib_dataset_path}")
|
||||
print()
|
||||
|
||||
# Load data from Qlib pipeline
|
||||
print("Step 1: Loading data from Qlib pipeline...")
|
||||
print(f" Loading since_date={start_date.strftime('%Y-%m-%d')}")
|
||||
|
||||
try:
|
||||
df = load_qlib_data(args.qlib_dataset_path, start_date.strftime("%Y-%m-%d"))
|
||||
print(f" Loaded DataFrame with shape: {df.shape}")
|
||||
except Exception as e:
|
||||
print(f" ERROR: Failed to load data from Qlib pipeline: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
# Filter to requested date range
|
||||
print("\nStep 2: Filtering to requested date range...")
|
||||
df = df.loc(axis=0)[slice(start_date, end_date)]
|
||||
print(f" Filtered shape: {df.shape}")
|
||||
|
||||
# Export full DataFrame
|
||||
print("\nStep 3: Exporting full DataFrame...")
|
||||
prefix = f"gold_standard_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}"
|
||||
|
||||
parquet_path = output_dir / f"{prefix}.parquet"
|
||||
df.to_parquet(parquet_path)
|
||||
print(f" Exported parquet: {parquet_path}")
|
||||
|
||||
pkl_path = output_dir / f"{prefix}.pkl"
|
||||
df.to_pickle(pkl_path)
|
||||
print(f" Exported pickle: {pkl_path}")
|
||||
|
||||
# Export column groups
|
||||
print("\nStep 4: Exporting column groups...")
|
||||
export_paths = export_column_groups(df, str(output_dir), prefix=prefix)
|
||||
|
||||
# Export metadata
|
||||
print("\nStep 5: Exporting metadata...")
|
||||
proc_list_path = os.path.join(args.qlib_dataset_path, "proc_list.proc")
|
||||
export_metadata(df, str(output_dir), prefix=prefix, proc_list_path=proc_list_path)
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 80)
|
||||
print("EXPORT SUMMARY")
|
||||
print("=" * 80)
|
||||
print(f"Date range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
||||
print(f"Output directory: {output_dir}")
|
||||
print(f"Total rows: {len(df)}")
|
||||
print(f"Total columns: {len(df.columns)}")
|
||||
print(f"\nFiles exported:")
|
||||
print(f" - {prefix}.parquet (full DataFrame)")
|
||||
print(f" - {prefix}.pkl (pickle, preserves dtypes)")
|
||||
print(f" - {prefix}_metadata.txt (column info, statistics)")
|
||||
for group, path in export_paths.items():
|
||||
print(f" - {os.path.basename(path)} ({group} columns)")
|
||||
print("\nDone!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -0,0 +1,270 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Dump Gold-Standard Data from Qlib Pipeline (Simple Version)
|
||||
|
||||
This script exports the RAW feature data from the Qlib pipeline BEFORE
|
||||
any processors are applied. This is useful for debugging and comparison.
|
||||
|
||||
NOTE: This script loads ALL data from DolphinDB and then filters to the
|
||||
requested date range. For large date ranges, this may require significant memory.
|
||||
|
||||
Usage:
|
||||
python dump_qlib_gold_standard_simple.py --start-date 2020-01-02 --end-date 2020-01-10
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import os
|
||||
import sys
|
||||
import pickle as pkl
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# Patch NumPy 2.0 compatibility: np.NaN was removed, use np.nan
|
||||
if not hasattr(np, 'NaN'):
|
||||
np.NaN = np.nan
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Dump gold-standard raw data from Qlib pipeline",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Export a few days for debugging (recommended)
|
||||
python dump_qlib_gold_standard_simple.py --start-date 2020-01-02 --end-date 2020-01-10
|
||||
|
||||
# Export with custom output directory
|
||||
python dump_qlib_gold_standard_simple.py --start-date 2020-01-02 --end-date 2020-01-10 --output-dir /path/to/output
|
||||
"""
|
||||
)
|
||||
parser.add_argument(
|
||||
"--start-date",
|
||||
type=str,
|
||||
default="2020-01-02",
|
||||
help="Start date for data export (YYYY-MM-DD)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--end-date",
|
||||
type=str,
|
||||
default="2020-01-10",
|
||||
help="End date for data export (YYYY-MM-DD)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
type=str,
|
||||
default="../data/",
|
||||
help="Output directory for exported files",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--qlib-dataset-path",
|
||||
type=str,
|
||||
default="/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/",
|
||||
help="Path to Qlib dataset module",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--instruments",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Comma-separated list of instrument codes to export (default: all)",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def load_raw_data(qlib_dataset_path, since_date, instruments=None):
|
||||
"""
|
||||
Load RAW data from Qlib pipeline (before processor list is applied).
|
||||
|
||||
Returns a dict of DataFrames, one per column group.
|
||||
|
||||
Args:
|
||||
qlib_dataset_path: Path to Qlib dataset module
|
||||
since_date: Start date for loading (needs history before for Diff)
|
||||
instruments: Optional list of instrument codes to filter
|
||||
"""
|
||||
import importlib.util
|
||||
import ruamel.yaml as yaml
|
||||
|
||||
# Create a YAML instance with safe loader for backward compatibility
|
||||
_yaml = yaml.YAML(typ='safe', pure=True)
|
||||
|
||||
def patched_safe_load(stream):
|
||||
import io
|
||||
if isinstance(stream, str):
|
||||
stream = io.StringIO(stream)
|
||||
return _yaml.load(stream)
|
||||
|
||||
yaml.safe_load = patched_safe_load
|
||||
|
||||
# Load the module directly
|
||||
spec = importlib.util.spec_from_file_location(
|
||||
"qlib_dataset",
|
||||
os.path.join(qlib_dataset_path, "__init__.py")
|
||||
)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
|
||||
# Parse since_date
|
||||
since_date_dt = pd.to_datetime(since_date)
|
||||
# Load with extra history for Diff processor
|
||||
load_start = (since_date_dt - timedelta(days=20)).strftime("%Y-%m-%d")
|
||||
|
||||
print(f" Loading raw data from handler (load_start={load_start})...")
|
||||
if instruments:
|
||||
print(f" Filtering instruments: {instruments[:5]}... ({len(instruments)} total)")
|
||||
|
||||
# Use _load_from_yaml to get raw handler data (SepDataFrame)
|
||||
handler_data = module._load_from_yaml(
|
||||
os.path.join(qlib_dataset_path, "handler.yaml"),
|
||||
load_start
|
||||
)
|
||||
|
||||
# Handle SepDataFrame - extract column groups
|
||||
if hasattr(handler_data, '_data') or hasattr(handler_data, '_df_dict'):
|
||||
df_dict = getattr(handler_data, '_data', None) or getattr(handler_data, '_df_dict', {})
|
||||
group_names = list(df_dict.keys())
|
||||
print(f" Handler returned SepDataFrame with groups: {group_names}")
|
||||
|
||||
# Filter instruments if specified
|
||||
if instruments:
|
||||
print(f" Filtering to specified instruments...")
|
||||
for group in group_names:
|
||||
if df_dict[group] is not None:
|
||||
df = df_dict[group]
|
||||
# Filter by instrument level
|
||||
if isinstance(df.index, pd.MultiIndex):
|
||||
mask = df.index.get_level_values('instrument').isin(instruments)
|
||||
df_dict[group] = df[mask]
|
||||
print(f" Group '{group}': {df_dict[group].shape} (filtered)")
|
||||
|
||||
for group in group_names:
|
||||
df = df_dict[group]
|
||||
if df is not None:
|
||||
print(f" Group '{group}': shape={df.shape}, columns={len(df.columns)}")
|
||||
|
||||
return df_dict, handler_data.index
|
||||
else:
|
||||
print(f" Handler returned DataFrame: shape={handler_data.shape}")
|
||||
return {"default": handler_data}, handler_data.index
|
||||
|
||||
|
||||
def export_data(df_dict, index, output_dir, start_date, end_date):
|
||||
"""Export data to parquet and pickle files."""
|
||||
output_dir = Path(output_dir).resolve()
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
start_date = pd.to_datetime(start_date)
|
||||
end_date = pd.to_datetime(end_date)
|
||||
|
||||
# Filter index
|
||||
mask = (index >= start_date) & (index <= end_date)
|
||||
filtered_index = index[mask]
|
||||
|
||||
print(f"\nExporting data for date range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}")
|
||||
print(f" Filtered index has {len(filtered_index)} dates")
|
||||
|
||||
prefix = f"gold_standard_raw_{start_date.strftime('%Y%m%d')}_{end_date.strftime('%Y%m%d')}"
|
||||
|
||||
exported_files = []
|
||||
|
||||
# Export each group separately
|
||||
for group, df in df_dict.items():
|
||||
if df is None or len(df.columns) == 0:
|
||||
print(f" Skipping empty group '{group}'")
|
||||
continue
|
||||
|
||||
# Filter by date
|
||||
df_filtered = df.loc[df.index.isin(filtered_index)]
|
||||
print(f" Group '{group}': {df_filtered.shape}")
|
||||
|
||||
# Export to parquet
|
||||
parquet_path = output_dir / f"{prefix}_{group}.parquet"
|
||||
df_filtered.to_parquet(parquet_path)
|
||||
exported_files.append(str(parquet_path))
|
||||
print(f" -> {parquet_path}")
|
||||
|
||||
# Export to pickle (preserves dtypes)
|
||||
pkl_path = output_dir / f"{prefix}_{group}.pkl"
|
||||
df_filtered.to_pickle(pkl_path)
|
||||
exported_files.append(str(pkl_path))
|
||||
|
||||
# Also create a metadata file
|
||||
metadata_path = output_dir / f"{prefix}_metadata.txt"
|
||||
with open(metadata_path, "w") as f:
|
||||
f.write("=" * 80 + "\n")
|
||||
f.write("GOLD-STANDARD RAW DATA - METADATA\n")
|
||||
f.write("=" * 80 + "\n\n")
|
||||
f.write(f"Export Date: {datetime.now().isoformat()}\n")
|
||||
f.write(f"Date Range: {start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}\n")
|
||||
f.write(f"Total Dates: {len(filtered_index)}\n\n")
|
||||
|
||||
f.write("COLUMN GROUPS:\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
for group, df in df_dict.items():
|
||||
if df is not None:
|
||||
f.write(f" {group}:\n")
|
||||
f.write(f" Shape: {df.shape}\n")
|
||||
f.write(f" Columns: {len(df.columns)}\n")
|
||||
f.write(f" Sample columns: {list(df.columns[:5])}...\n\n")
|
||||
|
||||
f.write("\nPROCESSOR LIST (for reference):\n")
|
||||
f.write("-" * 40 + "\n")
|
||||
proc_path = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc"
|
||||
if os.path.exists(proc_path):
|
||||
with open(proc_path, "rb") as pf:
|
||||
proc_list = pkl.load(pf)
|
||||
f.write(f"Number of processors: {len(proc_list)}\n\n")
|
||||
for i, proc in enumerate(proc_list):
|
||||
f.write(f" {i+1}. {type(proc).__module__}.{type(proc).__name__}\n")
|
||||
else:
|
||||
f.write(f"Processor list not found: {proc_path}\n")
|
||||
|
||||
exported_files.append(str(metadata_path))
|
||||
|
||||
return exported_files
|
||||
|
||||
|
||||
def main():
|
||||
args = parse_args()
|
||||
|
||||
print("=" * 80)
|
||||
print("DUMP GOLD-STANDARD RAW DATA FROM QLIB PIPELINE")
|
||||
print("=" * 80)
|
||||
print(f"Date Range: {args.start_date} to {args.end_date}")
|
||||
print(f"Output Directory: {args.output_dir}")
|
||||
print(f"Qlib Dataset Path: {args.qlib_dataset_path}")
|
||||
print()
|
||||
|
||||
# Load raw data
|
||||
print("Step 1: Loading raw data from Qlib pipeline...")
|
||||
try:
|
||||
instruments = None
|
||||
if args.instruments:
|
||||
instruments = args.instruments.split(',')
|
||||
df_dict, index = load_raw_data(args.qlib_dataset_path, args.start_date, instruments=instruments)
|
||||
except Exception as e:
|
||||
print(f" ERROR: Failed to load data: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
# Export data
|
||||
print("\nStep 2: Exporting data...")
|
||||
exported_files = export_data(df_dict, index, args.output_dir, args.start_date, args.end_date)
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 80)
|
||||
print("EXPORT SUMMARY")
|
||||
print("=" * 80)
|
||||
print(f"Date range: {args.start_date} to {args.end_date}")
|
||||
print(f"Output directory: {Path(args.output_dir).resolve()}")
|
||||
print(f"\nFiles exported ({len(exported_files)}):")
|
||||
for f in exported_files:
|
||||
print(f" - {f}")
|
||||
print("\nDone!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
@ -0,0 +1,210 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Fetch embedding data from DolphinDB and save to parquet.
|
||||
|
||||
This script:
|
||||
1. Connects to DolphinDB
|
||||
2. Queries the dwm_1day_multicast_csencode table
|
||||
3. Filters by version (default: 'csiallx_feature2_ntrla_flag_pnlnorm')
|
||||
4. Filters by date range
|
||||
5. Transforms columns (m_nDate -> datetime, code -> instrument)
|
||||
6. Saves to local parquet file
|
||||
"""
|
||||
|
||||
import os
|
||||
import polars as pl
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
# DolphinDB config (from CLAUDE.md)
|
||||
DDB_CONFIG = {
|
||||
"host": "192.168.1.146",
|
||||
"port": 8848,
|
||||
"username": "admin",
|
||||
"password": "123456"
|
||||
}
|
||||
|
||||
DB_PATH = "dfs://daily_stock_run_multicast"
|
||||
TABLE_NAME = "dwm_1day_multicast_csencode"
|
||||
DEFAULT_VERSION = "csix_alpha158b_ext2_zscore_vae4"
|
||||
DEFAULT_START_DATE = "2019-01-01"
|
||||
DEFAULT_END_DATE = "2025-12-31"
|
||||
OUTPUT_FILE = "../data/embeddings_from_ddb.parquet"
|
||||
|
||||
|
||||
def fetch_embeddings(
|
||||
start_date: str = DEFAULT_START_DATE,
|
||||
end_date: str = DEFAULT_END_DATE,
|
||||
version: str = DEFAULT_VERSION,
|
||||
output_file: str = OUTPUT_FILE
|
||||
) -> pl.DataFrame:
|
||||
"""
|
||||
Fetch embedding data from DolphinDB.
|
||||
|
||||
Args:
|
||||
start_date: Start date filter (YYYY-MM-DD)
|
||||
end_date: End date filter (YYYY-MM-DD)
|
||||
version: Version string to filter by
|
||||
output_file: Output parquet file path
|
||||
|
||||
Returns:
|
||||
Polars DataFrame with columns: [datetime, instrument, embedding_0, embedding_1, ...]
|
||||
"""
|
||||
print("=" * 60)
|
||||
print("Fetching embedding data from DolphinDB")
|
||||
print("=" * 60)
|
||||
print(f"Database: {DB_PATH}")
|
||||
print(f"Table: {TABLE_NAME}")
|
||||
print(f"Version: {version}")
|
||||
print(f"Date range: {start_date} to {end_date}")
|
||||
|
||||
# Connect to DolphinDB
|
||||
try:
|
||||
from qshare.io.ddb import get_ddb_sess
|
||||
sess = get_ddb_sess(host=DDB_CONFIG["host"], port=DDB_CONFIG["port"])
|
||||
print(f"Connected to DolphinDB at {DDB_CONFIG['host']}:{DDB_CONFIG['port']}")
|
||||
except Exception as e:
|
||||
print(f"Error connecting to DolphinDB: {e}")
|
||||
raise
|
||||
|
||||
# Convert date strings to DolphinDB date format (YYYY.MM.DD)
|
||||
start_ddb = start_date.replace("-", ".")
|
||||
end_ddb = end_date.replace("-", ".")
|
||||
|
||||
# Build SQL query with filters in the WHERE clause
|
||||
# Note: DolphinDB requires date() function for date literals
|
||||
# Use single-line SQL to avoid parsing issues
|
||||
sql = f'select * from loadTable("{DB_PATH}", "{TABLE_NAME}") where version = "{version}" and m_nDate >= date({start_ddb}) and m_nDate <= date({end_ddb})'
|
||||
|
||||
print(f"Executing SQL: {sql.strip()}")
|
||||
|
||||
try:
|
||||
# Execute query and get pandas DataFrame
|
||||
df_pd = sess.run(sql)
|
||||
print(f"Fetched {len(df_pd)} rows from DolphinDB")
|
||||
print(f"Columns: {df_pd.columns.tolist()}")
|
||||
if len(df_pd) > 0:
|
||||
print(f"Sample:\n{df_pd.head()}")
|
||||
except Exception as e:
|
||||
print(f"Error executing query: {e}")
|
||||
raise
|
||||
finally:
|
||||
sess.close()
|
||||
|
||||
# Convert to Polars
|
||||
df = pl.from_pandas(df_pd)
|
||||
print(f"Columns in result: {df.columns}")
|
||||
|
||||
# Transform columns
|
||||
# Rename m_nDate -> datetime and convert to uint32 (YYYYMMDD)
|
||||
if 'm_nDate' in df.columns:
|
||||
df = df.rename({"m_nDate": "datetime"})
|
||||
|
||||
if df["datetime"].dtype == pl.Datetime:
|
||||
df = df.with_columns([
|
||||
pl.col("datetime").dt.strftime("%Y%m%d").cast(pl.UInt32).alias("datetime")
|
||||
])
|
||||
elif df["datetime"].dtype == pl.Date:
|
||||
df = df.with_columns([
|
||||
pl.col("datetime").dt.strftime("%Y%m%d").cast(pl.UInt32).alias("datetime")
|
||||
])
|
||||
elif df["datetime"].dtype in [pl.Utf8, pl.String]:
|
||||
df = df.with_columns([
|
||||
pl.col("datetime").str.replace("-", "").cast(pl.UInt32).alias("datetime")
|
||||
])
|
||||
else:
|
||||
df = df.with_columns([pl.col("datetime").cast(pl.UInt32).alias("datetime")])
|
||||
|
||||
# Rename code -> instrument and convert to uint32
|
||||
if 'code' in df.columns:
|
||||
df = df.rename({"code": "instrument"})
|
||||
|
||||
# Convert TS code (e.g., 'SH600085') to uint32 by removing prefix and casting
|
||||
df = df.with_columns([
|
||||
pl.col("instrument")
|
||||
.str.replace("SH", "")
|
||||
.str.replace("SZ", "")
|
||||
.str.replace("BJ", "")
|
||||
.cast(pl.UInt32)
|
||||
.alias("instrument")
|
||||
])
|
||||
|
||||
# Drop version column if present (no longer needed)
|
||||
if 'version' in df.columns:
|
||||
df = df.drop('version')
|
||||
|
||||
# Check if 'values' column contains lists (embedding vectors)
|
||||
if 'values' in df.columns and df['values'].dtype == pl.List:
|
||||
# Get the embedding dimension from the first row
|
||||
first_val = df['values'][0]
|
||||
if first_val is not None:
|
||||
emb_dim = len(first_val)
|
||||
print(f"Detected embedding dimension: {emb_dim}")
|
||||
|
||||
# Expand the list column to separate embedding columns
|
||||
embedding_cols = []
|
||||
for i in range(emb_dim):
|
||||
col_name = f"embedding_{i}"
|
||||
embedding_cols.append(col_name)
|
||||
df = df.with_columns([
|
||||
pl.col('values').list.get(i).alias(col_name)
|
||||
])
|
||||
|
||||
# Drop the original values column
|
||||
df = df.drop('values')
|
||||
|
||||
# Reorder columns: datetime, instrument, embedding_0, embedding_1, ...
|
||||
core_cols = ['datetime', 'instrument']
|
||||
final_cols = core_cols + embedding_cols
|
||||
df = df.select(final_cols)
|
||||
|
||||
print(f"Expanded embeddings into {emb_dim} columns")
|
||||
else:
|
||||
# Identify embedding columns (typically named 'feature_0', 'feature_1', etc. or 'emb_0', 'emb_1', etc.)
|
||||
# Keep datetime, instrument, and any embedding/feature columns
|
||||
core_cols = ['datetime', 'instrument']
|
||||
embedding_cols = [c for c in df.columns if c not in core_cols + ['version']]
|
||||
|
||||
# Select and order columns
|
||||
final_cols = core_cols + sorted(embedding_cols)
|
||||
df = df.select(final_cols)
|
||||
|
||||
print(f"\nTransformed data:")
|
||||
print(f" Shape: {df.shape}")
|
||||
print(f" Columns: {df.columns[:10]}..." if len(df.columns) > 10 else f" Columns: {df.columns}")
|
||||
print(f" Date range: {df['datetime'].min()} to {df['datetime'].max()}")
|
||||
print(f" Instrument count: {df['instrument'].n_unique()}")
|
||||
print(f" Sample:\n{df.head()}")
|
||||
|
||||
# Save to parquet
|
||||
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
||||
df.write_parquet(output_file)
|
||||
print(f"\nSaved to: {output_file}")
|
||||
|
||||
return df
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Fetch embedding data from DolphinDB")
|
||||
parser.add_argument("--start-date", type=str, default=DEFAULT_START_DATE,
|
||||
help="Start date (YYYY-MM-DD)")
|
||||
parser.add_argument("--end-date", type=str, default=DEFAULT_END_DATE,
|
||||
help="End date (YYYY-MM-DD)")
|
||||
parser.add_argument("--version", type=str, default=DEFAULT_VERSION,
|
||||
help="Version string to filter by")
|
||||
parser.add_argument("--output", type=str, default=OUTPUT_FILE,
|
||||
help="Output parquet file")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
df = fetch_embeddings(
|
||||
start_date=args.start_date,
|
||||
end_date=args.end_date,
|
||||
version=args.version,
|
||||
output_file=args.output
|
||||
)
|
||||
|
||||
print("\nDone!")
|
||||
@ -0,0 +1,211 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Fetch original 0_7 predictions from DolphinDB and save to parquet.
|
||||
|
||||
This script:
|
||||
1. Connects to DolphinDB
|
||||
2. Queries the app_1day_multicast_longsignal_port table
|
||||
3. Filters for version 'host140_exp20_d033'
|
||||
4. Transforms columns (m_nDate -> datetime, code -> instrument)
|
||||
5. Saves to local parquet file
|
||||
"""
|
||||
|
||||
import os
|
||||
import polars as pl
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
# DolphinDB config (from CLAUDE.md)
|
||||
DDB_CONFIG = {
|
||||
"host": "192.168.1.146",
|
||||
"port": 8848,
|
||||
"username": "admin",
|
||||
"password": "123456"
|
||||
}
|
||||
|
||||
TABLE_PATH = "dfs://daily_stock_run_multicast/app_1day_multicast_longsignal_port"
|
||||
VERSION = "host140_exp20_d033"
|
||||
OUTPUT_FILE = "../data/original_predictions_0_7.parquet"
|
||||
|
||||
|
||||
def datetime_to_uint32(dt) -> int:
|
||||
"""Convert datetime to YYYYMMDD uint32 format."""
|
||||
if isinstance(dt, (int, float)):
|
||||
return int(dt)
|
||||
if hasattr(dt, 'strftime'):
|
||||
return int(dt.strftime('%Y%m%d'))
|
||||
return int(dt)
|
||||
|
||||
|
||||
def tscode_to_uint32(code) -> int:
|
||||
"""Convert TS code (e.g., '000001.SZ') to uint32 instrument code."""
|
||||
if isinstance(code, int):
|
||||
return code
|
||||
# Remove exchange suffix and leading zeros
|
||||
code_str = str(code).split('.')[0]
|
||||
return int(code_str)
|
||||
|
||||
|
||||
def fetch_original_predictions(
|
||||
start_date: Optional[str] = None,
|
||||
end_date: Optional[str] = None,
|
||||
output_file: str = OUTPUT_FILE
|
||||
) -> pl.DataFrame:
|
||||
"""
|
||||
Fetch original 0_7 predictions from DolphinDB.
|
||||
|
||||
Args:
|
||||
start_date: Optional start date filter (YYYY-MM-DD)
|
||||
end_date: Optional end date filter (YYYY-MM-DD)
|
||||
output_file: Output parquet file path
|
||||
|
||||
Returns:
|
||||
Polars DataFrame with columns: [datetime, instrument, prediction]
|
||||
"""
|
||||
print("Fetching original 0_7 predictions from DolphinDB...")
|
||||
print(f"Table: {TABLE_PATH}")
|
||||
print(f"Version: {VERSION}")
|
||||
|
||||
# Connect to DolphinDB
|
||||
try:
|
||||
from qshare.io.ddb import get_ddb_sess
|
||||
sess = get_ddb_sess(host=DDB_CONFIG["host"], port=DDB_CONFIG["port"])
|
||||
print(f"Connected to DolphinDB at {DDB_CONFIG['host']}:{DDB_CONFIG['port']}")
|
||||
except Exception as e:
|
||||
print(f"Error connecting to DolphinDB: {e}")
|
||||
raise
|
||||
|
||||
# Build SQL query using DolphinDB syntax
|
||||
# Need to load the table via database() first using dfs:// path
|
||||
db_path, table_name = TABLE_PATH.replace("dfs://", "").split("/", 1)
|
||||
|
||||
# Use DolphinDB's SQL syntax with loadTable and dfs://
|
||||
sql = f"""
|
||||
select * from loadTable("dfs://{db_path}", "{table_name}")
|
||||
"""
|
||||
|
||||
# We'll filter in Python after loading since DolphinDB's SQL syntax
|
||||
# for partitioned tables can be tricky
|
||||
print(f"Executing SQL: {sql.strip()}")
|
||||
|
||||
try:
|
||||
# Execute query and get pandas DataFrame
|
||||
df_full = sess.run(sql)
|
||||
print(f"Fetched {len(df_full)} total rows from DolphinDB")
|
||||
print(f"Columns: {df_full.columns.tolist()}")
|
||||
print(f"Sample:\n{df_full.head()}")
|
||||
print(f"Version values: {df_full['version'].unique()[:10] if 'version' in df_full.columns else 'N/A'}")
|
||||
|
||||
# Filter for version in Python
|
||||
# Version string contains additional parameters, use startswith
|
||||
if 'version' in df_full.columns:
|
||||
df_pd = df_full[df_full['version'].str.startswith(VERSION)]
|
||||
print(f"Filtered to {len(df_pd)} rows for version '{VERSION}'")
|
||||
if len(df_pd) > 0:
|
||||
print(f"Matching versions: {df_pd['version'].unique()[:5]}")
|
||||
else:
|
||||
print("Warning: 'version' column not found, using all data")
|
||||
df_pd = df_full
|
||||
|
||||
# Apply date filters if specified
|
||||
# m_nDate is datetime64, convert to YYYYMMDD int for comparison
|
||||
if start_date and 'm_nDate' in df_pd.columns:
|
||||
start_dt = pd.to_datetime(start_date)
|
||||
df_pd = df_pd[df_pd['m_nDate'] >= start_dt]
|
||||
if end_date and 'm_nDate' in df_pd.columns:
|
||||
end_dt = pd.to_datetime(end_date)
|
||||
df_pd = df_pd[df_pd['m_nDate'] <= end_dt]
|
||||
|
||||
print(f"After date filter: {len(df_pd)} rows")
|
||||
except Exception as e:
|
||||
print(f"Error executing query: {e}")
|
||||
raise
|
||||
finally:
|
||||
sess.close()
|
||||
|
||||
# Convert to Polars
|
||||
df = pl.from_pandas(df_pd)
|
||||
print(f"Columns in result: {df.columns}")
|
||||
print(f"Sample data:\n{df.head()}")
|
||||
|
||||
# Transform columns
|
||||
# Rename m_nDate -> datetime and convert to uint32
|
||||
df = df.rename({"m_nDate": "datetime"})
|
||||
|
||||
# Handle datetime conversion from datetime[ns] to uint32 (YYYYMMDD)
|
||||
if df["datetime"].dtype == pl.Datetime:
|
||||
df = df.with_columns([
|
||||
pl.col("datetime").dt.strftime("%Y%m%d").cast(pl.UInt32).alias("datetime")
|
||||
])
|
||||
elif df["datetime"].dtype == pl.Date:
|
||||
df = df.with_columns([
|
||||
pl.col("datetime").dt.strftime("%Y%m%d").cast(pl.UInt32).alias("datetime")
|
||||
])
|
||||
elif df["datetime"].dtype in [pl.Utf8, pl.String]:
|
||||
df = df.with_columns([
|
||||
pl.col("datetime").str.replace("-", "").cast(pl.UInt32).alias("datetime")
|
||||
])
|
||||
else:
|
||||
# Already numeric, just cast
|
||||
df = df.with_columns([pl.col("datetime").cast(pl.UInt32).alias("datetime")])
|
||||
|
||||
# Rename code -> instrument and convert to uint32
|
||||
# The code is in format "SH600085" or "SZ000001"
|
||||
df = df.rename({"code": "instrument"})
|
||||
|
||||
# Convert TS code (e.g., 'SH600085') to uint32 by removing prefix and casting
|
||||
df = df.with_columns([
|
||||
pl.col("instrument")
|
||||
.str.replace("SH", "")
|
||||
.str.replace("SZ", "")
|
||||
.str.replace("BJ", "")
|
||||
.cast(pl.UInt32)
|
||||
.alias("instrument")
|
||||
])
|
||||
|
||||
# The prediction column is 'weight' in this table
|
||||
# Rename it to 'prediction' for consistency
|
||||
if 'weight' in df.columns:
|
||||
df = df.rename({'weight': 'prediction'})
|
||||
else:
|
||||
# Fallback: find any numeric column that's not datetime or instrument
|
||||
for col in df.columns:
|
||||
if col not in ['datetime', 'instrument'] and df[col].dtype in [pl.Float32, pl.Float64]:
|
||||
df = df.rename({col: 'prediction'})
|
||||
break
|
||||
|
||||
# Select only the columns we need
|
||||
df = df.select(["datetime", "instrument", "prediction"])
|
||||
|
||||
print(f"\nTransformed data:")
|
||||
print(f" Shape: {df.shape}")
|
||||
print(f" Columns: {df.columns}")
|
||||
print(f" Date range: {df['datetime'].min()} to {df['datetime'].max()}")
|
||||
print(f" Sample:\n{df.head()}")
|
||||
|
||||
# Save to parquet
|
||||
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
||||
df.write_parquet(output_file)
|
||||
print(f"\nSaved to: {output_file}")
|
||||
|
||||
return df
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Fetch original 0_7 predictions from DolphinDB")
|
||||
parser.add_argument("--start-date", type=str, default=None, help="Start date (YYYY-MM-DD)")
|
||||
parser.add_argument("--end-date", type=str, default=None, help="End date (YYYY-MM-DD)")
|
||||
parser.add_argument("--output", type=str, default=OUTPUT_FILE, help="Output parquet file")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
df = fetch_original_predictions(
|
||||
start_date=args.start_date,
|
||||
end_date=args.end_date,
|
||||
output_file=args.output
|
||||
)
|
||||
|
||||
print("\nDone!")
|
||||
File diff suppressed because it is too large
Load Diff
@ -0,0 +1,292 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Script to generate actual returns using real kline data without changing the original format.
|
||||
This calculates real returns from kline VWAP prices using the original datetime and instrument format
|
||||
and saves the result as 'v2v_5d' column.
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
from datetime import datetime, timedelta
|
||||
import os
|
||||
|
||||
def generate_real_returns_from_kline(input_kline_path="/data/parquet/dataset/stg_1day_wind_kline_adjusted_1D/",
|
||||
prediction_file=None,
|
||||
output_file="../data/actual_returns.parquet",
|
||||
return_days=5):
|
||||
"""
|
||||
Generate real returns based on kline data using original datetime and instrument format.
|
||||
|
||||
Args:
|
||||
input_kline_path: Path to the kline data
|
||||
prediction_file: Optional prediction file to determine date range
|
||||
output_file: Output file for actual returns
|
||||
return_days: Number of days for return calculation (default 5)
|
||||
"""
|
||||
print(f"Generating real returns from kline data...")
|
||||
|
||||
# Import qshare functions for return calculation and spine operations
|
||||
try:
|
||||
from qshare.algo.polars.eval import calc_daily_return
|
||||
from qshare.algo.polars.spine import create_spine, align_to_calendar, merge_data_onto_spine
|
||||
print("Successfully imported qshare functions including spine operations")
|
||||
calc_daily_return_fn = calc_daily_return
|
||||
except ImportError as e:
|
||||
print(f"Could not import qshare functions: {e}")
|
||||
print("Falling back to manual return calculation without spine-filling")
|
||||
|
||||
def calc_daily_return_manual(df, price_col, window_len, col_name, bias=1):
|
||||
"""Manual implementation of daily return calculation."""
|
||||
# Sort by instrument and datetime
|
||||
df = df.sort(['instrument', 'datetime'])
|
||||
|
||||
# Calculate shifted prices for future returns
|
||||
df = df.with_columns([
|
||||
pl.col(price_col).shift(-bias).over('instrument').alias('price_base'),
|
||||
pl.col(price_col).shift(-(bias + window_len - 1)).over('instrument').alias('price_end')
|
||||
])
|
||||
|
||||
# Calculate returns
|
||||
df = df.with_columns([
|
||||
((pl.col('price_end') / pl.col('price_base')) - 1).alias(col_name)
|
||||
])
|
||||
|
||||
# Clean up temporary columns
|
||||
df = df.drop(['price_base', 'price_end'])
|
||||
|
||||
return df
|
||||
|
||||
calc_daily_return_fn = calc_daily_return_manual
|
||||
|
||||
# Determine date range - either from prediction file or use default range
|
||||
if prediction_file and os.path.exists(prediction_file):
|
||||
print(f"Using prediction file {prediction_file} to determine date range...")
|
||||
df_pred = pl.read_parquet(prediction_file)
|
||||
|
||||
pred_min_date = df_pred['date'].min()
|
||||
pred_max_date = df_pred['date'].max()
|
||||
|
||||
pred_min_date_int = int(pred_min_date.strftime('%Y%m%d'))
|
||||
pred_max_date_int = int(pred_max_date.strftime('%Y%m%d'))
|
||||
|
||||
print(f"Prediction date range: {pred_min_date} to {pred_max_date}")
|
||||
else:
|
||||
# Use a reasonable default range if no prediction file provided
|
||||
print("No prediction file provided, using default date range...")
|
||||
# Default to a range that should have data: 2019-01-01 to 2020-11-30
|
||||
pred_min_date_int = 20190101
|
||||
pred_max_date_int = 20201130
|
||||
print(f"Default date range: {pred_min_date_int} to {pred_max_date_int}")
|
||||
|
||||
print(f"Loading kline data from {input_kline_path} and filtering to date range...")
|
||||
|
||||
# Use lazy loading for efficiency and filter kline data
|
||||
try:
|
||||
df_kline = (
|
||||
pl.scan_parquet(input_kline_path)
|
||||
.filter(
|
||||
pl.col('datetime').is_between(pred_min_date_int, pred_max_date_int)
|
||||
)
|
||||
.collect()
|
||||
)
|
||||
|
||||
print(f"Kline data shape after filtering: {df_kline.shape}")
|
||||
print(f"Kline columns: {df_kline.columns}")
|
||||
print(f"Kline schema: {df_kline.schema}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error loading kline data: {e}")
|
||||
return None
|
||||
|
||||
if df_kline.height == 0:
|
||||
print("No kline data found within the date range!")
|
||||
return None
|
||||
|
||||
# Verify that we have required columns
|
||||
if 'datetime' not in df_kline.columns:
|
||||
raise ValueError("No datetime column found in kline data")
|
||||
if 'instrument' not in df_kline.columns:
|
||||
raise ValueError("No instrument column found in kline data")
|
||||
|
||||
# Use VWAP as the price column for return calculation
|
||||
price_col = 'vwap'
|
||||
if price_col not in df_kline.columns:
|
||||
print(f"Column '{price_col}' not found in kline data.")
|
||||
# Look for other possible price columns
|
||||
possible_price_cols = []
|
||||
for col in df_kline.columns:
|
||||
if any(price_term in col.lower() for price_term in ['price', 'vwap', 'close', 'adj', 'pct', 'open', 'high', 'low']):
|
||||
possible_price_cols.append(col)
|
||||
|
||||
print(f"Possible price columns: {possible_price_cols}")
|
||||
if possible_price_cols:
|
||||
price_col = possible_price_cols[0] # Use first available price-like column
|
||||
print(f"Using '{price_col}' as price column instead.")
|
||||
else:
|
||||
# If no obvious price column, use the first numeric column
|
||||
for col in df_kline.columns:
|
||||
if col not in ['datetime', 'instrument']: # Exclude datetime and instrument
|
||||
if df_kline[col].dtype in [pl.Float32, pl.Float64, pl.Int32, pl.Int64]:
|
||||
price_col = col
|
||||
break
|
||||
if 'vwap' not in locals() and 'vwap' not in globals():
|
||||
raise ValueError("No suitable price column found in kline data")
|
||||
|
||||
print(f"Using price column: {price_col}")
|
||||
|
||||
# Sort data by instrument and datetime to ensure proper temporal ordering
|
||||
df_kline = df_kline.sort(['instrument', 'datetime'])
|
||||
|
||||
# Apply spine-filling approach to ensure complete coverage
|
||||
print("Applying spine-filling to ensure complete date/instrument coverage...")
|
||||
|
||||
# Create spine to fill in missing combinations
|
||||
try:
|
||||
# Get unique dates and instruments in the dataset
|
||||
unique_dates = df_kline.select(pl.col('datetime').unique().sort()).get_column('datetime')
|
||||
unique_instruments = df_kline.select(pl.col('instrument').unique()).get_column('instrument')
|
||||
|
||||
# Create a spine (cartesian product of dates and instruments)
|
||||
spine_dates = unique_dates.cast(pl.Int32).to_frame()
|
||||
spine_dates = spine_dates.rename({'datetime': 'datetime'})
|
||||
|
||||
spine_instruments = unique_instruments.cast(pl.Int32).to_frame()
|
||||
spine_instruments = spine_instruments.rename({'instrument': 'instrument'})
|
||||
|
||||
# Create full spine by cross join
|
||||
df_spine = spine_dates.join(spine_instruments, how='cross')
|
||||
|
||||
# Merge the kline data onto the spine (left join to keep all spine entries)
|
||||
df_filled = df_spine.join(
|
||||
df_kline,
|
||||
on=['datetime', 'instrument'],
|
||||
how='left'
|
||||
)
|
||||
|
||||
print(f"Spine-filling completed. Shape before: {df_kline.shape}, after: {df_filled.shape}")
|
||||
|
||||
# Forward fill and backward fill to handle missing price data where possible
|
||||
# Sort by instrument and datetime for proper fill operations
|
||||
df_filled = df_filled.sort(['instrument', 'datetime'])
|
||||
|
||||
# Fill missing price data using forward/backward fill within each instrument
|
||||
df_filled = df_filled.with_columns([
|
||||
pl.col(price_col).forward_fill().backward_fill().over('instrument').alias(f'{price_col}_filled')
|
||||
])
|
||||
|
||||
# Use filled price if original was null, otherwise use original
|
||||
df_filled = df_filled.with_columns([
|
||||
pl.when(pl.col(price_col).is_null())
|
||||
.then(pl.col(f'{price_col}_filled'))
|
||||
.otherwise(pl.col(price_col))
|
||||
.alias(price_col)
|
||||
]).drop(f'{price_col}_filled')
|
||||
|
||||
# Keep only rows where we have price data after filling
|
||||
df_kline = df_filled.filter(pl.col(price_col).is_not_null())
|
||||
|
||||
print(f"After spine-filling and cleaning: {df_kline.shape}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error during spine-filling: {e}")
|
||||
print("Continuing with original data...")
|
||||
# Continue with original df_kline
|
||||
|
||||
print(f"Calculating {return_days}-day returns from T+1's {price_col} to T+{return_days+1}'s {price_col}...")
|
||||
|
||||
# Calculate returns using qshare function if available, otherwise manual
|
||||
try:
|
||||
df_returns = calc_daily_return_fn(
|
||||
pldf=df_kline,
|
||||
price_col=price_col,
|
||||
window_len=return_days, # specified return period
|
||||
col_name='v2v_5d', # Output column name (as requested)
|
||||
bias=1, # Use T+1 price as base for forward return
|
||||
return_type='normal' # Regular return (not log return)
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Error calculating returns with qshare function: {e}")
|
||||
print("Attempting manual calculation...")
|
||||
|
||||
# Manual calculation as fallback
|
||||
df_returns = df_kline.sort(['instrument', 'datetime']).with_columns([
|
||||
pl.col(price_col).shift(-1).over('instrument').alias('price_base'),
|
||||
pl.col(price_col).shift(-(1 + return_days - 1)).over('instrument').alias('price_end')
|
||||
]).with_columns([
|
||||
((pl.col('price_end') / pl.col('price_base')) - 1).alias('v2v_5d')
|
||||
]).drop(['price_base', 'price_end'])
|
||||
|
||||
print(f"Calculated returns shape: {df_returns.shape}")
|
||||
print(f"Calculated returns schema: {df_returns.schema}")
|
||||
|
||||
# Handle potential missing values or infinite returns
|
||||
df_returns = df_returns.filter(
|
||||
pl.col('v2v_5d').is_not_null() &
|
||||
pl.col('v2v_5d').is_finite()
|
||||
)
|
||||
|
||||
print(f"Returns after filtering invalid values: {df_returns.shape}")
|
||||
|
||||
# Keep the original datetime and instrument columns without conversion
|
||||
df_output = df_returns.select([
|
||||
'datetime', # Keep original uint32 datetime format
|
||||
'instrument', # Keep original uint32 instrument format
|
||||
'v2v_5d' # Use requested column name
|
||||
])
|
||||
|
||||
print(f"Final output shape: {df_output.shape}")
|
||||
print(f"Output schema: {df_output.schema}")
|
||||
|
||||
# Display some statistics about the returns
|
||||
if 'v2v_5d' in df_output.columns and len(df_output) > 0:
|
||||
returns_data = df_output['v2v_5d'].drop_nulls()
|
||||
if len(returns_data) > 0:
|
||||
print(f"Return statistics:")
|
||||
print(f" Count: {len(returns_data)}")
|
||||
print(f" Mean: {returns_data.mean():.6f}")
|
||||
print(f" Std: {returns_data.std():.6f}")
|
||||
print(f" Min: {returns_data.min():.6f}")
|
||||
print(f" Max: {returns_data.max():.6f}")
|
||||
print(f" 25th percentile: {returns_data.quantile(0.25):.6f}")
|
||||
print(f" 75th percentile: {returns_data.quantile(0.75):.6f}")
|
||||
else:
|
||||
print(" No valid returns data after filtering")
|
||||
|
||||
# Save to parquet preserving original datetime and instrument formats
|
||||
df_output.write_parquet(output_file)
|
||||
print(f"Real returns saved to {output_file} with original datetime and instrument formats")
|
||||
|
||||
return df_output
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Generate real returns from kline data using the original format
|
||||
|
||||
# Define the prediction file to use as date range reference (optional)
|
||||
pred_file = "../data/original_predictions_0_7.parquet"
|
||||
|
||||
# Check if the prediction file exists to use for date range
|
||||
pred_file_path = pred_file if os.path.exists(pred_file) else None
|
||||
|
||||
if pred_file_path:
|
||||
print(f"Using prediction file to determine date range: {pred_file_path}")
|
||||
else:
|
||||
print("Prediction file not found, using default date range")
|
||||
|
||||
print("Generating real returns from kline data...")
|
||||
real_returns_df = generate_real_returns_from_kline(
|
||||
input_kline_path="/data/parquet/dataset/stg_1day_wind_kline_adjusted_1D/",
|
||||
prediction_file=pred_file_path,
|
||||
output_file="../data/actual_returns.parquet",
|
||||
return_days=5
|
||||
)
|
||||
|
||||
if real_returns_df is not None:
|
||||
print("Real return generation completed successfully!")
|
||||
print("The output file contains:")
|
||||
print("- Original datetime format (uint32 YYYYMMDD)")
|
||||
print("- Original instrument format (uint32)")
|
||||
print("- Returns in 'v2v_5d' column")
|
||||
else:
|
||||
print("Failed to generate real returns.")
|
||||
@ -0,0 +1,433 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Standalone script to generate predictions using the d033 model with locally generated alpha158_0_7_beta embeddings.
|
||||
|
||||
This script does NOT rely on qlib's data handlers. It:
|
||||
1. Loads beta embeddings from local parquet file (generated by generate_beta_embedding.py)
|
||||
2. Applies the necessary processing (normalization, neutralization)
|
||||
3. Uses the d033 model to generate predictions
|
||||
4. Saves predictions to parquet
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import pickle as pkl
|
||||
import io
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
import pandas as pd
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from datetime import date, timedelta
|
||||
from typing import Optional, List, Tuple, Dict
|
||||
from pathlib import Path
|
||||
|
||||
# Constants
|
||||
EMBEDDING_FILE = "../data/embedding_0_7_beta.parquet"
|
||||
MODEL_PATH = "/home/guofu/Workspaces/alpha/data_ops/tasks/app_longsignal/model/host140_exp20_d033/module.pt"
|
||||
OUTPUT_DIR = "../data"
|
||||
|
||||
# Industry flag path for neutralization (optional)
|
||||
INDUSTRY_FLAG_PATH = "/data/parquet/dataset/stg_1day_gds_indus_flag_cc1_1D/"
|
||||
RISK_FACTOR_PATH = "/data/parquet/dataset/stg_1day_wind_kline_adjusted_1D/"
|
||||
|
||||
|
||||
def load_beta_embeddings(embedding_file: str, start_date: Optional[str] = None, end_date: Optional[str] = None) -> pl.DataFrame:
|
||||
"""
|
||||
Load beta embeddings from local parquet file.
|
||||
|
||||
Args:
|
||||
embedding_file: Path to the embeddings parquet file
|
||||
start_date: Optional start date filter (YYYY-MM-DD)
|
||||
end_date: Optional end date filter (YYYY-MM-DD)
|
||||
|
||||
Returns:
|
||||
Polars DataFrame with embeddings
|
||||
"""
|
||||
print(f"Loading beta embeddings from {embedding_file}...")
|
||||
df = pl.read_parquet(embedding_file)
|
||||
print(f"Loaded embeddings: {df.shape}")
|
||||
|
||||
# Convert datetime if needed
|
||||
if 'datetime' in df.columns:
|
||||
# Filter by date range if specified
|
||||
if start_date:
|
||||
start_dt = int(start_date.replace("-", ""))
|
||||
df = df.filter(pl.col('datetime') >= start_dt)
|
||||
if end_date:
|
||||
end_dt = int(end_date.replace("-", ""))
|
||||
df = df.filter(pl.col('datetime') <= end_dt)
|
||||
|
||||
print(f"Filtered embeddings: {df.shape}")
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def load_d033_model(model_path: str) -> nn.Module:
|
||||
"""
|
||||
Load the d033 prediction model.
|
||||
|
||||
Returns the underlying PyTorch model ready for inference on CPU.
|
||||
"""
|
||||
print(f"Loading d033 model from {model_path}...")
|
||||
|
||||
# Patch torch.load to always use CPU
|
||||
original_torch_load = torch.load
|
||||
|
||||
def cpu_torch_load(*args, **kwargs):
|
||||
kwargs['map_location'] = 'cpu'
|
||||
return original_torch_load(*args, **kwargs)
|
||||
|
||||
# Apply the patch
|
||||
torch.load = cpu_torch_load
|
||||
|
||||
try:
|
||||
with open(model_path, "rb") as fin:
|
||||
model = pkl.load(fin)
|
||||
|
||||
# The model is already an RNNPredict instance
|
||||
# Set to eval mode for inference
|
||||
model.eval()
|
||||
|
||||
# Set signal function (required for prediction)
|
||||
if not hasattr(model, 'signal_func'):
|
||||
model.signal_func = {"type": "logistic"}
|
||||
|
||||
print("Model loaded successfully (CPU)")
|
||||
return model
|
||||
|
||||
except RuntimeError as e:
|
||||
if "CUDA" in str(e):
|
||||
print("Model contains CUDA tensors, attempting CPU conversion...")
|
||||
|
||||
with open(model_path, "rb") as fin:
|
||||
content = fin.read()
|
||||
|
||||
model = torch.load(io.BytesIO(content), map_location='cpu', weights_only=False)
|
||||
model.eval()
|
||||
|
||||
# Set signal function (required for prediction)
|
||||
if not hasattr(model, 'signal_func'):
|
||||
model.signal_func = {"type": "logistic"}
|
||||
|
||||
print("Model loaded and converted to CPU")
|
||||
return model
|
||||
else:
|
||||
raise
|
||||
finally:
|
||||
# Restore original torch.load
|
||||
torch.load = original_torch_load
|
||||
|
||||
|
||||
def apply_cs_zscore_norm(df: pl.DataFrame, embedding_cols: List[str]) -> pl.DataFrame:
|
||||
"""
|
||||
Apply cross-sectional z-score normalization per datetime.
|
||||
This normalizes features within each date (cross-sectionally).
|
||||
|
||||
Args:
|
||||
df: DataFrame with embeddings
|
||||
embedding_cols: List of embedding column names
|
||||
|
||||
Returns:
|
||||
DataFrame with normalized embeddings
|
||||
"""
|
||||
print("Applying cross-sectional z-score normalization...")
|
||||
|
||||
# For each embedding column, calculate mean/std per datetime and normalize
|
||||
normalized_cols = []
|
||||
|
||||
for col in embedding_cols:
|
||||
# Calculate mean and std per datetime (cross-sectionally)
|
||||
mean_expr = pl.col(col).mean().over('datetime')
|
||||
std_expr = pl.col(col).std().over('datetime')
|
||||
|
||||
# Z-score: (x - mean) / std, with protection against div by zero
|
||||
norm_col = pl.when(std_expr > 0).then(
|
||||
(pl.col(col) - mean_expr) / std_expr
|
||||
).otherwise(0.0).alias(col)
|
||||
|
||||
normalized_cols.append(norm_col)
|
||||
|
||||
# Select non-embedding columns + normalized embedding columns
|
||||
other_cols = [c for c in df.columns if c not in embedding_cols]
|
||||
df_normalized = df.select(other_cols + normalized_cols)
|
||||
|
||||
return df_normalized
|
||||
|
||||
|
||||
def apply_robust_zscore_norm(df: pl.DataFrame, embedding_cols: List[str]) -> pl.DataFrame:
|
||||
"""
|
||||
Apply robust z-score normalization (uses median instead of mean).
|
||||
|
||||
Args:
|
||||
df: DataFrame with embeddings
|
||||
embedding_cols: List of embedding column names
|
||||
|
||||
Returns:
|
||||
DataFrame with normalized embeddings
|
||||
"""
|
||||
print("Applying robust z-score normalization...")
|
||||
|
||||
normalized_cols = []
|
||||
|
||||
for col in embedding_cols:
|
||||
# Calculate median and MAD (median absolute deviation) per datetime
|
||||
median_expr = pl.col(col).median().over('datetime')
|
||||
mad_expr = (pl.col(col) - median_expr).abs().median().over('datetime')
|
||||
|
||||
# Robust z-score: (x - median) / (1.4826 * MAD), with protection
|
||||
norm_col = pl.when(mad_expr > 0).then(
|
||||
(pl.col(col) - median_expr) / (1.4826 * mad_expr)
|
||||
).otherwise(0.0).alias(col)
|
||||
|
||||
normalized_cols.append(norm_col)
|
||||
|
||||
# Clip outliers at [-10, 10]
|
||||
clipped_cols = []
|
||||
for col in normalized_cols:
|
||||
clipped_cols.append(
|
||||
pl.col(col.name).clip(-10, 10).alias(col.name)
|
||||
)
|
||||
|
||||
other_cols = [c for c in df.columns if c not in embedding_cols]
|
||||
df_normalized = df.select(other_cols + clipped_cols)
|
||||
|
||||
return df_normalized
|
||||
|
||||
|
||||
def apply_fillna(df: pl.DataFrame, embedding_cols: List[str], fill_value: float = 0.0) -> pl.DataFrame:
|
||||
"""Fill NA values in embedding columns."""
|
||||
filled_cols = []
|
||||
for col in embedding_cols:
|
||||
filled_cols.append(pl.col(col).fill_null(fill_value).alias(col))
|
||||
|
||||
other_cols = [c for c in df.columns if c not in embedding_cols]
|
||||
return df.select(other_cols + filled_cols)
|
||||
|
||||
|
||||
def prepare_features_for_model(df: pl.DataFrame, embedding_cols: List[str], seq_len: int = 40) -> Tuple[np.ndarray, pl.DataFrame]:
|
||||
"""
|
||||
Prepare features for the model by creating sequences.
|
||||
|
||||
The d033 model expects 3D input: [batch_size, seq_len, d_feat]
|
||||
where seq_len is the lookback window (default 40 days).
|
||||
|
||||
Args:
|
||||
df: DataFrame with normalized embeddings
|
||||
embedding_cols: List of embedding column names
|
||||
seq_len: Sequence length (lookback window)
|
||||
|
||||
Returns:
|
||||
Tuple of (features_array, aligned_df)
|
||||
"""
|
||||
print(f"Preparing sequences with length {seq_len}...")
|
||||
|
||||
# Sort by instrument and datetime
|
||||
df = df.sort(['instrument', 'datetime'])
|
||||
|
||||
# Get unique instruments
|
||||
instruments = df['instrument'].unique().to_list()
|
||||
|
||||
features_list = []
|
||||
metadata_list = []
|
||||
|
||||
for inst in instruments:
|
||||
# Get data for this instrument
|
||||
inst_df = df.filter(pl.col('instrument') == inst)
|
||||
inst_data = inst_df.select(embedding_cols).to_numpy().astype(np.float32)
|
||||
inst_meta = inst_df.select(['datetime', 'instrument']).to_numpy()
|
||||
|
||||
# Create sliding windows
|
||||
for i in range(seq_len - 1, len(inst_data)):
|
||||
# Get sequence of seq_len days ending at day i
|
||||
seq = inst_data[i - seq_len + 1:i + 1] # [seq_len, d_feat]
|
||||
features_list.append(seq)
|
||||
|
||||
# Metadata for this prediction (the last day in sequence)
|
||||
metadata_list.append(inst_meta[i])
|
||||
|
||||
if not features_list:
|
||||
raise ValueError(f"Not enough data to create sequences of length {seq_len}")
|
||||
|
||||
features_array = np.stack(features_list, axis=0) # [N, seq_len, d_feat]
|
||||
metadata_array = np.array(metadata_list)
|
||||
|
||||
# Create metadata DataFrame
|
||||
metadata_df = pl.DataFrame({
|
||||
'datetime': metadata_array[:, 0],
|
||||
'instrument': metadata_array[:, 1]
|
||||
})
|
||||
|
||||
print(f"Prepared features shape: {features_array.shape}")
|
||||
print(f"Metadata shape: {metadata_df.shape}")
|
||||
|
||||
return features_array, metadata_df
|
||||
|
||||
|
||||
def predict_with_model(model, features: np.ndarray, batch_size: int = 1000) -> np.ndarray:
|
||||
"""
|
||||
Generate predictions using the loaded model.
|
||||
|
||||
Args:
|
||||
model: Loaded d033 model
|
||||
features: Feature array [N, seq_len, d_feat]
|
||||
batch_size: Batch size for inference
|
||||
|
||||
Returns:
|
||||
Predictions array [N]
|
||||
"""
|
||||
print(f"Generating predictions for {features.shape[0]} samples...")
|
||||
|
||||
device = torch.device('cpu')
|
||||
model = model.to(device)
|
||||
model.eval()
|
||||
|
||||
all_preds = []
|
||||
|
||||
with torch.no_grad():
|
||||
for i in range(0, len(features), batch_size):
|
||||
batch = features[i:i + batch_size]
|
||||
batch_tensor = torch.tensor(batch, dtype=torch.float32, device=device)
|
||||
|
||||
# Forward pass
|
||||
_, pred = model(batch_tensor)
|
||||
|
||||
# Convert to numpy
|
||||
pred_np = pred.cpu().numpy()
|
||||
all_preds.append(pred_np)
|
||||
|
||||
predictions = np.concatenate(all_preds, axis=0)
|
||||
print(f"Generated {len(predictions)} predictions")
|
||||
|
||||
return predictions
|
||||
|
||||
|
||||
def predict_with_embeddings(
|
||||
embeddings_df: pl.DataFrame,
|
||||
model,
|
||||
output_file: Optional[str] = None,
|
||||
seq_len: int = 40,
|
||||
batch_size: int = 1000
|
||||
) -> pl.DataFrame:
|
||||
"""
|
||||
Generate predictions using embeddings and the d033 model.
|
||||
|
||||
Args:
|
||||
embeddings_df: DataFrame with beta embeddings
|
||||
model: Loaded d033 model
|
||||
output_file: Optional output file path
|
||||
seq_len: Sequence length for model input
|
||||
batch_size: Batch size for inference
|
||||
|
||||
Returns:
|
||||
DataFrame with predictions
|
||||
"""
|
||||
print("Generating predictions...")
|
||||
|
||||
# Get embedding columns
|
||||
embedding_cols = [col for col in embeddings_df.columns if col.startswith('embedding_')]
|
||||
print(f"Found {len(embedding_cols)} embedding columns")
|
||||
|
||||
# Apply inference processors
|
||||
df_processed = apply_cs_zscore_norm(embeddings_df, embedding_cols)
|
||||
df_processed = apply_fillna(df_processed, embedding_cols, fill_value=0.0)
|
||||
|
||||
# Prepare sequences for model
|
||||
features, metadata_df = prepare_features_for_model(df_processed, embedding_cols, seq_len=seq_len)
|
||||
|
||||
# Generate predictions
|
||||
predictions = predict_with_model(model, features, batch_size=batch_size)
|
||||
|
||||
# Create output DataFrame
|
||||
result_df = metadata_df.with_columns([
|
||||
pl.Series(name="prediction", values=predictions)
|
||||
])
|
||||
|
||||
# Save to parquet
|
||||
if output_file is None:
|
||||
output_file = os.path.join(OUTPUT_DIR, "predictions_beta_embedding.parquet")
|
||||
|
||||
os.makedirs(os.path.dirname(output_file), exist_ok=True)
|
||||
result_df.write_parquet(output_file)
|
||||
print(f"Predictions saved to {output_file}")
|
||||
|
||||
return result_df
|
||||
|
||||
|
||||
def generate_predictions(
|
||||
embedding_file: str = EMBEDDING_FILE,
|
||||
model_path: str = MODEL_PATH,
|
||||
start_date: Optional[str] = None,
|
||||
end_date: Optional[str] = None,
|
||||
output_file: Optional[str] = None,
|
||||
seq_len: int = 40,
|
||||
batch_size: int = 1000
|
||||
) -> pl.DataFrame:
|
||||
"""
|
||||
Main function to generate predictions using beta embeddings.
|
||||
|
||||
Args:
|
||||
embedding_file: Path to beta embeddings parquet file
|
||||
model_path: Path to d033 model
|
||||
start_date: Optional start date filter
|
||||
end_date: Optional end date filter
|
||||
output_file: Optional output file path
|
||||
seq_len: Sequence length for model input (lookback window)
|
||||
batch_size: Batch size for inference
|
||||
|
||||
Returns:
|
||||
DataFrame with predictions
|
||||
"""
|
||||
print("=" * 60)
|
||||
print("Generating Predictions with Alpha158 0_7 Beta Embeddings")
|
||||
print("=" * 60)
|
||||
|
||||
# Load embeddings
|
||||
df_embeddings = load_beta_embeddings(embedding_file, start_date, end_date)
|
||||
|
||||
# Load model
|
||||
model = load_d033_model(model_path)
|
||||
|
||||
# Generate predictions
|
||||
predictions = predict_with_embeddings(
|
||||
df_embeddings, model, output_file,
|
||||
seq_len=seq_len, batch_size=batch_size
|
||||
)
|
||||
|
||||
return predictions
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Generate predictions with beta embeddings")
|
||||
parser.add_argument("--embeddings", type=str, default=EMBEDDING_FILE,
|
||||
help="Path to beta embeddings parquet file")
|
||||
parser.add_argument("--model", type=str, default=MODEL_PATH,
|
||||
help="Path to d033 model")
|
||||
parser.add_argument("--start-date", type=str, default=None,
|
||||
help="Start date (YYYY-MM-DD)")
|
||||
parser.add_argument("--end-date", type=str, default=None,
|
||||
help="End date (YYYY-MM-DD)")
|
||||
parser.add_argument("--output", type=str, default=None,
|
||||
help="Output parquet file path")
|
||||
parser.add_argument("--seq-len", type=int, default=40,
|
||||
help="Sequence length (lookback window)")
|
||||
parser.add_argument("--batch-size", type=int, default=1000,
|
||||
help="Batch size for inference")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
df = generate_predictions(
|
||||
embedding_file=args.embeddings,
|
||||
model_path=args.model,
|
||||
start_date=args.start_date,
|
||||
end_date=args.end_date,
|
||||
output_file=args.output,
|
||||
seq_len=args.seq_len,
|
||||
batch_size=args.batch_size
|
||||
)
|
||||
|
||||
print("\nDone!")
|
||||
print(f"Generated {len(df)} predictions")
|
||||
print(df.head())
|
||||
@ -0,0 +1,186 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Regenerate beta embeddings for a few days of sample data.
|
||||
|
||||
This script generates embeddings for a small date range to test the pipeline.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import pickle as pkl
|
||||
import numpy as np
|
||||
import polars as pl
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import List, Dict, Optional
|
||||
|
||||
# Add parent directory to path
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
# Import from the main generate script
|
||||
from generate_beta_embedding import (
|
||||
load_all_data,
|
||||
merge_data_sources,
|
||||
apply_feature_pipeline,
|
||||
prepare_vae_features,
|
||||
load_vae_model,
|
||||
encode_with_vae,
|
||||
load_qlib_processor_params,
|
||||
VAE_INPUT_DIM,
|
||||
OUTPUT_DIR,
|
||||
)
|
||||
|
||||
# Sample dates for testing (5 consecutive trading days)
|
||||
SAMPLE_DATES = [
|
||||
"2019-01-02",
|
||||
"2019-01-03",
|
||||
"2019-01-04",
|
||||
"2019-01-07",
|
||||
"2019-01-08",
|
||||
]
|
||||
|
||||
VAE_MODEL_PATH = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/model/csiallx_feature2_ntrla_flag_pnlnorm_vae4_dim32a_beta0001/module.pt"
|
||||
|
||||
|
||||
def generate_sample_embeddings(
|
||||
dates: List[str] = SAMPLE_DATES,
|
||||
output_file: str = "embedding_0_7_beta_sample.parquet",
|
||||
use_vae: bool = True
|
||||
) -> pl.DataFrame:
|
||||
"""
|
||||
Generate embeddings for a sample of dates.
|
||||
|
||||
Args:
|
||||
dates: List of dates in YYYY-MM-DD format
|
||||
output_file: Output parquet file path
|
||||
use_vae: Whether to use VAE for encoding (or random embeddings)
|
||||
"""
|
||||
start_date = dates[0]
|
||||
end_date = dates[-1]
|
||||
|
||||
print("=" * 60)
|
||||
print("Generating Sample Beta Embeddings")
|
||||
print(f"Dates: {dates}")
|
||||
print(f"Use VAE: {use_vae}")
|
||||
print("=" * 60)
|
||||
|
||||
# Load all data sources
|
||||
df_alpha, df_kline, df_flag, df_industry = load_all_data(start_date, end_date)
|
||||
|
||||
print(f"\nLoaded data:")
|
||||
print(f" Alpha158: {df_alpha.shape}")
|
||||
print(f" Kline: {df_kline.shape}")
|
||||
print(f" Flags: {df_flag.shape}")
|
||||
print(f" Industry: {df_industry.shape}")
|
||||
|
||||
# Filter to only the sample dates
|
||||
date_ints = [int(d.replace("-", "")) for d in dates]
|
||||
df_alpha = df_alpha.filter(pl.col("datetime").is_in(date_ints))
|
||||
df_kline = df_kline.filter(pl.col("datetime").is_in(date_ints))
|
||||
df_flag = df_flag.filter(pl.col("datetime").is_in(date_ints))
|
||||
df_industry = df_industry.filter(pl.col("datetime").is_in(date_ints))
|
||||
|
||||
print(f"\nAfter filtering to sample dates:")
|
||||
print(f" Alpha158: {df_alpha.shape}")
|
||||
print(f" Kline: {df_kline.shape}")
|
||||
print(f" Flags: {df_flag.shape}")
|
||||
print(f" Industry: {df_industry.shape}")
|
||||
|
||||
# Merge data sources
|
||||
df = merge_data_sources(df_alpha, df_kline, df_flag, df_industry)
|
||||
print(f"\nMerged data shape: {df.shape}")
|
||||
|
||||
# Save datetime and instrument before processing
|
||||
datetime_col = df["datetime"].clone()
|
||||
instrument_col = df["instrument"].clone()
|
||||
|
||||
# Apply feature transformation pipeline
|
||||
df_processed, feature_cols, norm_feature_cols, market_flag_for_vae = apply_feature_pipeline(df)
|
||||
|
||||
# Prepare features for VAE
|
||||
features = prepare_vae_features(
|
||||
df_processed, feature_cols,
|
||||
norm_feature_cols=norm_feature_cols,
|
||||
market_flag_for_vae=market_flag_for_vae
|
||||
)
|
||||
|
||||
print(f"\nFeature matrix shape: {features.shape}")
|
||||
|
||||
# Encode with VAE
|
||||
if use_vae:
|
||||
try:
|
||||
model = load_vae_model(VAE_MODEL_PATH)
|
||||
embeddings = encode_with_vae(features, model)
|
||||
print(f"\nVAE encoding successful!")
|
||||
except Exception as e:
|
||||
print(f"\nVAE encoding failed: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
print("\nFalling back to random embeddings...")
|
||||
np.random.seed(42)
|
||||
embeddings = np.random.randn(features.shape[0], 32).astype(np.float32)
|
||||
else:
|
||||
print("\nUsing random embeddings (VAE disabled)...")
|
||||
np.random.seed(42)
|
||||
embeddings = np.random.randn(features.shape[0], 32).astype(np.float32)
|
||||
|
||||
# Create output DataFrame
|
||||
embedding_cols = [f"embedding_{i}" for i in range(embeddings.shape[1])]
|
||||
|
||||
result_data = {
|
||||
"datetime": datetime_col.to_list(),
|
||||
"instrument": instrument_col.to_list(),
|
||||
**{col_name: embeddings[:, i].tolist() for i, col_name in enumerate(embedding_cols)}
|
||||
}
|
||||
|
||||
df_result = pl.DataFrame(result_data)
|
||||
|
||||
# Ensure output directory exists
|
||||
output_path = Path(output_file)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Save to parquet
|
||||
df_result.write_parquet(output_path)
|
||||
print(f"\nEmbeddings saved to: {output_path}")
|
||||
print(f"Output shape: {df_result.shape}")
|
||||
print(f"\nSample output:")
|
||||
print(df_result.head(10))
|
||||
|
||||
# Print summary statistics
|
||||
print("\n" + "=" * 60)
|
||||
print("Summary Statistics")
|
||||
print("=" * 60)
|
||||
print(f"Total samples: {len(df_result)}")
|
||||
print(f"Embedding dimension: {embeddings.shape[1]}")
|
||||
print(f"Date range: {df_result['datetime'].min()} to {df_result['datetime'].max()}")
|
||||
print(f"Instruments: {df_result['instrument'].n_unique()}")
|
||||
print(f"Embedding mean: {np.mean(embeddings):.6f}")
|
||||
print(f"Embedding std: {np.std(embeddings):.6f}")
|
||||
print(f"Embedding min: {np.min(embeddings):.6f}")
|
||||
print(f"Embedding max: {np.max(embeddings):.6f}")
|
||||
|
||||
return df_result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="Generate sample beta embeddings")
|
||||
parser.add_argument("--dates", nargs="+", default=SAMPLE_DATES,
|
||||
help="List of dates (YYYY-MM-DD)")
|
||||
parser.add_argument("--output", type=str, default="embedding_0_7_beta_sample.parquet",
|
||||
help="Output parquet file")
|
||||
parser.add_argument("--no-vae", action="store_true",
|
||||
help="Skip VAE encoding (use random embeddings)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
generate_sample_embeddings(
|
||||
dates=args.dates,
|
||||
output_file=args.output,
|
||||
use_vae=not args.no_vae
|
||||
)
|
||||
|
||||
print("\nDone!")
|
||||
@ -0,0 +1,394 @@
|
||||
[2715583:MainThread](2026-02-26 19:58:16,674) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
|
||||
[2715583:MainThread](2026-02-26 19:58:16,680) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
|
||||
[2715583:MainThread](2026-02-26 19:58:16,681) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
|
||||
================================================================================
|
||||
DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE
|
||||
================================================================================
|
||||
Date Range: 2020-01-02 to 2020-01-10
|
||||
Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data
|
||||
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
|
||||
|
||||
Step 1: Loading data from Qlib pipeline...
|
||||
Loading since_date=2020-01-02
|
||||
Will use `placehorder_value` from module: qlib.contrib.data.config
|
||||
Will init handler object from config:
|
||||
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'end_time': datetime.date(2026, 2, 26),
|
||||
'handler_list': [{'class': 'DDBAlpha158Handler',
|
||||
'kwargs': {'col_set': 'feature',
|
||||
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
|
||||
'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': 'alpha158',
|
||||
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
|
||||
{'class': 'DDBMarketExtHandler',
|
||||
'kwargs': {'col_set': 'feature_ext',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['Turnover '
|
||||
'as '
|
||||
'turnover',
|
||||
'FreeTurnover '
|
||||
'as '
|
||||
'free_turnover',
|
||||
'log(MarketValue) '
|
||||
'as '
|
||||
'log_size'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['con_rating_strength'],
|
||||
'table_name': 'stg_1day_gds_con_rating'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
|
||||
{'class': 'DDBMarketFlagHandler',
|
||||
'kwargs': {'col_set': 'feature_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['IsZt',
|
||||
'IsDt',
|
||||
'IsN',
|
||||
'IsXD',
|
||||
'IsXR',
|
||||
'IsDR'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['open_limit',
|
||||
'close_limit',
|
||||
'low_limit',
|
||||
'open_stop',
|
||||
'close_stop',
|
||||
'high_stop'],
|
||||
'table_name': 'stg_1day_wind_market_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
|
||||
{'class': 'DDBIndusFlagHandler',
|
||||
'kwargs': {'col_set': 'indus_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': 'industry_code_cc.csv',
|
||||
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
|
||||
{'class': 'DDBStFlagHandler',
|
||||
'kwargs': {'col_set': 'st_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['ST_Y',
|
||||
'ST_S',
|
||||
'ST_T',
|
||||
'ST_L',
|
||||
'ST_Z',
|
||||
'ST_X'],
|
||||
'table_name': 'stg_1day_wind_st_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
|
||||
'instruments': 'csiallx',
|
||||
'start_time': Timestamp('2019-12-03 00:00:00')},
|
||||
'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'handler': {'class': 'AggHandler',
|
||||
'kwargs': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'end_time': datetime.date(2026, 2, 26)[2715583:MainThread](2026-02-26 19:58:16,707) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
|
||||
[2715583:MainThread](2026-02-26 19:58:16,707) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
|
||||
[2715583:MainThread](2026-02-26 19:58:17,067) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
|
||||
[2715583:MainThread](2026-02-26 20:05:39,665) INFO - qlib.timer - [log.py:117] - Time cost: 442.946s | DDB query: Done
|
||||
[2715583:MainThread](2026-02-26 20:05:40,469) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
,
|
||||
'handler_list': [{'class': 'DDBAlpha158Handler',
|
||||
'kwargs': {'col_set': 'feature',
|
||||
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
|
||||
'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': 'alpha158',
|
||||
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
|
||||
{'class': 'DDBMarketExtHandler',
|
||||
'kwargs': {'col_set': 'feature_ext',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['Turnover '
|
||||
'as '
|
||||
'turnover',
|
||||
'FreeTurnover '
|
||||
'as '
|
||||
'free_turnover',
|
||||
'log(MarketValue) '
|
||||
'as '
|
||||
'log_size'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['con_rating_strength'],
|
||||
'table_name': 'stg_1day_gds_con_rating'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
|
||||
{'class': 'DDBMarketFlagHandler',
|
||||
'kwargs': {'col_set': 'feature_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['IsZt',
|
||||
'IsDt',
|
||||
'IsN',
|
||||
'IsXD',
|
||||
'IsXR',
|
||||
'IsDR'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['open_limit',
|
||||
'close_limit',
|
||||
'low_limit',
|
||||
'open_stop',
|
||||
'close_stop',
|
||||
'high_stop'],
|
||||
'table_name': 'stg_1day_wind_market_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
|
||||
{'class': 'DDBIndusFlagHandler',
|
||||
'kwargs': {'col_set': 'indus_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': 'industry_code_cc.csv',
|
||||
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
|
||||
{'class': 'DDBStFlagHandler',
|
||||
'kwargs': {'col_set': 'st_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['ST_Y',
|
||||
'ST_S',
|
||||
'ST_T',
|
||||
'ST_L',
|
||||
'ST_Z',
|
||||
'ST_X'],
|
||||
'table_name': 'stg_1day_wind_st_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
|
||||
'instruments': 'csiallx',
|
||||
'start_time': Timestamp('2019-12-03 00:00:00')},
|
||||
'module_path': 'qlib.contrib.data.agg_handler'},
|
||||
'load_end': datetime.date(2026, 2, 26),
|
||||
'load_start': Timestamp('2019-12-03 00:00:00'),
|
||||
'market': 'csiallx',
|
||||
'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target',
|
||||
'region': 'cn'}}
|
||||
Query config:
|
||||
#alpha158: 1;
|
||||
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e88b0587d0>
|
||||
[2715583:MainThread](2026-02-26 20:07:46,118) INFO - qlib.timer - [log.py:117] - Time cost: 115.964s | Instruments filter: Done
|
||||
[2715583:MainThread](2026-02-26 20:07:53,273) INFO - qlib.timer - [log.py:117] - Time cost: 576.561s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x70e88b086d80>) Done
|
||||
[2715583:MainThread](2026-02-26 20:07:53,274) INFO - qlib.timer - [log.py:117] - Time cost: 576.562s | Init data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x70e88b086d80>) Done
|
||||
[2715583:MainThread](2026-02-26 20:07:53,276) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2715583:MainThread](2026-02-26 20:07:56,700) INFO - qlib.timer - [log.py:117] - Time cost: 3.423s | fetch_df_by_index Done
|
||||
[2715583:MainThread](2026-02-26 20:07:58,185) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
KMID KLEN ... VSUMD30 VSUMD60
|
||||
datetime instrument ...
|
||||
2019-12-03 SH600000 0.004234 0.011008 ... -0.031454 -0.009671
|
||||
SH600004 0.015467 0.031529 ... -0.004401 0.007701
|
||||
SH600006 0.022573 0.033860 ... 0.060561 -0.000159
|
||||
SH600007 0.012129 0.025470 ... 0.008489 -0.054056
|
||||
SH600008 0.006173 0.009259 ... -0.088065 -0.080770
|
||||
... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708
|
||||
SZ301662 0.060584 0.087834 ... -0.014658 -0.014613
|
||||
SZ301665 -0.012899 0.040541 ... 0.083229 0.055994
|
||||
SZ301678 0.018182 0.027879 ... -0.054124 0.014202
|
||||
SZ302132 0.001754 0.016416 ... -0.049558 -0.038667
|
||||
|
||||
[6886779 rows x 158 columns]
|
||||
[2715583:MainThread](2026-02-26 20:07:58,186) INFO - qlib.timer - [log.py:117] - Time cost: 4.911s | Fetching dataframe Done
|
||||
[2715583:MainThread](2026-02-26 20:07:58,203) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
|
||||
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3
|
||||
[2715583:MainThread](2026-02-26 20:08:15,182) INFO - qlib.timer - [log.py:117] - Time cost: 16.990s | DDB query: Done
|
||||
[2715583:MainThread](2026-02-26 20:08:15,974) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2715583:MainThread](2026-02-26 20:08:16,548) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,con_rating_strength from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating")
|
||||
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH
|
||||
[2715583:MainThread](2026-02-26 20:08:27,838) INFO - qlib.timer - [log.py:117] - Time cost: 11.299s | DDB query: Done
|
||||
[2715583:MainThread](2026-02-26 20:08:28,690) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-25 00:00:00
|
||||
[2715583:MainThread](2026-02-26 20:09:53,616) INFO - qlib.timer - [log.py:117] - Time cost: 81.815s | Instruments filter: Done
|
||||
[2715583:MainThread](2026-02-26 20:09:54,168) INFO - qlib.timer - [log.py:117] - Time cost: 115.981s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x70ea4ba85f40>) Done
|
||||
[2715583:MainThread](2026-02-26 20:09:54,169) INFO - qlib.timer - [log.py:117] - Time cost: 115.982s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x70ea4ba85f40>) Done
|
||||
[2715583:MainThread](2026-02-26 20:09:54,170) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2715583:MainThread](2026-02-26 20:09:54,893) INFO - qlib.timer - [log.py:117] - Time cost: 0.723s | fetch_df_by_index Done
|
||||
[2715583:MainThread](2026-02-26 20:09:54,901) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
turnover free_turnover log_size con_rating_strength
|
||||
datetime instrument
|
||||
2019-12-03 SH600000 0.0696 0.1275 17.322001 0.6618
|
||||
SH600004 0.6009 1.2276 15.077468 0.8269
|
||||
SH600006 0.5976 1.5087 13.716795 1.0000
|
||||
SH600007 0.0961 0.4969 14.334991 0.7500
|
||||
SH600008 0.0967 0.1793 14.432563 0.6591
|
||||
... ... ... ... ...
|
||||
2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN
|
||||
SZ301662 12.5950 12.5950 12.681215 NaN
|
||||
SZ301665 14.0077 14.0077 11.719415 NaN
|
||||
SZ301678 6.6518 6.6518 12.799973 NaN
|
||||
SZ302132 1.3868 3.0296 15.359885 NaN
|
||||
|
||||
[7601552 rows x 4 columns]
|
||||
[2715583:MainThread](2026-02-26 20:09:54,902) INFO - qlib.timer - [log.py:117] - Time cost: 0.732s | Fetching dataframe Done
|
||||
[2715583:MainThread](2026-02-26 20:09:54,917) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
|
||||
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657',
|
||||
[2715583:MainThread](2026-02-26 20:10:15,465) INFO - qlib.timer - [log.py:117] - Time cost: 20.556s | DDB query: Done
|
||||
[2715583:MainThread](2026-02-26 20:10:16,265) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2715583:MainThread](2026-02-26 20:10:16,775) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag")
|
||||
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','
|
||||
[2715583:MainThread](2026-02-26 20:10:36,740) INFO - qlib.timer - [log.py:117] - Time cost: 19.975s | DDB query: Done
|
||||
[2715583:MainThread](2026-02-26 20:10:37,558) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2715583:MainThread](2026-02-26 20:12:04,978) INFO - qlib.timer - [log.py:117] - Time cost: 84.148s | Instruments filter: Done
|
||||
[2715583:MainThread](2026-02-26 20:12:05,899) INFO - qlib.timer - [log.py:117] - Time cost: 130.996s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x70e88d06acc0>) Done
|
||||
[2715583:MainThread](2026-02-26 20:12:05,900) INFO - qlib.timer - [log.py:117] - Time cost: 130.997s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x70e88d06acc0>) Done
|
||||
[2715583:MainThread](2026-02-26 20:12:05,902) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2715583:MainThread](2026-02-26 20:12:06,745) INFO - qlib.timer - [log.py:117] - Time cost: 0.842s | fetch_df_by_index Done
|
||||
[2715583:MainThread](2026-02-26 20:12:06,758) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
IsZt IsDt IsN ... open_stop close_stop high_stop
|
||||
datetime instrument ...
|
||||
2019-12-03 SH600000 False False False ... False False False
|
||||
SH600004 False False False ... False False False
|
||||
SH600006 False False False ... False False False
|
||||
SH600007 False False False ... False False False
|
||||
SH600008 False False False ... False False False
|
||||
... ... ... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False False ... False False False
|
||||
SZ301662 False False False ... False False False
|
||||
SZ301665 False False False ... False False False
|
||||
SZ301678 False False False ... False False False
|
||||
SZ302132 False False False ... False False False
|
||||
|
||||
[6903684 rows x 12 columns]
|
||||
[2715583:MainThread](2026-02-26 20:12:06,759) INFO - qlib.timer - [log.py:117] - Time cost: 0.857s | Fetching dataframe Done
|
||||
[2715583:MainThread](2026-02-26 20:12:06,777) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1")
|
||||
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S
|
||||
[2715583:MainThread](2026-02-26 20:12:08,840) INFO - qlib.timer - [log.py:117] - Time cost: 2.073s | DDB query: Done
|
||||
[2715583:MainThread](2026-02-26 20:12:08,849) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2715583:MainThread](2026-02-26 20:13:26,572) INFO - qlib.timer - [log.py:117] - Time cost: 77.719s | Instruments filter: Done
|
||||
[2715583:MainThread](2026-02-26 20:13:26,601) INFO - qlib.timer - [log.py:117] - Time cost: 79.839s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x70e67060acc0>) Done
|
||||
[2715583:MainThread](2026-02-26 20:13:26,602) INFO - qlib.timer - [log.py:117] - Time cost: 79.840s | Init data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x70e67060acc0>) Done
|
||||
[2715583:MainThread](2026-02-26 20:13:26,603) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2715583:MainThread](2026-02-26 20:13:26,612) INFO - qlib.timer - [log.py:117] - Time cost: 0.008s | fetch_df_by_index Done
|
||||
[2715583:MainThread](2026-02-26 20:13:26,633) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
gds_CC10 gds_CC11 ... gds_CC63 gds_CC70
|
||||
datetime instrument ...
|
||||
2026-02-09 SH600000 False False ... False False
|
||||
SH600004 False False ... False False
|
||||
SH600006 False False ... False False
|
||||
SH600007 False False ... False False
|
||||
SH600008 False False ... False False
|
||||
... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False ... False False
|
||||
SZ301662 False False ... False False
|
||||
SZ301665 False False ... False False
|
||||
SZ301678 False False ... False False
|
||||
SZ302132 False False ... False False
|
||||
|
||||
[41168 rows x 30 columns]
|
||||
[2715583:MainThread](2026-02-26 20:13:26,634) INFO - qlib.timer - [log.py:117] - Time cost: 0.031s | Fetching dataframe Done
|
||||
[2715583:MainThread](2026-02-26 20:13:26,652) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag")
|
||||
where m_nDate>=2019.12.03 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002
|
||||
[2715583:MainThread](2026-02-26 20:13:55,744) INFO - qlib.timer - [log.py:117] - Time cost: 29.102s | DDB query: Done
|
||||
[2715583:MainThread](2026-02-26 20:13:56,520) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2715583:MainThread](2026-02-26 20:15:27,625) INFO - qlib.timer - [log.py:117] - Time cost: 90.586s | Instruments filter: Done
|
||||
[2715583:MainThread](2026-02-26 20:15:28,257) INFO - qlib.timer - [log.py:117] - Time cost: 121.621s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x70e88c9710d0>) Done
|
||||
[2715583:MainThread](2026-02-26 20:15:28,257) INFO - qlib.timer - [log.py:117] - Time cost: 121.622s | Init data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x70e88c9710d0>) Done
|
||||
[2715583:MainThread](2026-02-26 20:15:28,258) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2715583:MainThread](2026-02-26 20:15:28,867) INFO - qlib.timer - [log.py:117] - Time cost: 0.608s | fetch_df_by_index Done
|
||||
[2715583:MainThread](2026-02-26 20:15:28,875) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-03 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
ST_Y ST_S ST_T ST_L ST_Z ST_X
|
||||
datetime instrument
|
||||
2019-12-03 SH600000 False False False False False False
|
||||
SH600004 False False False False False False
|
||||
SH600006 False False False False False False
|
||||
SH600007 False False False False False False
|
||||
SH600008 False False False False False False
|
||||
... ... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False False False False False
|
||||
SZ301662 False False False False False False
|
||||
SZ301665 False False False False False False
|
||||
SZ301678 False False False False False False
|
||||
SZ302132 False False False False False False
|
||||
|
||||
[6903687 rows x 6 columns]
|
||||
[2715583:MainThread](2026-02-26 20:15:28,876) INFO - qlib.timer - [log.py:117] - Time cost: 0.617s | Fetching dataframe Done
|
||||
/home/guofu/.venv/alpha2/lib/python3.12/site-packages/qlib/contrib/utils/paral.py:22: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
|
||||
group_list = [_df.resample("M", level="datetime")\
|
||||
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e88bd2a000>
|
||||
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e88bd2a000>
|
||||
Query config:
|
||||
#concepts: 2;
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e88c9cf6e0>
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e88c9cf6e0>
|
||||
Field list: ['gds_CC10', 'gds_CC11', 'gds_CC12', 'gds_CC20', 'gds_CC21', 'gds_CC22', 'gds_CC23', 'gds_CC24', 'gds_CC25', 'gds_CC26', 'gds_CC27', 'gds_CC28', 'gds_CC30', 'gds_CC31', 'gds_CC32', 'gds_CC33', 'gds_CC34', 'gds_CC35', 'gds_CC36', 'gds_CC37', 'gds_CC40', 'gds_CC41', 'gds_CC42', 'gds_CC43', 'gds_CC50', 'gds_CC60', 'gds_CC61', 'gds_CC62', 'gds_CC63', 'gds_CC70']
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e6706082f0>
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x70e65fdafd40>
|
||||
[2715583:MainThread](2026-02-26 20:15:32,735) INFO - qlib.timer - [log.py:117] - Time cost: 3.858s | Concat index: Done
|
||||
[2715583:MainThread](2026-02-26 20:15:32,737) INFO - qlib.timer - [log.py:117] - Time cost: 0.001s | Sort index: Done
|
||||
[2715583:MainThread](2026-02-26 20:15:36,349) INFO - qlib.timer - [log.py:117] - Time cost: 3.611s | Creating SepDataFrame: Done
|
||||
[2715583:MainThread](2026-02-26 20:15:37,245) INFO - qlib.timer - [log.py:117] - Time cost: 1040.537s | Loading data (<qlib.contrib.data.agg_handler.AggHandler object at 0x70e88b12b3e0>) Done
|
||||
[2715583:MainThread](2026-02-26 20:15:37,246) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
|
||||
[]
|
||||
[2715583:MainThread](2026-02-26 20:15:37,248) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
|
||||
[2715583:MainThread](2026-02-26 20:15:37,265) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
|
||||
[]
|
||||
[2715583:MainThread](2026-02-26 20:15:37,266) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
|
||||
[2715583:MainThread](2026-02-26 20:15:37,277) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
|
||||
[]
|
||||
[2715583:MainThread](2026-02-26 20:15:37,277) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
|
||||
[2715583:MainThread](2026-02-26 20:15:37,293) INFO - qlib.timer - [log.py:117] - Time cost: 0.047s | fit & process data Done
|
||||
[2715583:MainThread](2026-02-26 20:15:37,294) INFO - qlib.timer - [log.py:117] - Time cost: 1040.587s | Init data (<qlib.contrib.data.agg_handler.AggHandler object at 0x70e88b12b3e0>) Done
|
||||
[2715583:MainThread](2026-02-26 20:15:37,963) INFO - qlib.DataHandlerLP - [handler.py:487] - Will apply processor <qlib.contrib.data.processor.common.Diff object at 0x70e88bf4af30>
|
||||
[2715583:MainThread](2026-02-26 20:15:40,135) INFO - qlib.timer - [log.py:117] - Time cost: 2.171s | Diff Done
|
||||
[2715583:MainThread](2026-02-26 20:15:40,136) INFO - qlib.DataHandlerLP - [handler.py:487] - Will apply processor <qlib.contrib.data.processor.flag.FlagMarketInjector object at 0x70e88cd8fd40>
|
||||
All processors are readonly
|
||||
All processors are readonly
|
||||
All processors are readonly
|
||||
Did load data from config: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/handler.yaml
|
||||
Did load norm from: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc
|
||||
Will assign `feature_ext` with
|
||||
turnover ... con_rating_strength_diff
|
||||
datetime instrument ...
|
||||
2026-02-09 SH600000 0.1837 ... 0.0
|
||||
SH600004 0.6948 ... 0.0
|
||||
SH600006 0.5542 ... 0.0
|
||||
SH600007 0.2057 ... 0.0
|
||||
SH600008 0.9809 ... 0.0
|
||||
... ... ... ...
|
||||
2026-02-26 SZ301658 6.0785 ... 0.0
|
||||
SZ301662 12.5950 ... 0.0
|
||||
SZ301665 14.0077 ... 0.0
|
||||
SZ301678 6.6518 ... 0.0
|
||||
SZ302132 1.3868 ... 0.0
|
||||
|
||||
[41085 rows x 8 columns]
|
||||
---
|
||||
ERROR: Failed to load data from Qlib pipeline: Cannot convert non-finite values (NA or inf) to integer
|
||||
@ -0,0 +1,373 @@
|
||||
[2730312:MainThread](2026-02-26 21:28:33,675) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
|
||||
[2730312:MainThread](2026-02-26 21:28:33,679) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
|
||||
[2730312:MainThread](2026-02-26 21:28:33,680) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
|
||||
================================================================================
|
||||
DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE
|
||||
================================================================================
|
||||
Date Range: 2020-01-02 to 2020-01-10
|
||||
Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data
|
||||
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
|
||||
|
||||
Step 1: Loading data from Qlib pipeline...
|
||||
Loading since_date=2020-01-02
|
||||
Loading raw data from handler.yaml...
|
||||
Will use `placehorder_value` from module: qlib.contrib.data.config
|
||||
Will init handler object from config:
|
||||
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'end_time': datetime.date(2026, 2, 26),
|
||||
'handler_list': [{'class': 'DDBAlpha158Handler',
|
||||
'kwargs': {'col_set': 'feature',
|
||||
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
|
||||
'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': 'alpha158',
|
||||
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
|
||||
{'class': 'DDBMarketExtHandler',
|
||||
'kwargs': {'col_set': 'feature_ext',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['Turnover '
|
||||
'as '
|
||||
'turnover',
|
||||
'FreeTurnover '
|
||||
'as '
|
||||
'free_turnover',
|
||||
'log(MarketValue) '
|
||||
'as '
|
||||
'log_size'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['con_rating_strength'],
|
||||
'table_name': 'stg_1day_gds_con_rating'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
|
||||
{'class': 'DDBMarketFlagHandler',
|
||||
'kwargs': {'col_set': 'feature_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['IsZt',
|
||||
'IsDt',
|
||||
'IsN',
|
||||
'IsXD',
|
||||
'IsXR',
|
||||
'IsDR'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['open_limit',
|
||||
'close_limit',
|
||||
'low_limit',
|
||||
'open_stop',
|
||||
'close_stop',
|
||||
'high_stop'],
|
||||
'table_name': 'stg_1day_wind_market_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
|
||||
{'class': 'DDBIndusFlagHandler',
|
||||
'kwargs': {'col_set': 'indus_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': 'industry_code_cc.csv',
|
||||
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
|
||||
{'class': 'DDBStFlagHandler',
|
||||
'kwargs': {'col_set': 'st_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['ST_Y',
|
||||
'ST_S',
|
||||
'ST_T',
|
||||
'ST_L',
|
||||
'ST_Z',
|
||||
'ST_X'],
|
||||
'table_name': 'stg_1day_wind_st_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
|
||||
'instruments': 'csiallx',
|
||||
'start_time': Timestamp('2019-12-13 00:00:00')},
|
||||
'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'handler': {'class': 'AggHandler',
|
||||
'kwargs': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'end_time': [2730312:MainThread](2026-02-26 21:28:33,704) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
|
||||
[2730312:MainThread](2026-02-26 21:28:33,704) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
|
||||
[2730312:MainThread](2026-02-26 21:28:34,011) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
|
||||
[2730312:MainThread](2026-02-26 21:36:00,317) INFO - qlib.timer - [log.py:117] - Time cost: 446.602s | DDB query: Done
|
||||
[2730312:MainThread](2026-02-26 21:36:01,106) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
datetime.date(2026, 2, 26),
|
||||
'handler_list': [{'class': 'DDBAlpha158Handler',
|
||||
'kwargs': {'col_set': 'feature',
|
||||
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
|
||||
'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': 'alpha158',
|
||||
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
|
||||
{'class': 'DDBMarketExtHandler',
|
||||
'kwargs': {'col_set': 'feature_ext',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['Turnover '
|
||||
'as '
|
||||
'turnover',
|
||||
'FreeTurnover '
|
||||
'as '
|
||||
'free_turnover',
|
||||
'log(MarketValue) '
|
||||
'as '
|
||||
'log_size'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['con_rating_strength'],
|
||||
'table_name': 'stg_1day_gds_con_rating'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
|
||||
{'class': 'DDBMarketFlagHandler',
|
||||
'kwargs': {'col_set': 'feature_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['IsZt',
|
||||
'IsDt',
|
||||
'IsN',
|
||||
'IsXD',
|
||||
'IsXR',
|
||||
'IsDR'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['open_limit',
|
||||
'close_limit',
|
||||
'low_limit',
|
||||
'open_stop',
|
||||
'close_stop',
|
||||
'high_stop'],
|
||||
'table_name': 'stg_1day_wind_market_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
|
||||
{'class': 'DDBIndusFlagHandler',
|
||||
'kwargs': {'col_set': 'indus_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': 'industry_code_cc.csv',
|
||||
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
|
||||
{'class': 'DDBStFlagHandler',
|
||||
'kwargs': {'col_set': 'st_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['ST_Y',
|
||||
'ST_S',
|
||||
'ST_T',
|
||||
'ST_L',
|
||||
'ST_Z',
|
||||
'ST_X'],
|
||||
'table_name': 'stg_1day_wind_st_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
|
||||
'instruments': 'csiallx',
|
||||
'start_time': Timestamp('2019-12-13 00:00:00')},
|
||||
'module_path': 'qlib.contrib.data.agg_handler'},
|
||||
'load_end': datetime.date(2026, 2, 26),
|
||||
'load_start': Timestamp('2019-12-13 00:00:00'),
|
||||
'market': 'csiallx',
|
||||
'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target',
|
||||
'region': 'cn'}}
|
||||
Query config:
|
||||
#alpha158: 1;
|
||||
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x761f37e75a60>
|
||||
[2730312:MainThread](2026-02-26 21:38:13,636) INFO - qlib.timer - [log.py:117] - Time cost: 123.423s | Instruments filter: Done
|
||||
[2730312:MainThread](2026-02-26 21:38:20,733) INFO - qlib.timer - [log.py:117] - Time cost: 587.024s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x761f366b3bc0>) Done
|
||||
[2730312:MainThread](2026-02-26 21:38:20,734) INFO - qlib.timer - [log.py:117] - Time cost: 587.026s | Init data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x761f366b3bc0>) Done
|
||||
[2730312:MainThread](2026-02-26 21:38:20,736) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2730312:MainThread](2026-02-26 21:38:24,302) INFO - qlib.timer - [log.py:117] - Time cost: 3.564s | fetch_df_by_index Done
|
||||
[2730312:MainThread](2026-02-26 21:38:25,946) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
KMID KLEN ... VSUMD30 VSUMD60
|
||||
datetime instrument ...
|
||||
2019-12-13 SH600000 0.011686 0.015025 ... -0.011573 0.039735
|
||||
SH600004 0.000000 0.009169 ... -0.146051 0.024757
|
||||
SH600006 -0.004329 0.015152 ... 0.136883 0.024626
|
||||
SH600007 0.005590 0.019005 ... -0.012912 0.017215
|
||||
SH600008 0.012270 0.012270 ... 0.039878 -0.013888
|
||||
... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708
|
||||
SZ301662 0.060584 0.087834 ... -0.014658 -0.014613
|
||||
SZ301665 -0.012899 0.040541 ... 0.083229 0.055994
|
||||
SZ301678 0.018182 0.027879 ... -0.054124 0.014202
|
||||
SZ302132 0.001754 0.016416 ... -0.049558 -0.038667
|
||||
|
||||
[6858048 rows x 158 columns]
|
||||
[2730312:MainThread](2026-02-26 21:38:25,947) INFO - qlib.timer - [log.py:117] - Time cost: 5.212s | Fetching dataframe Done
|
||||
[2730312:MainThread](2026-02-26 21:38:25,965) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
|
||||
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3
|
||||
[2730312:MainThread](2026-02-26 21:38:43,081) INFO - qlib.timer - [log.py:117] - Time cost: 17.127s | DDB query: Done
|
||||
[2730312:MainThread](2026-02-26 21:38:43,874) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2730312:MainThread](2026-02-26 21:38:44,458) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,con_rating_strength from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating")
|
||||
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH
|
||||
[2730312:MainThread](2026-02-26 21:38:55,720) INFO - qlib.timer - [log.py:117] - Time cost: 11.271s | DDB query: Done
|
||||
[2730312:MainThread](2026-02-26 21:38:56,586) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-25 00:00:00
|
||||
[2730312:MainThread](2026-02-26 21:40:21,007) INFO - qlib.timer - [log.py:117] - Time cost: 81.315s | Instruments filter: Done
|
||||
[2730312:MainThread](2026-02-26 21:40:21,576) INFO - qlib.timer - [log.py:117] - Time cost: 115.627s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x7620fb822930>) Done
|
||||
[2730312:MainThread](2026-02-26 21:40:21,576) INFO - qlib.timer - [log.py:117] - Time cost: 115.628s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x7620fb822930>) Done
|
||||
[2730312:MainThread](2026-02-26 21:40:21,577) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2730312:MainThread](2026-02-26 21:40:22,309) INFO - qlib.timer - [log.py:117] - Time cost: 0.731s | fetch_df_by_index Done
|
||||
[2730312:MainThread](2026-02-26 21:40:22,317) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
turnover free_turnover log_size con_rating_strength
|
||||
datetime instrument
|
||||
2019-12-13 SH600000 0.2118 0.3879 17.343685 0.7143
|
||||
SH600004 0.7518 1.5357 15.099485 0.8214
|
||||
SH600006 0.7827 1.9762 13.732129 1.0000
|
||||
SH600007 0.1368 0.7071 14.409998 0.7500
|
||||
SH600008 0.2152 0.3990 14.444757 0.7500
|
||||
... ... ... ... ...
|
||||
2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN
|
||||
SZ301662 12.5950 12.5950 12.681215 NaN
|
||||
SZ301665 14.0077 14.0077 11.719415 NaN
|
||||
SZ301678 6.6518 6.6518 12.799973 NaN
|
||||
SZ302132 1.3868 3.0296 15.359885 NaN
|
||||
|
||||
[7572626 rows x 4 columns]
|
||||
[2730312:MainThread](2026-02-26 21:40:22,318) INFO - qlib.timer - [log.py:117] - Time cost: 0.741s | Fetching dataframe Done
|
||||
[2730312:MainThread](2026-02-26 21:40:22,334) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
|
||||
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657',
|
||||
[2730312:MainThread](2026-02-26 21:40:43,075) INFO - qlib.timer - [log.py:117] - Time cost: 20.751s | DDB query: Done
|
||||
[2730312:MainThread](2026-02-26 21:40:43,889) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2730312:MainThread](2026-02-26 21:40:44,394) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag")
|
||||
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','
|
||||
[2730312:MainThread](2026-02-26 21:41:04,632) INFO - qlib.timer - [log.py:117] - Time cost: 20.246s | DDB query: Done
|
||||
[2730312:MainThread](2026-02-26 21:41:05,434) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2730312:MainThread](2026-02-26 21:42:33,029) INFO - qlib.timer - [log.py:117] - Time cost: 84.294s | Instruments filter: Done
|
||||
[2730312:MainThread](2026-02-26 21:42:34,049) INFO - qlib.timer - [log.py:117] - Time cost: 131.730s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x761f37e74470>) Done
|
||||
[2730312:MainThread](2026-02-26 21:42:34,050) INFO - qlib.timer - [log.py:117] - Time cost: 131.731s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x761f37e74470>) Done
|
||||
[2730312:MainThread](2026-02-26 21:42:34,051) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2730312:MainThread](2026-02-26 21:42:34,895) INFO - qlib.timer - [log.py:117] - Time cost: 0.843s | fetch_df_by_index Done
|
||||
[2730312:MainThread](2026-02-26 21:42:34,907) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
IsZt IsDt IsN ... open_stop close_stop high_stop
|
||||
datetime instrument ...
|
||||
2019-12-13 SH600000 False False False ... False False False
|
||||
SH600004 False False False ... False False False
|
||||
SH600006 False False False ... False False False
|
||||
SH600007 False False False ... False False False
|
||||
SH600008 False False False ... False False False
|
||||
... ... ... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False False ... False False False
|
||||
SZ301662 False False False ... False False False
|
||||
SZ301665 False False False ... False False False
|
||||
SZ301678 False False False ... False False False
|
||||
SZ302132 False False False ... False False False
|
||||
|
||||
[6874830 rows x 12 columns]
|
||||
[2730312:MainThread](2026-02-26 21:42:34,908) INFO - qlib.timer - [log.py:117] - Time cost: 0.857s | Fetching dataframe Done
|
||||
[2730312:MainThread](2026-02-26 21:42:34,927) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1")
|
||||
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S
|
||||
[2730312:MainThread](2026-02-26 21:42:36,986) INFO - qlib.timer - [log.py:117] - Time cost: 2.069s | DDB query: Done
|
||||
[2730312:MainThread](2026-02-26 21:42:36,996) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2730312:MainThread](2026-02-26 21:43:53,198) INFO - qlib.timer - [log.py:117] - Time cost: 76.199s | Instruments filter: Done
|
||||
[2730312:MainThread](2026-02-26 21:43:53,230) INFO - qlib.timer - [log.py:117] - Time cost: 78.318s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x761f381e1c40>) Done
|
||||
[2730312:MainThread](2026-02-26 21:43:53,231) INFO - qlib.timer - [log.py:117] - Time cost: 78.319s | Init data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x761f381e1c40>) Done
|
||||
[2730312:MainThread](2026-02-26 21:43:53,231) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2730312:MainThread](2026-02-26 21:43:53,239) INFO - qlib.timer - [log.py:117] - Time cost: 0.007s | fetch_df_by_index Done
|
||||
[2730312:MainThread](2026-02-26 21:43:53,257) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
gds_CC10 gds_CC11 ... gds_CC63 gds_CC70
|
||||
datetime instrument ...
|
||||
2026-02-09 SH600000 False False ... False False
|
||||
SH600004 False False ... False False
|
||||
SH600006 False False ... False False
|
||||
SH600007 False False ... False False
|
||||
SH600008 False False ... False False
|
||||
... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False ... False False
|
||||
SZ301662 False False ... False False
|
||||
SZ301665 False False ... False False
|
||||
SZ301678 False False ... False False
|
||||
SZ302132 False False ... False False
|
||||
|
||||
[41168 rows x 30 columns]
|
||||
[2730312:MainThread](2026-02-26 21:43:53,258) INFO - qlib.timer - [log.py:117] - Time cost: 0.027s | Fetching dataframe Done
|
||||
[2730312:MainThread](2026-02-26 21:43:53,274) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag")
|
||||
where m_nDate>=2019.12.13 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002
|
||||
[2730312:MainThread](2026-02-26 21:44:44,876) INFO - qlib.timer - [log.py:117] - Time cost: 51.611s | DDB query: Done
|
||||
[2730312:MainThread](2026-02-26 21:44:45,602) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2730312:MainThread](2026-02-26 21:46:07,184) INFO - qlib.timer - [log.py:117] - Time cost: 81.056s | Instruments filter: Done
|
||||
[2730312:MainThread](2026-02-26 21:46:07,747) INFO - qlib.timer - [log.py:117] - Time cost: 134.487s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x761f387b3080>) Done
|
||||
[2730312:MainThread](2026-02-26 21:46:07,748) INFO - qlib.timer - [log.py:117] - Time cost: 134.488s | Init data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x761f387b3080>) Done
|
||||
[2730312:MainThread](2026-02-26 21:46:07,748) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2730312:MainThread](2026-02-26 21:46:08,349) INFO - qlib.timer - [log.py:117] - Time cost: 0.600s | fetch_df_by_index Done
|
||||
[2730312:MainThread](2026-02-26 21:46:08,358) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-12-13 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
ST_Y ST_S ST_T ST_L ST_Z ST_X
|
||||
datetime instrument
|
||||
2019-12-13 SH600000 False False False False False False
|
||||
SH600004 False False False False False False
|
||||
SH600006 False False False False False False
|
||||
SH600007 False False False False False False
|
||||
SH600008 False False False False False False
|
||||
... ... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False False False False False
|
||||
SZ301662 False False False False False False
|
||||
SZ301665 False False False False False False
|
||||
SZ301678 False False False False False False
|
||||
SZ302132 False False False False False False
|
||||
|
||||
[6874833 rows x 6 columns]
|
||||
[2730312:MainThread](2026-02-26 21:46:08,359) INFO - qlib.timer - [log.py:117] - Time cost: 0.610s | Fetching dataframe Done
|
||||
/home/guofu/.venv/alpha2/lib/python3.12/site-packages/qlib/contrib/utils/paral.py:22: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
|
||||
group_list = [_df.resample("M", level="datetime")\
|
||||
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x76203dfa91f0>
|
||||
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x76203dfa91f0>
|
||||
Query config:
|
||||
#concepts: 2;
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x761f346e7aa0>
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x761f346e7aa0>
|
||||
Field list: ['gds_CC10', 'gds_CC11', 'gds_CC12', 'gds_CC20', 'gds_CC21', 'gds_CC22', 'gds_CC23', 'gds_CC24', 'gds_CC25', 'gds_CC26', 'gds_CC27', 'gds_CC28', 'gds_CC30', 'gds_CC31', 'gds_CC32', 'gds_CC33', 'gds_CC34', 'gds_CC35', 'gds_CC36', 'gds_CC37', 'gds_CC40', 'gds_CC41', 'gds_CC42', 'gds_CC43', 'gds_CC50', 'gds_CC60', 'gds_CC61', 'gds_CC62', 'gds_CC63', 'gds_CC70']
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x761cc3995760>
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x761a968d1d00>
|
||||
[2730312:MainThread](2026-02-26 21:46:11,623) INFO - qlib.timer - [log.py:117] - Time cost: 3.264s | Concat index: Done
|
||||
[2730312:MainThread](2026-02-26 21:46:11,625) INFO - qlib.timer - [log.py:117] - Time cost: 0.001s | Sort index: Done
|
||||
[2730312:MainThread](2026-02-26 21:46:15,058) INFO - qlib.timer - [log.py:117] - Time cost: 3.433s | Creating SepDataFrame: Done
|
||||
[2730312:MainThread](2026-02-26 21:46:15,928) INFO - qlib.timer - [log.py:117] - Time cost: 1062.224s | Loading data (<qlib.contrib.data.agg_handler.AggHandler object at 0x761f36612720>) Done
|
||||
[2730312:MainThread](2026-02-26 21:46:15,929) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
|
||||
[]
|
||||
[2730312:MainThread](2026-02-26 21:46:15,931) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
|
||||
[2730312:MainThread](2026-02-26 21:46:15,935) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
|
||||
[]
|
||||
[2730312:MainThread](2026-02-26 21:46:15,936) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
|
||||
[2730312:MainThread](2026-02-26 21:46:15,939) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
|
||||
[]
|
||||
[2730312:MainThread](2026-02-26 21:46:15,940) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
|
||||
[2730312:MainThread](2026-02-26 21:46:15,943) INFO - qlib.timer - [log.py:117] - Time cost: 0.014s | fit & process data Done
|
||||
[2730312:MainThread](2026-02-26 21:46:15,943) INFO - qlib.timer - [log.py:117] - Time cost: 1062.239s | Init data (<qlib.contrib.data.agg_handler.AggHandler object at 0x761f36612720>) Done
|
||||
All processors are readonly
|
||||
All processors are readonly
|
||||
All processors are readonly
|
||||
ERROR: Failed to load data from Qlib pipeline: 'SepDataFrame' object has no attribute 'shape'
|
||||
@ -0,0 +1,373 @@
|
||||
[2734404:MainThread](2026-02-26 22:10:11,609) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
|
||||
[2734404:MainThread](2026-02-26 22:10:11,613) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
|
||||
[2734404:MainThread](2026-02-26 22:10:11,613) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
|
||||
================================================================================
|
||||
DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE
|
||||
================================================================================
|
||||
Date Range: 2020-01-02 to 2020-01-10
|
||||
Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data
|
||||
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
|
||||
|
||||
Step 1: Loading data from Qlib pipeline...
|
||||
Loading since_date=2020-01-02
|
||||
Loading data with handler (load_start=2019-12-13)...
|
||||
Will use `placehorder_value` from module: qlib.contrib.data.config
|
||||
Will init handler object from config:
|
||||
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'end_time': datetime.date(2026, 2, 26),
|
||||
'handler_list': [{'class': 'DDBAlpha158Handler',
|
||||
'kwargs': {'col_set': 'feature',
|
||||
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
|
||||
'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': 'alpha158',
|
||||
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
|
||||
{'class': 'DDBMarketExtHandler',
|
||||
'kwargs': {'col_set': 'feature_ext',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['Turnover '
|
||||
'as '
|
||||
'turnover',
|
||||
'FreeTurnover '
|
||||
'as '
|
||||
'free_turnover',
|
||||
'log(MarketValue) '
|
||||
'as '
|
||||
'log_size'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['con_rating_strength'],
|
||||
'table_name': 'stg_1day_gds_con_rating'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
|
||||
{'class': 'DDBMarketFlagHandler',
|
||||
'kwargs': {'col_set': 'feature_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['IsZt',
|
||||
'IsDt',
|
||||
'IsN',
|
||||
'IsXD',
|
||||
'IsXR',
|
||||
'IsDR'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['open_limit',
|
||||
'close_limit',
|
||||
'low_limit',
|
||||
'open_stop',
|
||||
'close_stop',
|
||||
'high_stop'],
|
||||
'table_name': 'stg_1day_wind_market_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
|
||||
{'class': 'DDBIndusFlagHandler',
|
||||
'kwargs': {'col_set': 'indus_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': 'industry_code_cc.csv',
|
||||
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
|
||||
{'class': 'DDBStFlagHandler',
|
||||
'kwargs': {'col_set': 'st_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['ST_Y',
|
||||
'ST_S',
|
||||
'ST_T',
|
||||
'ST_L',
|
||||
'ST_Z',
|
||||
'ST_X'],
|
||||
'table_name': 'stg_1day_wind_st_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
|
||||
'instruments': 'csiallx',
|
||||
'start_time': Timestamp('2019-11-23 00:00:00')},
|
||||
'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'handler': {'class': 'AggHandler',
|
||||
'kwargs': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
[2734404:MainThread](2026-02-26 22:10:11,634) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
|
||||
[2734404:MainThread](2026-02-26 22:10:11,634) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
|
||||
[2734404:MainThread](2026-02-26 22:10:11,842) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
|
||||
[2734404:MainThread](2026-02-26 22:17:41,432) INFO - qlib.timer - [log.py:117] - Time cost: 449.788s | DDB query: Done
|
||||
[2734404:MainThread](2026-02-26 22:17:42,271) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
'end_time': datetime.date(2026, 2, 26),
|
||||
'handler_list': [{'class': 'DDBAlpha158Handler',
|
||||
'kwargs': {'col_set': 'feature',
|
||||
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
|
||||
'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': 'alpha158',
|
||||
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
|
||||
{'class': 'DDBMarketExtHandler',
|
||||
'kwargs': {'col_set': 'feature_ext',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['Turnover '
|
||||
'as '
|
||||
'turnover',
|
||||
'FreeTurnover '
|
||||
'as '
|
||||
'free_turnover',
|
||||
'log(MarketValue) '
|
||||
'as '
|
||||
'log_size'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['con_rating_strength'],
|
||||
'table_name': 'stg_1day_gds_con_rating'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
|
||||
{'class': 'DDBMarketFlagHandler',
|
||||
'kwargs': {'col_set': 'feature_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['IsZt',
|
||||
'IsDt',
|
||||
'IsN',
|
||||
'IsXD',
|
||||
'IsXR',
|
||||
'IsDR'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['open_limit',
|
||||
'close_limit',
|
||||
'low_limit',
|
||||
'open_stop',
|
||||
'close_stop',
|
||||
'high_stop'],
|
||||
'table_name': 'stg_1day_wind_market_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
|
||||
{'class': 'DDBIndusFlagHandler',
|
||||
'kwargs': {'col_set': 'indus_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': 'industry_code_cc.csv',
|
||||
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
|
||||
{'class': 'DDBStFlagHandler',
|
||||
'kwargs': {'col_set': 'st_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['ST_Y',
|
||||
'ST_S',
|
||||
'ST_T',
|
||||
'ST_L',
|
||||
'ST_Z',
|
||||
'ST_X'],
|
||||
'table_name': 'stg_1day_wind_st_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
|
||||
'instruments': 'csiallx',
|
||||
'start_time': Timestamp('2019-11-23 00:00:00')},
|
||||
'module_path': 'qlib.contrib.data.agg_handler'},
|
||||
'load_end': datetime.date(2026, 2, 26),
|
||||
'load_start': Timestamp('2019-11-23 00:00:00'),
|
||||
'market': 'csiallx',
|
||||
'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target',
|
||||
'region': 'cn'}}
|
||||
Query config:
|
||||
#alpha158: 1;
|
||||
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c5e04773e0>
|
||||
[2734404:MainThread](2026-02-26 22:19:46,550) INFO - qlib.timer - [log.py:117] - Time cost: 115.118s | Instruments filter: Done
|
||||
[2734404:MainThread](2026-02-26 22:19:53,556) INFO - qlib.timer - [log.py:117] - Time cost: 581.918s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x71c5e0475d60>) Done
|
||||
[2734404:MainThread](2026-02-26 22:19:53,557) INFO - qlib.timer - [log.py:117] - Time cost: 581.920s | Init data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x71c5e0475d60>) Done
|
||||
[2734404:MainThread](2026-02-26 22:19:53,560) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2734404:MainThread](2026-02-26 22:19:57,060) INFO - qlib.timer - [log.py:117] - Time cost: 3.499s | fetch_df_by_index Done
|
||||
[2734404:MainThread](2026-02-26 22:19:58,834) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
KMID KLEN ... VSUMD30 VSUMD60
|
||||
datetime instrument ...
|
||||
2019-11-25 SH600000 0.003325 0.011638 ... -0.238055 -0.010125
|
||||
SH600004 -0.013806 0.030012 ... -0.017610 0.039195
|
||||
SH600006 0.009238 0.016166 ... -0.034782 -0.014306
|
||||
SH600007 -0.014749 0.018879 ... -0.032427 0.034279
|
||||
SH600008 0.009259 0.024691 ... -0.063490 0.003978
|
||||
... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708
|
||||
SZ301662 0.060584 0.087834 ... -0.014658 -0.014613
|
||||
SZ301665 -0.012899 0.040541 ... 0.083229 0.055994
|
||||
SZ301678 0.018182 0.027879 ... -0.054124 0.014202
|
||||
SZ302132 0.001754 0.016416 ... -0.049558 -0.038667
|
||||
|
||||
[6908346 rows x 158 columns]
|
||||
[2734404:MainThread](2026-02-26 22:19:58,835) INFO - qlib.timer - [log.py:117] - Time cost: 5.276s | Fetching dataframe Done
|
||||
[2734404:MainThread](2026-02-26 22:19:59,042) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3
|
||||
[2734404:MainThread](2026-02-26 22:20:16,326) INFO - qlib.timer - [log.py:117] - Time cost: 17.485s | DDB query: Done
|
||||
[2734404:MainThread](2026-02-26 22:20:17,102) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2734404:MainThread](2026-02-26 22:20:17,676) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,con_rating_strength from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH
|
||||
[2734404:MainThread](2026-02-26 22:20:29,343) INFO - qlib.timer - [log.py:117] - Time cost: 11.676s | DDB query: Done
|
||||
[2734404:MainThread](2026-02-26 22:20:30,245) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-25 00:00:00
|
||||
[2734404:MainThread](2026-02-26 22:21:55,033) INFO - qlib.timer - [log.py:117] - Time cost: 81.592s | Instruments filter: Done
|
||||
[2734404:MainThread](2026-02-26 22:21:55,586) INFO - qlib.timer - [log.py:117] - Time cost: 116.751s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x71c7a492c4d0>) Done
|
||||
[2734404:MainThread](2026-02-26 22:21:55,587) INFO - qlib.timer - [log.py:117] - Time cost: 116.752s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x71c7a492c4d0>) Done
|
||||
[2734404:MainThread](2026-02-26 22:21:55,588) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2734404:MainThread](2026-02-26 22:21:56,302) INFO - qlib.timer - [log.py:117] - Time cost: 0.713s | fetch_df_by_index Done
|
||||
[2734404:MainThread](2026-02-26 22:21:56,309) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
turnover free_turnover log_size con_rating_strength
|
||||
datetime instrument
|
||||
2019-11-25 SH600000 0.0895 0.1639 17.339552 0.8214
|
||||
SH600004 0.9386 1.9173 15.039255 0.8125
|
||||
SH600006 0.2566 0.6479 13.680836 1.0000
|
||||
SH600007 0.1647 0.8513 14.335590 0.7500
|
||||
SH600008 0.1813 0.3362 14.435625 0.6875
|
||||
... ... ... ... ...
|
||||
2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN
|
||||
SZ301662 12.5950 12.5950 12.681215 NaN
|
||||
SZ301665 14.0077 14.0077 11.719415 NaN
|
||||
SZ301678 6.6518 6.6518 12.799973 NaN
|
||||
SZ302132 1.3868 3.0296 15.359885 NaN
|
||||
|
||||
[7623242 rows x 4 columns]
|
||||
[2734404:MainThread](2026-02-26 22:21:56,310) INFO - qlib.timer - [log.py:117] - Time cost: 0.722s | Fetching dataframe Done
|
||||
[2734404:MainThread](2026-02-26 22:21:56,327) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657',
|
||||
[2734404:MainThread](2026-02-26 22:22:17,215) INFO - qlib.timer - [log.py:117] - Time cost: 20.899s | DDB query: Done
|
||||
[2734404:MainThread](2026-02-26 22:22:17,952) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2734404:MainThread](2026-02-26 22:22:18,463) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','
|
||||
[2734404:MainThread](2026-02-26 22:22:38,963) INFO - qlib.timer - [log.py:117] - Time cost: 20.509s | DDB query: Done
|
||||
[2734404:MainThread](2026-02-26 22:22:39,774) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2734404:MainThread](2026-02-26 22:24:07,744) INFO - qlib.timer - [log.py:117] - Time cost: 84.654s | Instruments filter: Done
|
||||
[2734404:MainThread](2026-02-26 22:24:08,702) INFO - qlib.timer - [log.py:117] - Time cost: 132.391s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x71c5e0847cb0>) Done
|
||||
[2734404:MainThread](2026-02-26 22:24:08,703) INFO - qlib.timer - [log.py:117] - Time cost: 132.392s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x71c5e0847cb0>) Done
|
||||
[2734404:MainThread](2026-02-26 22:24:08,704) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2734404:MainThread](2026-02-26 22:24:09,549) INFO - qlib.timer - [log.py:117] - Time cost: 0.844s | fetch_df_by_index Done
|
||||
[2734404:MainThread](2026-02-26 22:24:09,561) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
IsZt IsDt IsN ... open_stop close_stop high_stop
|
||||
datetime instrument ...
|
||||
2019-11-25 SH600000 False False False ... False False False
|
||||
SH600004 False False False ... False False False
|
||||
SH600006 False False False ... False False False
|
||||
SH600007 False False False ... False False False
|
||||
SH600008 False False False ... False False False
|
||||
... ... ... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False False ... False False False
|
||||
SZ301662 False False False ... False False False
|
||||
SZ301665 False False False ... False False False
|
||||
SZ301678 False False False ... False False False
|
||||
SZ302132 False False False ... False False False
|
||||
|
||||
[6925320 rows x 12 columns]
|
||||
[2734404:MainThread](2026-02-26 22:24:09,562) INFO - qlib.timer - [log.py:117] - Time cost: 0.858s | Fetching dataframe Done
|
||||
[2734404:MainThread](2026-02-26 22:24:09,760) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S
|
||||
[2734404:MainThread](2026-02-26 22:24:11,809) INFO - qlib.timer - [log.py:117] - Time cost: 2.238s | DDB query: Done
|
||||
[2734404:MainThread](2026-02-26 22:24:11,822) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2734404:MainThread](2026-02-26 22:25:28,259) INFO - qlib.timer - [log.py:117] - Time cost: 76.433s | Instruments filter: Done
|
||||
[2734404:MainThread](2026-02-26 22:25:28,285) INFO - qlib.timer - [log.py:117] - Time cost: 78.720s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x71c139b2af60>) Done
|
||||
[2734404:MainThread](2026-02-26 22:25:28,285) INFO - qlib.timer - [log.py:117] - Time cost: 78.720s | Init data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x71c139b2af60>) Done
|
||||
[2734404:MainThread](2026-02-26 22:25:28,286) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2734404:MainThread](2026-02-26 22:25:28,290) INFO - qlib.timer - [log.py:117] - Time cost: 0.003s | fetch_df_by_index Done
|
||||
[2734404:MainThread](2026-02-26 22:25:28,310) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
gds_CC10 gds_CC11 ... gds_CC63 gds_CC70
|
||||
datetime instrument ...
|
||||
2026-02-09 SH600000 False False ... False False
|
||||
SH600004 False False ... False False
|
||||
SH600006 False False ... False False
|
||||
SH600007 False False ... False False
|
||||
SH600008 False False ... False False
|
||||
... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False ... False False
|
||||
SZ301662 False False ... False False
|
||||
SZ301665 False False ... False False
|
||||
SZ301678 False False ... False False
|
||||
SZ302132 False False ... False False
|
||||
|
||||
[41168 rows x 30 columns]
|
||||
[2734404:MainThread](2026-02-26 22:25:28,311) INFO - qlib.timer - [log.py:117] - Time cost: 0.025s | Fetching dataframe Done
|
||||
[2734404:MainThread](2026-02-26 22:25:28,470) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002
|
||||
[2734404:MainThread](2026-02-26 22:25:58,108) INFO - qlib.timer - [log.py:117] - Time cost: 29.791s | DDB query: Done
|
||||
[2734404:MainThread](2026-02-26 22:25:58,818) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2734404:MainThread](2026-02-26 22:27:21,291) INFO - qlib.timer - [log.py:117] - Time cost: 81.957s | Instruments filter: Done
|
||||
[2734404:MainThread](2026-02-26 22:27:21,828) INFO - qlib.timer - [log.py:117] - Time cost: 113.516s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x71c5e0981ca0>) Done
|
||||
[2734404:MainThread](2026-02-26 22:27:21,829) INFO - qlib.timer - [log.py:117] - Time cost: 113.517s | Init data (<qlib.contrib.data.ddb_handlers.ddb_st_flag_handler.DDBStFlagHandler object at 0x71c5e0981ca0>) Done
|
||||
[2734404:MainThread](2026-02-26 22:27:21,830) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2734404:MainThread](2026-02-26 22:27:22,439) INFO - qlib.timer - [log.py:117] - Time cost: 0.608s | fetch_df_by_index Done
|
||||
[2734404:MainThread](2026-02-26 22:27:22,448) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
ST_Y ST_S ST_T ST_L ST_Z ST_X
|
||||
datetime instrument
|
||||
2019-11-25 SH600000 False False False False False False
|
||||
SH600004 False False False False False False
|
||||
SH600006 False False False False False False
|
||||
SH600007 False False False False False False
|
||||
SH600008 False False False False False False
|
||||
... ... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False False False False False
|
||||
SZ301662 False False False False False False
|
||||
SZ301665 False False False False False False
|
||||
SZ301678 False False False False False False
|
||||
SZ302132 False False False False False False
|
||||
|
||||
[6925323 rows x 6 columns]
|
||||
[2734404:MainThread](2026-02-26 22:27:22,448) INFO - qlib.timer - [log.py:117] - Time cost: 0.618s | Fetching dataframe Done
|
||||
/home/guofu/.venv/alpha2/lib/python3.12/site-packages/qlib/contrib/utils/paral.py:22: FutureWarning: 'M' is deprecated and will be removed in a future version, please use 'ME' instead.
|
||||
group_list = [_df.resample("M", level="datetime")\
|
||||
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c5e4d5d940>
|
||||
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c5e4d5d940>
|
||||
Query config:
|
||||
#concepts: 2;
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c13b9b8b60>
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c13b9b8b60>
|
||||
Field list: ['gds_CC10', 'gds_CC11', 'gds_CC12', 'gds_CC20', 'gds_CC21', 'gds_CC22', 'gds_CC23', 'gds_CC24', 'gds_CC25', 'gds_CC26', 'gds_CC27', 'gds_CC28', 'gds_CC30', 'gds_CC31', 'gds_CC32', 'gds_CC33', 'gds_CC34', 'gds_CC35', 'gds_CC36', 'gds_CC37', 'gds_CC40', 'gds_CC41', 'gds_CC42', 'gds_CC43', 'gds_CC50', 'gds_CC60', 'gds_CC61', 'gds_CC62', 'gds_CC63', 'gds_CC70']
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c139b28aa0>
|
||||
Will use bool for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71c5e07e8f20>
|
||||
[2734404:MainThread](2026-02-26 22:27:25,764) INFO - qlib.timer - [log.py:117] - Time cost: 3.315s | Concat index: Done
|
||||
[2734404:MainThread](2026-02-26 22:27:25,766) INFO - qlib.timer - [log.py:117] - Time cost: 0.001s | Sort index: Done
|
||||
[2734404:MainThread](2026-02-26 22:27:29,485) INFO - qlib.timer - [log.py:117] - Time cost: 3.718s | Creating SepDataFrame: Done
|
||||
[2734404:MainThread](2026-02-26 22:27:30,310) INFO - qlib.timer - [log.py:117] - Time cost: 1038.675s | Loading data (<qlib.contrib.data.agg_handler.AggHandler object at 0x71c5e17ec230>) Done
|
||||
[2734404:MainThread](2026-02-26 22:27:30,311) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
|
||||
[]
|
||||
[2734404:MainThread](2026-02-26 22:27:30,313) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
|
||||
[2734404:MainThread](2026-02-26 22:27:30,318) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
|
||||
[]
|
||||
[2734404:MainThread](2026-02-26 22:27:30,319) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
|
||||
[2734404:MainThread](2026-02-26 22:27:30,322) INFO - qlib.AggHandler - [handler.py:468] - Read-only True
|
||||
[]
|
||||
[2734404:MainThread](2026-02-26 22:27:30,323) INFO - qlib.AggHandler - [handler.py:476] - Will copy all groups of data-frame.
|
||||
[2734404:MainThread](2026-02-26 22:27:30,326) INFO - qlib.timer - [log.py:117] - Time cost: 0.015s | fit & process data Done
|
||||
[2734404:MainThread](2026-02-26 22:27:30,327) INFO - qlib.timer - [log.py:117] - Time cost: 1038.692s | Init data (<qlib.contrib.data.agg_handler.AggHandler object at 0x71c5e17ec230>) Done
|
||||
All processors are readonly
|
||||
All processors are readonly
|
||||
All processors are readonly
|
||||
ERROR: Failed to load data from Qlib pipeline: 'SepDataFrame' object has no attribute 'shape'
|
||||
@ -0,0 +1,321 @@
|
||||
[2739486:MainThread](2026-02-26 22:59:30,849) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
|
||||
[2739486:MainThread](2026-02-26 22:59:30,854) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
|
||||
[2739486:MainThread](2026-02-26 22:59:30,855) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
|
||||
================================================================================
|
||||
DUMP GOLD-STANDARD DATA FROM QLIB PIPELINE
|
||||
================================================================================
|
||||
Date Range: 2020-01-02 to 2020-01-10
|
||||
Output Directory: /home/guofu/Workspaces/alpha_lab/stock_1d/d033/alpha158_beta/data
|
||||
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
|
||||
|
||||
Step 1: Loading data from Qlib pipeline...
|
||||
Loading since_date=2020-01-02
|
||||
Loading data with handler (load_start=2019-12-13)...
|
||||
Will use `placehorder_value` from module: qlib.contrib.data.config
|
||||
Will init handler object from config:
|
||||
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'end_time': datetime.date(2026, 2, 26),
|
||||
'handler_list': [{'class': 'DDBAlpha158Handler',
|
||||
'kwargs': {'col_set': 'feature',
|
||||
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
|
||||
'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': 'alpha158',
|
||||
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
|
||||
{'class': 'DDBMarketExtHandler',
|
||||
'kwargs': {'col_set': 'feature_ext',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['Turnover '
|
||||
'as '
|
||||
'turnover',
|
||||
'FreeTurnover '
|
||||
'as '
|
||||
'free_turnover',
|
||||
'log(MarketValue) '
|
||||
'as '
|
||||
'log_size'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['con_rating_strength'],
|
||||
'table_name': 'stg_1day_gds_con_rating'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
|
||||
{'class': 'DDBMarketFlagHandler',
|
||||
'kwargs': {'col_set': 'feature_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['IsZt',
|
||||
'IsDt',
|
||||
'IsN',
|
||||
'IsXD',
|
||||
'IsXR',
|
||||
'IsDR'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['open_limit',
|
||||
'close_limit',
|
||||
'low_limit',
|
||||
'open_stop',
|
||||
'close_stop',
|
||||
'high_stop'],
|
||||
'table_name': 'stg_1day_wind_market_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
|
||||
{'class': 'DDBIndusFlagHandler',
|
||||
'kwargs': {'col_set': 'indus_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': 'industry_code_cc.csv',
|
||||
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
|
||||
{'class': 'DDBStFlagHandler',
|
||||
'kwargs': {'col_set': 'st_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['ST_Y',
|
||||
'ST_S',
|
||||
'ST_T',
|
||||
'ST_L',
|
||||
'ST_Z',
|
||||
'ST_X'],
|
||||
'table_name': 'stg_1day_wind_st_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
|
||||
'instruments': 'csiallx',
|
||||
'start_time': Timestamp('2019-11-23 00:00:00')},
|
||||
'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'handler': {'class': 'AggHandler',
|
||||
'kwargs': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
[2739486:MainThread](2026-02-26 22:59:30,878) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
|
||||
[2739486:MainThread](2026-02-26 22:59:30,878) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
|
||||
[2739486:MainThread](2026-02-26 22:59:30,938) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
|
||||
[2739486:MainThread](2026-02-26 23:07:16,353) INFO - qlib.timer - [log.py:117] - Time cost: 465.464s | DDB query: Done
|
||||
[2739486:MainThread](2026-02-26 23:07:17,149) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
'end_time': datetime.date(2026, 2, 26),
|
||||
'handler_list': [{'class': 'DDBAlpha158Handler',
|
||||
'kwargs': {'col_set': 'feature',
|
||||
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
|
||||
'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': 'alpha158',
|
||||
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
|
||||
{'class': 'DDBMarketExtHandler',
|
||||
'kwargs': {'col_set': 'feature_ext',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['Turnover '
|
||||
'as '
|
||||
'turnover',
|
||||
'FreeTurnover '
|
||||
'as '
|
||||
'free_turnover',
|
||||
'log(MarketValue) '
|
||||
'as '
|
||||
'log_size'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['con_rating_strength'],
|
||||
'table_name': 'stg_1day_gds_con_rating'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
|
||||
{'class': 'DDBMarketFlagHandler',
|
||||
'kwargs': {'col_set': 'feature_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['IsZt',
|
||||
'IsDt',
|
||||
'IsN',
|
||||
'IsXD',
|
||||
'IsXR',
|
||||
'IsDR'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['open_limit',
|
||||
'close_limit',
|
||||
'low_limit',
|
||||
'open_stop',
|
||||
'close_stop',
|
||||
'high_stop'],
|
||||
'table_name': 'stg_1day_wind_market_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
|
||||
{'class': 'DDBIndusFlagHandler',
|
||||
'kwargs': {'col_set': 'indus_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': 'industry_code_cc.csv',
|
||||
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
|
||||
{'class': 'DDBStFlagHandler',
|
||||
'kwargs': {'col_set': 'st_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['ST_Y',
|
||||
'ST_S',
|
||||
'ST_T',
|
||||
'ST_L',
|
||||
'ST_Z',
|
||||
'ST_X'],
|
||||
'table_name': 'stg_1day_wind_st_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
|
||||
'instruments': 'csiallx',
|
||||
'start_time': Timestamp('2019-11-23 00:00:00')},
|
||||
'module_path': 'qlib.contrib.data.agg_handler'},
|
||||
'load_end': datetime.date(2026, 2, 26),
|
||||
'load_start': Timestamp('2019-11-23 00:00:00'),
|
||||
'market': 'csiallx',
|
||||
'qlib_init': {'provider_uri': '/home/guofu/.qlib/data_ops/target',
|
||||
'region': 'cn'}}
|
||||
Query config:
|
||||
#alpha158: 1;
|
||||
Will use float32 for <qlib.contrib.data.ddb_data_loader.DDBDataLoader object at 0x71847694be90>
|
||||
[2739486:MainThread](2026-02-26 23:09:19,001) INFO - qlib.timer - [log.py:117] - Time cost: 112.707s | Instruments filter: Done
|
||||
[2739486:MainThread](2026-02-26 23:09:26,016) INFO - qlib.timer - [log.py:117] - Time cost: 595.133s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x7184769a5fa0>) Done
|
||||
[2739486:MainThread](2026-02-26 23:09:26,017) INFO - qlib.timer - [log.py:117] - Time cost: 595.135s | Init data (<qlib.contrib.data.ddb_handlers.ddb_alpha158_handler.DDBAlpha158Handler object at 0x7184769a5fa0>) Done
|
||||
[2739486:MainThread](2026-02-26 23:09:26,019) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2739486:MainThread](2026-02-26 23:09:29,432) INFO - qlib.timer - [log.py:117] - Time cost: 3.412s | fetch_df_by_index Done
|
||||
[2739486:MainThread](2026-02-26 23:09:31,228) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
KMID KLEN ... VSUMD30 VSUMD60
|
||||
datetime instrument ...
|
||||
2019-11-25 SH600000 0.003325 0.011638 ... -0.238055 -0.010125
|
||||
SH600004 -0.013806 0.030012 ... -0.017610 0.039195
|
||||
SH600006 0.009238 0.016166 ... -0.034782 -0.014306
|
||||
SH600007 -0.014749 0.018879 ... -0.032427 0.034279
|
||||
SH600008 0.009259 0.024691 ... -0.063490 0.003978
|
||||
... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 -0.017231 0.025231 ... -0.018706 0.003708
|
||||
SZ301662 0.060584 0.087834 ... -0.014658 -0.014613
|
||||
SZ301665 -0.012899 0.040541 ... 0.083229 0.055994
|
||||
SZ301678 0.018182 0.027879 ... -0.054124 0.014202
|
||||
SZ302132 0.001754 0.016416 ... -0.049558 -0.038667
|
||||
|
||||
[6908346 rows x 158 columns]
|
||||
[2739486:MainThread](2026-02-26 23:09:31,229) INFO - qlib.timer - [log.py:117] - Time cost: 5.211s | Fetching dataframe Done
|
||||
[2739486:MainThread](2026-02-26 23:09:31,242) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,Turnover as turnover,FreeTurnover as free_turnover,log(MarketValue) as log_size from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ3
|
||||
[2739486:MainThread](2026-02-26 23:09:54,142) INFO - qlib.timer - [log.py:117] - Time cost: 22.909s | DDB query: Done
|
||||
[2739486:MainThread](2026-02-26 23:09:54,927) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2739486:MainThread](2026-02-26 23:09:55,507) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,con_rating_strength from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_gds_con_rating")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002833','SH
|
||||
[2739486:MainThread](2026-02-26 23:10:10,691) INFO - qlib.timer - [log.py:117] - Time cost: 15.192s | DDB query: Done
|
||||
[2739486:MainThread](2026-02-26 23:10:11,588) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2739486:MainThread](2026-02-26 23:11:37,528) INFO - qlib.timer - [log.py:117] - Time cost: 82.525s | Instruments filter: Done
|
||||
[2739486:MainThread](2026-02-26 23:11:38,259) INFO - qlib.timer - [log.py:117] - Time cost: 127.029s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x7185777e3e90>) Done
|
||||
[2739486:MainThread](2026-02-26 23:11:38,260) INFO - qlib.timer - [log.py:117] - Time cost: 127.030s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_ext_handler.DDBMarketExtHandler object at 0x7185777e3e90>) Done
|
||||
[2739486:MainThread](2026-02-26 23:11:38,261) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2739486:MainThread](2026-02-26 23:11:39,000) INFO - qlib.timer - [log.py:117] - Time cost: 0.738s | fetch_df_by_index Done
|
||||
[2739486:MainThread](2026-02-26 23:11:39,009) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
turnover free_turnover log_size con_rating_strength
|
||||
datetime instrument
|
||||
2019-11-25 SH600000 0.0895 0.1639 17.339552 0.8214
|
||||
SH600004 0.9386 1.9173 15.039255 0.8125
|
||||
SH600006 0.2566 0.6479 13.680836 1.0000
|
||||
SH600007 0.1647 0.8513 14.335590 0.7500
|
||||
SH600008 0.1813 0.3362 14.435625 0.6875
|
||||
... ... ... ... ...
|
||||
2026-02-26 SZ301658 6.0785 6.0785 11.788368 NaN
|
||||
SZ301662 12.5950 12.5950 12.681215 1.0000
|
||||
SZ301665 14.0077 14.0077 11.719415 1.0000
|
||||
SZ301678 6.6518 6.6518 12.799973 0.7500
|
||||
SZ302132 1.3868 3.0296 15.359885 0.8750
|
||||
|
||||
[7623255 rows x 4 columns]
|
||||
[2739486:MainThread](2026-02-26 23:11:39,010) INFO - qlib.timer - [log.py:117] - Time cost: 0.749s | Fetching dataframe Done
|
||||
[2739486:MainThread](2026-02-26 23:11:39,191) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,IsZt,IsDt,IsN,IsXD,IsXR,IsDR from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_kline_adjusted")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657',
|
||||
[2739486:MainThread](2026-02-26 23:12:05,839) INFO - qlib.timer - [log.py:117] - Time cost: 26.825s | DDB query: Done
|
||||
[2739486:MainThread](2026-02-26 23:12:06,554) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2739486:MainThread](2026-02-26 23:12:07,075) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,open_limit,close_limit,low_limit,open_stop,close_stop,high_stop from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_market_flag")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','
|
||||
[2739486:MainThread](2026-02-26 23:12:32,695) INFO - qlib.timer - [log.py:117] - Time cost: 25.629s | DDB query: Done
|
||||
[2739486:MainThread](2026-02-26 23:12:33,566) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2739486:MainThread](2026-02-26 23:14:02,232) INFO - qlib.timer - [log.py:117] - Time cost: 85.158s | Instruments filter: Done
|
||||
[2739486:MainThread](2026-02-26 23:14:03,155) INFO - qlib.timer - [log.py:117] - Time cost: 144.143s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x718478991880>) Done
|
||||
[2739486:MainThread](2026-02-26 23:14:03,156) INFO - qlib.timer - [log.py:117] - Time cost: 144.144s | Init data (<qlib.contrib.data.ddb_handlers.ddb_market_flag_handler.DDBMarketFlagHandler object at 0x718478991880>) Done
|
||||
[2739486:MainThread](2026-02-26 23:14:03,156) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2739486:MainThread](2026-02-26 23:14:04,046) INFO - qlib.timer - [log.py:117] - Time cost: 0.889s | fetch_df_by_index Done
|
||||
[2739486:MainThread](2026-02-26 23:14:04,060) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
IsZt IsDt IsN ... open_stop close_stop high_stop
|
||||
datetime instrument ...
|
||||
2019-11-25 SH600000 False False False ... False False False
|
||||
SH600004 False False False ... False False False
|
||||
SH600006 False False False ... False False False
|
||||
SH600007 False False False ... False False False
|
||||
SH600008 False False False ... False False False
|
||||
... ... ... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False False ... False False False
|
||||
SZ301662 False False False ... False False False
|
||||
SZ301665 False False False ... False False False
|
||||
SZ301678 False False False ... False False False
|
||||
SZ302132 False False False ... False False False
|
||||
|
||||
[6925320 rows x 12 columns]
|
||||
[2739486:MainThread](2026-02-26 23:14:04,061) INFO - qlib.timer - [log.py:117] - Time cost: 0.904s | Fetching dataframe Done
|
||||
[2739486:MainThread](2026-02-26 23:14:04,079) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,gds_CC10,gds_CC11,gds_CC12,gds_CC20,gds_CC21,gds_CC22,gds_CC23,gds_CC24,gds_CC25,gds_CC26,gds_CC27,gds_CC28,gds_CC30,gds_CC31,gds_CC32,gds_CC33,gds_CC34,gds_CC35,gds_CC36,gds_CC37,gds_CC40,gds_CC41,gds_CC42,gds_CC43,gds_CC50,gds_CC60,gds_CC61,gds_CC62,gds_CC63,gds_CC70 from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_gds_indus_flag_cc1")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','S
|
||||
[2739486:MainThread](2026-02-26 23:14:06,440) INFO - qlib.timer - [log.py:117] - Time cost: 2.370s | DDB query: Done
|
||||
[2739486:MainThread](2026-02-26 23:14:06,448) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
[2739486:MainThread](2026-02-26 23:15:23,146) INFO - qlib.timer - [log.py:117] - Time cost: 76.695s | Instruments filter: Done
|
||||
[2739486:MainThread](2026-02-26 23:15:23,184) INFO - qlib.timer - [log.py:117] - Time cost: 79.120s | Loading data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x7184782fa0c0>) Done
|
||||
[2739486:MainThread](2026-02-26 23:15:23,185) INFO - qlib.timer - [log.py:117] - Time cost: 79.121s | Init data (<qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler.DDBIndusFlagHandler object at 0x7184782fa0c0>) Done
|
||||
[2739486:MainThread](2026-02-26 23:15:23,186) INFO - qlib.timer - [log.py:117] - Time cost: 0.000s | fetch_df_by_cols Done
|
||||
[2739486:MainThread](2026-02-26 23:15:23,190) INFO - qlib.timer - [log.py:117] - Time cost: 0.003s | fetch_df_by_index Done
|
||||
[2739486:MainThread](2026-02-26 23:15:23,210) INFO - qlib.DataLoaderDH - [agg_handler.py:215] - Did fetch @slice(Timestamp('2019-11-23 00:00:00'), datetime.date(2026, 2, 26), None)
|
||||
gds_CC10 gds_CC11 ... gds_CC63 gds_CC70
|
||||
datetime instrument ...
|
||||
2026-02-09 SH600000 False False ... False False
|
||||
SH600004 False False ... False False
|
||||
SH600006 False False ... False False
|
||||
SH600007 False False ... False False
|
||||
SH600008 False False ... False False
|
||||
... ... ... ... ... ...
|
||||
2026-02-26 SZ301658 False False ... False False
|
||||
SZ301662 False False ... False False
|
||||
SZ301665 False False ... False False
|
||||
SZ301678 False False ... False False
|
||||
SZ302132 False False ... False False
|
||||
|
||||
[41168 rows x 30 columns]
|
||||
[2739486:MainThread](2026-02-26 23:15:23,210) INFO - qlib.timer - [log.py:117] - Time cost: 0.025s | Fetching dataframe Done
|
||||
[2739486:MainThread](2026-02-26 23:15:23,226) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,ST_Y,ST_S,ST_T,ST_L,ST_Z,ST_X from
|
||||
loadTable("dfs://daily_stock_run", "stg_1day_wind_st_flag")
|
||||
where m_nDate>=2019.11.23 and m_nDate<=2026.02.26 and code in ('SH600373','SZ300557','SZ000416','SZ002156','SH600500','SZ002123','SZ000610','SH601699','SH603336','SZ000663','SH600713','SZ300623','SZ002840','SH601881','SZ000632','SH600030','SZ002101','SH600633','SH603797','SZ300563','SZ002281','SZ000972','SH600077','SZ300657','SZ002
|
||||
[2739486:MainThread](2026-02-26 23:15:53,388) INFO - qlib.timer - [log.py:117] - Time cost: 30.171s | DDB query: Done
|
||||
[2739486:MainThread](2026-02-26 23:15:54,166) INFO - qlib.DDBDataLoader - [__init__.py:219] - The last time point: 2026-02-26 00:00:00
|
||||
@ -0,0 +1,104 @@
|
||||
[2745445:MainThread](2026-02-26 23:18:06,410) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
|
||||
[2745445:MainThread](2026-02-26 23:18:06,414) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
|
||||
[2745445:MainThread](2026-02-26 23:18:06,415) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
|
||||
================================================================================
|
||||
DUMP GOLD-STANDARD RAW DATA FROM QLIB PIPELINE
|
||||
================================================================================
|
||||
Date Range: 2020-01-02 to 2020-01-10
|
||||
Output Directory: ../data/
|
||||
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
|
||||
|
||||
Step 1: Loading raw data from Qlib pipeline...
|
||||
Loading raw data from handler (load_start=2019-12-13)...
|
||||
Will use `placehorder_value` from module: qlib.contrib.data.config
|
||||
Will init handler object from config:
|
||||
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'end_time': datetime.date(2026, 2, 26),
|
||||
'handler_list': [{'class': 'DDBAlpha158Handler',
|
||||
'kwargs': {'col_set': 'feature',
|
||||
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
|
||||
'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': 'alpha158',
|
||||
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
|
||||
{'class': 'DDBMarketExtHandler',
|
||||
'kwargs': {'col_set': 'feature_ext',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['Turnover '
|
||||
'as '
|
||||
'turnover',
|
||||
'FreeTurnover '
|
||||
'as '
|
||||
'free_turnover',
|
||||
'log(MarketValue) '
|
||||
'as '
|
||||
'log_size'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['con_rating_strength'],
|
||||
'table_name': 'stg_1day_gds_con_rating'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
|
||||
{'class': 'DDBMarketFlagHandler',
|
||||
'kwargs': {'col_set': 'feature_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['IsZt',
|
||||
'IsDt',
|
||||
'IsN',
|
||||
'IsXD',
|
||||
'IsXR',
|
||||
'IsDR'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['open_limit',
|
||||
'close_limit',
|
||||
'low_limit',
|
||||
'open_stop',
|
||||
'close_stop',
|
||||
'high_stop'],
|
||||
'table_name': 'stg_1day_wind_market_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
|
||||
{'class': 'DDBIndusFlagHandler',
|
||||
'kwargs': {'col_set': 'indus_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': 'industry_code_cc.csv',
|
||||
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
|
||||
{'class': 'DDBStFlagHandler',
|
||||
'kwargs': {'col_set': 'st_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['ST_Y',
|
||||
'ST_S',
|
||||
'ST_T',
|
||||
'ST_L',
|
||||
'ST_Z',
|
||||
'ST_X'],
|
||||
'table_name': 'stg_1day_wind_st_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
|
||||
'instruments': 'csiallx',
|
||||
'start_time': Timestamp('2019-11-23 00:00:00')},
|
||||
'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'handler': {'class': 'AggHandler',
|
||||
'kwargs': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'end_time': datetime.date(2026, 2, 26),
|
||||
'handler_list': [{[2745445:MainThread](2026-02-26 23:18:06,436) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
|
||||
[2745445:MainThread](2026-02-26 23:18:06,437) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
|
||||
[2745445:MainThread](2026-02-26 23:18:06,492) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
|
||||
@ -0,0 +1,103 @@
|
||||
[2746177:MainThread](2026-02-26 23:21:56,618) INFO - qlib.Initialization - [config.py:413] - default_conf: client.
|
||||
[2746177:MainThread](2026-02-26 23:21:56,622) INFO - qlib.Initialization - [__init__.py:74] - qlib successfully initialized based on client settings.
|
||||
[2746177:MainThread](2026-02-26 23:21:56,623) INFO - qlib.Initialization - [__init__.py:76] - data_path={'__DEFAULT_FREQ': PosixPath('/data/qlib/default/data_ops/target')}
|
||||
================================================================================
|
||||
DUMP GOLD-STANDARD RAW DATA FROM QLIB PIPELINE
|
||||
================================================================================
|
||||
Date Range: 2020-01-02 to 2020-01-10
|
||||
Output Directory: ../data/
|
||||
Qlib Dataset Path: /home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/
|
||||
|
||||
Step 1: Loading raw data from Qlib pipeline...
|
||||
Loading raw data from handler (load_start=2019-12-13)...
|
||||
Filtering instruments: ['SH600000', 'SH600004', 'SH600006', 'SH600007', 'SH600008']... (5 total)
|
||||
Will use `placehorder_value` from module: qlib.contrib.data.config
|
||||
Will init handler object from config:
|
||||
{'data_handler_config': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'end_time': datetime.date(2026, 2, 26),
|
||||
'handler_list': [{'class': 'DDBAlpha158Handler',
|
||||
'kwargs': {'col_set': 'feature',
|
||||
'query_config': [{'alpha158_config': 'alpha158_expr.csv',
|
||||
'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': 'alpha158',
|
||||
'table_name': 'stg_1day_wind_alpha158_0_7'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_alpha158_handler'},
|
||||
{'class': 'DDBMarketExtHandler',
|
||||
'kwargs': {'col_set': 'feature_ext',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['Turnover '
|
||||
'as '
|
||||
'turnover',
|
||||
'FreeTurnover '
|
||||
'as '
|
||||
'free_turnover',
|
||||
'log(MarketValue) '
|
||||
'as '
|
||||
'log_size'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'float32',
|
||||
'field_list': ['con_rating_strength'],
|
||||
'table_name': 'stg_1day_gds_con_rating'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_ext_handler'},
|
||||
{'class': 'DDBMarketFlagHandler',
|
||||
'kwargs': {'col_set': 'feature_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['IsZt',
|
||||
'IsDt',
|
||||
'IsN',
|
||||
'IsXD',
|
||||
'IsXR',
|
||||
'IsDR'],
|
||||
'table_name': 'stg_1day_wind_kline_adjusted'},
|
||||
{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['open_limit',
|
||||
'close_limit',
|
||||
'low_limit',
|
||||
'open_stop',
|
||||
'close_stop',
|
||||
'high_stop'],
|
||||
'table_name': 'stg_1day_wind_market_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_market_flag_handler'},
|
||||
{'class': 'DDBIndusFlagHandler',
|
||||
'kwargs': {'col_set': 'indus_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': 'industry_code_cc.csv',
|
||||
'table_name': 'stg_1day_gds_indus_flag_cc1'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_indus_flag_handler'},
|
||||
{'class': 'DDBStFlagHandler',
|
||||
'kwargs': {'col_set': 'st_flag',
|
||||
'query_config': [{'db_path': 'dfs://daily_stock_run',
|
||||
'dtype': 'bool',
|
||||
'field_list': ['ST_Y',
|
||||
'ST_S',
|
||||
'ST_T',
|
||||
'ST_L',
|
||||
'ST_Z',
|
||||
'ST_X'],
|
||||
'table_name': 'stg_1day_wind_st_flag'}]},
|
||||
'module_path': 'qlib.contrib.data.ddb_handlers.ddb_st_flag_handler'}],
|
||||
'instruments': 'csiallx',
|
||||
'start_time': Timestamp('2019-11-23 00:00:00')},
|
||||
'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'},
|
||||
'handler': {'class': 'AggHandler',
|
||||
'kwargs': {'ddb_config': {'host': '192.168.1.146',
|
||||
'password': '123456',
|
||||
'port': 8848,
|
||||
'username': 'admin'}[2746177:MainThread](2026-02-26 23:21:56,647) INFO - qlib.AggHandler - [agg_handler.py:79] - Will use AggHandler
|
||||
[2746177:MainThread](2026-02-26 23:21:56,648) WARNING - qlib.DataLoaderDH - [agg_handler.py:191] - instruments[csiallx] is ignored
|
||||
[2746177:MainThread](2026-02-26 23:21:56,716) INFO - qlib.DDBDataLoader - [__init__.py:196] - Will use sql template on 192.168.1.146:
|
||||
|
||||
use mytt;
|
||||
select code,m_nDate,KMID,KLEN,KMID2,KUP,KUP2,KLOW,KLOW2,KSFT,KSFT2,OPEN0,HIGH0,LOW0,VWAP0,ROC5,ROC10,ROC20,ROC30,ROC60,MA5,MA10,MA20,MA30,MA60,STD5,STD10,STD20,STD30,STD60,BETA5,BETA10,BETA20,BETA30,BETA60,RSQR5,RSQR10,RSQR20,RSQR30,RSQR60,RESI5,RESI10,RESI20,RESI30,RESI60,MAX5,MAX10,MAX20,MAX30,MAX60,MIN5,MIN10,MIN20,MIN30,MIN60,QTLU5,QTLU10,QTLU20,QTLU30,QTLU60,QTLD5,QTLD10,QTLD20,QTLD30,QTLD60,RANK5,RANK10,RANK20,RANK30,RANK60,RSV5,RSV10,RSV20,RSV30,RSV60,IMAX5,IMAX
|
||||
@ -0,0 +1,187 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Verify feature column order between standalone pipeline and qlib gold standard.
|
||||
|
||||
This script:
|
||||
1. Loads a small sample using the qlib pipeline
|
||||
2. Runs the same sample through the standalone generate_beta_embedding pipeline
|
||||
3. Compares the column order and feature values
|
||||
"""
|
||||
|
||||
import pickle as pkl
|
||||
import ruamel.yaml as yaml
|
||||
import pandas as pd
|
||||
import polars as pl
|
||||
import numpy as np
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Patch yaml.safe_load for compatibility
|
||||
_yaml = yaml.YAML(typ='safe', pure=True)
|
||||
def patched_safe_load(stream):
|
||||
import io
|
||||
if isinstance(stream, str):
|
||||
stream = io.StringIO(stream)
|
||||
return _yaml.load(stream)
|
||||
yaml.safe_load = patched_safe_load
|
||||
|
||||
# Add scripts directory to path
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'scripts'))
|
||||
|
||||
def main():
|
||||
print("=" * 70)
|
||||
print("VERIFY FEATURE ORDER: Standalone vs Qlib Gold Standard")
|
||||
print("=" * 70)
|
||||
|
||||
# Step 1: Load processor list
|
||||
print("\nStep 1: Loading processor list...")
|
||||
proc_path = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/proc_list.proc"
|
||||
with open(proc_path, "rb") as f:
|
||||
proc_list = pkl.load(f)
|
||||
print(f" Loaded {len(proc_list)} processors")
|
||||
|
||||
# Step 2: Load small sample from qlib pipeline
|
||||
print("\nStep 2: Loading sample from qlib pipeline...")
|
||||
|
||||
import qlib
|
||||
from qlib.config import REG_CN
|
||||
qlib.init(provider_uri='/home/guofu/.qlib/data_ops/target', region=REG_CN)
|
||||
|
||||
from qlib.workflow.cli import sys_config
|
||||
from qlib.utils import fill_placeholder
|
||||
import datetime as dt
|
||||
|
||||
yaml_path = "/home/guofu/Workspaces/alpha/data_ops/tasks/dwm_feature_vae/dataset/csiallx_feature2_ntrla_flag_pnlnorm/handler.yaml"
|
||||
with open(yaml_path) as fin:
|
||||
config = yaml.safe_load(fin)
|
||||
|
||||
sys_config(config, "qlib.contrib.data.config")
|
||||
qlib.init(**config.get("qlib_init"))
|
||||
|
||||
load_start = pd.to_datetime("2020-01-02") - dt.timedelta(days=20)
|
||||
placehorder_value = {
|
||||
"<SINCE_DATE>": load_start,
|
||||
"<TODAY>": dt.date.today()
|
||||
}
|
||||
|
||||
config_filled = fill_placeholder(config, placehorder_value)
|
||||
handler = qlib.init_instance_by_config(config_filled["handler"])
|
||||
handler_data = handler._data
|
||||
|
||||
# Get data from SepDataFrame
|
||||
if hasattr(handler_data, '_data'):
|
||||
df_dict = handler_data._data
|
||||
print(f" Handler groups: {list(df_dict.keys())}")
|
||||
|
||||
# Concatenate groups
|
||||
raw_dfs = []
|
||||
for group, df in df_dict.items():
|
||||
df_copy = df.copy()
|
||||
df_copy.columns = [f"{group}::{col}" for col in df_copy.columns]
|
||||
raw_dfs.append(df_copy)
|
||||
print(f" {group}: {len(df_copy.columns)} columns")
|
||||
|
||||
raw_df = pd.concat(raw_dfs, axis=1)
|
||||
print(f" Raw concatenated shape: {raw_df.shape}")
|
||||
|
||||
# Step 3: Apply processors to get gold standard features
|
||||
print("\nStep 3: Applying processors (qlib gold standard)...")
|
||||
from qlib.contrib.data.utils import apply_proc_list
|
||||
|
||||
# Strip group prefixes for processor application
|
||||
col_mapping = {col: col.split('::', 1)[1] for col in raw_df.columns if '::' in col}
|
||||
raw_df_stripped = raw_df.rename(columns=col_mapping)
|
||||
|
||||
# Convert bool to object for processor compatibility
|
||||
bool_cols = raw_df_stripped.select_dtypes(include=['bool']).columns
|
||||
for col in bool_cols:
|
||||
raw_df_stripped[col] = raw_df_stripped[col].astype(object)
|
||||
|
||||
df_gold = apply_proc_list(raw_df_stripped, proc_list=proc_list, with_fit=False)
|
||||
print(f" Gold standard shape after processors: {df_gold.shape}")
|
||||
|
||||
# Restore group prefixes
|
||||
reverse_mapping = {v: k for k, v in col_mapping.items()}
|
||||
df_gold = df_gold.rename(columns=reverse_mapping)
|
||||
|
||||
# Get gold standard column order
|
||||
gold_columns = list(df_gold.columns)
|
||||
print(f"\nGold standard column groups:")
|
||||
|
||||
feature_cols = [c for c in gold_columns if c.startswith('feature::')]
|
||||
feature_ext_cols = [c for c in gold_columns if c.startswith('feature_ext::')]
|
||||
feature_flag_cols = [c for c in gold_columns if c.startswith('feature_flag::')]
|
||||
indus_idx_cols = [c for c in gold_columns if c.startswith('indus_idx::')]
|
||||
|
||||
print(f" feature:: {len(feature_cols)} cols")
|
||||
print(f" feature_ext:: {len(feature_ext_cols)} cols")
|
||||
print(f" feature_flag:: {len(feature_flag_cols)} cols")
|
||||
print(f" indus_idx:: {len(indus_idx_cols)} cols")
|
||||
|
||||
# Step 4: Now run standalone pipeline on same data
|
||||
print("\nStep 4: Running standalone pipeline...")
|
||||
|
||||
# Load parquet data for same date range
|
||||
from generate_beta_embedding import load_all_data, merge_data_sources, apply_feature_pipeline
|
||||
|
||||
df_alpha, df_kline, df_flag, df_industry = load_all_data("2020-01-02", "2020-01-10")
|
||||
df_standalone = merge_data_sources(df_alpha, df_kline, df_flag, df_industry)
|
||||
|
||||
print(f" Standalone loaded shape: {df_standalone.shape}")
|
||||
|
||||
# Apply feature pipeline
|
||||
df_processed, feature_cols_standalone = apply_feature_pipeline(df_standalone)
|
||||
print(f" Standalone processed shape: {df_processed.shape}")
|
||||
print(f" Standalone feature columns: {len(feature_cols_standalone)}")
|
||||
|
||||
# Step 5: Compare column counts
|
||||
print("\n" + "=" * 70)
|
||||
print("COMPARISON SUMMARY")
|
||||
print("=" * 70)
|
||||
|
||||
print(f"\nGold standard total columns: {len(gold_columns)}")
|
||||
print(f" feature:: {len(feature_cols)}")
|
||||
print(f" feature_ext:: {len(feature_ext_cols)}")
|
||||
print(f" feature_flag:: {len(feature_flag_cols)}")
|
||||
print(f" indus_idx:: {len(indus_idx_cols)}")
|
||||
|
||||
print(f"\nStandalone feature columns: {len(feature_cols_standalone)}")
|
||||
|
||||
# The gold standard columns (without prefix) should match standalone
|
||||
gold_feature_cols = [c.split('::', 1)[1] for c in feature_cols]
|
||||
gold_feature_ext_cols = [c.split('::', 1)[1] for c in feature_ext_cols]
|
||||
gold_feature_flag_cols = [c.split('::', 1)[1] for c in feature_flag_cols]
|
||||
gold_indus_idx_cols = [c.split('::', 1)[1] for c in indus_idx_cols]
|
||||
|
||||
gold_all = gold_feature_cols + gold_feature_ext_cols + gold_feature_flag_cols + gold_indus_idx_cols
|
||||
|
||||
print(f"\nGold standard (flat): {len(gold_all)} features")
|
||||
print(f"Standalone: {len(feature_cols_standalone)} features")
|
||||
|
||||
if len(gold_all) != len(feature_cols_standalone):
|
||||
print(f"\nWARNING: Feature count mismatch! Difference: {len(gold_all) - len(feature_cols_standalone)}")
|
||||
|
||||
# Check column order
|
||||
print("\nFirst 20 column comparison:")
|
||||
print(f"{'Idx':<5} {'Gold Standard':<40} {'Standalone':<40} {'Match':<6}")
|
||||
print("-" * 90)
|
||||
for i in range(min(20, len(gold_all), len(feature_cols_standalone))):
|
||||
match = "✓" if gold_all[i] == feature_cols_standalone[i] else "✗"
|
||||
print(f"{i:<5} {gold_all[i]:<40} {feature_cols_standalone[i]:<40} {match:<6}")
|
||||
|
||||
# Check if orders match
|
||||
if gold_all == feature_cols_standalone:
|
||||
print("\n✓ Column order MATCHES!")
|
||||
else:
|
||||
print("\n✗ Column order DOES NOT MATCH!")
|
||||
print("\nFinding differences...")
|
||||
diff_count = 0
|
||||
for i in range(min(len(gold_all), len(feature_cols_standalone))):
|
||||
if gold_all[i] != feature_cols_standalone[i]:
|
||||
diff_count += 1
|
||||
if diff_count <= 20:
|
||||
print(f" [{i}] Gold: {gold_all[i]} vs Standalone: {feature_cols_standalone[i]}")
|
||||
print(f"Total differences: {diff_count}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in new issue