You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
439 lines
13 KiB
439 lines
13 KiB
|
3 weeks ago
|
{
|
||
|
|
"cells": [
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"# CTA 1D Blend Comparison\n",
|
||
|
|
"\n",
|
||
|
|
"Compare model performance across different label blending configurations.\n",
|
||
|
|
"\n",
|
||
|
|
"**Purpose**: Identify the optimal normalization blend for the CTA 1-day prediction task."
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"import pandas as pd\n",
|
||
|
|
"import numpy as np\n",
|
||
|
|
"import matplotlib.pyplot as plt\n",
|
||
|
|
"import seaborn as sns\n",
|
||
|
|
"\n",
|
||
|
|
"from qshare.data.pandas.cta_1d import load_dataset\n",
|
||
|
|
"from qshare.algo.learning.cta_trainer import CTAXGBTrainer\n",
|
||
|
|
"from qshare.eval.cta.backtest import CTABacktester\n",
|
||
|
|
"\n",
|
||
|
|
"import sys\n",
|
||
|
|
"sys.path.insert(0, '../')\n",
|
||
|
|
"from common.plotting import setup_plot_style, plot_ic_series\n",
|
||
|
|
"from src.labels import BLEND_CONFIGS, get_blend_weights\n",
|
||
|
|
"\n",
|
||
|
|
"setup_plot_style()"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## 1. Configuration\n",
|
||
|
|
"\n",
|
||
|
|
"Define base configuration shared across all blend experiments."
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"BASE_CONFIG = {\n",
|
||
|
|
" # Date ranges\n",
|
||
|
|
" 'dt_range': ['2020-01-01', '2024-12-31'],\n",
|
||
|
|
" 'train_range': ['2020-01-01', '2022-12-31'],\n",
|
||
|
|
" 'test_range': ['2023-01-01', '2024-12-31'],\n",
|
||
|
|
" 'fit_range': ['2020-01-01', '2021-06-30'],\n",
|
||
|
|
" \n",
|
||
|
|
" # Data\n",
|
||
|
|
" 'feature_sets': ['alpha158', 'hffactor'],\n",
|
||
|
|
" 'return_type': 'o2c_twap1min',\n",
|
||
|
|
" 'normalization': 'dual',\n",
|
||
|
|
" 'weight_factors': {'positive': 1.0, 'negative': 2.0},\n",
|
||
|
|
" \n",
|
||
|
|
" # Model (fixed for fair comparison)\n",
|
||
|
|
" 'xgb_params': {\n",
|
||
|
|
" 'booster': 'gblinear',\n",
|
||
|
|
" 'eta': 0.5,\n",
|
||
|
|
" 'lambda_reg': 0.1,\n",
|
||
|
|
" 'num_round': 20,\n",
|
||
|
|
" },\n",
|
||
|
|
" \n",
|
||
|
|
" # Backtest\n",
|
||
|
|
" 'backtest_params': {\n",
|
||
|
|
" 'num_trades': 4,\n",
|
||
|
|
" 'signal_dist': 'normal',\n",
|
||
|
|
" 'pos_weight': True,\n",
|
||
|
|
" },\n",
|
||
|
|
"}\n",
|
||
|
|
"\n",
|
||
|
|
"print(\"Blend configurations to compare:\")\n",
|
||
|
|
"for name, weights in BLEND_CONFIGS.items():\n",
|
||
|
|
" print(f\" {name}: {weights}\")"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## 2. Run Experiments\n",
|
||
|
|
"\n",
|
||
|
|
"Train and evaluate a model for each blend configuration."
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"def run_single_experiment(blend_name, blend_weights):\n",
|
||
|
|
" \"\"\"Run experiment with specific blend configuration.\"\"\"\n",
|
||
|
|
" print(f\"\\n{'='*60}\")\n",
|
||
|
|
" print(f\"Running: {blend_name}\")\n",
|
||
|
|
" print(f\"Weights: {blend_weights}\")\n",
|
||
|
|
" print(f\"{'='*60}\")\n",
|
||
|
|
" \n",
|
||
|
|
" # Load data\n",
|
||
|
|
" df_full = load_dataset(\n",
|
||
|
|
" dt_range=BASE_CONFIG['dt_range'],\n",
|
||
|
|
" return_type=BASE_CONFIG['return_type'],\n",
|
||
|
|
" normalization=BASE_CONFIG['normalization'],\n",
|
||
|
|
" feature_sets=BASE_CONFIG['feature_sets'],\n",
|
||
|
|
" fit_range=BASE_CONFIG['fit_range'],\n",
|
||
|
|
" weight_factors=BASE_CONFIG['weight_factors'],\n",
|
||
|
|
" blend_weights=blend_weights,\n",
|
||
|
|
" )\n",
|
||
|
|
" \n",
|
||
|
|
" # Split\n",
|
||
|
|
" df_train = df_full.loc[BASE_CONFIG['train_range'][0]:BASE_CONFIG['train_range'][1]]\n",
|
||
|
|
" df_test = df_full.loc[BASE_CONFIG['test_range'][0]:BASE_CONFIG['test_range'][1]]\n",
|
||
|
|
" \n",
|
||
|
|
" # Features\n",
|
||
|
|
" feature_cols = [c for c in df_train.columns\n",
|
||
|
|
" if c.startswith(('alpha158_', 'hf_', 'f_'))]\n",
|
||
|
|
" \n",
|
||
|
|
" # Train\n",
|
||
|
|
" trainer = CTAXGBTrainer(**BASE_CONFIG['xgb_params'])\n",
|
||
|
|
" trainer.fit(\n",
|
||
|
|
" df_train,\n",
|
||
|
|
" feature_cols=feature_cols,\n",
|
||
|
|
" target_col='label',\n",
|
||
|
|
" weight_col='weight'\n",
|
||
|
|
" )\n",
|
||
|
|
" \n",
|
||
|
|
" # Predict\n",
|
||
|
|
" df_signal = trainer.predict(df_test)\n",
|
||
|
|
" \n",
|
||
|
|
" # Backtest\n",
|
||
|
|
" returns = df_test['return'] if 'return' in df_test.columns else df_test['label']\n",
|
||
|
|
" backtester = CTABacktester(**BASE_CONFIG['backtest_params'])\n",
|
||
|
|
" results = backtester.run(returns, df_signal)\n",
|
||
|
|
" \n",
|
||
|
|
" # Metrics\n",
|
||
|
|
" summary = backtester.summary()\n",
|
||
|
|
" ic_by_date = results.groupby(results.index.get_level_values(0))['ic'].mean()\n",
|
||
|
|
" \n",
|
||
|
|
" return {\n",
|
||
|
|
" 'name': blend_name,\n",
|
||
|
|
" 'weights': blend_weights,\n",
|
||
|
|
" 'summary': summary,\n",
|
||
|
|
" 'ic_by_date': ic_by_date,\n",
|
||
|
|
" 'results': results,\n",
|
||
|
|
" 'importance': trainer.get_feature_importance(),\n",
|
||
|
|
" }\n",
|
||
|
|
"\n",
|
||
|
|
"# Run all experiments\n",
|
||
|
|
"all_results = []\n",
|
||
|
|
"for name in BLEND_CONFIGS.keys():\n",
|
||
|
|
" result = run_single_experiment(name, name)\n",
|
||
|
|
" all_results.append(result)\n",
|
||
|
|
" \n",
|
||
|
|
"print(\"\\n\\nAll experiments complete!\")"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## 3. Results Summary"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"# Create comparison table\n",
|
||
|
|
"comparison_data = []\n",
|
||
|
|
"for r in all_results:\n",
|
||
|
|
" ic_mean = r['ic_by_date'].mean()\n",
|
||
|
|
" ic_std = r['ic_by_date'].std()\n",
|
||
|
|
" comparison_data.append({\n",
|
||
|
|
" 'Blend': r['name'],\n",
|
||
|
|
" 'Weights': str(r['weights']),\n",
|
||
|
|
" 'IC Mean': ic_mean,\n",
|
||
|
|
" 'IC Std': ic_std,\n",
|
||
|
|
" 'IR': ic_mean / ic_std if ic_std > 0 else 0,\n",
|
||
|
|
" 'Return': r['summary'].get('return', np.nan),\n",
|
||
|
|
" 'Sharpe': r['summary'].get('sharpe', np.nan),\n",
|
||
|
|
" 'Turnover': r['summary'].get('turnover', np.nan),\n",
|
||
|
|
" })\n",
|
||
|
|
"\n",
|
||
|
|
"df_comparison = pd.DataFrame(comparison_data)\n",
|
||
|
|
"\n",
|
||
|
|
"# Sort by IC Mean\n",
|
||
|
|
"df_comparison = df_comparison.sort_values('IC Mean', ascending=False)\n",
|
||
|
|
"\n",
|
||
|
|
"print(\"Comparison Summary (sorted by IC Mean):\")\n",
|
||
|
|
"print(df_comparison.to_string(index=False))"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"# Visual comparison\n",
|
||
|
|
"fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n",
|
||
|
|
"\n",
|
||
|
|
"# IC Mean\n",
|
||
|
|
"axes[0, 0].barh(df_comparison['Blend'], df_comparison['IC Mean'])\n",
|
||
|
|
"axes[0, 0].set_title('IC Mean')\n",
|
||
|
|
"axes[0, 0].axvline(x=0, color='red', linestyle='--', alpha=0.5)\n",
|
||
|
|
"\n",
|
||
|
|
"# Information Ratio\n",
|
||
|
|
"axes[0, 1].barh(df_comparison['Blend'], df_comparison['IR'])\n",
|
||
|
|
"axes[0, 1].set_title('Information Ratio')\n",
|
||
|
|
"axes[0, 1].axvline(x=0, color='red', linestyle='--', alpha=0.5)\n",
|
||
|
|
"\n",
|
||
|
|
"# Return\n",
|
||
|
|
"axes[1, 0].barh(df_comparison['Blend'], df_comparison['Return'])\n",
|
||
|
|
"axes[1, 0].set_title('Return')\n",
|
||
|
|
"axes[1, 0].axvline(x=0, color='red', linestyle='--', alpha=0.5)\n",
|
||
|
|
"\n",
|
||
|
|
"# Sharpe\n",
|
||
|
|
"axes[1, 1].barh(df_comparison['Blend'], df_comparison['Sharpe'])\n",
|
||
|
|
"axes[1, 1].set_title('Sharpe Ratio')\n",
|
||
|
|
"axes[1, 1].axvline(x=0, color='red', linestyle='--', alpha=0.5)\n",
|
||
|
|
"\n",
|
||
|
|
"plt.tight_layout()\n",
|
||
|
|
"plt.show()"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## 4. IC Time Series Comparison"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"# Plot IC series for all configurations\n",
|
||
|
|
"fig, ax = plt.subplots(figsize=(16, 6))\n",
|
||
|
|
"\n",
|
||
|
|
"for r in all_results:\n",
|
||
|
|
" ic_rolling = r['ic_by_date'].rolling(20, min_periods=5).mean()\n",
|
||
|
|
" ax.plot(ic_rolling.index, ic_rolling.values, label=r['name'], alpha=0.8)\n",
|
||
|
|
"\n",
|
||
|
|
"ax.axhline(y=0, color='black', linestyle='-', alpha=0.3)\n",
|
||
|
|
"ax.set_title('Rolling IC Comparison (20-day MA)')\n",
|
||
|
|
"ax.set_xlabel('Date')\n",
|
||
|
|
"ax.set_ylabel('Information Coefficient')\n",
|
||
|
|
"ax.legend(loc='upper right')\n",
|
||
|
|
"plt.tight_layout()\n",
|
||
|
|
"plt.show()"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## 5. Feature Importance Comparison"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"# Get top features from each blend\n",
|
||
|
|
"n_top = 10\n",
|
||
|
|
"top_features_by_blend = {}\n",
|
||
|
|
"\n",
|
||
|
|
"for r in all_results:\n",
|
||
|
|
" top_features_by_blend[r['name']] = set(r['importance'].head(n_top).index)\n",
|
||
|
|
"\n",
|
||
|
|
"# Find common features across all blends\n",
|
||
|
|
"common_features = set.intersection(*top_features_by_blend.values())\n",
|
||
|
|
"print(f\"\\nCommon top-{n_top} features across all blends:\")\n",
|
||
|
|
"for f in sorted(common_features):\n",
|
||
|
|
" print(f\" - {f}\")\n",
|
||
|
|
"\n",
|
||
|
|
"# Find unique features per blend\n",
|
||
|
|
"print(\"\\nUnique top features by blend:\")\n",
|
||
|
|
"for name, features in top_features_by_blend.items():\n",
|
||
|
|
" unique = features - set.union(*(top_features_by_blend.values() - {features}))\n",
|
||
|
|
" if unique:\n",
|
||
|
|
" print(f\"\\n {name}:\")\n",
|
||
|
|
" for f in sorted(unique):\n",
|
||
|
|
" print(f\" - {f}\")"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"# Heatmap of top feature importance across blends\n",
|
||
|
|
"all_top_features = set.union(*top_features_by_blend.values())\n",
|
||
|
|
"\n",
|
||
|
|
"importance_matrix = []\n",
|
||
|
|
"for r in all_results:\n",
|
||
|
|
" row = []\n",
|
||
|
|
" for f in sorted(all_top_features):\n",
|
||
|
|
" if f in r['importance'].index:\n",
|
||
|
|
" row.append(r['importance'].loc[f, 'importance'])\n",
|
||
|
|
" else:\n",
|
||
|
|
" row.append(0)\n",
|
||
|
|
" importance_matrix.append(row)\n",
|
||
|
|
"\n",
|
||
|
|
"df_importance = pd.DataFrame(\n",
|
||
|
|
" importance_matrix,\n",
|
||
|
|
" index=[r['name'] for r in all_results],\n",
|
||
|
|
" columns=sorted(all_top_features)\n",
|
||
|
|
")\n",
|
||
|
|
"\n",
|
||
|
|
"fig, ax = plt.subplots(figsize=(14, 6))\n",
|
||
|
|
"sns.heatmap(df_importance, cmap='YlOrRd', ax=ax, cbar_kws={'label': 'Importance'})\n",
|
||
|
|
"ax.set_title('Feature Importance Comparison Across Blends')\n",
|
||
|
|
"ax.set_xlabel('Features')\n",
|
||
|
|
"ax.set_ylabel('Blend Configuration')\n",
|
||
|
|
"plt.xticks(rotation=45, ha='right')\n",
|
||
|
|
"plt.tight_layout()\n",
|
||
|
|
"plt.show()"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## 6. Custom Weight Exploration\n",
|
||
|
|
"\n",
|
||
|
|
"Test custom blend weights."
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"# Define custom weights to test\n",
|
||
|
|
"CUSTOM_WEIGHTS = [\n",
|
||
|
|
" [0.0, 0.0, 0.5, 0.5], # Only rolling\n",
|
||
|
|
" [0.3, 0.3, 0.2, 0.2], # Fit-time heavy\n",
|
||
|
|
" [0.1, 0.4, 0.25, 0.25], # CS heavy + balanced rolling\n",
|
||
|
|
"]\n",
|
||
|
|
"\n",
|
||
|
|
"custom_results = []\n",
|
||
|
|
"for i, weights in enumerate(CUSTOM_WEIGHTS):\n",
|
||
|
|
" result = run_single_experiment(f'custom_{i+1}', weights)\n",
|
||
|
|
" custom_results.append(result)\n",
|
||
|
|
"\n",
|
||
|
|
"print(\"\\n\\nCustom weights experiments complete!\")"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"# Compare custom with standard\n",
|
||
|
|
"all_comparison_data = comparison_data.copy()\n",
|
||
|
|
"\n",
|
||
|
|
"for r in custom_results:\n",
|
||
|
|
" ic_mean = r['ic_by_date'].mean()\n",
|
||
|
|
" ic_std = r['ic_by_date'].std()\n",
|
||
|
|
" all_comparison_data.append({\n",
|
||
|
|
" 'Blend': r['name'],\n",
|
||
|
|
" 'Weights': str(r['weights']),\n",
|
||
|
|
" 'IC Mean': ic_mean,\n",
|
||
|
|
" 'IC Std': ic_std,\n",
|
||
|
|
" 'IR': ic_mean / ic_std if ic_std > 0 else 0,\n",
|
||
|
|
" 'Return': r['summary'].get('return', np.nan),\n",
|
||
|
|
" 'Sharpe': r['summary'].get('sharpe', np.nan),\n",
|
||
|
|
" 'Turnover': r['summary'].get('turnover', np.nan),\n",
|
||
|
|
" })\n",
|
||
|
|
"\n",
|
||
|
|
"df_all = pd.DataFrame(all_comparison_data)\n",
|
||
|
|
"df_all = df_all.sort_values('IC Mean', ascending=False)\n",
|
||
|
|
"\n",
|
||
|
|
"print(\"All Results (standard + custom):\")\n",
|
||
|
|
"print(df_all.to_string(index=False))"
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "markdown",
|
||
|
|
"metadata": {},
|
||
|
|
"source": [
|
||
|
|
"## 7. Conclusion\n",
|
||
|
|
"\n",
|
||
|
|
"Summarize findings and recommend best blend configuration."
|
||
|
|
]
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"cell_type": "code",
|
||
|
|
"execution_count": null,
|
||
|
|
"metadata": {},
|
||
|
|
"outputs": [],
|
||
|
|
"source": [
|
||
|
|
"# Best configuration\n",
|
||
|
|
"best = df_comparison.iloc[0]\n",
|
||
|
|
"print(\"Recommended Blend Configuration:\")\n",
|
||
|
|
"print(f\" Name: {best['Blend']}\")\n",
|
||
|
|
"print(f\" Weights: {best['Weights']}\")\n",
|
||
|
|
"print(f\"\\nPerformance:\")\n",
|
||
|
|
"print(f\" IC Mean: {best['IC Mean']:.4f}\")\n",
|
||
|
|
"print(f\" IC Std: {best['IC Std']:.4f}\")\n",
|
||
|
|
"print(f\" IR: {best['IR']:.4f}\")\n",
|
||
|
|
"print(f\" Return: {best['Return']:.4f}\")\n",
|
||
|
|
"print(f\" Sharpe: {best['Sharpe']:.4f}\")"
|
||
|
|
]
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"metadata": {
|
||
|
|
"kernelspec": {
|
||
|
|
"display_name": "Python 3",
|
||
|
|
"language": "python",
|
||
|
|
"name": "python3"
|
||
|
|
},
|
||
|
|
"language_info": {
|
||
|
|
"name": "python",
|
||
|
|
"version": "3.8.0"
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"nbformat": 4,
|
||
|
|
"nbformat_minor": 4
|
||
|
|
}
|