You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

319 lines
8.9 KiB

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# CTA 1D Label Analysis\n",
"\n",
"Explore label distributions and compare different normalization blending strategies.\n",
"\n",
"**Purpose**: Understand how different normalization methods affect label distributions and identify optimal blending."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"\n",
"from qshare.data.pandas.cta_1d.dataset import load_features, load_label\n",
"from qshare.data.pandas.cta_1d.label import normalize_label_dual, normalize_label\n",
"from qshare.io.ddb.cta import load_cta_returns\n",
"\n",
"import sys\n",
"sys.path.insert(0, '../')\n",
"from common.plotting import setup_plot_style\n",
"from src.labels import BLEND_CONFIGS, get_blend_weights, describe_blend_config\n",
"\n",
"setup_plot_style()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Configuration"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"CONFIG = {\n",
" 'dt_range': ['2020-01-01', '2024-12-31'],\n",
" 'fit_range': ['2020-01-01', '2021-12-31'], # For zscore normalization\n",
" 'return_type': 'o2c_twap1min',\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Load Raw Returns\n",
"\n",
"Load the raw return series before any normalization."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load returns\n",
"print(\"Loading raw returns...\")\n",
"df_returns = load_cta_returns(\n",
" since_date=CONFIG['dt_range'][0],\n",
" end_date=CONFIG['dt_range'][1],\n",
")\n",
"\n",
"return_col = CONFIG['return_type']\n",
"raw_returns = df_returns[return_col].copy()\n",
"\n",
"print(f\"\\nRaw {return_col} returns:\")\n",
"print(raw_returns.describe())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Plot raw return distribution\n",
"fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
"\n",
"# Histogram\n",
"raw_returns.hist(bins=100, ax=axes[0], edgecolor='black')\n",
"axes[0].set_title(f'Raw {return_col} Distribution')\n",
"axes[0].axvline(x=0, color='red', linestyle='--')\n",
"\n",
"# Time series\n",
"daily_mean = raw_returns.groupby(level=0).mean()\n",
"axes[1].plot(daily_mean.index, daily_mean.values)\n",
"axes[1].set_title('Daily Mean Return')\n",
"axes[1].axhline(y=0, color='red', linestyle='--')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Compare Normalization Methods\n",
"\n",
"Apply each normalization method individually and compare."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load dominant contract mapping for proper label construction\n",
"from qshare.io.ddb.cta import load_cta_dominant_contracts\n",
"\n",
"print(\"Loading dominant contract mapping...\")\n",
"df_dominant = load_cta_dominant_contracts(\n",
" since_date=CONFIG['dt_range'][0],\n",
" end_date=CONFIG['dt_range'][1],\n",
")\n",
"\n",
"# Merge returns with dominant mapping\n",
"df_merged = df_dominant.join(raw_returns, how='left')\n",
"\n",
"# Calculate different normalization methods\n",
"print(\"\\nApplying normalization methods...\")\n",
"\n",
"norm_results = {}\n",
"\n",
"# zscore (fit-time)\n",
"norm_results['zscore'] = normalize_label(\n",
" df_merged[return_col],\n",
" method='zscore',\n",
" fit_range=CONFIG['fit_range']\n",
")\n",
"\n",
"# cs_zscore (cross-sectional)\n",
"norm_results['cs_zscore'] = df_merged.groupby(level=0)[return_col].apply(\n",
" lambda x: (x - x.mean()) / (x.std() + 1e-8)\n",
")\n",
"\n",
"# rolling_20\n",
"norm_results['rolling_20'] = normalize_label(\n",
" df_merged[return_col],\n",
" method='rolling',\n",
" window=20\n",
")\n",
"\n",
"# rolling_60\n",
"norm_results['rolling_60'] = normalize_label(\n",
" df_merged[return_col],\n",
" method='rolling',\n",
" window=60\n",
")\n",
"\n",
"print(\"Done!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Compare distributions\n",
"fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n",
"axes = axes.flatten()\n",
"\n",
"for i, (method, series) in enumerate(norm_results.items()):\n",
" ax = axes[i]\n",
" series.dropna().hist(bins=100, ax=ax, edgecolor='black', alpha=0.7)\n",
" ax.set_title(f'{method}\\nmean={series.mean():.3f}, std={series.std():.3f}')\n",
" ax.axvline(x=0, color='red', linestyle='--')\n",
" ax.set_xlim(-5, 5) # Focus on main distribution\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Compare Blend Configurations\n",
"\n",
"Compare different blending strategies."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Apply each blend configuration\n",
"blend_results = {}\n",
"\n",
"for name in BLEND_CONFIGS.keys():\n",
" weights = get_blend_weights(name)\n",
" print(f\"\\nProcessing {name}: {weights}\")\n",
" \n",
" # Calculate blended label\n",
" blended = (\n",
" weights[0] * norm_results['zscore'] +\n",
" weights[1] * norm_results['cs_zscore'] +\n",
" weights[2] * norm_results['rolling_20'] +\n",
" weights[3] * norm_results['rolling_60']\n",
" )\n",
" \n",
" blend_results[name] = blended\n",
" print(f\" Mean: {blended.mean():.4f}, Std: {blended.std():.4f}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Visualize all blend distributions\n",
"fig, axes = plt.subplots(2, 3, figsize=(15, 10))\n",
"axes = axes.flatten()\n",
"\n",
"for i, (name, series) in enumerate(blend_results.items()):\n",
" ax = axes[i]\n",
" series.dropna().hist(bins=100, ax=ax, edgecolor='black', alpha=0.7)\n",
" weights = get_blend_weights(name)\n",
" ax.set_title(f'{name}\\nweights={weights}\\nmean={series.mean():.3f}, std={series.std():.3f}')\n",
" ax.axvline(x=0, color='red', linestyle='--')\n",
" ax.set_xlim(-5, 5)\n",
"\n",
"# Hide last subplot if not used\n",
"if len(blend_results) < 6:\n",
" axes[-1].axis('off')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Correlation Analysis\n",
"\n",
"Check correlations between different normalization methods."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create comparison DataFrame\n",
"comparison_df = pd.DataFrame(norm_results)\n",
"\n",
"# Add raw returns\n",
"comparison_df['raw'] = df_merged[return_col]\n",
"\n",
"# Calculate correlation matrix\n",
"corr = comparison_df.corr()\n",
"\n",
"# Plot heatmap\n",
"fig, ax = plt.subplots(figsize=(8, 6))\n",
"sns.heatmap(corr, annot=True, cmap='RdBu_r', center=0,\n",
" vmin=-1, vmax=1, ax=ax)\n",
"ax.set_title('Correlation: Normalization Methods')\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Rolling correlation analysis\n",
"window = 60\n",
"\n",
"# Calculate rolling correlation between zscore and cs_zscore\n",
"rolling_corr = norm_results['zscore'].rolling(window).corr(norm_results['cs_zscore'])\n",
"\n",
"fig, ax = plt.subplots(figsize=(14, 4))\n",
"ax.plot(rolling_corr.index.get_level_values(0).unique(), rolling_corr.groupby(level=0).mean())\n",
"ax.set_title(f'Rolling Correlation: zscore vs cs_zscore ({window}d window)')\n",
"ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.5)\n",
"ax.set_ylim(-1, 1)\n",
"plt.tight_layout()\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}