{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# CTA 1D Label Analysis\n", "\n", "Explore label distributions and compare different normalization blending strategies.\n", "\n", "**Purpose**: Understand how different normalization methods affect label distributions and identify optimal blending." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "\n", "from qshare.data.pandas.cta_1d.dataset import load_features, load_label\n", "from qshare.data.pandas.cta_1d.label import normalize_label_dual, normalize_label\n", "from qshare.io.ddb.cta import load_cta_returns\n", "\n", "import sys\n", "sys.path.insert(0, '../')\n", "from common.plotting import setup_plot_style\n", "from src.labels import BLEND_CONFIGS, get_blend_weights, describe_blend_config\n", "\n", "setup_plot_style()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Configuration" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "CONFIG = {\n", " 'dt_range': ['2020-01-01', '2024-12-31'],\n", " 'fit_range': ['2020-01-01', '2021-12-31'], # For zscore normalization\n", " 'return_type': 'o2c_twap1min',\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Load Raw Returns\n", "\n", "Load the raw return series before any normalization." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load returns\n", "print(\"Loading raw returns...\")\n", "df_returns = load_cta_returns(\n", " since_date=CONFIG['dt_range'][0],\n", " end_date=CONFIG['dt_range'][1],\n", ")\n", "\n", "return_col = CONFIG['return_type']\n", "raw_returns = df_returns[return_col].copy()\n", "\n", "print(f\"\\nRaw {return_col} returns:\")\n", "print(raw_returns.describe())" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Plot raw return distribution\n", "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n", "\n", "# Histogram\n", "raw_returns.hist(bins=100, ax=axes[0], edgecolor='black')\n", "axes[0].set_title(f'Raw {return_col} Distribution')\n", "axes[0].axvline(x=0, color='red', linestyle='--')\n", "\n", "# Time series\n", "daily_mean = raw_returns.groupby(level=0).mean()\n", "axes[1].plot(daily_mean.index, daily_mean.values)\n", "axes[1].set_title('Daily Mean Return')\n", "axes[1].axhline(y=0, color='red', linestyle='--')\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Compare Normalization Methods\n", "\n", "Apply each normalization method individually and compare." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load dominant contract mapping for proper label construction\n", "from qshare.io.ddb.cta import load_cta_dominant_contracts\n", "\n", "print(\"Loading dominant contract mapping...\")\n", "df_dominant = load_cta_dominant_contracts(\n", " since_date=CONFIG['dt_range'][0],\n", " end_date=CONFIG['dt_range'][1],\n", ")\n", "\n", "# Merge returns with dominant mapping\n", "df_merged = df_dominant.join(raw_returns, how='left')\n", "\n", "# Calculate different normalization methods\n", "print(\"\\nApplying normalization methods...\")\n", "\n", "norm_results = {}\n", "\n", "# zscore (fit-time)\n", "norm_results['zscore'] = normalize_label(\n", " df_merged[return_col],\n", " method='zscore',\n", " fit_range=CONFIG['fit_range']\n", ")\n", "\n", "# cs_zscore (cross-sectional)\n", "norm_results['cs_zscore'] = df_merged.groupby(level=0)[return_col].apply(\n", " lambda x: (x - x.mean()) / (x.std() + 1e-8)\n", ")\n", "\n", "# rolling_20\n", "norm_results['rolling_20'] = normalize_label(\n", " df_merged[return_col],\n", " method='rolling',\n", " window=20\n", ")\n", "\n", "# rolling_60\n", "norm_results['rolling_60'] = normalize_label(\n", " df_merged[return_col],\n", " method='rolling',\n", " window=60\n", ")\n", "\n", "print(\"Done!\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Compare distributions\n", "fig, axes = plt.subplots(2, 2, figsize=(14, 10))\n", "axes = axes.flatten()\n", "\n", "for i, (method, series) in enumerate(norm_results.items()):\n", " ax = axes[i]\n", " series.dropna().hist(bins=100, ax=ax, edgecolor='black', alpha=0.7)\n", " ax.set_title(f'{method}\\nmean={series.mean():.3f}, std={series.std():.3f}')\n", " ax.axvline(x=0, color='red', linestyle='--')\n", " ax.set_xlim(-5, 5) # Focus on main distribution\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Compare Blend Configurations\n", "\n", "Compare different blending strategies." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Apply each blend configuration\n", "blend_results = {}\n", "\n", "for name in BLEND_CONFIGS.keys():\n", " weights = get_blend_weights(name)\n", " print(f\"\\nProcessing {name}: {weights}\")\n", " \n", " # Calculate blended label\n", " blended = (\n", " weights[0] * norm_results['zscore'] +\n", " weights[1] * norm_results['cs_zscore'] +\n", " weights[2] * norm_results['rolling_20'] +\n", " weights[3] * norm_results['rolling_60']\n", " )\n", " \n", " blend_results[name] = blended\n", " print(f\" Mean: {blended.mean():.4f}, Std: {blended.std():.4f}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Visualize all blend distributions\n", "fig, axes = plt.subplots(2, 3, figsize=(15, 10))\n", "axes = axes.flatten()\n", "\n", "for i, (name, series) in enumerate(blend_results.items()):\n", " ax = axes[i]\n", " series.dropna().hist(bins=100, ax=ax, edgecolor='black', alpha=0.7)\n", " weights = get_blend_weights(name)\n", " ax.set_title(f'{name}\\nweights={weights}\\nmean={series.mean():.3f}, std={series.std():.3f}')\n", " ax.axvline(x=0, color='red', linestyle='--')\n", " ax.set_xlim(-5, 5)\n", "\n", "# Hide last subplot if not used\n", "if len(blend_results) < 6:\n", " axes[-1].axis('off')\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Correlation Analysis\n", "\n", "Check correlations between different normalization methods." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Create comparison DataFrame\n", "comparison_df = pd.DataFrame(norm_results)\n", "\n", "# Add raw returns\n", "comparison_df['raw'] = df_merged[return_col]\n", "\n", "# Calculate correlation matrix\n", "corr = comparison_df.corr()\n", "\n", "# Plot heatmap\n", "fig, ax = plt.subplots(figsize=(8, 6))\n", "sns.heatmap(corr, annot=True, cmap='RdBu_r', center=0,\n", " vmin=-1, vmax=1, ax=ax)\n", "ax.set_title('Correlation: Normalization Methods')\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "source": [ "# Rolling correlation analysis\n", "window = 60\n", "\n", "# Calculate rolling correlation between zscore and cs_zscore\n", "rolling_corr = norm_results['zscore'].rolling(window).corr(norm_results['cs_zscore'])\n", "\n", "fig, ax = plt.subplots(figsize=(14, 4))\n", "ax.plot(rolling_corr.index.get_level_values(0).unique(), rolling_corr.groupby(level=0).mean())\n", "ax.set_title(f'Rolling Correlation: zscore vs cs_zscore ({window}d window)')\n", "ax.axhline(y=0.5, color='red', linestyle='--', alpha=0.5)\n", "ax.set_ylim(-1, 1)\n", "plt.tight_layout()\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 4 }