{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# CTA 1D Data Check\n", "\n", "Load and validate CTA futures data.\n", "\n", "**Purpose**: Verify data availability, check basic statistics, and understand data structure before modeling." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "\n", "from qshare.data.pandas.cta_1d import load_dataset\n", "from qshare.io.ddb.cta import load_cta_alpha158, load_cta_hffactors, load_cta_returns\n", "\n", "import sys\n", "sys.path.insert(0, '../')\n", "from common.plotting import setup_plot_style\n", "\n", "setup_plot_style()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 1. Configuration\n", "\n", "Modify these parameters as needed for your data check." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "CONFIG = {\n", " 'dt_range': ['2020-01-01', '2024-12-31'],\n", " 'feature_sets': ['alpha158', 'hffactor'],\n", " 'return_type': 'o2c_twap1min', # or 'o2o_twap1min'\n", " 'normalization': 'dual',\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 2. Load Features Separately\n", "\n", "Check each feature set independently." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load alpha158 features\n", "print(\"Loading alpha158 features...\")\n", "df_alpha158 = load_cta_alpha158(\n", " since_date=CONFIG['dt_range'][0],\n", " end_date=CONFIG['dt_range'][1],\n", ")\n", "print(f\"alpha158 shape: {df_alpha158.shape}\")\n", "print(f\"\")\n", "print(f\"Columns: {list(df_alpha158.columns[:10])}...\") # First 10 columns\n", "print(f\"\")\n", "print(f\"Date range: {df_alpha158.index.get_level_values(0).min()} to {df_alpha158.index.get_level_values(0).max()}\")\n", "print(f\"Instruments: {df_alpha158.index.get_level_values(1).nunique()}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load HF factors\n", "print(\"Loading hffactor features...\")\n", "df_hf = load_cta_hffactors(\n", " since_date=CONFIG['dt_range'][0],\n", " end_date=CONFIG['dt_range'][1],\n", ")\n", "print(f\"hffactor shape: {df_hf.shape}\")\n", "print(f\"\")\n", "print(f\"Columns: {list(df_hf.columns[:10])}...\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 3. Load Returns (Labels)\n", "\n", "Check return indicators that will be used as prediction targets." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load return indicators\n", "print(\"Loading return indicators...\")\n", "df_returns = load_cta_returns(\n", " since_date=CONFIG['dt_range'][0],\n", " end_date=CONFIG['dt_range'][1],\n", ")\n", "print(f\"Returns shape: {df_returns.shape}\")\n", "print(f\"\")\n", "print(f\"Available return types:\")\n", "for col in df_returns.columns:\n", " print(f\" - {col}\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Check specific return type\n", "return_col = CONFIG['return_type']\n", "if return_col in df_returns.columns:\n", " print(f\"\\n{return_col} statistics:\")\n", " print(df_returns[return_col].describe())\n", " \n", " # Plot distribution\n", " fig, ax = plt.subplots(figsize=(10, 4))\n", " df_returns[return_col].hist(bins=100, ax=ax, edgecolor='black')\n", " ax.set_title(f'{return_col} Distribution')\n", " ax.axvline(x=0, color='red', linestyle='--')\n", " plt.show()\n", "else:\n", " print(f\"Warning: {return_col} not found in returns data\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 4. Load Full Dataset\n", "\n", "Load the complete training dataset with features and labels." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Load full dataset\n", "print(\"Loading full dataset...\")\n", "df_full = load_dataset(\n", " dt_range=CONFIG['dt_range'],\n", " return_type=CONFIG['return_type'],\n", " normalization=CONFIG['normalization'],\n", " feature_sets=CONFIG['feature_sets'],\n", ")\n", "\n", "print(f\"\\nFull dataset shape: {df_full.shape}\")\n", "print(f\"\")\n", "print(f\"Columns: {len(df_full.columns)} total\")\n", "print(f\" - Features: {len([c for c in df_full.columns if c.startswith(('alpha158_', 'hf_'))])}\")\n", "print(f\" - Label: 'label'\")\n", "print(f\" - Weight: 'weight'\")\n", "print(f\" - Return: 'return'\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Check for missing values\n", "missing = df_full.isnull().sum()\n", "missing_cols = missing[missing > 0]\n", "\n", "if len(missing_cols) > 0:\n", " print(f\"\\nColumns with missing values:\")\n", " print(missing_cols.head(10))\n", "else:\n", " print(\"\\nNo missing values found!\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "source": [ "# Label statistics\n", "print(\"\\nLabel statistics:\")\n", "print(df_full['label'].describe())\n", "\n", "fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n", "\n", "# Distribution\n", "df_full['label'].hist(bins=100, ax=axes[0], edgecolor='black')\n", "axes[0].set_title('Label Distribution')\n", "axes[0].axvline(x=0, color='red', linestyle='--')\n", "\n", "# Time series of mean label by date\n", "label_by_date = df_full.groupby(level=0)['label'].mean()\n", "axes[1].plot(label_by_date.index, label_by_date.values)\n", "axes[1].set_title('Mean Label by Date')\n", "axes[1].axhline(y=0, color='red', linestyle='--')\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## 5. Summary\n", "\n", "Check data availability by instrument and date." ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Data availability heatmap\n", "available = df_full.groupby([df_full.index.get_level_values(0).date, df_full.index.get_level_values(1)]).size().unstack(fill_value=0)\n", "available = (available > 0).astype(int)\n", "\n", "print(f\"Data availability: {available.sum().sum()} instrument-date pairs\")\n", "print(f\"Instruments: {len(available.columns)}\")\n", "print(f\"Dates: {len(available.index)}\")\n", "\n", "# Plot coverage\n", "fig, ax = plt.subplots(figsize=(14, 6))\n", "im = ax.imshow(available.T.values, aspect='auto', cmap='RdYlGn', interpolation='nearest')\n", "ax.set_title('Data Availability (Green=Available, Red=Missing)')\n", "ax.set_xlabel('Time')\n", "ax.set_ylabel('Instrument')\n", "plt.colorbar(im, ax=ax)\n", "plt.tight_layout()\n", "plt.show()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "name": "python", "version": "3.8.0" } }, "nbformat": 4, "nbformat_minor": 4 }