You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

278 lines
7.7 KiB

{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# CTA 1D Data Check\n",
"\n",
"Load and validate CTA futures data.\n",
"\n",
"**Purpose**: Verify data availability, check basic statistics, and understand data structure before modeling."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from qshare.data.pandas.cta_1d import load_dataset\n",
"from qshare.io.ddb.cta import load_cta_alpha158, load_cta_hffactors, load_cta_returns\n",
"\n",
"import sys\n",
"sys.path.insert(0, '../')\n",
"from common.plotting import setup_plot_style\n",
"\n",
"setup_plot_style()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 1. Configuration\n",
"\n",
"Modify these parameters as needed for your data check."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"CONFIG = {\n",
" 'dt_range': ['2020-01-01', '2024-12-31'],\n",
" 'feature_sets': ['alpha158', 'hffactor'],\n",
" 'return_type': 'o2c_twap1min', # or 'o2o_twap1min'\n",
" 'normalization': 'dual',\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 2. Load Features Separately\n",
"\n",
"Check each feature set independently."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load alpha158 features\n",
"print(\"Loading alpha158 features...\")\n",
"df_alpha158 = load_cta_alpha158(\n",
" since_date=CONFIG['dt_range'][0],\n",
" end_date=CONFIG['dt_range'][1],\n",
")\n",
"print(f\"alpha158 shape: {df_alpha158.shape}\")\n",
"print(f\"\")\n",
"print(f\"Columns: {list(df_alpha158.columns[:10])}...\") # First 10 columns\n",
"print(f\"\")\n",
"print(f\"Date range: {df_alpha158.index.get_level_values(0).min()} to {df_alpha158.index.get_level_values(0).max()}\")\n",
"print(f\"Instruments: {df_alpha158.index.get_level_values(1).nunique()}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load HF factors\n",
"print(\"Loading hffactor features...\")\n",
"df_hf = load_cta_hffactors(\n",
" since_date=CONFIG['dt_range'][0],\n",
" end_date=CONFIG['dt_range'][1],\n",
")\n",
"print(f\"hffactor shape: {df_hf.shape}\")\n",
"print(f\"\")\n",
"print(f\"Columns: {list(df_hf.columns[:10])}...\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 3. Load Returns (Labels)\n",
"\n",
"Check return indicators that will be used as prediction targets."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load return indicators\n",
"print(\"Loading return indicators...\")\n",
"df_returns = load_cta_returns(\n",
" since_date=CONFIG['dt_range'][0],\n",
" end_date=CONFIG['dt_range'][1],\n",
")\n",
"print(f\"Returns shape: {df_returns.shape}\")\n",
"print(f\"\")\n",
"print(f\"Available return types:\")\n",
"for col in df_returns.columns:\n",
" print(f\" - {col}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check specific return type\n",
"return_col = CONFIG['return_type']\n",
"if return_col in df_returns.columns:\n",
" print(f\"\\n{return_col} statistics:\")\n",
" print(df_returns[return_col].describe())\n",
" \n",
" # Plot distribution\n",
" fig, ax = plt.subplots(figsize=(10, 4))\n",
" df_returns[return_col].hist(bins=100, ax=ax, edgecolor='black')\n",
" ax.set_title(f'{return_col} Distribution')\n",
" ax.axvline(x=0, color='red', linestyle='--')\n",
" plt.show()\n",
"else:\n",
" print(f\"Warning: {return_col} not found in returns data\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 4. Load Full Dataset\n",
"\n",
"Load the complete training dataset with features and labels."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Load full dataset\n",
"print(\"Loading full dataset...\")\n",
"df_full = load_dataset(\n",
" dt_range=CONFIG['dt_range'],\n",
" return_type=CONFIG['return_type'],\n",
" normalization=CONFIG['normalization'],\n",
" feature_sets=CONFIG['feature_sets'],\n",
")\n",
"\n",
"print(f\"\\nFull dataset shape: {df_full.shape}\")\n",
"print(f\"\")\n",
"print(f\"Columns: {len(df_full.columns)} total\")\n",
"print(f\" - Features: {len([c for c in df_full.columns if c.startswith(('alpha158_', 'hf_'))])}\")\n",
"print(f\" - Label: 'label'\")\n",
"print(f\" - Weight: 'weight'\")\n",
"print(f\" - Return: 'return'\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Check for missing values\n",
"missing = df_full.isnull().sum()\n",
"missing_cols = missing[missing > 0]\n",
"\n",
"if len(missing_cols) > 0:\n",
" print(f\"\\nColumns with missing values:\")\n",
" print(missing_cols.head(10))\n",
"else:\n",
" print(\"\\nNo missing values found!\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"source": [
"# Label statistics\n",
"print(\"\\nLabel statistics:\")\n",
"print(df_full['label'].describe())\n",
"\n",
"fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
"\n",
"# Distribution\n",
"df_full['label'].hist(bins=100, ax=axes[0], edgecolor='black')\n",
"axes[0].set_title('Label Distribution')\n",
"axes[0].axvline(x=0, color='red', linestyle='--')\n",
"\n",
"# Time series of mean label by date\n",
"label_by_date = df_full.groupby(level=0)['label'].mean()\n",
"axes[1].plot(label_by_date.index, label_by_date.values)\n",
"axes[1].set_title('Mean Label by Date')\n",
"axes[1].axhline(y=0, color='red', linestyle='--')\n",
"\n",
"plt.tight_layout()\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 5. Summary\n",
"\n",
"Check data availability by instrument and date."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Data availability heatmap\n",
"available = df_full.groupby([df_full.index.get_level_values(0).date, df_full.index.get_level_values(1)]).size().unstack(fill_value=0)\n",
"available = (available > 0).astype(int)\n",
"\n",
"print(f\"Data availability: {available.sum().sum()} instrument-date pairs\")\n",
"print(f\"Instruments: {len(available.columns)}\")\n",
"print(f\"Dates: {len(available.index)}\")\n",
"\n",
"# Plot coverage\n",
"fig, ax = plt.subplots(figsize=(14, 6))\n",
"im = ax.imshow(available.T.values, aspect='auto', cmap='RdYlGn', interpolation='nearest')\n",
"ax.set_title('Data Availability (Green=Available, Red=Missing)')\n",
"ax.set_xlabel('Time')\n",
"ax.set_ylabel('Instrument')\n",
"plt.colorbar(im, ax=ax)\n",
"plt.tight_layout()\n",
"plt.show()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.8.0"
}
},
"nbformat": 4,
"nbformat_minor": 4
}