You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
278 lines
7.7 KiB
278 lines
7.7 KiB
{
|
|
"cells": [
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"# CTA 1D Data Check\n",
|
|
"\n",
|
|
"Load and validate CTA futures data.\n",
|
|
"\n",
|
|
"**Purpose**: Verify data availability, check basic statistics, and understand data structure before modeling."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"import pandas as pd\n",
|
|
"import numpy as np\n",
|
|
"import matplotlib.pyplot as plt\n",
|
|
"\n",
|
|
"from qshare.data.pandas.cta_1d import load_dataset\n",
|
|
"from qshare.io.ddb.cta import load_cta_alpha158, load_cta_hffactors, load_cta_returns\n",
|
|
"\n",
|
|
"import sys\n",
|
|
"sys.path.insert(0, '../')\n",
|
|
"from common.plotting import setup_plot_style\n",
|
|
"\n",
|
|
"setup_plot_style()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 1. Configuration\n",
|
|
"\n",
|
|
"Modify these parameters as needed for your data check."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"CONFIG = {\n",
|
|
" 'dt_range': ['2020-01-01', '2024-12-31'],\n",
|
|
" 'feature_sets': ['alpha158', 'hffactor'],\n",
|
|
" 'return_type': 'o2c_twap1min', # or 'o2o_twap1min'\n",
|
|
" 'normalization': 'dual',\n",
|
|
"}"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 2. Load Features Separately\n",
|
|
"\n",
|
|
"Check each feature set independently."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Load alpha158 features\n",
|
|
"print(\"Loading alpha158 features...\")\n",
|
|
"df_alpha158 = load_cta_alpha158(\n",
|
|
" since_date=CONFIG['dt_range'][0],\n",
|
|
" end_date=CONFIG['dt_range'][1],\n",
|
|
")\n",
|
|
"print(f\"alpha158 shape: {df_alpha158.shape}\")\n",
|
|
"print(f\"\")\n",
|
|
"print(f\"Columns: {list(df_alpha158.columns[:10])}...\") # First 10 columns\n",
|
|
"print(f\"\")\n",
|
|
"print(f\"Date range: {df_alpha158.index.get_level_values(0).min()} to {df_alpha158.index.get_level_values(0).max()}\")\n",
|
|
"print(f\"Instruments: {df_alpha158.index.get_level_values(1).nunique()}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Load HF factors\n",
|
|
"print(\"Loading hffactor features...\")\n",
|
|
"df_hf = load_cta_hffactors(\n",
|
|
" since_date=CONFIG['dt_range'][0],\n",
|
|
" end_date=CONFIG['dt_range'][1],\n",
|
|
")\n",
|
|
"print(f\"hffactor shape: {df_hf.shape}\")\n",
|
|
"print(f\"\")\n",
|
|
"print(f\"Columns: {list(df_hf.columns[:10])}...\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 3. Load Returns (Labels)\n",
|
|
"\n",
|
|
"Check return indicators that will be used as prediction targets."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Load return indicators\n",
|
|
"print(\"Loading return indicators...\")\n",
|
|
"df_returns = load_cta_returns(\n",
|
|
" since_date=CONFIG['dt_range'][0],\n",
|
|
" end_date=CONFIG['dt_range'][1],\n",
|
|
")\n",
|
|
"print(f\"Returns shape: {df_returns.shape}\")\n",
|
|
"print(f\"\")\n",
|
|
"print(f\"Available return types:\")\n",
|
|
"for col in df_returns.columns:\n",
|
|
" print(f\" - {col}\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Check specific return type\n",
|
|
"return_col = CONFIG['return_type']\n",
|
|
"if return_col in df_returns.columns:\n",
|
|
" print(f\"\\n{return_col} statistics:\")\n",
|
|
" print(df_returns[return_col].describe())\n",
|
|
" \n",
|
|
" # Plot distribution\n",
|
|
" fig, ax = plt.subplots(figsize=(10, 4))\n",
|
|
" df_returns[return_col].hist(bins=100, ax=ax, edgecolor='black')\n",
|
|
" ax.set_title(f'{return_col} Distribution')\n",
|
|
" ax.axvline(x=0, color='red', linestyle='--')\n",
|
|
" plt.show()\n",
|
|
"else:\n",
|
|
" print(f\"Warning: {return_col} not found in returns data\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 4. Load Full Dataset\n",
|
|
"\n",
|
|
"Load the complete training dataset with features and labels."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Load full dataset\n",
|
|
"print(\"Loading full dataset...\")\n",
|
|
"df_full = load_dataset(\n",
|
|
" dt_range=CONFIG['dt_range'],\n",
|
|
" return_type=CONFIG['return_type'],\n",
|
|
" normalization=CONFIG['normalization'],\n",
|
|
" feature_sets=CONFIG['feature_sets'],\n",
|
|
")\n",
|
|
"\n",
|
|
"print(f\"\\nFull dataset shape: {df_full.shape}\")\n",
|
|
"print(f\"\")\n",
|
|
"print(f\"Columns: {len(df_full.columns)} total\")\n",
|
|
"print(f\" - Features: {len([c for c in df_full.columns if c.startswith(('alpha158_', 'hf_'))])}\")\n",
|
|
"print(f\" - Label: 'label'\")\n",
|
|
"print(f\" - Weight: 'weight'\")\n",
|
|
"print(f\" - Return: 'return'\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Check for missing values\n",
|
|
"missing = df_full.isnull().sum()\n",
|
|
"missing_cols = missing[missing > 0]\n",
|
|
"\n",
|
|
"if len(missing_cols) > 0:\n",
|
|
" print(f\"\\nColumns with missing values:\")\n",
|
|
" print(missing_cols.head(10))\n",
|
|
"else:\n",
|
|
" print(\"\\nNo missing values found!\")"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"source": [
|
|
"# Label statistics\n",
|
|
"print(\"\\nLabel statistics:\")\n",
|
|
"print(df_full['label'].describe())\n",
|
|
"\n",
|
|
"fig, axes = plt.subplots(1, 2, figsize=(14, 4))\n",
|
|
"\n",
|
|
"# Distribution\n",
|
|
"df_full['label'].hist(bins=100, ax=axes[0], edgecolor='black')\n",
|
|
"axes[0].set_title('Label Distribution')\n",
|
|
"axes[0].axvline(x=0, color='red', linestyle='--')\n",
|
|
"\n",
|
|
"# Time series of mean label by date\n",
|
|
"label_by_date = df_full.groupby(level=0)['label'].mean()\n",
|
|
"axes[1].plot(label_by_date.index, label_by_date.values)\n",
|
|
"axes[1].set_title('Mean Label by Date')\n",
|
|
"axes[1].axhline(y=0, color='red', linestyle='--')\n",
|
|
"\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "markdown",
|
|
"metadata": {},
|
|
"source": [
|
|
"## 5. Summary\n",
|
|
"\n",
|
|
"Check data availability by instrument and date."
|
|
]
|
|
},
|
|
{
|
|
"cell_type": "code",
|
|
"execution_count": null,
|
|
"metadata": {},
|
|
"outputs": [],
|
|
"source": [
|
|
"# Data availability heatmap\n",
|
|
"available = df_full.groupby([df_full.index.get_level_values(0).date, df_full.index.get_level_values(1)]).size().unstack(fill_value=0)\n",
|
|
"available = (available > 0).astype(int)\n",
|
|
"\n",
|
|
"print(f\"Data availability: {available.sum().sum()} instrument-date pairs\")\n",
|
|
"print(f\"Instruments: {len(available.columns)}\")\n",
|
|
"print(f\"Dates: {len(available.index)}\")\n",
|
|
"\n",
|
|
"# Plot coverage\n",
|
|
"fig, ax = plt.subplots(figsize=(14, 6))\n",
|
|
"im = ax.imshow(available.T.values, aspect='auto', cmap='RdYlGn', interpolation='nearest')\n",
|
|
"ax.set_title('Data Availability (Green=Available, Red=Missing)')\n",
|
|
"ax.set_xlabel('Time')\n",
|
|
"ax.set_ylabel('Instrument')\n",
|
|
"plt.colorbar(im, ax=ax)\n",
|
|
"plt.tight_layout()\n",
|
|
"plt.show()"
|
|
]
|
|
}
|
|
],
|
|
"metadata": {
|
|
"kernelspec": {
|
|
"display_name": "Python 3",
|
|
"language": "python",
|
|
"name": "python3"
|
|
},
|
|
"language_info": {
|
|
"name": "python",
|
|
"version": "3.8.0"
|
|
}
|
|
},
|
|
"nbformat": 4,
|
|
"nbformat_minor": 4
|
|
}
|