#!/usr/bin/env python """ Compare 0_7 vs 0_7_beta predictions. This script: 1. Loads original 0_7 predictions (from DDB) 2. Loads 0_7_beta predictions (from new embeddings) 3. Calculates correlation between predictions 4. Compares metrics (IC, RankIC, etc.) if actual returns available """ import os import numpy as np import polars as pl import pandas as pd from scipy.stats import spearmanr from typing import Optional, Dict # File paths PRED_0_7_FILE = "../data/original_predictions_0_7.parquet" PRED_0_7_BETA_FILE = "../data/predictions_beta_embedding.parquet" ACTUAL_RETURNS_FILE = "../data/actual_returns.parquet" def load_and_align_predictions(): """Load both prediction files and align them by datetime and instrument.""" print("Loading predictions...") # Load 0_7 predictions df_0_7 = pl.read_parquet(PRED_0_7_FILE) print(f"0_7 predictions: {df_0_7.shape}") print(f" Date range: {df_0_7['datetime'].min()} to {df_0_7['datetime'].max()}") print(f" Unique instruments: {df_0_7['instrument'].n_unique()}") # Load 0_7_beta predictions df_beta = pl.read_parquet(PRED_0_7_BETA_FILE) print(f"\n0_7_beta predictions: {df_beta.shape}") print(f" Date range: {df_beta['datetime'].min()} to {df_beta['datetime'].max()}") print(f" Unique instruments: {df_beta['instrument'].n_unique()}") # Ensure compatible types df_0_7 = df_0_7.with_columns([ pl.col('datetime').cast(pl.Int64), pl.col('instrument').cast(pl.Int64) ]) df_beta = df_beta.with_columns([ pl.col('datetime').cast(pl.Int64), pl.col('instrument').cast(pl.Int64) ]) # Rename prediction columns df_0_7 = df_0_7.rename({'prediction': 'pred_0_7'}) df_beta = df_beta.rename({'prediction': 'pred_beta'}) # Join on datetime and instrument df_joined = df_0_7.join( df_beta, on=['datetime', 'instrument'], how='inner' ) print(f"\nJoined predictions: {df_joined.shape}") print(f" Overlapping dates: {df_joined['datetime'].n_unique()}") print(f" Overlapping instruments: {df_joined['instrument'].n_unique()}") return df_joined def calculate_correlation(df: pl.DataFrame) -> Dict[str, float]: """Calculate correlation between 0_7 and 0_7_beta predictions.""" df_pd = df.to_pandas() # Overall correlation pearson_corr = df_pd['pred_0_7'].corr(df_pd['pred_beta']) spearman_corr, _ = spearmanr(df_pd['pred_0_7'], df_pd['pred_beta']) # Correlation by date daily_corrs = [] for date, group in df_pd.groupby('datetime'): if len(group) >= 2: corr = group['pred_0_7'].corr(group['pred_beta']) daily_corrs.append(corr) daily_corr_mean = np.mean(daily_corrs) daily_corr_std = np.std(daily_corrs) return { 'pearson_corr': pearson_corr, 'spearman_corr': spearman_corr, 'daily_corr_mean': daily_corr_mean, 'daily_corr_std': daily_corr_std } def calculate_ic_metrics(df: pl.DataFrame, actual_returns: pl.DataFrame) -> Dict: """Calculate IC metrics for both prediction sets.""" # Join with actual returns df_joined = df.join( actual_returns, on=['datetime', 'instrument'], how='inner' ) print(f"\nJoined with returns: {df_joined.shape}") df_pd = df_joined.to_pandas() # Find return column return_col = None for col in ['v2v_5d', 'return', 'actual_return', 'ret']: if col in df_pd.columns: return_col = col break if return_col is None: print("No return column found!") return {} print(f"Using return column: {return_col}") # Calculate daily IC for both predictions results_0_7 = [] results_beta = [] for date, group in df_pd.groupby('datetime'): if len(group) < 5: # Need enough samples continue # IC (Pearson) ic_0_7 = group['pred_0_7'].corr(group[return_col]) ic_beta = group['pred_beta'].corr(group[return_col]) # RankIC (Spearman) rankic_0_7, _ = spearmanr(group['pred_0_7'], group[return_col]) rankic_beta, _ = spearmanr(group['pred_beta'], group[return_col]) results_0_7.append({'date': date, 'ic': ic_0_7, 'rankic': rankic_0_7}) results_beta.append({'date': date, 'ic': ic_beta, 'rankic': rankic_beta}) df_ic_0_7 = pd.DataFrame(results_0_7) df_ic_beta = pd.DataFrame(results_beta) metrics = { '0_7': { 'ic_mean': df_ic_0_7['ic'].mean(), 'ic_std': df_ic_0_7['ic'].std(), 'ic_ir': df_ic_0_7['ic'].mean() / df_ic_0_7['ic'].std() if df_ic_0_7['ic'].std() > 0 else 0, 'rankic_mean': df_ic_0_7['rankic'].mean(), 'rankic_std': df_ic_0_7['rankic'].std(), 'rankic_ir': df_ic_0_7['rankic'].mean() / df_ic_0_7['rankic'].std() if df_ic_0_7['rankic'].std() > 0 else 0, }, '0_7_beta': { 'ic_mean': df_ic_beta['ic'].mean(), 'ic_std': df_ic_beta['ic'].std(), 'ic_ir': df_ic_beta['ic'].mean() / df_ic_beta['ic'].std() if df_ic_beta['ic'].std() > 0 else 0, 'rankic_mean': df_ic_beta['rankic'].mean(), 'rankic_std': df_ic_beta['rankic'].std(), 'rankic_ir': df_ic_beta['rankic'].mean() / df_ic_beta['rankic'].std() if df_ic_beta['rankic'].std() > 0 else 0, } } return metrics def calculate_top_tier_return(df: pl.DataFrame, actual_returns: pl.DataFrame, top_pct: float = 0.1) -> Dict: """Calculate top-tier returns for both predictions.""" # Join with actual returns df_joined = df.join( actual_returns, on=['datetime', 'instrument'], how='inner' ) df_pd = df_joined.to_pandas() # Find return column return_col = None for col in ['v2v_5d', 'return', 'actual_return', 'ret']: if col in df_pd.columns: return_col = col break if return_col is None: return {} # Calculate top-tier returns top_returns_0_7 = [] top_returns_beta = [] for date, group in df_pd.groupby('datetime'): if len(group) < 10: continue n_top = max(1, int(len(group) * top_pct)) # Top predictions from 0_7 top_0_7 = group.nlargest(n_top, 'pred_0_7') top_returns_0_7.append(top_0_7[return_col].mean()) # Top predictions from beta top_beta = group.nlargest(n_top, 'pred_beta') top_returns_beta.append(top_beta[return_col].mean()) return { '0_7': { 'top_tier_return': np.mean(top_returns_0_7), 'top_tier_std': np.std(top_returns_0_7) }, '0_7_beta': { 'top_tier_return': np.mean(top_returns_beta), 'top_tier_std': np.std(top_returns_beta) } } def main(): """Main comparison function.""" print("=" * 70) print("COMPARISON: Alpha158 0_7 vs 0_7_beta Predictions") print("=" * 70) # Load and align predictions df_joined = load_and_align_predictions() if len(df_joined) == 0: print("\nERROR: No overlapping predictions found!") return # Calculate correlation print("\n" + "-" * 70) print("PREDICTION CORRELATION") print("-" * 70) corr_metrics = calculate_correlation(df_joined) print(f"Overall Pearson correlation: {corr_metrics['pearson_corr']:.4f}") print(f"Overall Spearman correlation: {corr_metrics['spearman_corr']:.4f}") print(f"Daily correlation mean: {corr_metrics['daily_corr_mean']:.4f}") print(f"Daily correlation std: {corr_metrics['daily_corr_std']:.4f}") # Prediction statistics print("\n" + "-" * 70) print("PREDICTION STATISTICS") print("-" * 70) df_pd = df_joined.to_pandas() print(f"0_7 predictions:") print(f" Mean: {df_pd['pred_0_7'].mean():.6f}") print(f" Std: {df_pd['pred_0_7'].std():.6f}") print(f" Min: {df_pd['pred_0_7'].min():.6f}") print(f" Max: {df_pd['pred_0_7'].max():.6f}") print(f"\n0_7_beta predictions:") print(f" Mean: {df_pd['pred_beta'].mean():.6f}") print(f" Std: {df_pd['pred_beta'].std():.6f}") print(f" Min: {df_pd['pred_beta'].min():.6f}") print(f" Max: {df_pd['pred_beta'].max():.6f}") # Load actual returns and calculate IC metrics if available if os.path.exists(ACTUAL_RETURNS_FILE): print("\n" + "-" * 70) print("IC METRICS (with actual returns)") print("-" * 70) actual_returns = pl.read_parquet(ACTUAL_RETURNS_FILE) print(f"Loaded actual returns: {actual_returns.shape}") ic_metrics = calculate_ic_metrics(df_joined, actual_returns) if ic_metrics: print(f"\n{'Metric':<20} {'0_7':<12} {'0_7_beta':<12} {'Diff':<12}") print("-" * 56) for metric in ['ic_mean', 'ic_std', 'ic_ir', 'rankic_mean', 'rankic_std', 'rankic_ir']: v0 = ic_metrics['0_7'][metric] v1 = ic_metrics['0_7_beta'][metric] diff = v1 - v0 print(f"{metric:<20} {v0:>11.4f} {v1:>11.4f} {diff:>+11.4f}") # Top-tier returns print("\n" + "-" * 70) print("TOP-TIER RETURNS (top 10%)") print("-" * 70) top_tier = calculate_top_tier_return(df_joined, actual_returns, top_pct=0.1) if top_tier: print(f"{'':<20} {'0_7':<12} {'0_7_beta':<12} {'Diff':<12}") print("-" * 56) t0 = top_tier['0_7']['top_tier_return'] t1 = top_tier['0_7_beta']['top_tier_return'] diff = t1 - t0 print(f"{'Top-tier return':<20} {t0:>11.4f} {t1:>11.4f} {diff:>+11.4f}") else: print(f"\nActual returns file not found: {ACTUAL_RETURNS_FILE}") print("Skipping IC metrics calculation.") print("\n" + "=" * 70) print("Comparison complete!") print("=" * 70) if __name__ == "__main__": main()