trading/quant_alpha_factor_demo.ipynb

{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "a476d03b",
   "metadata": {},
   "source": [
    "# =============================================================================\n",
    "# Quantitative Trading — Alpha Factor Research Demo\n",
    "# 量化交易 — Alpha 因子研究演示\n",
    "\n",
    "# =============================================================================\n",
    "#\n",
    "# Alpha 因子 (Alpha Factor) 是量化选股的核心工具。\n",
    "# 它是一个数学公式，从市场数据中提取信号，预测哪些股票未来会跑赢大盘。\n",
    "#\n",
    "# Alpha factors are the core tool of quantitative stock selection.\n",
    "# Each factor is a mathematical formula that extracts a signal from market data\n",
    "# to predict which stocks will outperform in the future.\n",
    "#\n",
    "# 研究流程 / Research Workflow:\n",
    "#   §0  合成股票池       Synthetic Universe     (50只股票 × 3年日线数据)\n",
    "#   §1  因子构建         Factor Construction    (5个经典因子)\n",
    "#   §2  因子预处理       Factor Preprocessing   (去极值、截面标准化、市值中性化)\n",
    "#   §3  IC 分析          IC Analysis            (信息系数 / 因子预测能力量化)\n",
    "#   §4  分层回测         Quantile Analysis      (五分位收益分层验证)\n",
    "#   §5  因子合成         Factor Combination     (等权 & IC加权合成因子)\n",
    "#   §6  多空组合         Long-Short Portfolio   (Top/Bottom 20% 多空策略)\n",
    "#   §7  因子衰减         Factor Decay           (IC 随持有期延长的衰减曲线)\n",
    "#   §8  可视化           Visualization          (9面板汇总图)\n",
    "#\n",
    "# 前置条件 / Prerequisites:\n",
    "#   pip install numpy pandas matplotlib scipy\n",
    "#\n",
    "# 运行方式 / Run:\n",
    "#   python quant_alpha_factor_demo.py\n",
    "\n",
    "# ============================================================================="
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "00e42fb4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "import matplotlib\n",
    "matplotlib.use('Agg')   # 非交互模式，避免 GUI 阻塞 / non-interactive, prevents GUI block\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.gridspec as gridspec\n",
    "from matplotlib.ticker import FuncFormatter\n",
    "from scipy import stats\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "\n",
    "# 中文字体配置 / Chinese font configuration\n",
    "plt.rcParams['font.sans-serif'] = ['WenQuanYi Zen Hei', 'Arial Unicode MS', 'SimHei', 'DejaVu Sans']\n",
    "plt.rcParams['axes.unicode_minus'] = False\n",
    "\n",
    "np.random.seed(42)\n",
    "print(\"=\" * 70)\n",
    "print(\"  量化交易 Alpha 因子研究演示\")\n",
    "print(\"  Quantitative Trading: Alpha Factor Research Demo\")\n",
    "print(\"=\" * 70)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "986c9755",
   "metadata": {},
   "source": [
    "# =============================================================================\n",
    "# §0  合成股票池  Synthetic Stock Universe\n",
    "\n",
    "# -----------------------------------------------------------------------------\n",
    "# 真实因子研究需要几百到几千只股票的截面数据 (cross-sectional data)。\n",
    "# 我们用\"隐藏因子生成模型\" (Hidden Factor Generative Model) 合成数据，\n",
    "# 确保我们构建的因子确实具有预测能力：\n",
    "#\n",
    "# Real factor research requires cross-sectional data of hundreds of stocks.\n",
    "# We use a Hidden Factor Generative Model to synthesize data, ensuring the\n",
    "# factors we build actually have genuine predictive power.\n",
    "#\n",
    "# 每只股票的日收益 = 市场分量 + 质量Alpha + 动量Alpha + 特质噪音\n",
    "# stock_daily_return = market_component + quality_alpha + momentum_alpha + noise\n",
    "#\n",
    "# 其中:\n",
    "#   market_component = β_i × r_market               (系统性风险 / systematic risk)\n",
    "#   quality_alpha    = λ_q × quality_score_{i,t}    (质量因子加载, 季度持续)\n",
    "#   momentum_alpha   = λ_m × momentum_score_{i,t}   (动量因子加载, 月度持续)\n",
    "#   noise            ~ N(0, σ_idio)                  (特质风险 / idiosyncratic risk)\n",
    "#\n",
    "# 关键: quality_score 和 momentum_score 是\"隐藏\"的，我们无法直接观察。\n",
    "#       但当我们从价格数据中计算因子时，会自然捕捉到这些隐藏信号。\n",
    "# Key: these hidden scores are unobservable, but our computed factors will\n",
    "#      naturally capture them — this is exactly what alpha factors do!\n",
    "\n",
    "# ============================================================================="
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e1332377",
   "metadata": {},
   "outputs": [],
   "source": [
    "N_STOCKS   = 50          # 股票数量 / number of stocks\n",
    "N_DAYS     = 756         # 交易日 ≈ 3 年 / trading days ≈ 3 years\n",
    "N_QUARTERS = N_DAYS // 63 + 2   # 季度数 / number of quarters\n",
    "N_MONTHS   = N_DAYS // 21 + 2   # 月度数 / number of months\n",
    "\n",
    "# 交易日期序列 / Business-day date index\n",
    "dates = pd.bdate_range(start=\"2021-01-04\", periods=N_DAYS)\n",
    "\n",
    "# 股票代码 (模拟A股格式) / Stock symbols (simulated A-share format)\n",
    "symbols = [f\"{str(i).zfill(6)}.{'SH' if i % 2 == 0 else 'SZ'}\" for i in range(1, N_STOCKS + 1)]\n",
    "\n",
    "# ── 0-A  隐藏质量因子（季度持续）/ Hidden Quality Factor (Quarterly Persistence) ──\n",
    "#\n",
    "# 质量因子 AR(1): quality_t = 0.90 × quality_{t-1} + shock\n",
    "# 季度自回归系数 0.90 → 年度持续性 ≈ 0.90^4 = 0.66（持续性更强）\n",
    "# Quarterly AR(1) = 0.90 → annual persistence ≈ 0.90^4 = 0.66 (more persistent)\n",
    "quality_quarterly = np.zeros((N_STOCKS, N_QUARTERS))\n",
    "quality_quarterly[:, 0] = np.random.randn(N_STOCKS)\n",
    "for q in range(1, N_QUARTERS):\n",
    "    # sqrt(1 - 0.90^2) ≈ 0.436 保持方差为 1 / keeps variance = 1\n",
    "    quality_quarterly[:, q] = (0.90 * quality_quarterly[:, q - 1]\n",
    "                                + 0.436 * np.random.randn(N_STOCKS))\n",
    "\n",
    "# 季度 → 每日映射 / Map quarterly to daily\n",
    "quality_daily = np.zeros((N_DAYS, N_STOCKS))\n",
    "for d in range(N_DAYS):\n",
    "    q_idx = min(d // 63, N_QUARTERS - 1)\n",
    "    quality_daily[d] = quality_quarterly[:, q_idx]\n",
    "\n",
    "# ── 0-B  隐藏动量因子（月度持续）/ Hidden Momentum Factor (Monthly Persistence) ──\n",
    "#\n",
    "# 动量因子 AR(1): momentum_t = 0.60 × momentum_{t-1} + shock\n",
    "# 月度自回归系数 0.60，意味着 3 个月后的持续性 ≈ 0.60^3 = 0.22\n",
    "# Monthly AR(1) coefficient 0.60 → 3-month persistence ≈ 0.22\n",
    "momentum_monthly = np.zeros((N_STOCKS, N_MONTHS))\n",
    "momentum_monthly[:, 0] = np.random.randn(N_STOCKS)\n",
    "for m in range(1, N_MONTHS):\n",
    "    # 月度 AR(1) = 0.80，年度持续性 ≈ 0.80^12 = 0.069（合理的动量衰减）\n",
    "    # Monthly AR(1) = 0.80 → annual persistence ≈ 0.80^12 = 0.069\n",
    "    momentum_monthly[:, m] = (0.80 * momentum_monthly[:, m - 1]\n",
    "                               + 0.60 * np.random.randn(N_STOCKS))\n",
    "\n",
    "momentum_daily = np.zeros((N_DAYS, N_STOCKS))\n",
    "for d in range(N_DAYS):\n",
    "    m_idx = min(d // 21, N_MONTHS - 1)\n",
    "    momentum_daily[d] = momentum_monthly[:, m_idx]\n",
    "\n",
    "# ── 0-C  日收益率生成 / Daily Return Generation ────────────────────────────────\n",
    "#\n",
    "# 日市场收益 (Market Daily Return):  年化μ=8%, σ=20%\n",
    "mkt_returns = np.random.normal(0.0003, 0.0126, N_DAYS)  # 0.20/√252 ≈ 0.0126\n",
    "\n",
    "# 每只股票的市场贝塔 β: 均匀分布在 [0.5, 1.5]\n",
    "# Each stock's market beta β: uniformly distributed in [0.5, 1.5]\n",
    "stock_betas = np.random.uniform(0.5, 1.5, N_STOCKS)\n",
    "\n",
    "# (N_DAYS, N_STOCKS): 每列是一只股票，每行是一个交易日\n",
    "market_component   = mkt_returns[:, np.newaxis] * stock_betas[np.newaxis, :]  # 市场分量\n",
    "quality_component  = 0.00080 * quality_daily    # 质量贡献：约 ±20% 年化 alpha\n",
    "momentum_component = 0.00050 * momentum_daily   # 动量贡献：约 ±12% 年化 alpha\n",
    "idio_noise         = np.random.normal(0, 0.0075, (N_DAYS, N_STOCKS))          # 特质噪音\n",
    "\n",
    "# 日对数收益率 / Daily log returns\n",
    "log_returns = market_component + quality_component + momentum_component + idio_noise\n",
    "\n",
    "# ── 0-D  价格路径 / Price Paths ────────────────────────────────────────────────\n",
    "# P_t = P_0 × exp( Σ log_return_s, s=1..t )   (价格路径 / price path)\n",
    "initial_prices = np.random.uniform(5.0, 100.0, N_STOCKS)  # A股初始价格\n",
    "prices_arr = initial_prices * np.exp(np.cumsum(log_returns, axis=0))\n",
    "\n",
    "prices_df    = pd.DataFrame(prices_arr,  index=dates, columns=symbols)\n",
    "log_ret_df   = pd.DataFrame(log_returns, index=dates, columns=symbols)\n",
    "simple_ret_df = prices_df.pct_change()   # 简单收益率: r_t = P_t/P_{t-1} - 1\n",
    "\n",
    "# ── 0-E  成交量生成 / Volume Generation ──────────────────────────────────────\n",
    "# 真实市场中，成交量与绝对收益率正相关（大涨大跌时换手更活跃）。\n",
    "# In real markets, volume correlates with |return| (more trading on big moves).\n",
    "#\n",
    "# log(volume) = log(base) + 3×|log_return| + noise\n",
    "base_volumes = np.random.uniform(1e6, 5e7, N_STOCKS)   # 基础日均成交量（股）\n",
    "vol_multiplier = np.exp(3.0 * np.abs(log_returns) + 0.3 * np.random.randn(N_DAYS, N_STOCKS))\n",
    "volume_df = pd.DataFrame(base_volumes * vol_multiplier, index=dates, columns=symbols)\n",
    "\n",
    "# ── 0-F  模拟市值 / Simulated Market Capitalization ───────────────────────────\n",
    "# 市值 (Market Cap) = 价格 × 流通股数  / Market Cap = Price × Float Shares\n",
    "float_shares  = np.random.uniform(1e8, 1e10, N_STOCKS)  # 流通股数（股）\n",
    "mktcap_df     = prices_df * float_shares                  # 市值（元）\n",
    "log_mktcap_df = np.log(mktcap_df)                         # 对数市值（因子分析常用）\n",
    "\n",
    "market_series = pd.Series(mkt_returns, index=dates)       # 市场收益率序列\n",
    "\n",
    "print(f\"\\n[§0] 股票池生成完成 / Universe Generated\")\n",
    "print(f\"     股票数量 (Stocks):    {N_STOCKS} 只\")\n",
    "print(f\"     交易日数 (Days):      {N_DAYS} 天  {dates[0].date()} ~ {dates[-1].date()}\")\n",
    "print(f\"     价格区间 (Price):     {prices_df.min().min():.2f} ~ {prices_df.max().max():.2f} 元\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "04c44408",
   "metadata": {},
   "source": [
    "# =============================================================================\n",
    "# §1  因子构建  Factor Construction\n",
    "\n",
    "# -----------------------------------------------------------------------------\n",
    "# 我们构建 5 个来自学术文献的经典因子，涵盖不同的 Alpha 来源：\n",
    "# We build 5 classic factors from academic literature, covering different alpha sources:\n",
    "#\n",
    "# ┌──────┬────────────────────────────┬──────────────────────────────────────────┐\n",
    "# │ 因子 │ 名称                       │ 公式 & 来源                              │\n",
    "# ├──────┼────────────────────────────┼──────────────────────────────────────────┤\n",
    "# │ MOM  │ 动量 Momentum              │ r(t-252, t-21)  Jegadeesh & Titman 1993  │\n",
    "# │ REV  │ 短期反转 Reversal          │ -r(t-21, t)     Jegadeesh 1990           │\n",
    "# │ LVOL │ 低波动 Low Volatility      │ -std(r, 60D)    Baker et al. 2011        │\n",
    "# │ BAB  │ 低贝塔 Betting vs Beta     │ -β(60D)         Frazzini & Pedersen 2014 │\n",
    "# │ ILLIQ│ 非流动性 Amihud Illiquidity│ mean(|r|/V,20D) Amihud 2002             │\n",
    "# └──────┴────────────────────────────┴──────────────────────────────────────────┘\n",
    "#\n",
    "# 重要约定: 所有因子都以\"因子值越高 → 预期未来收益越高\"为正方向。\n",
    "# Convention: higher factor value → expected higher future return (unified sign).\n",
    "\n",
    "# ============================================================================="
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1d852ac0",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n[§1] 构建因子 / Constructing factors...\")\n",
    "\n",
    "# ── 1-A  动量因子 (MOM)  Momentum Factor ──────────────────────────────────────\n",
    "#\n",
    "# 来源 / Source: Jegadeesh & Titman (1993)\n",
    "# \"Returns to Buying Winners and Selling Losers\"\n",
    "#\n",
    "# 动量效应 (Momentum Effect): 过去 12 个月（跳过最近 1 个月）涨幅最大的股票，\n",
    "#   未来 3-12 个月通常继续跑赢市场。这是学术上记录最多的股票市场异象之一。\n",
    "#\n",
    "# Momentum effect: stocks with highest 12M-1M returns tend to outperform\n",
    "#   over the next 3-12 months. One of the most documented stock market anomalies.\n",
    "#\n",
    "# 为什么要跳过最近 1 个月？/ Why skip the last month?\n",
    "#   最近 1 个月的收益存在短期反转效应（买卖价差、做市商库存调整等微观结构原因），\n",
    "#   会污染中期动量信号，所以必须跳过。\n",
    "#   The last month exhibits short-term reversal due to bid-ask spread and\n",
    "#   market-maker inventory rebalancing, contaminating medium-term momentum.\n",
    "#\n",
    "# 公式 / Formula:\n",
    "#   MOM_t = (P_{t-21} / P_{t-252}) - 1   (从12M前 到 1M前 的累积收益)\n",
    "#   MOM_t = (P_{t-21} / P_{t-252}) - 1   (cumulative return from 12M to 1M ago)\n",
    "\n",
    "MOM_LONG  = 252   # 长回看窗口: 12个月 / lookback long: 12 months\n",
    "MOM_SHORT = 21    # 跳过窗口: 1个月   / skip window:  1 month\n",
    "\n",
    "factor_mom = prices_df.shift(MOM_SHORT) / prices_df.shift(MOM_LONG) - 1.0\n",
    "print(f\"     ✓ MOM  动量因子: 有效值 {factor_mom.notna().mean().mean():.1%}\")\n",
    "\n",
    "# ── 1-B  短期反转因子 (REV)  Short-Term Reversal Factor ───────────────────────\n",
    "#\n",
    "# 来源 / Source: Jegadeesh (1990)\n",
    "# \"Evidence of Predictable Behavior of Security Returns\"\n",
    "#\n",
    "# 反转效应 (Reversal Effect): 上个月大涨的股票，下个月往往小幅回调；\n",
    "#   大跌的股票往往小幅反弹。这是对短期\"过度反应\"的修正。\n",
    "#\n",
    "# Reversal effect: last month's winners tend to reverse; losers tend to bounce.\n",
    "#   This is correction of short-term overreaction.\n",
    "#\n",
    "# 注意: 取负号是因为月涨 → 因子值高 → 预期下跌（反转），我们统一正方向。\n",
    "# Note: negative sign because high past return → expected to reverse → lower future return,\n",
    "#       we flip to maintain convention: higher factor → higher expected return.\n",
    "#\n",
    "# 公式 / Formula:\n",
    "#   REV_t = -(P_t / P_{t-21} - 1)    (负1月收益 / negative 1-month return)\n",
    "\n",
    "factor_rev = -(prices_df / prices_df.shift(MOM_SHORT) - 1.0)\n",
    "print(f\"     ✓ REV  反转因子: 有效值 {factor_rev.notna().mean().mean():.1%}\")\n",
    "\n",
    "# ── 1-C  低波动因子 (LVOL)  Low Volatility Factor ────────────────────────────\n",
    "#\n",
    "# 来源 / Source: Ang et al.(2006); Baker, Bradley & Wurgler (2011)\n",
    "# \"Benchmarks as Limits to Arbitrage: Understanding the Low-Volatility Anomaly\"\n",
    "#\n",
    "# 低波动异象 (Low Volatility Anomaly): 经典金融理论 CAPM 预测高风险 = 高收益。\n",
    "#   但实证研究发现，低波动率的股票反而有更高的原始收益和风险调整后收益！\n",
    "#\n",
    "# Low-vol anomaly: contrary to CAPM, low-volatility stocks earn HIGHER\n",
    "#   risk-adjusted and even raw returns than high-volatility stocks.\n",
    "#\n",
    "# 可能的解释 / Possible explanations:\n",
    "#   ① 机构投资者受基准约束，被迫持有高贝塔股票 (benchmark constraints)\n",
    "#   ② 投资者对\"彩票式\"高波动股票有偏好，导致其被高估 (lottery preference)\n",
    "#   ③ 杠杆限制阻止套利者纠正定价偏差 (leverage constraints)\n",
    "#\n",
    "# 公式 / Formula:\n",
    "#   LVOL_t = -σ(r_{t-60:t})    (负60日已实现波动率 / negative 60-day realized vol)\n",
    "\n",
    "VOL_WINDOW = 60   # 60 个交易日 ≈ 3 个月 / 60 trading days ≈ 3 months\n",
    "factor_lvol = -(simple_ret_df.rolling(VOL_WINDOW).std())\n",
    "print(f\"     ✓ LVOL 低波动因子: 有效值 {factor_lvol.notna().mean().mean():.1%}\")\n",
    "\n",
    "# ── 1-D  低贝塔因子 (BAB)  Betting Against Beta ──────────────────────────────\n",
    "#\n",
    "# 来源 / Source: Frazzini & Pedersen (2014) \"Betting Against Beta\"\n",
    "#\n",
    "# BAB 异象: 做多低贝塔股票、做空高贝塔股票，可以获得持续的超额收益。\n",
    "#   与低波动因子类似，但专注于对\"市场系统性风险\"的暴露，而非总波动率。\n",
    "#\n",
    "# BAB anomaly: long low-beta, short high-beta → persistent excess return.\n",
    "#   Similar to low-vol but focuses on market systematic risk exposure.\n",
    "#\n",
    "# 贝塔计算 / Beta computation:\n",
    "#   β_i = Cov(r_i, r_market) / Var(r_market)    (60日滚动窗口 / 60-day rolling)\n",
    "#\n",
    "# 向量化技巧: 利用 pandas rolling 方法计算所有股票的滚动协方差和市场方差\n",
    "# Vectorized trick: use pandas rolling to compute rolling covariance and variance\n",
    "\n",
    "BETA_WINDOW = 60   # 60日滚动贝塔 / 60-day rolling beta\n",
    "\n",
    "# pandas rolling().cov(other) 计算每个时间点的滚动协方差\n",
    "# pandas rolling().cov(other) computes rolling covariance at each time point\n",
    "rolling_var_mkt = market_series.rolling(BETA_WINDOW).var()    # 市场方差 / market variance\n",
    "rolling_cov     = log_ret_df.apply(\n",
    "    lambda col: col.rolling(BETA_WINDOW).cov(market_series)    # 逐列协方差 / per-stock cov\n",
    ")\n",
    "rolling_beta_df = rolling_cov.div(rolling_var_mkt, axis=0)     # β = cov / var\n",
    "factor_bab = -rolling_beta_df   # 取负：低贝塔 → 高因子值 / flip: low-beta → high score\n",
    "print(f\"     ✓ BAB  低贝塔因子: 有效值 {factor_bab.notna().mean().mean():.1%}\")\n",
    "\n",
    "# ── 1-E  Amihud 非流动性因子 (ILLIQ)  Amihud Illiquidity Factor ───────────────\n",
    "#\n",
    "# 来源 / Source: Amihud (2002) \"Illiquidity and Stock Returns\"\n",
    "#\n",
    "# Amihud 非流动性指标 (Amihud Illiquidity Measure):\n",
    "#   ILLIQ_{i,t} = (1/D) × Σ_{d=t-D+1}^{t} |r_{i,d}| / Volume_{i,d}\n",
    "#\n",
    "# 含义 (Interpretation):\n",
    "#   \"每单位成交量能引起多大的价格变动？\"\n",
    "#   \"How much price movement per unit of trading volume?\"\n",
    "#\n",
    "#   值越高 → 流动性越差 (higher → less liquid)\n",
    "#   流动性差的股票风险更高，投资者要求额外的\"流动性溢价\"(liquidity premium)\n",
    "#   Illiquid stocks carry higher risk, investors demand extra liquidity premium\n",
    "#\n",
    "# 注: 本因子中，高 ILLIQ（流动性差）→ 预期高收益（溢价补偿）→ 符合正方向约定\n",
    "# Note: high ILLIQ (illiquid) → expected high return (premium) → matches positive convention\n",
    "\n",
    "ILLIQ_WINDOW = 20   # 20日均值 / 20-day average\n",
    "\n",
    "# |日收益率| / 日成交量，再取20日滚动均值\n",
    "# |daily return| / daily volume, then 20-day rolling mean\n",
    "price_impact  = simple_ret_df.abs() / volume_df    # 每日价格冲击 / daily price impact\n",
    "factor_illiq  = price_impact.rolling(ILLIQ_WINDOW).mean() * 1e8   # 缩放 / scaling\n",
    "print(f\"     ✓ ILLIQ 非流动性因子: 有效值 {factor_illiq.notna().mean().mean():.1%}\")\n",
    "\n",
    "# ── 1-F  汇总 / Consolidate ───────────────────────────────────────────────────\n",
    "FACTORS = {\n",
    "    \"MOM\"   : factor_mom,   # 动量\n",
    "    \"REV\"   : factor_rev,   # 短期反转\n",
    "    \"LVOL\"  : factor_lvol,  # 低波动\n",
    "    \"BAB\"   : factor_bab,   # 低贝塔 / 贝塔\n",
    "    \"ILLIQ\" : factor_illiq, # Amihud 非流动性\n",
    "}\n",
    "print(f\"\\n     共构建 {len(FACTORS)} 个因子: {list(FACTORS.keys())}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ab363340",
   "metadata": {},
   "source": [
    "# =============================================================================\n",
    "# §2  因子预处理  Factor Preprocessing\n",
    "\n",
    "# -----------------------------------------------------------------------------\n",
    "# 原始因子值不能直接用于分析，需要经过三步标准化预处理流程：\n",
    "# Raw factor values must go through a 3-step preprocessing pipeline:\n",
    "#\n",
    "# Step 1 ── 截面去极值 (Cross-Sectional Winsorization)\n",
    "#   在每个日期截面，将超出 [μ - 3σ, μ + 3σ] 范围的值截断。\n",
    "#   At each date, clip values outside [mean - 3σ, mean + 3σ].\n",
    "#   为什么？极端异常值会主导相关系数计算，掩盖真实的因子信号。\n",
    "#   Why? Outliers dominate correlation calculations and mask the true signal.\n",
    "#\n",
    "# Step 2 ── 截面 Z-score 标准化 (Cross-Sectional Z-score)\n",
    "#   z_{i,t} = (x_{i,t} - μ_t) / σ_t    (在每个时间截面 t 上计算)\n",
    "#   z_{i,t} = (x_{i,t} - μ_t) / σ_t    (computed cross-sectionally at each date t)\n",
    "#   为什么？不同因子量纲不同（动量是%，波动率也是%，ILLIQ 是极小数），\n",
    "#          标准化后可以直接比较和合成。\n",
    "#   Why? Factors have different scales; Z-score enables fair comparison and combination.\n",
    "#\n",
    "# Step 3 ── 市值中性化 (Market Cap Neutralization)\n",
    "#   在每个截面，对 log(市值) 做 OLS 回归，用残差替换原因子值：\n",
    "#   At each date, regress on log(mktcap) via OLS; use residuals as factor:\n",
    "#   factor_neutral_i = factor_i - (α + β × log_mktcap_i)\n",
    "#   为什么？小市值效应 (Size Effect) 会污染其他因子。例如小市值股票往往同时具有\n",
    "#          高动量、高波动率、低流动性，如果不剔除市值效应，因子实际上只是\n",
    "#          在选小市值股票，而不是真正的动量/波动/流动性信号。\n",
    "#   Why? Size effect contaminates other factors — small caps often have high momentum,\n",
    "#        high vol, and low liquidity simultaneously. Without neutralization,\n",
    "#        factors simply pick small caps rather than true alpha signals.\n",
    "\n",
    "# ============================================================================="
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0fb3a541",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n[§2] 因子预处理 / Factor Preprocessing...\")\n",
    "\n",
    "\n",
    "def winsorize_cross_section(factor_df: pd.DataFrame, n_std: float = 3.0) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    截面去极值 / Cross-sectional winsorization.\n",
    "\n",
    "    在每个日期（行），将超出 n_std 个标准差的值截断到边界。\n",
    "    At each date (row), clip values that are more than n_std std devs from the mean.\n",
    "\n",
    "    向量化实现，逐行 clip / Vectorized via per-row clip applied with apply.\n",
    "    \"\"\"\n",
    "    mu  = factor_df.mean(axis=1)    # 每日截面均值 / daily cross-sectional mean\n",
    "    sig = factor_df.std(axis=1)     # 每日截面标准差 / daily cross-sectional std\n",
    "    lower = (mu - n_std * sig).values[:, np.newaxis]   # 广播形状 / broadcast shape\n",
    "    upper = (mu + n_std * sig).values[:, np.newaxis]\n",
    "    clipped = np.clip(factor_df.values, lower, upper)\n",
    "    return pd.DataFrame(clipped, index=factor_df.index, columns=factor_df.columns)\n",
    "\n",
    "\n",
    "def zscore_cross_section(factor_df: pd.DataFrame) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    截面 Z-score 标准化 / Cross-sectional Z-score normalization.\n",
    "\n",
    "    使每个时间截面的因子均值=0, 标准差=1。\n",
    "    Makes each time cross-section have mean=0 and std=1.\n",
    "    \"\"\"\n",
    "    mu  = factor_df.mean(axis=1)    # 每日截面均值 (N_DAYS,)\n",
    "    sig = factor_df.std(axis=1)     # 每日截面标准差 (N_DAYS,)\n",
    "    # sub/div 沿行方向广播 / broadcast along rows\n",
    "    return factor_df.sub(mu, axis=0).div(sig.replace(0, np.nan), axis=0)\n",
    "\n",
    "\n",
    "def neutralize_mktcap(factor_df: pd.DataFrame, log_mktcap: pd.DataFrame) -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    市值中性化 / Market cap neutralization via cross-sectional OLS.\n",
    "\n",
    "    在每个截面，用 OLS 将因子对 log(市值) 回归，取残差作为中性化因子。\n",
    "    At each cross-section, regress factor on log(mktcap) via OLS; keep residuals.\n",
    "\n",
    "    residual_i = factor_i - (intercept + slope × log_mktcap_i)\n",
    "    \"\"\"\n",
    "    # 对齐列顺序 / Align columns\n",
    "    y_arr = factor_df.values.copy()        # (N_DAYS, N_STOCKS)\n",
    "    x_arr = log_mktcap.reindex(columns=factor_df.columns).values  # (N_DAYS, N_STOCKS)\n",
    "\n",
    "    for t in range(len(factor_df)):\n",
    "        y = y_arr[t]\n",
    "        x = x_arr[t]\n",
    "        mask = ~(np.isnan(y) | np.isnan(x))\n",
    "        if mask.sum() < 10:\n",
    "            continue\n",
    "        y_c, x_c = y[mask], x[mask]\n",
    "        # 手动 OLS / manual OLS (faster than scipy on small arrays)\n",
    "        xm, ym = x_c.mean(), y_c.mean()\n",
    "        denom  = np.dot(x_c - xm, x_c - xm)\n",
    "        if denom < 1e-12:\n",
    "            continue\n",
    "        slope     = np.dot(x_c - xm, y_c - ym) / denom\n",
    "        intercept = ym - slope * xm\n",
    "        y_arr[t][mask] = y_c - (intercept + slope * x_c)   # 残差 / residuals\n",
    "\n",
    "    return pd.DataFrame(y_arr, index=factor_df.index, columns=factor_df.columns)\n",
    "\n",
    "\n",
    "# 对所有因子应用预处理流水线 / Apply preprocessing pipeline to all factors\n",
    "FACTORS_PROCESSED = {}\n",
    "for name, factor in FACTORS.items():\n",
    "    step1 = winsorize_cross_section(factor)          # ① 去极值\n",
    "    step2 = zscore_cross_section(step1)              # ② Z-score\n",
    "    step3 = neutralize_mktcap(step2, log_mktcap_df) # ③ 市值中性化\n",
    "    FACTORS_PROCESSED[name] = step3\n",
    "    print(f\"     ✓ {name:5s}: 去极值 → Z-score → 市值中性化 完成\")\n",
    "\n",
    "print(\"     预处理完成 / Preprocessing complete.\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5826aca6",
   "metadata": {},
   "source": [
    "# =============================================================================\n",
    "# §3  IC 分析  Information Coefficient Analysis\n",
    "\n",
    "# -----------------------------------------------------------------------------\n",
    "# IC (Information Coefficient 信息系数) 是衡量因子预测能力的黄金标准。\n",
    "# IC is the gold standard for measuring a factor's predictive power.\n",
    "#\n",
    "# 定义 / Definition:\n",
    "#   IC_t = Spearman_Rank_Correlation( factor_{t}, forward_return_{t+H} )\n",
    "#\n",
    "# 其中:\n",
    "#   factor_{t}          = 第 t 日截面因子值（50只股票，每只一个数）\n",
    "#   forward_return_{t+H} = 从第 t 日起持有 H 日的未来收益（未来数据！）\n",
    "#   H = 21 交易日 ≈ 1 个月（最常用的持有期）\n",
    "#\n",
    "# 用 Spearman 秩相关而非 Pearson 线性相关的原因:\n",
    "#   ① 对极端值鲁棒 (robust to outliers)\n",
    "#   ② 衡量单调关系（不需要线性）(measures monotonic, not necessarily linear, relationship)\n",
    "#\n",
    "# 为什么用 Spearman rank correlation instead of Pearson?\n",
    "#   ① Robust to outliers\n",
    "#   ② Measures monotonic relationship (no linearity assumption needed)\n",
    "#\n",
    "# IC 评价标准 / IC benchmark:\n",
    "#   |IC均值|  > 0.05  → 因子有效 / factor is effective\n",
    "#   |IC均值|  > 0.10  → 因子强 / factor is strong\n",
    "#   ICIR      > 0.50  → 因子稳定 / factor is stable\n",
    "#   IC>0 比率 > 55%   → 方向一致性好 / directionally consistent\n",
    "#\n",
    "# ICIR (IC Information Ratio):\n",
    "#   ICIR = IC均值 / IC标准差 = IC_mean / IC_std\n",
    "#   类似夏普比率，衡量\"每单位波动贡献多少稳定的预测能力\"\n",
    "#   Similar to Sharpe ratio — measures how much stable predictive power per unit of volatility\n",
    "\n",
    "# ============================================================================="
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2201ba1a",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n[§3] IC 分析 / IC Analysis...\")\n",
    "\n",
    "FORWARD_PERIOD = 21    # 持有期 H = 21 日 ≈ 1 个月 / holding period ≈ 1 month\n",
    "\n",
    "\n",
    "def compute_ic_series(\n",
    "    factor_df: pd.DataFrame,\n",
    "    forward_ret_df: pd.DataFrame,\n",
    "    holding_period: int = 21,\n",
    ") -> pd.Series:\n",
    "    \"\"\"\n",
    "    计算因子的 IC 时间序列。\n",
    "    Compute the IC time series for a factor.\n",
    "\n",
    "    在每个日期 t，计算截面 Spearman 相关系数：\n",
    "    At each date t, compute cross-sectional Spearman rank correlation:\n",
    "      IC_t = Spearmanr( factor[t, all_stocks], forward_return[t+H, all_stocks] )\n",
    "\n",
    "    Parameters:\n",
    "        factor_df      : 预处理后的因子值 (date × stock)\n",
    "        forward_ret_df : 收益率 DataFrame (date × stock)\n",
    "        holding_period : 持有期（交易日）/ holding period in days\n",
    "    Returns:\n",
    "        ic_series : IC 值时间序列 / IC time series indexed by date\n",
    "    \"\"\"\n",
    "    # shift(-H): 把 t+H 的收益值移到第 t 行，使得 fwd_returns.loc[t] = return from t to t+H\n",
    "    # shift(-H) aligns so that fwd_returns.loc[t] = the return earned from t to t+H\n",
    "    fwd_returns = forward_ret_df.shift(-holding_period)\n",
    "\n",
    "    ic_values, ic_dates = [], []\n",
    "    for date in factor_df.index[:-holding_period]:  # 最后 H 行无未来收益 / no future return\n",
    "        f_row = factor_df.loc[date].dropna()\n",
    "        r_row = fwd_returns.loc[date, f_row.index].dropna()\n",
    "        common = f_row.index.intersection(r_row.index)\n",
    "        if len(common) < 10:   # 至少 10 只股票 / need at least 10 stocks\n",
    "            continue\n",
    "        ic, _ = stats.spearmanr(f_row[common].values, r_row[common].values)\n",
    "        ic_values.append(ic)\n",
    "        ic_dates.append(date)\n",
    "\n",
    "    return pd.Series(ic_values, index=ic_dates, name=\"IC\")\n",
    "\n",
    "\n",
    "# 逐因子计算 IC 序列 / Compute IC series for each factor\n",
    "ic_series_dict = {}\n",
    "for name, factor in FACTORS_PROCESSED.items():\n",
    "    ic_series_dict[name] = compute_ic_series(factor, simple_ret_df, FORWARD_PERIOD)\n",
    "\n",
    "# ── IC 汇总统计表 / IC Summary Statistics Table ───────────────────────────────\n",
    "print(f\"\\n     {'因子':8s}  {'IC均值':>8s}  {'IC标准差':>9s}  {'ICIR':>8s}  {'IC>0比率':>9s}  评级\")\n",
    "print(f\"     \" + \"─\" * 64)\n",
    "\n",
    "ic_stats = {}\n",
    "for name, ic_s in ic_series_dict.items():\n",
    "    ic_mean       = ic_s.mean()\n",
    "    ic_std        = ic_s.std()\n",
    "    icir          = ic_mean / ic_std if ic_std > 0 else 0.0\n",
    "    positive_rate = (ic_s > 0).mean()\n",
    "    ic_stats[name] = {\n",
    "        \"IC Mean\"      : ic_mean,\n",
    "        \"IC Std\"       : ic_std,\n",
    "        \"ICIR\"         : icir,\n",
    "        \"Positive Rate\": positive_rate,\n",
    "    }\n",
    "    # 评级: ★★=强, ★=有效, ○=弱 / rating\n",
    "    if abs(ic_mean) > 0.08:\n",
    "        rating = \"★★ 强 Strong\"\n",
    "    elif abs(ic_mean) > 0.04:\n",
    "        rating = \"★  有效 Effective\"\n",
    "    else:\n",
    "        rating = \"○  弱 Weak\"\n",
    "    print(f\"     {name:8s}  {ic_mean:+8.4f}  {ic_std:9.4f}  {icir:+8.4f}  {positive_rate:9.1%}  {rating}\")\n",
    "\n",
    "print(f\"     \" + \"─\" * 64)\n",
    "print(f\"     评级标准: |IC均值|>0.08 ★★强  |IC均值|>0.04 ★有效  ICIR>0.5 稳定\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0b099a1a",
   "metadata": {},
   "source": [
    "# =============================================================================\n",
    "# §4  分层回测  Quantile Return Analysis\n",
    "\n",
    "# -----------------------------------------------------------------------------\n",
    "# 分层回测 (Quantile Analysis / Bucket Test) 是验证因子有效性的经典直观方法。\n",
    "# Quantile analysis is the classic, intuitive way to validate a factor.\n",
    "#\n",
    "# 操作步骤 / Procedure:\n",
    "# ① 在每个调仓日 T，按因子值将全部股票从小到大排序\n",
    "# ② 等分为 Q 组（常用 Q=5 五分位）\n",
    "# ③ 每组各构建等权组合 (equal-weight portfolio)，持有到下次调仓日 T+21\n",
    "# ④ 记录每组收益，重复直到回测结束\n",
    "#\n",
    "# ① On each rebalance date T, rank all stocks by factor value\n",
    "# ② Split into Q equal-sized buckets (usually Q=5 quintiles)\n",
    "# ③ Each bucket forms an equal-weight portfolio, held until next rebalance T+21\n",
    "# ④ Record each bucket's return, repeat until end of backtest\n",
    "#\n",
    "# 期望结果 / Expected result:\n",
    "#   Q5 累积收益 > Q4 > Q3 > Q2 > Q1\n",
    "#   即因子值越高，未来收益越高 —— 这是因子有效的最直观证明\n",
    "#   Higher factor value → higher future return = most intuitive proof of efficacy\n",
    "\n",
    "# ============================================================================="
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a7c15a4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n[§4] 分层回测 / Quantile Return Analysis...\")\n",
    "\n",
    "N_QUANTILES  = 5    # 五分位 / quintiles\n",
    "REBAL_PERIOD = 21   # 月度调仓 / monthly rebalancing\n",
    "\n",
    "# 选用 ICIR 最高的因子做分层演示 / Use factor with highest |ICIR| for demo\n",
    "best_factor_name = max(ic_stats, key=lambda n: abs(ic_stats[n][\"ICIR\"]))\n",
    "best_factor      = FACTORS_PROCESSED[best_factor_name]\n",
    "print(f\"     最强因子 (Best Factor): {best_factor_name}  ICIR={ic_stats[best_factor_name]['ICIR']:+.3f}\")\n",
    "\n",
    "\n",
    "def compute_quantile_returns(\n",
    "    factor_df: pd.DataFrame,\n",
    "    price_df: pd.DataFrame,\n",
    "    n_quantiles: int = 5,\n",
    "    rebal_period: int = 21,\n",
    ") -> pd.DataFrame:\n",
    "    \"\"\"\n",
    "    分层回测：每月调仓，计算各五分位等权组合的期间收益。\n",
    "    Quintile backtest: monthly rebalancing, equal-weight portfolio returns per bucket.\n",
    "\n",
    "    Returns:\n",
    "        DataFrame, columns=[Q1..Q5], index=rebalance dates,\n",
    "        values=equal-weight return for that holding period\n",
    "    \"\"\"\n",
    "    # 调仓日序列 / Rebalance date sequence\n",
    "    rebal_dates = price_df.index[::rebal_period]\n",
    "\n",
    "    bucket_returns = {f\"Q{q}\": [] for q in range(1, n_quantiles + 1)}\n",
    "    period_dates   = []\n",
    "\n",
    "    for i in range(len(rebal_dates) - 1):\n",
    "        t0 = rebal_dates[i]       # 建仓日 / entry date\n",
    "        t1 = rebal_dates[i + 1]   # 平仓日 / exit date\n",
    "\n",
    "        f_today = factor_df.loc[t0].dropna()\n",
    "        if len(f_today) < n_quantiles * 3:   # 股票太少则跳过 / skip if too few stocks\n",
    "            continue\n",
    "\n",
    "        # pd.qcut 按因子值等分为 Q 组 / split into Q equal-sized bins\n",
    "        labels = pd.qcut(\n",
    "            f_today.rank(method='first'),   # 先按秩排名（避免重复值问题）\n",
    "            n_quantiles,\n",
    "            labels=[f\"Q{q}\" for q in range(1, n_quantiles + 1)]\n",
    "        )\n",
    "\n",
    "        # 每组的持有期收益（等权平均）/ equal-weight return per group\n",
    "        p0       = price_df.loc[t0]\n",
    "        p1       = price_df.loc[t1]\n",
    "        hold_ret = p1 / p0 - 1.0\n",
    "\n",
    "        for q in range(1, n_quantiles + 1):\n",
    "            stocks_in_q = labels[labels == f\"Q{q}\"].index\n",
    "            bucket_returns[f\"Q{q}\"].append(hold_ret[stocks_in_q].mean())\n",
    "\n",
    "        period_dates.append(t0)\n",
    "\n",
    "    return pd.DataFrame(bucket_returns, index=period_dates)\n",
    "\n",
    "\n",
    "quantile_rets   = compute_quantile_returns(best_factor, prices_df, N_QUANTILES, REBAL_PERIOD)\n",
    "quantile_cumret = (1 + quantile_rets).cumprod()   # 累积净值 / cumulative NAV\n",
    "\n",
    "# 多空价差 (Long-Short Spread): Q5（多头）- Q1（空头）\n",
    "ls_spread      = quantile_rets[\"Q5\"] - quantile_rets[\"Q1\"]\n",
    "ls_cumret      = (1 + ls_spread).cumprod()\n",
    "\n",
    "periods_per_year = 252.0 / REBAL_PERIOD\n",
    "\n",
    "print(f\"\\n     {'分组':8s}  {'年化收益':>10s}  {'累积收益':>10s}\")\n",
    "print(f\"     \" + \"─\" * 40)\n",
    "for q in range(1, N_QUANTILES + 1):\n",
    "    ret_s   = quantile_rets[f\"Q{q}\"]\n",
    "    ann_ret = (1 + ret_s.mean()) ** periods_per_year - 1\n",
    "    cum_ret = quantile_cumret[f\"Q{q}\"].iloc[-1] - 1\n",
    "    print(f\"     Q{q}       {ann_ret:+10.2%}  {cum_ret:+10.2%}\")\n",
    "ann_ls = (1 + ls_spread.mean()) ** periods_per_year - 1\n",
    "print(f\"     L-S(Q5-Q1) {ann_ls:+9.2%}  {ls_cumret.iloc[-1]-1:+10.2%}\")\n",
    "print(f\"     \" + \"─\" * 40)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "cd68d8d8",
   "metadata": {},
   "source": [
    "# =============================================================================\n",
    "# §5  因子合成  Factor Combination\n",
    "\n",
    "# -----------------------------------------------------------------------------\n",
    "# 单个因子的预测能力有限（IC 通常只有 0.05~0.10），因子合成可以：\n",
    "# Individual factors have limited predictive power (IC ≈ 0.05~0.10).\n",
    "# Factor combination can:\n",
    "#\n",
    "#   ① 提升综合 ICIR（多个信号互补，减少因子特定噪音）\n",
    "#      Improve composite ICIR (signals complement each other, reduce factor noise)\n",
    "#   ② 覆盖更多 Alpha 来源（动量+质量+流动性的综合）\n",
    "#      Cover more alpha sources (momentum + quality + liquidity combined)\n",
    "#   ③ 降低单因子的\"失效期\"风险（某个风格因子可能在某段时间失效）\n",
    "#      Reduce regime risk (individual factors can fail in certain market regimes)\n",
    "#\n",
    "# 方法1: 等权合成 (Equal-Weight Composite)\n",
    "#   composite_EQ = (z_1 + z_2 + … + z_n) / n\n",
    "#   优点: 简单、稳健，不依赖历史 IC 估计  缺点: 不区分因子强弱\n",
    "#   Pro: simple, robust  Con: treats all factors equally regardless of efficacy\n",
    "#\n",
    "# 方法2: IC 加权合成 (IC-Weighted Composite)\n",
    "#   w_i = max(IC_mean_i, 0) / Σ max(IC_mean_j, 0)    (只给正 IC 因子分配权重)\n",
    "#   composite_ICW = Σ w_i × z_i\n",
    "#   优点: 给更强的因子更高权重  缺点: 历史 IC 可能不稳定（过拟合风险）\n",
    "#   Pro: higher weight to stronger factors  Con: historical IC may be unstable (overfitting)\n",
    "\n",
    "# ============================================================================="
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8a132503",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n[§5] 因子合成 / Factor Combination...\")\n",
    "\n",
    "# 只合成正 IC 的因子（负 IC 因子方向混乱，不宜纳入）\n",
    "# Only combine factors with positive IC (negative IC factors are directionally inconsistent)\n",
    "positive_ic_factors = {\n",
    "    name: f for name, f in FACTORS_PROCESSED.items()\n",
    "    if ic_stats[name][\"IC Mean\"] > 0\n",
    "}\n",
    "print(f\"     IC均值为正的因子: {list(positive_ic_factors.keys())}\")\n",
    "\n",
    "# ── 等权合成 / Equal-Weight Composite ─────────────────────────────────────────\n",
    "composite_eq = sum(\n",
    "    f.reindex(columns=symbols).fillna(0)\n",
    "    for f in positive_ic_factors.values()\n",
    ") / len(positive_ic_factors)\n",
    "\n",
    "# ── IC 加权合成 / IC-Weighted Composite ───────────────────────────────────────\n",
    "ic_weights_raw = {n: max(ic_stats[n][\"IC Mean\"], 0) for n in positive_ic_factors}\n",
    "total_w        = sum(ic_weights_raw.values())\n",
    "ic_weights     = {n: w / total_w for n, w in ic_weights_raw.items()}\n",
    "\n",
    "composite_icw = sum(\n",
    "    ic_weights[n] * f.reindex(columns=symbols).fillna(0)\n",
    "    for n, f in positive_ic_factors.items()\n",
    ")\n",
    "\n",
    "# 计算合成因子的 IC / Evaluate composite factor IC\n",
    "ic_eq  = compute_ic_series(composite_eq,  simple_ret_df, FORWARD_PERIOD)\n",
    "ic_icw = compute_ic_series(composite_icw, simple_ret_df, FORWARD_PERIOD)\n",
    "\n",
    "print(f\"\\n     IC 权重分配 / IC Weights:\")\n",
    "for name, w in ic_weights.items():\n",
    "    print(f\"       {name}: {w:.3f}\")\n",
    "\n",
    "print(f\"\\n     {'合成方法':22s}  {'IC均值':>8s}  {'ICIR':>8s}\")\n",
    "print(f\"     \" + \"─\" * 45)\n",
    "print(f\"     {'等权 Equal-Weight':22s}  {ic_eq.mean():+8.4f}  {ic_eq.mean()/ic_eq.std():+8.4f}\")\n",
    "print(f\"     {'IC加权 IC-Weighted':22s}  {ic_icw.mean():+8.4f}  {ic_icw.mean()/ic_icw.std():+8.4f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "eaa4a346",
   "metadata": {},
   "source": [
    "# =============================================================================\n",
    "# §6  多空组合  Long-Short Portfolio\n",
    "\n",
    "# -----------------------------------------------------------------------------\n",
    "# 多空组合 (Long-Short Portfolio) 是因子策略的实际收益实现形式。\n",
    "# A long-short portfolio is how a factor strategy generates actual P&L.\n",
    "#\n",
    "# 构建逻辑 / Construction logic:\n",
    "#   做多 (Long)  = 买入因子分最高的 Top 20% 股票（预期跑赢，做多享受上涨）\n",
    "#   做空 (Short) = 卖空因子分最低的 Bottom 20% 股票（预期跑输，做空赚下跌）\n",
    "#   多空价差     = 多头组合收益 - 空头组合收益（无论市场涨跌都能赚钱）\n",
    "#\n",
    "#   Long  = Buy top 20% stocks by factor score (expected to outperform)\n",
    "#   Short = Sell short bottom 20% stocks (expected to underperform)\n",
    "#   L-S   = Long return − Short return (market-neutral, profits in any market)\n",
    "#\n",
    "# 注意 A 股限制 / A-share restriction:\n",
    "#   A 股融券做空受到严格限制，本演示仅作为量化研究的理论演示。\n",
    "#   在港股、美股等市场中，做空非常普遍。\n",
    "#   Short selling in A-shares is heavily restricted. This demo is theoretical.\n",
    "#   Short selling is common and practical in HK/US markets.\n",
    "\n",
    "# ============================================================================="
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "48635af1",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n[§6] 多空组合 / Long-Short Portfolio...\")\n",
    "\n",
    "TOP_PCTILE = 0.20   # 前后 20% / top & bottom 20%\n",
    "\n",
    "\n",
    "def compute_long_short_portfolio(\n",
    "    factor_df: pd.DataFrame,\n",
    "    price_df: pd.DataFrame,\n",
    "    top_pct: float = 0.20,\n",
    "    rebal_period: int = 21,\n",
    ") -> dict:\n",
    "    \"\"\"\n",
    "    构建多空组合，计算每个持仓期的多头、空头、多空价差收益。\n",
    "    Build a long-short portfolio; compute per-period long, short, L-S returns.\n",
    "\n",
    "    Returns: dict with 'long', 'short', 'long_short' keys, each a pd.Series.\n",
    "    \"\"\"\n",
    "    rebal_dates = price_df.index[::rebal_period]\n",
    "    long_rets, short_rets, ls_rets, ret_dates = [], [], [], []\n",
    "\n",
    "    for i in range(len(rebal_dates) - 1):\n",
    "        t0, t1 = rebal_dates[i], rebal_dates[i + 1]\n",
    "        f_today = factor_df.loc[t0].dropna()\n",
    "        if len(f_today) < 10:\n",
    "            continue\n",
    "        n_top = max(1, int(len(f_today) * top_pct))\n",
    "\n",
    "        long_stocks  = f_today.nlargest(n_top).index    # Top 20% → 买入\n",
    "        short_stocks = f_today.nsmallest(n_top).index   # Bottom 20% → 卖空\n",
    "\n",
    "        p0  = price_df.loc[t0]\n",
    "        p1  = price_df.loc[t1]\n",
    "        ret = p1 / p0 - 1.0\n",
    "\n",
    "        lr = ret[long_stocks].mean()\n",
    "        sr = ret[short_stocks].mean()\n",
    "        long_rets.append(lr)\n",
    "        short_rets.append(sr)\n",
    "        ls_rets.append(lr - sr)   # 多空价差 / long-short spread\n",
    "        ret_dates.append(t0)\n",
    "\n",
    "    return {\n",
    "        \"long\"       : pd.Series(long_rets,  index=ret_dates, name=\"Long\"),\n",
    "        \"short\"      : pd.Series(short_rets, index=ret_dates, name=\"Short\"),\n",
    "        \"long_short\" : pd.Series(ls_rets,    index=ret_dates, name=\"L-S\"),\n",
    "    }\n",
    "\n",
    "\n",
    "def compute_max_drawdown(returns: pd.Series) -> float:\n",
    "    \"\"\"最大回撤 / Maximum Drawdown\"\"\"\n",
    "    cum  = (1 + returns).cumprod()\n",
    "    peak = cum.cummax()\n",
    "    return ((cum - peak) / peak).min()\n",
    "\n",
    "\n",
    "def compute_sharpe(returns: pd.Series, ann_factor: float) -> float:\n",
    "    \"\"\"夏普比率 / Annualized Sharpe Ratio\"\"\"\n",
    "    return returns.mean() / returns.std() * np.sqrt(ann_factor) if returns.std() > 0 else 0.0\n",
    "\n",
    "\n",
    "ls_result = compute_long_short_portfolio(composite_icw, prices_df, TOP_PCTILE, REBAL_PERIOD)\n",
    "ls_equity  = {k: (1 + v).cumprod() for k, v in ls_result.items()}\n",
    "\n",
    "ann_factor = periods_per_year\n",
    "print(f\"\\n     {'组合':14s}  {'年化收益':>10s}  {'夏普比率':>10s}  {'最大回撤':>10s}\")\n",
    "print(f\"     \" + \"─\" * 54)\n",
    "for name, rets in ls_result.items():\n",
    "    ann_ret = (1 + rets.mean()) ** ann_factor - 1\n",
    "    sharpe  = compute_sharpe(rets, ann_factor)\n",
    "    mdd     = compute_max_drawdown(rets)\n",
    "    print(f\"     {name:14s}  {ann_ret:+10.2%}  {sharpe:+10.3f}  {mdd:+10.2%}\")\n",
    "print(f\"     \" + \"─\" * 54)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "04431d61",
   "metadata": {},
   "source": [
    "# =============================================================================\n",
    "# §7  因子衰减分析  Factor Decay Analysis\n",
    "\n",
    "# -----------------------------------------------------------------------------\n",
    "# 因子衰减 (Factor Decay) 描述了因子预测能力随持有期增加而减弱的规律：\n",
    "# Factor decay describes how predictive power weakens as holding period grows:\n",
    "#\n",
    "#   短持有期 H=1D:   IC 最高（信号最新鲜，预测力最强）\n",
    "#   H 增大:          IC 逐渐下降（信号被市场消化，噪音积累）\n",
    "#   H=63D（1季度）: IC 接近 0（信号已基本失效）\n",
    "#\n",
    "# 实践意义 / Practical implications:\n",
    "#   ┌──────────────────┬──────────────────────────────────────────────┐\n",
    "#   │ 衰减类型         │ 适合持仓频率                                   │\n",
    "#   ├──────────────────┼──────────────────────────────────────────────┤\n",
    "#   │ 快速衰减 (Fast)  │ 日频或周频换仓（换手率高，成本高！）          │\n",
    "#   │ 慢速衰减 (Slow)  │ 月频或季频换仓（成本效率更高，适合实盘）     │\n",
    "#   └──────────────────┴──────────────────────────────────────────────┘\n",
    "\n",
    "# ============================================================================="
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a2292608",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n[§7] 因子衰减分析 / Factor Decay Analysis...\")\n",
    "\n",
    "DECAY_HORIZONS = [1, 5, 10, 21, 42, 63]    # 持有期（交易日）/ holding periods (days)\n",
    "\n",
    "decay_results = {}\n",
    "for fname in [best_factor_name, \"REV\"]:     # 对比 \"慢衰减\" vs \"快衰减\" 因子\n",
    "    horizon_ics = []\n",
    "    for h in DECAY_HORIZONS:\n",
    "        ic_h = compute_ic_series(FACTORS_PROCESSED[fname], simple_ret_df, h)\n",
    "        horizon_ics.append(ic_h.mean())\n",
    "    decay_results[fname] = horizon_ics\n",
    "    ic_str = \"  \".join([f\"H={h}D: {v:+.4f}\" for h, v in zip(DECAY_HORIZONS, horizon_ics)])\n",
    "    print(f\"     {fname}: {ic_str}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9ecc538c",
   "metadata": {},
   "source": [
    "# =============================================================================\n",
    "# §8  可视化  Visualization\n",
    "\n",
    "# ============================================================================="
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "34a1f89e",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"\\n[§8] 生成可视化图表 / Generating visualization...\")\n",
    "\n",
    "pct_fmt = FuncFormatter(lambda x, _: f\"{x:.0%}\")\n",
    "\n",
    "fig = plt.figure(figsize=(20, 24))\n",
    "fig.suptitle(\n",
    "    \"Alpha 因子研究演示  |  Alpha Factor Research Demo\\n\"\n",
    "    \"50只合成股票 × 3年日线数据  |  50 Synthetic Stocks × 3-Year Daily Data\",\n",
    "    fontsize=14, fontweight='bold', y=0.99\n",
    ")\n",
    "gs = gridspec.GridSpec(4, 3, figure=fig, hspace=0.50, wspace=0.35)\n",
    "\n",
    "# ── Panel 1: 股票池价格（归一化）/ Universe Prices (Normalized) ──────────────\n",
    "ax1 = fig.add_subplot(gs[0, 0])\n",
    "norm_prices = prices_df / prices_df.iloc[0]   # 归一化 / normalize to 1\n",
    "for sym in symbols[:12]:\n",
    "    ax1.plot(dates, norm_prices[sym], alpha=0.35, linewidth=0.7, color='#1f77b4')\n",
    "eq_index = norm_prices.mean(axis=1)           # 等权指数 / equal-weight index\n",
    "ax1.plot(dates, eq_index, color='black', linewidth=2, label='等权指数 EW Index', zorder=5)\n",
    "ax1.axhline(1, color='gray', linewidth=0.6, linestyle='--')\n",
    "ax1.set_title(\"股票池价格（归一化）\\nUniverse Prices (Normalized)\", fontsize=10)\n",
    "ax1.set_ylabel(\"归一化价格\")\n",
    "ax1.legend(fontsize=8)\n",
    "ax1.tick_params(axis='x', rotation=25, labelsize=8)\n",
    "ax1.xaxis.set_major_locator(plt.MaxNLocator(4))\n",
    "\n",
    "# ── Panel 2: IC 时间序列（5因子）/ IC Time Series ─────────────────────────────\n",
    "ax2 = fig.add_subplot(gs[0, 1:])\n",
    "ic_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']\n",
    "for (name, ic_s), color in zip(ic_series_dict.items(), ic_colors):\n",
    "    rolling_ic = ic_s.rolling(20, min_periods=5).mean()   # 20日滚动均值平滑\n",
    "    ax2.plot(ic_s.index, rolling_ic, label=name, color=color, linewidth=1.4)\n",
    "ax2.axhline(0,    color='black', linewidth=0.8, linestyle='--')\n",
    "ax2.axhline(0.05, color='green', linewidth=0.8, linestyle=':', alpha=0.7)\n",
    "ax2.axhline(-0.05, color='red',  linewidth=0.8, linestyle=':', alpha=0.7)\n",
    "ax2.set_title(\"因子 IC 时间序列（20日滚动均值）\\nFactor IC Time Series (20D Rolling Mean)\", fontsize=10)\n",
    "ax2.set_ylabel(\"IC\")\n",
    "ax2.legend(loc='upper right', ncol=5, fontsize=8)\n",
    "ax2.tick_params(axis='x', rotation=25, labelsize=8)\n",
    "ax2.xaxis.set_major_locator(plt.MaxNLocator(5))\n",
    "\n",
    "# ── Panel 3: IC 均值 & 误差棒 / IC Mean Bar Chart ─────────────────────────────\n",
    "ax3 = fig.add_subplot(gs[1, 0])\n",
    "names_list = list(ic_stats.keys())\n",
    "ic_means   = [ic_stats[n][\"IC Mean\"] for n in names_list]\n",
    "ic_stds    = [ic_stats[n][\"IC Std\"]  for n in names_list]\n",
    "bar_colors3 = ['#2ca02c' if v > 0 else '#d62728' for v in ic_means]\n",
    "ax3.bar(names_list, ic_means, yerr=ic_stds, capsize=5, color=bar_colors3,\n",
    "        alpha=0.8, error_kw={'linewidth': 1.5, 'ecolor': 'black'})\n",
    "ax3.axhline(0,    color='black', linewidth=0.8)\n",
    "ax3.axhline(0.05, color='green', linewidth=1.2, linestyle='--', alpha=0.7)\n",
    "ax3.axhline(-0.05, color='red', linewidth=1.2, linestyle='--', alpha=0.7)\n",
    "ax3.set_title(\"因子 IC 均值（误差棒=±1σ）\\nIC Mean ± 1σ\", fontsize=10)\n",
    "ax3.set_ylabel(\"IC 均值\")\n",
    "ax3.tick_params(axis='x', labelsize=9)\n",
    "\n",
    "# ── Panel 4: ICIR 柱状图 / ICIR Bar Chart ──────────────────────────────────────\n",
    "ax4 = fig.add_subplot(gs[1, 1])\n",
    "icirs      = [ic_stats[n][\"ICIR\"] for n in names_list]\n",
    "bar_colors4 = ['#2ca02c' if v > 0 else '#d62728' for v in icirs]\n",
    "ax4.bar(names_list, icirs, color=bar_colors4, alpha=0.8)\n",
    "ax4.axhline(0,   color='black', linewidth=0.8)\n",
    "ax4.axhline(0.5, color='green', linewidth=1.2, linestyle='--', alpha=0.7, label='ICIR=0.5')\n",
    "ax4.axhline(-0.5, color='red', linewidth=1.2, linestyle='--', alpha=0.7)\n",
    "ax4.set_title(\"因子 ICIR\\nFactor ICIR (Mean IC / Std IC)\", fontsize=10)\n",
    "ax4.set_ylabel(\"ICIR\")\n",
    "ax4.legend(fontsize=8)\n",
    "ax4.tick_params(axis='x', labelsize=9)\n",
    "\n",
    "# ── Panel 5: 因子相关矩阵 / Factor Correlation Matrix ─────────────────────────\n",
    "ax5 = fig.add_subplot(gs[1, 2])\n",
    "# 用各因子的截面均值时序来衡量因子间相关性 / Use cross-sectional mean timeseries\n",
    "factor_ts   = pd.DataFrame({n: f.mean(axis=1) for n, f in FACTORS_PROCESSED.items()})\n",
    "corr_matrix = factor_ts.corr()\n",
    "im5 = ax5.imshow(corr_matrix.values, cmap='RdYlGn', vmin=-1, vmax=1, aspect='auto')\n",
    "ax5.set_xticks(range(len(names_list)))\n",
    "ax5.set_yticks(range(len(names_list)))\n",
    "ax5.set_xticklabels(names_list, fontsize=9)\n",
    "ax5.set_yticklabels(names_list, fontsize=9)\n",
    "for i in range(len(names_list)):\n",
    "    for j in range(len(names_list)):\n",
    "        ax5.text(j, i, f\"{corr_matrix.values[i, j]:.2f}\",\n",
    "                 ha='center', va='center', fontsize=8,\n",
    "                 color='black' if abs(corr_matrix.values[i, j]) < 0.6 else 'white')\n",
    "plt.colorbar(im5, ax=ax5)\n",
    "ax5.set_title(\"因子相关矩阵\\nFactor Correlation Matrix\", fontsize=10)\n",
    "\n",
    "# ── Panel 6: 分层回测累积净值 / Quantile Cumulative Returns ───────────────────\n",
    "ax6 = fig.add_subplot(gs[2, 0])\n",
    "q_colors = plt.cm.RdYlGn(np.linspace(0.05, 0.95, N_QUANTILES))\n",
    "for q in range(1, N_QUANTILES + 1):\n",
    "    ax6.plot(quantile_cumret.index, quantile_cumret[f\"Q{q}\"],\n",
    "             label=f\"Q{q}\", color=q_colors[q - 1], linewidth=1.5)\n",
    "ax6.plot(ls_cumret.index, ls_cumret, label='L-S', color='black', linewidth=2, linestyle='--')\n",
    "ax6.axhline(1, color='gray', linewidth=0.6, linestyle=':')\n",
    "ax6.set_title(f\"分层回测（{best_factor_name}）\\nQuantile Returns ({best_factor_name})\", fontsize=10)\n",
    "ax6.set_ylabel(\"累积净值\")\n",
    "ax6.legend(ncol=3, fontsize=8)\n",
    "ax6.yaxis.set_major_formatter(FuncFormatter(lambda x, _: f\"{x:.1f}x\"))\n",
    "ax6.tick_params(axis='x', rotation=25, labelsize=8)\n",
    "ax6.xaxis.set_major_locator(plt.MaxNLocator(4))\n",
    "\n",
    "# ── Panel 7: 分位年化收益柱状图 / Quintile Annualized Return Bar ───────────────\n",
    "ax7 = fig.add_subplot(gs[2, 1])\n",
    "ann_q_rets = [(1 + quantile_rets[f\"Q{q}\"].mean()) ** periods_per_year - 1\n",
    "              for q in range(1, N_QUANTILES + 1)]\n",
    "bar_colors7 = plt.cm.RdYlGn(np.linspace(0.05, 0.95, N_QUANTILES))\n",
    "bars7 = ax7.bar([f\"Q{q}\" for q in range(1, N_QUANTILES + 1)], ann_q_rets, color=bar_colors7)\n",
    "ax7.axhline(0, color='black', linewidth=0.8)\n",
    "ax7.set_title(\"各分位组年化收益\\nAnnualized Return per Quintile\", fontsize=10)\n",
    "ax7.set_ylabel(\"年化收益 / Ann. Return\")\n",
    "ax7.yaxis.set_major_formatter(pct_fmt)\n",
    "ax7.tick_params(axis='x', labelsize=9)\n",
    "for bar, val in zip(bars7, ann_q_rets):\n",
    "    ax7.text(bar.get_x() + bar.get_width() / 2, val,\n",
    "             f\"{val:.1%}\", ha='center',\n",
    "             va='bottom' if val >= 0 else 'top', fontsize=8)\n",
    "\n",
    "# ── Panel 8: 多空组合净值曲线 / Long-Short Equity Curve ───────────────────────\n",
    "ax8 = fig.add_subplot(gs[2, 2])\n",
    "ax8.plot(ls_equity['long_short'].index, ls_equity['long_short'].values,\n",
    "         color='purple', linewidth=2, label='多空 L-S')\n",
    "ax8.plot(ls_equity['long'].index, ls_equity['long'].values,\n",
    "         color='#2ca02c', linewidth=1.5, linestyle='--', label='多头 Long', alpha=0.8)\n",
    "ax8.plot(ls_equity['short'].index, ls_equity['short'].values,\n",
    "         color='#d62728', linewidth=1.5, linestyle='--', label='空头 Short', alpha=0.8)\n",
    "ax8.axhline(1, color='gray', linewidth=0.6, linestyle=':')\n",
    "ax8.set_title(\"多空组合净值曲线\\nLong-Short Portfolio NAV\", fontsize=10)\n",
    "ax8.set_ylabel(\"净值 / NAV\")\n",
    "ax8.legend(fontsize=8)\n",
    "ax8.yaxis.set_major_formatter(FuncFormatter(lambda x, _: f\"{x:.1f}x\"))\n",
    "ax8.tick_params(axis='x', rotation=25, labelsize=8)\n",
    "ax8.xaxis.set_major_locator(plt.MaxNLocator(4))\n",
    "\n",
    "# ── Panel 9: 因子衰减曲线 / Factor Decay Curve ────────────────────────────────\n",
    "ax9 = fig.add_subplot(gs[3, :])\n",
    "decay_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']\n",
    "horizon_labels = [f\"{h}D\" for h in DECAY_HORIZONS]\n",
    "for (fname, decay_ics), color in zip(decay_results.items(), decay_colors):\n",
    "    ax9.plot(range(len(DECAY_HORIZONS)), decay_ics,\n",
    "             'o-', label=fname, color=color, linewidth=2, markersize=7)\n",
    "    for i, (h, v) in enumerate(zip(DECAY_HORIZONS, decay_ics)):\n",
    "        ax9.annotate(f\"{v:+.3f}\", (i, v), textcoords=\"offset points\",\n",
    "                     xytext=(0, 8), ha='center', fontsize=8, color=color)\n",
    "ax9.axhline(0,    color='black', linewidth=0.8, linestyle='--')\n",
    "ax9.axhline(0.05, color='green', linewidth=0.8, linestyle=':', alpha=0.7, label='IC=0.05 参考线')\n",
    "ax9.set_title(\n",
    "    \"因子衰减曲线  Factor Decay Curve\\n\"\n",
    "    \"IC 均值随持有期延长的衰减  |  How Mean IC Decays with Longer Holding Period\",\n",
    "    fontsize=10\n",
    ")\n",
    "ax9.set_xlabel(\"持有期 / Holding Period\")\n",
    "ax9.set_ylabel(\"IC 均值 / Mean IC\")\n",
    "ax9.set_xticks(range(len(DECAY_HORIZONS)))\n",
    "ax9.set_xticklabels(horizon_labels)\n",
    "ax9.legend(fontsize=9, ncol=4)\n",
    "ax9.grid(True, alpha=0.3)\n",
    "ax9.set_xlim(-0.3, len(DECAY_HORIZONS) - 0.7)\n",
    "\n",
    "plt.savefig(\"alpha_factor_demo.png\", dpi=150, bbox_inches='tight')\n",
    "print(f\"     图表已保存: alpha_factor_demo.png / Chart saved: alpha_factor_demo.png\")\n",
    "\n",
    "print(\"\\n\" + \"=\" * 70)\n",
    "print(\"  演示完成！ Demo Complete!\")\n",
    "print(\"  输出: alpha_factor_demo.png\")\n",
    "print(\"=\" * 70)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (trading)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "name": "python",
   "version": "3.11.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}