trading/quant_alpha_factor_demo.ipynb

1191 lines
63 KiB
Plaintext
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

{
"cells": [
{
"cell_type": "markdown",
"id": "a476d03b",
"metadata": {},
"source": [
"# =============================================================================\n",
"# Quantitative Trading — Alpha Factor Research Demo\n",
"# 量化交易 — Alpha 因子研究演示\n",
"\n",
"# =============================================================================\n",
"#\n",
"# Alpha 因子 (Alpha Factor) 是量化选股的核心工具。\n",
"# 它是一个数学公式,从市场数据中提取信号,预测哪些股票未来会跑赢大盘。\n",
"#\n",
"# Alpha factors are the core tool of quantitative stock selection.\n",
"# Each factor is a mathematical formula that extracts a signal from market data\n",
"# to predict which stocks will outperform in the future.\n",
"#\n",
"# 研究流程 / Research Workflow:\n",
"# §0 合成股票池 Synthetic Universe (50只股票 × 3年日线数据)\n",
"# §1 因子构建 Factor Construction (5个经典因子)\n",
"# §2 因子预处理 Factor Preprocessing (去极值、截面标准化、市值中性化)\n",
"# §3 IC 分析 IC Analysis (信息系数 / 因子预测能力量化)\n",
"# §4 分层回测 Quantile Analysis (五分位收益分层验证)\n",
"# §5 因子合成 Factor Combination (等权 & IC加权合成因子)\n",
"# §6 多空组合 Long-Short Portfolio (Top/Bottom 20% 多空策略)\n",
"# §7 因子衰减 Factor Decay (IC 随持有期延长的衰减曲线)\n",
"# §8 可视化 Visualization (9面板汇总图)\n",
"#\n",
"# 前置条件 / Prerequisites:\n",
"# pip install numpy pandas matplotlib scipy\n",
"#\n",
"# 运行方式 / Run:\n",
"# python quant_alpha_factor_demo.py\n",
"\n",
"# ============================================================================="
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "00e42fb4",
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib\n",
"matplotlib.use('Agg') # 非交互模式,避免 GUI 阻塞 / non-interactive, prevents GUI block\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib.gridspec as gridspec\n",
"from matplotlib.ticker import FuncFormatter\n",
"from scipy import stats\n",
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"# 中文字体配置 / Chinese font configuration\n",
"plt.rcParams['font.sans-serif'] = ['WenQuanYi Zen Hei', 'Arial Unicode MS', 'SimHei', 'DejaVu Sans']\n",
"plt.rcParams['axes.unicode_minus'] = False\n",
"\n",
"np.random.seed(42)\n",
"print(\"=\" * 70)\n",
"print(\" 量化交易 Alpha 因子研究演示\")\n",
"print(\" Quantitative Trading: Alpha Factor Research Demo\")\n",
"print(\"=\" * 70)"
]
},
{
"cell_type": "markdown",
"id": "986c9755",
"metadata": {},
"source": [
"# =============================================================================\n",
"# §0 合成股票池 Synthetic Stock Universe\n",
"\n",
"# -----------------------------------------------------------------------------\n",
"# 真实因子研究需要几百到几千只股票的截面数据 (cross-sectional data)。\n",
"# 我们用\"隐藏因子生成模型\" (Hidden Factor Generative Model) 合成数据,\n",
"# 确保我们构建的因子确实具有预测能力:\n",
"#\n",
"# Real factor research requires cross-sectional data of hundreds of stocks.\n",
"# We use a Hidden Factor Generative Model to synthesize data, ensuring the\n",
"# factors we build actually have genuine predictive power.\n",
"#\n",
"# 每只股票的日收益 = 市场分量 + 质量Alpha + 动量Alpha + 特质噪音\n",
"# stock_daily_return = market_component + quality_alpha + momentum_alpha + noise\n",
"#\n",
"# 其中:\n",
"# market_component = β_i × r_market (系统性风险 / systematic risk)\n",
"# quality_alpha = λ_q × quality_score_{i,t} (质量因子加载, 季度持续)\n",
"# momentum_alpha = λ_m × momentum_score_{i,t} (动量因子加载, 月度持续)\n",
"# noise ~ N(0, σ_idio) (特质风险 / idiosyncratic risk)\n",
"#\n",
"# 关键: quality_score 和 momentum_score 是\"隐藏\"的,我们无法直接观察。\n",
"# 但当我们从价格数据中计算因子时,会自然捕捉到这些隐藏信号。\n",
"# Key: these hidden scores are unobservable, but our computed factors will\n",
"# naturally capture them — this is exactly what alpha factors do!\n",
"\n",
"# ============================================================================="
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e1332377",
"metadata": {},
"outputs": [],
"source": [
"N_STOCKS = 50 # 股票数量 / number of stocks\n",
"N_DAYS = 756 # 交易日 ≈ 3 年 / trading days ≈ 3 years\n",
"N_QUARTERS = N_DAYS // 63 + 2 # 季度数 / number of quarters\n",
"N_MONTHS = N_DAYS // 21 + 2 # 月度数 / number of months\n",
"\n",
"# 交易日期序列 / Business-day date index\n",
"dates = pd.bdate_range(start=\"2021-01-04\", periods=N_DAYS)\n",
"\n",
"# 股票代码 (模拟A股格式) / Stock symbols (simulated A-share format)\n",
"symbols = [f\"{str(i).zfill(6)}.{'SH' if i % 2 == 0 else 'SZ'}\" for i in range(1, N_STOCKS + 1)]\n",
"\n",
"# ── 0-A 隐藏质量因子(季度持续)/ Hidden Quality Factor (Quarterly Persistence) ──\n",
"#\n",
"# 质量因子 AR(1): quality_t = 0.90 × quality_{t-1} + shock\n",
"# 季度自回归系数 0.90 → 年度持续性 ≈ 0.90^4 = 0.66(持续性更强)\n",
"# Quarterly AR(1) = 0.90 → annual persistence ≈ 0.90^4 = 0.66 (more persistent)\n",
"quality_quarterly = np.zeros((N_STOCKS, N_QUARTERS))\n",
"quality_quarterly[:, 0] = np.random.randn(N_STOCKS)\n",
"for q in range(1, N_QUARTERS):\n",
" # sqrt(1 - 0.90^2) ≈ 0.436 保持方差为 1 / keeps variance = 1\n",
" quality_quarterly[:, q] = (0.90 * quality_quarterly[:, q - 1]\n",
" + 0.436 * np.random.randn(N_STOCKS))\n",
"\n",
"# 季度 → 每日映射 / Map quarterly to daily\n",
"quality_daily = np.zeros((N_DAYS, N_STOCKS))\n",
"for d in range(N_DAYS):\n",
" q_idx = min(d // 63, N_QUARTERS - 1)\n",
" quality_daily[d] = quality_quarterly[:, q_idx]\n",
"\n",
"# ── 0-B 隐藏动量因子(月度持续)/ Hidden Momentum Factor (Monthly Persistence) ──\n",
"#\n",
"# 动量因子 AR(1): momentum_t = 0.60 × momentum_{t-1} + shock\n",
"# 月度自回归系数 0.60,意味着 3 个月后的持续性 ≈ 0.60^3 = 0.22\n",
"# Monthly AR(1) coefficient 0.60 → 3-month persistence ≈ 0.22\n",
"momentum_monthly = np.zeros((N_STOCKS, N_MONTHS))\n",
"momentum_monthly[:, 0] = np.random.randn(N_STOCKS)\n",
"for m in range(1, N_MONTHS):\n",
" # 月度 AR(1) = 0.80,年度持续性 ≈ 0.80^12 = 0.069(合理的动量衰减)\n",
" # Monthly AR(1) = 0.80 → annual persistence ≈ 0.80^12 = 0.069\n",
" momentum_monthly[:, m] = (0.80 * momentum_monthly[:, m - 1]\n",
" + 0.60 * np.random.randn(N_STOCKS))\n",
"\n",
"momentum_daily = np.zeros((N_DAYS, N_STOCKS))\n",
"for d in range(N_DAYS):\n",
" m_idx = min(d // 21, N_MONTHS - 1)\n",
" momentum_daily[d] = momentum_monthly[:, m_idx]\n",
"\n",
"# ── 0-C 日收益率生成 / Daily Return Generation ────────────────────────────────\n",
"#\n",
"# 日市场收益 (Market Daily Return): 年化μ=8%, σ=20%\n",
"mkt_returns = np.random.normal(0.0003, 0.0126, N_DAYS) # 0.20/√252 ≈ 0.0126\n",
"\n",
"# 每只股票的市场贝塔 β: 均匀分布在 [0.5, 1.5]\n",
"# Each stock's market beta β: uniformly distributed in [0.5, 1.5]\n",
"stock_betas = np.random.uniform(0.5, 1.5, N_STOCKS)\n",
"\n",
"# (N_DAYS, N_STOCKS): 每列是一只股票,每行是一个交易日\n",
"market_component = mkt_returns[:, np.newaxis] * stock_betas[np.newaxis, :] # 市场分量\n",
"quality_component = 0.00080 * quality_daily # 质量贡献:约 ±20% 年化 alpha\n",
"momentum_component = 0.00050 * momentum_daily # 动量贡献:约 ±12% 年化 alpha\n",
"idio_noise = np.random.normal(0, 0.0075, (N_DAYS, N_STOCKS)) # 特质噪音\n",
"\n",
"# 日对数收益率 / Daily log returns\n",
"log_returns = market_component + quality_component + momentum_component + idio_noise\n",
"\n",
"# ── 0-D 价格路径 / Price Paths ────────────────────────────────────────────────\n",
"# P_t = P_0 × exp( Σ log_return_s, s=1..t ) (价格路径 / price path)\n",
"initial_prices = np.random.uniform(5.0, 100.0, N_STOCKS) # A股初始价格\n",
"prices_arr = initial_prices * np.exp(np.cumsum(log_returns, axis=0))\n",
"\n",
"prices_df = pd.DataFrame(prices_arr, index=dates, columns=symbols)\n",
"log_ret_df = pd.DataFrame(log_returns, index=dates, columns=symbols)\n",
"simple_ret_df = prices_df.pct_change() # 简单收益率: r_t = P_t/P_{t-1} - 1\n",
"\n",
"# ── 0-E 成交量生成 / Volume Generation ──────────────────────────────────────\n",
"# 真实市场中,成交量与绝对收益率正相关(大涨大跌时换手更活跃)。\n",
"# In real markets, volume correlates with |return| (more trading on big moves).\n",
"#\n",
"# log(volume) = log(base) + 3×|log_return| + noise\n",
"base_volumes = np.random.uniform(1e6, 5e7, N_STOCKS) # 基础日均成交量(股)\n",
"vol_multiplier = np.exp(3.0 * np.abs(log_returns) + 0.3 * np.random.randn(N_DAYS, N_STOCKS))\n",
"volume_df = pd.DataFrame(base_volumes * vol_multiplier, index=dates, columns=symbols)\n",
"\n",
"# ── 0-F 模拟市值 / Simulated Market Capitalization ───────────────────────────\n",
"# 市值 (Market Cap) = 价格 × 流通股数 / Market Cap = Price × Float Shares\n",
"float_shares = np.random.uniform(1e8, 1e10, N_STOCKS) # 流通股数(股)\n",
"mktcap_df = prices_df * float_shares # 市值(元)\n",
"log_mktcap_df = np.log(mktcap_df) # 对数市值(因子分析常用)\n",
"\n",
"market_series = pd.Series(mkt_returns, index=dates) # 市场收益率序列\n",
"\n",
"print(f\"\\n[§0] 股票池生成完成 / Universe Generated\")\n",
"print(f\" 股票数量 (Stocks): {N_STOCKS} 只\")\n",
"print(f\" 交易日数 (Days): {N_DAYS} 天 {dates[0].date()} ~ {dates[-1].date()}\")\n",
"print(f\" 价格区间 (Price): {prices_df.min().min():.2f} ~ {prices_df.max().max():.2f} 元\")"
]
},
{
"cell_type": "markdown",
"id": "04c44408",
"metadata": {},
"source": [
"# =============================================================================\n",
"# §1 因子构建 Factor Construction\n",
"\n",
"# -----------------------------------------------------------------------------\n",
"# 我们构建 5 个来自学术文献的经典因子,涵盖不同的 Alpha 来源:\n",
"# We build 5 classic factors from academic literature, covering different alpha sources:\n",
"#\n",
"# ┌──────┬────────────────────────────┬──────────────────────────────────────────┐\n",
"# │ 因子 │ 名称 │ 公式 & 来源 │\n",
"# ├──────┼────────────────────────────┼──────────────────────────────────────────┤\n",
"# │ MOM │ 动量 Momentum │ r(t-252, t-21) Jegadeesh & Titman 1993 │\n",
"# │ REV │ 短期反转 Reversal │ -r(t-21, t) Jegadeesh 1990 │\n",
"# │ LVOL │ 低波动 Low Volatility │ -std(r, 60D) Baker et al. 2011 │\n",
"# │ BAB │ 低贝塔 Betting vs Beta │ -β(60D) Frazzini & Pedersen 2014 │\n",
"# │ ILLIQ│ 非流动性 Amihud Illiquidity│ mean(|r|/V,20D) Amihud 2002 │\n",
"# └──────┴────────────────────────────┴──────────────────────────────────────────┘\n",
"#\n",
"# 重要约定: 所有因子都以\"因子值越高 → 预期未来收益越高\"为正方向。\n",
"# Convention: higher factor value → expected higher future return (unified sign).\n",
"\n",
"# ============================================================================="
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1d852ac0",
"metadata": {},
"outputs": [],
"source": [
"print(\"\\n[§1] 构建因子 / Constructing factors...\")\n",
"\n",
"# ── 1-A 动量因子 (MOM) Momentum Factor ──────────────────────────────────────\n",
"#\n",
"# 来源 / Source: Jegadeesh & Titman (1993)\n",
"# \"Returns to Buying Winners and Selling Losers\"\n",
"#\n",
"# 动量效应 (Momentum Effect): 过去 12 个月(跳过最近 1 个月)涨幅最大的股票,\n",
"# 未来 3-12 个月通常继续跑赢市场。这是学术上记录最多的股票市场异象之一。\n",
"#\n",
"# Momentum effect: stocks with highest 12M-1M returns tend to outperform\n",
"# over the next 3-12 months. One of the most documented stock market anomalies.\n",
"#\n",
"# 为什么要跳过最近 1 个月?/ Why skip the last month?\n",
"# 最近 1 个月的收益存在短期反转效应(买卖价差、做市商库存调整等微观结构原因),\n",
"# 会污染中期动量信号,所以必须跳过。\n",
"# The last month exhibits short-term reversal due to bid-ask spread and\n",
"# market-maker inventory rebalancing, contaminating medium-term momentum.\n",
"#\n",
"# 公式 / Formula:\n",
"# MOM_t = (P_{t-21} / P_{t-252}) - 1 (从12M前 到 1M前 的累积收益)\n",
"# MOM_t = (P_{t-21} / P_{t-252}) - 1 (cumulative return from 12M to 1M ago)\n",
"\n",
"MOM_LONG = 252 # 长回看窗口: 12个月 / lookback long: 12 months\n",
"MOM_SHORT = 21 # 跳过窗口: 1个月 / skip window: 1 month\n",
"\n",
"factor_mom = prices_df.shift(MOM_SHORT) / prices_df.shift(MOM_LONG) - 1.0\n",
"print(f\" ✓ MOM 动量因子: 有效值 {factor_mom.notna().mean().mean():.1%}\")\n",
"\n",
"# ── 1-B 短期反转因子 (REV) Short-Term Reversal Factor ───────────────────────\n",
"#\n",
"# 来源 / Source: Jegadeesh (1990)\n",
"# \"Evidence of Predictable Behavior of Security Returns\"\n",
"#\n",
"# 反转效应 (Reversal Effect): 上个月大涨的股票,下个月往往小幅回调;\n",
"# 大跌的股票往往小幅反弹。这是对短期\"过度反应\"的修正。\n",
"#\n",
"# Reversal effect: last month's winners tend to reverse; losers tend to bounce.\n",
"# This is correction of short-term overreaction.\n",
"#\n",
"# 注意: 取负号是因为月涨 → 因子值高 → 预期下跌(反转),我们统一正方向。\n",
"# Note: negative sign because high past return → expected to reverse → lower future return,\n",
"# we flip to maintain convention: higher factor → higher expected return.\n",
"#\n",
"# 公式 / Formula:\n",
"# REV_t = -(P_t / P_{t-21} - 1) (负1月收益 / negative 1-month return)\n",
"\n",
"factor_rev = -(prices_df / prices_df.shift(MOM_SHORT) - 1.0)\n",
"print(f\" ✓ REV 反转因子: 有效值 {factor_rev.notna().mean().mean():.1%}\")\n",
"\n",
"# ── 1-C 低波动因子 (LVOL) Low Volatility Factor ────────────────────────────\n",
"#\n",
"# 来源 / Source: Ang et al.(2006); Baker, Bradley & Wurgler (2011)\n",
"# \"Benchmarks as Limits to Arbitrage: Understanding the Low-Volatility Anomaly\"\n",
"#\n",
"# 低波动异象 (Low Volatility Anomaly): 经典金融理论 CAPM 预测高风险 = 高收益。\n",
"# 但实证研究发现,低波动率的股票反而有更高的原始收益和风险调整后收益!\n",
"#\n",
"# Low-vol anomaly: contrary to CAPM, low-volatility stocks earn HIGHER\n",
"# risk-adjusted and even raw returns than high-volatility stocks.\n",
"#\n",
"# 可能的解释 / Possible explanations:\n",
"# ① 机构投资者受基准约束,被迫持有高贝塔股票 (benchmark constraints)\n",
"# ② 投资者对\"彩票式\"高波动股票有偏好,导致其被高估 (lottery preference)\n",
"# ③ 杠杆限制阻止套利者纠正定价偏差 (leverage constraints)\n",
"#\n",
"# 公式 / Formula:\n",
"# LVOL_t = -σ(r_{t-60:t}) (负60日已实现波动率 / negative 60-day realized vol)\n",
"\n",
"VOL_WINDOW = 60 # 60 个交易日 ≈ 3 个月 / 60 trading days ≈ 3 months\n",
"factor_lvol = -(simple_ret_df.rolling(VOL_WINDOW).std())\n",
"print(f\" ✓ LVOL 低波动因子: 有效值 {factor_lvol.notna().mean().mean():.1%}\")\n",
"\n",
"# ── 1-D 低贝塔因子 (BAB) Betting Against Beta ──────────────────────────────\n",
"#\n",
"# 来源 / Source: Frazzini & Pedersen (2014) \"Betting Against Beta\"\n",
"#\n",
"# BAB 异象: 做多低贝塔股票、做空高贝塔股票,可以获得持续的超额收益。\n",
"# 与低波动因子类似,但专注于对\"市场系统性风险\"的暴露,而非总波动率。\n",
"#\n",
"# BAB anomaly: long low-beta, short high-beta → persistent excess return.\n",
"# Similar to low-vol but focuses on market systematic risk exposure.\n",
"#\n",
"# 贝塔计算 / Beta computation:\n",
"# β_i = Cov(r_i, r_market) / Var(r_market) (60日滚动窗口 / 60-day rolling)\n",
"#\n",
"# 向量化技巧: 利用 pandas rolling 方法计算所有股票的滚动协方差和市场方差\n",
"# Vectorized trick: use pandas rolling to compute rolling covariance and variance\n",
"\n",
"BETA_WINDOW = 60 # 60日滚动贝塔 / 60-day rolling beta\n",
"\n",
"# pandas rolling().cov(other) 计算每个时间点的滚动协方差\n",
"# pandas rolling().cov(other) computes rolling covariance at each time point\n",
"rolling_var_mkt = market_series.rolling(BETA_WINDOW).var() # 市场方差 / market variance\n",
"rolling_cov = log_ret_df.apply(\n",
" lambda col: col.rolling(BETA_WINDOW).cov(market_series) # 逐列协方差 / per-stock cov\n",
")\n",
"rolling_beta_df = rolling_cov.div(rolling_var_mkt, axis=0) # β = cov / var\n",
"factor_bab = -rolling_beta_df # 取负:低贝塔 → 高因子值 / flip: low-beta → high score\n",
"print(f\" ✓ BAB 低贝塔因子: 有效值 {factor_bab.notna().mean().mean():.1%}\")\n",
"\n",
"# ── 1-E Amihud 非流动性因子 (ILLIQ) Amihud Illiquidity Factor ───────────────\n",
"#\n",
"# 来源 / Source: Amihud (2002) \"Illiquidity and Stock Returns\"\n",
"#\n",
"# Amihud 非流动性指标 (Amihud Illiquidity Measure):\n",
"# ILLIQ_{i,t} = (1/D) × Σ_{d=t-D+1}^{t} |r_{i,d}| / Volume_{i,d}\n",
"#\n",
"# 含义 (Interpretation):\n",
"# \"每单位成交量能引起多大的价格变动?\"\n",
"# \"How much price movement per unit of trading volume?\"\n",
"#\n",
"# 值越高 → 流动性越差 (higher → less liquid)\n",
"# 流动性差的股票风险更高,投资者要求额外的\"流动性溢价\"(liquidity premium)\n",
"# Illiquid stocks carry higher risk, investors demand extra liquidity premium\n",
"#\n",
"# 注: 本因子中,高 ILLIQ流动性差→ 预期高收益(溢价补偿)→ 符合正方向约定\n",
"# Note: high ILLIQ (illiquid) → expected high return (premium) → matches positive convention\n",
"\n",
"ILLIQ_WINDOW = 20 # 20日均值 / 20-day average\n",
"\n",
"# |日收益率| / 日成交量再取20日滚动均值\n",
"# |daily return| / daily volume, then 20-day rolling mean\n",
"price_impact = simple_ret_df.abs() / volume_df # 每日价格冲击 / daily price impact\n",
"factor_illiq = price_impact.rolling(ILLIQ_WINDOW).mean() * 1e8 # 缩放 / scaling\n",
"print(f\" ✓ ILLIQ 非流动性因子: 有效值 {factor_illiq.notna().mean().mean():.1%}\")\n",
"\n",
"# ── 1-F 汇总 / Consolidate ───────────────────────────────────────────────────\n",
"FACTORS = {\n",
" \"MOM\" : factor_mom, # 动量\n",
" \"REV\" : factor_rev, # 短期反转\n",
" \"LVOL\" : factor_lvol, # 低波动\n",
" \"BAB\" : factor_bab, # 低贝塔 / 贝塔\n",
" \"ILLIQ\" : factor_illiq, # Amihud 非流动性\n",
"}\n",
"print(f\"\\n 共构建 {len(FACTORS)} 个因子: {list(FACTORS.keys())}\")"
]
},
{
"cell_type": "markdown",
"id": "ab363340",
"metadata": {},
"source": [
"# =============================================================================\n",
"# §2 因子预处理 Factor Preprocessing\n",
"\n",
"# -----------------------------------------------------------------------------\n",
"# 原始因子值不能直接用于分析,需要经过三步标准化预处理流程:\n",
"# Raw factor values must go through a 3-step preprocessing pipeline:\n",
"#\n",
"# Step 1 ── 截面去极值 (Cross-Sectional Winsorization)\n",
"# 在每个日期截面,将超出 [μ - 3σ, μ + 3σ] 范围的值截断。\n",
"# At each date, clip values outside [mean - 3σ, mean + 3σ].\n",
"# 为什么?极端异常值会主导相关系数计算,掩盖真实的因子信号。\n",
"# Why? Outliers dominate correlation calculations and mask the true signal.\n",
"#\n",
"# Step 2 ── 截面 Z-score 标准化 (Cross-Sectional Z-score)\n",
"# z_{i,t} = (x_{i,t} - μ_t) / σ_t (在每个时间截面 t 上计算)\n",
"# z_{i,t} = (x_{i,t} - μ_t) / σ_t (computed cross-sectionally at each date t)\n",
"# 为什么?不同因子量纲不同(动量是%,波动率也是%ILLIQ 是极小数),\n",
"# 标准化后可以直接比较和合成。\n",
"# Why? Factors have different scales; Z-score enables fair comparison and combination.\n",
"#\n",
"# Step 3 ── 市值中性化 (Market Cap Neutralization)\n",
"# 在每个截面,对 log(市值) 做 OLS 回归,用残差替换原因子值:\n",
"# At each date, regress on log(mktcap) via OLS; use residuals as factor:\n",
"# factor_neutral_i = factor_i - (α + β × log_mktcap_i)\n",
"# 为什么?小市值效应 (Size Effect) 会污染其他因子。例如小市值股票往往同时具有\n",
"# 高动量、高波动率、低流动性,如果不剔除市值效应,因子实际上只是\n",
"# 在选小市值股票,而不是真正的动量/波动/流动性信号。\n",
"# Why? Size effect contaminates other factors — small caps often have high momentum,\n",
"# high vol, and low liquidity simultaneously. Without neutralization,\n",
"# factors simply pick small caps rather than true alpha signals.\n",
"\n",
"# ============================================================================="
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "0fb3a541",
"metadata": {},
"outputs": [],
"source": [
"print(\"\\n[§2] 因子预处理 / Factor Preprocessing...\")\n",
"\n",
"\n",
"def winsorize_cross_section(factor_df: pd.DataFrame, n_std: float = 3.0) -> pd.DataFrame:\n",
" \"\"\"\n",
" 截面去极值 / Cross-sectional winsorization.\n",
"\n",
" 在每个日期(行),将超出 n_std 个标准差的值截断到边界。\n",
" At each date (row), clip values that are more than n_std std devs from the mean.\n",
"\n",
" 向量化实现,逐行 clip / Vectorized via per-row clip applied with apply.\n",
" \"\"\"\n",
" mu = factor_df.mean(axis=1) # 每日截面均值 / daily cross-sectional mean\n",
" sig = factor_df.std(axis=1) # 每日截面标准差 / daily cross-sectional std\n",
" lower = (mu - n_std * sig).values[:, np.newaxis] # 广播形状 / broadcast shape\n",
" upper = (mu + n_std * sig).values[:, np.newaxis]\n",
" clipped = np.clip(factor_df.values, lower, upper)\n",
" return pd.DataFrame(clipped, index=factor_df.index, columns=factor_df.columns)\n",
"\n",
"\n",
"def zscore_cross_section(factor_df: pd.DataFrame) -> pd.DataFrame:\n",
" \"\"\"\n",
" 截面 Z-score 标准化 / Cross-sectional Z-score normalization.\n",
"\n",
" 使每个时间截面的因子均值=0, 标准差=1。\n",
" Makes each time cross-section have mean=0 and std=1.\n",
" \"\"\"\n",
" mu = factor_df.mean(axis=1) # 每日截面均值 (N_DAYS,)\n",
" sig = factor_df.std(axis=1) # 每日截面标准差 (N_DAYS,)\n",
" # sub/div 沿行方向广播 / broadcast along rows\n",
" return factor_df.sub(mu, axis=0).div(sig.replace(0, np.nan), axis=0)\n",
"\n",
"\n",
"def neutralize_mktcap(factor_df: pd.DataFrame, log_mktcap: pd.DataFrame) -> pd.DataFrame:\n",
" \"\"\"\n",
" 市值中性化 / Market cap neutralization via cross-sectional OLS.\n",
"\n",
" 在每个截面,用 OLS 将因子对 log(市值) 回归,取残差作为中性化因子。\n",
" At each cross-section, regress factor on log(mktcap) via OLS; keep residuals.\n",
"\n",
" residual_i = factor_i - (intercept + slope × log_mktcap_i)\n",
" \"\"\"\n",
" # 对齐列顺序 / Align columns\n",
" y_arr = factor_df.values.copy() # (N_DAYS, N_STOCKS)\n",
" x_arr = log_mktcap.reindex(columns=factor_df.columns).values # (N_DAYS, N_STOCKS)\n",
"\n",
" for t in range(len(factor_df)):\n",
" y = y_arr[t]\n",
" x = x_arr[t]\n",
" mask = ~(np.isnan(y) | np.isnan(x))\n",
" if mask.sum() < 10:\n",
" continue\n",
" y_c, x_c = y[mask], x[mask]\n",
" # 手动 OLS / manual OLS (faster than scipy on small arrays)\n",
" xm, ym = x_c.mean(), y_c.mean()\n",
" denom = np.dot(x_c - xm, x_c - xm)\n",
" if denom < 1e-12:\n",
" continue\n",
" slope = np.dot(x_c - xm, y_c - ym) / denom\n",
" intercept = ym - slope * xm\n",
" y_arr[t][mask] = y_c - (intercept + slope * x_c) # 残差 / residuals\n",
"\n",
" return pd.DataFrame(y_arr, index=factor_df.index, columns=factor_df.columns)\n",
"\n",
"\n",
"# 对所有因子应用预处理流水线 / Apply preprocessing pipeline to all factors\n",
"FACTORS_PROCESSED = {}\n",
"for name, factor in FACTORS.items():\n",
" step1 = winsorize_cross_section(factor) # ① 去极值\n",
" step2 = zscore_cross_section(step1) # ② Z-score\n",
" step3 = neutralize_mktcap(step2, log_mktcap_df) # ③ 市值中性化\n",
" FACTORS_PROCESSED[name] = step3\n",
" print(f\" ✓ {name:5s}: 去极值 → Z-score → 市值中性化 完成\")\n",
"\n",
"print(\" 预处理完成 / Preprocessing complete.\")"
]
},
{
"cell_type": "markdown",
"id": "5826aca6",
"metadata": {},
"source": [
"# =============================================================================\n",
"# §3 IC 分析 Information Coefficient Analysis\n",
"\n",
"# -----------------------------------------------------------------------------\n",
"# IC (Information Coefficient 信息系数) 是衡量因子预测能力的黄金标准。\n",
"# IC is the gold standard for measuring a factor's predictive power.\n",
"#\n",
"# 定义 / Definition:\n",
"# IC_t = Spearman_Rank_Correlation( factor_{t}, forward_return_{t+H} )\n",
"#\n",
"# 其中:\n",
"# factor_{t} = 第 t 日截面因子值50只股票每只一个数\n",
"# forward_return_{t+H} = 从第 t 日起持有 H 日的未来收益(未来数据!)\n",
"# H = 21 交易日 ≈ 1 个月(最常用的持有期)\n",
"#\n",
"# 用 Spearman 秩相关而非 Pearson 线性相关的原因:\n",
"# ① 对极端值鲁棒 (robust to outliers)\n",
"# ② 衡量单调关系(不需要线性)(measures monotonic, not necessarily linear, relationship)\n",
"#\n",
"# 为什么用 Spearman rank correlation instead of Pearson?\n",
"# ① Robust to outliers\n",
"# ② Measures monotonic relationship (no linearity assumption needed)\n",
"#\n",
"# IC 评价标准 / IC benchmark:\n",
"# |IC均值| > 0.05 → 因子有效 / factor is effective\n",
"# |IC均值| > 0.10 → 因子强 / factor is strong\n",
"# ICIR > 0.50 → 因子稳定 / factor is stable\n",
"# IC>0 比率 > 55% → 方向一致性好 / directionally consistent\n",
"#\n",
"# ICIR (IC Information Ratio):\n",
"# ICIR = IC均值 / IC标准差 = IC_mean / IC_std\n",
"# 类似夏普比率,衡量\"每单位波动贡献多少稳定的预测能力\"\n",
"# Similar to Sharpe ratio — measures how much stable predictive power per unit of volatility\n",
"\n",
"# ============================================================================="
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "2201ba1a",
"metadata": {},
"outputs": [],
"source": [
"print(\"\\n[§3] IC 分析 / IC Analysis...\")\n",
"\n",
"FORWARD_PERIOD = 21 # 持有期 H = 21 日 ≈ 1 个月 / holding period ≈ 1 month\n",
"\n",
"\n",
"def compute_ic_series(\n",
" factor_df: pd.DataFrame,\n",
" forward_ret_df: pd.DataFrame,\n",
" holding_period: int = 21,\n",
") -> pd.Series:\n",
" \"\"\"\n",
" 计算因子的 IC 时间序列。\n",
" Compute the IC time series for a factor.\n",
"\n",
" 在每个日期 t计算截面 Spearman 相关系数:\n",
" At each date t, compute cross-sectional Spearman rank correlation:\n",
" IC_t = Spearmanr( factor[t, all_stocks], forward_return[t+H, all_stocks] )\n",
"\n",
" Parameters:\n",
" factor_df : 预处理后的因子值 (date × stock)\n",
" forward_ret_df : 收益率 DataFrame (date × stock)\n",
" holding_period : 持有期(交易日)/ holding period in days\n",
" Returns:\n",
" ic_series : IC 值时间序列 / IC time series indexed by date\n",
" \"\"\"\n",
" # shift(-H): 把 t+H 的收益值移到第 t 行,使得 fwd_returns.loc[t] = return from t to t+H\n",
" # shift(-H) aligns so that fwd_returns.loc[t] = the return earned from t to t+H\n",
" fwd_returns = forward_ret_df.shift(-holding_period)\n",
"\n",
" ic_values, ic_dates = [], []\n",
" for date in factor_df.index[:-holding_period]: # 最后 H 行无未来收益 / no future return\n",
" f_row = factor_df.loc[date].dropna()\n",
" r_row = fwd_returns.loc[date, f_row.index].dropna()\n",
" common = f_row.index.intersection(r_row.index)\n",
" if len(common) < 10: # 至少 10 只股票 / need at least 10 stocks\n",
" continue\n",
" ic, _ = stats.spearmanr(f_row[common].values, r_row[common].values)\n",
" ic_values.append(ic)\n",
" ic_dates.append(date)\n",
"\n",
" return pd.Series(ic_values, index=ic_dates, name=\"IC\")\n",
"\n",
"\n",
"# 逐因子计算 IC 序列 / Compute IC series for each factor\n",
"ic_series_dict = {}\n",
"for name, factor in FACTORS_PROCESSED.items():\n",
" ic_series_dict[name] = compute_ic_series(factor, simple_ret_df, FORWARD_PERIOD)\n",
"\n",
"# ── IC 汇总统计表 / IC Summary Statistics Table ───────────────────────────────\n",
"print(f\"\\n {'因子':8s} {'IC均值':>8s} {'IC标准差':>9s} {'ICIR':>8s} {'IC>0比率':>9s} 评级\")\n",
"print(f\" \" + \"─\" * 64)\n",
"\n",
"ic_stats = {}\n",
"for name, ic_s in ic_series_dict.items():\n",
" ic_mean = ic_s.mean()\n",
" ic_std = ic_s.std()\n",
" icir = ic_mean / ic_std if ic_std > 0 else 0.0\n",
" positive_rate = (ic_s > 0).mean()\n",
" ic_stats[name] = {\n",
" \"IC Mean\" : ic_mean,\n",
" \"IC Std\" : ic_std,\n",
" \"ICIR\" : icir,\n",
" \"Positive Rate\": positive_rate,\n",
" }\n",
" # 评级: ★★=强, ★=有效, ○=弱 / rating\n",
" if abs(ic_mean) > 0.08:\n",
" rating = \"★★ 强 Strong\"\n",
" elif abs(ic_mean) > 0.04:\n",
" rating = \"★ 有效 Effective\"\n",
" else:\n",
" rating = \"○ 弱 Weak\"\n",
" print(f\" {name:8s} {ic_mean:+8.4f} {ic_std:9.4f} {icir:+8.4f} {positive_rate:9.1%} {rating}\")\n",
"\n",
"print(f\" \" + \"─\" * 64)\n",
"print(f\" 评级标准: |IC均值|>0.08 ★★强 |IC均值|>0.04 ★有效 ICIR>0.5 稳定\")"
]
},
{
"cell_type": "markdown",
"id": "0b099a1a",
"metadata": {},
"source": [
"# =============================================================================\n",
"# §4 分层回测 Quantile Return Analysis\n",
"\n",
"# -----------------------------------------------------------------------------\n",
"# 分层回测 (Quantile Analysis / Bucket Test) 是验证因子有效性的经典直观方法。\n",
"# Quantile analysis is the classic, intuitive way to validate a factor.\n",
"#\n",
"# 操作步骤 / Procedure:\n",
"# ① 在每个调仓日 T按因子值将全部股票从小到大排序\n",
"# ② 等分为 Q 组(常用 Q=5 五分位)\n",
"# ③ 每组各构建等权组合 (equal-weight portfolio),持有到下次调仓日 T+21\n",
"# ④ 记录每组收益,重复直到回测结束\n",
"#\n",
"# ① On each rebalance date T, rank all stocks by factor value\n",
"# ② Split into Q equal-sized buckets (usually Q=5 quintiles)\n",
"# ③ Each bucket forms an equal-weight portfolio, held until next rebalance T+21\n",
"# ④ Record each bucket's return, repeat until end of backtest\n",
"#\n",
"# 期望结果 / Expected result:\n",
"# Q5 累积收益 > Q4 > Q3 > Q2 > Q1\n",
"# 即因子值越高,未来收益越高 —— 这是因子有效的最直观证明\n",
"# Higher factor value → higher future return = most intuitive proof of efficacy\n",
"\n",
"# ============================================================================="
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a7c15a4c",
"metadata": {},
"outputs": [],
"source": [
"print(\"\\n[§4] 分层回测 / Quantile Return Analysis...\")\n",
"\n",
"N_QUANTILES = 5 # 五分位 / quintiles\n",
"REBAL_PERIOD = 21 # 月度调仓 / monthly rebalancing\n",
"\n",
"# 选用 ICIR 最高的因子做分层演示 / Use factor with highest |ICIR| for demo\n",
"best_factor_name = max(ic_stats, key=lambda n: abs(ic_stats[n][\"ICIR\"]))\n",
"best_factor = FACTORS_PROCESSED[best_factor_name]\n",
"print(f\" 最强因子 (Best Factor): {best_factor_name} ICIR={ic_stats[best_factor_name]['ICIR']:+.3f}\")\n",
"\n",
"\n",
"def compute_quantile_returns(\n",
" factor_df: pd.DataFrame,\n",
" price_df: pd.DataFrame,\n",
" n_quantiles: int = 5,\n",
" rebal_period: int = 21,\n",
") -> pd.DataFrame:\n",
" \"\"\"\n",
" 分层回测:每月调仓,计算各五分位等权组合的期间收益。\n",
" Quintile backtest: monthly rebalancing, equal-weight portfolio returns per bucket.\n",
"\n",
" Returns:\n",
" DataFrame, columns=[Q1..Q5], index=rebalance dates,\n",
" values=equal-weight return for that holding period\n",
" \"\"\"\n",
" # 调仓日序列 / Rebalance date sequence\n",
" rebal_dates = price_df.index[::rebal_period]\n",
"\n",
" bucket_returns = {f\"Q{q}\": [] for q in range(1, n_quantiles + 1)}\n",
" period_dates = []\n",
"\n",
" for i in range(len(rebal_dates) - 1):\n",
" t0 = rebal_dates[i] # 建仓日 / entry date\n",
" t1 = rebal_dates[i + 1] # 平仓日 / exit date\n",
"\n",
" f_today = factor_df.loc[t0].dropna()\n",
" if len(f_today) < n_quantiles * 3: # 股票太少则跳过 / skip if too few stocks\n",
" continue\n",
"\n",
" # pd.qcut 按因子值等分为 Q 组 / split into Q equal-sized bins\n",
" labels = pd.qcut(\n",
" f_today.rank(method='first'), # 先按秩排名(避免重复值问题)\n",
" n_quantiles,\n",
" labels=[f\"Q{q}\" for q in range(1, n_quantiles + 1)]\n",
" )\n",
"\n",
" # 每组的持有期收益(等权平均)/ equal-weight return per group\n",
" p0 = price_df.loc[t0]\n",
" p1 = price_df.loc[t1]\n",
" hold_ret = p1 / p0 - 1.0\n",
"\n",
" for q in range(1, n_quantiles + 1):\n",
" stocks_in_q = labels[labels == f\"Q{q}\"].index\n",
" bucket_returns[f\"Q{q}\"].append(hold_ret[stocks_in_q].mean())\n",
"\n",
" period_dates.append(t0)\n",
"\n",
" return pd.DataFrame(bucket_returns, index=period_dates)\n",
"\n",
"\n",
"quantile_rets = compute_quantile_returns(best_factor, prices_df, N_QUANTILES, REBAL_PERIOD)\n",
"quantile_cumret = (1 + quantile_rets).cumprod() # 累积净值 / cumulative NAV\n",
"\n",
"# 多空价差 (Long-Short Spread): Q5多头- Q1空头\n",
"ls_spread = quantile_rets[\"Q5\"] - quantile_rets[\"Q1\"]\n",
"ls_cumret = (1 + ls_spread).cumprod()\n",
"\n",
"periods_per_year = 252.0 / REBAL_PERIOD\n",
"\n",
"print(f\"\\n {'分组':8s} {'年化收益':>10s} {'累积收益':>10s}\")\n",
"print(f\" \" + \"─\" * 40)\n",
"for q in range(1, N_QUANTILES + 1):\n",
" ret_s = quantile_rets[f\"Q{q}\"]\n",
" ann_ret = (1 + ret_s.mean()) ** periods_per_year - 1\n",
" cum_ret = quantile_cumret[f\"Q{q}\"].iloc[-1] - 1\n",
" print(f\" Q{q} {ann_ret:+10.2%} {cum_ret:+10.2%}\")\n",
"ann_ls = (1 + ls_spread.mean()) ** periods_per_year - 1\n",
"print(f\" L-S(Q5-Q1) {ann_ls:+9.2%} {ls_cumret.iloc[-1]-1:+10.2%}\")\n",
"print(f\" \" + \"─\" * 40)"
]
},
{
"cell_type": "markdown",
"id": "cd68d8d8",
"metadata": {},
"source": [
"# =============================================================================\n",
"# §5 因子合成 Factor Combination\n",
"\n",
"# -----------------------------------------------------------------------------\n",
"# 单个因子的预测能力有限IC 通常只有 0.05~0.10),因子合成可以:\n",
"# Individual factors have limited predictive power (IC ≈ 0.05~0.10).\n",
"# Factor combination can:\n",
"#\n",
"# ① 提升综合 ICIR多个信号互补减少因子特定噪音\n",
"# Improve composite ICIR (signals complement each other, reduce factor noise)\n",
"# ② 覆盖更多 Alpha 来源(动量+质量+流动性的综合)\n",
"# Cover more alpha sources (momentum + quality + liquidity combined)\n",
"# ③ 降低单因子的\"失效期\"风险(某个风格因子可能在某段时间失效)\n",
"# Reduce regime risk (individual factors can fail in certain market regimes)\n",
"#\n",
"# 方法1: 等权合成 (Equal-Weight Composite)\n",
"# composite_EQ = (z_1 + z_2 + … + z_n) / n\n",
"# 优点: 简单、稳健,不依赖历史 IC 估计 缺点: 不区分因子强弱\n",
"# Pro: simple, robust Con: treats all factors equally regardless of efficacy\n",
"#\n",
"# 方法2: IC 加权合成 (IC-Weighted Composite)\n",
"# w_i = max(IC_mean_i, 0) / Σ max(IC_mean_j, 0) (只给正 IC 因子分配权重)\n",
"# composite_ICW = Σ w_i × z_i\n",
"# 优点: 给更强的因子更高权重 缺点: 历史 IC 可能不稳定(过拟合风险)\n",
"# Pro: higher weight to stronger factors Con: historical IC may be unstable (overfitting)\n",
"\n",
"# ============================================================================="
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8a132503",
"metadata": {},
"outputs": [],
"source": [
"print(\"\\n[§5] 因子合成 / Factor Combination...\")\n",
"\n",
"# 只合成正 IC 的因子(负 IC 因子方向混乱,不宜纳入)\n",
"# Only combine factors with positive IC (negative IC factors are directionally inconsistent)\n",
"positive_ic_factors = {\n",
" name: f for name, f in FACTORS_PROCESSED.items()\n",
" if ic_stats[name][\"IC Mean\"] > 0\n",
"}\n",
"print(f\" IC均值为正的因子: {list(positive_ic_factors.keys())}\")\n",
"\n",
"# ── 等权合成 / Equal-Weight Composite ─────────────────────────────────────────\n",
"composite_eq = sum(\n",
" f.reindex(columns=symbols).fillna(0)\n",
" for f in positive_ic_factors.values()\n",
") / len(positive_ic_factors)\n",
"\n",
"# ── IC 加权合成 / IC-Weighted Composite ───────────────────────────────────────\n",
"ic_weights_raw = {n: max(ic_stats[n][\"IC Mean\"], 0) for n in positive_ic_factors}\n",
"total_w = sum(ic_weights_raw.values())\n",
"ic_weights = {n: w / total_w for n, w in ic_weights_raw.items()}\n",
"\n",
"composite_icw = sum(\n",
" ic_weights[n] * f.reindex(columns=symbols).fillna(0)\n",
" for n, f in positive_ic_factors.items()\n",
")\n",
"\n",
"# 计算合成因子的 IC / Evaluate composite factor IC\n",
"ic_eq = compute_ic_series(composite_eq, simple_ret_df, FORWARD_PERIOD)\n",
"ic_icw = compute_ic_series(composite_icw, simple_ret_df, FORWARD_PERIOD)\n",
"\n",
"print(f\"\\n IC 权重分配 / IC Weights:\")\n",
"for name, w in ic_weights.items():\n",
" print(f\" {name}: {w:.3f}\")\n",
"\n",
"print(f\"\\n {'合成方法':22s} {'IC均值':>8s} {'ICIR':>8s}\")\n",
"print(f\" \" + \"─\" * 45)\n",
"print(f\" {'等权 Equal-Weight':22s} {ic_eq.mean():+8.4f} {ic_eq.mean()/ic_eq.std():+8.4f}\")\n",
"print(f\" {'IC加权 IC-Weighted':22s} {ic_icw.mean():+8.4f} {ic_icw.mean()/ic_icw.std():+8.4f}\")"
]
},
{
"cell_type": "markdown",
"id": "eaa4a346",
"metadata": {},
"source": [
"# =============================================================================\n",
"# §6 多空组合 Long-Short Portfolio\n",
"\n",
"# -----------------------------------------------------------------------------\n",
"# 多空组合 (Long-Short Portfolio) 是因子策略的实际收益实现形式。\n",
"# A long-short portfolio is how a factor strategy generates actual P&L.\n",
"#\n",
"# 构建逻辑 / Construction logic:\n",
"# 做多 (Long) = 买入因子分最高的 Top 20% 股票(预期跑赢,做多享受上涨)\n",
"# 做空 (Short) = 卖空因子分最低的 Bottom 20% 股票(预期跑输,做空赚下跌)\n",
"# 多空价差 = 多头组合收益 - 空头组合收益(无论市场涨跌都能赚钱)\n",
"#\n",
"# Long = Buy top 20% stocks by factor score (expected to outperform)\n",
"# Short = Sell short bottom 20% stocks (expected to underperform)\n",
"# L-S = Long return Short return (market-neutral, profits in any market)\n",
"#\n",
"# 注意 A 股限制 / A-share restriction:\n",
"# A 股融券做空受到严格限制,本演示仅作为量化研究的理论演示。\n",
"# 在港股、美股等市场中,做空非常普遍。\n",
"# Short selling in A-shares is heavily restricted. This demo is theoretical.\n",
"# Short selling is common and practical in HK/US markets.\n",
"\n",
"# ============================================================================="
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "48635af1",
"metadata": {},
"outputs": [],
"source": [
"print(\"\\n[§6] 多空组合 / Long-Short Portfolio...\")\n",
"\n",
"TOP_PCTILE = 0.20 # 前后 20% / top & bottom 20%\n",
"\n",
"\n",
"def compute_long_short_portfolio(\n",
" factor_df: pd.DataFrame,\n",
" price_df: pd.DataFrame,\n",
" top_pct: float = 0.20,\n",
" rebal_period: int = 21,\n",
") -> dict:\n",
" \"\"\"\n",
" 构建多空组合,计算每个持仓期的多头、空头、多空价差收益。\n",
" Build a long-short portfolio; compute per-period long, short, L-S returns.\n",
"\n",
" Returns: dict with 'long', 'short', 'long_short' keys, each a pd.Series.\n",
" \"\"\"\n",
" rebal_dates = price_df.index[::rebal_period]\n",
" long_rets, short_rets, ls_rets, ret_dates = [], [], [], []\n",
"\n",
" for i in range(len(rebal_dates) - 1):\n",
" t0, t1 = rebal_dates[i], rebal_dates[i + 1]\n",
" f_today = factor_df.loc[t0].dropna()\n",
" if len(f_today) < 10:\n",
" continue\n",
" n_top = max(1, int(len(f_today) * top_pct))\n",
"\n",
" long_stocks = f_today.nlargest(n_top).index # Top 20% → 买入\n",
" short_stocks = f_today.nsmallest(n_top).index # Bottom 20% → 卖空\n",
"\n",
" p0 = price_df.loc[t0]\n",
" p1 = price_df.loc[t1]\n",
" ret = p1 / p0 - 1.0\n",
"\n",
" lr = ret[long_stocks].mean()\n",
" sr = ret[short_stocks].mean()\n",
" long_rets.append(lr)\n",
" short_rets.append(sr)\n",
" ls_rets.append(lr - sr) # 多空价差 / long-short spread\n",
" ret_dates.append(t0)\n",
"\n",
" return {\n",
" \"long\" : pd.Series(long_rets, index=ret_dates, name=\"Long\"),\n",
" \"short\" : pd.Series(short_rets, index=ret_dates, name=\"Short\"),\n",
" \"long_short\" : pd.Series(ls_rets, index=ret_dates, name=\"L-S\"),\n",
" }\n",
"\n",
"\n",
"def compute_max_drawdown(returns: pd.Series) -> float:\n",
" \"\"\"最大回撤 / Maximum Drawdown\"\"\"\n",
" cum = (1 + returns).cumprod()\n",
" peak = cum.cummax()\n",
" return ((cum - peak) / peak).min()\n",
"\n",
"\n",
"def compute_sharpe(returns: pd.Series, ann_factor: float) -> float:\n",
" \"\"\"夏普比率 / Annualized Sharpe Ratio\"\"\"\n",
" return returns.mean() / returns.std() * np.sqrt(ann_factor) if returns.std() > 0 else 0.0\n",
"\n",
"\n",
"ls_result = compute_long_short_portfolio(composite_icw, prices_df, TOP_PCTILE, REBAL_PERIOD)\n",
"ls_equity = {k: (1 + v).cumprod() for k, v in ls_result.items()}\n",
"\n",
"ann_factor = periods_per_year\n",
"print(f\"\\n {'组合':14s} {'年化收益':>10s} {'夏普比率':>10s} {'最大回撤':>10s}\")\n",
"print(f\" \" + \"─\" * 54)\n",
"for name, rets in ls_result.items():\n",
" ann_ret = (1 + rets.mean()) ** ann_factor - 1\n",
" sharpe = compute_sharpe(rets, ann_factor)\n",
" mdd = compute_max_drawdown(rets)\n",
" print(f\" {name:14s} {ann_ret:+10.2%} {sharpe:+10.3f} {mdd:+10.2%}\")\n",
"print(f\" \" + \"─\" * 54)"
]
},
{
"cell_type": "markdown",
"id": "04431d61",
"metadata": {},
"source": [
"# =============================================================================\n",
"# §7 因子衰减分析 Factor Decay Analysis\n",
"\n",
"# -----------------------------------------------------------------------------\n",
"# 因子衰减 (Factor Decay) 描述了因子预测能力随持有期增加而减弱的规律:\n",
"# Factor decay describes how predictive power weakens as holding period grows:\n",
"#\n",
"# 短持有期 H=1D: IC 最高(信号最新鲜,预测力最强)\n",
"# H 增大: IC 逐渐下降(信号被市场消化,噪音积累)\n",
"# H=63D1季度: IC 接近 0信号已基本失效\n",
"#\n",
"# 实践意义 / Practical implications:\n",
"# ┌──────────────────┬──────────────────────────────────────────────┐\n",
"# │ 衰减类型 │ 适合持仓频率 │\n",
"# ├──────────────────┼──────────────────────────────────────────────┤\n",
"# │ 快速衰减 (Fast) │ 日频或周频换仓(换手率高,成本高!) │\n",
"# │ 慢速衰减 (Slow) │ 月频或季频换仓(成本效率更高,适合实盘) │\n",
"# └──────────────────┴──────────────────────────────────────────────┘\n",
"\n",
"# ============================================================================="
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a2292608",
"metadata": {},
"outputs": [],
"source": [
"print(\"\\n[§7] 因子衰减分析 / Factor Decay Analysis...\")\n",
"\n",
"DECAY_HORIZONS = [1, 5, 10, 21, 42, 63] # 持有期(交易日)/ holding periods (days)\n",
"\n",
"decay_results = {}\n",
"for fname in [best_factor_name, \"REV\"]: # 对比 \"慢衰减\" vs \"快衰减\" 因子\n",
" horizon_ics = []\n",
" for h in DECAY_HORIZONS:\n",
" ic_h = compute_ic_series(FACTORS_PROCESSED[fname], simple_ret_df, h)\n",
" horizon_ics.append(ic_h.mean())\n",
" decay_results[fname] = horizon_ics\n",
" ic_str = \" \".join([f\"H={h}D: {v:+.4f}\" for h, v in zip(DECAY_HORIZONS, horizon_ics)])\n",
" print(f\" {fname}: {ic_str}\")"
]
},
{
"cell_type": "markdown",
"id": "9ecc538c",
"metadata": {},
"source": [
"# =============================================================================\n",
"# §8 可视化 Visualization\n",
"\n",
"# ============================================================================="
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "34a1f89e",
"metadata": {},
"outputs": [],
"source": [
"print(\"\\n[§8] 生成可视化图表 / Generating visualization...\")\n",
"\n",
"pct_fmt = FuncFormatter(lambda x, _: f\"{x:.0%}\")\n",
"\n",
"fig = plt.figure(figsize=(20, 24))\n",
"fig.suptitle(\n",
" \"Alpha 因子研究演示 | Alpha Factor Research Demo\\n\"\n",
" \"50只合成股票 × 3年日线数据 | 50 Synthetic Stocks × 3-Year Daily Data\",\n",
" fontsize=14, fontweight='bold', y=0.99\n",
")\n",
"gs = gridspec.GridSpec(4, 3, figure=fig, hspace=0.50, wspace=0.35)\n",
"\n",
"# ── Panel 1: 股票池价格(归一化)/ Universe Prices (Normalized) ──────────────\n",
"ax1 = fig.add_subplot(gs[0, 0])\n",
"norm_prices = prices_df / prices_df.iloc[0] # 归一化 / normalize to 1\n",
"for sym in symbols[:12]:\n",
" ax1.plot(dates, norm_prices[sym], alpha=0.35, linewidth=0.7, color='#1f77b4')\n",
"eq_index = norm_prices.mean(axis=1) # 等权指数 / equal-weight index\n",
"ax1.plot(dates, eq_index, color='black', linewidth=2, label='等权指数 EW Index', zorder=5)\n",
"ax1.axhline(1, color='gray', linewidth=0.6, linestyle='--')\n",
"ax1.set_title(\"股票池价格(归一化)\\nUniverse Prices (Normalized)\", fontsize=10)\n",
"ax1.set_ylabel(\"归一化价格\")\n",
"ax1.legend(fontsize=8)\n",
"ax1.tick_params(axis='x', rotation=25, labelsize=8)\n",
"ax1.xaxis.set_major_locator(plt.MaxNLocator(4))\n",
"\n",
"# ── Panel 2: IC 时间序列5因子/ IC Time Series ─────────────────────────────\n",
"ax2 = fig.add_subplot(gs[0, 1:])\n",
"ic_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd']\n",
"for (name, ic_s), color in zip(ic_series_dict.items(), ic_colors):\n",
" rolling_ic = ic_s.rolling(20, min_periods=5).mean() # 20日滚动均值平滑\n",
" ax2.plot(ic_s.index, rolling_ic, label=name, color=color, linewidth=1.4)\n",
"ax2.axhline(0, color='black', linewidth=0.8, linestyle='--')\n",
"ax2.axhline(0.05, color='green', linewidth=0.8, linestyle=':', alpha=0.7)\n",
"ax2.axhline(-0.05, color='red', linewidth=0.8, linestyle=':', alpha=0.7)\n",
"ax2.set_title(\"因子 IC 时间序列20日滚动均值\\nFactor IC Time Series (20D Rolling Mean)\", fontsize=10)\n",
"ax2.set_ylabel(\"IC\")\n",
"ax2.legend(loc='upper right', ncol=5, fontsize=8)\n",
"ax2.tick_params(axis='x', rotation=25, labelsize=8)\n",
"ax2.xaxis.set_major_locator(plt.MaxNLocator(5))\n",
"\n",
"# ── Panel 3: IC 均值 & 误差棒 / IC Mean Bar Chart ─────────────────────────────\n",
"ax3 = fig.add_subplot(gs[1, 0])\n",
"names_list = list(ic_stats.keys())\n",
"ic_means = [ic_stats[n][\"IC Mean\"] for n in names_list]\n",
"ic_stds = [ic_stats[n][\"IC Std\"] for n in names_list]\n",
"bar_colors3 = ['#2ca02c' if v > 0 else '#d62728' for v in ic_means]\n",
"ax3.bar(names_list, ic_means, yerr=ic_stds, capsize=5, color=bar_colors3,\n",
" alpha=0.8, error_kw={'linewidth': 1.5, 'ecolor': 'black'})\n",
"ax3.axhline(0, color='black', linewidth=0.8)\n",
"ax3.axhline(0.05, color='green', linewidth=1.2, linestyle='--', alpha=0.7)\n",
"ax3.axhline(-0.05, color='red', linewidth=1.2, linestyle='--', alpha=0.7)\n",
"ax3.set_title(\"因子 IC 均值(误差棒=±1σ\\nIC Mean ± 1σ\", fontsize=10)\n",
"ax3.set_ylabel(\"IC 均值\")\n",
"ax3.tick_params(axis='x', labelsize=9)\n",
"\n",
"# ── Panel 4: ICIR 柱状图 / ICIR Bar Chart ──────────────────────────────────────\n",
"ax4 = fig.add_subplot(gs[1, 1])\n",
"icirs = [ic_stats[n][\"ICIR\"] for n in names_list]\n",
"bar_colors4 = ['#2ca02c' if v > 0 else '#d62728' for v in icirs]\n",
"ax4.bar(names_list, icirs, color=bar_colors4, alpha=0.8)\n",
"ax4.axhline(0, color='black', linewidth=0.8)\n",
"ax4.axhline(0.5, color='green', linewidth=1.2, linestyle='--', alpha=0.7, label='ICIR=0.5')\n",
"ax4.axhline(-0.5, color='red', linewidth=1.2, linestyle='--', alpha=0.7)\n",
"ax4.set_title(\"因子 ICIR\\nFactor ICIR (Mean IC / Std IC)\", fontsize=10)\n",
"ax4.set_ylabel(\"ICIR\")\n",
"ax4.legend(fontsize=8)\n",
"ax4.tick_params(axis='x', labelsize=9)\n",
"\n",
"# ── Panel 5: 因子相关矩阵 / Factor Correlation Matrix ─────────────────────────\n",
"ax5 = fig.add_subplot(gs[1, 2])\n",
"# 用各因子的截面均值时序来衡量因子间相关性 / Use cross-sectional mean timeseries\n",
"factor_ts = pd.DataFrame({n: f.mean(axis=1) for n, f in FACTORS_PROCESSED.items()})\n",
"corr_matrix = factor_ts.corr()\n",
"im5 = ax5.imshow(corr_matrix.values, cmap='RdYlGn', vmin=-1, vmax=1, aspect='auto')\n",
"ax5.set_xticks(range(len(names_list)))\n",
"ax5.set_yticks(range(len(names_list)))\n",
"ax5.set_xticklabels(names_list, fontsize=9)\n",
"ax5.set_yticklabels(names_list, fontsize=9)\n",
"for i in range(len(names_list)):\n",
" for j in range(len(names_list)):\n",
" ax5.text(j, i, f\"{corr_matrix.values[i, j]:.2f}\",\n",
" ha='center', va='center', fontsize=8,\n",
" color='black' if abs(corr_matrix.values[i, j]) < 0.6 else 'white')\n",
"plt.colorbar(im5, ax=ax5)\n",
"ax5.set_title(\"因子相关矩阵\\nFactor Correlation Matrix\", fontsize=10)\n",
"\n",
"# ── Panel 6: 分层回测累积净值 / Quantile Cumulative Returns ───────────────────\n",
"ax6 = fig.add_subplot(gs[2, 0])\n",
"q_colors = plt.cm.RdYlGn(np.linspace(0.05, 0.95, N_QUANTILES))\n",
"for q in range(1, N_QUANTILES + 1):\n",
" ax6.plot(quantile_cumret.index, quantile_cumret[f\"Q{q}\"],\n",
" label=f\"Q{q}\", color=q_colors[q - 1], linewidth=1.5)\n",
"ax6.plot(ls_cumret.index, ls_cumret, label='L-S', color='black', linewidth=2, linestyle='--')\n",
"ax6.axhline(1, color='gray', linewidth=0.6, linestyle=':')\n",
"ax6.set_title(f\"分层回测({best_factor_name}\\nQuantile Returns ({best_factor_name})\", fontsize=10)\n",
"ax6.set_ylabel(\"累积净值\")\n",
"ax6.legend(ncol=3, fontsize=8)\n",
"ax6.yaxis.set_major_formatter(FuncFormatter(lambda x, _: f\"{x:.1f}x\"))\n",
"ax6.tick_params(axis='x', rotation=25, labelsize=8)\n",
"ax6.xaxis.set_major_locator(plt.MaxNLocator(4))\n",
"\n",
"# ── Panel 7: 分位年化收益柱状图 / Quintile Annualized Return Bar ───────────────\n",
"ax7 = fig.add_subplot(gs[2, 1])\n",
"ann_q_rets = [(1 + quantile_rets[f\"Q{q}\"].mean()) ** periods_per_year - 1\n",
" for q in range(1, N_QUANTILES + 1)]\n",
"bar_colors7 = plt.cm.RdYlGn(np.linspace(0.05, 0.95, N_QUANTILES))\n",
"bars7 = ax7.bar([f\"Q{q}\" for q in range(1, N_QUANTILES + 1)], ann_q_rets, color=bar_colors7)\n",
"ax7.axhline(0, color='black', linewidth=0.8)\n",
"ax7.set_title(\"各分位组年化收益\\nAnnualized Return per Quintile\", fontsize=10)\n",
"ax7.set_ylabel(\"年化收益 / Ann. Return\")\n",
"ax7.yaxis.set_major_formatter(pct_fmt)\n",
"ax7.tick_params(axis='x', labelsize=9)\n",
"for bar, val in zip(bars7, ann_q_rets):\n",
" ax7.text(bar.get_x() + bar.get_width() / 2, val,\n",
" f\"{val:.1%}\", ha='center',\n",
" va='bottom' if val >= 0 else 'top', fontsize=8)\n",
"\n",
"# ── Panel 8: 多空组合净值曲线 / Long-Short Equity Curve ───────────────────────\n",
"ax8 = fig.add_subplot(gs[2, 2])\n",
"ax8.plot(ls_equity['long_short'].index, ls_equity['long_short'].values,\n",
" color='purple', linewidth=2, label='多空 L-S')\n",
"ax8.plot(ls_equity['long'].index, ls_equity['long'].values,\n",
" color='#2ca02c', linewidth=1.5, linestyle='--', label='多头 Long', alpha=0.8)\n",
"ax8.plot(ls_equity['short'].index, ls_equity['short'].values,\n",
" color='#d62728', linewidth=1.5, linestyle='--', label='空头 Short', alpha=0.8)\n",
"ax8.axhline(1, color='gray', linewidth=0.6, linestyle=':')\n",
"ax8.set_title(\"多空组合净值曲线\\nLong-Short Portfolio NAV\", fontsize=10)\n",
"ax8.set_ylabel(\"净值 / NAV\")\n",
"ax8.legend(fontsize=8)\n",
"ax8.yaxis.set_major_formatter(FuncFormatter(lambda x, _: f\"{x:.1f}x\"))\n",
"ax8.tick_params(axis='x', rotation=25, labelsize=8)\n",
"ax8.xaxis.set_major_locator(plt.MaxNLocator(4))\n",
"\n",
"# ── Panel 9: 因子衰减曲线 / Factor Decay Curve ────────────────────────────────\n",
"ax9 = fig.add_subplot(gs[3, :])\n",
"decay_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']\n",
"horizon_labels = [f\"{h}D\" for h in DECAY_HORIZONS]\n",
"for (fname, decay_ics), color in zip(decay_results.items(), decay_colors):\n",
" ax9.plot(range(len(DECAY_HORIZONS)), decay_ics,\n",
" 'o-', label=fname, color=color, linewidth=2, markersize=7)\n",
" for i, (h, v) in enumerate(zip(DECAY_HORIZONS, decay_ics)):\n",
" ax9.annotate(f\"{v:+.3f}\", (i, v), textcoords=\"offset points\",\n",
" xytext=(0, 8), ha='center', fontsize=8, color=color)\n",
"ax9.axhline(0, color='black', linewidth=0.8, linestyle='--')\n",
"ax9.axhline(0.05, color='green', linewidth=0.8, linestyle=':', alpha=0.7, label='IC=0.05 参考线')\n",
"ax9.set_title(\n",
" \"因子衰减曲线 Factor Decay Curve\\n\"\n",
" \"IC 均值随持有期延长的衰减 | How Mean IC Decays with Longer Holding Period\",\n",
" fontsize=10\n",
")\n",
"ax9.set_xlabel(\"持有期 / Holding Period\")\n",
"ax9.set_ylabel(\"IC 均值 / Mean IC\")\n",
"ax9.set_xticks(range(len(DECAY_HORIZONS)))\n",
"ax9.set_xticklabels(horizon_labels)\n",
"ax9.legend(fontsize=9, ncol=4)\n",
"ax9.grid(True, alpha=0.3)\n",
"ax9.set_xlim(-0.3, len(DECAY_HORIZONS) - 0.7)\n",
"\n",
"plt.savefig(\"alpha_factor_demo.png\", dpi=150, bbox_inches='tight')\n",
"print(f\" 图表已保存: alpha_factor_demo.png / Chart saved: alpha_factor_demo.png\")\n",
"\n",
"print(\"\\n\" + \"=\" * 70)\n",
"print(\" 演示完成! Demo Complete!\")\n",
"print(\" 输出: alpha_factor_demo.png\")\n",
"print(\"=\" * 70)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (trading)",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python",
"version": "3.11.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}