diff --git a/hello.py b/quant_data_pipeline_demo.py similarity index 80% rename from hello.py rename to quant_data_pipeline_demo.py index bd82afc..1404b14 100644 --- a/hello.py +++ b/quant_data_pipeline_demo.py @@ -1,3 +1,15 @@ +# ============================================================================= +# Quantitative Trading — Data Pipeline Demo +# ============================================================================= +# Topics covered: +# 1. Price Adjustment for Corporate Actions (splits & dividends) +# 2. Simple Returns vs Log Returns +# 3. Multi-stock Panel & Missing Value Handling +# 4. Outlier Detection & Treatment (Z-score, MAD, Winsorize) +# 5. Trading Calendar & Cross-market Alignment +# 6. End-to-End DataPipeline Class +# ============================================================================= + import numpy as np import pandas as pd import matplotlib.pyplot as plt @@ -14,6 +26,20 @@ plt.rcParams['figure.dpi'] = 100 np.random.seed(42) print("环境准备完成!") +# ============================================================================= +# TOPIC 1: Price Adjustment for Corporate Actions (价格复权) +# ----------------------------------------------------------------------------- +# Raw stock prices have artificial jumps caused by stock splits and cash +# dividends. These must be adjusted before computing returns or signals, +# otherwise your algorithm will see fake crashes/spikes. +# +# Key concepts: +# - Unadjusted price (未复权): raw market price, has visible jumps +# - Forward-adjusted (前复权): history scaled to today's price level +# - Backward-adjusted (后复权): history kept real, recent prices scaled up +# - Adjustment factor (复权因子): multiplier that accounts for all events +# ============================================================================= + def generate_raw_stock_data(n_days=1000, seed=42): """生成包含拆股和分红事件的原始股票数据""" np.random.seed(seed) @@ -145,6 +171,23 @@ print(f"未复权收益率: {stock_data['unadj_close'].iloc[split_idx] / stock_d print(f"前复权收益率: {stock_data['fwd_adj_close'].iloc[split_idx] / stock_data['fwd_adj_close'].iloc[split_idx-1] - 1:.4f}") print(f"真实收益率: {stock_data['close'].iloc[split_idx] / stock_data['close'].iloc[split_idx-1] - 1:.4f}") + +# ============================================================================= +# TOPIC 2: Simple Returns vs Log Returns (简单收益率 vs 对数收益率) +# ----------------------------------------------------------------------------- +# Two ways to measure how much a price changed: +# - Simple return: r = (P_t / P_{t-1}) - 1 +# - Log return: r = ln(P_t / P_{t-1}) +# +# Why log returns are preferred in quant research: +# - Time-additive: sum daily log returns to get total log return +# - Better statistical properties (closer to normal distribution) +# - Numerically stable for long compounding periods +# +# At daily frequency the difference is tiny (~1bp), but it grows at +# monthly/annual frequency — never mix the two in the same calculation. +# ============================================================================= + # 使用前复权价格计算收益率 prices = stock_data['fwd_adj_close'] @@ -211,6 +254,22 @@ plt.tight_layout() plt.show() +# ============================================================================= +# TOPIC 3: Multi-stock Panel & Missing Value Handling (多标的面板与缺失值处理) +# ----------------------------------------------------------------------------- +# Real market data panels have two main sources of missing values: +# - Suspensions (停牌): a stock halts trading for days/weeks +# - Data feed gaps: vendor errors, holidays, late reporting +# +# Strategies (choice depends on context): +# - ffill (forward fill): use last known price — standard for suspensions +# - Linear interpolation: smooth gaps — OK for short random gaps +# - ffill with limit: ffill but cap at N days — conservative choice +# - Drop: remove rows/columns — loses cross-sectional data +# +# Rule of thumb: if missing% > 10%, consider excluding the stock entirely. +# ============================================================================= + def generate_multi_stock_panel(n_stocks=5, n_days=500, seed=42): """生成多标的日线数据面板,包含停牌和缺失值""" np.random.seed(seed) @@ -363,6 +422,28 @@ ax.set_title('收益率相关性矩阵', fontsize=14) plt.tight_layout() plt.show() + +# ============================================================================= +# TOPIC 4: Outlier Detection & Treatment (异常值检测与处理) +# ----------------------------------------------------------------------------- +# Financial return series contain extreme values from real events (crashes, +# halts, data errors). How you handle them affects every downstream signal. +# +# Detection methods: +# - Z-score: |z| = |(x - mean) / std| > 3 — simple, but mean/std are +# themselves pulled by outliers (not robust) +# - MAD: uses median instead of mean — robust to extreme values, +# preferred for financial data +# +# Treatment methods: +# - Remove: delete the outlier row — reduces sample size +# - Winsorize: clip to [p1, p99] boundary — keeps all data, limits impact +# standard practice before factor construction +# +# Q-Q plot: visualizes how far returns deviate from a normal distribution. +# Fat tails (leptokurtosis) are a universal property of financial returns. +# ============================================================================= + def detect_outliers_zscore(series, threshold=3.0): """Z-score (标准分数) 方法检测异常值 @@ -491,6 +572,28 @@ print("- MAD: 更稳健,推荐用于金融数据") print("- Winsorize: 保留数据量,适合构建因子时使用") print("- 实际中常用: Winsorize(1%, 99%) + MAD检测") + +# ============================================================================= +# TOPIC 5: Trading Calendar & Cross-market Alignment (交易日历与跨市场对齐) +# ----------------------------------------------------------------------------- +# Different markets trade on different days (national holidays, weekends). +# When combining data from multiple markets, you must decide how to align +# the time axis — the wrong choice distorts correlation calculations. +# +# Two alignment strategies: +# - Intersection (取交集): keep only dates both markets are open. +# Best for: correlation, regression, signal comparison. +# Drawback: loses data on single-market trading days. +# +# - Forward fill (向前填充): fill non-trading days with last known price. +# Best for: portfolio construction, continuous NAV calculation. +# Drawback: introduces artificial zero-return days — inflates Sharpe, +# understates volatility. Must be accounted for in analysis. +# +# Note: this demo uses simplified hardcoded holiday lists. Production systems +# should use exchange_calendars or pandas_market_calendars libraries. +# ============================================================================= + def create_trading_calendars(): """创建不同市场的交易日历示例 @@ -599,6 +702,22 @@ print("- 计算相关性、回归: 使用交集日期,确保数据同步") print("- 构建组合/计算市值: 使用ffill,避免数据断裂") print("- 重要: ffill会引入假的'零收益日',计算波动率时需注意") + +# ============================================================================= +# TOPIC 6: End-to-End DataPipeline Class (完整数据清洗管道) +# ----------------------------------------------------------------------------- +# Combines all prior topics into a reusable, sequential pipeline: +# Step 1 — Filter stocks with too many missing values +# Step 2 — Fill remaining missing values (ffill + bfill) +# Step 3 — Compute return series +# Step 4 — Detect outliers (MAD) and apply Winsorize +# Step 5 — Generate a data quality report per stock +# +# This pattern (class-based pipeline with a .run() method) mirrors how +# production quant systems structure data ingestion — each step is auditable, +# configurable, and produces diagnostic output. +# ============================================================================= + class DataPipeline: """简单的数据清洗管道