feat: Add comprehensive quantitative trading data pipeline demo
- Implemented a data pipeline for quantitative trading covering: - Price adjustment for corporate actions (splits & dividends) - Calculation of simple and log returns - Handling of multi-stock panels and missing values - Outlier detection and treatment methods (Z-score, MAD, Winsorize) - Trading calendar creation and cross-market alignment - End-to-end DataPipeline class for data cleaning and analysis - Included visualizations for price adjustments, return comparisons, and missing value handling - Added detailed comments and documentation in Chinese for clarity
This commit is contained in:
parent
81852b83b9
commit
9023c8eb76
|
|
@ -1,3 +1,15 @@
|
|||
# =============================================================================
|
||||
# Quantitative Trading — Data Pipeline Demo
|
||||
# =============================================================================
|
||||
# Topics covered:
|
||||
# 1. Price Adjustment for Corporate Actions (splits & dividends)
|
||||
# 2. Simple Returns vs Log Returns
|
||||
# 3. Multi-stock Panel & Missing Value Handling
|
||||
# 4. Outlier Detection & Treatment (Z-score, MAD, Winsorize)
|
||||
# 5. Trading Calendar & Cross-market Alignment
|
||||
# 6. End-to-End DataPipeline Class
|
||||
# =============================================================================
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import matplotlib.pyplot as plt
|
||||
|
|
@ -14,6 +26,20 @@ plt.rcParams['figure.dpi'] = 100
|
|||
np.random.seed(42)
|
||||
print("环境准备完成!")
|
||||
|
||||
# =============================================================================
|
||||
# TOPIC 1: Price Adjustment for Corporate Actions (价格复权)
|
||||
# -----------------------------------------------------------------------------
|
||||
# Raw stock prices have artificial jumps caused by stock splits and cash
|
||||
# dividends. These must be adjusted before computing returns or signals,
|
||||
# otherwise your algorithm will see fake crashes/spikes.
|
||||
#
|
||||
# Key concepts:
|
||||
# - Unadjusted price (未复权): raw market price, has visible jumps
|
||||
# - Forward-adjusted (前复权): history scaled to today's price level
|
||||
# - Backward-adjusted (后复权): history kept real, recent prices scaled up
|
||||
# - Adjustment factor (复权因子): multiplier that accounts for all events
|
||||
# =============================================================================
|
||||
|
||||
def generate_raw_stock_data(n_days=1000, seed=42):
|
||||
"""生成包含拆股和分红事件的原始股票数据"""
|
||||
np.random.seed(seed)
|
||||
|
|
@ -145,6 +171,23 @@ print(f"未复权收益率: {stock_data['unadj_close'].iloc[split_idx] / stock_d
|
|||
print(f"前复权收益率: {stock_data['fwd_adj_close'].iloc[split_idx] / stock_data['fwd_adj_close'].iloc[split_idx-1] - 1:.4f}")
|
||||
print(f"真实收益率: {stock_data['close'].iloc[split_idx] / stock_data['close'].iloc[split_idx-1] - 1:.4f}")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# TOPIC 2: Simple Returns vs Log Returns (简单收益率 vs 对数收益率)
|
||||
# -----------------------------------------------------------------------------
|
||||
# Two ways to measure how much a price changed:
|
||||
# - Simple return: r = (P_t / P_{t-1}) - 1
|
||||
# - Log return: r = ln(P_t / P_{t-1})
|
||||
#
|
||||
# Why log returns are preferred in quant research:
|
||||
# - Time-additive: sum daily log returns to get total log return
|
||||
# - Better statistical properties (closer to normal distribution)
|
||||
# - Numerically stable for long compounding periods
|
||||
#
|
||||
# At daily frequency the difference is tiny (~1bp), but it grows at
|
||||
# monthly/annual frequency — never mix the two in the same calculation.
|
||||
# =============================================================================
|
||||
|
||||
# 使用前复权价格计算收益率
|
||||
prices = stock_data['fwd_adj_close']
|
||||
|
||||
|
|
@ -211,6 +254,22 @@ plt.tight_layout()
|
|||
plt.show()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# TOPIC 3: Multi-stock Panel & Missing Value Handling (多标的面板与缺失值处理)
|
||||
# -----------------------------------------------------------------------------
|
||||
# Real market data panels have two main sources of missing values:
|
||||
# - Suspensions (停牌): a stock halts trading for days/weeks
|
||||
# - Data feed gaps: vendor errors, holidays, late reporting
|
||||
#
|
||||
# Strategies (choice depends on context):
|
||||
# - ffill (forward fill): use last known price — standard for suspensions
|
||||
# - Linear interpolation: smooth gaps — OK for short random gaps
|
||||
# - ffill with limit: ffill but cap at N days — conservative choice
|
||||
# - Drop: remove rows/columns — loses cross-sectional data
|
||||
#
|
||||
# Rule of thumb: if missing% > 10%, consider excluding the stock entirely.
|
||||
# =============================================================================
|
||||
|
||||
def generate_multi_stock_panel(n_stocks=5, n_days=500, seed=42):
|
||||
"""生成多标的日线数据面板,包含停牌和缺失值"""
|
||||
np.random.seed(seed)
|
||||
|
|
@ -363,6 +422,28 @@ ax.set_title('收益率相关性矩阵', fontsize=14)
|
|||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# TOPIC 4: Outlier Detection & Treatment (异常值检测与处理)
|
||||
# -----------------------------------------------------------------------------
|
||||
# Financial return series contain extreme values from real events (crashes,
|
||||
# halts, data errors). How you handle them affects every downstream signal.
|
||||
#
|
||||
# Detection methods:
|
||||
# - Z-score: |z| = |(x - mean) / std| > 3 — simple, but mean/std are
|
||||
# themselves pulled by outliers (not robust)
|
||||
# - MAD: uses median instead of mean — robust to extreme values,
|
||||
# preferred for financial data
|
||||
#
|
||||
# Treatment methods:
|
||||
# - Remove: delete the outlier row — reduces sample size
|
||||
# - Winsorize: clip to [p1, p99] boundary — keeps all data, limits impact
|
||||
# standard practice before factor construction
|
||||
#
|
||||
# Q-Q plot: visualizes how far returns deviate from a normal distribution.
|
||||
# Fat tails (leptokurtosis) are a universal property of financial returns.
|
||||
# =============================================================================
|
||||
|
||||
def detect_outliers_zscore(series, threshold=3.0):
|
||||
"""Z-score (标准分数) 方法检测异常值
|
||||
|
||||
|
|
@ -491,6 +572,28 @@ print("- MAD: 更稳健,推荐用于金融数据")
|
|||
print("- Winsorize: 保留数据量,适合构建因子时使用")
|
||||
print("- 实际中常用: Winsorize(1%, 99%) + MAD检测")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# TOPIC 5: Trading Calendar & Cross-market Alignment (交易日历与跨市场对齐)
|
||||
# -----------------------------------------------------------------------------
|
||||
# Different markets trade on different days (national holidays, weekends).
|
||||
# When combining data from multiple markets, you must decide how to align
|
||||
# the time axis — the wrong choice distorts correlation calculations.
|
||||
#
|
||||
# Two alignment strategies:
|
||||
# - Intersection (取交集): keep only dates both markets are open.
|
||||
# Best for: correlation, regression, signal comparison.
|
||||
# Drawback: loses data on single-market trading days.
|
||||
#
|
||||
# - Forward fill (向前填充): fill non-trading days with last known price.
|
||||
# Best for: portfolio construction, continuous NAV calculation.
|
||||
# Drawback: introduces artificial zero-return days — inflates Sharpe,
|
||||
# understates volatility. Must be accounted for in analysis.
|
||||
#
|
||||
# Note: this demo uses simplified hardcoded holiday lists. Production systems
|
||||
# should use exchange_calendars or pandas_market_calendars libraries.
|
||||
# =============================================================================
|
||||
|
||||
def create_trading_calendars():
|
||||
"""创建不同市场的交易日历示例
|
||||
|
||||
|
|
@ -599,6 +702,22 @@ print("- 计算相关性、回归: 使用交集日期,确保数据同步")
|
|||
print("- 构建组合/计算市值: 使用ffill,避免数据断裂")
|
||||
print("- 重要: ffill会引入假的'零收益日',计算波动率时需注意")
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# TOPIC 6: End-to-End DataPipeline Class (完整数据清洗管道)
|
||||
# -----------------------------------------------------------------------------
|
||||
# Combines all prior topics into a reusable, sequential pipeline:
|
||||
# Step 1 — Filter stocks with too many missing values
|
||||
# Step 2 — Fill remaining missing values (ffill + bfill)
|
||||
# Step 3 — Compute return series
|
||||
# Step 4 — Detect outliers (MAD) and apply Winsorize
|
||||
# Step 5 — Generate a data quality report per stock
|
||||
#
|
||||
# This pattern (class-based pipeline with a .run() method) mirrors how
|
||||
# production quant systems structure data ingestion — each step is auditable,
|
||||
# configurable, and produces diagnostic output.
|
||||
# =============================================================================
|
||||
|
||||
class DataPipeline:
|
||||
"""简单的数据清洗管道
|
||||
|
||||
Loading…
Reference in New Issue