trading/demo_baostock_data.py

350 lines
14 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# =============================================================================
# Real Data Acquisition Demo — Baostock
# 真实数据获取演示 — Baostock 版
# =============================================================================
#
# Baostock (http://baostock.com) 是一个完全免费、无需注册的 A 股数据源。
# 与 AKShare 的区别:不需要 Token、不走东方财富 API避免反爬封 IP
# 但数据更新慢(通常滞后 1-2 天)、不支持 ETF 日线数据。
#
# Prerequisites:
# pip install baostock pandas numpy
#
# Baostock 的特点:
# • 本地 SDK 直连服务器,无代理问题
# • 股票代码格式: "sh.600519" / "sz.000001"
# • 数据全是字符串,需要手动转换类型
# • 登录/登出模型: bs.login() → 操作 → bs.logout()
# • adjustflag: 1=后复权 2=前复权 3=不复权
# • frequency: d=日 w=周 m=月 5=5分钟 15=15分钟 30=30分钟 60=60分钟
# =============================================================================
from __future__ import annotations
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from datetime import datetime
import os
import time
# ── 检查 baostock ──
try:
import baostock as bs
except ImportError:
print("请先安装: pip install baostock")
raise
print("=" * 68)
print(" 真实数据获取演示 — Baostock 版")
print(" Real Data Acquisition Demo with Baostock")
print("=" * 68)
# ══════════════════════════════════════════════════════════════════════
# §0 Baostock 的登录机制
# ══════════════════════════════════════════════════════════════════════
#
# Baostock 使用登录/登出模型。每次 session 需要 login() 开始logout() 结束。
# 免费用户每天有调用次数限制(通常够个人使用),不需要 Token。
print("\n[§0] 登录 Baostock")
lg = bs.login()
if lg.error_code == '0':
print(f" ✓ 登录成功")
else:
print(f" ✗ 登录失败: {lg.error_code} {lg.error_msg}")
raise RuntimeError("Baostock 登录失败")
# ══════════════════════════════════════════════════════════════════════
# §1 获取 5 只不同行业的代表股
# ══════════════════════════════════════════════════════════════════════
#
# Baostock 不支持 ETF 日线数据stock_basic 里有 ETF 但 query_history 返回空)。
# 这里选取 5 只不同行业的代表股 + 1 只债券 ETF 的替代(国债指数),
# 后续学习 ETF 轮动策略时,用宽基指数或行业指数替代 ETF 即可。
print("\n[§1] 获取 5 只不同行业的代表股")
STOCKS = {
"sh.600519": "贵州茅台 (消费)",
"sz.000858": "五粮液 (白酒)",
"sh.600036": "招商银行 (金融)",
"sh.600276": "恒瑞医药 (医药)",
"sz.300750": "宁德时代 (新能源)",
}
START_DATE = "2019-01-01"
END_DATE = "2025-06-01"
def fetch_k_data(code: str, start: str, end: str, freq: str = "d") -> pd.DataFrame:
"""
从 Baostock 获取日线数据,返回干净的 DataFrame。
Baostock 返回的数据全部是字符串,需要转换为正确的数值类型。
adjustflag="2" = 前复权。
"""
rs = bs.query_history_k_data_plus(
code,
"date,open,high,low,close,preclose,volume,amount,"
"turn, tradestatus, pctChg, isST",
start_date=start, end_date=end,
frequency=freq,
adjustflag="2" # 前复权
)
if rs.error_code != '0':
print(f" {code} 查询失败: {rs.error_msg}")
return pd.DataFrame()
rows = []
while rs.next():
rows.append(rs.get_row_data())
if not rows:
return pd.DataFrame()
df = pd.DataFrame(rows, columns=rs.fields)
# ── 类型转换Baostock 全是字符串 ──
df['date'] = pd.to_datetime(df['date'])
numeric_cols = ['open', 'high', 'low', 'close', 'preclose',
'volume', 'amount', 'turn', 'pctChg']
for col in numeric_cols:
if col in df.columns:
df[col] = pd.to_numeric(df[col], errors='coerce')
return df.set_index('date').sort_index()
stock_data = {}
for code, name in STOCKS.items():
print(f" 获取 {code} ({name})...", end=" ")
df = fetch_k_data(code, START_DATE, END_DATE)
if len(df) > 0:
stock_data[code] = df
print(f"{len(df)} 行, {df.index[0].date()} ~ {df.index[-1].date()}")
else:
print("无数据")
time.sleep(0.3) # 礼貌间隔
print(f"\n 成功获取: {len(stock_data)} / {len(STOCKS)} 只股票")
# ══════════════════════════════════════════════════════════════════════
# §2 获取宽基指数 & 行业指数(替代 ETF
# ══════════════════════════════════════════════════════════════════════
#
# Baostock 支持指数日线数据。指数代码格式: sh.000300 / sz.399006
# 在 Baostock 的 stock_basic 中 type=2 的是指数。
#
# 下面选了:
# • 3 个宽基指数 (沪深300, 中证500, 创业板指)
# • 4 个行业/主题指数 (替代行业 ETF)
# • 1 个国债指数 (替代国债 ETF, 用于避险)
# =============================================================================
print("\n[§2] 获取指数数据")
# 宽基 + 行业指数
INDICES = {
# 宽基
"sh.000300": "沪深300",
"sh.000905": "中证500",
"sz.399006": "创业板指",
# 行业/主题 (替代行业 ETF)
"sh.000948": "全指半导体",
"sz.399967": "中证军工",
"sh.000941": "全指医药",
"sz.399396": "国证食品饮料",
# 避险 (替代国债 ETF)
"sh.000012": "国债指数",
}
index_data = {}
for code, name in INDICES.items():
print(f" 获取 {code} ({name})...", end=" ")
df = fetch_k_data(code, START_DATE, END_DATE)
if len(df) > 0:
index_data[code] = df
print(f"{len(df)} 行, {df.index[0].date()} ~ {df.index[-1].date()}")
else:
print("无数据")
time.sleep(0.3)
print(f"\n 成功获取: {len(index_data)} / {len(INDICES)} 个指数")
# ══════════════════════════════════════════════════════════════════════
# §3 构建价格面板 & 数据清洗
# ══════════════════════════════════════════════════════════════════════
print("\n[§3] 构建价格面板")
# ── 3-A: 个股价格面板 ──
if stock_data:
stock_panel = pd.DataFrame({
code: df['close'] for code, df in stock_data.items()
})
stock_panel.columns = [f"{code} ({STOCKS[code].split('(')[0].strip()})"
for code in stock_panel.columns]
print(f"\n 个股价格面板: {stock_panel.shape}")
print(f" 日期: {stock_panel.index[0].date()} ~ {stock_panel.index[-1].date()}")
# 缺失值
missing = stock_panel.isna().sum()
if missing.sum() > 0:
print(f" 缺失值 (前5行):")
for col, cnt in missing.items():
if cnt > 0:
print(f" {col}: {cnt} 天 ({cnt/len(stock_panel):.2%})")
# 清洗
clean_stock = stock_panel.ffill().dropna(how='any')
print(f" 清洗前: {len(stock_panel)} 天, 清洗后: {len(clean_stock)}")
# ── 3-B: 指数价格面板 ──
if index_data:
index_panel = pd.DataFrame({
code: df['close'] for code, df in index_data.items()
})
index_panel.columns = [INDICES[code] for code in index_panel.columns]
print(f"\n 指数价格面板: {index_panel.shape}")
print(f" 日期: {index_panel.index[0].date()} ~ {index_panel.index[-1].date()}")
missing_idx = index_panel.isna().sum()
if missing_idx.sum() > 0:
print(f" 各指数缺失天数:")
for col, cnt in missing_idx.items():
if cnt > 0:
print(f" {col}: {cnt}")
clean_index = index_panel.ffill().dropna(how='any')
print(f" 清洗前: {len(index_panel)} 天, 清洗后: {len(clean_index)}")
# ══════════════════════════════════════════════════════════════════════
# §4 快速分析:让数据"说话"
# ══════════════════════════════════════════════════════════════════════
print("\n[§4] 快速分析")
if stock_data:
# ── 个股: 收益率 + 相关性 ──
stock_ret = clean_stock.pct_change().dropna()
ann_ret = (1 + stock_ret).prod() ** (252.0 / len(stock_ret)) - 1
ann_vol = stock_ret.std() * np.sqrt(252)
sharpe = (ann_ret - 0.02) / ann_vol
print(f"\n 个股绩效 (2019-2025):")
print(f" {'名称':<20s} {'年化收益':>8s} {'年化波动':>8s} {'夏普':>6s}")
print(f" " + "-" * 42)
for col in stock_ret.columns:
print(f" {col:<20s} {ann_ret[col]:+7.1%} {ann_vol[col]:+7.1%} {sharpe[col]:+6.2f}")
# 相关性矩阵 (3 只为例)
corr = stock_ret.iloc[:, :5].corr()
print(f"\n 个股相关性:")
print(corr.round(3).to_string())
if index_data:
# ── 指数: 归一化走势 ──
index_ret = clean_index.pct_change().dropna()
print(f"\n 指数归一化走势 (基期=100):")
norm = clean_index / clean_index.iloc[0] * 100
last_vals = norm.iloc[-1].sort_values(ascending=False)
for col, val in last_vals.items():
print(f" {col:<12s}: {val:7.1f}")
# ══════════════════════════════════════════════════════════════════════
# §5 展示部分行情数据 (快速检查)
# ══════════════════════════════════════════════════════════════════════
print("\n[§5] 行情数据样本")
if stock_data:
# 取茅台,展示最关键的 6 列
moutai = stock_data['sh.600519']
if len(moutai) > 0:
cols = ['open', 'high', 'low', 'close', 'volume', 'pctChg']
available = [c for c in cols if c in moutai.columns]
print(f"\n 贵州茅台 (sh.600519) 行情样本:")
print(moutai[available].head(10).to_string())
if index_data:
# 沪深300
hs300 = index_data.get('sh.000300')
if hs300 is not None and len(hs300) > 0:
print(f"\n 沪深300 (sh.000300) 最新 5 日:")
cols = ['close', 'volume', 'pctChg']
available = [c for c in cols if c in hs300.columns]
print(hs300[available].tail(5).to_string())
# ══════════════════════════════════════════════════════════════════════
# §6 数据存储
# ══════════════════════════════════════════════════════════════════════
print("\n[§6] 数据存储")
DATA_DIR = os.path.join(os.path.dirname(__file__), "data", "baostock")
os.makedirs(DATA_DIR, exist_ok=True)
if stock_data:
path = os.path.join(DATA_DIR, "stock_prices.csv")
clean_stock.to_csv(path)
print(f" ✓ 个股价格面板 → {path}")
if index_data:
path = os.path.join(DATA_DIR, "index_prices.csv")
clean_index.to_csv(path)
print(f" ✓ 指数价格面板 → {path}")
# ══════════════════════════════════════════════════════════════════════
# §7 登出
# ══════════════════════════════════════════════════════════════════════
bs.logout()
print("\n ✓ Baostock 登出")
print("\n" + "=" * 68)
print(" ✓ Baostock 数据获取 Demo 完成")
print("=" * 68)
print(f"""
Baostock 特点总结:
优点:
• 完全免费,无需注册
• 本地 SDK 直连,不受代理/VPN 影响
• 不会被东方财富反爬封 IP (因为根本不用东方财富)
• 支持股票 + 指数日线/分钟线
• 自带前复权/后复权/不复权选择
缺点:
• 不支持 ETF 日线数据 (stock_basic 有 ETF 但查不出行情)
• 数据更新慢 (通常 T+1 或 T+2)
• 所有字段都是字符串,每次都要手动转类型
• 没有财务数据 (PE/PB/ROE 等)
• 接口较少,没有北向资金、融资融券等
最佳用途:
• AKShare 被封 IP 时的备用数据源
• 个股回测的数据来源
• 指数数据 (覆盖好)
• 学习和概念验证 (快速、免费、零配置)
后续建议:
• 用这里的指数数据替代 ETF — 行业指数 ≈ 行业 ETF
• 等 AKShare IP 解封后切回 AKShare (数据更全)
• 长期稳定使用建议升级到 Tushare Pro
""")