feat: add demo script for data acquisition using Baostock
- Implemented a complete demo for fetching A-share stock and index data using Baostock. - Included login/logout mechanism, data fetching for selected stocks and indices, and data cleaning processes. - Added functionality to store cleaned data into CSV files. - Provided detailed comments and print statements for user guidance and understanding.
This commit is contained in:
parent
7822e6d363
commit
69e688deab
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,349 @@
|
|||
# =============================================================================
|
||||
# Real Data Acquisition Demo — Baostock
|
||||
# 真实数据获取演示 — Baostock 版
|
||||
# =============================================================================
|
||||
#
|
||||
# Baostock (http://baostock.com) 是一个完全免费、无需注册的 A 股数据源。
|
||||
# 与 AKShare 的区别:不需要 Token、不走东方财富 API(避免反爬封 IP)、
|
||||
# 但数据更新慢(通常滞后 1-2 天)、不支持 ETF 日线数据。
|
||||
#
|
||||
# Prerequisites:
|
||||
# pip install baostock pandas numpy
|
||||
#
|
||||
# Baostock 的特点:
|
||||
# • 本地 SDK 直连服务器,无代理问题
|
||||
# • 股票代码格式: "sh.600519" / "sz.000001"
|
||||
# • 数据全是字符串,需要手动转换类型
|
||||
# • 登录/登出模型: bs.login() → 操作 → bs.logout()
|
||||
# • adjustflag: 1=后复权 2=前复权 3=不复权
|
||||
# • frequency: d=日 w=周 m=月 5=5分钟 15=15分钟 30=30分钟 60=60分钟
|
||||
# =============================================================================
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import warnings
|
||||
warnings.filterwarnings("ignore")
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
import os
|
||||
import time
|
||||
|
||||
# ── 检查 baostock ──
|
||||
try:
|
||||
import baostock as bs
|
||||
except ImportError:
|
||||
print("请先安装: pip install baostock")
|
||||
raise
|
||||
|
||||
print("=" * 68)
|
||||
print(" 真实数据获取演示 — Baostock 版")
|
||||
print(" Real Data Acquisition Demo with Baostock")
|
||||
print("=" * 68)
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# §0 Baostock 的登录机制
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
#
|
||||
# Baostock 使用登录/登出模型。每次 session 需要 login() 开始,logout() 结束。
|
||||
# 免费用户每天有调用次数限制(通常够个人使用),不需要 Token。
|
||||
|
||||
print("\n[§0] 登录 Baostock")
|
||||
|
||||
lg = bs.login()
|
||||
if lg.error_code == '0':
|
||||
print(f" ✓ 登录成功")
|
||||
else:
|
||||
print(f" ✗ 登录失败: {lg.error_code} {lg.error_msg}")
|
||||
raise RuntimeError("Baostock 登录失败")
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# §1 获取 5 只不同行业的代表股
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
#
|
||||
# Baostock 不支持 ETF 日线数据(stock_basic 里有 ETF 但 query_history 返回空)。
|
||||
# 这里选取 5 只不同行业的代表股 + 1 只债券 ETF 的替代(国债指数),
|
||||
# 后续学习 ETF 轮动策略时,用宽基指数或行业指数替代 ETF 即可。
|
||||
|
||||
print("\n[§1] 获取 5 只不同行业的代表股")
|
||||
|
||||
STOCKS = {
|
||||
"sh.600519": "贵州茅台 (消费)",
|
||||
"sz.000858": "五粮液 (白酒)",
|
||||
"sh.600036": "招商银行 (金融)",
|
||||
"sh.600276": "恒瑞医药 (医药)",
|
||||
"sz.300750": "宁德时代 (新能源)",
|
||||
}
|
||||
|
||||
START_DATE = "2019-01-01"
|
||||
END_DATE = "2025-06-01"
|
||||
|
||||
|
||||
def fetch_k_data(code: str, start: str, end: str, freq: str = "d") -> pd.DataFrame:
|
||||
"""
|
||||
从 Baostock 获取日线数据,返回干净的 DataFrame。
|
||||
|
||||
Baostock 返回的数据全部是字符串,需要转换为正确的数值类型。
|
||||
adjustflag="2" = 前复权。
|
||||
"""
|
||||
rs = bs.query_history_k_data_plus(
|
||||
code,
|
||||
"date,open,high,low,close,preclose,volume,amount,"
|
||||
"turn, tradestatus, pctChg, isST",
|
||||
start_date=start, end_date=end,
|
||||
frequency=freq,
|
||||
adjustflag="2" # 前复权
|
||||
)
|
||||
if rs.error_code != '0':
|
||||
print(f" {code} 查询失败: {rs.error_msg}")
|
||||
return pd.DataFrame()
|
||||
|
||||
rows = []
|
||||
while rs.next():
|
||||
rows.append(rs.get_row_data())
|
||||
|
||||
if not rows:
|
||||
return pd.DataFrame()
|
||||
|
||||
df = pd.DataFrame(rows, columns=rs.fields)
|
||||
|
||||
# ── 类型转换:Baostock 全是字符串 ──
|
||||
df['date'] = pd.to_datetime(df['date'])
|
||||
numeric_cols = ['open', 'high', 'low', 'close', 'preclose',
|
||||
'volume', 'amount', 'turn', 'pctChg']
|
||||
for col in numeric_cols:
|
||||
if col in df.columns:
|
||||
df[col] = pd.to_numeric(df[col], errors='coerce')
|
||||
|
||||
return df.set_index('date').sort_index()
|
||||
|
||||
|
||||
stock_data = {}
|
||||
for code, name in STOCKS.items():
|
||||
print(f" 获取 {code} ({name})...", end=" ")
|
||||
df = fetch_k_data(code, START_DATE, END_DATE)
|
||||
if len(df) > 0:
|
||||
stock_data[code] = df
|
||||
print(f"{len(df)} 行, {df.index[0].date()} ~ {df.index[-1].date()}")
|
||||
else:
|
||||
print("无数据")
|
||||
time.sleep(0.3) # 礼貌间隔
|
||||
|
||||
print(f"\n 成功获取: {len(stock_data)} / {len(STOCKS)} 只股票")
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# §2 获取宽基指数 & 行业指数(替代 ETF)
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
#
|
||||
# Baostock 支持指数日线数据。指数代码格式: sh.000300 / sz.399006
|
||||
# 在 Baostock 的 stock_basic 中 type=2 的是指数。
|
||||
#
|
||||
# 下面选了:
|
||||
# • 3 个宽基指数 (沪深300, 中证500, 创业板指)
|
||||
# • 4 个行业/主题指数 (替代行业 ETF)
|
||||
# • 1 个国债指数 (替代国债 ETF, 用于避险)
|
||||
# =============================================================================
|
||||
|
||||
print("\n[§2] 获取指数数据")
|
||||
|
||||
# 宽基 + 行业指数
|
||||
INDICES = {
|
||||
# 宽基
|
||||
"sh.000300": "沪深300",
|
||||
"sh.000905": "中证500",
|
||||
"sz.399006": "创业板指",
|
||||
# 行业/主题 (替代行业 ETF)
|
||||
"sh.000948": "全指半导体",
|
||||
"sz.399967": "中证军工",
|
||||
"sh.000941": "全指医药",
|
||||
"sz.399396": "国证食品饮料",
|
||||
# 避险 (替代国债 ETF)
|
||||
"sh.000012": "国债指数",
|
||||
}
|
||||
|
||||
index_data = {}
|
||||
for code, name in INDICES.items():
|
||||
print(f" 获取 {code} ({name})...", end=" ")
|
||||
df = fetch_k_data(code, START_DATE, END_DATE)
|
||||
if len(df) > 0:
|
||||
index_data[code] = df
|
||||
print(f"{len(df)} 行, {df.index[0].date()} ~ {df.index[-1].date()}")
|
||||
else:
|
||||
print("无数据")
|
||||
time.sleep(0.3)
|
||||
|
||||
print(f"\n 成功获取: {len(index_data)} / {len(INDICES)} 个指数")
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# §3 构建价格面板 & 数据清洗
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
|
||||
print("\n[§3] 构建价格面板")
|
||||
|
||||
# ── 3-A: 个股价格面板 ──
|
||||
if stock_data:
|
||||
stock_panel = pd.DataFrame({
|
||||
code: df['close'] for code, df in stock_data.items()
|
||||
})
|
||||
stock_panel.columns = [f"{code} ({STOCKS[code].split('(')[0].strip()})"
|
||||
for code in stock_panel.columns]
|
||||
|
||||
print(f"\n 个股价格面板: {stock_panel.shape}")
|
||||
print(f" 日期: {stock_panel.index[0].date()} ~ {stock_panel.index[-1].date()}")
|
||||
|
||||
# 缺失值
|
||||
missing = stock_panel.isna().sum()
|
||||
if missing.sum() > 0:
|
||||
print(f" 缺失值 (前5行):")
|
||||
for col, cnt in missing.items():
|
||||
if cnt > 0:
|
||||
print(f" {col}: {cnt} 天 ({cnt/len(stock_panel):.2%})")
|
||||
|
||||
# 清洗
|
||||
clean_stock = stock_panel.ffill().dropna(how='any')
|
||||
print(f" 清洗前: {len(stock_panel)} 天, 清洗后: {len(clean_stock)} 天")
|
||||
|
||||
# ── 3-B: 指数价格面板 ──
|
||||
if index_data:
|
||||
index_panel = pd.DataFrame({
|
||||
code: df['close'] for code, df in index_data.items()
|
||||
})
|
||||
index_panel.columns = [INDICES[code] for code in index_panel.columns]
|
||||
|
||||
print(f"\n 指数价格面板: {index_panel.shape}")
|
||||
print(f" 日期: {index_panel.index[0].date()} ~ {index_panel.index[-1].date()}")
|
||||
|
||||
missing_idx = index_panel.isna().sum()
|
||||
if missing_idx.sum() > 0:
|
||||
print(f" 各指数缺失天数:")
|
||||
for col, cnt in missing_idx.items():
|
||||
if cnt > 0:
|
||||
print(f" {col}: {cnt} 天")
|
||||
|
||||
clean_index = index_panel.ffill().dropna(how='any')
|
||||
print(f" 清洗前: {len(index_panel)} 天, 清洗后: {len(clean_index)} 天")
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# §4 快速分析:让数据"说话"
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
|
||||
print("\n[§4] 快速分析")
|
||||
|
||||
if stock_data:
|
||||
# ── 个股: 收益率 + 相关性 ──
|
||||
stock_ret = clean_stock.pct_change().dropna()
|
||||
|
||||
ann_ret = (1 + stock_ret).prod() ** (252.0 / len(stock_ret)) - 1
|
||||
ann_vol = stock_ret.std() * np.sqrt(252)
|
||||
sharpe = (ann_ret - 0.02) / ann_vol
|
||||
|
||||
print(f"\n 个股绩效 (2019-2025):")
|
||||
print(f" {'名称':<20s} {'年化收益':>8s} {'年化波动':>8s} {'夏普':>6s}")
|
||||
print(f" " + "-" * 42)
|
||||
for col in stock_ret.columns:
|
||||
print(f" {col:<20s} {ann_ret[col]:+7.1%} {ann_vol[col]:+7.1%} {sharpe[col]:+6.2f}")
|
||||
|
||||
# 相关性矩阵 (3 只为例)
|
||||
corr = stock_ret.iloc[:, :5].corr()
|
||||
print(f"\n 个股相关性:")
|
||||
print(corr.round(3).to_string())
|
||||
|
||||
if index_data:
|
||||
# ── 指数: 归一化走势 ──
|
||||
index_ret = clean_index.pct_change().dropna()
|
||||
|
||||
print(f"\n 指数归一化走势 (基期=100):")
|
||||
norm = clean_index / clean_index.iloc[0] * 100
|
||||
last_vals = norm.iloc[-1].sort_values(ascending=False)
|
||||
for col, val in last_vals.items():
|
||||
print(f" {col:<12s}: {val:7.1f}")
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# §5 展示部分行情数据 (快速检查)
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
|
||||
print("\n[§5] 行情数据样本")
|
||||
|
||||
if stock_data:
|
||||
# 取茅台,展示最关键的 6 列
|
||||
moutai = stock_data['sh.600519']
|
||||
if len(moutai) > 0:
|
||||
cols = ['open', 'high', 'low', 'close', 'volume', 'pctChg']
|
||||
available = [c for c in cols if c in moutai.columns]
|
||||
print(f"\n 贵州茅台 (sh.600519) 行情样本:")
|
||||
print(moutai[available].head(10).to_string())
|
||||
|
||||
if index_data:
|
||||
# 沪深300
|
||||
hs300 = index_data.get('sh.000300')
|
||||
if hs300 is not None and len(hs300) > 0:
|
||||
print(f"\n 沪深300 (sh.000300) 最新 5 日:")
|
||||
cols = ['close', 'volume', 'pctChg']
|
||||
available = [c for c in cols if c in hs300.columns]
|
||||
print(hs300[available].tail(5).to_string())
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# §6 数据存储
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
|
||||
print("\n[§6] 数据存储")
|
||||
|
||||
DATA_DIR = os.path.join(os.path.dirname(__file__), "data", "baostock")
|
||||
os.makedirs(DATA_DIR, exist_ok=True)
|
||||
|
||||
if stock_data:
|
||||
path = os.path.join(DATA_DIR, "stock_prices.csv")
|
||||
clean_stock.to_csv(path)
|
||||
print(f" ✓ 个股价格面板 → {path}")
|
||||
|
||||
if index_data:
|
||||
path = os.path.join(DATA_DIR, "index_prices.csv")
|
||||
clean_index.to_csv(path)
|
||||
print(f" ✓ 指数价格面板 → {path}")
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
# §7 登出
|
||||
# ══════════════════════════════════════════════════════════════════════
|
||||
|
||||
bs.logout()
|
||||
print("\n ✓ Baostock 登出")
|
||||
|
||||
print("\n" + "=" * 68)
|
||||
print(" ✓ Baostock 数据获取 Demo 完成")
|
||||
print("=" * 68)
|
||||
print(f"""
|
||||
Baostock 特点总结:
|
||||
优点:
|
||||
• 完全免费,无需注册
|
||||
• 本地 SDK 直连,不受代理/VPN 影响
|
||||
• 不会被东方财富反爬封 IP (因为根本不用东方财富)
|
||||
• 支持股票 + 指数日线/分钟线
|
||||
• 自带前复权/后复权/不复权选择
|
||||
|
||||
缺点:
|
||||
• 不支持 ETF 日线数据 (stock_basic 有 ETF 但查不出行情)
|
||||
• 数据更新慢 (通常 T+1 或 T+2)
|
||||
• 所有字段都是字符串,每次都要手动转类型
|
||||
• 没有财务数据 (PE/PB/ROE 等)
|
||||
• 接口较少,没有北向资金、融资融券等
|
||||
|
||||
最佳用途:
|
||||
• AKShare 被封 IP 时的备用数据源
|
||||
• 个股回测的数据来源
|
||||
• 指数数据 (覆盖好)
|
||||
• 学习和概念验证 (快速、免费、零配置)
|
||||
|
||||
后续建议:
|
||||
• 用这里的指数数据替代 ETF — 行业指数 ≈ 行业 ETF
|
||||
• 等 AKShare IP 解封后切回 AKShare (数据更全)
|
||||
• 长期稳定使用建议升级到 Tushare Pro
|
||||
""")
|
||||
|
|
@ -4,4 +4,5 @@ matplotlib
|
|||
scipy
|
||||
scikit-learn
|
||||
akshare
|
||||
tushare
|
||||
tushare
|
||||
baostock
|
||||
Loading…
Reference in New Issue