# ============================================================================= # Real Data Acquisition Demo — Baostock # 真实数据获取演示 — Baostock 版 # ============================================================================= # # Baostock (http://baostock.com) 是一个完全免费、无需注册的 A 股数据源。 # 与 AKShare 的区别:不需要 Token、不走东方财富 API(避免反爬封 IP)、 # 但数据更新慢(通常滞后 1-2 天)、不支持 ETF 日线数据。 # # Prerequisites: # pip install baostock pandas numpy # # Baostock 的特点: # • 本地 SDK 直连服务器,无代理问题 # • 股票代码格式: "sh.600519" / "sz.000001" # • 数据全是字符串,需要手动转换类型 # • 登录/登出模型: bs.login() → 操作 → bs.logout() # • adjustflag: 1=后复权 2=前复权 3=不复权 # • frequency: d=日 w=周 m=月 5=5分钟 15=15分钟 30=30分钟 60=60分钟 # ============================================================================= from __future__ import annotations import warnings warnings.filterwarnings("ignore") import numpy as np import pandas as pd from datetime import datetime import os import time # ── 检查 baostock ── try: import baostock as bs except ImportError: print("请先安装: pip install baostock") raise print("=" * 68) print(" 真实数据获取演示 — Baostock 版") print(" Real Data Acquisition Demo with Baostock") print("=" * 68) # ══════════════════════════════════════════════════════════════════════ # §0 Baostock 的登录机制 # ══════════════════════════════════════════════════════════════════════ # # Baostock 使用登录/登出模型。每次 session 需要 login() 开始,logout() 结束。 # 免费用户每天有调用次数限制(通常够个人使用),不需要 Token。 print("\n[§0] 登录 Baostock") lg = bs.login() if lg.error_code == '0': print(f" ✓ 登录成功") else: print(f" ✗ 登录失败: {lg.error_code} {lg.error_msg}") raise RuntimeError("Baostock 登录失败") # ══════════════════════════════════════════════════════════════════════ # §1 获取 5 只不同行业的代表股 # ══════════════════════════════════════════════════════════════════════ # # Baostock 不支持 ETF 日线数据(stock_basic 里有 ETF 但 query_history 返回空)。 # 这里选取 5 只不同行业的代表股 + 1 只债券 ETF 的替代(国债指数), # 后续学习 ETF 轮动策略时,用宽基指数或行业指数替代 ETF 即可。 print("\n[§1] 获取 5 只不同行业的代表股") STOCKS = { "sh.600519": "贵州茅台 (消费)", "sz.000858": "五粮液 (白酒)", "sh.600036": "招商银行 (金融)", "sh.600276": "恒瑞医药 (医药)", "sz.300750": "宁德时代 (新能源)", } START_DATE = "2019-01-01" END_DATE = "2025-06-01" def fetch_k_data(code: str, start: str, end: str, freq: str = "d") -> pd.DataFrame: """ 从 Baostock 获取日线数据,返回干净的 DataFrame。 Baostock 返回的数据全部是字符串,需要转换为正确的数值类型。 adjustflag="2" = 前复权。 """ rs = bs.query_history_k_data_plus( code, "date,open,high,low,close,preclose,volume,amount," "turn, tradestatus, pctChg, isST", start_date=start, end_date=end, frequency=freq, adjustflag="2" # 前复权 ) if rs.error_code != '0': print(f" {code} 查询失败: {rs.error_msg}") return pd.DataFrame() rows = [] while rs.next(): rows.append(rs.get_row_data()) if not rows: return pd.DataFrame() df = pd.DataFrame(rows, columns=rs.fields) # ── 类型转换:Baostock 全是字符串 ── df['date'] = pd.to_datetime(df['date']) numeric_cols = ['open', 'high', 'low', 'close', 'preclose', 'volume', 'amount', 'turn', 'pctChg'] for col in numeric_cols: if col in df.columns: df[col] = pd.to_numeric(df[col], errors='coerce') return df.set_index('date').sort_index() stock_data = {} for code, name in STOCKS.items(): print(f" 获取 {code} ({name})...", end=" ") df = fetch_k_data(code, START_DATE, END_DATE) if len(df) > 0: stock_data[code] = df print(f"{len(df)} 行, {df.index[0].date()} ~ {df.index[-1].date()}") else: print("无数据") time.sleep(0.3) # 礼貌间隔 print(f"\n 成功获取: {len(stock_data)} / {len(STOCKS)} 只股票") # ══════════════════════════════════════════════════════════════════════ # §2 获取宽基指数 & 行业指数(替代 ETF) # ══════════════════════════════════════════════════════════════════════ # # Baostock 支持指数日线数据。指数代码格式: sh.000300 / sz.399006 # 在 Baostock 的 stock_basic 中 type=2 的是指数。 # # 下面选了: # • 3 个宽基指数 (沪深300, 中证500, 创业板指) # • 4 个行业/主题指数 (替代行业 ETF) # • 1 个国债指数 (替代国债 ETF, 用于避险) # ============================================================================= print("\n[§2] 获取指数数据") # 宽基 + 行业指数 INDICES = { # 宽基 "sh.000300": "沪深300", "sh.000905": "中证500", "sz.399006": "创业板指", # 行业/主题 (替代行业 ETF) "sh.000948": "全指半导体", "sz.399967": "中证军工", "sh.000941": "全指医药", "sz.399396": "国证食品饮料", # 避险 (替代国债 ETF) "sh.000012": "国债指数", } index_data = {} for code, name in INDICES.items(): print(f" 获取 {code} ({name})...", end=" ") df = fetch_k_data(code, START_DATE, END_DATE) if len(df) > 0: index_data[code] = df print(f"{len(df)} 行, {df.index[0].date()} ~ {df.index[-1].date()}") else: print("无数据") time.sleep(0.3) print(f"\n 成功获取: {len(index_data)} / {len(INDICES)} 个指数") # ══════════════════════════════════════════════════════════════════════ # §3 构建价格面板 & 数据清洗 # ══════════════════════════════════════════════════════════════════════ print("\n[§3] 构建价格面板") # ── 3-A: 个股价格面板 ── if stock_data: stock_panel = pd.DataFrame({ code: df['close'] for code, df in stock_data.items() }) stock_panel.columns = [f"{code} ({STOCKS[code].split('(')[0].strip()})" for code in stock_panel.columns] print(f"\n 个股价格面板: {stock_panel.shape}") print(f" 日期: {stock_panel.index[0].date()} ~ {stock_panel.index[-1].date()}") # 缺失值 missing = stock_panel.isna().sum() if missing.sum() > 0: print(f" 缺失值 (前5行):") for col, cnt in missing.items(): if cnt > 0: print(f" {col}: {cnt} 天 ({cnt/len(stock_panel):.2%})") # 清洗 clean_stock = stock_panel.ffill().dropna(how='any') print(f" 清洗前: {len(stock_panel)} 天, 清洗后: {len(clean_stock)} 天") # ── 3-B: 指数价格面板 ── if index_data: index_panel = pd.DataFrame({ code: df['close'] for code, df in index_data.items() }) index_panel.columns = [INDICES[code] for code in index_panel.columns] print(f"\n 指数价格面板: {index_panel.shape}") print(f" 日期: {index_panel.index[0].date()} ~ {index_panel.index[-1].date()}") missing_idx = index_panel.isna().sum() if missing_idx.sum() > 0: print(f" 各指数缺失天数:") for col, cnt in missing_idx.items(): if cnt > 0: print(f" {col}: {cnt} 天") clean_index = index_panel.ffill().dropna(how='any') print(f" 清洗前: {len(index_panel)} 天, 清洗后: {len(clean_index)} 天") # ══════════════════════════════════════════════════════════════════════ # §4 快速分析:让数据"说话" # ══════════════════════════════════════════════════════════════════════ print("\n[§4] 快速分析") if stock_data: # ── 个股: 收益率 + 相关性 ── stock_ret = clean_stock.pct_change().dropna() ann_ret = (1 + stock_ret).prod() ** (252.0 / len(stock_ret)) - 1 ann_vol = stock_ret.std() * np.sqrt(252) sharpe = (ann_ret - 0.02) / ann_vol print(f"\n 个股绩效 (2019-2025):") print(f" {'名称':<20s} {'年化收益':>8s} {'年化波动':>8s} {'夏普':>6s}") print(f" " + "-" * 42) for col in stock_ret.columns: print(f" {col:<20s} {ann_ret[col]:+7.1%} {ann_vol[col]:+7.1%} {sharpe[col]:+6.2f}") # 相关性矩阵 (3 只为例) corr = stock_ret.iloc[:, :5].corr() print(f"\n 个股相关性:") print(corr.round(3).to_string()) if index_data: # ── 指数: 归一化走势 ── index_ret = clean_index.pct_change().dropna() print(f"\n 指数归一化走势 (基期=100):") norm = clean_index / clean_index.iloc[0] * 100 last_vals = norm.iloc[-1].sort_values(ascending=False) for col, val in last_vals.items(): print(f" {col:<12s}: {val:7.1f}") # ══════════════════════════════════════════════════════════════════════ # §5 展示部分行情数据 (快速检查) # ══════════════════════════════════════════════════════════════════════ print("\n[§5] 行情数据样本") if stock_data: # 取茅台,展示最关键的 6 列 moutai = stock_data['sh.600519'] if len(moutai) > 0: cols = ['open', 'high', 'low', 'close', 'volume', 'pctChg'] available = [c for c in cols if c in moutai.columns] print(f"\n 贵州茅台 (sh.600519) 行情样本:") print(moutai[available].head(10).to_string()) if index_data: # 沪深300 hs300 = index_data.get('sh.000300') if hs300 is not None and len(hs300) > 0: print(f"\n 沪深300 (sh.000300) 最新 5 日:") cols = ['close', 'volume', 'pctChg'] available = [c for c in cols if c in hs300.columns] print(hs300[available].tail(5).to_string()) # ══════════════════════════════════════════════════════════════════════ # §6 数据存储 # ══════════════════════════════════════════════════════════════════════ print("\n[§6] 数据存储") DATA_DIR = os.path.join(os.path.dirname(__file__), "data", "baostock") os.makedirs(DATA_DIR, exist_ok=True) if stock_data: path = os.path.join(DATA_DIR, "stock_prices.csv") clean_stock.to_csv(path) print(f" ✓ 个股价格面板 → {path}") if index_data: path = os.path.join(DATA_DIR, "index_prices.csv") clean_index.to_csv(path) print(f" ✓ 指数价格面板 → {path}") # ══════════════════════════════════════════════════════════════════════ # §7 登出 # ══════════════════════════════════════════════════════════════════════ bs.logout() print("\n ✓ Baostock 登出") print("\n" + "=" * 68) print(" ✓ Baostock 数据获取 Demo 完成") print("=" * 68) print(f""" Baostock 特点总结: 优点: • 完全免费,无需注册 • 本地 SDK 直连,不受代理/VPN 影响 • 不会被东方财富反爬封 IP (因为根本不用东方财富) • 支持股票 + 指数日线/分钟线 • 自带前复权/后复权/不复权选择 缺点: • 不支持 ETF 日线数据 (stock_basic 有 ETF 但查不出行情) • 数据更新慢 (通常 T+1 或 T+2) • 所有字段都是字符串,每次都要手动转类型 • 没有财务数据 (PE/PB/ROE 等) • 接口较少,没有北向资金、融资融券等 最佳用途: • AKShare 被封 IP 时的备用数据源 • 个股回测的数据来源 • 指数数据 (覆盖好) • 学习和概念验证 (快速、免费、零配置) 后续建议: • 用这里的指数数据替代 ETF — 行业指数 ≈ 行业 ETF • 等 AKShare IP 解封后切回 AKShare (数据更全) • 长期稳定使用建议升级到 Tushare Pro """)