

选股(stock selection)是一种主动性投资策略,先按照某种规则或算法分析单只股票的前景,然后构建一个投资组合,长期持有。一般情况下要求组合的股票具有低相关性,这样才能对冲系统性风险,否则在大盘走弱的时候投资组合也会面临巨大的下跌风险。


关于如何选股,学术界提出过很多不同的模型,最经典的莫过于马科维茨投资组合理论。这里我们使用MM趋势模型(Mark Minervini’s Trend Template),这是国外一位传奇投资大师提出的技术面选股方法,核心思想是通过技术指标来度量股票动能,从中筛选最有潜力的股票,买入并持有。


  1. 股票价格高于150天均线和200天均线
  2. 150日均线高于200日均线
  3. 200日均线上升至少1个月
  4. 50日均线高于150日均线和200日均线
  5. 股票价格高于50日均线
  6. 股票价格比52周低点高30%
  7. 股票价格在52周高点的25%以内
  8. 相对强弱指数(RS)大于等于70,这里的相对强弱指的是股票与大盘对比,RS = 股票1年收益率 / 基准指数1年收益率

  1. 从哪里获取大量股票的历史数据?
  2. 当股票数量很多时,如何提高计算性能?


  1. import os

  2. import datetime as dt

  3. import time

  4. from typing import Any, Dict, Optional, List

  5. import requests

  6. import pickle

  7. import numpy as np

  8. import pandas as pd

  9. import matplotlib.pyplot as plt

  10. import seaborn as sns

  11. import talib

  12. import multiprocessing as mp

  13. from requests.exceptions import ConnectionError, Timeout

  14. %matplotlib inline

  15. plt.style.use("fivethirtyeight")

1. 从蜂鸟数据获取历史数据


  1. ## 撰写自定义函数,通过API获取数据

  2. def fetch_trochil(url: str,

  3. params: Dict[str, str],

  4. attempt: int = 3,

  5. timeout: int = 3) -> Dict[str, Any]:

  6. """装饰requests.get函数"""

  7. for i in range(attempt):

  8. try:

  9. resp = requests.get(url, params, timeout=timeout)

  10. resp.raise_for_status()

  11. data = resp.json()["data"]

  12. if not data:

  13. raise Exception("empty dataset")

  14. return data

  15. except (ConnectionError, Timeout) as e:

  16. print(e)

  17. i += 1

  18. time.sleep(i * 0.5)

  19. def fetch_cnstocks(apikey: str) -> pd.DataFrame:

  20. """从蜂鸟数据获取A股产品列表"""

  21. url = "https://api.trochil.cn/v1/cnstock/markets"

  22. params = {"apikey": apikey}

  23. res = fetch_trochil(url, params)

  24. return pd.DataFrame.from_records(res)

  25. def fetch_daily_ohlc(symbol: str,

  26. date_from: dt.datetime,

  27. date_to: dt.datetime,

  28. apikey: str) -> pd.DataFrame:

  29. """从蜂鸟数据获取A股日图历史K线"""

  30. url = "https://api.trochil.cn/v1/cnstock/history"

  31. params = {

  32. "symbol": symbol,

  33. "start_date": date_from.strftime("%Y-%m-%d"),

  34. "end_date": date_to.strftime("%Y-%m-%d"),

  35. "freq": "daily",

  36. "apikey": apikey

  37. }

  38. res = fetch_trochil(url, params)

  39. return pd.DataFrame.from_records(res)

  40. def fetch_index_ohlc(symbol: str,

  41. date_from: dt.datetime,

  42. date_to: dt.datetime,

  43. apikey: str) -> pd.DataFrame:

  44. """获取股指的日图历史数据"""

  45. url = "https://api.trochil.cn/v1/index/daily"

  46. params = {

  47. "symbol": symbol,

  48. "start_date": date_from.strftime("%Y-%m-%d"),

  49. "end_date": date_to.strftime("%Y-%m-%d"),

  50. "apikey": apikey

  51. }

  52. res = fetch_trochil(url, params)

  53. return pd.DataFrame.from_records(res)

1.1 产品列表


  1. apikey = os.getenv("TROCHIL_API") # use your apikey

  2. cnstocks = fetch_cnstocks(apikey)

  3. cnstocks


  1. # 筛选前缀为'SH'的股票

  2. cnstocks_shsz = cnstocks.query("symbol.str.startswith('SH')")

  3. cnstocks_shsz

1.2 个股历史数据


  1. %%time

  2. # 下载2019年至今的历史数据

  3. # 下载时剔除K线少于260个交易日的股票

  4. date_from = dt.datetime(2019, 1, 1)

  5. date_to = dt.datetime.today()

  6. symbols = cnstocks_shsz.symbol.to_list()

  7. min_klines = 260

  8. # 逐个下载,蜂鸟数据的API没有分钟请求限制

  9. # 先把数据存储在列表中,下载完成后再合并和清洗

  10. ohlc_list = []

  11. for symbol in symbols:

  12. try:

  13. ohlc = fetch_daily_ohlc(symbol, date_from, date_to, apikey)

  14. if ohlc is not None and len(ohlc) >= min_klines:

  15. ohlc.set_index("datetime", inplace=True)

  16. ohlc_list.append(ohlc)

  17. except Exception as e:

  18. pass

  19. CPU times: user 21.7 s, sys: 349 ms, total: 22 s

  20. Wall time: 49.3 s


  1. ohlc_joined = pd.concat(ohlc_list)

  2. ohlc_joined.info()

  3. <class 'pandas.core.frame.DataFrame'>

  4. Index: 532756 entries, 2019-01-02 to 2020-07-29

  5. Data columns (total 6 columns):

  6. # Column Non-Null Count Dtype

  7. --- ------ -------------- -----

  8. 0 open 532756 non-null float64

  9. 1 high 532756 non-null float64

  10. 2 low 532756 non-null float64

  11. 3 close 532756 non-null float64

  12. 4 volume 532756 non-null float64

  13. 5 symbol 532756 non-null object

  14. dtypes: float64(5), object(1)

  15. memory usage: 28.5+ MB


  1. ohlc_joined.isnull().sum()

  2. open 0

  3. high 0

  4. low 0

  5. close 0

  6. volume 0

  7. symbol 0

  8. dtype: int64


ohlc_joined.to_csv("cnstock_daily_ohlc.csv", index=True)

1.3 上证指数


  1. benchmark = fetch_index_ohlc("shci", date_from, date_to, apikey)

  2. benchmark.tail()

  1. # 计算1年累计收益率,1年以252个交易日计算

  2. benchmark_ann_ret = benchmark.close.pct_change(252).iloc[-1]

  3. benchmark_ann_ret

  4. 0.12150312157460808

2. 选股

  1. def screen(close: pd.Series, benchmark_ann_ret: float) -> pd.Series:

  2. """实现MM选股模型的逻辑,评估单只股票是否满足筛选条件

  3. Args:

  4. close(pd.Series): 股票收盘价,默认时间序列索引

  5. benchmark_ann_ret(float): 基准指数1年收益率,用于计算相对强弱

  6. """

  7. # 计算50,150,200日均线

  8. ema_50 = talib.EMA(close, 50).iloc[-1]

  9. ema_150 = talib.EMA(close, 150).iloc[-1]

  10. ema_200 = talib.EMA(close, 200).iloc[-1]

  11. # 200日均线的20日移动平滑,用于判断200日均线是否上升

  12. ema_200_smooth = talib.EMA(talib.EMA(close, 200), 20).iloc[-1]

  13. # 收盘价的52周高点和52周低点

  14. high_52week = close.rolling(52 * 5).max().iloc[-1]

  15. low_52week = close.rolling(52 * 5).min().iloc[-1]

  16. # 最新收盘价

  17. cl = close.iloc[-1]

  18. # 筛选条件1:收盘价高于150日均线和200日均线

  19. if cl > ema_150 and cl > ema_200:

  20. condition_1 = True

  21. else:

  22. condition_1 = False

  23. # 筛选条件2:150日均线高于200日均线

  24. if ema_150 > ema_200:

  25. condition_2 = True

  26. else:

  27. condition_2 = False

  28. # 筛选条件3:200日均线上升1个月

  29. if ema_200 > ema_200_smooth:

  30. condition_3 = True

  31. else:

  32. condition_3 = False

  33. # 筛选条件4:50日均线高于150日均线和200日均线

  34. if ema_50 > ema_150 and ema_50 > ema_200:

  35. condition_4 = True

  36. else:

  37. condition_4 = False

  38. # 筛选条件5:收盘价高于50日均线

  39. if cl > ema_50:

  40. condition_5 = True

  41. else:

  42. condition_5 = False

  43. # 筛选条件6:收盘价比52周低点高30%

  44. if cl >= low_52week * 1.3:

  45. condition_6 = True

  46. else:

  47. condition_6 = False

  48. # 筛选条件7:收盘价在52周高点的25%以内

  49. if cl >= high_52week * 0.75 and cl <= high_52week * 1.25:

  50. condition_7 = True

  51. else:

  52. condition_7 = False

  53. # 筛选条件8:相对强弱指数大于等于70

  54. rs = close.pct_change(252).iloc[-1] / benchmark_ann_ret * 100

  55. if rs >= 70:

  56. condition_8 = True

  57. else:

  58. condition_8 = False

  59. # 判断股票是否符合标准

  60. if (condition_1 and condition_2 and condition_3 and

  61. condition_4 and condition_5 and condition_6 and

  62. condition_7 and condition_8):

  63. meet_criterion = True

  64. else:

  65. meet_criterion = False

  66. out = {

  67. "rs": round(rs, 2),

  68. "close": cl,

  69. "ema_50": ema_50,

  70. "ema_150": ema_150,

  71. "ema_200": ema_200,

  72. "high_52week": high_52week,

  73. "low_52week": low_52week,

  74. "meet_criterion": meet_criterion

  75. }

  76. return pd.Series(out)

2.1 同步


  1. # 仅仅筛选有足够历史数据的股票

  2. symbols_to_screen = list(ohlc_joined.symbol.unique())

  3. # 将数据框的格式从long-format转化为wide-format

  4. ohlc_joined_wide = ohlc_joined.pivot(columns="symbol", values="close").fillna(method="ffill")

  5. ohlc_joined_wide.head()

  1. %%time

  2. results = ohlc_joined_wide.apply(screen, benchmark_ann_ret=benchmark_ann_ret)

  3. results = results.T

  4. CPU times: user 2.97 s, sys: 6.47 ms, total: 2.98 s

  5. Wall time: 2.97 s



results.query("meet_criterion == True").sort_values("rs", ascending=False)



2.2 多进程


  1. %%time

  2. # 定义worker函数

  3. def screen_stocks(df: pd.DataFrame, benchmark_ann_ret: float) -> pd.DataFrame:

  4. results = df.apply(screen, benchmark_ann_ret=benchmark_ann_ret)

  5. return results.T

  6. # 拆分数据框,先尝试用四条进程,将数据框拆分为四个部分(按列划分)

  7. df_chunks = np.array_split(ohlc_joined_wide, 4, axis=1)

  8. # 用multiprocessing.Pool对象管理进程池

  9. with mp.Pool(processes=4) as p:

  10. future_results = [p.apply_async(

  11. screen_stocks, kwds={"df": df, "benchmark_ann_ret": benchmark_ann_ret}) for df in df_chunks]

  12. results = pd.concat([r.get() for r in future_results])

  13. CPU times: user 934 ms, sys: 204 ms, total: 1.14 s

  14. Wall time: 1.06 s


results.query("meet_criterion == True").sort_values("rs", ascending=False)


  1. max_processors = mp.cpu_count()

  2. time_used = {}

  3. for processors in range(1, max_processors + 1):

  4. df_chunks = np.array_split(ohlc_joined_wide, processors, axis=1)

  5. t0 = time.time()

  6. with mp.Pool(processors) as p:

  7. future_results = [p.apply_async(

  8. screen_stocks, kwds={"df": df, "benchmark_ann_ret": benchmark_ann_ret}) for df in df_chunks]

  9. results = pd.concat([r.get() for r in future_results])

  10. elapsed = time.time() - t0

  11. time_used[processors] = elapsed

  12. fig, ax = plt.subplots(figsize=(12, 7))

  13. ax = sns.pointplot(x=list(time_used.keys()), y=list(time_used.values()))

  14. ax.set_xlabel("CPU cores")

  15. ax.set_ylabel("Time used(seconds)")

  16. ax.set_title("Computation time vs CPU Cores", loc="left")


3. 总结


  1. 从蜂鸟数据获取沪深A股的历史数据。
  2. 自定义函数实现MM模型的选股逻辑。
  3. 多进程计算,大幅减少筛选的时间。




