1.tushare介绍
tushare数据内容包含股票、基金、期货、债券、外汇、行业大数据,同时包括了数字货币行情等区块链数据,为各类金融投资和研究人员提供适用的数据和工具。我也用tushare的数据,完成了大作业、毕业论文等文章的数据收集工作,在工作过程中也常用tushare进行股票等资产筛选工作。
2.tushare官网链接:
3.数据获取代码演示
可以非常方便地通过api获取交易日列表、指数成分股和指数日度收益率数据,可以在此数据基础上进行指数增强、趋势择时等量化策略的设计。输入参数,返回的数据是DataFrame,数据整理很便捷,下载后可以保存到本地,读取更加快速。
import pandas as pd
import numpy as np
from datetime import datetime
import os
from sklearn.model_selection import cross_val_score
import statsmodels.api as sm
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
import tushare as ts
# 设置中文编码和负号的正常显示
plt.rcParams.update({"font.family": "STIXGeneral",
"font.size": 10,
"mathtext.fontset": "cm"})
plt.rcParams['font.family'] = 'Heiti TC'
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
def data_download():
print('————————————sample choose————————————')
# 股票池
start_time = "2010-01-01"
end_time = "2022-12-31"#tushare接口返回的最后日期是2021-12-31
# 登录tushare接口和choice接口
ts.set_token('你的token')
pro = ts.pro_api('账号')
print('————————————pre_setting————————————')
# 筛选出样本区间的每月最后一个交易日
list_trade = pro.trade_cal(exchange='', is_open='1',
start_date=start_time,
end_date=end_time,
fields='cal_date')
list_trade.sort_values(by="cal_date" , inplace=True, ascending=True)
list_trade.reset_index(drop=True,inplace=True)
df_time = list_trade
df_time = df_time.rename(columns={'cal_date':'Date'})
df_time['Date'] = df_time['Date'].astype('datetime64[ns]')
df_time['Year'] = df_time['Date'].map(lambda x: x.year)
df_time['Month'] = df_time['Date'].map(lambda x: x.month)
df_time['Day'] = df_time['Date'].map(lambda x: x.day)
df_time['Date_0'] = df_time['Date'].astype('str')
df_first = df_time.sort_values(by=['Date'], ascending=True)
df_first = df_first.drop_duplicates(subset=['Year', 'Month']).sort_values(by=['Date'], ascending=True)
first_time = list(df_first['Date'].astype('str'))
df_last = df_time.sort_values(by=['Date'], ascending=False)
df_last = df_last.drop_duplicates(subset=['Year', 'Month']).sort_values(by=['Date'], ascending=True)
last_time = list(df_last['Date'].astype('str'))
# 样本筛选
# 沪深300指数成分股code列表
hs300 = pro.index_weight(index_code='000300.SH', start_date='20230201',end_date='20230201')#沪深300成分股
hs300.to_csv('hs300_data.csv')
hs300_code = list(hs300['con_code'])
# 获得沪深三百指数的股票收益率作为参照
hs_return = pro.index_daily(**{"ts_code": "000300.SH","trade_date": "","start_date": start_time,"end_date": end_time,"limit": "","offset": ""}, fields=["ts_code","trade_date","close"])
hs_return.sort_values(by="trade_date" , inplace=True, ascending=True)
hs_return.reset_index(drop=True,inplace=True)
hs_return['Date'] = hs_return['trade_date'].astype('datetime64[ns]').astype('str')
hs_return = hs_return[hs_return['Date'].isin(first_time[1:])][['close', 'Date']]
hs_return['HS_INDEX'] = (-hs_return['close'].diff(-1)) / (hs_return['close'])