tushare

最新推荐文章于 2024-07-25 00:08:05 发布

柳木木_kylin

最新推荐文章于 2024-07-25 00:08:05 发布

阅读量82

点赞数

文章标签： python 机器学习数据分析

本文链接：https://blog.csdn.net/u010136741/article/details/130726820

版权

import os.path

import tushare as ts
from functools import wraps
import _pickle as cPickle
import pandas as pd
import time

ts.set_token('token')
pro = ts.pro_api()
df = pro.trade_cal(exchange='', start_date='20180901', end_date='20181001',
                   fields='exchange,cal_date,is_open,pretrade_date', is_open='0')
# print(df)


# fac_list = ['Price1Y', 'total_profit_to_cost_ratio', 'VOL120']
fac_list = ['Price1Y', ]
start = '20130101'
end = '20181231'
# 日期列表
df = pro.trade_cal(exchange='', start_date=start, end_date=end,
                   fields='exchange,cal_date,is_open,pretrade_date', is_open='1')


# date_list = df['cal_date'].to_list()[::-1]
# print(df)


def cache_df(fname, do=True):
    def __inner(func):
        @wraps(func)
        def wrap(*args, **kwargs):
            if os.path.isfile(fname):
                # load data from file
                print("from file")
                data = cPickle.load(open(fname, "rb"))
            else:
                # get data from func
                print("from func")
                data = func(*args, **kwargs)
            if fname and do and not os.path.isfile(fname):
                # save data
                print("to file")
                cPickle.dump(data, open(fname, 'wb'))
            return data

        return wrap

    return __inner


# Price1Y	当前股价除以过去一年股价均值再减1	当日收盘价 / mean(过去一年(250天)的收盘价) -1
# total_profit_to_cost_ratio	成本费用利润率	成本费用利润率=利润总额/(营业成本+财务费用+销售费用+管理费用)，以上科目使用的都是TTM的数值
# VOL120	120日平均换手率	120日换手率的均值,单位为%

def get_5d_change_pct(subdf):
    #     return []
    subdf = subdf.sort_values("trade_date")
    subdf['Price1Y'] = (subdf['close'] / (subdf['close'].rolling(250).mean())) - 1
    # subdf['VOL120'] = subdf['turn_over'].rolling(120).mean()
    subdf['5d_close'] = subdf['close'].shift(-4)
    subdf['5d_change_pct'] = ((subdf['5d_close'] - subdf['open']) / subdf['open']).shift(-1)
    # print(subdf.loc[:, ['trade_date', 'open', 'close', '5d_close', '5d_change_pct']])
    # print(subdf.iloc[:10])
    # subdf['5d_change_pct'] = (subdf['close'].pct_change(periods=5)).shift(-1)
    #     print(subdf.iloc[4:-1])
    return subdf.iloc[249:-5]


@cache_df('./20130101-20181231-close.pkl')
def get_his_close(df):
    all_daily_k = []
    for date in df['cal_date'].values:
        print(date)
        daily_k = pro.daily(trade_date=date)
        # print(daily_k.head())
        all_daily_k.append(daily_k)
        time.sleep(0.3)
        # break

    return pd.concat(all_daily_k).sort_index()


# @cache_df('./20130101-20131231-close.pkl')
def get_his_close_all(df):
    all_daily_k = []
    for date in df['cal_date'].values:
        print(date)
        file_path = rf"./data/{date}.pkl"
        if os.path.isfile(file_path):
            daily_k = cPickle.load(open(file_path, "rb"))
        else:
            # daily_k = pro.daily(trade_date=date)
            daily_k = pro.bak_daily(trade_date=date,
                                    fields='ts_code,trade_date,name,pct_change,close,change,open,high,low,pre_close,vol_ratio,turn_over,swing,vol,amount,selling,buying,total_share,float_share,pe,industry,area,float_mv,total_mv,avg_price,strength,activity,avg_turnover,attack,interval_3,interval_6')

            cPickle.dump(daily_k, open(file_path, "wb"))
            time.sleep(15)
        # print(daily_k.head())
        all_daily_k.append(daily_k)
        # break

    return pd.concat(all_daily_k).sort_index()


df = get_his_close(df)
print(df.columns)
xx = df.groupby('ts_code').apply(get_5d_change_pct)
print(xx)
xx.reset_index(inplace=True, drop=True)
print(xx.shape)
# xx = xx.loc[xx['5d_change_pct']>0]
# print(xx.shape)
# print(xx.columns)
xx.set_index('trade_date', inplace=True)
xx.fillna(0, inplace=True)

from sklearn.linear_model import LinearRegression

from sklearn.metrics import r2_score  # , median_absolute_error

# 2.2.1 求解
lr = LinearRegression(normalize=True)  # 记得进行标准化（在回归之前，对X减去平均值再除以二范数），以免不同因子的量纲不同而“自带权重”
## 训练（拟合）
lr.fit(xx.loc[:, fac_list], xx.loc[:, '5d_change_pct'])  # 第一个参数传入一系列“当日” factor_i 的值，第二个参数是“当日” price 的值
print("LinearRegression 线性回归的回归常数(w0):", lr.intercept_)
print("LinearRegression 线性回归的回归系数(wi)（即各因子的权重）:", lr.coef_)