akshare

柳木木_kylin
已于 2023-05-26 16:36:54 修改
阅读量620
点赞数
文章标签： python 开发语言
于 2023-05-17 16:36:24 首次发布
本文链接：https://blog.csdn.net/u010136741/article/details/130729330
版权
该代码示例展示了如何使用Python的akshare库获取股票历史数据和财务报表信息，并进行数据缓存，计算股票的Price1Y、total_profit_to_cost_ratio等因子，以及处理5日收益率变化。同时，代码还涉及到了数据清洗、合并和预处理，包括去除极端值和标准化处理。
摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >
新增获取profit，新增 total_profit_to_cost_ratio 因子
import os.path

import akshare as ak
from functools import wraps
import _pickle as cPickle
import pandas as pd
import time

# pip3 install akshare --upgrade


# print(df)


# fac_list = ['Price1Y', 'total_profit_to_cost_ratio', 'VOL120']
fac_list = ['Price1Y', 'total_profit_to_cost_ratio', 'VOL120']
start = '20130101'
end = '20181231'


# 股票列表
# stock_zh_a_spot_em_df = ak.stock_zh_a_spot_em()
# print(stock_zh_a_spot_em_df)
# raise Exception(1111)

# for stk in stock_zh_a_spot_em_df['代码'].values:
#     print(stk)
#     daily_k = ak.stock_zh_a_hist(symbol=stk, period="daily", start_date=start, end_date=end, adjust="")
# # 日期列表
# df = pro.trade_cal(exchange='', start_date=start, end_date=end,
#                    fields='exchange,cal_date,is_open,pretrade_date', is_open='1')


# date_list = df['cal_date'].to_list()[::-1]
# print(df)


def cache_df(fname, do=True):
    def __inner(func):
        @wraps(func)
        def wrap(*args, **kwargs):
            if os.path.isfile(fname):
                # load data from file
                print("from file")
                data = cPickle.load(open(fname, "rb"))
            else:
                # get data from func
                print("from func")
                data = func(*args, **kwargs)
            if fname and do and not os.path.isfile(fname):
                # save data
                print("to file")
                cPickle.dump(data, open(fname, 'wb'))
            return data

        return wrap

    return __inner


# Price1Y	当前股价除以过去一年股价均值再减1	当日收盘价 / mean(过去一年(250天)的收盘价) -1
# total_profit_to_cost_ratio	成本费用利润率	成本费用利润率=利润总额/(营业成本+财务费用+销售费用+管理费用)，以上科目使用的都是TTM的数值
# VOL120	120日平均换手率	120日换手率的均值,单位为%

@cache_df('./ak-20120101-20221231-profit.pkl')
def get_profit_table():
    dfs = []
    for year in range(2012, 2023):
        for md in ['0331', '0630', '0930', '1231']:
            date = f'{year}{md}'
            stock_lrb_em_df = ak.stock_lrb_em(date=date)
            print(stock_lrb_em_df)
            print(stock_lrb_em_df.columns)
            dfs.append(stock_lrb_em_df)
            time.sleep(0.1)
            # break
    return pd.concat(dfs).sort_index()


profit_df = get_profit_table()
# print(profit_df[['营业总支出-营业支出','营业总支出-销售费用', '营业总支出-管理费用', '营业总支出-财务费用', '营业总支出-营业总支出',]])
profit_df['total_profit_to_cost_ratio'] = profit_df['利润总额'] / profit_df['营业总支出-营业总支出']
# profit_df['total_profit_to_cost_ratio'] = profit_df['利润总额'] / profit_df['营业总支出-营业支出'] + profit_df['营业总支出-销售费用'] + \
#                                           profit_df['营业总支出-管理费用'] + profit_df['营业总支出-财务费用']
profit_df = profit_df.drop(['序号', '股票简称', '净利润', '净利润同比', '营业总收入', '营业总收入同比', '营业总支出-营业支出',
                            '营业总支出-销售费用', '营业总支出-管理费用', '营业总支出-财务费用', '营业总支出-营业总支出', '营业利润', '利润总额', ], axis=1)

# raise Exception(11)

import tushare as ts


def get_5d_change_pct(code, subdf):
    #     return []
    # print(code)
    # print(subdf['日期'])
    subdf_profit = profit_df[profit_df['股票代码'] == code]
    subdf_profit = subdf_profit.sort_values("公告日期")
    # print(subdf_profit['公告日期'])
    subdf = subdf.sort_values("日期")
    subdf['Price1Y'] = (subdf['收盘'] / (subdf['收盘'].rolling(250).mean())) - 1
    subdf['VOL120'] = subdf['换手率'].rolling(120).mean()
    subdf['5d_close'] = subdf['收盘'].shift(-4)
    subdf['5d_change_pct'] = ((subdf['5d_close'] - subdf['开盘']) / subdf['开盘']).shift(-1)
    # subdf['1d_close'] = subdf['收盘'].shift(-1)
    # subdf['1d_change_pct'] = ((subdf['1d_close'] - subdf['开盘']) / subdf['开盘']).shift(-1)
    # subdf['10d_close'] = subdf['收盘'].shift(-10)
    # subdf['10d_change_pct'] = ((subdf['10d_close'] - subdf['开盘']) / subdf['开盘']).shift(-1)
    # subdf['y'] = (subdf['10d_change_pct'] + subdf['5d_change_pct'] + subdf['1d_change_pct']) / 3

    # print(subdf_profit.columns)

    subdf['日期'] = pd.to_datetime(subdf['日期'])
    subdf_profit['公告日期'] = pd.to_datetime(subdf_profit['公告日期'])
    # print(subdf['日期'])
    # print(subdf_profit['公告日期'])
    merged = pd.merge_asof(subdf, subdf_profit, left_on='日期', right_on='公告日期')
    # try:
    #     print(("xx", code))
    #     xx = ak.stock_share_change_cninfo(symbol=code, start_date="20091227", end_date='20230524')
    #     xx = xx[['已流通股份', '变动日期']]
    #     merged = pd.merge_asof(merged, xx, left_on='日期', right_on='变动日期')
    #     print(merged)
    #     raise Exception(111)
    # except:
    #     merged['已流通股份'] = 0
    # merged = pd.merge_asof(subdf, subdf_profit, left_on='日期', right_on='公告日期', direction='forward')
    # print(merged)
    # print(xx.columns)
    # print(xx.head())
    # return merged.iloc[249:-6]
    return merged.iloc[249:-5]


@cache_df('./ak-20130101-20181231-after-close.pkl')
def get_his_close_all():
    stock_zh_a_spot_em_df = ak.stock_zh_a_spot_em()
    print(stock_zh_a_spot_em_df)
    all_daily_k = []
    for stk in stock_zh_a_spot_em_df['代码'].values:
        # print(stk)
        # stk = '300620'
        # # try:
        # xx = ak.stock_share_change_cninfo(symbol=stk, start_date="20091227", end_date='20190101')
        # print(xx[['已流通股份', '公告日期']])
        # print(xx.columns)
        # # except:
        # #     continue
        # raise Exception(111)
        daily_k = ak.stock_zh_a_hist(symbol=stk, period="daily", start_date=start, end_date=end, adjust="hfq")
        # daily_k = ak.stock_zh_a_hist(symbol=stk, period="daily", start_date=start, end_date=end, adjust="")
        if not daily_k.empty:
            daily_k['代码'] = stk
            all_daily_k.append(daily_k)

        time.sleep(0.1)

    return pd.concat(all_daily_k).sort_index()


df = get_his_close_all()


@cache_df('./ak-20130101-20181231-hfq-merge.pkl')
def merge_data(df):
    df['流通股本'] = df['成交量'] / df['换手率']
    df['流通市值'] = df['流通股本'] * df['收盘']
    print(df.columns)
    dfs = []
    for each in df.groupby('代码'):
        dfs.append(
            get_5d_change_pct(each[0], each[1])
        )
    return pd.concat(dfs).sort_index()


xx = merge_data(df)

print(xx)
# xx.reset_index(inplace=True, drop=True)
print(xx.shape)
xx = xx[xx['日期'].dt.month != 4]
xx = xx[xx['日期'].dt.dayofweek == 4]
# raise Exception(111)
print(xx.shape)
# xx.set_index('日期', inplace=True)
# xx.fillna(0, inplace=True)
import numpy as np


# 去极值函数
def mad(factor):
    me = np.median(factor)
    mad = np.median(abs(factor - me))
    up = me + (3 * 1.4826 * mad)
    down = me - (3 * 1.4826 * mad)
    factor = np.where(factor > up, up, factor)
    factor = np.where(factor < down, down, factor)
    return factor


# 标准化函数
def stand(factor):
    mean = factor.mean()
    print(mean)
    std = factor.std()
    return (factor - mean) / std


def one(factor):
    return (factor - factor.min()) / (factor.max() - factor.min())


# for name in fac_list:
#     xx[name] = mad(xx[name])
#     xx[name] = stand(xx[name])
# xx['5d_change_pct'] = stand(xx['5d_change_pct'])
xx['yy'] = xx['5d_change_pct']
xx.dropna(axis=0, how='any', inplace=True)


# xx = xx.sort_values('5d_change_pct', ascending=False)


def resort_some(subdf):
    # subdf = subdf.sort_values("流通市值")
    subdf = subdf.sort_values("5d_change_pct", ascending=False)
    # subdf = subdf[subdf["5d_change_pct"] > 0]
    subdf.reset_index(inplace=True, drop=True)
    # subdf['pct_idx'] = subdf.index + 1
    subdf = subdf.iloc[:int(len(subdf) / 3)]
    subdf = subdf.sort_values("流通市值")
    # subdf = subdf.sort_values("流通市值", ascending=False)
    # subdf = subdf.sort_values("5d_change_pct", ascending=False)
    subdf.reset_index(inplace=True, drop=True)
    # subdf['mv_idx'] = subdf.index + 1
    # subdf['yy'] = one(subdf['pct_idx'])
    subdf = subdf.iloc[:int(len(subdf) / 3)]
    # subdf = subdf.iloc[:int(len(subdf) / 50)]
    return subdf


xx = xx.groupby('日期').apply(resort_some)
# xx = xx[xx["yy"] > 0]
# xx = xx.sort_values('yy', ascending=False)

# xx['yy'] = one(xx['yy'])
# xx.reset_index(drop=True, inplace=True)
# xx = xx.iloc[:int(3017468 / 5)]
# xx = xx.iloc[:int(len(xx) / 5)]

# aa = xx.iloc[:int(3017468 / 10)]
# bb = xx.iloc[-1 * int(3017468 / 10):]
# xx = pd.concat([aa, bb]).sort_index()

# print(profit_df.columns)
# print(profit_df.head())
# print(xx.columns)
# print(xx.head())
# # aa = pd.merge_asof(xx, profit_df,left_on=['代码','日期'],right_on=['股票代码','公告日期'],)
from sklearn.linear_model import LinearRegression, BayesianRidge, SGDRegressor

#
# 2.2.1 求解
lr = BayesianRidge(normalize=True)  # 记得进行标准化（在回归之前，对X减去平均值再除以二范数），以免不同因子的量纲不同而“自带权重”
# lr = SGDRegressor(eta0=0.005, max_iter=10000)  # 记得进行标准化（在回归之前，对X减去平均值再除以二范数），以免不同因子的量纲不同而“自带权重”
# lr = BayesianRidge()  # 记得进行标准化（在回归之前，对X减去平均值再除以二范数），以免不同因子的量纲不同而“自带权重”
## 训练（拟合）
print(xx.shape)
print(xx.loc[:, fac_list])
print(xx.loc[:, 'yy'])
lr.fit(xx.loc[:, fac_list], xx.loc[:, 'yy'])  # 第一个参数传入一系列“当日” factor_i 的值，第二个参数是“当日” price 的值
print("LinearRegression 线性回归的回归常数(w0):", lr.intercept_)
print("LinearRegression 线性回归的回归系数(wi)（即各因子的权重）:", lr.coef_)

# 0.02444971 -0.00079505  0.00437738
# 0.00624086 -0.00072835  0.00789871
# [ 0.03942262,-0.00043044,0.00404489]
# 0.03120765,-0.00099382,0.00376255
# 0.03611993,-0.00041392 ,0.00421672
# 0.02854157 -0.00110044  0.00389443
# 0.02390334 -0.0013525   0.00037847
# 0.0243589  -0.00093434  0.00317006
# 0.05349193 -0.0010121   0.00295832
# 0.0555618  -0.00054349  0.00305459
# 0.04933447,-0.00108215,0.00272365

# 0.03524537 -0.00151333  0.00396425
# 0.05266273 - 0.0013247 0.00322452
# 0.05278181 -0.06279844  0.0010868
# -0.11372556,-0.30733683, 0.00318273
# 0.05408951,-0.0411778,-0.00294783
# 0.03985932 -0.02601888 -0.00251486
# 0.05553055 -0.0308877   0.00124696
# 0.05505635 -0.02279492  0.00222791
# 0.05266273 - 0.0013247 0.00322452
# 0.04446464,-0.00046872,0.00444217
# 0.02809314,-0.00115244,0.00345516

# 0.02945875,-0.00096102,0.00347765
# -2.12801429e-03,-6.13296055e-05,3.23480933e-04
# 0.02944592, -0.00096051, 0.00347614
# 0.02825109,-0.0015179,0.00335322
# -9.25829047e-04  5.07501655e-05  6.20132305e-04
# 0.01647885,-0.00066779 ,0.00232331
# 0.02819501 ,-0.0055081  , 0.01493451
# -0.00078882,-0.00017121 ,0.00109758
# -2.85955997e-03 -7.84713361e-05  4.75946331e-04s
# -1.08461012e-03,-6.55195958e-05,5.41337829e-05
# 0.03191417,-0.00063483,0.00445406
from itertools import combinations, combinations_with_replacement

example = [-1, -0.6, -0.2, 0.2, 0.6, 1]

# for answer in combinations(example, 3):
#     print(answer)
conditions = []
for answer in combinations_with_replacement(example, 3):
    print(answer)
    conditions.append(answer)

print(len(conditions))