新增获取profit,新增 total_profit_to_cost_ratio 因子
import os.path
import akshare as ak
from functools import wraps
import _pickle as cPickle
import pandas as pd
import time
# pip3 install akshare --upgrade
# print(df)
# fac_list = ['Price1Y', 'total_profit_to_cost_ratio', 'VOL120']
fac_list = ['Price1Y', 'total_profit_to_cost_ratio', 'VOL120']
start = '20130101'
end = '20181231'
# 股票列表
# stock_zh_a_spot_em_df = ak.stock_zh_a_spot_em()
# print(stock_zh_a_spot_em_df)
# raise Exception(1111)
# for stk in stock_zh_a_spot_em_df['代码'].values:
# print(stk)
# daily_k = ak.stock_zh_a_hist(symbol=stk, period="daily", start_date=start, end_date=end, adjust="")
# # 日期列表
# df = pro.trade_cal(exchange='', start_date=start, end_date=end,
# fields='exchange,cal_date,is_open,pretrade_date', is_open='1')
# date_list = df['cal_date'].to_list()[::-1]
# print(df)
def cache_df(fname, do=True):
def __inner(func):
@wraps(func)
def wrap(*args, **kwargs):
if os.path.isfile(fname):
# load data from file
print("from file")
data = cPickle.load(open(fname, "rb"))
else:
# get data from func
print("from func")
data = func(*args, **kwargs)
if fname and do and not os.path.isfile(fname):
# save data
print("to file")
cPickle.dump(data, open(fname, 'wb'))
return data
return wrap
return __inner
# Price1Y 当前股价除以过去一年股价均值再减1 当日收盘价 / mean(过去一年(250天)的收盘价) -1
# total_profit_to_cost_ratio 成本费用利润率 成本费用利润率=利润总额/(营业成本+财务费用+销售费用+管理费用),以上科目使用的都是TTM的数值
# VOL120 120日平均换手率 120日换手率的均值,单位为%
@cache_df('./ak-20120101-20221231-profit.pkl')
def get_profit_table():
dfs = []
for year in range(2012, 2023):
for md in ['0331', '0630', '0930', '1231']:
date = f'{year}{md}'
stock_lrb_em_df = ak.stock_lrb_em(date=date)
print(stock_lrb_em_df)
print(stock_lrb_em_df.columns)
dfs.append(stock_lrb_em_df)
time.sleep(0.1)
# break
return pd.concat(dfs).sort_index()
profit_df = get_profit_table()
# print(profit_df[['营业总支出-营业支出','营业总支出-销售费用', '营业总支出-管理费用', '营业总支出-财务费用', '营业总支出-营业总支出',]])
profit_df['total_profit_to_cost_ratio'] = profit_df['利润总额'] / profit_df['营业总支出-营业总支出']
# profit_df['total_profit_to_cost_ratio'] = profit_df['利润总额'] / profit_df['营业总支出-营业支出'] + profit_df['营业总支出-销售费用'] + \
# profit_df['营业总支出-管理费用'] + profit_df['营业总支出-财务费用']
profit_df = profit_df.drop(['序号', '股票简称', '净利润', '净利润同比', '营业总收入', '营业总收入同比', '营业总支出-营业支出',
'营业总支出-销售费用', '营业总支出-管理费用', '营业总支出-财务费用', '营业总支出-营业总支出', '营业利润', '利润总额', ], axis=1)
# raise Exception(11)
import tushare as ts
def get_5d_change_pct(code, subdf):
# return []
# print(code)
# print(subdf['日期'])
subdf_profit = profit_df[profit_df['股票代码'] == code]
subdf_profit = subdf_profit.sort_values("公告日期")
# print(subdf_profit['公告日期'])
subdf = subdf.sort_values("日期")
subdf['Price1Y'] = (subdf['收盘'] / (subdf['收盘'].rolling(250).mean())) - 1
subdf['VOL120'] = subdf['换手率'].rolling(120).mean()
subdf['5d_close'] = subdf['收盘'].shift(-4)
subdf['5d_change_pct'] = ((subdf['5d_close'] - subdf['开盘']) / subdf['开盘']).shift(-1)
# subdf['1d_close'] = subdf['收盘'].shift(-1)
# subdf['1d_change_pct'] = ((subdf['1d_close'] - subdf['开盘']) / subdf['开盘']).shift(-1)
# subdf['10d_close'] = subdf['收盘'].shift(-10)
# subdf['10d_change_pct'] = ((subdf['10d_close'] - subdf['开盘']) / subdf['开盘']).shift(-1)
# subdf['y'] = (subdf['10d_change_pct'] + subdf['5d_change_pct'] + subdf['1d_change_pct']) / 3
# print(subdf_profit.columns)
subdf['日期'] = pd.to_datetime(subdf['日期'])
subdf_profit['公告日期'] = pd.to_datetime(subdf_profit['公告日期'])
# print(subdf['日期'])
# print(subdf_profit['公告日期'])
merged = pd.merge_asof(subdf, subdf_profit, left_on='日期', right_on='公告日期')
# try:
# print(("xx", code))
# xx = ak.stock_share_change_cninfo(symbol=code, start_date="20091227", end_date='20230524')
# xx = xx[['已流通股份', '变动日期']]
# merged = pd.merge_asof(merged, xx, left_on='日期', right_on='变动日期')
# print(merged)
# raise Exception(111)
# except:
# merged['已流通股份'] = 0
# merged = pd.merge_asof(subdf, subdf_profit, left_on='日期', right_on='公告日期', direction='forward')
# print(merged)
# print(xx.columns)
# print(xx.head())
# return merged.iloc[249:-6]
return merged.iloc[249:-5]
@cache_df('./ak-20130101-20181231-after-close.pkl')
def get_his_close_all():
stock_zh_a_spot_em_df = ak.stock_zh_a_spot_em()
print(stock_zh_a_spot_em_df)
all_daily_k = []
for stk in stock_zh_a_spot_em_df['代码'].values:
# print(stk)
# stk = '300620'
# # try:
# xx = ak.stock_share_change_cninfo(symbol=stk, start_date="20091227", end_date='20190101')
# print(xx[['已流通股份', '公告日期']])
# print(xx.columns)
# # except:
# # continue
# raise Exception(111)
daily_k = ak.stock_zh_a_hist(symbol=stk, period="daily", start_date=start, end_date=end, adjust="hfq")
# daily_k = ak.stock_zh_a_hist(symbol=stk, period="daily", start_date=start, end_date=end, adjust="")
if not daily_k.empty:
daily_k['代码'] = stk
all_daily_k.append(daily_k)
time.sleep(0.1)
return pd.concat(all_daily_k).sort_index()
df = get_his_close_all()
@cache_df('./ak-20130101-20181231-hfq-merge.pkl')
def merge_data(df):
df['流通股本'] = df['成交量'] / df['换手率']
df['流通市值'] = df['流通股本'] * df['收盘']
print(df.columns)
dfs = []
for each in df.groupby('代码'):
dfs.append(
get_5d_change_pct(each[0], each[1])
)
return pd.concat(dfs).sort_index()
xx = merge_data(df)
print(xx)
# xx.reset_index(inplace=True, drop=True)
print(xx.shape)
xx = xx[xx['日期'].dt.month != 4]
xx = xx[xx['日期'].dt.dayofweek == 4]
# raise Exception(111)
print(xx.shape)
# xx.set_index('日期', inplace=True)
# xx.fillna(0, inplace=True)
import numpy as np
# 去极值函数
def mad(factor):
me = np.median(factor)
mad = np.median(abs(factor - me))
up = me + (3 * 1.4826 * mad)
down = me - (3 * 1.4826 * mad)
factor = np.where(factor > up, up, factor)
factor = np.where(factor < down, down, factor)
return factor
# 标准化函数
def stand(factor):
mean = factor.mean()
print(mean)
std = factor.std()
return (factor - mean) / std
def one(factor):
return (factor - factor.min()) / (factor.max() - factor.min())
# for name in fac_list:
# xx[name] = mad(xx[name])
# xx[name] = stand(xx[name])
# xx['5d_change_pct'] = stand(xx['5d_change_pct'])
xx['yy'] = xx['5d_change_pct']
xx.dropna(axis=0, how='any', inplace=True)
# xx = xx.sort_values('5d_change_pct', ascending=False)
def resort_some(subdf):
# subdf = subdf.sort_values("流通市值")
subdf = subdf.sort_values("5d_change_pct", ascending=False)
# subdf = subdf[subdf["5d_change_pct"] > 0]
subdf.reset_index(inplace=True, drop=True)
# subdf['pct_idx'] = subdf.index + 1
subdf = subdf.iloc[:int(len(subdf) / 3)]
subdf = subdf.sort_values("流通市值")
# subdf = subdf.sort_values("流通市值", ascending=False)
# subdf = subdf.sort_values("5d_change_pct", ascending=False)
subdf.reset_index(inplace=True, drop=True)
# subdf['mv_idx'] = subdf.index + 1
# subdf['yy'] = one(subdf['pct_idx'])
subdf = subdf.iloc[:int(len(subdf) / 3)]
# subdf = subdf.iloc[:int(len(subdf) / 50)]
return subdf
xx = xx.groupby('日期').apply(resort_some)
# xx = xx[xx["yy"] > 0]
# xx = xx.sort_values('yy', ascending=False)
# xx['yy'] = one(xx['yy'])
# xx.reset_index(drop=True, inplace=True)
# xx = xx.iloc[:int(3017468 / 5)]
# xx = xx.iloc[:int(len(xx) / 5)]
# aa = xx.iloc[:int(3017468 / 10)]
# bb = xx.iloc[-1 * int(3017468 / 10):]
# xx = pd.concat([aa, bb]).sort_index()
# print(profit_df.columns)
# print(profit_df.head())
# print(xx.columns)
# print(xx.head())
# # aa = pd.merge_asof(xx, profit_df,left_on=['代码','日期'],right_on=['股票代码','公告日期'],)
from sklearn.linear_model import LinearRegression, BayesianRidge, SGDRegressor
#
# 2.2.1 求解
lr = BayesianRidge(normalize=True) # 记得进行标准化(在回归之前,对X减去平均值再除以二范数),以免不同因子的量纲不同而“自带权重”
# lr = SGDRegressor(eta0=0.005, max_iter=10000) # 记得进行标准化(在回归之前,对X减去平均值再除以二范数),以免不同因子的量纲不同而“自带权重”
# lr = BayesianRidge() # 记得进行标准化(在回归之前,对X减去平均值再除以二范数),以免不同因子的量纲不同而“自带权重”
## 训练(拟合)
print(xx.shape)
print(xx.loc[:, fac_list])
print(xx.loc[:, 'yy'])
lr.fit(xx.loc[:, fac_list], xx.loc[:, 'yy']) # 第一个参数传入一系列“当日” factor_i 的值,第二个参数是“当日” price 的值
print("LinearRegression 线性回归的回归常数(w0):", lr.intercept_)
print("LinearRegression 线性回归的回归系数(wi)(即各因子的权重):", lr.coef_)
# 0.02444971 -0.00079505 0.00437738
# 0.00624086 -0.00072835 0.00789871
# [ 0.03942262,-0.00043044,0.00404489]
# 0.03120765,-0.00099382,0.00376255
# 0.03611993,-0.00041392 ,0.00421672
# 0.02854157 -0.00110044 0.00389443
# 0.02390334 -0.0013525 0.00037847
# 0.0243589 -0.00093434 0.00317006
# 0.05349193 -0.0010121 0.00295832
# 0.0555618 -0.00054349 0.00305459
# 0.04933447,-0.00108215,0.00272365
# 0.03524537 -0.00151333 0.00396425
# 0.05266273 - 0.0013247 0.00322452
# 0.05278181 -0.06279844 0.0010868
# -0.11372556,-0.30733683, 0.00318273
# 0.05408951,-0.0411778,-0.00294783
# 0.03985932 -0.02601888 -0.00251486
# 0.05553055 -0.0308877 0.00124696
# 0.05505635 -0.02279492 0.00222791
# 0.05266273 - 0.0013247 0.00322452
# 0.04446464,-0.00046872,0.00444217
# 0.02809314,-0.00115244,0.00345516
# 0.02945875,-0.00096102,0.00347765
# -2.12801429e-03,-6.13296055e-05,3.23480933e-04
# 0.02944592, -0.00096051, 0.00347614
# 0.02825109,-0.0015179,0.00335322
# -9.25829047e-04 5.07501655e-05 6.20132305e-04
# 0.01647885,-0.00066779 ,0.00232331
# 0.02819501 ,-0.0055081 , 0.01493451
# -0.00078882,-0.00017121 ,0.00109758
# -2.85955997e-03 -7.84713361e-05 4.75946331e-04s
# -1.08461012e-03,-6.55195958e-05,5.41337829e-05
# 0.03191417,-0.00063483,0.00445406
from itertools import combinations, combinations_with_replacement
example = [-1, -0.6, -0.2, 0.2, 0.6, 1]
# for answer in combinations(example, 3):
# print(answer)
conditions = []
for answer in combinations_with_replacement(example, 3):
print(answer)
conditions.append(answer)
print(len(conditions))