用sklearn分析基金数据<1>
python爬虫获取基金数据<2>
数据预处理:数据清洗、生成样本数据<3>
用sklearn训练样本数据<4>
用模型进行预测及改进<5>
前面是把12月数据按80%和20%分成训练集和测试集,测试结果还不错,现在来做实际应用,以11月数据做为训练集,12用数据作为测试集。具体代码如下:
import pandas as pd
import numpy as np
Novrecords=pd.read_csv('./input/halfm/2017-12_data_halfmon.csv')
Novrecords.index=Novrecords.iloc[:,0]
Novrecords['calsstype'] = Novrecords['calsstype'].replace([0],[-1])
data_train= Novrecords
Decrecords=pd.read_csv('./input/halfm/tail_2017-09_data_halfmon.csv')
Decrecords.index=Decrecords.iloc[:,0]
Decrecords['calsstype'] = Decrecords['calsstype'].replace([0],[-1])
data_test= Decrecords
Ytrain = data_train.iloc[:,6]
Xtrain = data_train.iloc[:,1:6]
Ytest = data_test.iloc[:,6]
Xtest = data_test.iloc[:,1:6]
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(Xtrain,Ytrain)
y_testpred = rf0.predict(Xtest)
newindex = Xtest.index
y_testpred=pd.DataFrame(y_testpred,index=newindex)
#SVN
from sklearn.svm import SVC
grid = GridSearchCV(SVC(), param_grid={"C":[0.1, 1, 10], "gamma": [1, 0.1, 0.01]}, cv=4)
grid.fit(Xtrain,Ytrain)
y_pred_svc = grid.predict(Xtest)
y_pred_svc=pd.DataFrame(y_pred_svc,index=newindex)
#adaboost
from sklearn.ensemble import AdaBoostClassifier
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=20, min_samples_leaf=5),
algorithm="SAMME",
n_estimators=200, learning_rate=0.8)
bdt.fit(Xtrain,Ytrain)
y_pred_ada = bdt.predict(Xtest)
y_pred_ada=pd.DataFrame(y_pred_ada,index=newindex)
#GBDT
from sklearn.ensemble import GradientBoostingClassifier
gbm0 = GradientBoostingClassifier(random_state=10)
gbm0.fit(Xtrain,Ytrain)
y_pred_gbdt = gbm0.predict(Xtest)
y_pred_gbdt=pd.DataFrame(y_pred_gbdt,index=newindex)
result = pd.concat([data_test,y_testpred,y_pred_svc,y_pred_ada,y_pred_gbdt], axis=1)
outputfile = './testoutput/output201712.xls'
result.to_excel(outputfile)
测试结果是非常不好,四种算法结果都全军覆没,结果都返回负值,正例都没有预出来,又多测几个月样本,效果依然是很差。此时考虑平均收益率也不行,因为结果显示一个基金都不要买。
总之泛化能力太弱基本为0,背后原因一是对训练数据过拟合,特别是决策树很容易过拟合,二是没有找到正确的特征,当然也很有可能没有这样的特征,基本无解。
从训练集的大好结果到惨淡的测试结果,有一点不甘心,又做了两种改进版本,
主要还是在原来的思路做一些扩展。两种方法,
一,既然样本存在前面存在的先天缺陷,上个月数据不能用于训练建模,只能用上上个月数据来训练,上月数据拿来做测试。
那我缩短周期,一月分成两份,上半月和下半月,则就可以拿上月的上半月来训练,下半月来做测试了。
二,特征过少,增加特征数量,原来的五个特征都是以一个月为周期来计算的,那我再增加以2个月和3个月为周期的相同指标,
特征数由5个增加至15个,这样做也有业务逻辑,一般来说选择基金不能只看当月,还应该多看看前几个月业绩,综合考虑。
第一种改进方式生成样本数据代码如下:
import pandas as pd
import numpy as np
#get all data
Allrecords=pd.read_csv('checkedalldata.csv')
Allrecords.index=Allrecords.iloc[:,0]
Allrecords.index=pd.to_datetime(Allrecords.index, format='%Y-%m-%d')
#just get date,price,fund-code
records = Allrecords.iloc[:,0:4]
Allfund=pd.read_table('Leixingall.txt',encoding='utf-8',sep=',')
Allfund.index=Allfund.iloc[:,1]
Allfund=Allfund[(Allfund.fund_type=='混合型') | (Allfund.fund_type=='股票指数')|(Allfund.fund_type=='股票型') |(Allfund.fund_type=='债券型') ]
codes=Allfund.iloc[:,1]
datelist =[['2017-12','2018-01'],
['2017-11','2017-12'],
['2017-10','2017-11'],
['2017-09','2017-10'],
['2017-08','2017-09'],
['2017-07','2017-08'],
['2017-06','2017-07'],
['2017-05','2017-06'],
['2017-04','2017-05'],
['2017-03','2017-04'],
['2017-02','2017-03'],
['2017-01','2017-02']]
def preparedata(currdt,nextdt,filename):
for currcode in codes:
upcount =0
downcount =0
try:
currrecords=records[records.trade_code==currcode]
Curprice=currrecords.cur_price
ret=(Curprice-Curprice.shift(-1))/Curprice.shift(-1)*100
ret.name='Ret'
retTM=ret[currdt]
counts=int(retTM.describe()['count'])
CurpriceTM=Curprice[currdt]
if counts <2:
print('count<2',currcode)
continue
count_head = round(counts/2)
count_tail = counts - count_head
retTM_head = retTM.head(count_head)
retTM_tail = retTM.tail(count_tail)
CurpriceTM_head = CurpriceTM.head(count_head)
CurpriceTM_tail = CurpriceTM.tail(count_tail)
avgret_head = (CurpriceTM_head[0]-CurpriceTM_head[count_head-1])/CurpriceTM_head[count_head-1]*100
avgret_tail = (CurpriceTM_tail[0]-CurpriceTM_tail[count_tail-1])/CurpriceTM_tail[count_tail-1]*100
for rets_head in retTM_head:
if rets_head >0:
upcount +=1
else:
downcount+=1
upret_head=upcount/count_head*100
maxret_head=retTM_head.describe()['max']
minret_head=retTM_head.describe()['min']
meanret_head=retTM_head.describe()['mean']
upcount =0
downcount =0
for rets_tail in retTM_tail:
if rets_tail >0:
upcount +=1
else:
downcount+=1
upret_tail=upcount/count_tail*100
maxret_tail=retTM_tail.describe()['max']
minret_tail=retTM_tail.describe()['min']
meanret_tail=retTM_tail.describe()['mean']
#get next month data
nextrecordsTM=currrecords[nextdt]
#get price
Nextprice=nextrecordsTM.cur_price
counts=int(Nextprice.describe()['count'])
count_head = round(counts/2)
count_tail = counts - count_head
Nextprice_head = Nextprice.head(count_head)
Nextprice_tail = Nextprice.tail(count_tail)
nextavgret_head = (Nextprice_head.iloc[0]-Nextprice_head.iloc[count_head-1])/Nextprice_head.iloc[count_head-1]*100
nextavgret_tail = (Nextprice_tail.iloc[0]-Nextprice_tail.iloc[count_tail-1])/Nextprice_tail.iloc[count_tail-1]*100
calsstype_head = -1
if nextavgret_head >= 3 : calsstype_head = 1
calsstype_tail = -1
if nextavgret_tail >= 3 : calsstype_tail = 1
filename_head = 'head_'+filename
filename_tail = 'tail_'+filename
with open(filename_head,'ab') as files:
items = str(currcode) +','+str(avgret_head) +','+ str(maxret_head) +','+ str(minret_head) + ','+ str(meanret_head)+ ','+ str(upret_head) + ','+ str(calsstype_head) +','+ str(nextavgret_head) + '\r\n'
items = items.encode('utf-8')
files.write(items)
with open(filename_tail,'ab') as files:
items = str(currcode) +','+str(avgret_tail) +','+ str(maxret_tail) +','+ str(minret_tail) + ','+ str(meanret_tail)+ ','+ str(upret_tail) + ','+ str(calsstype_tail) +','+ str(nextavgret_tail) + '\r\n'
items = items.encode('utf-8')
files.write(items)
except:
print(currcode)
pass
for date in datelist:
currdt =date[0]
nextdt =date[1]
filename=str(currdt)+'_data_halfmon.csv'
preparedata(currdt,nextdt,filename)
此时样本数据由原来的一月一份变为一月两份,前半月和后半月,测试用代码和原来一样,改一下训练样本、测试样本的名称就可以了。测试结果比原来有所提升,不再是全部返回负值,正值能预测准确10%左右,平均收益率2%左右,低于预期但也明显高于随机,多测几份数据会发现结果不稳定,其实只要分析下样本数据就能发现样本数据也是很有随机性的,每月差别很大。
然后是第二种改进方式,以12月为例生成样本数据代码如下:
import pandas as pd
import numpy as np
#get all data
Allrecords=pd.read_csv('checkedalldata.csv')
Allrecords.index=Allrecords.iloc[:,0]
Allrecords.index=pd.to_datetime(Allrecords.index, format='%Y-%m-%d')
#just get date,price,fund-code
records = Allrecords.iloc[:,0:4]
#get all fund-code
Allfund=pd.read_table('Leixingall.txt',encoding='utf-8',sep=',')
Allfund.index=Allfund.iloc[:,1]
Allfund=Allfund[(Allfund.fund_type=='混合型') | (Allfund.fund_type=='股票指数')|(Allfund.fund_type=='股票型') |(Allfund.fund_type=='债券型') ]
codes=Allfund.iloc[:,1]
outputfile = 'type_gpx.csv'
datelist =[['2017-12','2018-01','2017-11','2017-10']]
def preparedata(currdt,nextdt,filename,pre1mdt,pre2mdt):
for currcode in codes:
upcount =0
downcount =0
try:
currrecords=records[records.trade_code==currcode]
Curprice=currrecords.cur_price
ret=(Curprice-Curprice.shift(-1))/Curprice.shift(-1)*100
ret.name='Ret'
retTM=ret[currdt]
counts=int(retTM.describe()['count'])
CurpriceTM=Curprice[currdt]
ret_Pre1m=ret[currdt:pre2mdt]
counts_Pre1m=int(ret_Pre1m.describe()['count'])
Curprice_Pre1m=Curprice[currdt:pre2mdt]
ret_Pre2m=ret[currdt:pre2mdt]
counts_Pre2m=int(ret_Pre2m.describe()['count'])
Curprice_Pre2m=Curprice[currdt:pre2mdt]
if counts <2:
print('count<2',currcode)
continue
avgret = (CurpriceTM[0]-CurpriceTM[counts-1])/CurpriceTM[counts-1]*100
avgret_Pre1m = (Curprice_Pre1m[0]-Curprice_Pre1m[counts_Pre1m-1])/Curprice_Pre1m[counts_Pre1m-1]*100
avgret_Pre2m = (Curprice_Pre2m[0]-Curprice_Pre2m[counts_Pre2m-1])/Curprice_Pre2m[counts_Pre2m-1]*100
for rets in retTM:
if rets >0:
upcount +=1
else:
downcount+=1
upret=upcount/counts*100
maxret=retTM.describe()['max']
minret=retTM.describe()['min']
meanret=retTM.describe()['mean']
upcount =0
downcount =0
for rets in ret_Pre1m:
if rets >0:
upcount +=1
else:
downcount+=1
upret_Pre1m = upcount/counts_Pre1m*100
maxret_Pre1m = ret_Pre1m.describe()['max']
minret_Pre1m = ret_Pre1m.describe()['min']
meanret_Pre1m = ret_Pre1m.describe()['mean']
upcount =0
downcount =0
for rets in ret_Pre2m:
if rets >0:
upcount +=1
else:
downcount+=1
upret_Pre2m = upcount/counts_Pre2m*100
maxret_Pre2m = ret_Pre2m.describe()['max']
minret_Pre2m = ret_Pre2m.describe()['min']
meanret_Pre2m = ret_Pre2m.describe()['mean']
#get next month data
nextrecordsTM=currrecords[nextdt]
#get price
Nextprice=nextrecordsTM.cur_price
counts=int(Nextprice.describe()['count'])
nextavgret = (Nextprice.iloc[0]-Nextprice.iloc[counts-1])/Nextprice.iloc[counts-1]*100
calsstype = -1
if nextavgret >= 5 : calsstype = 1
with open(filename,'ab') as files:
items = str(currcode) +','+str(avgret) +','+ str(maxret) +','+ str(minret) + ','+ str(meanret)+ ',' + str(upret)+ ','\
+str(avgret_Pre1m) + ','+ str(maxret_Pre1m) +','+ str(minret_Pre1m) + ','+ str(meanret_Pre1m) +',' + str(upret_Pre1m)+ ','\
+str(avgret_Pre2m) + ','+ str(maxret_Pre2m) +','+ str(minret_Pre2m) + ','+ str(meanret_Pre2m) +',' + str(upret_Pre2m)+ ','\
+ str(calsstype) + ','+ str(nextavgret) + '\r\n'
items = items.encode('utf-8')
files.write(items)
except:
print(currcode)
pass
for date in datelist:
currdt =date[0]
nextdt =date[1]
pre1mdt =date[2]
pre2mdt =date[3]
filename=str(currdt)+'_data_3m.csv'
preparedata(currdt,nextdt,filename,pre1mdt,pre2mdt)
测试结果比第一种方式结果要好一些,正例的正确率和平均收益率都要稍高一些,说明特征选择还是很重要的,但多测几份样本数据也不稳定,还是会随样本波动而变化。
总而言之,虽然有所改进但表现不够稳定,而且收益率也不高并有损失风险,离实际应该还是有距离的。