用模型进行预测及改进<5>_primal模型-CSDN博客

本文链接：https://blog.csdn.net/Philics0725/article/details/81147623

用sklearn分析基金数据<1>
python爬虫获取基金数据<2>
数据预处理：数据清洗、生成样本数据<3>
用sklearn训练样本数据<4>
用模型进行预测及改进<5>

前面是把12月数据按80%和20%分成训练集和测试集，测试结果还不错，现在来做实际应用，以11月数据做为训练集，12用数据作为测试集。具体代码如下：

import pandas as pd
import numpy as np

Novrecords=pd.read_csv('./input/halfm/2017-12_data_halfmon.csv')
Novrecords.index=Novrecords.iloc[:,0]
Novrecords['calsstype'] = Novrecords['calsstype'].replace([0],[-1])
data_train= Novrecords

Decrecords=pd.read_csv('./input/halfm/tail_2017-09_data_halfmon.csv')
Decrecords.index=Decrecords.iloc[:,0]
Decrecords['calsstype'] = Decrecords['calsstype'].replace([0],[-1])
data_test= Decrecords
Ytrain = data_train.iloc[:,6]
Xtrain = data_train.iloc[:,1:6]
Ytest = data_test.iloc[:,6]
Xtest = data_test.iloc[:,1:6]

from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.tree import DecisionTreeClassifier

rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(Xtrain,Ytrain)
y_testpred = rf0.predict(Xtest)
newindex = Xtest.index
y_testpred=pd.DataFrame(y_testpred,index=newindex)

#SVN
from sklearn.svm import SVC
grid = GridSearchCV(SVC(), param_grid={"C":[0.1, 1, 10], "gamma": [1, 0.1, 0.01]}, cv=4)
grid.fit(Xtrain,Ytrain)
y_pred_svc = grid.predict(Xtest)
y_pred_svc=pd.DataFrame(y_pred_svc,index=newindex)

#adaboost
from sklearn.ensemble import AdaBoostClassifier
bdt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=2, min_samples_split=20, min_samples_leaf=5),
algorithm="SAMME",
n_estimators=200, learning_rate=0.8)
bdt.fit(Xtrain,Ytrain)
y_pred_ada = bdt.predict(Xtest)
y_pred_ada=pd.DataFrame(y_pred_ada,index=newindex)

#GBDT
from sklearn.ensemble import GradientBoostingClassifier
gbm0 = GradientBoostingClassifier(random_state=10)
gbm0.fit(Xtrain,Ytrain)
y_pred_gbdt = gbm0.predict(Xtest)
y_pred_gbdt=pd.DataFrame(y_pred_gbdt,index=newindex)

result = pd.concat([data_test,y_testpred,y_pred_svc,y_pred_ada,y_pred_gbdt], axis=1)
outputfile = './testoutput/output201712.xls'
result.to_excel(outputfile)

测试结果是非常不好，四种算法结果都全军覆没，结果都返回负值，正例都没有预出来，又多测几个月样本，效果依然是很差。此时考虑平均收益率也不行，因为结果显示一个基金都不要买。

总之泛化能力太弱基本为0，背后原因一是对训练数据过拟合，特别是决策树很容易过拟合，二是没有找到正确的特征，当然也很有可能没有这样的特征，基本无解。

从训练集的大好结果到惨淡的测试结果，有一点不甘心，又做了两种改进版本，
主要还是在原来的思路做一些扩展。两种方法，
一，既然样本存在前面存在的先天缺陷，上个月数据不能用于训练建模，只能用上上个月数据来训练，上月数据拿来做测试。
那我缩短周期，一月分成两份，上半月和下半月，则就可以拿上月的上半月来训练，下半月来做测试了。
二，特征过少，增加特征数量，原来的五个特征都是以一个月为周期来计算的，那我再增加以2个月和3个月为周期的相同指标，
特征数由5个增加至15个，这样做也有业务逻辑，一般来说选择基金不能只看当月，还应该多看看前几个月业绩，综合考虑。

第一种改进方式生成样本数据代码如下：

import pandas as pd
import numpy as np
#get all data
Allrecords=pd.read_csv('checkedalldata.csv')
Allrecords.index=Allrecords.iloc[:,0]
Allrecords.index=pd.to_datetime(Allrecords.index, format='%Y-%m-%d')
#just get date,price,fund-code
records = Allrecords.iloc[:,0:4]
Allfund=pd.read_table('Leixingall.txt',encoding='utf-8',sep=',')
Allfund.index=Allfund.iloc[:,1]
Allfund=Allfund[(Allfund.fund_type=='混合型') | (Allfund.fund_type=='股票指数')|(Allfund.fund_type=='股票型') |(Allfund.fund_type=='债券型') ]
codes=Allfund.iloc[:,1]
datelist =[['2017-12','2018-01'],
           ['2017-11','2017-12'],
           ['2017-10','2017-11'],
           ['2017-09','2017-10'],
           ['2017-08','2017-09'],
           ['2017-07','2017-08'],
           ['2017-06','2017-07'],
           ['2017-05','2017-06'],
           ['2017-04','2017-05'],
           ['2017-03','2017-04'],
           ['2017-02','2017-03'],
           ['2017-01','2017-02']]

def preparedata(currdt,nextdt,filename):
    for currcode in codes:
        upcount =0
        downcount =0
        try:
            currrecords=records[records.trade_code==currcode]
            Curprice=currrecords.cur_price
            ret=(Curprice-Curprice.shift(-1))/Curprice.shift(-1)*100
            ret.name='Ret'
            retTM=ret[currdt]
            counts=int(retTM.describe()['count'])
            CurpriceTM=Curprice[currdt]
            if counts <2:
                print('count<2',currcode)
                continue
            count_head = round(counts/2)
            count_tail = counts - count_head
            retTM_head = retTM.head(count_head)
            retTM_tail = retTM.tail(count_tail)
            CurpriceTM_head = CurpriceTM.head(count_head)
            CurpriceTM_tail = CurpriceTM.tail(count_tail)
            avgret_head = (CurpriceTM_head[0]-CurpriceTM_head[count_head-1])/CurpriceTM_head[count_head-1]*100
            avgret_tail = (CurpriceTM_tail[0]-CurpriceTM_tail[count_tail-1])/CurpriceTM_tail[count_tail-1]*100
            for rets_head in retTM_head:
                if rets_head >0:
                    upcount +=1
                else:
                    downcount+=1
            upret_head=upcount/count_head*100
            maxret_head=retTM_head.describe()['max']
            minret_head=retTM_head.describe()['min']
            meanret_head=retTM_head.describe()['mean']
            upcount =0
            downcount =0
            for rets_tail in retTM_tail:
                if rets_tail >0:
                    upcount +=1
                else:
                    downcount+=1
            upret_tail=upcount/count_tail*100
            maxret_tail=retTM_tail.describe()['max']
            minret_tail=retTM_tail.describe()['min']
            meanret_tail=retTM_tail.describe()['mean']
   #get next month data
            nextrecordsTM=currrecords[nextdt]
   #get price
            Nextprice=nextrecordsTM.cur_price
            counts=int(Nextprice.describe()['count'])
            count_head = round(counts/2)
            count_tail = counts - count_head
            Nextprice_head = Nextprice.head(count_head)
            Nextprice_tail = Nextprice.tail(count_tail)
            nextavgret_head = (Nextprice_head.iloc[0]-Nextprice_head.iloc[count_head-1])/Nextprice_head.iloc[count_head-1]*100
            nextavgret_tail = (Nextprice_tail.iloc[0]-Nextprice_tail.iloc[count_tail-1])/Nextprice_tail.iloc[count_tail-1]*100
            calsstype_head = -1
            if nextavgret_head >= 3 : calsstype_head = 1
            calsstype_tail = -1
            if nextavgret_tail >= 3 : calsstype_tail = 1
            filename_head = 'head_'+filename
            filename_tail = 'tail_'+filename
            with open(filename_head,'ab') as files:
                items = str(currcode) +','+str(avgret_head) +','+ str(maxret_head) +','+ str(minret_head) + ','+ str(meanret_head)+ ','+ str(upret_head) + ','+ str(calsstype_head) +','+ str(nextavgret_head) + '\r\n'
                items = items.encode('utf-8')
                files.write(items)
            with open(filename_tail,'ab') as files:
                items = str(currcode) +','+str(avgret_tail) +','+ str(maxret_tail) +','+ str(minret_tail) + ','+ str(meanret_tail)+ ','+ str(upret_tail) + ','+ str(calsstype_tail) +','+ str(nextavgret_tail) + '\r\n'
                items = items.encode('utf-8')
                files.write(items)
        except:
            print(currcode)
            pass
for date in datelist:
    currdt =date[0]
    nextdt =date[1]
    filename=str(currdt)+'_data_halfmon.csv'
    preparedata(currdt,nextdt,filename)

此时样本数据由原来的一月一份变为一月两份，前半月和后半月，测试用代码和原来一样，改一下训练样本、测试样本的名称就可以了。测试结果比原来有所提升，不再是全部返回负值，正值能预测准确10%左右，平均收益率2%左右，低于预期但也明显高于随机，多测几份数据会发现结果不稳定，其实只要分析下样本数据就能发现样本数据也是很有随机性的，每月差别很大。

然后是第二种改进方式,以12月为例生成样本数据代码如下：

import pandas as pd

import numpy as np
#get all data
Allrecords=pd.read_csv('checkedalldata.csv')
Allrecords.index=Allrecords.iloc[:,0]
Allrecords.index=pd.to_datetime(Allrecords.index, format='%Y-%m-%d')
#just get date,price,fund-code
records = Allrecords.iloc[:,0:4]
#get all fund-code
Allfund=pd.read_table('Leixingall.txt',encoding='utf-8',sep=',')
Allfund.index=Allfund.iloc[:,1]
Allfund=Allfund[(Allfund.fund_type=='混合型') | (Allfund.fund_type=='股票指数')|(Allfund.fund_type=='股票型') |(Allfund.fund_type=='债券型') ]
codes=Allfund.iloc[:,1]
outputfile = 'type_gpx.csv'
datelist =[['2017-12','2018-01','2017-11','2017-10']]

def preparedata(currdt,nextdt,filename,pre1mdt,pre2mdt):
    for currcode in codes:
        upcount =0
        downcount =0
        try:
            currrecords=records[records.trade_code==currcode]
            Curprice=currrecords.cur_price
            ret=(Curprice-Curprice.shift(-1))/Curprice.shift(-1)*100
            ret.name='Ret'
            retTM=ret[currdt]
            counts=int(retTM.describe()['count'])
            CurpriceTM=Curprice[currdt]
            ret_Pre1m=ret[currdt:pre2mdt]
            counts_Pre1m=int(ret_Pre1m.describe()['count'])
            Curprice_Pre1m=Curprice[currdt:pre2mdt]
            ret_Pre2m=ret[currdt:pre2mdt]
            counts_Pre2m=int(ret_Pre2m.describe()['count'])
            Curprice_Pre2m=Curprice[currdt:pre2mdt]
            if counts <2:
                print('count<2',currcode)
                continue
            avgret = (CurpriceTM[0]-CurpriceTM[counts-1])/CurpriceTM[counts-1]*100

            avgret_Pre1m = (Curprice_Pre1m[0]-Curprice_Pre1m[counts_Pre1m-1])/Curprice_Pre1m[counts_Pre1m-1]*100
            avgret_Pre2m = (Curprice_Pre2m[0]-Curprice_Pre2m[counts_Pre2m-1])/Curprice_Pre2m[counts_Pre2m-1]*100
            for rets in retTM:
                if rets >0:
                    upcount +=1
                else:
                    downcount+=1
            upret=upcount/counts*100
            maxret=retTM.describe()['max']
            minret=retTM.describe()['min']
            meanret=retTM.describe()['mean']
            upcount =0
            downcount =0
            for rets in ret_Pre1m:
                if rets >0:
                    upcount +=1
                else:
                    downcount+=1
            upret_Pre1m = upcount/counts_Pre1m*100
            maxret_Pre1m = ret_Pre1m.describe()['max']
            minret_Pre1m = ret_Pre1m.describe()['min']
            meanret_Pre1m = ret_Pre1m.describe()['mean']
            upcount =0
            downcount =0
            for rets in ret_Pre2m:
                if rets >0:
                    upcount +=1
                else:
                    downcount+=1
            upret_Pre2m = upcount/counts_Pre2m*100
            maxret_Pre2m = ret_Pre2m.describe()['max']
            minret_Pre2m = ret_Pre2m.describe()['min']
            meanret_Pre2m = ret_Pre2m.describe()['mean']
   #get next month data
            nextrecordsTM=currrecords[nextdt]
   #get price
            Nextprice=nextrecordsTM.cur_price
            counts=int(Nextprice.describe()['count'])
            nextavgret = (Nextprice.iloc[0]-Nextprice.iloc[counts-1])/Nextprice.iloc[counts-1]*100
            calsstype = -1
            if nextavgret >= 5 : calsstype = 1
            with open(filename,'ab') as files:
                items = str(currcode) +','+str(avgret) +','+ str(maxret) +','+ str(minret) + ','+ str(meanret)+ ',' + str(upret)+ ','\
                +str(avgret_Pre1m) + ','+ str(maxret_Pre1m) +','+ str(minret_Pre1m) + ','+ str(meanret_Pre1m) +',' + str(upret_Pre1m)+ ','\
                +str(avgret_Pre2m) + ','+ str(maxret_Pre2m) +','+ str(minret_Pre2m) + ','+ str(meanret_Pre2m) +',' + str(upret_Pre2m)+ ','\
                + str(calsstype) + ','+ str(nextavgret) + '\r\n'
                items = items.encode('utf-8')
                files.write(items)
        except:
            print(currcode)
            pass
for date in datelist:
    currdt =date[0]
    nextdt =date[1]
    pre1mdt =date[2]
    pre2mdt =date[3]
    filename=str(currdt)+'_data_3m.csv'
    preparedata(currdt,nextdt,filename,pre1mdt,pre2mdt)