天池新人实战赛o2o优惠券使用预测五(第一个预测的程序)

扑腾了好久,总算出了第一个能预测的程序:
思路很简单,使用了下列5个特征:

#用户相关特征:
#FUser1 线下领取优惠券后消费次数
#FUser2 线下消费总次数
#商户相关特征:
#FMer1 线下总领取优惠券次数
#FMer2 线下总领取优惠券后消费次数
#FMer3 线下总消费次数

预测模型采用随机森林。
下面是提取User特征的代码:

OffTrain = pd.read_csv('data/ccf_offline_stage1_train.csv')
OffTrain.head()
#把其中出现的所有的用户ID都统计出来
FUser = OffTrain[['User_id']]
print("FUser.shape=",FUser.shape)
FUser.drop_duplicates(inplace=True)
OffTrainUser = FUser.shape[0]#总共有539438个独立用户
print("OffTrainUser=",OffTrainUser)
FUser = FUser.reset_index(drop=True)
#读取正样本
OffTrainP = pd.read_csv('data\OffTrainP.csv')
OffTrainPNumber = OffTrainP.shape[0]#总共75382个正样本
print("OffTrainPNumber=",OffTrainPNumber)
OffTrainPperUser = OffTrainPNumber/OffTrainUser#每个独立用户可能购买的几率是13.974173%
print("OffTrainPperUser=",OffTrainPperUser)
#寻找同样的ID在P样本中出现的次数
t = OffTrainP[['User_id']]
t['FUser1']=1#特征1
t = t.groupby('User_id').agg('sum').reset_index()#求和
FUser = pd.merge(FUser,t,on=['User_id'],how='left')
print(FUser.head(5))
#把所有NaN填充为0
FUser = FUser.fillna(0)
print(FUser.head(5))
t = OffTrain[OffTrain['Date'] != "null"]
t = t[['User_id']]
t['FUser2']=1#特征2
t = t.groupby('User_id').agg('sum').reset_index()#求和
FUser = pd.merge(FUser,t,on=['User_id'],how='left')
FUser = FUser.fillna(0)
print(FUser.head(5))
print(FUser.FUser2.describe())
FUser.to_csv('data/FUser.csv',index=False,header=True)

提取商户相关特征:

#读取线下训练数据
OffTrain = pd.read_csv('data/ccf_offline_stage1_train.csv')
OffTrain.head()
#把线下商户ID都提取出来
FMer = OffTrain[['Merchant_id']]
#print("FMer.shape=",FMer.shape)
#去掉重复的
FMer.drop_duplicates(inplace=True)
#print("FMer.shape=",FMer.shape)
#重新建立索引
FMer = FMer.reset_index(drop=True)
t = OffTrain[OffTrain['Coupon_id'] != "null"]#取出所有有领取优惠券的部分
#print(t.shape)
t = t[['Merchant_id']]
t['FMer1']=1#特征1
t = t.groupby('Merchant_id').agg('sum').reset_index()#求和
#print(t.head())
FMer = pd.merge(FMer,t,on=['Merchant_id'],how='left')
FMer = FMer.fillna(0)
print(FMer.head())
#FMer2 线下总领取优惠券后消费次数
t = OffTrain[OffTrain['Coupon_id'] != "null"]#取出所有有领取优惠券的部分
print(t.shape)
t = t[t['Date'] != 'null' ]
print(t.shape)
t = t[['Merchant_id']]
t['FMer2']=1#特征2
t = t.groupby('Merchant_id').agg('sum').reset_index()#求和
#print(t.head())
FMer = pd.merge(FMer,t,on=['Merchant_id'],how='left')
FMer = FMer.fillna(0)
print(FMer.head())
#FMer3 线下总消费次数
t = OffTrain[OffTrain['Date'] != "null"]#取出所有有消费的部分
print(t.shape)
t = t[['Merchant_id']]
t['FMer3']=1#特征3
t = t.groupby('Merchant_id').agg('sum').reset_index()#求和
#print(t.head())
FMer = pd.merge(FMer,t,on=['Merchant_id'],how='left')
FMer = FMer.fillna(0)
print(FMer.head())
FMer.to_csv('data/FMer.csv',index=False,header=True)

使用随机森林做预测模型

#读取特征文件
FMer = pd.read_csv('data/FMer.csv')
FUser = pd.read_csv('data/FUser.csv')
#读取样本数据
OffTrainN = pd.read_csv('data/OffTrainN.csv')
OffTrainP = pd.read_csv('data/OffTrainP.csv')
#加入FLag区分P和N
OffTrainN['Flag'] = 0
OffTrainP['Flag'] = 1
#建立特征列
print(OffTrainN.shape)
OffTrainN = pd.merge(OffTrainN,FUser,on=['User_id'],how='left')
print(OffTrainN.shape)
OffTrainN.head()
print(OffTrainN.shape)
OffTrainN = pd.merge(OffTrainN,FMer,on=['Merchant_id'],how='left')
print(OffTrainN.shape)
OffTrainN.head()
#建立特征列
print(OffTrainP.shape)
OffTrainP = pd.merge(OffTrainP,FUser,on=['User_id'],how='left')
print(OffTrainP.shape)
OffTrainP.head()
print(OffTrainP.shape)
OffTrainP = pd.merge(OffTrainP,FMer,on=['Merchant_id'],how='left')
print(OffTrainP.shape)
OffTrainP.head()
#生成Flag数组
OffTrainFlagP = OffTrainP['Flag'].values
print(OffTrainFlagP)
print(OffTrainFlagP.shape)
OffTrainFlagN = OffTrainN['Flag'].values
print(OffTrainFlagN)
print(OffTrainFlagN.shape)
#合并Flag
OffTrainFlag = np.append(OffTrainFlagP,OffTrainFlagN)
print(OffTrainFlag)
print(OffTrainFlag.shape[0])
#生成特征数组
OffTrainFeatureP = OffTrainP[['FUser1','FUser2','FMer1','FMer2','FMer3']].values
print(OffTrainFeatureP)
print(OffTrainFeatureP.shape)
OffTrainFeatureN = OffTrainN[['FUser1','FUser2','FMer1','FMer2','FMer3']].values
print(OffTrainFeatureN)
print(OffTrainFeatureN.shape)
#合并特征
OffTrainFeature = np.append(OffTrainFeatureP,OffTrainFeatureN,axis=0)
print(OffTrainFeature)
print(OffTrainFeature.shape)
rf=RandomForestRegressor()#这里使用了默认的参数设置  
rf.fit(OffTrainFeature,OffTrainFlag)#进行模型的训练  
temp = rf.predict(OffTrainFeature)
start = time.time()
err = 0
for i in range(OffTrainFeature.shape[0]):
    t = temp[i]-OffTrainFlag[i]
    if (t>0.5)|(t<-0.5):
        err = err +1
err = err/OffTrainFeature.shape[0]
end = time.time()
print(end-start)
print(1-err)
#读取测试集
Test = pd.read_csv('data/ccf_offline_stage1_test_revised.csv')
print(Test.shape)
Test = pd.merge(Test,FUser,on=['User_id'],how='left')
print(Test.shape)
Test = pd.merge(Test,FMer,on=['Merchant_id'],how='left')
Test['Flag'] = 0.0
print(Test.shape)
print(Test.head())
Test = Test.fillna(0)
TestFeature = Test[['FUser1','FUser2','FMer1','FMer2','FMer3']].values
print(TestFeature.shape)
print(TestFeature)
start = time.time()
temp = rf.predict(TestFeature)
end = time.time()
print(end-start)
Test['Flag'] = temp
Test.head()
Test.to_csv('data\sample_submission20171225.csv',columns=['User_id','Coupon_id','Date_received','Flag'],index=False,header=False)

最终得到训练集的正确率:0.9584821538771193
天池上提交结果的正确率:0.57166541,好歹比蒙的结果要好,证明了整个方法流程是正确的,下面就是怎么样提取更加丰富的特征,使用更加合理的方法了。

阅读更多
想对作者说点什么? 我来说一句

没有更多推荐了,返回首页