# 天池新人实战赛o2o优惠券使用预测五（第一个预测的程序）

#用户相关特征：
#FUser1 线下领取优惠券后消费次数
#FUser2 线下消费总次数
#商户相关特征：
#FMer1 线下总领取优惠券次数
#FMer2 线下总领取优惠券后消费次数
#FMer3 线下总消费次数

OffTrain = pd.read_csv('data/ccf_offline_stage1_train.csv')
#把其中出现的所有的用户ID都统计出来
FUser = OffTrain[['User_id']]
print("FUser.shape=",FUser.shape)
FUser.drop_duplicates(inplace=True)
OffTrainUser = FUser.shape[0]#总共有539438个独立用户
print("OffTrainUser=",OffTrainUser)
FUser = FUser.reset_index(drop=True)
#读取正样本
OffTrainPNumber = OffTrainP.shape[0]#总共75382个正样本
print("OffTrainPNumber=",OffTrainPNumber)
OffTrainPperUser = OffTrainPNumber/OffTrainUser#每个独立用户可能购买的几率是13.974173%
print("OffTrainPperUser=",OffTrainPperUser)
#寻找同样的ID在P样本中出现的次数
t = OffTrainP[['User_id']]
t['FUser1']=1#特征1
t = t.groupby('User_id').agg('sum').reset_index()#求和
FUser = pd.merge(FUser,t,on=['User_id'],how='left')
#把所有NaN填充为0
FUser = FUser.fillna(0)
t = OffTrain[OffTrain['Date'] != "null"]
t = t[['User_id']]
t['FUser2']=1#特征2
t = t.groupby('User_id').agg('sum').reset_index()#求和
FUser = pd.merge(FUser,t,on=['User_id'],how='left')
FUser = FUser.fillna(0)
print(FUser.FUser2.describe())
FUser.to_csv('data/FUser.csv',index=False,header=True)

#读取线下训练数据
#把线下商户ID都提取出来
FMer = OffTrain[['Merchant_id']]
#print("FMer.shape=",FMer.shape)
#去掉重复的
FMer.drop_duplicates(inplace=True)
#print("FMer.shape=",FMer.shape)
#重新建立索引
FMer = FMer.reset_index(drop=True)
t = OffTrain[OffTrain['Coupon_id'] != "null"]#取出所有有领取优惠券的部分
#print(t.shape)
t = t[['Merchant_id']]
t['FMer1']=1#特征1
t = t.groupby('Merchant_id').agg('sum').reset_index()#求和
FMer = pd.merge(FMer,t,on=['Merchant_id'],how='left')
FMer = FMer.fillna(0)
#FMer2 线下总领取优惠券后消费次数
t = OffTrain[OffTrain['Coupon_id'] != "null"]#取出所有有领取优惠券的部分
print(t.shape)
t = t[t['Date'] != 'null' ]
print(t.shape)
t = t[['Merchant_id']]
t['FMer2']=1#特征2
t = t.groupby('Merchant_id').agg('sum').reset_index()#求和
FMer = pd.merge(FMer,t,on=['Merchant_id'],how='left')
FMer = FMer.fillna(0)
#FMer3 线下总消费次数
t = OffTrain[OffTrain['Date'] != "null"]#取出所有有消费的部分
print(t.shape)
t = t[['Merchant_id']]
t['FMer3']=1#特征3
t = t.groupby('Merchant_id').agg('sum').reset_index()#求和
FMer = pd.merge(FMer,t,on=['Merchant_id'],how='left')
FMer = FMer.fillna(0)
FMer.to_csv('data/FMer.csv',index=False,header=True)

#读取特征文件
#读取样本数据
#加入FLag区分P和N
OffTrainN['Flag'] = 0
OffTrainP['Flag'] = 1
#建立特征列
print(OffTrainN.shape)
OffTrainN = pd.merge(OffTrainN,FUser,on=['User_id'],how='left')
print(OffTrainN.shape)
print(OffTrainN.shape)
OffTrainN = pd.merge(OffTrainN,FMer,on=['Merchant_id'],how='left')
print(OffTrainN.shape)
#建立特征列
print(OffTrainP.shape)
OffTrainP = pd.merge(OffTrainP,FUser,on=['User_id'],how='left')
print(OffTrainP.shape)
print(OffTrainP.shape)
OffTrainP = pd.merge(OffTrainP,FMer,on=['Merchant_id'],how='left')
print(OffTrainP.shape)
#生成Flag数组
OffTrainFlagP = OffTrainP['Flag'].values
print(OffTrainFlagP)
print(OffTrainFlagP.shape)
OffTrainFlagN = OffTrainN['Flag'].values
print(OffTrainFlagN)
print(OffTrainFlagN.shape)
#合并Flag
OffTrainFlag = np.append(OffTrainFlagP,OffTrainFlagN)
print(OffTrainFlag)
print(OffTrainFlag.shape[0])
#生成特征数组
OffTrainFeatureP = OffTrainP[['FUser1','FUser2','FMer1','FMer2','FMer3']].values
print(OffTrainFeatureP)
print(OffTrainFeatureP.shape)
OffTrainFeatureN = OffTrainN[['FUser1','FUser2','FMer1','FMer2','FMer3']].values
print(OffTrainFeatureN)
print(OffTrainFeatureN.shape)
#合并特征
OffTrainFeature = np.append(OffTrainFeatureP,OffTrainFeatureN,axis=0)
print(OffTrainFeature)
print(OffTrainFeature.shape)
rf=RandomForestRegressor()#这里使用了默认的参数设置
rf.fit(OffTrainFeature,OffTrainFlag)#进行模型的训练
temp = rf.predict(OffTrainFeature)
start = time.time()
err = 0
for i in range(OffTrainFeature.shape[0]):
t = temp[i]-OffTrainFlag[i]
if (t>0.5)|(t<-0.5):
err = err +1
err = err/OffTrainFeature.shape[0]
end = time.time()
print(end-start)
print(1-err)
#读取测试集
print(Test.shape)
Test = pd.merge(Test,FUser,on=['User_id'],how='left')
print(Test.shape)
Test = pd.merge(Test,FMer,on=['Merchant_id'],how='left')
Test['Flag'] = 0.0
print(Test.shape)
Test = Test.fillna(0)
TestFeature = Test[['FUser1','FUser2','FMer1','FMer2','FMer3']].values
print(TestFeature.shape)
print(TestFeature)
start = time.time()
temp = rf.predict(TestFeature)
end = time.time()
print(end-start)
Test['Flag'] = temp
Test.to_csv('data\sample_submission20171225.csv',columns=['User_id','Coupon_id','Date_received','Flag'],index=False,header=False)