入门机器学习开源baseline(精品成单预测1)

在此记录了精品旅行服务成单预测的比赛baseline,感谢评论区的开源,这个比赛将做为我机器学习的入门赛。

# coding: utf-8
# In[1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
import os

# In[2]:
##read_data###
action_train=pd.read_csv('./data/trainingset/action_train.csv')#用户行为数据
#行为类型一共有9个,其中1是唤醒app;2~4是浏览产品,无先后关系;5~9则是有先后关系的,从填写表单到提交订单再到最后支付。
orderFuture_train=pd.read_csv('./data/trainingset/orderFuture_train.csv')#待预测数据
orderHistory_train=pd.read_csv('./data/trainingset/orderHistory_train.csv')#用户历史订单数据
userComment_train=pd.read_csv('./data/trainingset/userComment_train.csv')#用户评论数据
userProfile_train=pd.read_csv('./data/trainingset/userProfile_train.csv')#用户个人信息

action_test=pd.read_csv('./data/test/action_test.csv')
orderFuture_test=pd.read_csv('./data/test/orderFuture_test.csv')
orderHistory_test=pd.read_csv('./data/test/orderHistory_test.csv')
userComment_test=pd.read_csv('./data/test/userComment_test.csv')
userProfile_test=pd.read_csv('./data/test/userProfile_test.csv')

import time
def time_conv(x):
    timeArray=time.localtime(x)
    otherStyleTime = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)
    return otherStyleTime
#action_train.actionTime=action_train.actionTime.map(lambda x: time_conv(x))
orderHistory_train.orderTime=pd.to_datetime(orderHistory_train.orderTime.map(lambda x: time_conv(x)),format="%Y-%m-%d %H:%M:%S")
orderHistory_test.orderTime=pd.to_datetime(orderHistory_test.orderTime.map(lambda x: time_conv(x)),format="%Y-%m-%d %H:%M:%S")
action_train.actionTime=pd.to_datetime(action_train.actionTime.map(lambda x: time_conv(x)),format="%Y-%m-%d %H:%M:%S")
action_test.actionTime=pd.to_datetime(action_test.actionTime.map(lambda x: time_conv(x)),format="%Y-%m-%d %H:%M:%S")
orderFuture_train.rename(columns={'orderType':'label'},inplace=True)


# In[4]:


####feature#####
##user过去是否订购过orderType
def orderHistory_feat(df):    #0是购买普通服务  1是精品服务
    grouped=df[['userid','orderType']].groupby('userid',as_index=False)  #通过userid进行分组,拿到userid和orderType的列数据
    print(grouped.head())
    df_count=grouped.count()    #数出每个userid出现了几次,相当于购买了几次服务
    print('df_count:\n',df_count)
    print("**"*5)
    df_count.rename(columns={'orderType':'df_count'},inplace=True)
    df_sum=grouped.sum()        #累加各个相同userid的orderType,相当于计算出购买精品服务的总次数x2
    print("df_sum:\n",df_sum)
    df_sum.rename(columns={'orderType':'df_sum'},inplace=True)
    df_merge=pd.merge(df_count,df_sum,on='userid',how='left') #合并两个数据
    df_merge['rate']=df_merge['df_sum']/df_merge['df_count']  #计算购买精品服务的比率
    del df_merge['df_count']
    df_merge.rename(columns={'df_sum':'orderHistory_feat_sum','rate':'orderHistory_feat_rate'},inplace=True)
    return df_merge
def actions_orderType(df):  #返回用户在9个,其中1是唤醒app;2~4是浏览产品,无先后关系;5~9则是有先后关系的,从填写表单到提交订单再到最后支付
    print("df:\n",df)      #出现的比率
    df['count']=1
    df_count=df[['userid','count']].groupby('userid',as_index=False).count()  #计算出每个userid的记录次数
    print(df_count.head(30))
    actionType=pd.get_dummies(df['actionType'],prefix='actionType')  #对actionType进行独热编码
    print(actionType)
    df=pd.concat([df['userid'],actionType],axis=1)
    print("concatat:\n",df)
    df=df.groupby('userid',as_index=False).sum()   #意思是对相同userid的进行分类,同一个id是一类,然后同一个id的相同列名进行相加、求和
    print("groupby:\n",df)
    print("df.shape:\n",df.shape)
    for column in range(1,df.shape[1]):   #精辟。。  对每一列的次数 除 之前计算出来的userid出现的次数,计算各个阶段的比率
        df['actionType_{}'.format(column)]=df['actionType_{}'.format(column)]/df_count['count']
    return df
# action_train.head()


# In[5]:


def gen_train_feat():
    actions=orderFuture_train
    actions=pd.merge(actions,orderHistory_feat(orderHistory_train),on='userid',how='left')
    actions=pd.merge(actions,actions_orderType(action_train),on='userid',how='left')
    ###add feature###
    return actions
def gen_test_feat():
    actions=orderFuture_test
    actions=pd.merge(actions,orderHistory_feat(orderHistory_test),on='userid',how='left')
    actions=pd.merge(actions,actions_orderType(action_test),on='userid',how='left')
    return actions


# In[6]:


train_data=gen_train_feat()
test_data=gen_test_feat()
from sklearn.model_selection import train_test_split
train_label=train_data['label']
del train_data['label']
x_train,x_val,y_train,y_val=train_test_split(train_data,train_label,test_size=0.2,random_state=100)


import xgboost as xgb
print ('start running ....')
dtrain = xgb.DMatrix(x_train,label=y_train)
dval = xgb.DMatrix(x_val,label=y_val)
param = {'learning_rate' : 0.1,
        'n_estimators': 1000,
        'max_depth': 3,
        'min_child_weight': 5,
        'gamma': 0,
        'subsample': 0.8,
        'colsample_bytree': 0.8,
        'eta': 0.05,
        'silent': 1,
        'objective': 'binary:logistic',
        'scale_pos_weight':1}

num_round =200
plst = list(param.items())
plst += [('eval_metric', 'auc')]
evallist = [(dval, 'eval'), (dtrain, 'train')]
bst=xgb.train(plst,dtrain,num_round,evallist,early_stopping_rounds=100)
dtest = xgb.DMatrix(test_data)
y = bst.predict(dtest)


# In[7]:


orderFuture_test['orderType']=y
orderFuture_test.to_csv('baseline_2018_01_03.csv',index=False)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值