O2O优惠券预测复盘(一)

天池o2o优惠券比赛

初级版本:使用线性回归模型,后期会持续优化

导入相关库


import os,sys,pickle
import numpy as np
import pandas as pd
from datetime import date
from sklearn.model_selection import KFold,train_test_split,StratifiedKFold,cross_val_score,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss,roc_auc_score,auc,roc_curve
from sklearn.preprocessing import MinMaxScaler
get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('config', "InlineBackend.figure_format='retina'")

导入数据


dfoff=pd.read_csv(r"data\ccf_offline_stage1_train.csv",keep_default_na=False)
dfon=pd.read_csv(r"data\ccf_online_stage1_train.csv",keep_default_na=False)
dftest=pd.read_csv(r"data\ccf_offline_stage1_test_revised.csv",keep_default_na=False)
dfoff.head(5)

简单统计

print('有优惠券,购买:%d'%dfoff[(dfoff['Date_received']!='null')&(dfoff['Date']!='null')].shape[0])
print('有优惠券,未购买:%d'%dfoff[(dfoff['Date_received']!='null')&(dfoff['Date']=='null')].shape[0])
print('无优惠劵,购买:%d'%dfoff[(dfoff['Date_received']=='null')&(dfoff['Date']!='null')].shape[0])
print('无优惠劵,未购买:%d'%dfoff[(dfoff['Date_received']=='null')&(dfoff['Date']=='null')].shape[0])

特征提取

一、打折率

print('Discount_rate类型:\n',dfoff['Discount_rate'].unique())


'''  
处理打折率   
1.打折类型 getDiscountType()
2.折扣率 convertRate()
3.满多少 getDiscountMan()
4.减多少 getDiscountJian()
'''

def getDiscountType(row):
    if row=='null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    ''' Convert discount to rate'''
    if row =='null':
        return 1.0
    elif ':' in row:
        rows=row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
         return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows=row.split(':')
        return int(rows[0])
    else:
        return 0
    
def getDiscountJian(row):
    if ':' in row:
        rows=row.split(':')
        return int(rows[1])
    else:
        return 0
    
def processData(df):
    df['discount_type']=df['Discount_rate'].apply(getDiscountType)
    df['discount_rate']=df['Discount_rate'].apply(convertRate)
    df['discount_man']=df['Discount_rate'].apply(getDiscountMan)
    df['discount_jian']=df['Discount_rate'].apply(getDiscountJian)
    
    print (df['discount_rate'].unique())
    return df


dfoff=processData(dfoff)
dftest=processData(dftest)



dfoff.head(5)

二、距离

print('Distance 类型:',dfoff['Distance'].unique())




#将距离 str 转为 int
dfoff['distance']=dfoff['Distance'].replace('null',-1).astype(int)
print (dfoff['distance'].unique())
dftest['distance']=dftest['Distance'].replace('null',-1).astype(int)
print (dftest['distance'].unique())


dfoff.head(5)

三、领劵日期

date_received=dfoff['Date_received'].unique()
date_received=sorted(date_received[date_received!='null'])

date_buy=dfoff['Date'].unique()
date_buy=sorted(date_buy[date_buy!='null'])

print('优惠券收到日期从',date_received[0],'到',date_received[-1])
print('消费日期从',date_buy[0],'到',date_buy[-1])





'''  
日期处理
weekday : {null, 1, 2, 3, 4, 5, 6, 7}
weekday_type : {1, 0}(周六和周日为1,其他为0)
Weekday_1 : {1, 0, 0, 0, 0, 0, 0}
Weekday_2 : {0, 1, 0, 0, 0, 0, 0}
Weekday_3 : {0, 0, 1, 0, 0, 0, 0}
Weekday_4 : {0, 0, 0, 1, 0, 0, 0}
Weekday_5 : {0, 0, 0, 0, 1, 0, 0}
Weekday_6 : {0, 0, 0, 0, 0, 1, 0}
Weekday_7 : {0, 0, 0, 0, 0, 0, 1}
'''
def getWeekday(row):
    if row=='null':
        return row
    else:
        return date(int(row[0:4]),int(row[4:6]),int(row[6:8])).weekday()+1

dfoff['weekday']=dfoff['Date_received'].astype(str).apply(getWeekday)
dftest['weekday']=dftest['Date_received'].astype(str).apply(getWeekday)

dfoff['weekday_type']=dfoff['weekday'].apply(lambda x: 1 if x in [6,7] else 0)
dftest['weekday_type']=dftest['weekday'].apply(lambda x: 1 if x in [6,7] else 0)

weekdaycols=['weekday_'+str(i) for i in range(1,8)]

print(weekdaycols)

tmpdf=pd.get_dummies(dfoff['weekday'].replace('null',np.nan))
tmpdf.columns=weekdaycols
dfoff[weekdaycols]=tmpdf

tmpdf=pd.get_dummies(dftest['weekday'].replace('null',np.nan))
tmpdf.columns=weekdaycols
dftest[weekdaycols]=tmpdf



dfoff.head(5)

所有特征

discount_rate
discount_type
discount_man
discount_jian
distance
weekday
weekday_type
weekday_1
weekday_2
weekday_3
weekday_4
weekday_5
weekday_6
weekday_7

标签标注三种情况:
Date_received == ‘null’:表示没有领到优惠券,无需考虑,y = -1
(Date_received != ‘null’) & (Date != ‘null’) & (Date - Date_received <= 15):表示领取优惠券且在15天内使用,即正样本,y = 1
(Date_received != ‘null’) & ((Date == ‘null’) | (Date - Date_received > 15)):表示领取优惠券未在在15天内使用,即负样本,y = 0

def label(row):
    if row['Date_received']=='null':
        return -1
    if row['Date']!='null':
        td=pd.to_datetime(row['Date'],format='%Y%m%d')-pd.to_datetime(row['Date_received'],format='%Y%m%d')
        if td <= pd.Timedelta(15,'D'):
            return 1
    return 0
dfoff['label']=dfoff.apply(label,axis=1)




print (dfoff['label'].value_counts())



dfoff.head(5)

建立线性模型 SGDClassifier

使用上面提取的14个特征。

训练集:20160101-20160515;验证集:20160516-20160615。

用线性模型 SGDClassifier



#划分训练集、验证集
df=dfoff[dfoff['label']!=-1].copy()
train=df[(df['Date_received']<'20160516')].copy()
valid=df[(df['Date_received']>='20160516')&(df['Date_received']<='20160615')].copy()
print('Train Set: \n',train['label'].value_counts())
print('Valid set: \n',valid['label'].value_counts())




#特征数量
original_feature=['discount_rate','discount_type','discount_man','discount_jian','distance','weekday','weekday_type']+weekdaycols
print ('共有特征:',len(original_feature),'个')
print(original_feature)




#建立模型
def check_model(data, predictors):
    
    classifier=lambda:SGDClassifier(
        loss='log',
        penalty='elasticnet',
        fit_intercept=True,
        max_iter=100,
        shuffle=True,
        n_jobs=1,
        class_weight=None)
    
    model=Pipeline(steps=[
        ('ss',StandardScaler()),
        ('en',classifier())   
        ])
    
    parameters={
       'en__alpha':[0.001,0.01,0.1],
        'en__l1_ratio':[0.001,0.01,0.1]
        }
    
    folder=StratifiedKFold(n_splits=3,shuffle=True)
    
    grid_search=GridSearchCV(
        model,
        parameters,
        cv=folder,
        n_jobs=-1,
        verbose=1)
    grid_search=grid_search.fit(data[predictors],data['label'])
    
    return grid_search




#训练模型
predictors=original_feature
model = check_model(train,predictors)

验证


y_valid_pred= model.predict_proba(valid[predictors])
valid1=valid.copy()
valid1['pred_prob']=y_valid_pred[:,1]
valid1.head(5)

计算AUC



vg = valid1.groupby(['Coupon_id'])
aucs = []
for i in vg:
    tmpdf = i[1] 
    if len(tmpdf['label'].unique()) != 2:
        continue
    fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
    aucs.append(auc(fpr, tpr))
print(np.average(aucs))

测试


y_test_pred=model.predict_proba(dftest[predictors])
dftest1=dftest[['User_id','Coupon_id','Date_received']].copy()
dftest1['Probability']=y_test_pred[:,1]
dftest1.to_csv('submit2.csv',index=False,header=False)
dftest1.head(5)

保存模型&导入模型



if not os.path.isfile('2_model.pkl'):
    with open('2_model.pkl','wb') as f:
        pickle.dump(model,f)
else:
    with open('2_model.pkl','rb') as f:
        model=pickle.load(f)
  • 0
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值