O2O优惠券预测复盘（一）

最新推荐文章于 2022-12-28 16:37:09 发布

jassy_shan

最新推荐文章于 2022-12-28 16:37:09 发布

阅读量799

点赞数

分类专栏：数据挖掘与算法竞赛文章标签：数据挖掘比赛

本文链接：https://blog.csdn.net/weixin_38966454/article/details/90057156

版权

数据挖掘与算法竞赛专栏收录该内容

5 篇文章 0 订阅

订阅专栏

天池o2o优惠券比赛

初级版本：使用线性回归模型，后期会持续优化

导入相关库


import os,sys,pickle
import numpy as np
import pandas as pd
from datetime import date
from sklearn.model_selection import KFold,train_test_split,StratifiedKFold,cross_val_score,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier,LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss,roc_auc_score,auc,roc_curve
from sklearn.preprocessing import MinMaxScaler
get_ipython().run_line_magic('matplotlib', 'inline')
get_ipython().run_line_magic('config', "InlineBackend.figure_format='retina'")

导入数据


dfoff=pd.read_csv(r"data\ccf_offline_stage1_train.csv",keep_default_na=False)
dfon=pd.read_csv(r"data\ccf_online_stage1_train.csv",keep_default_na=False)
dftest=pd.read_csv(r"data\ccf_offline_stage1_test_revised.csv",keep_default_na=False)
dfoff.head(5)

简单统计

print('有优惠券，购买：%d'%dfoff[(dfoff['Date_received']!='null')&(dfoff['Date']!='null')].shape[0])
print('有优惠券，未购买：%d'%dfoff[(dfoff['Date_received']!='null')&(dfoff['Date']=='null')].shape[0])
print('无优惠劵，购买：%d'%dfoff[(dfoff['Date_received']=='null')&(dfoff['Date']!='null')].shape[0])
print('无优惠劵，未购买：%d'%dfoff[(dfoff['Date_received']=='null')&(dfoff['Date']=='null')].shape[0])

特征提取

一、打折率

print('Discount_rate类型：\n',dfoff['Discount_rate'].unique())


'''  
处理打折率   
1.打折类型 getDiscountType()
2.折扣率 convertRate()
3.满多少 getDiscountMan()
4.减多少 getDiscountJian()
'''

def getDiscountType(row):
    if row=='null':
        return 'null'
    elif ':' in row:
        return 1
    else:
        return 0

def convertRate(row):
    ''' Convert discount to rate'''
    if row =='null':
        return 1.0
    elif ':' in row:
        rows=row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
         return float(row)

def getDiscountMan(row):
    if ':' in row:
        rows=row.split(':')
        return int(rows[0])
    else:
        return 0
    
def getDiscountJian(row):
    if ':' in row:
        rows=row.split(':')
        return int(rows[1])
    else:
        return 0
    
def processData(df):
    df['discount_type']=df['Discount_rate'].apply(getDiscountType)
    df['discount_rate']=df['Discount_rate'].apply(convertRate)
    df['discount_man']=df['Discount_rate'].apply(getDiscountMan)
    df['discount_jian']=df['Discount_rate'].apply(getDiscountJian)
    
    print (df['discount_rate'].unique())
    return df


dfoff=processData(dfoff)
dftest=processData(dftest)



dfoff.head(5)

二、距离

print('Distance 类型：',dfoff['Distance'].unique())




#将距离 str 转为 int
dfoff['distance']=dfoff['Distance'].replace('null',-1).astype(int)
print (dfoff['distance'].unique())
dftest['distance']=dftest['Distance'].replace('null',-1).astype(int)
print (dftest['distance'].unique())


dfoff.head(5)

三、领劵日期

date_received=dfoff['Date_received'].unique()
date_received=sorted(date_received[date_received!='null'])

date_buy=dfoff['Date'].unique()
date_buy=sorted(date_buy[date_buy!='null'])

print('优惠券收到日期从',date_received[0],'到',date_received[-1])
print('消费日期从',date_buy[0],'到',date_buy[-1])





'''  
日期处理
weekday : {null, 1, 2, 3, 4, 5, 6, 7}
weekday_type : {1, 0}（周六和周日为1，其他为0）
Weekday_1 : {1, 0, 0, 0, 0, 0, 0}
Weekday_2 : {0, 1, 0, 0, 0, 0, 0}
Weekday_3 : {0, 0, 1, 0, 0, 0, 0}
Weekday_4 : {0, 0, 0, 1, 0, 0, 0}
Weekday_5 : {0, 0, 0, 0, 1, 0, 0}
Weekday_6 : {0, 0, 0, 0, 0, 1, 0}
Weekday_7 : {0, 0, 0, 0, 0, 0, 1}
'''
def getWeekday(row):
    if row=='null':
        return row
    else:
        return date(int(row[0:4]),int(row[4:6]),int(row[6:8])).weekday()+1

dfoff['weekday']=dfoff['Date_received'].astype(str).apply(getWeekday)
dftest['weekday']=dftest['Date_received'].astype(str).apply(getWeekday)

dfoff['weekday_type']=dfoff['weekday'].apply(lambda x: 1 if x in [6,7] else 0)
dftest['weekday_type']=dftest['weekday'].apply(lambda x: 1 if x in [6,7] else 0)

weekdaycols=['weekday_'+str(i) for i in range(1,8)]

print(weekdaycols)

tmpdf=pd.get_dummies(dfoff['weekday'].replace('null',np.nan))
tmpdf.columns=weekdaycols
dfoff[weekdaycols]=tmpdf

tmpdf=pd.get_dummies(dftest['weekday'].replace('null',np.nan))
tmpdf.columns=weekdaycols
dftest[weekdaycols]=tmpdf



dfoff.head(5)

所有特征：

discount_rate
discount_type
discount_man
discount_jian
distance
weekday
weekday_type
weekday_1
weekday_2
weekday_3
weekday_4
weekday_5
weekday_6
weekday_7

标签标注三种情况：
Date_received == ‘null’：表示没有领到优惠券，无需考虑，y = -1
(Date_received != ‘null’) & (Date != ‘null’) & (Date - Date_received <= 15)：表示领取优惠券且在15天内使用，即正样本，y = 1
(Date_received != ‘null’) & ((Date == ‘null’) | (Date - Date_received > 15))：表示领取优惠券未在在15天内使用，即负样本，y = 0

def label(row):
    if row['Date_received']=='null':
        return -1
    if row['Date']!='null':
        td=pd.to_datetime(row['Date'],format='%Y%m%d')-pd.to_datetime(row['Date_received'],format='%Y%m%d')
        if td <= pd.Timedelta(15,'D'):
            return 1
    return 0
dfoff['label']=dfoff.apply(label,axis=1)




print (dfoff['label'].value_counts())



dfoff.head(5)

建立线性模型 SGDClassifier

使用上面提取的14个特征。

训练集：20160101-20160515；验证集：20160516-20160615。

用线性模型 SGDClassifier



#划分训练集、验证集
df=dfoff[dfoff['label']!=-1].copy()
train=df[(df['Date_received']<'20160516')].copy()
valid=df[(df['Date_received']>='20160516')&(df['Date_received']<='20160615')].copy()
print('Train Set: \n',train['label'].value_counts())
print('Valid set: \n',valid['label'].value_counts())




#特征数量
original_feature=['discount_rate','discount_type','discount_man','discount_jian','distance','weekday','weekday_type']+weekdaycols
print ('共有特征：',len(original_feature),'个')
print(original_feature)




#建立模型
def check_model(data, predictors):
    
    classifier=lambda:SGDClassifier(
        loss='log',
        penalty='elasticnet',
        fit_intercept=True,
        max_iter=100,
        shuffle=True,
        n_jobs=1,
        class_weight=None)
    
    model=Pipeline(steps=[
        ('ss',StandardScaler()),
        ('en',classifier())   
        ])
    
    parameters={
       'en__alpha':[0.001,0.01,0.1],
        'en__l1_ratio':[0.001,0.01,0.1]
        }
    
    folder=StratifiedKFold(n_splits=3,shuffle=True)
    
    grid_search=GridSearchCV(
        model,
        parameters,
        cv=folder,
        n_jobs=-1,
        verbose=1)
    grid_search=grid_search.fit(data[predictors],data['label'])
    
    return grid_search




#训练模型
predictors=original_feature
model = check_model(train,predictors)

验证


y_valid_pred= model.predict_proba(valid[predictors])
valid1=valid.copy()
valid1['pred_prob']=y_valid_pred[:,1]
valid1.head(5)

计算AUC



vg = valid1.groupby(['Coupon_id'])
aucs = []
for i in vg:
    tmpdf = i[1] 
    if len(tmpdf['label'].unique()) != 2:
        continue
    fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
    aucs.append(auc(fpr, tpr))
print(np.average(aucs))

测试


y_test_pred=model.predict_proba(dftest[predictors])
dftest1=dftest[['User_id','Coupon_id','Date_received']].copy()
dftest1['Probability']=y_test_pred[:,1]
dftest1.to_csv('submit2.csv',index=False,header=False)
dftest1.head(5)

保存模型&导入模型



if not os.path.isfile('2_model.pkl'):
    with open('2_model.pkl','wb') as f:
        pickle.dump(model,f)
else:
    with open('2_model.pkl','rb') as f:
        model=pickle.load(f)

jassy_shan

关注

0
点赞
踩
8

收藏

觉得还不错? 一键收藏
0
评论
O2O优惠券预测复盘（一）

天池o2o优惠券比赛初级版本：使用线性回归模型，后期会持续优化导入相关库import os,sys,pickleimport numpy as npimport pandas as pdfrom datetime import datefrom sklearn.model_selection import KFold,train_test_split,StratifiedKFold...
复制链接

扫一扫