05_机器学习赛事_优惠券使用预测

本文主要介绍了如何运用机器学习技术进行优惠券使用预测。首先,通过导入必要的函数库,接着读取和加载数据集,然后对数据进行预处理,最后展示数据的基本结构。内容包括5行28列的数据处理过程。
摘要由CSDN通过智能技术生成

在这里插入图片描述

1. 函数库导入

# import libraries necessary for this project
import os, sys, pickle

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import seaborn as sns
import datetime as dt

from datetime import date

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve

# display for this notebook
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

2. 读取文件数据

dfoff = pd.read_csv('./data/ccf_offline_stage1_train.csv')
dftest = pd.read_csv('./data/ccf_offline_stage1_test_revised.csv')
dfon = pd.read_csv('./data/ccf_online_stage1_train.csv')

dfoff.head()
User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receivedDate
014394082632NaNNaN0.0NaN20160217.0
11439408466311002.0150:201.020160528.0NaN
2143940826328591.020:10.020160217.0NaN
3143940826321078.020:10.020160319.0NaN
4143940826328591.020:10.020160613.0NaN

3. 数据处理

# 1. 将满xx减yy类型(`xx:yy`)的券变成折扣率 : `1 - yy/xx`,同时建立折扣券相关的特征 `discount_rate, discount_man, discount_jian, discount_type`
# 2. 将距离 `str` 转为 `int`
# convert Discount_rate and Distance
def getDiscountType(row):
    if pd.isnull(row):
        return np.nan
    elif ':' in row:
        return 1
    else:
        return 0

    
def convertRate(row):
    """Convert discount to rate"""
    if pd.isnull(row):
        return 1.0
    elif ':' in str(row):
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in str(row):
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in str(row):
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0
    
def processData(df):
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].apply(convertRate)
    df['discount_man'] = df['Discount_rate'].apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
    #print(df['discount_rate'].unique())
    # convert distance
    df['distance'] = df['Distance'].fillna(-1).astype(int)
    return df

dfoff = processData(dfoff)
dftest = processData(dftest)

dfoff.head()
dftest.head()
User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receiveddiscount_ratediscount_mandiscount_jiandiscount_typedistance
04129537450998330:51.0201607120.83333330511
169493781300342930:5NaN201607060.8333333051-1
2216652971136928200:205.0201607270.9000002002015
3216652971131808100:105.0201607270.9000001001015
461721627605650030:12.0201607080.96666730112
date_received = dfoff['Date_received'].unique()
date_received = sorted(date_received[pd.notnull(date_received)])

date_buy = dfoff['Date'].unique()
date_buy = sorted(date_buy[pd.notnull(date_buy)])
date_buy = sorted(dfoff[dfoff['Date'].notnull()]['Date'])

dfoff.head()
User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receivedDatediscount_ratediscount_mandiscount_jiandiscount_typedistance
014394082632NaNNaN0.0NaN20160217.01.00000000NaN0
11439408466311002.0150:201.020160528.0NaN0.866667150201.01
2143940826328591.020:10.020160217.0NaN0.9500002011.00
3143940826321078.020:10.020160319.0NaN0.9500002011.00
4143940826328591.020:10.020160613.0NaN0.9500002011.00
#size () 与 count() 的区别 ,size() 计算nan的值。
#每天 领取 coupon 的数量
couponbydate = dfoff[dfoff['Date_received'].notnull()][['Date_received', 'Date']].groupby(['Date_received'], as_index=False)['Date'].agg({'count':np.size})
couponbydate.columns = ['Date_received','count']

#每天消耗 coupon 的数量
buybydate = dfoff[(dfoff['Date'].notnull()) & (dfoff['Date_received'].notnull())][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count()
buybydate.columns = ['Date_received','count']

dfoff.head()
#couponbydate.head()
User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receivedDatediscount_ratediscount_mandiscount_jiandiscount_typedistance
014394082632NaNNaN0.0NaN20160217.01.00000000NaN0
11439408466311002.0150:201.020160528.0NaN0.866667150201.01
2143940826328591.020:10.020160217.0NaN0.9500002011.00
3143940826321078.020:10.020160319.0NaN0.9500002011.00
4143940826328591.020:10.020160613.0NaN0.9500002011.00
# 每个user领取coupon的数量(优惠券消费次数)
temp_user_coupon =  dfoff[(dfoff['Date'].notnull()) & (dfoff['Date_received'].notnull())]
user_coupon = temp_user_coupon.groupby(['User_id']).size().reset_index(name='user_coupon')

dfoff = pd.merge(dfoff, user_coupon, how='left', on='User_id')
dfoff.user_coupon.fillna(0, inplace=True)
dftest = pd.merge(dftest, user_coupon, how='left', on='User_id')
dftest.user_coupon.fillna(0, inplace=True)

# 优惠券消费最大间隔
date_coupon= temp_user_coupon.groupby('User_id',as_index= False ).Date.agg({'cmax':max,'cmin':min})
date_coupon[['cmin','cmax']] = date_coupon[['cmin','cmax']].astype('int').astype('str')
date_coupon['cmax'] =date_coupon['cmax'].apply(lambda x : dt.datetime.strptime(x,'%Y%m%d') )
date_coupon['cmin'] =date_coupon['cmin'].apply(lambda x : dt.datetime.strptime(x,'%Y%m%d') )
date_coupon['cdate_interval'] = (date_coupon['cmax'] -date_coupon['cmin']).dt.days


dfoff = pd.merge(dfoff, date_coupon, how='left', on='User_id')
dftest = pd.merge(dftest, date_coupon, how='left', on='User_id')
dfoff['cdate_interval'] = dfoff['cdate_interval'].fillna(-1)
dftest['cdate_interval'] = dftest['cdate_interval'].fillna(-1)

dfoff.head()
dftest.head()
User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receiveddiscount_ratediscount_mandiscount_jiandiscount_typedistanceuser_couponcmaxcmincdate_interval
04129537450998330:51.0201607120.833333305110.0NaTNaT-1.0
169493781300342930:5NaN201607060.8333333051-11.02016-05-082016-05-080.0
2216652971136928200:205.0201607270.90000020020150.0NaTNaT-1.0
3216652971131808100:105.0201607270.90000010010150.0NaTNaT-1.0
461721627605650030:12.0201607080.966667301120.0NaTNaT-1.0
# 普通消费次数
temp_user_nocoupon =dfoff[(dfoff['Date'].notnull()) & (dfoff['Date_received'].isnull())]
user_nocoupon = temp_user_nocoupon.groupby(['User_id']).size().reset_index(name='user_nocoupon')

dfoff = pd.merge(dfoff, user_nocoupon, how='left', on='User_id')
dfoff.user_nocoupon.fillna(0, inplace=True)
dftest = pd.merge(dftest, user_nocoupon, how='left', on='User_id')
dftest.user_nocoupon.fillna(0, inplace=True)


# 普通消费最大间隔
date1 = temp_user_nocoupon.groupby('User_id',as_index= False ).Date.agg({'max':max,'min':min})
date1[['min','max']] = date1[['min','max']].astype('int').astype('str')
date1['max'] =date1['max'].apply(lambda x : dt.datetime.strptime(x,'%Y%m%d') )
date1['min'] =date1['min'].apply(lambda x : dt.datetime.strptime(x,'%Y%m%d') )
date1['date_interval'] = (date1['max'] -date1['min']).dt.days


dfoff = pd.merge(dfoff, date1, how='left', on='User_id')
dftest = pd.merge(dftest, date1, how='left', on='User_id')
dfoff['date_interval'] = dfoff['date_interval'].fillna(-1)
dftest['date_interval'] = dftest['date_interval'].fillna(-1)


dfoff.head()
dftest.head()

User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receiveddiscount_ratediscount_mandiscount_jiandiscount_typedistanceuser_couponcmaxcmincdate_intervaluser_nocouponmaxmindate_interval
04129537450998330:51.0201607120.833333305110.0NaTNaT-1.00.0NaTNaT-1.0
169493781300342930:5NaN201607060.8333333051-11.02016-05-082016-05-080.01.02016-05-052016-05-050.0
2216652971136928200:205.0201607270.90000020020150.0NaTNaT-1.00.0NaTNaT-1.0
3216652971131808100:105.0201607270.90000010010150.0NaTNaT-1.00.0NaTNaT-1.0
461721627605650030:12.0201607080.966667301120.0NaTNaT-1.08.02016-05-112016-01-05127.0
def getWeekday(row):
    if row == 'nan':
        return np.nan
    else:
        return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1

dfoff['weekday'] = dfoff['Date_received'].astype('str').apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].astype('str').apply(getWeekday)

dfoff.head()
dftest.head()

User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receiveddiscount_ratediscount_mandiscount_jiandiscount_typedistanceuser_couponcmaxcmincdate_intervaluser_nocouponmaxmindate_intervalweekday
04129537450998330:51.0201607120.833333305110.0NaTNaT-1.00.0NaTNaT-1.02
169493781300342930:5NaN201607060.8333333051-11.02016-05-082016-05-080.01.02016-05-052016-05-050.03
2216652971136928200:205.0201607270.90000020020150.0NaTNaT-1.00.0NaTNaT-1.03
3216652971131808100:105.0201607270.90000010010150.0NaTNaT-1.00.0NaTNaT-1.03
461721627605650030:12.0201607080.966667301120.0NaTNaT-1.08.02016-05-112016-01-05127.05
# weekday_type :  周六和周日为1,其他为0
dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x : 1 if x in [6,7] else 0 )
dftest['weekday_type'] = dftest['weekday'].apply(lambda x : 1 if x in [6,7] else 0 )

# change weekday to one-hot encoding 
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
tmpdf = pd.get_dummies(dfoff['weekday'].replace('nan', np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['weekday'].replace('nan', np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf

dfoff.head()
dftest.head()

User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receiveddiscount_ratediscount_mandiscount_jiandiscount_type...date_intervalweekdayweekday_typeweekday_1weekday_2weekday_3weekday_4weekday_5weekday_6weekday_7
04129537450998330:51.0201607120.8333333051...-1.020FalseTrueFalseFalseFalseFalseFalse
169493781300342930:5NaN201607060.8333333051...0.030FalseFalseTrueFalseFalseFalseFalse
2216652971136928200:205.0201607270.900000200201...-1.030FalseFalseTrueFalseFalseFalseFalse
3216652971131808100:105.0201607270.900000100101...-1.030FalseFalseTrueFalseFalseFalseFalse
461721627605650030:12.0201607080.9666673011...127.050FalseFalseFalseFalseTrueFalseFalse

5 rows × 28 columns

def label(row):
    if pd.isnull(row['Date_received']):
        return -1
    if pd.notnull(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff['label'] = dfoff.apply(label, axis = 1)

dfoff.head()
dftest.head()
User_idMerchant_idCoupon_idDiscount_rateDistanceDate_receiveddiscount_ratediscount_mandiscount_jiandiscount_type...date_intervalweekdayweekday_typeweekday_1weekday_2weekday_3weekday_4weekday_5weekday_6weekday_7
04129537450998330:51.0201607120.8333333051...-1.020FalseTrueFalseFalseFalseFalseFalse
169493781300342930:5NaN201607060.8333333051...0.030FalseFalseTrueFalseFalseFalseFalse
2216652971136928200:205.0201607270.900000200201...-1.030FalseFalseTrueFalseFalseFalseFalse
3216652971131808100:105.0201607270.900000100101...-1.030FalseFalseTrueFalseFalseFalseFalse
461721627605650030:12.0201607080.9666673011...127.050FalseFalseFalseFalseTrueFalseFalse

5 rows × 28 columns

dfoff.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754884 entries, 0 to 1754883
Data columns (total 30 columns):
 #   Column          Dtype         
---  ------          -----         
 0   User_id         int64         
 1   Merchant_id     int64         
 2   Coupon_id       float64       
 3   Discount_rate   object        
 4   Distance        float64       
 5   Date_received   float64       
 6   Date            float64       
 7   discount_rate   float64       
 8   discount_man    int64         
 9   discount_jian   int64         
 10  discount_type   float64       
 11  distance        int32         
 12  user_coupon     float64       
 13  cmax            datetime64[ns]
 14  cmin            datetime64[ns]
 15  cdate_interval  float64       
 16  user_nocoupon   float64       
 17  max             datetime64[ns]
 18  min             datetime64[ns]
 19  date_interval   float64       
 20  weekday         float64       
 21  weekday_type    int64         
 22  weekday_1       bool          
 23  weekday_2       bool          
 24  weekday_3       bool          
 25  weekday_4       bool          
 26  weekday_5       bool          
 27  weekday_6       bool          
 28  weekday_7       bool          
 29  label           int64         
dtypes: bool(7), datetime64[ns](4), float64(11), int32(1), int64(6), object(1)
memory usage: 313.0+ MB
# data split
df = dfoff[dfoff['label'] != -1].copy()

df.to_csv('./data/df.csv', index=False)

train = df[(df['Date_received'] < 20160516)].copy()
valid = df[(df['Date_received'] >= 20160516) & (df['Date_received'] <= 20160615)].copy()
# feature
df = pd.read_csv('./data/df.csv')

print(df.columns)
original_feature = ['discount_rate', 'discount_man',
       'discount_jian', 'discount_type', 'distance', 'user_coupon', 'weekday',
       'weekday_type','user_nocoupon', 'date_interval'] + weekdaycols
Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'Date', 'discount_rate', 'discount_man',
       'discount_jian', 'discount_type', 'distance', 'user_coupon', 'cmax',
       'cmin', 'cdate_interval', 'user_nocoupon', 'max', 'min',
       'date_interval', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2',
       'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7',
       'label'],
      dtype='object')
print("----train-----")
SGD = SGDClassifier(#lambda:
    loss='log_loss',
    penalty='elasticnet',
    fit_intercept=True,
    max_iter=100,
    shuffle=True,
    alpha = 0.01,
    l1_ratio = 0.01,
    n_jobs=1,
    class_weight=None
)
SGD.fit(train[original_feature], train['label'])
----train-----
SGDClassifier(alpha=0.01, l1_ratio=0.01, loss='log_loss', max_iter=100,
          n_jobs=1, penalty=&#x27;elasticnet&#x27;)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-1" type="checkbox" checked><label for="sk-estimator-id-1" class="sk-toggleable__label sk-toggleable__label-arrow">SGDClassifier</label><div class="sk-toggleable__content"><pre>SGDClassifier(alpha=0.01, l1_ratio=0.01, loss=&#x27;log_loss&#x27;, max_iter=100,
          n_jobs=1, penalty=&#x27;elasticnet&#x27;)</pre></div></div></div></div></div>
# #### 预测以及结果评价

SGD_pre_train=SGD.predict_proba(train[original_feature])
SGD_auc_train = roc_auc_score(train['label'],SGD_pre_train[:,1])
SGD_pre_test=SGD.predict_proba(valid[original_feature])
SGD_auc_valid = roc_auc_score(valid['label'],SGD_pre_test[:,1])


print('SGD_auc_train: ', SGD_auc_train)
print('SGD_auc_valid: ', SGD_auc_valid)
SGD_auc_train:  0.9154812464203588
SGD_auc_valid:  0.8621531088485809
#save model
print("---save model---")
with open('./data/SGD.pkl', 'wb') as f:
    pickle.dump(SGD, f)
---save model---
# test prediction for submission
y_test_pred = SGD.predict_proba(dftest[original_feature])
dftest1 = dftest[['User_id','Coupon_id','Date_received']].copy()
dftest1['label'] = y_test_pred[:,1]
dftest1.to_csv('./o2osubmit.csv', index=False, header=False)
dftest1.head()
User_idCoupon_idDate_receivedlabel
041295379983201607120.055341
169493783429201607060.124270
221665296928201607270.008998
321665291808201607270.016814
461721626500201607080.106921

  • 22
    点赞
  • 14
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

腾飞开源

你的鼓励将是我创作的最大动力!

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值