05_机器学习赛事_优惠券使用预测

置顶腾飞开源

已于 2024-07-02 10:29:24 修改

阅读量1k

点赞数 22

分类专栏： Machine Learning 文章标签：机器学习人工智能笔记算法

于 2024-05-02 22:25:15 首次发布

本文链接：https://blog.csdn.net/luorongxi123/article/details/138401700

版权

Machine Learning 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

本文主要介绍了如何运用机器学习技术进行优惠券使用预测。首先，通过导入必要的函数库，接着读取和加载数据集，然后对数据进行预处理，最后展示数据的基本结构。内容包括5行28列的数据处理过程。

摘要由CSDN通过智能技术生成

在这里插入图片描述

1. 函数库导入

# import libraries necessary for this project
import os, sys, pickle

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib.dates as mdates

import seaborn as sns
import datetime as dt

from datetime import date

from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve

# display for this notebook
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

2. 读取文件数据

dfoff = pd.read_csv('./data/ccf_offline_stage1_train.csv')
dftest = pd.read_csv('./data/ccf_offline_stage1_test_revised.csv')
dfon = pd.read_csv('./data/ccf_online_stage1_train.csv')

dfoff.head()

	User_id	Merchant_id	Coupon_id	Discount_rate	Distance	Date_received	Date
0	1439408	2632	NaN	NaN	0.0	NaN	20160217.0
1	1439408	4663	11002.0	150:20	1.0	20160528.0	NaN
2	1439408	2632	8591.0	20:1	0.0	20160217.0	NaN
3	1439408	2632	1078.0	20:1	0.0	20160319.0	NaN
4	1439408	2632	8591.0	20:1	0.0	20160613.0	NaN

3. 数据处理

# 1. 将满xx减yy类型(`xx:yy`)的券变成折扣率 : `1 - yy/xx`，同时建立折扣券相关的特征 `discount_rate, discount_man, discount_jian, discount_type`
# 2. 将距离 `str` 转为 `int`
# convert Discount_rate and Distance
def getDiscountType(row):
    if pd.isnull(row):
        return np.nan
    elif ':' in row:
        return 1
    else:
        return 0

    
def convertRate(row):
    """Convert discount to rate"""
    if pd.isnull(row):
        return 1.0
    elif ':' in str(row):
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

def getDiscountMan(row):
    if ':' in str(row):
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0

def getDiscountJian(row):
    if ':' in str(row):
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0
    
def processData(df):
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].apply(convertRate)
    df['discount_man'] = df['Discount_rate'].apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
    #print(df['discount_rate'].unique())
    # convert distance
    df['distance'] = df['Distance'].fillna(-1).astype(int)
    return df

dfoff = processData(dfoff)
dftest = processData(dftest)

dfoff.head()
dftest.head()

	User_id	Merchant_id	Coupon_id	Discount_rate	Distance	Date_received	discount_rate	discount_man	discount_jian	discount_type	distance
0	4129537	450	9983	30:5	1.0	20160712	0.833333	30	5	1	1
1	6949378	1300	3429	30:5	NaN	20160706	0.833333	30	5	1	-1
2	2166529	7113	6928	200:20	5.0	20160727	0.900000	200	20	1	5
3	2166529	7113	1808	100:10	5.0	20160727	0.900000	100	10	1	5
4	6172162	7605	6500	30:1	2.0	20160708	0.966667	30	1	1	2

date_received = dfoff['Date_received'].unique()
date_received = sorted(date_received[pd.notnull(date_received)])

date_buy = dfoff['Date'].unique()
date_buy = sorted(date_buy[pd.notnull(date_buy)])
date_buy = sorted(dfoff[dfoff['Date'].notnull()]['Date'])

dfoff.head()

	User_id	Merchant_id	Coupon_id	Discount_rate	Distance	Date_received	Date	discount_rate	discount_man	discount_jian	discount_type	distance
0	1439408	2632	NaN	NaN	0.0	NaN	20160217.0	1.000000	0	0	NaN	0
1	1439408	4663	11002.0	150:20	1.0	20160528.0	NaN	0.866667	150	20	1.0	1
2	1439408	2632	8591.0	20:1	0.0	20160217.0	NaN	0.950000	20	1	1.0	0
3	1439408	2632	1078.0	20:1	0.0	20160319.0	NaN	0.950000	20	1	1.0	0
4	1439408	2632	8591.0	20:1	0.0	20160613.0	NaN	0.950000	20	1	1.0	0

#size （） 与 count（） 的区别 ，size（） 计算nan的值。
#每天 领取 coupon 的数量
couponbydate = dfoff[dfoff['Date_received'].notnull()][['Date_received', 'Date']].groupby(['Date_received'], as_index=False)['Date'].agg({'count':np.size})
couponbydate.columns = ['Date_received','count']

#每天消耗 coupon 的数量
buybydate = dfoff[(dfoff['Date'].notnull()) & (dfoff['Date_received'].notnull())][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count()
buybydate.columns = ['Date_received','count']

dfoff.head()
#couponbydate.head()

	User_id	Merchant_id	Coupon_id	Discount_rate	Distance	Date_received	Date	discount_rate	discount_man	discount_jian	discount_type	distance
0	1439408	2632	NaN	NaN	0.0	NaN	20160217.0	1.000000	0	0	NaN	0
1	1439408	4663	11002.0	150:20	1.0	20160528.0	NaN	0.866667	150	20	1.0	1
2	1439408	2632	8591.0	20:1	0.0	20160217.0	NaN	0.950000	20	1	1.0	0
3	1439408	2632	1078.0	20:1	0.0	20160319.0	NaN	0.950000	20	1	1.0	0
4	1439408	2632	8591.0	20:1	0.0	20160613.0	NaN	0.950000	20	1	1.0	0

# 每个user领取coupon的数量(优惠券消费次数)
temp_user_coupon =  dfoff[(dfoff['Date'].notnull()) & (dfoff['Date_received'].notnull())]
user_coupon = temp_user_coupon.groupby(['User_id']).size().reset_index(name='user_coupon')

dfoff = pd.merge(dfoff, user_coupon, how='left', on='User_id')
dfoff.user_coupon.fillna(0, inplace=True)
dftest = pd.merge(dftest, user_coupon, how='left', on='User_id')
dftest.user_coupon.fillna(0, inplace=True)

# 优惠券消费最大间隔
date_coupon= temp_user_coupon.groupby('User_id',as_index= False ).Date.agg({'cmax':max,'cmin':min})
date_coupon[['cmin','cmax']] = date_coupon[['cmin','cmax']].astype('int').astype('str')
date_coupon['cmax'] =date_coupon['cmax'].apply(lambda x : dt.datetime.strptime(x,'%Y%m%d') )
date_coupon['cmin'] =date_coupon['cmin'].apply(lambda x : dt.datetime.strptime(x,'%Y%m%d') )
date_coupon['cdate_interval'] = (date_coupon['cmax'] -date_coupon['cmin']).dt.days


dfoff = pd.merge(dfoff, date_coupon, how='left', on='User_id')
dftest = pd.merge(dftest, date_coupon, how='left', on='User_id')
dfoff['cdate_interval'] = dfoff['cdate_interval'].fillna(-1)
dftest['cdate_interval'] = dftest['cdate_interval'].fillna(-1)

dfoff.head()
dftest.head()

	User_id	Merchant_id	Coupon_id	Discount_rate	Distance	Date_received	discount_rate	discount_man	discount_jian	discount_type	distance	user_coupon	cmax	cmin	cdate_interval
0	4129537	450	9983	30:5	1.0	20160712	0.833333	30	5	1	1	0.0	NaT	NaT	-1.0
1	6949378	1300	3429	30:5	NaN	20160706	0.833333	30	5	1	-1	1.0	2016-05-08	2016-05-08	0.0
2	2166529	7113	6928	200:20	5.0	20160727	0.900000	200	20	1	5	0.0	NaT	NaT	-1.0
3	2166529	7113	1808	100:10	5.0	20160727	0.900000	100	10	1	5	0.0	NaT	NaT	-1.0
4	6172162	7605	6500	30:1	2.0	20160708	0.966667	30	1	1	2	0.0	NaT	NaT	-1.0

# 普通消费次数
temp_user_nocoupon =dfoff[(dfoff['Date'].notnull()) & (dfoff['Date_received'].isnull())]
user_nocoupon = temp_user_nocoupon.groupby(['User_id']).size().reset_index(name='user_nocoupon')

dfoff = pd.merge(dfoff, user_nocoupon, how='left', on='User_id')
dfoff.user_nocoupon.fillna(0, inplace=True)
dftest = pd.merge(dftest, user_nocoupon, how='left', on='User_id')
dftest.user_nocoupon.fillna(0, inplace=True)


# 普通消费最大间隔
date1 = temp_user_nocoupon.groupby('User_id',as_index= False ).Date.agg({'max':max,'min':min})
date1[['min','max']] = date1[['min','max']].astype('int').astype('str')
date1['max'] =date1['max'].apply(lambda x : dt.datetime.strptime(x,'%Y%m%d') )
date1['min'] =date1['min'].apply(lambda x : dt.datetime.strptime(x,'%Y%m%d') )
date1['date_interval'] = (date1['max'] -date1['min']).dt.days


dfoff = pd.merge(dfoff, date1, how='left', on='User_id')
dftest = pd.merge(dftest, date1, how='left', on='User_id')
dfoff['date_interval'] = dfoff['date_interval'].fillna(-1)
dftest['date_interval'] = dftest['date_interval'].fillna(-1)


dfoff.head()
dftest.head()

	User_id	Merchant_id	Coupon_id	Discount_rate	Distance	Date_received	discount_rate	discount_man	discount_jian	discount_type	distance	user_coupon	cmax	cmin	cdate_interval	user_nocoupon	max	min	date_interval
0	4129537	450	9983	30:5	1.0	20160712	0.833333	30	5	1	1	0.0	NaT	NaT	-1.0	0.0	NaT	NaT	-1.0
1	6949378	1300	3429	30:5	NaN	20160706	0.833333	30	5	1	-1	1.0	2016-05-08	2016-05-08	0.0	1.0	2016-05-05	2016-05-05	0.0
2	2166529	7113	6928	200:20	5.0	20160727	0.900000	200	20	1	5	0.0	NaT	NaT	-1.0	0.0	NaT	NaT	-1.0
3	2166529	7113	1808	100:10	5.0	20160727	0.900000	100	10	1	5	0.0	NaT	NaT	-1.0	0.0	NaT	NaT	-1.0
4	6172162	7605	6500	30:1	2.0	20160708	0.966667	30	1	1	2	0.0	NaT	NaT	-1.0	8.0	2016-05-11	2016-01-05	127.0

def getWeekday(row):
    if row == 'nan':
        return np.nan
    else:
        return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1

dfoff['weekday'] = dfoff['Date_received'].astype('str').apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].astype('str').apply(getWeekday)

dfoff.head()
dftest.head()

	User_id	Merchant_id	Coupon_id	Discount_rate	Distance	Date_received	discount_rate	discount_man	discount_jian	discount_type	distance	user_coupon	cmax	cmin	cdate_interval	user_nocoupon	max	min	date_interval	weekday
0	4129537	450	9983	30:5	1.0	20160712	0.833333	30	5	1	1	0.0	NaT	NaT	-1.0	0.0	NaT	NaT	-1.0	2
1	6949378	1300	3429	30:5	NaN	20160706	0.833333	30	5	1	-1	1.0	2016-05-08	2016-05-08	0.0	1.0	2016-05-05	2016-05-05	0.0	3
2	2166529	7113	6928	200:20	5.0	20160727	0.900000	200	20	1	5	0.0	NaT	NaT	-1.0	0.0	NaT	NaT	-1.0	3
3	2166529	7113	1808	100:10	5.0	20160727	0.900000	100	10	1	5	0.0	NaT	NaT	-1.0	0.0	NaT	NaT	-1.0	3
4	6172162	7605	6500	30:1	2.0	20160708	0.966667	30	1	1	2	0.0	NaT	NaT	-1.0	8.0	2016-05-11	2016-01-05	127.0	5

# weekday_type :  周六和周日为1，其他为0
dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x : 1 if x in [6,7] else 0 )
dftest['weekday_type'] = dftest['weekday'].apply(lambda x : 1 if x in [6,7] else 0 )

# change weekday to one-hot encoding 
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
tmpdf = pd.get_dummies(dfoff['weekday'].replace('nan', np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['weekday'].replace('nan', np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf

dfoff.head()
dftest.head()

	User_id	Merchant_id	Coupon_id	Discount_rate	Distance	Date_received	discount_rate	discount_man	discount_jian	discount_type	...	date_interval	weekday	weekday_1	weekday_2	weekday_3	weekday_4	weekday_5	weekday_6	weekday_7
0	4129537	450	9983	30:5	1.0	20160712	0.833333	30	5	1	...	-1.0	2	False	True	False	False	False	False	False
1	6949378	1300	3429	30:5	NaN	20160706	0.833333	30	5	1	...	0.0	3	False	False	True	False	False	False	False
2	2166529	7113	6928	200:20	5.0	20160727	0.900000	200	20	1	...	-1.0	3	False	False	True	False	False	False	False
3	2166529	7113	1808	100:10	5.0	20160727	0.900000	100	10	1	...	-1.0	3	False	False	True	False	False	False	False
4	6172162	7605	6500	30:1	2.0	20160708	0.966667	30	1	1	...	127.0	5	False	False	False	False	True	False	False

5 rows × 28 columns

def label(row):
    if pd.isnull(row['Date_received']):
        return -1
    if pd.notnull(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0

dfoff['label'] = dfoff.apply(label, axis = 1)

dfoff.head()
dftest.head()

	User_id	Merchant_id	Coupon_id	Discount_rate	Distance	Date_received	discount_rate	discount_man	discount_jian	discount_type	...	date_interval	weekday	weekday_1	weekday_2	weekday_3	weekday_4	weekday_5	weekday_6	weekday_7
0	4129537	450	9983	30:5	1.0	20160712	0.833333	30	5	1	...	-1.0	2	False	True	False	False	False	False	False
1	6949378	1300	3429	30:5	NaN	20160706	0.833333	30	5	1	...	0.0	3	False	False	True	False	False	False	False
2	2166529	7113	6928	200:20	5.0	20160727	0.900000	200	20	1	...	-1.0	3	False	False	True	False	False	False	False
3	2166529	7113	1808	100:10	5.0	20160727	0.900000	100	10	1	...	-1.0	3	False	False	True	False	False	False	False
4	6172162	7605	6500	30:1	2.0	20160708	0.966667	30	1	1	...	127.0	5	False	False	False	False	True	False	False

5 rows × 28 columns

dfoff.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754884 entries, 0 to 1754883
Data columns (total 30 columns):
 #   Column          Dtype         
---  ------          -----         
 0   User_id         int64         
 1   Merchant_id     int64         
 2   Coupon_id       float64       
 3   Discount_rate   object        
 4   Distance        float64       
 5   Date_received   float64       
 6   Date            float64       
 7   discount_rate   float64       
 8   discount_man    int64         
 9   discount_jian   int64         
 10  discount_type   float64       
 11  distance        int32         
 12  user_coupon     float64       
 13  cmax            datetime64[ns]
 14  cmin            datetime64[ns]
 15  cdate_interval  float64       
 16  user_nocoupon   float64       
 17  max             datetime64[ns]
 18  min             datetime64[ns]
 19  date_interval   float64       
 20  weekday         float64       
 21  weekday_type    int64         
 22  weekday_1       bool          
 23  weekday_2       bool          
 24  weekday_3       bool          
 25  weekday_4       bool          
 26  weekday_5       bool          
 27  weekday_6       bool          
 28  weekday_7       bool          
 29  label           int64         
dtypes: bool(7), datetime64[ns](4), float64(11), int32(1), int64(6), object(1)
memory usage: 313.0+ MB

# data split
df = dfoff[dfoff['label'] != -1].copy()

df.to_csv('./data/df.csv', index=False)

train = df[(df['Date_received'] < 20160516)].copy()
valid = df[(df['Date_received'] >= 20160516) & (df['Date_received'] <= 20160615)].copy()

# feature
df = pd.read_csv('./data/df.csv')

print(df.columns)
original_feature = ['discount_rate', 'discount_man',
       'discount_jian', 'discount_type', 'distance', 'user_coupon', 'weekday',
       'weekday_type','user_nocoupon', 'date_interval'] + weekdaycols

Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'Date', 'discount_rate', 'discount_man',
       'discount_jian', 'discount_type', 'distance', 'user_coupon', 'cmax',
       'cmin', 'cdate_interval', 'user_nocoupon', 'max', 'min',
       'date_interval', 'weekday', 'weekday_type', 'weekday_1', 'weekday_2',
       'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6', 'weekday_7',
       'label'],
      dtype='object')

print("----train-----")
SGD = SGDClassifier(#lambda:
    loss='log_loss',
    penalty='elasticnet',
    fit_intercept=True,
    max_iter=100,
    shuffle=True,
    alpha = 0.01,
    l1_ratio = 0.01,
    n_jobs=1,
    class_weight=None
)
SGD.fit(train[original_feature], train['label'])

----train-----

SGDClassifier(alpha=0.01, l1_ratio=0.01, loss='log_loss', max_iter=100,
          n_jobs=1, penalty=&#x27;elasticnet&#x27;)</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class="sk-container" hidden><div class="sk-item"><div class="sk-estimator sk-toggleable"><input class="sk-toggleable__control sk-hidden--visually" id="sk-estimator-id-1" type="checkbox" checked><label for="sk-estimator-id-1" class="sk-toggleable__label sk-toggleable__label-arrow">SGDClassifier</label><div class="sk-toggleable__content"><pre>SGDClassifier(alpha=0.01, l1_ratio=0.01, loss=&#x27;log_loss&#x27;, max_iter=100,
          n_jobs=1, penalty=&#x27;elasticnet&#x27;)</pre></div></div></div></div></div>

# #### 预测以及结果评价

SGD_pre_train=SGD.predict_proba(train[original_feature])
SGD_auc_train = roc_auc_score(train['label'],SGD_pre_train[:,1])
SGD_pre_test=SGD.predict_proba(valid[original_feature])
SGD_auc_valid = roc_auc_score(valid['label'],SGD_pre_test[:,1])


print('SGD_auc_train: ', SGD_auc_train)
print('SGD_auc_valid: ', SGD_auc_valid)

SGD_auc_train:  0.9154812464203588
SGD_auc_valid:  0.8621531088485809

#save model
print("---save model---")
with open('./data/SGD.pkl', 'wb') as f:
    pickle.dump(SGD, f)

---save model---

# test prediction for submission
y_test_pred = SGD.predict_proba(dftest[original_feature])
dftest1 = dftest[['User_id','Coupon_id','Date_received']].copy()
dftest1['label'] = y_test_pred[:,1]
dftest1.to_csv('./o2osubmit.csv', index=False, header=False)
dftest1.head()