天池 O2O 优惠券使用预测思路解读

最新推荐文章于 2024-03-26 09:31:36 发布
炼丹师666
最新推荐文章于 2024-03-26 09:31:36 发布
阅读量695
点赞数
分类专栏：算法 Python
本文链接：https://blog.csdn.net/wj1298250240/article/details/103163867
版权
Python 同时被 2 个专栏收录
123 篇文章 1 订阅
订阅专栏
算法
101 篇文章 5 订阅
订阅专栏
原文参考：
https://blog.csdn.net/red_stone1/article/details/83859845
所用数据：https://pan.baidu.com/s/18g-PZcdSWwzxtjpuSmjG-Q
代码自己实践走通
使用 #SGDClassifier梯度下降分类方法预测

# import libraries necessary for this project
import os, sys, pickle

import numpy as np
import pandas as pd
from datetime import date
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler

# display for this notebook
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

dfoff = pd.read_csv('C:\data\O2O_tianchi\ccf_offline_stage1_train.csv')
dfon = pd.read_csv('C:\data\O2O_tianchi\ccf_online_stage1_train.csv')
dftest = pd.read_csv('C:\data\O2O_tianchi\ccf_offline_stage1_test_revised.csv')
# dfoff.head(5)
# dfoff.info()
# dfoff.shape

# dfoff['Date_received']=dfoff['Date_received'].astype('str')
dfoff['Date_received']=dfoff['Date_received'].fillna('null')
dfoff['Discount_rate']=dfoff['Discount_rate'].fillna('null')
# dfoff['Date_received'].astype('int64')
# dfoff.info()
# dfoff.head(5)
dfoff['Date_received']=dfoff['Date_received'].astype('str')
dfoff['Discount_rate']=dfoff['Discount_rate'].astype('str')
# dfoff.head(5)
dfoff.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754884 entries, 0 to 1754883
Data columns (total 7 columns):
User_id          int64
Merchant_id      int64
Coupon_id        float64
Discount_rate    object
Distance         float64
Date_received    object
Date             float64
dtypes: float64(3), int64(2), object(2)
memory usage: 93.7+ MB
# dfoff['Date']=dfoff['Date'].astype('str')
dfoff['Date']=dfoff['Date'].fillna('null')

# dfoff['Date_received'].astype('int64')
# dfoff.info()
dfoff['Date']=dfoff['Date'].astype('str')
# dfoff.head()
# dfoff['Date_received']=dfoff['Date_received'].fillna('null')
# dfoff['Date']=dfoff['Date'].fillna('null')
# dfoff.info()
#在这里注意字段类型  先转换为int类型
print('有优惠卷，购买商品：%d' % dfoff[(dfoff['Date_received'] != 'null') & (dfoff['Date'] != 'null')].shape[0])
print('有优惠卷，未购商品：%d' % dfoff[(dfoff['Date_received'] != 'null') & (dfoff['Date'] == 'null')].shape[0])
print('无优惠卷，购买商品：%d' % dfoff[(dfoff['Date_received'] == 'null') & (dfoff['Date'] != 'null')].shape[0])
print('无优惠卷，未购商品：%d' % dfoff[(dfoff['Date_received'] == 'null') & (dfoff['Date'] == 'null')].shape[0])

# print('有优惠卷，购买商品：%d' % dfoff[(dfoff['Date_received'].isnull()) & (dfoff['Date'].isnull())].shape[0])
# print('有优惠卷，未购商品：%d' % dfoff[(dfoff['Date_received'] != 0) & (dfoff['Date'] == 0)].shape[0])
# print('无优惠卷，购买商品：%d' % dfoff[(dfoff['Date_received'] == 0) & (dfoff['Date'] != 0)].shape[0])
# print('无优惠卷，未购商品：%d' % dfoff[(dfoff['Date_received'] == 0) & (dfoff['Date'] == 0)].shape[0])

有优惠卷，购买商品：75382
有优惠卷，未购商品：977900
无优惠卷，购买商品：701602
无优惠卷，未购商品：0
#查看打折率（Discount_rate）字段类型
# print('Discount_rate 类型：\n',dfoff['Discount_rate'].unique())
# Convert Discount_rate and Distance
def getDiscountType(row):
   if row == 'null':
       return 'null'
       #判断数据中有没有冒号
   elif ':' in row:
       return 1
   else:
       return 0
#计算折扣率
def convertRate(row):
   """Convert discount to rate"""
   if row == 'null':
       return 1.0
   elif ':' in row:
       rows = row.split(':')
       return 1.0 - float(rows[1])/float(rows[0])
   else:
       return float(row)
#计算满多少  
def getDiscountMan(row):
   if ':' in row:
       rows = row.split(':')
       return int(rows[0])
   else:
       return 0
#减多少
def getDiscountJian(row):
   if ':' in row:
       rows = row.split(':')
       return int(rows[1])
   else:
       return 0
   
def processData(df):
   
   # convert discount_rate
   df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
   df['discount_rate'] = df['Discount_rate'].apply(convertRate)
   df['discount_man'] = df['Discount_rate'].apply(getDiscountMan)
   df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
   
   print(df['discount_rate'].unique())
   
   return df

# dfoff.head()
dfoff = processData(dfoff)
dftest = processData(dftest)
# dfoff.head()
[1.         0.86666667 0.95       0.9        0.83333333 0.8
 0.5        0.85       0.75       0.66666667 0.93333333 0.7
 0.6        0.96666667 0.98       0.99       0.975      0.33333333
 0.2        0.4       ]
[0.83333333 0.9        0.96666667 0.8        0.95       0.75
 0.98       0.5        0.86666667 0.6        0.66666667 0.7
 0.85       0.33333333 0.94       0.93333333 0.975      0.99      ]
#距离
# print('Distance 类型：',dfoff['Distance'].unique())
# print(dfoff['distance'].unique())
# convert distance
dfoff['distance'] = dfoff['Distance'].replace(np.nan, -1)
dfoff['distance'] = dfoff['Distance'].fillna(-1)
# dfoff['distance'] = dfoff['Distance'].astype('str')

print(dfoff['distance'].unique())
dftest['distance'] = dftest['Distance'].replace(np.nan, -1)
# dftest['distance'] = dftest['Distance'].fillna(-1)
# dftest['distance'] = dftest['Distance'].astype('str')

# print(dftest['distance'].unique())
[ 0.  1. -1.  2. 10.  4.  7.  9.  3.  5.  6.  8.]
# dfoff.head()
# dftest.head()
# dfoff['Date_received'] == 'null'
# dfoff['Date_received'][0:4]
# dfoff['Date_received'][4:6]
# date(int(dfoff['Date_received'][0:4]), int(dfoff['Date_received'][4:6]), int(dfoff['Date_received'][6:8])).weekday() + 1
# # date(int(dfoff['Date_received'].astype(str)[0:4]), int(dfoff['Date_received'].astype(str)[4:6]), dfoff['Date_received'].astype(str)[6:8])).weekday()
# # date(2018, 12, 24).weekday() + 1
# date(2018, 12, 24)
# int(dfoff['Date_received'].iloc[1][0:4])
# int(dfoff['Date_received'].iloc[1][4:6])
# int(dfoff['Date_received'].iloc[1][6:8])
# date(int(dfoff['Date_received'].iloc[1][0:4]), int(dfoff['Date_received'].iloc[1][4:6]), int(dfoff['Date_received'].iloc[1][6:8])).weekday()+1
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
# print(weekdaycols)
#3.领劵日期（Date_received）
def getWeekday(row):
   if row == 'null':
       return row
   else:
       return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1
#apply，应用函数， 逐行处理
dfoff['weekday'] = dfoff['Date_received'].astype(str).apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].astype(str).apply(getWeekday)

# weekday_type :  周六和周日为1，其他为0
dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x: 1 if x in [6,7] else 0)
dftest['weekday_type'] = dftest['weekday'].apply(lambda x: 1 if x in [6,7] else 0)

# change weekday to one-hot encoding 
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
#print(weekdaycols)

tmpdf = pd.get_dummies(dfoff['weekday'].replace('null', np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['weekday'].replace('null', np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf

# dfoff.head()
# dftest.head()
#标注标签 Label
def label(row):
   if row['Date_received'] == 'null':
       return -1
   if row['Date'] != 'null':
       td = pd.to_datetime(row['Date'], format='%Y%m%d') - pd.to_datetime(row['Date_received'], format='%Y%m%d')
       if td <= pd.Timedelta(15, 'D'):
           return 1
   return 0

dfoff['label'] = dfoff.apply(label, axis=1)

# dfoff.head()
dfoff.iloc[:1,6]
0    20160217.0
Name: Date, dtype: object
#看一下正负样本究竟有多少
# print(dfoff['label'].value_counts())

# data split
df = dfoff[dfoff['label'] != -1].copy()
train = df[(df['Date_received'] < '20160516')].copy()
valid = df[(df['Date_received'] >= '20160516') & (df['Date_received'] <= '20160615')].copy()
# print('Train Set: \n', train['label'].value_counts())
# print('Valid Set: \n', valid['label'].value_counts())
#2.构建模型
def check_model(data, predictors):
   #SGDClassifier梯度下降分类方法
   classifier = lambda: SGDClassifier(
       loss='log',  # loss function: logistic regression
       penalty='elasticnet', # L1 & L2
       fit_intercept=True,  # 是否存在截距，默认存在
       max_iter=100, 
       shuffle=True,  # Whether or not the training data should be shuffled after each epoch
       n_jobs=-1, # The number of processors to use
       class_weight=None) # Weights associated with classes. If not given, all classes are supposed to have weight one.

   # 管道机制使得参数集在新数据集（比如测试集）上的重复使用，管道机制实现了对全部步骤的流式化封装和管理。
   model = Pipeline(steps=[
       ('ss', StandardScaler()), # transformer
       ('en', classifier())  # estimator
   ])

   parameters = {
       'en__alpha': [ 0.001, 0.01, 0.1],
       'en__l1_ratio': [ 0.001, 0.01, 0.1]
   }

   # StratifiedKFold用法类似Kfold，但是他是分层采样，确保训练集，测试集中各类别样本的比例与原始数据集中相同。
   folder = StratifiedKFold(n_splits=3, shuffle=True)
   
   # Exhaustive search over specified parameter values for an estimator.
   grid_search = GridSearchCV(
       model, 
       parameters, 
       cv=folder, 
       n_jobs=-1,  # -1 means using all processors
       verbose=1)
   grid_search = grid_search.fit(data[predictors], 
                                 data['label'])
   
   return grid_search

train.columns
Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'Date', 'discount_type', 'discount_rate',
       'discount_man', 'discount_jian', 'distance', 'weekday', 'weekday_type',
       'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5',
       'weekday_6', 'weekday_7', 'label'],
      dtype='object')

#3.训练
predictors = ['discount_rate','discount_type','discount_man','discount_jian','distance','weekday','weekday_type','weekday_1','weekday_2','weekday_3','weekday_4','weekday_5','weekday_6','weekday_7']
model = check_model(train, predictors)
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   49.3s finished
# train_orignal=train_orignal.iloc[:,[3,4]]
# train_orignal_01=train_orignal.iloc[:,[3,4]]
# train_orignal_01=train_orignal.iloc[:,3:8]
# train_orignal_01.head()
#4.验证
# valid predict
#valid是验证集
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
#将预测的结果赋值给标签列
valid1['pred_prob'] = y_valid_pred[:, 1]
# valid1.head(5)
#得到AUC值
# avgAUC calculation
vg = valid1.groupby(['Coupon_id'])
aucs = []
for i in vg:
   tmpdf = i[1] 
   if len(tmpdf['label'].unique()) != 2:
       continue
   fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
   aucs.append(auc(fpr, tpr))
print(np.average(aucs))