天池 O2O 优惠券使用预测思路解读

101 篇文章 5 订阅

原文参考:
https://blog.csdn.net/red_stone1/article/details/83859845
所用数据:https://pan.baidu.com/s/18g-PZcdSWwzxtjpuSmjG-Q
代码自己实践走通
使用 #SGDClassifier梯度下降分类方法预测


# import libraries necessary for this project
import os, sys, pickle
​
import numpy as np
import pandas as pd
from datetime import date
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler
​
# display for this notebook
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
​
dfoff = pd.read_csv('C:\data\O2O_tianchi\ccf_offline_stage1_train.csv')
dfon = pd.read_csv('C:\data\O2O_tianchi\ccf_online_stage1_train.csv')
dftest = pd.read_csv('C:\data\O2O_tianchi\ccf_offline_stage1_test_revised.csv')
# dfoff.head(5)
# dfoff.info()
# dfoff.shape# dfoff['Date_received']=dfoff['Date_received'].astype('str')
dfoff['Date_received']=dfoff['Date_received'].fillna('null')
dfoff['Discount_rate']=dfoff['Discount_rate'].fillna('null')
# dfoff['Date_received'].astype('int64')
# dfoff.info()
# dfoff.head(5)
dfoff['Date_received']=dfoff['Date_received'].astype('str')
dfoff['Discount_rate']=dfoff['Discount_rate'].astype('str')
# dfoff.head(5)
dfoff.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754884 entries, 0 to 1754883
Data columns (total 7 columns):
User_id          int64
Merchant_id      int64
Coupon_id        float64
Discount_rate    object
Distance         float64
Date_received    object
Date             float64
dtypes: float64(3), int64(2), object(2)
memory usage: 93.7+ MB
# dfoff['Date']=dfoff['Date'].astype('str')
dfoff['Date']=dfoff['Date'].fillna('null')# dfoff['Date_received'].astype('int64')
# dfoff.info()
dfoff['Date']=dfoff['Date'].astype('str')
# dfoff.head()
# dfoff['Date_received']=dfoff['Date_received'].fillna('null')
# dfoff['Date']=dfoff['Date'].fillna('null')
# dfoff.info()
#在这里注意字段类型  先转换为int类型
print('有优惠卷,购买商品:%d' % dfoff[(dfoff['Date_received'] != 'null') & (dfoff['Date'] != 'null')].shape[0])
print('有优惠卷,未购商品:%d' % dfoff[(dfoff['Date_received'] != 'null') & (dfoff['Date'] == 'null')].shape[0])
print('无优惠卷,购买商品:%d' % dfoff[(dfoff['Date_received'] == 'null') & (dfoff['Date'] != 'null')].shape[0])
print('无优惠卷,未购商品:%d' % dfoff[(dfoff['Date_received'] == 'null') & (dfoff['Date'] == 'null')].shape[0])# print('有优惠卷,购买商品:%d' % dfoff[(dfoff['Date_received'].isnull()) & (dfoff['Date'].isnull())].shape[0])
# print('有优惠卷,未购商品:%d' % dfoff[(dfoff['Date_received'] != 0) & (dfoff['Date'] == 0)].shape[0])
# print('无优惠卷,购买商品:%d' % dfoff[(dfoff['Date_received'] == 0) & (dfoff['Date'] != 0)].shape[0])
# print('无优惠卷,未购商品:%d' % dfoff[(dfoff['Date_received'] == 0) & (dfoff['Date'] == 0)].shape[0])
​
有优惠卷,购买商品:75382
有优惠卷,未购商品:977900
无优惠卷,购买商品:701602
无优惠卷,未购商品:0
#查看打折率(Discount_rate)字段类型
# print('Discount_rate 类型:\n',dfoff['Discount_rate'].unique())
# Convert Discount_rate and Distance
def getDiscountType(row):
   if row == 'null':
       return 'null'
       #判断数据中有没有冒号
   elif ':' in row:
       return 1
   else:
       return 0
#计算折扣率
def convertRate(row):
   """Convert discount to rate"""
   if row == 'null':
       return 1.0
   elif ':' in row:
       rows = row.split(':')
       return 1.0 - float(rows[1])/float(rows[0])
   else:
       return float(row)
#计算满多少  
def getDiscountMan(row):
   if ':' in row:
       rows = row.split(':')
       return int(rows[0])
   else:
       return 0
#减多少
def getDiscountJian(row):
   if ':' in row:
       rows = row.split(':')
       return int(rows[1])
   else:
       return 0
   
def processData(df):
   
   # convert discount_rate
   df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
   df['discount_rate'] = df['Discount_rate'].apply(convertRate)
   df['discount_man'] = df['Discount_rate'].apply(getDiscountMan)
   df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
   
   print(df['discount_rate'].unique())
   
   return df
​
# dfoff.head()
dfoff = processData(dfoff)
dftest = processData(dftest)
# dfoff.head()
[1.         0.86666667 0.95       0.9        0.83333333 0.8
 0.5        0.85       0.75       0.66666667 0.93333333 0.7
 0.6        0.96666667 0.98       0.99       0.975      0.33333333
 0.2        0.4       ]
[0.83333333 0.9        0.96666667 0.8        0.95       0.75
 0.98       0.5        0.86666667 0.6        0.66666667 0.7
 0.85       0.33333333 0.94       0.93333333 0.975      0.99      ]
#距离
# print('Distance 类型:',dfoff['Distance'].unique())
# print(dfoff['distance'].unique())
# convert distance
dfoff['distance'] = dfoff['Distance'].replace(np.nan, -1)
dfoff['distance'] = dfoff['Distance'].fillna(-1)
# dfoff['distance'] = dfoff['Distance'].astype('str')print(dfoff['distance'].unique())
dftest['distance'] = dftest['Distance'].replace(np.nan, -1)
# dftest['distance'] = dftest['Distance'].fillna(-1)
# dftest['distance'] = dftest['Distance'].astype('str')# print(dftest['distance'].unique())
[ 0.  1. -1.  2. 10.  4.  7.  9.  3.  5.  6.  8.]
# dfoff.head()
# dftest.head()
# dfoff['Date_received'] == 'null'
# dfoff['Date_received'][0:4]
# dfoff['Date_received'][4:6]
# date(int(dfoff['Date_received'][0:4]), int(dfoff['Date_received'][4:6]), int(dfoff['Date_received'][6:8])).weekday() + 1
# # date(int(dfoff['Date_received'].astype(str)[0:4]), int(dfoff['Date_received'].astype(str)[4:6]), dfoff['Date_received'].astype(str)[6:8])).weekday()
# # date(2018, 12, 24).weekday() + 1
# date(2018, 12, 24)
# int(dfoff['Date_received'].iloc[1][0:4])
# int(dfoff['Date_received'].iloc[1][4:6])
# int(dfoff['Date_received'].iloc[1][6:8])
# date(int(dfoff['Date_received'].iloc[1][0:4]), int(dfoff['Date_received'].iloc[1][4:6]), int(dfoff['Date_received'].iloc[1][6:8])).weekday()+1
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
# print(weekdaycols)
#3.领劵日期(Date_received)
def getWeekday(row):
   if row == 'null':
       return row
   else:
       return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1
#apply,应用函数, 逐行处理
dfoff['weekday'] = dfoff['Date_received'].astype(str).apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].astype(str).apply(getWeekday)# weekday_type :  周六和周日为1,其他为0
dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x: 1 if x in [6,7] else 0)
dftest['weekday_type'] = dftest['weekday'].apply(lambda x: 1 if x in [6,7] else 0)# change weekday to one-hot encoding 
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
#print(weekdaycols)
​
tmpdf = pd.get_dummies(dfoff['weekday'].replace('null', np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf
​
tmpdf = pd.get_dummies(dftest['weekday'].replace('null', np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf
​
# dfoff.head()
# dftest.head()
#标注标签 Label
def label(row):
   if row['Date_received'] == 'null':
       return -1
   if row['Date'] != 'null':
       td = pd.to_datetime(row['Date'], format='%Y%m%d') - pd.to_datetime(row['Date_received'], format='%Y%m%d')
       if td <= pd.Timedelta(15, 'D'):
           return 1
   return 0
​
dfoff['label'] = dfoff.apply(label, axis=1)# dfoff.head()
dfoff.iloc[:1,6]
0    20160217.0
Name: Date, dtype: object
#看一下正负样本究竟有多少
# print(dfoff['label'].value_counts())# data split
df = dfoff[dfoff['label'] != -1].copy()
train = df[(df['Date_received'] < '20160516')].copy()
valid = df[(df['Date_received'] >= '20160516') & (df['Date_received'] <= '20160615')].copy()
# print('Train Set: \n', train['label'].value_counts())
# print('Valid Set: \n', valid['label'].value_counts())
#2.构建模型
def check_model(data, predictors):
   #SGDClassifier梯度下降分类方法
   classifier = lambda: SGDClassifier(
       loss='log',  # loss function: logistic regression
       penalty='elasticnet', # L1 & L2
       fit_intercept=True,  # 是否存在截距,默认存在
       max_iter=100, 
       shuffle=True,  # Whether or not the training data should be shuffled after each epoch
       n_jobs=-1, # The number of processors to use
       class_weight=None) # Weights associated with classes. If not given, all classes are supposed to have weight one.# 管道机制使得参数集在新数据集(比如测试集)上的重复使用,管道机制实现了对全部步骤的流式化封装和管理。
   model = Pipeline(steps=[
       ('ss', StandardScaler()), # transformer
       ('en', classifier())  # estimator
   ])
​
   parameters = {
       'en__alpha': [ 0.001, 0.01, 0.1],
       'en__l1_ratio': [ 0.001, 0.01, 0.1]
   }# StratifiedKFold用法类似Kfold,但是他是分层采样,确保训练集,测试集中各类别样本的比例与原始数据集中相同。
   folder = StratifiedKFold(n_splits=3, shuffle=True)
   
   # Exhaustive search over specified parameter values for an estimator.
   grid_search = GridSearchCV(
       model, 
       parameters, 
       cv=folder, 
       n_jobs=-1,  # -1 means using all processors
       verbose=1)
   grid_search = grid_search.fit(data[predictors], 
                                 data['label'])
   
   return grid_search
​
train.columns
Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
       'Date_received', 'Date', 'discount_type', 'discount_rate',
       'discount_man', 'discount_jian', 'distance', 'weekday', 'weekday_type',
       'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5',
       'weekday_6', 'weekday_7', 'label'],
      dtype='object')#3.训练
predictors = ['discount_rate','discount_type','discount_man','discount_jian','distance','weekday','weekday_type','weekday_1','weekday_2','weekday_3','weekday_4','weekday_5','weekday_6','weekday_7']
model = check_model(train, predictors)
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   49.3s finished
# train_orignal=train_orignal.iloc[:,[3,4]]
# train_orignal_01=train_orignal.iloc[:,[3,4]]
# train_orignal_01=train_orignal.iloc[:,3:8]
# train_orignal_01.head()
#4.验证
# valid predict
#valid是验证集
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
#将预测的结果赋值给标签列
valid1['pred_prob'] = y_valid_pred[:, 1]
# valid1.head(5)
#得到AUC值
# avgAUC calculation
vg = valid1.groupby(['Coupon_id'])
aucs = []
for i in vg:
   tmpdf = i[1] 
   if len(tmpdf['label'].unique()) != 2:
       continue
   fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
   aucs.append(auc(fpr, tpr))
print(np.average(aucs))

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值