原文参考:
https://blog.csdn.net/red_stone1/article/details/83859845
所用数据:https://pan.baidu.com/s/18g-PZcdSWwzxtjpuSmjG-Q
代码自己实践走通
使用 #SGDClassifier梯度下降分类方法预测
# import libraries necessary for this project
import os, sys, pickle
import numpy as np
import pandas as pd
from datetime import date
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler
# display for this notebook
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
dfoff = pd.read_csv('C:\data\O2O_tianchi\ccf_offline_stage1_train.csv')
dfon = pd.read_csv('C:\data\O2O_tianchi\ccf_online_stage1_train.csv')
dftest = pd.read_csv('C:\data\O2O_tianchi\ccf_offline_stage1_test_revised.csv')
# dfoff.head(5)
# dfoff.info()
# dfoff.shape
# dfoff['Date_received']=dfoff['Date_received'].astype('str')
dfoff['Date_received']=dfoff['Date_received'].fillna('null')
dfoff['Discount_rate']=dfoff['Discount_rate'].fillna('null')
# dfoff['Date_received'].astype('int64')
# dfoff.info()
# dfoff.head(5)
dfoff['Date_received']=dfoff['Date_received'].astype('str')
dfoff['Discount_rate']=dfoff['Discount_rate'].astype('str')
# dfoff.head(5)
dfoff.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1754884 entries, 0 to 1754883
Data columns (total 7 columns):
User_id int64
Merchant_id int64
Coupon_id float64
Discount_rate object
Distance float64
Date_received object
Date float64
dtypes: float64(3), int64(2), object(2)
memory usage: 93.7+ MB
# dfoff['Date']=dfoff['Date'].astype('str')
dfoff['Date']=dfoff['Date'].fillna('null')
# dfoff['Date_received'].astype('int64')
# dfoff.info()
dfoff['Date']=dfoff['Date'].astype('str')
# dfoff.head()
# dfoff['Date_received']=dfoff['Date_received'].fillna('null')
# dfoff['Date']=dfoff['Date'].fillna('null')
# dfoff.info()
#在这里注意字段类型 先转换为int类型
print('有优惠卷,购买商品:%d' % dfoff[(dfoff['Date_received'] != 'null') & (dfoff['Date'] != 'null')].shape[0])
print('有优惠卷,未购商品:%d' % dfoff[(dfoff['Date_received'] != 'null') & (dfoff['Date'] == 'null')].shape[0])
print('无优惠卷,购买商品:%d' % dfoff[(dfoff['Date_received'] == 'null') & (dfoff['Date'] != 'null')].shape[0])
print('无优惠卷,未购商品:%d' % dfoff[(dfoff['Date_received'] == 'null') & (dfoff['Date'] == 'null')].shape[0])
# print('有优惠卷,购买商品:%d' % dfoff[(dfoff['Date_received'].isnull()) & (dfoff['Date'].isnull())].shape[0])
# print('有优惠卷,未购商品:%d' % dfoff[(dfoff['Date_received'] != 0) & (dfoff['Date'] == 0)].shape[0])
# print('无优惠卷,购买商品:%d' % dfoff[(dfoff['Date_received'] == 0) & (dfoff['Date'] != 0)].shape[0])
# print('无优惠卷,未购商品:%d' % dfoff[(dfoff['Date_received'] == 0) & (dfoff['Date'] == 0)].shape[0])
有优惠卷,购买商品:75382
有优惠卷,未购商品:977900
无优惠卷,购买商品:701602
无优惠卷,未购商品:0
#查看打折率(Discount_rate)字段类型
# print('Discount_rate 类型:\n',dfoff['Discount_rate'].unique())
# Convert Discount_rate and Distance
def getDiscountType(row):
if row == 'null':
return 'null'
#判断数据中有没有冒号
elif ':' in row:
return 1
else:
return 0
#计算折扣率
def convertRate(row):
"""Convert discount to rate"""
if row == 'null':
return 1.0
elif ':' in row:
rows = row.split(':')
return 1.0 - float(rows[1])/float(rows[0])
else:
return float(row)
#计算满多少
def getDiscountMan(row):
if ':' in row:
rows = row.split(':')
return int(rows[0])
else:
return 0
#减多少
def getDiscountJian(row):
if ':' in row:
rows = row.split(':')
return int(rows[1])
else:
return 0
def processData(df):
# convert discount_rate
df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
df['discount_rate'] = df['Discount_rate'].apply(convertRate)
df['discount_man'] = df['Discount_rate'].apply(getDiscountMan)
df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
print(df['discount_rate'].unique())
return df
# dfoff.head()
dfoff = processData(dfoff)
dftest = processData(dftest)
# dfoff.head()
[1. 0.86666667 0.95 0.9 0.83333333 0.8
0.5 0.85 0.75 0.66666667 0.93333333 0.7
0.6 0.96666667 0.98 0.99 0.975 0.33333333
0.2 0.4 ]
[0.83333333 0.9 0.96666667 0.8 0.95 0.75
0.98 0.5 0.86666667 0.6 0.66666667 0.7
0.85 0.33333333 0.94 0.93333333 0.975 0.99 ]
#距离
# print('Distance 类型:',dfoff['Distance'].unique())
# print(dfoff['distance'].unique())
# convert distance
dfoff['distance'] = dfoff['Distance'].replace(np.nan, -1)
dfoff['distance'] = dfoff['Distance'].fillna(-1)
# dfoff['distance'] = dfoff['Distance'].astype('str')
print(dfoff['distance'].unique())
dftest['distance'] = dftest['Distance'].replace(np.nan, -1)
# dftest['distance'] = dftest['Distance'].fillna(-1)
# dftest['distance'] = dftest['Distance'].astype('str')
# print(dftest['distance'].unique())
[ 0. 1. -1. 2. 10. 4. 7. 9. 3. 5. 6. 8.]
# dfoff.head()
# dftest.head()
# dfoff['Date_received'] == 'null'
# dfoff['Date_received'][0:4]
# dfoff['Date_received'][4:6]
# date(int(dfoff['Date_received'][0:4]), int(dfoff['Date_received'][4:6]), int(dfoff['Date_received'][6:8])).weekday() + 1
# # date(int(dfoff['Date_received'].astype(str)[0:4]), int(dfoff['Date_received'].astype(str)[4:6]), dfoff['Date_received'].astype(str)[6:8])).weekday()
# # date(2018, 12, 24).weekday() + 1
# date(2018, 12, 24)
# int(dfoff['Date_received'].iloc[1][0:4])
# int(dfoff['Date_received'].iloc[1][4:6])
# int(dfoff['Date_received'].iloc[1][6:8])
# date(int(dfoff['Date_received'].iloc[1][0:4]), int(dfoff['Date_received'].iloc[1][4:6]), int(dfoff['Date_received'].iloc[1][6:8])).weekday()+1
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
# print(weekdaycols)
#3.领劵日期(Date_received)
def getWeekday(row):
if row == 'null':
return row
else:
return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1
#apply,应用函数, 逐行处理
dfoff['weekday'] = dfoff['Date_received'].astype(str).apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].astype(str).apply(getWeekday)
# weekday_type : 周六和周日为1,其他为0
dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x: 1 if x in [6,7] else 0)
dftest['weekday_type'] = dftest['weekday'].apply(lambda x: 1 if x in [6,7] else 0)
# change weekday to one-hot encoding
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
#print(weekdaycols)
tmpdf = pd.get_dummies(dfoff['weekday'].replace('null', np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf
tmpdf = pd.get_dummies(dftest['weekday'].replace('null', np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf
# dfoff.head()
# dftest.head()
#标注标签 Label
def label(row):
if row['Date_received'] == 'null':
return -1
if row['Date'] != 'null':
td = pd.to_datetime(row['Date'], format='%Y%m%d') - pd.to_datetime(row['Date_received'], format='%Y%m%d')
if td <= pd.Timedelta(15, 'D'):
return 1
return 0
dfoff['label'] = dfoff.apply(label, axis=1)
# dfoff.head()
dfoff.iloc[:1,6]
0 20160217.0
Name: Date, dtype: object
#看一下正负样本究竟有多少
# print(dfoff['label'].value_counts())
# data split
df = dfoff[dfoff['label'] != -1].copy()
train = df[(df['Date_received'] < '20160516')].copy()
valid = df[(df['Date_received'] >= '20160516') & (df['Date_received'] <= '20160615')].copy()
# print('Train Set: \n', train['label'].value_counts())
# print('Valid Set: \n', valid['label'].value_counts())
#2.构建模型
def check_model(data, predictors):
#SGDClassifier梯度下降分类方法
classifier = lambda: SGDClassifier(
loss='log', # loss function: logistic regression
penalty='elasticnet', # L1 & L2
fit_intercept=True, # 是否存在截距,默认存在
max_iter=100,
shuffle=True, # Whether or not the training data should be shuffled after each epoch
n_jobs=-1, # The number of processors to use
class_weight=None) # Weights associated with classes. If not given, all classes are supposed to have weight one.
# 管道机制使得参数集在新数据集(比如测试集)上的重复使用,管道机制实现了对全部步骤的流式化封装和管理。
model = Pipeline(steps=[
('ss', StandardScaler()), # transformer
('en', classifier()) # estimator
])
parameters = {
'en__alpha': [ 0.001, 0.01, 0.1],
'en__l1_ratio': [ 0.001, 0.01, 0.1]
}
# StratifiedKFold用法类似Kfold,但是他是分层采样,确保训练集,测试集中各类别样本的比例与原始数据集中相同。
folder = StratifiedKFold(n_splits=3, shuffle=True)
# Exhaustive search over specified parameter values for an estimator.
grid_search = GridSearchCV(
model,
parameters,
cv=folder,
n_jobs=-1, # -1 means using all processors
verbose=1)
grid_search = grid_search.fit(data[predictors],
data['label'])
return grid_search
train.columns
Index(['User_id', 'Merchant_id', 'Coupon_id', 'Discount_rate', 'Distance',
'Date_received', 'Date', 'discount_type', 'discount_rate',
'discount_man', 'discount_jian', 'distance', 'weekday', 'weekday_type',
'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5',
'weekday_6', 'weekday_7', 'label'],
dtype='object')
#3.训练
predictors = ['discount_rate','discount_type','discount_man','discount_jian','distance','weekday','weekday_type','weekday_1','weekday_2','weekday_3','weekday_4','weekday_5','weekday_6','weekday_7']
model = check_model(train, predictors)
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done 27 out of 27 | elapsed: 49.3s finished
# train_orignal=train_orignal.iloc[:,[3,4]]
# train_orignal_01=train_orignal.iloc[:,[3,4]]
# train_orignal_01=train_orignal.iloc[:,3:8]
# train_orignal_01.head()
#4.验证
# valid predict
#valid是验证集
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
#将预测的结果赋值给标签列
valid1['pred_prob'] = y_valid_pred[:, 1]
# valid1.head(5)
#得到AUC值
# avgAUC calculation
vg = valid1.groupby(['Coupon_id'])
aucs = []
for i in vg:
tmpdf = i[1]
if len(tmpdf['label'].unique()) != 2:
continue
fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
aucs.append(auc(fpr, tpr))
print(np.average(aucs))