导包并加载数据
import os, sys, pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date
from sklearn.linear_model import SGDClassifier, LogisticRegression
import seaborn as sns
# 显示中文
plt.rcParams['font.sans-serif'] = [u'SimHei']
plt.rcParams['axes.unicode_minus'] = False
dfoff = pd.read_csv('data/ccf_offline_stage1_train.csv')
dftest = pd.read_csv('data/ccf_offline_stage1_test_revised.csv')
dfon = pd.read_csv('data/ccf_online_stage1_train.csv')
print('data read end.')
简单的观察数据特征
print("dfoff的shape是",dfoff.shape)
print("dftest的shape是",dftest.shape)
print("dfon的shape是",dfon.shape)
print(dfoff.describe())
print(dftest.describe())
print(dfon.describe())
dfoff.head()
def convertRate(row):
# 将满xx减yy变成折扣率
"""Convert discount to rate"""
if pd.isnull(row):
return 1.0
elif ':' in str(row):
rows = row.split(':')
return 1.0 - float(rows[1])/float(rows[0])
else:
return float(row)
# 从discount_rate中提取三个新的特征,把满xx减yy的xx和yy各自作为两个特征,是否有优惠券作为一个特征。
def getDiscountMan(row):
if ':' in str(row):
rows = row.split(':')
return int(rows[0])
else:
return 0
def getDiscountJian(row):
if ':' in str(row):
rows = row.split(':')
return int(rows[1])
else:
return 0
def getDiscountType(row):
# 对优惠率特征进行处理,返回的是空、1(有优惠)、0(没有优惠)
if pd.isnull(row):
return np.nan
elif ':' in row: # 则代表存在折扣
return 1
else:
return 0
def processData(df):
# convert discunt_rate
df['discount_rate'] = df['Discount_rate'].apply(convertRate)
df['discount_man'] = df['Discount_rate'].apply(getDiscountMan)
df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
print("处理完后discount_rate的唯一值为:",df['discount_rate'].unique())
# convert distance
# 用-1填充,并转换成int类型
df['distance'] = df['Distance'].fillna(-1).astype(int)
return df
dfoff = processData(dfoff)
dftest = processData(dftest)
print("tool is ok.")
# 对领域优惠券时间的特征进行处理
date_received = dfoff['Date_received'].unique()
date_received = sorted(date_received[pd.notnull(date_received)]) # 提取出非空值的时间,并排序
# 对消费日期的特征进行处理
date_buy = dfoff['Date'].unique()
date_buy = sorted(date_buy[pd.notnull(date_buy)])
date_buy = sorted(dfoff[dfoff['Date'].notnull()]['Date'])
# 当用户有优惠券时,通过领取优惠券时间分组,并计算数量。提取为新的特征。
couponbydate = dfoff[dfoff['Date_received'].notnull()][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count()
couponbydate.columns = ['Date_received','count']
# 当用户消费并且领取了优惠券的时候,通过领取优惠券时间分组,并计算数量。提取为新的特征。
buybydate = dfoff[(dfoff['Date'].notnull()) & (dfoff['Date_received'].notnull())][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count()
buybydate.columns = ['Date_received','count']
def getWeekday(row):
# 转换为年月日的时间序列
if row == 'nan':
return np.nan
else:
return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1
dfoff['weekday'] = dfoff['Date_received'].astype(str).apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].astype(str).apply(getWeekday)
# weekday_type : 周六和周日为1,其他为0
dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x : 1 if x in [6,7] else 0 )
dftest['weekday_type'] = dftest['weekday'].apply(lambda x : 1 if x in [6,7] else 0 )
# 对weekday_type进行one-hot编码
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
tmpdf = pd.get_dummies(dfoff['weekday'].replace('nan', np.nan)) # one-hot编码
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf
tmpdf = pd.get_dummies(dftest['weekday'].replace('nan', np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf
def label(row):
if pd.isnull(row['Date_received']):
return -1
if pd.notnull(row['Date']):
td = pd.to_datetime(row['Date'], format='%Y%m%d') - pd.to_datetime(row['Date_received'], format='%Y%m%d')
if td <= pd.Timedelta(15, 'D'):
return 1
return 0
dfoff['label'] = dfoff.apply(label, axis = 1)
print("end")
corr = dfoff.corr()
print(corr)
plt.subplots(figsize=(16, 16))
sns.heatmap(corr, vmax=.8, square=True, annot=True)
# 根据用户领取优惠券的日期划分为 训练集、验证集
print("-----data split------")
df = dfoff[dfoff['label'] != -1].copy()
train = df[(df['Date_received'] < 20160516)].copy()
valid = df[(df['Date_received'] >= 20160516) & (df['Date_received'] <= 20160615)].copy()
print("end")
# # feature 使用线性模型SGD方法
# model = SGDClassifier(#lambda:
# loss='log',
# penalty='elasticnet',
# fit_intercept=True,
# max_iter=100,
# shuffle=True,
# alpha = 0.01,
# l1_ratio = 0.01,
# n_jobs=-1,
# class_weight=None
# )
# model.fit(train[original_feature], train['label'])
# # #### 预测以及结果评价
# print(model.score(valid[original_feature], valid['label']))
# print("---save model---")
# with open('1_model.pkl', 'wb') as f:
# pickle.dump(model, f)
# with open('1_model.pkl', 'rb') as f:
# model = pickle.load(f)
# # 保存要提交的csv文件
# y_test_pred = model.predict_proba(dftest[original_feature])
# dftest1 = dftest[['User_id','Coupon_id','Date_received']].copy()
# dftest1['label'] = y_test_pred[:,1]
# dftest1.to_csv('submit1.csv', index=False, header=False)
# dftest1.head()
# 使用500个决策树模型集成,每次从数据集中随机采样100个训练实例
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier
original_feature = ['discount_rate','discount_type','discount_man', 'discount_jian','distance', 'weekday', 'weekday_type'] + weekdaycols
print("----train-----")
model = BaggingClassifier(
DecisionTreeClassifier(),n_estimators=500,max_samples=100,bootstrap=True,n_jobs=-1
)
model.fit(train[original_feature], train['label'])
# #### 预测以及结果评价
print(model.score(valid[original_feature], valid['label']))
print("---save model---")
with open('1_model.pkl', 'wb') as f:
pickle.dump(model, f)
with open('1_model.pkl', 'rb') as f:
model = pickle.load(f)
# test prediction for submission
y_test_pred = model.predict_proba(dftest[original_feature])
dftest1 = dftest[['User_id','Coupon_id','Date_received']].copy()
dftest1['label'] = y_test_pred[:,1]
dftest1.to_csv('submit2.csv', index=False, header=False)
dftest1.head()
# 以Boosting方法
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(
max_depth=2,
n_estimators=100, # 太小容易欠拟合,太大容易过拟合
learning_rate=0.1)
model.fit(train[original_feature], train['label'])
# 使用网格搜索的方法调参,虽然线上的成绩没有太大的上升,但是过拟合的情况得到了很大的改善。
from sklearn.model_selection import GridSearchCV
param_test1 = {'n_estimators':range(20,81,10)}
gsearch1 = GridSearchCV(
estimator = GradientBoostingClassifier(
learning_rate=0.1, min_samples_split=300,
min_samples_leaf=20,
max_depth=8,
max_features='sqrt',
subsample=0.8,
random_state=10),
param_grid = param_test1, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch1.fit(train[original_feature], train['label'])
# gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
print(gsearch1.score(valid[original_feature], valid['label']))
print("---save model---")
with open('1_model.pkl', 'wb') as f:
pickle.dump(gsearch1, f)
with open('1_model.pkl', 'rb') as f:
model = pickle.load(f)
# test prediction for submission
y_test_pred = gsearch1.predict_proba(dftest[original_feature])
dftest1 = dftest[['User_id','Coupon_id','Date_received']].copy()
dftest1['label'] = y_test_pred[:,1]
dftest1.to_csv('submit6.csv', index=False, header=False)
dftest1.head()