天池O2O优惠券预测

JhouKis

已于 2023-05-21 22:11:58 修改

阅读量495

点赞数

文章标签： python 开发语言

于 2023-05-21 22:11:23 首次发布

本文链接：https://blog.csdn.net/m0_68809900/article/details/130797609

版权

导包并加载数据

import os, sys, pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date
from sklearn.linear_model import SGDClassifier, LogisticRegression
import seaborn as sns
# 显示中文
plt.rcParams['font.sans-serif'] = [u'SimHei']
plt.rcParams['axes.unicode_minus'] = False

dfoff = pd.read_csv('data/ccf_offline_stage1_train.csv')
dftest = pd.read_csv('data/ccf_offline_stage1_test_revised.csv')
dfon = pd.read_csv('data/ccf_online_stage1_train.csv')
print('data read end.')

简单的观察数据特征

print("dfoff的shape是",dfoff.shape)
print("dftest的shape是",dftest.shape)
print("dfon的shape是",dfon.shape)
print(dfoff.describe())
print(dftest.describe())
print(dfon.describe())

dfoff.head()

def convertRate(row):
    # 将满xx减yy变成折扣率
    """Convert discount to rate"""
    if pd.isnull(row):
        return 1.0
    elif ':' in str(row):
        rows = row.split(':')
        return 1.0 - float(rows[1])/float(rows[0])
    else:
        return float(row)

# 从discount_rate中提取三个新的特征，把满xx减yy的xx和yy各自作为两个特征，是否有优惠券作为一个特征。
def getDiscountMan(row):
    if ':' in str(row):
        rows = row.split(':')
        return int(rows[0])
    else:
        return 0
def getDiscountJian(row):
    if ':' in str(row):
        rows = row.split(':')
        return int(rows[1])
    else:
        return 0
    
def getDiscountType(row):
    # 对优惠率特征进行处理,返回的是空、1(有优惠)、0(没有优惠)
    if pd.isnull(row):
        return np.nan
    elif ':' in row: # 则代表存在折扣
        return 1
    else:
        return 0    

def processData(df):
    # convert discunt_rate
    df['discount_rate'] = df['Discount_rate'].apply(convertRate)
    df['discount_man'] = df['Discount_rate'].apply(getDiscountMan)
    df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
    df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
    print("处理完后discount_rate的唯一值为:",df['discount_rate'].unique())
    # convert distance
    # 用-1填充，并转换成int类型
    df['distance'] = df['Distance'].fillna(-1).astype(int)
    return df
dfoff = processData(dfoff)
dftest = processData(dftest)

print("tool is ok.")

# 对领域优惠券时间的特征进行处理
date_received = dfoff['Date_received'].unique()
date_received = sorted(date_received[pd.notnull(date_received)]) # 提取出非空值的时间，并排序

# 对消费日期的特征进行处理
date_buy = dfoff['Date'].unique()
date_buy = sorted(date_buy[pd.notnull(date_buy)])
date_buy = sorted(dfoff[dfoff['Date'].notnull()]['Date'])

# 当用户有优惠券时，通过领取优惠券时间分组，并计算数量。提取为新的特征。
couponbydate = dfoff[dfoff['Date_received'].notnull()][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count()
couponbydate.columns = ['Date_received','count']

# 当用户消费并且领取了优惠券的时候，通过领取优惠券时间分组，并计算数量。提取为新的特征。
buybydate = dfoff[(dfoff['Date'].notnull()) & (dfoff['Date_received'].notnull())][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count()
buybydate.columns = ['Date_received','count']

def getWeekday(row):
    # 转换为年月日的时间序列
    if row == 'nan':
        return np.nan
    else:
        return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1
dfoff['weekday'] = dfoff['Date_received'].astype(str).apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].astype(str).apply(getWeekday)
# weekday_type :  周六和周日为1，其他为0
dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x : 1 if x in [6,7] else 0 )
dftest['weekday_type'] = dftest['weekday'].apply(lambda x : 1 if x in [6,7] else 0 )

# 对weekday_type进行one-hot编码
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
tmpdf = pd.get_dummies(dfoff['weekday'].replace('nan', np.nan)) # one-hot编码
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf

tmpdf = pd.get_dummies(dftest['weekday'].replace('nan', np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf


def label(row):
    if pd.isnull(row['Date_received']):
        return -1
    if pd.notnull(row['Date']):
        td = pd.to_datetime(row['Date'], format='%Y%m%d') -  pd.to_datetime(row['Date_received'], format='%Y%m%d')
        if td <= pd.Timedelta(15, 'D'):
            return 1
    return 0
dfoff['label'] = dfoff.apply(label, axis = 1)


print("end")

corr = dfoff.corr()
print(corr)
plt.subplots(figsize=(16, 16))
sns.heatmap(corr, vmax=.8, square=True, annot=True)

# 根据用户领取优惠券的日期划分为 训练集、验证集
print("-----data split------")
df = dfoff[dfoff['label'] != -1].copy()
train = df[(df['Date_received'] < 20160516)].copy()
valid = df[(df['Date_received'] >= 20160516) & (df['Date_received'] <= 20160615)].copy()
print("end")

# # feature 使用线性模型SGD方法
# model = SGDClassifier(#lambda:
#     loss='log',
#     penalty='elasticnet',
#     fit_intercept=True,
#     max_iter=100,
#     shuffle=True,
#     alpha = 0.01,
#     l1_ratio = 0.01,
#     n_jobs=-1,
#     class_weight=None
# )
# model.fit(train[original_feature], train['label'])
# # #### 预测以及结果评价
# print(model.score(valid[original_feature], valid['label']))
# print("---save model---")
# with open('1_model.pkl', 'wb') as f:
#     pickle.dump(model, f)
# with open('1_model.pkl', 'rb') as f:
#     model = pickle.load(f)
   
# # 保存要提交的csv文件
# y_test_pred = model.predict_proba(dftest[original_feature])
# dftest1 = dftest[['User_id','Coupon_id','Date_received']].copy()
# dftest1['label'] = y_test_pred[:,1]
# dftest1.to_csv('submit1.csv', index=False, header=False)
# dftest1.head()

# 使用500个决策树模型集成，每次从数据集中随机采样100个训练实例
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

original_feature = ['discount_rate','discount_type','discount_man', 'discount_jian','distance', 'weekday', 'weekday_type'] + weekdaycols
print("----train-----")
model = BaggingClassifier(
    DecisionTreeClassifier(),n_estimators=500,max_samples=100,bootstrap=True,n_jobs=-1
)
model.fit(train[original_feature], train['label'])

# #### 预测以及结果评价
print(model.score(valid[original_feature], valid['label']))

print("---save model---")
with open('1_model.pkl', 'wb') as f:
    pickle.dump(model, f)
with open('1_model.pkl', 'rb') as f:
    model = pickle.load(f)

# test prediction for submission
y_test_pred = model.predict_proba(dftest[original_feature])
dftest1 = dftest[['User_id','Coupon_id','Date_received']].copy()
dftest1['label'] = y_test_pred[:,1]
dftest1.to_csv('submit2.csv', index=False, header=False)
dftest1.head()

# 以Boosting方法
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(
    max_depth=2,
    n_estimators=100, # 太小容易欠拟合，太大容易过拟合
    learning_rate=0.1)
model.fit(train[original_feature], train['label'])

# 使用网格搜索的方法调参，虽然线上的成绩没有太大的上升，但是过拟合的情况得到了很大的改善。
from sklearn.model_selection import GridSearchCV
param_test1 = {'n_estimators':range(20,81,10)}
gsearch1 = GridSearchCV(
    estimator = GradientBoostingClassifier(
        learning_rate=0.1, min_samples_split=300,
        min_samples_leaf=20,
        max_depth=8,
        max_features='sqrt', 
        subsample=0.8,
        random_state=10), 
        param_grid = param_test1, scoring='roc_auc',cv=5,n_jobs=-1)
gsearch1.fit(train[original_feature], train['label'])

# gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
print(gsearch1.score(valid[original_feature], valid['label']))
print("---save model---")
with open('1_model.pkl', 'wb') as f:
    pickle.dump(gsearch1, f)
with open('1_model.pkl', 'rb') as f:
    model = pickle.load(f)

# test prediction for submission
y_test_pred = gsearch1.predict_proba(dftest[original_feature])
dftest1 = dftest[['User_id','Coupon_id','Date_received']].copy()
dftest1['label'] = y_test_pred[:,1]
dftest1.to_csv('submit6.csv', index=False, header=False)
dftest1.head()