用机器学习对CTR预估建模(一)

版权声明:本文为博主原创文章,遵循 CC 4.0 by-sa 版权协议,转载请附上原文出处链接和本声明。
本文链接:https://blog.csdn.net/JR_lu/article/details/54836968

题目网址:https://www.kaggle.com/c/avazu-ctr-prediction

数据集介绍:

train - Training set. 10 days of click-through data, ordered chronologically. Non-clicks and clicks
are subsampled according to different strategies.
Train.csv 解压后有5.6G,样本个数非常大,一般200m的csv数据(20~30维)用pandas读取成数据帧(dataframe)格式,大概会占用内存1G左右,所以这么的数据集单机内存一般吃不消。

test - Test set. 1 day of ads to for testing your model predictions.
Test.csv解压后有673m,不是很大。

sampleSubmission.csv - Sample submission file in the correct format, corresponds to the All-0.5 Benchmark.

对特征进行筛选和down sampling来降低数据集

# -*- coding: utf-8 -*-
"""
Created on Wed Feb 01 12:51:31 2017

@author: JR.Lu
"""
import pandas as pd
import numpy as np

train_df=pd.read_csv('train.csv',nrows=10000000)
test_df=pd.read_csv('test.csv')

#down sampling
temp_0=train_df.click==0
data_0=train_df[temp_0] # 16546986./20000000 占了0.8273493左右
temp_1=train_df.click==1
data_1=train_df[temp_1] # 3453014
data_0_ed=data_0[0:len(data_1)]
data_downsampled=pd.concat([data_1,data_0_ed])

#select features
#通过每个columns对label的影响来选择feature,这里使用grouby实现
#train_df.groupby(train_df['device_model'])['click'].mean()
columns_select_test=['id','device_type','C1','C15','C16','banner_pos','banner_pos','site_category']
columns_select=['click','device_type','C1','C15','C16','banner_pos','banner_pos','site_category']
data_downsampled_1=data_downsampled[columns_select]
test_small=test_df[columns_select_test]

# 打乱数据
sampler = np.random.permutation(len(data_downsampled_1))
data_downsampled_1=data_downsampled_1.take(sampler)
data_downsampled_1.to_csv('train_small.csv')
test_small.to_csv('test_small.csv')

其次是用简单的特征来测试模型,用网格搜索的方式来进行参数优选

# -*- coding: utf-8 -*-
"""
Created on Wed Feb 01 20:36:46 2017

@author: JR.Lu
"""
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier,RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.learning_curve import learning_curve
from sklearn import metrics
import numpy as np
import matplotlib.pyplot as plt
import scipy as sp



def logloss(act, pred):
    '''
    比赛使用logloss作为evaluation
    '''
    epsilon = 1e-15
    pred = sp.maximum(epsilon, pred)
    pred = sp.minimum(1-epsilon, pred)
    ll = sum(act*sp.log(pred) + sp.subtract(1,act)*sp.log(sp.subtract(1,pred)))
    ll = ll * -1.0/len(act)
    return ll

# 结果衡量
def print_metrics(true_values, predicted_values):
    print "logloss: ", logloss(true_values, predicted_values)
    print "Accuracy: ", metrics.accuracy_score(true_values, predicted_values)
    print "AUC: ", metrics.roc_auc_score(true_values, predicted_values)
    print "Confusion Matrix: ", + metrics.confusion_matrix(true_values, predicted_values)
    print metrics.classification_report(true_values, predicted_values)    


def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None,
                        n_jobs=1, train_sizes=np.linspace(.1, 1.0, 5)):
    '''
    输入一个模型、title、x、y,返回模型学习过程曲线
    '''
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    return plt


#读取经过down sampling后的small数据
train_df=pd.read_csv('train_small.csv',nrows=100000) 
test_df=pd.read_csv('test_small.csv') 

feature_columns=['device_type','C1','C15','C16','banner_pos',
                'banner_pos','site_category']
train_x=train_df[feature_columns]
test_x=test_df[feature_columns]
x=pd.concat([train_x,test_x])

#变成one-hot encoding
temp=x
for each in feature_columns:
    temp_dummies=pd.get_dummies(x[each])
    temp=pd.concat([temp,temp_dummies],axis=1)

x_dummies=temp.drop(feature_columns,axis=1)

X_train=x_dummies[0:len(train_x)]
Y_train=train_df['click']

x_train, x_test, y_train, y_test=train_test_split(X_train,Y_train,test_size=0.33)


#建模


#模型参数选择,使用GridSearchCV实现
"""
LR模型可调的参数,没几个能调的,gs调参只能输入list,不能对str进行选择。

LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True,
                   intercept_scaling=1, class_weight=None, random_state=None, 
                   solver='liblinear', max_iter=100, multi_class='ovr', 
                   verbose=0, warm_start=False, n_jobs=1)

        solver : {‘newton-cg’, ‘lbfgs’, ‘liblinear’, ‘sag’}, default: ‘liblinear’
"""
#param_LR= {'C':[0.1,1,2]}
#
#    
#gsearch_LR = GridSearchCV(estimator = LogisticRegression(penalty='l1',solver='liblinear'),
#                          param_grid=param_LR,cv=3)
#gsearch_LR.fit(x_train,y_train)
#gsearch_LR.grid_scores_, gsearch_LR.best_params_, gsearch_LR.best_score_

title='LRlearning{penalty=l1,solver=liblinear,cv=3}'                          
plot_learning_curve(LogisticRegression(penalty='l1',solver='liblinear',C=1),
                    title=title,cv=10,X=x_train,y=y_train)

#gsearch_LR.fit(x_train,y_train)


#gbdt模型

#param_GBDT= {'learning_rate':[0.1,0.5],
#             'n_estimators':[100,200,300,400],
#             'max_depth':[3,4]}
#
#gsearch_GBDT = GridSearchCV(estimator =GradientBoostingClassifier(),
#                          param_grid=param_GBDT,cv=10)
#gsearch_GBDT.fit(x_train,y_train)
##gsearch_GBDT.grid_scores_
#gsearch_GBDT.best_params_
#gsearch_GBDT.best_score_
#最佳参数:'n_estimators': 200, 'learning_rate': 0.1, 'max_depth': 3

title='GDBTlearning{n_estimators: 200, learning_rate: 0.1, max_depth: 3}'
plot_learning_curve(estimator=GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3),
                    title=title,cv=2,X=x_train,y=y_train)
#比LR好那么一点点

#rf建模


#param_rf= {'n_estimators':[100,200,300],
#            'max_depth':[2,3,4]}
#
#gsearch_rf = GridSearchCV(estimator =RandomForestClassifier(),
#                          param_grid=param_rf,cv=3)
#
#gsearch_rf.fit(x_train,y_train)
##gsearch_GBDT.grid_scores_
#gsearch_rf.best_params_
#gsearch_rf.best_score_

# 最佳参数: {'n_estimators': 200, 'max_depth': 4} 

title='RFlearning{n_estimators: 200,  max_depth: 4}'
plot_learning_curve(estimator=RandomForestClassifier(n_estimators=200, max_depth=4),
                    title=title,cv=2,X=x_train,y=y_train)



# predict

lr_model=LogisticRegression(penalty='l1',solver='liblinear',C=1)
gbdt_model=GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3)
rf_model=RandomForestClassifier(n_estimators=200, max_depth=4)

lr_model.fit(x_train,y_train)
gbdt_model.fit(x_train,y_train)
rf_model.fit(x_train,y_train)

lr_predict=lr_model.predict( x_test)
gbdt_predict=gbdt_model.predict(x_test)
rf_predict=rf_model.predict(x_test)

print "LRmodel 性能如下:-------"
print_metrics(y_test, lr_predict)

print "GBDTmodel 性能如下:-------"
print_metrics(y_test, gbdt_predict)

print "RFmodel 性能如下:-------"
print_metrics(y_test, rf_predict)

结果大概如下:

LRmodel 性能如下:-------
logloss:  14.8549419892
Accuracy:  0.569909090909
AUC:  0.570339428461
Confusion Matrix:  [[11141  5293]
 [ 8900  7666]]
             precision    recall  f1-score   support

          0       0.56      0.68      0.61     16434
          1       0.59      0.46      0.52     16566

avg / total       0.57      0.57      0.56     33000
GBDTmodel 性能如下:-------
logloss:  14.7952832304
Accuracy:  0.571636363636
AUC:  0.572068547036
Confusion Matrix:  [[11177  5257]
 [ 8879  7687]]
             precision    recall  f1-score   support

          0       0.56      0.68      0.61     16434
          1       0.59      0.46      0.52     16566

avg / total       0.58      0.57      0.57     33000
RFmodel 性能如下:-------
logloss:  15.4713065032
Accuracy:  0.552060606061
AUC:  0.553565705536
Confusion Matrix:  [[15281  1153]
 [13629  2937]]
             precision    recall  f1-score   support

          0       0.53      0.93      0.67     16434
          1       0.72      0.18      0.28     16566

avg / total       0.62      0.55      0.48     33000

插个图看看结果:
这里写图片描述

这里写图片描述

这里写图片描述

展开阅读全文

没有更多推荐了,返回首页