CTR点击率预估

最新推荐文章于 2022-12-12 12:35:20 发布

NewBee.Mu

最新推荐文章于 2022-12-12 12:35:20 发布

阅读量247

点赞数

分类专栏：机器学习推荐系统大数据

本文链接：https://blog.csdn.net/NewBeeMu/article/details/103516655

版权

大数据同时被 3 个专栏收录

18 篇文章 0 订阅

订阅专栏

机器学习

12 篇文章 0 订阅

订阅专栏

推荐系统

10 篇文章 1 订阅

订阅专栏

# coding=utf-8
# @Time    : 2019/12/12 0:34
# @Author  : Z
# @Email   : S
# @File    : 10.1CTR.py

# 读入数据
import os
data_path = os.path.join(".", "train_small.csv")
import pandas as pd
ctr_data1 = pd.read_csv(data_path)
# (99999, 24)
# 打印维度，99999行，24列
print(ctr_data1.shape)
# print ctr_data.head()
# print ctr_data.describe()
# Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
#        'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
#        'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
#        'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'],
#       dtype='object')
# 打印各特征的名称，类型为"object"
print(ctr_data1.columns)
print("="*100)
# 删除其中一些特征
training_Set=ctr_data1.drop(['id','site_id', 'app_id', 'device_id', 'device_ip', 'site_domain',
                  'site_category', 'app_domain', 'app_category', 'device_model'], axis=1)
ctr_data=training_Set.values
#2.对数据进行处理和分析
from sklearn.model_selection import train_test_split
# 样本特征集
X=ctr_data[:,1:]
# (99999, 13)
print(X.shape)
# 样本标签
y=ctr_data[:,0]
# (99999,)
print(y.shape)
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.22,random_state=33)
# (77999, 13)
print(X_train.shape)
# (77999,)
print(y_train.shape)
# #3.引入机器学习算法
# 逻辑斯特回归模型
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
# 训练
lr.fit(X_train,y_train)
# #4.模型预测
y_pred=lr.predict(X_test)
# y_pred1=lr.predict(X_test[0])
print(y_pred)# [0 0 0 ... 0 0 0]
# print(y_pred1)
print("="*100)
# # #5.模型校验
# 0.8239721022064386
print(lr.score(X_train,y_train))
# 0.8290909090909091
print(lr.score(X_test,y_test))
# 混淆矩阵
# TP(True Positive)：将正类预测为正类数，真实为0，预测也为0
# FN(False Negative)：将正类预测为负类数，真实为0，预测为1
# FP(False Positive)：将负类预测为正类数， 真实为1，预测为0
# TN(True Negative)：将负类预测为负类数，真实为1，预测也为1
# [[TP     FN]
#  [ FP     TN]]
from sklearn.metrics import confusion_matrix
# [[18240     0]
#  [ 3760     0]]
print(confusion_matrix(y_test,y_pred))
print("="*100)
# precision：精确率(真实值和预测值相匹配的案例数/预测值中的案例数)
# recall：召回率(真实值和预测值相匹配的案例数/真实值中的案例数)
# f1-score：F1值是精确度和召回率的调和平均值：F1=2P×RP+R F1=2P×RP+R
# support：每个标签的出现次数
from sklearn.metrics import classification_report
#               precision    recall  f1-score   support
#
#            0       0.83      1.00      0.91     18240
#            1       0.00      0.00      0.00      3760
#
#     准确率
#     accuracy                           0.83     22000
#    宏平均值
#    macro avg       0.41      0.50      0.45     22000
# 加权平均数
# weighted avg       0.69      0.83      0.75     22000
print(classification_report(y_test,y_pred))
print("="*100)
# #6.保存模型
from sklearn.externals import joblib
joblib.dump(lr,filename="Ctr_Predict.pkl")
# #8.按照要求写入对应的csv文件
import numpy as np
import pandas as pd
ctr_data2=pd.read_csv("test.csv")
ctr_data3=ctr_data2.drop(['click','site_id', 'app_id', 'device_id', 'device_ip', 'site_domain',
                  'site_category', 'app_domain', 'app_category', 'device_model'], axis=1)
print(ctr_data3)
print("="*100)
ids=ctr_data3.values[0:,0]
y_pred_test=lr.predict(ctr_data3.values[0:,1:])
print(y_pred_test)
# # print ids
submit=np.concatenate((ids.reshape(len(ids),1),y_pred_test.reshape(len(y_pred_test),1)),axis=1)
df=pd.DataFrame(submit)
df.to_csv("submit.csv", header=['id', 'click'], index=False)