# coding=utf-8
# @Time : 2019/12/12 0:34
# @Author : Z
# @Email : S
# @File : 10.1CTR.py
# 读入数据
import os
data_path = os.path.join(".", "train_small.csv")
import pandas as pd
ctr_data1 = pd.read_csv(data_path)
# (99999, 24)
# 打印维度,99999行,24列
print(ctr_data1.shape)
# print ctr_data.head()
# print ctr_data.describe()
# Index(['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
# 'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
# 'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
# 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21'],
# dtype='object')
# 打印各特征的名称,类型为"object"
print(ctr_data1.columns)
print("="*100)
# 删除其中一些特征
training_Set=ctr_data1.drop(['id','site_id', 'app_id', 'device_id', 'device_ip', 'site_domain',
'site_category', 'app_domain', 'app_category', 'device_model'], axis=1)
ctr_data=training_Set.values
#2.对数据进行处理和分析
from sklearn.model_selection import train_test_split
# 样本特征集
X=ctr_data[:,1:]
# (99999, 13)
print(X.shape)
# 样本标签
y=ctr_data[:,0]
# (99999,)
print(y.shape)
X_train, X_test, y_train, y_test=train_test_split(X,y,test_size=0.22,random_state=33)
# (77999, 13)
print(X_train.shape)
# (77999,)
print(y_train.shape)
# #3.引入机器学习算法
# 逻辑斯特回归模型
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
# 训练
lr.fit(X_train,y_train)
# #4.模型预测
y_pred=lr.predict(X_test)
# y_pred1=lr.predict(X_test[0])
print(y_pred)# [0 0 0 ... 0 0 0]
# print(y_pred1)
print("="*100)
# # #5.模型校验
# 0.8239721022064386
print(lr.score(X_train,y_train))
# 0.8290909090909091
print(lr.score(X_test,y_test))
# 混淆矩阵
# TP(True Positive):将正类预测为正类数,真实为0,预测也为0
# FN(False Negative):将正类预测为负类数,真实为0,预测为1
# FP(False Positive):将负类预测为正类数, 真实为1,预测为0
# TN(True Negative):将负类预测为负类数,真实为1,预测也为1
# [[TP FN]
# [ FP TN]]
from sklearn.metrics import confusion_matrix
# [[18240 0]
# [ 3760 0]]
print(confusion_matrix(y_test,y_pred))
print("="*100)
# precision:精确率(真实值和预测值相匹配的案例数/预测值中的案例数)
# recall:召回率(真实值和预测值相匹配的案例数/真实值中的案例数)
# f1-score:F1值是精确度和召回率的调和平均值:F1=2P×RP+R F1=2P×RP+R
# support:每个标签的出现次数
from sklearn.metrics import classification_report
# precision recall f1-score support
#
# 0 0.83 1.00 0.91 18240
# 1 0.00 0.00 0.00 3760
#
# 准确率
# accuracy 0.83 22000
# 宏平均值
# macro avg 0.41 0.50 0.45 22000
# 加权平均数
# weighted avg 0.69 0.83 0.75 22000
print(classification_report(y_test,y_pred))
print("="*100)
# #6.保存模型
from sklearn.externals import joblib
joblib.dump(lr,filename="Ctr_Predict.pkl")
# #8.按照要求写入对应的csv文件
import numpy as np
import pandas as pd
ctr_data2=pd.read_csv("test.csv")
ctr_data3=ctr_data2.drop(['click','site_id', 'app_id', 'device_id', 'device_ip', 'site_domain',
'site_category', 'app_domain', 'app_category', 'device_model'], axis=1)
print(ctr_data3)
print("="*100)
ids=ctr_data3.values[0:,0]
y_pred_test=lr.predict(ctr_data3.values[0:,1:])
print(y_pred_test)
# # print ids
submit=np.concatenate((ids.reshape(len(ids),1),y_pred_test.reshape(len(y_pred_test),1)),axis=1)
df=pd.DataFrame(submit)
df.to_csv("submit.csv", header=['id', 'click'], index=False)
CTR点击率预估
最新推荐文章于 2022-12-12 12:35:20 发布