'''
TencentAdCompetition:
计算广告是互联网最重要的商业模式之一,广告投放效果通常通过曝光、点击和转化各环节来衡量,大多数广告系统受广告效果
数据回流的限制只能通过曝光或点击作为投放效果的衡量标准开展优化。
腾讯社交广告(`http://ads.tencent.com`)发挥特有的用户识别和转化跟踪数据能力,帮助广告主跟踪广告投放后的转化效果,
基于广告转化数据训练转化率预估模型(pCVR,Predicted Conversion Rate),在广告排序中引入pCVR因子优化广告投放效果,提升ROI。
本题目以移动App广告为研究对象,预测App广告点击后被激活的概率:pCVR=P(conversion=1 | Ad,User,Context),即给定广告、
用户和上下文情况下广告被点击后发生激活的概率。
评估方式为Logarithmic loss (logloss)
提交结果为“instanceD,prob”
所有数据均来自腾讯比赛
'''
import numpy as np
import pandas as pd
import scipy as sp
from sklearn.preprocessing import Binarizer #切段
from sklearn.preprocessing import MinMaxScaler #scaling
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import os
import warnings
'''
文件读取
'''
def read_csv_file(f,logging=False):
print("==========读取数据==========")
data = pd.read_csv(f)
if logging:
print(data.head(5))
print(f,"包含以下列......")
print(data.columns.values)
print(data.describe())
print(data.info())
return data
#取出第一类编码,AppCategory.例如210,一级类目为2,二级类目为10
def categories_process_first_class(cate):
cate = str(cate)
if len(cate)==1:
if int(cate)==0:
return 0
else:
return int(cate[0])
#取出第二类编码
def categories_process_second_class(cate):
cate = str(cate)
if len(cate)<3:
return 0
else:
return int(cate[1:])
#年龄处理,切段
def age_process(age):
age = int(age)
if age == 0:
return 0
elif age<15:
return 1
elif age<30:
return 2
elif age<40:
return 3
elif age<60:
return 4
else:
return 5
#省份处理.例如1806,18表示省份,06表示城市
def process_province(hometown):
hometown = str(hometown)
province = int(hometown[0:2])
return province
#城市处理
def process_city(hometown):
hometown = str(hometown)
if len(hometown)>1:
city = int(hometown[2:])
else:
city = 0
return city
#获得哪一天
def get_time_day(t):
t = str(t)
t = int(t[0:2])
return t
#一天切成四段
def get_time_hour(t):
t = str(t)
t = int(t[2:4])
if t<6:
return 0
elif t<12:
return 1
elif t<18:
return 2
else:
return 3
#评估与计算logloss
def logloss(act, pred):
epsilon = 1e-15
pred = sp.maximum(epsilon, pred)
pred = sp.minimum(1 - epsilon, pred)
ll = sum(act * sp.log(pred) + sp.subtract(1, act) * sp.log(sp.subtract(1, pred)))
ll = ll * -1.0 / len(act)
return ll
'''
特征工程+随机森林建模
'''
user = read_csv_file('H:/TencentAdCompetition/user.csv',logging=True)
train_data = read_csv_file('H:/TencentAdCompetition/train.csv',logging=True)
test_data = read_csv_file('H:/TencentAdCompetition/test.csv',logging=True)
ad = read_csv_file('H:/TencentAdCompetition/ad.csv',logging=True)
app_categories = read_csv_file('H:/TencentAdCompetition/app_categories.csv',logging=True)
# print(user.columns)
# print(user['age'].value_counts()) #查看不同年龄段分布
# user.age.value_counts().plot(kind = 'bar')
# plt.show()
下图为年龄分布示意图,可以使用类似方法对特征进行分类
user['age_process'] = user['age'].apply(age_process)
user['hometown_province'] = user['hometown'].apply(process_province)
user['hometown_city'] = user['hometown'].apply(process_city)
user['residence_province'] = user['residence'].apply(process_province)
user['residence_city'] = user['residence'].apply(process_city)
#数据合并
train_data['clickTime_day'] = train_data['clickTime'].apply(get_time_day)
train_data['clickTime_hour'] = train_data['clickTime'].apply(get_time_hour)
test_data['clickTime_day'] = test_data['clickTime'].apply(get_time_day)
test_data['clickTime_hour'] = test_data['clickTime'].apply(get_time_hour)
train_user = pd.merge(train_data,user,on='userID')
train_user_ad = pd.merge(train_user,ad,on='creativeID')
train_user_ad_app =pd.merge(train_user_ad,app_categories,on='appID')
# print(train_user_ad_app.columns.values)
#特征部分 比如loc[0:3,[“a”,”b”]]。取0到第3行(左闭右开,非整型值时左闭右闭。。。),”a”列与”b”列
x_user_ad_app = train_user_ad_app.loc[:,['creativeID','userID','positionID'
,'connectionType','telecomsOperator','clickTime_day','clickTime_hour'
,'age','gender','education','marriageStatus','haveBaby','hometown'
,'residence','age_process','hometown_province','hometown_city'
,'residence_province','residence_city','adID','camgaignID','advertiserID'
,'appID','appPlatform','appCategory']]
x_user_ad_app = np.array(x_user_ad_app.values,dtype='int32')
y_user_ad_app = train_user_ad_app.loc[:,['label']].values
#!!!查看特征的重要度
# feat_labels = np.array(['creativeID','userID','positionID'
# ,'connectionType','telecomsOperator','clickTime_day','clickTime_hour'
# ,'age','gender','education','marriageStatus','haveBaby','hometown'
# ,'residence','age_process','hometown_province','hometown_city'
# ,'residence_province','residence_city','adID','camgaignID','advertiserID'
# ,'appID','appPlatform','appCategory'] )
# forest = RandomForestClassifier(n_estimators=100,random_state=0,n_jobs=-1)
# forest.fit(x_user_ad_app,y_user_ad_app.reshape(y_user_ad_app.shape[0],)) #转换label向量
# importances = forest.feature_importances_ #查看特征重要性
#
# indices = np.argsort(importances)[::-1]
#
# print(importances,indices)
如下为特征重要性情况:
算法调参:
#随机森林自动调参,时间略长
# from sklearn.model_selection import GridSearchCV #网格自动调参
# param_grid = {
# 'n_estimators':[10,100,500,1000],
# 'max_features':[0.6,0.7,0.8,0.9]
# }
# rf = RandomForestClassifier()
# rfc = GridSearchCV(rf,param_grid,scoring='neg_log_loss',cv=3,n_jobs=2)
# rfc.fit(x_user_ad_app,y_user_ad_app.reshape(y_user_ad_app.shape[0],))
# print(rfc.best_score_)
# print(rfc.best_params_)
#Xgboost
from sklearn.model_selection import GridSearchCV #网格自动调参
rng = np.random.RandomState(4315)
warnings.filterwarnings("ignore")
param_grid = {
'max_depth':[3,4,5,7,9],
'n_estimators':[10,50,100,400,800,1000,1200],
'learning_rate':[0.1,0.2,0.3],
'gamma':[0,0.2],
'subsample':[0.8,1],
'colsample_bylevel':[0.8,1]
}
xgb_model = XGBClassifier()
rgs = GridSearchCV(xgb_model,param_grid,n_jobs=4)
rgs.fit(x_user_ad_app,y_user_ad_app.reshape(y_user_ad_app.shape[0],))
print(rgs.best_score_)
print(rgs.best_params_)
PS:电脑运行很慢~~~~~~
# '''
# 正负样本比,约为1:40
# '''
# positive_num = train_user_ad_app[train_user_ad_app['label']==1].values.shape[0]
# negetive_num = train_user_ad_app[train_user_ad_app['label']==0].values.shape[0]
# print(positive_num/negetive_num)