创意视角下的数字广告CTR预估挑战赛baseline_catboost版本_score0.51
1. 赛事任务
广告的CTR预估需要强大的数据作为支撑,本次大赛提供了讯飞AI营销云海量的现网流量和创意数据作为训练样本,参赛选手需基于提供的样本构建模型,预测测试集的点击率,点击率的准确性将直接影响评价结果。
评估指标gAUC。
2. 数据处理
train_df = pd.read_csv(
"./dataset/train_data.txt",
low_memory=True,
header=None,
usecols=['label','pkgname','ver','slotid','mediaid','material'],
names=['label','pkgname','ver','slotid','mediaid','material'],
nrows=None,
dtype={
'label': "category",
'pkgname': "category",
'ver': "category",
'slotid': "category",
'mediaid': "category",
'material': "category",
},
)
train_df = train_df[train_df['label'] != 'adx_slot_id']
train_data = np.loadtxt(
"./dataset/train_data.txt",
dtype=np.float32,
comments="adx_slot_id",
delimiter=",",
usecols=list(range(6, 246)),
)
train_df.to_parquet('./temp_data/train_df.parquet')
np.save('./temp_data/train_data.npy', train_data)
test_df = pd.read_csv(
"./dataset/test_data.txt",
low_memory=True,
header=None,
usecols=['pkgname','ver','slotid','mediaid','material'],
names=['pkgname','ver','slotid','mediaid','material'],
nrows=None,
dtype={
'pkgname': "category",
'ver': "category",
'slotid': "category",
'mediaid': "category",
'material': "category",
},
)
test_data = np.loadtxt(
"./dataset/test_data.txt",
dtype=np.float32,
comments="adx_slot_id",
delimiter=",",
usecols=list(range(5, 245)),
)
test_df.to_parquet('./dataset/test_df.parquet')
np.save('./dataset/test_data.npy', test_data)
3. 模型训练与评估
import pandas as pd
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from pylab import *
from sklearn.preprocessing import OneHotEncoder
## 模型预测的
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
# from xgboost.sklearn import XGBClassifier
import lightgbm as lgb
#import xgboost as xgb
from catboost import CatBoostClassifier
## 参数搜索和评价的
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split
from sklearn.metrics import accuracy_score,f1_score,roc_auc_score,recall_score,precision_score
train_df = pd.read_parquet("./temp_data/train_df.parquet")
test_df = pd.read_parquet("./temp_data/test_df.parquet")
train_data = np.load("./temp_data/train_data.npy")
test_data = np.load("./temp_data/test_data.npy")
submission = pd.read_csv('./dataset/提交示例.csv')
train_data = pd.DataFrame(data=train_data)
test_data = pd.DataFrame(data=test_data)
for col in ["pkgname", "ver", "slotid", "mediaid", "material"]:
train_df[col] = train_df[col].str.replace("b'", "").str.replace("'", "")
test_df[col] = test_df[col].str.replace("b'", "").str.replace("'", "")
train_df[['label']] = train_df[['label']].astype('int')
train = pd.merge(train_df,train_data,left_index=True,right_index=True)
test = pd.merge(test_df,test_data,left_index=True,right_index=True)
X_train = train[train['label'].notnull()].drop(['label'],axis=1)
Y_train = train[train['label'].notnull()]['label']
test_data = test
cols =['pkgname','ver','slotid','mediaid','material']
model=CatBoostClassifier(
loss_function="Logloss",
eval_metric="AUC",
task_type="CPU",
learning_rate=0.05,
iterations=500,
random_seed=2022,
od_type="Iter",
depth=10)
answers = []
mean_score = 0
mean_f1 = 0
n_folds = 5
sk = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2022)
for fold_,(train, test) in enumerate(sk.split(X_train, Y_train)):
print("fold n°{}".format(fold_))
print('trn_idx:',train)
print('val_idx:',test)
x_train = X_train.iloc[train]
y_train = Y_train.iloc[train]
x_test = X_train.iloc[test]
y_test = Y_train.iloc[test]
clf = model.fit(x_train,y_train, eval_set=(x_test,y_test),verbose=500,cat_features=cols)
# yy_pred_valid=clf.predict(x_test) #输出直接为标签
yy_pred_valid=clf.predict_proba(x_test)[:,1] #输出预测为1的概率
#验证集的AUC
print('cat验证的auc:{}'.format(roc_auc_score(y_test, yy_pred_valid)))
mean_score += roc_auc_score(y_test, yy_pred_valid) / n_folds
print('mean_score:{}'.format(mean_score))
y_pred_valid = clf.predict(test_data)
#submission['predict'] = y_pred_valid
#submission.to_csv('./results/cat_0713.csv',index = None)
print('over,请检查submit文件,及时提交文件!')
pd.DataFrame({'predict': (y_pred_valid > 0.1).astype(int)}).to_csv('./results/sub_baseline0714.csv', index=None)
测试集得分0.51左右。