# 科大讯飞2021丨广告点击率预估挑战赛 Top1方案(附完整代码)

2 篇文章 1 订阅

### 上代码

# =============================================================================
# # 导入工具包
# =============================================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import seaborn as sns
from sklearn.model_selection import KFold
import lightgbm as lgb
from sklearn.metrics import f1_score, roc_auc_score
import warnings
warnings.filterwarnings('ignore')

os.chdir('C:/Users/yyz/Desktop/比赛/广告点击率/data/广告点击率预估挑战赛_数据集/')

# =============================================================================
# # 读取数据并合并
# =============================================================================
df_tr_te = pd.concat([df_tr,df_te],axis=0,ignore_index = True)
# 区分训练集和测试
df_tr_te['isClick'] = df_tr_te['isClick'].fillna(-1)
# 读取待提交的数据

# 对日期进行分列
df_tr_te['hour'] = df_tr_te['date'].apply(lambda x: int(x.split(' ')[-1].split(':')[0]))
df_tr_te['day']= df_tr_te['date'].apply(lambda x: int(x.split(' ')[0].split('-')[1]))

# 对user_id计数小于等于3的归为1类
te = df_tr_te['user_id'].value_counts().reset_index()
lis_thr = te[te['user_id']<=3]['index'].unique().tolist()
df_tr_te['thr'] = np.where(df_tr_te['user_id'].isin(lis_thr),0,1)

# =============================================================================
# 特征工程
# =============================================================================

# 历史点击率
def _his_click_rate(df, f1, window_size = 2):
fea_name = '{}_his_{}_clickrate'.format(f1,window_size )
df[fea_name] = 0
for i in tqdm(range(3,8)):
df_t = df.loc[((df['day'] >= i-window_size) & (df['day'] < i))]
inds = df['day'] == i
df.loc[inds,fea_name] = df.loc[inds,f1].map(df_t.groupby(f1)['isClick'].mean())
return df

df_tr_te = _his_click_rate(df = df_tr_te, f1 = 'user_id', window_size = 5)
# 在baseline的基础上又增加了一个
df_tr_te['user_id_webpage_id'] = [str(i)+ str(j) for i,j in zip(df_tr_te['user_id'],df_tr_te['webpage_id'])]
df_tr_te = _his_click_rate(df = df_tr_te, f1 = 'user_id_webpage_id', window_size = 5)

# 窗口特征
df_tr_te['user_product_day_5mean'] = df_tr_te.groupby(['user_id','product','day'])['isClick'].transform(lambda x: x.rolling(3).mean().shift(1))

# 缺失值数据填充并替换
df_tr_te['gender'] = df_tr_te['gender'].fillna('NAN').map({'Female':1,'Male':0,'NAN':-1})
# 星期数据替换, 主要将周五 周六 周天归为一类
df_tr_te['xingqi'] = df_tr_te['day'].replace([2,3,4,5,6,7],[2,2,1,0,0,0])

# 单变量count特征
for c in ['user_id','product','hour','campaign_id','webpage_id','user_group_id','age_level',
'gender','day','product_category_id','user_depth']:
df_tr_te[c + '_cnt'] = df_tr_te.groupby(c)['id'].transform('count')

# 双变量的count特征
import itertools
lis_i =  ['user_id','product','hour','campaign_id','webpage_id','user_group_id','age_level',
'gender','day','product_category_id','user_depth']
lis_i_re = list(itertools.permutations(lis_i, 2))
for c in lis_i_re:
df_tr_te[c[0] + c[1] + '_cnt'] = df_tr_te.groupby(list(c))['id'].transform('count')

# 处理时间(根据数据条数猜测是2021年数据)
df_tr_te['date'] =  ['2021-' + i for i in df_tr_te['date']]
df_tr_te['date'] = pd.to_datetime(df_tr_te['date'])
# 计算按用户, 天, 小时的时间差
df_tr_te['user_time_hour'] = df_tr_te.groupby(['user_id','day','hour'])['date'].transform(lambda x: (x.max()-x.min()).total_seconds())
# 计算按用户, 天的时间差
df_tr_te['user_time_day'] = df_tr_te.groupby(['user_id','day'])['date'].transform(lambda x: (x.max()-x.min()).total_seconds())
# 一阶差分
df_tr_te['user_time_del'] = df_tr_te.groupby(['user_id'])['date'].transform(lambda x: (x.diff(periods=-1)))
df_tr_te['user_time_del'] = df_tr_te['user_time_del'].apply(lambda x: x.total_seconds())

# count计数
df_tr_te['user_id_webpage_id_product'] = df_tr_te.groupby(['user_id','product','webpage_id'])['id'].transform('count')
# 产品按用户, 天权重
df_tr_te['user_id_day_range'] = df_tr_te.groupby(['user_id','day'])['product'].transform(lambda x : len(x) / np.array(range(1,len(x)+1)))
# 产品按用户权重
df_tr_te['user_id_range'] = df_tr_te.groupby(['user_id'])['product'].transform(lambda x : len(x) / np.array(range(1,len(x)+1)))
# 网页按用户, 产品权重
df_tr_te['user_id_product_webpage_range'] = df_tr_te.groupby(['user_id','product'])['webpage_id'].transform(lambda x : len(x) / np.array(range(1,len(x)+1)))
# 网页按用户, 活动权重
df_tr_te['user_id_campaign_id_webpage_range'] = df_tr_te.groupby(['user_id','campaign_id'])['webpage_id'].transform(lambda x : len(x) / np.array(range(1,len(x)+1)))

# 不同组合的时间均值
lis_i_1 =  ['user_id','product','campaign_id','webpage_id','product_category_id',
'user_group_id','age_level','gender','user_depth','var_1']
for c in lis_i_1:
df_tr_te[str(c) + '_user_time_hour_mean'] = df_tr_te.groupby(c)['user_time_hour'].transform('mean')
df_tr_te[str(c) + '_user_time_day_mean'] = df_tr_te.groupby(c)['user_time_hour'].transform('mean')
df_tr_te[str(c) + '_user_time_hour_sum'] = df_tr_te.groupby(c)['user_time_hour'].transform('sum')
df_tr_te[str(c) + '_user_time_day_sum'] = df_tr_te.groupby(c)['user_time_hour'].transform('sum')

# 性别, 年龄, 产品的平均用时
df_tr_te['yong_time_gender_age_level_product_category_id_ave'] = df_tr_te.groupby(['gender','age_level','product_category_id'])['user_time_hour'].transform('mean')

# 暴力增加2个特征的组合平均用时
lis_i_1 =  ['user_id','product','campaign_id','webpage_id','product_category_id','user_group_id','age_level','gender','user_depth','var_1']
lis_i_re_1 = list(itertools.permutations(lis_i_1, 2))
for c in lis_i_re_1:
df_tr_te[c[0] + c[1] + '_user_time_hour_mean'] = df_tr_te.groupby(list(c))['user_time_hour'].transform('mean')

# nunique特征
for i in ['product','campaign_id','webpage_id','product_category_id']:
df_tr_te['day_'+str(i)+'_nunique'] = df_tr_te.groupby(['user_id','day'])[i].transform('nunique')
df_tr_te['day_'+str(i)+'_nunique_p%'] = df_tr_te['user_idday_cnt'] / df_tr_te['day_'+str(i)+'_nunique']

df_tr_te['day_web_nunique'] = df_tr_te.groupby(['user_id','day','hour'])['webpage_id'].transform('nunique')

# =============================================================================
# 建模
# =============================================================================

# cate_features  = ['user_id','product','hour','campaign_id','webpage_id','user_group_id','age_level']

features = [i for i in df_tr_te.columns if i not in ['id','isClick','date','user_id_webpage_id']]

test= df_tr_te[df_tr_te['isClick']==-1]
train= df_tr_te[df_tr_te['isClick']!=-1]

x_train = train[features]
x_test = test[features]
y_train = train['isClick']

def cv_model(clf, train_x, train_y, test_x, clf_name='lgb'):
folds = 5
seed = 2021
kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

train = np.zeros(train_x.shape[0])
test = np.zeros(test_x.shape[0])

cv_scores = []

for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
print('************************************ {} ************************************'.format(str(i+1)))
trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)

params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'min_child_weight': 5,
'num_leaves': 2**6,
'lambda_l2': 10,
'feature_fraction': 0.9,
'bagging_fraction': 0.9,
'bagging_freq': 4,
'learning_rate': 0.01,
'seed': 2021,
'n_jobs':-1,
'silent': True,
'verbose': -1,
}

model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix],
#categorical_feature = categorical_feature,
verbose_eval=500,early_stopping_rounds=200)
val_pred = model.predict(val_x, num_iteration=model.best_iteration)
test_pred = model.predict(test_x, num_iteration=model.best_iteration)

train[valid_index] = val_pred
test += test_pred / kf.n_splits
cv_scores.append(roc_auc_score(val_y, val_pred))

print(cv_scores)

print("%s_scotrainre_list:" % clf_name, cv_scores)
print("%s_score_mean:" % clf_name, np.mean(cv_scores))
print("%s_score_std:" % clf_name, np.std(cv_scores))
return train, test

lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test)

## 预测结果
df_sub['isClick'] = lgb_test
df_sub.to_csv('C:/Users/yyz/Desktop/比赛/广告点击率/baseline55_5zhe_re.csv', index=False)


#### 解题思路

1. 模型是在大佬提供的baseline基础上完善, 使用单模lightgbm
2. 在特征构造上主要有以下几个方面:
a. 常用的count和nunique特征, 至于按几个分类变量分组, 需要多尝试;
b. 时间特征: 因为数据涉及时间, 所有构造了很多时间差, 平均用时, 总用时特征, 不同分类组合的平均用时, 总用时特征;
c. 权重特征: 次赛题业务涉及广告, 广告出现的次数和它此次被点击的概率应该成反比, 所有构造了很多权重特征;
d. 历史点击率特征
e. 其他特征: 星期归类, 频次较少的样本归为一类等
3. 调参: 主要对learning_rate, num_leaves, min_child_weight进行调整

### 后期优化

• 使用不同的模型, 比如catboost, xgb等模型, 进行模型融合
• 特征较多, 计算用时较长, 对特征进行筛选
• 代码比较繁琐, 需要模块化

• 9
点赞
• 28
收藏
觉得还不错? 一键收藏
• 打赏
• 13
评论
04-12
12-19 1099
07-07 368
05-30 306
01-15 3万+
05-18 1万+
07-13 2万+
10-08
11-29 5672
10-08 2956
07-15 6152
02-18 499

### “相关推荐”对你有帮助么？

• 非常没帮助
• 没帮助
• 一般
• 有帮助
• 非常有帮助

python技巧(数据分析及可视化)

¥1 ¥2 ¥4 ¥6 ¥10 ¥20

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。