刚参加完一场CTR比赛,原本打算使用NN解决问题,后期发现树模型效果意外的好,学习了一些树模型处理CTR问题的特征工程代码。
曝光特征,交叉特征(count, nunique)
# 统计特征 count,nunqiue
def cnt_stat(df, group_cols, target_col=None, use_cnt=True, use_nunique=True):
if isinstance(group_cols, list):
col_name = '_'.join(group_cols)
else:
col_name = 'global_' + group_cols
if use_cnt:
if isinstance(group_cols, list):
df[f'{col_name}_count'] = df.groupby(group_cols)[target_col].transform('count')
else:
df[f'{col_name}_count'] = df[col_name].map(df[col_name].value_counts())
if (target_col is not None) and use_nunique:
col_name = col_name + f'_{target_col}'
df[f'{col_name}_nunique'] = df.groupby(group_cols)[target_col].transform('nunique')
return df
历史CTR特征,即用户之前的平均点击率
# 历史ctr特征
def make_ctr_feature(df):
feature_list = ['user_id', 'video_id', 'device_name', 'city']
task_feature = ['is_watch', 'watch_label', 'is_share', 'is_collect', 'is_comment'] #
mean_rate_dict = {feat_2: df[df['period'] < 14][feat_2].mean() for feat_2 in task_feature}
for feat_1 in feature_list:
res = pd.DataFrame()
# 各个(特征,pt_d)对应的ctr率
for period in range(1, 15, 1):
if period == 0:
count = df[df['period'] <= period].groupby(feat_1)[task_feature].mean().reset_index()
else:
count = df[df['period'] < period].groupby(feat_1)[task_feature].mean().reset_index() # , as_index=False
count['period'] = period
res = res.append(count, ignore_index=True)
# mean_rate 重命名
res.rename(columns={feat_2: feat_1 + '_' + feat_2 + '_mean_rate' for feat_2 in task_feature}, inplace=True)
df = pd.merge(df, res, how='left', on=[feat_1, 'period'], sort=False, ) # 生成了新的df
for feat_2 in task_feature:
df[feat_1 + '_' + feat_2 + '_mean_rate'].fillna(mean_rate_dict[feat_2], inplace=True)
print(feat_1, ' over')
return df
可变长变量映射为embedding处理
def emb_varlen_feature(df, feat, emb_size=4, window=20, min_count=5, epochs=5, workers=10):
sentences = df[feat].values.tolist()
for i in range(len(sentences)):
tags = sentences[i].split(',') if sentences[i] is not np.nan else []
sentences[i] = [str(x) for x in tags]
model = Word2Vec(sentences, vector_size=emb_size, window=window, min_count=min_count, sg=0, hs=0, seed=1, epochs=epochs, workers=workers)
emb_matrix = []
for seq in sentences:
vec = []
for w in seq:
if w in model.wv:
vec.append(model.wv[w])
if len(vec) > 0:
emb_matrix.append(np.mean(vec, axis=0))
else:
emb_matrix.append([0] * emb_size)
emb_matrix = np.array(emb_matrix)
for i in range(emb_size):
df['{}_emb_{}'.format(feat, i)] = emb_matrix[:, i]
曝光序列映射为embedding处理
def emb_pair_feature(df, f1, f2, emb_size=4, window=20, min_count=5, epochs=5, workers=1):
print('====================================== {} {} ======================================'.format(f1, f2))
# 优化
tmp = df[[f1,f2]].groupby(f1, as_index=False)[f2].agg(
{'{}_{}_list'.format(f1, f2): list}) # df[df['is_watch']==1]
sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist()
del tmp['{}_{}_list'.format(f1, f2)]
for i in range(len(sentences)):
sentences[i] = [str(x) for x in sentences[i]]
model = Word2Vec(sentences, vector_size=emb_size, window=window, min_count=min_count, sg=0, hs=0, seed=1, epochs=epochs, workers=workers)
emb_matrix = []
for seq in sentences:
vec = []
for w in seq:
if w in model.wv:
vec.append(model.wv[w])
if len(vec) > 0:
emb_matrix.append(np.mean(vec, axis=0))
else:
emb_matrix.append([0] * emb_size)
emb_matrix = np.array(emb_matrix)
for i in range(emb_size):
tmp['{}_{}_emb_{}'.format(f1, f2, i)] = emb_matrix[:, i]
return tmp
当前feed与历史feed emb均值做差,获取emb的差异性特征
def gen_hist_emb_diff(df, f1, f2, ):
# f1 user_id, f2 video_id
emb_cols = [c for c in df.columns.to_list() if f'{f1}_{f2}_emb' in c] # f1_emb_cols
hist_cols = [f'hist_mean_{c}' for c in emb_cols]
tm_cols = ['is_share', 'watch_label', ]
user_action = df[[f1, f2, 'period'] + tm_cols + emb_cols]
# user_action['id'] = user_action.index.tolist()
train_df = []
for t in range(1, 15):
prev_tmp = user_action[user_action['period'] < t]
now_tmp = user_action[user_action['period'] == t]
for d in tm_cols:
grp_tmp = prev_tmp[prev_tmp[d] != 0].groupby(f2)[emb_cols].agg(
'mean').reset_index() # f2 t天前点击行为交互过的 f1 embedding
grp_tmp.columns = [f2] + hist_cols # f2, hist_mean_f1_emb_cols
now_tmp = now_tmp.merge(grp_tmp, on=f2, how='left') # f1, f2, proid, f1_emb_cols, hist_mean_f1_emb_cols
now_emb = now_tmp[emb_cols].values
hist_emb = now_tmp[hist_cols].values
### 余弦相似度
sim = np.sum(np.multiply(now_emb, hist_emb), axis=-1) / (
np.linalg.norm(now_emb, axis=-1) * np.linalg.norm(hist_emb, axis=-1))
now_tmp[f'{d}_{f1}_emb_corr'] = sim
for col in hist_cols:
del now_tmp[col]
del now_tmp[d]
now_tmp = now_tmp.drop(emb_cols, axis=1)
train_df.append(now_tmp)
train_ = pd.concat(train_df, axis=0, ignore_index=True)
df = df.merge(train_, on=[f1, f2, 'period'], how='left')
for d in tm_cols:
# print(df.columns.to_list())
corr_mean = df[f'{d}_{f1}_emb_corr'].dropna().values.mean()
print(corr_mean)
df[f'{d}_{f1}_emb_corr'].fillna(value=corr_mean, inplace=True)
del train_
# df = reduce(df)
return df
特征处理,涉及到内存问题,下面是数据压缩函数
def reduce(df, verbose=True):
int_list = ['int', 'int32', 'int16']
float_list = ['float', 'float32']
start_mem = df.memory_usage().sum() / 1024 ** 2
for col in df.columns.tolist():
col_type = df[col].dtypes
if col_type in int_list:
c_min = df[col].min()
c_max = df[col].max()
if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
df[col] = df[col].astype(np.int8)
elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
df[col] = df[col].astype(np.int16)
elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
df[col] = df[col].astype(np.int32)
elif col_type in float_list:
c_min = df[col].min()
c_max = df[col].max()
if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
df[col] = df[col].astype(np.float16)
elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
df[col] = df[col].astype(np.float32)
end_mem = df.memory_usage().sum() / 1024 ** 2
if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (
start_mem - end_mem) / start_mem))
return df