CTR 树模型-特征工程代码

刚参加完一场CTR比赛,原本打算使用NN解决问题,后期发现树模型效果意外的好,学习了一些树模型处理CTR问题的特征工程代码。

曝光特征,交叉特征(count, nunique)

# 统计特征 count,nunqiue
def cnt_stat(df, group_cols, target_col=None, use_cnt=True, use_nunique=True):
    if isinstance(group_cols, list):
        col_name = '_'.join(group_cols)
    else:
        col_name = 'global_' + group_cols

    if use_cnt:
        if isinstance(group_cols, list):
            df[f'{col_name}_count'] = df.groupby(group_cols)[target_col].transform('count')
        else:
            df[f'{col_name}_count'] = df[col_name].map(df[col_name].value_counts())

    if (target_col is not None) and use_nunique:
        col_name = col_name + f'_{target_col}'
        df[f'{col_name}_nunique'] = df.groupby(group_cols)[target_col].transform('nunique')
    return df

历史CTR特征,即用户之前的平均点击率

# 历史ctr特征
def make_ctr_feature(df):
    feature_list = ['user_id', 'video_id', 'device_name', 'city']
    task_feature = ['is_watch', 'watch_label', 'is_share', 'is_collect', 'is_comment']  #

    mean_rate_dict = {feat_2: df[df['period'] < 14][feat_2].mean() for feat_2 in task_feature}

    for feat_1 in feature_list:
        res = pd.DataFrame()
        # 各个(特征,pt_d)对应的ctr率
        for period in range(1, 15, 1):
            if period == 0:
                count = df[df['period'] <= period].groupby(feat_1)[task_feature].mean().reset_index()
            else:
                count = df[df['period'] < period].groupby(feat_1)[task_feature].mean().reset_index()  # , as_index=False
            count['period'] = period
            res = res.append(count, ignore_index=True)

        # mean_rate 重命名
        res.rename(columns={feat_2: feat_1 + '_' + feat_2 + '_mean_rate' for feat_2 in task_feature}, inplace=True)
        df = pd.merge(df, res, how='left', on=[feat_1, 'period'], sort=False, )  # 生成了新的df

        for feat_2 in task_feature:
            df[feat_1 + '_' + feat_2 + '_mean_rate'].fillna(mean_rate_dict[feat_2], inplace=True)

        print(feat_1, ' over')

    return df

可变长变量映射为embedding处理

def emb_varlen_feature(df, feat, emb_size=4, window=20, min_count=5, epochs=5, workers=10):
    sentences = df[feat].values.tolist()

    for i in range(len(sentences)):
        tags = sentences[i].split(',') if sentences[i] is not np.nan else []
        sentences[i] = [str(x) for x in tags]

    model = Word2Vec(sentences, vector_size=emb_size, window=window, min_count=min_count, sg=0, hs=0, seed=1, epochs=epochs, workers=workers)
    emb_matrix = []

    for seq in sentences:
        vec = []
        for w in seq:
            if w in model.wv:
                vec.append(model.wv[w])
        if len(vec) > 0:
            emb_matrix.append(np.mean(vec, axis=0))
        else:
            emb_matrix.append([0] * emb_size)

    emb_matrix = np.array(emb_matrix)

    for i in range(emb_size):
        df['{}_emb_{}'.format(feat, i)] = emb_matrix[:, i]

曝光序列映射为embedding处理

def emb_pair_feature(df, f1, f2, emb_size=4, window=20, min_count=5, epochs=5, workers=1):
    print('====================================== {} {} ======================================'.format(f1, f2))
    # 优化
    tmp = df[[f1,f2]].groupby(f1, as_index=False)[f2].agg(
        {'{}_{}_list'.format(f1, f2): list})    # df[df['is_watch']==1]
    sentences = tmp['{}_{}_list'.format(f1, f2)].values.tolist()
    del tmp['{}_{}_list'.format(f1, f2)]

    for i in range(len(sentences)):
        sentences[i] = [str(x) for x in sentences[i]]

    model = Word2Vec(sentences, vector_size=emb_size, window=window, min_count=min_count, sg=0, hs=0, seed=1, epochs=epochs, workers=workers)
    emb_matrix = []

    for seq in sentences:
        vec = []
        for w in seq:
            if w in model.wv:
                vec.append(model.wv[w])
        if len(vec) > 0:
            emb_matrix.append(np.mean(vec, axis=0))
        else:
            emb_matrix.append([0] * emb_size)

    emb_matrix = np.array(emb_matrix)

    for i in range(emb_size):
        tmp['{}_{}_emb_{}'.format(f1, f2, i)] = emb_matrix[:, i]

    return tmp

当前feed与历史feed emb均值做差,获取emb的差异性特征

def gen_hist_emb_diff(df, f1, f2, ):
    # f1 user_id, f2 video_id
    emb_cols = [c for c in df.columns.to_list() if f'{f1}_{f2}_emb' in c]  # f1_emb_cols
    hist_cols = [f'hist_mean_{c}' for c in emb_cols]

    tm_cols = ['is_share', 'watch_label', ]
    user_action = df[[f1, f2, 'period'] + tm_cols + emb_cols]

    # user_action['id'] = user_action.index.tolist()

    train_df = []
    for t in range(1, 15):
        prev_tmp = user_action[user_action['period'] < t]
        now_tmp = user_action[user_action['period'] == t]

        for d in tm_cols:
            grp_tmp = prev_tmp[prev_tmp[d] != 0].groupby(f2)[emb_cols].agg(
                'mean').reset_index()  # f2 t天前点击行为交互过的 f1 embedding
            grp_tmp.columns = [f2] + hist_cols  # f2, hist_mean_f1_emb_cols
            now_tmp = now_tmp.merge(grp_tmp, on=f2, how='left')  # f1, f2, proid, f1_emb_cols, hist_mean_f1_emb_cols
            now_emb = now_tmp[emb_cols].values
            hist_emb = now_tmp[hist_cols].values

            ### 余弦相似度
            sim = np.sum(np.multiply(now_emb, hist_emb), axis=-1) / (
                        np.linalg.norm(now_emb, axis=-1) * np.linalg.norm(hist_emb, axis=-1))
            now_tmp[f'{d}_{f1}_emb_corr'] = sim

            for col in hist_cols:
                del now_tmp[col]
            del now_tmp[d]

        now_tmp = now_tmp.drop(emb_cols, axis=1)
        train_df.append(now_tmp)

    train_ = pd.concat(train_df, axis=0, ignore_index=True)

    df = df.merge(train_, on=[f1, f2, 'period'], how='left')
    for d in tm_cols:
        # print(df.columns.to_list())
        corr_mean = df[f'{d}_{f1}_emb_corr'].dropna().values.mean()
        print(corr_mean)
        df[f'{d}_{f1}_emb_corr'].fillna(value=corr_mean, inplace=True)
    del train_
    # df = reduce(df)

    return df

特征处理,涉及到内存问题,下面是数据压缩函数

def reduce(df, verbose=True):
    int_list = ['int', 'int32', 'int16']
    float_list = ['float', 'float32']
    start_mem = df.memory_usage().sum() / 1024 ** 2
    for col in df.columns.tolist():
        col_type = df[col].dtypes
        if col_type in int_list:
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
        elif col_type in float_list:
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (
                start_mem - end_mem) / start_mem))
    return df
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值