多路召回
多路召回:采用不同的策略、特征或简单模型,分别召回一部分候选集,然后把候选集混在一起供后续排序模型使用
导包
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict
import os, math, warnings, math, pickle
from tqdm import tqdm
import faiss
import collections
import random
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from datetime import datetime
from deepctr.feature_column import SparseFeat, VarLenSparseFeat
from sklearn.preprocessing import LabelEncoder
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from deepmatch.models import *
from deepmatch.utils import sampledsoftmaxloss
warnings.filterwarnings('ignore')
data_path = './'
save_path = './'
# 做召回评估的一个标志, 如果不进行评估就是直接使用全量数据进行召回
metric_recall = False
读取数据
# debug模式: 从训练集中划出一部分数据来调试代码
def get_all_click_sample(data_path, sample_nums=10000):
"""
训练集中采样一部分数据调试
data_path: 原数据的存储路径
sample_nums: 采样数目(这里由于机器的内存限制,可以采样用户做)
"""
all_click = pd.read_csv(data_path + 'train_click_log.csv')
all_user_ids = all_click.user_id.unique()
sample_user_ids = np.random.choice(all_user_ids, size=sample_nums, replace=False)
all_click = all_click[all_click['user_id'].isin(sample_user_ids)]
all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))
return all_click
# 读取点击数据,这里分成线上和线下,如果是为了获取线上提交结果应该讲测试集中的点击数据合并到总的数据中
# 如果是为了线下验证模型的有效性或者特征的有效性,可以只使用训练集
def get_all_click_df(data_path='./', offline=True):
if offline:
train_click = pd.read_csv(data_path + 'train_click_log.csv')
test_click = pd.read_csv(data_path + 'testA_click_log.csv')
all_click = train_click.append(test_click)
else:
all_click = pd.read_csv(data_path + 'train_click_log.csv')
all_click = all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp']))
return all_click
# 读取文章的基本属性
def get_item_info_df(data_path):
item_info_df = pd.read_csv(data_path + 'articles.csv')
# 为了方便与训练集中的click_article_id拼接,需要把article_id修改成click_article_id
item_info_df = item_info_df.rename(columns={'article_id': 'click_article_id'})
return item_info_df
# 读取文章的Embedding数据
def get_item_emb_dict(data_path):
item_emb_df = pd.read_csv(data_path + 'articles_emb.csv')
item_emb_cols = [x for x in item_emb_df.columns if 'emb' in x]
item_emb_np = np.ascontiguousarray(item_emb_df[item_emb_cols])
# 进行归一化
item_emb_np = item_emb_np / np.linalg.norm(item_emb_np, axis=1, keepdims=True)
item_emb_dict = dict(zip(item_emb_df['article_id'], item_emb_np))
pickle.dump(item_emb_dict, open(save_path + 'item_content_emb.pkl', 'wb'))
return item_emb_dict
max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
# 全量训练集
all_click_df = get_all_click_df(offline=False)
# 对时间戳进行归一化,用于在关联规则的时候计算权重
all_click_df['click_timestamp'] = all_click_df[['click_timestamp']].apply(max_min_scaler)
item_info_df = get_item_info_df(data_path)
item_emb_dict = get_item_emb_dict(data_path)
工具函数
# 根据点击时间获取用户的点击文章序列 {user1: [(item1, time1), (item2, time2)..]...}
def get_user_item_time(click_df):
click_df = click_df.sort_values('click_timestamp')
def make_item_time_pair(df):
return list(zip(df['click_article_id'], df['click_timestamp']))
user_item_time_df = click_df.groupby('user_id')['click_article_id', 'click_timestamp'].apply(lambda x: make_item_time_pair(x))\
.reset_index().rename(columns={0: 'item_time_list'})
user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))
return user_item_time_dict
# 根据时间获取商品被点击的用户序列 {item1: [(user1, time1), (user2, time2)...]...}
# 这里的时间是用户点击当前商品的时间,好像没有直接的关系。
def get_item_user_time_dict(click_df):
def make_user_time_pair(df):
return list(zip(df['user_id'], df['click_timestamp']))
click_df = click_df.sort_values('click_timestamp')
item_user_time_df = click_df.groupby('click_article_id')['user_id', 'click_timestamp'].apply(lambda x: make_user_time_pair(x))\
.reset_index().rename(columns={0: 'user_time_list'})
item_user_time_dict = dict(zip(item_user_time_df['click_article_id'], item_user_time_df['user_time_list']))
return item_user_time_dict
# 获取当前数据的历史点击和最后一次点击
def get_hist_and_last_click(all_click):
all_click = all_click.sort_values(by=['user_id', 'click_timestamp'])
click_last_df = all_click.groupby('user_id').tail(1)
# 如果用户只有一个点击,hist为空了,会导致训练的时候这个用户不可见,此时默认泄露一下
def hist_func(user_df):
if len(user_df) == 1:
return user_df
else:
return user_df[:-1]
click_hist_df = all_click.groupby('user_id').apply(hist_func).reset_index(drop=True)
return click_hist_df, click_last_df
# 获取文章id对应的基本属性,保存成字典的形式,方便后面召回阶段,冷启动阶段直接使用
def get_item_info_dict(item_info_df):
max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
item_info_df['created_at_ts'] = item_info_df[['created_at_ts']].apply(max_min_scaler)
item_type_dict = dict(zip(item_info_df['click_article_id'], item_info_df['category_id']))
item_words_dict = dict(zip(item_info_df['click_article_id'], item_info_df['words_count']))
item_created_time_dict = dict(zip(item_info_df['click_article_id'], item_info_df['created_at_ts']))
return item_type_dict, item_words_dict, item_created_time_dict
def get_user_hist_item_info_dict(all_click):
# 获取user_id对应的用户历史点击文章类型的集合字典
user_hist_item_typs = all_click.groupby('user_id')['category_id'].agg(set).reset_index()
user_hist_item_typs_dict = dict(zip(user_hist_item_typs['user_id'], user_hist_item_typs['category_id']))
# 获取user_id对应的用户点击文章的集合
user_hist_item_ids_dict = all_click.groupby('user_id')['click_article_id'].agg(set).reset_index()
user_hist_item_ids_dict = dict(zip(user_hist_item_ids_dict['user_id'], user_hist_item_ids_dict['click_article_id']))
# 获取user_id对应的用户历史点击的文章的平均字数字典
user_hist_item_words = all_click.groupby('user_id')['words_count'].agg('mean').reset_index()
user_hist_item_words_dict = dict(zip(user_hist_item_words['user_id'], user_hist_item_words['words_count']))
# 获取user_id对应的用户最后一次点击的文章的创建时间
all_click_ = all_click.sort_values('click_timestamp')
user_last_item_created_time = all_click_.groupby('user_id')['created_at_ts'].apply(lambda x: x.iloc[-1]).reset_index()
max_min_scaler = lambda x : (x-np.min(x))/(np.max(x)-np.min(x))
user_last_item_created_time['created_at_ts'] = user_last_item_created_time[['created_at_ts']].apply(max_min_scaler)
user_last_item_created_time_dict = dict(zip(user_last_item_created_time['user_id'], \
user_last_item_created_time['created_at_ts']))
return user_hist_item_typs_dict, user_hist_item_ids_dict, user_hist_item_words_dict, user_last_item_created_time_dict
# 获取近期点击最多的文章
def get_item_topk_click(click_df, k):
topk_click = click_df['click_article_id'].value_counts().index[:k]
return topk_click
定义多路召回字典
# 获取文章的属性信息,保存成字典的形式方便查询
item_type_dict, item_words_dict, item_created_time_dict = get_item_info_dict(item_info_df)
# 定义一个多路召回的字典,将各路召回的结果都保存在这个字典当中
user_multi_recall_dict = {'itemcf_sim_itemcf_recall': {},
'embedding_sim_item_recall': {},
'youtubednn_recall': {},
'youtubednn_usercf_recall': {},
'cold_start_recall': {}}
# 提取最后一次点击作为召回评估,如果不需要做召回评估直接使用全量的训练集进行召回(线下验证模型)
# 如果不是召回评估,直接使用全量数据进行召回,不用将最后一次提取出来
trn_hist_click_df, trn_last_click_df = get_hist_and_last_click(all_click_df)
召回效果评估
# 依次评估召回的前10, 20, 30, 40, 50个文章中的击中率
def metrics_recall(user_recall_items_dict, trn_last_click_df, topk=5):
last_click_item_dict = dict(zip(trn_last_click_df['user_id'], trn_last_click_df['click_article_id']))
user_num = len(user_recall_items_dict)
for k in range(10, topk+1, 10):
hit_num = 0
for user, item_list in user_recall_items_dict.items():
# 获取前k个召回的结果
tmp_recall_items = [x[0] for x in user_recall_items_dict[user][:k]]
if last_click_item_dict[user] in set(tmp_recall_items):
hit_num += 1
hit_rate = round(hit_num * 1.0 / user_num, 5)
print(' topk: ', k, ' : ', 'hit_num: ', hit_num, 'hit_rate: ', hit_rate, 'user_num : ', user_num)