简要介绍
协同过滤
本文显示了基于用户的 CF 和基于项的 CF 的朴素集合。当然,在这次比赛中,用户可以被视为会话,将项目视为辅助工具。因此,我们将提取两个不同的相似性矩阵:第一个计算会话对之间的相似性,另一个计算辅助工具对之间的相似性。
最终,最终模型是基于会话的 CF 和基于援助的 CF 之间的加权和的结果。
鉴于大量的会话和项目,我们将计算绑定到前一周开始的与测试集相关的会话。因此,如果我们的训练集在 4 周内拆分,为了简单起见,我们认为只有最后一周可用
一些准备
import numpy as np
import pandas as pd
import scipy.sparse as sps
import gc
from pathlib import Path
from tqdm import tqdm
from datetime import datetime, timedelta
#import similaripy as sim
import simil as sim
# Initial Configuration
NUM_SESSIONS = 12_899_779 + 1_671_803 # Train_sessions + test_sesssions
NUM_ITEMS = 1_855_603
shape = (NUM_SESSIONS, NUM_ITEMS)
ID_TO_ACTION = {
0: 'click',
1: 'cart',
2: 'order'
}
N_DAYS_BEFORE = 14
数据加载
train = pd.read_parquet('../input/otto-full-optimized-memory-footprint/train.parquet')
test = pd.read_parquet('../input/otto-full-optimized-memory-footprint/test.parquet')
# Extract sessions
num_sessions = 12_899_779
sessions = np.zeros(num_sessions, dtype=np.int32)
timestamps = np.zeros(num_sessions)
previous_session = -1
i = 0
for s, t in tqdm(zip(train.session, train.ts)):
if s == previous_session:
continue
sessions[i] = s
timestamps[i] = t
previous_session = s
i += 1
# Consider N_DAYS_BEFORE days before the first day in the test
min_ts = test.ts.min()
dt_obj = datetime.fromtimestamp(min_ts)
print(dt_obj)
previous_week_day = dt_obj - timedelta(days = N_DAYS_BEFORE)
# Extract sessions that starts at least N_DAYS_BEFORE
last_week_sessions = sessions[timestamps >= int(previous_week_day.timestamp())]
f"Fraction of sessions: {last_week_sessions.shape[0]/sessions.shape[0]}"
将训练集限制为之前提取的会话
lw_train = train[train.session.isin(last_week_sessions)]
计算稀疏矩阵
为了加快和优化计算速度,我们利用稀疏矩阵来保证会话辅助交互的内存高效表示。特别是,我们使用压缩稀疏行矩阵
我们根据每个event_type拆分创建,因此将创建三种不同的 CSR:_clickscsr、_cartscsr 和 _orderscsr
def create_csr(df, shape, name, folder):
csr = sps.csr_matrix(
(np.ones(df.shape[0]), (df.session.values, df.aid.values)),
shape=shape
)
path = Path(folder)
path.mkdir(parents=True, exist_ok=True)
path = path.joinpath(f"{name}_csr.npz")
sps.save_npz(path, csr)
def create_csr_matrices(df, shape, folder):
for i, action in ID_TO_ACTION.items():
tmp_df = df[df.type == i]
create_csr(tmp_df, shape, action, folder)
create_csr_matrices(lw_train, shape, folder='lw_train')
创建 URM
经典推荐系统模型的常见结构是所谓的用户评分矩阵 (URM),它定义了用户分配给一组项目的评分。我们的 URM 将是计算的基本块,在我们的例子中,包括训练和测试 urms。
# Last week train
train_csr_clicks = sps.load_npz('/kaggle/working/lw_train/click_csr.npz')
train_csr_carts = sps.load_npz('/kaggle/working/lw_train/cart_csr.npz')
train_csr_orders = sps.load_npz('/kaggle/working/lw_train/order_csr.npz')
# Test
test_csr_clicks = sps.load_npz('../input/csr-generator/test/click_csr.npz')
test_csr_carts = sps.load_npz('../input/csr-generator/test/cart_csr.npz')
test_csr_orders = sps.load_npz('../input/csr-generator/test/order_csr.npz')
# TRAIN
train_csr_clicks.data = np.ones(train_csr_clicks.data.shape[0], dtype=np.float32)
train_csr_carts.data = np.ones(train_csr_carts.data.shape[0], dtype=np.float32) * 6
train_csr_orders.data = np.ones(train_csr_orders.data.shape[0], dtype=np.float32) * 3
train_urm = train_csr_clicks + train_csr_carts + train_csr_orders
del train_csr_clicks, train_csr_carts, train_csr_orders
# TEST
test_csr_clicks.data = np.ones(test_csr_clicks.data.shape[0], dtype=np.float32)
test_csr_carts.data = np.ones(test_csr_carts.data.shape[0], dtype=np.float32) * 6
test_csr_orders.data = np.ones(test_csr_orders.data.shape[0], dtype=np.float32) * 3
test_urm = test_csr_clicks + test_csr_carts + test_csr_orders
del test_csr_clicks #, test_csr_carts, test_csr_orders
gc.collect()
urm = train_urm + test_urm
另一种常见的策略是根据某些函数规范化 URM。已经部署了不同的归一化技术:l1,l2,max,tf-idf,bm25。
我们选择 TF-IDF 规范化。
urm = sim.normalization.tfidf(urm)
模型
这两个模型的作用方式相同:
获取 URM 作为输入
根据某个函数计算相似性矩阵
使用 URM 和相似性矩阵提取最终分数
因此,给定形状为 (NUM_SESSIONS, NUM_ITEMS) 的 URM,计算两个模型如下:
基于用户的 CF:
计算相似性矩阵:_similaritymethod(URM) --> 带形状的 SIM 卡(NUM_SESSIONS、NUM_SESSIONS)
提取分数:分数 = SIM URM,其中是_dotproduct,分数有形状(NUM_SESSIONS、NUM_ITEMS)
基于项目的 CF:
计算相似性矩阵:_similaritymethod(URM.T) --> 带形状的 SIM 卡(NUM_ITEMS、NUM_ITEMS)
提取分数:分数 = URM SIM,其中是_dotproduct,分数有形状(NUM_SESSIONS、NUM_ITEMS)
可以选择几种_similaritymethod。在我们的例子中,我们选择余弦函数来提取相似性矩阵。
def user_based_cf(urm, test_urm, topK=30, cutoff=20):
model = sim.cosine(urm, k=topK)
model.setdiag(0)
model = model.transpose().tocsr()
test_sessions = np.unique(test_urm.nonzero()[0])
scores = sim.dot_product(model, test_urm, k=cutoff, target_rows=test_sessions) # filter_cols=test_urm)
return scores.tocsr()
def item_based_cf(urm, test_urm, topK=100, cutoff=20):
model = sim.cosine(urm.T, k=topK)
model.setdiag(0)
model = model.transpose().tocsr()
test_sessions = np.unique(test_urm.nonzero()[0])
scores = sim.dot_product(test_urm, model, k=cutoff, target_rows=test_sessions) # filter_cols=test_urm)
return scores.tocsr()
user_cf_scores = user_based_cf(urm, test_urm, topK=50, cutoff=20)
item_cf_scores = item_based_cf(urm, test_urm, topK=50, cutoff=20)
集成
朴素融合:用固定权重提取的两个分数矩阵的加权和。权重是超参数,应正确调整
test_scores = 0.7 * user_cf_scores + 0.3 * item_cf_scores
test_scores
计算预测
对于测试集中的每个session_id,我们提取按会话中的分数排序的推荐项目。
test_sessions = np.unique(test_urm.nonzero()[0])
result_click = []
test_scores = test_scores[test_sessions]
for left, right in tqdm(zip(test_scores.indptr[:-1], test_scores.indptr[1:])):
session_recs = test_scores.indices[left:right]
session_recs_scores = test_scores.data[left:right]
# Sort aids by score
sorted_recs = session_recs[np.argsort(session_recs_scores)[::-1]]
# Join aids
result_click.append(sorted_recs)
获取最近的购物车/订单
对于测试集中的每个会话,请考虑最近的购物车/订单
last_carts_test = test[test.type == 1].sort_values(['session', 'ts']).groupby('session').apply(lambda x: list(x['aid'])).apply(lambda l: list(set(l))).apply(lambda l: l[-20:])
last_order_test = test[test.type == 2].sort_values(['session', 'ts']).groupby('session').apply(lambda x: list(x['aid'])).apply(lambda l: list(set(l))).apply(lambda l: l[-20:])
# Utility function: if carts/orders in the session are already 20, do nothing. Otherwise, if there are less than 20 aids, add recommended items with the higher score
def get_cart_order_sub(last_aids_test, test_sessions, result_click, op, k=20):
session_type = np.zeros(test_sessions.shape[0], dtype=object)
session_labels = np.zeros(test_sessions.shape[0], dtype=object)
for i in tqdm(range(test_sessions.shape[0])):
session = test_sessions[i]
if session in last_aids_test:
pred_aids = last_aids_test[session]
else:
pred_aids = []
session_type[i] = f"{session}_{op}"
if len(pred_aids) < k:
for aid in result_click[i]:
if aid not in pred_aids: pred_aids.append(aid)
if len(pred_aids) == k: break
session_labels[i] = " ".join([str(x) for x in pred_aids])
return pd.DataFrame({'session_type': session_type, 'labels': session_labels})
数据提取预测
subs = []
op_names = ["clicks", "carts", "orders"]
for op in op_names:
print(f"Computing df for {op}")
if op == 'clicks':
sub = pd.DataFrame({'session_type': test_sessions, 'labels': result_click})
sub.session_type = sub.session_type.astype(str) + f"_{op}"
if op == 'carts':
sub = get_cart_order_sub(last_carts_test, test_sessions, result_click, op, k=20)
if op == 'orders':
sub = get_cart_order_sub(last_order_test, test_sessions, result_click, op, k=20)
subs.append(sub)
print("Writing output")
sub = pd.concat(subs).reset_index(drop = True)
sub.to_csv('submission.csv', index = False)