导入
#GPU大大加速了这个过程。
#我在开发时关闭GPU,提交时打开GPU
# FYI:
# This pip command takes a lot with GPU enabled (~15 min)
# It works though. And GPU accelerates the process *a lot*.
# I am developing with GPU turned off and submitting with GPU turned on
!pip install --upgrade implicit
import os; os.environ['OPENBLAS_NUM_THREADS']='1'
import numpy as np
import pandas as pd
import implicit
from scipy.sparse import coo_matrix
from implicit.evaluation import mean_average_precision_at_k
Load dataframes
%%time
base_path = '../input/h-and-m-personalized-fashion-recommendations/'
csv_train = f'{base_path}transactions_train.csv'
csv_sub = f'{base_path}sample_submission.csv'
csv_users = f'{base_path}customers.csv'
csv_items = f'{base_path}articles.csv'
df = pd.read_csv(csv_train, dtype={'article_id': str}, parse_dates=['t_dat'])
df_sub = pd.read_csv(csv_sub)
dfu = pd.read_csv(csv_users)
dfi = pd.read_csv(csv_items, dtype={'article_id': str})
# Trying with less data:
# https://www.kaggle.com/tomooinubushi/folk-of-time-is-our-best-friend/notebook
df = df[df['t_dat'] > '2020-08-21']
df.shape
对于验证,这意味着3周的培训和1周的验证
对于提交,这意味着4周的培训
# For validation this means 3 weeks of training and 1 week for validation
# For submission, it means 4 weeks of training
df['t_dat'].max()
Assign autoincrementing ids starting from 0 to both users and items
为用户和项目分配从0开始的自增id
ALL_USERS = dfu['customer_id'].unique().tolist()
ALL_ITEMS = dfi['article_id'].unique().tolist()
user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))
user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}
df['user_id'] = df['customer_id'].map(user_map)
df['item_id'] = df['article_id'].map(item_map)
del dfu, dfi
Create coo_matrix (user x item) and csr matrix (user x item)
It is common to use scipy sparse matrices in recommender systems, because the main core of the problem is typically modeled as a matrix with users and items, with the values representing whether the user purchased (or liked) an items. Since each user purchases only a small fraction of the catalog of products, this matrix is full of zero (aka: it’s sparse).
In a very recent release they did an API breaking change, so be aware of that: https://github.com/benfred/implicit/releases In this notebook we are using the latest version, so everything is aligned with (user x item)
We are using (user x item) matrices, both for training and for evaluating/recommender.
In the previous versions the training procedure required a COO item x user
For evaluation and prediction, on the other hand, CSR matrices with users x items format should be provided.
About COO matrices
COO matrices are a kind of sparse matrix. They store their values as tuples of (row, column, value) (the coordinates)
You can read more about them here:
https://en.wikipedia.org/wiki/Sparse_matrix#Coordinate_list_(COO)
https://scipy-lectures.org/advanced/scipy_sparse/coo_matrix.html
From https://het.as.utexas.edu/HET/Software/Scipy/generated/scipy.sparse.coo_matrix.html
创建coo_matrix(用户x项)和csr矩阵(用户x项)
在推荐系统中通常使用scipy稀疏矩阵,因为问题的主要核心通常被建模为包含用户和商品的矩阵,其中的值表示用户是否购买(或喜欢)商品。由于每个用户只购买产品目录的一小部分,所以这个矩阵充满了零(又名:它是稀疏的)。
在最近发布的版本中,他们做了一个API突破性的更改,所以要注意:https://github.com/benfred/implicit/releases在这个笔记本中,我们使用的是最新版本,所以所有内容都与(user x item)对齐
我们正在使用(用户x项)矩阵,用于训练和评估/推荐。
在以前的版本中,培训程序需要COO项目x用户
另一方面,为了评估和预测,应提供用户x项格式的CSR矩阵。
关于COO矩阵
COO矩阵是一种稀疏矩阵。它们将值存储为(行,列,值)的元组(坐标)
你可以在这里了解更多:
https://en.wikipedia.org/wiki/Sparse_matrix Coordinate_list_(首席运营官)
https://scipy-lectures.org/advanced/scipy_sparse/coo_matrix.html
从https://het.as.utexas.edu/HET/Software/Scipy/generated/scipy.sparse.coo_matrix.html
>>> row = np.array([0,3,1,0]) # user_ids
>>> col = np.array([0,3,1,2]) # item_ids
>>> data = np.array([4,5,7,9]) # a bunch of ones of lenght unique(user) x unique(items)
>>> coo_matrix((data,(row,col)), shape=(4,4)).todense()
matrix([[4, 0, 9, 0],
[0, 7, 0, 0],
[0, 0, 0, 0],
[0, 0, 0, 5]])
About CSR matrices
https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)
关于CSR矩阵
row = df['user_id'].values
col = df['item_id'].values
data = np.ones(df.shape[0])
coo_train = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
coo_train
Check that model works ok with data
%%time
model = implicit.als.AlternatingLeastSquares(factors=10, iterations=2)
model.fit(coo_train)
Validation
Functions required for validation
def to_user_item_coo(df):
""" Turn a dataframe with transactions into a COO sparse items x users matrix"""
row = df['user_id'].values
col = df['item_id'].values
data = np.ones(df.shape[0])
coo = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
return coo
def split_data(df, validation_days=7):
""" Split a pandas dataframe into training and validation data, using <<validation_days>>
"""
validation_cut = df['t_dat'].max() - pd.Timedelta(validation_days)
df_train = df[df['t_dat'] < validation_cut]
df_val = df[df['t_dat'] >= validation_cut]
return df_train, df_val
def get_val_matrices(df, validation_days=7):
""" Split into training and validation and create various matrices
Returns a dictionary with the following keys:
coo_train: training data in COO sparse format and as (users x items)
csr_train: training data in CSR sparse format and as (users x items)
csr_val: validation data in CSR sparse format and as (users x items)
"""
df_train, df_val = split_data(df, validation_days=validation_days)
coo_train = to_user_item_coo(df_train)
coo_val = to_user_item_coo(df_val)
csr_train = coo_train.tocsr()
csr_val = coo_val.tocsr()
return {'coo_train': coo_train,
'csr_train': csr_train,
'csr_val': csr_val
}
def validate(matrices, factors=200, iterations=20, regularization=0.01, show_progress=True):
""" Train an ALS model with <<factors>> (embeddings dimension)
for <<iterations>> over matrices and validate with MAP@12
"""
coo_train, csr_train, csr_val = matrices['coo_train'], matrices['csr_train'], matrices['csr_val']
model = implicit.als.AlternatingLeastSquares(factors=factors,
iterations=iterations,
regularization=regularization,
random_state=42)
model.fit(coo_train, show_progress=show_progress)
# The MAPK by implicit doesn't allow to calculate allowing repeated items, which is the case.
# TODO: change MAP@12 to a library that allows repeated items in prediction
map12 = mean_average_precision_at_k(model, csr_train, csr_val, K=12, show_progress=show_progress, num_threads=4)
print(f"Factors: {factors:>3} - Iterations: {iterations:>2} - Regularization: {regularization:4.3f} ==> MAP@12: {map12:6.5f}")
return map12
matrices = get_val_matrices(df)
%%time
best_map12 = 0
for factors in [40, 50, 60, 100, 200, 500, 1000]:
for iterations in [3, 12, 14, 15, 20]:
for regularization in [0.01]:
map12 = validate(matrices, factors, iterations, regularization, show_progress=False)
if map12 > best_map12:
best_map12 = map12
best_params = {'factors': factors, 'iterations': iterations, 'regularization': regularization}
print(f"Best MAP@12 found. Updating: {best_params}")
del matrices
Training over the full dataset
coo_train = to_user_item_coo(df)
csr_train = coo_train.tocsr()
def train(coo_train, factors=200, iterations=15, regularization=0.01, show_progress=True):
model = implicit.als.AlternatingLeastSquares(factors=factors,
iterations=iterations,
regularization=regularization,
random_state=42)
model.fit(coo_train, show_progress=show_progress)
return model
best_params
model = train(coo_train, **best_params)
Submission
Submission function
def submit(model, csr_train, submission_name="submissions.csv"):
preds = []
batch_size = 2000
to_generate = np.arange(len(ALL_USERS))
for startidx in range(0, len(to_generate), batch_size):
batch = to_generate[startidx : startidx + batch_size]
ids, scores = model.recommend(batch, csr_train[batch], N=12, filter_already_liked_items=False)
for i, userid in enumerate(batch):
customer_id = user_ids[userid]
user_items = ids[i]
article_ids = [item_ids[item_id] for item_id in user_items]
preds.append((customer_id, ' '.join(article_ids)))
df_preds = pd.DataFrame(preds, columns=['customer_id', 'prediction'])
df_preds.to_csv(submission_name, index=False)
display(df_preds.head())
print(df_preds.shape)
return df_preds
%%time
df_preds = submit(model, csr_train);