# !/usr/bin/env python
# -*-coding:utf-8 -*-
"""
# File : new_main.py
# Time :2022/4/14 10:49
# Author :huangtaogan
# Email :842960911@qq.com
# Description:
"""
import os
import time
import argparse
import random
from tqdm import tqdm
import pandas as pd
import numpy as np
from surprise import KNNBasic, Dataset, Reader
from sklearn.preprocessing import KBinsDiscretizer
import torch
import torch.nn as nn
import torch.optim as optim
# import torch.utils.data as data
from torch.utils.data import DataLoader
from tensorboardX import SummaryWriter
import model
import config
import util
import data_utils
import evaluate
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--seed",
type=int,
default=42,
help="Seed")
parser.add_argument("--lr",
type=float,
default=0.000001,
help="learning rate")
parser.add_argument("--dropout",
type=float,
default=0.2,
help="dropout rate")
parser.add_argument("--batch_size",
type=int,
default=256,
help="batch size for training")
parser.add_argument("--epochs",
type=int,
default=30,
help="training epoches")
parser.add_argument("--top_k",
type=int,
default=10,
help="compute metrics@top_k")
parser.add_argument("--factor_num",
type=int,
default=16,
help="predictive factors numbers in the model")
parser.add_argument("--layers",
nargs='+',
default=[64, 32, 16, 8],
help="MLP layers. Note that the first layer is the concatenation of user and item embeddings. So layers[0]/2 is the embedding size.")
parser.add_argument("--num_ng",
type=int,
default=4,
help="Number of negative samples for training set")
parser.add_argument("--max_num_ng",
type=int,
default=2000,
help="")
parser.add_argument("--num_ng_test",
type=int,
default=100,
help="Number of negative samples for test set")
parser.add_argument("--test_candicate_type",
type=int,
default=0,
help="0 for random, 1 for KNN")
parser.add_argument("--out",
default=True,
help="save model or not")
parser.add_argument("--item_side_feature",
default=True,
help="use item side features or not")
args = parser.parse_args()
# origin data process
print("loading origin data ...")
art_data = pd.read_excel(config.DATA_PATH + '数据.xlsx', sheet_name="用户点赞", names=["user", "item", "label", "timestamp"])
item_info = pd.read_excel(config.DATA_PATH + '数据.xlsx', sheet_name='艺术品信息', usecols=[0, 4, 8], names=['item', 'artist', 'cates'])
cates = pd.read_excel(config.DATA_PATH + '数据.xlsx', sheet_name='种子标签', names=['cate'])
comments = pd.read_excel(config.DATA_PATH + '数据.xlsx', sheet_name='用户评论', usecols=[1, 2], names=['item', 'user'])
# art_data = pd.merge(art_data, item_info, how="left", on='item')
# all users & items & artists
users = list(art_data.user.drop_duplicates())
items = list(art_data.item.drop_duplicates())
artists = list(item_info.artist.drop_duplicates())
cates = list(cates.cate.drop_duplicates())
# reset user & item & artist & cate ids
user_to_idx = {user: idx for user, idx in zip(users, range(len(users)))}
item_to_idx = {item: idx for item, idx in zip(items, range(len(items)))}
artist_to_idx = {artist: idx for artist, idx in zip(artists, range(len(artists)))}
cate_to_idx = {cate: idx for cate, idx in zip(cates, range(len(cates)))}
idx_to_user = {idx: user for idx, user in zip(range(len(users)), users)}
idx_to_item = {idx: item for idx, item in zip(range(len(items)), items)}
idx_to_artist = {idx: artist for artist, idx in zip(artists, range(len(artists)))}
idx_to_cate = {idx: cate for cate, idx in zip(cates, range(len(cates)))}
art_data["user"] = art_data.user.map(user_to_idx)
art_data["item"] = art_data.item.map(item_to_idx)
comments["user"] = comments.user.map(user_to_idx)
comments["item"] = comments.item.map(item_to_idx)
item_info["item"] = item_info.item.map(item_to_idx)
item_info["artist"] = item_info.artist.map(artist_to_idx)
item_info['cates'] = item_info.cates.map(lambda x: [cate_to_idx[cate] for cate in x.split(',') if cate != ''])
reset_id_users = [user_to_idx[user] for user in users]
reset_id_items = [item_to_idx[item] for item in items]
reset_id_cates = [cate_to_idx[cate] for cate in cates]
reset_id_artists = [artist_to_idx[artist] for artist in artists]
# count features
enc = KBinsDiscretizer(n_bins=10, encode="ordinal")
user_thumbs = art_data[art_data['label'] == 1].groupby("user").agg({'label': "count"}).reset_index()
user_thumbs.columns = ['user', 'user_thumbs']
user_thumbs['user_thumbs'] = enc.fit_transform(user_thumbs[['user_thumbs']]).astype(np.int64)
item_thumbs = art_data[art_data['label'] == 1].groupby("item").agg({'label': "count"}).reset_index()
item_thumbs.columns = ['item', 'item_thumbs']
item_thumbs['item_thumbs'] = enc.fit_transform(item_thumbs[['item_thumbs']]).astype(np.int64)
user_comments = comments.groupby("user").agg({'item': "count"}).reset_index()
user_comments.columns = ['user', 'user_comments']
user_comments['user_comments'] = enc.fit_transform(user_comments[['user_comments']]).astype(np.int64)
item_comments = comments.groupby("item").agg({'user': "count"}).reset_index()
item_comments.columns = ['item', 'item_comments']
item_comments['item_comments'] = enc.fit_transform(item_comments[['item_comments']]).astype(np.int64)
# data split
last_thumbs_up_art_data = art_data.groupby("user").tail(1)
history_data = art_data[~art_data.index.isin(last_thumbs_up_art_data.index)]
# KNN recommendations
user_seen_items = history_data.groupby("user").agg({"item": lambda x: list(x)}).to_dict()["item"]
# KNN data
df_KNN_train_data = history_data[history_data.label == 1][["user", "item", "label"]]
reader = Reader()
KNN_train_data = Dataset.load_from_df(df_KNN_train_data, reader=reader)
trainset = KNN_train_data.build_full_trainset()
algo = KNNBasic()
algo.fit(trainset)
print("cache user KNN recommendations ...")
def get_user_recom(algo, user, user_seen_items, top_k_user):
try:
user_inner_id = algo.trainset.to_inner_uid(user)
except:
return []
user_neighbors = algo.get_neighbors(user_inner_id, k=top_k_user)
recom_items = []
for u in user_neighbors:
raw_uid = algo.trainset.to_raw_uid(u)
recom_items += user_seen_items[raw_uid]
recom_items = list(set(recom_items))
cur_user_seen_items = set(user_seen_items[user])
recom_items = [item for item in recom_items if item not in cur_user_seen_items]
return recom_items
user_recom_items = dict()
for user in tqdm(reset_id_users):
user_recom_items[user] = get_user_recom(algo, user, user_seen_items, 5)
# train data
tini_train_datas = []
for user in list(history_data["user"].drop_duplicates()):
pos_tini_train = history_data[(history_data.user == user) & (history_data.label == 1)][["user", "item", "label"]]
ng_tini_train = history_data[(history_data.user == user) & (history_data.label == 0)][["user", "item", "label"]]
pos_items = list(pos_tini_train.item)
ng_items = list(ng_tini_train.item)
num_ng = max(0, min(len(pos_items) * args.num_ng - len(ng_items), args.max_num_ng))
candicate_items = list(set(reset_id_items) - set(pos_items + ng_items))
random_ng_items = random.sample(candicate_items, num_ng)
random_ng_tini_train = pd.DataFrame({"user": user, "item": random_ng_items, "label": 1})
tini_train_datas.append(pd.concat([pos_tini_train, ng_tini_train, random_ng_tini_train]))
train_sample = pd.concat(tini_train_datas)
train_sample = pd.merge(train_sample, item_info, how='left', on='item')
train_sample = pd.merge(train_sample, user_thumbs, how='left', on='user')
train_sample = pd.merge(train_sample, user_comments, how='left', on='user')
train_sample = pd.merge(train_sample, item_thumbs, how='left', on='item')
train_sample = pd.merge(train_sample, item_comments, how='left', on='item')
train_sample.fillna(0, inplace=True)
# test data
tini_test_datas = []
for user in list(last_thumbs_up_art_data["user"].drop_duplicates()):
candicate_items = list(set(reset_id_items) - set(user_seen_items.get(user, [])))
if args.test_candicate_type == 0:
ng_items = random.sample(candicate_items, args.num_ng_test)
pos_tini_test = last_thumbs_up_art_data[(last_thumbs_up_art_data.user == user) & (last_thumbs_up_art_data.label == 1)][["user", "item", "label"]]
random_ng_tini_test = pd.DataFrame({"user": user, "item": ng_items, "label": 0})
tini_test_datas.append(pd.concat([pos_tini_test, random_ng_tini_test]))
else:
pos_item = last_thumbs_up_art_data[last_thumbs_up_art_data.user==user]["item"].tolist()[0]
KNN_recom_items = user_recom_items.get(user, [])[:args.num_ng_test+1]
num_fill_item = args.num_ng_test+1 - len(KNN_recom_items)
fill_items = random.sample(candicate_items, num_fill_item)
tini_test = pd.DataFrame({"user": user, "item": KNN_recom_items+fill_items, "label": 0})
tini_test_datas.append(tini_test)
test_sample = pd.concat(tini_test_datas)
test_sample = pd.merge(test_sample, item_info, how='left', on='item')
test_sample = pd.merge(test_sample, user_thumbs, how='left', on='user')
test_sample = pd.merge(test_sample, user_comments, how='left', on='user')
test_sample = pd.merge(test_sample, item_thumbs, how='left', on='item')
test_sample = pd.merge(test_sample, item_comments, how='left', on='item')
test_sample.fillna(0, inplace=True)
# set device and parameters
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
writer = SummaryWriter()
# seed for Reproducibility
util.seed_everything(args.seed)
# set the num_users, items
num_users = len(users)+1
num_items = len(items)+1
num_artists = len(artists)+1
# construct the train and test datasets
data_columns = list(train_sample.columns)
train_dataset = data_utils.Rating_Datset(*[list(train_sample[col]) for col in data_columns])
# user_list, item_list, artist_list, cates_list, user_thumbs_list, user_comments_list, item_thumbs_list, item_comments_list, rating_list
test_dataset = data_utils.Rating_Datset(*[list(test_sample[col]) for col in data_columns])
train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=args.num_ng_test+1, shuffle=False)
# set model and loss, optimizer
model = model.NeuMF(args, num_users, num_items, num_artists)
model = model.to(device)
loss_function = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=args.lr)
# train, evaluation
best_hr = 0
for epoch in range(1, args.epochs+1):
model.train() # Enable dropout (if have).
start_time = time.time()
for user, item, artist, cates, user_thumbs, user_comments, item_thumbs, item_comments, label in tqdm(train_loader):
user = user.to(device)
item = item.to(device)
artist = artist.to(device)
cates = cates.reshape([-1, 34]).to(device)
user_thumbs = user_thumbs.to(device)
user_comments = user_comments.to(device)
item_thumbs = item_thumbs.to(device)
item_comments = item_comments.to(device)
label = label.to(device)
optimizer.zero_grad()
prediction = model(user, item, artist, cates, user_thumbs, user_comments, item_thumbs, item_comments)
loss = loss_function(prediction, label)
loss.backward()
optimizer.step()
writer.add_scalar('loss/Train_loss', loss.item(), epoch)
model.eval()
HR, NDCG, LOSS = evaluate.metrics(model, test_loader, args.top_k, loss_function, device)
writer.add_scalar('Perfomance/HR@10', HR, epoch)
writer.add_scalar('Perfomance/NDCG@10', NDCG, epoch)
writer.add_scalar('loss/test_loss', LOSS, epoch)
elapsed_time = time.time() - start_time
print("The time elapse of epoch {:03d}".format(epoch) + " is: " +
time.strftime("%H: %M: %S", time.gmtime(elapsed_time)))
print("LOSS: {:.3f}\t HR: {:.3f}\tNDCG: {:.3f}".format(np.mean(LOSS), np.mean(HR), np.mean(NDCG)))
if HR > best_hr:
best_hr, best_ndcg, best_epoch = HR, NDCG, epoch
if args.out:
if not os.path.exists(config.MODEL_PATH):
os.mkdir(config.MODEL_PATH)
torch.save(model,
'{}{}.pth'.format(config.MODEL_PATH, config.MODEL))
time.sleep(2)
writer.close()
print("End. Best epoch {:03d}: HR = {:.3f}, NDCG = {:.3f}".format(
best_epoch, best_hr, best_ndcg))
model
import torch
import torch.nn as nn
class Generalized_Matrix_Factorization(nn.Module):
def __init__(self, args, num_users, num_items):
super(Generalized_Matrix_Factorization, self).__init__()
self.num_users = num_users
self.num_items = num_items
self.factor_num = args.factor_num
self.embedding_user = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.factor_num)
self.embedding_item = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.factor_num)
self.affine_output = nn.Linear(in_features=self.factor_num, out_features=1)
self.logistic = nn.Sigmoid()
def forward(self, user_indices, item_indices):
user_embedding = self.embedding_user(user_indices)
item_embedding = self.embedding_item(item_indices)
element_product = torch.mul(user_embedding, item_embedding)
logits = self.affine_output(element_product)
rating = self.logistic(logits)
return rating
def init_weight(self):
pass
class Multi_Layer_Perceptron(nn.Module):
def __init__(self, args, num_users, num_items):
super(Multi_Layer_Perceptron, self).__init__()
self.num_users = num_users
self.num_items = num_items
self.factor_num = args.factor_num
self.layers = args.layers
self.embedding_user = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.factor_num)
self.embedding_item = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.factor_num)
self.fc_layers = nn.ModuleList()
for idx, (in_size, out_size) in enumerate(zip(self.layers[:-1], self.layers[1:])):
self.fc_layers.append(nn.Linear(in_size, out_size))
self.affine_output = nn.Linear(in_features=self.layers[-1], out_features=1)
self.logistic = nn.Sigmoid()
def forward(self, user_indices, item_indices):
user_embedding = self.embedding_user(user_indices)
item_embedding = self.embedding_item(item_indices)
vector = torch.cat([user_embedding, item_embedding], dim=-1) # the concat latent vector
for idx, _ in enumerate(range(len(self.fc_layers))):
vector = self.fc_layers[idx](vector)
vector = nn.ReLU()(vector)
# vector = nn.BatchNorm1d()(vector)
# vector = nn.Dropout(p=0.5)(vector)
logits = self.affine_output(vector)
rating = self.logistic(logits)
return rating
def init_weight(self):
pass
class NeuMF(nn.Module):
def __init__(self, args, num_users, num_items, num_artists):
super(NeuMF, self).__init__()
self.num_users = num_users
self.num_items = num_items
self.num_artists = num_artists
self.factor_num_mf = args.factor_num
self.factor_num_mlp = int((args.layers[0]-4)/4)
self.layers = args.layers
self.dropout = args.dropout
self.item_side_feature = args.item_side_feature
self.embedding_user_mlp = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.factor_num_mlp)
self.embedding_item_mlp = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.factor_num_mlp)
if self.item_side_feature:
self.embedding_artist_mlp = nn.Embedding(num_embeddings=self.num_artists, embedding_dim=self.factor_num_mlp)
self.embedding_item_cates = nn.Linear(in_features=34, out_features=self.factor_num_mlp)
self.embedding_item_thumbs = nn.Embedding(num_embeddings=10, embedding_dim=1)
self.embedding_item_comments = nn.Embedding(num_embeddings=10, embedding_dim=1)
self.embedding_user_thumbs = nn.Embedding(num_embeddings=10, embedding_dim=1)
self.embedding_user_comments = nn.Embedding(num_embeddings=10, embedding_dim=1)
self.embedding_user_mf = nn.Embedding(num_embeddings=self.num_users, embedding_dim=self.factor_num_mf)
self.embedding_item_mf = nn.Embedding(num_embeddings=self.num_items, embedding_dim=self.factor_num_mf)
self.fc_layers = nn.ModuleList()
for idx, (in_size, out_size) in enumerate(zip(args.layers[:-1], args.layers[1:])):
self.fc_layers.append(torch.nn.Linear(in_size, out_size))
self.fc_layers.append(nn.ReLU())
self.affine_output = nn.Linear(in_features=args.layers[-1] + self.factor_num_mf, out_features=1)
self.logistic = nn.Sigmoid()
self.init_weight()
def init_weight(self):
nn.init.normal_(self.embedding_user_mlp.weight, std=0.01)
nn.init.normal_(self.embedding_item_mlp.weight, std=0.01)
nn.init.normal_(self.embedding_user_mf.weight, std=0.01)
nn.init.normal_(self.embedding_item_mf.weight, std=0.01)
if self.item_side_feature:
nn.init.normal_(self.embedding_artist_mlp.weight, std=0.01)
nn.init.normal_(self.embedding_item_cates.weight, std=0.01)
nn.init.normal_(self.embedding_user_thumbs.weight, std=0.01)
nn.init.normal_(self.embedding_user_comments.weight, std=0.01)
nn.init.normal_(self.embedding_item_thumbs.weight, std=0.01)
nn.init.normal_(self.embedding_item_comments.weight, std=0.01)
for m in self.fc_layers:
if isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight)
nn.init.xavier_uniform_(self.affine_output.weight)
for m in self.modules():
if isinstance(m, nn.Linear) and m.bias is not None:
m.bias.data.zero_()
def forward(self, user_indices, item_indices, artist_indices=None, item_cates=None, user_thumbs=None,
user_comments=None, item_thumbs=None, item_comments=None):
user_embedding_mlp = self.embedding_user_mlp(user_indices)
item_embedding_mlp = self.embedding_item_mlp(item_indices)
if self.item_side_feature:
artist_embedding_mlp = self.embedding_artist_mlp(artist_indices)
item_cates_embedding_mlp = self.embedding_item_cates(item_cates)
item_thumbs_embedding_mlp = self.embedding_item_thumbs(item_thumbs)
item_comments_embedding_mlp = self.embedding_item_comments(item_comments)
user_thumbs_embedding_mlp = self.embedding_user_thumbs(user_thumbs)
user_comments_embedding_mlp = self.embedding_user_comments(user_comments)
user_embedding_mf = self.embedding_user_mf(user_indices)
item_embedding_mf = self.embedding_item_mf(item_indices)
if self.item_side_feature:
mlp_vector = torch.cat([user_embedding_mlp, item_embedding_mlp, artist_embedding_mlp,
item_cates_embedding_mlp, item_thumbs_embedding_mlp, item_comments_embedding_mlp,
user_thumbs_embedding_mlp, user_comments_embedding_mlp],
dim=-1) # the concat latent vector
else:
mlp_vector = torch.cat([user_embedding_mlp, item_embedding_mlp], dim=-1) # the concat latent vector
mf_vector =torch.mul(user_embedding_mf, item_embedding_mf)
for idx, _ in enumerate(range(len(self.fc_layers))):
mlp_vector = self.fc_layers[idx](mlp_vector)
vector = torch.cat([mlp_vector, mf_vector], dim=-1)
logits = self.affine_output(vector)
rating = self.logistic(logits)
return rating.squeeze()
eval
import numpy as np
import torch
def hit(ng_item, pred_items):
if type(ng_item) == int:
ng_item = [ng_item]
for i in ng_item:
if i in pred_items:
return 1
return 0
def ndcg(ng_item, pred_items):
ndcg = 0
if type(ng_item) == int:
ng_item = [ng_item]
for i in ng_item:
if i in pred_items:
index = pred_items.index(i)+1
ndcgi = 1/np.log2(index+1)
ndcg += ndcgi
return ndcg
def metrics(model, test_loader, top_k, loss_fun, device):
HR, NDCG, LOSS = [], [], []
for user, item, artist, cates, user_thumbs, user_comments, item_thumbs, item_comments, label in test_loader:
user = user.to(device)
item = item.to(device)
artist = artist.to(device)
cates = cates.reshape([-1, 34]).to(device)
user_thumbs = user_thumbs.to(device)
user_comments = user_comments.to(device)
item_thumbs = item_thumbs.to(device)
item_comments = item_comments.to(device)
# label = label.to(device)
predictions = model(user, item, artist, cates, user_thumbs, user_comments, item_thumbs, item_comments)
loss = loss_fun(predictions, label).detach().cpu().item()
_, indices = torch.topk(predictions, top_k)
recommends = torch.take(
item, indices).cpu().numpy().tolist()
ng_item_indices = torch.nonzero(label==1).squeeze()
ng_item = torch.take(item, ng_item_indices).cpu().numpy().tolist()
LOSS.append(loss)
HR.append(hit(ng_item, recommends))
NDCG.append(ndcg(ng_item, recommends))
return np.mean(HR), np.mean(NDCG), np.mean(LOSS)
if __name__ == "__main__":
print(ndcg([1, 3], [3, 1, 2]))
data_util
import random
import numpy as np
import pandas as pd
import torch
import config
class NCF_Data(object):
"""
Construct Dataset for NCF
"""
def __init__(self, args, ratings):
self.ratings = ratings
self.num_ng = args.num_ng
self.num_ng_test = args.num_ng_test
self.batch_size = args.batch_size
self.preprocess_ratings = self._reindex(self.ratings)
self.user_pool = set(self.ratings['user_id'].unique())
self.item_pool = set(self.ratings['item_id'].unique())
self.train_ratings, self.test_ratings = self._leave_one_out(self.preprocess_ratings)
self.negatives = self._negative_sampling(self.preprocess_ratings)
random.seed(args.seed)
def _reindex(self, ratings):
"""
Process dataset to reindex userID and itemID, also set rating as binary feedback
"""
user_list = list(ratings['user_id'].drop_duplicates())
user2id = {w: i for i, w in enumerate(user_list)}
item_list = list(ratings['item_id'].drop_duplicates())
item2id = {w: i for i, w in enumerate(item_list)}
ratings['user_id'] = ratings['user_id'].apply(lambda x: user2id[x])
ratings['item_id'] = ratings['item_id'].apply(lambda x: item2id[x])
ratings['rating'] = ratings['rating'].apply(lambda x: float(x > 0))
return ratings
def _leave_one_out(self, ratings):
"""
leave-one-out evaluation protocol in paper https://www.comp.nus.edu.sg/~xiangnan/papers/ncf.pdf
"""
ratings['rank_latest'] = ratings.groupby(['user_id'])['timestamp'].rank(method='first', ascending=False)
test = ratings.loc[ratings['rank_latest'] == 1]
train = ratings.loc[ratings['rank_latest'] > 1]
assert train['user_id'].nunique()==test['user_id'].nunique(), 'Not Match Train User with Test User'
return train[['user_id', 'item_id', 'rating']], test[['user_id', 'item_id', 'rating']]
def _negative_sampling(self, ratings):
interact_status = (
ratings.groupby('user_id')['item_id']
.apply(set)
.reset_index()
.rename(columns={'item_id': 'interacted_items'}))
interact_status['negative_items'] = interact_status['interacted_items'].apply(lambda x: self.item_pool - x)
interact_status['negative_samples'] = interact_status['negative_items'].apply(lambda x: random.sample(x, self.num_ng_test))
return interact_status[['user_id', 'negative_items', 'negative_samples']]
def get_train_instance(self):
users, items, ratings = [], [], []
train_ratings = pd.merge(self.train_ratings, self.negatives[['user_id', 'negative_items']], on='user_id')
train_ratings['negatives'] = train_ratings['negative_items'].apply(lambda x: random.sample(x, self.num_ng))
for row in train_ratings.itertuples():
users.append(int(row.user_id))
items.append(int(row.item_id))
ratings.append(float(row.rating))
for i in range(self.num_ng):
users.append(int(row.user_id))
items.append(int(row.negatives[i]))
ratings.append(float(0)) # negative samples get 0 rating
dataset = Rating_Datset(
user_list=users,
item_list=items,
rating_list=ratings)
return torch.utils.data.DataLoader(dataset, batch_size=self.batch_size, shuffle=True, num_workers=4)
def get_test_instance(self):
users, items, ratings = [], [], []
test_ratings = pd.merge(self.test_ratings, self.negatives[['user_id', 'negative_samples']], on='user_id')
for row in test_ratings.itertuples():
users.append(int(row.user_id))
items.append(int(row.item_id))
ratings.append(float(row.rating))
for i in getattr(row, 'negative_samples'):
users.append(int(row.user_id))
items.append(int(i))
ratings.append(float(0))
dataset = Rating_Datset(
user_list=users,
item_list=items,
rating_list=ratings)
return torch.utils.data.DataLoader(dataset, batch_size=self.num_ng_test+1, shuffle=False, num_workers=4)
class Rating_Datset(torch.utils.data.Dataset):
def __init__(self, user_list, item_list, rating_list, artist_list, cates_list, user_thumbs_list, user_comments_list, item_thumbs_list, item_comments_list):
super(Rating_Datset, self).__init__()
self.user_list = user_list
self.item_list = item_list
self.artist_list = artist_list
self.cates_list = cates_list
self.user_thumbs_list = user_thumbs_list
self.user_comments_list = user_comments_list
self.item_thumbs_list = item_thumbs_list
self.item_comments_list = item_comments_list
self.rating_list = rating_list
def __len__(self):
return len(self.user_list)
def __getitem__(self, idx):
user = self.user_list[idx]
item = self.item_list[idx]
artist = self.artist_list[idx]
cates = self.cates_list[idx]
user_thumbs = self.user_thumbs_list[idx]
user_comments = self.user_comments_list[idx]
item_thumbs = self.item_comments_list[idx]
item_comments = self.item_comments_list[idx]
rating = self.rating_list[idx]
return (
torch.tensor(user, dtype=torch.long),
torch.tensor(item, dtype=torch.long),
torch.tensor(artist, dtype=torch.long),
torch.zeros(1, 34).index_fill(1, torch.tensor(cates), 1),
torch.tensor(user_thumbs, dtype=torch.long),
torch.tensor(user_comments, dtype=torch.long),
torch.tensor(item_thumbs, dtype=torch.long),
torch.tensor(item_comments, dtype=torch.long),
torch.tensor(rating, dtype=torch.float)
)