本文研读Collaborative Memory Network for Recommendation Systems论文复现代码,源代码来源于https://codeload.github.com/IamAdiSri/cmn4recosys/zip/master
文章目录
1. 论文思想
本文采用记忆力神经网络的框架,综合CF和attention机制来选择记忆单元,预测item评分。
2. 论文复现
2.1 获取训练数据
源码所使用的数据中用户数目为16980 ,item数目为5551 ,最长的用户历史序列为311
self._data[‘train_data’]中trian data的每一个训练样本为 [user id, item id]
self._data[‘test_data’]每一个数据项是一个字典,每项数据为userid: (pos_id, [neg_id1, neg_id2, …])},即对于每一个测试样本包含1个正样例100个负样例
下面的类成员函数get_data(),将会返回一个batch_size大小的训练集和,每个batch的每一条数据格式为[user_idx, item_idx, neg_item_idx],其中会对每一个item_idx和neg_item_idx分别生成他们的邻居。每个用户将会有neg_count条数据。
class Dataset(object):
def __init__(self, filename):
"""
Wraps dataset and produces batches for the model to consume
:param filename: path to training data for npz file
"""
self._data = np.load(filename, allow_pickle=True)
self.train_data = self._data['train_data'][:, :2]
self.test_data = self._data['test_data'].tolist()
self._train_index = np.arange(len(self.train_data), dtype=np.uint)
self._n_users, self._n_items = self.train_data.max(axis=0) + 1
# Neighborhoods 生成每个item和用户的邻居
self.user_items = defaultdict(set)
self.item_users = defaultdict(set)
for u, i in self.train_data:
self.user_items[u].add(i)
self.item_users[i].add(u)
# Get a list version so we do not need to perform type casting将字典中值类型为集合的转换成
#成列表
self.item_users_list = {
k: list(v) for k, v in self.item_users.items()}
self._max_user_neighbors = max([len(x) for x in self.item_users.values()])
self.user_items = dict(self.user_items)
self.item_users = dict(self.item_users)
@property
def train_size(self):
"""
:return: number of examples in training set
:rtype: int
"""
return len(self.train_data)
@property
def user_count(self):
"""
Number of users in dataset
"""
return self._n_users
@property
def item_count(self):
"""
Number of items in dataset
"""
return self._n_items
def _sample_item(self):
"""
Draw an item uniformly
"""
return np.random.randint(0, self.item_count)
def _sample_negative_item(self, user_id):
"""
Uniformly sample a negative item
"""
if user_id > self.user_count:
raise ValueError("Trying to sample user id: {} > user count: {}".format(
user_id, self.user_count))
n = self._sample_item()
positive_items = self.user_items[user_id]
if len(positive_items) >= self.item_count:
raise ValueError("The User has rated more items than possible %s / %s" % (
len(positive_items), self.item_count))
#进行负采样,用户未购买过的记录
while n in positive_items or n not in self.item_users:
n = self._sample_item()
return n
def _generate_data(self, neg_count):
idx = 0
self._examples = np.zeros((self.train_size*neg_count, 3),
dtype=np.uint32)
self._examples[:, :] = 0
for user_idx, item_idx in self.train_data:
#每个用户生成四条训练数据
for _ in range(neg_count):
neg_item_idx = self._sample_negative_item(user_idx)
self._examples[idx, :] = [user_idx, item_idx, neg_item_idx]
idx += 1
def get_data(self, batch_size: int, neighborhood: bool, neg_count: int):
"""
Batch data together as (user, item, negative item), pos_neighborhood,
length of neighborhood, negative_neighborhood, length of negative neighborhood
if neighborhood is False returns only user, item, negative_item so we
can reuse this for non-neighborhood-based methods.
:param batch_size: size of the batch
:param neighborhood: return the neighborhood information or not
:param neg_count: number of negative samples to uniformly draw per a pos
example
:return: generator
"""
# Allocate inputs
batch = np.zeros((batch_size, 3), dtype=np.uint32)
pos_neighbor = np.zeros((batch_size, self._max_user_neighbors), dtype=np.int32)
pos_length = np.zeros(batch_size, dtype=np.int32)
neg_neighbor = np.zeros((batch_size, self._max_user_neighbors), dtype=np.int32)
neg_length = np.zeros(batch_size, dtype=np.int32)
# Shuffle index 打乱数据
np.random.shuffle(self._train_index)
idx = 0
for