Item2Vec: Neural Item Embedding for Collaborative Filtering,2016
abstract
该工作借鉴了word2vec的思想,学习item在低维隐含空间的嵌入表示,类似item-based CF用于推荐。item的序列等价于词的序列,将训练样本由句子该为item序列即可,item间共现为正样本,按照item的概率分布进行负采样。实验结果表明比SVD的效果更好。
模型
skip-gram:最小化目标函数
1
K
∑
i
=
1
K
∑
−
c
≤
j
≤
c
,
j
≠
0
l
o
g
p
(
w
i
+
j
∣
w
i
)
\frac{1}{K} \sum_{i=1}^{K} \sum_{-c\leq j \leq c , j\neq0}{{\rm log \ } p(w_{i+j} | w_i)}
K1i=1∑K−c≤j≤c,j̸=0∑log p(wi+j∣wi)
子采样
为了解决热门物品和冷门物品之间的不平衡,引入子采样 (subsampling),对于输入的序列,一定概率丢弃词w
p
(
d
i
s
c
a
r
d
∣
w
)
=
1
−
ρ
f
(
w
)
p(discard|w) = 1-\sqrt{\frac{\rho}{f(w)}}
p(discard∣w)=1−f(w)ρ
其中
f
(
w
)
f(w)
f(w) 为w的频率,
ρ
\rho
ρ 是预设的阈值。在本文两个数据集中分别取
1
0
−
5
10^{-5}
10−5 和
1
0
−
3
10^{-3}
10−3。
负采样
为了加快训练,引入负采样(negtive sampling):
p
(
w
j
∣
w
i
)
=
σ
(
u
i
T
v
j
)
∏
k
=
1
N
σ
(
−
u
i
T
v
k
)
p(w_j|w_i) = \sigma(u_i^Tv_j) \prod_{k=1}^N \sigma(-u_i^Tv_k)
p(wj∣wi)=σ(uiTvj)k=1∏Nσ(−uiTvk)
σ
(
x
)
=
1
1
+
e
x
p
(
−
x
)
\sigma(x) = \frac{1}{1+exp(-x)}
σ(x)=1+exp(−x)1 ,N是负样本数目
在item2vec中,将序列当作集合来处理,只要共现就视为正样本。优化的目标函数替代为
1
K
∑
i
=
1
K
∑
j
≠
i
K
l
o
g
p
(
w
j
∣
w
i
)
\frac{1}{K} \sum_{i=1}^{K} \sum_{j\neq i}^{K}{{\rm log \ } p(w_{j} | w_i)}
K1i=1∑Kj̸=i∑Klog p(wj∣wi)
具体的处理方法有以下2种
(1)将窗口设置足够大,同个序列中的数据都视为正样本
(2)每次训练时打乱原序列
最终将训练好的target representation u i u_i ui 作为最终 i t e m i item_i itemi 的表示
代码实现:https://github.com/zhengjingwei/recommender-system
讨论
在实际应用中item2vec的得到的embedding聚类效果好。
在推荐时,直接计算embedding的相似度取TopN作为推荐结果容易得到过于相似的商品(如同款商品的不同型号)。
代码实现
以下是参考tensorflow中word2vec basic代码改的
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
import numpy as np
import math
import collections
import random
from six.moves import xrange
local_file = '../data/trans_all_incode.csv' # 每行一个订单
log_dir = './log'
读数据
def read_data(filename):
corpus = []
vocabulary = set()
words = []
max_len = 0
with open(filename,'r') as f:
for line in f:
sentence = line.strip().split('|')
corpus.append(sentence)
max_len = max(max_len,len(sentence))
words += sentence
print('corpus size: %s sentence %s words' %(len(corpus), len(words)))
print('sentence max len: %s' %(max_len))
return corpus,words
def generate_batch_from_sentence(sentence, num_skips, skip_window):
batch_inputs = []
batch_labels = []
for i in range(len(sentence)):
window = list(range(len(sentence))) # 句子内除该元素外所有元素
window.remove(i)
sample_index = random.sample(window,min(num_skips,len(window)))
input_id = word2id.get(sentence[i])
for index in sample_index:
label_id = word2id.get(sentence[index])
batch_inputs.append(input_id)
batch_labels.append(label_id)
batch_inputs = np.array(batch_inputs,dtype=np.int32)
batch_labels = np.array(batch_labels,dtype=np.int32)
batch_labels = np.reshape(batch_labels,[batch_labels.__len__(),1])
return batch_inputs,batch_labels
建立item到id以及id到item的映射
corpus,words = read_data(local_file)
vocabulary = collections.Counter(words)
count = []
count.extend(collections.Counter(words).most_common(len(words) - 1))
word2id = dict()
for word , _ in count:
word2id[word] = len(word2id)
id2word = dict(zip(word2id.values(), word2id.keys()))
模型设置
embedding_size = 100 # embedding维度
skip_window = 50 # 单边窗口长度
num_skips = 4 # 从整个窗口中选取多少个不同的词作为我们的output word
num_sampled = 50 # 负采样样本数
valid_size = 16 # 选取验证的样本数
valid_window = 500 # 在前100个的中挑选验证样本
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
vocabulary_size = len(vocabulary)
batch_size = None
graph = tf.Graph()
with graph.as_default():
# Input data.
with tf.name_scope('inputs'):
train_inputs = tf.placeholder(tf.int32, shape=[batch_size])
train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])
valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
# 查找输入的embedding
with tf.name_scope('embeddings'):
embeddings = tf.Variable(tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) # 随机均匀分布
embed = tf.nn.embedding_lookup(embeddings, train_inputs)
# 模型内部参数矩阵初始化
with tf.name_scope('weights'):
nce_weights = tf.Variable(
tf.truncated_normal([vocabulary_size, embedding_size],
stddev=1.0 / math.sqrt(embedding_size)))
with tf.name_scope('biases'):
nce_biases = tf.Variable(tf.zeros([vocabulary_size]))
# 得到NCE损失
with tf.name_scope('loss'):
loss = tf.reduce_mean(tf.nn.nce_loss(
weights = nce_weights,
biases = nce_biases,
labels = train_labels,
inputs = embed,
num_sampled = num_sampled,
num_classes = vocabulary_size))
tf.summary.scalar('loss', loss)
with tf.name_scope('optimizer'):
optimizer = tf.train.GradientDescentOptimizer(0.1).minimize(loss)
# 计算与指定若干单词的相似度
norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1 ,keep_dims=True))
normalized_embedding = embeddings / norm
valid_embeddings = tf.nn.embedding_lookup(normalized_embedding, valid_dataset)
similarity = tf.matmul(valid_embeddings, normalized_embedding, transpose_b=True)
merged = tf.summary.merge_all()
# 变量初始化
init = tf.global_variables_initializer()
saver = tf.train.Saver()
训练
num_steps = 1000001
with tf.Session(graph=graph) as sess:
train_loss_records = collections.deque(maxlen=10) # 保存最近10次的误差
train_loss_k10 = 0
train_sents_num = 0
# 写入summary
writter = tf.summary.FileWriter(log_dir, sess.graph)
# 初始化全局参数
init.run()
aver_loss = 0
for step in xrange(num_steps):
sentence = corpus[step % len(corpus)] # 逐句训练
batch_inputs, batch_labels = generate_batch_from_sentence(sentence, num_skips, skip_window)
feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}
# Define metadata variable.
# run_metadata = tf.RunMetadata()
_, summary, loss_val = sess.run([optimizer, merged, loss],
feed_dict=feed_dict)
# average_loss += loss_val
train_sents_num += 1
# 每一步将summary写入writer
writter.add_summary(summary, step)
train_loss_records.append(loss_val)
train_loss_k10 = np.mean(train_loss_records)
if train_sents_num % 1000 == 0:
print('{a} sentences dealed, loss: {b}'
.format(a=train_sents_num,b=train_loss_k10))
if train_sents_num % 5000 == 0:
sim = similarity.eval()
for i in xrange(valid_size):
valid_word = id2word[valid_examples[i]]
top_k = 8
nearest = (-sim[i, :]).argsort()[1:top_k + 1]
log_str = 'Nearest to %s:' % valid_word
for k in xrange(top_k):
close_word = id2word[nearest[k]]
log_str = '%s %s,' % (log_str, close_word)
print(log_str)
final_embeddings = normalized_embedding.eval()