item2vec代码

最新推荐文章于 2024-08-01 07:45:00 发布

烟雨风渡

最新推荐文章于 2024-08-01 07:45:00 发布

阅读量2.5k

点赞数

分类专栏： pytorch 文章标签： item2vec

本文链接：https://blog.csdn.net/tszupup/article/details/100007336

版权

pytorch 专栏收录该内容

16 篇文章 2 订阅

订阅专栏

#-*-coding:utf-8-*-
"""
@author:taoshouzheng
@time:2019/8/20 10:46
@email:tsz1216@sina.com
"""

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import random
from collections import Counter


class SGNS(nn.Module):

	def __init__(self, vocab_size, projection_dim):

		super(SGNS, self).__init__()

		self.embedding_v = nn.Embedding(vocab_size, projection_dim)		# center embedding
		self.embedding_u = nn.Embedding(vocab_size, projection_dim)		# out embedding
		self.log_sigmoid = nn.LogSigmoid()

		init_range = (2.0 / (vocab_size + projection_dim)) ** 0.5		# Xavier init
		self.embedding_v.weight.data.uniform_(-init_range, init_range)		# init
		# self.embedding_u.weight.data.uniform_(-0.0, 0.0)		# init
		self.embedding_u.weight.data.uniform_(-init_range, init_range)  # init

	def forward(self, center_words, target_words, negative_words):
		center_embeds = self.embedding_v(center_words)		# B * 1 * D
		target_embeds = self.embedding_u(target_words)		# B * 1 * D
		neg_embeds = -self.embedding_u(negative_words)		# B * K * D
		positive_score = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)		# B * 1
		negative_score = torch.sum(neg_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2), 1).view(center_words.size(0), -1)		# B * K ---> B * 1
		los = self.log_sigmoid(positive_score) + self.log_sigmoid(negative_score)

		return -torch.mean(los)

	# 获取单词的embedding
	def prediction(self, inputs):
		embeds = self.embedding_v(inputs)
		return embeds


def get_batch_sample(bat_size, tra_data):		# ~
	random.shuffle(tra_data)		# 随机打乱数据
	s_index = 0
	e_index = bat_size

	while e_index < len(tra_data):
		bat = tra_data[s_index: e_index]
		temp = e_index
		e_index = e_index + bat_size
		s_index = temp
		yield bat

	if e_index >= len(tra_data):
		bat = train_data[s_index:]
		yield bat


# 生成正样本
def get_positive_sample(samp_lists):		# ~
	"""
	:param :list: 二维列表
	:return:
	"""

	positive_samples = []

	for sublist in samp_lists:

		sublist_length = len(sublist)

		for ite in sublist:

			ite_index = sublist.index(ite)

			for j in range(sublist_length):

				if ite_index != j:
					positive_samples.append([ite, sublist[j]])

	target_words = []
	context_words = []

	for word_pair in positive_samples:
		target_words.append(word_pair[0])
		context_words.append(word_pair[1])

	return target_words, context_words		# 一维列表


# 生成负样本
def get_negative_sample(centers, targets, un_table, dict, k):

	batch_size = len(targets)		# 批次大小

	negative_samples = []

	for i in range(batch_size):

		neg_sample = []
		center_index = centers[i][0]		# !!!
		target_index = targets[i][0]		# !!!

		while len(neg_sample) < k:

			neg = random.choice(un_table)
			if neg == target_index or neg == center_index:
				continue
			neg_sample.append(dict[neg])
		negative_samples.append(neg_sample)

	# 返回一个二维列表
	return negative_samples


if __name__ == '__main__':

	movie_lists = []

	with open(r'E:\Experiment\Algorithms\Item2vec-pytorch\1.csv', 'r', encoding='utf8') as f:
		contents = f.readlines()
		for content in contents:
			content = content.strip().split(',')
			if content[0] == '':
				continue
			movie_list = [int(m) for m in content]
			if len(movie_list) > 1:
				movie_lists.append(movie_list)		# 二维列表

	fla = lambda k: [i for sublist in k for i in sublist]
	item_counter = Counter(fla(movie_lists))
	item = [w for w, c in item_counter.items()]  # item列表

	item2index = {}
	for vo in item:
		if item2index.get(vo) is None:
			item2index[vo] = len(item2index)

	index2item = {v: k for k, v in item2index.items()}

	new_movie_lists = []
	for m in movie_lists:
		m = [item2index[n] for n in m]
		new_movie_lists.append(m)

	cent_words, cont_words = get_positive_sample(new_movie_lists)  # 一维列表

	uni_table = []
	f = sum([item_counter[it] ** 0.75 for it in item])		# 这边有个地方索引没转过来
	z = 0.0001
	for it in item:
		uni_table.extend([it] * int(((item_counter[it] ** 0.75) / f) / z))

	train_data = [[cent_words[i], cont_words[i]] for i in range(len(cent_words))]		# 二维列表

	item2vec = SGNS(len(item), 10)		# ~
	print(item2vec)
	optimizer = optim.Adam(item2vec.parameters(), lr=0.001)

	for epoch in range(1):

		for i, batch in enumerate(get_batch_sample(100, train_data)):

			target = [[p[0]] for p in batch]
			context = [[q[1]] for q in batch]
			negative = get_negative_sample(centers=target, targets=context, un_table=uni_table, dict=item2index, k=10)

			target = Variable(torch.LongTensor(target))
			# print(target)
			context = Variable(torch.LongTensor(context))
			# print(context)
			negative = Variable(torch.LongTensor(negative))
			# print(negative)
			item2vec.zero_grad()

			loss = item2vec(target, context, negative)

			loss.backward()
			optimizer.step()

			print('Epoch : %d, Batch : %d, loss : %.04f' % (epoch + 1, i + 1, loss))

item2vec.eval()


print(item2vec.prediction(torch.LongTensor([1])))
print(item2vec.prediction(torch.LongTensor([1])).data.numpy().tolist())
print(item2vec.prediction(torch.LongTensor([1])).data.size())

item_embeddings = []

for item, index in item2index.items():

	print(item)
	print(index)
	print(torch.flatten(item2vec.prediction(torch.LongTensor([index]))).data)

	item_embedding_str = str(item) + ';' + ','.join([str(a) for a in torch.flatten(item2vec.prediction(torch.LongTensor([index]))).data.numpy().tolist()])

	item_embeddings.append(item_embedding_str)


with open(r'E:\Experiment\Algorithms\Item2vec-pytorch\em.txt', 'w', encoding='utf8') as g:
	for st in item_embeddings:
		g.write(st + '\n')

print('完成！')

输入数据如下：