item2vec代码

#-*-coding:utf-8-*-
"""
@author:taoshouzheng
@time:2019/8/20 10:46
@email:tsz1216@sina.com
"""

import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.optim as optim
import random
from collections import Counter


class SGNS(nn.Module):

	def __init__(self, vocab_size, projection_dim):

		super(SGNS, self).__init__()

		self.embedding_v = nn.Embedding(vocab_size, projection_dim)		# center embedding
		self.embedding_u = nn.Embedding(vocab_size, projection_dim)		# out embedding
		self.log_sigmoid = nn.LogSigmoid()

		init_range = (2.0 / (vocab_size + projection_dim)) ** 0.5		# Xavier init
		self.embedding_v.weight.data.uniform_(-init_range, init_range)		# init
		# self.embedding_u.weight.data.uniform_(-0.0, 0.0)		# init
		self.embedding_u.weight.data.uniform_(-init_range, init_range)  # init

	def forward(self, center_words, target_words, negative_words):
		center_embeds = self.embedding_v(center_words)		# B * 1 * D
		target_embeds = self.embedding_u(target_words)		# B * 1 * D
		neg_embeds = -self.embedding_u(negative_words)		# B * K * D
		positive_score = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)		# B * 1
		negative_score = torch.sum(neg_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2), 1).view(center_words.size(0), -1)		# B * K ---> B * 1
		los = self.log_sigmoid(positive_score) + self.log_sigmoid(negative_score)

		return -torch.mean(los)

	# 获取单词的embedding
	def prediction(self, inputs):
		embeds = self.embedding_v(inputs)
		return embeds


def get_batch_sample(bat_size, tra_data):		# ~
	random.shuffle(tra_data)		# 随机打乱数据
	s_index = 0
	e_index = bat_size

	while e_index < len(tra_data):
		bat = tra_data[s_index: e_index]
		temp = e_index
		e_index = e_index + bat_size
		s_index = temp
		yield bat

	if e_index >= len(tra_data):
		bat = train_data[s_index:]
		yield bat


# 生成正样本
def get_positive_sample(samp_lists):		# ~
	"""
	:param :list: 二维列表
	:return:
	"""

	positive_samples = []

	for sublist in samp_lists:

		sublist_length = len(sublist)

		for ite in sublist:

			ite_index = sublist.index(ite)

			for j in range(sublist_length):

				if ite_index != j:
					positive_samples.append([ite, sublist[j]])

	target_words = []
	context_words = []

	for word_pair in positive_samples:
		target_words.append(word_pair[0])
		context_words.append(word_pair[1])

	return target_words, context_words		# 一维列表


# 生成负样本
def get_negative_sample(centers, targets, un_table, dict, k):

	batch_size = len(targets)		# 批次大小

	negative_samples = []

	for i in range(batch_size):

		neg_sample = []
		center_index = centers[i][0]		# !!!
		target_index = targets[i][0]		# !!!

		while len(neg_sample) < k:

			neg = random.choice(un_table)
			if neg == target_index or neg == center_index:
				continue
			neg_sample.append(dict[neg])
		negative_samples.append(neg_sample)

	# 返回一个二维列表
	return negative_samples


if __name__ == '__main__':

	movie_lists = []

	with open(r'E:\Experiment\Algorithms\Item2vec-pytorch\1.csv', 'r', encoding='utf8') as f:
		contents = f.readlines()
		for content in contents:
			content = content.strip().split(',')
			if content[0] == '':
				continue
			movie_list = [int(m) for m in content]
			if len(movie_list) > 1:
				movie_lists.append(movie_list)		# 二维列表

	fla = lambda k: [i for sublist in k for i in sublist]
	item_counter = Counter(fla(movie_lists))
	item = [w for w, c in item_counter.items()]  # item列表

	item2index = {}
	for vo in item:
		if item2index.get(vo) is None:
			item2index[vo] = len(item2index)

	index2item = {v: k for k, v in item2index.items()}

	new_movie_lists = []
	for m in movie_lists:
		m = [item2index[n] for n in m]
		new_movie_lists.append(m)

	cent_words, cont_words = get_positive_sample(new_movie_lists)  # 一维列表

	uni_table = []
	f = sum([item_counter[it] ** 0.75 for it in item])		# 这边有个地方索引没转过来
	z = 0.0001
	for it in item:
		uni_table.extend([it] * int(((item_counter[it] ** 0.75) / f) / z))

	train_data = [[cent_words[i], cont_words[i]] for i in range(len(cent_words))]		# 二维列表

	item2vec = SGNS(len(item), 10)		# ~
	print(item2vec)
	optimizer = optim.Adam(item2vec.parameters(), lr=0.001)

	for epoch in range(1):

		for i, batch in enumerate(get_batch_sample(100, train_data)):

			target = [[p[0]] for p in batch]
			context = [[q[1]] for q in batch]
			negative = get_negative_sample(centers=target, targets=context, un_table=uni_table, dict=item2index, k=10)

			target = Variable(torch.LongTensor(target))
			# print(target)
			context = Variable(torch.LongTensor(context))
			# print(context)
			negative = Variable(torch.LongTensor(negative))
			# print(negative)
			item2vec.zero_grad()

			loss = item2vec(target, context, negative)

			loss.backward()
			optimizer.step()

			print('Epoch : %d, Batch : %d, loss : %.04f' % (epoch + 1, i + 1, loss))

item2vec.eval()


print(item2vec.prediction(torch.LongTensor([1])))
print(item2vec.prediction(torch.LongTensor([1])).data.numpy().tolist())
print(item2vec.prediction(torch.LongTensor([1])).data.size())

item_embeddings = []

for item, index in item2index.items():

	print(item)
	print(index)
	print(torch.flatten(item2vec.prediction(torch.LongTensor([index]))).data)

	item_embedding_str = str(item) + ';' + ','.join([str(a) for a in torch.flatten(item2vec.prediction(torch.LongTensor([index]))).data.numpy().tolist()])

	item_embeddings.append(item_embedding_str)


with open(r'E:\Experiment\Algorithms\Item2vec-pytorch\em.txt', 'w', encoding='utf8') as g:
	for st in item_embeddings:
		g.write(st + '\n')

print('完成!')

输入数据如下:

结果如下:

 

要在PyTorch中实现word2vec代码,可以按照以下步骤进行操作: 1. 首先,需要创建一个数据集对象,用于加载文本数据。可以使用引用中的代码创建`WordEmbeddingDataset`对象,并传入相应的参数,如文本内容、词向量索引等。 2. 接下来,使用`torch.utils.data.DataLoader`创建一个数据加载器对象,用于批量加载数据。可以使用引用中的代码创建`DataLoader`对象,并传入数据集对象和批量大小等参数。 3. 然后,读取并处理文本数据。可以使用引用中的代码,通过打开文件、分割成单词列表、统计单词频率等步骤得到文本数据。 4. 在建立词和id的相互索引之前,需要根据词频对单词进行排序,并限制词表大小。可以使用引用中的代码,通过`Counter`统计词频,然后根据词频创建单词字典表,并添加一个特殊的未知单词标记。 5. 建立词和id的相互索引。可以使用引用中的代码,通过遍历单词字典表和使用enumerate函数创建词到id的映射表。 6. 最后,可以根据需求进行词向量的训练和嵌入。可以使用PyTorch的Embedding层来实现词嵌入,将id作为输入,获取对应的词向量。可以参考引用中的代码,使用Embedding层并传入合适的参数,如词表大小、词向量维度等。 综上所述,根据引用、引用和引用中的代码,可以实现word2vec模型的PyTorch代码。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* *2* [PyTorch实现Word2Vec](https://blog.csdn.net/qq_37236745/article/details/105687578)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 50%"] - *3* [word2Vec之Pytorch实现_代码部分](https://blog.csdn.net/weixin_38544305/article/details/115288348)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_2"}}] [.reference_item style="max-width: 50%"] [ .reference_list ]
评论 9
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值