import numpy as np
import torch
from torch import nn, optim
import random
from collections import Counter
import matplotlib.pyplot as plt
# 训练数据
text = "I like dog i like cat i like animal dog cat animal apple cat dog like dog fish milk like dog \
cat eyes like i like apple apple i hate apple i movie book music like cat dog hate cat dog like"
# 文本预处理
frequency = 0
delete_common_words = False
def preprocess(text, frequency):
text = text.lower()
words = text.split()
"""去除低频词"""
word_counts = Counter(words)
new_words = [word for word in words if word_counts[word] > frequency]
return new_words
words = preprocess(text, frequency)
# 构建词典
vocabulary = set(words)
vocabulary_to_int = {w: c for c, w in enumerate(vocabulary)}
int_to_vocabulary = {c: w for c, w in enumerate(vocabulary)}
print(vocabulary_to_int)
# 将text中的文本转化为词典对应的数值
int_words = [vocabulary_to_int[w] for w in words]
# print(int_words)
# 计算单词出现频率
int_word_counts = Counter(int_words)
total_count = len(int_words)
word_frequency = {w: c/total_count for w, c in int_word_counts.items()}
# print(word_frequency)
# 去除出现频次高的词汇
if delete_common_words:
t = 1e-5
prob_drop = {w: 1-np.sqrt(t/word_frequency[w]) for w in int_word_counts}
train_words = [w for w in int_words if random.random()<(1-prob_drop[w])] #random.random()方法返回一个随机数,其在0至1的范围之内,这种随机删除的方式好像不太对
else:
train_words = int_words
# print(train_words)
# 单词分布
word_frequency = np.array(list(word_frequency.values()))
# print(word_frequency)
noise_dist = torch.from_numpy(word_frequency ** 0.75 / np.sum(word_frequency ** 0.75))
# print(noise_dist)
# 获取目标词汇
window_size = 5
def get_target(words, idx, window_size):
# target_window = np.random.randint(1, window_size+1)
target_window = np.random.randint(1, window_size+1)
start_point = idx - target_window if (idx - target_window)>0 else 0
end_point = idx + target_window
targets = set(words[start_point:idx] + words[idx+1:end_point+1]) # 前包后不包
return list(targets)
# 批次化数据
batch_size = 5
emb_dim=2 # 嵌入维度
def get_batch(words, batch_size, window_size):
n_batches = len(words)//batch_size #是表示向下取整的除法
words = words[:n_batches*batch_size] #舍掉最后那点数据
for idx in range(0, len(words), batch_size): #batch_size是步长
batch_x, batch_y = [],[]
batch = words[idx:idx+batch_size]
for i in range(len(batch)):
x = batch[i]
y = get_target(batch, i, window_size)
batch_x.extend([x]*len(y))
batch_y.extend(y)
yield batch_x, batch_y
# 定义模型
class SkipGramNegSampling(nn.Module):
def __init__(self, vocab_size, emb_dim, noise_dist):
"""vocab_size:语料库中单词的数量,emb_dim:向量的维度"""
super(SkipGramNegSampling, self).__init__()
self.vocab_size = vocab_size
self.emb_dim = emb_dim
self.noise_dist = noise_dist
# 定义词向量层
self.embedding_v = nn.Embedding(vocab_size, emb_dim)
self.embedding_u = nn.Embedding(vocab_size, emb_dim)
# 词向量层参数初始化
self.embedding_v.weight.data.uniform_(-1, 1)
self.embedding_u.weight.data.uniform_(-1, 1) # 初始化网络权重
"""torch.nn.Embedding(num_embeddings, embedding_dim)"""
def forward(self, center_words, target_words, n_sample):
center_embeds = self.embedding_v(center_words) # B X 1 X D
target_embeds = self.embedding_u(target_words) # B X 1 X D
size, _ = center_embeds.shape
noise_dist = self.noise_dist
# 从词汇分布中采样负样本
negative_words = torch.multinomial(noise_dist, size * n_sample, replacement=True)
negative_embeds = self.embedding_u(negative_words).view(size, n_sample, self.emb_dim)
# 计算损失
batch_size, embed_size = center_embeds.shape
# 目标词损失
center_embeds = center_embeds.view(batch_size, embed_size, 1)
target_embeds = target_embeds.view(batch_size, 1, embed_size)
positive_score = torch.bmm(target_embeds, center_embeds).sigmoid().log()
positive_score = positive_score.squeeze()
# 负样本损失
negative_score = torch.bmm(negative_embeds.neg(), center_embeds).sigmoid().log()
negative_score = negative_score.squeeze().sum(1)
loss = positive_score + negative_score
return -torch.mean(loss) # nll实际上就是给定中心词生成背景词的条件概率
model = SkipGramNegSampling(len(vocabulary_to_int), 2, noise_dist=noise_dist)
optimizer = optim.Adam(model.parameters(), lr=0.003)
steps = 0
Epoches = 1000
n_sample = 3
print_every = 1000
# center_words, target_words = get_batch(train_words, batch_size, window_size)
# print(get_batch(train_words, batch_size, window_size))
for e in range(Epoches):
for center_words, target_words in get_batch(train_words, batch_size, window_size):
steps += 1
center_words, target_words = torch.LongTensor(center_words), torch.LongTensor(target_words)
loss = model(center_words, target_words, n_sample)
if steps % print_every == 0:
print("loss: ", loss)
optimizer.zero_grad()
loss.backward()
optimizer.step()
# 可视化词向量
for i, w in int_to_vocabulary.items():
vectors = model.state_dict()["embedding_v.weight"]
x, y = float(vectors[i][0]), float(vectors[i][1])
plt.scatter(x, y)
plt.annotate(w, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()
for i, w in int_to_vocabulary.items():
vectors = model.state_dict()["embedding_v.weight"]
x, y = float(vectors[i][0]), float(vectors[i][1])
plt.scatter(x, y)
plt.annotate(w, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()
word2vec入门代码
最新推荐文章于 2024-04-21 17:16:13 发布