word2vec入门代码

import numpy as np
import torch
from torch import nn, optim
import random
from collections import Counter
import matplotlib.pyplot as plt

# 训练数据
text = "I like dog i like cat i like animal dog cat animal apple cat dog like dog fish milk like dog \
cat eyes like i like apple apple i hate apple i movie book music like cat dog hate cat dog like"

# 文本预处理
frequency = 0
delete_common_words = False
def preprocess(text, frequency):
    text = text.lower()
    words = text.split()

    """去除低频词"""
    word_counts = Counter(words)
    new_words = [word for word in words if word_counts[word] > frequency]
    return new_words

words = preprocess(text, frequency)

# 构建词典
vocabulary = set(words)
vocabulary_to_int = {w: c for c, w in enumerate(vocabulary)}
int_to_vocabulary = {c: w for c, w in enumerate(vocabulary)}
print(vocabulary_to_int)




# 将text中的文本转化为词典对应的数值
int_words = [vocabulary_to_int[w] for w in words]
# print(int_words)

# 计算单词出现频率
int_word_counts = Counter(int_words)
total_count = len(int_words)
word_frequency = {w: c/total_count for w, c in int_word_counts.items()}
# print(word_frequency)

# 去除出现频次高的词汇
if delete_common_words:
    t = 1e-5
    prob_drop = {w: 1-np.sqrt(t/word_frequency[w]) for w in int_word_counts}
    train_words = [w for w in int_words if random.random()<(1-prob_drop[w])] #random.random()方法返回一个随机数,其在0至1的范围之内,这种随机删除的方式好像不太对
else:
    train_words = int_words

# print(train_words)

# 单词分布
word_frequency = np.array(list(word_frequency.values()))
# print(word_frequency)
noise_dist = torch.from_numpy(word_frequency ** 0.75 / np.sum(word_frequency ** 0.75))
# print(noise_dist)


# 获取目标词汇
window_size = 5
def get_target(words, idx, window_size):
    # target_window = np.random.randint(1, window_size+1)
    target_window = np.random.randint(1, window_size+1)
    start_point = idx - target_window if (idx - target_window)>0 else 0
    end_point = idx + target_window
    targets = set(words[start_point:idx] + words[idx+1:end_point+1]) # 前包后不包
    return list(targets)

# 批次化数据
batch_size = 5
emb_dim=2 # 嵌入维度
def get_batch(words, batch_size, window_size):
    n_batches = len(words)//batch_size #是表示向下取整的除法
    words = words[:n_batches*batch_size] #舍掉最后那点数据
    for idx in range(0, len(words), batch_size): #batch_size是步长
        batch_x, batch_y = [],[]
        batch = words[idx:idx+batch_size]
        for i in range(len(batch)):
            x = batch[i]
            y = get_target(batch, i, window_size)
            batch_x.extend([x]*len(y))
            batch_y.extend(y)
        yield batch_x, batch_y


# 定义模型
class SkipGramNegSampling(nn.Module):
    def __init__(self, vocab_size, emb_dim, noise_dist):
        """vocab_size:语料库中单词的数量,emb_dim:向量的维度"""
        super(SkipGramNegSampling, self).__init__()
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.noise_dist = noise_dist
        # 定义词向量层
        self.embedding_v = nn.Embedding(vocab_size, emb_dim)
        self.embedding_u = nn.Embedding(vocab_size, emb_dim)

        # 词向量层参数初始化
        self.embedding_v.weight.data.uniform_(-1, 1)
        self.embedding_u.weight.data.uniform_(-1, 1)  # 初始化网络权重


        """torch.nn.Embedding(num_embeddings, embedding_dim)"""

    def forward(self, center_words, target_words, n_sample):

        center_embeds = self.embedding_v(center_words)  # B X 1 X D
        target_embeds = self.embedding_u(target_words)  # B X 1 X D

        size, _ = center_embeds.shape

        noise_dist = self.noise_dist
        # 从词汇分布中采样负样本
        negative_words = torch.multinomial(noise_dist, size * n_sample, replacement=True)
        negative_embeds = self.embedding_u(negative_words).view(size, n_sample, self.emb_dim)


        # 计算损失
        batch_size, embed_size = center_embeds.shape
        # 目标词损失
        center_embeds = center_embeds.view(batch_size, embed_size, 1)
        target_embeds = target_embeds.view(batch_size, 1, embed_size)

        positive_score = torch.bmm(target_embeds, center_embeds).sigmoid().log()
        positive_score = positive_score.squeeze()

        # 负样本损失
        negative_score = torch.bmm(negative_embeds.neg(), center_embeds).sigmoid().log()
        negative_score = negative_score.squeeze().sum(1)


        loss = positive_score + negative_score
        return -torch.mean(loss)  # nll实际上就是给定中心词生成背景词的条件概率


model = SkipGramNegSampling(len(vocabulary_to_int), 2, noise_dist=noise_dist)

optimizer = optim.Adam(model.parameters(), lr=0.003)

steps = 0
Epoches = 1000
n_sample = 3
print_every = 1000
# center_words, target_words = get_batch(train_words, batch_size, window_size)
# print(get_batch(train_words, batch_size, window_size))
for e in range(Epoches):
    for center_words, target_words in get_batch(train_words, batch_size, window_size):
        steps += 1
        center_words, target_words = torch.LongTensor(center_words), torch.LongTensor(target_words)

        loss = model(center_words, target_words, n_sample)
        if steps % print_every == 0:
            print("loss: ", loss)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# 可视化词向量
for i, w in int_to_vocabulary.items():
    vectors = model.state_dict()["embedding_v.weight"]
    x, y = float(vectors[i][0]), float(vectors[i][1])
    plt.scatter(x, y)
    plt.annotate(w, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()

for i, w in int_to_vocabulary.items():
    vectors = model.state_dict()["embedding_v.weight"]
    x, y = float(vectors[i][0]), float(vectors[i][1])
    plt.scatter(x, y)
    plt.annotate(w, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom')
plt.show()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值