Word2Vec的Pytorch实现(二)

  • 所需导入的包
import collections
import math
import random
import sys
import time
import os
import torch.utils.data as Data
import torch
import torch.nn as nn
  • 1、读取并处理数据集
assert 'ptb.train.txt' in os.listdir("../data/ptb")

with open('../data/ptb/ptb.train.txt','r') as f:
    lines = f.readlines()
    raw_dataset = [sentence.split() for sentence in lines]
  • 1、1建立词语索引
counter = collections.Counter([token for sentence in raw_dataset for token in sentence])
# print(counter.items())
counter = dict(filter(lambda x:x[1] >= 5,counter.items()))
# [token for sentence in raw_dataset for token in sentence] 代码作用同下
# a = []
# for sentence in raw_dataset:
#     for token in sentence:
#         a.append(token)
# 将词映射到整数索引
idx_to_token = [token for token,_ in counter.items()]
# print(idx_to_token) # ['pierre', '<unk>', 'N', 'years', 'old', 'will', 'join', 'the', 'board', 'as', 'a', 'nonexecutive', 'director', 'nov.',
token_to_idx = {token:idx for idx,token in enumerate(idx_to_token)}
# print(token_to_idx) # {'pierre': 0, '<unk>': 1, 'N': 2, 'years': 3, 'old': 4, 'will': 5, 'join': 6, 'the': 7, 'board': 8, 'as': 9,
dataset = [[token_to_idx[token] for token in sentence if token in token_to_idx] for sentence in raw_dataset]
num_tokens = sum([len(sentence) for sentence in dataset])
# print(num_tokens) # 二次采样前 887100

# [[token_to_idx[token] for token in sentence if token in token_to_idx] for sentence in raw_dataset]
# 代码作用同下
# print(len(dataset))
# print(dataset)
# b = []
# for sentence in raw_dataset:
#     c = []
#     for token in sentence:
#         if token in token_to_idx:
#             c.append(token_to_idx[token])
#     b.append(c)
# print(len(b))
# print(b)
  • 1、2二次采样
# 1、2 二次采样
def discard(idx):
    return random.uniform(0,1) < 1 - math.sqrt(1e-4 / counter[idx_to_token[idx]] * num_tokens)

subsampled_dataset = [[token for token in sentence if not discard(token)] for sentence in dataset]
num_tokens_2 = sum([len(sentence) for sentence in subsampled_dataset])
print('二次采样后:',num_tokens_2) # 二次采样后 375930

# 比较一个词在二次采样前后出现在数据集中的次数
def compare_counts(token):
    return '# %s: before=%d, after=%d' % (token, sum(
        [st.count(token_to_idx[token]) for st in dataset]), sum(
        [st.count(token_to_idx[token]) for st in subsampled_dataset]))

# print(compare_counts('the')) # the: before=50770, after=2089
# print(compare_counts('join')) # join: before=45, after=45
  • 1、3 提取中心词和背景词
# 1、3 提取中心词和背景词
def get_centers_and_contexts(dataset, max_window_size):
    centers, contexts = [], []
    for st in dataset:
        if len(st) < 2:  # 每个句子至少要有2个词才可能组成一对“中心词-背景词”
            continue
        centers += st
        for center_i in range(len(st)):
            window_size = random.randint(1, max_window_size)
            indices = list(range(max(0, center_i - window_size),
                                 min(len(st), center_i + 1 + window_size)))
            indices.remove(center_i)  # 将中心词排除在背景词之外
            contexts.append([st[idx] for idx in indices])
    return centers, contexts

# tiny_dataset = [list(range(7)), list(range(7, 10))]
# print('dataset', tiny_dataset)
# for center, context in zip(*get_centers_and_contexts(tiny_dataset, 2)):
#     print('center', center, 'has contexts', context)
#
# # dataset [[0, 1, 2, 3, 4, 5, 6], [7, 8, 9]]
# # center 0 has contexts [1]
# # center 1 has contexts [0, 2]
# # center 2 has contexts [0, 1, 3, 4]
# # center 3 has contexts [1, 2, 4, 5]
# # center 4 has contexts [3, 5]
# # center 5 has contexts [3, 4, 6]
# # center 6 has contexts [4, 5]
# 设最大背景窗口大小为5,提取数据集中所有的中心词及其背景词
all_centers, all_contexts = get_centers_and_contexts(subsampled_dataset, 5)
  • 2、负采样
def get_negatives(all_contexts, sampling_weights, K):
    all_negatives, neg_candidates, i = [], [], 0
    population = list(range(len(sampling_weights)))
    for contexts in all_contexts:
        negatives = []
        while len(negatives) < len(contexts) * K:
            if i == len(neg_candidates):
                # 根据每个词的权重(sampling_weights)随机生成k个词的索引作为噪声词。
                # 为了高效计算,可以将k设得稍大一点
                i, neg_candidates = 0, random.choices(
                    population, sampling_weights, k=int(1e5))
            neg, i = neg_candidates[i], i + 1
            # 噪声词不能是背景词
            if neg not in set(contexts):
                negatives.append(neg)
        all_negatives.append(negatives)
    return all_negatives

sampling_weights = [counter[w]**0.75 for w in idx_to_token]
all_negatives = get_negatives(all_contexts, sampling_weights, 5)
  • 3、读取数据
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, centers, contexts, negatives):
        assert len(centers) == len(contexts) == len(negatives)
        self.centers = centers
        self.contexts = contexts
        self.negatives = negatives

    def __getitem__(self, index):
        return (self.centers[index], self.contexts[index], self.negatives[index])

    def __len__(self):
        return len(self.centers)
        
# 小批量读取函数
def batchify(data):
    '''
      用作DataLoader的参数collate_fn: 输入是一个长为batchsize的list,list中的每个元素都是Dataset类调用__getitem__得到的结果
    '''
    max_len = max(len(c) + len(n) for _, c, n in data)
    centers, contexts_negatives, masks, labels = [], [], [], []
    for center, context, negative in data:
        cur_len = len(context) + len(negative)
        centers += [center]
        contexts_negatives += [context + negative + [0] * (max_len - cur_len)]
        masks += [[1] * cur_len + [0] * (max_len - cur_len)]
        labels += [[1] * len(context) + [0] * (max_len - len(context))]
    return (torch.tensor(centers).view(-1, 1), torch.tensor(contexts_negatives),
            torch.tensor(masks), torch.tensor(labels))
# 上面定义的batchify函数指定DataLoader实例中小批量的读取方式,然后打印读取的第一个批量中各个变量的形状
batch_size = 512
num_workers = 0 if sys.platform.startswith('win32') else 4

dataset = MyDataset(all_centers,
                    all_contexts,
                    all_negatives)
data_iter = Data.DataLoader(dataset, batch_size, shuffle=True,
                            collate_fn=batchify,
                            num_workers=num_workers)
for batch in data_iter:
    for name, data in zip(['centers', 'contexts_negatives', 'masks',
                           'labels'], batch):
        print(name, 'shape:', data.shape)
    break
# # centers shape: torch.Size([512, 1])
# # contexts_negatives shape: torch.Size([512, 60])
# # masks shape: torch.Size([512, 60])
# # labels shape: torch.Size([512, 60])
  • 4、跳字模型
# 行数为词典大小(num_embeddings),列数为每个词向量的维度(embedding_dim)
embed = nn.Embedding(num_embeddings=20, embedding_dim=4) # [num_embeddings, embedding_dim]
# print(embed.weight)
# x = torch.tensor([[1,2,3],[4,5,6]], dtype=torch.long) # [2,3]
# print(embed(x)) # [2, 3, 4]

# 跳字模型前向计算
def skip_gram(center, contexts_and_negatives, embed_v, embed_u):
    v = embed_v(center)
    u = embed_u(contexts_and_negatives)
    pred = torch.bmm(v, u.permute(0, 2, 1)) #  u.permute(0, 2, 1) 将tensor的维度换位
    return pred

  • 5、训练模型
  • 5、1定义模型的损失函数(二元交叉熵损失函数)
class SigmoidBinaryCrossEntropyLoss(nn.Module):
    def __init__(self):
        super(SigmoidBinaryCrossEntropyLoss, self).__init__()
    def forward(self, inputs, targets, mask=None):
        '''
        :param inputs: Tensor shape: (batch_size, len)
        :param targets: Tensor of the same shape as input
        '''
        inputs, targets, mask = inputs.float(), targets.float(), mask.float()
        res = nn.functional.binary_cross_entropy_with_logits(
            input=inputs, target=targets, reduction='none', weight=mask
        )
        return res.mean(dim=1)

loss = SigmoidBinaryCrossEntropyLoss()
# pred = torch.tensor([[1.5,0.3,-1,2],[1.1,-0.6,2.2,0.4]])
# # 标签变量label中的1和0分别代表背景词和噪声词
# label = torch.tensor([[1,0,0,0],[1,1,0,0]])
# mask = torch.tensor([[1,1,1,1],[1,1,1,0]]) # 掩码变量
# res = loss(pred,label,mask) * mask.shape[1] / mask.float().sum(dim=1)
# print(res) # tensor([0.8740, 1.2100])
  • 5、2初始化模型参数
embed_size = 100
net = nn.Sequential(
    nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size),
    nn.Embedding(num_embeddings=len(idx_to_token), embedding_dim=embed_size)
)
  • 5、3定义训练函数
def train(net, lr, num_epochs):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("train on", device)
    net = net.to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=lr)
    for epoch in range(num_epochs):
        start, l_sum, n = time.time(), 0.0, 0
        for batch in data_iter:
            center, context_negative, mask, label = [d.to(device) for d in batch]

            pred = skip_gram(center, context_negative, net[0], net[1])

            # 使用掩码变量mask来避免填充项对损失函数计算的影响
            l = (loss(pred.view(label.shape), label, mask) *
                 mask.shape[1] / mask.float().sum(dim=1)).mean()  # 一个batch的平均loss
            optimizer.zero_grad()
            l.backward()
            optimizer.step()
            l_sum += l.cpu().item()
            n += 1
            print('epoch %d, loss %.2f, time %.2fs'
                  % (epoch + 1, l_sum / n, time.time() - start))
  • 训练
train(net, 0.01, 10)
  • 6、应用词嵌入模型
def get_similar_tokens(query_token, k, embed):
    W = embed.weight.data
    x = W[token_to_idx[query_token]]
    # 添加的1e-9是为了数值稳定性
    cos = torch.matmul(W, x) / (torch.sum(W * W, dim=1) * torch.sum(x * x) + 1e-9).sqrt()
    _, topk = torch.topk(cos, k=k+1)
    topk = topk.cpu().numpy()
    for i in topk[1:]:  # 除去输入词
        print('cosine sim=%.3f: %s' % (cos[i], (idx_to_token[i])))
get_similar_tokens('chip', 3, net[0])
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值