静态词向量之FFNN训练词向量

介绍

本文分享几个好玩的知识点:

  1. 前馈神经网络
  2. bag of words(词袋)
  3. 使用ffnn获取词向量

前馈神经网络

什么叫前馈神经网络呢,emmm,自个去看百度百科定义前馈神经网络。简单来说,就是两个linear加一个激活函数,简单结构如下:

1
2
3
4
5
6
7
8
class FFNN(nn.Module):
def __init__(self):
self.linear1 = nn.Linear()
self.active_func = F.relu
self.linear2 = nn.Linear()

def forward(self, x):
return self.linear2(self.active_func(self.linear1(x)))

其中大名鼎鼎的transformer中也用到了FFNN,所以要认真对待每一种结构哦。

bag of words(词袋)

啥叫词袋呢,emmmm,这个咋解释呢?就是从一堆词取context_size大小的词回来。它没有顺序,所以叫做词袋。比如unigram, bigram, trigram,ngram,都是属于词袋。

而大名鼎鼎的word2vec也是属于词袋这种的哦!这里画重点。

使用ffnn获取词向量

这里就不难理解了,就是换一种方式来实现词向量的获取方式。我在这两采用了两种方式,第一种是以前面两个词为准,获取当前词,这叫做用过去的词来预测未来的词。嘿嘿,如果脑洞大开点的话,是不是有种transformer encoder的感觉😂😂😂。

1. 使用过去词预测当前词

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# Defined in Section 5.3.1.2

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from tqdm.auto import tqdm

from utils import BOS_TOKEN, EOS_TOKEN
from utils import load_reuters, save_pretrained, get_loader, init_weights


def cal_similar(w):
v = model.embeddings.weight[vocab[w]]
values, indices = torch.mm(model.embeddings.weight, v.view(-1, 1)).topk(dim=0, k=3)
similar_tokens = vocab.convert_ids_to_tokens(indices.view(-1).tolist())
return similar_tokens


def demos():
tokens = ['china', 'august', 'good', 'paris']
for token in tokens:
s = cal_similar(token)
print(f'{token}: {s}')
class NGramDataset(Dataset):
def __init__(self, corpus, vocab, context_size=2):
self.data = []
self.bos = vocab[BOS_TOKEN]
self.eos = vocab[EOS_TOKEN]
for sentence in tqdm(corpus, desc="Dataset Construction"):
# 插入句首句尾符号
sentence = [self.bos] + sentence + [self.eos]
if len(sentence) < context_size:
continue
for i in range(context_size, len(sentence)):
# here,只取之前的词
# 模型输入:长为context_size的上文
context = sentence[i-context_size:i]
# 模型输出:当前词
target = sentence[i]
self.data.append((context, target))

def __len__(self):
return len(self.data)

def __getitem__(self, i):
return self.data[i]

def collate_fn(self, examples):
# 从独立样本集合中构建batch输入输出
inputs = torch.tensor([ex[0] for ex in examples], dtype=torch.long)
targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
return (inputs, targets)

class FeedForwardNNLM(nn.Module):
def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
super(FeedForwardNNLM, self).__init__()
# 词嵌入层
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
# 线性变换:词嵌入层->隐含层
self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)
# 线性变换:隐含层->输出层
self.linear2 = nn.Linear(hidden_dim, vocab_size)
# 使用ReLU激活函数
self.activate = F.relu
init_weights(self)

def forward(self, inputs):
embeds = self.embeddings(inputs).view((inputs.shape[0], -1))
hidden = self.activate(self.linear1(embeds))
output = self.linear2(hidden)
# 根据输出层(logits)计算概率分布并取对数,以便于计算对数似然
# 这里采用PyTorch库的log_softmax实现
log_probs = F.log_softmax(output, dim=1)
return log_probs

embedding_dim = 64
context_size = 2
hidden_dim = 128
batch_size = 1024
num_epoch = 10

# 读取文本数据,构建FFNNLM训练数据集(n-grams)
corpus, vocab = load_reuters()
dataset = NGramDataset(corpus, vocab, context_size)
data_loader = get_loader(dataset, batch_size)

# 负对数似然损失函数
nll_loss = nn.NLLLoss()
# 构建FFNNLM,并加载至device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = FeedForwardNNLM(len(vocab), embedding_dim, context_size, hidden_dim)
model.to(device)
# 使用Adam优化器
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
total_losses = []
for epoch in range(num_epoch):
total_loss = 0
for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
inputs, targets = [x.to(device) for x in batch]
optimizer.zero_grad()
log_probs = model(inputs)
loss = nll_loss(log_probs, targets)
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Loss: {total_loss:.2f}")
total_losses.append(total_loss)
demos()
# 保存词向量(model.embeddings)
save_pretrained(vocab, model.embeddings.weight.data, "ffnnlm.vec")

2. 使用过去和未来的词预测当前词

是不是像cbow~

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122

# Defined in Section 5.3.1.2

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from tqdm.auto import tqdm
from utils import BOS_TOKEN, EOS_TOKEN
from utils import load_reuters, save_pretrained, get_loader, init_weights
from torch.optim.lr_scheduler import ExponentialLR
def cal_similar(w):
v = model.embeddings.weight[vocab[w]]
values, indices = torch.mm(model.embeddings.weight, v.view(-1, 1)).topk(dim=0, k=3)
similar_tokens = vocab.convert_ids_to_tokens(indices.view(-1).tolist())
return similar_tokens


def demos():
tokens = ['china', 'august', 'good', 'paris']
for token in tokens:
s = cal_similar(token)
print(f'{token}: {s}')

class NGramDataset(Dataset):
def __init__(self, corpus, vocab, context_size=2):
self.data = []
self.bos = vocab[BOS_TOKEN]
self.eos = vocab[EOS_TOKEN]
for sentence in tqdm(corpus, desc="Dataset Construction"):
# 插入句首句尾符号
sentence = [self.bos] + sentence + [self.eos]
if len(sentence) < context_size:
continue
for i in range(context_size, len(sentence) - context_size):
# 就这里哦
# 模型输入:长为context_size的上文
left_context = sentence[i-context_size:i]
right_context = sentence[i+1: i+context_size + 1]
context = [*left_context, *right_context]
# 模型输出:当前词
target = sentence[i]
self.data.append((context, target))

def __len__(self):
return len(self.data)

def __getitem__(self, i):
return self.data[i]

def collate_fn(self, examples):
# 从独立样本集合中构建batch输入输出
inputs = torch.tensor([ex[0] for ex in examples], dtype=torch.long)
targets = torch.tensor([ex[1] for ex in examples], dtype=torch.long)
return (inputs, targets)

class FeedForwardNNLM(nn.Module):
def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
super(FeedForwardNNLM, self).__init__()
# 词嵌入层
self.embeddings = nn.Embedding(vocab_size, embedding_dim)
# 线性变换:词嵌入层->隐含层
self.linear1 = nn.Linear(context_size * embedding_dim * 2, hidden_dim)
# 线性变换:隐含层->输出层
self.linear2 = nn.Linear(hidden_dim, vocab_size)
# 使用ReLU激活函数
self.activate = F.relu
# init_weights(self)
self.dp = nn.Dropout(0.1)

def forward(self, inputs):
embeds = self.embeddings(inputs).view((inputs.shape[0], -1))
hidden = self.activate(self.linear1(embeds))
output = self.linear2(hidden)
# 根据输出层(logits)计算概率分布并取对数,以便于计算对数似然
# 这里采用PyTorch库的log_softmax实现
# output = self.dp(output)
log_probs = F.log_softmax(output, dim=1)
return log_probs

embedding_dim = 64
context_size = 2
hidden_dim = 128
batch_size = 10240
num_epoch = 10

# 读取文本数据,构建FFNNLM训练数据集(n-grams)
corpus, vocab = load_reuters()
dataset = NGramDataset(corpus, vocab, context_size)
data_loader = get_loader(dataset, batch_size)

# 负对数似然损失函数
nll_loss = nn.NLLLoss()
# 构建FFNNLM,并加载至device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = FeedForwardNNLM(len(vocab), embedding_dim, context_size, hidden_dim)
model.to(device)
# 使用Adam优化器
optimizer = optim.Adam(model.parameters(), lr=0.01)
scheduler = ExponentialLR(optimizer, gamma=0.9)
model.train()
total_losses = []
for epoch in range(num_epoch):
total_loss = 0
for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
inputs, targets = [x.to(device) for x in batch]
optimizer.zero_grad()
log_probs = model(inputs)
loss = nll_loss(log_probs, targets)
loss.backward()
optimizer.step()

total_loss += loss.item()

print(f"Loss: {total_loss:.2f}, LR: {scheduler.get_last_lr()[0]}")
scheduler.step()
demos()
total_losses.append(total_loss)

# 保存词向量(model.embeddings)
save_pretrained(vocab, model.embeddings.weight.data, "ffnnlm.vec")

总结

这两者之间就以下几点不同:

  1. NGramDataset那里在获取context_size的词时不一样
  2. 训练时linear1的in_feature大小变了。

其余都一样哦,可以自己跑一跑呢。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值