【NLP】神经网络语言模型NNLM

15 篇文章 3 订阅
5 篇文章 1 订阅

论文

《A Neural Probabilistic Language Model》2003

Neural Network Language Model
在这里插入图片描述

在这里插入图片描述

其中双曲正切 tanh 逐个元素地应用,W 可选为零(无直接连接),x 是词特征层激活向量,它是来自矩阵 C 的输入词特征的串联:

代码

【参考:graykode/nlp-tutorial: Natural Language Processing Tutorial for Deep Learning Researchers

【参考:Neural Network Language Model PyTorch实现_哔哩哔哩_bilibili
【参考:NNLM的PyTorch实现 - mathor

# %%
# code by Tae Hwan Jung @graykode
import torch
import torch.nn as nn
import torch.optim as optim

def make_batch():
    input_batch = []
    target_batch = []

    for sen in sentences:
        word = sen.split() # space tokenizer
        input = [word_dict[n] for n in word[:-1]] # create (1~n-1) as input
        target = word_dict[word[-1]] # create (n) as target, We usually call this 'casual language model'

        input_batch.append(input)
        target_batch.append(target)

    return input_batch, target_batch

# Model
class NNLM(nn.Module):
    def __init__(self):
        super(NNLM, self).__init__()
        self.C = nn.Embedding(n_class, m)
        self.H = nn.Linear(n_step * m, n_hidden, bias=False)
        self.d = nn.Parameter(torch.ones(n_hidden))
        self.U = nn.Linear(n_hidden, n_class, bias=False)
        self.W = nn.Linear(n_step * m, n_class, bias=False)
        self.b = nn.Parameter(torch.ones(n_class))

    def forward(self, X):
        # 论文中 y = b+Wx+Utanh(d+Hx)
        '''
            X: [batch_size, n_step]
        '''
        X = self.C(X) # X : [batch_size, n_step, m]  # [batch_size,seq_len,embedding_size]
        X = X.view(-1, n_step * m) # [batch_size, n_step * m] # [batch_size,seq_len*embedding_size]
        tanh = torch.tanh(self.d + self.H(X)) # [batch_size, n_hidden]
        output = self.b + self.W(X) + self.U(tanh) # [batch_size, n_class]
        return output

if __name__ == '__main__':
    n_step = 2 # number of steps, n-1 in paper # 相当于seq_len
    n_hidden = 2 # number of hidden size, h in paper
    m = 2 # embedding size, m in paper # embedding_size

    sentences = ["i like dog", "i love coffee", "i hate milk"]
    # batch_size = 3
    word_list = " ".join(sentences).split() # 先把列表变成字符串再以空格为分隔符切分字符串,返回List[str]
    word_list = list(set(word_list)) # 去重
    word_dict = {w: i for i, w in enumerate(word_list)} # word2idx
    number_dict = {i: w for i, w in enumerate(word_list)} # idx2word
    n_class = len(word_dict)  # number of Vocabulary :n_class=7

    model = NNLM()

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    # input_batch:[batch_size,seq_len] target_batch:[batch_size,1]
    input_batch, target_batch = make_batch() # 构建数据
    # 转成Tensor类型
    input_batch = torch.LongTensor(input_batch)
    target_batch = torch.LongTensor(target_batch)

    # Training
    for epoch in range(5000):
        optimizer.zero_grad()
        output = model(input_batch) # [batch_size,n_class]

        # output : [batch_size, n_class], target_batch : [batch_size]
        loss = criterion(output, target_batch)
        if (epoch + 1) % 1000 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =', '{:.6f}'.format(loss))

        loss.backward()
        optimizer.step()

    # Predict
    # predict = model(input_batch).data.max(1, keepdim=True)[1] # 原来的代码
    predict=model(input_batch) # [batch_size,n_class]
    predict_index=predict.max(dim=1, # 每行求最大值下标
                              keepdim=True) # 保持维度不变 [batch_size,1]
    predict_data=predict_index[1] # 只返回最大值的每个索引 [batch_size,1]

    # Test
    # print([sen.split()[:2] for sen in sentences], '->', [number_dict[n.item()] for n in predict.squeeze()]) # 原来的代码
    sent=[sen.split()[:2] for sen in sentences] # 取每句话前面两个单词 例如:["i like dog"]->[[i,like]]
    predict_text=[
        number_dict[n.item()] # 到idx2word查找word
                  for n in predict_data.squeeze() # [batch_size]
                  ]
    print(sent,'->',predict_text)
Epoch: 1000 cost = 0.040966
Epoch: 2000 cost = 0.008105
Epoch: 3000 cost = 0.002991
Epoch: 4000 cost = 0.001298
Epoch: 5000 cost = 0.000630
[['i', 'like'], ['i', 'love'], ['i', 'hate']] -> ['dog', 'coffee', 'milk']

代码二

【参考:ML_Study/NNLModel.py at master · pi408637535/ML_Study

# -*- coding: utf-8 -*-
# @Time    : 2020/4/16 15:59
# @Author  : piguanghua
# @FileName: NNLModel.py
# @Software: PyCharm

# this model train word embedding NNLM
# Impletation refer to <A Neural Probabilistic Language Model> pdf:http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf

from matplotlib import pyplot as plt
import numpy as np
import random
import torch as t
import torch.nn as nn
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

from collections import Counter

# hypotrical parameter
USE_CUDA = t.cuda.is_available()

# 为了保证实验结果可以复现,我们经常会把各种random seed固定在某一个值
random.seed(53113)
np.random.seed(53113)
t.manual_seed(53113)
if USE_CUDA:
    t.cuda.manual_seed(53113)

NUM_EPOCHS = 200
BATCH_SIZE = 1  # the batch size
LEARNING_RATE = 0.2  # the initial learning rate
EMBEDDING_SIZE = 300
N_GRAM = 2
HIDDEN_UNIT = 128
UNK = "<unk>"

test_sentence = """When forty winters shall besiege thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's proud livery so gazed on now,
Will be a totter'd weed of small worth held:
Then being asked, where all thy beauty lies,
Where all the treasure of thy lusty days;
To say, within thine own deep sunken eyes,
Were an all-eating shame, and thriftless praise.
How much more praise deserv'd thy beauty's use,
If thou couldst answer 'This fair child of mine
Shall sum my count, and make my old excuse,'
Proving his beauty by succession thine!
This were to be new made when thou art old,
And see thy blood warm when thou feel'st it cold.""".split()

tokens = test_sentence
trigram = [((test_sentence[i], test_sentence[i + 1]), test_sentence[i + 2]) for i in range(len(tokens) - N_GRAM)]
words = dict(Counter(tokens).most_common()) # 返回所有单词的频率 例如[('a', 5), ('b', 2), ('r', 2)]


def cmp(a, b):
    return (a > b) - (a < b)


words = sorted(iter(words.keys()), key=words.get, reverse=False)
words += UNK
word2id = {k: i for i, k in enumerate(words)}
id2word = {i: k for i, k in enumerate(words)}

H = N_GRAM * EMBEDDING_SIZE
U = HIDDEN_UNIT


class MyDataset(Dataset):

    def __init__(self, word2id, id2word, tokens):
        self.word2id = word2id
        self.id2word = id2word
        self.tokens = tokens
        # self.word_encoder = [word2idx[token] for token in tokens]

    def __len__(self):
        return len(self.tokens) - N_GRAM

    def __getitem__(self, index):
        ((word_0, word_1), word_2) = trigram[index]
        word_0 = self.word2id[word_0]
        word_1 = self.word2id[word_1]
        word_2 = self.word2id[word_2]

        return word_0, word_1, word_2


class NNLM(nn.Module):
    def __init__(self, vocab, dim):
        super(NNLM, self).__init__()
        self.embed = nn.Embedding(vocab, dim)
        self.H = nn.Parameter(t.randn(EMBEDDING_SIZE * N_GRAM, HIDDEN_UNIT))
        self.d = nn.Parameter(t.randn(HIDDEN_UNIT))
        self.U = nn.Parameter(t.randn(HIDDEN_UNIT, vocab))
        self.b = nn.Parameter(t.randn(vocab))
        self.W = nn.Parameter(t.randn(EMBEDDING_SIZE * N_GRAM, vocab))

    '''
        words:batch,sequence
        # x: [batch_size, n_step*n_class]
    '''

    def forward(self, word_0, word_1):
        batch = word_0.shape[0]
        word_0 = self.embed(word_0)
        word_1 = self.embed(word_1)
        words = t.cat((word_0, word_1), dim=1)
        words = words.view(batch, -1)  # batch,sequence*dim
        tanh = t.tanh(t.mm(words, self.H) + self.d)  # tanh:batch,HIDDEN_UNIT
        hidden_output = t.mm(tanh, self.U) + self.b  # hidden_output:batch,vocab
        y = hidden_output + t.mm(words, self.W)
        y = F.log_softmax(y, 1)

        return -y


def evaluate(model, word_0, word_1):
    model.eval()
    word_0 = word_0.long()
    word_1 = word_1.long()

    softmax = model(word_0, word_1)
    predict = t.argmax(softmax, 1)

    word_0 = word_0.cpu().detach().numpy()
    word_1 = word_1.cpu().detach().numpy()
    predict = predict.cpu().detach().numpy()
    word_sequence = [((id2word[word_0[i]], id2word[word_1[i]]), id2word[predict[i]]) for i in range(len(word_0))]
    print(word_sequence)
    model.train()


def train(model, dataloader, optimizer, criterion):
    model.train()
    for e in range(NUM_EPOCHS):
        for i, (word_0, word_1, word_2) in enumerate(dataloader):

            word_0 = word_0.long()
            word_1 = word_1.long()
            word_2 = word_2.long()
            if USE_CUDA:
                word_0 = word_0.cuda()
                word_1 = word_1.cuda()
                word_2 = word_2.cuda()

            optimizer.zero_grad()

            softmax = model(word_0, word_1)
            loss = criterion(softmax, word_2)
            loss.backward()
            optimizer.step()

            if i % 50 == 0:
                print("epoch: {}, iter: {}, loss: {}".format(e, i, loss.item()))
                evaluate(model, word_0, word_1)

        # embedding_weights = model.input_embeddings()
        # np.save("embedding-{}".format(EMBEDDING_SIZE), embedding_weights)
        t.save(model.state_dict(), "embedding-{}.pth".format(EMBEDDING_SIZE))


if __name__ == '__main__':
    word2idx, idx2word, = word2id, id2word
    dim = EMBEDDING_SIZE
    hidden = HIDDEN_UNIT

    model = NNLM(len(word2id.keys()), dim)

    for name, parameters in model.named_parameters():
        print(name, ':', parameters.size())

    model.to(t.device("cuda" if USE_CUDA else 'cpu'))

    lr = 1e-4
    optimizer = t.optim.SGD(model.parameters(), lr=lr)

    dataloader = DataLoader(dataset=MyDataset(word2id, id2word, tokens), batch_size=BATCH_SIZE)
    criterion = nn.CrossEntropyLoss()
    train(model, dataloader, optimizer, criterion)

epoch: 197, iter: 0, loss: 24.425506591796875
[(('When', 'forty'), 'blood')]
epoch: 197, iter: 50, loss: 48.05567932128906
[(('within', 'thine'), 'now,')]
epoch: 197, iter: 100, loss: 1.5497195136049413e-06
[(('made', 'when'), 'thou')]
epoch: 198, iter: 0, loss: 24.238327026367188
[(('When', 'forty'), 'blood')]
epoch: 198, iter: 50, loss: 47.84667205810547
[(('within', 'thine'), 'now,')]
epoch: 198, iter: 100, loss: 1.5497195136049413e-06
[(('made', 'when'), 'thou')]
epoch: 199, iter: 0, loss: 24.051368713378906
[(('When', 'forty'), 'blood')]
epoch: 199, iter: 50, loss: 47.63882064819336
[(('within', 'thine'), 'now,')]
epoch: 199, iter: 100, loss: 1.5497195136049413e-06
[(('made', 'when'), 'thou')]

进程已结束,退出代码0

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值