神经网络语言模型 NNLM (Keras实现)

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.utils import np_utils
from keras.utils.data_utils import get_file
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

import numpy as np
np.random.seed(13)

path = get_file('alice.txt', origin='http://www.gutenberg.org/cache/epub/11/pg11.txt')
doc = open(path).readlines()[0:50]
tokenizer = Tokenizer()
tokenizer.fit_on_texts(doc)
doc = tokenizer.texts_to_sequences(doc)
doc = [l for l in doc if len(l) > 1]
words_size = sum([len(words) - 1 for words in doc])


maxlen = max([len(x)-1 for x in doc])
vocab_size = len(tokenizer.word_index) + 1


def generate_data(X, maxlen, V):
    for sentence in X: 
        inputs = []
        targets = []
        for i in range(1, len(sentence)):
            inputs.append(sentence[0:i])
            targets.append(sentence[i])
        y = np_utils.to_categorical(targets, V)
        inputs_sequence = sequence.pad_sequences(inputs, maxlen=maxlen)
        yield (inputs_sequence, y)

def sample(p):
    p /= sum(p)
    return np.where(np.random.multinomial(1, p, 1)==1)[1][0]


model = Sequential()
model.add(Embedding(vocab_size, 128, input_length=maxlen))
model.add(LSTM(128, return_sequences=False))
model.add(Dense(vocab_size, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adadelta')

for i in range(30):
    for x, y in generate_data(doc, maxlen, vocab_size):
        model.train_on_batch(x, y)

    in_words = "alice's"
    for _ in range(maxlen):
        in_sequence = sequence.pad_sequences(tokenizer.texts_to_sequences([in_words]), maxlen=maxlen)
        wordid = sample(model.predict(in_sequence)[0])
        for k, v in tokenizer.word_index.items():
            if v == wordid:
                in_words += ' ' + k
                break

    print(i, in_words)

in_words = "alice's"
for _ in range(maxlen):
    in_sequence = sequence.pad_sequences(tokenizer.texts_to_sequences([in_words]), maxlen=maxlen)
    wordid = model.predict_classes(in_sequence, verbose=0)[0]
    for k, v in tokenizer.word_index.items():
        if v == wordid:
            in_words += ' ' + k
            break

print(in_words)

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

Ai君臣

学会的就要教给人

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值