LSTM做文本生成(基于bag_of_word)

最新推荐文章于 2023-12-25 09:44:11 发布

勤奋的郑先生

最新推荐文章于 2023-12-25 09:44:11 发布

阅读量1.1k

点赞数 1

文章标签： BAG_Of_word LSTM word_clssification

本文链接：https://blog.csdn.net/weixin_41370083/article/details/82846431

版权

数据：使用丘吉尔的人物传记作为我的学习语料。

框架：keras

import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Droupout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

#文本读入
raw_test=open("../input/Winston_Churchil.txt").read()
raw_text=raw_text.lower()

#既然我们是以每个字母为层级，字母总共才26个，所以我们可以很方便的用One-hot编码出所有的字母
chars=sorted(list(set(raw_text)))
char_to_int=dict((c,i) for i ,c in enumerate(chars))
int_to_char=dict((i,c) for i ,c in enumerate(chars))

#查看chars
chars

t[69]:
['\n',
 ' ',
 '!',
 '#',
 '$',
 '%',
 '(',
 ')',
 '*',
 ',',
 '-',
 '.',
 '/',
 '0',
 '1',
 '2',
 '3',
 '4',
 '5',
 '6',
 '7',
 '8',
 '9',
 ':',
 ';',
 '?',
 '@',
 '[',
 ']',
 '_',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 '‘',
 '’',
 '“',
 '”',
 '\ufeff']


len(chars)

Out[70]:
61


len(raw_text)


71]:
276830

此处文本预测，就是，给了前置的字母以后，预测下一个字母：

比如Winsto,给出n.

Britai 给出 n

构造训练测试集

把raw text变成的可以用来训练x,y:

x是前置字母们，y是最后一个字母

seq_length=100
x=[]
y=[]
for i in range(0,len(raw_text)-seq_length):
    given=raw_text[i:i+seq_length]
    predict=raw_text[i+seq_length]
    x.append([char_to_int[char] for char in given])
    y.append(char_to_int[predict])

#查看数据集
print(x[:3])
print(y[:3])

[[60, 45, 47, 44, 39, 34, 32, 49, 1, 36, 50, 49, 34, 43, 31, 34, 47, 36, 57, 48, 1, 47, 34, 30, 41, 1, 48, 44, 41, 33, 38, 34, 47, 48, 1, 44, 35, 1, 35, 44, 47, 49, 50, 43, 34, 9, 1, 31, 54, 1, 47, 38, 32, 37, 30, 47, 33, 1, 37, 30, 47, 33, 38, 43, 36, 1, 33, 30, 51, 38, 48, 0, 0, 49, 37, 38, 48, 1, 34, 31, 44, 44, 40, 1, 38, 48, 1, 35, 44, 47, 1, 49, 37, 34, 1, 50, 48, 34, 1, 44], [45, 47, 44, 39, 34, 32, 49, 1, 36, 50, 49, 34, 43, 31, 34, 47, 36, 57, 48, 1, 47, 34, 30, 41, 1, 48, 44, 41, 33, 38, 34, 47, 48, 1, 44, 35, 1, 35, 44, 47, 49, 50, 43, 34, 9, 1, 31, 54, 1, 47, 38, 32, 37, 30, 47, 33, 1, 37, 30, 47, 33, 38, 43, 36, 1, 33, 30, 51, 38, 48, 0, 0, 49, 37, 38, 48, 1, 34, 31, 44, 44, 40, 1, 38, 48, 1, 35, 44, 47, 1, 49, 37, 34, 1, 50, 48, 34, 1, 44, 35], [47, 44, 39, 34, 32, 49, 1, 36, 50, 49, 34, 43, 31, 34, 47, 36, 57, 48, 1, 47, 34, 30, 41, 1, 48, 44, 41, 33, 38, 34, 47, 48, 1, 44, 35, 1, 35, 44, 47, 49, 50, 43, 34, 9, 1, 31, 54, 1, 47, 38, 32, 37, 30, 47, 33, 1, 37, 30, 47, 33, 38, 43, 36, 1, 33, 30, 51, 38, 48, 0, 0, 49, 37, 38, 48, 1, 34, 31, 44, 44, 40, 1, 38, 48, 1, 35, 44, 47, 1, 49, 37, 34, 1, 50, 48, 34, 1, 44, 35, 1]]
[35, 1, 30]

可得，楼上表达方式，类似一个词袋，或者说Index.

接下来要做的：

1.讲已有的Input数字表达式（index)，把它变成LSTM需要的数组格式：【样本数，时间步伐，特征】

2.第二，对于output,我们再word2vec学过，用one-hot做Output的预测可以给我们更好的效果，相对于直接预测一个准确的y数值。

n_patterns=len(x)
n_vocab=len(chars)

#把x变成LSTM需要的样子
x=numpy.reshape(x,(n_patterns,seq_length,1))#1代表1维
#简单normal到0-1之间
x=x/float(n_vocab)
#output变成One-hot
y=np_utils.to_categorical(y)

print(x[11])
print(y[11])

#结果
[[ 0.80327869]
 [ 0.55737705]
 [ 0.70491803]
 [ 0.50819672]
 [ 0.55737705]
 [ 0.7704918 ]
 [ 0.59016393]
 [ 0.93442623]
 [ 0.78688525]
 [ 0.01639344]
 [ 0.7704918 ]
 [ 0.55737705]
 [ 0.49180328]
 [ 0.67213115]
 [ 0.01639344]
 [ 0.78688525]
 [ 0.72131148]
 [ 0.67213115]
 [ 0.54098361]
 [ 0.62295082]
 [ 0.55737705]
 [ 0.7704918 ]
 [ 0.78688525]
 [ 0.01639344]
 [ 0.72131148]
 [ 0.57377049]
 [ 0.01639344]
 [ 0.57377049]
 [ 0.72131148]
 [ 0.7704918 ]
 [ 0.80327869]
 [ 0.81967213]
 [ 0.70491803]
 [ 0.55737705]
 [ 0.14754098]
 [ 0.01639344]
 [ 0.50819672]
 [ 0.8852459 ]
 [ 0.01639344]
 [ 0.7704918 ]
 [ 0.62295082]
 [ 0.52459016]
 [ 0.60655738]
 [ 0.49180328]
 [ 0.7704918 ]
 [ 0.54098361]
 [ 0.01639344]
 [ 0.60655738]
 [ 0.49180328]
 [ 0.7704918 ]
 [ 0.54098361]
 [ 0.62295082]
 [ 0.70491803]
 [ 0.59016393]
 [ 0.01639344]
 [ 0.54098361]
 [ 0.49180328]
 [ 0.83606557]
 [ 0.62295082]
 [ 0.78688525]
 [ 0.        ]
 [ 0.        ]
 [ 0.80327869]
 [ 0.60655738]
 [ 0.62295082]
 [ 0.78688525]
 [ 0.01639344]
 [ 0.55737705]
 [ 0.50819672]
 [ 0.72131148]
 [ 0.72131148]
 [ 0.6557377 ]
 [ 0.01639344]
 [ 0.62295082]
 [ 0.78688525]
 [ 0.01639344]
 [ 0.57377049]
 [ 0.72131148]
 [ 0.7704918 ]
 [ 0.01639344]
 [ 0.80327869]
 [ 0.60655738]
 [ 0.55737705]
 [ 0.01639344]
 [ 0.81967213]
 [ 0.78688525]
 [ 0.55737705]
 [ 0.01639344]
 [ 0.72131148]
 [ 0.57377049]
 [ 0.01639344]
 [ 0.49180328]
 [ 0.70491803]
 [ 0.8852459 ]
 [ 0.72131148]
 [ 0.70491803]
 [ 0.55737705]
 [ 0.01639344]
 [ 0.49180328]
 [ 0.70491803]]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  1.  0.  0.  0.  0.  0.]

模型构造

model=Sequential()
model.add(LSTM(128,input_shape=(x.shape[1],x.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1],activation="softmax"))
model.compile(loss="categorical_crossentropy",optimizer="adam")

#跑模型
model.fit(x,y,nv_epoch=10,batch_size=32)

#预测
def predict_next(input_array):
    x=numpy.reshape(input_array,(1,seq_length,1))
    x=x/float(n_vocab)
    y=model.predict(x)
    return y

def string_to_index(raw_input):
    res=[]
    for c in raw_input[(len(raw_input)-seq_length):]:
        res.append(char_to_int[c])
    return res

def y_to_char(y):
    largest_index=y.argmax()
    c=int_to_char[largest_index]
    return c

def generate_article(init,rounds=500):
    in_string=init.lower()
    for i in range(rounds):
        n=y_to_char(predict_next(string_to_index(in_string)))
        in_string+=n
    return in_string


init = 'Professor Michael S. Hart is the originator of the Project'
article = generate_article(init)
print(article)

此次对Input用的是词袋模型，我们知道词袋模型，无法顾及到上下单词/字符之间的关系，因此选用word2vec来做input，可以提升效果。

勤奋的郑先生

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
2
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫