数据:使用丘吉尔的人物传记作为我的学习语料。
框架:keras
import numpy
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Droupout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
#文本读入
raw_test=open("../input/Winston_Churchil.txt").read()
raw_text=raw_text.lower()
#既然我们是以每个字母为层级,字母总共才26个,所以我们可以很方便的用One-hot编码出所有的字母
chars=sorted(list(set(raw_text)))
char_to_int=dict((c,i) for i ,c in enumerate(chars))
int_to_char=dict((i,c) for i ,c in enumerate(chars))
#查看chars
chars
t[69]:
['\n',
' ',
'!',
'#',
'$',
'%',
'(',
')',
'*',
',',
'-',
'.',
'/',
'0',
'1',
'2',
'3',
'4',
'5',
'6',
'7',
'8',
'9',
':',
';',
'?',
'@',
'[',
']',
'_',
'a',
'b',
'c',
'd',
'e',
'f',
'g',
'h',
'i',
'j',
'k',
'l',
'm',
'n',
'o',
'p',
'q',
'r',
's',
't',
'u',
'v',
'w',
'x',
'y',
'z',
'‘',
'’',
'“',
'”',
'\ufeff']
len(chars)
Out[70]:
61
len(raw_text)
71]:
276830
此处文本预测,就是,给了前置的字母以后,预测下一个字母:
比如Winsto,给出n.
Britai 给出 n
构造训练测试集
把raw text变成的可以用来训练x,y:
x是前置字母们,y是最后一个字母
seq_length=100
x=[]
y=[]
for i in range(0,len(raw_text)-seq_length):
given=raw_text[i:i+seq_length]
predict=raw_text[i+seq_length]
x.append([char_to_int[char] for char in given])
y.append(char_to_int[predict])
#查看数据集
print(x[:3])
print(y[:3])
[[60, 45, 47, 44, 39, 34, 32, 49, 1, 36, 50, 49, 34, 43, 31, 34, 47, 36, 57, 48, 1, 47, 34, 30, 41, 1, 48, 44, 41, 33, 38, 34, 47, 48, 1, 44, 35, 1, 35, 44, 47, 49, 50, 43, 34, 9, 1, 31, 54, 1, 47, 38, 32, 37, 30, 47, 33, 1, 37, 30, 47, 33, 38, 43, 36, 1, 33, 30, 51, 38, 48, 0, 0, 49, 37, 38, 48, 1, 34, 31, 44, 44, 40, 1, 38, 48, 1, 35, 44, 47, 1, 49, 37, 34, 1, 50, 48, 34, 1, 44], [45, 47, 44, 39, 34, 32, 49, 1, 36, 50, 49, 34, 43, 31, 34, 47, 36, 57, 48, 1, 47, 34, 30, 41, 1, 48, 44, 41, 33, 38, 34, 47, 48, 1, 44, 35, 1, 35, 44, 47, 49, 50, 43, 34, 9, 1, 31, 54, 1, 47, 38, 32, 37, 30, 47, 33, 1, 37, 30, 47, 33, 38, 43, 36, 1, 33, 30, 51, 38, 48, 0, 0, 49, 37, 38, 48, 1, 34, 31, 44, 44, 40, 1, 38, 48, 1, 35, 44, 47, 1, 49, 37, 34, 1, 50, 48, 34, 1, 44, 35], [47, 44, 39, 34, 32, 49, 1, 36, 50, 49, 34, 43, 31, 34, 47, 36, 57, 48, 1, 47, 34, 30, 41, 1, 48, 44, 41, 33, 38, 34, 47, 48, 1, 44, 35, 1, 35, 44, 47, 49, 50, 43, 34, 9, 1, 31, 54, 1, 47, 38, 32, 37, 30, 47, 33, 1, 37, 30, 47, 33, 38, 43, 36, 1, 33, 30, 51, 38, 48, 0, 0, 49, 37, 38, 48, 1, 34, 31, 44, 44, 40, 1, 38, 48, 1, 35, 44, 47, 1, 49, 37, 34, 1, 50, 48, 34, 1, 44, 35, 1]]
[35, 1, 30]
可得,楼上表达方式,类似一个词袋,或者说Index.
接下来要做的:
1.讲已有的Input数字表达式(index),把它变成LSTM需要的数组格式:【样本数,时间步伐,特征】
2.第二,对于output,我们再word2vec学过,用one-hot做Output的预测可以给我们更好的效果,相对于直接预测一个准确的y数值。
n_patterns=len(x)
n_vocab=len(chars)
#把x变成LSTM需要的样子
x=numpy.reshape(x,(n_patterns,seq_length,1))#1代表1维
#简单normal到0-1之间
x=x/float(n_vocab)
#output变成One-hot
y=np_utils.to_categorical(y)
print(x[11])
print(y[11])
#结果
[[ 0.80327869]
[ 0.55737705]
[ 0.70491803]
[ 0.50819672]
[ 0.55737705]
[ 0.7704918 ]
[ 0.59016393]
[ 0.93442623]
[ 0.78688525]
[ 0.01639344]
[ 0.7704918 ]
[ 0.55737705]
[ 0.49180328]
[ 0.67213115]
[ 0.01639344]
[ 0.78688525]
[ 0.72131148]
[ 0.67213115]
[ 0.54098361]
[ 0.62295082]
[ 0.55737705]
[ 0.7704918 ]
[ 0.78688525]
[ 0.01639344]
[ 0.72131148]
[ 0.57377049]
[ 0.01639344]
[ 0.57377049]
[ 0.72131148]
[ 0.7704918 ]
[ 0.80327869]
[ 0.81967213]
[ 0.70491803]
[ 0.55737705]
[ 0.14754098]
[ 0.01639344]
[ 0.50819672]
[ 0.8852459 ]
[ 0.01639344]
[ 0.7704918 ]
[ 0.62295082]
[ 0.52459016]
[ 0.60655738]
[ 0.49180328]
[ 0.7704918 ]
[ 0.54098361]
[ 0.01639344]
[ 0.60655738]
[ 0.49180328]
[ 0.7704918 ]
[ 0.54098361]
[ 0.62295082]
[ 0.70491803]
[ 0.59016393]
[ 0.01639344]
[ 0.54098361]
[ 0.49180328]
[ 0.83606557]
[ 0.62295082]
[ 0.78688525]
[ 0. ]
[ 0. ]
[ 0.80327869]
[ 0.60655738]
[ 0.62295082]
[ 0.78688525]
[ 0.01639344]
[ 0.55737705]
[ 0.50819672]
[ 0.72131148]
[ 0.72131148]
[ 0.6557377 ]
[ 0.01639344]
[ 0.62295082]
[ 0.78688525]
[ 0.01639344]
[ 0.57377049]
[ 0.72131148]
[ 0.7704918 ]
[ 0.01639344]
[ 0.80327869]
[ 0.60655738]
[ 0.55737705]
[ 0.01639344]
[ 0.81967213]
[ 0.78688525]
[ 0.55737705]
[ 0.01639344]
[ 0.72131148]
[ 0.57377049]
[ 0.01639344]
[ 0.49180328]
[ 0.70491803]
[ 0.8852459 ]
[ 0.72131148]
[ 0.70491803]
[ 0.55737705]
[ 0.01639344]
[ 0.49180328]
[ 0.70491803]]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
1. 0. 0. 0. 0. 0.]
模型构造
model=Sequential()
model.add(LSTM(128,input_shape=(x.shape[1],x.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1],activation="softmax"))
model.compile(loss="categorical_crossentropy",optimizer="adam")
#跑模型
model.fit(x,y,nv_epoch=10,batch_size=32)
#预测
def predict_next(input_array):
x=numpy.reshape(input_array,(1,seq_length,1))
x=x/float(n_vocab)
y=model.predict(x)
return y
def string_to_index(raw_input):
res=[]
for c in raw_input[(len(raw_input)-seq_length):]:
res.append(char_to_int[c])
return res
def y_to_char(y):
largest_index=y.argmax()
c=int_to_char[largest_index]
return c
def generate_article(init,rounds=500):
in_string=init.lower()
for i in range(rounds):
n=y_to_char(predict_next(string_to_index(in_string)))
in_string+=n
return in_string
init = 'Professor Michael S. Hart is the originator of the Project'
article = generate_article(init)
print(article)
此次对Input用的是词袋模型,我们知道词袋模型,无法顾及到上下单词/字符之间的关系,因此选用word2vec来做input,可以提升效果。