Github:https://github.com/yjfiejd/text_generation_chars/blob/master/rnn_text_generation_chars.py (代码&英文文本数据)
# -*- coding:utf8 -*-
# @TIME : 2018/5/2 下午3:47
# @Author : Allen
# @File : rnn_text_generation.py
#学习LST,给出前置的字母后,下一个字母是谁?
#【1】导入常用的包
import numpy as np
import os
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
os.chdir('/Users/a1/Desktop/算法实战/kaggel_06/Char_rnn/')
#【2】读取文本信息, 这里只针对字母进行操作,举例:给出Winsto,下一个单词应该预测n,组成Winston
raw_text = open('Project_Gutenberg_Complete_Works_of_Winston_Churchill的副本.txt').read()
raw_text = raw_text.lower() #把txt英文全部变为小写
#One-Hot编码编码26个字母+符号
chars = sorted(list(set(raw_text))) #set后返回字典 -> list转为列表 -> 最后用sorted排序
print(len(raw_text))
print(len(chars))
print(chars)
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))
#【3】文本预测:构造训练集测试集,x为前置字母,y是最后一个字母
seq_length = 100
x = []
y = []
for i in range(0, len(raw_text) - seq_length):
#刚开始从0道100[0,100)取出字母,
given = raw_text[i:i+seq_length]
#取出第101个字母[100]
predict = raw_text[i+seq_length]
#遍历取出来第100字母,把每一个字母对应在char中第序号找出来, char一共有62种
x.append([char_to_int[char] for char in given])
#把预测第结果的字母,在char中找出来,找出序号
y.append(char_to_int[predict])
print(x[:2]) #打印列表两个元素,每个元素里有100个字母对应的序号
print(y[:2]) #打印列表2个元素,预测值
#输出结果如下:
# 1044648
# 53
# ['\n', ' ', '!', '"', '#', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
# [[46, 34, 31, 1, 42, 44, 41, 36, 31, 29, 46, 1, 33, 47, 46, 31, 40, 28, 31, 44, 33, 1, 31, 28, 41, 41, 37, 1, 41, 32, 1, 46, 34, 31, 1, 29, 41, 39, 42, 38, 31, 46, 31, 1, 42, 33, 1, 31, 30, 35, 46, 35, 41, 40, 1, 41, 32, 1, 46, 34, 31, 1, 49, 41, 44, 37, 45, 1, 41, 32, 0, 49, 35, 40, 45, 46, 41, 40, 1, 29, 34, 47, 44, 29, 34, 35, 38, 38, 9, 1, 28, 51, 1, 49, 35, 40, 45, 46, 41, 40], [34, 31, 1, 42, 44, 41, 36, 31, 29, 46, 1, 33, 47, 46, 31, 40, 28, 31, 44, 33, 1, 31, 28, 41, 41, 37, 1, 41, 32, 1, 46, 34, 31, 1, 29, 41, 39, 42, 38, 31, 46, 31, 1, 42, 33, 1, 31, 30, 35, 46, 35, 41, 40, 1, 41, 32, 1, 46, 34, 31, 1, 49, 41, 44, 37, 45, 1, 41, 32, 0, 49, 35, 40, 45, 46, 41, 40, 1, 29, 34, 47, 44, 29, 34, 35, 38, 38, 9, 1, 28, 51, 1, 49, 35, 40, 45, 46, 41, 40, 1]]
# [1, 29]
#这个类似词袋 index
#【4】把index变为LSTM需要的数组格式:[样本数,时间步伐,特征], 对于output,我们在word2Vec中学过,用One-Hot做预测可以给我们更好的效果
#x的元素一共有几个
n_patterns = len(x) #一会代表行数,
n_vocab = len(chars)
#把x变成LSTM需要的样子,从一维数组变为多维数组
#参考:https://docs.scipy.org/doc/numpy/reference/generated/numpy.reshape.html
x = np.reshape(x, (n_patterns, seq_length, 1)) #行数有n_patterns确定,列数由seq_length确定,每个特征是一维的
#粗糙的normal化,把数值都处理为0-1之间
x = x/float(n_vocab)
#把output变为One-Hot
y = np_utils.to_categorical(y)
print(x[3]) #x相当于打印第3行,100个字母的序号,只是序号变准化了属于(0~1)
print(len(x[3])) # 一共100个
print(y[3]) #相当于One-Hot编码,通过前面的100个字母,预测出第xx号字母,把它设置为1
print(len(y[3])) #输出一个52维的向量,预测正确的地方设置为1
#输出如下:
# [[0.01886792]
# [0.79245283]
# [0.83018868]
# [0.77358491]
# [0.67924528]
# [0.58490566]
# [0.54716981]
# [0.86792453]
# [0.01886792]
# [0.62264151]
# [0.88679245]
# [0.86792453]
# [0.58490566]
# [0.75471698]
# [0.52830189]
# [0.58490566]
# [0.83018868]
# [0.62264151]
# [0.01886792]
# [0.58490566]
# [0.52830189]
# [0.77358491]
# [0.77358491]
# [0.69811321]
# [0.01886792]
# [0.77358491]
# [0.60377358]
# [0.01886792]
# [0.86792453]
# [0.64150943]
# [0.58490566]
# [0.01886792]
# [0.54716981]
# [0.77358491]
# [0.73584906]
# [0.79245283]
# [0.71698113]
# [0.58490566]
# [0.86792453]
# [0.58490566]
# [0.01886792]
# [0.79245283]
# [0.62264151]
# [0.01886792]
# [0.58490566]
# [0.56603774]
# [0.66037736]
# [0.86792453]
# [0.66037736]
# [0.77358491]
# [0.75471698]
# [0.01886792]
# [0.77358491]
# [0.60377358]
# [0.01886792]
# [0.86792453]
# [0.64150943]
# [0.58490566]
# [0.01886792]
# [0.9245283 ]
# [0.77358491]
# [0.83018868]
# [0.69811321]
# [0.8490566 ]
# [0.01886792]
# [0.77358491]
# [0.60377358]
# [0. ]
# [0.9245283 ]
# [0.66037736]
# [0.75471698]
# [0.8490566 ]
# [0.86792453]
# [0.77358491]
# [0.75471698]
# [0.01886792]
# [0.54716981]
# [0.64150943]
# [0.88679245]
# [0.83018868]
# [0.54716981]
# [0.64150943]
# [0.66037736]
# [0.71698113]
# [0.71698113]
# [0.16981132]
# [0.01886792]
# [0.52830189]
# [0.96226415]
# [0.01886792]
# [0.9245283 ]
# [0.66037736]
# [0.75471698]
# [0.8490566 ]
# [0.86792453]
# [0.77358491]
# [0.75471698]
# [0.01886792]
# [0.54716981]
# [0.64150943]]
# 100
# [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
# 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
# 0. 0. 0. 0. 0.]
# 53
#【5】模型构建:LSTM模型构建
model = Sequential()
model.add(LSTM(128, input_shape=(x.shape[1], x.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')
#【6】跑模型
model.fit(x, y, nb_epoch=10, batch_size=1000)
#输出结果:
# 1000 / 1044548[..............................] - ETA: 3:45: 53 - loss: 3.9726
# 2000 / 1044548[..............................] - ETA: 2:34: 03 - loss: 3.9644
# 3000 / 1044548[..............................] - ETA: 2:08: 07 - loss: 3.9578
# 4000 / 1044548[..............................] - ETA: 1:55: 24 - loss: 3.9505
# 5000 / 1044548[..............................] - ETA: 1:45: 25 - loss: 3.9414
# 6000 / 1044548[..............................] - ETA: 1:50: 52 - loss: 3.9316
# 7000 / 1044548[..............................] - ETA: 1:44: 39 - loss: 3.9204
# 8000 / 1044548[..............................] - ETA: 1:40: 36 - loss: 3.9056
# 9000 / 1044548[..............................] - ETA: 1:35: 44 - loss: 3.8842
# 10000 / 1044548[..............................] - ETA: 1:32: 24 - loss: 3.8536
# 11000 / 1044548[..............................] - ETA: 1:35: 12 - loss: 3.8133
# 12000 / 1044548[..............................] - ETA: 1:33: 55 - loss: 3.7732
# 13000 / 1044548[..............................] - ETA: 1:31: 54 - loss: 3.7351
# 14000 / 1044548[..............................] - ETA: 1:29: 28 - loss: 3.7053
# 15000 / 1044548[..............................] - ETA: 1:27: 36 - loss: 3.6780
# 16000 / 1044548[..............................] - ETA: 1:30: 23 - loss: 3.6490
# 17000 / 1044548[..............................] - ETA: 1:29: 20 - loss: 3.6211
# 18000 / 1044548[..............................] - ETA: 1:28: 10 - loss: 3.5986
# 19000 / 1044548[..............................] - ETA: 1:26: 21 - loss: 3.5790
# 20000 / 1044548[..............................] - ETA: 1:26: 14 - loss: 3.5587
# 21000 / 1044548[..............................] - ETA: 1:29: 05 - loss: 3.5396
# 22000 / 1044548[..............................] - ETA: 1:28: 21 - loss: 3.5225
# 23000 / 1044548[..............................] - ETA: 1:27: 41 - loss: 3.5067
# 24000 / 1044548[..............................] - ETA: 1:26: 03 - loss: 3.4917
# 25000 / 1044548[..............................] - ETA: 1:25: 22 - loss: 3.4761
# 26000 / 1044548[..............................] - ETA: 1:26: 38 - loss: 3.4615
# 27000 / 1044548[..............................] - ETA: 1:26: 02 - loss: 3.4492
# 28000 / 1044548[..............................] - ETA: 1:25: 17 - loss: 3.4358
# 29000 / 1044548[..............................] - ETA: 1:24: 10 - loss: 3.4230
# 30000 / 1044548[..............................] - ETA: 1:23: 26 - loss: 3.4129
# 31000 / 1044548[..............................] - ETA: 1:24: 38 - loss: 3.4014
# 32000 / 1044548[..............................] - ETA: 1:24: 10 - loss: 3.3925
# 33000 / 1044548[..............................] - ETA: 1:23: 39 - loss: 3.3826
#【7】训练效果
def predict_next(input_array):
x = np.reshape(input_array, (1, seq_length, 1))
x = x / float(n_vocab)
y = model.predict(x)
return y
def string_to_index(raw_input):
res = []
for c in raw_text[(len(raw_input) - seq_length)]
res.append(char_to_int[c])
return res
def y_to_char(y):
largest_index = y.argmax()
c = int_to_char[largest_index]
return c
def generate_article(init, rounds=500):
in_string = init.lower()
for i in range(rounds):
n = y_to_char(predict_next(string_to_index(in_string)))
in_string += n
return in_string
init = 'Professor Michael S. Hart is the originator of the Project'
article = generate_article(init)
print(article)
# 最后机器太热了,没跑结果