RNN_文本生成_字符篇chars

Github:https://github.com/yjfiejd/text_generation_chars/blob/master/rnn_text_generation_chars.py (代码&英文文本数据)

# -*- coding:utf8 -*-
# @TIME : 2018/5/2 下午3:47
# @Author : Allen
# @File : rnn_text_generation.py

#学习LST,给出前置的字母后,下一个字母是谁?

#【1】导入常用的包
import numpy as np
import os
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils

os.chdir('/Users/a1/Desktop/算法实战/kaggel_06/Char_rnn/')

#【2】读取文本信息, 这里只针对字母进行操作,举例:给出Winsto,下一个单词应该预测n,组成Winston
raw_text = open('Project_Gutenberg_Complete_Works_of_Winston_Churchill的副本.txt').read()
raw_text = raw_text.lower() #把txt英文全部变为小写
#One-Hot编码编码26个字母+符号
chars = sorted(list(set(raw_text))) #set后返回字典 -> list转为列表 -> 最后用sorted排序
print(len(raw_text))
print(len(chars))
print(chars)
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

#【3】文本预测:构造训练集测试集,x为前置字母,y是最后一个字母
seq_length = 100
x = []
y = []
for i in range(0, len(raw_text) - seq_length):
    #刚开始从0道100[0,100)取出字母,
    given = raw_text[i:i+seq_length]
    #取出第101个字母[100]
    predict = raw_text[i+seq_length]
    #遍历取出来第100字母,把每一个字母对应在char中第序号找出来, char一共有62种
    x.append([char_to_int[char] for char in given])
    #把预测第结果的字母,在char中找出来,找出序号
    y.append(char_to_int[predict])

print(x[:2]) #打印列表两个元素,每个元素里有100个字母对应的序号
print(y[:2]) #打印列表2个元素,预测值

#输出结果如下:
# 1044648
# 53
# ['\n', ' ', '!', '"', '#', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', '[', ']', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
# [[46, 34, 31, 1, 42, 44, 41, 36, 31, 29, 46, 1, 33, 47, 46, 31, 40, 28, 31, 44, 33, 1, 31, 28, 41, 41, 37, 1, 41, 32, 1, 46, 34, 31, 1, 29, 41, 39, 42, 38, 31, 46, 31, 1, 42, 33, 1, 31, 30, 35, 46, 35, 41, 40, 1, 41, 32, 1, 46, 34, 31, 1, 49, 41, 44, 37, 45, 1, 41, 32, 0, 49, 35, 40, 45, 46, 41, 40, 1, 29, 34, 47, 44, 29, 34, 35, 38, 38, 9, 1, 28, 51, 1, 49, 35, 40, 45, 46, 41, 40], [34, 31, 1, 42, 44, 41, 36, 31, 29, 46, 1, 33, 47, 46, 31, 40, 28, 31, 44, 33, 1, 31, 28, 41, 41, 37, 1, 41, 32, 1, 46, 34, 31, 1, 29, 41, 39, 42, 38, 31, 46, 31, 1, 42, 33, 1, 31, 30, 35, 46, 35, 41, 40, 1, 41, 32, 1, 46, 34, 31, 1, 49, 41, 44, 37, 45, 1, 41, 32, 0, 49, 35, 40, 45, 46, 41, 40, 1, 29, 34, 47, 44, 29, 34, 35, 38, 38, 9, 1, 28, 51, 1, 49, 35, 40, 45, 46, 41, 40, 1]]
# [1, 29]

#这个类似词袋 index
#【4】把index变为LSTM需要的数组格式:[样本数,时间步伐,特征], 对于output,我们在word2Vec中学过,用One-Hot做预测可以给我们更好的效果
#x的元素一共有几个
n_patterns = len(x) #一会代表行数,
n_vocab = len(chars)

#把x变成LSTM需要的样子,从一维数组变为多维数组
#参考:https://docs.scipy.org/doc/numpy/reference/generated/numpy.reshape.html
x = np.reshape(x, (n_patterns, seq_length, 1)) #行数有n_patterns确定,列数由seq_length确定,每个特征是一维的
#粗糙的normal化,把数值都处理为0-1之间
x = x/float(n_vocab)
#把output变为One-Hot
y = np_utils.to_categorical(y)

print(x[3]) #x相当于打印第3行,100个字母的序号,只是序号变准化了属于(0~1)
print(len(x[3])) # 一共100个
print(y[3]) #相当于One-Hot编码,通过前面的100个字母,预测出第xx号字母,把它设置为1
print(len(y[3])) #输出一个52维的向量,预测正确的地方设置为1

#输出如下:
# [[0.01886792]
#  [0.79245283]
#  [0.83018868]
#  [0.77358491]
#  [0.67924528]
#  [0.58490566]
#  [0.54716981]
#  [0.86792453]
#  [0.01886792]
#  [0.62264151]
#  [0.88679245]
#  [0.86792453]
#  [0.58490566]
#  [0.75471698]
#  [0.52830189]
#  [0.58490566]
#  [0.83018868]
#  [0.62264151]
#  [0.01886792]
#  [0.58490566]
#  [0.52830189]
#  [0.77358491]
#  [0.77358491]
#  [0.69811321]
#  [0.01886792]
#  [0.77358491]
#  [0.60377358]
#  [0.01886792]
#  [0.86792453]
#  [0.64150943]
#  [0.58490566]
#  [0.01886792]
#  [0.54716981]
#  [0.77358491]
#  [0.73584906]
#  [0.79245283]
#  [0.71698113]
#  [0.58490566]
#  [0.86792453]
#  [0.58490566]
#  [0.01886792]
#  [0.79245283]
#  [0.62264151]
#  [0.01886792]
#  [0.58490566]
#  [0.56603774]
#  [0.66037736]
#  [0.86792453]
#  [0.66037736]
#  [0.77358491]
#  [0.75471698]
#  [0.01886792]
#  [0.77358491]
#  [0.60377358]
#  [0.01886792]
#  [0.86792453]
#  [0.64150943]
#  [0.58490566]
#  [0.01886792]
#  [0.9245283 ]
#  [0.77358491]
#  [0.83018868]
#  [0.69811321]
#  [0.8490566 ]
#  [0.01886792]
#  [0.77358491]
#  [0.60377358]
#  [0.        ]
#  [0.9245283 ]
#  [0.66037736]
#  [0.75471698]
#  [0.8490566 ]
#  [0.86792453]
#  [0.77358491]
#  [0.75471698]
#  [0.01886792]
#  [0.54716981]
#  [0.64150943]
#  [0.88679245]
#  [0.83018868]
#  [0.54716981]
#  [0.64150943]
#  [0.66037736]
#  [0.71698113]
#  [0.71698113]
#  [0.16981132]
#  [0.01886792]
#  [0.52830189]
#  [0.96226415]
#  [0.01886792]
#  [0.9245283 ]
#  [0.66037736]
#  [0.75471698]
#  [0.8490566 ]
#  [0.86792453]
#  [0.77358491]
#  [0.75471698]
#  [0.01886792]
#  [0.54716981]
#  [0.64150943]]
# 100
# [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
#  0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.
#  0. 0. 0. 0. 0.]
# 53



#【5】模型构建:LSTM模型构建
model = Sequential()
model.add(LSTM(128, input_shape=(x.shape[1], x.shape[2])))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

#【6】跑模型
model.fit(x, y, nb_epoch=10, batch_size=1000)

#输出结果:
# 1000 / 1044548[..............................] - ETA: 3:45: 53 - loss: 3.9726
# 2000 / 1044548[..............................] - ETA: 2:34: 03 - loss: 3.9644
# 3000 / 1044548[..............................] - ETA: 2:08: 07 - loss: 3.9578
# 4000 / 1044548[..............................] - ETA: 1:55: 24 - loss: 3.9505
# 5000 / 1044548[..............................] - ETA: 1:45: 25 - loss: 3.9414
# 6000 / 1044548[..............................] - ETA: 1:50: 52 - loss: 3.9316
# 7000 / 1044548[..............................] - ETA: 1:44: 39 - loss: 3.9204
# 8000 / 1044548[..............................] - ETA: 1:40: 36 - loss: 3.9056
# 9000 / 1044548[..............................] - ETA: 1:35: 44 - loss: 3.8842
# 10000 / 1044548[..............................] - ETA: 1:32: 24 - loss: 3.8536
# 11000 / 1044548[..............................] - ETA: 1:35: 12 - loss: 3.8133
# 12000 / 1044548[..............................] - ETA: 1:33: 55 - loss: 3.7732
# 13000 / 1044548[..............................] - ETA: 1:31: 54 - loss: 3.7351
# 14000 / 1044548[..............................] - ETA: 1:29: 28 - loss: 3.7053
# 15000 / 1044548[..............................] - ETA: 1:27: 36 - loss: 3.6780
# 16000 / 1044548[..............................] - ETA: 1:30: 23 - loss: 3.6490
# 17000 / 1044548[..............................] - ETA: 1:29: 20 - loss: 3.6211
# 18000 / 1044548[..............................] - ETA: 1:28: 10 - loss: 3.5986
# 19000 / 1044548[..............................] - ETA: 1:26: 21 - loss: 3.5790
# 20000 / 1044548[..............................] - ETA: 1:26: 14 - loss: 3.5587
# 21000 / 1044548[..............................] - ETA: 1:29: 05 - loss: 3.5396
# 22000 / 1044548[..............................] - ETA: 1:28: 21 - loss: 3.5225
# 23000 / 1044548[..............................] - ETA: 1:27: 41 - loss: 3.5067
# 24000 / 1044548[..............................] - ETA: 1:26: 03 - loss: 3.4917
# 25000 / 1044548[..............................] - ETA: 1:25: 22 - loss: 3.4761
# 26000 / 1044548[..............................] - ETA: 1:26: 38 - loss: 3.4615
# 27000 / 1044548[..............................] - ETA: 1:26: 02 - loss: 3.4492
# 28000 / 1044548[..............................] - ETA: 1:25: 17 - loss: 3.4358
# 29000 / 1044548[..............................] - ETA: 1:24: 10 - loss: 3.4230
# 30000 / 1044548[..............................] - ETA: 1:23: 26 - loss: 3.4129
# 31000 / 1044548[..............................] - ETA: 1:24: 38 - loss: 3.4014
# 32000 / 1044548[..............................] - ETA: 1:24: 10 - loss: 3.3925
# 33000 / 1044548[..............................] - ETA: 1:23: 39 - loss: 3.3826

#【7】训练效果
def predict_next(input_array):
    x = np.reshape(input_array, (1, seq_length, 1))
    x = x / float(n_vocab)
    y = model.predict(x)
    return y
def string_to_index(raw_input):
    res = []
    for c in raw_text[(len(raw_input) - seq_length)]
        res.append(char_to_int[c])
        return  res

def y_to_char(y):
    largest_index = y.argmax()
    c = int_to_char[largest_index]
    return c

def generate_article(init, rounds=500):
    in_string = init.lower()
    for i in range(rounds):
        n = y_to_char(predict_next(string_to_index(in_string)))
        in_string += n
    return in_string

init = 'Professor Michael S. Hart is the originator of the Project'
article = generate_article(init)
print(article)
# 最后机器太热了,没跑结果


  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值