读入数据
# 使用pandas读入数据
import pandas as pd
poems_text = pd.read_table('./poems_clean.txt', header=None)
poems_text.columns = ["text"] #加上text
# 查看文本
poems_text.head() #诗名:内容
结果如下图:
import string #处理字符串
import numpy as np
poems_new = []
for line in poems_text['text']: # poems_text[0]的第0列,指
title, poem = line.split(':') #按:切割
poem = poem.replace(' ', '') #将空格去掉
poem = 'bbb' + poem
poems_new.append(list(poem))
数据整理
XY =[]
for poem in poems_new:
for i in range(len(poem) - 3): #前三个字是bbb
x1 = poem[i]
x2 = poem[i+1]
x3 = poem[i+2]
y = poem[i+3] #要猜的字
XY.append([x1, x2, x3, y])
# 展示整理后的X和Y的形式
print("原始诗句:")
print(poems_text['text'][3864])
print("\n")
print("训练数据:")
print(["X1", "X2", "X3", "Y"])
for i in range(132763, 132773):
print(XY[i])
结果如下图:
数据整理:文字编码
由于文字是非结构化数据,不能在计算机中直接分析,因此对文字进行编码处理
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(poems_new)
print(tokenizer.word_index)
#tokenizer默认把0这个索引留个停止词了,其它的词是从1开始索引的。
#然后分类的损失又默认0是第一类
#所以导致假设我们有三个字:a,b,c。
#他们的索引分别是1,2,3,没有0。
#这对keras要使用一个4分类才行
vocab_size = len(tokenizer.word_index) + 1
结果如下图:
XY_digit = np.array(tokenizer.texts_to_sequences(XY)) #array形状,新矩阵,非负正整数
X_digit = XY_digit[:, :3]
Y_digit = XY_digit[:, 3]
for i in range(132763, 132773):
print("{:<35}".format(str(XY[i])), "\t", "{:<30}".format(str(list(X_digit[i]))),"\t", Y_digit[i])
# Embedding + 线性模型
from keras.layers import Input, Embedding
from keras.models import Sequential, load_model, Model
from keras.layers import Input, Dense, Activation, Embedding, Flatten
hidden_size = 256
inp = Input(shape=(3,))
x = Embedding(vocab_size, hidden_size)(inp)
x = Flatten()(x)
x = Dense(vocab_size)(x)
pred = Activation('softmax')(x)
model = Model(inp, pred) #整合模型
model.summary()
print(vocab_size) #不同字符类别数
print(hidden_size) #虚拟空间维度,消耗5547*256个参数
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X_digit,Y_digit,test_size=0.2, random_state=0) #拆分数据集
from keras.optimizers import Adam
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(lr=0.001))
#损失函数为sparse_categorical_crossentropy,因变量为one-hot的话,损失函数用这个
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), batch_size=10000, epochs=10)
sample_text = ['床', '前', '明']
print(sample_text)
sample_index = tokenizer.texts_to_sequences(sample_text)#变换数据形式
print(sample_index)
word_prob = model.predict(np.array(sample_index).reshape(1, 3))[0]
print(tokenizer.index_word[word_prob.argmax()], word_prob.max())
poem_incomplete = 'bbb我****超****可****爱****'
poem_index = []
poem_text = ''
for i in range(len(poem_incomplete)):
current_word = poem_incomplete[i]
if current_word != '*':
# 给定的词
index = tokenizer.word_index[current_word]
else:
# 根据前三个词预测 *
x = poem_index[-3:]
y = model.predict(np.expand_dims(x, axis=0))[0]
index = y.argmax()
current_word = tokenizer.index_word[index]
poem_index.append(index)
poem_text = poem_text + current_word
poem_text = poem_text[3:]
print(poem_text[0:5])
print(poem_text[5:10])
print(poem_text[10:15])
print(poem_text[15:20])
从草稿库里发现了这篇文章???不知道啥时候写的了,好像还写完了【挠头】现发布了再说吧