#数据处理
import collections
import numpy as np
start_token = 'B'#begin
end_token = 'E'#end
#数据集:总共有34646首诗,1721655个字(6110个去重后的字)
def process_poems(file_name):
# poems -> list of numbers诗集
poems = []#是二维数组,但不是矩阵,因为每首诗的长度不同
with open(file_name, "r", encoding='utf-8', ) as f:
for line in f.readlines():
try:
title, content = line.strip().split(':')#以冒号来分开
content = content.replace(' ', '')#去掉所有的空格
if '_' in content or '(' in content or '(' in content or '《' in content or '[' in content or \
start_token in content or end_token in content:#去掉特殊符号
continue
if len(content) < 5 or len(content) > 79:#内容少于5个字或大于79个字为异常诗需要剔除,跳出本次循环
continue
content = start_token + content + end_token
poems.append(content)
except ValueError as e:
pass
#取出所有诗中所有的字构成一维数组,比如['低','头','思','故','乡','低',]
all_words = [word for poem in poems for word in poem]
#以字为key,该字出现的次数为value形成字典,按value从大到小排列{'不':6000,'的':5800,}
counter = collections.Counter(all_words)
#对计数结果进行由大到小排序,返回的结果是一个数组,数组元素为(某个字,该字的次数),[('不':6000),('的':5800),]
count_pairs = sorted(counter.items(), key=lambda x: x[1], reverse=True)
#按照出现次数由大到小的顺序取出所有的字放到一个小括号中,字与字之间用逗号隔开,('不','的',)
words, _ = zip(*count_pairs)
#末尾加一个空格,('不','的', , ,' ')
words = words + (' ',)
#为每个字打上位置标签,从0开始,形成字典,高频次的字在前面{'不':0,'的':1, , ,' ':6110}
word_int_map = dict(zip(words, range(len(words))))
#每首诗中的字都能在word_int_map中找到位置标签,poems_vector是二维矩阵,行数为诗的个数,列代表每首诗的字的位置标签
poems_vector = [list(map(lambda word: word_int_map.get(word, len(words)), poem)) for poem in poems]
return poems_vector, word_int_map, words
def generate_batch(batch_size, poems_vec, word_to_int):
#总共切多少块,batch_size如果取64,取整数原则34646//64=541块
n_chunk = len(poems_vec) // batch_size
x_batches = []
y_batches = []
for i in range(n_chunk):
start_index = i * batch_size
end_index = start_index + batch_size
#batches为二维数组,每块固定有多少首诗,长度为batch_size
batches = poems_vec[start_index:end_index]
#确定该batches中字数最多的那首诗的字数
length = max(map(len, batches))
#每个x_data由空格组成一个batch_size*length的矩阵
x_data = np.full((batch_size, length), word_to_int[' '], np.int32)
#ro
LSTM-实现写诗机器人
最新推荐文章于 2024-09-19 19:05:43 发布