LSTM-实现写诗机器人

最新推荐文章于 2024-09-19 19:05:43 发布

Ai_践行者

最新推荐文章于 2024-09-19 19:05:43 发布

阅读量4.6k

点赞数 4

分类专栏：人工智能文章标签： LSTM 文本生成写诗机器人

本文链接：https://blog.csdn.net/qq_41424519/article/details/81943237

版权

#数据处理
import collections
import numpy as np

start_token = 'B'#begin
end_token = 'E'#end
#数据集：总共有34646首诗,1721655个字(6110个去重后的字)

def process_poems(file_name):
    # poems -> list of numbers诗集
    poems = []#是二维数组，但不是矩阵，因为每首诗的长度不同
    with open(file_name, "r", encoding='utf-8', ) as f:
        for line in f.readlines():
            try:
                title, content = line.strip().split(':')#以冒号来分开
                content = content.replace(' ', '')#去掉所有的空格
                if '_' in content or '(' in content or '（' in content or '《' in content or '[' in content or \
                        start_token in content or end_token in content:#去掉特殊符号
                    continue
                if len(content) < 5 or len(content) > 79:#内容少于5个字或大于79个字为异常诗需要剔除，跳出本次循环
                    continue
                content = start_token + content + end_token
                poems.append(content)
            except ValueError as e:
                pass
    #取出所有诗中所有的字构成一维数组,比如['低','头','思','故','乡','低',]
    all_words = [word for poem in poems for word in poem]
    #以字为key，该字出现的次数为value形成字典,按value从大到小排列{'不'：6000,'的'：5800,}
    counter = collections.Counter(all_words)
    #对计数结果进行由大到小排序，返回的结果是一个数组，数组元素为(某个字，该字的次数),[('不'：6000),('的'：5800),]
    count_pairs = sorted(counter.items(), key=lambda x: x[1], reverse=True)
    #按照出现次数由大到小的顺序取出所有的字放到一个小括号中，字与字之间用逗号隔开,('不','的',)
    words, _ = zip(*count_pairs)
    #末尾加一个空格,('不','的', , ,' ')
    words = words + (' ',)
    #为每个字打上位置标签，从0开始，形成字典,高频次的字在前面{'不'：0,'的':1, , ,' ':6110}
    word_int_map = dict(zip(words, range(len(words))))
    #每首诗中的字都能在word_int_map中找到位置标签，poems_vector是二维矩阵，行数为诗的个数，列代表每首诗的字的位置标签
    poems_vector = [list(map(lambda word: word_int_map.get(word, len(words)), poem)) for poem in poems]

    return poems_vector, word_int_map, words


def generate_batch(batch_size, poems_vec, word_to_int):
    #总共切多少块,batch_size如果取64，取整数原则34646//64=541块
    n_chunk = len(poems_vec) // batch_size
    x_batches = []
    y_batches = []
    for i in range(n_chunk):
        start_index = i * batch_size
        end_index = start_index + batch_size
        #batches为二维数组，每块固定有多少首诗，长度为batch_size
        batches = poems_vec[start_index:end_index]
        #确定该batches中字数最多的那首诗的字数
        length = max(map(len, batches))
        #每个x_data由空格组成一个batch_size*length的矩阵
        x_data = np.full((batch_size, length), word_to_int[' '], np.int32)
        #ro