【chap6_RNN==＞利用循环神经网络来生成唐诗】

一枚射手座的程序媛

已于 2024-06-10 14:01:33 修改

阅读量798

点赞数 2

分类专栏：代码分析文章标签： rnn 人工智能深度学习

于 2023-05-29 22:04:25 首次发布

本文链接：https://blog.csdn.net/qq_43761715/article/details/130936742

版权

代码分析专栏收录该内容

2 篇文章 0 订阅

订阅专栏

邱锡鹏-神经网络与深度学习-代码练习chap6_RNN==>利用循环神经网络来生成唐诗
https://github.com/nndl/exercise/
chap6_RNN

嗨，我是射手座的程序媛，期待与大家更多的学习与交流，欢迎添加3512724768

环境：pytorch 1.11版本，用了GPU来加速。

思路：对数据集进行预处理，用id表示每个word，进入RNN模型进行训练，生成唐诗。
1、先来看看main函数吧

'''
Descripttion: 
Author: WangXiaoyo
version: tf15
Date: 2023-05-29 15:40:03
LastEditors: WangXiaoyo
LastEditTime: 2023-05-29 21:36:08
'''
import numpy as np
import collections
import torch
from torch.autograd import Variable
import torch.optim as optim

import rnn_lstm
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")#使用GPU是真的快！

start_token = 'G'
end_token = 'E'
batch_size = 64


def process_poems1(file_name):
    """

    :param file_name:
    :return: poems_vector  have tow dimmention ,first is the poem, the second is the word_index
    e.g. [[1,2,3,4,5,6,7,8,9,10],[9,6,3,8,5,2,7,4,1]]

    """
    poems = []
    with open(file_name, "r", encoding='utf-8', ) as f:
        for line in f.readlines():
            try:
                title, content = line.strip().split(':')#题目&内容分开呗
                # content = content.replace(' ', '').replace('，','').replace('。','')
                content = content.replace(' ', '')#除杂吧
                #出现下面的情况就不考虑了，都是有问题的“古诗”
                if '_' in content or '(' in content or '（' in content or '《' in content or '[' in content or \
                                start_token in content or end_token in content:
                    continue
                if len(content) < 5 or len(content) > 80:
                    continue
                #拼接：['G寒随穷律变，春逐鸟声开。初风飘带柳，晚雪间花梅。碧林青旧竹，绿沼翠新苔。芝田初雁去，绮树巧莺来。E']
                content = start_token + content + end_token 
                poems.append(content)
            except ValueError as e:
                # print("error")
                pass
    # 按诗的字数排序    默认升序排列
    poems = sorted(poems, key=lambda line: len(line))
    # print(poems)
    # 统计每个字出现次数
    all_words = []
    for poem in poems:
        all_words += [word for word in poem]
    #上面就是把词都给分开
    #['G', '寒', '随', '穷', '律', '变', '，', '春', '逐', '鸟', '声', '开', '。', '初', '风', '飘', '带', '柳', '，', '晚', '雪', '间', '花', '梅', '。', '碧', '林', '青', '旧', '竹', '，', '绿', '沼', '翠', '新', '苔', '。', '芝', '田', '初', '雁', '去', '，', '绮', '树', '巧', '莺', '来', '。', 'E']
    counter = collections.Counter(all_words)  # 统计词和词频。
    count_pairs = sorted(counter.items(), key=lambda x: -x[1])  # 根据词频降序排列
    #[('，', 4), ('。', 4), ('初', 2), ('G', 1), ('寒', 1), ('随', 1), ('穷', 1), ('律', 1), ('变', 1), ('春', 1), ('逐', 1), ('鸟', 1), ('声', 1), ('开', 1), ('风', 1), ('飘', 1), ('带', 1), ('柳', 1), ('晚', 1), ('雪', 1), ('间', 1), ('花', 1), ('梅', 1), ('碧', 1), ('林', 1), ('青', 1), ('旧', 1), ('竹', 1), ('绿', 1), ('沼', 1), ('翠', 1), ('新', 1), ('苔', 1), ('芝', 1), ('田', 1), ('雁', 1), ('去', 1), ('绮', 1), ('树', 1), ('巧', 1), ('莺', 1), ('来', 1), ('E', 1)]
    words, _ = zip(*count_pairs)#解压 分开成两个结构 将词与词频分开到不同的元素列表中
    
    #('，', '。', '初', 'G', '寒', '随', '穷', '律', '变', '春', '逐', '鸟', '声', '开', '风', '飘', '带', '柳', '晚', '雪', '间', '花', '梅', '碧', '林', '青', '旧', '竹', '绿', '沼', '翠', '新', '苔', '芝', '田', '雁', '去', '绮', '树', '巧', '莺', '来', 'E')
    #(4, 4, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1)  
    words = words[:len(words)] + (' ',)#每个诗后面都加上,''
    #('，', ... '来', 'E', ' ')
    word_int_map = dict(zip(words, range(len(words))))#映射到字典里面 键值对，每个word有一个唯一的id
    #{'，': 0, '。': 1, '初': 2, 'G': 3, '寒': 4, '随': 5, '穷': 6, '律': 7, '变': 8, '春': 9, '逐': 10, '鸟': 11, '声': 12, '开': 13, '风': 14, '飘': 15, '带': 16, '柳': 17, '晚': 18, '雪': 19, '间': 20, '花': 21, '梅': 22, '碧': 23, '林': 24, '青': 25, '旧': 26, '竹': 27, '绿': 28, '沼': 29, '翠': 30, '新': 31, '苔': 32, '芝': 33, '田': 34, '雁': 35, '去': 36, '绮': 37, '树': 38, '巧': 39, '莺': 40, '来': 41, 'E': 42, ' ': 43}
    #根据词频吧，高的在前面，简单，后面的出现几率小，用复杂的，感觉类似于哈夫曼编码
    poems_vector = [list(map(word_int_map.get, poem)) for poem in poems]#生成vector,将汉字古诗用数字id进行表示了
    #[[3, 4, 5, 6, 7, 8, 0, 9, 10, 11, 12, 13, 1, 2, 14, 15, 16, 17, 0, 18, 19, 20, 21, 22, 1, 23, 24, 25, 26, 27, 0, 28, 29, 30, 31, 32, 1, 33, 34, 2, 35, 36, 0, 37, 38, 39, 40, 41, 1, 42]]
    return poems_vector, word_int_map, words

def generate_batch(batch_size, poems_vec, word_to_int):
    n_chunk = len(poems_vec) // batch_size #生成batch的number
    x_batches = []
    y_batches = []
    for i in range(n_chunk):
        start_index = i * batch_size
        end_index = start_index + batch_size
        x_data = poems_vec[start_index:end_index]
        y_data = []
        for row in x_data:
            y  = row[1:]
            y.append(row[-1])
            y_data.append(y)
        """
        看下面的例子：x第一个是6，y的第一个是2
        就是根据6，来预测下一个‘汉字’2
        y是x左移一位，最后一位重复【最后两位相同啊==> 9,9  5,5】
        仔细看看就知道了，上面的代码就是把y处理成这样了。
        x_data             y_data
        [6,2,4,6,9]       [2,4,6,9,9]
        [1,4,2,8,5]       [4,2,8,5,5]
        """
       
        x_batches.append(x_data)
        y_batches.append(y_data)
    return x_batches, y_batches


def run_training():
    # 处理数据集
    poems_vector, word_to_int, vocabularies = process_poems1('./poems.txt')
    # 生成batch
    print("finish  loadding data")
    BATCH_SIZE = 100

    torch.manual_seed(5)#随机种子，初始化，保证了每次结果都相同
    #得到word的embedding，vocab_length个word（+1应该是因为在前期数据处理时加了' '【个人感觉】），每个embedding的维度是100
    word_embedding = rnn_lstm.word_embedding( vocab_length= len(word_to_int) + 1 , embedding_dim= 100)
    #模型初始化啦，确定各个参数，做好预备工作
    rnn_model = rnn_lstm.RNN_model(batch_sz = BATCH_SIZE,vocab_len = len(word_to_int) + 1 ,word_embedding = word_embedding ,embedding_dim= 100, lstm_hidden_dim=128)
    rnn_model.to(device)#GPU
    # optimizer = optim.Adam(rnn_model.parameters(), lr= 0.001)
    #优化呗，用的RMSprop，学习率0.01
    optimizer=optim.RMSprop(rnn_model.parameters(), lr=0.01)

    loss_fun = torch.nn.NLLLoss()#初始化，需要label&predict
    # loss_fun = torch.nn.CrossEntropyLoss() ==> 相当于softmax + log + nllloss
    # rnn_model.load_state_dict(torch.load('./poem_generator_rnn'))  # if you have already trained your model you can load it by this line.
    #开始训练了 100个epoch
    for epoch in range(100):
        batches_inputs, batches_outputs = generate_batch(BATCH_SIZE, poems_vector, word_to_int)#生成batch
        n_chunk = len(batches_inputs)#分成了几块，数据集/batch_size  ==> 有多少个batch
        for batch in range(n_chunk):
            batch_x = batches_inputs[batch]
            batch_y = batches_outputs[batch] # (batch , time_step)
            loss = 0
            for index in range(BATCH_SIZE):
                x = np.array(batch_x[index], dtype = np.int64)
                y = np.array(batch_y[index], dtype = np.int64)
                x = Variable(torch.from_numpy(np.expand_dims(x,axis=1)))#变量化
                y = Variable(torch.from_numpy(y ))
                x, y = x.to(device), y.to(device)
                pre = rnn_model(x)#这里就进入model，得到预测结果
                loss += loss_fun(pre , y)#与真实label对比得loss
                if index == 0:
                    _, pre = torch.max(pre, dim=1)#输出预测概率最大的那一个word
                    print('prediction', pre.data.tolist()) # 输出预测结果（现在都是数字id形式的）
                    print('b_y       ', y.data.tolist())   # 输出label，真正的古诗，也是数字id形式
                    print('*' * 30)
            loss  = loss  / BATCH_SIZE #计算平均损失吧
            print("epoch  ",epoch,'batch number',batch,"loss is: ", loss.data.tolist())
            optimizer.zero_grad()#梯度归零
            loss.backward()#反向传播
            torch.nn.utils.clip_grad_norm(rnn_model.parameters(), 1)#对所有的梯度乘以一个clip_coef，缓解梯度爆炸问题（小于1）
            optimizer.step()#通过梯度下降执行一步参数更新

        if epoch % 5 ==0:
            torch.save(rnn_model.state_dict(), './poem_generator_rnn')#每五个epoch保存一次model
            print("finish  save model")


def to_word(predict, vocabs):  # 预测的结果转化成汉字（输出结果的时候才用，训练的时候根本不用【咱们又不用看，所以上面都是数字id的形式】）
    sample = np.argmax(predict)
    # print(sample)
    if sample >= len(vocabs):
        sample = len(vocabs) - 1
    # print(vocabs[sample])
    return vocabs[sample]

def pretty_print_poem(poem):  # 令打印的结果更工整
    shige=[]
    for w in poem:
        if w == start_token or w == end_token:
            break
        shige.append(w)
    # print(shige)
    # poem_sentences = poem.split('。')
    print("".join(shige))
    # for s in poem_sentences:
    #     if s != '' and len(s) > 10:
    #         print(s + '。')

#这里就是根据模型训练的结果，开始验证，作诗了
def gen_poem(begin_word):
    # poems_vector, word_int_map, vocabularies = process_poems2('./tangshi.txt')  #  use the other dataset to train the network
    poems_vector, word_int_map, vocabularies = process_poems1('./poems.txt')
    word_embedding = rnn_lstm.word_embedding(vocab_length=len(word_int_map) + 1, embedding_dim=100)
    rnn_model = rnn_lstm.RNN_model(batch_sz=64, vocab_len=len(word_int_map) + 1, word_embedding=word_embedding,
                                   embedding_dim=100, lstm_hidden_dim=128)

    rnn_model.load_state_dict(torch.load('./poem_generator_rnn'))#根据之前保存的模型参数，生成诗歌

    # 指定开始的字
    rnn_model.to(device)#GPU呗
    poem = begin_word
    word = begin_word
    while word != end_token:
        input = np.array([word_int_map[w] for w in poem],dtype= np.int64)
        input = Variable(torch.from_numpy(input)).to(device)#变量化
        output = rnn_model(input, is_test=True)#进入模型，得出output【现在这个是数字id形式的诗歌】
        word = to_word(output.data.tolist()[-1], vocabularies)#变成汉字了
        poem += word #一个word一个word的生成、预测
        # 50个汉字的诗吧
        if len(poem) > 50:
            return poem
    return poem
    
run_training()  # 如果不是训练阶段 ，请注销这一行 。 网络训练时间很长。

pretty_print_poem(gen_poem("日"))
pretty_print_poem(gen_poem("红"))
pretty_print_poem(gen_poem("山"))
pretty_print_poem(gen_poem("夜"))
pretty_print_poem(gen_poem("湖"))
pretty_print_poem(gen_poem("君"))
pretty_print_poem(gen_poem("星"))
``

2、再来看看model吧

'''
Descripttion: 
Author: WangXing
version: tf15
Date: 2023-05-29 15:42:03
LastEditors: WangXing
LastEditTime: 2023-05-29 21:26:08
'''
import torch.nn as nn
import torch
from torch.autograd import Variable
import torch.nn.functional as F

import numpy as np
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#对全连接层fc的权重等参数进行初始化【个人理解】，这里没有特别认真看。。。
def weights_init(m):
    classname = m.__class__.__name__  #   obtain the class name
    if classname.find('Linear') != -1:
        weight_shape = list(m.weight.data.size())
        fan_in = weight_shape[1]
        fan_out = weight_shape[0]
        w_bound = np.sqrt(6. / (fan_in + fan_out))
        m.weight.data.uniform_(-w_bound, w_bound)
        m.bias.data.fill_(0)
        print("inital  linear weight ")

class word_embedding(nn.Module):
    def __init__(self,vocab_length , embedding_dim):
        super(word_embedding, self).__init__()
        w_embeding_random_intial = np.random.uniform(-1,1,size=(vocab_length ,embedding_dim))#均匀分布embedding随机初始化
        self.word_embedding = nn.Embedding(vocab_length,embedding_dim)
        self.word_embedding.weight.data.copy_(torch.from_numpy(w_embeding_random_intial))#好像是加载已经训练好的词向量
    def forward(self,input_sentence):
        """
        :param input_sentence:  a tensor ,contain several word index.
        :return: a tensor ,contain word embedding tensor
        """
        sen_embed = self.word_embedding(input_sentence)#把input_sentence变成embedding吧
        return sen_embed


class RNN_model(nn.Module):
    def __init__(self, batch_sz ,vocab_len ,word_embedding,embedding_dim, lstm_hidden_dim):
        super(RNN_model,self).__init__()

        self.word_embedding_lookup = word_embedding
        self.batch_size = batch_sz
        self.vocab_length = vocab_len
        self.word_embedding_dim = embedding_dim
        self.lstm_dim = lstm_hidden_dim
        #这里就是定义lstm模型了，两层lstm，input_size、hidden_size都规定了
        self.rnn_lstm = nn.LSTM(input_size=self.word_embedding_dim,hidden_size=self.lstm_dim,num_layers=2,batch_first=True)
        #########################################
        # here you need to define the "self.rnn_lstm"  the input size is "embedding_dim" and the output size is "lstm_hidden_dim"
        # the lstm should have two layers, and the  input and output tensors are provided as (batch, seq, feature)
        # ???
        # for name, param in self.rnn.named_parameters():
        #     nn.init.uniform_(param,-0.1,0.1)


        ##########################################
        #这里是一个全连接层，输入大小是上面lstm出来的output=hidden_dim,输出大小是vocab_len，【self.word_embedding_dim】这个不确定，根据函数定义的话他是bias，是布尔类型的，但是感觉好像不对
        self.fc = nn.Linear(lstm_hidden_dim, vocab_len,self.word_embedding_dim)
        self.apply(weights_init) # call the weights initial function.#全连接层参数初始化去啦

        self.softmax = nn.LogSoftmax() # the activation function.
        self.tanh = nn.Tanh()
    def forward(self,sentence,is_test = False):
        batch_input = self.word_embedding_lookup(sentence).view(1,-1,self.word_embedding_dim)#加载的词向量
        # print(batch_input.size()) # print the size of the input
        ################################################
        # here you need to put the "batch_input"  input the self.lstm which is defined before.
        # the hidden output should be named as output, .
        # ???
        #将h、c都初始化为零，进入模型得到输出
        output, _ = self.rnn_lstm(batch_input, (torch.zeros(2, 1, self.lstm_dim).to(device), torch.zeros(2, 1, self.lstm_dim).to(device)))
        ################################################
        out = output.contiguous().view(-1,self.lstm_dim)#使用contiguous()针对x进行变化，感觉上就是深拷贝。当调用contiguous()时，会强制拷贝一份tensor，让它的布局和从头创建的一模一样，但是两个tensor完全没有联系。改变x之后，对y没有影响
        out =  F.relu(self.fc(out))
        # out =  self.tanh(self.fc(out))
        out = self.softmax(out)
        # out = self.softmax(out,dim=1)
        # out = nn.softmax(dim=1)

        if is_test:
            prediction = out[ -1, : ].view(1,-1)    #测试需要看看汉字。
            output = prediction
        else:
           output = out #训练的话，就直接输出数字id形式
        return output

哎，终于大概完成作业了。。。对代码的理解还有待提升啊，感谢周围的大佬。

以上就是对代码的理解了，错误之处，烦请指出！