循环神经网络(RNN)
问题描述:
利用循环神经网络,实现唐诗生成任务
数据集:
唐诗
题目要求:
补全程序,主要是前面的3个空和生成诗歌的一段代码,pytorch需要补全对应的rnn.py文件中的两处代码,生成诗歌开头词汇是:“日、红、山、夜、湖、海、月”
import numpy as np
import collections
import torch
from torch.autograd import Variable
import torch.optim as optim
import rnn as rnn_lstm
start_token = 'G'
end_token = 'E'
batch_size = 64
def process_poems1(file_name):
"""
:param file_name:
:return: poems_vector have tow dimmention ,first is the poem, the second is the word_index
e.g. [[1,2,3,4,5,6,7,8,9,10],[9,6,3,8,5,2,7,4,1]]
"""
poems = []
with open(file_name, "r", encoding='utf-8', ) as f:
for line in f.readlines():
try:
title, content = line.strip().split(':')
# content = content.replace(' ', '').replace(',','').replace('。','')
content = content.replace(' ', '')
if '_' in content or '(' in content or '(' in content or '《' in content or '[' in content or \
start_token in content or end_token in content:
continue
if len(content) < 5 or len(content) > 80:
continue
content = start_token + content + end_token
poems.append(content)
except ValueError as e:
print("error")
pass
# 按诗的字数排序
poems = sorted(poems, key=lambda line: len(line))
# print(poems)
# 统计每个字出现次数
all_words = []
for poem in poems:
all_words += [word for word in poem]
counter = collections.Counter(all_words) # 统计词和词频。
count_pairs = sorted(counter.items(), key=lambda x: -x[1]) # 排序
words, _ = zip(*count_pairs)
words = words[:len(words)] + (' ',)
word_int_map = dict(zip(words, range(len(words))))
poems_vector = [list(map(word_int_map.get, poem)) for poem in poems]
return poems_vector, word_int_map, words
def process_poems2(file_name):
"""
:param file_name:
:return: poems_vector have tow dimmention ,first is the poem, the second is the word_index
e.g. [[1,2,3,4,5,6,7,8,9,10],[9,6,3,8,5,2,7,4,1]]
"""
poems = []
with open(file_name, "r", encoding='utf-8', ) as f:
# content = ''
for line in f.readlines():
try:
line = line.strip()
if line:
content = line.replace(' '' ', '').replace(