文章目录
1 项目组织结构
项目组织结构如下:
└── Project
├── Data
└── jaychou_lyrics.txt.zip
├── Test
└── __init__.py
└── function.py
└── main.py
2 实现
2.1 数据导入
使用的数据集为周杰伦歌词数据集,函数的参数包括:
1)数据集选取范围,None默认选取全部;
2)数据集目录。
数据集已上传至:https://gitee.com/inkiinki/data20201205/blob/master/Data20201205/jaychou_lyrics.txt.zip
采样方式请参照:https://blog.csdn.net/weixin_44575152/article/details/112753980
def load_jaychou_lyrics(tr_range=None, path="../Data/jaychou_lyrics.txt.zip"):
"""
:param tr_range: 数据集选取范围
:param path: 数据集存储路径
"""
with zipfile.ZipFile(path) as zin:
with zin.open('jaychou_lyrics.txt') as f:
ori_data = f.read().decode("utf-8")
ori_data = ori_data.replace("\n", " ").replace("\r", " ")
"""设置原始数据集的选取范围并选取"""
if tr_range is None:
tr_range = (0, len(ori_data))
ori_data = ori_data[tr_range[0]: tr_range[1]]
# 不重复字符列表
idx2char_list = list(set(ori_data))
# 字符索引字典
char2idx_dict = dict([(char, i) for i, char in enumerate(idx2char_list)])
# 字典大小,即不重复字符的数量
dict_size = len(char2idx_dict)
# 字符索引列表
char2idx_list = [char2idx_dict[char] for char in ori_data]
return idx2char_list, char2idx_dict, dict_size, char2idx_list
2.2 One-hot向量
为了将词表示成向量输入,一个简单的方法是one-hot向量:
1)假设词典中不同字符的数量为
N
N
N,即vocab_size;
2)每个字符同
[
0..
(
N
−
1
)
]
[0..(N - 1)]
[0..(N−1)]的整数索引相对应;
3)如果一个字符的索引是整数
i
i
i,那么创建一个全为
0
0
0的
N
N
N维向量,其中
i
i
i位置设置为
1
1
1,其余为零。这个向量则是相应字符的one-hot向量:
def to_one_hot(X, n_class, device=torch.device('cuda' if torch.cuda.is_available() else "cpu")):
"""
:param X: 数据
:param n_class: 不同字符的数量
:param device
"""
return [one_hot(X[:, i], n_class, device=device) for i in range(X.shape[1])]
def one_hot(x, n_class, dtype=torch.float32, device=torch.device('cuda' if torch.cuda.is_available() else "cpu")):
"""
:param x: x.shape --> (d), d是向量维度
:param n_class --> 不同字符的数量
:param dtype
:param device
:return: ret.shape --> (d, n_class)
"""
x = x.long()
res = torch.zeros(x.shape[0], n_class, dtype=dtype, device=device)
# scatter_(dim, index, src):将src中的数据按照索引index,在维度dim上进行填充到指定tensor,例如下例中的res
res.scatter_(1, x.view(-1, 1), 1)
return res
2.3 梯度裁剪
循环神经网络容易出现梯度衰减和梯度保证,因此需要对梯度进行裁剪。假设把所有模型参数梯度的元素拼接为一个向量
g
\boldsymbol{g}
g,并设置裁剪阈值为
θ
\theta
θ,则裁剪后梯度为:
min
(
θ
∥
g
∥
,
1
)
g
.
\min(\frac{\theta}{\|\boldsymbol{g}\|, 1})\boldsymbol{g}.
min(∥g∥,1θ)g.
def grad_clipping(params, theta, device):
norm = torch.tensor([0.], device=device)
for param in params:
norm += (param.grad.data ** 2).sum()
norm = norm.sqrt().item()
if norm > theta:
for param in params:
param.grad.data *= (theta / norm)
2.4 模型构建
class RNNModel(nn.Module):
def __init__(self, run_layer, vocab_size):
super(RNNModel, self).__init__()
self.rnn = run_layer
self.hidden_size = self.rnn.hidden_size * (2 if self.rnn.bidirectional else 1)
self.vocab_size = vocab_size
self.dense = nn.Linear(self.hidden_size, self.vocab_size)
self.state = None
def forward(self, X, state):
X = to_one_hot(X, self.vocab_size)
Y, self.state = self.rnn(torch.stack(X), state)
Y = self.dense(Y.view(-1, Y.shape[-1]))
return Y, self.state
2.5 模型训练
2.5.1 网络随机初始化测试
def train(prefix, num_chars, model, idx2char, char2idx,
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
state = None
output = [char2idx[prefix[0]]]
for t in range(num_chars + len(prefix) - 1):
X = torch.tensor([output[-1]], device=device).view(1, 1)
if state is not None:
if isinstance(state, tuple): # LSTM, state:(h, c)
state = (state[0].to(device), state[1].to(device))
else:
state = state.to(device)
(Y, state) = model(X, state) # 前向计算不需要传入模型参数
if t < len(prefix) - 1:
output.append(char2idx[prefix[t + 1]])
else:
output.append(int(Y.argmax(dim=1).item()))
return ''.join([idx2char[i] for i in output])
def test():
idx2char_list, char2idx_dict, dict_size, _ = load_jaychou_lyrics(tr_range=(0, 10000))
hidden_size = 256
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rnn_layer = get_rnn_layer(input_size=dict_size, hidden_size=hidden_size)
model = RNNModel(rnn_layer, dict_size).to(device)
print(train("分开", 10, model, idx2char_list, char2idx_dict, device=device))
if __name__ == '__main__':
test()
网络随机初始化一次之后,预测的示例如下:
分开乌羞直羞直极能极能物
2.5.2 模型训练及测试
def train_predict(model, data_idx, idx2char, char2idx, num_epoch, num_step,
lr, clipping_theta, batch_size, pred_period, pred_len, prefixes,
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
model.to(device)
state = None
for epoch in range(num_epoch):
l_sum, n, start = 0.0, 1e-5, time.time()
data_iter = load_jaychou_lyrics_iter_consecutive(data_idx, batch_size, num_step, device) # 相邻采样
for X, Y in data_iter:
if state is not None:
# 使用detach函数从计算图分离隐藏状态, 这是为了
# 使模型参数的梯度计算只依赖一次迭代读取的小批量序列(防止梯度计算开销太大)
if isinstance(state, tuple): # LSTM, state:(h, c)
state = (state[0].detach(), state[1].detach())
else:
state = state.detach()
(output, state) = model(X, state) # output: 形状为(num_steps * batch_size, vocab_size)
# Y的形状是(batch_size, num_steps),转置后再变成长度为
# batch * num_steps 的向量,这样跟输出的行一一对应
y = torch.transpose(Y, 0, 1).contiguous().view(-1)
l = loss(output, y.long())
optimizer.zero_grad()
l.backward()
# 梯度裁剪
grad_clipping(model.parameters(), clipping_theta, device)
optimizer.step()
l_sum += l.item() * y.shape[0]
n += y.shape[0]
if (epoch + 1) % pred_period == 0:
print('epoch %d, perplexity %f, time %.2f sec' % (
epoch + 1, math.exp(l_sum / n), time.time() - start))
for prefix in prefixes:
print(' -', train(prefix, pred_len, model, idx2char, char2idx))
def test1():
num_epoch, batch_size, lr, clipping_theta, tr_range = 250, 32, 1e-3, 1e-2, (0, 10000)
pred_period, pred_len, prefixes = 50, 50, ["分开", "不分开"]
idx2char_list, char2idx_dict, dict_size, char2idx_list = load_jaychou_lyrics(tr_range=tr_range)
hidden_size, num_step = 256, 25
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rnn_layer = get_rnn_layer(input_size=dict_size, hidden_size=hidden_size)
model = RNNModel(rnn_layer, dict_size).to(device)
train_predict(model, char2idx_list, idx2char_list, char2idx_dict,
num_epoch, num_step, lr, clipping_theta, batch_size,
pred_period, pred_len, prefixes, device=device)
if __name__ == '__main__':
test1()
输出如下:
epoch 50, perplexity 3.530170, time 0.58 sec
- 分开 我不能再想 我不能再想 我不 我不 我不能再想 我不能再想 我不 我不 我不能再想 我不能再想 我
- 不分开 我有你这样 我不 这样 我不 我不 我不 我不 我不能再想 我不 我不 我不 我不 我不能再想 我
epoch 100, perplexity 1.103285, time 0.57 sec
- 分开 我不多难熬 没有你在我有多难熬多烦恼 没有你烦 我有多烦恼 没有你烦我有多烦恼多难熬 穿过
- 不分开 我有你这节奏 后 从不能活力 一颗风颗三颗四颗 连成线背著背默默许下心愿 看远方的星是否听的见 手
epoch 150, perplexity 1.039727, time 0.59 sec
- 分开 我不 这爱的 爸一你 手对一阵莫名感动 我想带你 回我的外婆家 一起看着日落 一直到我们都睡着 我
- 不分开不能不想 你的黑色幽默我想通 说穿了其实我的愿望就怎么小 就怎么每天祈祷我的心跳你知道 杵在伊斯坦
epoch 200, perplexity 1.024952, time 0.60 sec
- 分开 我不 爱情走的太快就像龙卷风 不能承受我已无处可躲 我不要再想 我不要再想 我不 我不 我不要再
- 不分开不能不能承受我已无处可躲 我不要再想 我不要再想 我不 我不 我不要再想你 爱情来的太快就像龙卷风
epoch 250, perplexity 1.018972, time 0.58 sec
- 分开 我不 这可的我爱如果说散 想一定人演云多 对我用铅笔写一个人 什么都一轻人慢慢温习 我爱还是
- 不分开不能不能承受我已无处可躲 我不要再想 我不要再想 我不 我不 我不要再想你 不知不觉 你已经离开我
3 相关函数
3.1 init.py
"""
@author: Inki
@email: inki.yinji@qq.com
@create: 2021 0602
@lost modify: 2021 0602
"""
import math
import numpy as np
import time
import torch
import torch.nn.functional as F
import zipfile
from torch import nn, optim
from .function import (
load_jaychou_lyrics, load_jaychou_lyrics_iter_consecutive,
load_jaychou_lyrics_iter_random,
grad_clipping, get_rnn_layer,
RNNModel)
__all__ = [
"math",
"np",
"time",
"torch",
"F",
"zipfile",
"nn",
"optim",
"load_jaychou_lyrics",
"load_jaychou_lyrics_iter_consecutive",
"load_jaychou_lyrics_iter_random",
"grad_clipping",
"get_rnn_layer",
"RNNModel",
]
3.2 function.py
# coding: utf-8
"""
@author: Inki
@email: inki.yinji@qq.com
@create: 2021 0602
@lost modify: 2021 0602
"""
from Test import *
def load_jaychou_lyrics(tr_range=None, path="../Data/jaychou_lyrics.txt.zip"):
"""
:param tr_range: 数据集选取范围
:param path: 数据集存储路径
"""
with zipfile.ZipFile(path) as zin:
with zin.open('jaychou_lyrics.txt') as f:
ori_data = f.read().decode("utf-8")
ori_data = ori_data.replace("\n", " ").replace("\r", " ")
"""设置原始数据集的选取范围并选取"""
if tr_range is None:
tr_range = (0, len(ori_data))
ori_data = ori_data[tr_range[0]: tr_range[1]]
# 不重复字符列表
idx2char_list = list(set(ori_data))
# 字符索引字典
char2idx_dict = dict([(char, i) for i, char in enumerate(idx2char_list)])
# 字典大小,即不重复字符的数量
dict_size = len(char2idx_dict)
# 字符索引列表
char2idx_list = [char2idx_dict[char] for char in ori_data]
return idx2char_list, char2idx_dict, dict_size, char2idx_list
def load_jaychou_lyrics_iter_random(data_idx, batch_size=2, num_step=5,
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
"""
:param data_idx: 数据选取索引
:param batch_size: 批次大小
:param num_step: 每个样本的时间步数
:param device: 设备
"""
# 减1是因为输出的索引x是相应输入的索引y+1
num_data = (len(data_idx) - 1) // num_step
num_epoch = num_data // batch_size
idx = np.random.permutation(num_data)
def _data(pos):
return data_idx[pos: pos + num_step]
for i in range(num_epoch):
j = i * batch_size
batch_idx = idx[j: j + batch_size]
X = [_data(k * num_step) for k in batch_idx]
Y = [_data(k * num_step + 1) for k in batch_idx]
yield (torch.tensor(X, dtype=torch.float32, device=device),
torch.tensor(Y, dtype=torch.float32, device=device))
def load_jaychou_lyrics_iter_consecutive(data_idx, batch_size=2, num_step=5,
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
"""
:param data_idx: 数据选取索引
:param batch_size: 批次大小
:param num_step: 每个样本的时间步数
:param device: 设备
"""
data_idx = torch.tensor(data_idx, dtype=torch.float32, device=device)
num_data = len(data_idx)
num_batch = num_data // batch_size
idx = data_idx[0: batch_size * num_batch].view(batch_size, num_batch)
num_epoch = (num_batch - 1) // num_step
for i in range(num_epoch):
j = i * num_step
X = idx[:, j: j + num_step]
Y = idx[:, j + 1: j + num_step + 1]
yield X, Y
def to_one_hot(X, n_class, device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
"""
:param X: 数据
:param n_class: 不同字符的数量
:param device
"""
return [one_hot(X[:, i], n_class, device=device) for i in range(X.shape[1])]
def one_hot(x, n_class, dtype=torch.float32, device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
"""
:param x: x.shape --> (d), d是向量维度
:param n_class --> 不同字符的数量
:param dtype
:param device
:return: ret.shape --> (d, n_class)
"""
x = x.long()
res = torch.zeros(x.shape[0], n_class, dtype=dtype, device=device)
# scatter_(dim, index, src):将src中的数据按照索引index,在维度dim上进行填充到指定tensor,例如下例中的res
res.scatter_(1, x.view(-1, 1), 1)
return res
def grad_clipping(params, theta, device):
norm = torch.tensor([0.], device=device)
for param in params:
norm += (param.grad.data ** 2).sum()
norm = norm.sqrt().item()
if norm > theta:
for param in params:
param.grad.data *= (theta / norm)
def get_rnn_layer(input_size, hidden_size=256):
"""
:param input_size: 不同字符的数量
:param hidden_size: 隐藏层结点数
"""
rnn_layer = nn.RNN(input_size=input_size, hidden_size=hidden_size)
return rnn_layer
class RNNModel(nn.Module):
def __init__(self, run_layer, vocab_size):
super(RNNModel, self).__init__()
self.rnn = run_layer
self.hidden_size = self.rnn.hidden_size * (2 if self.rnn.bidirectional else 1)
self.vocab_size = vocab_size
self.dense = nn.Linear(self.hidden_size, self.vocab_size)
self.state = None
def forward(self, X, state):
X = to_one_hot(X, self.vocab_size)
Y, self.state = self.rnn(torch.stack(X), state)
Y = self.dense(Y.view(-1, Y.shape[-1]))
return Y, self.state
if __name__ == '__main__':
for (a, b) in load_jaychou_lyrics_iter_random(list(range(30))):
print(a, "\n", b)
3.3 main.py
"""
@author: Inki
@email: inki.yinji@qq.com
@create: 2021 0602
@lost modify: 2021 0602
"""
from Test import *
def train(prefix, num_chars, model, idx2char, char2idx,
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
state = None
output = [char2idx[prefix[0]]]
for t in range(num_chars + len(prefix) - 1):
X = torch.tensor([output[-1]], device=device).view(1, 1)
if state is not None:
if isinstance(state, tuple): # LSTM, state:(h, c)
state = (state[0].to(device), state[1].to(device))
else:
state = state.to(device)
(Y, state) = model(X, state) # 前向计算不需要传入模型参数
if t < len(prefix) - 1:
output.append(char2idx[prefix[t + 1]])
else:
output.append(int(Y.argmax(dim=1).item()))
return ''.join([idx2char[i] for i in output])
def test():
idx2char_list, char2idx_dict, dict_size, _ = load_jaychou_lyrics(tr_range=(0, 10000))
hidden_size = 256
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rnn_layer = get_rnn_layer(input_size=dict_size, hidden_size=hidden_size)
model = RNNModel(rnn_layer, dict_size).to(device)
print(train("分开", 10, model, idx2char_list, char2idx_dict, device=device))
def train_predict(model, data_idx, idx2char, char2idx, num_epoch, num_step,
lr, clipping_theta, batch_size, pred_period, pred_len, prefixes,
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
model.to(device)
state = None
for epoch in range(num_epoch):
l_sum, n, start = 0.0, 1e-5, time.time()
data_iter = load_jaychou_lyrics_iter_consecutive(data_idx, batch_size, num_step, device) # 相邻采样
for X, Y in data_iter:
if state is not None:
# 使用detach函数从计算图分离隐藏状态, 这是为了
# 使模型参数的梯度计算只依赖一次迭代读取的小批量序列(防止梯度计算开销太大)
if isinstance(state, tuple): # LSTM, state:(h, c)
state = (state[0].detach(), state[1].detach())
else:
state = state.detach()
(output, state) = model(X, state) # output: 形状为(num_steps * batch_size, vocab_size)
# Y的形状是(batch_size, num_steps),转置后再变成长度为
# batch * num_steps 的向量,这样跟输出的行一一对应
y = torch.transpose(Y, 0, 1).contiguous().view(-1)
l = loss(output, y.long())
optimizer.zero_grad()
l.backward()
# 梯度裁剪
grad_clipping(model.parameters(), clipping_theta, device)
optimizer.step()
l_sum += l.item() * y.shape[0]
n += y.shape[0]
if (epoch + 1) % pred_period == 0:
print('epoch %d, perplexity %f, time %.2f sec' % (
epoch + 1, math.exp(l_sum / n), time.time() - start))
for prefix in prefixes:
print(' -', train(prefix, pred_len, model, idx2char, char2idx))
def test1():
num_epoch, batch_size, lr, clipping_theta, tr_range = 250, 32, 1e-3, 1e-2, (0, 10000)
pred_period, pred_len, prefixes = 50, 50, ["分开", "不分开"]
idx2char_list, char2idx_dict, dict_size, char2idx_list = load_jaychou_lyrics(tr_range=tr_range)
hidden_size, num_step = 256, 25
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
rnn_layer = get_rnn_layer(input_size=dict_size, hidden_size=hidden_size)
model = RNNModel(rnn_layer, dict_size).to(device)
train_predict(model, char2idx_list, idx2char_list, char2idx_dict,
num_epoch, num_step, lr, clipping_theta, batch_size,
pred_period, pred_len, prefixes, device=device)
if __name__ == '__main__':
test()