NLP FROM SCRATCH:GENERATING NAMES WITH A CHARACTER-LEVEL RNN
这是“NLP From Scratch”系列的第二个教程。第一个教程“NLP From Scratch: Classifying Names with a Character-Level RNN”是把names进行分类到对应的language category,本教程将会反过来,根据language category生成相应的names。如下:
> python sample.py Russian RUS
Rovakov
Uantov
Shavakov
> python sample.py German GER
Gerren
Ereng
Rosher
我们还是手动写代码创建RNN网络结构,与上一个教程最大不同是,上一个是读取name的所有letter之后预测category是哪个,本教程我们将输入category和第一个时刻的letter,然后输出指定category和first letter下的最大概率的name。
1. Preparing the Data
数据下载地址:https://download.pytorch.org/tutorial/data.zip
上一教程NLP From Scratch: Classifying Names with a Character-Level RNN
对数据预处理过程进行了详细介绍。
大概过程:
- 文件
data/names/[Language].txt
中一行表示一个名字,我们将该文件的所有行split into an array。 - 由Unicode编码转换为ASCII编码。
- 最终生成类似{language: [names …]}形式的字典。
from __future__ import unicode_literals, print_function, division
from io import open
import glob
import os
import unicodedata
import string
all_letters = string.ascii_letters + " .,;'-"
n_letters = len(all_letters) + 1 # Plus EOS marker
def findFiles(path):
return glob.glob(path)
# turn a unicode string to ascii,可以参考网址:https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
and c in all_letters
)
# 读取一个文件,并split into lines
def readLines(filename):
lines = open(filename, encoding='utf-8').read().strip().split('\n')
return [unicodeToAscii(line) for line in lines]
# 建立 category_lines 字典,每个category有 a list of lines,即{language:[names,...]}
category_lines = {}
all_categories = []
for filename in findFiles('./.data/data/names/*.txt'):
category = os.path.splitext(os.path.basename(filename))[0]
all_categories.append(category)
lines = readLines(filename)
category_lines[category] = lines
n_categories = len(all_categories)
if n_categories == 0:
raise RuntimeError('Data not found. Make sure that you downloaded data '
'from https://download.pytorch.org/tutorial/data.zip and extract it to '
'the current directory.')
print('# categories:', n_categories, all_categories)
print(unicodeToAscii("O'Néàl"))
# categories: 18 ['Arabic', 'Chinese', 'Czech', 'Dutch', 'English', 'French', 'German', 'Greek', 'Irish', 'Italian', 'Japanese', 'Korean', 'Polish', 'Portuguese', 'Russian', 'Scottish', 'Spanish', 'Vietnamese']
O'Neal
2. Creating the Network
import torch
import torch.nn as nn
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.i2h = nn.Linear(n_categories + input_size + hidden_size, hidden_size)
self.i2o = nn.Linear(n_categories + input_size + hidden_size, output_size)
self.o2o = nn.Linear(hidden_size + output_size, output_size)
self.dropout = nn.Dropout(0.1)
self.softmax = nn.LogSoftmax(dim=1)
def forward(self, category, input, hidden):
# 输入某一category(language),某时刻的letter,某时刻的隐藏状态hidden,将它们拼接在一起
input_combined = torch.cat((category, input, hidden), 1)
# 全连接1:输入大小n_categories + input_size + hidden_size,输出大小hidden_size
hidden = self.i2h(input_combined)
# 全连接2:输入大小n_categories + input_size + hidden_size,输出大小hidden_size
output = self.i2o(input_combined)
# 将全连接1 和 全连接2 的结果拼接一起
output_combined = torch.cat((hidden, output), 1)
# 全连接3:输入大小 hidden_size + output_size,输出大小 output_size
output = self.o2o(output_combined)
output = self.dropout(output)
output = self.softmax(output)
return output, hidden
def initHidden(self):
return torch.zeros(1, self.hidden_size)
3. Training
3.1 Preparing for Training
首先,利用helper functions 获取随机pairs=(category, line).
import random
# Random item from a list
def randomChoice(l):
return l[random.randint(0, len(l) - 1)]
# Get a random category and random line from that category
def randomTrainingPair():
# 随机获取language
category = randomChoice(all_categories)
# 基于language下,随机获取name
line = randomChoice(category_lines[category])
return category, line
对于每个时刻,网络结构的输入(category, current_letter, hidden state),输出(next letter, next hidden state)。
对于每个时刻,我们从the current letter预测 next letter。对于每个name,pairs的生成是基于name中相邻的letter组成。比如:“ABCD”,我们将创建(‘A’, ‘B’), (‘B’, ‘C’), (‘C’, ‘D’), (‘D’, ‘EOS’)
The category tensor 是a one-hot tensor,大小是<1×n_categories>。
# One-hot vector for category
def categoryTensor(category):
li = all_categories.index(category)
tensor = torch.zeros(1, n_categories)
tensor[0][li] = 1
return tensor
# 对于一个name,转换为tensor,类似于one-hot,只不过以矩阵的形式表示一个word。
def inputTensor(line):
tensor = torch.zeros(len(line), 1, n_letters)
for li in range(len(line)):
letter = line[li]
tensor[li][0][all_letters.find(letter)] = 1
return tensor
# 从name的第二个letter,到 EOS 结束,作为target
def targetTensor(line):
letter_indexes = [all_letters.find(line[li]) for li in range(1, len(line))]
letter_indexes.append(n_letters - 1) # EOS
return torch.LongTensor(letter_indexes)
随机生成pairs,然后将它们转化为(category, input, target)形式的tensor。
def randomTrainingExample():
# 生成pair对
category, line = randomTrainingPair()
# 将category转换为one-hot形式
category_tensor = categoryTensor(category)
# 将name转换为one-hot的矩阵形式
input_line_tensor = inputTensor(line)
# 生成target tensor,即从name的第二个letter到EOS 的索引值加入到tensor。
target_line_tensor = targetTensor(line)
return category_tensor, input_line_tensor, target_line_tensor
3.2. Training the Nework
相比于上个教程的分类最后的输出仅被利用,本篇教程将在每个时刻预测和计算损失值。
criterion = nn.NLLLoss()
learning_rate = 0.0005
rnn = RNN(n_letters, 128, n_letters)
def train(category_tensor, input_line_tensor, target_line_tensor):
target_line_tensor.unsqueeze_(-1)
hidden = rnn.initHidden()
# 所有参数的梯度归零,不然反向传播后,梯度会基于the existing gradients进行累加
rnn.zero_grad()
loss = 0
for i in range(input_line_tensor.size(0)):
output, hidden = rnn(category_tensor, input_line_tensor[i], hidden)
# 计算每个时刻的损失值
l = criterion(output, target_line_tensor[i])
loss += l
# 反向传播
loss.backward()
# 更新参数
for p in rnn.parameters():
p.data.add_(p.grad.data, alpha=-learning_rate)
return output, loss.item()
import time
import math
def timeSince(since):
now = time.time()
s = now - since
m = math.floor(s / 60)
s -= m * 60
return '%dm %ds' % (m, s)
打印当前时间和loss every print_every,画 loss per plot_every。
n_iters = 100000
print_every = 5000
plot_every = 500
all_losses = []
total_loss = 0 # reset every plot_every iters
start = time.time()
# 一个样本一个样本的feed model
for iter in range(1, n_iters + 1):
output, loss = train(*randomTrainingExample())
total_loss += loss
if iter % print_every == 0:
print("%s (%d %d%%) %.4f" % (timeSince(start), iter, iter / n_iters*100, loss))
if iter % plot_every == 0:
all_losses.append(total_loss / plot_every)
total_loss = 0
0m 14s (5000 5%) 29.4672
0m 28s (10000 10%) 20.1486
0m 42s (15000 15%) 16.7669
0m 56s (20000 20%) 12.3206
1m 10s (25000 25%) 14.6552
1m 25s (30000 30%) 18.2652
1m 39s (35000 35%) 13.1117
1m 53s (40000 40%) 24.3611
2m 8s (45000 45%) 31.1215
2m 22s (50000 50%) 17.2064
2m 37s (55000 55%) 4.0709
2m 51s (60000 60%) 15.5258
3m 6s (65000 65%) 21.8035
3m 21s (70000 70%) 14.3376
3m 36s (75000 75%) 37.4147
3m 51s (80000 80%) 18.6844
4m 5s (85000 85%) 13.6755
4m 19s (90000 90%) 8.6191
4m 34s (95000 95%) 13.0634
4m 50s (100000 100%) 6.4869
4. Plotting the Losses
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
plt.figure()
plt.plot(all_losses)
[<matplotlib.lines.Line2D at 0x29134ccdcc0>]
5. Sampling the Network
- 创建input category tensor、starting letter tensor、empty hidden state tensor
- 用the starting letter创建 a string output_name
- output的最大长度
- feed the current letter to the network
- Get the next letter from highest output, and next hidden state
- if the letter is EOS, stop here
- If a regular letter, add to output_name and continue
- 生成最终的name
max_length = 20
# sample from a category and starting letter
def sample(category, start_letter='A'):
with torch.no_grad(): # no need to track history in sampling
category_tensor = categoryTensor(category)
input = inputTensor(start_letter)
hidden = rnn.initHidden()
output_name = start_letter
for i in range(max_length):
output, hidden = rnn(category_tensor, input[0], hidden)
print('output:', output)
topv, topi = output.topk(1)
topi = topi[0][0]
if topi == n_letters - 1: # 判断是否等于EOS
break
else:
letter = all_letters[topi]
output_name += letter
input = inputTensor(letter)
return output_name
def samples(category, start_letters='ABC'):
for start_letter in start_letters:
print(sample(category, start_letter))
samples('Russian', 'R')
# samples('German', 'GER')
# samples('Spanish', 'SPA')
output: tensor([[-1.4413, -5.4598, -4.8666, -6.1329, -2.4513, -7.1161, -5.9060, -3.0190,
-1.8063, -7.2397, -5.7096, -4.5078, -6.0241, -5.3823, -1.3100, -6.2489,
-7.4404, -4.1200, -4.9572, -5.2206, -2.4143, -6.6713, -6.0780, -7.6809,
-4.3445, -5.8323, -8.3920, -7.9315, -8.1703, -7.8993, -8.3804, -8.3994,
-8.1776, -7.7506, -8.4526, -8.3975, -7.8748, -8.1534, -8.0270, -8.0865,
-8.3704, -8.3390, -8.4436, -8.2002, -8.3183, -8.3038, -8.2838, -8.3120,
-8.4860, -8.3879, -8.1261, -8.2742, -7.6758, -8.3950, -8.3926, -8.3634,
-5.3357, -8.2320, -7.3196]])
output: tensor([[-4.2243, -3.3834, -3.6097, -3.6600, -4.8209, -3.9725, -3.8417, -3.3276,
-4.0886, -4.4422, -2.8407, -3.4183, -2.6477, -2.5793, -3.6651, -3.4486,
-6.5613, -3.5908, -2.2531, -2.9721, -2.3586, -2.0722, -3.6778, -6.6563,
-4.1223, -3.9812, -7.7794, -7.2141, -7.6238, -7.1017, -7.8011, -7.6014,
-7.6500, -7.1003, -7.9286, -8.0263, -7.2799, -7.7786, -7.3070, -7.5131,
-7.6667, -7.6237, -7.9490, -7.5318, -7.5460, -7.7267, -7.7099, -7.8292,
-7.9670, -7.6456, -7.5693, -7.6834, -5.9614, -7.9111, -7.9610, -7.8803,
-6.9437, -7.4477, -5.2634]])
output: tensor([[-1.2362, -4.9570, -4.7936, -4.7268, -1.6312, -5.0020, -4.9987, -2.8746,
-2.2392, -5.7015, -3.6326, -4.4013, -5.0075, -5.1254, -2.1361, -4.7653,
-7.3361, -4.2661, -3.3871, -4.0629, -3.9821, -5.1486, -6.3289, -7.4465,
-3.9958, -4.9392, -8.9968, -8.5194, -8.8172, -8.3143, -9.1219, -9.1497,
-8.6827, -8.5474, -9.1323, -9.1637, -8.6329, -8.8342, -8.8349, -8.6719,
-8.8547, -8.9560, -9.1864, -8.7278, -8.9542, -8.8242, -9.1077, -8.9672,
-8.9083, -8.9585, -8.8132, -8.9318, -7.2933, -9.2192, -9.0394, -9.1762,
-7.7964, -8.7389, -4.7701]])
output: tensor([[-5.3886, -4.7167, -3.7475, -4.1300, -5.0341, -4.1315, -4.5360, -3.5546,
-3.6082, -5.3660, -1.4379, -2.1742, -3.6383, -1.9338, -4.4879, -4.8465,
-7.4296, -2.6857, -2.8674, -3.3065, -4.6224, -2.8724, -4.0867, -7.2982,
-3.2847, -4.8584, -8.9813, -8.5608, -8.9272, -8.6181, -9.1277, -9.1586,
-9.0637, -8.6138, -9.3083, -9.2960, -8.7978, -9.2561, -9.2114, -8.9565,
-9.1472, -9.2058, -9.3532, -9.0190, -9.3273, -9.0435, -9.3787, -9.1509,
-9.0924, -9.1141, -9.2940, -9.4039, -7.0684, -9.3861, -9.3719, -9.5770,
-8.4011, -8.8125, -4.1207]])
output: tensor([[ -2.4359, -5.4711, -5.2472, -4.9270, -2.6689, -4.9558, -5.7110,
-2.7672, -1.4588, -6.8242, -3.3992, -3.9154, -5.8384, -4.8454,
-1.5711, -5.3059, -7.7949, -4.7413, -3.1547, -4.2674, -4.2048,
-4.6826, -6.4816, -7.8292, -2.7387, -5.6351, -10.0187, -9.5470,
-9.8422, -9.4509, -10.2621, -10.4727, -9.9208, -9.5784, -10.4204,
-10.3154, -9.6771, -10.1955, -10.1793, -9.7159, -10.1339, -10.2025,
-10.2646, -9.7793, -10.4529, -10.0000, -10.5198, -10.1766, -10.0998,
-10.1162, -10.0015, -10.2873, -7.4078, -10.5454, -10.1746, -10.4858,
-9.4067, -9.8696, -2.5178]])
output: tensor([[ -5.4580, -5.3080, -4.0530, -4.3628, -3.8828, -3.8970, -5.2831,
-6.1293, -6.0859, -5.6576, -2.9308, -3.1711, -3.9806, -1.7532,
-4.8708, -5.1059, -7.7976, -3.6083, -1.7038, -3.6821, -6.8194,
-2.6986, -5.5729, -7.1876, -3.7523, -5.5891, -9.8540, -9.8926,
-10.1004, -9.7783, -10.2589, -10.3617, -10.1926, -9.7958, -10.3698,
-10.6007, -9.9273, -10.3753, -10.3624, -9.9205, -10.1051, -10.2533,
-10.4298, -10.0148, -10.4460, -10.2790, -10.7557, -10.2795, -10.0166,
-10.2439, -10.2326, -10.6405, -7.0911, -10.4892, -10.3478, -10.6048,
-10.5618, -10.0589, -1.3129]])
Rovaki
多次运行samples('Russian', 'R')
函数,可能会有不一样的name产生,如:Rovaki、Rokvio…,留下个疑问,你知道为什么模型训练好了,参数都已经学习好了,为什么结果会存在不一样的情况呢?