个人主页zicesun.com
RNN
根据人的名字判断任务的性别是一个很有趣的工作. 预测英文姓名的性别,对first name进行预测,有很多种方法,比如朴素贝叶斯法,SVM,神经网络等.预测的时候能够利用的特征也有多种:名字的最后一个字母,两个字母(2-gram),最后一个字母是否是元音或者辅音.
我们讲介绍如何利用RNN预测英文姓名的性别. 点击dataset即可下载数据集
数据处理
本项目采用的数据集是一个英文数据集.男性和女性的数据比为,在一般情况下,数据比例较为合适. - female 5001条 - male 2943条
在处理数据的时候,对数据进行规范化处理:
- 所有字符小写化;
- 剔除所有的非字母数据;
- 字母数值化,每个字母对应在字符标的顺序;
- 模型的特征是姓名的单个字母;
- 测试数据和训练数据的划分按照的比例进行划分.
RNN模型
模型有三层,分别是RNN层, 全连接层,softmax层.RNN层采用LSTM网络结构.
class Classifier(nn.Module):
def __init__(self, input_size, hidden_size, embedding_size):
super(Classifier, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.embedding_size = embedding_size
self.embeddings = nn.Embedding(self.input_size, self.embedding_size)
nn.init.xavier_normal(self.embeddings.weight.data)
self.drop = nn.Dropout(p=0.1)
self.rnn = nn.LSTM(input_size=self.embedding_size, hidden_size=self.hidden_size,
batch_first=True)
self.out = nn.Linear(self.hidden_size, 2)
self.log_softmax = nn.LogSoftmax(dim=-1)
def forward(self, input, length):
input = self.embeddings(input)
input = self.drop(input)
input_packed = nn.utils.rnn.pack_padded_sequence(input, length, batch_first=True)
# out_packed, (ht, ct) = self.rnn(input_packed, None)
# out = nn.utils.rnn.pad_packed_sequence(out_packed, batch_first=True)
_, (ht, _) = self.rnn(input_packed, None)
out = self.out(ht)
out = self.log_softmax(out)
return out
复制代码
RNN网络参数batch_first=True时,需要注意输入序列.用RNN(包括LSTM\GRU等)做NLP任务时,对于同一个batch内的短句子一般需要padding补齐,这些padding的字符一般不应该拿去算output、hidden state、loss.
在pytorch中,在处理NLP任务的时候,首先要讲数据的序列进行padding,让一个batch的数据长度一样.然后根据序列原有的长度进行逆序排序.在我们这个任务里,需要处理names序列,以及labels.这时候有两种方案:
- names序列按顺序排列之后,labels也按照names排列顺序进行调整.
- names序列按照长短顺序排列之后,进行出列,然后将结果序列调整为原来的顺序. 在这里,采用第一种方法.
length = np.array([len(name) for name in names_list])
sort_idx = np.argsort(-length)
max_len = max(length)
name_tensors = torch.zeros(len(names), max_len).to(torch.long)
for i, idx in enumerate(sort_idx):
for j, e in enumerate(names_list[idx]):
name_tensors[i][j] = e
names_lengths = torch.from_numpy(length[sort_idx]).to(torch.long)
labels = labels[sort_idx].view(1, -1).squeeze(0)
复制代码
训练
classifier = Classifier(len(chars), 128, 128)
optimizer = optim.RMSprop(classifier.parameters(),lr=0.001)
loss_func = nn.NLLLoss()
total_loss = 0
total_step = 0
while True:
for data in train_loader:
total_step += 1
names = data[0]
labels =data[1]
name_tensors, labels, names_lengths = name_to_tensor(names, labels)
out = classifier(name_tensors, names_lengths).view(-1, 2)
# print(out)
# print(labels)
loss = loss_func(out, labels)
total_loss += loss.item()
loss.backward()
optimizer.step()
if total_step % 50 == 0:
print('%dth step, avg_loss: %0.4f'%(total_step, total_loss/total_step))
with torch.no_grad():
for data in test_loader:
names = data[0]
labels = data[1]
name_tensors, labels, names_lengths = name_to_tensor(names, labels)
out = classifier(name_tensors, names_lengths).view(-1, 2)
result = torch.argmax(out,dim=-1 )
result = (result == labels).to(torch.float)
print(torch.mean(result))
break
复制代码
完整代码和实验结果
代码
# coding=utf-8
import random
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch.nn as nn
import numpy as np
import torch
import torch.optim as optim
################################load dataset ###################
################################################################
def load_data(path, label):
names = []
with open(path) as f:
lines = f.readlines()
for l in lines:
names.append((l.strip('\n'), label))
return names
female_names = load_data('../datasets/names_gender/eng/female.txt', 0)
male_names = load_data('../datasets/names_gender/eng/male.txt', 1)
names = female_names + male_names
random.shuffle(names)
# 将数据划分为训练集和测试集
train_dataset = names[: int(len(names)*0.9)]
test_dataset = names[int(len(names)*0.9):]
# padding的字符为0,
chars = [0] + [chr(i) for i in range(97,123)]
# print(chars)
class NameDataset(Dataset):
def __init__(self, data):
self.data = data
def __getitem__(self, index):
# 这里可以对数据进行处理,比如讲字符数值化
data = self.data[index]
name = data[0]
label = data[1]
return name, label
def __len__(self):
return len(self.data)
train_dataset = NameDataset(train_dataset)
test_dataset = NameDataset(test_dataset)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=500, shuffle=True)
# 序列按照长短顺序逆序排列
def name_to_tensor(names, labels):
names_list = []
for name in names:
char_list = []
for ch in name.lower():
try:
char_list.append(chars.index(ch))
except:
char_list.append(0)
# name_tensor = torch.from_numpy(np.array(char_list))
names_list.append(char_list)
length = np.array([len(name) for name in names_list])
sort_idx = np.argsort(-length)
max_len = max(length)
name_tensors = torch.zeros(len(names), max_len).to(torch.long)
for i, idx in enumerate(sort_idx):
for j, e in enumerate(names_list[idx]):
name_tensors[i][j] = e
names_lengths = torch.from_numpy(length[sort_idx]).to(torch.long)
labels = labels[sort_idx].view(1, -1).squeeze(0)
return name_tensors, labels, names_lengths
######################### model ###################
# #################################################################
class Classifier(nn.Module):
def __init__(self, input_size, hidden_size, embedding_size):
super(Classifier, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.embedding_size = embedding_size
self.embeddings = nn.Embedding(self.input_size, self.embedding_size)
nn.init.xavier_normal(self.embeddings.weight.data)
self.drop = nn.Dropout(p=0.1)
self.rnn = nn.LSTM(input_size=self.embedding_size, hidden_size=self.hidden_size,
batch_first=True)
self.out = nn.Linear(self.hidden_size, 2)
self.log_softmax = nn.LogSoftmax(dim=-1)
def forward(self, input, length):
input = self.embeddings(input)
input = self.drop(input)
input_packed = nn.utils.rnn.pack_padded_sequence(input, length, batch_first=True)
# out_packed, (ht, ct) = self.rnn(input_packed, None)
# out = nn.utils.rnn.pad_packed_sequence(out_packed, batch_first=True)
_, (ht, _) = self.rnn(input_packed, None)
out = self.out(ht)
out = self.log_softmax(out)
return out
classifier = Classifier(len(chars), 128, 128)
optimizer = optim.RMSprop(classifier.parameters(),lr=0.001)
loss_func = nn.NLLLoss()
total_loss = 0
total_step = 0
while True:
for data in train_loader:
total_step += 1
names = data[0]
labels =data[1]
name_tensors, labels, names_lengths = name_to_tensor(names, labels)
out = classifier(name_tensors, names_lengths).view(-1, 2)
# print(out)
# print(labels)
loss = loss_func(out, labels)
total_loss += loss.item()
loss.backward()
optimizer.step()
if total_step % 50 == 0:
print('%dth step, avg_loss: %0.4f'%(total_step, total_loss/total_step))
with torch.no_grad():
for data in test_loader:
names = data[0]
labels = data[1]
name_tensors, labels, names_lengths = name_to_tensor(names, labels)
out = classifier(name_tensors, names_lengths).view(-1, 2)
result = torch.argmax(out,dim=-1 )
result = (result == labels).to(torch.float)
print(torch.mean(result))
break
复制代码
实验结果
50th step, avg_loss: 0.4905
100th step, avg_loss: 0.4662
tensor(0.8200)
150th step, avg_loss: 0.4535
200th step, avg_loss: 0.4467
tensor(0.8020)
250th step, avg_loss: 0.4396
300th step, avg_loss: 0.4320
tensor(0.8260)
350th step, avg_loss: 0.4270
400th step, avg_loss: 0.4217
tensor(0.8300)
450th step, avg_loss: 0.4178
500th step, avg_loss: 0.4117
550th step, avg_loss: 0.4073
tensor(0.8140)
600th step, avg_loss: 0.4027
650th step, avg_loss: 0.3994
tensor(0.8260)
700th step, avg_loss: 0.3971
750th step, avg_loss: 0.3934
tensor(0.8100)
800th step, avg_loss: 0.3908
850th step, avg_loss: 0.3884
tensor(0.8160)
900th step, avg_loss: 0.3861
950th step, avg_loss: 0.3832
1000th step, avg_loss: 0.3822
tensor(0.8140)
1050th step, avg_loss: 0.3801
1100th step, avg_loss: 0.3789
tensor(0.8060)
1150th step, avg_loss: 0.3769
1200th step, avg_loss: 0.3757
tensor(0.8420)
复制代码
在训练的时候,验证的结果正确率最高能够达到86%.这个模型还比较粗糙,仅仅用名字的单个字母作为特征,模型还不能够完全分析出字母之间的联系.采用2-gram的方式,结果可能要好一点.这个待以后再去实现.