一段Bi-GRU代码(可以单向或双向)
解决文本分类问题
输入文本和标签(文本编码格式为ASCII编码)
输出预测结果
import csv
import time
import datetime
import numpy as np
import torch
from matplotlib import pyplot as plt
from torch.utils.data import DataLoader
# Parameters
HIDDEN_SIZE = 256 # 隐藏层维度
BATCH_SIZE = 256 # 批量大小
N_LAYER = 2 # GRU层数
N_EPOCHS = 10 # 训练轮数
N_CHARS = 256 # 字符数量
USE_GPU = True # 是否使用GPU
label_num = 2 # 标签数量
train_file = 'data/train_datas.csv'
test_file = 'data/test_datas.csv'
def make_tensor(tensor):
# 判断是否使用GPU 使用的话把tensor搬到GPU上去
if USE_GPU:
device = torch.device("cuda:0")
tensor = tensor.to(device)
return tensor
class CharDataset(): # 处理数据集
def __init__(self, is_train_set=True):
filename = train_file if is_train_set else test_file
with open(filename, 'rt', encoding="utf-8", errors='ignore', newline='') as f:
reader = csv.reader(f) # 读取表格文件
rows = list(reader)
self.chars = [row[0] for row in rows] # 取出字符
self.len = len(self.chars) # 字符个数
self.labels = [row[1] for row in rows] # 取出标签
def __getitem__(self, index):
self.labels[index] = int(self.labels[index])
return self.chars[index], self.labels[index]
def __len__(self):
return self.len
def char2list(char):
# 把每个字符按字符都变成ASCII码
arr = [ord(c) for c in char]
return arr, len(arr)
def chars2tensors(chars, labels):
'''
将字符串进行处理:
1.把字符分割为字母和数字,并转为ASCII码
2.对其进行填充,使其长度一致(先创造零矩阵,然后再把值粘贴上去)
3.转置矩阵,横向变为竖向
4.按长度由高到低排序
:param chars:字符串
:param labels:0,1
:return:排序后的 ASCII列表,字符长度降序列表,标签列表
'''
# 1.把字符分割为字母和数字,并转为ASCII码
# char_sequences, char_lengths = [char2list(char) for char in chars] # 返回一个元组,ascii码和长度
sequences_and_lengths = [char2list(char) for char in chars] # 返回一个元组,ascii码和长度
char_sequences = [sl[0] for sl in sequences_and_lengths] # 取出名字列表对应的ACSII码
char_lengths = torch.LongTensor([sl[1] for sl in sequences_and_lengths]) # 取出每个名字对应的长度列表
labels = torch.LongTensor(labels) # 把标签转变为long
# 2.对其进行填充,使其长度一致(先创造零矩阵,然后再把值粘贴上去)
chars_num = len(char_sequences)
max_char_length = max(char_lengths)
seq_tensor = torch.zeros(chars_num, max_char_length).long() # 初始化一个全0的矩阵,字符串个数x最长字符串长度
for idx, (seq, seq_len) in enumerate(zip(char_sequences, char_lengths), 0): # 取出序列,ACSII码和长度列表
seq_tensor[idx, :seq_len] = torch.LongTensor(seq) # 用名字列表的ACSII码填充上面的全0tensor
# 4.按长度由高到低排序
char_lengths, permutation_index = char_lengths.sort(dim=0, descending=True) # 按照长度降序排序
seq_tensor = seq_tensor[permutation_index] # 按照长度降序排序
labels = labels[permutation_index] # 按照长度降序排序
# 返回排序后的 ASCII列表 字符长度降序列表 标签列表
return make_tensor(seq_tensor), make_tensor(char_lengths), make_tensor(labels)
def trainModel():
total_loss = 0
for i, (chars, labels) in enumerate(train_loader, 1):
inpus_ascii, char_lengths, labels = chars2tensors(chars, labels) # 处理输入字符串
outputs = classifier(inpus_ascii, char_lengths) # 输出结果
loss = criterion(outputs, labels) # 计算损失
optimizer.zero_grad() # 梯度清零
loss.backward() # 反向传播
optimizer.step() # 更新参数
total_loss += loss.item()
# if i % 10 == 0:
# print('[%d, %5d] loss: %.3f' % (1, i, total_loss / i))
return total_loss
def testModel():
correct = 0
total = len(test_set)
with torch.no_grad():
for i, (chars, labels) in enumerate(test_loader, 1):
inputs, seq_lengths, target = chars2tensors(chars, labels) # 处理输入字符串
output = classifier(inputs, seq_lengths) # 输出结果
pred = output.max(dim=1, keepdim=True)[1] # 预测
correct += pred.eq(target.view_as(pred)).sum().item() # 计算预测对了多少
percent = '%.2f' % (100 * correct / total)
print(f'Test set: Accuracy {correct}/{total} {percent}%')
return correct / total
class RNNClassifier(torch.nn.Module):
def __init__(self, input_size, hidden_size, output_size, n_layers=1, bidirectional=False):
super(RNNClassifier, self).__init__()
self.hidden_size = hidden_size # 包括下面的n_layers在GRU模型里使用
self.n_layers = n_layers
self.n_directions = 2 if bidirectional else 1
self.embedding = torch.nn.Embedding(input_size,
hidden_size) # input.shape=(seqlen,batch) output.shape=(seqlen,batch,hiddensize)
self.gru = torch.nn.GRU(hidden_size, hidden_size, n_layers, bidirectional=bidirectional)
# 输入维度 输出维度 层数 说明单向还是双向
self.fc = torch.nn.Linear(hidden_size * self.n_directions, output_size) # 双向GRU会输出两个hidden,维度需要✖2,要接一个线性层
def forward(self, input, seq_lengths):
input = input.t() # input shape : Batch x Seq -> S x B 用于embedding
batch_size = input.size(1)
hidden = self._init_hidden(batch_size)
embedding = self.embedding(input)
# pack_padded_sequence函数当出入seq_lengths是GPU张量时报错,在这里改成cpu张量就可以
if USE_GPU:
seq_lengths = seq_lengths.cpu() # 改成cpu张量
# pack_padded_sequence是补上的0去掉,不参与运算,加快速度
gru_input = torch.nn.utils.rnn.pack_padded_sequence(embedding, seq_lengths) # 让0不参与运算加快运算速度的方式
# 需要提前把输入按有效值长度降序排列 再对输入做嵌入,然后按每个输入len(seq——lengths)取值做为GRU输入
output, hidden = self.gru(gru_input, hidden) # 双向传播的话hidden有两个
if self.n_directions == 2:
hidden_cat = torch.cat([hidden[-1], hidden[-2]], dim=1)
else:
hidden_cat = hidden[-1]
fc_output = self.fc(hidden_cat)
return fc_output
def _init_hidden(self, batch_size):
hidden = torch.zeros(self.n_layers * self.n_directions, batch_size, self.hidden_size)
return make_tensor(hidden)
if __name__ == '__main__':
train_set = CharDataset(is_train_set=True)
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True)
test_set = CharDataset(is_train_set=False)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, shuffle=False)
print("Train for %d epochs..." % N_EPOCHS)
start = time.time()
classifier = RNNClassifier(N_CHARS, HIDDEN_SIZE, label_num, N_LAYER)
if USE_GPU:
device = torch.device('cuda:0')
classifier.to(device)
criterion = torch.nn.CrossEntropyLoss() # 计算损失
optimizer = torch.optim.Adam(classifier.parameters(), lr=0.001) # 更新
acc_list = []
for epoch in range(1, N_EPOCHS + 1):
# 训练
print('%d / %d:' % (epoch, N_EPOCHS))
trainModel()
acc = testModel()
acc_list.append(acc)
end = time.time()
print(datetime.timedelta(seconds=(end - start) // 1))
# 保存模型
torch.save(classifier.state_dict(), './model/rnn_classifier.pth')
# 保存模型参数
with open('./model/rnn_classifier.txt', 'w') as f:
f.write(str(classifier))
epoch = np.arange(1, len(acc_list) + 1, 1)
acc_list = np.array(acc_list)
plt.plot(epoch, acc_list)
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.grid()
plt.show()
# def read_file(file_path):
# with open(file_path, 'rt') as f:
# reader = csv.reader(f) # 读取文件
# rows = list(reader)
# chars = [row[0] for row in rows] # 取出字符
# labels = [row[1] for row in rows] # 取出标签
# return chars, labels