参考链接
https://blog.csdn.net/ifhuke/article/details/127625901
腾讯词向量
https://www.cnblogs.com/yanqiang/p/13536619.html
腾讯公开词向量数据集
https://ai.tencent.com/ailab/nlp/en/embedding.html
本例中用到的数据已经存到百度网盘了
其他参考
https://blog.csdn.net/Kaiyuan_sjtu/article/details/120446703
中文分词常用方法
https://www.jianshu.com/p/6c085bf1086f
注意
需要注意的是要在text_classify_data下新建一个saved_dict,不然训练会报错
知识点
1. 密集向量和稀疏向量的区别
密集向量和稀疏向量的区别: 密集向量的值就是一个普通的Double数组 而稀疏向量由两个并列的 数组indices和values组成 例如:向量(1.0,0.0,1.0,3.0)用密集格式表示为[1.0,0.0,1.0,3.0],用稀疏格式表示为(4,[0,2,3],[1.0,1.0,3.0]) 第一个4表示向量的长度(元素个数),[0,2,3]就是indices数组,[1.0,1.0,3.0]是values数组 表示向量0的位置的值是1.0,2的位置的值是1.0,而3的位置的值是3.0,其他的位置都是0
2. 文本转向量
https://blog.csdn.net/weixin_44766179/article/details/103218288
文本独热稀疏多维,需要降维,word embeding技术就是降维成稠密向量,把文本嵌入到数字空间
3. 随机数种子
https://blog.csdn.net/weixin_51390582/article/details/124246873
也就是说在代码中,我们看到 “ 随机 ” ,那就是说并不是真正随机(假随机),且一次有效
import torch
import numpy as np
# 存放数据的文件夹
dataset = 'text_classify_data'
# 搜狗新闻:embedding_SougouNews.npz, 腾讯:embedding_Tencent.npz
embedding = 'embedding_SougouNews.npz'
# 设置随机数种子,保证每次运行结果一致,不至于不能复现模型
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True # 保证每次结果一样
import numpy as np
class Config(object):
"""配置参数"""
def __init__(self, dataset, embedding):
'''
:param dataset: 数据所在的文件夹路径
:param embedding: 使用的词嵌入文件名称
'''
self.model_name = 'TextRNN'
self.train_path = dataset + '/data/train.txt' # 训练集
self.dev_path = dataset + '/data/dev.txt' # 验证集
self.test_path = dataset + '/data/test.txt' # 测试集
self.class_list = [x.strip() for x in open(
dataset + '/data/class.txt').readlines()] # 类别列表
self.vocab_path = dataset + '/data/vocab.pkl' # 词表
self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt' # 模型训练结果
self.embedding_pretrained = torch.tensor(
np.load(dataset + '/data/' + embedding)["embeddings"].astype('float32')) # 预训练词向量
self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 设备
self.dropout = 0.5 # 随机失活
self.require_improvement = 1000 # 若超过1000batch效果还没提升,则提前结束训练
self.num_classes = len(self.class_list) # 类别数
self.num_epochs = 10 # epoch数
self.batch_size = 128 # mini-batch大小
self.pad_size = 32 # 每句话处理成的长度(短填长切)
self.learning_rate = 1e-3 # 学习率
self.embed = self.embedding_pretrained.size(1) # 词向量维度, 若使用了预训练词向量,则维度统一
self.hidden_size = 128 # lstm隐藏层
self.num_layers = 2 # lstm层数
# 超参数的设置
config = Config(dataset, embedding)
np.load(dataset + '/data/' + embedding)["embeddings"]
array([[ 0.29827962, 0.41063769, 0.89462984, ..., 0.6416691 ,
0.88055139, 0.16834516],
[ 0.00102 , -0.133386 , -0.190171 , ..., -0.14429501,
-0.52121401, 0.206875 ],
[-0.024858 , 0.130821 , -0.401039 , ..., 0.34848201,
-0.50993001, -0.183386 ],
...,
[-0.20301799, 0.144519 , -0.003503 , ..., -0.29272199,
-0.155543 , 0.066212 ],
[ 0.50523703, 0.6514817 , 0.40988785, ..., 0.63914118,
0.27362602, 0.79338627],
[ 0.28897455, 0.88642565, 0.62531905, ..., 0.67309214,
0.78327786, 0.13400399]])
import pickle as pkl
from tqdm import tqdm
UNK, PAD = '<UNK>', '<PAD>' # 未知字,padding符号
def get_data(config):
tokenizer = lambda x: [y for y in x] # 字级别,使用lambda print(tokenizer(x))
vocab = pkl.load(open(config.vocab_path, 'rb'))#词嵌入模块中对每一个词都进行了一个编号,读入词嵌入模型后,能够读出编号所对应的词向量
print(f"Vocab size: {len(vocab)}")
train = load_dataset(config.train_path, config.pad_size, tokenizer, vocab)
dev = load_dataset(config.dev_path, config.pad_size, tokenizer, vocab)
test = load_dataset(config.test_path, config.pad_size, tokenizer, vocab)
return vocab, train, dev, test
def load_dataset(path, pad_size, tokenizer, vocab):
'''
将路径文本文件分词并转为三元组返回
:param path: 文件路径
:param pad_size: 每个序列的大小
:param tokenizer: 转为词级别或字级别
:param vocab: 词向量模型
:return: 二元组,含有字ID,标签
'''
contents = []
with open(path, 'r', encoding='UTF-8') as f:
# tqdm可以看进度条
for line in tqdm(f):
lin = line.strip()
if not lin:
continue
content, label = lin.split('\t')
# word_line存储每个字的id
words_line = []
# 分割器,分成每个字
token = tokenizer(content)
# 字的长度
seq_len = len(token)
if pad_size:
# 如果字长度小于指定长度,则填充,否则截断
if len(token) < pad_size:
token.extend([vocab.get(PAD)] * (pad_size - len(token)))
else:
token = token[:pad_size]
seq_len = pad_size
# 将每个字映射为ID
for word in token:
words_line.append(vocab.get(word, vocab.get(UNK)))
contents.append((words_line, int(label)))
return contents
vocab, train_data, dev_data, test_data = get_data(config)
# {' ': 0,
# '0': 1,
# '1': 2,
# '2': 3,
# ':': 4,
# '大': 5,
# '国': 6,
# '图': 7}
Vocab size: 4762
180000it [00:03, 59228.36it/s]
10000it [00:00, 67163.88it/s]
10000it [00:00, 66702.25it/s]
from torch.utils.data import Dataset, DataLoader
class TextDataset(Dataset):
def __init__(self, data, config):
self.device = config.device
# 将传入的文本存到一起
self.x = torch.LongTensor([x[0] for x in data]).to(self.device)
# 将传入的标签存到一起
self.y = torch.LongTensor([x[1] for x in data]).to(self.device)
def __getitem__(self,index):
# 拿出文本中的一个
self.text = self.x[index]
# 拿出对应标签中的一个
self.label = self.y[index]
return self.text, self.label
def __len__(self):
return len(self.x)
dataloaders = {
'train': DataLoader(TextDataset(train_data, config), 128, shuffle=True),
'dev': DataLoader(TextDataset(dev_data, config), 128, shuffle=True),
'test': DataLoader(TextDataset(test_data, config), 128, shuffle=True)
}
import torch
import torch.nn as nn
class RNNModel(nn.Module):
def __init__(self, config):
super(RNNModel, self).__init__()
# 使用预训练的词向量模型,freeze=False 表示允许参数在训练中更新
self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
# bidirectional=True表示使用的是双向LSTM(长短期记忆人工神经网络。 长短期记忆网络(LSTM,Long Short-Term Memory)是一种时间循环神经网络)
self.lstm = nn.LSTM(config.embed, config.hidden_size, config.num_layers,
bidirectional=True, batch_first=True, dropout=config.dropout)
# 因为是双向LSTM,所以层数为config.hidden_size * 2
self.fc = nn.Linear(config.hidden_size * 2, config.num_classes)
def forward(self, x):
out = self.embedding(x) # [batch_size, seq_len, embeding]=[128, 32, 300]
# lstm 的input为[batchsize, max_length, embedding_size],输出表示为 output,(h_n,c_n),
# 保存了每个时间步的输出,如果想要获取最后一个时间步的输出,则可以这么获取:output_last = output[:,-1,:]
out, _ = self.lstm(out)
out = self.fc(out[:, -1, :]) # 句子最后时刻的 hidden state
return out
import torch.nn as nn
# 权重初始化,默认xavier
def init_network(model, method='xavier', exclude='embedding', seed=123):
for name, w in model.named_parameters():
# 不对词嵌入的层的参数进行初始化,因为我们使用的是预训练的模型
if exclude not in name:
if 'weight' in name:
if method == 'xavier':
nn.init.xavier_normal_(w)
elif method == 'kaiming':
nn.init.kaiming_normal_(w)
else:
nn.init.normal_(w)
elif 'bias' in name:
nn.init.constant_(w, 0)
else:
pass
import torch.nn as nn
import time
import torch
import copy
import pandas as pd
import datetime
from sklearn import metrics
import numpy as np
def train_best(config, model, dataloaders, log_step=100):
'''
训练模型
:param config: 超参数
:param model: 模型
:param dataloaders: 处理后的数据,包含trian,dev,test
:param log_step: 每隔多少个batch打印一次数据,默认100
:return: 训练的指标
'''
optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
loss_function = torch.nn.CrossEntropyLoss()
best_acc = 0
# 最优模型
best_model = copy.deepcopy(model.state_dict())
total_step = 0 # 记录进行到多少batch
dev_best_loss = float('inf')
last_improve = 0 # 记录上次验证集loss下降的batch数
flag = False # 记录是否很久没有效果提升
# 保存每一个100个batch的信息
dfhistory = pd.DataFrame(columns=["epoch", "train_loss", "train_acc", "dev_loss", "dev_acc"])
device = config.device
print("Start Training...\n")
nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
print("==========" * 8 + "%s\n" % nowtime)
for i in range(config.num_epochs):
# 1,训练循环----------------------------------------------------------------
# 记录每一个batch
step = 0
print('Epoch [{}/{}]\n'.format(i + 1, config.num_epochs))
for inputs, labels in dataloaders['train']:
# 训练模式,可以更新参数
model.train()
inputs = inputs.to(device)
labels = labels.to(device)
# 梯度清零,防止累加
optimizer.zero_grad()
outputs = model(inputs)
loss = loss_function(outputs, labels)
loss.backward()
optimizer.step()
total_step += 1
step += 1
if step % log_step == 0:
true = labels.data.cpu()
# torch.max返回的值最大的值以及最大值的索引,这里只要[1]维度的索引,不要值
predic = torch.max(outputs.data, 1)[1].cpu()
train_loss = loss.item()
train_acc = metrics.accuracy_score(true, predic)
# 2,开发集验证----------------------------------------------------------------
dev_acc, dev_loss = dev_eval(model, dataloaders['dev'], loss_function)
dfhistory.loc[i] = (i, train_loss, train_acc, dev_loss, dev_acc)
if dev_loss < dev_best_loss:
dev_best_loss = dev_loss
# torch.save(model,'save_path')
torch.save(model.state_dict(), config.save_path)
last_improve = total_step
print("[step = {} batch] train_loss = {:.3f}, train_acc = {:.2%}, dev_loss = {:.3f}, dev_acc = {:.2%}".
format(step, train_loss, train_acc, dev_loss, dev_acc))
if total_step - last_improve > config.require_improvement:
# 验证集loss超过1000batch没下降,结束训练
print("No optimization for a long time, auto-stopping...")
flag = True
break
if flag:
break
# 3,验证循环----------------------------------------------------------------
model.load_state_dict(torch.load(config.save_path))
model.eval()
start_time = time.time()
test_acc, test_loss = dev_eval(model, dataloaders['test'], loss_function)
print('================'*8)
print('test_loss: {:.3f} test_acc: {:.2%}'.format(test_loss, test_acc))
return dfhistory
def dev_eval(model, data, loss_function):
'''
得到开发集和测试集的准确率和loss
:param model: 模型
:param data: 测试集集和开发集的数据
:param loss_function: 损失函数
:return: 损失和准确率
'''
model.eval()
loss_total = 0
predict_all = np.array([], dtype=int)
labels_all = np.array([], dtype=int)
with torch.no_grad():
for texts, labels in data:
outputs = model(texts)
loss = loss_function(outputs, labels)
loss_total += loss.item()
labels = labels.data.cpu().numpy()
predic = torch.max(outputs.data, 1)[1].cpu().numpy()
labels_all = np.append(labels_all, labels)
predict_all = np.append(predict_all, predic)
acc = metrics.accuracy_score(labels_all, predict_all)
return acc, loss_total / len(data)
# train
# config.n_vocab = len(vocab)
# model = RNNModel(config).to(config.device)
# # writer = SummaryWriter(log_dir=config.log_path + '/' + time.strftime('%m-%d_%H.%M', time.localtime()))
# init_network(model)
# print(model.parameters)
# train_best(config, model, dataloaders)
# state_dict = torch.load('./text_classify_data/saved_dict/TextRNN.ckpt')
# model.load_state_dict(state_dict)
如何调用模型参照链接
https://blog.csdn.net/qq_35925375/article/details/121029494
model2 = torch.load("./text_classify_data/saved_dict/TextRNN.ckpt", map_location=torch.device('cpu'))
cc=RNNModel(config)
cc.load_state_dict(model2)
cc.eval()
RNNModel(
(embedding): Embedding(4762, 300)
(lstm): LSTM(300, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
(fc): Linear(in_features=256, out_features=10, bias=True)
)
得到模型并应用
def dev_eval2(model, data, loss_function):
'''
得到开发集和测试集的准确率和loss
:param model: 模型
:param data: 测试集集和开发集的数据
:param loss_function: 损失函数
:return: 损失和准确率
'''
model.eval()
loss_total = 0
predict_all = np.array([], dtype=int)
labels_all = np.array([], dtype=int)
with torch.no_grad():
for texts, labels in data:
print(texts,'-',labels)
# for ss in texts[0]:
# print(ss.item(),"==")
# str="那么就要想办法将文字转为数值"
# for k,v in vocab.items():
# # print(v)
# for ss in range(len(texts[0])):
# if texts[0][ss] == v:
# print(k)
outputs = model(texts)
loss = loss_function(outputs, labels)
loss_total += loss.item()
labels = labels.data.cpu().numpy()
predic = torch.max(outputs.data, 1)[1].cpu().numpy()
labels_all = np.append(labels_all, labels)
predict_all = np.append(predict_all, predic)
acc = metrics.accuracy_score(labels_all, predict_all)
return acc, loss_total / len(data)
tokenizer = lambda x: [y for y in x]
token1 = tokenizer("教育部:严禁发达地区学校到薄弱地区抢挖优秀校长和教师")
# 字的长度
seq_len = len(token1)
if config.pad_size:
# 如果字长度小于指定长度,则填充,否则截断
if len(token1) < config.pad_size:
token1.extend([vocab.get(PAD)] * (config.pad_size - len(token1)))
words_line1=[]
contents1=[]
# contents1.append(([1,2,3], 3))
# contents1
for word in token1:
words_line1.append(vocab.get(word, vocab.get(UNK)))
contents1.append((words_line1,1))
hh=cc(torch.tensor([words_line1]))
hh
predic = torch.max(hh.data, 1)[1].cpu().numpy()
predic,hh,config.class_list[predic[0]]
输入:教育部:严禁发达地区学校到薄弱地区抢挖优秀校长和教师,反馈的是运动
(array([7], dtype=int64),
tensor([[-0.4999, 1.6377, -1.1072, 1.2998, -1.1940, 0.1228, 0.3395, 3.3256,
-2.3486, -2.5429]], grad_fn=),
‘sports’)
如果输入“台湾多地街头出现反战广告牌”,反馈的 政治
(array([6], dtype=int64), tensor([[-1.9477, 2.2553, 0.7324,
-3.6068, 1.1799, 2.0770, 2.6818, -4.1674,
-1.5136, -2.2992]], grad_fn=), ‘politics’)
看来正确率还有待提高