pytorch-如何加载模型并应用

参考链接
https://blog.csdn.net/ifhuke/article/details/127625901

腾讯词向量
https://www.cnblogs.com/yanqiang/p/13536619.html

腾讯公开词向量数据集
https://ai.tencent.com/ailab/nlp/en/embedding.html

本例中用到的数据已经存到百度网盘了

其他参考
https://blog.csdn.net/Kaiyuan_sjtu/article/details/120446703

中文分词常用方法
https://www.jianshu.com/p/6c085bf1086f

注意

需要注意的是要在text_classify_data下新建一个saved_dict,不然训练会报错

知识点

1. 密集向量和稀疏向量的区别

密集向量和稀疏向量的区别: 密集向量的值就是一个普通的Double数组 而稀疏向量由两个并列的 数组indices和values组成 例如:向量(1.0,0.0,1.0,3.0)用密集格式表示为[1.0,0.0,1.0,3.0],用稀疏格式表示为(4,[0,2,3],[1.0,1.0,3.0]) 第一个4表示向量的长度(元素个数),[0,2,3]就是indices数组,[1.0,1.0,3.0]是values数组 表示向量0的位置的值是1.0,2的位置的值是1.0,而3的位置的值是3.0,其他的位置都是0

2. 文本转向量

https://blog.csdn.net/weixin_44766179/article/details/103218288
文本独热稀疏多维,需要降维,word embeding技术就是降维成稠密向量,把文本嵌入到数字空间

3. 随机数种子

https://blog.csdn.net/weixin_51390582/article/details/124246873
也就是说在代码中,我们看到 “ 随机 ” ,那就是说并不是真正随机(假随机),且一次有效

import torch
import numpy as np

# 存放数据的文件夹
dataset = 'text_classify_data'

# 搜狗新闻:embedding_SougouNews.npz, 腾讯:embedding_Tencent.npz
embedding = 'embedding_SougouNews.npz'

# 设置随机数种子,保证每次运行结果一致,不至于不能复现模型
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True  # 保证每次结果一样

import numpy as np

class Config(object):

    """配置参数"""
    def __init__(self, dataset, embedding):
        '''
        :param dataset: 数据所在的文件夹路径
        :param embedding: 使用的词嵌入文件名称
        '''
        self.model_name = 'TextRNN'
        self.train_path = dataset + '/data/train.txt'                                # 训练集
        self.dev_path = dataset + '/data/dev.txt'                                    # 验证集
        self.test_path = dataset + '/data/test.txt'                                  # 测试集
        self.class_list = [x.strip() for x in open(
            dataset + '/data/class.txt').readlines()]                                # 类别列表
        self.vocab_path = dataset + '/data/vocab.pkl'                                # 词表
        self.save_path = dataset + '/saved_dict/' + self.model_name + '.ckpt'        # 模型训练结果
        self.embedding_pretrained = torch.tensor(
            np.load(dataset + '/data/' + embedding)["embeddings"].astype('float32')) # 预训练词向量        
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')   # 设备

        self.dropout = 0.5                                              # 随机失活
        self.require_improvement = 1000                                 # 若超过1000batch效果还没提升,则提前结束训练
        self.num_classes = len(self.class_list)                         # 类别数
        self.num_epochs = 10                                            # epoch数
        self.batch_size = 128                                           # mini-batch大小
        self.pad_size = 32                                              # 每句话处理成的长度(短填长切)
        self.learning_rate = 1e-3                                       # 学习率
        self.embed = self.embedding_pretrained.size(1)                  # 词向量维度, 若使用了预训练词向量,则维度统一
        self.hidden_size = 128                                          # lstm隐藏层
        self.num_layers = 2                                             # lstm层数


# 超参数的设置
config = Config(dataset, embedding)
np.load(dataset + '/data/' + embedding)["embeddings"]
array([[ 0.29827962,  0.41063769,  0.89462984, ...,  0.6416691 ,
         0.88055139,  0.16834516],
       [ 0.00102   , -0.133386  , -0.190171  , ..., -0.14429501,
        -0.52121401,  0.206875  ],
       [-0.024858  ,  0.130821  , -0.401039  , ...,  0.34848201,
        -0.50993001, -0.183386  ],
       ...,
       [-0.20301799,  0.144519  , -0.003503  , ..., -0.29272199,
        -0.155543  ,  0.066212  ],
       [ 0.50523703,  0.6514817 ,  0.40988785, ...,  0.63914118,
         0.27362602,  0.79338627],
       [ 0.28897455,  0.88642565,  0.62531905, ...,  0.67309214,
         0.78327786,  0.13400399]])
import pickle as pkl
from tqdm import tqdm

UNK, PAD = '<UNK>', '<PAD>'  # 未知字,padding符号


def get_data(config):
    tokenizer = lambda x: [y for y in x]  # 字级别,使用lambda print(tokenizer(x))
    vocab = pkl.load(open(config.vocab_path, 'rb'))#词嵌入模块中对每一个词都进行了一个编号,读入词嵌入模型后,能够读出编号所对应的词向量

    print(f"Vocab size: {len(vocab)}")

    train = load_dataset(config.train_path, config.pad_size, tokenizer, vocab)
    dev = load_dataset(config.dev_path, config.pad_size, tokenizer, vocab)
    test = load_dataset(config.test_path, config.pad_size, tokenizer, vocab)
    return vocab, train, dev, test


def load_dataset(path, pad_size, tokenizer, vocab):
    '''
    将路径文本文件分词并转为三元组返回
    :param path: 文件路径
    :param pad_size: 每个序列的大小
    :param tokenizer: 转为词级别或字级别
    :param vocab: 词向量模型
    :return: 二元组,含有字ID,标签
    '''
    contents = []
    with open(path, 'r', encoding='UTF-8') as f:
        # tqdm可以看进度条
        for line in tqdm(f):
            lin = line.strip()
            if not lin:
                continue
            content, label = lin.split('\t')

            # word_line存储每个字的id
            words_line = []
            # 分割器,分成每个字
            token = tokenizer(content)
            # 字的长度
            seq_len = len(token)
            if pad_size:
                # 如果字长度小于指定长度,则填充,否则截断
                if len(token) < pad_size:
                    token.extend([vocab.get(PAD)] * (pad_size - len(token)))
                else:
                    token = token[:pad_size]
                    seq_len = pad_size
            # 将每个字映射为ID
            for word in token:
                words_line.append(vocab.get(word, vocab.get(UNK)))
            contents.append((words_line, int(label)))
    return contents


vocab, train_data, dev_data, test_data = get_data(config)

# {' ': 0,
#  '0': 1,
#  '1': 2,
#  '2': 3,
#  ':': 4,
#  '大': 5,
#  '国': 6,
#  '图': 7}
Vocab size: 4762


180000it [00:03, 59228.36it/s]
10000it [00:00, 67163.88it/s]
10000it [00:00, 66702.25it/s]
from torch.utils.data import Dataset, DataLoader
class TextDataset(Dataset):
    def __init__(self, data, config):
        self.device = config.device
        # 将传入的文本存到一起
        self.x = torch.LongTensor([x[0] for x in data]).to(self.device)
        # 将传入的标签存到一起
        self.y = torch.LongTensor([x[1] for x in data]).to(self.device)


    def __getitem__(self,index):
    	# 拿出文本中的一个
        self.text = self.x[index]
        # 拿出对应标签中的一个
        self.label = self.y[index]
        return self.text, self.label


    def __len__(self):
        return len(self.x)
dataloaders = {
    'train': DataLoader(TextDataset(train_data, config), 128, shuffle=True),
    'dev': DataLoader(TextDataset(dev_data, config), 128, shuffle=True),
    'test': DataLoader(TextDataset(test_data, config), 128, shuffle=True)
}


import torch
import torch.nn as nn

class RNNModel(nn.Module):
    def __init__(self, config):
        super(RNNModel, self).__init__()
        # 使用预训练的词向量模型,freeze=False 表示允许参数在训练中更新
        self.embedding = nn.Embedding.from_pretrained(config.embedding_pretrained, freeze=False)
        # bidirectional=True表示使用的是双向LSTM(长短期记忆人工神经网络。 长短期记忆网络(LSTM,Long Short-Term Memory)是一种时间循环神经网络)
        self.lstm = nn.LSTM(config.embed, config.hidden_size, config.num_layers,
                            bidirectional=True, batch_first=True, dropout=config.dropout)
        # 因为是双向LSTM,所以层数为config.hidden_size * 2
        self.fc = nn.Linear(config.hidden_size * 2, config.num_classes)

    def forward(self, x):
        out = self.embedding(x)  # [batch_size, seq_len, embeding]=[128, 32, 300]
        # lstm 的input为[batchsize, max_length, embedding_size],输出表示为 output,(h_n,c_n),
        # 保存了每个时间步的输出,如果想要获取最后一个时间步的输出,则可以这么获取:output_last = output[:,-1,:]
        out, _ = self.lstm(out)
        out = self.fc(out[:, -1, :])  # 句子最后时刻的 hidden state
        return out

import torch.nn as nn

# 权重初始化,默认xavier
def init_network(model, method='xavier', exclude='embedding', seed=123):
    for name, w in model.named_parameters():
    	# 不对词嵌入的层的参数进行初始化,因为我们使用的是预训练的模型
        if exclude not in name:
            if 'weight' in name:
                if method == 'xavier':
                    nn.init.xavier_normal_(w)
                elif method == 'kaiming':
                    nn.init.kaiming_normal_(w)
                else:
                    nn.init.normal_(w)
            elif 'bias' in name:
                nn.init.constant_(w, 0)
            else:
                pass

import torch.nn as nn
import time
import torch
import copy
import pandas as pd
import datetime
from sklearn import metrics
import numpy as np

def train_best(config, model, dataloaders, log_step=100):
    '''
    训练模型
    :param config: 超参数
    :param model: 模型
    :param dataloaders: 处理后的数据,包含trian,dev,test
    :param log_step: 每隔多少个batch打印一次数据,默认100
    :return: 训练的指标
    '''

    optimizer = torch.optim.Adam(model.parameters(), lr=config.learning_rate)
    loss_function = torch.nn.CrossEntropyLoss()
    best_acc = 0
    # 最优模型
    best_model = copy.deepcopy(model.state_dict())

    total_step = 0  # 记录进行到多少batch
    dev_best_loss = float('inf')
    last_improve = 0  # 记录上次验证集loss下降的batch数
    flag = False  # 记录是否很久没有效果提升

    # 保存每一个100个batch的信息
    dfhistory = pd.DataFrame(columns=["epoch", "train_loss", "train_acc", "dev_loss", "dev_acc"])

    device = config.device

    print("Start Training...\n")
    nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print("==========" * 8 + "%s\n" % nowtime)

    for i in range(config.num_epochs):
        # 1,训练循环----------------------------------------------------------------

        # 记录每一个batch
        step = 0

        print('Epoch [{}/{}]\n'.format(i + 1, config.num_epochs))

        for inputs, labels in dataloaders['train']:
            # 训练模式,可以更新参数
            model.train()

            inputs = inputs.to(device)
            labels = labels.to(device)
            # 梯度清零,防止累加
            optimizer.zero_grad()

            outputs = model(inputs)
            loss = loss_function(outputs, labels)
            loss.backward()
            optimizer.step()

            total_step += 1
            step += 1

            if step % log_step == 0:
                true = labels.data.cpu()
                # torch.max返回的值最大的值以及最大值的索引,这里只要[1]维度的索引,不要值
                predic = torch.max(outputs.data, 1)[1].cpu()
                train_loss = loss.item()
                train_acc = metrics.accuracy_score(true, predic)

                # 2,开发集验证----------------------------------------------------------------
                dev_acc, dev_loss = dev_eval(model, dataloaders['dev'], loss_function)

                dfhistory.loc[i] = (i, train_loss, train_acc, dev_loss, dev_acc)

                if dev_loss < dev_best_loss:
                    dev_best_loss = dev_loss
#                     torch.save(model,'save_path')
                    torch.save(model.state_dict(), config.save_path)
                    last_improve = total_step

                print("[step = {} batch]  train_loss = {:.3f}, train_acc = {:.2%}, dev_loss = {:.3f}, dev_acc = {:.2%}".
                      format(step, train_loss, train_acc, dev_loss, dev_acc))

            if total_step - last_improve > config.require_improvement:
                # 验证集loss超过1000batch没下降,结束训练
                print("No optimization for a long time, auto-stopping...")
                flag = True
                break
        if flag:
            break

    # 3,验证循环----------------------------------------------------------------
    model.load_state_dict(torch.load(config.save_path))
    model.eval()
    start_time = time.time()
    test_acc, test_loss = dev_eval(model, dataloaders['test'], loss_function)
    print('================'*8)
    print('test_loss: {:.3f}      test_acc: {:.2%}'.format(test_loss, test_acc))

    return dfhistory


def dev_eval(model, data, loss_function):
    '''
    得到开发集和测试集的准确率和loss
    :param model: 模型
    :param data: 测试集集和开发集的数据
    :param loss_function: 损失函数
    :return: 损失和准确率
    '''
    model.eval()
    loss_total = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    with torch.no_grad():
        for texts, labels in data:
            outputs = model(texts)
            loss = loss_function(outputs, labels)
            loss_total += loss.item()
            labels = labels.data.cpu().numpy()
            predic = torch.max(outputs.data, 1)[1].cpu().numpy()
            labels_all = np.append(labels_all, labels)
            predict_all = np.append(predict_all, predic)

    acc = metrics.accuracy_score(labels_all, predict_all)
    return acc, loss_total / len(data)
# train
# config.n_vocab = len(vocab)
# model = RNNModel(config).to(config.device)
# # writer = SummaryWriter(log_dir=config.log_path + '/' + time.strftime('%m-%d_%H.%M', time.localtime()))
# init_network(model)
# print(model.parameters)
# train_best(config, model, dataloaders)
# state_dict = torch.load('./text_classify_data/saved_dict/TextRNN.ckpt')
# model.load_state_dict(state_dict)


如何调用模型参照链接
https://blog.csdn.net/qq_35925375/article/details/121029494

model2 = torch.load("./text_classify_data/saved_dict/TextRNN.ckpt", map_location=torch.device('cpu'))
cc=RNNModel(config)
cc.load_state_dict(model2)
cc.eval()

RNNModel(
  (embedding): Embedding(4762, 300)
  (lstm): LSTM(300, 128, num_layers=2, batch_first=True, dropout=0.5, bidirectional=True)
  (fc): Linear(in_features=256, out_features=10, bias=True)
)

得到模型并应用

def dev_eval2(model, data, loss_function):
    '''
    得到开发集和测试集的准确率和loss
    :param model: 模型
    :param data: 测试集集和开发集的数据
    :param loss_function: 损失函数
    :return: 损失和准确率
    '''
    model.eval()
    loss_total = 0
    predict_all = np.array([], dtype=int)
    labels_all = np.array([], dtype=int)
    with torch.no_grad():
        for texts, labels in data:
            print(texts,'-',labels)
            
#             for ss in texts[0]:
#                 print(ss.item(),"==")
            
#             str="那么就要想办法将文字转为数值"
#             for k,v in vocab.items():
# #                 print(v)
#                 for ss in range(len(texts[0])):
#                     if texts[0][ss] == v:
#                         print(k)
                
        
            
            outputs = model(texts)
            loss = loss_function(outputs, labels)
            loss_total += loss.item()
            labels = labels.data.cpu().numpy()
            predic = torch.max(outputs.data, 1)[1].cpu().numpy()
            labels_all = np.append(labels_all, labels)
            predict_all = np.append(predict_all, predic)

    acc = metrics.accuracy_score(labels_all, predict_all)
    return acc, loss_total / len(data)


tokenizer = lambda x: [y for y in x]
token1 = tokenizer("教育部:严禁发达地区学校到薄弱地区抢挖优秀校长和教师")

# 字的长度
seq_len = len(token1)
if config.pad_size:
    # 如果字长度小于指定长度,则填充,否则截断
    if len(token1) < config.pad_size:
        token1.extend([vocab.get(PAD)] * (config.pad_size - len(token1)))

words_line1=[]


contents1=[]
# contents1.append(([1,2,3], 3))
# contents1


for word in token1:
    words_line1.append(vocab.get(word, vocab.get(UNK)))
contents1.append((words_line1,1))


hh=cc(torch.tensor([words_line1]))
hh
predic = torch.max(hh.data, 1)[1].cpu().numpy()
predic,hh,config.class_list[predic[0]]

输入:教育部:严禁发达地区学校到薄弱地区抢挖优秀校长和教师,反馈的是运动
(array([7], dtype=int64),
tensor([[-0.4999, 1.6377, -1.1072, 1.2998, -1.1940, 0.1228, 0.3395, 3.3256,
-2.3486, -2.5429]], grad_fn=),
‘sports’)

如果输入“台湾多地街头出现反战广告牌”,反馈的 政治

(array([6], dtype=int64), tensor([[-1.9477, 2.2553, 0.7324,
-3.6068, 1.1799, 2.0770, 2.6818, -4.1674,
-1.5136, -2.2992]], grad_fn=), ‘politics’)

看来正确率还有待提高

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值