目录
前言
🍨 本文为
中的学习记录博客
[🔗365天深度学习训练营]
🍖 原作者:
[K同学啊]
说在前面
1)本周任务
- 理解文中代码逻辑并成功运行
- 根据自己的理解对代码进行调优,使准确率达到70%
2)运行环境:Python3.8、Pycharm2020、torch1.12.1+cu113
一、前期准备
1.1 环境安装
本文是基于Pytorch框架实现的文本分类
代码如下:
#一、准备工作
#1.1 环境安装
import torch,torchvision
print(torch.__version__)
print(torchvision.__version__)
import torch.nn as nn
from torchvision import transforms, datasets
import os, PIL,pathlib,warnings
import pandas as pd
warnings.filterwarnings("ignore")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
打印输出:
2.0.0+cu118
0.15.1+cu118
cuda
1.2 加载数据
代码如下:
#1.2 加载数据
#加载自定义中文数据
train_data = pd.read_csv('train.csv', sep='\t', header=None)
print(train_data.head())
#构造数据集迭代器
def custom_data_iter(texts, labels):
for x, y in zip(texts, labels):
yield x, y
train_iter = custom_data_iter(train_data[0].values[:], train_data[1].values[:])
打印输出:
0 1
0 还有双鸭山到淮阴的汽车票吗13号的 Travel-Query
1 从这里怎么回家 Travel-Query
2 随便播放一首专辑阁楼里的佛里的歌 Music-Play
3 给看一下墓王之王嘛 FilmTele-Play
4 我想看挑战两把s686打突变团竞的游戏视频 Video-Play
二、数据预处理
2.1 构建词典
需要安装jieba分词库,安装语句pip install jieba
代码如下(示例):
#二、数据预处理
#2.1 构建词典
from torchtext.vocab import build_vocab_from_iterator
from torchtext.data.utils import get_tokenizer
import jieba
#中文分词方法
tokenizer = jieba.lcut
def yield_tokens(data_iter):
for text,_ in data_iter:
yield tokenizer(text)
vocab = build_vocab_from_iterator(yield_tokens(train_iter), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])
vocab(['我', '想', '看', '和平', '精英', '上', '战神', '必备', '技巧', '的', '游戏', '视频'])
label_name = list(set(train_data[1].values[:]))
print('label name:', label_name)
text_pipeline = lambda x: vocab(tokenizer(x))
label_pipeline = lambda x: label_name.index(x)
print(text_pipeline('我想看和平精英上战神必备技巧的游戏视频'))
print(label_pipeline('Video-Play'))
打印输出:
Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\XiaoMa\AppData\Local\Temp\jieba.cache
Loading model cost 0.320 seconds.
Prefix dict has been built successfully.
label name: ['Radio-Listen', 'Other', 'Alarm-Update', 'Travel-Query', 'FilmTele-Play', 'Weather-Query', 'Audio-Play', 'HomeAppliance-Control', 'Music-Play', 'Calendar-Query', 'TVProgram-Play', 'Video-Play']
[2, 10, 13, 973, 1079, 146, 7724, 7574, 7793, 1, 186, 28]
11
2.2 生成数据批次和迭代器
代码如下:
#2.2 生成数据批次和迭代器
from torch.utils.data import DataLoader
def collate_batch(batch):
label_list, text_list, offsets = [], [], [0]
for (_text, _label) in batch:
# 标签列表
label_list.append(label_pipeline(_label))
# 文本列表
processed_text = torch.tensor(text_pipeline(_text), dtype=torch.int64)
text_list.append(processed_text)
# 偏移量
offsets.append(processed_text.size(0))
label_list = torch.tensor(label_list, dtype=torch.int64)
text_list = torch.cat(text_list)
offsets = torch.tensor(offsets[:-1]).cumsum(dim=0) # 返回维度dim中输入元素的累计和
return text_list.to(device), label_list.to(device), offsets.to(device)
2.3 构建数据集
代码如下:
#2.3 构建数据集
from torch.utils.data.dataset import random_split
from torchtext.data.functional import to_map_style_dataset
BATCH_SIZE = 4
train_iter = custom_data_iter(train_data[0].values[:], train_data[1].values[:])
train_dataset = to_map_style_dataset(train_iter)
split_train_, split_valid_ = random_split(train_dataset,
[int(len(train_dataset)*0.8), int(len(train_dataset)*0.2)])
train_dataloader = DataLoader(split_train_, batch_size=BATCH_SIZE,
shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid_, batch_size=BATCH_SIZE,
shuffle=True, collate_fn=collate_batch)
to_map_style_dataset()函数:作用是将一个迭代式的数据集(Iterable-style dataset)转换为映射式的数据集(Map-style dataset)。这个转换使得我们可以通过索引(例如:整数)更方便地访问数据集中的元素。在 PyTorch 中,数据集可以分为两种类型:Iterable-style 和 Map-style。
●Iterable-style 数据集实现了 __ iter__() 方法,可以迭代访问数据集中的元素,但不支持通过索引访问。
●Map-style 数据集实现了 __ getitem__() 和 __ len__() 方法,可以直接通过索引访问特定元素,并能获取数据集的大小。
三、模型构建
3.1 定义位置编码器
代码如下:
#三、模型构建
#3.1 定义位置编码函数
import math
#位置编码
class PositionalEncoding(nn.Module):
"实现位置编码"
def __init__(self, embed_dim, max_len=500):
super(PositionalEncoding, self).__init__()
# 初始化Shape为(max_len,embed_dim)的PE (positional encoding)
pe = torch.zeros(max_len, embed_dim)
# 初始化一个tensor [max_len, 1]
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
# 这里就是sin和cos括号中的内容,通过e和ln进行了变换
div_term = torch.exp(torch.arange(0, embed_dim, 2) * (-math.log(100.0) / embed_dim))
pe[:, 0::2] = torch.sin(position * div_term) # 计算PE(pos, 2i)
pe[:, 1::2] = torch.cos(position * div_term) # 计算PE(pos, 2i+1)
pe = pe.unsqueeze(0).transpose(0, 1) # 为了方便计算,在最外面在unsqueeze出一个batch
# 如果一个参数不参与梯度下降,但又希望保存model的时候将其保存下来
# 这个时候就可以用register_buffer
# 这里将位置编码张量注册为模型的缓冲区,参数不参与梯度下降
self.register_buffer('pe', pe)
def forward(self, x):
# 将x和positional encoding相加。
#print(x.shape)
#x = x.unsqueeze(1)
#print(x.shape)
#print(self.pe[:x.size(0)].shape)
#x = x.unsqueeze(1).expand(-1, 1, -1) # 将 x 的形状调整为 [4, 1, 64]
x = x + self.pe[:x.size(0)]
return x
3.2 定义Transformer模型
代码如下:
#3.2 定义Transformer模型
from tempfile import TemporaryDirectory
from typing import Tuple
from torch import nn, tensor
from torch.nn import TransformerEncoder, TransformerEncoderLayer
from torch.utils.data import dataset
class TransformerModel(nn.Module):
def __init__(self, vocab_size, embed_dim, num_class, nhead=8,d_hid=256,nlayers=12,dropout=0.1):
super().__init__()
self.embedding = nn.EmbeddingBag(vocab_size,embed_dim,sparse=False)
self.pos_encoder = PositionalEncoding(embed_dim)
#定义编码器层
encoder_layers = TransformerEncoderLayer(embed_dim, nhead, d_hid, dropout)
self.transformer_encoder = TransformerEncoder(encoder_layers,nlayers)
self.embed_dim = embed_dim
self.linear = nn.Linear(embed_dim*4, num_class)
def forward(self, src, offsets, src_mask=None):
src = self.embedding(src, offsets)
src = self.pos_encoder(src)
output = self.transformer_encoder(src, src_mask)
output = output.view(4, embed_dim * 4)
output = self.linear(output)
return output
3.3 初始化模型
代码如下:
#3.3 初始化模型
vocab_size = len(vocab)
embed_dim = 64
num_class = len(label_name)
model = TransformerModel(vocab_size, embed_dim, num_class).to(device)
3.4 定义训练函数
代码如下:
#3.4 定义训练函数
import time
def train(dataloader):
model.train()
total_acc, train_loss, total_count = 0, 0, 0
log_interval = 300
start_time = time.time()
for idx, (text, label, offsets) in enumerate(dataloader):
predicted_label = model(text, offsets)
optimizer.zero_grad()
loss = criterion(predicted_label, label)
loss.backward()
optimizer.step()
#记录loss与acc
total_acc += (predicted_label.argmax(1) == label).sum().item()
train_loss += loss.item()
total_count += label.size(0)
if idx % log_interval == 0 and idx > 0:
elapsed = time.time() - start_time
print('| epoch{:1d} | {:4d}/{:4d} batches'
'train_acc {:4.3f} train_loss {:4.5f}'.format(epoch, idx, len(dataloader),
total_acc / total_count, train_loss / total_count))
total_acc, train_loss, total_count = 0, 0, 0
start_time = time.time()
3.5 定义评估函数
代码如下:
#3.5 定义评估函数
def evaluate(dataloader):
model.eval()
total_acc, train_loss, total_count = 0, 0, 0
with torch.no_grad():
for idx, (text, label, offsets) in enumerate(dataloader):
predicted_label = model(text, offsets)
loss = criterion(predicted_label, label)
# 记录acc和loss
total_acc += (predicted_label.argmax(1) == label).sum().item()
train_loss += loss.item()
total_count += label.size(0)
return total_acc/total_count, train_loss/total_count
四、训练模型
4.1 模型训练
代码如下:
#四、训练模型
#4.1 模型训练
epochs = 50
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)
for epoch in range(1, epochs+1):
epoch_start_time = time.time()
train(train_dataloader)
val_acc, val_loss = evaluate(valid_dataloader)
#获取当前的学习率
lr = optimizer.state_dict()['param_groups'][0]['lr']
print('-' * 69)