python-pytorc+bert句子分类0.1.000

liwulin0506

已于 2024-06-27 17:00:27 修改

阅读量267

点赞数 3

分类专栏： python pytorch 文章标签： python bert 分类

于 2024-06-21 20:13:50 首次发布

本文链接：https://blog.csdn.net/m0_60688978/article/details/139869146

版权

python 同时被 2 个专栏收录

57 篇文章 2 订阅

订阅专栏

pytorch

42 篇文章 0 订阅

订阅专栏

这里写目录标题

引入包

import torch
from torch import nn
from torch.utils.data import DataLoader,Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor

加载预训练模型

from transformers import BertTokenizer,BertForSequenceClassification,BertConfig
config=BertConfig.from_pretrained("D:\\jpdir\\bert\\bertchinese",num_labels=10)
tokenizer = BertTokenizer.from_pretrained("D:\\jpdir\\bert\\bertchinese")
model = BertForSequenceClassification.from_pretrained("D:\\jpdir\\bert\\bertchinese",config=config)

d:\python\python37\lib\site-packages\tensorflow\python\framework\dtypes.py:526: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
d:\python\python37\lib\site-packages\tensorflow\python\framework\dtypes.py:527: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
d:\python\python37\lib\site-packages\tensorflow\python\framework\dtypes.py:528: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
d:\python\python37\lib\site-packages\tensorflow\python\framework\dtypes.py:529: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
d:\python\python37\lib\site-packages\tensorflow\python\framework\dtypes.py:530: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
d:\python\python37\lib\site-packages\tensorflow\python\framework\dtypes.py:535: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])

加载数据文件

# 整理训练数据
x_train=[]
x_test=[]
with open("D:\\jpdir\\bert\\bertdata\\Multi-classification\\train.txt","r",encoding="utf-8") as f:
    lines=f.readlines()
    for line in lines:
        x_train.append(line.split("\t")[0])
        x_test.append(line.split("\t")[1].replace("\n",""))
        
# 整理测试数据
y_train=[]
y_test=[]
with open("D:\\jpdir\\bert\\bertdata\\Multi-classification\\test.txt","r",encoding="utf-8") as f:
    lines=f.readlines()
    for line in lines:
        y_train.append(line.split("\t")[0])
        y_test.append(line.split("\t")[1].replace("\n",""))

定义数据

class CustomDataset(Dataset):
    def __init__(self,data_path):
        # 初始化数据集的过程，例如加载数据等
        # 假设我们有一个数据列表
        self.data = []
        with open(data_path,"r",encoding="utf-8") as f:
            lines=f.readlines()
            for line in lines:
                self.data.append(line)
 
    def __len__(self):
        # 返回数据集的长度
        return len(self.data)
 
    def __getitem__(self, index):
        # 根据索引获取一个样本
        line=self.data[index]
        content=line.split("\t")[0]
        label=line.split("\t")[1].replace("\n","").replace("\"","")
        return content,label

实例化数据集

train_data= CustomDataset("D:\\jpdir\\bert\\bertdata\\Multi-classification\\train.txt")
test_data= CustomDataset("D:\\jpdir\\bert\\bertdata\\Multi-classification\\test.txt")
len(train_data),len(test_data)

(4610, 4768)

使用loader加载数据

设定最大句子长度

maxlenhth=32

定义加padding的函数

不够maxlength，就加pad，这的pad对应的索引是0

def add_padding(data):
    if len(data)<maxlenhth:
        for x in torch.arange(maxlenhth-len(data)):
            data.append(0)
    return data

定义加collate_fn函数

这里处理tokenizer和paading

def collate_fn(batchData,tokenizer):
    scentence=[line[0] for line in batchData]
    label=[int(line[1]) for line in batchData]

    scentence=torch.tensor([add_padding(tokenizer.encode(one,max_length=32,add_special_tokens=True)) for one in scentence])
    label=torch.tensor(label)
    
    return scentence,label

使用DataLoader加载数据

loader = DataLoader(train_data, 5, shuffle=True,collate_fn=lambda x:collate_fn(x,tokenizer))
data_iter = iter(loader)
print(len(data_iter))

# 看下数据
data = next(data_iter)
"长度：",len(data[0]),"data[0]:",data[0],"data[1]:",data[1],"data:",data,data[0].size(),data[1].unsqueeze(1).size()

922





('长度：',
 5,
 'data[0]:',
 tensor([[ 101,  517,  682, 1957, 3187, 3127,  518, 3119, 6228, 1086, 1932, 1094,
          3209, 3241,  677, 4028, 1920, 5310, 2229,  102,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0],
         [ 101, 2349, 7561,  680, 2357, 3306, 2199, 6158, 5739, 1744, 1957, 4374,
          2970, 6224, 1217, 2135, 2196, 4265, 2900, 3189, 1377, 2521,  102,    0,
             0,    0,    0,    0,    0,    0,    0,    0],
         [ 101, 4242, 6946, 3215, 3777, 9560, 7555, 4680, 8183, 2398, 6629,  122,
           118,  124, 2233, 1762, 1545, 1059, 3621, 8380, 2835,  102,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0],
         [ 101, 3791, 1744, 8226,  674,  782, 7770, 5440,  868, 3152, 1091,  100,
          3152, 1265, 3221, 1415,  886,  782, 2814, 3289,  100,  102,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0],
         [ 101,  517, 7987,  722, 6484,  518,  100,  100, 2845, 1399, 2661, 5683,
          2458, 1423,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,
             0,    0,    0,    0,    0,    0,    0,    0]]),
 'data[1]:',
 tensor([9, 7, 1, 3, 8]),
 'data:',
 (tensor([[ 101,  517,  682, 1957, 3187, 3127,  518, 3119, 6228, 1086, 1932, 1094,
           3209, 3241,  677, 4028, 1920, 5310, 2229,  102,    0,    0,    0,    0,
              0,    0,    0,    0,    0,    0,    0,    0],
          [ 101, 2349, 7561,  680, 2357, 3306, 2199, 6158, 5739, 1744, 1957, 4374,
           2970, 6224, 1217, 2135, 2196, 4265, 2900, 3189, 1377, 2521,  102,    0,
              0,    0,    0,    0,    0,    0,    0,    0],
          [ 101, 4242, 6946, 3215, 3777, 9560, 7555, 4680, 8183, 2398, 6629,  122,
            118,  124, 2233, 1762, 1545, 1059, 3621, 8380, 2835,  102,    0,    0,
              0,    0,    0,    0,    0,    0,    0,    0],
          [ 101, 3791, 1744, 8226,  674,  782, 7770, 5440,  868, 3152, 1091,  100,
           3152, 1265, 3221, 1415,  886,  782, 2814, 3289,  100,  102,    0,    0,
              0,    0,    0,    0,    0,    0,    0,    0],
          [ 101,  517, 7987,  722, 6484,  518,  100,  100, 2845, 1399, 2661, 5683,
           2458, 1423,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,
              0,    0,    0,    0,    0,    0,    0,    0]]),
  tensor([9, 7, 1, 3, 8])),
 torch.Size([5, 32]),
 torch.Size([5, 1]))

定义模型

测试预训练模型输出

BertForSequenceClassification的输入input_ids size是[batch_size,maxlength],labels的size是[batch_size,1]
input_ids 是中文转成设定的数字
lables是数据的分类标签

测试预训练模型输出

loss 损失值
logits 概率分布

input_ids = torch.tensor(tokenizer.encode("词汇阅读是关键 08年考研暑期英语复习全指南",max_length=32,add_special_tokens=True)).unsqueeze(0)  # Batch size 1
labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
outputs = model(input_ids, labels=labels)
print(outputs)
loss, logits = outputs
loss, logits

(tensor(2.2565, grad_fn=<NllLossBackward0>), tensor([[ 0.5478, -0.0462, -0.2125, -0.8165,  0.1208, -0.4684, -0.9593,  0.4391,
          0.1320, -1.0400]], grad_fn=<AddmmBackward0>))





(tensor(2.2565, grad_fn=<NllLossBackward0>),
 tensor([[ 0.5478, -0.0462, -0.2125, -0.8165,  0.1208, -0.4684, -0.9593,  0.4391,
           0.1320, -1.0400]], grad_fn=<AddmmBackward0>))

定义自己的模型

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

model1 = NeuralNetwork()
print(model1)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)

optimizer = torch.optim.AdamW(model.parameters(),lr=0.001)
criterion = torch.nn.CrossEntropyLoss()

model.train()

for i,batch in enumerate(loader):
    optimizer.zero_grad()
        
    scentenses,labels=batch
    output=model(scentenses,labels=labels.unsqueeze(1))
    loss,logits=output
    
    loss.backward()
    optimizer.step()
    
    print(i,loss.item())