使用Transformers、torch微调 BERT模型

Shy960418

已于 2024-10-14 19:50:14 修改

阅读量590

点赞数 2

分类专栏：深度学习文章标签： bert 人工智能深度学习

于 2024-10-14 19:50:00 首次发布

本文链接：https://blog.csdn.net/m0_37134868/article/details/142926567

版权

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
# python == 3.8.6
# torch == 1.10.0+cu102
# transformers == 4.36.2

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
!nvidia-smi

在这里插入图片描述

# 数据集下载
#  https://nyu-mll.github.io/CoLA/
# 然后加载数据集
df = pd.read_csv(r'dataset/cola_public/raw/in_domain_train.tsv',encoding='utf-8',delimiter='\t',header=None,names=['sentence_source','label','label_notes','sentence'])
df.shape
#（8551，4）

# 随机展示10个样本
# 句子来源、标注（0：不可接受，1：可接受）、作者的标注、要分类的句子
df.sample(10)

# 创建句子、标注列表、添加[CLS]和[SEP]词元
sentences = df['sentence'].values
sentences = ["[CLS]" + sentence + "[SEP]" for sentence in sentences]
labels = df['label'].values
sentences

# 激活 BERT 词元分析嚣
# 模型自己去huggingface下载，并设置将大写字母转换为小写。
tokenizer = BertTokenizer.from_pretrained(r'model/bert-base-chinese',do_lower_case=True)
tokenized_texts =  [tokenizer.encode(sent, add_special_tokens=True) for sent in sentences]
print("Tokenize the first sentence:")
print(tokenized_texts[0])

MAX_LEN = 128
input_ids = torch.tensor([
    tokens + [tokenizer.pad_token_id] * (MAX_LEN - len(tokens)) if len(tokens) < MAX_LEN else tokens[:MAX_LEN]
    for tokens in tokenized_texts
])
# 检查填充后的输入形状
print("填充后的输入形状：", input_ids.shape)
print(input_ids)

# 接下来将防止模型对填充词元进行注意力计算。
attention_masks = []
for seq in input_ids:
    seq_mask = [float(i>0) for i in seq]
    attention_masks

最低0.47元/天解锁文章