【Foundation】(三)transformers之Model

本篇博客内容以及后续内容均来自b站up主你可是处女座啊

1、介绍

1.1、 模型类型

  • 编码器模型:自编码器模型,使用encoder,拥有双向注意力机制,即计算每一个词的特征时都看到完整的上下文
  • 解码器模型:自回归模型,使用decoder,拥有单向注意力机制,即计算每一个词的特征时智能看到上文,无法看到下文
  • 编码解码器模型:序列到序列的模型,使用encoder-decoder encoder使用双向注意力,decoder使用单向注意力
    在这里插入图片描述

1.2、Model Head

在这里插入图片描述

2、模型加载

from transformers import  AutoConfig,AutoModel,AutoTokenizer
#在线加载
model = AutoModel.from_pretrained('hfl/rbt3')

#模型下载
#!git clone “https://huggingface.co/hfl/rbt3”
!git lfs clone “https://huggingface.co/hfl/rbt3” --include=“*.bin”

#离线加载
model = AutoModel.from_pretrained('rbt3')
模型参数
model.config

3、模型调用

sen = '弱小的我也有大梦想'
tokenizer = AutoTokenizer.from_pretrained('hfl/rbt3',output_attentions=True)
inputs = tokenizer(sen,return_tensors='pt')
inputs

3.1、不带Model Head的模型调用

model = AutoModel.from_pretrained('hfl/rbt3',output_attentions=True)
output = model(**inputs)
output
output.last_hidden_state.size()

3.2、带Model Head的模型调用

from transformers import AutoModelForSequenceClassification,BertForSequenceClassification
clz_model = AutoModelForSequenceClassification.from_pretrained('hfl/rbt3',num_labels=10)
clz_model(**inputs)

clz_model.config.id2label
clz_model.config.num_labels

4、模型微调实战

4.1、导包

#文本分类实战
import pandas as pd
from datasets import load_dataset
from transformers import AutoTokenizer,AutoModelForSequenceClassification,Trainer,TrainingArguments

4.2、加载数据

#法一
data = pd.read_csv('./datasets/ChnSentiCorp_htl_all.csv')
data.head()
data = data.dropna()
data
#法二
dataset = load_dataset('csv',data_files='datasets/ChnSentiCorp_htl_all.csv',split='train')
dataset = dataset.filter(lambda x : x['review'] is not None)
dataset

4.3、创建数据集

from torch.utils.data import Dataset
class MyDataset(Dataset):
    def __init__(self) -> None:
        super().__init__()
        self.data =pd.read_csv('./datasets/ChnSentiCorp_htl_all.csv')
        self.data = self.data.dropna()

    def __getitem__(self, index):
        return self.data.iloc[index]['review'],self.data.iloc[index]['label']
    def __len__(self):
        return len(self.data)
dataset = MyDataset()
for i in range(5):
    print(dataset[i])

4.4、划分数据集

from torch.utils.data import random_split
#法一
trainset,validset = random_split(dataset,lengths=[0.9,0.1])
len(trainset),len(validset)
#法二
dataset = dataset.train_test_split(test_size=0.1)
dataset

4.5、创建加载器

from transformers import AutoTokenizer
import torch
tokenizer =AutoTokenizer.from_pretrained('hfl/rbt3')
def collate_func(batch):
    texts,labels = [],[]
    for item in batch:
        texts.append(item[0])
        labels.append(item[1])
    inputs = tokenizer(texts,max_length=128,padding='max_length',truncation=True,
                       return_tensors='pt')
    inputs['labels']= torch.tensor(labels)
    return inputs
from torch.utils.data import DataLoader
trainloader = DataLoader(trainset,batch_size=32,shuffle=True,collate_fn=collate_func)
validloader = DataLoader(trainset,batch_size=64,shuffle=False,collate_fn=collate_func)
import torch
tokenizer =AutoTokenizer.from_pretrained('rbt3')


def process_function(examples):
    tokenized_examples = tokenizer(examples['review'],max_length=128,truncation=True)
    tokenized_examples['labels'] = examples["label"]
    return tokenized_examples


tokenized_datasets = dataset.map(process_function,batched=True,remove_columns=dataset['train'].column_names)
tokenized_datasets
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
trainset ,validset = tokenized_datasets['train'],tokenized_datasets['test']

trainloader = DataLoader(trainset,batch_size=32,shuffle=True,collate_fn=DataCollatorWithPadding(tokenizer))
validloader = DataLoader(validset,batch_size=32,shuffle=True,collate_fn=DataCollatorWithPadding(tokenizer))

4.6、创建模型以及优化器

from torch.optim import Adam
from transformers import AutoModelForSequenceClassification
#法一
model = AutoModelForSequenceClassification.from_pretrained('hfl/rbt3')
if torch.cuda.is_available():
   model.to('cuda')
opt = Adam(model.parameters(),lr=2e-5)
#方法二 trainer
model = AutoModelForSequenceClassification.from_pretrained('rbt3')
########## 创建评估函数
import evaluate

acc_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
def eval_metric(eval_predict):
    predictions,labels = eval_predict
    predictions = predictions.argmax(axis=-1)
    acc = acc_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels)
    acc.update(f1)
    return acc

4.7、模型训练

import evaluate
clf_metrics = evaluate.combine(['accuracy', 'f1'])

def train(epoch=3,log_step=100):
    global_step = 0
    for ep in range(epoch):
        model.train()
        for batch in trainloader:
            if torch.cuda.is_available():
                batch = {k:v.to('cuda') for k,v in batch.items()}
            opt.zero_grad()
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()
            opt.step()
            global_step += 1
            if global_step % log_step == 0:
                print(f'epoch:{ep},global_step:{global_step},loss:{loss.item()}')
        acc = evaluate()
        # print(f'ep:{ep},acc:{acc}')
        print(f'ep:{ep},{acc}')
def evaluate():
    model.eval()
    acc_num = 0
    with torch.inference_mode():
        for batch in validloader:
            if torch.cuda.is_available():
                batch = {k:v.to('cuda') for k,v in batch.items()}
            outputs = model(**batch)
            pred = torch.argmax(outputs.logits,dim=-1)
            clf_metrics.add_batch(predictions=pred.long(),references=batch['labels'].long())
            
    return clf_metrics.compute()
            # acc_num += (pred.long() == batch['labels'].long()).float().sum()
    # return acc_num / len(validset)
#创建Training Arguments

train_args = TrainingArguments(output_dir='./checkpoint',       #输出文件
                               per_device_eval_batch_size=8,    #验证时batch大小
                               per_device_train_batch_size=8,   #训练时batch大小
                               logging_steps=10,                #每10步打印日志
                               eval_strategy="epoch",     #评估策略  epoch、step
                               save_steps=100,                  #每100步保存一次模型
                               save_strategy='epoch',           #保存策略
                               save_total_limit=2,              #保存模型的数量
                               learning_rate=2e-5,              #学习率
                               weight_decay=1e-5,               #衰减率
                               metric_for_best_model="f1",      #最好模型评估标准
                               load_best_model_at_end = True,   #加载最优模型
                               )
train_args
#创建Trainer
from transformers import DataCollatorWithPadding
trainer = Trainer(
    model = model,
    args=train_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['test'],
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    compute_metrics=eval_metric
    )
trainer.train()

4.8、模型评估

trainer.evaluate(tokenized_datasets["test"])

4.9、模型预测

trainer.predict(tokenized_datasets["test"])
sen = '我觉得这家酒店不错,饭很好吃'
inputs = tokenizer(sen,return_tensors='pt')
id2label = {0:'差评!',1:'好评!'}
model.eval()
with torch.inference_mode():
    inputs = tokenizer(sen,return_tensors='pt')
    inputs = {k:v.cuda() for k,v in inputs.items()}
    logits = model(**inputs).logits
    pred = torch.argmax(logits,dim=-1)
    print(f'输入:{sen}\n模型预测结果:{pred.item()}')
    print(pred)
from transformers import pipeline

model.config.id2label = id2label
pipe = pipeline('text-classification',model=model,tokenizer=tokenizer,device=0)
pipe(sen)
  • 32
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

鲸可落

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值