介绍transformer一些主要组件的基本使用,几乎所有的NLP任务都可以套用这些基础组件来完成。
1、pipline基本使用
# 查看pipline支持的任务类型
from transformers import pipelines
# 遍历打印任务类型
for k,v in pipelines.SUPPORTED_TASKS.items():
print(k,v)
# Pipeline的创建与使用方式
from transformers import pipeline
# 本地加载 并同时加载指定的中文模型
pipe = pipeline("text-classification", model="../models/bert-base-chinese")
pipe("我感觉还行")
# 先加载模型 再创建pipline 但必须同时指定model跟tokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("../models/bert-base-chinese")
tokenizer = AutoTokenizer.from_pretrained("../models/bert-base-chinese")
# 创建pipline
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)
pipe("感觉很好")
# 其他任务
qa_pipe = pipeline("question-answering", model="../models/bert-base-chinese")
qa_pipe(question="中国的首都是哪里?", context="中国的首都是北京", max_answer_len=1)
# Pipeline背后的实现
from transformers import *
import torch
tokenizer = AutoTokenizer.from_pretrained("../models/bert-base-chinese")
model = AutoModelForSequenceClassification.from_pretrained("../models/bert-base-chinese")
# tokenizer数据处理
input_text = "我感觉很好"
inputs = tokenizer(input_text, return_tensors="pt")
# print(inputs)
# 传入模型
res = model(**inputs)
# print(res)
# logits转换 概率
logits = res.logits
logits = torch.softmax(logits, dim=-1)
# print(logits)
# 获取预测标签
pred = torch.argmax(logits).item()
# print(pred)
# 标签准换
print(model.config.id2label)
result = model.config.id2label.get(pred)
print(result)
2、tokenizer基本使用
from transformers import AutoTokenizer
# tokenizer加载与保存
tokenizer = AutoTokenizer.from_pretrained("../models/bert-base-chinese")
# print(tokenizer)
# 保存到本地
tokenizer.save_pretrained("./save_tokenizer")
# 查看词典
print(tokenizer.vocab)
# 词典大小
print(tokenizer.vocab_size)
# 句子分词
sen = "重庆是个好地方"
# 分词
tokens = tokenizer.tokenize(sen)
print(tokens)
# 此序列转换为词典的id序列
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)
# 将id转为词
new_tokens = tokenizer.convert_ids_to_tokens(ids)
print(new_tokens)
# 将词转换为string
str_sen = tokenizer.convert_tokens_to_string(tokens)
print(str_sen)
# 一步转换
# 词转id --> 编码
ids = tokenizer.encode(sen, add_special_tokens=True)
print(ids)
# id转字符串 --> 解码
tokens = tokenizer.decode(ids)
print(tokens)
# 句子有长有短 需要补充与截断
# 补充 不够填充0
ids = tokenizer.encode(sen, padding="max_length", max_length=20)
print(ids)
# 截断 统一长度
tokens = tokenizer.encode(sen, max_length=5, truncation=True)
print(tokens)
# mask
ids = tokenizer.encode(sen, padding="max_length", max_length=15)
attention_mask = [1 if idx != 0 else 0 for idx in ids]
token_type_ids = [0] * len(ids)
print(ids, "\n",attention_mask,"\n", token_type_ids)
# 一步调用
inputs = tokenizer.encode_plus(sen, padding="max_length", max_length=15)
print(inputs)
inputs = tokenizer(sen, padding="max_length", max_length=15)
print(inputs)
# 特殊tokenizer的加载 一些非官方或者自定义的tokenizer
# 调用时需要加上参数 trust_remote_code=True
tokenizer = AutoTokenizer.from_pretrained("../chatglm3-6b", trust_remote_code=True)
# tokenizer
tokenizer.decode(tokenizer.encode(sen))
3、datasets基本使用
# 数据集本地加载
from datasets import load_dataset, Dataset
# 读取本地csv文件
dataset = load_dataset("csv", data_files="../transformers-code-master/01-Getting Started/04-model/ChnSentiCorp_htl_all.csv", split="train")
print(dataset)
# pandasdf转dataset
import pandas as pd
pdf = pd.read_csv("../transformers-code-master/01-Getting Started/04-model/ChnSentiCorp_htl_all.csv")
dataset = Dataset.from_pandas(pdf)
print(dataset)
# 直接使用Dataset读
dataset = Dataset.from_csv("../transformers-code-master/01-Getting Started/04-model/ChnSentiCorp_htl_all.csv")
print(dataset)
# 数据操作
print(dataset.column_names)
print(dataset.features)
print(dataset[:3])
print(dataset["label"][:3])
# 数据集划分
print(dataset.train_test_split(test_size=0.2))
# 选取与过滤
print(dataset.select([0,3]))
print(dataset.filter(lambda example: example["label"] == 1))
# 过滤空值
dataset = dataset.filter(lambda x: x["review"] is not None)
# 数据映射
def add_prefix(example):
if example["review"] is not None:
example["review"] = 'Prefix: ' + example["review"]
return example
map_dataset = dataset.map(add_prefix)
map_dataset["review"][:3]
# 结合tokenizer
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("../models/bert-base-chinese")
def process_fun(example, tokenizer=tokenizer):
moedel_inputs = tokenizer(example["review"], max_length=128, truncation=True)
moedel_inputs["labels"] = example["label"]
return moedel_inputs
processed_dataset = dataset.map(process_fun, batched=True)
print(processed_dataset)
# 移除不需要传入模型的字段 原来的字段
processed_dataset = dataset.map(process_fun, batched=True, remove_columns=dataset.column_names)
print(processed_dataset)
# 结合内置的一些DataCollator
from transformers import DataCollatorWithPadding
from datasets import load_dataset
dataset = load_dataset("csv", data_files="../transformers-code-master/01-Getting Started/04-model/ChnSentiCorp_htl_all.csv", split="train")
dataset = dataset.filter(lambda example: example["review"] is not None)
# print(dataset)
def process_fun(example, tokenizer=tokenizer):
moedel_inputs = tokenizer(example["review"], max_length=128, truncation=True)
moedel_inputs["labels"] = example["label"]
return moedel_inputs
tokenized_dataset = dataset.map(process_fun, batched=True, remove_columns=dataset.column_names)
# print(tokenized_dataset)
# 结合DataLoader做数据处理
from torch.utils.data import DataLoader
collator = DataCollatorWithPadding(tokenizer=tokenizer)
dl = DataLoader(tokenized_dataset, batch_size=4, collate_fn=collator, shuffle=True)
next(enumerate(dl))[1]
4、model基本使用
# 常用的模型一般分为三种:自回归模型、自编码模型和序列到序列模型
from transformers import AutoConfig, AutoModel, AutoTokenizer
# 模型本地加载 带modelhead跟不带modelhead
# 不带任务头(modelhead) 仅返回模型本身 实际少用
model = AutoModel.from_pretrained("../models/bert-base-chinese")
# 查看修改模型配置
config = AutoConfig.from_pretrained("../models/bert-base-chinese")
# print(config)
print(config.hidden_size)
# 带任务头的使用 常使用
from transformers import AutoConfig, AutoModel, AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("../models/bert-base-chinese")
sen = "重庆是个很好玩的地方!"
tokenizer = AutoTokenizer.from_pretrained("../models/bert-base-chinese")
inputs = tokenizer(sen, return_tensors="pt")
# 模型调用
output = model(**inputs)
print(output)
print(model.config.num_labels)
5、evaluate基本使用
import evaluate
# 查看支持的评估函数
# print(evaluate.list_evaluation_modules())
# 加载评估函数 本地加载
accuracy = evaluate.load("../metrics/accuracy")
# 查看评估指标说明
print(accuracy.description)
# 全局计算
results = accuracy.compute(references=[0, 1, 2, 0, 1, 2], predictions=[0, 1, 1, 2, 1, 0])
print(results)
# 迭代计算
for ref, pred in zip([0,1,0,1], [1,0,0,1]):
accuracy.add(references=ref, predictions=pred)
print(accuracy.compute())
for refs, preds in zip([[0,1],[0,1]], [[1,0],[0,1]]):
accuracy.add_batch(references=refs, predictions=preds)
print(accuracy.compute())
# 多评估指标计算
# clf_metrics = evaluate.combine(["accuracy", "f1", "recall", "precision"])
# print(clf_metrics)
6、Trainer、TrainingArguments基本使用
from transformers import Trainer, TrainingArguments
# 配置训练参数
train_args = TrainingArguments(output_dir="./checkpoints", # 输出文件夹
per_device_train_batch_size=64, # 训练时的batch_size
per_device_eval_batch_size=128, # 验证时的batch_size
logging_steps=10, # log 打印的频率
evaluation_strategy="epoch", # 评估策略
save_strategy="epoch", # 保存策略
save_total_limit=3, # 最大保存数
learning_rate=2e-5, # 学习率
weight_decay=0.01, # weight_decay
metric_for_best_model="f1", # 设定评估指标
load_best_model_at_end=True) # 训练完成后加载最优模型
print(train_args)
# 创建训练参数
from transformers import DataCollatorWithPadding
trainer = Trainer(model=model,
args=train_args,
train_dataset=tokenized_dataset["train"],
eval_dataset=tokenized_dataset["test"],
data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
compute_metrics=eval_metric)