直接使用transformer库即可完成此任务
训练代码
from datetime import datetime
import numpy as np
from datasets import load_dataset
from loguru import logger
from sklearn.metrics import accuracy_score, f1_score
from transformers import (AutoModelForSequenceClassification, AutoTokenizer,
Trainer, TrainingArguments)
from transformers import BertForSequenceClassification
def compute_metrics(eval_pred) -> dict:
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
acc = accuracy_score(labels, predictions)
f1 = f1_score(labels, predictions, average="micro")
return {"accuracy": acc, "f1": f1}
def train():
base_model ="bert-base-uncased"#"distilbert-base-uncased " #"bert-base-uncased"
num_train_epochs = 50
pretrained_cache_dir = "pretrained_cache/"
pretrained_ckpt = './pretrained_cache/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/'
logger.info("Building dataset ...")
# tokenizer = AutoTokenizer.from_pretrained(
# base_model, cache_dir=pretrained_cache_dir
# )
tokenizer = AutoTokenizer.from_pretrained(
pretrained_ckpt
)
dataset = load_dataset(
"json",
data_files={
"train": r"./data/train.jsonl",
"test": r"./data/test.jsonl",
},
)
#print(dataset["train"][100])
def tokenize_function(examples):
labels = []
texts = []
# for example in examples['label']:
# labels.append(int(example))
for example in examples['text']:
texts.append(example)
tokenized = tokenizer(
texts, padding="max_length", truncation=True, max_length=512
)
# tokenized["labels"] = labels
#print( tokenized)
return tokenized
tokenized_datasets = dataset.map(tokenize_function, batched=True)
train_dataset = tokenized_datasets["train"].shuffle(seed=42)
eval_dataset = tokenized_datasets["test"].shuffle(seed=42)
print(train_dataset[100])
logger.info("Building model ...")
model = AutoModelForSequenceClassification.from_pretrained(
pretrained_ckpt, num_labels=9, cache_dir=pretrained_cache_dir,
# problem_type="multi_label_classification", # this is important
)
save_dir_suffix = datetime.now().strftime("%Y%m%d%H%m%S")
training_args = TrainingArguments(
f"ckpts/{save_dir_suffix}",
per_device_train_batch_size=4,
num_train_epochs=num_train_epochs,
no_cuda=False,
evaluation_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
save_total_limit=1,
)
from transformers import DataCollatorForLanguageModeling,DataCollatorWithPadding
#data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# data_collator = DataCollatorWithPadding(tokenizer, padding='max_length')
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
compute_metrics=compute_metrics,
#data_collator=data_collator
)
logger.info("Training ...")
trainer.train()
trainer.save_model(f"model/{save_dir_suffix}/")
result = trainer.evaluate()
print(result)
if __name__ == "__main__":
train()
1.数据载入
data_loader可以读取多种格式的数据集文件,最好将key名设置为“label”,“text”,
label需要是int型,可以将label单独处理对应0,1,2,3等。
可以打印:dataset['train'][1]查看数据格式。
2.数据处理
需要将文本数据经tokenize后转换为模型可以读取的格式,即将单词转化为词汇表中的编码。
使用tokenizer函数即可,这里遍历存储text为list。载入模型直接basemodel定义好即会下载到本地。
使用map函数完成映射,组织好数据集。
3.模型载入和训练
使用AutoModelForSequenceClassification模型,定义好分类类别,基本就是常规操作。
推理代码
from transformers import TextClassificationPipeline, AutoTokenizer, AutoModelForSequenceClassification
pretrained_ckpt = './pretrained_cache/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/'
finetued = './model/20230908140920'
tokenizer = AutoTokenizer.from_pretrained(pretrained_ckpt)
model = AutoModelForSequenceClassification.from_pretrained(finetued)
pipe = TextClassificationPipeline(model=model, tokenizer=tokenizer)
import json
from icecream import ic
with open('./data/test.jsonl', 'r') as f_file:
for item in f_file.readlines():
jsonstr = json.loads(item)
img_add = jsonstr["image"]
text = jsonstr["text"]
score_label = jsonstr["label"]
prediction = pipe(text, return_all_scores=True)
ic(img_add)
ic(prediction)