参考: https://huggingface.co/docs/transformers/v4.15.0/en/main_classes/output
单个预测
#载入训练好的模型
import numpy as np
import torch
from transformers import BertTokenizer, BertConfig, BertForSequenceClassification
#加载训练好的模型
model_name = 'bert-base-chinese'
MODEL_PATH = 'your_model_path'
# tokenizer.encode_plus返回多句子在字典中的id,tokenizer.tokenize返回token
# [CLS]的id为101,[SEP]的id为102
# tokenizer.tokenize()为分词器
# a. 通过词典导入分词器
tokenizer = BertTokenizer.from_pretrained(model_name)
# b. 导入配置文件
model_config = BertConfig.from_pretrained(model_name)
# 修改配置
model_config.output_hidden_states = True
model_config.output_attentions = True
# 通过配置和路径导入模型
model = BertForSequenceClassification.from_pretrained(MODEL_PATH, config = model_config)
# 读取预训练模型
print("加载模型")
model.eval()
print("模型加载完成,开始预测")
# 第一种encode方式,及对应的预测
encoding = tokenizer('王小二', '王小二可是个好人呐', return_tensors='pt')
print(encoding)
labels = torch.tensor([1]).unsqueeze(0) # Batch size 1
outputs = model(**encoding, labels=labels)
logits = outputs.logits
# assert logits[0, 0] < logits[0, 1]
print(logits)
sample = logits.detach().numpy()
pred = np.argmax(sample, axis=1)
print(pred)
#第二种预测方式,及对应的预测
sen_code = tokenizer.encode_plus(
'王小二',
'王小二可是个好人呐',
add_special_tokens=True,
max_length=128,
pad_to_max_length=True,
padding="max_length",
return_tensors='pt'
)
sen_code2 = tokenizer.encode_plus(
'好人',
'王小二可不是个好人呐',
add_special_tokens=True,
max_length=128,
pad_to_max_length=True,
padding="max_length"
)
print(sen_code)
# tokens_tensor = torch.tensor([sen_code])
# print(tokens_tensor)
outputs = model(**sen_code,labels=labels)
print(outputs.logits)
seq_relationship_scores = outputs.logits # seq_relationship_scores.shape: torch.Size([class_name, n]),n为句子对数量,输出为每个类别的预测结果
sample = seq_relationship_scores.detach().numpy() # sample.shape: [class_name, n]
pred = np.argmax(sample, axis=1)
print(pred) # 预测结果
批量预测
#基于训练好的模型进行预测
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from accelerate import Accelerator
from transformers import DataCollatorWithPadding,default_data_collator
accelerator = Accelerator()
import time
def preprocess_function(examples):
return tokenizer(examples["sentence1"], examples["sentence2"], truncation=True)
def dataset_predict():
data_files={"test": your_file_path}
raw_datasets=load_dataset("csv", data_files=data_files)
# print(raw_datasets['test'][:2])
starttime = time.time()
processed_datasets = raw_datasets.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
# print(processed_datasets['test'].column_names)
processed_datasets = processed_datasets.remove_columns(['sentence1', 'sentence2'])
processed_datasets.set_format('torch')
# print(processed_datasets['test'].column_names)
# print(processed_datasets['test'][:2])
test_dataloader = DataLoader(processed_datasets['test'], batch_size=8, collate_fn=data_collator)
flag=0
for batch in test_dataloader:
if(flag==0):
print(batch)
flag=1
outputs=model(**batch)
predictions = outputs.logits.argmax(dim=-1)
print(predictions)
endtime = time.time()
print ("times:"+str((endtime - starttime)))
def multi_sen_predict():
sen1=["中国","外国"]
sen2=["我是中国人","他是外国人"]
data=tokenizer(sen1, sen2, truncation=True)#dict里存放的是list,要转成tensor格式
print(data)
for i in data:
data[i]=torch.LongTensor(data[i])
print(data)
outputs=model(**data)
predictions = outputs.logits.argmax(dim=-1)
print(predictions)
if __name__ == "__main__":
dataset_predict()
multi_sen_predict()