目录
1--metrics的使用
metrics 意为指标,通过 API 可以快速使用内置的评价指标。
代码:
from datasets import list_metrics, load_metric
if __name__ == "__main__":
# 列出所有评价指标
metrics_list = list_metrics()
print(len(metrics_list))
print(metrics_list)
# 加载一个评价指标
metric = load_metric('glue', 'mrpc') # glue和mrpc见https://zhuanlan.zhihu.com/p/522017847
print(metric.inputs_description)
# 计算一个评价指标
predictions = [0, 1, 0]
references = [0, 1, 1]
final_score = metric.compute(predictions=predictions, references=references)
print(final_score)
print("All done!")
通过 list_metrics 查看所有的评价指标,通过 load_metric 选取合适的评价指标;
2--pipeline的使用
使用 pipeline 可以快速使用预训练好的模型,可以直接进行相关的任务,或作为下游任务的预训练模型。
pipeline 一般可以拆分为 tokenizer(分词)、model(模型) 和 post-process(后处理) 三部分;
from transformers.pipelines import SUPPORTED_TASKS
from transformers import pipeline, AutoModelForSequenceClassification, AutoTokenizer
if __name__ == "__main__":
# 查看pipeline支持的任务类型
for k, v in SUPPORTED_TASKS.items():
print(k, v)
# 根据任务类型直接创建pipeline
pipe = pipeline("text-classification") # 默认都是英文
print(pipe(["very good!", "very bad!"]))
# 根据任务类型和模型创建pipeline
pipe = pipeline("text-classification", model="uer/roberta-base-finetuned-dianping-chinese")
pipe("我觉得不太行!")
# 预先加载模型,再创建pipeline
model = AutoModelForSequenceClassification.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
tokenizer = AutoTokenizer.from_pretrained("uer/roberta-base-finetuned-dianping-chinese")
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
2-1--正负面文本分类任务
代码:
from transformers import pipeline
if __name__ == "__main__":
# 正面和负面文本分类
classifier = pipeline("sentiment-analysis")
result = classifier("I hate you")[0]
print(result)
result = classifier("I love you")[0]
print(result)
输出结果:
{'label': 'NEGATIVE', 'score': 0.9991129040718079}
{'label': 'POSITIVE', 'score': 0.9998656511306763}
2-2--阅读理解任务
代码:
from transformers import pipeline
if __name__ == "__main__":
# 阅读理解
question_answerer = pipeline("question-answering")
context = r"""
Extractive Question Answering is the task of extracting an answer from a text given a question. \
An example of a question answering dataset is the SQuAD dataset, which is entirely based on that task. \
If you would like to fine-tune a model on a SQuAD task, \
you may leverage the examples/pytorch/question-answering/run_squad.py script.
"""
result = question_answerer(question="What is extractive question answering?", context=context)
print(result)
result = question_answerer(question = "What is a good example of a question answering dataset?", context=context)
print(result)
输出结果:
{'score': 0.6034508347511292, 'start': 42, 'end': 103, 'answer': 'the task of extracting an answer from a text given a question'}
{'score': 0.4721057713031769, 'start': 165, 'end': 178, 'answer': 'SQuAD dataset'}
2-3--完型填空
代码:
from transformers import pipeline
if __name__ == "__main__":
# 完形填空
unmasker = pipeline("fill-mask")
sentence = 'HuggingFace is creating a <mask> that the community uses to solve NLP tasks.'
result = unmasker(sentence)
print(result)
输出结果:
[
{'score': 0.17927497625350952, 'token': 3944, 'token_str': ' tool', 'sequence': 'HuggingFace is creating a tool that the community uses to solve NLP tasks.'},
{'score': 0.11349403858184814, 'token': 7208, 'token_str': ' framework', 'sequence': 'HuggingFace is creating a framework that the community uses to solve NLP tasks.'},
{'score': 0.05243556201457977, 'token': 5560, 'token_str': ' library', 'sequence': 'HuggingFace is creating a library that the community uses to solve NLP tasks.'},
{'score': 0.03493537753820419, 'token': 8503, 'token_str': ' database', 'sequence': 'HuggingFace is creating a database that the community uses to solve NLP tasks.'},
{'score': 0.02860264666378498, 'token': 17715, 'token_str': ' prototype', 'sequence': 'HuggingFace is creating a prototype that the community uses to solve NLP tasks.'}
]