huggingface：transformers中文文本分词

追梦小愚

已于 2022-09-28 16:29:24 修改

阅读量2.1k

点赞数

分类专栏： NLP 文章标签：人工智能深度学习算法 python

于 2022-09-20 16:49:51 首次发布

本文链接：https://blog.csdn.net/wu0310zh/article/details/126955480

版权

NLP 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

导入相关包

import re
from pathlib import Path
import numpy as np
import torch
from datasets import load_metric
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, BertTokenizerFast

加载数据集

这次用到的数据集是用BIES标注的数据集，如下图所示。

（这里文本和标签中间是空格隔开的，没两句之间空一行）

训练自己的数据集时，后面的名称可以自定义。

把数据读进去，这其中主要就是把token和tag分别读入。

def read_data(file_path):
    file_path = Path(file_path)
    raw_text = file_path.read_text(encoding='UTF-8').strip()  # 去掉结尾空格
    raw_docs = re.split(r'\n\t?\n', raw_text)  # 按空一行切开\n\n

    token_docs = []
    tag_docs = []
    for doc in raw_docs:
        tokens = []
        tags = []
        for line in doc.split('\n'):  # '中 B-开头'
            if "\t" in line:
                line = line.replace("\t", "")
            if line == '':
                continue
            token, tag = line.split(' ')  # '中' 'B-开头'
            tokens.append(token)
            tags.append(tag)
        token_docs.append(tokens)  # ['中', '空', '塑', '料', '模', '板']
        tag_docs.append(tags)  # ['B-开头', 'I-中间', 'I-中间', 'I-中间', 'I-中间', 'E-结尾']
    return token_docs, tag_docs

# 加载数据集    
data_dir = r"\word_segmentation"  # 训练数据文件位置
train_texts, train_tags = read_data(data_dir + '/train_BIE.txt')
val_texts, val_tags = read_data(data_dir + '/val_BIE.txt')

定义下标签，方便后续查看。也为了在训练时固定分类的类别。

label_list = ['S-单字', 'B-开头', 'I-中间', 'E-结尾']
id2tag = {0: 'S-单字', 1: 'B-开头', 2: 'I-中间', 3: 'E-结尾'}
tag2id = {'S-单字': 0, 'B-开头': 1, 'I-中间': 2, 'E-结尾': 3}

数据集处理

处理训练集与测试集

tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True,truncation=True, max_length=512)  # is_split_into_words表示已经分词好了
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True,truncation=True, max_length=512)

处理标签

def encode_tags(tags, encodings, tag2id):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # 创建全由-100组成的矩阵
        doc_enc_labels = np.ones(len(doc_offset), dtype=int) * -100
        arr_offset = np.array(doc_offset)
        if len(doc_labels) >= 510:  # 防止异常
            doc_labels = doc_labels[:510]
        # 设置第一个偏移位置为0，第二个偏移位置不为0的标签（offset-mapping中 [0,0] 表示不在原文中出现的内容）
        doc_enc_labels[(arr_offset[:, 0] == 0) & (arr_offset[:, 1] != 0)] = doc_labels 
        encoded_labels.append(doc_enc_labels.tolist())
    return encoded_labels

train_labels = encode_tags(train_tags, train_encodings, tag2id)
val_labels = encode_tags(val_tags, val_encodings, tag2id)

把tokenizer的数据处理一下转化成pytorch可以使用的tensor形式

class NerDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_encodings.pop("offset_mapping")  # 训练不需要这个
val_encodings.pop("offset_mapping")
train_dataset = NerDataset(train_encodings, train_labels)
val_dataset = NerDataset(val_encodings, val_labels)

评估函数

这里用的是seqeval这个库，可以得到precision，recall，f1，accuracy这四个评价指标。

（之前的分隔符被标记成lable=-100，所以这里需要把分隔符去掉就是-100的剔除掉）

def compute_metrics(p):
    metric = load_metric("seqeval")
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # 不要管-100那些，剔除掉
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

配置训练器

加载模型

BIES属于4分类任务，num_labels=4。（这里模型选的hfl/rbt3）

model_dir = 'hfl/rbt3'
model = AutoModelForTokenClassification.from_pretrained(model_dir, num_labels=4,  # 4分类
                                                        ignore_mismatched_sizes=True,  # 不加载权重
                                                        id2label=id2tag,
                                                        label2id=tag2id
                                                        )

设置训练参数

training_args = TrainingArguments(
    output_dir='./output',  # 模型输出路径
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=128,
    num_train_epochs=5,
    weight_decay=0.01,  # 权重衰减
    logging_steps=10,  # 日志记录的步长(loss,学习率)
    evaluation_strategy="epoch",  # 评估策略为训练完一个epoch之后进行评估
    save_strategy="epoch",  # 保存策略同上
    save_total_limit=3,  # 最多保存数量
    load_best_model_at_end=True,  # 设置训练完成后加载最优模型
    metric_for_best_model="f1",  # 指定最优模型的评估指标为f1
    fp16=True  # 半精度训练（提高训练速度）
)

构建训练器

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

训练与评估

# 训练与评估
trainer.train()
trainer.evaluate()

预测

定义预测函数

这里输出采用的策略是标签为B就在前面填空格，标签为I继续，标签为E就在后面填空格，标签为S就在前后都填空格。

def ws_predict(input_str, tokenizer, model):
    input_char = list(input_str.replace(' ', ''))  # 文本去空格
    input_tensor = tokenizer(input_char, is_split_into_words=True, padding=True, truncation=True,
                             return_offsets_mapping=True, max_length=512, return_tensors="pt")
    offsets = input_tensor["offset_mapping"]
    ignore_mask = offsets[0, :, 1] == 0

    input_tensor.pop("offset_mapping")  # 不剔除的话会报错
    outputs = model(**input_tensor)
    predictions = outputs.logits.argmax(dim=-1)[0].tolist()
    res = ''
    idx = 0
    while idx < len(predictions):
        if ignore_mask[idx]:  # 跳过分隔符
            idx += 1
            continue
        while idx < len(predictions) - 1 and model.config.id2label[predictions[idx]] == f"I-中间":  # 如果下一个是'i'
            res += input_char[idx - 1]
            idx += 1
        if idx < len(predictions) - 1 and model.config.id2label[predictions[idx]] == f"B-开头":
            res += '  '
            res += input_char[idx - 1]
            idx += 1
        elif idx < len(predictions) - 1 and model.config.id2label[predictions[idx]] == f"E-结尾":
            res += input_char[idx - 1]
            res += '  '
            idx += 1
        elif idx < len(predictions) - 1 and model.config.id2label[predictions[idx]] == f"S-单字":
            res += '  '
            res += input_char[idx - 1]
            res += '  '
            idx += 1
    return res

demo

model_dir = './output/checkpoint-100'
model = AutoModelForTokenClassification.from_pretrained(model_dir)
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')
input_str = '测试一个吧字太小啦建议买别的版本'
res = ws_predict(input_str, tokenizer, model)
ic(res)

结果