Albert处理文本分类任务


准备工作

预训练模型下载地址:Albert_Large_zh
数据集下载地址:事故灾害多分类数据集(数据集由爬虫获取,如有错误请多多指教)

一、处理数据集

将预训练模型放入pretraining_model文件夹下。
将数据集分成训练集,测试集与验证集,分别为train.txt,test.txt,dev.txt三个,比例一般为7:2:1,放入datasets文件夹下。

案例数据:

数据标签与内容用’\t’分割

二、构建分类任务

项目结构

Keras_Bert_Class/
|-- datasets/
|   |-- train.txt
|   |-- test.txt
|   |-- dev.txt
|
|-- model/
|
|-- pretraining_model/
|   |-- albert_larger/
|
|-- main.py
|-- requirements.txt

1.引入库

代码如下:

import numpy as np
from sklearn import metrics
from bert4keras.tokenizers import Tokenizer
from bert4keras.backend import keras, set_gelu
from bert4keras.models import build_transformer_model
from bert4keras.snippets import DataGenerator, sequence_padding
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
from keras.layers import Lambda, Dense
from contextlib import redirect_stdout

2.配置参数

代码如下:

# 配置
class Config:
    def __init__(self):
        # 预训练模型名称
        self.model_name = "bert"
        # 数据集
        self.train_path = "datasets/train.txt"
        # 类别列表
        self.class_list = self.read_class()
        # 类别数
        self.num_classes = len(self.class_list)
        # epoch
        self.epochs = 10
        # mini-batch
        self.batch_size = 8
        # 每句话处理长度(短填切长)
        self.pad_size = 128
        # 学习率
        self.learning_rate = 1e-5
        # 预训练模型路径
        self.config_path = "./pretraining_model/albert_large/albert_config.json"
        self.checkpoint_path = "./pretraining_model/albert_large/albert_model.ckpt"
        self.dict_path = "./pretraining_model/albert_large/vocab.txt"
        # 文本处理
        self.tokenizer = Tokenizer(self.dict_path)
        # 标签字典
        self.label2id, self.id2label = self.label_dict()

    def read_class(self):
        class_list = []
        for line in open("datasets/train.txt", 'r', encoding='utf-8').readlines():
            line = line.split('\t')
            if line[0] not in class_list:
                class_list.append(line)
        return class_list

    def label_dict(self):
        label2id, id2label = {}, {}
        with open(self.train_path, 'r', encoding="utf-8") as data:
            for line in data:
                line = line.split('\t')
                label, text = line[0], line[1].replace("\n", "")
                if label not in label2id:
                    label2id[label] = len(label2id)
                    id2label[len(label2id)] = label
        return label2id, id2label
config = Config()

3.读取处理数据

代码如下:

def split_data(ratio=0.2, transmit_data=config.train_path):
    data = open(transmit_data, 'r', encoding='utf-8').readlines()
    train_x, test_val_x, train_y, test_val_y = train_test_split(x, y, test_size=ratio, stratify=y, random_state=42)
    test_x, val_x, test_y, val_y = train_test_split(test_val_x, test_val_y, test_size=0.5, stratify=test_val_y, random_state=42)
    train_data = [(x, y) for x, y in zip(train_x, train_y)]
    test_data = [(x, y) for x, y in zip(test_x, test_y)]
    val_data = [(x, y) for x, y in zip(val_x, val_y)]
    return train_data, test_data, val_data


train_data, test_data, dev_data = split_data(ratio=0.2, transmit_data=config.train_path)


class data_gernerator(DataGenerator):
    """数据生成器"""
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids, batch_labels = [], [], []
        for is_end, (text, label) in self.sample(random):
            token_ids, segment_ids = config.tokenizer.encode(text, maxlen=config.pad_size)
            batch_token_ids.append(token_ids)
            batch_segment_ids.append(segment_ids)
            batch_labels.append([label])
            if len(batch_token_ids) == config.batch_size or is_end:
                batch_token_ids = sequence_padding(batch_token_ids)
                batch_segment_ids = sequence_padding(batch_segment_ids)
                batch_labels = sequence_padding(batch_labels)
                yield [batch_token_ids, batch_segment_ids], batch_labels
                batch_token_ids, batch_segment_ids, batch_labels = [], [], []

train_generator = data_gernerator(train_data, config.batch_size)
test_generator = data_gernerator(test_data, config.batch_size)
dev_generator = data_gernerator(dev_data, config.batch_size)

4.加载Albert预训练模型

使用苏神的bert4keras加载预训练模型
代码如下:

bert = build_transformer_model(
    config_path=config.config_path,
    checkpoint_path=config.checkpoint_path,
    model="albert",
    return_keras_model=False
)

output = Lambda(lambda x: x[:, 0], name='CLS-token')(bert.model.output)
output = Dense(
    units=config.num_classes,
    activation='softmax',
    kernel_initializer=bert.initializer
)(output)

model = keras.models.Model(bert.model.input, output)

# 将模型结构写入txt文件 
with open('model/modelsummary.txt', 'w+') as f:
    with redirect_stdout(f):
        model.summary()

AdamLR = extend_with_piecewise_linear_lr(Adam, name='AdamLR')

model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=AdamLR(lr=config.learning_rate),
    metrics=['accuracy'])

5.开始训练

代码如下:

# 生成准确率
def evaluate(data):
    total, right = 0., 0.
    for x_true, y_true in data:
        y_pred = model.predict(x_true).argmax(axis=1)
        y_true = y_true[:, 0]
        total += len(y_true)
        right += (y_true == y_pred).sum()
    return right / total


class Evaluator(keras.callbacks.Callback):
    def __init__(self):
        self.best_val_acc = 0.

    def on_epoch_end(self, epoch, logs=None):

        val_acc = evaluate(dev_generator)
        if val_acc > self.best_val_acc:
            self.best_val_acc = val_acc
            model.save_weights('best_model.weights')
        test_acc = evaluate(test_generator)
        print(
            u'val_acc: %.5f, best_val_acc: %.5f, test_acc: %.5f\n' %
            (val_acc, self.best_val_acc, test_acc)
        )


evaluator = Evaluator()


model.fit_generator(
    train_generator.forfit(),
    steps_per_epoch=len(train_generator),
    epochs=config.epochs,
    callbacks=[evaluator]
)

6.验证模型

代码如下:

# 生成分类报告
def evaluate1(data):
    y1, y2 = [], []
    for x_true, y_true in data:
        y_pred = model.predict(x_true).argmax(axis=1)
        y_pred = y_pred.tolist()
        y_true = y_true[:, 0]
        y_true = y_true.tolist()
        y1 = y1 + y_pred
        y2 = y2 + y_true

    y1, y2 = np.array(y1), np.array(y2)
    categories = list(config.label2id.keys())
    print(categories)
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y2, y1, target_names=categories)
    
evaluate1(test_generator)

总结

模型整体效果较佳,由于数据集中个别类别较少导致权重不均衡,识别效果较差。大家可以各显神通获取数据,取得样本均衡,模型效果浮动在97%左右。
感谢评论区‘喀拉布喀’的指点

参考文献

https://github.com/bojone/bert4keras/blob/master/examples/task_sentiment_albert.py
科学空间站

  • 12
    点赞
  • 25
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 26
    评论
评论 26
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

趋吉避凶

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值