ALBERT的简单尝试

这两天,对刚出来的Albert模型进行了简单尝试,记录一下(本人主要关注模型大小和速度,性能还没有详细对比),如果想跑一下的同学只需要将文件路径换一下,然后将包下载到本地是很容易进行实验的:
基本参照两位大佬:

https://github.com/brightmart/albert_zh
https://github.com/bojone/bert4keras
https://kexue.fm/

以下代码基本来源于(只进行过了几行修改,然后添加了速度测试):
https://github.com/bojone/bert4keras/blob/master/examples/task_sentiment_albert.py

#! -*- coding:utf-8 -*-
# 情感分析类似,加载albert_zh权重(https://github.com/brightmart/albert_zh)

import json
import numpy as np
import pandas as pd
import random
from random import choice
import re, os
import codecs
from bert4keras.bert import load_pretrained_model, set_gelu
from bert4keras.utils import SimpleTokenizer, load_vocab
from bert4keras.train import PiecewiseLinearLearningRate
import tensorflow as tf
from keras.callbacks import ModelCheckpoint
set_gelu('tanh') # 切换gelu版本


os.environ["CUDA_VISIBLE_DEVICES"] = '0'
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.8
config.gpu_options.allow_growth = False


maxlen = 80
config_path = '/xxxx/albert_base_zh/albert_base_zh/bert_config.json'
checkpoint_path = '/xxxx/albert_base_zh/albert_base_zh/bert_model.ckpt'
dict_path = '/xxxx/albert_base_zh/albert_base_zh/vocab.txt'


neg = pd.read_excel('datasets/neg.xls', header=None)
pos = pd.read_excel('datasets/pos.xls', header=None)
chars = {}


data = []

for d in neg[0]:
    data.append((d, 0))
    for c in d:
        chars[c] = chars.get(c, 0) + 1

for d in pos[0]:
    data.append((d, 1))
    for c in d:
        chars[c] = chars.get(c, 0) + 1

chars = {i: j for i, j in chars.items() if j >= 4}


_token_dict = load_vocab(dict_path) # 读取词典
token_dict, keep_words = {}, []

for c in ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '[unused1]']:
    token_dict[c] = len(token_dict)
    keep_words.append(_token_dict[c])

for c in chars:
    if c in _token_dict:
        token_dict[c] = len(token_dict)
        keep_words.append(_token_dict[c])


tokenizer = SimpleTokenizer(token_dict) # 建立分词器


if not os.path.exists('./random_order.json'):
    random_order = [i for i in range(len(data))]
    random.shuffle(random_order)
    json.dump(
        random_order,
        open('./random_order.json', 'w'),
        indent=4
    )
else:
    random_order = json.load(open('./random_order.json'))


# 按照9:1的比例划分训练集和验证集
train_data = [data[j] for i, j in enumerate(random_order) if i % 10 != 0][:1000]
valid_data = [data[j] for i, j in enumerate(random_order) if i % 10 == 0][:1000]


def seq_padding(X, padding=0):
    L = [len(x) for x in X]
    ML = max(L)
    return np.array([
        np.concatenate([x, [padding] * (ML - len(x))]) if len(x) < ML else x for x in X
    ])


class data_generator:
    def __init__(self, data, batch_size=32):
        self.data = data
        self.batch_size = batch_size
        self.steps = len(self.data) // self.batch_size
        if len(self.data) % self.batch_size != 0:
            self.steps += 1
    def __len__(self):
        return self.steps
    def __iter__(self):
        while True:
            idxs = [i for i in range(len(self.data))]
            random.shuffle(idxs)
            X1, X2, Y = [], [], []
            for i in idxs:
                d = self.data[i]
                text = d[0][:maxlen]
                x1, x2 = tokenizer.encode(first=text)
                y = d[1]
                X1.append(x1)
                X2.append(x2)
                Y.append([y])
                if len(X1) == self.batch_size or i == idxs[-1]:
                    X1 = seq_padding(X1)
                    X2 = seq_padding(X2)
                    Y = seq_padding(Y)
                    yield [X1, X2], Y
                    [X1, X2, Y] = [], [], []


from keras.layers import *
from keras.models import Model
import keras.backend as K
from keras.optimizers import Adam


model = load_pretrained_model(
    config_path,
    checkpoint_path,
    keep_words=keep_words,
    albert=True
)

output = Lambda(lambda x: x[:, 0])(model.output)
output = Dense(1, activation='sigmoid')(output)
model = Model(model.input, output)

model.compile(
    loss='binary_crossentropy',
    optimizer=Adam(1e-3),  # 用足够小的学习率
    # optimizer=PiecewiseLinearLearningRate(Adam(1e-5), {1000: 1e-5, 2000: 6e-5}),
    metrics=['accuracy']
)
model.summary()


train_D = data_generator(train_data)
valid_D = data_generator(valid_data)


checkpoint = ModelCheckpoint('./model/albert_1010.ckpt',
                             monitor='val_acc', verbose=1, save_best_only=True,
                             save_weights_only=True,
                             mode='max')  # 这里是监控损失,所以选最小
# model.fit_generator(
#     train_D.__iter__(),
#     steps_per_epoch=len(train_D),
#     epochs=1,
#     validation_data=valid_D.__iter__(),
#     validation_steps=len(valid_D),
#     callbacks=[checkpoint]
# )

model.load_weights('./model/albert_1010.ckpt')
# 推理速度测试:

import time
for i in range(10000):
    #print(train_data[i])
    atime = time.perf_counter()
    # 准备一段数据,由于迭代器的写法限制,所以句子不能相同
    lines = [('我和我的祖国一刻也不能分割大家都是一起的' + str(i), 1)  for i in range(5)]
    #test_sentence = data_generator([train_data[i]]).__iter__().__next__()[0]
    test_sentence = data_generator(lines).__iter__().__next__()[0]
    print(test_sentence[0].shape)
    model.predict(test_sentence)
    print('耗时为',time.perf_counter() - atime)

速度测试比较
bert_base(单句,max_len=80)
在这里插入图片描述
albert稍快一点点(单句,max_len=80):
在这里插入图片描述
亮点在模型大小:
在这里插入图片描述
相差近十倍

效果对比后续进行补充:

2019/10/16:在都不进行fine_tune的情况下,序列类任务进行测试后,发现 albert_base相比 bert_base模型综合f1下降 0.8%,进行 fine_tune 训练后,仍稍微有所区别(0.3%左右)所以在选择模型时,需要综合考虑,不考虑模型大小时,尽量选择 BERT_base更为稳妥

  • 2
    点赞
  • 16
    收藏
    觉得还不错? 一键收藏
  • 6
    评论
评论 6
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值