该项目是识别query中实体的项目, 由于业务特点,query中实体较密集且连续。
该项目是Albert在该项目的一个测试方案, 仅仅是想体验一下albert流程,效果上还可以。
使用了bert4keras包,感谢作者。
但应该注意的有:
(1)albert的中文向量版本要看仔细, 要和代码需要的版本相匹配。
(2)在使用的时候,注意编码时添加的前缀后缀([CLS],[SEP]),对应的tag是O。
实现方式如下:
import numpy as np
from bert4keras.backend import keras, set_gelu
from bert4keras.tokenizers import Tokenizer
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open
from keras.layers import Lambda, Dense
from keras import layers
from keras_contrib.layers import CRF
from keras.models import Model, model_from_json
from keras.utils.np_utils import to_categorical
import random
import math
import os
import json
from keras.optimizers import Adam, SGD
import numpy as np
import keras.backend.tensorflow_backend as KTF
import tensorflow as tf
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
config = tf.ConfigProto()
config.gpu_options.allow_growth=True #不全部占满显存, 按需分配
sess = tf.Session(config=config)
set_gelu('tanh') # 切换gelu版本
config_path = 'albert_large_google_zh/albert_config.json'
checkpoint_path = 'albert_large_google_zh/albert_model.ckpt'
dict_path = 'albert_large_google_zh/vocab.txt'
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
class ner_model_resume(object):
def __init__(self):
self.max_sentence_len = 15 + 2
self.class_num = 0 # 类别数量
self.word_num = 0 # 单词个数
self.word2id = None
self.tag2id = None
self.id2tag = {}
self.batch_size = 128
self.CONV_SIZE = 256
self.model_load = None
def build_model(self):
model_bert = build_transformer_model(
config_path,
checkpoint_path,
model='albert',
)
model_bert.trainable = False
output_layer = 'Transformer-FeedForward-Norm'
output = model_bert.get_layer(output_layer).get_output_at(12 - 1)
# dense = layers.TimeDistributed(Dense(len(self.word2id), activation="relu"), name="time_distributed")(output)
output = layers.Bidirectional(layers.LSTM(512, return_sequences=True))(output)
# dense = layers.TimeDistributed(Dense(len(self.word2id), activation="softmax"), name="time_distributed")(output)
dense = layers.Dense(len(self.word2id),activation="relu")(output)
crf = CRF(self.class_num, sparse_target=False)
crf_res = crf(dense)
model = Model(model_bert.input, crf_res)
adam = Adam(lr=0.000005)
model.compile(optimizer=adam, loss=crf.loss_function, metrics=[crf.accuracy])
print(model.summary())
return model
def gene_batch_data(self, sent_list, tag_list, word2id, tag2id):
# 对id进行便把操作
sent_list_id = []
seg_list_id = []
tag_list_id = []
for sent in sent_list:
sent = ["[CLS]"] + sent + ["[SEP]"]
# sent = "".join(sent)
token_ids, segment_ids = tokenizer.encode(sent, first_length=self.max_sentence_len)
sent_list_id.append(token_ids)
seg_list_id.append(segment_ids)
for tag in tag_list:
tmp_tag_list_id = []
tag = ["O"] + tag + ["O"]
for t in tag:
tmp_tag_list_id.append(tag2id[t])
if len(tmp_tag_list_id) < self.max_sentence_len:
tmp_tag_list_id = tmp_tag_list_id + [tag2id["O"]] * (self.max_sentence_len - len(tmp_tag_list_id))
if len(tmp_tag_list_id) >= self.max_sentence_len:
tmp_tag_list_id = tmp_tag_list_id[0:self.max_sentence_len]
tag_list_id.append(np.array(tmp_tag_list_id))
train_x = np.stack(sent_list_id, axis=0)
train_x1 = np.stack(seg_list_id, axis=0)
train_y = np.stack(tag_list_id, axis=0)
train_y = to_categorical(train_y, num_classes=self.class_num)
# #混淆和划分
# cc = list(zip(train_x, train_y))
# random.shuffle(cc)
# train_x[:], train_y[:] = zip(*cc)
#
return [train_x,train_x1], train_y