#初始化my_backend
label-studio-ml init my_backend --script label-studio-ml-backend\label_studio_ml\bert_bilstm_crf_ner.py
#bert_bilstm_crf_ner代码如下:
import os
import pinyin
from kashgari.tasks.labeling import BiLSTM_CRF_Model
from kashgari.embeddings import BertEmbedding,TransformerEmbedding
from label_studio_ml.model import LabelStudioMLBase
from label_studio.core.settings.base import DATA_UNDEFINED_NAME
class Bert_BiLSTM_CRF_NER(LabelStudioMLBase):
def __init__(self, **kwargs):
# don't forget to initialize base class...
super(Bert_BiLSTM_CRF_NER, self).__init__(**kwargs)
model_folder='C:/Python38/Scripts/albert_small_zh_google'
vocab_path = os.path.join(model_folder, 'vocab.txt')
config_path = os.path.join(model_folder, 'albert_config_small_google.json')
checkpoint_path = os.path.join(model_folder, 'albert_model.ckpt')
self.embedding = TransformerEmbedding(vocab_path,config_path,checkpoint_path,'albert')
self.from_name, self.info = list(self.parsed_label_config.items())[0]
self.to_name = self.info['to_name'][0]
self.zh_label2en_label ={}
self.en_label2zh_label ={}
self.en_labels=[]
self.zh_labels=[]
self.model_file='C:/Users/WXL/bert-bilstm-crf-ner'
if not self.train_output:
self.zh_labels = self.info['labels']
for zh_label in self.zh_labels:
self.en_labels.append(pinyin.get_initial(zh_label, delimiter=""))
self.zh_label2en_label = {l: pinyin.get_initial(l, delimiter="") for l, l in enumerate(self.zh_labels)}
self.en_label2zh_label = {pinyin.get_initial(l, delimiter=""): l for l, l in enumerate(self.zh_labels)}
self.reset_model()
self.model = self.model.load_model(self.model_file)
print('Initialized with from_name={from_name}, to_name={to_name}, labels={labels}'.format(
from_name=self.from_name, to_name=self.to_name, labels=str(self.zh_labels)
))
else:
# otherwise load the model from the latest training results
self.reset_model()
self.model = self.model.load_model(self.train_output['model_file'])
# and use the labels from training outputs
self.zh_labels = self.train_output['labels']
for zh_label in self.zh_labels:
self.en_labels.append(pinyin.get_initial(zh_label, delimiter=""))
self.zh_label2en_label = {l: pinyin.get_initial(l, delimiter="") for l, l in enumerate(self.zh_labels)}
self.en_label2zh_label = {pinyin.get_initial(l, delimiter=""): l for l, l in enumerate(self.zh_labels)}
print('Loaded from train output with from_name={from_name}, to_name={to_name}, labels={labels}'.format(
from_name=self.from_name, to_name=self.to_name, labels=str(self.zh_labels)
))
def reset_model(self):
self.model = BiLSTM_CRF_Model(self.embedding)
print('reset_model')
def predict(self, tasks, **kwargs):
predictions = []
for task in tasks:
print('task',task)
sentence = task['data'][DATA_UNDEFINED_NAME]
print('predict,sentence:',sentence)
x=[[ w for w in (sentence)]]
y = self.model.predict(x)
temp=''
results=[]
for i in range(0,len(sentence)):
for en_label in self.en_labels:
if len(y[0][i])>1 and y[0][i][2:]==en_label:
temp+=sentence[i]
if len(temp)>0 and i<len(sentence)-1 and (y[0][i+1].startswith('B') or y[0][i+1].startswith('O')):
results.append({
'from_name': 'label',
'to_name': 'text',
'type': 'labels',
'value': {
'start': sentence.index(temp),
'end': sentence.index(temp)+len(temp),
'text': temp,
'labels': [self.en_label2zh_label.get(en_label)]
}
})
temp=''
print('results:',results)
predictions.append({'result': results})
return predictions
def fit(self, completions, workdir=None, **kwargs):
train_x,train_y=[],[]
for completion in completions:
if completion['annotations'][0].get('skipped') or completion['annotations'][0].get('was_cancelled') or not completion['annotations'][0]['result']:
continue
text=completion['data']['ner']
data_x,data_y=[],[]
for word in text:
data_x.append(word)
data_y.append('O')
for result in completion['annotations'][0]['result']:
zh_label=result['value']['labels'][0]
en_label=pinyin.get_initial(zh_label, delimiter="")
values=result['value']['text']
start=result['value']['start']
i=0
for tag_word in values:
if i==0:
data_y[start]=('B-'+en_label)
else:
data_y[start+i]=('I-'+en_label)
i+=1
train_x.append(data_x)
train_y.append(data_y)
# train the model
self.reset_model()
self.model.fit(train_x, train_y, epochs=12, batch_size=1)
# save output resources
self.model.save(self.model_file)
train_output = {
'labels': self.zh_labels,
'model_file': self.model_file
}
return train_output
#启动my_backend
label-studio-ml start .\my_backend
#启动label-studio
label-studio start
创建一个工程并导入标注任务数据
设置NER模板
设置backend
然后就可以在标签任务列表中看到backend的预测结果了。
总结:
使用迁移学习进行领域命名实体识别存在的问题:
1.训练和预测效率低(训练和样本速度相关,预测5-8条/秒,使用albert相比bert没有多少效率的提升,论文说能提升10倍,实测提升60%的样子)
2.预测的精度不高,需要海量样本,需要做大量的样本标注
想比而言,不如使用模板进行识别的精度高、效率高(3000-5000条/秒)。