实现一个label_studio的领域命名实体的backend

#初始化my_backend

label-studio-ml init my_backend --script label-studio-ml-backend\label_studio_ml\bert_bilstm_crf_ner.py

#bert_bilstm_crf_ner代码如下:

import os
import pinyin

from kashgari.tasks.labeling import BiLSTM_CRF_Model
from kashgari.embeddings import BertEmbedding,TransformerEmbedding

from label_studio_ml.model import LabelStudioMLBase
from label_studio.core.settings.base import DATA_UNDEFINED_NAME


class Bert_BiLSTM_CRF_NER(LabelStudioMLBase):

    def __init__(self, **kwargs):
        # don't forget to initialize base class...
        super(Bert_BiLSTM_CRF_NER, self).__init__(**kwargs)
        model_folder='C:/Python38/Scripts/albert_small_zh_google'
        vocab_path = os.path.join(model_folder, 'vocab.txt')
        config_path = os.path.join(model_folder, 'albert_config_small_google.json')
        checkpoint_path = os.path.join(model_folder, 'albert_model.ckpt')
        self.embedding = TransformerEmbedding(vocab_path,config_path,checkpoint_path,'albert')
        self.from_name, self.info = list(self.parsed_label_config.items())[0]
        self.to_name = self.info['to_name'][0]
        self.zh_label2en_label ={}
        self.en_label2zh_label ={}
        self.en_labels=[]
        self.zh_labels=[]
        self.model_file='C:/Users/WXL/bert-bilstm-crf-ner'
        if not self.train_output:
            self.zh_labels = self.info['labels']
            for zh_label in self.zh_labels:
                self.en_labels.append(pinyin.get_initial(zh_label, delimiter=""))
            self.zh_label2en_label = {l: pinyin.get_initial(l, delimiter="") for l, l in enumerate(self.zh_labels)}
            self.en_label2zh_label = {pinyin.get_initial(l, delimiter=""): l for l, l in enumerate(self.zh_labels)}
            self.reset_model()
            self.model = self.model.load_model(self.model_file)
            print('Initialized with from_name={from_name}, to_name={to_name}, labels={labels}'.format(
                from_name=self.from_name, to_name=self.to_name, labels=str(self.zh_labels)
            ))
        else:
            # otherwise load the model from the latest training results
            self.reset_model()
            self.model = self.model.load_model(self.train_output['model_file'])
            # and use the labels from training outputs
            self.zh_labels = self.train_output['labels']
            for zh_label in self.zh_labels:
                self.en_labels.append(pinyin.get_initial(zh_label, delimiter=""))
            self.zh_label2en_label = {l: pinyin.get_initial(l, delimiter="") for l, l in enumerate(self.zh_labels)}
            self.en_label2zh_label = {pinyin.get_initial(l, delimiter=""): l for l, l in enumerate(self.zh_labels)}
            print('Loaded from train output with from_name={from_name}, to_name={to_name}, labels={labels}'.format(
                from_name=self.from_name, to_name=self.to_name, labels=str(self.zh_labels)
            ))

    def reset_model(self):
        self.model = BiLSTM_CRF_Model(self.embedding)
        print('reset_model')
    def predict(self, tasks, **kwargs):
        predictions = []
        for task in tasks:
            print('task',task)
            sentence = task['data'][DATA_UNDEFINED_NAME] 
            print('predict,sentence:',sentence)
            x=[[ w for w in (sentence)]]
            y = self.model.predict(x)
            temp=''
            results=[]
            for i in range(0,len(sentence)):
                    for en_label in self.en_labels:
                        if len(y[0][i])>1 and y[0][i][2:]==en_label:
                            temp+=sentence[i]
                            if len(temp)>0 and i<len(sentence)-1 and (y[0][i+1].startswith('B') or y[0][i+1].startswith('O')):
                                results.append({
                                    'from_name': 'label',
                                    'to_name': 'text',
                                    'type': 'labels',
                                    'value': {
                                        'start': sentence.index(temp),
                                        'end':  sentence.index(temp)+len(temp),
                                        'text': temp,
                                        'labels': [self.en_label2zh_label.get(en_label)]
                                    }
                                })
                                temp=''
            print('results:',results)
            predictions.append({'result': results})
        return predictions

    def fit(self, completions, workdir=None, **kwargs):
        
        train_x,train_y=[],[]
        for completion in completions:
            if completion['annotations'][0].get('skipped') or completion['annotations'][0].get('was_cancelled') or not completion['annotations'][0]['result']:
                continue
            text=completion['data']['ner']
            data_x,data_y=[],[]
            for word in text:
                data_x.append(word)
                data_y.append('O')
            for result in completion['annotations'][0]['result']:
                zh_label=result['value']['labels'][0]
                en_label=pinyin.get_initial(zh_label, delimiter="")
                values=result['value']['text']
                start=result['value']['start']
                i=0
                for tag_word in values:
                    if i==0:
                        data_y[start]=('B-'+en_label)
                    else:
                        data_y[start+i]=('I-'+en_label)
                    i+=1
            train_x.append(data_x)
            train_y.append(data_y)

        # train the model
        self.reset_model()
        self.model.fit(train_x, train_y, epochs=12, batch_size=1)
        # save output resources
        self.model.save(self.model_file)
        train_output = {
            'labels': self.zh_labels,
            'model_file': self.model_file
        }
        return train_output

#启动my_backend

label-studio-ml start .\my_backend

#启动label-studio

label-studio start

创建一个工程并导入标注任务数据

 设置NER模板

设置backend

 

然后就可以在标签任务列表中看到backend的预测结果了。

总结:

使用迁移学习进行领域命名实体识别存在的问题:

1.训练和预测效率低(训练和样本速度相关,预测5-8条/秒,使用albert相比bert没有多少效率的提升,论文说能提升10倍,实测提升60%的样子)

2.预测的精度不高,需要海量样本,需要做大量的样本标注

想比而言,不如使用模板进行识别的精度高、效率高(3000-5000条/秒)。

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值