Bilstm_crf实现NER

背景

	在NLP中主要包括四大类任务:
		序列标注(分词, 词性标注, NER(命令实体识别))
		分类任务(文本分类, 情感计算(sentiment classification)
		句子关系判断(句子模式(蕴含,推断),相似度计算)
		生成式任务(机器翻译(seq2seq),...)

问题

	本文主要实现了NER任务,使用模型为Bilstm+CRF.

实现过程

	整个实现过程包括数据收集,数据处理,搭建模型, 调整模型和数据的维度等参数使模型
正常训练以及训练结果的记录.

数据集

地址
本文使用来自kaggle的一个用于NER的数据集(https://www.kaggle.com/abhinavwali
a95/entity-annotated-corpus)

数据预处理

	步骤如下:
		1.读取dataset
		2.转换格式
		3.建立word_vocab, tag_vocab
		4.得到sentences和tags并转化为数字表示
		5.数组填充
import csv
import json
class Data_Loader:
    def __init__(self, raw_data_path='E:/sentiment_classification/dataset/ner_annotated_corpus/ner_dataset.csv'):
        self.data = []
        self.path = raw_data_path
        self.sentences = []
        self.sentences_dict = {}
        self.vocab_word = {}
        self.vocab_label = {}

    def read(self):
        with open(self.path, 'r', encoding='ISO-8859-1') as f:
            reader = csv.reader(f)
            for line in reader:
                self.data.append(line)
        # print(self.data)

    def trans_format(self):
        self.data = self.data[1:]
        sentences = self.sentences
        sentence1 = []
        for line in self.data:
            # if "" in line:   # 有缺陷
            if line[0] == '':
                sentence1.append(line)
            else:
                if sentence1:  # 当sentence1非空时执行(sentence1为空时, 默认为0,即False)
                    sentences.append(sentence1)
                sentence1 = []
                sentence1.append(line)
        sentences.append(sentence1)
        return sentences

    def generate_sentence_and_tag(self):
        sentence_dict = {}
        sentence_indexs, sentence_words, sentence_poss, sentence_tags = [], [], [], []
        for line in self.sentences:
            indexs, words, poss, tags = [], [], [], []
            for (index, word, pos, tag) in line:
                if not (indexs):  # 为空时添加元素进列表
                    indexs.append(index)
                words.append(word)
                poss.append(pos)
                tags.append(tag)
            sentence_indexs.append(indexs)
            sentence_words.append(words)
            sentence_poss.append(poss)
            sentence_tags.append(tags)
        sentence_dict['indexs'] = sentence_indexs
        sentence_dict['words'] = sentence_words
        sentence_dict['poss'] = sentence_poss
        sentence_dict['tags'] = sentence_tags
        self.sentences_dict = sentence_dict
        return sentence_dict

    def get_vocab(self):
        word = []
        label = []
        words = self.sentences_dict['words']
        tags = self.sentences_dict['tags']
        for (i, ii) in zip(words, tags):
            for j, jj in zip(i, ii):
                word.append(j)  # 不用正则表达式才能用一个列表表示
                label.append(jj)
        word_set = set(word)
        label_set = set(label)

        word2id = {word: id + 1 for id, word in enumerate(word_set)}
        label2id = {label: id for id, label in enumerate(label_set)}
        word2id['UNK'] = 0
        self.vocab_word = word2id
        self.vocab_label = label2id
        return word2id, label2id

    def save_vocab(self, path1='vocab_word.json', path2='vocab_label.json'):
        # self.vocab_word['unk'] = 0
        with open(path1, 'w') as f1:
            json.dump(self.vocab_word, f1)
        with open(path2, 'w') as f2:
            json.dump(self.vocab_label, f2)
	上面的代码创建了一个类Data_Loader用以实现前3部,接下来创建一个Data_Process类处理
进行第4/5步的数据转换和填充。
from keras_preprocessing.sequence import pad_sequences
class Data_Process:
    def __init__(self, word2id, label2id, sentence_dict, maxlen):
        self.word2id, self.label2id = word2id, label2id
        self.words = sentence_dict['words']
        self.labels = sentence_dict['tags']
        self.max_len = maxlen
        self.to_digit()
        self.pad_data(maxlen)

    def to_digit(self):
        for words in self.words:
            for i in range(len(words)):
                word = words[i]
                words[i] = self.word2id[word]
        for labels in self.labels:
            for i in range(len(labels)):
                label = labels[i]
                labels[i] = self.label2id[label]

    def pad_data(self, max_len):
        self.words = pad_sequences(self.words, maxlen=max_len, padding='post', value=0)
        self.labels = pad_sequences(self.labels, maxlen=max_len, padding='post', value=0)
	将进行数据处理的两个类进行封装:
def processed_data():
    data = Data_Loader()
    data.read()
    sentence = data.trans_format()
    sentence_dict = data.generate_sentence_and_tag()
    word2id, label2id = data.get_vocab()
    data.save_vocab()

    max_len = 20
    process_data = Data_Process(word2id, label2id, sentence_dict, max_len)
    return process_data
    # 函数返回的process_data是Data_Process类,process_data.words和
 process_data.labels即为进行训练所需的x和y

模型

	接下来构建进行训练的Bilstm+crf模型.模型原理不赘述,本文注重Python实现的过程.
from keras_contrib.layers import CRF
from keras.layers import Embedding, LSTM, Bidirectional, TimeDistributed, Dense
from keras import Model

class Bulid_Bilstm_CRF(Model):
    def __init__(self, max_len, vocab_size, embedding_dim, hidden_dim, num_class, drop_rate):
        super(Bulid_Bilstm_CRF, self).__init__()
        self.embed = Embedding(vocab_size, embedding_dim, input_length=max_len)
        self.bilstm = Bidirectional(LSTM(hidden_dim, return_sequences=True))
        self.drop = Dropout(drop_rate)
        self.time = TimeDistributed(Dense(num_class, activation='relu'))
        self.crf = CRF(num_class)

    def call(self, inputs):
        x = inputs
        x = self.embed(x)
        x = self.bilstm(x)
        x = self.drop(x)
        x = self.time(x)
        x = self.crf(x)
        return x

模型训练与结果

	在NER中得到的是多标签的标注结果(label的类型不只一个),因此用one_hot向量表示最
后的标签。因此需将y转化为one_hot向量.
from dataset.ner_annotated_corpus.data_process import processed_data
from numpy import *
import numpy as np
from keras.utils import to_categorical

dataset = processed_data()
max_len = dataset.max_len
vocab_size = len(dataset.word2id)  # vocab_word
num_class = len(dataset.label2id)  # vocab_label

x_train = dataset.words[: 30000]
y_train = dataset.labels[: 30000].tolist()
x_test = dataset.words[30001:]
y_test = (dataset.labels[30001:]).tolist()
y_train = [to_categorical(i, num_classes=num_class) for i in y_train]
y_test = [to_categorical(j, num_classes=num_class) for j in y_test]
test_dataset = (x_test, y_test)
	参数设置、具体模型构建与训练:
batchsz = 256
Epochs = 10 
embedding_dim = 50
hidden_dim = 64
drop_rate = 0.3
model = Bulid_Bilstm_CRF(max_len, vocab_size, embedding_dim, hidden_dim, num_class, drop_rate)
model.compile(optimizer='rmsprop',
                  loss=model.crf.loss_function,
                  metrics=[model.crf.accuracy])
history = model.fit(x_train, np.array(y_train), batch_size=batchsz, epochs=Epochs, validation_data=(x_test, np.array(y_test)))
print(history.history.keys())
for i in history.history.keys():
    print(history.history[i])
	最后得到训练10次后的结果为:

在这里插入图片描述

	为了直观,可以将loss和accuracy用matplotlib绘图表示,本文省略绘图部分,如有需要,可以在我上传的资料中获取.

调整模型时的注意事项和收获

1)构建模型时某些层的功能:
	1.Flatten():将数据平铺,但是是将最后几维的数据放到了一维上,隐式的进行了降维的操作
	2.TimeDistributed(Dense(num_class, activation='relu')) : 类似于Flattten(),
但是没有进行降维,只是在对应的维度上进行变换
	3.LSTM:参数return_sequences=True时比False的维度要多1维
2)当模型维多分类任务时,设置的损失函数为sparse_crossentropy, 且y应为one_hot encoding
3)在训练时, 若出现错误:
	error when checking model target: the list of Numpy arrays that you are 
passing to your model is not the size the model expected. Expected to see 1 
array(s), but instead got the 
following list of 30000 arrays 
	 则是model.fit()时,x或者y的维度存在问题,只需使用np.array(x), np.array(y)
替换即可.

资源获取

复现本文代码总共有三种方式:
		1.直接将单文的代码综合一下,然后改某些import语句,并根据报错适当修改即可
	(code基于我的电脑的文件位置,未额外整理).
		2.已经整理后的code和dataset等文件已上传至csdn资源库(https://download.c
	sdn.net/download/qq_39667545/13137638)
		3. 文件和代码已上传到github上(https://github.com/yxq1997/bilstm_crf).

转载声明

	本文资源已全部上传,如有疑问可直接咨询,博主正常情况2天内会回答.
	本文为原创博客,地址为https://editor.csdn.net/md?not_checkout=1&articleId
=109935676,如需转载请注明出处。另外本博客持续更新,欢迎大家批评指正。
  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值