背景
在NLP中主要包括四大类任务:
序列标注(分词, 词性标注, NER(命令实体识别))
分类任务(文本分类, 情感计算(sentiment classification)
句子关系判断(句子模式(蕴含,推断),相似度计算)
生成式任务(机器翻译(seq2seq),...)
问题
本文主要实现了NER任务,使用模型为Bilstm+CRF.
实现过程
整个实现过程包括数据收集,数据处理,搭建模型, 调整模型和数据的维度等参数使模型
正常训练以及训练结果的记录.
数据集
地址
本文使用来自kaggle的一个用于NER的数据集(https://www.kaggle.com/abhinavwali
a95/entity-annotated-corpus)
数据预处理
步骤如下:
1.读取dataset
2.转换格式
3.建立word_vocab, tag_vocab
4.得到sentences和tags并转化为数字表示
5.数组填充
import csv
import json
class Data_Loader:
def __init__(self, raw_data_path='E:/sentiment_classification/dataset/ner_annotated_corpus/ner_dataset.csv'):
self.data = []
self.path = raw_data_path
self.sentences = []
self.sentences_dict = {}
self.vocab_word = {}
self.vocab_label = {}
def read(self):
with open(self.path, 'r', encoding='ISO-8859-1') as f:
reader = csv.reader(f)
for line in reader:
self.data.append(line)
# print(self.data)
def trans_format(self):
self.data = self.data[1:]
sentences = self.sentences
sentence1 = []
for line in self.data:
# if "" in line: # 有缺陷
if line[0] == '':
sentence1.append(line)
else:
if sentence1: # 当sentence1非空时执行(sentence1为空时, 默认为0,即False)
sentences.append(sentence1)
sentence1 = []
sentence1.append(line)
sentences.append(sentence1)
return sentences
def generate_sentence_and_tag(self):
sentence_dict = {}
sentence_indexs, sentence_words, sentence_poss, sentence_tags = [], [], [], []
for line in self.sentences:
indexs, words, poss, tags = [], [], [], []
for (index, word, pos, tag) in line:
if not (indexs): # 为空时添加元素进列表
indexs.append(index)
words.append(word)
poss.append(pos)
tags.append(tag)
sentence_indexs.append(indexs)
sentence_words.append(words)
sentence_poss.append(poss)
sentence_tags.append(tags)
sentence_dict['indexs'] = sentence_indexs
sentence_dict['words'] = sentence_words
sentence_dict['poss'] = sentence_poss
sentence_dict['tags'] = sentence_tags
self.sentences_dict = sentence_dict
return sentence_dict
def get_vocab(self):
word = []
label = []
words = self.sentences_dict['words']
tags = self.sentences_dict['tags']
for (i, ii) in zip(words, tags):
for j, jj in zip(i, ii):
word.append(j) # 不用正则表达式才能用一个列表表示
label.append(jj)
word_set = set(word)
label_set = set(label)
word2id = {word: id + 1 for id, word in enumerate(word_set)}
label2id = {label: id for id, label in enumerate(label_set)}
word2id['UNK'] = 0
self.vocab_word = word2id
self.vocab_label = label2id
return word2id, label2id
def save_vocab(self, path1='vocab_word.json', path2='vocab_label.json'):
# self.vocab_word['unk'] = 0
with open(path1, 'w') as f1:
json.dump(self.vocab_word, f1)
with open(path2, 'w') as f2:
json.dump(self.vocab_label, f2)
上面的代码创建了一个类Data_Loader用以实现前3部,接下来创建一个Data_Process类处理
进行第4/5步的数据转换和填充。
from keras_preprocessing.sequence import pad_sequences
class Data_Process:
def __init__(self, word2id, label2id, sentence_dict, maxlen):
self.word2id, self.label2id = word2id, label2id
self.words = sentence_dict['words']
self.labels = sentence_dict['tags']
self.max_len = maxlen
self.to_digit()
self.pad_data(maxlen)
def to_digit(self):
for words in self.words:
for i in range(len(words)):
word = words[i]
words[i] = self.word2id[word]
for labels in self.labels:
for i in range(len(labels)):
label = labels[i]
labels[i] = self.label2id[label]
def pad_data(self, max_len):
self.words = pad_sequences(self.words, maxlen=max_len, padding='post', value=0)
self.labels = pad_sequences(self.labels, maxlen=max_len, padding='post', value=0)
将进行数据处理的两个类进行封装:
def processed_data():
data = Data_Loader()
data.read()
sentence = data.trans_format()
sentence_dict = data.generate_sentence_and_tag()
word2id, label2id = data.get_vocab()
data.save_vocab()
max_len = 20
process_data = Data_Process(word2id, label2id, sentence_dict, max_len)
return process_data
# 函数返回的process_data是Data_Process类,process_data.words和
process_data.labels即为进行训练所需的x和y
模型
接下来构建进行训练的Bilstm+crf模型.模型原理不赘述,本文注重Python实现的过程.
from keras_contrib.layers import CRF
from keras.layers import Embedding, LSTM, Bidirectional, TimeDistributed, Dense
from keras import Model
class Bulid_Bilstm_CRF(Model):
def __init__(self, max_len, vocab_size, embedding_dim, hidden_dim, num_class, drop_rate):
super(Bulid_Bilstm_CRF, self).__init__()
self.embed = Embedding(vocab_size, embedding_dim, input_length=max_len)
self.bilstm = Bidirectional(LSTM(hidden_dim, return_sequences=True))
self.drop = Dropout(drop_rate)
self.time = TimeDistributed(Dense(num_class, activation='relu'))
self.crf = CRF(num_class)
def call(self, inputs):
x = inputs
x = self.embed(x)
x = self.bilstm(x)
x = self.drop(x)
x = self.time(x)
x = self.crf(x)
return x
模型训练与结果
在NER中得到的是多标签的标注结果(label的类型不只一个),因此用one_hot向量表示最
后的标签。因此需将y转化为one_hot向量.
from dataset.ner_annotated_corpus.data_process import processed_data
from numpy import *
import numpy as np
from keras.utils import to_categorical
dataset = processed_data()
max_len = dataset.max_len
vocab_size = len(dataset.word2id) # vocab_word
num_class = len(dataset.label2id) # vocab_label
x_train = dataset.words[: 30000]
y_train = dataset.labels[: 30000].tolist()
x_test = dataset.words[30001:]
y_test = (dataset.labels[30001:]).tolist()
y_train = [to_categorical(i, num_classes=num_class) for i in y_train]
y_test = [to_categorical(j, num_classes=num_class) for j in y_test]
test_dataset = (x_test, y_test)
参数设置、具体模型构建与训练:
batchsz = 256
Epochs = 10
embedding_dim = 50
hidden_dim = 64
drop_rate = 0.3
model = Bulid_Bilstm_CRF(max_len, vocab_size, embedding_dim, hidden_dim, num_class, drop_rate)
model.compile(optimizer='rmsprop',
loss=model.crf.loss_function,
metrics=[model.crf.accuracy])
history = model.fit(x_train, np.array(y_train), batch_size=batchsz, epochs=Epochs, validation_data=(x_test, np.array(y_test)))
print(history.history.keys())
for i in history.history.keys():
print(history.history[i])
最后得到训练10次后的结果为:
为了直观,可以将loss和accuracy用matplotlib绘图表示,本文省略绘图部分,如有需要,可以在我上传的资料中获取.
调整模型时的注意事项和收获
1)构建模型时某些层的功能:
1.Flatten():将数据平铺,但是是将最后几维的数据放到了一维上,隐式的进行了降维的操作
2.TimeDistributed(Dense(num_class, activation='relu')) : 类似于Flattten(),
但是没有进行降维,只是在对应的维度上进行变换
3.LSTM:参数return_sequences=True时比False的维度要多1维
2)当模型维多分类任务时,设置的损失函数为sparse_crossentropy, 且y应为one_hot encoding
3)在训练时, 若出现错误:
error when checking model target: the list of Numpy arrays that you are
passing to your model is not the size the model expected. Expected to see 1
array(s), but instead got the
following list of 30000 arrays
则是model.fit()时,x或者y的维度存在问题,只需使用np.array(x), np.array(y)
替换即可.
资源获取
复现本文代码总共有三种方式:
1.直接将单文的代码综合一下,然后改某些import语句,并根据报错适当修改即可
(code基于我的电脑的文件位置,未额外整理).
2.已经整理后的code和dataset等文件已上传至csdn资源库(https://download.c
sdn.net/download/qq_39667545/13137638)
3. 文件和代码已上传到github上(https://github.com/yxq1997/bilstm_crf).
转载声明
本文资源已全部上传,如有疑问可直接咨询,博主正常情况2天内会回答.
本文为原创博客,地址为https://editor.csdn.net/md?not_checkout=1&articleId
=109935676,如需转载请注明出处。另外本博客持续更新,欢迎大家批评指正。