项目场景:
目前拼图就差最后一块就是命名体识别,识别出沉积学物质,时间,地点时间。有人会问,已经词性识别出来,那为什么要最后命名体识别了。第一点,因为项目需要,如果做工程,到前面就已经结束了。第二点,命名体识别可以做到预测功能,可以推理出更多的关键实体。
模型构建
首先在models文件里面构建,utils.py文件来提取特征代码如下
import os
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
def load_data(filename):
data=pd.read_csv(filename,encoding="latin1")
data=data.fillna(method="ffill") # 用前一个非缺失值去填充该缺失值
return data
class SentenceGetter(object):
def __init__(self, data):
self.n_sent = 1
self.data = data
self.empty = False
agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
s["POS"].values.tolist(),
s["Tag"].values.tolist())]
self.grouped = self.data.groupby("Sentence #").apply(agg_func)
self.sentences = [s for s in self.grouped]
def get_next(self):
try:
s = self.grouped["Sentence: {}".format(self.n_sent)]
self.n_sent += 1
return s
except:
return None
# 2 提取特征
def word2features(sent,i):
word=sent[i][0]
postag=sent[i][1]
features={
'bias':1.0,
'word.lower()':word.lower(),
'word[-3:]': word[-3:],
'word[-2:]': word[-2:],
'word.isupper()':word.isupper(),
'word.istitle()':word.istitle(),
'word.isdigit()':word.isdigit(),
'postag':postag,
'postag[:2]':postag[:2]
}
if i>0:
word1=sent[i-1][0]
postag1=sent[i-1][1]
features.update({
'-1:word.lower()': word1.lower(),
'-1:word.istitle()': word1.istitle(),
'-1:word.isupper()': word1.isupper(),
'-1:postag': postag1,
'-1:postag[:2]': postag1[:2],
})
else:
features['BOS']=True
if i<len(sent)-1:
word1=sent[i+1][0]
postag1=sent[i+1][1]
features.update({
'+1:word.lower()': word1.lower(),
'+1:word.istitle()': word1.istitle(),
'+1:word.isupper()': word1.isupper(),
'+1:postag': postag1,
'+1:postag[:2]': postag1[:2],
})
else:
features['EOS']=True
return features
def sent2features(sent):
return [word2features(sent,i) for i in range(len(sent))]
def sent2labels(sent):
return [label for token,postag,label in sent]
# def sent2tokens(sent):
# return [token for token,postag,label in sent]
def bulid_dataset(ner_dataset_dir,dataset_dir,max_len=50):
"""
构建数据
:param data:
:return:
"""
data = pd.read_csv(ner_dataset_dir, encoding="latin1")
data = data.fillna(method="ffill") # 用前一个非缺失值去填充该缺失值
# dataset_dir="../data/dataset.pkl"
if os.path.exists(dataset_dir):
print("正在加载旧数据")
with open(dataset_dir,'rb') as in_data:
data=pickle.load(in_data)
return data
# 标签和单词
words = list(set(data["Word"].values))
words.append("ENDPAD")
n_words = len(words)
tags = list(set(data["Tag"].values))
n_tags = len(tags)
getter = SentenceGetter(data)
sentences = getter.sentences
# print(sentences[0])
# plt.hist([len(s) for s in sentences], bins=50)
# plt.show()
# 输入长度等长,统一设置为50
max_len = 50
word2idx = {w: i for i, w in enumerate(words)}
tag2idx = {t: i for i, t in enumerate(tags)}
# print(word2idx['Obama'])
# print(tag2idx['B-geo'])
# 填充句子
X = [[word2idx[w[0]] for w in s] for s in sentences]
X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words - 1)
# print(X[1])
# 填充标签
y = [[tag2idx[w[2]] for w in s] for s in sentences]
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])
# print(y[1])
# 将label转为categorial
y = [to_categorical(i, num_classes=n_tags) for i in y]
# 划分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
print(X_train.shape, np.array(y_test).shape)
print("正在保存数据")
with open(dataset_dir,'wb') as out_data:
pickle.dump([n_words, n_tags, max_len, words,tags,X_train, X_test, y_train, y_test],
out_data,pickle.HIGHEST_PROTOCOL)
return n_words, n_tags, max_len, words,tags,X_train, X_test, y_train, y_test
紧接着,构建BILSTM-CRF模型
import argparse
import numpy as np
import pandas as pd
from keras import applications
from ner.models.utils import bulid_dataset
from keras.models import Model,Input
from keras.layers import LSTM,Embedding,Dense,TimeDistributed,Dropout,Bidirectional
from keras_contrib.layers import CRF
from keras_contrib.utils import save_load_utils
import matplotlib.pyplot as plt
import time
plt.style.use("ggplot")
# 1 加载数据
ner_dataset_dir='../data/ner_dataset.csv'
dataset_dir= '../assets/dataset.pkl'
# 2 构建数据集
n_words, n_tags, max_len, words,tags,\
X_train, X_test, y_train, y_test=bulid_dataset(ner_dataset_dir,dataset_dir,max_len=50)
def train():
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words + 1, output_dim=20,
input_length=max_len, mask_zero=False)(input) # 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
recurrent_dropout=0.1))(model) # variational biLSTM
model = TimeDistributed(Dense(50, activation="relu"))(model) # a dense layer as suggested by neuralNer
crf = CRF(n_tags) # CRF layer
out = crf(model) # output
model = Model(input, out)
model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
model.summary()
history = model.fit(X_train, np.array(y_train), batch_size=8, epochs=10,
validation_split=0.1, verbose=1)
save_load_utils.save_all_weights(model,filepath="../result/bilstm-crf.h5")
hist = pd.DataFrame(history.history)
print("H.histroy keys:", history.history.keys())
plt.figure(figsize=(12,12))
plt.plot(hist["crf_viterbi_accuracy"],label='Training set accuracy')
plt.plot(hist["val_crf_viterbi_accuracy"],label='test set accuracy')
plt.title('Bi-LSTM+CRF Training and test accuracy')
plt.legend() # 添加图例
plt.savefig("5examples.png")
plt.show()
def sample():
"""
利用已经训练好的数据进行预测
:return:
"""
# 重新初始化模型,构建配置信息,和train部分一样
input = Input(shape=(max_len,))
model = Embedding(input_dim=n_words + 1, output_dim=20,
input_length=max_len, mask_zero=False)(input) # 20-dim embedding
model = Bidirectional(LSTM(units=50, return_sequences=True,
recurrent_dropout=0.1))(model) # variational biLSTM
model = TimeDistributed(Dense(50, activation="relu"))(model) # a dense layer as suggested by neuralNer
crf = CRF(n_tags) # CRF layer
out = crf(model) # output
model = Model(input, out)
# 恢复权重
save_load_utils.load_all_weights(model,filepath="../result/bilstm-crf.h5")
# 预测
i = 304
p = model.predict(np.array([X_test[i]]))
p = np.argmax(p, axis=-1)
true = np.argmax(y_test[i], -1)
print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
print(30 * "=")
for w, t, pred in zip(X_test[i], true, p[0]):
print("{:15}: {:5} {}".format(words[w], tags[t], tags[pred]))
if __name__ == '__main__':
begin = time.time()
parser = argparse.ArgumentParser(description="命名执行训练或者预测")
parser.add_argument('--action', default='train', help="input train or test")
args = parser.parse_args()
if args.action == 'train':
train()
if args.action == 'test':
sample()
end = time.time()
print(end-begin)
结果展示
最后展示一下项目中模型实现的结果,github代码:https://github.com/zhichen-roger/Sedimentology_Ner.git
最最最后是套了一个网上前端模板展示如下,
如果对你发文章有帮助请引用我已发表的文章