具体学科下构建数据集进行命名实体识别的方案(整体步骤第四步,最后一步)

项目场景:

目前拼图就差最后一块就是命名体识别,识别出沉积学物质,时间,地点时间。有人会问,已经词性识别出来,那为什么要最后命名体识别了。第一点,因为项目需要,如果做工程,到前面就已经结束了。第二点,命名体识别可以做到预测功能,可以推理出更多的关键实体。


模型构建

首先在models文件里面构建,utils.py文件来提取特征代码如下

import os
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

def load_data(filename):
    data=pd.read_csv(filename,encoding="latin1")
    data=data.fillna(method="ffill") # 用前一个非缺失值去填充该缺失值
    return data


class SentenceGetter(object):

    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, p, t) for w, p, t in zip(s["Word"].values.tolist(),
                                                           s["POS"].values.tolist(),
                                                           s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence #").apply(agg_func)
        self.sentences = [s for s in self.grouped]

    def get_next(self):
        try:
            s = self.grouped["Sentence: {}".format(self.n_sent)]
            self.n_sent += 1
            return s
        except:
            return None

# 2 提取特征
def word2features(sent,i):
    word=sent[i][0]
    postag=sent[i][1]

    features={
        'bias':1.0,
        'word.lower()':word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()':word.isupper(),
        'word.istitle()':word.istitle(),
        'word.isdigit()':word.isdigit(),
        'postag':postag,
        'postag[:2]':postag[:2]
    }

    if i>0:
        word1=sent[i-1][0]
        postag1=sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS']=True

    if i<len(sent)-1:
        word1=sent[i+1][0]
        postag1=sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS']=True

    return features

def sent2features(sent):
    return [word2features(sent,i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token,postag,label in sent]

# def sent2tokens(sent):
#     return [token for token,postag,label in sent]


def bulid_dataset(ner_dataset_dir,dataset_dir,max_len=50):

    """
    构建数据
    :param data:
    :return:
    """
    data = pd.read_csv(ner_dataset_dir, encoding="latin1")
    data = data.fillna(method="ffill")  # 用前一个非缺失值去填充该缺失值

    # dataset_dir="../data/dataset.pkl"

    if os.path.exists(dataset_dir):
        print("正在加载旧数据")
        with open(dataset_dir,'rb') as in_data:
            data=pickle.load(in_data)
            return data


    # 标签和单词
    words = list(set(data["Word"].values))
    words.append("ENDPAD")
    n_words = len(words)
    tags = list(set(data["Tag"].values))
    n_tags = len(tags)
    getter = SentenceGetter(data)
    sentences = getter.sentences
    # print(sentences[0])

    # plt.hist([len(s) for s in sentences], bins=50)
    # plt.show()

    # 输入长度等长,统一设置为50
    max_len = 50
    word2idx = {w: i for i, w in enumerate(words)}
    tag2idx = {t: i for i, t in enumerate(tags)}

    # print(word2idx['Obama'])
    # print(tag2idx['B-geo'])

    # 填充句子
    X = [[word2idx[w[0]] for w in s] for s in sentences]
    X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words - 1)
    # print(X[1])

    # 填充标签
    y = [[tag2idx[w[2]] for w in s] for s in sentences]
    y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])
    # print(y[1])

    # 将label转为categorial
    y = [to_categorical(i, num_classes=n_tags) for i in y]

    # 划分数据集
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
    print(X_train.shape, np.array(y_test).shape)
    print("正在保存数据")
    with open(dataset_dir,'wb') as out_data:
        pickle.dump([n_words, n_tags, max_len, words,tags,X_train, X_test, y_train, y_test],
                    out_data,pickle.HIGHEST_PROTOCOL)

    return n_words, n_tags, max_len, words,tags,X_train, X_test, y_train, y_test

紧接着,构建BILSTM-CRF模型

import argparse
import numpy as np
import pandas as pd
from keras import applications
from ner.models.utils import bulid_dataset
from keras.models import Model,Input
from keras.layers import LSTM,Embedding,Dense,TimeDistributed,Dropout,Bidirectional
from keras_contrib.layers import CRF
from keras_contrib.utils import save_load_utils

import matplotlib.pyplot as plt
import time
plt.style.use("ggplot")

# 1 加载数据
ner_dataset_dir='../data/ner_dataset.csv'
dataset_dir= '../assets/dataset.pkl'

# 2 构建数据集
n_words, n_tags, max_len, words,tags,\
X_train, X_test, y_train, y_test=bulid_dataset(ner_dataset_dir,dataset_dir,max_len=50)


def train():
    input = Input(shape=(max_len,))
    model = Embedding(input_dim=n_words + 1, output_dim=20,
                      input_length=max_len, mask_zero=False)(input)  # 20-dim embedding
    model = Bidirectional(LSTM(units=50, return_sequences=True,
                               recurrent_dropout=0.1))(model)  # variational biLSTM
    model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
    crf = CRF(n_tags)  # CRF layer
    out = crf(model)  # output
    model = Model(input, out)


    model.compile(optimizer="rmsprop", loss=crf.loss_function, metrics=[crf.accuracy])
    model.summary()

    history = model.fit(X_train, np.array(y_train), batch_size=8, epochs=10,
                        validation_split=0.1, verbose=1)
    save_load_utils.save_all_weights(model,filepath="../result/bilstm-crf.h5")

    hist = pd.DataFrame(history.history)
    print("H.histroy keys:", history.history.keys())
    plt.figure(figsize=(12,12))
    plt.plot(hist["crf_viterbi_accuracy"],label='Training set accuracy')
    plt.plot(hist["val_crf_viterbi_accuracy"],label='test set accuracy')
    plt.title('Bi-LSTM+CRF Training and test accuracy')
    plt.legend()  # 添加图例
    plt.savefig("5examples.png")
    plt.show()

def sample():
    """
    利用已经训练好的数据进行预测
    :return:
    """
    # 重新初始化模型,构建配置信息,和train部分一样
    input = Input(shape=(max_len,))
    model = Embedding(input_dim=n_words + 1, output_dim=20,
                      input_length=max_len, mask_zero=False)(input)  # 20-dim embedding
    model = Bidirectional(LSTM(units=50, return_sequences=True,
                               recurrent_dropout=0.1))(model)  # variational biLSTM
    model = TimeDistributed(Dense(50, activation="relu"))(model)  # a dense layer as suggested by neuralNer
    crf = CRF(n_tags)  # CRF layer
    out = crf(model)  # output
    model = Model(input, out)

    # 恢复权重
    save_load_utils.load_all_weights(model,filepath="../result/bilstm-crf.h5")

    # 预测
    i = 304
    p = model.predict(np.array([X_test[i]]))
    p = np.argmax(p, axis=-1)
    true = np.argmax(y_test[i], -1)
    print("{:15}||{:5}||{}".format("Word", "True", "Pred"))
    print(30 * "=")
    for w, t, pred in zip(X_test[i], true, p[0]):
        print("{:15}: {:5} {}".format(words[w], tags[t], tags[pred]))

if __name__ == '__main__':
    begin = time.time()
    parser = argparse.ArgumentParser(description="命名执行训练或者预测")
    parser.add_argument('--action', default='train', help="input train or test")
    args = parser.parse_args()
    if args.action == 'train':
        train()
    if args.action == 'test':
        sample()
    end = time.time()
    print(end-begin)

结果展示

最后展示一下项目中模型实现的结果,github代码:https://github.com/zhichen-roger/Sedimentology_Ner.git
在这里插入图片描述
最最最后是套了一个网上前端模板展示如下,
在这里插入图片描述

如果对你发文章有帮助请引用我已发表的文章
在这里插入图片描述


评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

经常喝假酒的胡小臣

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值