命名实体识别

最新推荐文章于 2024-08-12 00:24:46 发布

zbxzc

最新推荐文章于 2024-08-12 00:24:46 发布

阅读量1.1w

点赞数 1

分类专栏： nlp 文章标签： nlp

本文链接：https://blog.csdn.net/u014568921/article/details/51778742

版权

nlp 专栏收录该内容

20 篇文章 0 订阅

订阅专栏

方法有HMM、ME、crf、svm、mlp等。

基于HMM、ME、CRF的方法和分词，pos-tagging类似，字标注方法

实战HMM-Viterbi角色标注地名识别

ICTCLAS中的HMM人名识别

基于SVM

以英文命名实体识别为例，将识别看成分类问题，从语料库中提取每个词及其上下文的特征，加上该词的标签，组成一个训练样本，训练svm分类器，完成之后对未标注语句即可进行识别。

feature

<span style="font-size:12px;"><span style="font-size:14px;">import re
import scipy.sparse as sp


def extract_features(sentence, vocabulary):
    """Do feature extraction on a single sentence.

    We need a sentence, rather than a token, since some features depend
    on the context of tokens.

    Parameters
    ----------
    sentence : list of string

    vocabulary : dict of (string * int)
        Maps terms to indices.
    """
    n_tokens = len(sentence)
    n_features = n_feature_functions + len(vocabulary)
    X = sp.lil_matrix((n_tokens, n_features), dtype=bool)

    for i in xrange(n_tokens):
        for j, f in enumerate(FEATURE_FUNCTIONS):
            X[i, j] = f(sentence, i)

        # Vocabulary feature
        try:
            X[i, n_feature_functions + vocabulary[sentence[i][0].lower()]] = 1
        except KeyError:
            pass

    return X


# Spelling
def first_of_sentence(s, i):
    return i == 0

def all_caps(s, i):
    return s[i][0].isupper()

def initial_cap(s, i):
    return s[i][0][0].isupper()

def has_dash(s, i):
    return '-' in s[i]

def has_num(s, i):
    return re.search(r"[0-9]", s[i]) is not None

# FIXME Learn POS tags from training data.
def isadj(s, i):
    return s[i][1] == "Adj"

def isadv(s, i):
    return s[i][1] == "Adv"

def isart(s, i):
    return s[i][1] == "Art"

def isconj(s, i):
    return s[i][1] == "Conj"

def isint(s, i):
    return s[i][1] == "Int"

def ismisc(s, i):
    return s[i][1] == "Misc"

def isnoun(s, i):
    return s[i][1] == "N"

def isnum(s, i):
    return s[i][1] == "Num"

def isprep(s, i):
    return s[i][1] == "Prep"

def ispunc(s, i):
    return s[i][1] == "Punc"

def ispron(s, i):
    return s[i][1] == "Pron"

def isverb(s, i):
    return s[i][1] == "V"

# Feature metafunctions
def conj(fs):
    """Conjunction of features fs"""
    def feature(s, i):
        return all(f(s, i) for f in fs)
    return feature

def butnot(f1, f2):
    def feature(s, i):
        return f1(s, i) and not f2(s, i)
    return feature

def nextf(f, offset=1):
    """Next token has feature f"""
    def feature(s, i):
        i += offset
        return i < len(s) and f(s, i)
    return feature

def prevf(f, offset=1):
    """Previous token has feature f"""
    def feature(s, i):
        i -= offset
        return i >= 0 and f(s, i)
    return feature


FEATURE_FUNCTIONS = [initial_cap, all_caps, first_of_sentence, has_dash,
                     #has_num,
                     #butnot(initial_cap, first_of_sentence),
                     prevf(initial_cap), prevf(all_caps),
                     isadj, isadv, isart, isconj, isint, ismisc,
                     isnoun, isnum, isprep, ispunc, ispron, isverb]
n_feature_functions = len(FEATURE_FUNCTIONS)


if __name__ == '__main__':
    import conll
    import sys

    for s in conll.read_file(sys.argv[1]):
        for i in xrange(len(s)):
            print ' '.join(s[i]),
            for f in extract_features(s):
                print '%s:%d' % f,
        print</span></span>

train

<span style="font-size:12px;"><span style="font-size:14px;">import logging
import numpy as np
from scikits.learn.grid_search import GridSearchCV
from scikits.learn.svm.sparse import LinearSVC
import scipy.sparse as sp

import conll
from features import extract_features
from util import bio_int


logger = logging.getLogger()


def train(sentences):
    """Train NER tagger.

    Parameters
    ----------
    sentences : iterable over list
        A sequence of lists of tokens.
    """
    if not isinstance(sentences, list):
        sentences = list(sentences)

    logger.debug("Extracting features")

    vocabulary = dict((t[0], i) for s in sentences for i, t in enumerate(s))

    X = []
    for i, s in enumerate(sentences):
        X.append(extract_features(s, vocabulary))
    X = sp.vstack(X, format='csr')

    # FIXME Only BIO tags for now
    y = np.array([bio_int[tok[2][0]] for s in sentences for tok in s])

    params = {
        "loss": ["l1", "l2"],
        "multi_class": [True, False],
        "C": [1., 10., 100.],
    }
    logger.debug("Training linear SVMs")
    clf = GridSearchCV(LinearSVC(), params, n_jobs=-1).fit(X, y)
    logger.debug("Done, returning the best one")
    return (clf.best_estimator, vocabulary)


if __name__ == "__main__":
    # Write pickled classifier to stdout.

    import cPickle as pickle
    import sys

    logging.basicConfig(level=logging.DEBUG)
    pickle.dump(train(conll.read_file(sys.argv[1])), sys.stdout)</span></span>

predict

<span style="font-size:12px;"><span style="font-size:14px;">from features import extract_features
from util import int_bio


def predict(clf, sentence, vocabulary):
    """Predict BIO labels for a single sentence."""

    X = extract_features(sentence, vocabulary)
    pred = [int_bio[y] for y in clf.predict(X)]

    # Heuristic repair: make output consistent,
    # but never worse than the raw prediction.
    for i in xrange(len(pred)):
        if pred[i] == "I" and (i == 0 or pred[i - 1] == "O"):
            pred[i] = "B"

    return pred


if __name__ == "__main__":
    from conll import read_file
    import cPickle as pickle
    import sys

    if len(sys.argv) != 3:
        print >> sys.stderr, "Usage: %s clf input_file" % sys.argv[0]
        sys.exit(1)

    clf, vocabulary = pickle.load(open(sys.argv[1]))

    for sentence in read_file(sys.argv[2]):
        Y_pred = predict(clf, sentence, vocabulary)
        for (token, pos, y_true), y_pred in zip(sentence, Y_pred):
            print token, pos, y_true[0], y_pred
        print</span></span>

conll03数据集下载：http://www.cnts.ua.ac.be/conll2003/ner/

这个比较坑爹，还要下一个路透社语料库，然后用conll03里的perl脚本处理一下才能得到数据集

基于神经网络

基于sliding-window模型，单词用词向量表示，需要用word2vec预先训练得到词向量，窗口大小为w，比如w取5,词向量dim=50,一个输入就是一段长度为5的短句[x1,x2,x3,x4,x5],每个x都是50维的词向量，经过隐含层、softmax层输出后得到中心单词x3的tag，然后用整个语料库训练这个神经网络，训练完成后mlp即可进行命名实体识别。

可以看这个notes

http://nlp.stanford.edu/~socherr/pa4_ner.pdf

基于深层神经网络的命名实体识别技术

Named-Entity Recognition using Deep Learning

词法分析之Bi-LSTM-CRF框架

https://github.com/brightmart/name_entity_recognition