方法有HMM、ME、crf、svm、mlp等。
基于HMM、ME、CRF的方法和分词,pos-tagging类似,字标注方法
基于SVM
以英文命名实体识别为例,将识别看成分类问题,从语料库中提取每个词及其上下文的特征,加上该词的标签,组成一个训练样本,训练svm分类器,完成之后对未标注语句即可进行识别。
feature
<span style="font-size:12px;"><span style="font-size:14px;">import re
import scipy.sparse as sp
def extract_features(sentence, vocabulary):
"""Do feature extraction on a single sentence.
We need a sentence, rather than a token, since some features depend
on the context of tokens.
Parameters
----------
sentence : list of string
vocabulary : dict of (string * int)
Maps terms to indices.
"""
n_tokens = len(sentence)
n_features = n_feature_functions + len(vocabulary)
X = sp.lil_matrix((n_tokens, n_features), dtype=bool)
for i in xrange(n_tokens):
for j, f in enumerate(FEATURE_FUNCTIONS):
X[i, j] = f(sentence, i)
# Vocabulary feature
try:
X[i, n_feature_functions + vocabulary[sentence[i][0].lower()]] = 1
except KeyError:
pass
return X
# Spelling
def first_of_sentence(s, i):
return i == 0
def all_caps(s, i):
return s[i][0].isupper()
def initial_cap(s, i):
return s[i][0][0].isupper()
def has_dash(s, i):
return '-' in s[i]
def has_num(s, i):
return re.search(r"[0-9]", s[i]) is not None
# FIXME Learn POS tags from training data.
def isadj(s, i):
return s[i][1] == "Adj"
def isadv(s, i):
return s[i][1] == "Adv"
def isart(s, i):
return s[i][1] == "Art"
def isconj(s, i):
return s[i][1] == "Conj"
def isint(s, i):
return s[i][1] == "Int"
def ismisc(s, i):
return s[i][1] == "Misc"
def isnoun(s, i):
return s[i][1] == "N"
def isnum(s, i):
return s[i][1] == "Num"
def isprep(s, i):
return s[i][1] == "Prep"
def ispunc(s, i):
return s[i][1] == "Punc"
def ispron(s, i):
return s[i][1] == "Pron"
def isverb(s, i):
return s[i][1] == "V"
# Feature metafunctions
def conj(fs):
"""Conjunction of features fs"""
def feature(s, i):
return all(f(s, i) for f in fs)
return feature
def butnot(f1, f2):
def feature(s, i):
return f1(s, i) and not f2(s, i)
return feature
def nextf(f, offset=1):
"""Next token has feature f"""
def feature(s, i):
i += offset
return i < len(s) and f(s, i)
return feature
def prevf(f, offset=1):
"""Previous token has feature f"""
def feature(s, i):
i -= offset
return i >= 0 and f(s, i)
return feature
FEATURE_FUNCTIONS = [initial_cap, all_caps, first_of_sentence, has_dash,
#has_num,
#butnot(initial_cap, first_of_sentence),
prevf(initial_cap), prevf(all_caps),
isadj, isadv, isart, isconj, isint, ismisc,
isnoun, isnum, isprep, ispunc, ispron, isverb]
n_feature_functions = len(FEATURE_FUNCTIONS)
if __name__ == '__main__':
import conll
import sys
for s in conll.read_file(sys.argv[1]):
for i in xrange(len(s)):
print ' '.join(s[i]),
for f in extract_features(s):
print '%s:%d' % f,
print</span></span>
train
<span style="font-size:12px;"><span style="font-size:14px;">import logging
import numpy as np
from scikits.learn.grid_search import GridSearchCV
from scikits.learn.svm.sparse import LinearSVC
import scipy.sparse as sp
import conll
from features import extract_features
from util import bio_int
logger = logging.getLogger()
def train(sentences):
"""Train NER tagger.
Parameters
----------
sentences : iterable over list
A sequence of lists of tokens.
"""
if not isinstance(sentences, list):
sentences = list(sentences)
logger.debug("Extracting features")
vocabulary = dict((t[0], i) for s in sentences for i, t in enumerate(s))
X = []
for i, s in enumerate(sentences):
X.append(extract_features(s, vocabulary))
X = sp.vstack(X, format='csr')
# FIXME Only BIO tags for now
y = np.array([bio_int[tok[2][0]] for s in sentences for tok in s])
params = {
"loss": ["l1", "l2"],
"multi_class": [True, False],
"C": [1., 10., 100.],
}
logger.debug("Training linear SVMs")
clf = GridSearchCV(LinearSVC(), params, n_jobs=-1).fit(X, y)
logger.debug("Done, returning the best one")
return (clf.best_estimator, vocabulary)
if __name__ == "__main__":
# Write pickled classifier to stdout.
import cPickle as pickle
import sys
logging.basicConfig(level=logging.DEBUG)
pickle.dump(train(conll.read_file(sys.argv[1])), sys.stdout)</span></span>
predict
<span style="font-size:12px;"><span style="font-size:14px;">from features import extract_features
from util import int_bio
def predict(clf, sentence, vocabulary):
"""Predict BIO labels for a single sentence."""
X = extract_features(sentence, vocabulary)
pred = [int_bio[y] for y in clf.predict(X)]
# Heuristic repair: make output consistent,
# but never worse than the raw prediction.
for i in xrange(len(pred)):
if pred[i] == "I" and (i == 0 or pred[i - 1] == "O"):
pred[i] = "B"
return pred
if __name__ == "__main__":
from conll import read_file
import cPickle as pickle
import sys
if len(sys.argv) != 3:
print >> sys.stderr, "Usage: %s clf input_file" % sys.argv[0]
sys.exit(1)
clf, vocabulary = pickle.load(open(sys.argv[1]))
for sentence in read_file(sys.argv[2]):
Y_pred = predict(clf, sentence, vocabulary)
for (token, pos, y_true), y_pred in zip(sentence, Y_pred):
print token, pos, y_true[0], y_pred
print</span></span>
conll03数据集下载:http://www.cnts.ua.ac.be/conll2003/ner/
这个比较坑爹,还要下一个路透社语料库,然后用conll03里的perl脚本处理一下才能得到数据集
基于神经网络
基于sliding-window模型,单词用词向量表示,需要用word2vec预先训练得到词向量,窗口大小为w,比如w取5,词向量dim=50,一个输入就是一段长度为5的短句[x1,x2,x3,x4,x5],每个x都是50维的词向量,经过隐含层、softmax层输出后得到中心单词x3的tag,然后用整个语料库训练这个神经网络,训练完成后mlp即可进行命名实体识别。
可以看这个notes
http://nlp.stanford.edu/~socherr/pa4_ner.pdf
Named-Entity Recognition using Deep Learning
https://github.com/brightmart/name_entity_recognition