N-gram 标注
1、一元标注器
from nltk.corpus import brown
import nltk
brown_tagged_sents = brown.tagged_sents(categories = 'news') # 提取已经标注的句子
brown_sents = brown.sents(categories='news') # 提取没有标注的句子
unigramer_tagger = nltk.UnigramTagger(brown_tagged_sents) # 利用已经标注的来训练!
unigramer_tagger.tag(brown_sents[2007]) # 利用训练得到的模型来标注第2007个句子!
[('Various', 'JJ'),
('of', 'IN'),
('the', 'AT'),
('apartments', 'NNS'),
('are', 'BER'),
('of', 'IN'),
('the', 'AT'),
......
unigramer_tagger.evaluate(brown_tagged_sents) # 该模型利用已经标注的句子作标注测试,测试模型的准确率!!
0.9349006503968017
2、分离训练和测试数据
size = int(len(brown_tagged_sents) * 0.9)
size
4160
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:] # 分隔模型!
unigram_tagger = nltk.UnigramTagger(train_sents) # 训练 模型标注器!
unigram_tagger.evaluate(test_sents) # 在 前所未见的文本 上测试 模型的可用性!
0.8121200039868434
3、一般的N-gram的标注
bigram_tagger = nltk.BigramTagger(train_sents)
bigram_tagger.tag(brown_sents[2007])
[('Various', 'JJ'),
('of', 'IN'),
('the', 'AT'),
('apartments', 'NNS'),
('are', 'BER'),
......
unseen_sent = brown_sents[4203]
bigram_tagger.tag(unseen_sent)
[('The', 'AT'),
('population', 'NN'),
('of', 'IN'),
('the', 'AT'),
('Congo', 'NP'),
......
bigram_tagger.evaluate(test_sents)
0.10206319146815508
4、组合标注器
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t2.evaluate(test_sents)
0.8452108043456593
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t3 = nltk.TrigramTagger(train_sents, backoff=t2)
t3.evaluate(test_sents)
0.843317053722715
5、标注生词
注意:标注生词的方法是回退到正则表达式标注器或默认标注器
6、存储标注器
import _pickle as cPickle # python3中的cPickle换成了_pickle
output = open('t3.pkl', 'wb')
cPickle.dump(t3, output, -1)
output.close()
input = open('t3.pkl', 'rb')
tagger = cPickle.load(input)
input.close()
text = """The board's action shows what free enterprise is
up against in our complex maze of regulatory laws."""
tokens = text.split()
tagger.tag(tokens)
[('The', 'AT'),
("board's", 'NN$'),
('action', 'NN'),
('shows', 'NNS'),
('what', 'WDT'),
('free', 'JJ'),
('enterprise', 'NN'),
('is', 'BEZ'),
7、性能限制
s = [sent for sent in brown_tagged_sents]
print(s[:1])
[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')]]
ss = [(x,y,z) for x,y,z in nltk.trigrams(s)]
print(ss[:10])
[([('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'),......
cfd = nltk.ConditionalFreqDist(((a[0][1], a[1][1], a[2][0]), a[2][1]) for a in ss)
注意:下面这里频率统计要把前面的提取句子全部改成提取单词!!!!!!!!
cfd = nltk.ConditionalFreqDist(
((x[1], y[1], z[0]), z[1])
for sent in brown_tagged_sents
for x, y, z in nltk.trigrams(sent)
)
ambiguous_contexts = [c for c in cfd.conditions() if len(cfd[c]) > 1]
sum(cfd[c].N() for c in ambiguous_contexts) / cfd.N()
0.1027......
test_tags = [tag for sent in brown.sents(categories='editorial')
for (word, tag) in t3.tag(sent)]
gold_tags = [tag for (word, tag) in brown.tagged_words(categories='editorial')]
print(nltk.ConfusionMatrix(gold_tags, test_tags))
| W |
| N V D
注意:训练数据中的歧义可产生标注器性能的上限
8、跨句子边界标注
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
t2.evaluate(test_sents)# 因为百分之0.9的brown语料库数量已经很多了!!!
0.8452108043456593