5.4 自动标注
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories = 'news')
brown_sents = brown.sents(categories='news')
1、默认标注器
import nltk
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
nltk.FreqDist(tags).max()
‘NN’
# 标记流程,全部标记为NN
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
default_tagger.tag(tokens)
[(‘I’, ‘NN’),
(‘do’, ‘NN’),
(‘not’, ‘NN’),
(‘like’, ‘NN’),
(‘green’, ‘NN’),
(‘eggs’, ‘NN’),
(‘and’, ‘NN’),
(‘ham’, ‘NN’),
(’,’, ‘NN’),
(‘I’, ‘NN’),
(‘do’, ‘NN’),
(‘not’, ‘NN’),
(‘like’, ‘NN’),
(‘them’, ‘NN’),
(‘Sam’, ‘NN’),
(‘I’, ‘NN’),
(‘am’, ‘NN’),
(’!’, ‘NN’)]
default_tagger.evaluate(brown_tagged_sents)
0.13089484257215028
2、正则表达式标注器
patterns = [
(r'.*ing$', 'VBG'),
(r'.*ed$', 'VBD'),
(r'.*es$', 'VBZ'),
(r'.*ould$', 'MD'),
(r'.*\'s$', 'NN$'),
(r'.*s$', 'NNS'),
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers 基数词
(r'.*', 'NN'),
]
regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tagger.tag(brown_sents[3])
[(’``’, ‘NN’),
(‘Only’, ‘NN’),
(‘a’, ‘NN’),
(‘relative’, ‘NN’),
(‘handful’, ‘NN’),
(‘of’, ‘NN’),
(‘such’, ‘NN’),
(‘reports’, ‘NNS’),
(‘was’, ‘NNS’),
(‘received’, ‘VBD’),
("’’", ‘NN’),
(’,’, ‘NN’),
…
regexp_tagger.evaluate(brown_tagged_sents)
0.20326391789486245
3、查询标注器
fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
most_freq_words = list(fd.keys())[:100]
likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
# print(likely_tags)
baseline_tagger = nltk.UnigramTagger(model = likely_tags) # 利用“查找标注器”的模型!
baseline_tagger.evaluate(brown_tagged_sents)
0.3329355371243312
# 上面的模型运行在未标注的输入文本上
sent = brown.sents(categories='news')[3]
baseline_tagger.tag(sent)
[(’', '
’),
(‘Only’, ‘RB’),
(‘a’, ‘AT’),
(‘relative’, ‘JJ’),
(‘handful’, ‘NN’),
(‘of’, ‘IN’),
(‘such’, ‘JJ’),
(‘reports’, ‘NNS’),
(‘was’, ‘BEDZ’),
(‘received’, ‘VBD’),
("’’", “’’”),
(’,’, ‘,’),
…
baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger('NN')) # 给模型增加默认标记!
最重要的一个项目!
# 基于频率标注的小项目!!!!!
import nltk
import pylab
from nltk.corpus import brown
def performance(cfd, wordlist):
lt = dict((word, cfd[word].max()) for word in wordlist)
baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))
def display():
words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
sizes = 2 ** pylab.arange(15)
perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
pylab.plot(sizes, perfs, '-bo')
pylab.title('Lookup Tagger Performance with Varying Model Size')
pylab.xlabel('Model Size')
pylab.ylabel('Performance')
pylab.show()
display()