5.4 自动标注

from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories = 'news')
brown_sents = brown.sents(categories='news')

1、默认标注器

import nltk
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
nltk.FreqDist(tags).max()

‘NN’

# 标记流程,全部标记为NN
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
default_tagger.tag(tokens)

[(‘I’, ‘NN’),
(‘do’, ‘NN’),
(‘not’, ‘NN’),
(‘like’, ‘NN’),
(‘green’, ‘NN’),
(‘eggs’, ‘NN’),
(‘and’, ‘NN’),
(‘ham’, ‘NN’),
(’,’, ‘NN’),
(‘I’, ‘NN’),
(‘do’, ‘NN’),
(‘not’, ‘NN’),
(‘like’, ‘NN’),
(‘them’, ‘NN’),
(‘Sam’, ‘NN’),
(‘I’, ‘NN’),
(‘am’, ‘NN’),
(’!’, ‘NN’)]

default_tagger.evaluate(brown_tagged_sents)

0.13089484257215028

2、正则表达式标注器

patterns = [
    (r'.*ing$', 'VBG'),
    (r'.*ed$', 'VBD'),
    (r'.*es$', 'VBZ'),
    (r'.*ould$', 'MD'),
    (r'.*\'s$', 'NN$'),
    (r'.*s$', 'NNS'), 
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers 基数词
    (r'.*', 'NN'),
]
regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tagger.tag(brown_sents[3])

[(’``’, ‘NN’),
(‘Only’, ‘NN’),
(‘a’, ‘NN’),
(‘relative’, ‘NN’),
(‘handful’, ‘NN’),
(‘of’, ‘NN’),
(‘such’, ‘NN’),
(‘reports’, ‘NNS’),
(‘was’, ‘NNS’),
(‘received’, ‘VBD’),
("’’", ‘NN’),
(’,’, ‘NN’),

regexp_tagger.evaluate(brown_tagged_sents)

0.20326391789486245

3、查询标注器

fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))

most_freq_words = list(fd.keys())[:100]
likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
# print(likely_tags)
baseline_tagger = nltk.UnigramTagger(model = likely_tags) # 利用“查找标注器”的模型!
baseline_tagger.evaluate(brown_tagged_sents)

0.3329355371243312

# 上面的模型运行在未标注的输入文本上
sent = brown.sents(categories='news')[3]
baseline_tagger.tag(sent)

[(’', '’),
(‘Only’, ‘RB’),
(‘a’, ‘AT’),
(‘relative’, ‘JJ’),
(‘handful’, ‘NN’),
(‘of’, ‘IN’),
(‘such’, ‘JJ’),
(‘reports’, ‘NNS’),
(‘was’, ‘BEDZ’),
(‘received’, ‘VBD’),
("’’", “’’”),
(’,’, ‘,’),

baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger('NN')) # 给模型增加默认标记!

最重要的一个项目!

# 基于频率标注的小项目!!!!!
import nltk
import pylab
from nltk.corpus import brown
def performance(cfd, wordlist):
    lt = dict((word, cfd[word].max()) for word in wordlist)
    baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
    return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))
    
def display():
    words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))
    cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
    
    sizes = 2 ** pylab.arange(15)
    perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
    pylab.plot(sizes, perfs, '-bo')
    pylab.title('Lookup Tagger Performance with Varying Model Size')
    pylab.xlabel('Model Size')
    pylab.ylabel('Performance')
    pylab.show()

display()

查找标注器

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值