包括以下章节:内置标注器、默认字典、自动标注、n-gram标注
内置标注器
nltk内置词性标注器: 'alice'->('alice', 'NN')
text = nltk.word_tokenize("And now for something completely different")
text = 'alice'
nltk.pos_tag(text)
“标注”已标注词: 'fly/NN'->('fly', 'NN')
tagged_token = nltk.tag.str2tuple('fly/NN')
print(tagged_token)
名词
#检查哪类词在brown语料库新闻类最常见
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
tag_fd.most_common()
#检查哪类词出现在名词前
word_tag_pairs = nltk.bigrams(brown_news_tagged)
noun_preceder = [a[1] for (a, b) in word_tag_pairs if b[1] == 'NOUN']
fdist = nltk.FreqDist(noun_preceder)
[tag for (tag, _) in fdist.most_common()] #most_common返回频率分布的降序列表
动词
#最常见的动词
wsj = nltk.corpus.treebank.tagged_words(tagset='universal') #使用简化标记
word_tag_fd = nltk.FreqDist(wsj) #((word, tag), frequency)
[wt[0] for (wt, _) in word_tag_fd.most_common() if wt[1] == 'VERB']
#某个词的各个词性出现频率
cfd1 = nltk.ConditionalFreqDist(wsj) #tag为条件,word为事件
cfd1['cut'].most_common()
#某种词性的可能词
wsj2 = nltk.corpus.treebank.tagged_words()
cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in wsj2)
list(cfd2['VBN'])
#既是过去式VBD又是过去分词VBN的词,上下文情况
cfd3 = nltk.ConditionalFreqDist(wsj2) #使用未简化标记
[w for w in cfd3.conditions() if 'VBD' in cfd3[w] and 'VBN' in cfd3[w]]
idx1 = wsj2.index(('preferred', 'VBD'))
wsj2[idx1-4:idx1+1]
idx2 = wsj2.index(('preferred', 'VBN'))
wsj2[idx2-4:idx2+1]
找出最频繁的名词标记(未简化的标记)
def findtags(tag_prefix, tagged_text):
cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text
if tag.startswith(tag_prefix))
return dict((tag, cfd[tag].most_common(5)) for tag in cfd.conditions())
tagdict = findtags('NN', nltk.corpus.brown.tagged_words(categories='news'))
for tag in sorted(tagdict):
print(tag, tagdict[tag])
探索已标记的语料库
from nltk.corpus import brown
brown_learned_text = brown.words(categories='learned')
sorted(set(b for(a, b) in nltk.bigrams(brown_learned_text) if a == 'often'))
#查看often后面词的词性
brown_learned_tagged = brown.tagged_words(categories='learned', tagset='universal')
tags = [b[1] for (a, b) in nltk.bigrams(brown_learned_tagged) if a[0] == 'often']
fd = nltk.FreqDist(tags)
fd.tabulate()
#使用tag(词性标记)寻找v.-to-v.短语(动词不定式)
def process(sentence):
for (w1, t1), (w2, t2), (w3, t3) in nltk.trigrams(sentence):
if(t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
print(w1, w2, w3)
for tagged_sent in brown.tagged_sents():
process(tagged_sent)
#拥有众多词性的词
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
data = nltk.ConditionalFreqDist((word.lower(), tag)
for (word, tag) in brown_news_tagged)
for word in sorted(data.conditions()):
if len(data[word]) > 3:
tags = [tag for (tag, _) in data[word].most_common()]
print(word, ' '.join(tags))
默认字典
基本使用:为新key自动创建默认值
frequency = defaultdict(int) #参数本质为函数,可用lambda表达式
frequency['colorless'] = 4
#frequency['ideas'] #默认值为0
pos = defaultdict(list)
pos['sleep'] = ['NOUN', 'VERB']
#pos['ideas'] #默认值为[]
创建默认值为‘N’的字典
pos = defaultdict(lambda: 'NOUN')
pos['colorless'] = 'ADJ'
#pos['blog']
list(pos.items())
#创建默认字典,高频词映射到自身,低频词映射到‘UNK’
alice = nltk.corpus.gutenburg.words('carroll_alice.txt')
vocab = nltk.FreqDist(alice)
v1000 = [word for (word, _) in vocab.most_common(1000)]
mapping = defaultdict(lambda: 'UNK')
for v in v1000:
mapping[v] = v
alice2 = [mapping[v] for v in alice] #将原文进行映射
#alice2[:100]
递增更新字典
# {tag:number}按值排序
from collections import defaultdict
counts = defaultdict(int)
from nltk.corpus import brown
for (word, tag) in brown.tagged_words(categories='news', tagset='universal'):
counts[tag] += 1
#counts['NOUN']
sorted(counts)
from operator import itemgetter
sorted(counts.items(), key=itemgetter(1), reverse=True)
#itemgetter(1)相当于一个函数,并不返回确定的值。
#仅作用于特定对象时才获取值。参数‘1’为对象的维度
[t for t, c in sorted(counts.items(), key=itemgetter(1), reverse=True)]
#根据单词后2字母递增字典{'ly':[abandonedly, abasedly]}
last_letter = defaultdict(list)
word = nltk.corpus.words.words('en')
for word in words:
key = word[-2:]
last_letter[key].append(word)
#last_letter['ly'] #所有以ly结尾的单词
#将包含特定字母的单词列为同一条目,key为相应字母
anagrams = defaultdict(list)
for word in words:
key = ''.join(sorted(word))
anagram[key].append(word)
#anagrams['aeilnrt'] #所有包含aeilnrt且仅1次的单词
复杂的键与值
pos = defaultdict(lambda: defaultdict(int))
brown_news_tagged = brown.tagged_words(categories='news', tagset='universal')
for ((w1,t1), (w2,t2)) in nltk.ibigrams(brown_news_tagged)
pos[(t1, w2)][t2] += 1
#pos['DET', 'right']
反向查找-颠倒字典-key、value互换
#单次反向查找
counts = defaultdict(int)
for word in nlrk.corpus.gutenburg.words('milton-paradise.txt')
counts[word] += 1
[key for (key, value) in counts.items() if value == 32]
#one-to-one时,key,value直接调换
pos = {'colorless': 'ADJ', 'ideas': 'N', 'sleep': 'V', 'furiously': 'ADV'}
pos2 = dict((value, key) for (key, value) in pos.items())
#one-to-many时,为原value递增原key
pos.update({'cats': 'N', 'scratch': 'V', 'peacefully': 'ADV', 'old': 'ADJ'})
#将新键值对插入原字典
pos2 = defaultdict(int)
for key, value in pos.items():
pos2[value].append(key)
#one-to-many时,nltk内置方法
pos2 = nltk.Index((value, key) for (key, value) in pos.items())
自动标注
默认标注器
import nltk
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
brown_tagged_words = brown.tagged_words(categories='news')
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
fdist = nltk.FreqDist(tags)
tag = fdist.max() #tag为出现最频繁的词性
tagger = nltk.DefaultTagger(tag)
raw = "I do not like green eggs and ham, I do not like them Sam I am!"
tokens = nltk.word_tokenize(raw)
tagger.tag(tokens) #仅能标注token列表(一个sentence)
tagger.evaluate(brown_tagged_sents) #仅能评估senten
正则表达式标注器
patterns = [
(r'.*ing$', 'VBG'),
(r'.*ed$', 'VBD'),
(r'.*es$', 'VBZ'),
(r'.*ould$', 'MD'),
(r'.*\'s$', 'NN$'),
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
(r'.*', 'NN')
]
regexp_tagger = nltk.RegexpTagger(patterns)
regexp_tagger.tag(brown_sents[3])
regexp_tagger.evaluate(brown_tagged_sents) #评估词正则标注器的准确率,约0.2
#参数仅能为标注sentence列表
查询标注器(unigram tagger)
fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
most_freq_words = fd.most_common(100) #返回(word, frequency)列表
likely_tags = dict((word, cfd[word].max()) for (word, _) in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags) #能给100最频繁词标注
baseline_tagger.evaluate(brown_tagged_sents) #评估准确率
baseline_tagger.tag(brown_sents[3]) #给第4个sentence标注
baseline_tagger2 = nltk.UnigramTagger(model=likely_tags,
backoff=nltk.DefaultTagger('NN')) #backoff(回退标注器)为默认标注器
#不是100频繁词,则用默认标注器标注
n-gram 标注
Unigram Tagging
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
#切分训练数据
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
#训练和测试
unigram_tagger = nltk.UnigramTagger(train_sents)
unigram_tagger.evaluate(test_sents)
N-gram标注器(精度高、覆盖范围小)
bigram_tagger = nltk.BigramTagger(train_sents) #bigram与unigram的train data同
seen_sent = brown_tagged_sents[2007] #包含在训练集中的某sent
unseen_sent = brown_tagged_sents[4203] #未包含在训练集中的某sent
bigram_tagger.tag(seen_sent)
bigram_tagger.tag(unseen_sent) #对未见过的sent,n-gram效果很差
bigram_tagger.evaluate(test_sents)
组合标注器
尽可能的用更精确的模型,但在必要时用覆盖范围广的模型
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, cutoff=2, backoff=t1)
#出现频率<=2的上下文词,丢弃、不加入标注器模型
t2.evaluate(test_sents)
标注生词
#step1:建立词汇表,词汇表外的词记为UNK
#step2:1-gram会将UNK标注为特定词性;
#2-gram会根据上下文标注词性
n-gram性能上限
cfd = nltk.ConditionalFreqDist(
((x[1], y[1], z[0]), z[1])
for sent in brown_tagged_sents
for x, y, z in nltk.trigrams(sent))
ambiguous_contexts = [c for c in cfd.conditions()
if len(cfd[c]) > 1] #cfd[c]返回{'事件':频率}的频率分布
sum(cfd[c].N() for c in ambiguous_contexts) / cfd.N()
#cfd.conditions()为去重的条件,cfd.N(),cfd[c].N()为不去重的条件、事件的数量
test_tags = [tag for sent in brown_tagged_sents
for (word, tag) in t2.tag(sent)]
gold_tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
print(nltk.ConfusionMatrix(gold_tags, test_tags))
跨句子边界标注
#N-gram一般不允许跨句标注,首单词的上下文设置为None
#若要跨句,上下文为上句最后一个单词+句尾标点