import nltk
text=nltk.word_tokenize('and now for something completely different')
nltk.pos_tag(text)
2.NLTK中提供了每个标记的文档,可以使用标记来查询,如:nltk.help.upenn_tagset('RB')或正则表达式,如:nltk.helo.upenn_brown_tagset('NN.*').一些语料库有标记集文档的README文件,见nltk.name.readme(),用语料库名字替换name。
3.找出最频繁的名词标记。
def findtags(tag_prefix,tagged_text):
cfd=nltk.ConditionalFreqDist((tag,word) for (word,tag) in tagged_text
if tag.startswith(tag_prefix))
return dict((tag,cfd[tag].keys()[:5] for tag in cfd.conditions())
>>>tagdict=findtags('NN',nltk.corpus.brown.tagged_wowrds(categories='news'))
>>>for tag in soeted(tagdict):
print tag,tagdict[tag]
>>>pos=nltk.defaultdict(lambda:'N')
>>>pos['colorless']='ADJ'
>>>pos['blog']
'N'
>>>pos.items()
[('blog','N'),('colorless','ADJ')]
alice=nltk.corpus.gutenberg.words('carroll-alice.txt')
vocab=nltk.FreqDist(alice)
v1000=list(vocab)[:1000]
mapping=nltk.defualtdict(lambda:'UNK')
for v in v1000:
mapping[v]=v
counts=nltk.defauledict(int)
for word in nltk.corpus.gutenberg.words('milton-paradise.txt')
counts[word]+=1
[ key for (key,value) in counts.items() if value==32]
#若经常进行反向查找,可建立一个映射值到键的字典。
pos={'colorless':'ADJ','ideas':'N','sleep':'V'}
pos2=dict((value,key) for (key,value) in pos.items())
6.更新字典。使用字典的update()方法在pos中加入一些词,创建多个键具有相同值的情况。pos.update({'cast':'N','scratch':'V','peacefully':'ADV'})
pos2=nltk.defualtdict(list)
for key,value in pos.items():
pos2[value].append(key)
7.默认标注器。raw='I do not like green eggs and ham, I do not like them Sam I am!'
tokens=nltk.word_tokenize(raw)
default_tagger=nltk.DefauleTagger('NN')
default_tagger.tag(tokens)
#评估
default_tagger.evalute(brown_tagged_sents) #只有八分之一的正确率
fd=nltk.FreqDist(brown.words(categoies='news')
cfd=nltk.ConditonalFreqDist(brown.tagged_word(categories='news'))
most_freq_words=fd.keys()[:100]
likely_tags=dict((word,cfd[word].max()) for word in msot_freq_words)
baseline_tagger=nltk.UnigramTagger(model=likely_tags)
baseline_tagger.evaluate(brown_tagged_sents)
from nltk.corpus import brown
brown_taggged_sents=brown.tagged_sents(categories='news')
brown_sents=brown.sents(categories='news')
unigram_tagger=nltk.UnigramTagger(brown_tagged_sents)
unigram_tagger.tag(brown_sents[2007])
unigram_tagger.evaluate(brown_tagged_sents)
t0=nltk.DefaultTagger('NN')
t1=nltk.UnigramTragger(train_sents,backoff=t0)
t2=nltk.BigramTragger(train_sents,backoff=t1)
t2.evaluate(test_sents)
#backoff 指定回退标注器