分类和标注词汇
import nltk
text = nltk.word_tokenize("And now for something completely different")
res = nltk.pos_tag(text)
print(res)
text = nltk.word_tokenize("They refuse to permit us to obtain the refuse permit")
res = nltk.pos_tag(text)
print(res)
text = nltk.Text(word.lower() for word in nltk.corpus.brown.words())
text.similar('woman')
text.similar('bought')
text.similar('over')
tagged_token = nltk.tag.str2tuple('fly/NN')
print(tagged_token)
print(tagged_token[0])
print(tagged_token[1])
sent = """
The/AT grand/JJ jury/NN commented/VBD on/IN a/AT number/NN of/IN
other/AP topics/NNS ,/, AMONG/IN them/PPO the/AT Atlanta/NP and/CC
Fulton/NP-tl County/NN-tl purchasing/VBG departments/NNS which/WDT it/PPS
said/VBD ``/`` ARE/BER well/QL operated/VBN and/CC follow/VB generally/RB
accepted/VBN practices/NNS which/WDT inure/VB to/IN the/AT best/JJT
interest/NN of/IN both/ABX governments/NNS ''/'' ./.
"""
res = [nltk.tag.str2tuple(t) for t in sent.split()]
print(res)
res = nltk.corpus.brown.tagged_words()
print(res)
res = nltk.corpus.nps_chat.tagged_words()
print(res)
res = nltk.corpus.conll2000.tagged_words()
print(res)
res = nltk.corpus.treebank.tagged_words()
print(res)
res = nltk.corpus.sinica_treebank.tagged_words()
print(res)
res = nltk.corpus.indian.tagged_words()
print(res)
res = nltk.corpus.mac_morpho.tagged_words()
print(res)
res = nltk.corpus.conll2002.tagged_words()
print(res)
res = nltk.corpus.cess_cat.tagged_words()
print(res)
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories='news')
tag_fd = nltk.FreqDist(tag for (word, tag) in brown_news_tagged)
print(list(tag_fd.keys()))
word_tag_pairs= nltk.bigrams(brown_news_tagged)
word_tag_pairs = list(word_tag_pairs)
res = list(nltk.FreqDist(a[1] for (a, b) in word_tag_pairs if b[1] == 'NP'))
print(res)
wsj = nltk.corpus.treebank.tagged_words()
word_tag_fd = nltk.FreqDist(wsj)
res = [word + "/" + tag for (word, tag) in word_tag_fd if tag.startswith("V")]
print(res)
cfd1 = nltk.ConditionalFreqDist(wsj)
res = list(cfd1['yield'].keys())
print(res)
res = list(cfd1['cut'].keys())
print(res)
cfd2 = nltk.ConditionalFreqDist((tag, word) for (word, tag) in wsj)
print(list(cfd2))
res = list(cfd2['VBG'].keys())
print(res)
res = [w for w in cfd1.conditions() if 'VBD' in cfd1[w] and 'VBN' in cfd1[w]]
print(res)
idx1 = wsj.index(('kicked', 'VBD'))
print(wsj[idx1-4:idx1+1])
idx2 = wsj.index(('kicked', 'VBN'))
print(wsj[idx2-4:idx2+1])
def findtags(tag_prefix, tagged_text):
cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text
if tag.startswith(tag_prefix))
return cfd
tagdict = findtags("NN", list(nltk.corpus.brown.tagged_words(categories='news')))
for tag in sorted(tagdict):
print(tag, tagdict[tag])
from nltk.corpus import brown
brown_learned_text = brown.words(categories = 'learned')
res = sorted(set(b for (a, b) in nltk.bigrams(brown_learned_text) if a == 'often'))
print(res)
brown_lrnd_tagged = brown.tagged_words(categories='learned')
tags = [b[1] for (a, b) in nltk.bigrams(brown_lrnd_tagged) if a[0] == 'often']
fd = nltk.FreqDist(tags)
fd.tabulate()
from nltk.corpus import brown
def process(sentence):
for (w1, t1), (w2, t2), (w3,t3) in nltk.trigrams(sentence):
if (t1.startswith('V') and t2 == 'TO' and t3.startswith('V')):
print(w1, w2, w3)
for tagged_sent in brown.tagged_sents():
process(tagged_sent)
from nltk.corpus import brown
brown_news_tagged = brown.tagged_words(categories='news')
data = nltk.ConditionalFreqDist((word.lower(), tag) for (word, tag) in brown_news_tagged)
for word in data.conditions():
if len(data[word]) > 3:
tags = data[word].keys()
print(word, ' '.join(tags))
pos = {}
print(pos)
pos['colorless'] = 'ADJ'
print(pos)
pos['ideas'] = 'N'
pos['sleep'] = 'V'
pos['furiously'] = 'ADV'
print(pos)
print(pos['ideas'])
print(pos['colorless'])
print(list(pos))
print(sorted(pos))
res = [w for w in pos if w.endswith('s')]
print(res)
for word in sorted(pos):
print(word + ":", pos[word])
print(list(pos.keys()))
print(list(pos.values()))
print(list(pos.items()))
for key, val in sorted(list(pos.items())):
print(key + ":", val)
pos['sleep'] = 'V'
print(pos['sleep'])
pos['sleep'] = 'N'
print(pos['sleep'])
pos = {'colorless':'ADJ', 'ideas':'N', 'sleep':'V', 'furiously':'ADV'}
print(pos)
pos1 = dict(colorless='ADJ', ideas='N', sleep='V', furiously='ADV')
print(pos1)
frequency = nltk.defaultdict(int)
frequency['colorless'] = 4
print(frequency['ideas'])
pos = nltk.defaultdict(list)
pos['sleep'] = ['N', 'V']
print(pos['ideas'])
pos = nltk.defaultdict(lambda: 'N')
pos['colorless'] = 'ADJ'
print(pos['blog'])
print(list(pos.items()))
alice = nltk.corpus.gutenberg.words('carroll-alice.txt')
vocab = nltk.FreqDist(alice)
v1000 = list(vocab)[:1000]
mapping = nltk.defaultdict(lambda: 'UNK')
for v in v1000:
mapping[v] = v
alice2 = [mapping[v] for v in alice]
print(alice2[:100])
print(len(set(alice2)))
counts = nltk.defaultdict(int)
from nltk.corpus import brown
for (word, tag) in brown.tagged_words(categories='news'):
counts[tag] += 1
print(counts['NN'])
print(list(counts))
from operator import itemgetter
res = sorted(counts.items(), key=itemgetter(1), reverse=True)
print(res)
res = [t for t, c in sorted(counts.items(), key=itemgetter(1), reverse=True)]
print(res)
pair = ('NP', 8336)
print(pair[1])
print(itemgetter(1)(pair))
last_letters = nltk.defaultdict(list)
words = nltk.corpus.words.words('en')
for word in words:
key = word[-2:]
last_letters[key].append(word)
res = last_letters['ly']
print(res)
print(last_letters['zy'])
anagrams = nltk.defaultdict(list)
for word in words:
key = ''.join(sorted(word))
anagrams[key].append(word)
print(anagrams['aeilnrt'])
anagrams = nltk.Index((''.join(sorted(w)), w) for w in words)
print(anagrams['aeilnrt'])
pos = nltk.defaultdict(lambda: nltk.defaultdict(int))
brown_news_tagged = brown.tagged_words(categories='news')
for ((w1, t1), (w2, t2)) in nltk.bigrams(brown_news_tagged):
pos[(t1, w2)][t2] += 1
print(pos[('DT', 'library')])
counts = nltk.defaultdict(int)
for word in nltk.corpus.gutenberg.words('milton-paradise.txt'):
counts[word] += 1
res = [key for (key, value) in counts.items() if value == 32]
print(res)
pos = {'colorless':'ADJ', 'ideas':'N', 'sleep':'V', 'furiously':'ADV'}
pos2 = dict((value, key) for (key, value) in pos.items())
print(pos2['N'])
pos.update({'cats':'N', 'scratch':'V', 'peacefully':'ADV', 'old':'ADJ'})
pos2 = nltk.defaultdict(list)
for key, value in pos.items():
pos2[value].append(key)
print(pos2['ADV'])
pos2 = nltk.Index((value, key) for (key, value) in pos.items())
print(pos2['ADV'])
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
tags = [tag for (word, tag) in brown.tagged_words(categories='news')]
print(nltk.FreqDist(tags).max())
raw = 'I do not like green eggs and ham, I do not like them Sam I am!'
tokens = nltk.word_tokenize(raw)
default_tagger = nltk.DefaultTagger('NN')
res = default_tagger.tag(tokens)
print(res)
res = default_tagger.evaluate(brown_tagged_sents)
print(res)
patterns = [
(r'.*img$', 'VBG'),
(r'.*ed$', 'VBD'),
(r'.*es$', 'VBZ'),
(r'.*ould$', 'MD'),
(r'.*\'s$', 'NN$'),
(r'.*$', 'NNS'),
(r'^-?[0-9]+(.[0-9]+)?$', 'CD'),
(r'.*', 'NN')
]
regexp_tagger = nltk.RegexpTagger(patterns)
res = regexp_tagger.tag(brown_sents[3])
print(res)
print(regexp_tagger.evaluate(brown_tagged_sents))
fd = nltk.FreqDist(brown.words(categories='news'))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
most_freq_words = list(fd.keys())[:100]
likely_tags= dict((word, cfd[word].max()) for word in most_freq_words)
baseline_tagger = nltk.UnigramTagger(model=likely_tags)
res = baseline_tagger.evaluate(brown_tagged_sents)
print(res)
sent = brown.sents(categories = 'news')[3]
res = baseline_tagger.tag(sent)
print(res)
baseline_tagger = nltk.UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger('NN'))
def performance(cfd, wordlist):
lt = dict((word, cfd[word].max()) for word in wordlist)
baseline_tagger = nltk.UnigramTagger(model=lt, backoff=nltk.DefaultTagger('NN'))
return baseline_tagger.evaluate(brown.tagged_sents(categories='news'))
def display():
import pylab
words_by_freq = list(nltk.FreqDist(brown.words(categories='news')))
cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='news'))
sizes = 2 ** pylab.arange(15)
perfs = [performance(cfd, words_by_freq[:size]) for size in sizes]
pylab.plot(sizes, perfs, '-bo')
pylab.title('Lookup Tagger Performance with Varying Model Size')
pylab.xlabel('Model Size')
pylab.ylabel('Performance')
pylab.show()
display()
from nltk.corpus import brown
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
unigram_tagger = nltk.UnigramTagger(brown_tagged_sents)
res = unigram_tagger.tag(brown_sents[2007])
print(res)
res = unigram_tagger.evaluate(brown_tagged_sents)
print(res)
size = int(len(brown_tagged_sents)*0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
unigram_tagger = nltk.UnigramTagger(train_sents)
res = unigram_tagger.evaluate(test_sents)
print(res)
bigram_tagger = nltk.BigramTagger(train_sents)
res = bigram_tagger.tag(brown_sents[2007])
print(res)
unseen_sent = brown_sents[4203]
res = bigram_tagger.tag(unseen_sent)
print(res)
res = bigram_tagger.evaluate(test_sents)
print(res)
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
res = t2.evaluate(test_sents)
print(res)
from pickle import dump
output = open('t2.pkl', 'wb')
dump(t2, output, -1)
output.close()
from pickle import load
input = open('t2.pkl', 'rb')
tagger = load(input)
input.close()
text = """The board's action shows what free enterprise is up against in our complex maze of regulatory laws."""
tokens = text.split()
res = tagger.tag(tokens)
print(res)
cfd = nltk.ConditionalFreqDist(
((x[1], y[1], z[0]), z[1])
for sent in brown_tagged_sents
for x, y, z in nltk.trigrams(sent))
ambiguous_contexts = [c for c in cfd.conditions() if len(cfd[c]) > 1]
res = sum(cfd[c].N() for c in ambiguous_contexts) / cfd.N()
print(res)
test_tags = [tag for sent in brown.sents(categories='editorial')
for (word, tag) in t2.tag(sent)]
gold_tags = [tag for (word, tag) in brown.tagged_words(categories='editorial')]
print(nltk.ConfusionMatrix(gold_tags, test_tags))
brown_tagged_sents = brown.tagged_sents(categories='news')
brown_sents = brown.sents(categories='news')
size = int(len(brown_tagged_sents) * 0.9)
train_sents = brown_tagged_sents[:size]
test_sents = brown_tagged_sents[size:]
t0 = nltk.DefaultTagger('NN')
t1 = nltk.UnigramTagger(train_sents, backoff=t0)
t2 = nltk.BigramTagger(train_sents, backoff=t1)
res = t2.evaluate(test_sents)
print(res)
res = nltk.tag.brill.nltkdemo18()
print(res)