from __future__ import division
import nltk
from nltk.corpus import gutenberg, brown, state_union, swadesh, names, cmudict, udhr
from nltk.corpus import wordnet as wn
# 1
phrase = ["I", "like", 'noodles', "."]
print(phrase+phrase)
print(phrase*3)
print(phrase[2])
print(phrase[:3])
print(sorted(phrase))
# 2
text = gutenberg.words('austen-persuasion.txt')
print('word token: ', len(text))
print('word type: ', len(set(text)))
# 3
print(brown.categories())
print(brown.words(categories='fiction'))
print(brown.words(categories='humor'))
# 4
su = state_union.words()
cfd = nltk.ConditionalFreqDist(
(target, fileid[:4])
for fileid in state_union.fileids()
for word in [w.lower() for w in state_union.words(fileid)]
for target in ['men', 'women', 'people']
if target == word
)
cfd.plot()
# 5
print(wn.synsets('rabbit'))
print(wn.synset('rabbit.n.01').member_meronyms())
print(wn.synset('rabbit.n.01').part_meronyms())
print(wn.synset('rabbit.n.01').substance_meronyms())
print(wn.synset('rabbit.n.01').member_holonyms())
print(wn.synset('rabbit.n.01').part_holonyms())
print(wn.synset('rabbit.n.01').substance_holonyms())
paths = wn.synset('rabbit.n.01').hypernym_paths()
print(len(paths), paths[0])
print(wn.synsets('pig'))
print(wn.synset('hog.n.01').member_meronyms())
print(wn.synset('hog.n.01').part_meronyms())
print(wn.synset('hog.n.01').substance_meronyms())
print(wn.synset('hog.n.01').member_holonyms())
print(wn.synset('hog.n.01').part_holonyms())
print(wn.synset('hog.n.01').substance_holonyms())
paths = wn.synset('hog.n.01').hypernym_paths()
print(len(paths), paths[0], paths[1])
print(wn.synset('rabbit.n.01').lowest_common_hypernyms(wn.synset('hog.n.01')))
# 6
print(swadesh.fileids())
print(swadesh.words('en'))
en2fr = swadesh.entries(['en', 'fr'])
translate = dict(en2fr)
mydict = dict([('rabbit', 'add_lml')])
translate.update(mydict)
print(translate['pig'])
try:
print(translate['pig'])
except KeyError as key:
print('No match!')
# 7
emma = nltk.Text(gutenberg.words('austen-emma.txt'))
print(emma.concordance('However', width=50, lines=5))
print(emma.similar('However'))
# 8
cfd = nltk.ConditionalFreqDist(
(fileid[:-4], name[0])
for fileid in names.fileids()
for name in names.words(fileid)
)
cfd.plot()
# 9
from nltk.book import text1, text2
print(text1.concordance('love', width=50, lines=5))
print(text2.concordance('love', width=50, lines=5))
print(text1.similar('love'))
print(text2.similar('love'))
# 10
freq1 = nltk.FreqDist(text1)
freq1.plot(50)
prop = len(text1)/3
print([w for w in set(text1) if freq1[w] > prop])
# 11
type = ['news', 'romance']
emotions = ['like', 'dislike', 'hate', 'love', 'ignore', 'curse', 'enjoy']
cfd = nltk.ConditionalFreqDist(
(genre, emotion)
for genre in type
for word in brown.words(categories=genre)
for emotion in emotions
if emotion == word
)
cfd.plot()
# 12
entries = cmudict.entries()
print(len(entries))
wordlist = set([word for word, pron in entries])
print(len(wordlist))
totalword = [word for word, pron in entries]
oneprop = [word for word in wordlist if totalword.count(word) == 1]
print(len(oneprop))
print(len(oneprop)/len(wordlist))
cfd = nltk.ConditionalFreqDist(
(w, totalword.count(w)/len(totalword))
for w in totalword
)
print(cfd.tabulate())
# 13
# pay attention to difference between generator and list
noun = wn.all_synsets('n')
noun_num = len(list(noun))
lst = [ss for ss in wn.all_synsets('n') if len(ss.hyponyms()) <= 0] # wn.all_synsets('n') can not be noun!
noun_no_hypo = len(list(lst))
print(noun_num, noun_no_hypo, noun_no_hypo / noun_num)
# 14
def supergloss(synset):
str = ''
hypernyms = synset.hypernyms()
for hyper in hypernyms:
str += hyper.definition() + '\n'
str += '\n\n' + synset.definition() + '\n\n\n'
hyponyms = synset.hyponyms()
for hypo in hyponyms:
str += hypo.definition() + '\n'
print(str)
supergloss(wn.synset('car.n.01'))
# 15
fdist = nltk.FreqDist(brown.words())
lst = [words for words in brown.words() if fdist[words] >= 3]
print(len(lst), lst[:5])
# 16
def tokens(text):
return len(text)
def types(text):
return len(set(text))
def lexical_diversity(tokens_um, types_num):
return tokens_um / types_num
import prettytable as pt
tb = pt.PrettyTable(["Genre", "Tokens", "Types", "Lexical diversity"])
for genre in brown.categories():
text = brown.words(categories=genre)
tok = tokens(text)
typ = types(text)
div = lexical_diversity(tok, typ)
tb.add_row([genre, tok, typ, round(div, 1)])
print(tb)
# 17
from nltk.corpus import stopwords
def most_common(text):
fdist1 = nltk.FreqDist([w for w in text if w not in stopwords.words('english')])
print(fdist1.most_common(50))
fdist1.plot(50)
most_common(gutenberg.words('austen-emma.txt'))
# 18
text = gutenberg.words('austen-emma.txt')
text = [w for w in text if w not in stopwords.words('english')]
bigrams = nltk.bigrams(text)
fdist = nltk.FreqDist(bigrams)
print(fdist.most_common(50))
fdist.plot(50)
# 19 similar as 11
emotions = ['like', 'hate', 'love', 'mad', 'fear', 'sad', 'happy']
cfd = nltk.ConditionalFreqDist(
(genre, emotion)
for genre in brown.categories()
for word in brown.words(categories=genre)
for emotion in emotions
if emotion == word
)
cfd.plot()
cfd.tabulate()
# 20
def word_freq(word, section):
text = section.words()
print(100 * text.count(word)/len(text))
word_freq('like', brown)
# 21
def syllables_num(text):
entries = cmudict.entries()
prons = [pron for word, pron in entries if word in text]
syllables = [syls for syls in prons]
return len(syllables)
ss = set([w for w in gutenberg.words('austen-emma.txt') if w.startswith("ant")])
print(syllables_num(gutenberg.words('austen-emma.txt')))
# 22
import numpy as np
def hedge(text):
new_version = []
i = 0
for words in text:
new_version.append(words)
i += 1
if i % 3 == 0:
new_version.append("like")
# another method
# i = 0
# while (i+3) <= len(text):
# new_version = np.concatenate((new_version, text[i:i+3]))
# new_version = np.concatenate((new_version, ["like"]))
# i += 3
# if i < len(text):
# new_version = np.concatenate((new_version, text[i:]))
return new_version
print(hedge(['I', 'love', 'rabbit', 'and', 'peggy']))
# 23
import random
import re
import matplotlib.pyplot as plt
def zipf(text):
fdist = nltk.FreqDist([w.lower() for w in text if w.isalpha()])
# print(fdist.keys())
sorted_brown_words = sorted(fdist.keys(), key=lambda x: fdist[x], reverse=True)
outfile = open(r"2_23_1.txt", 'w')
for w in sorted_brown_words:
outfile.write("%s\t%d\n" % (w, fdist[w]))
plt.scatter(w, fdist[w])
plt.show()
zipf(brown.words())
text = ""
for i in range(500000):
text += random.choice("abcdefg ")
word_li = re.split(r"\s+", text) # according to space to split
fdist = nltk.FreqDist(word_li)
sorted_word_li = sorted(fdist.keys(), key=lambda x: fdist[x], reverse=True)
with open(r'\2_23_2.txt', 'w') as outfile:
for w in sorted_word_li:
outfile.write("%s\t%d\n" % (w, fdist[w]))
# 24
import random
def word_selection(n, text):
fdist = nltk.FreqDist(text)
return(fdist.most_common(n))
def generate_model(text, word, num=20):
bigrams = nltk.bigrams(text)
cfd = nltk.ConditionalFreqDist(bigrams)
for i in range(num):
print(word, end=' ')
word = cfd[word].max()
text = brown.words(categories='fiction')
words = [w for w, num in word_selection(n=10, text=text)]
print(words)
word = random.choice(words)
print(word)
generate_model(text, word, 50)
# 25
def find_language(string):
languages = [lang for lang in udhr.fileids() if '-Latin1' in lang]
rst = [lang for lang in languages if string in udhr.words(lang)]
return rst
print(find_language('I'))
# 26
branch_sum = 0
synset_num = len(list(wn.all_synsets('n')))
branch_list = []
print(synset_num)
for synset in wn.all_synsets('n'):
branch_num = len(list(synset.hyponyms()))
branch_list.append(branch_num)
branch_sum += branch_num
print(branch_sum, synset_num, branch_sum/synset_num)
print(branch_list)
fdist = nltk.FreqDist(branch_list)
fdist.plot()
# 27
noun_set = set([x.name().split(".")[0] for x in wn.all_synsets('n')])
noun_num = len(noun_set)
sem_sum = 0
sem_list = []
for word in noun_set:
sem_num = len(wn.synsets(word, 'n'))
sem_sum += sem_num
sem_list.append(sem_num)
print(sem_sum, noun_num, sem_sum/noun_num)
fdist = nltk.FreqDist(sem_list)
fdist.plot()
# 28
# Hint: the similarity of a pair should be represented by the similarity of the most similar pair of synsets they have.
def similarity(pair):
word1, word2 = pair.split('-')
print(word1, word2)
sn1 = wn.synsets(word1)
# print(sn1)
sn2 = wn.synsets(word2)
max_sim = 0.0
for s1 in sn1:
# print('\n', s1)
for s2 in sn2:
# print(s2)
cur_sim = s1.path_similarity(s2)
# print(cur_sim)
try:
if max_sim < cur_sim:
max_sim = cur_sim
except TypeError:
pass
# print(max_sim)
return max_sim
pairs = ["car-automobile", "gem-jewel", "journey-voyage", "boy-lad", "coast-shore", "asylum-madhouse",
"magician-wizard", "midday-noon", "furnace-stove", "food-fruit", "bird-cock", "bird-crane",
"tool-implement", "brother-monk", "lad-brother", "crane-implement", "journey-car", "monk-oracle",
"cemetery-woodland", "food-rooster", "coast-hill", "forest-graveyard", "shore-woodland", "monk-slave",
"coast-forest", "lad-wizard", "chord-smile", "glass-magician", "rooster-voyage", "noon-string"]
sim_list = [similarity(p) for p in pairs]
print(sim_list)
sorted_pairs = sorted(pairs, key=lambda x: similarity(x), reverse=True)
print(sorted_pairs)