--------------------------------------------------------------------------------------------------------
P58处理布朗语料库的新闻和言情文体,找出一周中最有新闻价值和最浪漫的日子。
days=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']
cfd=nltk.ConditionalFreqDist(
(genre,word)
for genre in ['romance','news']
for word in brown.words(categories=genre))
cfd.tabulate(conditions=['romance','news'],samples=days)
2.使用语料库模块处理austen-persuasion.txt,这本书中有多少标识符?有多少词类型?
from nltk.corpus import gutenberg
austen=gutenberg.words('austen-persuasion.txt')
len(austen)
len(set(s.lower() for s in austen if s.isalpha()))
from nltk.corpus import brown
nltk.corpus.brown.categories()
romance=brown.words(categories='romance')
news=brown.words(categories='news')
from nltk.corpus import webtext
nltk.corpus.webtext.fileids()
grail=webtext.words('grail.txt')
4.使用state_union语料库阅读器,访问《国情资文报告》文本。计数每个文档中出现的men,women,people。随时间的推移这些词的用法有什么变化?import nltk
from nltk.corpus import state_union
cfd=nltk.ConditionalFreqDist(
(target,fileid[:4])
for fileid in state_union.fileids()
for target in state_union.words(fileid) if target in ['men','women','people'])
cfd.plot() #图:第二章练习4
the second way:
import nltk
from nltk.courpus import state_union
fdist=FreqDist('2006-GWBush.txt')
cfd=nltk.ConditionalFreqDist((word,fdist[word])
for word in state_union.words('2006-GWBush.txt')
if word in ['men','women','people'])
cfd.tabulate()
>>>from nltk.corpus import wordnet as wn
>>>wn.synsets('smile') #同义词集
>>>smile=wn.synset('smile.n.01')
>>>smile
Synet('smile.n.01')
>>>smile.member_meronyms()
[]
>>>wn.synset('smile.n.01').definition
<bound method synset.definition of Synset('smile.n.01')>
>>>smile.hyponyms() #下位词
[Synset('simper.n.01'),Synset('smirk.n.01')]
>>>smile.hypernyms() #上位词
[Synset('facial_exprssion.n.01')]
>>>smile.part_meronyms() #部分
[]
>>>smile.substance_meronyms() #边材
[]
>>>smile.member_holonyms() #森林
[]
#原文代码
from nltk.corpus import swadesh #斯瓦迪士核心词列表
fr2en=swadesh.entries(['fr','en'])
translate=dict(fr2en)
translate['chien'] #print dog
如果输入错误的词或者不存在的词,会报错。应添加一个错误处理,或者使用if语句判断后在进行translate。
7.根据Strunk和White的《Elements of Style》,词howerer在开有使用是“in whatever way”或“to whatever extent”的意思,而没有“nevertheless”的意思。正确用法的例子:However you advise him,he will probably do as he thinks best.使用词汇索引工具在各种文本中研究这个词的实际用法。
however=nltk.Text(gutenberg.words('austen-persuasion.txt')
however.concordance("However") #借鉴
import nltk
form nltk.corpus import names
names=nltk.corpus.names
cfd=nltk.ConditionalFreqDist(
(fileid,name[0])
for fileid in names.fileids()
for name in names.words(fileid))
cfd.plot() #图:第二章练习8
import nltk10.阅读BBC新闻文章:”UK's Vicky Pollards' left behind"~~有对于大量文本来说,占所有词的三分之一的标识符有多少词类型?从这个统计中能得到什么结论?
from nltk.corpus import gutenberg
gutenberg.fileids()
austen=gentuberg.words('austen-persuasion.txt')
len(set(austen))
len(austen)
austen=nltk.Text(nlkt.corpus.gutenberg.words('austen-persuasion.txt')
austen.concordance("dislike")
emma=gutenberg.words('asuten-emma.txt')
len(set(emma))
len(emma)
emma_text=nltk.Text(nltk.corpus.gutenberg('austen-emma.txt')
emma_text.concordance("dislike")
from nltk.book import *
fdist=FreqDist(text1)
from __future import division
counts=len(text)/3
[(w,fdist[w]) for w in text1 if fdist[w]>counts]
什么是模式分布表?
12.CMU发音词典包含某些词的多种发音,它包含多少种不同的词汇?具有多少发音的词汇在这个字典中的比例是多少?entries=nltk.corpus.cmudict.entires()
s=' '
for word,pron in entries:
s=s+' '+word
s=nltk.word_tokenize(s)
len(set(s))
############################################################
the second way:
entries=nltk.corpus.cmudict.entries()
words=[w for w,pron in entries]
len(set(words)
cfd=nltk.ConditionalFreqDist(
(w,words.count(w)/len(words))
for w in words)
cfd.tabulate()
import nltk
from nltk.corpus import wordnet as wn
all_noun_dict = wn.all_synsets('n')
all_noun_num = len(set(all_noun_dict))
noun_have_hypon = filter(lambda ss: len(ss.hyponyms()) <= 0, wn.all_synsets('n'))
noun_have_num = len(list(noun_have_hypon))
print('There are %d nouns, and %d nouns without hyponyms, the percentage is %f' %
(all_noun_num, noun_have_num, noun_have_num/all_noun_num*100))
#answer
There are 82115 nouns, and 65422 nouns without hyponyms, the percentage is 79.671193
14.定义函数supergloss(s),使用同义词集s作为他的参数,返回字符串,包含s的定义和s所有的上位词与下位词的定义的连接字符串。
def supergloss(s):
from nltk.corpus import wordnet as wordnet
print s.definition
sup=s.hypernyms()
print sup[0].definition
sdown=s.hyponyms()
print sdown[0].definition
smile=wn.synset('smile.n.01')
supergloss(smile)
from nltk.corpus import brown
fdist=FreqDist(brown.words())
如果出现FreqDist is not definited 输入from nltk import *
方法一:缺点输出过多
for word in brown.words():
if fdist[word]>3:
print word, 或者 s=s+' '+word
方法二:缺点运行时间长,容易死机
cfd=nltk.ConditionalFreqDist(
(target,fdist[target])
for target in brown.words() if fdist[target]>=3)
cfd.plot()
方法三:
words=[w for w in brown.words() if brown.words().counts(w)>3] #计算很慢
import nltk
from nltk.corpus import brown
def percent(category):
text=brown.words(categories=category)
words=[ w.lower() for w in text]
return len(words)*1.0/len(set(words))
cfd=nltk.ConditionalFreqDist(
( percent(genre),genre)
for genre in brown.categories())
cfd.tabulate()
the second way:
from nltk.corpus import brown
def word_diversity(words):
words = [w.lower() for w in words]
return len(words)*1.0/len(set(words))
def main():
for category in brown.categories():
diversity_sent = word_diversity(brown.words(categories=category))
print "%s\t%.2f"%(category,diversity_sent)
if __name__ == "__main__":
main()
17.编写一个函数,找出文本中最常用的50个词,停用词除外。
from nltk.corpus import gutenberg
stopwords=nltk.corpus.stopwords.words('english')
emma=[word for word in gutenberg.words('austen-emma.txt') if word not in stopwords]
fdist=FreqDist(emma)
fdist.plot(50,cumulative=True) #图:第二章练习17-1
from nltk.corpus import gutenberg
stopwords=nltk.corpus.stopwords.words('english')
emma=[word for word in gutenberg.words('austen-emma.txt') if word not in stopwords]
bigrams=nltk.bigrams(emma)
fdist=FreqDist(bigrams)
fdist.plot(50,cumulative=True) #第二章练习18
from nltk.corpus import brown
cfd=nltk.ConditionalFreqDist(
(genre,word)
for genre in brown.categories()
for word in brown.words(categories=genre))
genres=['news','religion','hobbies','science_fiction','romance','humor']
modals=['can','could','may','should','will','would']
cfd.tabulate(conditions=genres,samples=modals)
def freq(word,category):
text=nltk.Text(brown.words(categories=category))
return 1.0*text.count(word)/len(text)
from nltk.corpus import cmudict
from nltk.book import *
def musiccounts(text):
entries=nltk.corpus.cmudict.entries()
prons=[pron for w in text for w,pron in entries ]
return len(prons)
def hedge(text):
i=0
new_text=[]
while i<len(text):
new_text+=text[i:i+3]+['like']
i=i+3
return nltk.Text(new_text)
a.编写一个函数用于处理大文本,使用pylab.plot根据词排名画出词的频率。你赞同齐夫定律吗?(提示:使用对数刻度)。所绘线的极端情况是怎样的?
import nltk
from nltk.book import *
fdist=FreqDist(text1)
cfd=nltk.ConditionalFreqDist(
(word,fdist[word]*1.0/length)
for word in text)
cfd.plot()
import random
sent=random.choice(text1)
sent+=text1
words=nltk.word_tokenize(sent)
fdist=FreqDist(text)
cfd=nltk.ConditionalFreqDist(
(word,fdist[word])
for word in text)
cfd.plot()
a.在一个词链表中存储n个最相似的词,使用random.choice()从链表中随机选取一个词。
import random
words=['ramdom','rando','ramdon',ramdon']
word=random.choice()