《Python自然语言处理》第二章练习题答案

第二章
最近要学nltk,这本书的练习题出的很好,自己写下来锻炼一下。
这一章主要涉及词频统计和画图。
2

from nltk.corpus import gutenberg
len(gutenberg.words('austen-persuasion.txt'))
len(set(gutenberg.words('austen-persuasion.txt')))

3

from nltk.corpus import brown
brown.words(categories=['news','editorial'])

4

from nltk.corpus import state_union
cdf = nltk.ConditionalFreqDist(
    (target,fileid[:4])
    for fileid in state_union.fileids()
    for w in state_union.words(fileid)
    for target in ['men','women','people']
    if w.lower().startswith(target)
)
cfd.plot()

8

names = nltk.corpus.names
cfd = nltk.ConditionalFreqDist(
    (fileid,name[0])
    for fileid in names.fileids()
    for name in names.words(fileid)
)
cfd.plot()

15

from nltk.corpus import brown
fd = nltk.FreqDist(brown.words())
print([w for (w,_) in fd.most_common() if fd[w]>3])

16

for cate in brown.categories():
    word_num = len([w for w in brown.words(categories=cate)])
    type_num = len(set(w for w in brown.words(categories=cate)))
    print(cate,":",word_num/type_num)

17

from nltk.corpus import stopwords
sw = stopwords.words('English')
fd = nltk.FreqDist([w for w in brown.words() if w.lower() not in sw])
[w for (w,_) in fd.most_common()[:50]]
li = list(fd.most_common())
li[:50]

18

fd = nltk.FreqDist([(w1,w2) for (w1,w2) in nltk.bigrams(brown.words()) if w1 not in sw and w2 not in sw])
[w for (w,_) in fd.most_common()[:50]]
fd.most_common()[1]

19

import re
def cate_count_word(text):
    cates = text.categories()
    for cate in cates:
        fd = nltk.FreqDist([w for w in text.words(categories=cate) if (re.search(r'[A-Za-z]+',w) and w.lower() not in sw)])
        print(cate,":",fd.most_common()[1],fd.most_common()[-1])
cate_count_word(brown)

20

from nltk.corpus import gutenberg
from nltk.text import Text
def word_freq(text,word):
    count = Text(gutenberg.words(text)).count(word)
    return count/len(text)
word_freq('austen-emma.txt','will')

23

%matplotlib inline
from nltk.corpus import PlaintextCorpusReader
corpus_root = r'root'
wordlists = PlaintextCorpusReader(corpus_root,'.*.txt')
fdist = nltk.FreqDist(wordlists.words('nkw_all.txt'))
common = fdist.most_common()
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] 
plt.xticks(rotation=270)
x = [i for i in range(1,151)]
y = [w[1] for w in common][:150]
plt.plot(x,y)
plt.show()
import random
word_li = ''
for i in range(100000):
    word_li += random.choice('abcdefg ')
    i += 1
words = word_li.split(' ')
fdist = nltk.FreqDist(words)
common = fdist.most_common()
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] 
plt.xticks(rotation=270)
x = [i for i in range(1,151)]
y = [w[1] for w in common][:150]
plt.plot(x,y)
plt.show()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值