《Python自然语言处理》第二章练习题答案

最新推荐文章于 2024-03-05 08:19:49 发布

heize19

最新推荐文章于 2024-03-05 08:19:49 发布

阅读量822

点赞数 1

分类专栏：自然语言处理文章标签： python 自然语言处理 nltk

本文链接：https://blog.csdn.net/qq_44715621/article/details/115013831

版权

自然语言处理专栏收录该内容

4 篇文章 0 订阅

订阅专栏

第二章
最近要学nltk，这本书的练习题出的很好，自己写下来锻炼一下。
这一章主要涉及词频统计和画图。
2

from nltk.corpus import gutenberg
len(gutenberg.words('austen-persuasion.txt'))
len(set(gutenberg.words('austen-persuasion.txt')))

from nltk.corpus import brown
brown.words(categories=['news','editorial'])

from nltk.corpus import state_union
cdf = nltk.ConditionalFreqDist(
    (target,fileid[:4])
    for fileid in state_union.fileids()
    for w in state_union.words(fileid)
    for target in ['men','women','people']
    if w.lower().startswith(target)
)
cfd.plot()

names = nltk.corpus.names
cfd = nltk.ConditionalFreqDist(
    (fileid,name[0])
    for fileid in names.fileids()
    for name in names.words(fileid)
)
cfd.plot()

from nltk.corpus import brown
fd = nltk.FreqDist(brown.words())
print([w for (w,_) in fd.most_common() if fd[w]>3])

for cate in brown.categories():
    word_num = len([w for w in brown.words(categories=cate)])
    type_num = len(set(w for w in brown.words(categories=cate)))
    print(cate,":",word_num/type_num)

from nltk.corpus import stopwords
sw = stopwords.words('English')
fd = nltk.FreqDist([w for w in brown.words() if w.lower() not in sw])
[w for (w,_) in fd.most_common()[:50]]
li = list(fd.most_common())
li[:50]

fd = nltk.FreqDist([(w1,w2) for (w1,w2) in nltk.bigrams(brown.words()) if w1 not in sw and w2 not in sw])
[w for (w,_) in fd.most_common()[:50]]
fd.most_common()[1]

import re
def cate_count_word(text):
    cates = text.categories()
    for cate in cates:
        fd = nltk.FreqDist([w for w in text.words(categories=cate) if (re.search(r'[A-Za-z]+',w) and w.lower() not in sw)])
        print(cate,":",fd.most_common()[1],fd.most_common()[-1])
cate_count_word(brown)

from nltk.corpus import gutenberg
from nltk.text import Text
def word_freq(text,word):
    count = Text(gutenberg.words(text)).count(word)
    return count/len(text)
word_freq('austen-emma.txt','will')

%matplotlib inline
from nltk.corpus import PlaintextCorpusReader
corpus_root = r'root'
wordlists = PlaintextCorpusReader(corpus_root,'.*.txt')
fdist = nltk.FreqDist(wordlists.words('nkw_all.txt'))
common = fdist.most_common()
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] 
plt.xticks(rotation=270)
x = [i for i in range(1,151)]
y = [w[1] for w in common][:150]
plt.plot(x,y)
plt.show()

import random
word_li = ''
for i in range(100000):
    word_li += random.choice('abcdefg ')
    i += 1
words = word_li.split(' ')
fdist = nltk.FreqDist(words)
common = fdist.most_common()
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] 
plt.xticks(rotation=270)
x = [i for i in range(1,151)]
y = [w[1] for w in common][:150]
plt.plot(x,y)
plt.show()