import urllib
from bs4 import BeautifulSoup
import nltk
# urllib is use to download the html content of the web link
response = urllib.request.urlopen('http://python.org/')
#You can read the entire content of a file using read() method
html = response.read()
clean = BeautifulSoup(html,"lxml").get_text()
#No more Use nltk.clean_html() Now use bs4
tokens = nltk.word_tokenize(clean)
print(tokens[:100])
Freq_dist_nltk=nltk.FreqDist(tokens)
print(Freq_dist_nltk)
for k,v in Freq_dist_nltk.items():
print(str(k)+':'+str(v))
#the plot for the frequency distributions
Freq_dist_nltk.plot(50,cumulative=False)
#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#English.stop txt 自己想办法
stopwords=[word.strip().lower() for word in open("PATH/english.stop.txt")]
#自己手动输入的
stopwords=['Welcome','@','http','and','of']
clean_tokens=[tok for tok in tokens if len(tok.lower())>1 and (tok.lower() not in stopwords)]
Freq_dist_nltk=nltk.FreqDist(clean_tokens)
Freq_dist_nltk.plot(5,cumulative=False)
NLP学习笔记
最新推荐文章于 2022-08-07 21:55:00 发布