import nltk
import urllib.request as urllib
import re
from bs4 import BeautifulSoup
import operator
print("Python and NLTK installed successfully")
# urllib2 is use to download the html content of the web link
response = urllib.urlopen('http://python.org/')
# You can read the entire content of a file using read() method
html = response.read()
print(len(html))
print(html)
# 清理HTML标签
tokens = [tok for tok in html.split()]
print("Total no of tokens :"+ str(len(tokens)))
# First 100 tokens
print(tokens[0:100])
# 存在过量的HTML标签和其他无关紧要的字符
tokens = re.split('\\W+', html.decode('utf-8'))
print(len(tokens))
print(tokens[0:100])
# nltk
clean = BeautifulSoup(html, "html5lib").get_text()
# clean will have entire string removing all the html noise
tokens = [tok for tok in clean.split()]
print(len(tokens))
print(tokens[:100])
freq_dis = {}
for tok in tokens:
if
自然语言处理简介第一章实例代码
最新推荐文章于 2023-02-20 00:30:00 发布