注:输出部分用省略号代替...
爬取网站
''' import urllib.request
response = urllib.request.urlopen('http://php.net/') html = response.read()
print(html) '''
输出:
''' b'\n\n
\n\n \n \n\n PHP: Hypertext Preprocessor\n\n \n \n'''
转换为干净文本
''' import urllib.request from bs4 import BeautifulSoup
response = urllib.request.urlopen('http://php.net/') html = response.read() soup=BeautifulSoup(html,"html5lib") # 这需要安装html5lib模块 text = soup.get_text(strip=True)
-- text -- 获取了一个干净的文本
print(text) ''' 输出为: ''' PHP: Hypertext PreprocessorDownloadsDocumentationGet InvolvedHelpGetting StartedIntroductionA simple tutorialLanguage ReferenceBasic ...... '''
转换为tokens ''' import urllib.request from bs4