简介
本实例爬取了该网站有关python3的所有教程,并将其结果保存在contents.txt文件中。
代码:
import requests #导入网页请求库
from bs4 import BeautifulSoup #导入网页解析库
import parser
def start_requests(url):
headers = {
'User - Agent': 'Mozilla / 5.0(Windows NT 6.1;WOW64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 63.0.3239.132 Safari / 537.36'
}
response = requests.get(url, headers=headers)
return response.content.decode()
if __name__ == '__main__':
f = open('content.txt', 'a+', encoding="utf-8") # 以追加形式打开文件
#f.truncate(0) # 清空原文件中的内容
url='https://www.runoob.com/python3/python3-tutorial.html'
html=start_requests(url)
soup = BeautifulSoup(html,"html.parser")
htmls=soup.find_all('a',target="_top")#获取所有网址
for html in htmls:
# 拼接网址,因为获取的html规则不一致,依据情况进行网址拼接,对获得的网址进行访问、下载网站内P标签内的内容并保存。
if html['href'][0]=='/':
url='https://www.runoob.com'+html['href']#拼接网址
html = start_requests(url)
soup = BeautifulSoup(html, "html.parser")
title= soup.find('title') # 获取所有网址
#print(url,title.string.strip())
texts = soup.find_all('p') # 获取所有网址
for text in texts:
f.write(text.get_text())
else:
url = 'https://www.runoob.com/python3/' + html['href']
html = start_requests(url)
soup = BeautifulSoup(html, "html.parser")
title= soup.find('title') # 获取所有网址
#print(url,title.string.strip())
texts = soup.find_all('p') # 获取所有网址
for text in texts:
f.write(text.get_text())
f.close()