本文采用的是谷歌浏览器爬取python习题并保存
获取’User-Agent’:参加
https://blog.csdn.net/orange_mask/article/details/97800094
import requests
from bs4 import BeautifulSoup
url = "https://www.runoob.com/python/python-100-examples.html"
# 伪装浏览器
headers = {'User-Agent':
自行获取
}
# 发送请求
r = requests.get(url, headers=headers).content.decode('utf-8')
# 解析html文档
soup = BeautifulSoup(r, 'lxml')
# 根据实际需求确定,查找练习的a链接的href属性获取对应的链接地址
re_a = soup.find(id='content').ul.find_all('a')
lis = []
for i in re_a:
lis.append(i.attrs['href'])
# print(lis)
for j in lis:
dic = {}
# 根据地址请求详情页面
ar = requests.get('https://www.runoob.com' + j, headers=headers).content.decode('utf-8')
# 解析为 html 文档
soup_ar = BeautifulSoup(ar, 'lxml')
# 根据实际查找
# a.查标题
dic['headline'] = soup_ar.find(id='content').h1.text
# b.查提目
dic['title'] = soup_ar.find(id='content').find_all('p')[1].text
# c.查程序分析
dic['analysis'] = soup_ar.find(id='content').find_all('p')[2].text
# d.查源代码
try:
dic['code'] = soup_ar.find(class_='hl-main').text
except Exception as e:
dic['code'] = soup_ar.find('pre').text
# print(dic)
# 保存文件
with open('菜鸟教程py-100.txt', 'a+', encoding='utf-8') as file:
file.write(dic['headline'] + '\n')
file.write(dic['title'] + '\n')
file.write(dic['analysis'] + '\n')
file.write(dic['code'] + '\n')
file.write('=' * 50 + '\n')