1.环境
python3.8
requests
lxml
re
base64
2.代码(未包含保存部分)
import requests
from lxml import etree
import re
import time
# 首页地址
BASE_url = "aHR0cHM6Ly93d3cud3hlcHQuY29tLw=="
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '
'(KHTML, like Gecko) Chrome/105.0.0.0 Safari/537.36'
}
html = requests.get(BASE_url, headers=headers).text
html = etree.HTML(html)
# 获取总页数
page_num = html.xpath('/html/body/section/div[3]/ul/li[9]/span/text()')[0]
page_num = int(re.findall(r'\d+', page_num)[0])
# 构建所有页面地址
index_urls = []
for i in range(2, page_num + 1):
index_url = BASE_url + f'/page/{i}'
index_urls.append(index_url)
# 解析图片合集名称,首页和之后的每页结构不同
# 解析首页
pic_dir = html.xpath('/html/body/section/div[2]/div/article/h2/a/text()')
# 获取图片合集地址
pic_coll_urls = html.xpath('/html/body/section/div[2]/div/article/h2/a/@href')
print(pic_coll_urls)
for pic_coll_url in pic_coll_urls:
pic_urls = []
detail_html = etree.HTML(requests.get(pic_coll_url, headers=headers).text)
#解析第一页的图片地址
pic_url = detail_html.xpath('/html/body/section/article/p/a/img/@src')
pic_urls.extend(pic_url)
#获取每个合集的总页数
pic_page_num = int(detail_html.xpath('/html/body/section/div[1]/a[last()]/span/text()')[0])
print(pic_page_num)
# /html/body/section/div[1]/a[6]/span /html/body/section/div[1]/a[6]/span
# 构造图片合计每页的地址
for i in range(2, pic_page_num+1):
pic_coll_url_total = pic_coll_url+f'/{i}'
# print(pic_coll_url_total)
detail_html = etree.HTML(requests.get(pic_coll_url_total, headers=headers).text)
# 解析第一页的图片地址
pic_url = detail_html.xpath('/html/body/section/article/p/a/img/@src')
# print(pic_url)
# /html/body/section/article/p/a[1]/img
pic_urls.extend(pic_url)
print(pic_urls)
time.sleep(2)
# 解析第二页之后的
for index_url in index_urls:
index_html = etree.HTML(requests.get(index_url, headers=headers).text)
# 解析图片合集名称
pic_dir = index_html.xpath('/html/body/section/div[3]/div/article/a/img/@alt')
# 获取图片合集地址
pic_coll_urls = index_html.xpath('/html/body/section/div[3]/div/article/a/@href')
pic_urls = []
# print(pic_coll_urls)
for pic_coll_url in pic_coll_urls:
detail_html = etree.HTML(requests.get(pic_coll_url, headers=headers).text)
pic_url = detail_html.xpath('/html/body/section/article/p/a/img/@src')
pic_urls.extend(pic_url)
# 获取每个合集的总页数
pic_page_num = int(detail_html.xpath('/html/body/section/div[1]/a[last()]/span/text()')[0])
print(pic_page_num)
# /html/body/section/div[1]/a[6]/span /html/body/section/div[1]/a[6]/span
# 构造图片合计每页的地址
for i in range(2, pic_page_num + 1):
pic_coll_url_total = pic_coll_url + f'/{i}'
# print(pic_coll_url_total)
detail_html = etree.HTML(requests.get(pic_coll_url_total, headers=headers).text)
# 解析第一页的图片地址
pic_url = detail_html.xpath('/html/body/section/article/p/a/img/@src')
# print(pic_url)
# /html/body/section/article/p/a[1]/img
pic_urls.extend(pic_url)
print(pic_urls)
time.sleep(2)
这个网站没有什么反爬虫措施,为避免服务器压力过大,在每次循环后面加了休眠2秒。试了一下,应该能够将网站上的图片都扒下来。图片保存的代码需要自己添加,示例
for pic in pic_urls:
with open(pic.split('/')[-1], 'wb') as f:
img = requests.get(pic, headers=headers)
f.write(img.content)