import requests
import os
import re
import time
url_base = 'http://www.netbian.com'
headers = {
'User-Agent:':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36',
}
# 分析网页结构
# 第1页:http://www.netbian.com/index.htm
# 第3页:http://www.netbian.com/index_3.htm
# 第8页:http://www.netbian.com/index_8.htm
# 第i页:
# f'http://www.netbian.com/index_{i}.htm'
# <p><a href="/desk/30948-1920x1080.htm" target="_blank"><img src="http://img.netbian.com/file/2023/0427/112808gmtzD.jpg" alt="王者荣耀 妲己九尾狐皮肤超清无水印电脑壁纸" title="王者荣耀 妲己九尾狐皮肤超清无水印电脑壁纸"></a></p>
# <p><a href="/desk/30935-1920x1080.htm" target="_blank"><img src="http://img.netbian.com/file/2023/0426/203306JXA7z.jpg" alt="海边 橙色兰博基尼跑车高清电脑壁纸" title="海边 橙色兰博基尼跑车高清电脑壁纸"></a></p>
# <p><a href="/desk/.*?-1920x1080.htm" target="_blank"><img src="(.*?)" alt=".*?" title=".*?"></a></p>
# 给爬取到的图片编号命名
num = 1
os.makedirs('./img_bian',exist_ok=True)
# 第1页列表页提取规则
# <li><a href="/desk/30922.htm"title="大海 棕榈树 海滩沙滩 帆船 岛 插图风景 动漫壁纸"
# <li><a href="/desk/30909.htm"title="灌篮高手人物大全图片高清壁纸"
# <li><a href="/desk/30910.htm"title="美女刘亦菲2023年5月日历高清桌面壁纸"
# <li><a href="(.*?)"title=".*?"
# 第2页列表页提取规则
# <li><a href="/desk/30909.htm"title="灌篮高手人物大全图片高清壁纸"
# <li><a href="/desk/30915.htm" title="雪境之森 可爱女孩 白夜极光壁纸 更新时间:2023-04-25"
# 第7页列表页提取规则
# <li><a href="/desk/30674.htm" title="跑车动态壁纸电脑桌面 更新时间:2023-04-04"
# 结论:从第2页开始,列表页提取规则相比第一页,title属性前多了一个空格
# 用来实现多页爬取
# 测试用
# for i in range(1,2):
# 正式版
for i in range(1,21):
if i == 1:
url_page = 'http://www.netbian.com/index.htm'
response = requests.get(url_page,headers)
# print(response.encoding)
response.encoding = 'gbk'
# print(response.encoding)
# 下面调试代码,正式程序中屏蔽
# with open('./img_bian/彼岸第一页.txt','w',encoding='utf-8') as f:
# f.write(response.text)
pattern = '<li><a href="(.*?)"title=".*?"'
res_list = re.findall(pattern,response.text)
# 调试代码,正式版屏蔽
# print(len(res_list))
# print(res_list)
# 爬取22张壁纸图片的详情页响应
# 测试版
# for j in range(1):
# 正式版
for j in range(len(res_list)):
url_detail = url_base + res_list[j]
# print(url_detail)
response_detail = requests.get(url_detail,headers)
# 防止乱码,更改编码方式
response_detail.encoding = 'gbk'
# 测试用代码,正式版屏蔽
# with open('./img_bian/第一幅图片网页源代码.txt','w',encoding='utf-8') as f:
# f.write(response_detail.text)
time.sleep(1)
pattern_detail = '<p><a href="/desk/.*?-1920x1080.htm" target="_blank"><img src="(.*?)" alt=".*?" title=".*?"></a></p>'
img_url = re.findall(pattern_detail,response_detail.text)[0]
# print(img_url)
# 请求图片数据并保存图片
image_data = requests.get(img_url,headers).content
print(f'正在下载第{i}页第{num}张图片......')
time.sleep(1)
with open(f'./img_bian/{num}.jpg','wb') as file:
file.write(image_data)
num = num + 1
else:
url_page = f'http://www.netbian.com/index_{i}.htm'
response = requests.get(url_page,headers)
# print(response.encoding)
response.encoding = 'gbk'
# print(response.encoding)
# 下面调试代码,正式程序中屏蔽
# with open('./img_bian/彼岸第2页.txt','w',encoding='utf-8') as f:
# f.write(response.text)
# 不同于第一页有22张图片,从第2页开始每一页只有19张图片,检查其提取规则也有小差别
pattern = '<li><a href="(.*?)" title=".*?" target="_blank"><img src="'
res_list = re.findall(pattern,response.text)
# 调试代码,正式版屏蔽
# print(len(res_list))
# print(res_list)
# 爬取19张壁纸图片的详情页响应
# 测试版
# for j in range(1):
# 正式版
for j in range(len(res_list)):
url_detail = url_base + res_list[j]
# print(url_detail)
response_detail = requests.get(url_detail,headers)
# 防止乱码,更改编码方式
response_detail.encoding = 'gbk'
# 测试用代码,正式版屏蔽
# with open('./img_bian/第一幅图片网页源代码.txt','w',encoding='utf-8') as f:
# f.write(response_detail.text)
time.sleep(1)
pattern_detail = '<p><a href="/desk/.*?-1920x1080.htm" target="_blank"><img src="(.*?)" alt=".*?" title=".*?"></a></p>'
img_url = re.findall(pattern_detail,response_detail.text)[0]
# print(img_url)
# 请求图片数据并保存图片
image_data = requests.get(img_url,headers).content
print(f'正在下载第{i}页第{num}张图片......')
time.sleep(1)
with open(f'./img_bian/{num}.jpg','wb') as file:
file.write(image_data)
num = num + 1
爬取彼岸壁纸
最新推荐文章于 2025-05-17 21:17:31 发布