前言
这个网站的4K图片还是挺好看的,不要一下大量爬取网站,不听,后果自负ヽ(`⌒´メ)ノ!!!
主要目标就是4K图片爬取,
pip install requests
pip install lxml
网站分析
- 目标网站:
https://pic.netbian.com
第一页 http://pic.netbian.com/4kdongman/
第二页 https://pic.netbian.com/4kdongman/index_2.html
第三页 https://pic.netbian.com/4kdongman/index_3.html
第一页与其他页url不同
- 问题解决
Q:代码遇到的问题是由于文件名中包含了不合法的字符
A:
def sanitize_filename(filename):
"""
Define disallowed characters
parms:
filename: str, the filename to be sanitized
returns:
str, the sanitized filename
"""
invalid_chars = '<>:"/\\|?*'
for char in invalid_chars:
filename = filename.replace(char, '_')
return filename
Q:requests.exceptions.SSLError,这是一个 SSL 连接错误,通常发生在尝试通过 HTTPS 协议连接到服务器时。如果连接中断或服务器的 SSL 证书无效,可能会出现这种错误。
A:
- 忽略 SSL 验证:你可以在 requests.get 中添加 verify=False 参数来忽略 SSL 证书验证。注意:这会使你的连接不再安全,应该谨慎使用。
- 重试机制:可以添加一个简单的重试机制,以便在遇到错误时重新尝试下载图片。
- 使用 requests.Session 和 requests.adapters.HTTPAdapter 创建一个带有重试机制的会话。
- 设置重试策略,最多重试 5 次,每次重试之间有 0.1 秒的延迟。
- 捕获请求异常:使用 try…except 块捕获 requests.exceptions.RequestException 异常,并在遇到错误时打印错误信息。
无异步版本
import requests
import os
from lxml import etree
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
def sanitize_filename(filename):
"""
Define disallowed characters
parms:
filename: str, the filename to be sanitized
returns:
str, the sanitized filename
"""
invalid_chars = '<>:"/\\|?*'
for char in invalid_chars:
filename = filename.replace(char, '_')
return filename
# 创建一个带有重试机制的会话
session = requests.Session()
retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504])
session.mount('https://', HTTPAdapter(max_retries=retries))
# (1)数据获取:请求网页
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0'
}
# 定义存储图片的文件夹
folder_path = 'pic'
# 如果文件夹不存在,则创建它
if not os.path.exists(folder_path):
os.makedirs(folder_path)
for page in range(1, 11): # 注意数量,别给自己爬进去,最好不要超过30ヽ(`⌒´メ)ノ
if page == 1:
url = 'http://pic.netbian.com/4kdongman/'
else:
url = f'http://pic.netbian.com/4kdongman/index_{page}.html'
print(f'------------------------ DOWNLOADING PAGE {page} PICTURES ------------------------')
try:
response = session.get(url, headers=headers, verify=False)
response.encoding = 'gbk'
page_text = response.text
# (2)数据解析:图片地址+图片名称
tree = etree.HTML(page_text)
# a.全局解析
list_li = tree.xpath('//div[@class="slist"]/ul/li')
for li in list_li:
# b.局部解析:解析li标签文本内容
img_title = li.xpath('./a/b/text()')[0] + '.jpg'
img_title = sanitize_filename(img_title) # 清理文件名中的非法字符
# 注意:对网页进行分析,src的值没有域名不完整,需要加上https://pic.netbian.com
img_src = 'https://pic.netbian.com' + li.xpath('./a/img/@src')[0]
# (3)数据存储:存储图片数据
img_data = session.get(img_src, verify=False).content
img_path = os.path.join(folder_path, img_title)
with open(img_path, 'wb') as fp:
fp.write(img_data)
print(img_path, 'successfully downloaded')
except requests.exceptions.RequestException as e:
print(f"Error downloading page {page}: {e}")
print("------------------------- ALL PICTURES DOWNLOADED ------------------------")
异步版本
import os
import aiohttp
import asyncio
from lxml import etree
# 定义一个函数来清理文件名
def sanitize_filename(filename):
invalid_chars = '<>:"/\\|?*'
for char in invalid_chars:
filename = filename.replace(char, '_')
return filename
# 定义存储图片的基文件夹
base_folder_path = 'pic'
# 定义存储图片的文件夹
def create_folder(path):
if not os.path.exists(path):
os.makedirs(path)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0'
}
async def fetch(session, url):
async with session.get(url) as response:
text = await response.read() # 读取二进制内容
return text.decode('gbk') # 解码为 GBK 编码
async def fetch_image(session, url, img_title, folder_path):
img_title = sanitize_filename(img_title)
img_path = os.path.join(folder_path, img_title)
async with session.get(url) as response:
img_data = await response.read()
with open(img_path, 'wb') as f:
f.write(img_data)
print(f'{img_path} successfully downloaded')
async def download_images(session, url, folder_path):
try:
page_text = await fetch(session, url)
# 数据解析:图片地址+图片名称
tree = etree.HTML(page_text)
list_li = tree.xpath('//div[@class="slist"]/ul/li')
tasks = []
for li in list_li:
img_title = li.xpath('./a/b/text()')[0] + '.jpg'
img_src = 'https://pic.netbian.com' + li.xpath('./a/img/@src')[0]
tasks.append(fetch_image(session, img_src, img_title, folder_path))
await asyncio.gather(*tasks)
except Exception as e:
print(f"Error downloading images from {url}: {e}")
async def main():
# 手动开启,要爬取的网址
url_map = {
'https://pic.netbian.com/4kdongman/': 'dongman',
'https://pic.netbian.com/4kyouxi/': 'youxi',
'https://pic.netbian.com/4kmeinv/': 'meinv',
# 'https://pic.netbian.com/4kfengjing/': 'fengjing',
# 'https://pic.netbian.com/4kyingshi/': 'yingshi',
# 'https://pic.netbian.com/4kqiche/': 'qiche',
# 'https://pic.netbian.com/4krenwu/': 'renwu',
# 'https://pic.netbian.com/4kdongwu/': 'dongwu',
# 'https://pic.netbian.com/4kzongjiao/': 'zongjiao',
# 'https://pic.netbian.com/4kbeijing/': 'beijing',
# 'https://pic.netbian.com/pingban/': 'pingban',
}
async with aiohttp.ClientSession(headers=headers) as session:
tasks = []
for base_url, folder_name in url_map.items():
folder_path = os.path.join(base_folder_path, folder_name)
create_folder(folder_path)
for page in range(1, 11):
if page == 1:
url = base_url
else:
url = f'{base_url}index_{page}.html'
print(f'------------------------正在下载 {base_url} 第 {page} 页图片------------------------')
tasks.append(download_images(session, url, folder_path))
await asyncio.gather(*tasks)
# 运行主函数
if __name__ == '__main__':
asyncio.run(main())
print("------------------------- ALL PICTURES DOWNLOADED ------------------------")