不建议爬取全部,加了asyncio.sleep和多个User-Agent,爬取太多 还是有风险的.....
试了几类图片都可以,可能还存在其他bug..
import random import requests from bs4 import BeautifulSoup import aiohttp import aiofiles import asyncio import re # 获取子网页的所有地址 async def get_second_class_url(get_second_url, head): tasks = [] # 获取图片的正则表达 obj = re.compile('.*?class="shenlue">.*?<b>(?P<page>.*?)</b>', re.S) # 子网页URL存放 all_page_url_list = [] resp = requests.get(get_second_url, head) resp.encoding = 'UTF-8' # 正则拿到数据唯一值可以用match函数拿到需要的页面 也可使用for循环 page = re.match(obj, resp.text) # 获取该分类的总页数 all_page_num = page.group('page') # 获取所有页面的地址 for page in range(1, 3): # 测试用 只爬取了前2页 如果爬取全部页面将上面代码放开, 练手用不建议全爬 # for page in range(1, int(all_page_num)+1): # 获取该类图片的全部内容 if page == 1: all_page_url_list.append(get_second_url) else: # 组合新的URL 拿到所有页面的地址 all_url = get_second_url.split('.html')[0] + '_' + str(page) + '.html' all_page_url_list.append(all_url) # 启动异步任务 print('共有' + str(len(all_page_url_list)) + '页数据等待加载...............') for p in all_page_url_list: print(p + '正在获取该网页图片的url地址.......\n\n') tasks.append(asyncio.create_task(get_img_url(p, head))) await asyncio.wait(tasks) # 获取图片的url async def get_img_url(url, head): task = [] async with aiohttp.ClientSession() as session: async with session.get(url, headers=head) as resp: soup = BeautifulSoup(await resp.text(), 'html.parser') # 拿到说有图片的div divs = soup.find_all('div', class_='bot-div') for div in divs: # 获取图片链接 img_url = url.split('/tupian')[0] + div.a.get('href') # 异步获取图片下载地址 print(f'{img_url}图片地址获取成功等待获取图片下载链接............') task.append(asyncio.create_task(get_all_img_url(session, img_url, head))) await asyncio.wait(task) # 获取图片的下载地址 (传session后,不用在下面的函数中每次都创建了) async def get_all_img_url(session, url, head): task = [] async with session.get(url, headers=head) as resp1: soup1 = BeautifulSoup(await resp1.text(), 'html.parser') p_label = soup1.find_all('p', class_='bg-bull btn-p com-right-down-btn') for p in p_label: # 拿到图片的真实下载地址 img_url_big = p.a.get('href') # 异步通过地址下载并保持图片 task.append(asyncio.create_task(img_download(session, img_url_big, head))) await asyncio.wait(task) # 下载图片 async def img_download(session, url, head): print(url + '图片地址获取成功准备下载...') name = url.rsplit('/')[-1] # 请求图片下载地址拿到数据 session 上面函数传入 try: async with session.get(url, headers=head) as resp2: async with aiofiles.open(f'img/{name}', 'wb') as f: await f.write(await resp2.content.read()) print(name + '图片下载完毕......') # 建议睡眠一会 await asyncio.sleep(0.1) except Exception as e: print(url + '该地址图片下载失败.......') async def main(): # 请求头池 user_agent_pool = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.167 ' 'Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 ' 'Safari/537.36 Edg/93.0.961.38', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 ' 'Safari/537.36 Edg/93.0.961.44', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 ' 'Safari/537.36 Edg/93.0.961.47', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 ' 'Safari/537.36 Edg/93.0.961.52', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 ' 'Safari/537.36 Edg/94.0.992.31', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 ' 'Safari/537.36 Edg/94.0.992.37', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 ' 'Safari/537.36 Edg/94.0.992.38', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 ' 'Safari/537.36 Edg/94.0.992.47', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 ' 'Safari/537.36 Edg/94.0.992.50', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 ' 'Safari/537.36 Edg/95.0.1020.30', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 ' 'Safari/537.36 Edg/95.0.1020.40', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 ' 'Safari/537.36 Edg/95.0.1020.44', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/116.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/117.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/118.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/119.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:122.0) Gecko/20100101 Firefox/122.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:123.0) Gecko/20100101 Firefox/123.0', ] headers = {'User-Agent': random.choice(user_agent_pool)} # main_url = 'https://sc.chinaz.com/tupian/dongwutupian.html' # 动物图片 main_url = 'https://sc.chinaz.com/tupian/renwutupian.html' # 人物图片 # main_url = 'https://sc.chinaz.com/tupian/taikongkexuetupian.html' # 太空图片 # 获取一个网页中的说有图片网页地址 await get_second_class_url(main_url, headers) print('所有图片下载完毕') if __name__ == '__main__': # 配合使用这句代码必须加上否则报错 asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy()) asyncio.run(main())