使用协程爬取唯一,不同的是这个网站的布局不是很好爬
这是全部代码
import requests
from lxml import etree
import aiohttp
import aiofiles
import asyncio
import urllib.parse
import re
import time
def get_page_source(url, headers):
rep = requests.get(url, headers=headers)
rep.encoding = "utf-8"
return rep.text
def get_hrefs(rep):
tree = etree.HTML(rep)
hrefs = tree.xpath('//div[@id="qq_risy"]/div/dl/dt/a/@href')
return hrefs
async def down_one(task, headers):
for i in task:
name = i.split("/")[-1]
async with aiohttp.ClientSession(headers=headers)as session:
async with session.get(i)as rep:
content = await rep.content.read()
async with aiofiles.open(f"{name}", mode="wb")as f:
await f.write(content)
print("搞定")
async def down_load(href, headers):
async with aiohttp.ClientSession(headers=headers) as session:
async with session.get(href) as rep:
page = await rep.text()
obj = re.compile(r'data-src="(?P<src>.*?)" alt=', re.S)
srcs = obj.finditer(page)
task = []
for src in srcs:
task.append(src.group("src"))
await down_one(task, headers)
async def main():
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/106.0.0.0 Safari/537.36 Edg/106.0.1370.52"}
url = "https://qq.yh31.com/"
rep = get_page_source(url, headers)
hrefs = get_hrefs(rep)
tasks = []
for href in hrefs:
new_href = urllib.parse.urljoin(url, href)
task = asyncio.create_task(down_load(new_href, headers))
tasks.append(task)
await asyncio.wait(tasks)
if __name__ == '__main__':
start = time.time()
event_loop = asyncio.get_event_loop()
event_loop.run_until_complete(main())
print(time.time() - start)