python爬取美女头像

最新推荐文章于 2023-11-16 09:59:27 发布

快乐乐学习

最新推荐文章于 2023-11-16 09:59:27 发布

阅读量203

点赞数

文章标签： python html 开发语言

本文链接：https://blog.csdn.net/weixin_47276949/article/details/130037328

版权

import os
import re
import requests
import asyncio
import aiohttp


if not os.path.exists("./美女头像"):
    os.mkdir("./美女头像")

url = "https://www.ddtouxiang.com/touxiang/c-nvsheng"
headers = {
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.0.0 Safari/537.36 Edg/112.0.1722.34"
}
home_page_text = requests.get(url, headers=headers).text

# 使用正则提取帖子名称，使用帖子名称创建一个文件夹，存放这个帖子所有头像,提取到的第1个元素不需要，所以列表从第2个元素开始循环
for file in re.findall('title="(.*?)"', home_page_text)[1:]:
    # 判断文件是否存在，不存在则创建
    if not os.path.exists(f"./美女头像/{file}"):
        os.mkdir(f"./美女头像/{file}")


async def get_ulr():
    # 创建一个会话对象
    async with aiohttp.ClientSession() as session:
        # 提取帖子url的后半部分
        for i in re.findall('<a href="/(.*?)" title=', home_page_text)[1:]:
            # 拼接完整的url
            urls = "https://www.ddtouxiang.com/" + i
            # 获取每个帖子界面的html信息
            person_text = requests.get(urls, headers=headers).text
            # 提取每个图片的url
            img_url = re.findall('<a href="(.*?)" data-lightbox', person_text)
            # 提取每个帖子的名称
            img_name = re.findall('alt="(.*?)"></a>', person_text)
            # 同时遍历两个列表
            for url, name in zip(img_url, img_name):
                tasks = [asyncio.create_task(download_img(session, img_dic)) for img_dic in
                         {f"{name}": f"{url}"}.items()]
                await asyncio.wait(tasks)


async def download_img(session, img_dic):
    # 发起请求
    async with session.get(img_dic[1], verify_ssl=False) as response:
        # 获取图片二进制数据并且读取
        content = await response.content.read()
        img_path = fr".\美女头像\{img_dic[0][0:-4].strip()}\{img_dic[0][-3:-1].strip()}.jpg"
        with open(img_path, "wb") as fp:
            fp.write(content)
            print(f"{img_dic[0]}下载完成！！！")


if __name__ == '__main__':
    asyncio.run(get_ulr())
    # 运行完会报错Event loop is closed，这个不用管