python的web工具:自动化框架DrissionPage(比selenium更加灵活,不需要配置浏览器驱动)

最近看到一个好玩的库,声称碾压selenium,怀着好奇的心打开了一个新世界。选动态网页试试收,使用DrissionPage写了一个自动化爬虫(爬取丽人网图片下载到本地)
本文包含主要技术(DrissionPage用法,动态页面ajax应对方法,xpath解析,异步协程保存图片)

import time
from DrissionPage import ChromiumPage
import aiohttp
import asyncio
import os
from DrissionPage import ChromiumOptions
from functools import wraps

path = r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"  # 请改为你电脑内Chrome可执行文件路径
ChromiumOptions().set_browser_path(path).save()


class Spider:
    def __init__(self, name="利世", start_page=1, all_page=None):
        self.page = ChromiumPage()
        self.name: str = name
        self.file_div_path: str = name
        self.index = 0

        self.start_page = start_page #开始页
        self.all_page = all_page #总页数 
        if not os.path.exists(self.file_div_path):
            os.makedirs(self.file_div_path)

    def start(self):
        self.page.get("https://spacemiss.com/")
        ele = self.page.ele("#tdb-search-form-input-tdi_28")
        ele.input(self.name)
        ele = self.page.ele(".wpb_button wpb_btn-inverse tdb-search-form-btn")
        ele.click()

    def analysis_first_page(self):
        height = self.page.run_js_loaded("return document.body.scrollHeight")
        # 异步加载
        # 循环滚动页面
        while True:
            # 滚动到页面底部
            self.page.run_js_loaded("window.scrollTo(0, document.body.scrollHeight);")

            # 等待页面加载
            time.sleep(4)

            # 计算新的页面高度并与之前的比较
            new_height = self.page.run_js_loaded("return document.body.scrollHeight")
            if new_height == height:
                break
            height = new_height
        links = self.page.eles(
            "xpath:/html/body/div[6]/div[2]/div/div/div/div[2]/div/div/div/div/div[1]/div"
        )
        print(f"共找到{len(links)}个结果")
        i = 0
        for link in links[self.start_page - 1 :]:

            a = link.ele("xpath:./div/div[1]/div/a")
            url = a.attr("href")
            tab_page = self.page.new_tab()
            tab_page.get(url)
            self.analysis_second_page(tab_page)
            tab_page.close()
            i += 1

            if self.all_page and i >= self.all_page:
                break

    def analysis_second_page(self, tab_page):
        while True:
            try:
                time.sleep(3)
                title = tab_page.ele('xpath://h1[@class="tdb-title-text"]')
                break
            except:
                tab_page.refresh()

        imgs = tab_page.eles(
            "xpath:/html/body/div[6]/div[2]/div/div/article/div/div/div[4]/div/div[2]/div/div/div[2]/img"
        )

        urls = []
        for img in imgs:
            img_url = img.attr("src")
            urls.append(img_url)

        title = title.text.strip()
        title = title.replace(" ", "")
        title = title.replace(".", "")
        title = title.replace("|", "")
        file_path = self.file_div_path + "/" + title

        if not os.path.exists(file_path):
            os.makedirs(file_path)
        else:
            return
        print(f"{title}共找到{len(urls)}张图片")
        tasks = [self.save_img(url, title) for url in urls]

        # 异步下载图片
        async def main():
            await asyncio.gather(*tasks)

        asyncio.run(main())

    def close(self):
        self.page.close()

    async def save_img(self, img_url: str, title: str):
        retry_count = 3
        while retry_count > 0:

            try:
                async with aiohttp.ClientSession(
                    timeout=aiohttp.ClientTimeout(total=60)
                ) as session:
                    async with session.get(img_url) as resp:
                        save_path = (
                            self.file_div_path + f"/{title}" + f"/{img_url[-7:]}"
                        )
                        with open(save_path, "wb") as f:
                            while True:
                                try:

                                    chunk = await resp.content.read(1024)
                                    if not chunk:
                                        break
                                    f.write(chunk)
                                except:
                                    break
                        break
            except:
                print(f"服务器断开连接,正在重新连接{3-retry_count + 1}...")
                retry_count -= 1
                await asyncio.sleep(3)  # 等待5秒后再次尝试

    def run(self):
        self.start()
        self.analysis_first_page()
        self.close()


spider = Spider(name="日奈娇", start_page=1)
spider.run()

 网络不好下载图片可能出问题,下载图片重试三次

  • 5
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值