【Python】使用 playwright 进行站内死链检查

原创已于 2024-08-15 11:47:07 修改

· 661 阅读

6 ·

版权

文章标签：

#python #开发语言 #网络爬虫 #协程 #playwright

于 2024-08-15 11:32:14 首次发布

Python 同时被 3 个专栏收录

46 篇文章

订阅专栏

效能/体系与工具实践

13 篇文章

订阅专栏

Playwright

1 篇文章

订阅专栏

使用 playwright 进行站内死链检查

author: jayzhen
date: 20240815

文章目录

使用 playwright 进行站内死链检查

背景

在测试过程中发现网站里有一个外链文档，访问的时候页面 404 了，因为站内有很多外链，所以想通过某种方式来将站内的所有外链检查一遍，然后进行统一修复

解决方案

对于扫描站内所有的外链，我们该如何进行操作呢？

使用浏览器对站内所有的页面进行访问，并获取每个页面里的链接
拿到所有链接后，对每个链接进行网络访问，查看网络返回状态码或者返回内容

技术实现

问题1：为什么要用浏览器访问方式来获取页面链接？

因为网站使用的技术都是 动态加载 的方式开发的，如果没有能够进行js执行并渲染的情况下，拿到的页面内容只有一个简单的模版 html

问题2：能够通过自动化进行检查，为何本次使用了 playwright ？

理由很简单，我要的是简单和异步, 大家有机会可以对比下 puppeteer、selenium 和 playwright 的区别，之后应该会爱上 playwright, 当然还有工具 cypress

代码实现

协程构建，队列传递，日志记录，完事分析

# -*- coding: utf-8 -*-

"""
@author: jay
@version: Python 3.8+
@file: broken_link_check.py
@time: 2024/8/2 17:30

pip install playwright
playwright install
"""

import asyncio
from urllib import parse

import requests
from playwright.async_api import async_playwright

from lib.common import string_lines_to_dict, is_valid_url
from lib.log4py import LoggingPorter

logger = LoggingPorter()

class WebLinkScan:

    def __init__(self, url):
        self.checked_set = set()
        self.lock = asyncio.Lock()
        self.base_url = url

        self.queue_site = asyncio.Queue(maxsize=500)
        self.queue_other = asyncio.Queue(maxsize=500)

        self.playwright = None
        self.browser = None
        self.context = None
        self.page = None

    async def init_scan(self):
        self.playwright = await async_playwright().start()
        self.browser = await self.playwright.chromium.launch()  # headless=True
        self.context = await self.browser.new_context()
        self.page = await self.context.new_page()

    async def close(self):
        await self.context.close()
        await self.browser.close()
        await self.playwright.stop()

    @staticmethod
    def is_link_within_site(url):
        parsed_url = parse.urlparse(url)
        netloc = parsed_url.netloc
        if '***' in netloc:
            return True
        return False

    @staticmethod
    def whether_to_filter(url):
        if "*****" in url:
            return True
        return False

    async def scan_links(self, url, index):
        scan_new_links = set()
        real_url = url
        try:
            logger.info(f"scan {index}/{self.queue_site.qsize()} {url} begin")
            resp = await self.page.goto(url, timeout=30000)
            links = await self.page.query_selector_all('a[href]')
            for link in links:
                href = await link.get_attribute('href')
                logger.info(f"detected new link {href}")
                if href.startswith("/"):
                    if '***' in url:
                        href = self.base_url.rstrip("/") + href
                if href in scan_new_links or href in self.checked_set or self.whether_to_filter(href):
                    continue
                if is_valid_url(href):
                    scan_new_links.add(href)
            logger.info(f"scan {index}/{self.queue_site.qsize()} {url} links: {len(scan_new_links)}")
            real_url = resp.url
        except TimeoutError:
            logger.error(f"{url} goto timeout")
        return scan_new_links, real_url

    async def producer_links(self):
        self.queue_site.put_nowait(self.base_url)
        url_index = 1
        while self.queue_site.qsize() > 0:
            url = await self.queue_site.get()
            if url in self.checked_set:
                continue
            new_links, url2 = await self.scan_links(url, url_index)
            logger.info(f"get queue_site pre {url} cur {url2}")
            await self.add_to_set(url)
            await self.add_to_set(url2)
            for i in new_links:
                if i.strip() == url.strip() or i in self.checked_set:
                    continue
                if self.is_link_within_site(i):
                    logger.info(f"add to queue_site {i}")
                    await self.queue_site.put(i)
                else:
                    logger.info(f"add to queue_other {i}")
                    await self.queue_other.put(i)
            self.queue_site.task_done()
            url_index += 1
        self.queue_other.put_nowait(None)
        logger.info("producer_links ending")

    async def consumer_links(self):
        while True:
            href = await self.queue_other.get()
            if href is None:
                break
            if href in self.checked_set:
                continue
            try:
                response = requests.get(href, allow_redirects=True, timeout=30)
                if response.status_code == 200:
                    logger.info(f"check {len(self.checked_set)}/{self.queue_other.qsize()} {href} OK")
                else:
                    logger.info(f"check {len(self.checked_set)}/{self.queue_other.qsize()} {href} ERROR; status_code: {response.status_code}")
            except Exception:
                logger.error(f"Failed link {len(self.checked_set)}/{self.queue_other.qsize()} {href}")  # {str(e)}
            await self.add_to_set(href)
            self.queue_other.task_done()
        logger.info("consumer_links ending")

    async def add_to_set(self, value):
        async with self.lock:
            self.checked_set.add(value)

    async def remove_from_set(self, value):
        async with self.lock:
            if value in self.checked_set:
                self.checked_set.remove(value)
                logger.info(f"Removed {value}, set is now: {self.checked_set}")
            else:
                logger.info(f"{value} not in set, nothing removed")


async def main(url):
    scan = WebLinkScan(url)
    await scan.init_scan()
    producer_task = asyncio.create_task(scan.producer_links())
    consumer_task = asyncio.create_task(scan.consumer_links())
    await scan.queue_site.join()
    await scan.queue_other.join()
    await asyncio.gather(producer_task, consumer_task)
    logger.info(f'all links {len(scan.checked_set)}')
    await scan.close()


if __name__ == "__main__":
    asyncio.run(main("https://****"))