【Python】使用 playwright 进行站内死链检查

使用 playwright 进行站内死链检查

author: jayzhen
date: 20240815

背景

  • 在测试过程中发现网站里有一个外链文档,访问的时候页面 404 了,因为站内有很多外链,所以想通过某种方式来将站内的所有外链检查一遍,然后进行统一修复

解决方案

  • 对于扫描站内所有的外链,我们该如何进行操作呢?
  1. 使用浏览器对站内所有的页面进行访问,并获取每个页面里的链接
  2. 拿到所有链接后,对每个链接进行网络访问,查看网络返回状态码或者返回内容

技术实现

问题1: 为什么要用浏览器访问方式来获取页面链接?

  • 因为网站使用的技术都是 动态加载 的方式开发的,如果没有能够进行js执行并渲染的情况下,拿到的页面内容只有一个简单的模版 html

问题2:能够通过自动化进行检查,为何本次使用了 playwright

  • 理由很简单,我要的是简单和异步, 大家有机会可以对比下 puppeteerseleniumplaywright 的区别,之后应该会爱上 playwright, 当然还有工具 cypress

代码实现

  • 协程构建,队列传递,日志记录,完事分析
# -*- coding: utf-8 -*-

"""
@author: jay
@version: Python 3.8+
@file: broken_link_check.py
@time: 2024/8/2 17:30

pip install playwright
playwright install
"""

import asyncio
from urllib import parse

import requests
from playwright.async_api import async_playwright

from lib.common import string_lines_to_dict, is_valid_url
from lib.log4py import LoggingPorter

logger = LoggingPorter()

class WebLinkScan:

    def __init__(self, url):
        self.checked_set = set()
        self.lock = asyncio.Lock()
        self.base_url = url

        self.queue_site = asyncio.Queue(maxsize=500)
        self.queue_other = asyncio.Queue(maxsize=500)

        self.playwright = None
        self.browser = None
        self.context = None
        self.page = None

    async def init_scan(self):
        self.playwright = await async_playwright().start()
        self.browser = await self.playwright.chromium.launch()  # headless=True
        self.context = await self.browser.new_context()
        self.page = await self.context.new_page()

    async def close(self):
        await self.context.close()
        await self.browser.close()
        await self.playwright.stop()

    @staticmethod
    def is_link_within_site(url):
        parsed_url = parse.urlparse(url)
        netloc = parsed_url.netloc
        if '***' in netloc:
            return True
        return False

    @staticmethod
    def whether_to_filter(url):
        if "*****" in url:
            return True
        return False

    async def scan_links(self, url, index):
        scan_new_links = set()
        real_url = url
        try:
            logger.info(f"scan {index}/{self.queue_site.qsize()} {url} begin")
            resp = await self.page.goto(url, timeout=30000)
            links = await self.page.query_selector_all('a[href]')
            for link in links:
                href = await link.get_attribute('href')
                logger.info(f"detected new link {href}")
                if href.startswith("/"):
                    if '***' in url:
                        href = self.base_url.rstrip("/") + href
                if href in scan_new_links or href in self.checked_set or self.whether_to_filter(href):
                    continue
                if is_valid_url(href):
                    scan_new_links.add(href)
            logger.info(f"scan {index}/{self.queue_site.qsize()} {url} links: {len(scan_new_links)}")
            real_url = resp.url
        except TimeoutError:
            logger.error(f"{url} goto timeout")
        return scan_new_links, real_url

    async def producer_links(self):
        self.queue_site.put_nowait(self.base_url)
        url_index = 1
        while self.queue_site.qsize() > 0:
            url = await self.queue_site.get()
            if url in self.checked_set:
                continue
            new_links, url2 = await self.scan_links(url, url_index)
            logger.info(f"get queue_site pre {url} cur {url2}")
            await self.add_to_set(url)
            await self.add_to_set(url2)
            for i in new_links:
                if i.strip() == url.strip() or i in self.checked_set:
                    continue
                if self.is_link_within_site(i):
                    logger.info(f"add to queue_site {i}")
                    await self.queue_site.put(i)
                else:
                    logger.info(f"add to queue_other {i}")
                    await self.queue_other.put(i)
            self.queue_site.task_done()
            url_index += 1
        self.queue_other.put_nowait(None)
        logger.info("producer_links ending")

    async def consumer_links(self):
        while True:
            href = await self.queue_other.get()
            if href is None:
                break
            if href in self.checked_set:
                continue
            try:
                response = requests.get(href, allow_redirects=True, timeout=30)
                if response.status_code == 200:
                    logger.info(f"check {len(self.checked_set)}/{self.queue_other.qsize()} {href} OK")
                else:
                    logger.info(f"check {len(self.checked_set)}/{self.queue_other.qsize()} {href} ERROR; status_code: {response.status_code}")
            except Exception:
                logger.error(f"Failed link {len(self.checked_set)}/{self.queue_other.qsize()} {href}")  # {str(e)}
            await self.add_to_set(href)
            self.queue_other.task_done()
        logger.info("consumer_links ending")

    async def add_to_set(self, value):
        async with self.lock:
            self.checked_set.add(value)

    async def remove_from_set(self, value):
        async with self.lock:
            if value in self.checked_set:
                self.checked_set.remove(value)
                logger.info(f"Removed {value}, set is now: {self.checked_set}")
            else:
                logger.info(f"{value} not in set, nothing removed")


async def main(url):
    scan = WebLinkScan(url)
    await scan.init_scan()
    producer_task = asyncio.create_task(scan.producer_links())
    consumer_task = asyncio.create_task(scan.consumer_links())
    await scan.queue_site.join()
    await scan.queue_other.join()
    await asyncio.gather(producer_task, consumer_task)
    logger.info(f'all links {len(scan.checked_set)}')
    await scan.close()


if __name__ == "__main__":
    asyncio.run(main("https://****"))
  • 需要优化的内容:检测外链是否可用,是否可用requestsaiohttpplaywright.api.request 来相互检验
  • 需要注意的内容:外链是否反爬虫,这些两个问题以后再说

资料

  • 6
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值