使用 playwright 进行站内死链检查
author: jayzhen
date: 20240815
背景
- 在测试过程中发现网站里有一个外链文档,访问的时候页面 404 了,因为站内有很多外链,所以想通过某种方式来将站内的
所有外链
检查一遍,然后进行统一修复
解决方案
- 对于扫描站内所有的外链,我们该如何进行操作呢?
- 使用浏览器对站内所有的页面进行访问,并获取每个页面里的链接
- 拿到所有链接后,对每个链接进行网络访问,查看网络返回状态码或者返回内容
技术实现
问题1: 为什么要用浏览器访问方式来获取页面链接?
- 因为网站使用的技术都是
动态加载
的方式开发的,如果没有能够进行js执行并渲染的情况下,拿到的页面内容只有一个简单的模版 html
问题2:能够通过自动化进行检查,为何本次使用了 playwright
?
- 理由很简单,我要的是
简单和异步
, 大家有机会可以对比下puppeteer
、selenium
和playwright
的区别,之后应该会爱上playwright
, 当然还有工具cypress
代码实现
- 协程构建,队列传递,日志记录,完事分析
# -*- coding: utf-8 -*-
"""
@author: jay
@version: Python 3.8+
@file: broken_link_check.py
@time: 2024/8/2 17:30
pip install playwright
playwright install
"""
import asyncio
from urllib import parse
import requests
from playwright.async_api import async_playwright
from lib.common import string_lines_to_dict, is_valid_url
from lib.log4py import LoggingPorter
logger = LoggingPorter()
class WebLinkScan:
def __init__(self, url):
self.checked_set = set()
self.lock = asyncio.Lock()
self.base_url = url
self.queue_site = asyncio.Queue(maxsize=500)
self.queue_other = asyncio.Queue(maxsize=500)
self.playwright = None
self.browser = None
self.context = None
self.page = None
async def init_scan(self):
self.playwright = await async_playwright().start()
self.browser = await self.playwright.chromium.launch() # headless=True
self.context = await self.browser.new_context()
self.page = await self.context.new_page()
async def close(self):
await self.context.close()
await self.browser.close()
await self.playwright.stop()
@staticmethod
def is_link_within_site(url):
parsed_url = parse.urlparse(url)
netloc = parsed_url.netloc
if '***' in netloc:
return True
return False
@staticmethod
def whether_to_filter(url):
if "*****" in url:
return True
return False
async def scan_links(self, url, index):
scan_new_links = set()
real_url = url
try:
logger.info(f"scan {index}/{self.queue_site.qsize()} {url} begin")
resp = await self.page.goto(url, timeout=30000)
links = await self.page.query_selector_all('a[href]')
for link in links:
href = await link.get_attribute('href')
logger.info(f"detected new link {href}")
if href.startswith("/"):
if '***' in url:
href = self.base_url.rstrip("/") + href
if href in scan_new_links or href in self.checked_set or self.whether_to_filter(href):
continue
if is_valid_url(href):
scan_new_links.add(href)
logger.info(f"scan {index}/{self.queue_site.qsize()} {url} links: {len(scan_new_links)}")
real_url = resp.url
except TimeoutError:
logger.error(f"{url} goto timeout")
return scan_new_links, real_url
async def producer_links(self):
self.queue_site.put_nowait(self.base_url)
url_index = 1
while self.queue_site.qsize() > 0:
url = await self.queue_site.get()
if url in self.checked_set:
continue
new_links, url2 = await self.scan_links(url, url_index)
logger.info(f"get queue_site pre {url} cur {url2}")
await self.add_to_set(url)
await self.add_to_set(url2)
for i in new_links:
if i.strip() == url.strip() or i in self.checked_set:
continue
if self.is_link_within_site(i):
logger.info(f"add to queue_site {i}")
await self.queue_site.put(i)
else:
logger.info(f"add to queue_other {i}")
await self.queue_other.put(i)
self.queue_site.task_done()
url_index += 1
self.queue_other.put_nowait(None)
logger.info("producer_links ending")
async def consumer_links(self):
while True:
href = await self.queue_other.get()
if href is None:
break
if href in self.checked_set:
continue
try:
response = requests.get(href, allow_redirects=True, timeout=30)
if response.status_code == 200:
logger.info(f"check {len(self.checked_set)}/{self.queue_other.qsize()} {href} OK")
else:
logger.info(f"check {len(self.checked_set)}/{self.queue_other.qsize()} {href} ERROR; status_code: {response.status_code}")
except Exception:
logger.error(f"Failed link {len(self.checked_set)}/{self.queue_other.qsize()} {href}") # {str(e)}
await self.add_to_set(href)
self.queue_other.task_done()
logger.info("consumer_links ending")
async def add_to_set(self, value):
async with self.lock:
self.checked_set.add(value)
async def remove_from_set(self, value):
async with self.lock:
if value in self.checked_set:
self.checked_set.remove(value)
logger.info(f"Removed {value}, set is now: {self.checked_set}")
else:
logger.info(f"{value} not in set, nothing removed")
async def main(url):
scan = WebLinkScan(url)
await scan.init_scan()
producer_task = asyncio.create_task(scan.producer_links())
consumer_task = asyncio.create_task(scan.consumer_links())
await scan.queue_site.join()
await scan.queue_other.join()
await asyncio.gather(producer_task, consumer_task)
logger.info(f'all links {len(scan.checked_set)}')
await scan.close()
if __name__ == "__main__":
asyncio.run(main("https://****"))
- 需要优化的内容:检测外链是否可用,是否可用
requests
、aiohttp
、playwright.api.request
来相互检验 - 需要注意的内容:外链是否反爬虫,这些两个问题以后再说