python批量爬取网页截图【多线程+网站框架加载检测+最大程度】

  1. 增加判断各种网站框架的加载检测
  2. 跳过、忽略网站SSL/TLS证书错误和安全检查,最大程度呈现网页
  3. 采用多线程访问网站
import concurrent.futures
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import csv
import os
import time
from tqdm import tqdm

# 设置Chrome选项
def set_chrome_options():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('--ignore-certificate-errors')  # Chrome浏览器忽略SSL证书错误
    chrome_options.add_argument('--disable-web-security')       # 绕过某些安全检查,但这也会降低安全性,应谨慎使用
    return chrome_options
# 指定ChromeDriver的路径
driver_path = '/usr/bin/chromedriver'

# 提前检查重复文件
def check_existing_screenshots(urls):
    existing_urls = set()
    output_dir = 'multi_ss2/'
    for url in urls:
        screenshot_name = f"{output_dir}{url.split('//')[-1].replace('/', '_')}.png"
        if os.path.exists(screenshot_name):
            existing_urls.add(url)
    return existing_urls


# 创建WebDriver实例
def create_driver():
    service = Service(driver_path)
    return webdriver.Chrome(service=service, options=set_chrome_options())

# 读取URL列表
def read_urls(filename):
    urls = []
    with open(filename, mode='r', encoding='utf-8') as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            if row:
                urls.append(row[0])
    return urls

# 等待页面完全加载,包括JavaScript执行完毕
def wait_for_page_to_load(driver):
    start_time = time.time()  # 记录开始时间
    
    # 等待document.readyState状态为'complete'
    WebDriverWait(driver, 20).until(
        lambda d: d.execute_script('return document.readyState') == 'complete'
    )

    # 针对Angular应用,等待$http请求完成
    WebDriverWait(driver, 20).until(
        lambda d: d.execute_script('''
            const angular = window.angular;
            return !angular || angular.element(document).injector().get("$http").pendingRequests.length === 0;
        ''')
    )

    # 针对React应用,等待ReactDOM.findDOMNode返回非空值
    WebDriverWait(driver, 20).until(
        lambda d: d.execute_script('''
            const React = window.React;
            const ReactDOM = window.ReactDOM;
            return !React || !ReactDOM || ReactDOM.findDOMNode(document.body) !== null;
        ''')
    )

    # 针对Vue.js应用,等待Vue.nextTick
    WebDriverWait(driver, 20).until(
        lambda d: d.execute_script('''
            const Vue = window.Vue;
            if (!Vue || typeof Vue.nextTick !== 'function') {
                return true;
            }
            return new Promise(resolve => Vue.nextTick(resolve));
        ''')
    )

    
    elapsed_time = time.time() - start_time  # 计算已过去的时间
    remaining_time = max(0, 20 - elapsed_time)  # 计算还需要等待的时间
    time.sleep(remaining_time)  # 强制等待剩余时间,确保至少等待20秒

def process_url(url):
    driver = create_driver()
    try:
        output_dir = 'multi_ss2/'
        screenshot_name = f"{output_dir}{url.split('//')[-1].replace('/', '_')}.png"
        
        driver.get(url)
        wait_for_page_to_load(driver)  # 调用新的等待函数
        driver.save_screenshot(screenshot_name)
    except Exception as e:
        print(f"An error occurred while processing {url}: {str(e)}")
    finally:
        if driver:
            driver.quit()
# 主函数
def main():
    urls = read_urls('list_3r.csv')
    existing_urls = check_existing_screenshots(urls)
    urls_to_process = [url for url in urls if url not in existing_urls]
    
    # 使用ThreadPoolExecutor进行多线程处理
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        list(tqdm(executor.map(process_url, urls_to_process), total=len(urls_to_process), desc="Processing URLs"))

    print("All screenshots have been saved.")

if __name__ == "__main__":
    main()


"""
1、等待JavaScript执行完毕: 你可以检查document.readyState的状态,当这个状态变为"complete"时,意味着页面上的所有脚本都已经执行完毕。

python
WebDriverWait(driver, 10).until(lambda driver: driver.execute_script('return document.readyState') == 'complete')

2、等待Angular、React或Vue.js应用加载完成: 如果页面使用了这些框架,你可以等待框架特有的加载完成标志。例如,对于Angular,你可以等待ng-app元素的ng-app:ready属性变为true。

python
WebDriverWait(driver, 10).until(lambda driver: driver.execute_script('return angular.element(document).injector().get("$http").pendingRequests.length === 0'))

"""


  • 4
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值