python批量爬取网页截图【多线程+网站框架加载检测+最大程度】

之群害马

已于 2024-07-31 14:39:20 修改

阅读量193

点赞数 4

文章标签：服务器 linux 网络爬虫网络爬虫

于 2024-07-31 11:31:25 首次发布

本文链接：https://blog.csdn.net/Ppandaer/article/details/140818486

版权

增加判断各种网站框架的加载检测
跳过、忽略网站SSL/TLS证书错误和安全检查，最大程度呈现网页
采用多线程访问网站

import concurrent.futures
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import csv
import os
import time
from tqdm import tqdm

# 设置Chrome选项
def set_chrome_options():
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.add_argument('--window-size=1920,1080')
    chrome_options.add_argument('--ignore-certificate-errors')  # Chrome浏览器忽略SSL证书错误
    chrome_options.add_argument('--disable-web-security')       # 绕过某些安全检查，但这也会降低安全性，应谨慎使用
    return chrome_options
# 指定ChromeDriver的路径
driver_path = '/usr/bin/chromedriver'

# 提前检查重复文件
def check_existing_screenshots(urls):
    existing_urls = set()
    output_dir = 'multi_ss2/'
    for url in urls:
        screenshot_name = f"{output_dir}{url.split('//')[-1].replace('/', '_')}.png"
        if os.path.exists(screenshot_name):
            existing_urls.add(url)
    return existing_urls


# 创建WebDriver实例
def create_driver():
    service = Service(driver_path)
    return webdriver.Chrome(service=service, options=set_chrome_options())

# 读取URL列表
def read_urls(filename):
    urls = []
    with open(filename, mode='r', encoding='utf-8') as file:
        csv_reader = csv.reader(file)
        for row in csv_reader:
            if row:
                urls.append(row[0])
    return urls

# 等待页面完全加载，包括JavaScript执行完毕
def wait_for_page_to_load(driver):
    start_time = time.time()  # 记录开始时间
    
    # 等待document.readyState状态为'complete'
    WebDriverWait(driver, 20).until(
        lambda d: d.execute_script('return document.readyState') == 'complete'
    )

    # 针对Angular应用，等待$http请求完成
    WebDriverWait(driver, 20).until(
        lambda d: d.execute_script('''
            const angular = window.angular;
            return !angular || angular.element(document).injector().get("$http").pendingRequests.length === 0;
        ''')
    )

    # 针对React应用，等待ReactDOM.findDOMNode返回非空值
    WebDriverWait(driver, 20).until(
        lambda d: d.execute_script('''
            const React = window.React;
            const ReactDOM = window.ReactDOM;
            return !React || !ReactDOM || ReactDOM.findDOMNode(document.body) !== null;
        ''')
    )

    # 针对Vue.js应用，等待Vue.nextTick
    WebDriverWait(driver, 20).until(
        lambda d: d.execute_script('''
            const Vue = window.Vue;
            if (!Vue || typeof Vue.nextTick !== 'function') {
                return true;
            }
            return new Promise(resolve => Vue.nextTick(resolve));
        ''')
    )

    
    elapsed_time = time.time() - start_time  # 计算已过去的时间
    remaining_time = max(0, 20 - elapsed_time)  # 计算还需要等待的时间
    time.sleep(remaining_time)  # 强制等待剩余时间，确保至少等待20秒

def process_url(url):
    driver = create_driver()
    try:
        output_dir = 'multi_ss2/'
        screenshot_name = f"{output_dir}{url.split('//')[-1].replace('/', '_')}.png"
        
        driver.get(url)
        wait_for_page_to_load(driver)  # 调用新的等待函数
        driver.save_screenshot(screenshot_name)
    except Exception as e:
        print(f"An error occurred while processing {url}: {str(e)}")
    finally:
        if driver:
            driver.quit()
# 主函数
def main():
    urls = read_urls('list_3r.csv')
    existing_urls = check_existing_screenshots(urls)
    urls_to_process = [url for url in urls if url not in existing_urls]
    
    # 使用ThreadPoolExecutor进行多线程处理
    with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
        list(tqdm(executor.map(process_url, urls_to_process), total=len(urls_to_process), desc="Processing URLs"))

    print("All screenshots have been saved.")

if __name__ == "__main__":
    main()


"""
1、等待JavaScript执行完毕： 你可以检查document.readyState的状态，当这个状态变为"complete"时，意味着页面上的所有脚本都已经执行完毕。

python
WebDriverWait(driver, 10).until(lambda driver: driver.execute_script('return document.readyState') == 'complete')

2、等待Angular、React或Vue.js应用加载完成： 如果页面使用了这些框架，你可以等待框架特有的加载完成标志。例如，对于Angular，你可以等待ng-app元素的ng-app:ready属性变为true。

python
WebDriverWait(driver, 10).until(lambda driver: driver.execute_script('return angular.element(document).injector().get("$http").pendingRequests.length === 0'))

"""