- 增加判断各种网站框架的加载检测
- 跳过、忽略网站SSL/TLS证书错误和安全检查,最大程度呈现网页
- 采用多线程访问网站
import concurrent.futures
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import csv
import os
import time
from tqdm import tqdm
def set_chrome_options():
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--window-size=1920,1080')
chrome_options.add_argument('--ignore-certificate-errors')
chrome_options.add_argument('--disable-web-security')
return chrome_options
driver_path = '/usr/bin/chromedriver'
def check_existing_screenshots(urls):
existing_urls = set()
output_dir = 'multi_ss2/'
for url in urls:
screenshot_name = f"{output_dir}{url.split('//')[-1].replace('/', '_')}.png"
if os.path.exists(screenshot_name):
existing_urls.add(url)
return existing_urls
def create_driver():
service = Service(driver_path)
return webdriver.Chrome(service=service, options=set_chrome_options())
def read_urls(filename):
urls = []
with open(filename, mode='r', encoding='utf-8') as file:
csv_reader = csv.reader(file)
for row in csv_reader:
if row:
urls.append(row[0])
return urls
def wait_for_page_to_load(driver):
start_time = time.time()
WebDriverWait(driver, 20).until(
lambda d: d.execute_script('return document.readyState') == 'complete'
)
WebDriverWait(driver, 20).until(
lambda d: d.execute_script('''
const angular = window.angular;
return !angular || angular.element(document).injector().get("$http").pendingRequests.length === 0;
''')
)
WebDriverWait(driver, 20).until(
lambda d: d.execute_script('''
const React = window.React;
const ReactDOM = window.ReactDOM;
return !React || !ReactDOM || ReactDOM.findDOMNode(document.body) !== null;
''')
)
WebDriverWait(driver, 20).until(
lambda d: d.execute_script('''
const Vue = window.Vue;
if (!Vue || typeof Vue.nextTick !== 'function') {
return true;
}
return new Promise(resolve => Vue.nextTick(resolve));
''')
)
elapsed_time = time.time() - start_time
remaining_time = max(0, 20 - elapsed_time)
time.sleep(remaining_time)
def process_url(url):
driver = create_driver()
try:
output_dir = 'multi_ss2/'
screenshot_name = f"{output_dir}{url.split('//')[-1].replace('/', '_')}.png"
driver.get(url)
wait_for_page_to_load(driver)
driver.save_screenshot(screenshot_name)
except Exception as e:
print(f"An error occurred while processing {url}: {str(e)}")
finally:
if driver:
driver.quit()
def main():
urls = read_urls('list_3r.csv')
existing_urls = check_existing_screenshots(urls)
urls_to_process = [url for url in urls if url not in existing_urls]
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
list(tqdm(executor.map(process_url, urls_to_process), total=len(urls_to_process), desc="Processing URLs"))
print("All screenshots have been saved.")
if __name__ == "__main__":
main()
"""
1、等待JavaScript执行完毕: 你可以检查document.readyState的状态,当这个状态变为"complete"时,意味着页面上的所有脚本都已经执行完毕。
python
WebDriverWait(driver, 10).until(lambda driver: driver.execute_script('return document.readyState') == 'complete')
2、等待Angular、React或Vue.js应用加载完成: 如果页面使用了这些框架,你可以等待框架特有的加载完成标志。例如,对于Angular,你可以等待ng-app元素的ng-app:ready属性变为true。
python
WebDriverWait(driver, 10).until(lambda driver: driver.execute_script('return angular.element(document).injector().get("$http").pendingRequests.length === 0'))
"""