import concurrent.futures
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import csv
import os
import time
from tqdm import tqdm
# 设置Chrome选项
def set_chrome_options():
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
chrome_options.add_argument('--no-sandbox')
prefs = {"profile.managed_default_content_settings.images": 2}
chrome_options.add_experimental_option("prefs", prefs)
chrome_options.add_argument('--window-size=1920,1080')
return chrome_options
# 指定ChromeDriver的路径
driver_path = '/usr/bin/chromedriver'
# 创建WebDriver实例
def create_driver():
service = Service(driver_path)
return webdriver.Chrome(service=service, options=set_chrome_options())
# 读取URL列表
def read_urls(filename):
urls = []
with open(filename, mode='r', encoding='utf-8') as file:
csv_reader = csv.reader(file)
for row in csv_reader:
if row:
urls.append(row[0])
return urls
# 处理单个URL
def process_url(url):
driver = create_driver()
try:
driver.get(url)
# 强制等待至少5秒
time.sleep(5)
# 初级:WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.TAG_NAME, "body"))) #等10秒页面加载完成
# 等待页面完全加载
WebDriverWait(driver, 10).until(lambda d: d.execute_script('return document.readyState') == 'complete')
# 如果是Angular应用,等待所有$http请求完成
WebDriverWait(driver, 10).until(lambda d: d.execute_script('return angular.element(document).injector().get("$http").pendingRequests.length === 0'))
output_dir = 'multi_ss2/'
if not os.path.exists(output_dir):
os.makedirs(output_dir)
screenshot_name = f"{output_dir}{url.split('//')[-1].replace('/', '_')}.png"
driver.save_screenshot(screenshot_name)
except Exception as e:
print(f"An error occurred while processing {url}: {str(e)}")
finally:
if driver:
driver.quit()
# 主函数
def main():
urls = read_urls('lists_3.csv')
# 使用ThreadPoolExecutor进行多线程处理
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
list(tqdm(executor.map(process_url, urls), total=len(urls), desc="Processing URLs"))
print("All screenshots have been saved.")
if __name__ == "__main__":
main()
"""
1、等待JavaScript执行完毕: 你可以检查document.readyState的状态,当这个状态变为"complete"时,意味着页面上的所有脚本都已经执行完毕。
python
WebDriverWait(driver, 10).until(lambda driver: driver.execute_script('return document.readyState') == 'complete')
2、等待Angular、React或Vue.js应用加载完成: 如果页面使用了这些框架,你可以等待框架特有的加载完成标志。例如,对于Angular,你可以等待ng-app元素的ng-app:ready属性变为true。
python
WebDriverWait(driver, 10).until(lambda driver: driver.execute_script('return angular.element(document).injector().get("$http").pendingRequests.length === 0'))
"""
批量爬取网页截图【多线程+页面加载判断】
最新推荐文章于 2024-07-31 11:31:25 发布