基于静态网页爬取进行修改,爬取动态网页
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
import time
import re
def fetch_weather_data(url):
# 配置Chrome选项
chrome_options = Options()
chrome_options.add_argument("--headless") # 以无头模式运行
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
# 启动Chrome浏览器
service = Service('path/to/chromedriver')
driver = webdriver.Chrome(service=service, options=chrome_options)
try:
# 访问目标URL
driver.get(url)
time.sleep(5) # 等待页面加载完成
weather_data = []
# 查找天气信息
weather_wraps = driver.find_elements(By.CLASS_NAME, 'weatherWrap')
for wrap in weather_wraps:
date = wrap.find_element(By.CLASS_NAME, 'date').text.strip()
weather_desc = wrap.find_element(By.CLASS_NAME, 'desc').text.strip()
wind_direction = wrap.find_element(By.CLASS_NAME, 'windd').text.strip()
wind_volume = wrap.find_element(By.CLASS_NAME, 'winds').text.strip()
temperature_tags = wrap.find_elements(By.CSS_SELECTOR, 'div[class*="tmp.tmp_lte"]')
if len(temperature_tags) >= 2:
high_temp = temperature_tags[0].text.strip()
low_temp = temperature_tags[1].text.strip()
else:
high_temp = 'N/A'
low_temp = 'N/A'
weather_info = {
'date': date,
'weather': weather_desc,
'wind_direction': wind_direction,
'wind_volume': wind_volume,
'high_temp': high_temp,
'low_temp': low_temp
}
weather_data.append(weather_info)
finally:
driver.quit()
return weather_data
def save_to_file(data, filename):
with open(filename, 'w', encoding='utf-8') as file:
for item in data:
file.write(f"{item}\n")
if __name__ == "__main__":
weather_url = "http://www.nmc.cn/publish/forecast/ASH/pudong.html"
weather_data = fetch_weather_data(weather_url)
if weather_data:
save_to_file(weather_data, 'weather.txt')
print("Weather data has been saved to weather.txt")
for data in weather_data:
print(data)
else:
print("No weather data found or failed to retrieve the page.")