1. 爬取单个站点数据
1.1 对AERONET站点网址进行解析,并提取该网址下所有站点信息
手动点击步骤:
点击AERONET官网https://aeronet.gsfc.nasa.gov/
点击左下 AEROSOL OPTICAL DEPTH (V3)-SOLAR -> Data Display
选择你想要的level(本文选择level2): AOD level:level2
缩放图中地图大小到你需要的范围,该页面下的站点信息会根据图中范围显示相应的站点
本文选择以下载AOE_Baotou站点数据为例
代码:该站点网站提取站点信息和起止时间 (利用代码解析右图Step4结果)
import re
import pandas as pd
from bs4 import BeautifulSoup
import requests
from pathlib import Path
# 单个链接
single_url = 'https://aeronet.gsfc.nasa.gov/cgi-bin/data_display_aod_v3?site=AOE_Baotou&nachal=2&level=1&place_code=10'
# 发起请求
response = requests.get(single_url)
beautifulSoup = BeautifulSoup(response.text, 'html.parser')
# 解析数据
station = 'AOE_Baotou' # 从URL或页面中获取
geoInfo = 'N/A' # 如果地理信息不在页面中,可以手动设置或设为'N/A'
# 使用'string'参数代替'text'参数,应对DeprecationWarning
pageUrl = beautifulSoup.find('a', string=re.compile(r'More AERONET Downloadable Products\.{3}')).get('href')
date = beautifulSoup.find(string=re.compile(r'Start Date.+')).split('-')
start_year = re.sub(r'\;.+', '', date[2])
latest_year = date[4]
# 准备数据
results = [[station, geoInfo, pageUrl, start_year, latest_year]]
输出结果
[['AOE_Baotou',
'N/A',
'webtool_aod_v3?stage=3®ion=Asia&state=China&site=AOE_Baotou&place_code=10',
'2003',
'2023']]
1.2 选择想要的年份下载数据
类似与上图手动点击下载例子,主要在右图网址对相应zip进行下载,下边代码尝试上图利用右图给的下载链接的地址下载多年数据
# Directory to save the downloaded zip files
download_dir = Path('./aeronet_data')
download_dir.mkdir(parents=True, exist_ok=True)
for station, geoInfo, pageUrl, start_year, latest_year in results:
start_year, latest_year = int(start_year), int(latest_year)
for year in range(max(2010, start_year), min(2018, latest_year) + 1):
# 构造每个年份的下载URL,这里直接使用预定的下载URL格式
download_url = f"https://aeronet.gsfc.nasa.gov/zip_files_v3/{year}0101_{year}1231_{station}.zip"
filename = f"{station}_{year}.zip"
filepath = download_dir / filename
if filepath.exists():
print(f"File already exists: {filename}")
continue
print(f"Downloading: {filename}")
# 使用requests库直接下载文件
response = requests.get(download_url, stream=True)
if response.status_code == 200:
with open(filepath, 'wb') as file:
for chunk in response.iter_content(chunk_size=128):
file.write(chunk)
print(f"Successfully downloaded {filename}")
else:
print(f"Failed to download {filename}. Status code: {response.status_code}")
print("Download process completed.")
输出结果
Downloading: AOE_Baotou_2010.zip
Failed to download AOE_Baotou_2010.zip. Status code: 404
Downloading: AOE_Baotou_2011.zip
Failed to download AOE_Baotou_2011.zip. Status code: 404
Downloading: AOE_Baotou_2012.zip
Failed to download AOE_Baotou_2012.zip. Status code: 404
Downloading: AOE_Baotou_2013.zip
Successfully downloaded AOE_Baotou_2013.zip
Downloading: AOE_Baotou_2014.zip
Failed to download AOE_Baotou_2014.zip. Status code: 404
Downloading: AOE_Baotou_2015.zip
Failed to download AOE_Baotou_2015.zip. Status code: 404
Downloading: AOE_Baotou_2016.zip
Failed to download AOE_Baotou_2016.zip. Status code: 404
Downloading: AOE_Baotou_2017.zip
Failed to download AOE_Baotou_2017.zip. Status code: 404
Downloading: AOE_Baotou_2018.zip
Failed to download AOE_Baotou_2018.zip. Status code: 404
Download process completed.
该站点仅2013年有level2的数据
2. 根据以下网址学习python自动下载AERONET站点数据,在此特别感谢博主的文章
AERONET AOD 数据自动化下载 + PYTHON + SELENIUM_aeronet下载-CSDN博客
该作者的github代码地址
https://github.com/SakuraSong001/spider4remotedata
以下为个人笔记
存储中国区域的html (点击Data Display 部分后将图缩放到中国区域,另存为html)
该博客的代码主要修改其chinaAreaPage 以及其他相应需要的存储位置即可用
以下为个人测试代码
注意:下述代码下载的zip文件既包含level 1, level 1.5, level 2 各级产品,如需下载单一产品还需进一步修改。可参考前文博主的代码先筛选和下载。
通过解析网站前文存储的html所包含的数据提取其包含的站点名称、起止时间等存为csv文件
import os
import csv
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import requests
from selenium.webdriver.edge.service import Service
from webdriver_manager.microsoft import EdgeChromiumDriverManager
import chardet
def get_stations(area_file):
result = []
pattern = r'https\:\/\/aeronet\.gsfc\.nasa\.gov\/cgi\-bin\/data\_display\_aod\_v3\?site\=.+'
chinaAreaPage = r'china.html'
try:
with open(chinaAreaPage, 'rb') as file:
encoding = chardet.detect(file.read())['encoding']
with open(chinaAreaPage, 'r', encoding=encoding) as file:
content = file.read()
except Exception as e:
print(f"Error reading {chinaAreaPage}: {e}")
return []
soup = BeautifulSoup(content, 'html.parser')
aList = soup.find_all('a', href=re.compile(pattern))
for item in aList:
sHref = item.get('href')
station = re.sub(r'\n', '', item.get_text())
geoInfo = re.sub(r'\n+.+\(\s', r'(', item.parent.get_text())
response = requests.get(sHref)
beautifulSoup = BeautifulSoup(response.text, 'html.parser')
pageUrl = beautifulSoup.find('a', string=re.compile(r'More AERONET Downloadable Products\.{3}')).get('href')
date = beautifulSoup.find(string=re.compile(r'Start Date.+')).split('-')
start_year = re.sub(r'\;.+', '', date[2])
latest_year = date[4]
result.append([station, geoInfo, pageUrl, start_year, latest_year])
dataframe = pd.DataFrame(result, columns=['station', 'geoInfo', 'pageUrl', 'start_year', 'latest_year'])
dataframe.to_csv(area_file, index=False, sep=',', encoding='utf-8')
return result
if __name__ == '__main__':
chinaAreaFile = './aeroChinaGeo.csv'
if not os.path.exists(chinaAreaFile):
stationList = get_stations(chinaAreaFile)
结果展示
根据上述csv提取出的站点结果并进行分站点分年下载,以下为个人测试代码,下载时间为2005-2012年的数据
import os
import requests
from pathlib import Path
import pandas as pd
def download_aeronet_data(csv_file_path, download_dir_path):
df = pd.read_csv(csv_file_path)
download_dir = Path(download_dir_path)
download_dir.mkdir(parents=True, exist_ok=True)
for _, row in df.iterrows():
station, first, latest = row['station'], str(row['start_year']), str(row['latest_year'])
for year in range(max(int(first), 2005), min(int(latest), 2012) + 1):
filename = f"{year}0101_{year}1231_{station}.zip"
filepath = download_dir / filename
url = f'https://aeronet.gsfc.nasa.gov/zip_files_v3/{filename}'
if filepath.exists():
print(f"File already exists: {filename}")
continue
try:
response = requests.get(url, stream=True)
if response.status_code == 200:
with open(filepath, 'wb') as file:
for chunk in response.iter_content(chunk_size=8192):
file.write(chunk)
print(f"Successfully downloaded {filename}")
else:
print(f"Failed to download {filename}. Status code: {response.status_code}")
except Exception as e:
print(f"Error downloading {filename}: {e}")
if __name__ == '__main__':
# Specify the path to your CSV file containing station information
csv_file_path = './aeroChinaGeo.csv'
# Specify the directory where you want to save the downloaded ZIP files
download_dir_path = './aeronet_data_test'
download_aeronet_data(csv_file_path, download_dir_path)
部分输出结果如下