Selenium自动下载MERRAero数据
由于课题需要MERRAero数据,但是需要的变量及年份有点多,下载地址
这边下载时对数据的大小及时间步有着比较严格的要求,没有办法一次性下载多个数据集,只能一此次选择,但是太麻烦了,所以尝试用Selenium下载数据
图片中箭头指向的部位就是需要处理的部位,fF12打开源代码定位具体元素
variable selection
driver=webdriver.Chrome()
driver.get('https://portal.nccs.nasa.gov/cgi-lats4d/webform.cgi?&i=GEOS-5/MERRAero/hourly/inst1hr_2d_hwl_Nx')
data=driver.find_element_by_css_selector("input[value=all]")
data.click()
时间选择
from selenium.webdriver.support.ui import Select
begin_year=Select(driver.find_element_by_css_selector("select[name=year]"))
begin_year.select_by_visible_text("2015")
begin_month=Select(driver.find_element_by_css_selector("select[name=month]"))
begin_month.select_by_visible_text("Jan")
begin_day=Select(driver.find_element_by_css_selector("select[name=day]"))
begin_day.select_by_visible_text("01")
begin_hour=Select(driver.find_element_by_css_selector("select[name=hour]"))
begin_hour.select_by_visible_text("00")
end_year=Select(driver.find_element_by_css_selector("select[name=yearend]"))
end_year.select_by_visible_text("2020")
end_month=Select(driver.find_element_by_css_selector("select[name=monthend]"))
end_month.select_by_visible_text("Dec")
end_day=Select(driver.find_element_by_css_selector("select[name=dayend]"))
end_day.select_by_visible_text("31")
end_hour=Select(driver.find_element_by_css_selector("select[name=hourend]"))
end_hour.select_by_visible_text("23")
数据范围选择
draw_box=driver.find_element_by_id("boxToggle")
draw_box.click()
search_north=driver.find_element_by_css_selector("input[name=NorthLatitude]")
search_north.clear() #清空框内已经存在的数据
search_north.send_keys("43")
search_south=driver.find_element_by_css_selector("input[name=SouthLatitude]")
search_south.clear()
search_south.send_keys("35")
search_west=driver.find_element_by_css_selector("input[name=WestLongitude]")
search_west.clear()
search_west.send_keys("112")
search_east=driver.find_element_by_css_selector("input[name=EastLongitude]")
search_east.clear()
search_east.send_keys("120")
#更新经纬度范围显示
search_update=driver.find_element_by_css_selector("input[id=mapUpdateBtn]")
search_update.click()
格式选择
#选择数据格式
search_type=Select(driver.find_element_by_css_selector("select[name=format]"))
search_type.select_by_visible_text("netCDF4")
#数据下载
search_download=driver.find_element_by_css_selector("input[value=Download]")
search_download.click()
结束
加上对应变量的for循环及数据保存路径的更换,最终变成这样子
import os
import time
import datetime
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from tqdm import *
year=["2015","2016","2017","2018","2019","2020"]
month=["Jan","Apr","May","Aug","Sep","Dec"]
data=["dusmass","sssmass","dusmass25","sssmass25","so4smass","so2smass","bcsmass","ocsmass"]
if __name__=="__main__":
for da in data:
for i in tqdm(range(len(year))):
for j in range(0,len(month),2):
data_path=os.path.join(r'E:\MERRAero',da)
out_path=os.path.join(data_path,year[i]+month[j])
if not os.path.exists(out_path):
os.makedirs(out_path)
options = webdriver.ChromeOptions()
prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': out_path}
options.add_experimental_option('prefs', prefs)
#访问页面
driver = webdriver.Chrome(executable_path=r'C:\Program Files\Google\Chrome\Application\chromedriver.exe', chrome_options=options)
driver.get('https://portal.nccs.nasa.gov/cgi-lats4d/webform.cgi?&i=GEOS-5/MERRAero/hourly/inst1hr_2d_hwl_Nx')
#选择数据
data=driver.find_element_by_css_selector("input[value={}]".format(da))
data.click()
#开始时间
begin_year=Select(driver.find_element_by_css_selector("select[name=year]"))
begin_year.select_by_visible_text("{}".format(year[i]))
time.sleep(5)
begin_month=Select(driver.find_element_by_css_selector("select[name=month]"))
begin_month.select_by_visible_text("{}".format(month[j]))
time.sleep(5)
begin_day=Select(driver.find_element_by_css_selector("select[name=day]"))
begin_day.select_by_visible_text("01")
time.sleep(5)
begin_hour=Select(driver.find_element_by_css_selector("select[name=hour]"))
begin_hour.select_by_visible_text("00")
time.sleep(5)
#结束时间
end_year=Select(driver.find_element_by_css_selector("select[name=yearend]"))
end_year.select_by_visible_text("{}".format(year[i]))
time.sleep(5)
end_month=Select(driver.find_element_by_css_selector("select[name=monthend]"))
end_month.select_by_visible_text("{}".format(month[j+1]))
time.sleep(5)
if month[j+1]=="Apr":
end_day=Select(driver.find_element_by_css_selector("select[name=dayend]"))
end_day.select_by_visible_text("30")
else:
end_day=Select(driver.find_element_by_css_selector("select[name=dayend]"))
end_day.select_by_visible_text("31")
time.sleep(5)
end_hour=Select(driver.find_element_by_css_selector("select[name=hourend]"))
end_hour.select_by_visible_text("23")
time.sleep(5)
#pan or draw box
draw_box=driver.find_element_by_id("boxToggle")
draw_box.click()
time.sleep(5)
#经纬度范围
search_north=driver.find_element_by_css_selector("input[name=NorthLatitude]")
search_north.clear() #清空框内已经存在的数据
search_north.send_keys("43")
time.sleep(5)
search_south=driver.find_element_by_css_selector("input[name=SouthLatitude]")
search_south.clear()
search_south.send_keys("35")
time.sleep(5)
search_west=driver.find_element_by_css_selector("input[name=WestLongitude]")
search_west.clear()
search_west.send_keys("112")
time.sleep(5)
search_east=driver.find_element_by_css_selector("input[name=EastLongitude]")
search_east.clear()
search_east.send_keys("120")
time.sleep(5)
#更新经纬度范围显示
search_update=driver.find_element_by_css_selector("input[id=mapUpdateBtn]")
search_update.click()
time.sleep(5)
#选择数据格式
search_type=Select(driver.find_element_by_css_selector("select[name=format]"))
search_type.select_by_visible_text("netCDF4")
time.sleep(5)
#数据下载
search_download=driver.find_element_by_css_selector("input[value=Download]")
search_download.click()
time.sleep(30)
#刷新页面
driver.refresh()
#退出对应的浏览器
driver.quit()
这里依旧存在问题(在一个变量的选择结束后,数据下载要是未完成应该会直接关闭,这就需要后续手动操作几次了),不过暂时是够用的了。