建了一个QQ群,大家可以在里边聊聊水色遥感数据下载和数据处理方面的事情:1087024529
GOCI数据好久不更新了,不知道咋回事。
闲着没事把GOCI的所有下载链接爬下来了
代码:
#-*-coding: utf-8-*-
#@Author: zhaohaiyang
#@E-mail: hyzhao_rs@163.com
#获取GOCI在OceanColor上的所有下载链接
#获取GOCI所有快视图链接
#GOCI所有文件名
#
##########################################################
import os
import urllib
import requests
import pandas as pd
from bs4 import BeautifulSoup
def download(url,num):
try:
html=urllib.request.urlopen(url).read()
except:
html=None
if num>0:
return download(url,num-1)
return html
#.获取GOCI在OceanColor上的所有下载链接
rootUrl='https://oceandata.sci.gsfc.nasa.gov/directaccess/GOCI/L1/'
rootHTML=urllib.request.urlopen(rootUrl).read()
yearDataFrame=pd.read_html(rootHTML)[0]
yearList=list(yearDataFrame['Directory'])
with open('H:/All_GOCI_Data_URL.txt','w') as gb:
for year in yearList:
yearUrl=rootUrl+year
yearHTML=download(yearUrl,5)
if yearHTML is None:
print(year)
else:
dayDataFrame=pd.read_html(yearHTML)[0]
dayList=list(dayDataFrame['Directory'])
for day in dayList:
dayUrl=yearUrl+day[-4:-1]
dayHTML=download(dayUrl,5)
if dayHTML is None:
print(' '+day)
else:
tempDataFrame=pd.read_html(dayHTML)[0]
tempDataNameList=list(tempDataFrame['Filename'])
for tdnl in tempDataNameList:
dataUrl='https://oceandata.sci.gsfc.nasa.gov/goci/getfile/'+tdnl
gb.writelines(dataUrl+'\n')