python LAADS+Selenium下载MODIS数据

最新推荐文章于 2023-08-15 14:45:29 发布

一只大笨猪

最新推荐文章于 2023-08-15 14:45:29 发布

阅读量635

点赞数

分类专栏：卫星数据下载文章标签： python selenium

本文链接：https://blog.csdn.net/qq_33339770/article/details/117653274

版权

卫星数据下载专栏收录该内容

5 篇文章 0 订阅

订阅专栏

说明：
1、NASA数据的存放地址参照下图，各类数据存放在不同的网站，MODIS的L1级数据存放在LAADS网站上，此次从此网站进行数据查询和下载，需要在Earthdata网站进行注册并登陆，登陆之后在profile拿到token。
在这里插入图片描述
图片链接：https://earthdata.nasa.gov/eosdis/daacs
2、LAADS数据查询需要在网页上选择各项参数，然后显示数据列表，数据列表可下载为csv或者json文件。具体查询可以直接到网站进行操作体验：https://ladsweb.modaps.eosdis.nasa.gov/search/
3、产品号是构建查询URL时用的，不清楚的话直接在网站上查询，查看网站的地址，数据查询用到了MODIS的轨道文件，通过selenium进行网页操作并下载csv（需要谷歌浏览器并安装chromedriver，不细说），MODIS轨道文件在网上可以找到。csv文件打开长这样：
在这里插入图片描述
4、下载地址照样是建立了下层的年份文件夹存储下载文件，下载地址：https://ladsweb.modaps.eosdis.nasa.gov
5、数据下载本来采用的是下面这个脚本，这样下载没有进度显示，参考ASF API的数据下载chunk_read（）加了进度展示，不想看进度的话直接用下面这个就行。

def geturl(url, token=None, out=None):
    headers = { 'user-agent' : USERAGENT }
    if not token is None:
        headers['Authorization'] = 'Bearer ' + token
    try:
        import ssl
        CTX = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
        if sys.version_info.major == 2:
            import urllib2
            try:
                fh = urllib2.urlopen(urllib2.Request(url, headers=headers), context=CTX)
                if out is None:
                    return fh.read()
                else:
                    shutil.copyfileobj(fh, out)
            except urllib2.HTTPError as e:
                print('HTTP GET error code: %d' % e.code(), file=sys.stderr)
                print('HTTP GET error message: %s' % e.message, file=sys.stderr)
            except urllib2.URLError as e:
                print('Failed to make request: %s' % e.reason, file=sys.stderr)
            return None

        else:
            from urllib.request import urlopen, Request, URLError, HTTPError
            try:
                fh = urlopen(Request(url, headers=headers), context=CTX)
                if out is None:
                    return fh.read().decode('utf-8')
                else:
                    shutil.copyfileobj(fh, out)
            except HTTPError as e:
                print('HTTP GET error code: %d' % e.code(), file=sys.stderr)
                print('HTTP GET error message: %s' % e.message, file=sys.stderr)
            except URLError as e:
                print('Failed to make request: %s' % e.reason, file=sys.stderr)
            return None

    except AttributeError:
        # OS X Python 2 and 3 don't support tlsv1.1+ therefore... curl
        import subprocess
        try:
            args = ['curl', '--fail', '-sS', '-L', '--get', url]
            for (k,v) in headers.items():
                args.extend(['-H', ': '.join([k, v])])
            if out is None:
                # python3's subprocess.check_output returns stdout as a byte string
                result = subprocess.check_output(args)
                return result.decode('utf-8') if isinstance(result, bytes) else result
            else:
                subprocess.call(args, stdout=out)
        except subprocess.CalledProcessError as e:
            print('curl GET error message: %' + (e.message if hasattr(e, 'message') else e.output), file=sys.stderr)
        return None

完整下载代码：

from selenium import webdriver
from time import sleep
import tempfile
import os,sys
import pandas as pd
import geopandas as gpd
import time

# 构建查询地址
def GetURL(ProductID,StartTime,EndTime,search_file):
    #查询边界
    data = gpd.GeoDataFrame.from_file(search_file)
    bbox = (data.bounds.values)[0].tolist()
    #研究区范围，左上角和右下角。根据需要构造字符串
    Area = str(round(bbox[0],1))+','+str(round(bbox[3],1))+','+str(round(bbox[2],1))+','+str(round(bbox[1],1))
    # 输入MODIS轨道矢量
    modis_grid_file = 'E:\***\modis_WGS84_grid_world.shp'
    modis_grid = gpd.GeoDataFrame.from_file(modis_grid_file)
    # 查询边界覆盖的轨道中心坐标
    modis_intersection = modis_grid[modis_grid.intersects(data.geometry[0])]
    path_row = 'Tile:'
    for mv in modis_intersection.values:
        path_row += "H"+str(mv[1])+"V"+str(mv[2])+","
    #根据以上信息构建Search页的网址
    path_row = path_row[0:-1]
    url='https://ladsweb.modaps.eosdis.nasa.gov/search/order/4/'+ProductID+StartTime+'..'+EndTime+'/DB/'+path_row#Area
    return url

# 使用Selenium查询影像
def SearchFileList(url):
    # 创建文件夹，命名规则为程序运行的时刻
    # 将使用selenium下载的文件使用该文件夹存储
    csvdir = 'E:\\***\\' + str(time.time()).replace('.','')
    os.mkdir(csvdir)
    #配置selenium的参数
    options = webdriver.ChromeOptions()
    prefs = {'profile.default_content_settings.popups': 0, 'download.default_directory': csvdir}
    options.add_experimental_option('prefs', prefs)
    chromedriver = r"C:\***\Google\Chrome\Application\chromedriver.exe"#chromedriver.exe的本地路径
    # options.add_argument('--headless')  #有无浏览器界面模式，根据需要设置
    driver = webdriver.Chrome(executable_path=chromedriver,options=options)
    #自动打开Search页
    driver.get(url)
    #浏览器打开Search页后，还要留足时间让服务器进行数据检索
    #所以这里sleep50秒，可以根据网速自行设定
    sleep(50)
    #当然也可以判断搜索结果，也就是包含csv的标签是否出现
    # WebDriverWait(driver, 

    #下载csv文件
    # 找到文本包含csv的标签
    # csvElement = driver.find_element_by_link_text('csv')
    csvElement = driver.find_element_by_xpath('// *[ @ id = "tab4download"] / a[2]')
    # 点击下载
    csvElement.click()
    # 留下下载csv文件的时间
    sleep(20)
    #关闭浏览器
    driver.quit()
    return csvdir

# 下载影像
def MODISDown(FileDir):

    # 获取下载的csv文件的文件名
    csvfilename = os.listdir(FileDir)[0]
    # 构造文件路径
    csvfilepath = os.path.join(FileDir, csvfilename)
    # print(csvfilepath)
    csvvalues = pd.read_csv(csvfilepath).values
    os.remove(csvfilepath)
    os.rmdir(FileDir)
    # 下载数据
    file_count = 0
    for cv in csvvalues:
        file_count += 1
        #构建数据的下载链接
        modislink='https://ladsweb.modaps.eosdis.nasa.gov'+cv[1]
        outdir = 'E:/***/MODIS/'+(cv[1].split("/"))[5]
        # outdir = 'E:/Temp/' + (cv[1].split("/"))[5]
        if not os.path.isdir(outdir):
            os.mkdir(outdir)
        path = outdir + '/' + (cv[1].split("/"))[7]
        if not os.path.exists(path):
            print("({0}/{1}) Downloading {2}".format(file_count, len(csvvalues), modislink.split("/")[-1]))
            with open(path, 'w+b') as out:
                geturl(modislink, out)

# 获取下载链接并下载影像数据
def geturl(url,out=None):
    USERAGENT = 'tis/download.py_1.0--' + sys.version.replace('\n', '').replace('\r', '')
    headers = { 'user-agent' : USERAGENT }
    token = '******' # 你的token,可登陆Earthdata网站后在profile中得到
    headers['Authorization'] = 'Bearer ' + token
    try:
        import ssl
        CTX = ssl.SSLContext(ssl.PROTOCOL_TLSv1_2)
        from urllib.request import urlopen, Request, URLError, HTTPError
        try:
            response = urlopen(Request(url, headers=headers), context=CTX)
            if out is None:
                return response.read().decode('utf-8')
            else:
                start = time.time()
                # 将连接中的下载文件写入临时文件 并返回文件写入进度
                chunk_read(response, out, report_hook=chunk_report)
                elapsed = max(time.time() - start,1.0)
                # 平均下载速度
                rate = (get_total_size(response) / 1024 ** 2) / elapsed
                print("Downloaded {0}b in {1:.2f}secs, Average Rate: {2:.2f}MB/sec".format(get_total_size(response), elapsed, rate))
                # shutil.copyfileobj(response, out)
        except HTTPError as e:
            print('HTTP GET error code: %d' % e.code(), file=sys.stderr)
            print('HTTP GET error message: %s' % e.message, file=sys.stderr)
        except URLError as e:
            print('Failed to make request: %s' % e.reason, file=sys.stderr)
        return None

    except AttributeError:
        # OS X Python 2 and 3 don't support tlsv1.1+ therefore... curl
        import subprocess
        try:
            args = ['curl', '--fail', '-sS', '-L', '--get', url]
            for (k,v) in headers.items():
                args.extend(['-H', ': '.join([k, v])])
            if out is None:
                # python3's subprocess.check_output returns stdout as a byte string
                result = subprocess.check_output(args)
                return result.decode('utf-8') if isinstance(result, bytes) else result
            else:
                subprocess.call(args, stdout=out)
        except subprocess.CalledProcessError as e:
            print('curl GET error message: %' + (e.message if hasattr(e, 'message') else e.output), file=sys.stderr)
        return None

#  chunk_read modified from http://stackoverflow.com/questions/2028517/python-urllib2-progress-hook
def chunk_read( response, local_file, chunk_size=10240, report_hook=None):
    # 完整文件大小
    file_size = get_total_size(response)
    # 下载文件大小
    bytes_so_far = 0
    # 文件写入本地
    while 1:
        try:
            # 从地址中读取固定大小文件对象
            chunk = response.read(chunk_size)
        except:
            sys.stdout.write("\n > There was an error reading data. \n")
            break

        try:
            # 将读取出的文件对象写入本地文件
            local_file.write(chunk)
        except TypeError:
            local_file.write(chunk.decode(local_file.encoding))
        # 写入完成即更新已下载文件大小
        bytes_so_far += len(chunk)

        if not chunk:
            break
        if report_hook:
            # 获取下载进度
            report_hook(bytes_so_far, file_size)

    return bytes_so_far

def chunk_report( bytes_so_far, file_size):
    if file_size is not None:
        # 计算下载进度百分比
        percent = float(bytes_so_far) / file_size
        percent = round(percent * 100, 2)
        sys.stdout.write(" > Downloaded %d of %d bytes (%0.2f%%)\r" %
                         (bytes_so_far, file_size, percent))
    else:
        # We couldn't figure out the size.
        sys.stdout.write(" > Downloaded %d of unknown Size\r" % (bytes_so_far))

def get_total_size(response):
   try:
      file_size = response.info().getheader('Content-Length').strip()
   except AttributeError:
      try:
         file_size = response.getheader('Content-Length').strip()
      except AttributeError:
         print ("> Problem getting size")
         return None
   return int(file_size)

if __name__ == "__main__":
    # 定义要下载数据的信息
    ProductID = 'MOD021KM--61/'  # 产品号#sys.argv[1]#
    # 设置数据的起始和截至时间。其实就是根据需要构造一个简单的字符串
    StartTime = '2020-06-01'  # 开始时间#sys.argv[2]#
    EndTime = '2020-06-03'  # 截至日期#sys.argv[3]#
    search_file = r'E:\***\北京市.shp'  # 查询范围#sys.argv[4]#

    # 构建查询地址
    url = GetURL(ProductID,StartTime,EndTime,search_file)
    # 获取数据列表
    csvdir = SearchFileList(url)
    # 根据列表下载数据
    MODISDown(csvdir)