[爬虫] 哨兵数据的精轨数据|自动下载|爬取链接并下载文件

最新推荐文章于 2024-07-30 14:00:44 发布

geodoer

最新推荐文章于 2024-07-30 14:00:44 发布

阅读量1.1w

点赞数 7

分类专栏： python 遥感文章标签： python 爬虫

本文链接：https://blog.csdn.net/summer_dew/article/details/79053421

版权

遥感同时被 2 个专栏收录

41 篇文章 10 订阅

订阅专栏

python

25 篇文章 3 订阅

订阅专栏

功能：自动下载文件夹下哨兵数据对应的轨道数据

以下载哨兵的精轨数据为例：

网址：https://qc.sentinel1.eo.esa.int/aux_poeorb/

url搜索参数：

key	value	例子
mission	哨兵1A：S1A 哨兵1B：S1B	查询1A的精轨数据： https://qc.sentinel1.eo.esa.int/aux_poeorb/?mission=S1A
validity_start_time	年：2014 月：2014-01 日：2017-04-07…2017-04-10	查询2017-03-16到2017-03-19的精轨数据： https://qc.sentinel1.eo.esa.int/aux_poeorb/?validity_start_time=2017&validity_start_time=2017-03&validity_start_time=2017-03-16..2017-03-19

搜索参数的问题：

这里写图片描述
表单中下载的链接为：
https://qc.sentinel1.eo.esa.int/aux_poeorb/S1B_OPER_AUX_POEORB_OPOD_20170221T111232_V20170131T225942_20170202T005942.EOF

Python代码：

# -*- coding:utf-8 -*-
# Author:PasserQi
# Time:2019-4-5
# 下载文件夹下哨兵数据的精轨数据
# 须知：文件夹下的哨兵数据需解压。不想解压可以修改程序的第43行，.SAFE该为.zip
import urllib
from bs4 import BeautifulSoup
import re
import os
import datetime
import time

# 需要修改的参数
dir_path = r'G:\Sentinel-original data\Orbit40-path40\Frame75-11\added_20180105\SourceData' # 哨兵数据存在的目录
out_path = r'C:\Users\PasserQi\Desktop' #精轨数据保存的目录
FILE_TYPE = ".SAFE" #文件格式：.SAFE .zip
IsDownload = True #是否下载：True False

download_urls = []
error_url = []
url_prefix = 'https://qc.sentinel1.eo.esa.int/aux_poeorb/' #下载地址
def download(dest_dir, url):
    print "正在下载：{}\n\t至{}\n".format(url, dest_dir)
    try:
        urllib.urlretrieve(url, dest_dir, callbackfunc)
    except:
        error_url.append(url)
        print '\tError retrieving the URL:', dest_dir
    else: # 没有异常
        print "\t[done]"
        if url in error_url: #在错误列表里
            error_url.remove(url)
def callbackfunc(blocknum, blocksize, totalsize):
    '''回调函数
    @blocknum: 已经下载的数据块
    @blocksize: 数据块的大小
    @totalsize: 远程文件的大小
    '''
    percent = 100.0 * blocknum * blocksize / totalsize
    if percent > 100:
        percent = 100
    print "%.2f%%"% percent

def get_yestoday(mytime):
    myday = datetime.datetime( int(mytime[0:4]),int(mytime[4:6]),int(mytime[6:8]) )
    delta = datetime.timedelta(days=-1)
    my_yestoday = myday + delta
    my_yes_time = my_yestoday.strftime('%Y%m%d')
    return my_yes_time

if __name__ == '__main__':
    # 获得files
    files = os.listdir(dir_path)

    #files = [
    #   "S1A_IW_SLC__1SDV_20180201T101712_20180201T101742_020412_022E1C_43FD.SAFE",
    #   "S1A_IW_SLC__1SDV_20180213T101712_20180213T101742_020587_0233BB_CA75.SAFE",
    #   "S1A_IW_SLC__1SDV_20180309T101712_20180309T101742_020937_023ED6_693E.SAFE",
    #   ]

    for file in files:
        if not file.endswith(FILE_TYPE):
            continue

        # ###########################
        # 按文件名上的信息查找EOF

        # 拼接URL
        url_param_json = {}
        url_param_json['sentinel1__mission'] = file[0:3]
        date = re.findall(r"\d{8}",file)[0]

        # 若参数为20170316，则搜索的是20170317的数据
        # 所以参数应该提前一天
        # 求date的前一天
        date = get_yestoday(date)

        # 在字符串指定位置插入指定字符
        # 例：20170101 --> 2017-01-01
        tmp = list(date)
        tmp.insert(4,'-');tmp.insert(7,'-')
        date = "".join(tmp)
        url_param_json['validity_start'] = date

        # 获得EOF下载网址
        url_param = urllib.urlencode(url_param_json) #url参数
        url = 'https://qc.sentinel1.eo.esa.int/aux_poeorb/?%s' % url_param #拼接
        print "url：{}".format(url)
        html = urllib.urlopen(url)  # 获取html
        dom = BeautifulSoup(html) # 解析html文档
        a_list = dom.findAll("a")  # 找出<a>
        eof_lists = [a['href'] for a in a_list if a['href'].endswith('.EOF')]  # 找出EOF
        for eof in eof_lists:
            if IsDownload:
                eof_name = eof.split('/')[-1] #名字
                savefile = os.path.join(out_path, eof_name) #保存路径
                download(savefile, eof)
            else:
                download_urls.append(eof)


    if IsDownload: #下载
        print "------------------------------------"
        print "开始下载出错的数据"
        # 下载出错的数据重新下载
        while len(error_url)!=0:
            print "出错的数据有"
            print error_url
            for eof in error_url:
                savefile = os.path.join(out_path, eof)
                download(savefile, url_prefix + eof)
        print "全部下载成功，无出错文件"
    else: #不下载
        with open(os.path.join(out_path, u"下载链接.txt"), "w+") as f:
            for eof in download_urls:
                f.write(eof)
                f.write("\n")
            f.close()