批量获取精密轨道数据(python)

RuiXuan Zhang

已于 2022-06-28 09:48:21 修改

阅读量1.1k

点赞数 1

分类专栏： Python学习文件处理小程序文章标签： python pandas 机器学习

于 2022-06-22 13:48:50 首次发布

本文链接：https://blog.csdn.net/qq_44813407/article/details/125406793

版权

文件处理小程序同时被 2 个专栏收录

21 篇文章 2 订阅

订阅专栏

Python学习

7 篇文章 0 订阅

订阅专栏

文章目录

1、提取成像日期（pandas）
2、生成csv日期文件（excel）
3、获取cookie（html）
4、数据爬取（爬虫）

1、提取成像日期（pandas）

import os
import pandas as pd


def writefilename2excel(folder_path, excel1_path, txt_path):
    # 加载文件夹
    folder = os.listdir(folder_path)
    satellite_list = []  # 卫星
    image_list = []  # 成像模式
    product_list = []  # 产品类别
    time_list = []  # 成像日期
    moment1_list = []  # 成像时刻1
    moment2_list = []  # 成像时刻2
    orbit_list = []  # 绝对轨道号
    task_list = []  # 任务数据利用标识符
    code_list = []  # 产品唯一识别码

    for file in folder:
        # file_list.append(file)
        satellite = file.split('_')[0]  # 获取卫星标识
        # print(satellite)
        satellite_list.append(satellite)
        # print(satellite)
        image = file.split('_')[1]  # 获取成像方式
        # print(image)
        image_list.append(image)
        product = file.split('_')[2]  # 获取产品类别
        # print(product)
        product_list.append(product)
        time = file.split('_')[5]  # 获取成像日期
        time = time.split('T')[0]  # 获取成像日期
        time_list.append(time)
        # print(time)
        moment1 = file.split('_')[5].split('T')[1]  # 获取成像开始时刻
        moment1_list.append(moment1)
        moment2 = file.split('_')[6].split('T')[1]  # 获取成像结束时刻
        moment2_list.append(moment2)
        orbit = file.split('_')[7]  # 获取绝对轨道号
        orbit_list.append(orbit)
        task = file.split('_')[8]  # 获取任务数据利用标识符
        task_list.append(task)
        code = file.split('_')[9]  # 获取任务数据利用标识符
        code_list.append(code)

    all_list = {
        'satellite_list': satellite_list,
        'image_list': image_list,
        'product_list': product_list,
        'time_list': time_list,
        'moment1_list': moment1_list,
        'moment2_list': moment2_list,
        'orbit_list': orbit_list,
        'task_list': task_list,
        'code_list': code_list,
    }

    df = pd.DataFrame(all_list)
    # columns = ['file_name', 'sensor', 'longitude', 'latitude','time', 'product_ID']
    pd.DataFrame(df).to_excel(excel1_path, index=False)

    # for i in range(len(product_ID_list)):
    #     product_ID_list[i].replace("'", "")
    # print(product_ID_list)
    str = '\n'
    f = open(txt_path, "w")
    f.write(str.join(time_list))
    f.close()


if __name__ == '__main__':
    folder = r''
    excel = r''
    txt = r''
    writefilename2excel(folder, excel, txt)

在这里插入图片描述

2、生成csv日期文件（excel）

用excel生成一个日期.csv，不要title，保存为CSV UTF-8文件类型
在这里插入图片描述

3、获取cookie（html）

打开网页源码，获取cookie
在这里插入图片描述

4、数据爬取（爬虫）

from urllib.request import Request, build_opener
from urllib.error import URLError
from my_fake_useragent import UserAgent
from time import sleep
import re
import datetime
import pandas
import os
from dateutil.parser import parse

'''
1、该程序获取的为'https://s1qc.asf.alaska.edu/aux_poeorb/'这个网站上的数据
2、需要写一个csv文件，在csv文件的第一列往下开始写影像下载时间
3、该程序需要更改的第一个地方为'get_info_href()'和'DownLoad_POD()'这两个函数里面的cookie属性
4、该程序需要更改的第二个地方为'get_data_time()'函数下的csv文件的路径，例如：'D:\python\datatime.csv'
5、该程序下载精密轨道数据的路径为该程序运行下面的POD文件夹，程序会自己创建，路径在输出窗口也会出现
'''


def get_info_href():  # 利用正则表达式获取每个具体精轨数据的网址
    try:
        ua = UserAgent(family='chrome')
        UA = ua.random()  # 设置虚拟代理
        # 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.124 Safari/537.36 Edg/102.0.1245.41'
        headers = {'User-Agent': UA,
                   'Cookie': ''}
        url = 'https://s1qc.asf.alaska.edu/aux_poeorb/'  # 请求网址
        time = get_datetime()  # 运行get_datatime()函数，返回时间
        day = datetime.timedelta(days=1)  # 确定day参数，让day=1
        href = []  # 设置空列表，用来存储精轨网址
        for i in time:  # 对返回来的时间进行遍历
            # str为正则表达式，用来识别数据，记得更改S1A\S1B,int((i-day).strftime("%Y%m%d"))和int((i+day).strftime("%Y%m%d"))根据精轨命名规则计算前一天和后一天日期
            str = f'S1A_OPER_AUX_POEORB_OPOD_\d+\w\d+_\w{int((i - day).strftime("%Y%m%d"))}\w\d+_{int((i + day).strftime("%Y%m%d"))}\w\d+.EOF'
            req = Request(url, headers=headers)  # 发送请求头
            sleep(2)  # 睡眠两秒，防止网站识别
            opener = build_opener()
            result = opener.open(req)
            a = result.read().decode()  # 获取所有的精轨网址，以str类型进行保存
            result1 = re.search(str, a)  # 利用正则表达式进行匹配
            href.append(result1.group())  # 将匹配好的数据存放道href列表中
        return href  # 返回href列表
    except URLError as e:
        print(print('网络错误为:' + '{0}'.format(e)))


def DownLoad_POD():  # 开始下载POD数据文件
    ua = UserAgent(family='chrome')
    UA = ua.random()  # 设置虚拟代理
    headers = {
        'User-Agent': UA,
        'Cookie': ''}
    hrefs = get_info_href()
    path()  # 返回设置到的POD路径
    num = 0
    for href in hrefs:  # 遍历每一个返回回来的POD网址
        try:
            url = f'https://s1qc.asf.alaska.edu/aux_poeorb/{href}'
            req = Request(url, headers=headers)
            sleep(2)
            opener = build_opener()
            result = opener.open(req)
            try:
                '''开始下载数据'''
                print('数据下载中')
                f = open(f'{href}', 'w', encoding='utf-8')
                f.write(result.read().decode())
                f.close()
                print(f'{href}下载完成')
                num = num + 1
            except BaseException as f:
                print('文件下载错误为:', f)
        except URLError as e:
            print('网络错误为:', e)
    print('下载POD数据', str(num), '个', f'下载文件夹路径为{os.getcwd()}')


def path():  # 更改路径，创建POD文件夹
    path = os.getcwd()
    isexist = os.path.exists('POD')
    if not isexist:  # 判断该路径下有无POD文件
        os.mkdir('POD')
        os.chdir(f'{path}\POD')
        print(f'在该{path}下已创建POD文件夹,已将当前目录设置为{os.getcwd()}')
    else:
        os.chdir(f'{path}\POD')
        os.chdir(f'{path}\POD')
        print(f'该{path}下已存在POD文件夹，已将当前目录设置为{os.getcwd()}')


def get_datetime():  # 对csv文件里面的时间进行转换
    with open(r'C:\Users\123\Desktop\datatime.csv', encoding='utf-8') as a:
        a_scv = pandas.read_csv(a, header=None)
        nrows = a_scv.shape[0]
        ncols = a_scv.columns.size
        list = []
        print('时间数据加载中')
        for irow in range(nrows):
            for icol in range(ncols):
                # print(a_scv.iloc[irow, icol])
                list.append(a_scv.iloc[irow, icol])
        print(f'加载{nrows}条时间数据')
        time = []
        for t in list:
            time.append(parse(str(t)).date())
        return time


if __name__ == '__main__':
    import http.client

    http.client.HTTPConnection._http_vsn = 10
    http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0'
    start_time = datetime.datetime.now()
    DownLoad_POD()
    end_time = datetime.datetime.now()
    spend_time = (end_time - start_time).seconds
    print('下载时间为', spend_time, 's')