定时爬取邮箱附件(以163邮箱为例)

最新推荐文章于 2023-09-01 17:04:34 发布
木子雨巷
最新推荐文章于 2023-09-01 17:04:34 发布
阅读量583
点赞数
分类专栏： web 文章标签： python numpy 机器学习
本文链接：https://blog.csdn.net/aonong0521/article/details/129344718
版权
web 专栏收录该内容
3 篇文章
订阅专栏
该脚本主要执行一系列数据处理任务，包括从ZIP文件中解压EML邮件，提取附件，使用结构化信息解码二进制文件，将数据转换为NetCDF格式，并根据不同的数据类型（如风速、浪高、温度等）进行存储。同时，脚本还包含了网络爬虫功能，模拟登录邮箱并下载邮件。整个流程自动化处理，适用于气象数据的批量分析和整理。
摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >
import os
import time
import datetime
import struct
import numpy as np
import xarray as xr
import time
from interval import Interval
import json
import requests

Load_Path = "C:\\Users\\iscas\\Downloads\\信件打包.zip"
Eml_Path = "./NewMail"  # 第一个参数为eml所在文件夹
Annex_Path = "./NewRKW"  # 第二个参数为eml附件输出的路径
AWT_Path = "./NewAWT1"  # 第二个参数为eml附件输出的路径
NC_Path = r"Z:\weather\212-data\BVS2"  # 第二个参数为eml附件输出的路径


def mkdir(path):
    import os
    path = path.strip()
    path = path.rstrip("\\")
    isExists = os.path.exists(path)

    if not isExists:
        os.makedirs(path)
        return True
    else:
        return False

class AwtHead:
    jdLon = 0
    jdLat = 0
    time = 0
    maxlat = 0
    maxlon = 0
    minlat = 0
    minlon = 0
    rowCount = 0
    columnCount = 0
    type = "none"
    data = []


def decode_File(input_file, head, outputdir):
    data = open(input_file, 'rb').read()
    if len(data) < 14:
        print("文件格式不符合")
        return
    shortlist = struct.unpack('h'*7, data[0:14])
    head.jdLon = shortlist[0]
    head.jdLat = shortlist[1]
    filesplit = input_file.split('_')
    filetime = datetime.datetime.strptime(
        filesplit[len(filesplit)-2] + filesplit[len(filesplit)-1][:2]+':00:00', "%Y%m%d%H:%M:%S")
    head.time = np.array([filetime]).astype('datetime64[ns]')
    head.maxlat = shortlist[3]
    head.minlat = shortlist[4]
    head.minlon = shortlist[5]
    head.maxlon = shortlist[6]
    head.rowCount = int((head.maxlat-head.minlat)*100/head.jdLat+1)
    head.columnCount = int((head.maxlon-head.minlon)*100/head.jdLon+1)

    lat, lon = get_latlon(head.jdLon, head.jdLat,
                          head.maxlon, head.maxlat, head.minlon, head.minlat)

    outputdir = os.path.join(outputdir, head.type)
    mkdir(outputdir)

    if head.type == "vesselicing" or head.type == "pressure" or head.type == "height500mb" or head.type == "sst" or head.type == "visibility" or head.type == "wxtype" or head.type == "cloud" or head.type == "temperature" or head.type == "rhum":
        head.data = struct.unpack(
            'f' * int(head.rowCount * head.columnCount), data[14:len(data)])
        da1, = get_values(head.data, head.rowCount,
                          head.columnCount, var_nums=1)
        write_data_to_nc(input_file, head.time, head.rowCount, head.columnCount,
                         head.type, lat, lon, outputdir, var_nums=1, da1=da1)

    if head.type == "wind" or head.type == "current":
        head.data = struct.unpack(
            'ff' * int(head.rowCount * head.columnCount), data[14:len(data)])
        da1, da2 = get_values(head.data, head.rowCount,
                              head.columnCount, var_nums=2)
        write_data_to_nc(input_file, head.time, head.rowCount, head.columnCount,
                         head.type, lat, lon, outputdir,  var_nums=2, da1=da1, da2=da2)

    if head.type == "wave" or head.type == "seas" or head.type == "swell":
        head.data = struct.unpack(
            'fff' * int(head.rowCount * head.columnCount), data[14:len(data)])
        da1, da2, da3 = get_values(
            head.data, head.rowCount, head.columnCount, var_nums=3)
        write_data_to_nc(input_file, head.time, head.rowCount, head.columnCount,
                         head.type, lat, lon, outputdir,  var_nums=3, da1=da1, da2=da2, da3=da3)

    return


def get_latlon(jdLon, jdLat, maxlon, maxlat, minlon, minlat):
    lat_1d = np.arange(maxlat, minlat-0.01, -jdLat/100.0).astype(np.float32)
    lon_1d = np.arange(minlon, maxlon+0.01, jdLon/100.0).astype(np.float32)
    lon_2d, lat_2d = np.meshgrid(lon_1d, lat_1d)  # bvs经纬度
    return lat_2d, lon_2d


def get_values(data, rowCount, columnCount, var_nums=3):

    values = []
    for i in range(var_nums):
        values.append(np.array(data[i::var_nums], dtype=np.float32).reshape(
            rowCount, columnCount))
    return values


def write_data_to_nc(in_file, headtime, headrowCount, headcolumnCount, headtype, lat, lon, outputdir, var_nums=1, da1=None, da2=None, da3=None):

    ncfile = xr.Dataset()
    ncfile['time'] = (['time'], headtime)
    ncfile.time.attrs = {'long_name': '时间'}
    ncfile.time.encoding['units'] = "seconds since 1970-01-01 00:00:00"
    ncfile['xlat'] = (['xlat'], np.arange(headrowCount).astype(np.float32))
    ncfile.xlat.attrs = {'long_name': 'Latitude',
                         'axis': 'Y', 'units': "degrees_north"}
    ncfile['xlong'] = (['xlong'], np.arange(
        headcolumnCount).astype(np.float32))
    ncfile.xlong.attrs = {'long_name': 'Longitude',
                          'axis': 'X', 'units': "degrees_east"}
    ncfile['lat'] = (['xlat', 'xlong'], lat)
    ncfile.lat.attrs = {'long_name': 'Latitude', 'units': 'degrees_north'}
    ncfile['lon'] = (['xlat', 'xlong'], lon)
    ncfile.lon.attrs = {'long_name': 'Longitude', 'units': 'degrees_east'}

    if var_nums == 1:
        if headtype == "temperature":
            varname1 = headtype+'_2m'
        else:
            varname1 = headtype
        ncfile[varname1] = (['xlat', 'xlong'], da1)

    if var_nums == 2:
        if headtype == "wind":
            varname1 = 'u10_'+headtype
            varname2 = 'v10_'+headtype
        if headtype == "current":
            varname1 = 'u_'+headtype
            varname2 = 'v_'+headtype
        ncfile[varname1] = (['xlat', 'xlong'], da1)
        ncfile[varname2] = (['xlat', 'xlong'], da2)

    if var_nums == 3:
        varname1 = headtype+'_height'
        ncfile[varname1] = (['xlat', 'xlong'], da1)
        varname2 = headtype+'_direction'
        ncfile[varname2] = (['xlat', 'xlong'], da2)
        varname3 = headtype+'_period'
        ncfile[varname3] = (['xlat', 'xlong'], da3)

    ncfilename = os.path.basename(in_file)+'.nc'
    out_file = os.path.join(outputdir, ncfilename)
    ncfile.to_netcdf(out_file)


def decode_Dir(inputdir, outputdir):
    typename = ["wind", "swell", "seas", "wave", "current", "vesselicing", "pressure",
                "height500mb", "sst", "visibility", "wxtype", "cloud", "temperature", "rhum"]

    for tnm in typename:
        if os.path.exists(os.path.join(inputdir, tnm)):
            filelist = os.listdir(os.path.join(inputdir, tnm))
            for input_file in filelist:
                head = AwtHead()
                head.type = tnm
                decode_File(os.path.join(inputdir, tnm,
                            input_file), head, outputdir)

import email
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
import os
import time
import zipfile
import shutil
import subprocess
from ctypes import *
def init_browser():  #模拟打开浏览器，
    browser = webdriver.Chrome(r'C:\Program Files\Google\Chrome\Application\chromedriver.exe')
    browser.implicitly_wait(10)
    wait = WebDriverWait(browser, 1)
    browser.get('https://email.163.com/')
    time.sleep(1)
    iframe = browser.find_element(By.XPATH, '//div[@id="urs163Area"]/iframe')
    browser.switch_to.frame(iframe)
    browser.find_element(By.NAME, "email").send_keys('')#邮箱账号
    time.sleep(1)
    browser.find_element(By.NAME, "password").send_keys('')#邮箱密码
    time.sleep(1)
    browser.find_element(By.ID, 'dologin').click()
    time.sleep(3)
    browser.current_window_handle
    #browser.find_element(By.ID, '_mail_component_147_147').click()
    try:
        browser.find_element(By.ID, '_mail_component_83_83').click()
    except:
        browser.find_element(By.ID, '_mail_component_82_82').click()
    time.sleep(1)
    browser.find_element(By.XPATH, '//div[@id="_dvModuleContainer_mbox.ListModule_0"]/header/div/div[1]').click()
    time.sleep(1)

    browser.find_element(By.XPATH, '//div[@id="_dvModuleContainer_mbox.ListModule_0"]/header/div/div[4]/div[3]').click()
    time.sleep(1)
    browser.current_window_handle
    browser.find_element(By.XPATH, '//div[@style="visibility: visible; left: 528px; top: 124px;"]/div[1]').click()
    time.sleep(10)
    while 1:
        time.sleep(1)
        if os.path.exists(Load_Path):
            break
    return
def unzip(spath,dpath):
    zip_file = zipfile.ZipFile(spath, 'r')
    zip_list = zip_file.namelist()  # 得到压缩包里所有文件

    for f in zip_list:
        zip_file.extract(f, dpath)  # 循环解压文件到指定目录
        os.rename(dpath+"\\"+f, dpath+"\\"+f.encode('cp437').decode('gbk'))


    zip_file.close()  # 关闭文件，必须有，释放内存

    os.remove(spath)
    return
'''
rkwdll = CDLL('E:\\BVS\\4-orther\\BVS7\\bin\\decode.dll')
def CdllRun(filePath,dirPath):
    filePath = bytes(filePath, 'utf-8')# 可见光源
    dirPath = bytes(dirPath, 'utf-8')# 可见光源
    rkwdll.decode.restype = c_bool
    rkwdll.decode.argtyps = (c_char_p,c_char_p)
    print(3)
    if rkwdll.decode(filePath,dirPath) == False:
        print("解析文件失败")
    time.sleep(5)
    return
    '''
def CdllRun(filePath,dirPath):
    run = ("decode.exe "+dirPath +" "+filePath)
    run = run.replace("\\", "\\\\")
    os.system(run)
    p = subprocess.Popen(run)
    while p.poll() is None:
        time.sleep(1)
def Get_Annex_Message(FilePath, Annex_Path):
    try:
        fp = open(FilePath, 'rb')        #打开任意格式文件，通过email库来判断是否为eml文件
        msg = email.message_from_binary_file(fp)
        for part in msg.walk():            #循环信件中的每一个mime的数据块
            if part.get_content_maintype() == 'multipart':
                continue
            Annex_name = part.get_filename()
            if Annex_name:        #如果附件存在名字
                fp = open(os.path.join(Annex_Path, Annex_name), 'wb')
                fp.write(part.get_payload(decode=True))
    except Exception as e:
        print(e)
        return

#递归文件夹下所有文件
def List_Filepath(Eml_Path, Annex_Path):
    for parent,dirnames,filenames in os.walk(Eml_Path): #遍历文件夹
        for dirname in dirnames:        #对文件夹进行递归
            List_Filepath(dirname, Annex_Path)
        for filename in filenames:        #r对文件进行判断
            FilePath = os.path.join(parent,filename)
            Get_Annex_Message(FilePath, Annex_Path)

#创建目的文件夹
def Create_Dir(Annex_Path):
    if os.path.exists(Annex_Path):
        print("dir exists, Annex file will create in %s" % Annex_Path)
    else:
        os.mkdir(Annex_Path)


Create_Dir(NC_Path)  # 创建保存附加的文件夹



# 时间区间二
time_4 = Interval("06:05:30", "06:06:00")
# time_5 = Interval("05:35:30", "05:36:00")
time_10 = Interval("12:05:30", "12:06:00")
#time_11 = Interval("11:35:30", "11:36:00")
time_16 = Interval("18:05:30", "18:06:00")
#time_17 = Interval("17:35:30", "17:36:00")
time_22 = Interval("00:05:30", "00:06:00")
time_22 = Interval("12:48:30", "12:58:00")


while 1:
    while 1:
        # 当前时间
        time.sleep(20)
        now_localtime = time.strftime("%H:%M:%S", time.localtime())
        # 当前时间（以时间区间的方式表示）
        now_time = Interval(now_localtime, now_localtime)
        # 方法二：
        if now_time in time_4 or now_time in time_10 or now_time in time_16 or now_time in time_22:
            break

    # Gang Li
    root_path_lg = 'Z:\\weather\\212-data\\BVS2\\'
    list_lg = os.listdir(root_path_lg)
    # print(list_lg)
    for folder_lg in list_lg:
        if '_1823' in folder_lg:
            folder_lg_replace = folder_lg.replace('_1823', '_2023')
            print('replace:')
            print(folder_lg_replace)
            if not os.path.exists(root_path_lg + folder_lg_replace):
                shutil.move(root_path_lg + folder_lg, root_path_lg + folder_lg_replace)
            else:
                shutil.rmtree(root_path_lg + folder_lg)

    for folder_lg in list_lg:
        ddir = folder_lg
        s_lg = folder_lg[-3:]
        if s_lg in ['_01', '_02', '_03', '_04', '_05']:
            ddir_lg = ddir.replace(s_lg, '_00')
        elif s_lg in ['_07', '_08', '_09', '_10', '_11']:
            ddir_lg = ddir.replace(s_lg, '_06')
        elif s_lg in ['_13', '_14', '_15', '_16', '_17']:
            ddir_lg = ddir.replace(s_lg, '_12')
        elif s_lg in ['_19', '_20', '_21', '_22', '_23']:
            ddir_lg = ddir.replace(s_lg, '_18')
        else:
            continue
        print(root_path_lg + ddir)
        print(root_path_lg + ddir_lg)

        if not os.path.exists(root_path_lg + ddir_lg):
            shutil.move(root_path_lg + ddir, root_path_lg + ddir_lg)
        else:
            shutil.rmtree(root_path_lg + ddir)


    print(now_localtime+"start download")
    shutil.rmtree(Annex_Path)
    os.mkdir(Annex_Path)
    if os.path.exists(AWT_Path):
        shutil.rmtree(AWT_Path)
    os.mkdir(AWT_Path)
    shutil.rmtree(Eml_Path)
    os.mkdir(Eml_Path)

    init_browser()
    print("init_browser finish")
    unzip(Load_Path, Eml_Path)
    print("unzip finish")
    List_Filepath(Eml_Path, Annex_Path)
    print("List_Filepath finish")
    pathDir = os.listdir(Annex_Path)
    for allDir in pathDir:
       sdir = os.path.join('%s\%s' % (Annex_Path, allDir))
       ddir = os.path.join('%s\%s' % (AWT_Path, allDir[0:len(allDir)-4]))
       # By Gang Li.
       print('*'*10)
       print(ddir)
       s_lg = ddir[-3:]
       print(s_lg)
       if s_lg in ['_00','_01','_02','_03','_04','_05']:
           ddir = ddir.replace(s_lg, '_00')
       if s_lg in ['_06','_07','_08','_09','_10','_11']:
           ddir = ddir.replace(s_lg, '_06')
       if s_lg in ['_12','_13','_14','_15','_16','_17']:
           ddir = ddir.replace(s_lg, '_12')
       if s_lg in ['_18','_19','_20','_21','_22','_23']:
           ddir = ddir.replace(s_lg, '_18')

       # ddir = ddir.replace("_05", "_06")
       # ddir = ddir.replace("_01", "_00")
       # ddir = ddir.replace("_07", "_06")
       # ddir = ddir.replace("_13", "_12")
       # ddir = ddir.replace("_19", "_18")


       if os.path.exists(ddir):
           continue
       print('&'*100)
       print(ddir)

       Schedule = ddir.split('_')[-2] + ddir.split('_')[-1]
       msg = {
           "Level": 0,
           "Host": 0,
           "Process": 15,
           "Schedule": Schedule,
           "Status": 2,  # 16
           "Content": "BVS2 data"
       }

       url = "http://159.226.5.166:7894/api/log"
       msg_body = json.dumps(msg).encode(encoding='utf-8')
       result = requests.post(url, msg_body)
       print("Post data is ", result.status_code)


       Create_Dir(ddir)
       CdllRun(sdir, ddir)
       print(sdir+"decode finish")

       msg = {
           "Level": 0,
           "Host": 0,
           "Process": 15,
           "Schedule": Schedule,
           "Status": 1,  # 16
           "Content": "BVS2 data"
       }

       url = "http://159.226.5.166:7894/api/log"
       msg_body = json.dumps(msg).encode(encoding='utf-8')
       result = requests.post(url, msg_body)
       print("Post data is ", result.status_code)


       outdir = os.path.join('%s\%s' % (NC_Path, allDir[0:len(allDir)-4]))
       # outdir = outdir.replace("_05", "_06")
       # outdir = outdir.replace("_01", "_00")
       # outdir = outdir.replace("_07", "_06")
       # outdir = outdir.replace("_13", "_12")
       # outdir = outdir.replace("_19", "_18")
       if os.path.exists(outdir):
            continue
       else:
           shutil.copytree(ddir, outdir)
       rkwfile = os.path.join('%s\%s' % (os.path.join('%s\%s' % (NC_Path, "rkw")), allDir))
       if os.path.exists(rkwfile):
           continue
       shutil.copy(sdir, rkwfile)
       print("copy to nas finish")

   # Gang Li
    root_path_lg = 'Z:\\weather\\212-data\\BVS2\\'
    list_lg = os.listdir(root_path_lg)
    # print(list_lg)
    # for folder_lg in list_lg:
    #     if '_1823' in folder_lg:
    #         folder_lg_replace = folder_lg.replace('_1823', '_2023')
    #         print('replace:')
    #         print(folder_lg_replace)
    #         if not os.path.exists(root_path_lg + folder_lg_replace):
    #             shutil.move(root_path_lg + folder_lg, root_path_lg + folder_lg_replace)
    #         else:
    #             shutil.rmtree(root_path_lg + folder_lg)

    for folder_lg in list_lg:
       ddir = folder_lg
       s_lg = folder_lg[-3:]
       if s_lg in ['_01', '_02', '_03', '_04', '_05']:
           ddir_lg = ddir.replace(s_lg, '_00')
       elif s_lg in ['_07', '_08', '_09', '_10', '_11']:
           ddir_lg = ddir.replace(s_lg, '_06')
       elif s_lg in ['_13', '_14', '_15', '_16', '_17']:
           ddir_lg = ddir.replace(s_lg, '_12')
       elif s_lg in ['_19', '_20', '_21', '_22', '_23']:
           ddir_lg = ddir.replace(s_lg, '_18')
       else:
           continue
       print(root_path_lg + ddir)
       print(root_path_lg + ddir_lg)
       if '_1823' in ddir_lg:
           ddir_lg = ddir_lg.replace('_1823', '_2023')
       if not os.path.exists(root_path_lg + ddir_lg):
           shutil.move(root_path_lg + ddir, root_path_lg + ddir_lg)
       else:
           shutil.rmtree(root_path_lg + ddir)

print("end")