使用python3处理解码email

最近遇到了个需求,需要使用python来解码email文件(.eml),途中发现了一些问题,虽然网上有一些使用python自带库email来处理的,但是由于版本问题,部分代码已经跑不通了,因此重新写了个工具类来解码邮件和从邮件里提取信息。

遇到的一些问题:部分邮件的subject,from,to会经过编码,需要一定处理才能提取
还有就是邮件正文本身也可能是会用不同的加密方式拼接而成的(一般不会这样),因此需要逐块读取,通过邮件里指定的解码方式进行解码

Received: from 192.168.208.56 ( 192.168.208.56 [192.168.208.56] ) by
 ajax-webmail-wmsvr37 (Coremail) ; Thu, 12 Apr 2007 12:07:48 +0800 (CST)
Date: Thu, 12 Apr 2007 12:07:48 +0800 (CST)
From: user1 <xxxxxxxx@163.com>
To: zhaowei <zhaoweikid@163.com>
Message-ID: <31571419.200911176350868321.JavaMail.root@bj163app37.163.com>
Subject: =?gbk?B?u+nJtA==?=
MIME-Version: 1.0
Content-Type: multipart/Alternative; 
    boundary="----=_Part_21696_28113972.1176350868319"

------=_Part_21696_28113972.1176350868319
Content-Type: text/plain; charset=gbk
Content-Transfer-Encoding: base64

ztLS0b+qyrzS1M6qysfSu7j20MfG2ru70ru0zqOs1K3AtMrH0ru49tTCtffSu7TOztLDx8/W1NrT
prjDysew67XjssXE3MjI1ebC6bezICAg
------=_Part_21696_28113972.1176350868319
Content-Type: text/html; charset=gbk
Content-Transfer-Encoding: quoted-printable

<DIV>=CE=D2=D2=D1=BF=AA=CA=BC=D2=D4=CE=AA=CA=C7=D2=BB=B8=F6=D0=C7=C6=DA=BB=
=BB=D2=BB=B4=CE=A3=AC=D4=AD=C0=B4=CA=C7=D2=BB=B8=F6=D4=C2=B5=F7=D2=BB=B4=CE=
</DIV>
<DIV>=CE=D2=C3=C7=CF=D6=D4=DA=D3=A6=B8=C3=CA=C7=B0=EB=B5=E3=B2=C5=C4=DC=C8=
=C8</DIV>
<DIV>=D5=E6=C2=E9=B7=B3</DIV>
------=_Part_21696_28113972.1176350868319--

最终工具类:

import email.header
import os
from email.parser import Parser
import json


class MailReader(object):
    """
    用于读取eml文件,并把各个字段保存下来
    使用python3.7内置的email包
    """

    def __init__(self, eml_path="", debug=False):
        """
        初始化属性
        """
        self.raw_email = None
        self.email_content = None
        self.process_log = ""
        self.debug = debug
        self.header_dict = {}
        self.mail_text = ""
        self.all_links = []
        if eml_path:
            self.__MailReader(eml_path)

    @staticmethod
    def decodeHeader(header_str):
        """
        输入需要解码的header字符串,返回解码结果
        """
        temp = email.header.decode_header(header_str)
        result = email.header.make_header(temp)
        return result

    def toString(self):
        """
        打印整个邮件以及日志
        """
        print("email内容:", self.email_content)
        if self.debug:
            print("process_log:", self.process_log)
        return self.email_content

    def toDict(self):
        """
        把header转换为字典形式,From,To,Subject需要单独解码
        """
        each_key: str

        if self.header_dict != {}:
            return self.header_dict

        for each_key in set(self.email_content.keys()):
            self.header_dict.update({each_key: self.email_content.get_all(each_key)})

        for each_key in ["From", "To", "Subject"]:
            temp = []
            for each_str in self.header_dict.get(each_key):
                each_str = str(self.decodeHeader(each_str))
                temp.append(each_str)
            self.header_dict.update({each_key: temp})
        return self.header_dict

    def toJson(self):
        """
        把header转换为json格式
        """
        if self.header_dict == {}:
            self.header_dict = self.toDict()
        return json.dumps(self.header_dict)

    def __MailReader(self, eml_path):
        """
        读取邮件
        """
        try:
            if os.path.exists(eml_path):
                with open(eml_path) as fp:
                    self.raw_email = fp.read()
                self.email_content = Parser().parsestr(self.raw_email)
        except Exception as e:
            self.process_log += "读取邮件失败:" + str(e)
            self.toString()
        return self

    def parseMail(self, eml_path):
        """
        输入邮件路径,用email库整理邮件
        """
        self.header_dict = {}
        return self.__MailReader(eml_path)

    def getContent(self):
        """
        循环遍历数据块并尝试解码,暂时只处理text数据
        """
        all_content = []
        for par in self.email_content.walk():
            if not par.is_multipart():  # 这里要判断是否是multipart,是的话,里面的数据是无用的
                str_charset = par.get_content_charset(failobj=None)  # 当前数据块的编码信息
                str_content_type = par.get_content_type()
                if str_content_type in ('text/plain', 'text/html'):
                    content = par.get_payload(decode=True)
                    all_content.append(content.decode(str_charset))
        self.mail_text = all_content
        return all_content

    def getLinks(self):
        """
        通过正则表达式,匹配超链接以及
        显示的属性内容,格式如下
         [('https://rashangharper.com/wp-admin/user/welllz/display/login.html', 'wellsfargo.com')]
        """
        if self.all_links:
            return self.all_links
        all_links = []
        pattern = '<a.*?href="(.+)".*?>(.*?)</a>'
        if self.mail_text == "":
            self.getContent()
        for part in self.mail_text:
            links = re.findall(pattern, part)
            all_links += links
        self.all_links = all_links
        return all_links


if __name__ == '__main__':
    a = MailReader("fakeherf.eml").getLinks()
    print(a)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值