最近遇到了个需求,需要使用python来解码email文件(.eml),途中发现了一些问题,虽然网上有一些使用python自带库email来处理的,但是由于版本问题,部分代码已经跑不通了,因此重新写了个工具类来解码邮件和从邮件里提取信息。
遇到的一些问题:部分邮件的subject,from,to会经过编码,需要一定处理才能提取
还有就是邮件正文本身也可能是会用不同的加密方式拼接而成的(一般不会这样),因此需要逐块读取,通过邮件里指定的解码方式进行解码
Received: from 192.168.208.56 ( 192.168.208.56 [192.168.208.56] ) by
ajax-webmail-wmsvr37 (Coremail) ; Thu, 12 Apr 2007 12:07:48 +0800 (CST)
Date: Thu, 12 Apr 2007 12:07:48 +0800 (CST)
From: user1 <xxxxxxxx@163.com>
To: zhaowei <zhaoweikid@163.com>
Message-ID: <31571419.200911176350868321.JavaMail.root@bj163app37.163.com>
Subject: =?gbk?B?u+nJtA==?=
MIME-Version: 1.0
Content-Type: multipart/Alternative;
boundary="----=_Part_21696_28113972.1176350868319"
------=_Part_21696_28113972.1176350868319
Content-Type: text/plain; charset=gbk
Content-Transfer-Encoding: base64
ztLS0b+qyrzS1M6qysfSu7j20MfG2ru70ru0zqOs1K3AtMrH0ru49tTCtffSu7TOztLDx8/W1NrT
prjDysew67XjssXE3MjI1ebC6bezICAg
------=_Part_21696_28113972.1176350868319
Content-Type: text/html; charset=gbk
Content-Transfer-Encoding: quoted-printable
<DIV>=CE=D2=D2=D1=BF=AA=CA=BC=D2=D4=CE=AA=CA=C7=D2=BB=B8=F6=D0=C7=C6=DA=BB=
=BB=D2=BB=B4=CE=A3=AC=D4=AD=C0=B4=CA=C7=D2=BB=B8=F6=D4=C2=B5=F7=D2=BB=B4=CE=
</DIV>
<DIV>=CE=D2=C3=C7=CF=D6=D4=DA=D3=A6=B8=C3=CA=C7=B0=EB=B5=E3=B2=C5=C4=DC=C8=
=C8</DIV>
<DIV>=D5=E6=C2=E9=B7=B3</DIV>
------=_Part_21696_28113972.1176350868319--
最终工具类:
import email.header
import os
from email.parser import Parser
import json
class MailReader(object):
"""
用于读取eml文件,并把各个字段保存下来
使用python3.7内置的email包
"""
def __init__(self, eml_path="", debug=False):
"""
初始化属性
"""
self.raw_email = None
self.email_content = None
self.process_log = ""
self.debug = debug
self.header_dict = {}
self.mail_text = ""
self.all_links = []
if eml_path:
self.__MailReader(eml_path)
@staticmethod
def decodeHeader(header_str):
"""
输入需要解码的header字符串,返回解码结果
"""
temp = email.header.decode_header(header_str)
result = email.header.make_header(temp)
return result
def toString(self):
"""
打印整个邮件以及日志
"""
print("email内容:", self.email_content)
if self.debug:
print("process_log:", self.process_log)
return self.email_content
def toDict(self):
"""
把header转换为字典形式,From,To,Subject需要单独解码
"""
each_key: str
if self.header_dict != {}:
return self.header_dict
for each_key in set(self.email_content.keys()):
self.header_dict.update({each_key: self.email_content.get_all(each_key)})
for each_key in ["From", "To", "Subject"]:
temp = []
for each_str in self.header_dict.get(each_key):
each_str = str(self.decodeHeader(each_str))
temp.append(each_str)
self.header_dict.update({each_key: temp})
return self.header_dict
def toJson(self):
"""
把header转换为json格式
"""
if self.header_dict == {}:
self.header_dict = self.toDict()
return json.dumps(self.header_dict)
def __MailReader(self, eml_path):
"""
读取邮件
"""
try:
if os.path.exists(eml_path):
with open(eml_path) as fp:
self.raw_email = fp.read()
self.email_content = Parser().parsestr(self.raw_email)
except Exception as e:
self.process_log += "读取邮件失败:" + str(e)
self.toString()
return self
def parseMail(self, eml_path):
"""
输入邮件路径,用email库整理邮件
"""
self.header_dict = {}
return self.__MailReader(eml_path)
def getContent(self):
"""
循环遍历数据块并尝试解码,暂时只处理text数据
"""
all_content = []
for par in self.email_content.walk():
if not par.is_multipart(): # 这里要判断是否是multipart,是的话,里面的数据是无用的
str_charset = par.get_content_charset(failobj=None) # 当前数据块的编码信息
str_content_type = par.get_content_type()
if str_content_type in ('text/plain', 'text/html'):
content = par.get_payload(decode=True)
all_content.append(content.decode(str_charset))
self.mail_text = all_content
return all_content
def getLinks(self):
"""
通过正则表达式,匹配超链接以及
显示的属性内容,格式如下
[('https://rashangharper.com/wp-admin/user/welllz/display/login.html', 'wellsfargo.com')]
"""
if self.all_links:
return self.all_links
all_links = []
pattern = '<a.*?href="(.+)".*?>(.*?)</a>'
if self.mail_text == "":
self.getContent()
for part in self.mail_text:
links = re.findall(pattern, part)
all_links += links
self.all_links = all_links
return all_links
if __name__ == '__main__':
a = MailReader("fakeherf.eml").getLinks()
print(a)