SMTP发送邮件的博文很多,但完整读取邮件的较少,本文主要是Python3读取邮件的编码,同时使用BeautifulSoup解析邮件内容。
Python版本信息,如下:
Python 3.8.2 (tags/v3.8.2:7b3ab59, Feb 25 2020, 23:03:10) [MSC v.1916 64 bit (AMD64)] on win32
代码
import email
import imaplib
from bs4 import BeautifulSoup
def main():
try:
# 填写需要读取邮件服务器的imap的host和port,不知道请联系管理员
conn = imaplib.IMAP4_SSL(host='imap.xxx.com', port='993')
# 读取邮件的用户名和密码
conn.login('xxx@qq.com', 'your password')
# 默认选择收件箱 INBOX
conn.select()
# Recent\Seen参数不起作用,暂先读取所有邮件
status, data = conn.search(None, 'ALL')
if status != 'OK':
raise Exception('读取邮件发生错误')
emailids = data[0].split()
# 倒序读取邮件
mail_counts = len(emailids)
for i in range(mail_counts-1, 0, -1):
# 获取邮件信息
status, edata = conn.fetch(emailids[i], '(RFC822)')
# Message对象
msg = email.message_from_bytes(edata[0][1])
# 标题
subject = email.header.decode_header(msg.get('subject'))
# subject包含文档编码
default_code = subject[0][1]
print('subject', subject[0][0].decode(default_code))
# print('Content_Type', msg.get_content_type())
# 是否multipart类型,分别处理
if msg.is_multipart():
pl = msg.get_payload()
for m in pl:
ctype = m.get_content_type()
if 'html' in ctype:
# 注意decode参数,如果是True将解码base64/quoted-printable等格式编码内容,否则不解码
html = str(m.get_payload(decode=True), m.get('content-type').split('=')[1])
else:
txt = msg.get_payload(decode=True)
html = str(txt, default_code) if txt else ''
# BeautifulSoup解析网页
soup = BeautifulSoup(html, "lxml")
divs = soup.select('body')
for d in divs:
# 提取所有文本内容
text = d.get_text(strip=True)
print(text)
else:
html = str(msg.get_payload(decode=True), default_code)
# BeautifulSoup解析网页
soup = BeautifulSoup(html, "lxml")
# 提取body标签里面的所有文本内容
divs = soup.select('body')
for d in divs:
text = d.get_text(strip=True)
print(text)
except Exception as ex:
print(ex)
finally:
# close
conn.close()
conn.logout()
if __name__ == "__main__":
main()