POP3协议登陆邮箱并获取所有邮件
import re
import poplib
import time
from email.parser import Parser
from email.header import decode_header
from email.utils import parseaddr
from datetime import datetime
from selenium import webdriver
user_email_address = ''
user_password = ''
pop_server_host = 'pop3.mxhichina.com'
pop_server_port = 995
driver = webdriver.Chrome()
driver.maximize_window()
def connect_email_by_pop3(number,key_words):
try:
email_server = poplib.POP3_SSL(host=pop_server_host, port=pop_server_port, timeout=10)
print("连接pop服务器-------正常,开始验证用户邮箱")
except:
print("连接pop服务器-------异常,退出")
exit(1)
try:
email_server.user(user_email_address)
print("用户邮箱验证-------正常,开始验证邮箱授权码")
except:
print("用户邮箱验证-------异常,退出")
exit(1)
try:
email_server.pass_(user_password)
print("邮箱授权码验证-------正常,开始接受邮箱以及附件")
except:
print("邮箱授权码验证-------异常,退出")
exit(1)
parse_email_server(email_server,number,key_words)
def parse_email_server(email_server,number,key_words):
_, mails, _ = email_server.list()
index = len(mails)
for i in range(index, index-number, -1):
_, lines, _ = email_server.retr(i)
msg_content = b'\r\n'.join(lines).decode('utf-8')
msg = Parser().parsestr(msg_content)
parser_content(msg, 0,key_words,driver)
email_server.quit()
def parser_content(msg, indent,key_words,driver):
is_chendu = False
if indent == 0:
is_chendu,head = parser_email_header(msg,key_words)
if not is_chendu:
return
if (msg.is_multipart()):
parts = msg.get_payload()
for n, part in enumerate(parts):
return parser_content(part, indent + 1,key_words,driver)
else:
content_type = msg.get_content_type()
if content_type == 'text/plain' or content_type == 'text/html':
content = msg.get_payload(decode=True)
charset = guess_charset(msg)
if charset:
content = content.decode(charset)
if "2023年毕业" in content:
ms = parse_content(content)
print(head,ms)
if driver.current_url != 'data:,':
windows = driver.window_handles
driver.execute_script("window.open('','_blank');")
driver.switch_to.window(windows[-1])
driver.get(ms)
return 1
def open_chrome(url):
pass
def parse_content(content):
pattern = r'<tr><td style="font-weight:600;font-size: 20px;color: #00a9e0;">([^><]+)</td></tr>'
info = re.findall(pattern, content)
rpattern = r'<a href="(https://hr.shixiseng.com/#/email/view/.+)" style="display:block;width:120px;text-align:center;padding: 10px 0;vertical-align:middle; color: #ffffff;background-color: #00a9e0;text-decoration: none;">查看完整简历</a></td></tr>'
res = re.findall(rpattern, content)
return res[0]
def parser_email_header(msg,keywords):
subject = msg['Subject']
value, charset = decode_header(subject)[0]
if charset:
value = value.decode(charset)
info = filter(lambda x:x in value,key_words)
return 1 if list(info) else 0,value
def decode_str(s):
value, charset = decode_header(s)[0]
if charset:
value = value.decode(charset)
return value
def guess_charset(msg):
charset = msg.get_charset()
if charset is None:
content_type = msg.get('Content-Type', '').lower()
for item in content_type.split(';'):
item = item.strip()
if item.startswith('charset'):
charset = item.split('=')[1]
break
return charset
def parse_mail_time(mail_datetime):
GMT_FORMAT = "%a, %d %b %Y %H:%M:%S"
GMT_FORMAT2 = "%d %b %Y %H:%M:%S"
index = mail_datetime.find(' +0')
if index > 0:
mail_datetime = mail_datetime[:index]
formats = [GMT_FORMAT, GMT_FORMAT2]
for ft in formats:
try:
mail_datetime = datetime.strptime(mail_datetime, ft)
return mail_datetime
except:
pass
raise Exception("邮件时间格式解析错误")
if __name__ == "__main__":
number = 50
key_words = ["成都","四川","西南"]
connect_email_by_pop3(number,key_words)