import chardet,poplib,email.parser,email.policy,base64
from email.parser import Parser
from email.header import decode_header
from email.utils import parseaddr
from bs4 import BeautifulSoup
"""
Author: Thomas zhu
Date:2023-11-18
Funcaiton:Parsing the inform from Emailbox by POP3
"""
value = ['','','']
#Parsing the head content from message
def parsse_heard(msg):
global value
i=0
for header in ['From','To','Subject']: #解析邮件头部
value[i] = msg.get(header,'')
if value[i]:
if header == 'Subject': #解析主题
value[i] = decode_str(value[i])
else:
hdr,addr = parseaddr(value[i])
name=decode_str(hdr)
value[i]=u'%s<%s>'%(name,addr)
i=i+1
print("parsse hearder:" + value.__getitem__(0) + " " + value.__getitem__(2))
#Decoding the string
def decode_str(s):
value3,charset = decode_header(s)[0]
if charset:
value3 = value3.decode(charset)
return value3
def set_charset(msg): # 设置字符集
charset=msg.get_charset() #获取字符集
if charset is None:
content_type = msg.get('Content-type','').lower()
pos=content_type.find('charset=')
if pos>=0:
charset = content_type[pos + 8:].strip()
return charset
#解析邮件
def parsse_body_msg(msg):
v3=['','','']
i=0
#解析邮件头部
for header in ['From','To','Subject']: #解析邮件头部
v3[i] = msg.get(header,'')
if v3[i]:
if header == 'Subject': #解析主题
v3[i] = decode_str(v3[i])
# print('标题 %s:%s' % (header, v3[i]))
else:
hdr,addr = parseaddr(v3[i])
name=decode_str(hdr)
v3[i]=u'%s<%s>'%(name,addr)
i+=1
print("{} ".format(i)+"=====From: {} ".format(v3.__getitem__(0))+"To: {} ".format(v3.__getitem__(1))+ "Subject: {} ".format(v3.__getitem__(2)))
if (msg.is_multipart()): #如果邮件有多个部分组成则返回ture
parts = msg.get_payload() #返回一个包含邮件所有子对象的列表
for n, part in enumerate(parts): #枚举,遍历各子对象
parsse_body_msg(part)
else:
content_type = msg.get_content_type() #获取邮件信息内容的编码类型
if content_type == 'text/plain' or content_type == 'text/html': #如果是纯文本或html类型
print('22222part')
contentStr = msg.get_payload(decode=True) #返回一个包含邮件所有子对象(已经解码)的列表
charset = set_charset(msg) #设置字符集
if charset: # 字符集不为空
contentStr = contentStr.decode(charset) # 解码
else:
print('Attachement:%s' % (content_type)) # 附件
#print('方法中封装的字符串 htmlstr.....' + contentStr+" The end for String in fundation")
soup = BeautifulSoup(contentStr, 'html.parser')
attrstr1 ="font-size: 13px;width: 540px; color: #222222;padding-left: 20px;word-break: break-all;"
attrstr2="font-size: 13px\s+\d+px\s+padding-left: 20px;word-break: break-all;"
attrstr3= "font-size: 13px;width: 130px; color: #222222;"
attrstr4="font-size: 13px; width: 130px;color: #222222;padding-right: 20px"
attrstr5="font-size: 13px;width: 90px;color: #222222;padding-left: 10px;"
nm = 0
subjectList,date1,date2,price=[],[],[],[]
for link in soup.find_all('td',style=attrstr1):
#print(link.get_text())
subjectList.append(link.get_text())
for link in soup.find_all('td', style=attrstr3):
#print(link.get_text())
date1.append(link.get_text())
for link in soup.find_all('td', style=attrstr4):
#print(link.get_text())
date2.append(link.get_text())
for link in soup.find_all('td', style=attrstr5):
#print(link.get_text())
price.append(link.get_text())
for link in soup.find_all('td', style=attrstr1):
print(date1[nm]+" "+ date2[nm]+" "+price[nm]+" "+subjectList[nm])
nm=nm+1
#Connect to 126 mail server
server =poplib.POP3_SSL('pop.126.com',995);
#server=poplib.POP3('pop.126.com',110)
server.user("liuhu@126.com");
server.pass_('SDOPOPSOIVKNMBCC');
emailNum,size =server.stat()
numEmail = len(server.list()[1]) # 另一种获取邮件列表的方法
print("Messages:%s. size:%s"%(emailNum,size)) #邮件数量和占用空间
print("numEmail=",numEmail+="");
response,maillist,r=server.list() # Get mail list
#print("response",response)
#print("maillist",maillist) #’the content of print is:[b'1 329199', b'2 54441', b'3 39162', b'4 38935', b'5 39137', b'6 38943', b'7 40960', ]
bankstr1='creditcard@service.pingan'
bankstr2='cebbank@cd'
bankstr3='95574@'
bankstr4='creditcard@message'
bankstr5="95555@me"
b6="master@"
for i in range(emailNum,emailNum-500, -1):
try:
response,message,octets = server.retr(i) # 返回 状态信息,邮件 ,邮件大小 Message中存储了邮件原始文本中的每一行
email_content = b'\r\n'.join(message).decode('utf-8') # b 表示后面的字符串为Bytes类型
msg = Parser().parsestr(email_content) #msg =邮件内容
mailbankstr= msg['from']
if bankstr1 in mailbankstr: # or bankstr2 in mailbankstr or bankstr3 in mailbankstr or bankstr4 in mailbankstr or bankstr5 in mailbankstr or b6 in mailbankstr: #拣选所需要的邮件
parsse_body_msg(msg)
except Exception as e:
pass
finally:
pass
server.quit()