python内置模块登录邮箱(SMTP/POP3)
使用pdfminer解析pdf合同附件,简单的提取一下里面的内容
压缩包已经上传!
poplib_emain.py内容(完整版):
# coding:utf-8
# date:2018/4/19
# PDFParser : pdf解析类
# PDFDocument : pdf 文本存储
# PDFResourceManager : pdf 存储资源类(图片文本)
# PDFPageInterpreter : pdf 处理页面内容将PDFDevice翻译成想要的内容
from email.parser import Parser
from email.header import decode_header
from email.utils import parseaddr
import re
import poplib
import traceback
import mysql.connector
from pdfminer.pdfparser import PDFDocument, PDFParser
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, PDFTextExtractionNotAllowed
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from emails.settings import *
from emails.Logger import *
logger = Logger(logname='log5.txt', loglevel=1, logger='email').getlog()
def deco(func):
def wrapper(self, *args, **kwargs):
try:
return func(self, *args, **kwargs)
except Exception as e:
logger.error('\n%s\n方法 %s发生错误,原因是: %s\n' % ('-' * 100, func.__name__, traceback.format_exc()))
return wrapper
class Email:
name_lis = []
money_lis = []
mingxi_lis = []
def __init__(self):
self.db = mysql.connector.Connect(host=HOST, user=USER, password=PASSWORD, port=PORT, db=DBS)
self.cursor = self.db.cursor()
@deco
def pdf_text(self, path):
'''
pdf解析文本,保存到列表
:param path: pdf位置参数
:return: 文本列表
'''
content_ = []
fp = open(path, 'rb')
# 创建一个pdf分析对象
parse = PDFParser(fp)
# 创建一个pdf文档对象
document = PDFDocument(parse)
# 连接分析器和文档独享
parse.set_document(document)
document.set_parser(parse)
# 提供初始化密码
# 如果没有密码 就创建一个空的字符串
document.initialize()
# 检测文档是否提供txt转换,不提供就忽略
if not document.is_extractable:
raise PDFTextExtractionNotAllowed
# 创建一个资源共享管理对象
rsrcmgr = PDFResourceManager()
# 创建一个PDF设备对象
laparams = LAParams()
# 创建一个PDF解释器对象。
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# 处理文档中包含的每个页面。
interpreter = PDFPageInterpreter(rsrcmgr, device)
lens = []
for i in document.get_pages():
lens.append(i)
obj = lens.pop()
for page in document.get_pages():
# 使用页面解析器来读取
interpreter.process_page(page)
# 使用聚合器获取内容
layout = device.get_result()
for x in layout:
if isinstance(x, LTTextBoxHorizontal):
content_.append(x.get_text())
return content_
def turn_on_off(self, lines, status, start, end):
'''
检测该行开始是否是特定值开始/结束
如果是设定特定值开始/结束,状态开/关
'''
if lines.startswith(start):
status = True
elif status:
if lines == end:
status = False
return status
def savefile(self, filename, data, path):
try:
filepath = path + filename
print('Save as: ' + filepath)
f = open(filepath, 'wb')
except:
print(filepath + ' open failed')
# f.close()
else:
f.write(data)
f.close()
def decode_str(self, s):
'''
转码
:param s:
:return:
'''
value, charset = decode_header(s)[0]
if charset:
value = value.decode(charset)
return value
def guess_charset(self, msg):
'''
# 先从msg对象获取编码:
:param msg:
:return:
'''
charset = msg.get_charset()
if charset is None:
# 如果获取不到,再从Content-Type字段获取:
content_type = msg.get('Content-Type', '').lower()
pos = content_type.find('charset=')
if pos >= 0:
charset = content_type[pos + 8:].strip()
return charset
@deco
def print_info(self, msg, mypath):
'''
解析邮件
:param msg: 数据对象
:param mypath: pdf文本保存路径
:return: 文本字典,于pdf提取结果
'''
headers = {}
# 邮件的From, To, Subject存在于根对象上:
for header in ['From', 'To', 'Subject', 'Date']:
value = msg.get(header, '')
if value:
if header == 'Date':
headers['date'] = value
if header == 'Subject':
# 需要解码Subject字符串:
value = self.decode_str(value)
headers['Subject'] = value
else:
# 需要解码Email地址:
hdr, addr = parseaddr(value)
name = self.decode_str(hdr)
value = u'%s ' % (name, addr)
if header == 'From':
from_address = value
headers['from'] = from_address
if header == 'To':
to_address = value
headers['to'] = to_address
# headers['date']=''
else:
headers['date'] = ''
for part in msg.walk():
filename = part.get_filename()
content_type = part.get_content_type()
charset = self.guess_charset(part)
if filename:
filename = self.decode_str(filename)
data = part.get_payload(decode=True)
if filename != None or filename != '':
if '.pdf' in filename:
print('Accessory: ' + filename)
headers['url'] = mypath + filename
self.savefile(filename, data, mypath)
with open('pdf.txt', 'wb')as f:
f.write(''.join(self.pdf_text(mypath + filename)).encode('utf-8'))
openfile = open('pdf.txt', 'r', encoding='utf-8')
names = money = mingxi = False
for line in openfile:
if names:
self.name_lis.append(line.strip())
# print(line.strip())
names = self.turn_on_off(line, names, '甲方(借款人):', '鉴于:\n')
if money:
self.money_lis.append(line.strip())
# print(line, )
money = self.turn_on_off(line, money, '第一条 借款金额、期限及利息、借款类型、借款提现', '第二条 还款\n')
if mingxi:
self.mingxi_lis.append(line.strip())
# print(line.strip())
mingxi = self.turn_on_off(line, mingxi, '2.4 还款计划明细\n',
'甲方还款日如发生变化的,甲方同意服务方以录音电话或电子邮件并辅助以短信的方\n')
headers['names'] = ''.join(self.name_lis)
headers['money'] = ''.join(self.money_lis)
headers['mingxi'] = ''.join(self.mingxi_lis)
else:
headers['names'] = ''
headers['money'] = ''
headers['mingxi'] = ''
headers['url'] = ''
email_content_type = ''
content = ''
if content_type == 'text/plain':
email_content_type = 'text'
if content_type == 'text/html':
email_content_type = 'html'
if charset:
try:
content = part.get_payload(decode=True).decode(charset)
except:
content = '英文'
headers['contents'] = ','.join(re.findall(u'[\u4e00-\u9fa5]+', content)).replace('宋体', '')
# print(headers)
return headers
@deco
def save_file(self):
email = str(input('请输入账号:'))
password = str(input('请输入授权码:'))
if '@163.' in email:
pop3_server = 'pop3.163.com'
elif '@126.' in email:
pop3_server = 'pop3.126.com' # 网易授权服务器地址
elif '@qq.com' in email: # 腾讯
pop3_server = 'pop.qq.com'
elif '@sina.com' or '@2008.sina' or '@51uc.com' in email: # 新浪
pop3_server = 'pop.sina.com'
elif '@188.com' in email:
pop3_server = 'pop3.188.com'
else:
logger.info('不支持此邮箱%s,或程序出错' % email)
print('暂不支持此邮箱')
exit()
mypath = PATH
# 连接到POP3服务器:
print(pop3_server)
logger.info('传入的账号是%s' % email)
server = poplib.POP3_SSL(pop3_server)
# 可以打开或关闭调试信息:
server.set_debuglevel(1)
# 可选:打印POP3服务器的欢迎文字:
print(server.getwelcome().decode('utf-8'))
# 身份认证:
server.user(email)
server.pass_(password)
try:
sql = "create table if not exists email_sina(username char(255) not null,froms char(255) not null,tos char(255) not null,subject char(255) not null primary key,dates char(255),fu_url char(255),contents char(255),names varchar(1000),money varchar(1000),detail varchar(1000))"
self.cursor.execute(sql)
self.db.commit()
print('创建email_sina表')
except Exception as e:
print('%s表已经存在了email_sina')
# stat()返回邮件数量和占用空间:
# print('Messages: %s. Size: %s' % server.stat())
# list()返回所有邮件的编号:
resp, mails, octets = server.list()
# 获取最新一封邮件, 注意索引号从1开始:
index = len(mails)
for i in range(1, index + 1):
resp, lines, octets = server.retr(i)
# lines存储了邮件的原始文本的每一行,
# 可以获得整个邮件的原始文本:
try:
msg_content = b'\r\n'.join(lines).decode('utf-8')
except:
continue
# 稍后解析出邮件:
msg = Parser().parsestr(msg_content)
dic = self.print_info(msg, mypath)
# print(dic)
try:
self.cursor.execute("replace into email_sina(`username`,`froms`,`tos`,`subject`,`dates`,`fu_url`,`contents`,`names`,`money`,`detail`) "
"values('"+email+"','"+dic['from']+"','"+dic['to']+"','"+dic['Subject']+"','"+dic['date']+"','"+dic['url']+"','"+dic['contents']+"','"+dic['names']+"','"+dic['money']+"','"+dic['mingxi']+"')") #ON DUPLICATE KEY UPDATE from=values('"+dic['from']+"'),to=values('"+dic['to']+"'),subject=values('"+dic['Subject']+"'),date=values('"+dic['date']+"'),fu_url=values('"+dic['url']+"'),contents=values('"+dic['contents']+"')")
self.db.commit()
print(dic['Subject'],'插入')
except Exception as e:
print(e)
self.db.rollback()
continue
server.close()
logger.info('账号解析完成')
self.db.close()
if __name__ == '__main__':
Email().save_file()