python pop3_python通过pop3方式登录邮箱(qq,新浪,网易)

该博客介绍了一个Python脚本,它使用内置模块poplib和email库从SMTP/POP3服务器登录并下载邮件,然后利用pdfminer库解析PDF附件,从中提取合同内容。脚本还包含了错误处理和日志记录功能,可以解析邮件头信息并将内容保存到文件。最后,脚本能够从PDF文本中提取关键信息,如姓名、金额和明细。
摘要由CSDN通过智能技术生成

python内置模块登录邮箱(SMTP/POP3)

使用pdfminer解析pdf合同附件,简单的提取一下里面的内容

压缩包已经上传!

poplib_emain.py内容(完整版):

# coding:utf-8

# date:2018/4/19

# PDFParser : pdf解析类

# PDFDocument : pdf 文本存储

# PDFResourceManager : pdf 存储资源类(图片文本)

# PDFPageInterpreter : pdf 处理页面内容将PDFDevice翻译成想要的内容

from email.parser import Parser

from email.header import decode_header

from email.utils import parseaddr

import re

import poplib

import traceback

import mysql.connector

from pdfminer.pdfparser import PDFDocument, PDFParser

from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter, PDFTextExtractionNotAllowed

from pdfminer.converter import PDFPageAggregator

from pdfminer.layout import LTTextBoxHorizontal, LAParams

from emails.settings import *

from emails.Logger import *

logger = Logger(logname='log5.txt', loglevel=1, logger='email').getlog()

def deco(func):

def wrapper(self, *args, **kwargs):

try:

return func(self, *args, **kwargs)

except Exception as e:

logger.error('\n%s\n方法 %s发生错误,原因是: %s\n' % ('-' * 100, func.__name__, traceback.format_exc()))

return wrapper

class Email:

name_lis = []

money_lis = []

mingxi_lis = []

def __init__(self):

self.db = mysql.connector.Connect(host=HOST, user=USER, password=PASSWORD, port=PORT, db=DBS)

self.cursor = self.db.cursor()

@deco

def pdf_text(self, path):

'''

pdf解析文本,保存到列表

:param path: pdf位置参数

:return: 文本列表

'''

content_ = []

fp = open(path, 'rb')

# 创建一个pdf分析对象

parse = PDFParser(fp)

# 创建一个pdf文档对象

document = PDFDocument(parse)

# 连接分析器和文档独享

parse.set_document(document)

document.set_parser(parse)

# 提供初始化密码

# 如果没有密码 就创建一个空的字符串

document.initialize()

# 检测文档是否提供txt转换,不提供就忽略

if not document.is_extractable:

raise PDFTextExtractionNotAllowed

# 创建一个资源共享管理对象

rsrcmgr = PDFResourceManager()

# 创建一个PDF设备对象

laparams = LAParams()

# 创建一个PDF解释器对象。

device = PDFPageAggregator(rsrcmgr, laparams=laparams)

# 处理文档中包含的每个页面。

interpreter = PDFPageInterpreter(rsrcmgr, device)

lens = []

for i in document.get_pages():

lens.append(i)

obj = lens.pop()

for page in document.get_pages():

# 使用页面解析器来读取

interpreter.process_page(page)

# 使用聚合器获取内容

layout = device.get_result()

for x in layout:

if isinstance(x, LTTextBoxHorizontal):

content_.append(x.get_text())

return content_

def turn_on_off(self, lines, status, start, end):

'''

检测该行开始是否是特定值开始/结束

如果是设定特定值开始/结束,状态开/关

'''

if lines.startswith(start):

status = True

elif status:

if lines == end:

status = False

return status

def savefile(self, filename, data, path):

try:

filepath = path + filename

print('Save as: ' + filepath)

f = open(filepath, 'wb')

except:

print(filepath + ' open failed')

# f.close()

else:

f.write(data)

f.close()

def decode_str(self, s):

'''

转码

:param s:

:return:

'''

value, charset = decode_header(s)[0]

if charset:

value = value.decode(charset)

return value

def guess_charset(self, msg):

'''

# 先从msg对象获取编码:

:param msg:

:return:

'''

charset = msg.get_charset()

if charset is None:

# 如果获取不到,再从Content-Type字段获取:

content_type = msg.get('Content-Type', '').lower()

pos = content_type.find('charset=')

if pos >= 0:

charset = content_type[pos + 8:].strip()

return charset

@deco

def print_info(self, msg, mypath):

'''

解析邮件

:param msg: 数据对象

:param mypath: pdf文本保存路径

:return: 文本字典,于pdf提取结果

'''

headers = {}

# 邮件的From, To, Subject存在于根对象上:

for header in ['From', 'To', 'Subject', 'Date']:

value = msg.get(header, '')

if value:

if header == 'Date':

headers['date'] = value

if header == 'Subject':

# 需要解码Subject字符串:

value = self.decode_str(value)

headers['Subject'] = value

else:

# 需要解码Email地址:

hdr, addr = parseaddr(value)

name = self.decode_str(hdr)

value = u'%s ' % (name, addr)

if header == 'From':

from_address = value

headers['from'] = from_address

if header == 'To':

to_address = value

headers['to'] = to_address

# headers['date']=''

else:

headers['date'] = ''

for part in msg.walk():

filename = part.get_filename()

content_type = part.get_content_type()

charset = self.guess_charset(part)

if filename:

filename = self.decode_str(filename)

data = part.get_payload(decode=True)

if filename != None or filename != '':

if '.pdf' in filename:

print('Accessory: ' + filename)

headers['url'] = mypath + filename

self.savefile(filename, data, mypath)

with open('pdf.txt', 'wb')as f:

f.write(''.join(self.pdf_text(mypath + filename)).encode('utf-8'))

openfile = open('pdf.txt', 'r', encoding='utf-8')

names = money = mingxi = False

for line in openfile:

if names:

self.name_lis.append(line.strip())

# print(line.strip())

names = self.turn_on_off(line, names, '甲方(借款人):', '鉴于:\n')

if money:

self.money_lis.append(line.strip())

# print(line, )

money = self.turn_on_off(line, money, '第一条 借款金额、期限及利息、借款类型、借款提现', '第二条 还款\n')

if mingxi:

self.mingxi_lis.append(line.strip())

# print(line.strip())

mingxi = self.turn_on_off(line, mingxi, '2.4 还款计划明细\n',

'甲方还款日如发生变化的,甲方同意服务方以录音电话或电子邮件并辅助以短信的方\n')

headers['names'] = ''.join(self.name_lis)

headers['money'] = ''.join(self.money_lis)

headers['mingxi'] = ''.join(self.mingxi_lis)

else:

headers['names'] = ''

headers['money'] = ''

headers['mingxi'] = ''

headers['url'] = ''

email_content_type = ''

content = ''

if content_type == 'text/plain':

email_content_type = 'text'

if content_type == 'text/html':

email_content_type = 'html'

if charset:

try:

content = part.get_payload(decode=True).decode(charset)

except:

content = '英文'

headers['contents'] = ','.join(re.findall(u'[\u4e00-\u9fa5]+', content)).replace('宋体', '')

# print(headers)

return headers

@deco

def save_file(self):

email = str(input('请输入账号:'))

password = str(input('请输入授权码:'))

if '@163.' in email:

pop3_server = 'pop3.163.com'

elif '@126.' in email:

pop3_server = 'pop3.126.com' # 网易授权服务器地址

elif '@qq.com' in email: # 腾讯

pop3_server = 'pop.qq.com'

elif '@sina.com' or '@2008.sina' or '@51uc.com' in email: # 新浪

pop3_server = 'pop.sina.com'

elif '@188.com' in email:

pop3_server = 'pop3.188.com'

else:

logger.info('不支持此邮箱%s,或程序出错' % email)

print('暂不支持此邮箱')

exit()

mypath = PATH

# 连接到POP3服务器:

print(pop3_server)

logger.info('传入的账号是%s' % email)

server = poplib.POP3_SSL(pop3_server)

# 可以打开或关闭调试信息:

server.set_debuglevel(1)

# 可选:打印POP3服务器的欢迎文字:

print(server.getwelcome().decode('utf-8'))

# 身份认证:

server.user(email)

server.pass_(password)

try:

sql = "create table if not exists email_sina(username char(255) not null,froms char(255) not null,tos char(255) not null,subject char(255) not null primary key,dates char(255),fu_url char(255),contents char(255),names varchar(1000),money varchar(1000),detail varchar(1000))"

self.cursor.execute(sql)

self.db.commit()

print('创建email_sina表')

except Exception as e:

print('%s表已经存在了email_sina')

# stat()返回邮件数量和占用空间:

# print('Messages: %s. Size: %s' % server.stat())

# list()返回所有邮件的编号:

resp, mails, octets = server.list()

# 获取最新一封邮件, 注意索引号从1开始:

index = len(mails)

for i in range(1, index + 1):

resp, lines, octets = server.retr(i)

# lines存储了邮件的原始文本的每一行,

# 可以获得整个邮件的原始文本:

try:

msg_content = b'\r\n'.join(lines).decode('utf-8')

except:

continue

# 稍后解析出邮件:

msg = Parser().parsestr(msg_content)

dic = self.print_info(msg, mypath)

# print(dic)

try:

self.cursor.execute("replace into email_sina(`username`,`froms`,`tos`,`subject`,`dates`,`fu_url`,`contents`,`names`,`money`,`detail`) "

"values('"+email+"','"+dic['from']+"','"+dic['to']+"','"+dic['Subject']+"','"+dic['date']+"','"+dic['url']+"','"+dic['contents']+"','"+dic['names']+"','"+dic['money']+"','"+dic['mingxi']+"')") #ON DUPLICATE KEY UPDATE from=values('"+dic['from']+"'),to=values('"+dic['to']+"'),subject=values('"+dic['Subject']+"'),date=values('"+dic['date']+"'),fu_url=values('"+dic['url']+"'),contents=values('"+dic['contents']+"')")

self.db.commit()

print(dic['Subject'],'插入')

except Exception as e:

print(e)

self.db.rollback()

continue

server.close()

logger.info('账号解析完成')

self.db.close()

if __name__ == '__main__':

Email().save_file()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值