1.将pdf文件内容写入txt文件:
利用PDFminer3k
模块来抽取PDF内容,包括文本、图像、曲线等:
# -*- coding: utf-8 -*-
import sys
import importlib
importlib.reload(sys)
from pdfminer.pdfparser import PDFParser,PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import *
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
'''
解析pdf文件,获取文件中包含的各种对象
'''
# 解析pdf文件函数
def parse(pdf_path):
fp = open(pdf_path, 'rb') # 以二进制读模式打开
# 用文件对象来创建一个pdf文档分析器
parser = PDFParser(fp)
# 创建一个PDF文档
doc = PDFDocument()
# 连接分析器 与文档对象
parser.set_document(doc)
doc.set_parser(parser)
# 提供初始化密码
# 如果没有密码 就创建一个空的字符串
doc.initialize()
# 检测文档是否提供txt转换,不提供就忽略
if not doc.is_extractable:
raise PDFTextExtractionNotAllowed
else:
# 创建PDf 资源管理器 来管理共享资源
rsrcmgr = PDFResourceManager()
# 创建一个PDF设备对象
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
# 创建一个PDF解释器对象
interpreter = PDFPageInterpreter(rsrcmgr, device)
# 用来计数页面,图片,曲线,figure,水平文本框等对象的数量
num_page, num_image, num_curve, num_figure, num_TextBoxHorizontal = 0, 0, 0, 0, 0
# 循环遍历列表,每次处理一个page的内容
for page in doc.get_pages(): # doc.get_pages() 获取page列表
num_page += 1 # 页面增一
interpreter.process_page(page)
# 接受该页面的LTPage对象
layout = device.get_result()
for x in layout:
if isinstance(x,LTImage): # 图片对象
num_image += 1
if isinstance(x,LTCurve): # 曲线对象
num_curve += 1
if isinstance(x,LTFigure): # figure对象
num_figure += 1
if isinstance(x, LTTextBoxHorizontal): # 获取文本内容
num_TextBoxHorizontal += 1 # 水平文本框对象增一
# 保存文本内容
with open(r'test.txt', 'a') as f:
results = x.get_text()
print(results,end='')
f.write(results)
print('对象数量:\n','页面数:%s\n'%num_page,'图片数:%s\n'%num_image,'曲线数:%s\n'%num_curve,'水平文本框:%s\n'
%num_TextBoxHorizontal)
if __name__ == '__main__':
pdf_path = r'D:\python tests\ZQfd_paiming\pdf\12.pdf'
parse(pdf_path)
2.利用pdf2htmlEX工具,将pdf转化为html文件,分析源码,根据格式提取出需要的内容:
下载pdf2htmlEX,将需要处理的pdf转化为html:
(1)先处理了前两页:13.html:13
# -*- coding: UTF-8 -*-
# -*- coding: gbk -*-
from bs4 import BeautifulSoup
import re
file = open('C:/Users/wdh/Desktop/pdf2htmlEX-win32-0.14.6-upx-with-poppler-data/13.html', 'rb')
# html=str(file.read()) #str类型,中文->十六进制
html = file.read() # byte类型,直接显示中文,但是下文的定位是根据十六进制来的
div_bf = BeautifulSoup(html, 'lxml')
div = div_bf.find_all('div', class_='c x0 y0 w2 h2') # 每一页
# div=div_bf.find_all('div',class_='pf w0 h0')
pattern_id = re.compile('<div class="t m0 x8 h3 y.*? ff1 fs0 fc1 sc1 ls0 ws0">') # 以学号开始的每个人的信息
pattern_name = re.compile('<div class="t m0 x9 h3 y.*? ff1 fs0 fc1 sc1 ls0 ws0">') # 姓名
pattern_grade = re.compile('<div class="t m0 xb h3 y.*? ff1 fs0 fc1 sc1 ls0 ws0">') # 分数:初试、复试、总成绩
ID = []
NAME = []
GRADE1 = [] # 初试
GRADE2 = [] # 复试
GRADE3 = [] # 总成绩
COLLEGE = []
page_num = len(div)
for i in range(page_num):
ID += [[]]
NAME += [[]]
GRADE1 += [[]]
GRADE2 += [[]]
GRADE3 += [[]]
COLLEGE += [[]]
str_each = str(div[i]) # 分析每页信息
per_id = re.findall(pattern_id, str_each)
per_name = re.findall(pattern_name, str_each)
per_grade = re.findall(pattern_grade, str_each)
per_num = len(per_id) # 每页人数
for j in range(per_num):
start_id = str_each.find(per_id[j]) + 50
if str_each[start_id] == '>':
id = str_each[start_id + 1:start_id + 6]
else:
id = str_each[start_id + 2:start_id + 7]
ID[i].append(id)
start_name = str_each.find(per_name[j]) + 50
end_name = str_each.find('<', start_name)
if str_each[start_name] == '>':
name = str_each[start_name + 1:end_name]
else:
name = str_each[start_name + 2:end_name]
NAME[i].append(name.replace(' ', '')) # 避免名字中有空格出现
start_grade = str_each.find(per_grade[j]) + 50
if str_each[start_grade] != '>':
start_grade += 1
grade1 = str_each[start_grade + 1:start_grade + 4]
grade2 = str_each[start_grade + 31:start_grade + 36]
grade3 = str_each[start_grade + 63:start_grade + 68]
GRADE1[i].append(grade1)
GRADE2[i].append(grade2)
GRADE3[i].append(grade3)
# 有的学院和专业是跟在姓名div后面的,有的是另外开辟了div,处理起来比较麻烦
# 处理姓名到得分之间的部分,将这部分中诸如<span class="_ _3"></span>
# 或者</div><div class="t m0 xd h3 y13 ff1 fs0 fc1 sc1 ls0 ws0">去掉
college_info = str_each[end_name:start_grade + 1]
pattern_useless1 = re.compile('<span class="_ _[0-9|a-f]"></span>')
pattern_useless2 = re.compile('<span class="_ _[0-9|a-f]"> </span>')
pattern_useless3 = re.compile('</div><div class="t m0 x.*? h3 y.*? ff1 fs0 fc1 sc1 ls0 ws0">')
useless1 = re.findall(pattern_useless1, college_info)
useless2 = re.findall(pattern_useless2, college_info)
useless3 = re.findall(pattern_useless3, college_info)
for each in useless1:
college_info = college_info.replace(each, '')
for each in useless2:
college_info = college_info.replace(each, '')
for each in useless3:
college_info = college_info.replace(each, '')
COLLEGE[i].append(college_info)
# print('%-10s %-10s %-50s %-5s %-7s %-7s' % (ID[i][j],NAME[i][j],COLLEGE[i][j],GRADE1[i][j],GRADE2[i][j],GRADE3[i][j]))
print('|%-10s|%s' % (ID[i][j], NAME[i][j]), end='')
for x in range(10 - len(NAME[i][j])):
print('\u3000', end='')
print('|%s' % COLLEGE[i][j], end='')
for x in range(30 - len(COLLEGE[i][j])):
print('\u3000', end='')
print('|%-7s|%-7s|%-7s|' % (GRADE1[i][j], GRADE2[i][j], GRADE3[i][j]))
(2)再处理所有的168页,结构不完全相同,使用的方法也有很大不同:转化后的zong1.html:zong1
效果截图:
# -*- coding: UTF-8 -*-
# -*- coding: gbk -*-
from bs4 import BeautifulSoup
import re
file=open('C:/Users/wdh/Desktop/pdf2htmlEX-win32-0.14.6-upx-with-poppler-data/zong1.html','rb')
#html=str(file.read()) #str类型,中文->十六进制
html=file.read() #byte类型,直接显示中文,但是下文的定位是根据十六进制来的
div_bf=BeautifulSoup(html,'lxml')
div=div_bf.find_all('div',class_='pf w0 h0')
pattern_id=re.compile('<div class="t m0 x4 h3 y[0-9|a-f]+ ff2 fs0 fc0 sc1 ls1 ws0">') #以学号开始的每个人的信息
pattern_name=re.compile('<span class="ff1">.*?</span>') #姓名
#pattern_college1=re.compile('<div class="t m0 x9 h3 y[0-9|a-f]+ ff1 fs0 fc0 sc1 ls1 ws0">') #学院
pattern_college1=re.compile('<div class=".*?"')
pattern_college2=re.compile('<span class="_ _[0-9|a-f]+">')
pattern_grade=re.compile('<div class="t m0 xb h3 y.*? ff2 fs0 fc0 sc1 ls1 ws0">') #分数:初试、复试、总成绩
ID=[]
NAME=[]
GRADE1=[] #初试
GRADE2=[] #复试
GRADE3=[] #总成绩
COLLEGE=[] #学院
REMARKS=[] #备注
page_num=len(div)
for i in range(page_num):
ID+=[[]]
NAME+=[[]]
GRADE1+=[[]]
GRADE2+=[[]]
GRADE3+=[[]]
COLLEGE+=[[]]
REMARKS+=[[]]
str_each=str(div[i]) #分析每页信息
per_id=re.findall(pattern_id,str_each)
per_name=re.findall(pattern_name,str_each)
per_college1=re.findall(pattern_college1,str_each)
per_college2 = re.findall(pattern_college2, str_each)
per_grade=re.findall(pattern_grade,str_each)
per_num=len(per_id) #每页人数
for j in range(per_num):
start_id=str_each.find(per_id[j])+50
if str_each[start_id]=='>':
id=str_each[start_id+1:start_id+6]
else:
id=str_each[start_id+2:start_id+7]
ID[i].append(id)
#学号是规整的,姓名有的跟在学号后面:<span class="ff1">.*?</span>'
# 有的:<div class="t m0 x4 h3 y.*? ff1 fs0 fc0 sc1 ls1 ws0">.*?</div>
start_name1=str_each.find('<span class="ff1">',start_id)
start_name2=str_each.find('<div class="t m0 x4 h3 y',start_id)
if start_name1==-1:
if str_each[start_name2+50]!='>':
start_name=start_name2+51
else:
start_name=start_name2+50
elif start_name2==-1:
start_name=start_name1+18
elif start_name1<start_name2:
start_name=start_name1+18
else:
if str_each[start_name2+50]!='>':
start_name=start_name2+51
else:
start_name=start_name2+50
end_name=str_each.find('</',start_name)
name=str_each[start_name:end_name]
NAME[i].append(name.replace(' ','').replace('>','').replace('·','\u3000')) #避免名字中有空格出现
#只有学号是规整的,分数有的跟在学院后面:<span class="ff2">
#有的:<div class="t m0 xb h3 y22 ff2 fs0 fc0 sc1 ls1 ws0">
start_grade1 = str_each.find('<span class="ff2">', start_id)
start_grade2 = str_each.find('<div class="t m0 xb h3 y', start_id)
start_grade3=str_each.find('<div class="t m0 x10 h3 y',start_id)
if start_grade3!=-1 and start_grade3<start_grade2:
start_grade2=start_grade3
if start_grade1 == -1:
if str_each[start_grade2 + 50] != '>':
start_grade = start_grade2 + 52
else:
start_grade = start_grade2 + 51
elif start_grade2 == -1:
start_grade = start_grade1 + 18
elif start_grade1 < start_grade2:
start_grade = start_grade1 + 18
else:
if str_each[start_grade2 + 50] != '>':
start_grade = start_grade2 + 52
else:
start_grade = start_grade2 + 51
end_grade = str_each.find('</div', start_grade)
GRADE1[i].append(str_each[start_grade:start_grade+3].replace('>',''))
if str_each[start_grade+30].isdigit():
GRADE2[i].append(str_each[start_grade+30:start_grade+35])
GRADE3[i].append(str_each[start_grade + 62:start_grade + 67])
else:
GRADE2[i].append('-')
GRADE3[i].append('-')
'''
if str_each[start_grade:end_grade].count('<span class=')>=3: #有备注
start_remarks=str_each[start_grade:end_grade].rfind('>')
remarks=str_each[start_remarks+1:end_grade]
print(remarks)
REMARKS[i].append(remarks)
else:
REMARKS[i].append('-')
'''
# 学院专业信息在姓名和成绩之间,再利用正则表达式,将格式信息等去掉
college = str_each[end_name:start_grade]
for each in per_college1:
college=college.replace(each,'')
for each in per_college2:
college=college.replace(each,'')
college=college.replace('<span class="ff2">','').replace('</span>','').replace('</div>','').replace('>','').replace(' ','').replace('(','(').replace(')',')')
COLLEGE[i].append(college)
print('|%-10s|%s' % (ID[i][j],NAME[i][j]),end='')
for x in range(12-len(NAME[i][j])):
print('\u3000',end='')
print('|%s' % COLLEGE[i][j],end='')
for x in range(30-len(COLLEGE[i][j])):
print('\u3000',end='')
print('|%-7s|%-7s|%-7s|' % (GRADE1[i][j],GRADE2[i][j],GRADE3[i][j]))
3.在上述代码的基础上添加几行,使用xlwt将字典写入excel中:
效果截图:
# coding=utf-8
from xlwt import *
# 需要xlwt库的支持
# -*- coding: UTF-8 -*-
# -*- coding: gbk -*-
from bs4 import BeautifulSoup
import re
file=open('C:/Users/wdh/Desktop/pdf2htmlEX-win32-0.14.6-upx-with-poppler-data/zong1.html','rb')
#html=str(file.read()) #str类型,中文->十六进制
html=file.read() #byte类型,直接显示中文,但是下文的定位是根据十六进制来的
div_bf=BeautifulSoup(html,'lxml')
div=div_bf.find_all('div',class_='pf w0 h0')
pattern_id=re.compile('<div class="t m0 x4 h3 y[0-9|a-f]+ ff2 fs0 fc0 sc1 ls1 ws0">') #以学号开始的每个人的信息
pattern_name=re.compile('<span class="ff1">.*?</span>') #姓名
#pattern_college1=re.compile('<div class="t m0 x9 h3 y[0-9|a-f]+ ff1 fs0 fc0 sc1 ls1 ws0">') #学院
pattern_college1=re.compile('<div class=".*?"')
pattern_college2=re.compile('<span class="_ _[0-9|a-f]+">')
pattern_grade=re.compile('<div class="t m0 xb h3 y.*? ff2 fs0 fc0 sc1 ls1 ws0">') #分数:初试、复试、总成绩
ID=[]
NAME=[]
GRADE1=[] #初试
GRADE2=[] #复试
GRADE3=[] #总成绩
COLLEGE=[] #学院
REMARKS=[] #备注
page_num=len(div)
file_excel = Workbook(encoding='utf-8')
# 指定file以utf-8的格式打开
table = file_excel.add_sheet('data')
# 指定打开的文件名
data = {'考生编号 (后五位)':['姓名','拟录取院系/专业','初试成绩','复试成绩','总成绩']}
# 字典数据
for i in range(page_num):
ID+=[[]]
NAME+=[[]]
GRADE1+=[[]]
GRADE2+=[[]]
GRADE3+=[[]]
COLLEGE+=[[]]
REMARKS+=[[]]
str_each=str(div[i]) #分析每页信息
per_id=re.findall(pattern_id,str_each)
per_name=re.findall(pattern_name,str_each)
per_college1=re.findall(pattern_college1,str_each)
per_college2 = re.findall(pattern_college2, str_each)
per_grade=re.findall(pattern_grade,str_each)
per_num=len(per_id) #每页人数
for j in range(per_num):
start_id=str_each.find(per_id[j])+50
if str_each[start_id]=='>':
id=str_each[start_id+1:start_id+6]
else:
id=str_each[start_id+2:start_id+7]
ID[i].append(id)
#学号是规整的,姓名有的跟在学号后面:<span class="ff1">.*?</span>'
# 有的:<div class="t m0 x4 h3 y.*? ff1 fs0 fc0 sc1 ls1 ws0">.*?</div>
start_name1=str_each.find('<span class="ff1">',start_id)
start_name2=str_each.find('<div class="t m0 x4 h3 y',start_id)
if start_name1==-1:
if str_each[start_name2+50]!='>':
start_name=start_name2+51
else:
start_name=start_name2+50
elif start_name2==-1:
start_name=start_name1+18
elif start_name1<start_name2:
start_name=start_name1+18
else:
if str_each[start_name2+50]!='>':
start_name=start_name2+51
else:
start_name=start_name2+50
end_name=str_each.find('</',start_name)
name=str_each[start_name:end_name]
NAME[i].append(name.replace(' ','').replace('>','').replace('·','\u3000')) #避免名字中有空格出现
#只有学号是规整的,分数有的跟在学院后面:<span class="ff2">
#有的:<div class="t m0 xb h3 y22 ff2 fs0 fc0 sc1 ls1 ws0">
start_grade1 = str_each.find('<span class="ff2">', start_id)
start_grade2 = str_each.find('<div class="t m0 xb h3 y', start_id)
start_grade3=str_each.find('<div class="t m0 x10 h3 y',start_id)
if start_grade3!=-1 and start_grade3<start_grade2:
start_grade2=start_grade3
if start_grade1 == -1:
if str_each[start_grade2 + 50] != '>':
start_grade = start_grade2 + 52
else:
start_grade = start_grade2 + 51
elif start_grade2 == -1:
start_grade = start_grade1 + 18
elif start_grade1 < start_grade2:
start_grade = start_grade1 + 18
else:
if str_each[start_grade2 + 50] != '>':
start_grade = start_grade2 + 52
else:
start_grade = start_grade2 + 51
end_grade = str_each.find('</div', start_grade)
GRADE1[i].append(str_each[start_grade:start_grade+3].replace('>',''))
if str_each[start_grade+30].isdigit():
GRADE2[i].append(str_each[start_grade+30:start_grade+35])
GRADE3[i].append(str_each[start_grade + 62:start_grade + 67])
else:
GRADE2[i].append('-')
GRADE3[i].append('-')
'''
if str_each[start_grade:end_grade].count('<span class=')>=3: #有备注
start_remarks=str_each[start_grade:end_grade].rfind('>')
remarks=str_each[start_remarks+1:end_grade]
print(remarks)
REMARKS[i].append(remarks)
else:
REMARKS[i].append('-')
'''
# 学院专业信息在姓名和成绩之间,再利用正则表达式,将格式信息等去掉
college = str_each[end_name:start_grade]
for each in per_college1:
college=college.replace(each,'')
for each in per_college2:
college=college.replace(each,'')
college=college.replace('<span class="ff2">','').replace('</span>','').replace('</div>','').replace('>','').replace(' ','').replace('(','(').replace(')',')')
COLLEGE[i].append(college)
target=[]
target.append(NAME[i][j])
target.append(COLLEGE[i][j])
target.append(GRADE1[i][j])
target.append(GRADE2[i][j])
target.append(GRADE3[i][j])
data[ID[i][j]]=target
ldata=[]
num=[a for a in data]
#for循环指定取出key值存入num中
for x in num:
#for循环将data字典中的键和值分批的保存在ldata中
t=[x]
for a in data[x]:
t.append(a)
ldata.append(t)
for i,p in enumerate(ldata):
#将数据写入文件,i是enumerate()函数返回的序号数
for j,q in enumerate(p):
table.write(i,j,q)
file_excel.save('data.xlsx')