pdf里面包含多篇报告,报告以文章编号结尾,部分存在文章连接 。目的提取报告的标题,版号,版面,作者,文章编号,原文连接,以及文章主体部分大概文字
代码如下:
blocks:
import fitz
from PyPDF2 import PdfFileReader, PdfFileWriter
import pandas as pd
doc = fitz.Document(r'C:\Users\Crown\Desktop\WS6.0下载文本样式PDF 15Nov2022-clipbook.pdf')#作为提取文本的对象
pdf = PdfFileReader(r'C:\Users\Crown\Desktop\WS6.0下载文本样式PDF 15Nov2022-clipbook.pdf')#作为切分pdf的对象
title = []#存放pdf的多篇文档的标题
pages = []#存放pdf文档的起始页
list_data = []#存放数据的列表
for list in doc.get_toc():#遍历pdf文档的目录
pages.append(list[2])#存储文档起始页,便于分割
title.append(list[1])#存储标题
for pg in range(len(pages)):#切分文档
pdf_writer = PdfFileWriter()#建立对象
if pg == len(pages)-1:#最后一部分文档应该是最后文档的起始页到pdf的页数结尾
for j in range(pages[pg]-1,len(pdf.pages)):
pdf_writer.addPage(pdf.getPage(j))
with open('../测试/temp.pdf', 'wb') as page_data:
pdf_writer.write(page_data)
else:#两篇文档的起始页就是一篇完整的文档
for j in range(pages[pg]-1,pages[pg+1]-1):
pdf_writer.addPage(pdf.getPage(j))
with open('../测试/temp.pdf', 'wb') as page_data:#存储
pdf_writer.write(page_data)
temp = fitz.Document('../测试/temp.pdf')#读取完整文档
sum = 0#统计个数
data = {}#存储数据的字典
banhao = ''#版号
meiti = ''#媒体
riqi = ''#日期
banmian = ''#版面
biaoti = ''#标题
zuozhi = ''#作者
Docld = ''#文章编号
lianjie = ''#原文连接
for page in range(0,len(temp)):#读取文档的每一页
blocks = temp[page].get_text('blocks')#以模块方式读取文本信息
if page == 0 and pg == 0:#pdf的起始页有一行写明文章总数,与其他页面不同,单独处理
for i in range(0,len(blocks)-1):#对文本模块进行遍历,最后一个模块是注释故不需要
if '文章总数' in blocks[i][4]:
continue
if '|' in blocks[i][4]:#获取标题,日期,媒体,作者
meiti = blocks[i][4].split('|')[0].strip()
if len(blocks[i][4].split('|')) > 4:#有些有版号有些没有
banhao = blocks[i][4].split('|')[2].strip()
banmian = blocks[i][4].split('|')[3].strip()
if 'By' in blocks[i][4].split('|')[-1]:
zuozhi = blocks[i][4].split('By')[1].strip()
else:
banmian = blocks[i][4].split('|')[2].strip()
riqi = blocks[i][4].split('|')[1].split('\n')[0].strip()
riqi = riqi.split('-')[1] + '-' + riqi.split('-')[2] + '-' + riqi.split('-')[0]#改变日期格式
if i == 2:
biaoti = blocks[i][4].replace('\n','').strip()
if '原文' in blocks[i][4] and '/' in blocks[i][4]:
lianjie = blocks[i][4].split(':')[1].replace('\n', '')
if '文章编号' in blocks[i][4]:
Docld = blocks[i][4].split(':')[1].replace('\n', '')
sum += len(blocks[i][4]) - 1
elif page == 0:#针对第一页
for i in range(0,len(blocks)-1):
#print(blocks[i])
if '文章总数' in blocks[i][4]:
continue
if '|' in blocks[i][4]:
meiti = blocks[i][4].split('|')[0].strip()
if len(blocks[i][4].split('|')) > 4:
banhao = blocks[i][4].split('|')[2].strip()
banmian = blocks[i][4].split('|')[3].strip()
if 'By' in blocks[i][4].split('|')[-1]:
zuozhi = blocks[i][4].split('By')[1].strip()
else:
banhao = ''
banmian = blocks[i][4].split('|')[2].strip()
riqi = blocks[i][4].split('|')[1].split('\n')[0].strip()
riqi = riqi.split('-')[1] + '-' + riqi.split('-')[2] + '-' + riqi.split('-')[0]
if i == 1:
biaoti = blocks[i][4].replace('\n','').strip()
sum += len(blocks[i][4]) - 1
if '原文' in blocks[i][4] and '/' in blocks[i][4]:
lianjie = blocks[i][4].split(':')[1].replace('\n', '')
if '文章编号' in blocks[i][4]:
Docld = blocks[i][4].split(':')[1].replace('\n', '')
elif page < len(temp):#针对后续的页面
for i in range(0, len(blocks)-1):
#print(blocks[i])
if '|' in blocks[i][4]:
continue
if '文章编号' in blocks[i][4]:
Docld = blocks[i][4].split(':')[1].replace('\n', '')
continue
elif '原文' in blocks[i][4] and '/' in blocks[i][4]:
lianjie = blocks[i][4].split(':')[1].replace('\n', '')
continue
sum += len(blocks[i][4]) - 1
data['标题'] = title[pg]
data['日期'] = riqi
data['媒体类型'] = '报刊'
data['媒体'] = meiti
data['版号'] = banhao
data['版面'] = banmian
data['作者'] = zuozhi
data['字数'] = sum - 20
data['Docld'] = Docld
data['原文链接'] = lianjie
print(data)
list_data.append(data)
datas = pd.DataFrame(list_data)
datas.to_excel('1.xlsx')
data: