我就废话不多说了,大家还是看代码吧!
import PyPDF2
import re
pdf_file = open('xxx.pdf', mode='rb')
read_pdf = PyPDF2.PdfFileReader(pdf_file)
# 获取pdf文件的所有页数
number_of_pages = read_pdf.getNumPages()
# print('total_page: ', number_of_pages)
line_list = []
# 循环遍历每一页
for i in range(0, number_of_pages):
# 读取每一页的内容
page = read_pdf.getPage(i)
page_content = page.extractText()
# 将这一页的内容分割为列表,,并相加所有的页面内容
line_list += page_content.split()
# 关闭pdf文件
pdf_file.close()
line_buf = ''
for buf in line_list:
line_buf = line_buf+' '+buf
# 匹配数据:第一列和第二列 如:000069.sz 和 100
# print(line_buf)
a = re.findall('([0-9]+[0-9]+[0-9]+[0-9]+[0-