import pandas as pd
from selenium import webdriver
import time
# import pymysql
# import pymssql
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings('ignore')
import datetime
from datetime import timedelta
from urllib import parse
import xlwings as xw
import fitz
import pdfplumber
部分摘录:https://blog.csdn.net/weixin_39588206/article/details/113707858
模块安装
-
首先需要安装两个模块,第一个是pdfplumber,在命令行使用pip安装即可
-
pip install pdfplumber
-
第二个是fitz, 它是pymupdf中的一个模块,同样可以使用pip轻松安装
-
pip install pymupdf
文字信息提取
使用python提取PDF中文字代码思路如下
1、利用pdfplumber打开一个 PDF 文件
2、获取指定的页,或者遍历每一页
3、利用.extract_text()方法提取当前页的文字
4、利用.extract_tables()方法提取当前页的文字
5、利用.extract_table()方法提取当前页的文字
def IL_HAWB_PDF(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
page = pdf.pages[0]
doc_table = page.extract_table()
df = pd.DataFrame(doc_table)
df.loc[19,0] #定位在19,0格中有需要的信息
# Find location -- > cost
cost_txt_chr_start = '₪(cid:3)'
cost_txt_chr_end = '(cid:676)(cid:689)(cid:688)'
length_s = df.loc[19,0].find(cost_txt_chr_start)
length_e = df.loc[19,0].find(cost_txt_chr_end)
cost_ = df.loc[19,0][length_s:length_e].replace(cost_txt_chr_start,'').replace(' ','').replace('(cid:15)',',')
# Find location -- > hawb
hawb_txt_chr_start = '(cid:3)(cid:674)(cid:677)(cid:689)'
hawb_txt_chr_end = '(cid:696)(cid:679)(cid:672)'
hawb_length_s = df.loc[19,0].find(hawb_txt_chr_start)
hawb_length_e = df.loc[19,0].find(hawb_txt_chr_end)
# print(hawb_length_s)
hawb_ = df.loc[19,0][hawb_length_s:hawb_length_e].replace(hawb_txt_chr_start,'').replace(' ','')
return cost_,hawb_
IL_HAWB_PDF(r"C:\Users\chenjx29\Desktop\PDF 爬取信息\IL.PDF")
('1,357', '1283995790')
def AU_HAWB_PDF(pdf_path):
# pdf_path = r"C:\Users\chenjx29\Desktop\PDF 爬取信息\AU.PDF"
with pdfplumber.open(pdf_path) as pdf:
page = pdf.pages[0]
doc_txt = page.extract_text()
hawb_txt_start = 'O/REF'
hawb_txt_end = 'FOB'
hawb_ = doc_txt[doc_txt.find(hawb_txt_start)+len(hawb_txt_start):doc_txt.find(hawb_txt_end)].replace(' ','').replace(':','')
cost_txt_start = 'Total AMT. PAYABLE'
cost_txt_end = 'OFFICIAL USE ONLY'
cost_ = doc_txt[doc_txt.find(cost_txt_start)+len(cost_txt_start):doc_txt.find(cost_txt_end)].replace(' ','').replace(':','').replace('\n','')
return cost_,hawb_
AU_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\AU.PDF')
('190.00', '9807420291')
def JP_HAWB_PDF(pdf_path):
# pdf_path = r'C:\Users\chenjx29\Desktop\PDF 爬取信息\JP.PDF'
with pdfplumber.open(pdf_path) as pdf:
page = pdf.pages[0]
doc_txt = page.extract_text()
hawb_txt_start = 'AWB番号'
hawb_txt_end = '蔵置税関'
hawb_ = doc_txt[doc_txt.find(hawb_txt_start)+len(hawb_txt_start):doc_txt.find(hawb_txt_end)].replace(' ','')
hawb_
cost_txt_start = '納税額合計'
cost_txt_end = '通貨レート'
cost_ = doc_txt[doc_txt.find(cost_txt_start)+len(cost_txt_start):doc_txt.find(cost_txt_end)].replace(' ','').replace('\\','')
return cost_,hawb_
JP_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\JP.PDF')
('4,200', '218-9476096')
def KR_HAWB_PDF(pdf_path):
# pdf_path = r'C:\Users\chenjx29\Desktop\PDF 爬取信息\KR.PDF'
with pdfplumber.open(pdf_path) as pdf:
page = pdf.pages[0]
doc_txt = page.extract_text()
hawb_txt_start = '징수형태'
hawb_txt_end = ' '
hawb_doc_txt = doc_txt[doc_txt.find(hawb_txt_start)+len(hawb_txt_start):]
hawb_ = hawb_doc_txt[:hawb_doc_txt.find(hawb_txt_end)].replace(' ','').replace('\n','')
cost_txt_start = '총세액합계'
cost_txt_end = '67 담당자'
cost_ = doc_txt[doc_txt.find(cost_txt_start)+len(cost_txt_start):doc_txt.find(cost_txt_end)].replace(' ','').replace('\n','')
return cost_,hawb_
KR_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\KR.PDF')
('293,830', '1069158160')
def SG_HAWB_PDF(pdf_path):
# pdf_path = r'C:\Users\chenjx29\Desktop\PDF 爬取信息\SG.PDF'
with pdfplumber.open(pdf_path) as pdf:
page = pdf.pages[0]
doc_txt = page.extract_text()
hawb_txt_start = 'B/L No. (Waybill)'
hawb_txt_end = 'Declaration Number'
hawb_ = doc_txt[doc_txt.find(hawb_txt_start)+len(hawb_txt_start):doc_txt.find(hawb_txt_end)].replace(' ','').replace('\n','')
hawb_
cost_txt_start = 'GST (7% of CIF & DUTY)'
cost_txt_end = 'ACCESS Permit Date'
cost_ = doc_txt[doc_txt.find(cost_txt_start)+len(cost_txt_start):doc_txt.find(cost_txt_end)].replace(' ','').replace('\n','')
cost_
return cost_,hawb_
def NZ_HAWB_PDF(pdf_path):
# pdf_path = r'C:\Users\chenjx29\Desktop\PDF 爬取信息\NZ.PDF'
with pdfplumber.open(pdf_path) as pdf:
page = pdf.pages[0]
doc_table = page.extract_tables()
df = pd.DataFrame(doc_table)
for i in df.loc[0,5]:
if pd.isna(i)==False:
if i.find('Bill Number')!=-1:
hawb_ = i.replace('Bill Number','').replace(' ','').replace('\n','')
hawb_
for i in df.loc[0,8]:
if pd.isna(i)==False:
if i.find('Total Amount Payable') != -1:
cost_ = i.replace('Total Amount Payable','').replace(' ','').replace('\n','')
cost_
return cost_,hawb_
NZ_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\NZ.PDF')
('$331.43', '5553678922')
AU_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\AU.PDF')
('190.00', '9807420291')
JP_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\JP.PDF')
('4,200', '218-9476096')
KR_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\KR.PDF')
('293,830', '1069158160')
SG_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\SG.PDF')
('966.02', '8415998430')
IL_HAWB_PDF(r"C:\Users\chenjx29\Desktop\PDF 爬取信息\IL.PDF")
('1,357', '1283995790')