Python使用pdfplumber获取PDF所需信息

import pandas as pd 
from selenium import webdriver
import time
# import pymysql
# import pymssql
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings('ignore')
import datetime
from datetime import timedelta
from urllib import parse
import xlwings as xw
import fitz
import pdfplumber

部分摘录:https://blog.csdn.net/weixin_39588206/article/details/113707858

模块安装

  • 首先需要安装两个模块,第一个是pdfplumber,在命令行使用pip安装即可

  • pip install pdfplumber

  • 第二个是fitz, 它是pymupdf中的一个模块,同样可以使用pip轻松安装

  • pip install pymupdf

文字信息提取

使用python提取PDF中文字代码思路如下

1、利用pdfplumber打开一个 PDF 文件

2、获取指定的页,或者遍历每一页

3、利用.extract_text()方法提取当前页的文字

4、利用.extract_tables()方法提取当前页的文字

5、利用.extract_table()方法提取当前页的文字


def IL_HAWB_PDF(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[0]
        doc_table = page.extract_table()
        df = pd.DataFrame(doc_table)
        df.loc[19,0] #定位在19,0格中有需要的信息
        
        # Find location -- > cost
        cost_txt_chr_start = '₪(cid:3)'
        cost_txt_chr_end = '(cid:676)(cid:689)(cid:688)'
        length_s = df.loc[19,0].find(cost_txt_chr_start)
        length_e = df.loc[19,0].find(cost_txt_chr_end)
        cost_ = df.loc[19,0][length_s:length_e].replace(cost_txt_chr_start,'').replace(' ','').replace('(cid:15)',',')
        
        # Find location -- > hawb
        hawb_txt_chr_start = '(cid:3)(cid:674)(cid:677)(cid:689)'
        hawb_txt_chr_end = '(cid:696)(cid:679)(cid:672)'
        hawb_length_s = df.loc[19,0].find(hawb_txt_chr_start)
        hawb_length_e = df.loc[19,0].find(hawb_txt_chr_end)
#         print(hawb_length_s)
        hawb_ = df.loc[19,0][hawb_length_s:hawb_length_e].replace(hawb_txt_chr_start,'').replace(' ','')
        return cost_,hawb_
IL_HAWB_PDF(r"C:\Users\chenjx29\Desktop\PDF 爬取信息\IL.PDF")
('1,357', '1283995790')

def AU_HAWB_PDF(pdf_path):
#     pdf_path = r"C:\Users\chenjx29\Desktop\PDF 爬取信息\AU.PDF"
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[0]
        doc_txt = page.extract_text()
    hawb_txt_start = 'O/REF'
    hawb_txt_end = 'FOB'
    hawb_ = doc_txt[doc_txt.find(hawb_txt_start)+len(hawb_txt_start):doc_txt.find(hawb_txt_end)].replace(' ','').replace(':','')

    cost_txt_start = 'Total AMT. PAYABLE'
    cost_txt_end = 'OFFICIAL USE ONLY'
    cost_ = doc_txt[doc_txt.find(cost_txt_start)+len(cost_txt_start):doc_txt.find(cost_txt_end)].replace(' ','').replace(':','').replace('\n','')
    return cost_,hawb_
AU_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\AU.PDF')
('190.00', '9807420291')
def JP_HAWB_PDF(pdf_path):
#     pdf_path = r'C:\Users\chenjx29\Desktop\PDF 爬取信息\JP.PDF'
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[0]
        doc_txt = page.extract_text()
    hawb_txt_start = 'AWB番号'
    hawb_txt_end = '蔵置税関'
    hawb_ = doc_txt[doc_txt.find(hawb_txt_start)+len(hawb_txt_start):doc_txt.find(hawb_txt_end)].replace(' ','')
    hawb_

    cost_txt_start = '納税額合計'
    cost_txt_end = '通貨レート'
    cost_ = doc_txt[doc_txt.find(cost_txt_start)+len(cost_txt_start):doc_txt.find(cost_txt_end)].replace(' ','').replace('\\','')
    return cost_,hawb_
JP_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\JP.PDF')
('4,200', '218-9476096')

def KR_HAWB_PDF(pdf_path):
#     pdf_path = r'C:\Users\chenjx29\Desktop\PDF 爬取信息\KR.PDF'
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[0]
        doc_txt = page.extract_text()
    hawb_txt_start = '징수형태'
    hawb_txt_end = ' '
    hawb_doc_txt = doc_txt[doc_txt.find(hawb_txt_start)+len(hawb_txt_start):]
    hawb_ = hawb_doc_txt[:hawb_doc_txt.find(hawb_txt_end)].replace(' ','').replace('\n','')

    cost_txt_start = '총세액합계'
    cost_txt_end = '67 담당자'
    cost_ = doc_txt[doc_txt.find(cost_txt_start)+len(cost_txt_start):doc_txt.find(cost_txt_end)].replace(' ','').replace('\n','')
    return cost_,hawb_
KR_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\KR.PDF')
('293,830', '1069158160')

def SG_HAWB_PDF(pdf_path):
#     pdf_path = r'C:\Users\chenjx29\Desktop\PDF 爬取信息\SG.PDF'
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[0]
        doc_txt = page.extract_text()
    hawb_txt_start = 'B/L No. (Waybill)'
    hawb_txt_end = 'Declaration Number'
    hawb_ = doc_txt[doc_txt.find(hawb_txt_start)+len(hawb_txt_start):doc_txt.find(hawb_txt_end)].replace(' ','').replace('\n','')
    hawb_

    cost_txt_start = 'GST (7% of CIF & DUTY)'
    cost_txt_end = 'ACCESS Permit Date'
    cost_ = doc_txt[doc_txt.find(cost_txt_start)+len(cost_txt_start):doc_txt.find(cost_txt_end)].replace(' ','').replace('\n','')
    cost_
    return cost_,hawb_

def NZ_HAWB_PDF(pdf_path):
#     pdf_path = r'C:\Users\chenjx29\Desktop\PDF 爬取信息\NZ.PDF'
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[0]
        doc_table = page.extract_tables()
        df = pd.DataFrame(doc_table)
    for i in df.loc[0,5]:
        if pd.isna(i)==False:
            if i.find('Bill Number')!=-1:
                hawb_ = i.replace('Bill Number','').replace(' ','').replace('\n','')
    hawb_

    for i in df.loc[0,8]:
        if pd.isna(i)==False:
            if i.find('Total Amount Payable') != -1:
                cost_ = i.replace('Total Amount Payable','').replace(' ','').replace('\n','')
    cost_
    return cost_,hawb_

NZ_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\NZ.PDF')
('$331.43', '5553678922')
AU_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\AU.PDF')
('190.00', '9807420291')
JP_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\JP.PDF')
('4,200', '218-9476096')
KR_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\KR.PDF')
('293,830', '1069158160')
SG_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\SG.PDF')
('966.02', '8415998430')
IL_HAWB_PDF(r"C:\Users\chenjx29\Desktop\PDF 爬取信息\IL.PDF")
('1,357', '1283995790')

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值