Python使用pdfplumber获取PDF所需信息

最新推荐文章于 2024-04-30 11:11:45 发布

大米2H

最新推荐文章于 2024-04-30 11:11:45 发布

阅读量695

点赞数

分类专栏：自动化办公文章标签： python pdf 开发语言

本文链接：https://blog.csdn.net/m0_57446978/article/details/128374637

版权

自动化办公专栏收录该内容

6 篇文章 1 订阅

订阅专栏

import pandas as pd 
from selenium import webdriver
import time
# import pymysql
# import pymssql
from sqlalchemy import create_engine
import warnings
warnings.filterwarnings('ignore')
import datetime
from datetime import timedelta
from urllib import parse
import xlwings as xw
import fitz
import pdfplumber

部分摘录：https://blog.csdn.net/weixin_39588206/article/details/113707858

模块安装

首先需要安装两个模块，第一个是pdfplumber，在命令行使用pip安装即可
pip install pdfplumber
第二个是fitz, 它是pymupdf中的一个模块，同样可以使用pip轻松安装
pip install pymupdf

文字信息提取

使用python提取PDF中文字代码思路如下

1、利用pdfplumber打开一个 PDF 文件

2、获取指定的页，或者遍历每一页

3、利用.extract_text()方法提取当前页的文字

4、利用.extract_tables()方法提取当前页的文字

5、利用.extract_table()方法提取当前页的文字

def IL_HAWB_PDF(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[0]
        doc_table = page.extract_table()
        df = pd.DataFrame(doc_table)
        df.loc[19,0] #定位在19，0格中有需要的信息
        
        # Find location -- > cost
        cost_txt_chr_start = '₪(cid:3)'
        cost_txt_chr_end = '(cid:676)(cid:689)(cid:688)'
        length_s = df.loc[19,0].find(cost_txt_chr_start)
        length_e = df.loc[19,0].find(cost_txt_chr_end)
        cost_ = df.loc[19,0][length_s:length_e].replace(cost_txt_chr_start,'').replace(' ','').replace('(cid:15)',',')
        
        # Find location -- > hawb
        hawb_txt_chr_start = '(cid:3)(cid:674)(cid:677)(cid:689)'
        hawb_txt_chr_end = '(cid:696)(cid:679)(cid:672)'
        hawb_length_s = df.loc[19,0].find(hawb_txt_chr_start)
        hawb_length_e = df.loc[19,0].find(hawb_txt_chr_end)
#         print(hawb_length_s)
        hawb_ = df.loc[19,0][hawb_length_s:hawb_length_e].replace(hawb_txt_chr_start,'').replace(' ','')
        return cost_,hawb_

IL_HAWB_PDF(r"C:\Users\chenjx29\Desktop\PDF 爬取信息\IL.PDF")

('1,357', '1283995790')

def AU_HAWB_PDF(pdf_path):
#     pdf_path = r"C:\Users\chenjx29\Desktop\PDF 爬取信息\AU.PDF"
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[0]
        doc_txt = page.extract_text()
    hawb_txt_start = 'O/REF'
    hawb_txt_end = 'FOB'
    hawb_ = doc_txt[doc_txt.find(hawb_txt_start)+len(hawb_txt_start):doc_txt.find(hawb_txt_end)].replace(' ','').replace(':','')

    cost_txt_start = 'Total AMT. PAYABLE'
    cost_txt_end = 'OFFICIAL USE ONLY'
    cost_ = doc_txt[doc_txt.find(cost_txt_start)+len(cost_txt_start):doc_txt.find(cost_txt_end)].replace(' ','').replace(':','').replace('\n','')
    return cost_,hawb_

AU_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\AU.PDF')

('190.00', '9807420291')

def JP_HAWB_PDF(pdf_path):
#     pdf_path = r'C:\Users\chenjx29\Desktop\PDF 爬取信息\JP.PDF'
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[0]
        doc_txt = page.extract_text()
    hawb_txt_start = 'ＡＷＢ番号'
    hawb_txt_end = '蔵置税関'
    hawb_ = doc_txt[doc_txt.find(hawb_txt_start)+len(hawb_txt_start):doc_txt.find(hawb_txt_end)].replace(' ','')
    hawb_

    cost_txt_start = '納税額合計'
    cost_txt_end = '通貨レート'
    cost_ = doc_txt[doc_txt.find(cost_txt_start)+len(cost_txt_start):doc_txt.find(cost_txt_end)].replace(' ','').replace('\\','')
    return cost_,hawb_

JP_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\JP.PDF')

('4,200', '218-9476096')

def KR_HAWB_PDF(pdf_path):
#     pdf_path = r'C:\Users\chenjx29\Desktop\PDF 爬取信息\KR.PDF'
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[0]
        doc_txt = page.extract_text()
    hawb_txt_start = '징수형태'
    hawb_txt_end = ' '
    hawb_doc_txt = doc_txt[doc_txt.find(hawb_txt_start)+len(hawb_txt_start):]
    hawb_ = hawb_doc_txt[:hawb_doc_txt.find(hawb_txt_end)].replace(' ','').replace('\n','')

    cost_txt_start = '총세액합계'
    cost_txt_end = '67 담당자'
    cost_ = doc_txt[doc_txt.find(cost_txt_start)+len(cost_txt_start):doc_txt.find(cost_txt_end)].replace(' ','').replace('\n','')
    return cost_,hawb_

KR_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\KR.PDF')

('293,830', '1069158160')

def SG_HAWB_PDF(pdf_path):
#     pdf_path = r'C:\Users\chenjx29\Desktop\PDF 爬取信息\SG.PDF'
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[0]
        doc_txt = page.extract_text()
    hawb_txt_start = 'B/L No. (Waybill)'
    hawb_txt_end = 'Declaration Number'
    hawb_ = doc_txt[doc_txt.find(hawb_txt_start)+len(hawb_txt_start):doc_txt.find(hawb_txt_end)].replace(' ','').replace('\n','')
    hawb_

    cost_txt_start = 'GST (7% of CIF & DUTY)'
    cost_txt_end = 'ACCESS Permit Date'
    cost_ = doc_txt[doc_txt.find(cost_txt_start)+len(cost_txt_start):doc_txt.find(cost_txt_end)].replace(' ','').replace('\n','')
    cost_
    return cost_,hawb_

def NZ_HAWB_PDF(pdf_path):
#     pdf_path = r'C:\Users\chenjx29\Desktop\PDF 爬取信息\NZ.PDF'
    with pdfplumber.open(pdf_path) as pdf:
        page = pdf.pages[0]
        doc_table = page.extract_tables()
        df = pd.DataFrame(doc_table)
    for i in df.loc[0,5]:
        if pd.isna(i)==False:
            if i.find('Bill Number')!=-1:
                hawb_ = i.replace('Bill Number','').replace(' ','').replace('\n','')
    hawb_

    for i in df.loc[0,8]:
        if pd.isna(i)==False:
            if i.find('Total Amount Payable') != -1:
                cost_ = i.replace('Total Amount Payable','').replace(' ','').replace('\n','')
    cost_
    return cost_,hawb_

NZ_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\NZ.PDF')

('$331.43', '5553678922')

AU_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\AU.PDF')

('190.00', '9807420291')

JP_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\JP.PDF')

('4,200', '218-9476096')

KR_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\KR.PDF')

('293,830', '1069158160')

SG_HAWB_PDF(r'C:\Users\chenjx29\Desktop\PDF 爬取信息\SG.PDF')

('966.02', '8415998430')

IL_HAWB_PDF(r"C:\Users\chenjx29\Desktop\PDF 爬取信息\IL.PDF")

('1,357', '1283995790')

大米2H

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
Python使用pdfplumber获取PDF所需信息

首先需要安装两个模块，第一个是pdfplumber，在命令行使用pip安装即可。第二个是fitz, 它是pymupdf中的一个模块，同样可以使用pip轻松安装。4、利用.extract_tables()方法提取当前页的文字。5、利用.extract_table()方法提取当前页的文字。3、利用.extract_text()方法提取当前页的文字。1、利用pdfplumber打开一个 PDF 文件。使用python提取PDF中文字代码思路如下。2、获取指定的页，或者遍历每一页。
复制链接

扫一扫