pyhon多线程批量识别pdf,并写入txt文件

本文介绍了一个Python脚本,利用`pdfplumber`库从PDF文件中提取文本,并使用`concurrent.futures`实现多线程处理大量PDF,同时记录并输出处理过程中的错误路径。
摘要由CSDN通过智能技术生成

```python
import concurrent.futures
import re
import os
import pdfplumber
from pdfminer.high_level import extract_text

def contains_element(string, elements):
    element_set = set(elements)
    return any(element in element_set for element in string)

def extract_text_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        # 逐页提取文本信息
        for page_number in range(len(pdf.pages)):
            page = pdf.pages[page_number]
            text = page.extract_text()
    return text


def process_pdf(pdf_path):
    global err_path
    try:
        extracted_text = extract_text(pdf_path)
    except Exception as e:
        # 如果发生异常,打印异常信息
        print(f"An error occurred: {e}")
        err_path = err_path + pdf_path + '\n'
        print("错误路径",pdf_path)
        return ''
    return extracted_text +"\n"


err_path = ''
pdf_folder_path = r'C:\Users\Desktop\pdf提取数据\data'
output_folder_path = r'C:\Users\Desktop\pdf提取数据\test'
# 创建一个线程池
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
    # 提交任务给线程池
    m = 1
    t = 1
    futures = []
    for filename in os.listdir(pdf_folder_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(pdf_folder_path, filename)
            future = executor.submit(process_pdf, pdf_path)
            print("已处理完" + str(t) + "条")
            t += 1
            futures.append(future)
            if len(futures) % 5 == 0:
                #阻塞程序,直到futures列表中的所有任务都完成为止
                concurrent.futures.wait(futures)
                # 拼接并打印返回的字符串
                combined_result = ""
                for f in futures:
                    combined_result += f.result()
                output_file = os.path.join(output_folder_path, f"{m}.txt")
                with open(output_file, "w", encoding="utf-8") as file:
                # 将字符串写入文件
                    file.write(combined_result)
                futures = []
                m += 1
    output_file = os.path.join(output_folder_path, f"错误文件.txt")
    with open(output_file, "w", encoding="utf-8") as file:
       # 将错误文件路径写入文件
        file.write(err_path)


评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值