```python
import concurrent.futures
import re
import os
import pdfplumber
from pdfminer.high_level import extract_text
def contains_element(string, elements):
element_set = set(elements)
return any(element in element_set for element in string)
def extract_text_from_pdf(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
# 逐页提取文本信息
for page_number in range(len(pdf.pages)):
page = pdf.pages[page_number]
text = page.extract_text()
return text
def process_pdf(pdf_path):
global err_path
try:
extracted_text = extract_text(pdf_path)
except Exception as e:
# 如果发生异常,打印异常信息
print(f"An error occurred: {e}")
err_path = err_path + pdf_path + '\n'
print("错误路径",pdf_path)
return ''
return extracted_text +"\n"
err_path = ''
pdf_folder_path = r'C:\Users\Desktop\pdf提取数据\data'
output_folder_path = r'C:\Users\Desktop\pdf提取数据\test'
# 创建一个线程池
with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
# 提交任务给线程池
m = 1
t = 1
futures = []
for filename in os.listdir(pdf_folder_path):
if filename.endswith(".pdf"):
pdf_path = os.path.join(pdf_folder_path, filename)
future = executor.submit(process_pdf, pdf_path)
print("已处理完" + str(t) + "条")
t += 1
futures.append(future)
if len(futures) % 5 == 0:
#阻塞程序,直到futures列表中的所有任务都完成为止
concurrent.futures.wait(futures)
# 拼接并打印返回的字符串
combined_result = ""
for f in futures:
combined_result += f.result()
output_file = os.path.join(output_folder_path, f"{m}.txt")
with open(output_file, "w", encoding="utf-8") as file:
# 将字符串写入文件
file.write(combined_result)
futures = []
m += 1
output_file = os.path.join(output_folder_path, f"错误文件.txt")
with open(output_file, "w", encoding="utf-8") as file:
# 将错误文件路径写入文件
file.write(err_path)