1、解析pdf中内容(ocr同步提取图片内容)
# -*- coding: utf-8 -*-
'''
解析pdf文件拿到内容
执行流程:
(1)从oss拿到pdf原始文件
(2)通过插件读取pdf数据
'''
import pathlib,sys
sys.path.append(str(pathlib.Path(__file__).absolute().parent.parent.parent.parent.parent.parent.parent))
from PIL import Image
import fitz # pip3 install pymupdf
import time
import pytesseract
from utils.color_print_utils import *
def pytesseract_ocr(pdf_path):
target_check = False
doc = None
dl_strat_time = time.time()
try:
"""提取指定页数的文本"""
doc = fitz.open(pdf_path)
print(f'fitz.open:耗时 {time.time() - dl_strat_time} 秒')
for i in range(doc.page_count):
# 转换成图片,然后进行ocr识别
current_page = doc[i]
# 设置缩放和旋转系数:x/y轴方向放大5倍,不旋转
trans = fitz.Matrix(5, 5).prerotate(0)
# 获取像素图
pm = current_page.get_pixmap(matrix=trans, alpha=False)
# 开始写图像
img_path = '/Users/lvjun/Desktop/test/pdf/ocr/{}.png'.format(i)
pm.save(img_path)
print(f'pm.save-转image:耗时 {time.time() - dl_strat_time} 秒')
# 不依赖opencv写法
content = pytesseract.image_to_string(Image.open(img_path),lang='chi_sim')
print(f'ocr读取文本内容:耗时 {time.time() - dl_strat_time} 秒')
print("-------第[{}]页面,读取图片内容---------------".format(i))
print(f"{green_content(content)}")
print("------------------------------------------")
doc.close()
except Exception as e:
if doc != None:
doc.close()
print("OCR读取本地PDF文件的的内容,异常:{}".format(e))
return target_check
def opencv_pytesseract_ocr():
pass
# 测试入口
if __name__ == '__main__':
local_pdf = '/Users/lvjun/Desktop/test/pdf/0.pdf'
put_md = '/Users/lvjun/Desktop/test/pdf/3.md'
image_path ='/Users/lvjun/Desktop/test/pdf/imgs/'
pytesseract_ocr(local_pdf)
2、解析pdf中内容(ocr同步提取图片内容),转换成md格式,并通过tts读出来~
import pyttsx3
# https://pypi.org/project/py3-tts/
class TTS_Engine(object):
def __init__(self,_voice='com.apple.speech.synthesis.voice.mei-jia.premium',_rate=200,_volume=0.5):
# 设置基础属性
self.engine = pyttsx3.init()
self.voice = _voice
self.rate = _rate
self.volume = _volume
""" 调整人声类型 ✅ """
self.engine.setProperty('voice', self.voice)
""" 语速·RATE ✅ """
# 调整语速:一般范围一般在0~500之间
# 设置新的语速,例如每分钟150个字
self.engine.setProperty('rate', self.rate)
""" 音量·VOLUME ✅ """
# 设置新的音量,取值范围为0.0到1.0,例如0.8
self.engine.setProperty('volume', self.volume)
def show_voices(self):
""" 查看语音引擎 """
voices = self.engine.getProperty('voices')
for voice in voices:
print("\nvoice:")
print("- id: %s" % voice.id)
print("- name: %s" % voice.name)
print("- languages: %s" % voice.languages)
print("- gender: %s" % voice.gender)
print("- age: %s" % voice.age)
def say(self,info):
self.engine.say(info)
# 等待执行
self.engine.runAndWait()
def stop_engine(self):
# 关闭引擎
self.engine.stop()
def engine(self):
return self.engine
import sys,re,time,pathlib
import pymupdf4llm
import fitz
from PIL import Image
import pytesseract
from utils.tts_voice_utils import TTS_Engine
# 引入基础依赖
sys.path.append(str(pathlib.Path(__file__).absolute().parent.parent.parent.parent.parent.parent.parent))
# 引入本地引用
from utils.color_print_utils import *
# 定制一个语音引擎
tts = TTS_Engine()
print(f'==================》读取任务·开始《==================')
def pdf_2_images_4_txt_ocr_content(pdf_path):
target_check = False
content = ''
doc = None
dl_strat_time = time.time()
try:
"""提取指定页数的文本"""
doc = fitz.open(pdf_path)
print(f'fitz.open:耗时 {time.time() - dl_strat_time} 秒')
for i in range(doc.page_count):
# 转换成图片,然后进行ocr识别
current_page = doc[i]
# 设置缩放和旋转系数
trans = fitz.Matrix(5, 5).prerotate(0)
pm = current_page.get_pixmap(matrix=trans, alpha=False)
# 开始写图像
img_path = '/Users/lvjun/Desktop/test/pdf/ocr/{}.png'.format(i)
pm.save(img_path)
print(f'pm.save-转image:耗时 {time.time() - dl_strat_time} 秒')
# 不依赖opencv写法
content = pytesseract.image_to_string(Image.open(img_path),lang='chi_sim')
print(f'ocr读取文本内容:耗时 {time.time() - dl_strat_time} 秒')
print("-------第[{}]页面,读取图片内容---------------".format(i))
print(f"{green_content(content)}")
print("------------------------------------------")
doc.close()
except Exception as e:
if doc != None:
doc.close()
print("OCR读取本地PDF文件的的内容,异常:{}".format(e))
return content
def clear(info:str):
return info.replace('\n\n','\n')
def extract_and_replace_images(md_string):
pattern = r'!\[.*?\]\((.*?)\)'
image_infos = re.findall(pattern, md_string)
replaced_md_string = md_string
for image_info in image_infos:
replacement_string = pdf_2_images_4_txt_ocr_content(image_info)
replaced_md_string = re.sub(pattern, clear(replacement_string), replaced_md_string,count=1)
return replaced_md_string
local_pdf = '/Users/lvjun/Desktop/test/pdf/2.pdf'
put_md = '/Users/lvjun/Desktop/test/pdf/t.md'
image_path ='/Users/lvjun/Desktop/test/pdf/imgs/'
# 基于fitz进行的pdf提取
fitz_doc = fitz.open(local_pdf)
src_pages = fitz_doc.page_count
# 控制提取页数
min_pages= min(src_pages,1)
print(f'文件原始页数:{src_pages}\t限制页数:5\t提取页数:{min_pages}')
# 提取md格式的内容
md_text = pymupdf4llm.to_markdown(
doc=fitz_doc, # 提取文件对象
pages=range(min_pages), # 提取页数范围:0~N
write_images=True, # 是否提取图片
image_path=image_path # 图片保存位置
)
# 然后原位置插入(提取指定路径的图片,过ocr)
md_text = extract_and_replace_images(md_text)
# 将提取的 Markdown 文本写入到文件中
with open(put_md, "w", encoding="utf-8") as f:
f.write(md_text)
# 读出内容
tts.say(md_text)