0.介绍
用来PDF转Markdown
1.安装依赖包
conda create -n mineru python=3.10
conda activate mineru
pip install -U magic-pdf[full] --extra-index-url https://wheels.myhloli.com
magic-pdf --version
#版本>=0.7.0
pip install tqdm
conda install -c conda-forge ccache
2.下载模型
pip install huggingface_hub
wget https://gcore.jsdelivr.net/gh/opendatalab/MinerU@master/scripts/download_models_hf.py -O download_models_hf.py
python download_models_hf.py
3.使用
wget https://github.com/opendatalab/MinerU/raw/master/demo/small_ocr.pdf
#修改magic-pdf.json
{
"device-mode": "cuda"
}
#为OCR开启CUDA加速
python -m pip install paddlepaddle-gpu==3.0.0rc1 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
#测试
magic-pdf -p small_ocr.pdf -o ./output
4.批量处理
import os
from tqdm import tqdm
from magic_pdf.data.data_reader_writer import FileBasedDataWriter, FileBasedDataReader
from magic_pdf.data.dataset import PymuDocDataset
from magic_pdf.model.doc_analyze_by_custom_model import doc_analyze
from magic_pdf.config.enums import SupportedPdfParseMethod
# 获取input中文件路径
def get_files_path(folder_path):
target_files = []
for filename in os.listdir(folder_path):
ext = os.path.splitext(filename)[1].lower()
if ext in ['.pdf', '.jpg', '.jpeg']:
target_files.append(os.path.join(folder_path, filename))
return target_files
input_dir = "input"
files_path = get_files_path(input_dir)
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)
for file_path in tqdm(files_path, desc="基础进度条"):
print("当前处理文件路径:", file_path)
file_name = os.path.basename(file_path.split(".")[0])
print("当前处理文件名:", file_name)
file_outputMd_dir = os.path.join(output_dir, file_name)
file_outputImg_dir = os.path.join(file_outputMd_dir, "images")
os.makedirs(file_outputMd_dir, exist_ok=True)
os.makedirs(file_outputImg_dir, exist_ok=True)
print("当前处理文件Md写入路径:", file_outputMd_dir)
print("当前处理文件Img写入路径:", file_outputImg_dir)
image_writer, md_writer = FileBasedDataWriter(file_outputImg_dir), FileBasedDataWriter(file_outputMd_dir)
reader1 = FileBasedDataReader("")
pdf_bytes = reader1.read(file_path)
ds = PymuDocDataset(pdf_bytes)
if ds.classify() == SupportedPdfParseMethod.OCR:
infer_result = ds.apply(doc_analyze, ocr=True)
pipe_result = infer_result.pipe_ocr_mode(image_writer)
else:
infer_result = ds.apply(doc_analyze, ocr=False)
pipe_result = infer_result.pipe_txt_mode(image_writer)
infer_result.draw_model(os.path.join(file_outputMd_dir, f"{file_name}_model.pdf"))
model_inference_result = infer_result.get_infer_res()
pipe_result.draw_layout(os.path.join(file_outputMd_dir, f"{file_name}_layout.pdf"))
pipe_result.draw_span(os.path.join(file_outputMd_dir, f"{file_name}_spans.pdf"))
md_content = pipe_result.get_markdown(file_outputImg_dir)
pipe_result.dump_md(md_writer, f"{file_name}.md", file_outputImg_dir)
content_list_content = pipe_result.get_content_list(file_outputImg_dir)
pipe_result.dump_content_list(md_writer, f"{file_name}_content_list.json", file_outputImg_dir)
middle_json_content = pipe_result.get_middle_json()
pipe_result.dump_middle_json(md_writer, f'{file_name}_middle.json')