.\marker\benchmark.py
import argparse
import tempfile
import time
from collections import defaultdict
from tqdm import tqdm
from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models
from marker.benchmark.scoring import score_text
from marker.extract_text import naive_get_text
import json
import os
import subprocess
import shutil
import fitz as pymupdf
from tabulate import tabulate
configure_logging()
def nougat_prediction(pdf_filename, batch_size=1):
out_dir = tempfile.mkdtemp()
subprocess.run(["nougat", pdf_filename, "-o", out_dir, "--no-skipping", "--recompute", "--batchsize", str(batch_size)], check=True)
md_file = os.listdir(out_dir)[0]
with open(os.path.join(out_dir, md_file), "r") as f:
data = f.read()
shutil.rmtree(out_dir)
return data
def main():
parser = argparse.ArgumentParser(description="Benchmark PDF to MD conversion. Needs source pdfs, and a refernece folder with the correct markdown.")
parser.add_argument("in_folder", help="Input PDF files")
parser.add_argument("reference_folder", help="Reference folder with reference markdown files")
parser.add_argument("out_file", help="Output filename")
parser.add_argument("--nougat", action="store_true", help="Run nougat and compare", default=False)
parser.add_argument("--nougat_batch_size", type=int, default=1, help="Batch size to use for nougat when making predictions.")
parser.add_argument("--marker_parallel_factor", type=int, default=1, help="How much to multiply default parallel OCR workers and model batch sizes by.")
parser.add_argument("--md_out_path", type=str, default=None, help="Output path for generated markdown files")
args = parser.parse_args()
methods = ["naive", "marker"]
if args.nougat:
methods.append("nougat")
model_lst = load_all_models()
scores = defaultdict(dict)
benchmark_files = os.listdir(args.in_folder)
benchmark_files = [b for b in benchmark_files if b.endswith(".pdf")]
times = defaultdict(dict)
pages = defaultdict(int)
for fname in tqdm(benchmark_files):
md_filename = fname.rsplit(".", 1)[0] + ".md"
reference_filename = os.path.join(args.reference_folder, md_filename)
with open(reference_filename, "r") as f:
reference = f.read()
pdf_filename = os.path.join(args.in_folder, fname)
doc = pymupdf.open(pdf_filename)
pages[fname] = len(doc)
for method in methods:
start = time.time()
if method == "marker":
full_text, out_meta = convert_single_pdf(pdf_filename, model_lst, parallel_factor=args.marker_parallel_factor)
elif method == "nougat":
full_text = nougat_prediction(pdf_filename, batch_size=args.nougat_batch_size)
elif method == "naive":
full_text = naive_get_text(doc)
else:
raise ValueError(f"Unknown method {method}")
times[method][fname] = time.time() - start
score = score_text(full_text, reference)
scores[method][fname] = score
if args.md_out_path:
md_out_filename = f"{method}_{md_filename}"
with open(os.path.join(args.md_out_path, md_out_filename), "w+") as f:
f.write(full_text)
total_pages = sum(pages.values())
with open(args.out_file, "w+") as f:
write_data = defaultdict(dict)
for method in methods:
total_time = sum(times[method].values())
file_stats = {
fname:
{
"time": times[method][fname],
"score": scores[method][fname],
"pages": pages[fname]
}
for fname in benchmark_files
}
write_data[method] = {
"files": file_stats,
"avg_score": sum(scores[method].values()) / len(scores[method]),
"time_per_page": total_time / total_pages,
"time_per_doc": total_time / len(scores[method])
}
json.dump(write_data, f, indent=4)
summary_table = []
score_table = []
score_headers = benchmark_files
for method in methods:
summary_table.append([method, write_data[method]["avg_score"], write_data[method]["time_per_page"], write_data[method]["time_per_doc"]])
score_table.append([method, *[write_data[method]["files"][h]["score"] for h in score_headers]])
print(tabulate(summary_table, headers=["Method", "Average Score", "Time per page", "Time per document"]))
print("")
print("Scores by file")
print(tabulate(score_table, headers=["Method", *score_headers]))
if __name__ == "__main__":
main()
.\marker\chunk_convert.py
import argparse
import subprocess
def main():
parser = argparse.ArgumentParser(description="Convert a folder of PDFs to a folder of markdown files in chunks.")
parser.add_argument("in_folder", help="Input folder with pdfs.")
parser.add_argument("out_folder", help="Output folder")
args = parser.parse_args()
cmd = f"./chunk_convert.sh {args.in_folder} {args.out_folder}"
subprocess.run(cmd, shell=True, check=True)
if __name__ == "__main__":
main()
.\marker\convert.py
import argparse
import os
from typing import Dict, Optional
import ray
from tqdm import tqdm
import math
from marker.convert import convert_single_pdf, get_length_of_text
from marker.models import load_all_models
from marker.settings import settings
from marker.logger import configure_logging
import traceback
import json
configure_logging()
@ray.remote(num_cpus=settings.RAY_CORES_PER_WORKER, num_gpus=.05 if settings.CUDA else 0)
def process_single_pdf(fname: str, out_folder: str, model_refs, metadata: Optional[Dict] = None, min_length: Optional[int] = None):
out_filename = fname.rsplit(".", 1)[0] + ".md"
out_filename = os.path.join(out_folder, os.path.basename(out_filename))
out_meta_filename = out_filename.rsplit(".", 1)[0] + "_meta.json"
if os.path.exists(out_filename):
return
try:
if min_length:
length = get_length_of_text(fname)
if length < min_length:
return
full_text, out_metadata = convert_single_pdf(fname, model_refs, metadata=metadata)
if len(full_text.strip()) > 0:
with open(out_filename, "w+", encoding='utf-8') as f:
f.write(full_text)
with open(out_meta_filename, "w+") as f:
f.write(json.dumps(out_metadata, indent=4))
else:
print(f"Empty file: {fname}. Could not convert.")
except Exception as e:
print(f"Error converting {fname}: {e}")
print(traceback.format_exc())
def main():
parser = argparse.ArgumentParser(description="Convert multiple pdfs to markdown.")
parser.add_argument("in_folder", help="Input folder with pdfs.")
parser.add_argument("out_folder", help="Output folder")
parser.add_argument("--chunk_idx", type=int, default=0, help="Chunk index to convert")
parser.add_argument("--num_chunks", type=int, default=1, help="Number of chunks being processed in parallel")
parser.add_argument("--max", type=int, default=None, help="Maximum number of pdfs to convert")
parser.add_argument("--workers", type=int, default=5, help="Number of worker processes to use")
parser.add_argument("--metadata_file", type=str, default=None, help="Metadata json file to use for filtering")
parser.add_argument("--min_length", type=int, default=None, help="Minimum length of pdf to convert")
args = parser.parse_args()
in_folder = os.path.abspath(args.in_folder)
out_folder = os.path.abspath(args.out_folder)
files = [os.path.join(in_folder, f) for f in os.listdir(in_folder)]
os.makedirs(out_folder, exist_ok=True)
chunk_size = math.ceil(len(files) / args.num_chunks)
start_idx = args.chunk_idx * chunk_size
end_idx = start_idx + chunk_size
files_to_convert = files[start_idx:end_idx]
if args.max:
files_to_convert = files_to_convert[:args.max]
metadata = {}
if args.metadata_file:
metadata_file = os.path.abspath(args.metadata_file)
with open(metadata_file, "r") as f:
metadata = json.load(f)
total_processes = min(len(files_to_convert), args.workers)
ray.init(
num_cpus=total_processes,
num_gpus=1 if settings.CUDA else 0,
storage=settings.RAY_CACHE_PATH,
_temp_dir=settings.RAY_CACHE_PATH,
log_to_driver=settings.DEBUG
)
model_lst = load_all_models()
model_refs = ray.put(model_lst)
gpu_frac = settings.VRAM_PER_TASK / settings.INFERENCE_RAM if settings.CUDA else 0
print(f"Converting {len(files_to_convert)} pdfs in chunk {args.chunk_idx + 1}/{args.num_chunks} with {total_processes} processes, and storing in {out_folder}")
futures = [
process_single_pdf.options(num_gpus=gpu_frac).remote(
filename,
out_folder,
model_refs,
metadata=metadata.get(os.path.basename(filename)),
min_length=args.min_length
) for filename in files_to_convert
]
progress_bar = tqdm(total=len(futures))
while len(futures) > 0:
finished, futures = ray.wait(
futures, timeout=7.0
)
finished_lst = ray.get(finished)
if isinstance(finished_lst, list):
progress_bar.update(len(finished_lst))
else:
progress_bar.update(1)
ray.shutdown()
if __name__ == "__main__":
main()
.\marker\convert_single.py
import argparse
from marker.convert import convert_single_pdf
from marker.logger import configure_logging
from marker.models import load_all_models
import json
configure_logging()
def main():
parser = argparse.ArgumentParser()
parser.add_argument("filename", help="PDF file to parse")
parser.add_argument("output", help="Output file name")
parser.add_argument("--max_pages", type=int, default=None, help="Maximum number of pages to parse")
parser.add_argument("--parallel_factor", type=int, default=1, help="How much to multiply default parallel OCR workers and model batch sizes by.")
args = parser.parse_args()
fname = args.filename
model_lst = load_all_models()
full_text, out_meta = convert_single_pdf(fname, model_lst, max_pages=args.max_pages, parallel_factor=args.parallel_factor)
with open(args.output, "w+", encoding='utf-8') as f:
f.write(full_text)
out_meta_filename = args.output.rsplit(".", 1)[0] + "_meta.json"
with open(out_meta_filename, "w+") as f:
f.write(json.dumps(out_meta, indent=4))
if __name__ == "__main__":
main()
.\marker\marker\bbox.py
import fitz as pymupdf
def should_merge_blocks(box1, box2, tol=5):
merge = [
box2[0] > box1[0],
abs(box2[1] - box1[1]) < tol,
abs(box2[3] - box1[3]) < tol,
abs(box2[0] - box1[2]) < tol,
]
return all(merge)
def merge_boxes(box1, box2):
return (min(box1[0], box2[0]), min(box1[1], box2[1]), max(box2[2], box1[2]), max(box1[3], box2[3]))
def boxes_intersect(box1, box2):
return box1[0] < box2[2] and box1[2] > box2[0] and box1[1] < box2[3] and box1[3] > box2[1]
def boxes_intersect_pct(box1, box2, pct=.9):
x_left = max(box1[0], box2[0])
y_top = max(box1[1], box2[1])
x_right = min(box1[2], box2[2])
y_bottom = min(box1[3], box2[3])
if x_right < x_left or y_bottom < y_top:
return 0.0
intersection_area = (x_right - x_left) * (y_bottom - y_top)
bb1_area = (box1[2] - box1[0]) * (box1[3] - box1[1])
bb2_area = (box2[2] - box2[0]) * (box2[3] - box2[1])
iou = intersection_area / float(bb1_area + bb2_area - intersection_area)
return iou > pct
def multiple_boxes_intersect(box1, boxes):
for box2 in boxes:
if boxes_intersect(box1, box2):
return True
return False
def box_contained(box1, box2):
return box1[0] > box2[0] and box1[1] > box2[1] and box1[2] < box2[2] and box1[3] < box2[3]
def unnormalize_box(bbox, width, height):
return [
width * (bbox[0] / 1000),
height * (bbox[1] / 1000),
width * (bbox[2] / 1000),
height * (bbox[3] / 1000),
]
def correct_rotation(bbox, page):
rotation = page.rotation
if rotation == 0:
return bbox
tl = pymupdf.Point(bbox[0], bbox[1]) * page.rotation_matrix
br = pymupdf.Point(bbox[2], bbox[3]) * page.rotation_matrix
if rotation == 90:
bbox = [br[0], tl[1], tl[0], br[1]]
elif rotation == 180:
bbox = [br[0], br[1], tl[0], tl[1]]
elif rotation == 270:
bbox = [tl[0], br[1], br[0], tl[1]]
return bbox
.\marker\marker\benchmark\scoring.py
import math
from rapidfuzz import fuzz, distance
import re
CHUNK_MIN_CHARS = 25
def tokenize(text):
pattern = r'([^\w\s\d\'])|([\w\']+)|(\d+)|(\n+)|( +)'
result = re.findall(pattern, text)
flattened_result = [item for sublist in result for item in sublist if item]
return flattened_result
def chunk_text(text):
chunks = text.split("\n")
chunks = [c for c in chunks if c.strip() and len(c) > CHUNK_MIN_CHARS]
return chunks
def overlap_score(hypothesis_chunks, reference_chunks):
length_modifier = len(hypothesis_chunks) / len(reference_chunks)
search_distance = max(len(reference_chunks) // 5, 10)
chunk_scores = []
chunk_weights = []
for i, hyp_chunk in enumerate(hypothesis_chunks):
max_score = 0
chunk_weight = 1
i_offset = int(i * length_modifier)
chunk_range = range(max(0, i_offset-search_distance), min(len(reference_chunks), i_offset+search_distance))
for j in chunk_range:
ref_chunk = reference_chunks[j]
score = fuzz.ratio(hyp_chunk, ref_chunk, score_cutoff=30) / 100
if score > max_score:
max_score = score
chunk_weight = math.sqrt(len(ref_chunk))
chunk_scores.append(max_score)
chunk_weights.append(chunk_weight)
chunk_scores = [chunk_scores[i] * chunk_weights[i] for i in range(len(chunk_scores))]
return chunk_scores, chunk_weights
def score_text(hypothesis, reference):
hypothesis_chunks = chunk_text(hypothesis)
reference_chunks = chunk_text(reference)
chunk_scores, chunk_weights = overlap_score(hypothesis_chunks, reference_chunks)
return sum(chunk_scores) / sum(chunk_weights)
.\marker\marker\cleaners\bullets.py
import re
def replace_bullets(text):
bullet_pattern = r"(^|[\n ])[•●○■▪▫–—]( )"
replaced_string = re.sub(bullet_pattern, r"\1-\2", text)
return replaced_string
.\marker\marker\cleaners\code.py
from marker.schema import Span, Line, Page
import re
from typing import List
import fitz as pymupdf
def is_code_linelen(lines, thresh=60):
total_alnum_chars = sum(len(re.findall(r'\w', line.prelim_text)) for line in lines)
total_newlines = max(len(lines) - 1, 1)
if total_alnum_chars == 0:
return False
ratio = total_alnum_chars / total_newlines
return ratio < thresh
def comment_count(lines):
pattern = re.compile(r"^(//|#|'|--|/\*|'''|\"\"\"|--\[\[|<!--|%|%{|\(\*)")
return sum([1 for line in lines if pattern.match(line)])
def identify_code_blocks(blocks: List[Page]):
code_block_count = 0
font_info = None
for p in blocks:
stats = p.get_font_stats()
if font_info is None:
font_info = stats
else:
font_info += stats
try:
most_common_font = font_info.most_common(1)[0][0]
except IndexError:
print(f"Could not find most common font")
most_common_font = None
last_block = None
for page in blocks:
try:
min_start = page.get_min_line_start()
except IndexError:
continue
for block in page.blocks:
if block.most_common_block_type() != "Text":
last_block = block
continue
is_indent = []
line_fonts = []
for line in block.lines:
fonts = [span.font for span in line.spans]
line_fonts += fonts
line_start = line.bbox[0]
if line_start > min_start:
is_indent.append(True)
else:
is_indent.append(False)
comment_lines = comment_count([line.prelim_text for line in block.lines])
is_code = [
len(block.lines) > 3,
sum([f != most_common_font for f in line_fonts]) > len(line_fonts) * .8,
is_code_linelen(block.lines),
(
sum(is_indent) > len(block.lines) * .2
or
comment_lines > len(block.lines) * .2
),
]
is_code_prev = [
last_block and last_block.most_common_block_type() == "Code",
sum(is_indent) >= len(block.lines) * .8
]
if all(is_code) or all(is_code_prev):
code_block_count += 1
block.set_block_type("Code")
last_block = block
return code_block_count
def indent_blocks(blocks: List[Page]):
span_counter = 0
for page in blocks:
for block in page.blocks:
block_types = [span.block_type for line in block.lines for span in line.spans]
if "Code" not in block_types:
continue
lines = []
min_left = 1000
col_width = 0
for line in block.lines:
text = ""
min_left = min(line.bbox[0], min_left)
for span in line.spans:
if col_width == 0 and len(span.text) > 0:
col_width = (span.bbox[2] - span.bbox[0]) / len(span.text)
text += span.text
lines.append((pymupdf.Rect(line.bbox), text))
block_text = ""
blank_line = False
for line in lines:
text = line[1]
prefix = " " * int((line[0].x0 - min_left) / col_width)
current_line_blank = len(text.strip()) == 0
if blank_line and current_line_blank:
continue
block_text += prefix + text + "\n"
blank_line = current_line_blank
new_span = Span(
text=block_text,
bbox=block.bbox,
color=block.lines[0].spans[0].color,
span_id=f"{span_counter}_fix_code",
font=block.lines[0].spans[0].font,
block_type="Code"
)
span_counter += 1
block.lines = [Line(spans=[new_span], bbox=block.bbox)]
.\marker\marker\cleaners\equations.py
import io
from copy import deepcopy
from functools import partial
from typing import List
import torch
from texify.inference import batch_inference
from texify.model.model import load_model
from texify.model.processor import load_processor
import re
from PIL import Image, ImageDraw
from marker.bbox import should_merge_blocks, merge_boxes
from marker.debug.data import dump_equation_debug_data
from marker.settings import settings
from marker.schema import Page, Span, Line, Block, BlockType
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
processor = load_processor()
def load_texify_model():
texify_model = load_model(checkpoint=settings.TEXIFY_MODEL_NAME, device=settings.TORCH_DEVICE_MODEL, dtype=settings.TEXIFY_DTYPE)
return texify_model
def mask_bbox(png_image, bbox, selected_bboxes):
mask = Image.new('L', png_image.size, 0)
draw = ImageDraw.Draw(mask)
first_x = bbox[0]
first_y = bbox[1]
bbox_height = bbox[3] - bbox[1]
bbox_width = bbox[2] - bbox[0]
for box in selected_bboxes:
new_box = (box[0] - first_x, box[1] - first_y, box[2] - first_x, box[3] - first_y)
resized = (
new_box[0] / bbox_width * png_image.size[0],
new_box[1] / bbox_height * png_image.size[1],
new_box[2] / bbox_width * png_image.size[0],
new_box[3] / bbox_height * png_image.size[1]
)
draw.rectangle(resized, fill=255)
result = Image.composite(png_image, Image.new('RGBA', png_image.size, 'white'), mask)
return result
def get_masked_image(page, bbox, selected_bboxes):
pix = page.get_pixmap(dpi=settings.TEXIFY_DPI, clip=bbox)
png = pix.pil_tobytes(format="PNG")
png_image = Image.open(io.BytesIO(png))
png_image = mask_bbox(png_image, bbox, selected_bboxes)
png_image = png_image.convert("RGB")
return png_image
def get_latex_batched(images, reformat_region_lens, texify_model, batch_size):
if len(images) == 0:
return []
predictions = [""] * len(images)
for i in range(0, len(images), batch_size):
min_idx = i
max_idx = min(min_idx + batch_size, len(images))
max_length = max(reformat_region_lens[min_idx:max_idx])
max_length = min(max_length, settings.TEXIFY_MODEL_MAX)
max_length += settings.TEXIFY_TOKEN_BUFFER
model_output = batch_inference(images[min_idx:max_idx], texify_model, processor, max_tokens=max_length)
for j, output in enumerate(model_output):
token_count = get_total_texify_tokens(output)
if token_count >= max_length - 1:
output = ""
image_idx = i + j
predictions[image_idx] = output
return predictions
def get_total_texify_tokens(text):
tokenizer = processor.tokenizer
tokens = tokenizer(text)
return len(tokens["input_ids"])
def find_page_equation_regions(pnum, page, block_types):
i = 0
equation_boxes = [b.bbox for b in block_types[pnum] if b.block_type == "Formula"]
reformatted_blocks = set()
reformat_regions = []
block_lens = []
return reformat_regions, block_lens
def get_bboxes_for_region(page, region):
bboxes = []
merged_box = None
for idx in region:
block = page.blocks[idx]
bbox = block.bbox
if merged_box is None:
merged_box = bbox
else:
merged_box = merge_boxes(merged_box, bbox)
bboxes.append(bbox)
return bboxes, merged_box
def replace_blocks_with_latex(page_blocks: Page, merged_boxes, reformat_regions, predictions, pnum):
new_blocks = []
converted_spans = []
current_region = 0
idx = 0
success_count = 0
fail_count = 0
while idx < len(page_blocks.blocks):
block = page_blocks.blocks[idx]
if current_region >= len(reformat_regions) or idx < reformat_regions[current_region][0]:
new_blocks.append(block)
idx += 1
continue
orig_block_text = " ".join([page_blocks.blocks[i].prelim_text for i in reformat_regions[current_region]])
latex_text = predictions[current_region]
conditions = [
len(latex_text) > 0,
get_total_texify_tokens(latex_text) < settings.TEXIFY_MODEL_MAX,
len(latex_text) > len(orig_block_text) * .8,
len(latex_text.strip()) > 0
]
idx = reformat_regions[current_region][-1] + 1
if not all(conditions):
fail_count += 1
converted_spans.append(None)
for i in reformat_regions[current_region]:
new_blocks.append(page_blocks.blocks[i])
else:
success_count += 1
block_line = Line(
spans=[
Span(
text=latex_text,
bbox=merged_boxes[current_region],
span_id=f"{pnum}_{idx}_fixeq",
font="Latex",
color=0,
block_type="Formula"
)
],
bbox=merged_boxes[current_region]
)
converted_spans.append(deepcopy(block_line.spans[0]))
new_blocks.append(Block(
lines=[block_line],
bbox=merged_boxes[current_region],
pnum=pnum
))
current_region += 1
return new_blocks, success_count, fail_count, converted_spans
def replace_equations(doc, blocks: List[Page], block_types: List[List[BlockType]], texify_model, batch_size=settings.TEXIFY_BATCH_SIZE):
unsuccessful_ocr = 0
successful_ocr = 0
reformat_regions = []
reformat_region_lens = []
for pnum, page in enumerate(blocks):
regions, region_lens = find_page_equation_regions(pnum, page, block_types)
reformat_regions.append(regions)
reformat_region_lens.append(region_lens)
eq_count = sum([len(x) for x in reformat_regions])
flat_reformat_region_lens = [item for sublist in reformat_region_lens for item in sublist]
images = []
merged_boxes = []
for page_idx, reformat_regions_page in enumerate(reformat_regions):
page_obj = doc[page_idx]
for reformat_region in reformat_regions_page:
bboxes, merged_box = get_bboxes_for_region(blocks[page_idx], reformat_region)
png_image = get_masked_image(page_obj, merged_box, bboxes)
images.append(png_image)
merged_boxes.append(merged_box)
predictions = get_latex_batched(images, flat_reformat_region_lens, texify_model, batch_size)
page_start = 0
converted_spans = []
for page_idx, reformat_regions_page in enumerate(reformat_regions):
page_predictions = predictions[page_start:page_start + len(reformat_regions_page)]
page_boxes = merged_boxes[page_start:page_start + len(reformat_regions_page)]
new_page_blocks, success_count, fail_count, converted_span = replace_blocks_with_latex(
blocks[page_idx],
page_boxes,
reformat_regions_page,
page_predictions,
page_idx
)
converted_spans.extend(converted_span)
blocks[page_idx].blocks = new_page_blocks
page_start += len(reformat_regions_page)
successful_ocr += success_count
unsuccessful_ocr += fail_count
dump_equation_debug_data(doc, images, converted_spans)
return blocks, {"successful_ocr": successful_ocr, "unsuccessful_ocr": unsuccessful_ocr, "equations": eq_count}
.\marker\marker\cleaners\headers.py
import re
from collections import Counter, defaultdict
from itertools import chain
from thefuzz import fuzz
from sklearn.cluster import DBSCAN
import numpy as np
from marker.schema import Page, FullyMergedBlock
from typing import List, Tuple
def filter_common_elements(lines, page_count):
text = [s.text for line in lines for s in line.spans if len(s.text) > 4]
counter = Counter(text)
common = [k for k, v in counter.items() if v > page_count * .6]
bad_span_ids = [s.span_id for line in lines for s in line.spans if s.text in common]
return bad_span_ids
def filter_header_footer(all_page_blocks, max_selected_lines=2):
first_lines = []
last_lines = []
for page in all_page_blocks:
nonblank_lines = page.get_nonblank_lines()
first_lines.extend(nonblank_lines[:max_selected_lines])
last_lines.extend(nonblank_lines[-max_selected_lines:])
bad_span_ids = filter_common_elements(first_lines, len(all_page_blocks))
bad_span_ids += filter_common_elements(last_lines, len(all_page_blocks))
return bad_span_ids
def categorize_blocks(all_page_blocks: List[Page]):
spans = list(chain.from_iterable([p.get_nonblank_spans() for p in all_page_blocks]))
X = np.array(
[(*s.bbox, len(s.text)) for s in spans]
)
dbscan = DBSCAN(eps=.1, min_samples=5)
dbscan.fit(X)
labels = dbscan.labels_
label_chars = defaultdict(int)
for i, label in enumerate(labels):
label_chars[label] += len(spans[i].text)
most_common_label = None
most_chars = 0
for i in label_chars.keys():
if label_chars[i] > most_chars:
most_common_label = i
most_chars = label_chars[i]
labels = [0 if label == most_common_label else 1 for label in labels]
bad_span_ids = [spans[i].span_id for i in range(len(spans)) if labels[i] == 1]
return bad_span_ids
def replace_leading_trailing_digits(string, replacement):
string = re.sub(r'^\d+', replacement, string)
string = re.sub(r'\d+$', replacement, string)
return string
def find_overlap_elements(lst: List[Tuple[str, int]], string_match_thresh=.9, min_overlap=.05) -> List[int]:
result = []
titles = [l[0] for l in lst]
for i, (str1, id_num) in enumerate(lst):
overlap_count = 0
for j, str2 in enumerate(titles):
if i != j and fuzz.ratio(str1, str2) >= string_match_thresh * 100:
overlap_count += 1
if overlap_count >= max(3.0, len(lst) * min_overlap):
result.append(id_num)
return result
def filter_common_titles(merged_blocks: List[FullyMergedBlock]) -> List[FullyMergedBlock]:
titles = []
for i, block in enumerate(merged_blocks):
if block.block_type in ["Title", "Section-header"]:
text = block.text
if text.strip().startswith("#"):
text = re.sub(r'#+', '', text)
text = text.strip()
text = replace_leading_trailing_digits(text, "").strip()
titles.append((text, i))
bad_block_ids = find_overlap_elements(titles)
new_blocks = []
for i, block in enumerate(merged_blocks):
if i in bad_block_ids:
continue
new_blocks.append(block)
return new_blocks
.\marker\marker\cleaners\table.py
from marker.bbox import merge_boxes
from marker.schema import Line, Span, Block, Page
from copy import deepcopy
from tabulate import tabulate
from typing import List
import re
import textwrap
def merge_table_blocks(blocks: List[Page]):
current_lines = []
current_bbox = None
for page in blocks:
new_page_blocks = []
pnum = page.pnum
for block in page.blocks:
if block.most_common_block_type() != "Table":
if len(current_lines) > 0:
new_block = Block(
lines=deepcopy(current_lines),
pnum=pnum,
bbox=current_bbox
)
new_page_blocks.append(new_block)
current_lines = []
current_bbox = None
new_page_blocks.append(block)
continue
current_lines.extend(block.lines)
if current_bbox is None:
current_bbox = block.bbox
else:
current_bbox = merge_boxes(current_bbox, block.bbox)
if len(current_lines) > 0:
new_block = Block(
lines=deepcopy(current_lines),
pnum=pnum,
bbox=current_bbox
)
new_page_blocks.append(new_block)
current_lines = []
current_bbox = None
page.blocks = new_page_blocks
def create_new_tables(blocks: List[Page]):
table_idx = 0
dot_pattern = re.compile(r'(\s*\.\s*){4,}')
dot_multiline_pattern = re.compile(r'.*(\s*\.\s*){4,}.*', re.DOTALL)
for page in blocks:
for block in page.blocks:
if block.most_common_block_type() != "Table" or len(block.lines) < 3:
continue
table_rows = []
y_coord = None
row = []
for line in block.lines:
for span in line.spans:
if y_coord != span.y_start:
if len(row) > 0:
table_rows.append(row)
row = []
y_coord = span.y_start
text = span.text
if dot_multiline_pattern.match(text):
text = dot_pattern.sub(' ', text)
row.append(text)
if len(row) > 0:
table_rows.append(row)
if max([len("".join(r)) for r in table_rows]) > 300 or len(table_rows[0]) > 8 or len(table_rows[0]) < 2:
continue
new_text = tabulate(table_rows, headers="firstrow", tablefmt="github")
new_span = Span(
bbox=block.bbox,
span_id=f"{table_idx}_fix_table",
font="Table",
color=0,
block_type="Table",
text=new_text
)
new_line = Line(
bbox=block.bbox,
spans=[new_span]
)
block.lines = [new_line]
table_idx += 1
return table_idx
.\marker\marker\convert.py
import fitz as pymupdf
from marker.cleaners.table import merge_table_blocks, create_new_tables
from marker.debug.data import dump_bbox_debug_data
from marker.extract_text import get_text_blocks
from marker.cleaners.headers import filter_header_footer, filter_common_titles
from marker.cleaners.equations import replace_equations
from marker.ordering import order_blocks
from marker.postprocessors.editor import edit_full_text
from marker.segmentation import detect_document_block_types
from marker.cleaners.code import identify_code_blocks, indent_blocks
from marker.cleaners.bullets import replace_bullets
from marker.markdown import merge_spans, merge_lines, get_full_text
from marker.schema import Page, BlockType
from typing import List, Dict, Tuple, Optional
import re
import magic
from marker.settings import settings
def find_filetype(fpath):
mimetype = magic.from_file(fpath).lower()
if "pdf" in mimetype:
return "pdf"
elif "epub" in mimetype:
return "epub"
elif "mobi" in mimetype:
return "mobi"
elif mimetype in settings.SUPPORTED_FILETYPES:
return settings.SUPPORTED_FILETYPES[mimetype]
else:
print(f"Found nonstandard filetype {mimetype}")
return "other"
def annotate_spans(blocks: List[Page], block_types: List[BlockType]):
for i, page in enumerate(blocks):
page_block_types = block_types[i]
page.add_block_types(page_block_types)
def get_length_of_text(fname: str) -> int:
filetype = find_filetype(fname)
if filetype == "other":
return 0
doc = pymupdf.open(fname, filetype=filetype)
full_text = ""
for page in doc:
full_text += page.get_text("text", sort=True, flags=settings.TEXT_FLAGS)
return len(full_text)
def convert_single_pdf(
fname: str,
model_lst: List,
max_pages=None,
metadata: Optional[Dict]=None,
parallel_factor: int = 1
) -> Tuple[str, Dict]:
lang = settings.DEFAULT_LANG
if metadata:
lang = metadata.get("language", settings.DEFAULT_LANG)
tess_lang = settings.TESSERACT_LANGUAGES.get(lang, "eng")
spell_lang = settings.SPELLCHECK_LANGUAGES.get(lang, None)
if "eng" not in tess_lang:
tess_lang = f"eng+{tess_lang}"
out_meta = {"language": lang}
filetype = find_filetype(fname)
if filetype == "other":
return "", out_meta
out_meta["filetype"] = filetype
doc = pymupdf.open(fname, filetype=filetype)
if filetype != "pdf":
conv = doc.convert_to_pdf()
doc = pymupdf.open("pdf", conv)
blocks, toc, ocr_stats = get_text_blocks(
doc,
tess_lang,
spell_lang,
max_pages=max_pages,
parallel=int(parallel_factor * settings.OCR_PARALLEL_WORKERS)
)
out_meta["toc"] = toc
out_meta["pages"] = len(blocks)
out_meta["ocr_stats"] = ocr_stats
if len([b for p in blocks for b in p.blocks]) == 0:
print(f"Could not extract any text blocks for {fname}")
return "", out_meta
texify_model, layoutlm_model, order_model, edit_model = model_lst
block_types = detect_document_block_types(
doc,
blocks,
layoutlm_model,
batch_size=int(settings.LAYOUT_BATCH_SIZE * parallel_factor)
)
bad_span_ids = filter_header_footer(blocks)
out_meta["block_stats"] = {"header_footer": len(bad_span_ids)}
annotate_spans(blocks, block_types)
dump_bbox_debug_data(doc, blocks)
blocks = order_blocks(
doc,
blocks,
order_model,
batch_size=int(settings.ORDERER_BATCH_SIZE * parallel_factor)
)
code_block_count = identify_code_blocks(blocks)
out_meta["block_stats"]["code"] = code_block_count
indent_blocks(blocks)
merge_table_blocks(blocks)
table_count = create_new_tables(blocks)
out_meta["block_stats"]["table"] = table_count
for page in blocks:
for block in page.blocks:
block.filter_spans(bad_span_ids)
block.filter_bad_span_types()
filtered, eq_stats = replace_equations(
doc,
blocks,
block_types,
texify_model,
batch_size=int(settings.TEXIFY_BATCH_SIZE * parallel_factor)
)
out_meta["block_stats"]["equations"] = eq_stats
merged_lines = merge_spans(filtered)
text_blocks = merge_lines(merged_lines, filtered)
text_blocks = filter_common_titles(text_blocks)
full_text = get_full_text(text_blocks)
full_text = re.sub(r'\n{3,}', '\n\n', full_text)
full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text)
full_text = replace_bullets(full_text)
full_text, edit_stats = edit_full_text(
full_text,
edit_model,
batch_size=settings.EDITOR_BATCH_SIZE * parallel_factor
)
out_meta["postprocess_stats"] = {"edit": edit_stats}
return full_text, out_meta
.\marker\marker\debug\data.py
import base64
import json
import os
import zlib
from typing import List
from marker.schema import Page
from marker.settings import settings
from PIL import Image
import io
def dump_equation_debug_data(doc, images, converted_spans):
if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL == 0:
return
if len(images) == 0:
return
assert len(converted_spans) == len(images)
data_lines = []
for idx, (pil_image, converted_span) in enumerate(zip(images, converted_spans)):
if converted_span is None:
continue
img_bytes = io.BytesIO()
pil_image.save(img_bytes, format="WEBP", lossless=True)
b64_image = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
data_lines.append({
"image": b64_image,
"text": converted_span.text,
"bbox": converted_span.bbox
})
doc_base = os.path.basename(doc.name).rsplit(".", 1)[0]
debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_equations.json")
with open(debug_file, "w+") as f:
json.dump(data_lines, f)
def dump_bbox_debug_data(doc, blocks: List[Page]):
if not settings.DEBUG_DATA_FOLDER or settings.DEBUG_LEVEL < 2:
return
doc_base = os.path.basename(doc.name).rsplit(".", 1)[0]
debug_file = os.path.join(settings.DEBUG_DATA_FOLDER, f"{doc_base}_bbox.json")
debug_data = []
for idx, page_blocks in enumerate(blocks):
page = doc[idx]
pix = page.get_pixmap(dpi=settings.TEXIFY_DPI, annots=False, clip=page_blocks.bbox)
png = pix.pil_tobytes(format="PNG")
png_image = Image.open(io.BytesIO(png))
width, height = png_image.size
max_dimension = 6000
if width > max_dimension or height > max_dimension:
scaling_factor = min(max_dimension / width, max_dimension / height)
png_image = png_image.resize((int(width * scaling_factor), int(height * scaling_factor)), Image.ANTIALIAS)
img_bytes = io.BytesIO()
png_image.save(img_bytes, format="WEBP", lossless=True, quality=100)
b64_image = base64.b64encode(img_bytes.getvalue()).decode("utf-8")
page_data = page_blocks.model_dump()
page_data["image"] = b64_image
debug_data.append(page_data)
with open(debug_file, "w+") as f:
json.dump(debug_data, f)
.\marker\marker\extract_text.py
import os
from typing import Tuple, List, Optional
from spellchecker import SpellChecker
from marker.bbox import correct_rotation
from marker.ocr.page import ocr_entire_page
from marker.ocr.utils import detect_bad_ocr, font_flags_decomposer
from marker.settings import settings
from marker.schema import Span, Line, Block, Page
from concurrent.futures import ThreadPoolExecutor
os.environ["TESSDATA_PREFIX"] = settings.TESSDATA_PREFIX
def sort_rotated_text(page_blocks, tolerance=1.25):
vertical_groups = {}
for block in page_blocks:
group_key = round(block.bbox[1] / tolerance) * tolerance
if group_key not in vertical_groups:
vertical_groups[group_key] = []
vertical_groups[group_key].append(block)
sorted_page_blocks = []
for _, group in sorted(vertical_groups.items()):
sorted_group = sorted(group, key=lambda x: x.bbox[0])
sorted_page_blocks.extend(sorted_group)
return sorted_page_blocks
def get_single_page_blocks(doc, pnum: int, tess_lang: str, spellchecker: Optional[SpellChecker] = None, ocr=False) -> Tuple[List[Block], int]:
page = doc[pnum]
rotation = page.rotation
if ocr:
blocks = ocr_entire_page(page, tess_lang, spellchecker)
else:
blocks = page.get_text("dict", sort=True, flags=settings.TEXT_FLAGS)["blocks"]
page_blocks = []
span_id = 0
for block_idx, block in enumerate(blocks):
block_lines = []
for l in block["lines"]:
spans = []
for i, s in enumerate(l["spans"]):
block_text = s["text"]
bbox = s["bbox"]
span_obj = Span(
text=block_text,
bbox=correct_rotation(bbox, page),
span_id=f"{pnum}_{span_id}",
font=f"{s['font']}_{font_flags_decomposer(s['flags'])}",
color=s["color"],
ascender=s["ascender"],
descender=s["descender"],
)
spans.append(span_obj)
span_id += 1
line_obj = Line(
spans=spans,
bbox=correct_rotation(l["bbox"], page),
)
if line_obj.area > 0:
block_lines.append(line_obj)
block_obj = Block(
lines=block_lines,
bbox=correct_rotation(block["bbox"], page),
pnum=pnum
)
if len(block_lines) > 0:
page_blocks.append(block_obj)
if rotation > 0:
page_blocks = sort_rotated_text(page_blocks)
return page_blocks
def convert_single_page(doc, pnum, tess_lang: str, spell_lang: Optional[str], no_text: bool, disable_ocr: bool = False, min_ocr_page: int = 2):
ocr_pages = 0
ocr_success = 0
ocr_failed = 0
spellchecker = None
page_bbox = doc[pnum].bound()
if spell_lang:
spellchecker = SpellChecker(language=spell_lang)
blocks = get_single_page_blocks(doc, pnum, tess_lang, spellchecker)
page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox)
conditions = [
(
no_text
or
(len(page_obj.prelim_text) > 0 and detect_bad_ocr(page_obj.prelim_text, spellchecker))
),
min_ocr_page < pnum < len(doc) - 1,
not disable_ocr
]
if all(conditions) or settings.OCR_ALL_PAGES:
page = doc[pnum]
blocks = get_single_page_blocks(doc, pnum, tess_lang, spellchecker, ocr=True)
page_obj = Page(blocks=blocks, pnum=pnum, bbox=page_bbox, rotation=page.rotation)
ocr_pages = 1
if len(blocks) == 0:
ocr_failed = 1
else:
ocr_success = 1
return page_obj, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success}
def get_text_blocks(doc, tess_lang: str, spell_lang: Optional[str], max_pages: Optional[int] = None, parallel: int = settings.OCR_PARALLEL_WORKERS):
all_blocks = []
toc = doc.get_toc()
ocr_pages = 0
ocr_failed = 0
ocr_success = 0
range_end = len(doc)
no_text = len(naive_get_text(doc).strip()) == 0
if max_pages:
range_end = min(max_pages, len(doc))
with ThreadPoolExecutor(max_workers=parallel) as pool:
args_list = [(doc, pnum, tess_lang, spell_lang, no_text) for pnum in range(range_end)]
if parallel == 1:
func = map
else:
func = pool.map
results = func(lambda a: convert_single_page(*a), args_list)
for result in results:
page_obj, ocr_stats = result
all_blocks.append(page_obj)
ocr_pages += ocr_stats["ocr_pages"]
ocr_failed += ocr_stats["ocr_failed"]
ocr_success += ocr_stats["ocr_success"]
return all_blocks, toc, {"ocr_pages": ocr_pages, "ocr_failed": ocr_failed, "ocr_success": ocr_success}
def naive_get_text(doc):
full_text = ""
for page in doc:
full_text += page.get_text("text", sort=True, flags=settings.TEXT_FLAGS)
full_text += "\n"
return full_text
.\marker\marker\logger.py
import logging
import fitz as pymupdf
def configure_logging():
logging.basicConfig(level=logging.WARNING)
logging.getLogger('pdfminer').setLevel(logging.ERROR)
logging.getLogger('PIL').setLevel(logging.ERROR)
logging.getLogger('fitz').setLevel(logging.ERROR)
logging.getLogger('ocrmypdf').setLevel(logging.ERROR)
pymupdf.TOOLS.mupdf_display_errors(False)
warnings.simplefilter(action='ignore', category=FutureWarning)
.\marker\marker\markdown.py
from marker.schema import MergedLine, MergedBlock, FullyMergedBlock, Page
import re
from typing import List
def surround_text(s, char_to_insert):
leading_whitespace = re.match(r'^(\s*)', s).group(1)
trailing_whitespace = re.search(r'(\s*)$', s).group(1)
stripped_string = s.strip()
modified_string = char_to_insert + stripped_string + char_to_insert
final_string = leading_whitespace + modified_string + trailing_whitespace
return final_string
def merge_spans(blocks):
merged_blocks = []
return merged_blocks
def block_surround(text, block_type):
if block_type == "Section-header":
if not text.startswith("#"):
text = "\n## " + text.strip().title() + "\n"
elif block_type == "Title":
if not text.startswith("#"):
text = "# " + text.strip().title() + "\n"
elif block_type == "Table":
text = "\n" + text + "\n"
elif block_type == "List-item":
pass
elif block_type == "Code":
text = "\n" + text + "\n"
return text
def line_separator(line1, line2, block_type, is_continuation=False):
lowercase_letters = "a-zà-öø-ÿа-яşćăâđêôơưþðæøå"
uppercase_letters = "A-ZÀ-ÖØ-ßА-ЯŞĆĂÂĐÊÔƠƯÞÐÆØÅ"
hyphen_pattern = re.compile(rf'.*[{lowercase_letters}][-]\s?$', re.DOTALL)
if line1 and hyphen_pattern.match(line1) and re.match(rf"^[{lowercase_letters}]", line2):
line1 = re.split(r"[-—]\s?$", line1)[0]
return line1.rstrip() + line2.lstrip()
lowercase_pattern1 = re.compile(rf'.*[{lowercase_letters},]\s?$', re.DOTALL)
lowercase_pattern2 = re.compile(rf'^\s?[{uppercase_letters}{lowercase_letters}]', re.DOTALL)
end_pattern = re.compile(r'.*[.?!]\s?$', re.DOTALL)
if block_type in ["Title", "Section-header"]:
return line1.rstrip() + " " + line2.lstrip()
elif lowercase_pattern1.match(line1) and lowercase_pattern2.match(line2) and block_type == "Text":
return line1.rstrip() + " " + line2.lstrip()
elif is_continuation:
return line1.rstrip() + " " + line2.lstrip()
elif block_type == "Text" and end_pattern.match(line1):
return line1 + "\n\n" + line2
elif block_type == "Formula":
return line1 + " " + line2
else:
return line1 + "\n" + line2
def block_separator(line1, line2, block_type1, block_type2):
sep = "\n"
if block_type1 == "Text":
sep = "\n\n"
return sep + line2
def merge_lines(blocks, page_blocks: List[Page]):
text_blocks = []
prev_type = None
prev_line = None
block_text = ""
block_type = ""
common_line_heights = [p.get_line_height_stats() for p in page_blocks]
for page in blocks:
for block in page:
block_type = block.most_common_block_type()
if block_type != prev_type and prev_type:
text_blocks.append(
FullyMergedBlock(
text=block_surround(block_text, prev_type),
block_type=prev_type
)
)
block_text = ""
prev_type = block_type
for i, line in enumerate(block.lines):
line_height = line.bbox[3] - line.bbox[1]
prev_line_height = prev_line.bbox[3] - prev_line.bbox[1] if prev_line else 0
prev_line_x = prev_line.bbox[0] if prev_line else 0
prev_line = line
is_continuation = line_height == prev_line_height and line.bbox[0] == prev_line_x
if block_text:
block_text = line_separator(block_text, line.text, block_type, is_continuation)
else:
block_text = line.text
text_blocks.append(
FullyMergedBlock(
text=block_surround(block_text, prev_type),
block_type=block_type
)
)
return text_blocks
def get_full_text(text_blocks):
full_text = ""
prev_block = None
for block in text_blocks:
if prev_block:
full_text += block_separator(prev_block.text, block.text, prev_block.block_type, block.block_type)
else:
full_text += block.text
prev_block = block
return full_text