Skip to content

Commit 64c73b9

Browse files
committed
Fix tests, add way to disable ocr
1 parent bcb60b8 commit 64c73b9

6 files changed

Lines changed: 64 additions & 38 deletions

File tree

marker/builders/line.py

Lines changed: 12 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -36,10 +36,6 @@ class LineBuilder(BaseBuilder):
3636
"The batch size to use for the ocr error detection model.",
3737
"Default is None, which will use the default batch size for the model.",
3838
] = None
39-
enable_table_ocr: Annotated[
40-
bool,
41-
"Whether to skip OCR on tables. The TableProcessor will re-OCR them. Only enable if the TableProcessor is not running.",
42-
] = False
4339
layout_coverage_min_lines: Annotated[
4440
int,
4541
"The minimum number of PdfProvider lines that must be covered by the layout model",
@@ -54,17 +50,10 @@ class LineBuilder(BaseBuilder):
5450
float,
5551
"If less pages than this threshold are good, OCR will happen in the document. Otherwise it will not.",
5652
] = 0.85
57-
provider_line_detected_line_min_overlap_pct: Annotated[
58-
float,
59-
"The percentage of a provider line that has to be covered by a detected line",
60-
] = 0.1
6153
provider_line_provider_line_min_overlap_pct: Annotated[
6254
float,
6355
"The percentage of a provider line that has to be covered by a detected line",
64-
] = 0.1
65-
line_vertical_merge_threshold: Annotated[
66-
int, "The maximum pixel distance between y1s for two lines to be merged"
67-
] = 8
56+
] = 0.15
6857
excluded_for_coverage: Annotated[
6958
Tuple[BlockTypes],
7059
"A list of block types to exclude from the layout coverage check.",
@@ -86,6 +75,10 @@ class LineBuilder(BaseBuilder):
8675
bool,
8776
"Disable tqdm progress bars.",
8877
] = False
78+
disable_ocr: Annotated[
79+
bool,
80+
"Disable OCR for the document. This will only use the lines from the provider.",
81+
] = False
8982
keep_chars: Annotated[bool, "Keep individual characters."] = False
9083

9184
def __init__(
@@ -169,6 +162,9 @@ def get_all_lines(self, document: Document, provider: PdfProvider):
169162
), # Ensure provider lines don't overflow the page or intersect
170163
]
171164
)
165+
if self.disable_ocr:
166+
provider_lines_good = True
167+
172168
layout_good.append(provider_lines_good)
173169

174170
run_detection = [not good for good in layout_good]
@@ -191,12 +187,12 @@ def get_all_lines(self, document: Document, provider: PdfProvider):
191187
)
192188

193189
# Setup detection results
190+
detection_boxes = []
194191
if detection_result:
195192
detection_boxes = [
196193
PolygonBox(polygon=box.polygon) for box in detection_result.bboxes
197194
]
198-
else:
199-
detection_boxes = []
195+
200196
detection_boxes = sort_text_lines(detection_boxes)
201197

202198
if provider_lines_good:
@@ -257,6 +253,7 @@ def check_line_overlaps(
257253
provider_bboxes = [line.line.polygon.bbox for line in provider_lines]
258254
# Add a small margin to account for minor overflows
259255
page_bbox = document_page.polygon.expand(5, 5).bbox
256+
260257
for bbox in provider_bboxes:
261258
if bbox[0] < page_bbox[0]:
262259
return False
@@ -275,7 +272,7 @@ def check_line_overlaps(
275272
)
276273

277274
# There should be one intersection with itself
278-
if intersect_counts > 1:
275+
if intersect_counts > 2:
279276
return False
280277

281278
return True

tests/builders/test_document_builder.py

Lines changed: 25 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,20 +4,40 @@
44
from marker.schema.text.line import Line
55

66

7+
@pytest.mark.filename("thinkpython.pdf")
78
@pytest.mark.config({"page_range": [0]})
89
def test_document_builder(pdf_document):
910
first_page = pdf_document.pages[0]
10-
assert first_page.structure[0] == '/page/0/SectionHeader/0'
11+
assert first_page.structure[0] == "/page/0/SectionHeader/0"
1112

1213
first_block = first_page.get_block(first_page.structure[0])
1314
assert first_block.block_type == BlockTypes.SectionHeader
14-
assert first_block.text_extraction_method == 'pdftext'
15+
assert first_block.text_extraction_method == "pdftext"
1516

1617
first_text_block: Line = first_page.get_block(first_block.structure[0])
1718
assert first_text_block.block_type == BlockTypes.Line
1819

1920
first_span = first_page.get_block(first_text_block.structure[0])
2021
assert first_span.block_type == BlockTypes.Span
21-
assert first_span.text == 'Subspace Adversarial Training'
22-
assert first_span.font == 'NimbusRomNo9L-Medi'
23-
assert first_span.formats == ['plain']
22+
assert first_span.text == "Think Python"
23+
assert first_span.font == "URWPalladioL-Roma"
24+
assert first_span.formats == ["plain"]
25+
26+
27+
@pytest.mark.config({"page_range": [0]})
28+
def test_document_builder_inline_eq(pdf_document):
29+
first_page = pdf_document.pages[0]
30+
assert first_page.structure[0] == "/page/0/SectionHeader/0"
31+
32+
first_block = first_page.get_block(first_page.structure[0])
33+
assert first_block.block_type == BlockTypes.SectionHeader
34+
assert first_block.text_extraction_method == "surya"
35+
36+
first_text_block: Line = first_page.get_block(first_block.structure[0])
37+
assert first_text_block.block_type == BlockTypes.Line
38+
39+
first_span = first_page.get_block(first_text_block.structure[0])
40+
assert first_span.block_type == BlockTypes.Span
41+
assert first_span.text == "Subspace Adversarial Training"
42+
assert first_span.font == "NimbusRomNo9L-Medi"
43+
assert first_span.formats == ["plain"]

tests/builders/test_layout_replace.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,11 @@
88
from marker.schema.registry import get_block_class
99

1010

11+
@pytest.mark.filename("thinkpython.pdf")
1112
@pytest.mark.config({"page_range": [0]})
12-
def test_layout_replace(request, config, doc_provider, layout_model, ocr_error_model, detection_model):
13+
def test_layout_replace(
14+
request, config, doc_provider, layout_model, ocr_error_model, detection_model
15+
):
1316
# The llm layout builder replaces blocks - this makes sure text is still merged properly
1417
layout_builder = LayoutBuilder(layout_model, config)
1518
line_builder = LineBuilder(detection_model, ocr_error_model, config)
@@ -35,8 +38,4 @@ def test_layout_replace(request, config, doc_provider, layout_model, ocr_error_m
3538
renderer = MarkdownRenderer(config)
3639
rendered = renderer(document)
3740

38-
assert "worst-case perturbations" in rendered.markdown
39-
assert "projected gradient descent" in rendered.markdown
40-
41-
42-
41+
assert "Think Python" in rendered.markdown

tests/converters/test_ocr_converter.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ def check_bboxes(page: OCRJSONPageOutput, lines):
3535

3636
@pytest.mark.config({"page_range": [0]})
3737
def test_ocr_converter(config, model_dict, temp_doc):
38-
_ocr_converter(config, model_dict, temp_doc, 83, 2)
38+
_ocr_converter(config, model_dict, temp_doc, 85, 2)
3939

4040

4141
@pytest.mark.filename("pres.pdf")

tests/converters/test_pdf_converter.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77

88
@pytest.mark.output_format("markdown")
9-
@pytest.mark.config({"page_range": [0, 1, 2, 3, 7]})
9+
@pytest.mark.config({"page_range": [0, 1, 2, 3, 7], "disable_ocr": True})
1010
def test_pdf_converter(pdf_converter: PdfConverter, temp_doc):
1111
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
1212
markdown = markdown_output.markdown
@@ -79,7 +79,7 @@ def test_pptx_converter(pdf_converter: PdfConverter, temp_doc):
7979

8080

8181
@pytest.mark.output_format("markdown")
82-
@pytest.mark.config({"page_range": [0, 1, 2, 3, 7]})
82+
@pytest.mark.config({"page_range": [0, 1, 2, 3, 7], "disable_ocr": True})
8383
def test_pdf_converter_bytes(pdf_converter: PdfConverter, temp_doc):
8484
with open(temp_doc.name, "rb") as f:
8585
data = f.read()

tests/renderers/test_markdown_renderer.py

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,22 @@
55
from marker.schema.blocks import TableCell
66

77

8-
@pytest.mark.config({"page_range": [0]})
8+
@pytest.mark.config({"page_range": [0], "disable_ocr": True})
99
def test_markdown_renderer(pdf_document):
1010
renderer = MarkdownRenderer()
1111
md = renderer(pdf_document).markdown
1212

1313
# Verify markdown
14-
assert '# Subspace Adversarial Training' in md
14+
assert "# Subspace Adversarial Training" in md
15+
16+
17+
@pytest.mark.config({"page_range": [0]})
18+
def test_markdown_renderer_auto_ocr(pdf_document):
19+
renderer = MarkdownRenderer()
20+
md = renderer(pdf_document).markdown
21+
22+
# Verify markdown
23+
assert "Subspace Adversarial Training" in md
1524

1625

1726
@pytest.mark.config({"page_range": [0, 1], "paginate_output": True})
@@ -29,12 +38,14 @@ def test_markdown_renderer_pagination_blank_last_page(pdf_document):
2938
last_page = pdf_document.pages[-1]
3039
last_page.children = []
3140
last_page.structure = []
32-
41+
3342
renderer = MarkdownRenderer({"paginate_output": True})
3443
md = renderer(pdf_document).markdown
35-
44+
3645
# Should end with pagination marker and preserve trailing newlines
37-
assert md.endswith("}\n\n") or md.endswith("}------------------------------------------------\n\n")
46+
assert md.endswith("}\n\n") or md.endswith(
47+
"}------------------------------------------------\n\n"
48+
)
3849

3950

4051
@pytest.mark.config({"page_range": [0, 1]})
@@ -48,9 +59,10 @@ def test_markdown_renderer_metadata(pdf_document):
4859
def test_markdown_renderer_images(pdf_document):
4960
renderer = MarkdownRenderer({"extract_images": False})
5061
markdown_output = renderer(pdf_document)
51-
62+
5263
assert len(markdown_output.images) == 0
53-
assert '![](' not in markdown_output.markdown
64+
assert "![](" not in markdown_output.markdown
65+
5466

5567
@pytest.mark.config({"page_range": [5]})
5668
def test_markdown_renderer_tables(pdf_document):
@@ -74,5 +86,3 @@ def test_markdown_renderer_tables(pdf_document):
7486
renderer = MarkdownRenderer()
7587
md = renderer(pdf_document).markdown
7688
assert "54 <i>.45</i> 67<br>89 $x$" in md
77-
78-

0 commit comments

Comments
 (0)