Skip to content

Commit fa9c18f

Browse files
committed
Fix remaining tests
1 parent 64c73b9 commit fa9c18f

File tree

4 files changed

+20
-11
lines changed

4 files changed

+20
-11
lines changed

marker/builders/line.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ class LineBuilder(BaseBuilder):
5353
provider_line_provider_line_min_overlap_pct: Annotated[
5454
float,
5555
"The percentage of a provider line that has to be covered by a detected line",
56-
] = 0.15
56+
] = 0.1
5757
excluded_for_coverage: Annotated[
5858
Tuple[BlockTypes],
5959
"A list of block types to exclude from the layout coverage check.",

tests/builders/test_document_builder.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,5 @@ def test_document_builder_inline_eq(pdf_document):
3838

3939
first_span = first_page.get_block(first_text_block.structure[0])
4040
assert first_span.block_type == BlockTypes.Span
41-
assert first_span.text == "Subspace Adversarial Training"
42-
assert first_span.font == "NimbusRomNo9L-Medi"
43-
assert first_span.formats == ["plain"]
41+
assert first_span.text.strip() == "Subspace Adversarial Training"
42+
assert "bold" in first_span.formats

tests/builders/test_pdf_links.py

Lines changed: 16 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
@pytest.mark.filename("arxiv_test.pdf")
1313
@pytest.mark.output_format("markdown")
14+
@pytest.mark.config({"disable_ocr": True})
1415
def test_pdf_links(pdf_document: Document, config, renderer, model_dict, temp_doc):
1516
first_page = pdf_document.pages[1]
1617

@@ -19,26 +20,35 @@ def test_pdf_links(pdf_document: Document, config, renderer, model_dict, temp_do
1920
artifact_dict=model_dict,
2021
processor_list=processors,
2122
renderer=classes_to_strings([renderer])[0],
22-
config=config
23+
config=config,
2324
)
2425

25-
for section_header_span in first_page.contained_blocks(pdf_document, (BlockTypes.Span,)):
26+
for section_header_span in first_page.contained_blocks(
27+
pdf_document, (BlockTypes.Span,)
28+
):
2629
if "II." in section_header_span.text:
2730
assert section_header_span.url == "#page-1-0"
2831
break
2932
else:
3033
raise ValueError("Could not find II. in the first page")
3134

32-
section_header_block = first_page.contained_blocks(pdf_document, (BlockTypes.SectionHeader,))[0]
33-
assert section_header_block.raw_text(pdf_document) == 'II. THEORETICAL FRAMEWORK\n'
35+
section_header_block = first_page.contained_blocks(
36+
pdf_document, (BlockTypes.SectionHeader,)
37+
)[0]
38+
assert section_header_block.raw_text(pdf_document) == "II. THEORETICAL FRAMEWORK\n"
3439

3540
assert first_page.refs[0].ref == "page-1-0"
3641

3742
markdown_output: MarkdownOutput = pdf_converter(temp_doc.name)
3843
markdown = markdown_output.markdown
3944

40-
assert '[II.](#page-1-0)' in markdown
45+
assert "[II.](#page-1-0)" in markdown
4146
assert '<span id="page-1-0"></span>II. THEORETICAL FRAMEWORK' in markdown
4247

43-
for ref in set([f'<span id="page-{m[0]}-{m[1]}">' for m in re.findall(r'\]\(#page-(\d+)-(\d+)\)', markdown)]):
48+
for ref in set(
49+
[
50+
f'<span id="page-{m[0]}-{m[1]}">'
51+
for m in re.findall(r"\]\(#page-(\d+)-(\d+)\)", markdown)
52+
]
53+
):
4454
assert ref in markdown, f"Reference {ref} not found in markdown"

tests/builders/test_rotated_bboxes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ def test_rotated_bboxes(pdf_document):
1313
text_blocks = first_page.contained_blocks(
1414
pdf_document, (BlockTypes.Text, BlockTypes.TextInlineMath)
1515
)
16-
assert len(text_lines) == 85
16+
assert len(text_lines) == 84
1717

1818
# Ensure the bbox sizes match up
1919
max_line_position = max([line.polygon.x_end for line in text_lines])

0 commit comments

Comments
 (0)