diff --git a/README.md b/README.md index 21c876bbf..3e1bc2c4e 100644 --- a/README.md +++ b/README.md @@ -373,6 +373,7 @@ As mentioned above, processing a pdf using `hi_res` is currently a slow operatio * `UNSTRUCTURED_PARALLEL_MODE_THREADS` - the number of threads making requests at once, default is `3`. * `UNSTRUCTURED_PARALLEL_MODE_SPLIT_SIZE` - the number of pages to be processed in one request, default is `1`. * `UNSTRUCTURED_PARALLEL_RETRY_ATTEMPTS` - the number of retry attempts on a retryable error, default is `2`. (i.e. 3 attempts are made in total) +* `UNSTRUCTURED_PDF_HI_RES_MAX_PAGES` - the maximum number of pages in pdf file that will not be rejected in `hi_res` strategy, default is `300`. Due to the overhead associated with file splitting, parallel processing mode is only recommended for the `hi_res` strategy. Additionally users of the official [Python client](https://github.com/Unstructured-IO/unstructured-python-client?tab=readme-ov-file#splitting-pdf-by-pages) can enable client-side splitting by setting `split_pdf_page=True`. diff --git a/prepline_general/api/general.py b/prepline_general/api/general.py index eb2868624..b54a42e14 100644 --- a/prepline_general/api/general.py +++ b/prepline_general/api/general.py @@ -35,6 +35,7 @@ from prepline_general.api.models.form_params import GeneralFormParams from prepline_general.api.filetypes import get_validated_mimetype +from unstructured.errors import PageCountExceededError from unstructured.documents.elements import Element from unstructured.partition.auto import partition from unstructured.staging.base import ( @@ -328,6 +329,7 @@ def pipeline_api( if file_content_type == "application/pdf": _check_pdf(file) + pdf_hi_res_max_pages = int(os.environ.get("UNSTRUCTURED_PDF_HI_RES_MAX_PAGES", 300)) hi_res_model_name = _validate_hi_res_model_name(hi_res_model_name, coordinates) strategy = _validate_strategy(strategy) @@ -373,6 +375,7 @@ def pipeline_api( "extract_image_block_types": extract_image_block_types, "extract_image_block_to_payload": extract_image_block_to_payload, "unique_element_ids": unique_element_ids, + "pdf_hi_res_max_pages": pdf_hi_res_max_pages, }, default=str, ) @@ -403,6 +406,7 @@ def pipeline_api( "extract_image_block_to_payload": extract_image_block_to_payload, "unique_element_ids": unique_element_ids, "starting_page_number": starting_page_number, + "pdf_hi_res_max_pages": pdf_hi_res_max_pages, } if file_content_type == "application/pdf" and pdf_parallel_mode_enabled: @@ -437,6 +441,13 @@ def pipeline_api( status_code=500, detail=str(e), ) + except PageCountExceededError as e: + raise HTTPException( + status_code=422, + detail=f"{e} Check the split_pdf_page functionality of unstructured_client to send the file " + f"in smaller chunks.", + ) + except ValueError as e: if "Invalid file" in e.args[0]: raise HTTPException( diff --git a/test_general/api/test_app.py b/test_general/api/test_app.py index f2fa3211b..e33f6265e 100644 --- a/test_general/api/test_app.py +++ b/test_general/api/test_app.py @@ -1158,3 +1158,25 @@ def test__set_pdf_infer_table_structure( ) is expected ) + + +@pytest.mark.parametrize( + ("strategy", "test_file", "pdf_hi_res_max_pages", "expect_code"), + [ + ("hi_res", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "300", 200), + ("hi_res", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "2", 422), + ("auto", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "300", 200), + ("auto", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "2", 422), + ("fast", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "300", 200), + ("fast", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "2", 200), + ], +) +def test_max_pages_in_hi_res(monkeypatch, strategy, test_file, pdf_hi_res_max_pages, expect_code): + monkeypatch.setenv("UNSTRUCTURED_PDF_HI_RES_MAX_PAGES", pdf_hi_res_max_pages) + client = TestClient(app) + response = client.post( + MAIN_API_ROUTE, + files=[("files", (str(test_file), open(test_file, "rb")))], + data={"strategy": strategy}, + ) + assert response.status_code == expect_code