Skip to content
Draft
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,7 @@ As mentioned above, processing a pdf using `hi_res` is currently a slow operatio
* `UNSTRUCTURED_PARALLEL_MODE_THREADS` - the number of threads making requests at once, default is `3`.
* `UNSTRUCTURED_PARALLEL_MODE_SPLIT_SIZE` - the number of pages to be processed in one request, default is `1`.
* `UNSTRUCTURED_PARALLEL_RETRY_ATTEMPTS` - the number of retry attempts on a retryable error, default is `2`. (i.e. 3 attempts are made in total)
* `UNSTRUCTURED_MAX_PDF_PAGES` - the maximum number of pages in pdf file that will not be rejected in `hi_res` strategy, default is `300`.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's name it more precisely - UNSTRUCTURED_PDF_HI_RES_MAX_PAGES


Due to the overhead associated with file splitting, parallel processing mode is only recommended for the `hi_res` strategy. Additionally users of the official [Python client](https://github.com/Unstructured-IO/unstructured-python-client?tab=readme-ov-file#splitting-pdf-by-pages) can enable client-side splitting by setting `split_pdf_page=True`.

Expand Down
11 changes: 11 additions & 0 deletions prepline_general/api/general.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

from prepline_general.api.models.form_params import GeneralFormParams
from prepline_general.api.filetypes import get_validated_mimetype
from unstructured.errors import PageCountExceededError
from unstructured.documents.elements import Element
from unstructured.partition.auto import partition
from unstructured.staging.base import (
Expand Down Expand Up @@ -328,6 +329,7 @@ def pipeline_api(

if file_content_type == "application/pdf":
_check_pdf(file)
max_pages = int(os.environ.get("UNSTRUCTURED_MAX_PDF_PAGES", 300))

hi_res_model_name = _validate_hi_res_model_name(hi_res_model_name, coordinates)
strategy = _validate_strategy(strategy)
Expand Down Expand Up @@ -373,6 +375,7 @@ def pipeline_api(
"extract_image_block_types": extract_image_block_types,
"extract_image_block_to_payload": extract_image_block_to_payload,
"unique_element_ids": unique_element_ids,
"max_pages": max_pages,
},
default=str,
)
Expand Down Expand Up @@ -403,6 +406,7 @@ def pipeline_api(
"extract_image_block_to_payload": extract_image_block_to_payload,
"unique_element_ids": unique_element_ids,
"starting_page_number": starting_page_number,
"max_pages": max_pages,
}

if file_content_type == "application/pdf" and pdf_parallel_mode_enabled:
Expand Down Expand Up @@ -437,6 +441,13 @@ def pipeline_api(
status_code=500,
detail=str(e),
)
except PageCountExceededError as e:
raise HTTPException(
status_code=422,
detail=f"{e} Check the split_pdf_page functionality of unstructured_client to send the file "
f"in smaller chunks.",
)

except ValueError as e:
if "Invalid file" in e.args[0]:
raise HTTPException(
Expand Down
22 changes: 22 additions & 0 deletions test_general/api/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -1158,3 +1158,25 @@ def test__set_pdf_infer_table_structure(
)
is expected
)


@pytest.mark.parametrize(
("strategy", "test_file", "max_pages", "expect_code"),
[
("hi_res", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "300", 200),
("hi_res", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "2", 422),
("auto", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "300", 200),
("auto", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "2", 422),
("fast", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "300", 200),
("fast", Path("sample-docs") / "DA-1p-with-duplicate-pages.pdf", "2", 200),
],
)
def test_max_pages_in_hi_res(monkeypatch, strategy, test_file, max_pages, expect_code):
monkeypatch.setenv("UNSTRUCTURED_MAX_PDF_PAGES", max_pages)
client = TestClient(app)
response = client.post(
MAIN_API_ROUTE,
files=[("files", (str(test_file), open(test_file, "rb")))],
data={"strategy": strategy},
)
assert response.status_code == expect_code