Skip to content

Commit eb6eadb

Browse files
committed
init conftest.py and move fixtures, move csv test to test_split_pdf_hook
1 parent 390038d commit eb6eadb

File tree

5 files changed

+88
-51
lines changed

5 files changed

+88
-51
lines changed

_test_unstructured_client/conftest.py

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
from __future__ import annotations
2+
3+
import os
4+
from pathlib import Path
5+
from typing import Generator
6+
import pytest
7+
8+
from unstructured_client.sdk import UnstructuredClient
9+
10+
11+
@pytest.fixture(scope="module")
12+
def client() -> Generator[UnstructuredClient, None, None]:
13+
_client = UnstructuredClient(api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"), server='free-api')
14+
yield _client
15+
16+
17+
@pytest.fixture(scope="module")
18+
def doc_path() -> Path:
19+
return Path(__file__).resolve().parents[1] / "_sample_docs"

_test_unstructured_client/integration/test_integration_freemium.py

Lines changed: 0 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -3,29 +3,16 @@
33
import asyncio
44
import json
55
import os
6-
from pathlib import Path
76

87
import pytest
98
from deepdiff import DeepDiff
109

1110
from unstructured_client import UnstructuredClient
1211
from unstructured_client.models import shared, operations
1312
from unstructured_client.models.errors import SDKError, ServerError, HTTPValidationError
14-
from unstructured_client.models.shared.partition_parameters import OutputFormat
1513
from unstructured_client.utils.retries import BackoffStrategy, RetryConfig
1614

1715

18-
@pytest.fixture(scope="module")
19-
def client() -> UnstructuredClient:
20-
_client = UnstructuredClient(api_key_auth=os.getenv("UNSTRUCTURED_API_KEY"), server='free-api')
21-
yield _client
22-
23-
24-
@pytest.fixture(scope="module")
25-
def doc_path() -> Path:
26-
return Path(__file__).resolve().parents[2] / "_sample_docs"
27-
28-
2916
@pytest.mark.parametrize("split_pdf", [True, False])
3017
@pytest.mark.parametrize("strategy", ["fast", "ocr_only", "hi_res"])
3118
def test_partition_strategies(split_pdf, strategy, client, doc_path):
@@ -223,29 +210,3 @@ async def call_api():
223210
uvloop.install()
224211
elements = asyncio.run(call_api())
225212
assert len(elements) > 0
226-
227-
228-
def test_partition_csv_response(client, doc_path):
229-
filename = "layout-parser-paper-fast.pdf"
230-
with open(doc_path / filename, "rb") as f:
231-
files = shared.Files(
232-
content=f.read(),
233-
file_name=filename,
234-
)
235-
236-
req = operations.PartitionRequest(
237-
partition_parameters=shared.PartitionParameters(
238-
files=files,
239-
output_format=OutputFormat.TEXT_CSV,
240-
)
241-
)
242-
243-
response = client.general.partition(request=req)
244-
assert response.status_code == 200
245-
assert response.content_type == "text/csv; charset=utf-8"
246-
assert response.elements is None
247-
assert response.csv_elements is not None
248-
assert response.csv_elements.startswith(
249-
"type,element_id,text,filetype,languages,page_number,filename,parent_id"
250-
"\nTitle,6aa0ff22f91bbe7e26e8e25ca8052acd,Layout"
251-
)

_test_unstructured_client/unit/test_request_utils.py

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1+
from __future__ import annotations
2+
13
# Get unit tests for request_utils.py module
24
import httpx
35
import pytest
46

5-
from unstructured_client._hooks.custom.request_utils import create_pdf_chunk_request_params, get_multipart_stream_fields
6-
from unstructured_client.models import shared
7+
from unstructured_client._hooks.custom.request_utils import create_pdf_chunk_request_params, create_response, get_multipart_stream_fields
78

89

910
# make the above test using @pytest.mark.parametrize
@@ -30,6 +31,7 @@ def test_get_multipart_stream_fields(input_request, expected):
3031
fields = get_multipart_stream_fields(input_request)
3132
assert fields == expected
3233

34+
3335
def test_multipart_stream_fields_raises_value_error_when_filename_is_not_set():
3436
with pytest.raises(ValueError):
3537
get_multipart_stream_fields(httpx.Request(
@@ -40,6 +42,7 @@ def test_multipart_stream_fields_raises_value_error_when_filename_is_not_set():
4042
headers={"Content-Type": "multipart/form-data; boundary=----WebKitFormBoundary7MA4YWxkTrZu0gW"}),
4143
)
4244

45+
4346
@pytest.mark.parametrize(("input_form_data", "page_number", "expected_form_data"), [
4447
(
4548
{"hello": "world"},
@@ -70,3 +73,24 @@ def test_multipart_stream_fields_raises_value_error_when_filename_is_not_set():
7073
def test_create_pdf_chunk_request_params(input_form_data, page_number, expected_form_data):
7174
form_data = create_pdf_chunk_request_params(input_form_data, page_number)
7275
assert form_data == expected_form_data
76+
77+
78+
def test_create_response_for_json():
79+
elements = [
80+
{"type": "Title", "text": "Hello, World!"},
81+
{"type": "NarrativeText", "text": "Goodbye!"},
82+
]
83+
response = create_response(elements)
84+
assert response.status_code == 200
85+
assert response.json() == elements
86+
assert response.headers["Content-Type"] == "application/json"
87+
88+
89+
def test_create_response_for_csv():
90+
elements = 'type,element_id,text,languages,page_number,filename,filetype,parent_id' \
91+
'\nTitle,f73329878fbbb0bb131a83e7b6daacbe,Module One - Introduction to Product' \
92+
' Development and Quality Assurance,[\'eng\'],1,list-item-example-1.pdf,application/pdf,'
93+
response = create_response(elements)
94+
assert response.status_code == 200
95+
assert response.json() == None
96+
assert response.headers["Content-Type"] == "text/csv"

_test_unstructured_client/unit/test_split_pdf_hook.py

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,13 @@
11
from __future__ import annotations
22

33
import asyncio
4-
import io
5-
import logging
64
from asyncio import Task
75
from collections import Counter
86
from functools import partial
9-
from typing import Coroutine
107

11-
import httpx
128
import pytest
139
import requests
14-
from requests_toolbelt import MultipartDecoder, MultipartEncoder
10+
from requests_toolbelt import MultipartDecoder
1511

1612
from unstructured_client._hooks.custom import form_utils, pdf_utils, request_utils
1713
from unstructured_client._hooks.custom.form_utils import (
@@ -28,7 +24,9 @@
2824
SplitPdfHook,
2925
get_optimal_split_size, run_tasks,
3026
)
31-
from unstructured_client.models import shared
27+
from unstructured_client.models import operations, shared
28+
from unstructured_client.models.shared.partition_parameters import OutputFormat
29+
from unstructured_client.sdk import UnstructuredClient
3230

3331

3432
def test_unit_clear_operation():
@@ -434,3 +432,37 @@ async def test_remaining_tasks_cancelled_when_fails_disallowed():
434432
await asyncio.sleep(1)
435433
print("Cancelled amount: ", cancelled_counter["cancelled"])
436434
assert len(tasks) > cancelled_counter["cancelled"] > 0
435+
436+
437+
@pytest.mark.parametrize("split_pdf_page", [True, False])
438+
def test_integration_get_split_csv_response(split_pdf_page, doc_path):
439+
try:
440+
response = requests.get("http://127.0.0.1:8000/general/docs")
441+
assert response.status_code == 200
442+
except requests.exceptions.ConnectionError:
443+
assert False, "The unstructured-api is not running on localhost:8000"
444+
445+
client = UnstructuredClient(api_key_auth="", server_url="127.0.0.1:8000")
446+
filename = "layout-parser-paper.pdf"
447+
with open(doc_path / filename, "rb") as f:
448+
files = shared.Files(
449+
content=f.read(),
450+
file_name=filename,
451+
)
452+
req = operations.PartitionRequest(
453+
partition_parameters=shared.PartitionParameters(
454+
files=files,
455+
output_format=OutputFormat.TEXT_CSV,
456+
split_pdf_page=split_pdf_page,
457+
)
458+
)
459+
460+
resp = client.general.partition(request=req)
461+
462+
assert resp.status_code == 200
463+
assert resp.content_type == "text/csv; charset=utf-8"
464+
assert resp.elements is None
465+
assert resp.csv_elements is not None
466+
assert resp.csv_elements.startswith(
467+
"type,element_id,text,filetype,languages,page_number,filename,parent_id"
468+
)

src/unstructured_client/_hooks/custom/request_utils.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,8 @@ def prepare_request_headers(
207207
new_headers.pop("Content-Length", None)
208208
return new_headers
209209

210-
def create_response(elements: list) -> httpx.Response:
210+
211+
def create_response(elements: list[dict[str, Any] | bytes]) -> httpx.Response:
211212
"""
212213
Creates a modified response object with updated content.
213214
@@ -218,9 +219,9 @@ def create_response(elements: list) -> httpx.Response:
218219
Returns:
219220
The modified response object with updated content.
220221
"""
221-
if not isinstance(elements[0], dict):
222-
response = httpx.Response(status_code=200, headers={"Content-Type": "text/csv"})
223-
content = b''.join(elements)
222+
if isinstance(elements, list) and all(isinstance(element, bytes) for element in elements):
223+
response = httpx.Response(status_code=200, headers={"Content-Type": "text/csv; charset=utf-8"})
224+
content = b''.join(elements) # type: ignore
224225
else:
225226
response = httpx.Response(status_code=200, headers={"Content-Type": "application/json"})
226227
content = json.dumps(elements).encode()

0 commit comments

Comments
 (0)