Skip to content

Commit 265c9c1

Browse files
committed
feat(client): route parse() via DWS Extract key and reject text+spatial
DWS Extract is a separate product from DWS Processor with its own API key and credit pool. Calling /extraction/parse with the Processor key returns 403. Add an optional extract_api_key constructor parameter (str or async callable) that parse() prefers over api_key when set; non-parse methods keep using api_key. Falling back to api_key keeps a single-key setup working once tenants get global DWS keys. Also reject mode='text' + output_format='spatial' before the request goes out — the text mode only produces markdown, so the combination would 502 on the server side. Surface it as a ValidationError with guidance. Addresses PR #47 review feedback from HungKNguyen.
1 parent ba5fb0d commit 265c9c1

6 files changed

Lines changed: 193 additions & 10 deletions

File tree

README.md

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,15 @@ For a complete list of available methods with examples, see the [Methods Documen
9494
**content-extraction workflows** where you need to feed document content into a
9595
downstream pipeline rather than render or transform the document itself:
9696

97+
> **Heads up — separate API key.** DWS Extract is a different product from
98+
> DWS Processor and has its own API key. Pass it as
99+
> `NutrientClient(api_key=..., extract_api_key=...)`; the Extract key is
100+
> used only for `parse()`, while every other method continues to use the
101+
> Processor key. Using the Processor key against `/extraction/parse`
102+
> returns `403`. If `extract_api_key` is omitted, `parse()` falls back to
103+
> the main `api_key` — that path works once your tenant moves to global
104+
> DWS API keys.
105+
97106
- **RAG (retrieval-augmented generation) pipelines** — pull a clean Markdown
98107
representation of a document for chunking, embedding, and indexing in a
99108
vector store.
@@ -114,14 +123,21 @@ downstream pipeline rather than render or transform the document itself:
114123
| `markdown` | RAG, search indexing, content migration — anywhere structured text beats spatial data | One whole-document Markdown string at `response['output']['markdown']` |
115124
| `spatial` (default) | Form/invoice extraction, layout reconstruction, flows that need per-element confidence | Flat list of typed elements at `response['output']['elements']` |
116125

126+
Spatial output requires an OCR-capable mode (`structure`, `understand`, or
127+
`agentic`); `mode='text'` is markdown-only and the client rejects the
128+
`text` + `spatial` combination before the request goes out.
129+
117130
### Quick start
118131

119132
```python
120133
import asyncio
121134
from nutrient_dws import NutrientClient
122135

123136
async def main():
124-
client = NutrientClient(api_key='your_api_key')
137+
client = NutrientClient(
138+
api_key='your_processor_key',
139+
extract_api_key='your_extract_key',
140+
)
125141

126142
# Spatial elements (default) — paragraphs, tables, formulas, pictures, etc.
127143
response = await client.parse('contract.pdf', mode='understand')

src/nutrient_dws/client.py

Lines changed: 61 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -117,26 +117,47 @@ async def get_token():
117117
118118
client = NutrientClient(api_key=get_token)
119119
```
120+
121+
Data Extraction requires a separate DWS Extract API key — supply it
122+
alongside the Processor key:
123+
124+
```python
125+
client = NutrientClient(
126+
api_key='your_processor_key',
127+
extract_api_key='your_extract_key',
128+
)
129+
```
120130
"""
121131

122132
def __init__(
123133
self,
124134
api_key: str | Callable[[], str | Awaitable[str]],
125135
base_url: str | None = None,
126136
timeout: int | None = None,
137+
extract_api_key: str | Callable[[], str | Awaitable[str]] | None = None,
127138
) -> None:
128139
"""Create a new NutrientClient instance.
129140
130141
Args:
131-
api_key: API key or API key getter
142+
api_key: API key or API key getter for the DWS Processor product
143+
(used by every method except `parse()`).
132144
base_url: DWS Base url
133145
timeout: DWS request timeout
146+
extract_api_key: Optional API key or getter for the DWS Extract
147+
product. Required by `parse()` because DWS Extract is a
148+
separate product with its own credit pool and API key — using
149+
the Processor key will return 403. If omitted, `parse()`
150+
falls back to `api_key`, which works once DWS rolls out
151+
global API keys.
134152
135153
Raises:
136154
ValidationError: If options are invalid
137155
"""
138156
options = NutrientClientOptions(
139-
apiKey=api_key, baseUrl=base_url, timeout=timeout
157+
apiKey=api_key,
158+
baseUrl=base_url,
159+
timeout=timeout,
160+
extractApiKey=extract_api_key,
140161
)
141162
self._validate_options(options)
142163
self.options = options
@@ -166,6 +187,14 @@ def _validate_options(self, options: NutrientClientOptions) -> None:
166187
if base_url is not None and not isinstance(base_url, str):
167188
raise ValidationError("Base URL must be a string")
168189

190+
extract_api_key = options.get("extractApiKey")
191+
if extract_api_key is not None and not (
192+
isinstance(extract_api_key, str) or callable(extract_api_key)
193+
):
194+
raise ValidationError(
195+
"Extract API key must be a string or a function that returns a string"
196+
)
197+
169198
async def get_account_info(self) -> AccountInfo:
170199
"""Get account information for the current API key.
171200
@@ -784,6 +813,11 @@ async def parse(
784813
See the README's Data Extraction section for worked recipes (RAG
785814
ingestion, form extraction) and per-mode positioning.
786815
816+
DWS Extract is a separate product from DWS Processor and uses its own
817+
API key. Pass it via `NutrientClient(extract_api_key=...)`. If omitted
818+
the method falls back to the main `api_key`, which only succeeds when
819+
the key is a global DWS key.
820+
787821
The Data Extraction API is billed against **extraction credits**, which
788822
are a separate billing bucket from the **processor API credits**
789823
consumed by `/build`, `/sign`, OCR, and other Processor API endpoints.
@@ -803,7 +837,9 @@ async def parse(
803837
804838
- `spatial` (default): `output.elements` — typed elements (paragraph,
805839
table, formula, picture, keyValueRegion, handwriting) with bounds,
806-
confidence, and reading order.
840+
confidence, and reading order. Requires an OCR-capable mode
841+
(`structure`, `understand`, or `agentic`); `text` mode does not
842+
produce spatial output.
807843
- `markdown`: `output.markdown` — a whole-document Markdown string,
808844
well suited for RAG / search indexing pipelines.
809845
@@ -822,12 +858,17 @@ async def parse(
822858
to `"structure"`.
823859
output_format: Output shape — `"spatial"` for typed elements or
824860
`"markdown"` for a Markdown document. Defaults to
825-
`"spatial"`.
861+
`"spatial"`. `mode="text"` is incompatible with
862+
`output_format="spatial"`.
826863
827864
Returns:
828865
The full parse response envelope, including `output`, `metrics`,
829866
`usage` (the extraction-credit accounting), and `configuration`.
830867
868+
Raises:
869+
ValidationError: If `mode="text"` is combined with
870+
`output_format="spatial"`.
871+
831872
Example:
832873
```python
833874
# Spatial elements with full layout analysis (9 extraction credits / page)
@@ -848,6 +889,13 @@ async def parse(
848889
f"(remaining: {usage['remainingCredits']})")
849890
```
850891
"""
892+
if mode == "text" and output_format == "spatial":
893+
raise ValidationError(
894+
"mode='text' is not supported with output_format='spatial'. "
895+
"Use output_format='markdown', or choose mode='structure' / "
896+
"'understand' / 'agentic' for spatial elements."
897+
)
898+
851899
# Multipart-only endpoint; only local file inputs are supported.
852900
normalized_file = await process_file_input(file)
853901

@@ -861,14 +909,22 @@ async def parse(
861909
"instructions": instructions,
862910
}
863911

912+
# DWS Extract uses a separate API key. Route the request via a
913+
# per-call options copy so the rest of the client (which talks to
914+
# the Processor API) keeps using the main key.
915+
parse_options = self.options.copy()
916+
extract_key = parse_options.get("extractApiKey")
917+
if extract_key is not None:
918+
parse_options["apiKey"] = extract_key
919+
864920
response: Any = await send_request(
865921
{
866922
"method": "POST",
867923
"endpoint": "/extraction/parse",
868924
"data": request_data,
869925
"headers": None,
870926
},
871-
self.options,
927+
parse_options,
872928
)
873929
return cast("ParseResponse", response["data"])
874930

src/nutrient_dws/http.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -190,6 +190,9 @@ class NutrientClientOptions(TypedDict):
190190
apiKey: str | Callable[[], str | Awaitable[str]]
191191
baseUrl: str | None
192192
timeout: int | None
193+
# DWS Extract is a separate product with its own API key; parse() prefers
194+
# this when set, otherwise falls back to apiKey.
195+
extractApiKey: NotRequired[str | Callable[[], str | Awaitable[str]] | None]
193196

194197

195198
async def resolve_api_key(api_key: str | Callable[[], str | Awaitable[str]]) -> str:

tests/conftest.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
from unittest.mock import AsyncMock
22

33
import pytest
4+
45
from nutrient_dws import NutrientClient
56
from tests.helpers import TestDocumentGenerator
67

8+
79
@pytest.fixture
810
def mock_workflow_instance():
911
"""Create a mock workflow instance for testing."""
@@ -40,7 +42,12 @@ def mock_workflow_instance():
4042
@pytest.fixture
4143
def valid_client_options():
4244
"""Valid client options for testing."""
43-
return {"apiKey": "test-api-key", "baseUrl": "https://api.test.com/v1", "timeout": None}
45+
return {
46+
"apiKey": "test-api-key",
47+
"baseUrl": "https://api.test.com/v1",
48+
"timeout": None,
49+
"extractApiKey": None,
50+
}
4451

4552
@pytest.fixture
4653
def unit_client():

tests/unit/test_client.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,12 @@ def test_create_workflow_instance(
6565

6666
@patch("nutrient_dws.client.StagedWorkflowBuilder")
6767
def test_pass_client_options_to_workflow(self, mock_staged_workflow_builder):
68-
custom_options = {"apiKey": "custom-key", "baseUrl": "https://custom.api.com", "timeout": None}
68+
custom_options = {
69+
"apiKey": "custom-key",
70+
"baseUrl": "https://custom.api.com",
71+
"timeout": None,
72+
"extractApiKey": None,
73+
}
6974
client = NutrientClient(api_key=custom_options["apiKey"], base_url=custom_options["baseUrl"])
7075

7176
client.workflow()

tests/unit/test_parse.py

Lines changed: 98 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,97 @@ def test_prepare_request_body_omits_instructions_when_absent(self) -> None:
167167
assert "data" not in prepared
168168

169169

170+
class TestParseClientSideValidation:
171+
"""Combinations rejected before any network round-trip."""
172+
173+
@pytest.mark.asyncio
174+
async def test_text_mode_with_spatial_output_raises(
175+
self, parse_client: NutrientClient
176+
) -> None:
177+
with patch("nutrient_dws.client.send_request", new_callable=AsyncMock) as send:
178+
with pytest.raises(ValidationError, match="mode='text'"):
179+
await parse_client.parse(
180+
b"%PDF-1.7", mode="text", output_format="spatial"
181+
)
182+
send.assert_not_called()
183+
184+
185+
class TestParseApiKeyRouting:
186+
"""`/extraction/parse` is served by DWS Extract, which uses a separate
187+
API key from DWS Processor. When `extract_api_key` is set we route via
188+
that key; otherwise we fall back to the main `api_key`.
189+
"""
190+
191+
@pytest.mark.asyncio
192+
async def test_parse_uses_extract_api_key_when_set(self) -> None:
193+
client = NutrientClient(
194+
api_key="processor-key",
195+
extract_api_key="extract-key",
196+
)
197+
with patch("nutrient_dws.client.send_request", new_callable=AsyncMock) as send:
198+
send.return_value = _make_response(
199+
{"status": 200, "requestId": "r", "output": {"elements": []}}
200+
)
201+
202+
await client.parse(b"%PDF-1.7", mode="structure")
203+
204+
sent_options = send.call_args[0][1]
205+
206+
assert sent_options["apiKey"] == "extract-key"
207+
# The client's own options are untouched — other methods still see the
208+
# processor key.
209+
assert client.options["apiKey"] == "processor-key"
210+
211+
@pytest.mark.asyncio
212+
async def test_parse_falls_back_to_main_api_key_when_extract_key_unset(
213+
self, parse_client: NutrientClient
214+
) -> None:
215+
with patch("nutrient_dws.client.send_request", new_callable=AsyncMock) as send:
216+
send.return_value = _make_response(
217+
{"status": 200, "requestId": "r", "output": {"elements": []}}
218+
)
219+
220+
await parse_client.parse(b"%PDF-1.7", mode="structure")
221+
222+
sent_options = send.call_args[0][1]
223+
224+
assert sent_options["apiKey"] == "pdf_test_unit"
225+
226+
@pytest.mark.asyncio
227+
async def test_non_parse_methods_keep_processor_key(self) -> None:
228+
"""A sibling endpoint (`/account/info`) must not see the Extract
229+
key — only `parse()` swaps.
230+
"""
231+
client = NutrientClient(
232+
api_key="processor-key",
233+
extract_api_key="extract-key",
234+
)
235+
with patch("nutrient_dws.client.send_request", new_callable=AsyncMock) as send:
236+
send.return_value = _make_response({"subscriptionType": "live"})
237+
238+
await client.get_account_info()
239+
240+
sent_options = send.call_args[0][1]
241+
242+
assert sent_options["apiKey"] == "processor-key"
243+
244+
def test_invalid_extract_api_key_type_raises(self) -> None:
245+
with pytest.raises(
246+
ValidationError,
247+
match="Extract API key must be a string or a function that returns a string",
248+
):
249+
NutrientClient(api_key="processor-key", extract_api_key=123) # type: ignore[arg-type]
250+
251+
def test_async_extract_api_key_callable_accepted(self) -> None:
252+
async def get_extract_key() -> str:
253+
return "async-extract-key"
254+
255+
client = NutrientClient(
256+
api_key="processor-key", extract_api_key=get_extract_key
257+
)
258+
assert callable(client.options["extractApiKey"])
259+
260+
170261
class TestParseResponseHandling:
171262
"""Verify the client returns the raw response envelope to the caller."""
172263

@@ -315,7 +406,9 @@ async def test_authentication_error_propagates(
315406
)
316407

317408
with pytest.raises(AuthenticationError) as exc_info:
318-
await parse_client.parse(b"%PDF-1.7", mode="text")
409+
await parse_client.parse(
410+
b"%PDF-1.7", mode="text", output_format="markdown"
411+
)
319412

320413
assert exc_info.value.status_code == 401
321414
assert (exc_info.value.details or {}).get("requestId") == "req_e_401"
@@ -342,7 +435,10 @@ async def test_validation_error_propagates(
342435

343436
with pytest.raises(ValidationError) as exc_info:
344437
await parse_client.parse(
345-
b"%PDF-1.7", mode="text" # mode is fine; server-side fail
438+
# client-side validation passes; failure is the mocked server response
439+
b"%PDF-1.7",
440+
mode="text",
441+
output_format="markdown",
346442
)
347443

348444
details = exc_info.value.details or {}

0 commit comments

Comments
 (0)