Skip to content

Commit

Permalink
feat: Update v1beta1 sdk to support llmparser in import file functions
Browse files Browse the repository at this point in the history
PiperOrigin-RevId: 708932562
  • Loading branch information
vertex-sdk-bot authored and copybara-github committed Dec 23, 2024
1 parent 2224c83 commit 1eb493b
Show file tree
Hide file tree
Showing 6 changed files with 160 additions and 2 deletions.
24 changes: 24 additions & 0 deletions tests/unit/vertex_rag/test_rag_constants_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
Filter,
HybridSearch,
LayoutParserConfig,
LlmParserConfig,
LlmRanker,
Pinecone,
RagCorpus,
Expand Down Expand Up @@ -623,6 +624,12 @@
max_parsing_requests_per_min=100,
)

TEST_LLM_PARSER_CONFIG = LlmParserConfig(
model_name="gemini-1.5-pro-002",
max_parsing_requests_per_min=500,
custom_parsing_prompt="test-custom-parsing-prompt",
)

TEST_IMPORT_FILES_CONFIG_SHARE_POINT_SOURCE_NO_FOLDERS = ImportRagFilesConfig(
rag_file_transformation_config=TEST_RAG_FILE_TRANSFORMATION_CONFIG,
share_point_sources=GapicSharePointSources(
Expand Down Expand Up @@ -677,6 +684,23 @@
import_rag_files_config=TEST_IMPORT_FILES_CONFIG_LAYOUT_PARSER_WITH_PROCESSOR_VERSION_PATH,
)

TEST_IMPORT_FILES_CONFIG_LLM_PARSER = ImportRagFilesConfig(
TEST_IMPORT_FILES_CONFIG_DRIVE_FOLDER
)

TEST_IMPORT_FILES_CONFIG_LLM_PARSER.rag_file_parsing_config = RagFileParsingConfig(
llm_parser=RagFileParsingConfig.LlmParser(
model_name="gemini-1.5-pro-002",
max_parsing_requests_per_min=500,
custom_parsing_prompt="test-custom-parsing-prompt",
)
)

TEST_IMPORT_REQUEST_LLM_PARSER = ImportRagFilesRequest(
parent=TEST_RAG_CORPUS_RESOURCE_NAME,
import_rag_files_config=TEST_IMPORT_FILES_CONFIG_LLM_PARSER,
)

# Retrieval
TEST_QUERY_TEXT = "What happen to the fox and the dog?"
TEST_CONTEXTS = RagContexts(
Expand Down
64 changes: 64 additions & 0 deletions tests/unit/vertex_rag/test_rag_data_preview.py
Original file line number Diff line number Diff line change
Expand Up @@ -1156,6 +1156,70 @@ async def test_advanced_pdf_parsing_and_layout_parser_both_set_error_async(self)
"passed in at a time"
)

def test_prepare_import_files_request_llm_parser(self):
request = prepare_import_files_request(
corpus_name=test_rag_constants_preview.TEST_RAG_CORPUS_RESOURCE_NAME,
paths=[test_rag_constants_preview.TEST_DRIVE_FOLDER],
transformation_config=create_transformation_config(),
llm_parser=test_rag_constants_preview.TEST_LLM_PARSER_CONFIG,
)
import_files_request_eq(
request,
test_rag_constants_preview.TEST_IMPORT_REQUEST_LLM_PARSER,
)

def test_advanced_pdf_parsing_and_llm_parser_both_set_error(self):
with pytest.raises(ValueError) as e:
rag.import_files(
corpus_name=test_rag_constants_preview.TEST_RAG_CORPUS_RESOURCE_NAME,
paths=[test_rag_constants_preview.TEST_DRIVE_FOLDER],
transformation_config=create_transformation_config(),
use_advanced_pdf_parsing=True,
llm_parser=test_rag_constants_preview.TEST_LLM_PARSER_CONFIG,
)
e.match(
"Only one of use_advanced_pdf_parsing or llm_parser may be "
"passed in at a time"
)

def test_layout_parser_and_llm_parser_both_set_error(self):
with pytest.raises(ValueError) as e:
rag.import_files(
corpus_name=test_rag_constants_preview.TEST_RAG_CORPUS_RESOURCE_NAME,
paths=[test_rag_constants_preview.TEST_DRIVE_FOLDER],
transformation_config=create_transformation_config(),
layout_parser=test_rag_constants_preview.TEST_LAYOUT_PARSER_WITH_PROCESSOR_PATH_CONFIG,
llm_parser=test_rag_constants_preview.TEST_LLM_PARSER_CONFIG,
)
e.match("Only one of layout_parser or llm_parser may be passed in at a time")

@pytest.mark.asyncio
async def test_advanced_pdf_parsing_and_llm_parser_both_set_error_async(self):
with pytest.raises(ValueError) as e:
await rag.import_files_async(
corpus_name=test_rag_constants_preview.TEST_RAG_CORPUS_RESOURCE_NAME,
paths=[test_rag_constants_preview.TEST_DRIVE_FOLDER],
transformation_config=create_transformation_config(),
use_advanced_pdf_parsing=True,
llm_parser=test_rag_constants_preview.TEST_LLM_PARSER_CONFIG,
)
e.match(
"Only one of use_advanced_pdf_parsing or llm_parser may be "
"passed in at a time"
)

@pytest.mark.asyncio
async def test_layout_parser_and_llm_parser_both_set_error_async(self):
with pytest.raises(ValueError) as e:
await rag.import_files_async(
corpus_name=test_rag_constants_preview.TEST_RAG_CORPUS_RESOURCE_NAME,
paths=[test_rag_constants_preview.TEST_DRIVE_FOLDER],
transformation_config=create_transformation_config(),
layout_parser=test_rag_constants_preview.TEST_LAYOUT_PARSER_WITH_PROCESSOR_PATH_CONFIG,
llm_parser=test_rag_constants_preview.TEST_LLM_PARSER_CONFIG,
)
e.match("Only one of layout_parser or llm_parser may be passed in at a time")

def test_set_embedding_model_config_set_both_error(self):
embedding_model_config = rag.EmbeddingModelConfig(
publisher_model="whatever",
Expand Down
2 changes: 2 additions & 0 deletions vertexai/preview/rag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
JiraQuery,
JiraSource,
LayoutParserConfig,
LlmParserConfig,
LlmRanker,
Pinecone,
RagCorpus,
Expand Down Expand Up @@ -72,6 +73,7 @@
"JiraQuery",
"JiraSource",
"LayoutParserConfig",
"LlmParserConfig",
"LlmRanker",
"Pinecone",
"RagCorpus",
Expand Down
33 changes: 31 additions & 2 deletions vertexai/preview/rag/rag_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
EmbeddingModelConfig,
JiraSource,
LayoutParserConfig,
LlmParserConfig,
Pinecone,
RagCorpus,
RagFile,
Expand Down Expand Up @@ -475,6 +476,7 @@ def import_files(
use_advanced_pdf_parsing: Optional[bool] = False,
partial_failures_sink: Optional[str] = None,
layout_parser: Optional[LayoutParserConfig] = None,
llm_parser: Optional[LlmParserConfig] = None,
) -> ImportRagFilesResponse:
"""
Import files to an existing RagCorpus, wait until completion.
Expand Down Expand Up @@ -592,7 +594,10 @@ def import_files(
to the table.
layout_parser: Configuration for the Document AI Layout Parser Processor
to use for document parsing. Optional.
If not None,`use_advanced_pdf_parsing` must be False.
If not None, the other parser configs must be None.
llm_parser: Configuration for the LLM Parser to use for document parsing.
Optional.
If not None, the other parser configs must be None.
Returns:
ImportRagFilesResponse.
"""
Expand All @@ -605,6 +610,15 @@ def import_files(
"Only one of use_advanced_pdf_parsing or layout_parser may be "
"passed in at a time"
)
if use_advanced_pdf_parsing and llm_parser is not None:
raise ValueError(
"Only one of use_advanced_pdf_parsing or llm_parser may be "
"passed in at a time"
)
if layout_parser is not None and llm_parser is not None:
raise ValueError(
"Only one of layout_parser or llm_parser may be passed in at a time"
)
corpus_name = _gapic_utils.get_corpus_name(corpus_name)
request = _gapic_utils.prepare_import_files_request(
corpus_name=corpus_name,
Expand All @@ -617,6 +631,7 @@ def import_files(
use_advanced_pdf_parsing=use_advanced_pdf_parsing,
partial_failures_sink=partial_failures_sink,
layout_parser=layout_parser,
llm_parser=llm_parser,
)
client = _gapic_utils.create_rag_data_service_client()
try:
Expand All @@ -638,6 +653,7 @@ async def import_files_async(
use_advanced_pdf_parsing: Optional[bool] = False,
partial_failures_sink: Optional[str] = None,
layout_parser: Optional[LayoutParserConfig] = None,
llm_parser: Optional[LlmParserConfig] = None,
) -> operation_async.AsyncOperation:
"""
Import files to an existing RagCorpus asynchronously.
Expand Down Expand Up @@ -755,7 +771,10 @@ async def import_files_async(
to the table.
layout_parser: Configuration for the Document AI Layout Parser Processor
to use for document parsing. Optional.
If not None,`use_advanced_pdf_parsing` must be False.
If not None, the other parser configs must be None.
llm_parser: Configuration for the LLM Parser to use for document parsing.
Optional.
If not None, the other parser configs must be None.
Returns:
operation_async.AsyncOperation.
"""
Expand All @@ -768,6 +787,15 @@ async def import_files_async(
"Only one of use_advanced_pdf_parsing or layout_parser may be "
"passed in at a time"
)
if use_advanced_pdf_parsing and llm_parser is not None:
raise ValueError(
"Only one of use_advanced_pdf_parsing or llm_parser may be "
"passed in at a time"
)
if layout_parser is not None and llm_parser is not None:
raise ValueError(
"Only one of layout_parser or llm_parser may be passed in at a time"
)
corpus_name = _gapic_utils.get_corpus_name(corpus_name)
request = _gapic_utils.prepare_import_files_request(
corpus_name=corpus_name,
Expand All @@ -780,6 +808,7 @@ async def import_files_async(
use_advanced_pdf_parsing=use_advanced_pdf_parsing,
partial_failures_sink=partial_failures_sink,
layout_parser=layout_parser,
llm_parser=llm_parser,
)
async_client = _gapic_utils.create_rag_data_service_async_client()
try:
Expand Down
15 changes: 15 additions & 0 deletions vertexai/preview/rag/utils/_gapic_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
EmbeddingModelConfig,
VertexPredictionEndpoint,
LayoutParserConfig,
LlmParserConfig,
Pinecone,
RagCorpus,
RagFile,
Expand Down Expand Up @@ -450,6 +451,7 @@ def prepare_import_files_request(
use_advanced_pdf_parsing: bool = False,
partial_failures_sink: Optional[str] = None,
layout_parser: Optional[LayoutParserConfig] = None,
llm_parser: Optional[LlmParserConfig] = None,
) -> ImportRagFilesRequest:
if len(corpus_name.split("/")) != 6:
raise ValueError(
Expand Down Expand Up @@ -479,6 +481,19 @@ def prepare_import_files_request(
processor_name=layout_parser.processor_name,
max_parsing_requests_per_min=layout_parser.max_parsing_requests_per_min,
)
if llm_parser is not None:
rag_file_parsing_config.llm_parser = RagFileParsingConfig.LlmParser(
model_name=llm_parser.model_name
)
if llm_parser.max_parsing_requests_per_min is not None:
rag_file_parsing_config.llm_parser.max_parsing_requests_per_min = (
llm_parser.max_parsing_requests_per_min
)
if llm_parser.custom_parsing_prompt is not None:
rag_file_parsing_config.llm_parser.custom_parsing_prompt = (
llm_parser.custom_parsing_prompt
)

local_chunk_size = chunk_size
local_chunk_overlap = chunk_overlap
if transformation_config and transformation_config.chunking_config:
Expand Down
24 changes: 24 additions & 0 deletions vertexai/preview/rag/utils/resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,3 +500,27 @@ class LayoutParserConfig:

processor_name: str
max_parsing_requests_per_min: Optional[int] = None


@dataclasses.dataclass
class LlmParserConfig:
"""Configuration for the Document AI Layout Parser Processor.
Attributes:
model_name (str):
The full resource name of a Vertex AI model. Format:
- `projects/{project_id}/locations/{location}/publishers/google/models/{model_id}`
- `projects/{project_id}/locations/{location}/models/{model_id}`
max_parsing_requests_per_min (int):
The maximum number of requests the job is allowed to make to the
Vertex AI model per minute. Consult
https://cloud.google.com/vertex-ai/generative-ai/docs/quotas and
the Quota page for your project to set an appropriate value here.
If unspecified, a default value of 120 QPM will be used.
custom_parsing_prompt (str):
A custom prompt to use for parsing.
"""

model_name: str
max_parsing_requests_per_min: Optional[int] = None
custom_parsing_prompt: Optional[str] = None

0 comments on commit 1eb493b

Please sign in to comment.