diff --git a/CHANGELOG.md b/CHANGELOG.md index b5b762d7f6..f67b64a5ca 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,9 @@ +## 0.18.20 + +### Enhancement +- Improve the VoyageAI integration +- Add voyage-context-3 support + ## 0.18.19-dev0 ### Enhancement diff --git a/test_unstructured/embed/test_voyageai.py b/test_unstructured/embed/test_voyageai.py index f0a24bde7c..1273d9b283 100644 --- a/test_unstructured/embed/test_voyageai.py +++ b/test_unstructured/embed/test_voyageai.py @@ -10,6 +10,7 @@ def test_embed_documents_does_not_break_element_to_dict(mocker): embed_response.embeddings = [[1], [2]] mock_client = mocker.MagicMock() mock_client.embed.return_value = embed_response + mock_client.tokenize.return_value = [[1], [1]] # Mock token counts # Mock get_client to return our mock_client mocker.patch.object(VoyageAIEmbeddingConfig, "get_client", return_value=mock_client) @@ -23,3 +24,219 @@ def test_embed_documents_does_not_break_element_to_dict(mocker): assert len(elements) == 2 assert elements[0].to_dict()["text"] == "This is sentence 1" assert elements[1].to_dict()["text"] == "This is sentence 2" + + +def test_embed_documents_voyage_3_5(mocker): + """Test embedding with voyage-3.5 model.""" + embed_response = Mock() + embed_response.embeddings = [[1.0] * 1024, [2.0] * 1024] + mock_client = mocker.MagicMock() + mock_client.embed.return_value = embed_response + mock_client.tokenize.return_value = [[1, 2, 3], [1, 2]] # Mock token counts + + mocker.patch.object(VoyageAIEmbeddingConfig, "get_client", return_value=mock_client) + + encoder = VoyageAIEmbeddingEncoder( + config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-3.5") + ) + elements = encoder.embed_documents( + elements=[Text("Test document 1"), Text("Test document 2")], + ) + assert len(elements) == 2 + assert len(elements[0].embeddings) == 1024 + assert len(elements[1].embeddings) == 1024 + + +def test_embed_documents_voyage_3_5_lite(mocker): + """Test embedding with voyage-3.5-lite model.""" + embed_response = Mock() + embed_response.embeddings = [[1.0] * 512, [2.0] * 512, [3.0] * 512] + mock_client = mocker.MagicMock() + mock_client.embed.return_value = embed_response + mock_client.tokenize.return_value = [[1], [1], [1]] # Mock token counts + + mocker.patch.object(VoyageAIEmbeddingConfig, "get_client", return_value=mock_client) + + encoder = VoyageAIEmbeddingEncoder( + config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-3.5-lite") + ) + elements = encoder.embed_documents( + elements=[Text("Test 1"), Text("Test 2"), Text("Test 3")], + ) + assert len(elements) == 3 + assert all(len(e.embeddings) == 512 for e in elements) + + +def test_embed_documents_contextual_model(mocker): + """Test embedding with voyage-context-3 model.""" + # Mock contextualized_embed response + contextualized_response = Mock() + result_item = Mock() + result_item.embeddings = [[1.0] * 1024, [2.0] * 1024] + contextualized_response.results = [result_item] + + mock_client = mocker.MagicMock() + mock_client.contextualized_embed.return_value = contextualized_response + mock_client.tokenize.return_value = [[1, 2], [1, 2, 3]] # Mock token counts + + mocker.patch.object(VoyageAIEmbeddingConfig, "get_client", return_value=mock_client) + + encoder = VoyageAIEmbeddingEncoder( + config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-context-3") + ) + elements = encoder.embed_documents( + elements=[Text("Context document 1"), Text("Context document 2")], + ) + assert len(elements) == 2 + assert len(elements[0].embeddings) == 1024 + assert len(elements[1].embeddings) == 1024 + # Verify contextualized_embed was called + mock_client.contextualized_embed.assert_called_once() + + +def test_count_tokens(mocker): + """Test token counting functionality.""" + mock_client = mocker.MagicMock() + mock_client.tokenize.return_value = [[1, 2], [1, 2, 3, 4, 5]] # Different token counts + + mocker.patch.object(VoyageAIEmbeddingConfig, "get_client", return_value=mock_client) + + encoder = VoyageAIEmbeddingEncoder( + config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-3.5") + ) + texts = ["short text", "this is a longer text with more tokens"] + token_counts = encoder.count_tokens(texts) + + assert len(token_counts) == 2 + assert token_counts[0] == 2 + assert token_counts[1] == 5 + + +def test_count_tokens_empty_list(mocker): + """Test token counting with empty list.""" + mocker.patch.object(VoyageAIEmbeddingConfig, "get_client", return_value=mocker.MagicMock()) + + encoder = VoyageAIEmbeddingEncoder( + config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-3.5") + ) + token_counts = encoder.count_tokens([]) + assert token_counts == [] + + +def test_get_token_limit(mocker): + """Test getting token limit for different models.""" + mocker.patch.object(VoyageAIEmbeddingConfig, "get_client", return_value=mocker.MagicMock()) + + # Test voyage-3.5 model + config = VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-3.5") + assert config.get_token_limit() == 320_000 + + # Test voyage-3.5-lite model + config_lite = VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-3.5-lite") + assert config_lite.get_token_limit() == 1_000_000 + + # Test context model + config_context = VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-context-3") + assert config_context.get_token_limit() == 32_000 + + # Test voyage-2 model + config_v2 = VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-2") + assert config_v2.get_token_limit() == 320_000 + + # Test unknown model (should use default) + config_unknown = VoyageAIEmbeddingConfig(api_key="api_key", model_name="unknown-model") + assert config_unknown.get_token_limit() == 120_000 + + +def test_is_context_model(mocker): + """Test the _is_context_model helper method.""" + mocker.patch.object(VoyageAIEmbeddingConfig, "get_client", return_value=mocker.MagicMock()) + + # Test with context model + encoder_context = VoyageAIEmbeddingEncoder( + config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-context-3") + ) + assert encoder_context._is_context_model() is True + + # Test with regular model + encoder_regular = VoyageAIEmbeddingEncoder( + config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-3.5") + ) + assert encoder_regular._is_context_model() is False + + +def test_build_batches_with_token_limits(mocker): + """Test that batching respects token limits.""" + mock_client = mocker.MagicMock() + # Simulate different token counts for each text + mock_client.tokenize.return_value = [[1] * 10, [1] * 20, [1] * 15, [1] * 25] + + mocker.patch.object(VoyageAIEmbeddingConfig, "get_client", return_value=mock_client) + + encoder = VoyageAIEmbeddingEncoder( + config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-2") + ) + texts = ["text1", "text2", "text3", "text4"] + batches = list(encoder._build_batches(texts, mock_client)) + + # Should create at least one batch + assert len(batches) >= 1 + # Total texts should be preserved + total_texts = sum(len(batch) for batch in batches) + assert total_texts == len(texts) + + +def test_embed_query(mocker): + """Test embedding a single query.""" + embed_response = Mock() + embed_response.embeddings = [[1.0] * 1024] + mock_client = mocker.MagicMock() + mock_client.embed.return_value = embed_response + + mocker.patch.object(VoyageAIEmbeddingConfig, "get_client", return_value=mock_client) + + encoder = VoyageAIEmbeddingEncoder( + config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-3.5") + ) + embedding = encoder.embed_query("test query") + + assert len(embedding) == 1024 + # Verify embed was called with input_type="query" + mock_client.embed.assert_called_once() + call_kwargs = mock_client.embed.call_args[1] + assert call_kwargs["input_type"] == "query" + + +def test_embed_documents_with_output_dimension(mocker): + """Test embedding with custom output dimension.""" + embed_response = Mock() + embed_response.embeddings = [[1.0] * 512, [2.0] * 512] + mock_client = mocker.MagicMock() + mock_client.embed.return_value = embed_response + mock_client.tokenize.return_value = [[1], [1]] + + mocker.patch.object(VoyageAIEmbeddingConfig, "get_client", return_value=mock_client) + + encoder = VoyageAIEmbeddingEncoder( + config=VoyageAIEmbeddingConfig( + api_key="api_key", model_name="voyage-3.5", output_dimension=512 + ) + ) + elements = encoder.embed_documents( + elements=[Text("Test 1"), Text("Test 2")], + ) + assert len(elements) == 2 + # Verify output_dimension was passed + call_kwargs = mock_client.embed.call_args[1] + assert call_kwargs["output_dimension"] == 512 + + +def test_embed_documents_empty_list(mocker): + """Test embedding empty list of documents.""" + mocker.patch.object(VoyageAIEmbeddingConfig, "get_client", return_value=mocker.MagicMock()) + + encoder = VoyageAIEmbeddingEncoder( + config=VoyageAIEmbeddingConfig(api_key="api_key", model_name="voyage-3.5") + ) + elements = encoder.embed_documents(elements=[]) + assert elements == [] diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 15c3dcdee3..cc6c3b30a5 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.19-dev0" # pragma: no cover +__version__ = "0.18.20" # pragma: no cover diff --git a/unstructured/embed/voyageai.py b/unstructured/embed/voyageai.py index 35d231884d..8a496a68a3 100644 --- a/unstructured/embed/voyageai.py +++ b/unstructured/embed/voyageai.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import TYPE_CHECKING, Iterable, List, Optional, cast +from typing import TYPE_CHECKING, Iterable, List, Optional import numpy as np from pydantic import Field, SecretStr @@ -11,10 +11,29 @@ if TYPE_CHECKING: from voyageai import Client -DEFAULT_VOYAGE_2_BATCH_SIZE = 72 -DEFAULT_VOYAGE_3_LITE_BATCH_SIZE = 30 -DEFAULT_VOYAGE_3_BATCH_SIZE = 10 -DEFAULT_BATCH_SIZE = 7 +# Token limits for different VoyageAI models +VOYAGE_TOTAL_TOKEN_LIMITS = { + "voyage-context-3": 32_000, + "voyage-3.5-lite": 1_000_000, + "voyage-3.5": 320_000, + "voyage-2": 320_000, + "voyage-02": 320_000, + "voyage-3-large": 120_000, + "voyage-code-3": 120_000, + "voyage-large-2-instruct": 120_000, + "voyage-finance-2": 120_000, + "voyage-multilingual-2": 120_000, + "voyage-law-2": 120_000, + "voyage-large-2": 120_000, + "voyage-3": 120_000, + "voyage-3-lite": 120_000, + "voyage-code-2": 120_000, + "voyage-3-m-exp": 120_000, + "voyage-multimodal-3": 120_000, +} + +# Batch size for embedding requests (max documents per batch) +MAX_BATCH_SIZE = 1000 class VoyageAIEmbeddingConfig(EmbeddingConfig): @@ -37,17 +56,9 @@ def get_client(self) -> "Client": api_key=self.api_key.get_secret_value(), ) - def get_batch_size(self): - if self.batch_size is None: - if self.model_name in ["voyage-2", "voyage-02"]: - self.batch_size = DEFAULT_VOYAGE_2_BATCH_SIZE - elif self.model_name == "voyage-3-lite": - self.batch_size = DEFAULT_VOYAGE_3_LITE_BATCH_SIZE - elif self.model_name == "voyage-3": - self.batch_size = DEFAULT_VOYAGE_3_BATCH_SIZE - else: - self.batch_size = DEFAULT_BATCH_SIZE - return self.batch_size + def get_token_limit(self) -> int: + """Get the token limit for the current model.""" + return VOYAGE_TOTAL_TOKEN_LIMITS.get(self.model_name, 120_000) @dataclass @@ -70,53 +81,157 @@ def is_unit_vector(self) -> bool: exemplary_embedding = self.get_exemplary_embedding() return np.isclose(np.linalg.norm(exemplary_embedding), 1.0) - def embed_documents(self, elements: List[Element]) -> List[Element]: - client = self.config.get_client() - embeddings: List[List[float]] = [] - - _iter = self._get_batch_iterator(elements) - for i in _iter: - r = client.embed( - texts=[str(e) for e in elements[i : i + self.config.get_batch_size()]], + def _is_context_model(self) -> bool: + """Check if the model is a contextualized embedding model.""" + return "context" in self.config.model_name + + def _build_batches(self, texts: List[str], client: "Client") -> Iterable[List[str]]: + """ + Generate batches of texts based on token limits. + + Args: + texts: List of texts to batch. + client: VoyageAI client instance to use for tokenization. + + Yields: + Batches of texts as lists. + """ + if not texts: + return + + max_tokens_per_batch = self.config.get_token_limit() + current_batch: List[str] = [] + current_batch_tokens = 0 + + # Tokenize all texts in one API call + all_token_lists = client.tokenize(texts, model=self.config.model_name) + token_counts = [len(tokens) for tokens in all_token_lists] + + for i, text in enumerate(texts): + n_tokens = token_counts[i] + + # Check if adding this text would exceed limits + if current_batch and ( + len(current_batch) >= MAX_BATCH_SIZE + or (current_batch_tokens + n_tokens > max_tokens_per_batch) + ): + # Yield the current batch and start a new one + yield current_batch + current_batch = [] + current_batch_tokens = 0 + + current_batch.append(text) + current_batch_tokens += n_tokens + + # Yield the last batch (always has at least one text) + if current_batch: + yield current_batch + + def _embed_batch( + self, batch: List[str], client: "Client", input_type: str = "document" + ) -> List[List[float]]: + """ + Embed a batch of texts using the appropriate method for the model. + + Args: + batch: List of texts to embed. + client: VoyageAI client instance to use for embedding. + input_type: Type of input ("document" or "query"). + + Returns: + List of embedding vectors. + """ + if self._is_context_model(): + result = client.contextualized_embed( + inputs=[batch], + model=self.config.model_name, + input_type=input_type, + output_dimension=self.config.output_dimension, + ) + return [list(emb) for emb in result.results[0].embeddings] + else: + result = client.embed( + texts=batch, model=self.config.model_name, - input_type="document", + input_type=input_type, truncation=self.config.truncation, output_dimension=self.config.output_dimension, - ).embeddings - embeddings.extend(cast(Iterable[List[float]], r)) - return self._add_embeddings_to_elements(elements, embeddings) + ) + return [list(emb) for emb in result.embeddings] + + def embed_documents(self, elements: List[Element]) -> List[Element]: + """ + Embed documents with automatic batching based on token limits. + + Args: + elements: List of elements to embed. + + Returns: + List of elements with embeddings added. + """ + if not elements: + return [] - def embed_query(self, query: str) -> List[float]: client = self.config.get_client() - return client.embed( - texts=[query], - model=self.config.model_name, - input_type="query", - truncation=self.config.truncation, - output_dimension=self.config.output_dimension, - ).embeddings[0] + texts = [str(e) for e in elements] + all_embeddings: List[List[float]] = [] - @staticmethod - def _add_embeddings_to_elements(elements, embeddings) -> List[Element]: - assert len(elements) == len(embeddings) - elements_w_embedding = [] - for i, element in enumerate(elements): - element.embeddings = embeddings[i] - elements_w_embedding.append(element) - return elements + # Process each batch + batches = list(self._build_batches(texts, client)) - def _get_batch_iterator(self, elements: List[Element]) -> Iterable: if self.config.show_progress_bar: try: from tqdm.auto import tqdm # type: ignore + + batches = tqdm(batches, desc="Embedding batches") except ImportError as e: raise ImportError( "Must have tqdm installed if `show_progress_bar` is set to True. " "Please install with `pip install tqdm`." ) from e - _iter = tqdm(range(0, len(elements), self.config.get_batch_size())) - else: - _iter = range(0, len(elements), self.config.get_batch_size()) # type: ignore + for batch in batches: + batch_embeddings = self._embed_batch(batch, client, input_type="document") + all_embeddings.extend(batch_embeddings) + + return self._add_embeddings_to_elements(elements, all_embeddings) + + def embed_query(self, query: str) -> List[float]: + """ + Embed a single query string. + + Args: + query: Query string to embed. + + Returns: + Embedding vector. + """ + client = self.config.get_client() + batch_embeddings = self._embed_batch([query], client, input_type="query") + return batch_embeddings[0] + + def count_tokens(self, texts: List[str]) -> List[int]: + """ + Count tokens for the given texts. + + Args: + texts: List of texts to count tokens for. + + Returns: + List of token counts for each text. + """ + if not texts: + return [] - return _iter + client = self.config.get_client() + token_lists = client.tokenize(texts, model=self.config.model_name) + return [len(token_list) for token_list in token_lists] + + @staticmethod + def _add_embeddings_to_elements(elements, embeddings) -> List[Element]: + assert len(elements) == len(embeddings) + elements_w_embedding = [] + for i, element in enumerate(elements): + element.embeddings = embeddings[i] + elements_w_embedding.append(element) + return elements