|
1 | 1 | # SPDX-FileCopyrightText: 2023-present deepset GmbH <info@deepset.ai> |
2 | 2 | # |
3 | 3 | # SPDX-License-Identifier: Apache-2.0 |
| 4 | + |
4 | 5 | from copy import copy |
5 | 6 | from typing import Any, Literal, Optional, Union |
6 | 7 |
|
@@ -376,6 +377,165 @@ async def delete_all_documents_async(self) -> None: |
376 | 377 | # Namespace doesn't exist (empty collection), which is fine - nothing to delete |
377 | 378 | logger.debug("Namespace '{namespace}' not found. Nothing to delete.", namespace=self.namespace or "default") |
378 | 379 |
|
| 380 | + @staticmethod |
| 381 | + def _update_documents_metadata(documents: list[Document], meta: dict[str, Any]) -> None: |
| 382 | + """ |
| 383 | + Updates metadata for a list of documents by merging the provided meta dictionary. |
| 384 | +
|
| 385 | + :param documents: List of documents to update. |
| 386 | + :param meta: Metadata fields to merge into each document's existing metadata. |
| 387 | + """ |
| 388 | + for document in documents: |
| 389 | + if document.meta is None: |
| 390 | + document.meta = {} |
| 391 | + document.meta.update(meta) |
| 392 | + |
| 393 | + def delete_by_filter(self, filters: dict[str, Any]) -> int: |
| 394 | + """ |
| 395 | + Deletes all documents that match the provided filters. |
| 396 | +
|
| 397 | + Pinecone does not support server-side delete by filter, so this method |
| 398 | + first searches for matching documents, then deletes them by ID. |
| 399 | +
|
| 400 | + :param filters: The filters to apply to select documents for deletion. |
| 401 | + For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) |
| 402 | + :returns: The number of documents deleted. |
| 403 | + """ |
| 404 | + _validate_filters(filters) |
| 405 | + |
| 406 | + self._initialize_index() |
| 407 | + assert self._index is not None, "Index is not initialized" |
| 408 | + |
| 409 | + documents = self.filter_documents(filters=filters) |
| 410 | + if not documents: |
| 411 | + return 0 |
| 412 | + |
| 413 | + document_ids = [doc.id for doc in documents] |
| 414 | + |
| 415 | + self.delete_documents(document_ids) |
| 416 | + |
| 417 | + deleted_count = len(document_ids) |
| 418 | + logger.info( |
| 419 | + "Deleted {n_docs} documents from index '{index}' using filters.", |
| 420 | + n_docs=deleted_count, |
| 421 | + index=self.index_name, |
| 422 | + ) |
| 423 | + |
| 424 | + return deleted_count |
| 425 | + |
| 426 | + async def delete_by_filter_async(self, filters: dict[str, Any]) -> int: |
| 427 | + """ |
| 428 | + Asynchronously deletes all documents that match the provided filters. |
| 429 | +
|
| 430 | + Pinecone does not support server-side delete by filter, so this method |
| 431 | + first searches for matching documents, then deletes them by ID. |
| 432 | +
|
| 433 | + :param filters: The filters to apply to select documents for deletion. |
| 434 | + For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) |
| 435 | + :returns: The number of documents deleted. |
| 436 | + """ |
| 437 | + _validate_filters(filters) |
| 438 | + |
| 439 | + await self._initialize_async_index() |
| 440 | + assert self._async_index is not None, "Index is not initialized" |
| 441 | + |
| 442 | + documents = await self.filter_documents_async(filters=filters) |
| 443 | + if not documents: |
| 444 | + return 0 |
| 445 | + |
| 446 | + document_ids = [doc.id for doc in documents] |
| 447 | + |
| 448 | + await self.delete_documents_async(document_ids) |
| 449 | + |
| 450 | + deleted_count = len(document_ids) |
| 451 | + logger.info( |
| 452 | + "Deleted {n_docs} documents from index '{index}' using filters.", |
| 453 | + n_docs=deleted_count, |
| 454 | + index=self.index_name, |
| 455 | + ) |
| 456 | + |
| 457 | + return deleted_count |
| 458 | + |
| 459 | + def update_by_filter(self, filters: dict[str, Any], meta: dict[str, Any]) -> int: |
| 460 | + """ |
| 461 | + Updates the metadata of all documents that match the provided filters. |
| 462 | +
|
| 463 | + Pinecone does not support server-side update by filter, so this method |
| 464 | + first searches for matching documents, then updates their metadata and re-writes them. |
| 465 | +
|
| 466 | + :param filters: The filters to apply to select documents for updating. |
| 467 | + For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) |
| 468 | + :param meta: The metadata fields to update. This will be merged with existing metadata. |
| 469 | + :returns: The number of documents updated. |
| 470 | + """ |
| 471 | + _validate_filters(filters) |
| 472 | + |
| 473 | + if not isinstance(meta, dict): |
| 474 | + msg = "meta must be a dictionary" |
| 475 | + raise ValueError(msg) |
| 476 | + |
| 477 | + self._initialize_index() |
| 478 | + assert self._index is not None, "Index is not initialized" |
| 479 | + |
| 480 | + documents = self.filter_documents(filters=filters) |
| 481 | + if not documents: |
| 482 | + return 0 |
| 483 | + |
| 484 | + self._update_documents_metadata(documents, meta) |
| 485 | + |
| 486 | + # Re-write documents with updated metadata |
| 487 | + # Using OVERWRITE policy to update existing documents |
| 488 | + self.write_documents(documents, policy=DuplicatePolicy.OVERWRITE) |
| 489 | + |
| 490 | + updated_count = len(documents) |
| 491 | + logger.info( |
| 492 | + "Updated {n_docs} documents in index '{index}' using filters.", |
| 493 | + n_docs=updated_count, |
| 494 | + index=self.index_name, |
| 495 | + ) |
| 496 | + |
| 497 | + return updated_count |
| 498 | + |
| 499 | + async def update_by_filter_async(self, filters: dict[str, Any], meta: dict[str, Any]) -> int: |
| 500 | + """ |
| 501 | + Asynchronously updates the metadata of all documents that match the provided filters. |
| 502 | +
|
| 503 | + Pinecone does not support server-side update by filter, so this method |
| 504 | + first searches for matching documents, then updates their metadata and re-writes them. |
| 505 | +
|
| 506 | + :param filters: The filters to apply to select documents for updating. |
| 507 | + For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) |
| 508 | + :param meta: The metadata fields to update. This will be merged with existing metadata. |
| 509 | + :returns: The number of documents updated. |
| 510 | + """ |
| 511 | + _validate_filters(filters) |
| 512 | + |
| 513 | + if not isinstance(meta, dict): |
| 514 | + msg = "meta must be a dictionary" |
| 515 | + raise ValueError(msg) |
| 516 | + |
| 517 | + await self._initialize_async_index() |
| 518 | + assert self._async_index is not None, "Index is not initialized" |
| 519 | + |
| 520 | + documents = await self.filter_documents_async(filters=filters) |
| 521 | + if not documents: |
| 522 | + return 0 |
| 523 | + |
| 524 | + self._update_documents_metadata(documents, meta) |
| 525 | + |
| 526 | + # Re-write documents with updated metadata |
| 527 | + # Using OVERWRITE policy to update existing documents |
| 528 | + await self.write_documents_async(documents, policy=DuplicatePolicy.OVERWRITE) |
| 529 | + |
| 530 | + updated_count = len(documents) |
| 531 | + logger.info( |
| 532 | + "Updated {n_docs} documents in index '{index}' using filters.", |
| 533 | + n_docs=updated_count, |
| 534 | + index=self.index_name, |
| 535 | + ) |
| 536 | + |
| 537 | + return updated_count |
| 538 | + |
379 | 539 | def _embedding_retrieval( |
380 | 540 | self, |
381 | 541 | query_embedding: list[float], |
|
0 commit comments