-
Notifications
You must be signed in to change notification settings - Fork 204
feat: adding delete_by_filter and update_by_filter to WeaviateDocumentStore
#2656
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
davidsbatista
merged 15 commits into
main
from
feat/add-update-delete-by-filter-to-Weaviate
Jan 8, 2026
Merged
Changes from all commits
Commits
Show all changes
15 commits
Select commit
Hold shift + click to select a range
793c7a5
adding delete_by_filter and updated_by_filter
davidsbatista a641fd3
formatting
davidsbatista ea4d4de
adding tests
davidsbatista 5ae8c20
cleaning up
davidsbatista b12bf76
fixing tests
davidsbatista dc0ec94
trying to fix tests
davidsbatista 6469748
refactoring to reduce duplicated code + fixing typos
davidsbatista c1bd3e8
reusing query_with_filters or the sync version + removing redundant c…
davidsbatista c092447
adding tests to assert pagination is workin with updates
davidsbatista f9bb772
Merge branch 'main' into feat/add-update-delete-by-filter-to-Weaviate
davidsbatista bc11fbf
removing unused imports
davidsbatista 92770fc
removing unused imports
davidsbatista f7e696d
formatting
davidsbatista ab024b2
fixing tests, docs were not being written before being filtered/updated
davidsbatista c8d6ff9
Merge branch 'main' into feat/add-update-delete-by-filter-to-Weaviate
davidsbatista File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,6 +1,7 @@ | ||
| # SPDX-FileCopyrightText: 2023-present deepset GmbH <[email protected]> | ||
| # | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| import base64 | ||
| import datetime | ||
| import json | ||
|
|
@@ -19,7 +20,7 @@ | |
| from weaviate.embedded import EmbeddedOptions | ||
| from weaviate.util import generate_uuid5 | ||
|
|
||
| from ._filters import convert_filters | ||
| from ._filters import convert_filters, validate_filters | ||
| from .auth import AuthCredentials | ||
|
|
||
| logger = logging.getLogger(__name__) | ||
|
|
@@ -419,7 +420,7 @@ def _query_with_filters(self, filters: dict[str, Any]) -> list[DataObject[dict[s | |
| # | ||
| # Nonetheless there's also another issue, paginating with limit and offset is not efficient | ||
| # and it's still restricted by the QUERY_MAXIMUM_RESULTS environment variable. | ||
| # If the sum of limit and offest is greater than QUERY_MAXIMUM_RESULTS an error is raised. | ||
| # If the sum of limit and offset is greater than QUERY_MAXIMUM_RESULTS an error is raised. | ||
| # See the official docs for more: | ||
| # https://weaviate.io/developers/weaviate/api/graphql/additional-operators#performance-considerations | ||
| offset = 0 | ||
|
|
@@ -452,9 +453,7 @@ def filter_documents(self, filters: Optional[dict[str, Any]] = None) -> list[Doc | |
| :param filters: The filters to apply to the document list. | ||
| :returns: A list of Documents that match the given filters. | ||
| """ | ||
| if filters and "operator" not in filters and "conditions" not in filters: | ||
| msg = "Invalid filter syntax. See https://docs.haystack.deepset.ai/docs/metadata-filtering for details." | ||
| raise ValueError(msg) | ||
| validate_filters(filters) | ||
|
|
||
| result = [] | ||
| if filters: | ||
|
|
@@ -483,7 +482,7 @@ def _batch_write(self, documents: list[Document]) -> int: | |
| vector=doc.embedding, | ||
| ) | ||
| if failed_objects := self.client.batch.failed_objects: | ||
| # We fallback to use the UUID if the _original_id is not present, this is just to be | ||
| # We fall back to use the UUID if the _original_id is not present, this is just to be | ||
| mapped_objects = {} | ||
| for obj in failed_objects: | ||
| properties = obj.object_.properties or {} | ||
|
|
@@ -507,7 +506,7 @@ def _batch_write(self, documents: list[Document]) -> int: | |
| def _write(self, documents: list[Document], policy: DuplicatePolicy) -> int: | ||
| """ | ||
| Writes documents to Weaviate using the specified policy. | ||
| This doesn't uses the batch API, so it's slower than _batch_write. | ||
| This doesn't use the batch API, so it's slower than _batch_write. | ||
| If policy is set to SKIP it will skip any document that already exists. | ||
| If policy is set to FAIL it will raise an exception if any of the documents already exists. | ||
| """ | ||
|
|
@@ -610,6 +609,231 @@ def delete_all_documents(self, *, recreate_index: bool = False, batch_size: int | |
| "Make sure to specify a deletion `batch_size` which is less than `QUERY_MAXIMUM_RESULTS`.", | ||
| ) | ||
|
|
||
| def delete_by_filter(self, filters: dict[str, Any]) -> int: | ||
| """ | ||
| Deletes all documents that match the provided filters. | ||
|
|
||
| :param filters: The filters to apply to select documents for deletion. | ||
| For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) | ||
| :returns: The number of documents deleted. | ||
| """ | ||
| validate_filters(filters) | ||
|
|
||
| try: | ||
| weaviate_filter = convert_filters(filters) | ||
| result = self.collection.data.delete_many(where=weaviate_filter) | ||
| deleted_count = result.successful | ||
| logger.info( | ||
| "Deleted {n_docs} documents from collection '{collection}' using filters.", | ||
| n_docs=deleted_count, | ||
| collection=self.collection.name, | ||
| ) | ||
| return deleted_count | ||
| except weaviate.exceptions.WeaviateQueryError as e: | ||
| msg = f"Failed to delete documents by filter in Weaviate. Error: {e.message}" | ||
| raise DocumentStoreError(msg) from e | ||
| except Exception as e: | ||
| msg = f"Failed to delete documents by filter in Weaviate: {e!s}" | ||
| raise DocumentStoreError(msg) from e | ||
|
|
||
| async def delete_by_filter_async(self, filters: dict[str, Any]) -> int: | ||
| """ | ||
| Asynchronously deletes all documents that match the provided filters. | ||
|
|
||
| :param filters: The filters to apply to select documents for deletion. | ||
| For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) | ||
| :returns: The number of documents deleted. | ||
| """ | ||
| validate_filters(filters) | ||
|
|
||
| try: | ||
| collection = await self.async_collection | ||
| weaviate_filter = convert_filters(filters) | ||
| result = await collection.data.delete_many(where=weaviate_filter) | ||
| deleted_count = result.successful | ||
| logger.info( | ||
| "Deleted {n_docs} documents from collection '{collection}' using filters.", | ||
| n_docs=deleted_count, | ||
| collection=collection.name, | ||
| ) | ||
| return deleted_count | ||
| except weaviate.exceptions.WeaviateQueryError as e: | ||
| msg = f"Failed to delete documents by filter in Weaviate. Error: {e.message}" | ||
| raise DocumentStoreError(msg) from e | ||
| except Exception as e: | ||
| msg = f"Failed to delete documents by filter in Weaviate: {e!s}" | ||
| raise DocumentStoreError(msg) from e | ||
|
|
||
| def update_by_filter(self, filters: dict[str, Any], meta: dict[str, Any]) -> int: | ||
| """ | ||
| Updates the metadata of all documents that match the provided filters. | ||
|
|
||
| :param filters: The filters to apply to select documents for updating. | ||
| For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) | ||
| :param meta: The metadata fields to update. These will be merged with existing metadata. | ||
| :returns: The number of documents updated. | ||
| """ | ||
| validate_filters(filters) | ||
|
|
||
| if not isinstance(meta, dict): | ||
| msg = "Meta must be a dictionary" | ||
| raise ValueError(msg) | ||
|
|
||
| try: | ||
| matching_objects = self._query_with_filters(filters) | ||
| if not matching_objects: | ||
| return 0 | ||
|
|
||
| # Update each object with the new metadata | ||
| # Since metadata is stored flattened in Weaviate properties, we update properties directly | ||
| updated_count = 0 | ||
| failed_updates = [] | ||
|
|
||
| for obj in matching_objects: | ||
| try: | ||
| # Get current properties | ||
| current_properties = obj.properties.copy() if obj.properties else {} | ||
|
|
||
| # Update with new metadata values | ||
| # Note: metadata fields are stored directly in properties (flattened) | ||
| for key, value in meta.items(): | ||
| current_properties[key] = value | ||
|
|
||
| # Update the object, preserving the vector | ||
| # Get the vector from the object to preserve it during replace | ||
| vector = None | ||
| if isinstance(obj.vector, list): | ||
| vector = obj.vector | ||
| elif isinstance(obj.vector, dict): | ||
| vector = obj.vector.get("default") | ||
|
|
||
| self.collection.data.replace( | ||
| uuid=obj.uuid, | ||
| properties=current_properties, | ||
| vector=vector, | ||
| ) | ||
| updated_count += 1 | ||
| except Exception as e: | ||
| # Collect failed updates but continue with others | ||
| obj_properties = obj.properties or {} | ||
| id_ = obj_properties.get("_original_id", obj.uuid) | ||
| failed_updates.append((id_, str(e))) | ||
|
|
||
| if failed_updates: | ||
| msg = "\n".join( | ||
| [f"Failed to update object with id '{id_}'. Error: '{error}'" for id_, error in failed_updates] | ||
| ) | ||
| raise DocumentStoreError(msg) | ||
|
|
||
| logger.info( | ||
| "Updated {n_docs} documents in collection '{collection}' using filters.", | ||
| n_docs=updated_count, | ||
| collection=self.collection.name, | ||
| ) | ||
| return updated_count | ||
| except weaviate.exceptions.WeaviateQueryError as e: | ||
| msg = f"Failed to update documents by filter in Weaviate. Error: {e.message}" | ||
| raise DocumentStoreError(msg) from e | ||
| except Exception as e: | ||
| msg = f"Failed to update documents by filter in Weaviate: {e!s}" | ||
| raise DocumentStoreError(msg) from e | ||
|
|
||
| async def update_by_filter_async(self, filters: dict[str, Any], meta: dict[str, Any]) -> int: | ||
| """ | ||
| Asynchronously updates the metadata of all documents that match the provided filters. | ||
|
|
||
| :param filters: The filters to apply to select documents for updating. | ||
| For filter syntax, see [Haystack metadata filtering](https://docs.haystack.deepset.ai/docs/metadata-filtering) | ||
| :param meta: The metadata fields to update. These will be merged with existing metadata. | ||
| :returns: The number of documents updated. | ||
| """ | ||
| validate_filters(filters) | ||
|
|
||
| if not isinstance(meta, dict): | ||
| msg = "Meta must be a dictionary" | ||
| raise ValueError(msg) | ||
|
|
||
| try: | ||
| collection = await self.async_collection | ||
| weaviate_filter = convert_filters(filters) | ||
| config = await collection.config.get() | ||
| properties = [p.name for p in config.properties] | ||
|
|
||
| # Query all objects matching the filter | ||
| matching_objects = [] | ||
| offset = 0 | ||
| partial_result = None | ||
|
|
||
| # Paginate through all matching objects | ||
| # We include vector=True to preserve vectors when updating | ||
| while partial_result is None or len(partial_result.objects) == DEFAULT_QUERY_LIMIT: | ||
| partial_result = await collection.query.fetch_objects( | ||
| filters=weaviate_filter, | ||
| include_vector=True, | ||
| limit=DEFAULT_QUERY_LIMIT, | ||
| offset=offset, | ||
| return_properties=properties, | ||
| ) | ||
| matching_objects.extend(partial_result.objects) | ||
| offset += DEFAULT_QUERY_LIMIT | ||
|
|
||
| if not matching_objects: | ||
| return 0 | ||
|
|
||
| # Update each object with the new metadata | ||
| # Since metadata is stored flattened in Weaviate properties, we update properties directly | ||
| updated_count = 0 | ||
| failed_updates = [] | ||
|
|
||
| for obj in matching_objects: | ||
| try: | ||
| # Get current properties | ||
| current_properties = obj.properties.copy() if obj.properties else {} | ||
|
|
||
| # Update with new metadata values | ||
| # Note: metadata fields are stored directly in properties (flattened) | ||
| for key, value in meta.items(): | ||
| current_properties[key] = value | ||
|
|
||
| # Update the object, preserving the vector | ||
| # Get the vector from the object to preserve it during replace | ||
| vector = None | ||
| if isinstance(obj.vector, list): | ||
| vector = obj.vector | ||
| elif isinstance(obj.vector, dict): | ||
| vector = obj.vector.get("default") | ||
|
|
||
| await collection.data.replace( | ||
| uuid=obj.uuid, | ||
| properties=current_properties, | ||
| vector=vector, | ||
| ) | ||
| updated_count += 1 | ||
| except Exception as e: | ||
| # Collect failed updates but continue with others | ||
| obj_properties = obj.properties or {} | ||
| id_ = obj_properties.get("_original_id", obj.uuid) | ||
| failed_updates.append((id_, str(e))) | ||
|
|
||
| if failed_updates: | ||
| msg = "\n".join( | ||
| [f"Failed to update object with id '{id_}'. Error: '{error}'" for id_, error in failed_updates] | ||
| ) | ||
| raise DocumentStoreError(msg) | ||
|
|
||
| logger.info( | ||
| "Updated {n_docs} documents in collection '{collection}' using filters.", | ||
| n_docs=updated_count, | ||
| collection=collection.name, | ||
| ) | ||
| return updated_count | ||
| except weaviate.exceptions.WeaviateQueryError as e: | ||
| msg = f"Failed to update documents by filter in Weaviate. Error: {e.message}" | ||
| raise DocumentStoreError(msg) from e | ||
| except Exception as e: | ||
| msg = f"Failed to update documents by filter in Weaviate: {e!s}" | ||
| raise DocumentStoreError(msg) from e | ||
|
|
||
| def _bm25_retrieval( | ||
| self, query: str, filters: Optional[dict[str, Any]] = None, top_k: Optional[int] = None | ||
| ) -> list[Document]: | ||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.