diff --git a/bugbug/tools/code_review.py b/bugbug/tools/code_review.py index d450715703..9d1b021dd8 100644 --- a/bugbug/tools/code_review.py +++ b/bugbug/tools/code_review.py @@ -1285,8 +1285,15 @@ def clean_comment(self, comment): return comment def add_comments_by_hunk(self, items: Iterable[tuple[Hunk, InlineComment]]): + point_ids = self.vector_db.get_existing_ids() + def vector_points(): + nonlocal point_ids + for hunk, comment in items: + if comment.id in point_ids: + continue + str_hunk = str(hunk) vector = self.embeddings.embed_query(str_hunk) payload = { diff --git a/bugbug/vectordb.py b/bugbug/vectordb.py index 7f8be1097d..689ba773dd 100644 --- a/bugbug/vectordb.py +++ b/bugbug/vectordb.py @@ -46,6 +46,10 @@ def insert(self, points: Iterable[VectorPoint]): def search(self, query: list[float]) -> Iterable[PayloadScore]: ... + @abstractmethod + def get_existing_ids(self): + ... + class QdrantVectorDB(VectorDB): def __init__(self, collection_name: str, *args, **kwargs): @@ -83,3 +87,25 @@ def insert(self, points: Iterable[VectorPoint]): def search(self, query: list[float]) -> Iterable[PayloadScore]: for item in self.client.search(self.collection_name, query): yield PayloadScore(item.score, item.id, item.payload) + + def get_existing_ids(self): + point_ids = set() + offset = None + + while True: + points, next_page_offset = self.client.scroll( + collection_name=self.collection_name, + limit=100, + with_payload=False, + with_vectors=False, + offset=offset, + ) + + point_ids.update(point.id for point in points) + + if not next_page_offset: + break + + offset = next_page_offset + + return point_ids