feat(community): add mmr search to pgvector (#7438)

Co-authored-by: jacoblee93 <[email protected]>
langchain-ai · Jan 2, 2025 · c132cf9 · c132cf9
1 parent d0fb84f
commit c132cf9
Show file tree

Hide file tree

Showing 3 changed files with 103 additions and 10 deletions.
diff --git a/libs/langchain-community/src/vectorstores/pgvector.ts b/libs/langchain-community/src/vectorstores/pgvector.ts
@@ -1,8 +1,12 @@
 import pg, { type Pool, type PoolClient, type PoolConfig } from "pg";
-import { VectorStore } from "@langchain/core/vectorstores";
+import {
+  MaxMarginalRelevanceSearchOptions,
+  VectorStore,
+} from "@langchain/core/vectorstores";
 import type { EmbeddingsInterface } from "@langchain/core/embeddings";
 import { Document } from "@langchain/core/documents";
 import { getEnvironmentVariable } from "@langchain/core/utils/env";
+import { maximalMarginalRelevance } from "@langchain/core/utils/math";
 
 type Metadata = Record<string, unknown>;
 
@@ -602,19 +606,18 @@ export class PGVectorStore extends VectorStore {
   }
 
   /**
-   * Method to perform a similarity search in the vector store. It returns
-   * the `k` most similar documents to the query vector, along with their
-   * similarity scores.
-   *
+   * Method to perform a similarity search in the vector store. It returns the `k` most similar documents to the query text.
    * @param query - Query vector.
    * @param k - Number of most similar documents to return.
    * @param filter - Optional filter to apply to the search.
+   * @param includeEmbedding Whether to include the embedding vectors in the results.
    * @returns Promise that resolves with an array of tuples, each containing a `Document` and its similarity score.
    */
-  async similaritySearchVectorWithScore(
+  private async searchPostgres(
     query: number[],
     k: number,
-    filter?: this["FilterType"]
+    filter?: this["FilterType"],
+    includeEmbedding?: boolean
   ): Promise<[Document, number][]> {
     const embeddingString = `[${query.join(",")}]`;
     const _filter: this["FilterType"] = filter ?? {};
@@ -694,12 +697,32 @@ export class PGVectorStore extends VectorStore {
           metadata: doc[this.metadataColumnName],
           id: doc[this.idColumnName],
         });
+        if (includeEmbedding) {
+          document.metadata[this.vectorColumnName] = doc[this.vectorColumnName];
+        }
         results.push([document, doc._distance]);
       }
     }
     return results;
   }
 
+  /**
+   * Method to perform a similarity search in the vector store. It returns
+   * the `k` most similar documents to the query vector, along with their
+   * similarity scores.
+   * @param query - Query vector.
+   * @param k - Number of most similar documents to return.
+   * @param filter - Optional filter to apply to the search.
+   * @returns Promise that resolves with an array of tuples, each containing a `Document` and its similarity score.
+   */
+  async similaritySearchVectorWithScore(
+    query: number[],
+    k: number,
+    filter?: this["FilterType"]
+  ): Promise<[Document, number][]> {
+    return this.searchPostgres(query, k, filter, false);
+  }
+
   /**
    * Method to ensure the existence of the table in the database. It creates
    * the table if it does not already exist.
@@ -885,4 +908,46 @@ export class PGVectorStore extends VectorStore {
       );
     }
   }
+
+  /**
+   * Return documents selected using the maximal marginal relevance.
+   * Maximal marginal relevance optimizes for similarity to the query AND
+   * diversity among selected documents.
+   * @param query Text to look up documents similar to.
+   * @param options.k=4 Number of documents to return.
+   * @param options.fetchK=20 Number of documents to fetch before passing to
+   *     the MMR algorithm.
+   * @param options.lambda=0.5 Number between 0 and 1 that determines the
+   *     degree of diversity among the results, where 0 corresponds to maximum
+   *     diversity and 1 to minimum diversity.
+   * @returns List of documents selected by maximal marginal relevance.
+   */
+  async maxMarginalRelevanceSearch(
+    query: string,
+    options: MaxMarginalRelevanceSearchOptions<this["FilterType"]>
+  ): Promise<Document[]> {
+    const { k = 4, fetchK = 20, lambda = 0.5, filter } = options;
+    const queryEmbedding = await this.embeddings.embedQuery(query);
+
+    const docs = await this.searchPostgres(
+      queryEmbedding,
+      fetchK,
+      filter,
+      true
+    );
+
+    const embeddingList = docs.map((doc) =>
+      JSON.parse(doc[0].metadata[this.vectorColumnName])
+    );
+
+    const mmrIndexes = maximalMarginalRelevance(
+      queryEmbedding,
+      embeddingList,
+      lambda,
+      k
+    );
+
+    const mmrDocs = mmrIndexes.map((index) => docs[index][0]);
+    return mmrDocs;
+  }
 }
diff --git a/libs/langchain-community/src/vectorstores/tests/pgvector/docker-compose.yml b/libs/langchain-community/src/vectorstores/tests/pgvector/docker-compose.yml
@@ -1,10 +1,9 @@
 # Run this command to start the database:
-# docker-compose up --build
-version: "3"
+# docker compose up --build
 services:
   db:
     hostname: 127.0.0.1
-    image: ankane/pgvector
+    image: pgvector/pgvector:pg16
     ports:
       - 5432:5432
     restart: always

diff --git a/libs/langchain-community/src/vectorstores/tests/pgvector/pgvector.int.test.ts b/libs/langchain-community/src/vectorstores/tests/pgvector/pgvector.int.test.ts
@@ -74,6 +74,35 @@ describe("PGVectorStore", () => {
     expect(results[0].pageContent).toEqual("Cat drinks milk");
   });
 
+  test.only("Test MMR search", async () => {
+    const documents = [
+      {
+        pageContent: "hello",
+        metadata: { a: 1 },
+      },
+      {
+        pageContent: "Cat drinks milk",
+        metadata: { a: 2 },
+      },
+      {
+        pageContent: "foo",
+        metadata: { a: 2 },
+      },
+      { pageContent: "hi", metadata: { a: 1 } },
+    ];
+    await pgvectorVectorStore.addDocuments(documents);
+    const results = await pgvectorVectorStore.maxMarginalRelevanceSearch(
+      "milk",
+      {
+        k: 2,
+      }
+    );
+
+    expect(results).toHaveLength(2);
+    expect(results[0].pageContent).toEqual("Cat drinks milk");
+    expect(results[1].pageContent).toEqual("foo");
+  });
+
   test("PGvector can save documents with a list greater than default chunk size", async () => {
     // Extract the default chunk size and add one.
     const docsToGenerate = pgvectorVectorStore.chunkSize + 1;