Skip to content

Commit

Permalink
feat(community): add mmr search to pgvector (#7438)
Browse files Browse the repository at this point in the history
Co-authored-by: jacoblee93 <[email protected]>
  • Loading branch information
anadi45 and jacoblee93 authored Jan 2, 2025
1 parent d0fb84f commit c132cf9
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 10 deletions.
79 changes: 72 additions & 7 deletions libs/langchain-community/src/vectorstores/pgvector.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
import pg, { type Pool, type PoolClient, type PoolConfig } from "pg";
import { VectorStore } from "@langchain/core/vectorstores";
import {
MaxMarginalRelevanceSearchOptions,
VectorStore,
} from "@langchain/core/vectorstores";
import type { EmbeddingsInterface } from "@langchain/core/embeddings";
import { Document } from "@langchain/core/documents";
import { getEnvironmentVariable } from "@langchain/core/utils/env";
import { maximalMarginalRelevance } from "@langchain/core/utils/math";

type Metadata = Record<string, unknown>;

Expand Down Expand Up @@ -602,19 +606,18 @@ export class PGVectorStore extends VectorStore {
}

/**
* Method to perform a similarity search in the vector store. It returns
* the `k` most similar documents to the query vector, along with their
* similarity scores.
*
* Method to perform a similarity search in the vector store. It returns the `k` most similar documents to the query text.
* @param query - Query vector.
* @param k - Number of most similar documents to return.
* @param filter - Optional filter to apply to the search.
* @param includeEmbedding Whether to include the embedding vectors in the results.
* @returns Promise that resolves with an array of tuples, each containing a `Document` and its similarity score.
*/
async similaritySearchVectorWithScore(
private async searchPostgres(
query: number[],
k: number,
filter?: this["FilterType"]
filter?: this["FilterType"],
includeEmbedding?: boolean
): Promise<[Document, number][]> {
const embeddingString = `[${query.join(",")}]`;
const _filter: this["FilterType"] = filter ?? {};
Expand Down Expand Up @@ -694,12 +697,32 @@ export class PGVectorStore extends VectorStore {
metadata: doc[this.metadataColumnName],
id: doc[this.idColumnName],
});
if (includeEmbedding) {
document.metadata[this.vectorColumnName] = doc[this.vectorColumnName];
}
results.push([document, doc._distance]);
}
}
return results;
}

/**
* Method to perform a similarity search in the vector store. It returns
* the `k` most similar documents to the query vector, along with their
* similarity scores.
* @param query - Query vector.
* @param k - Number of most similar documents to return.
* @param filter - Optional filter to apply to the search.
* @returns Promise that resolves with an array of tuples, each containing a `Document` and its similarity score.
*/
async similaritySearchVectorWithScore(
query: number[],
k: number,
filter?: this["FilterType"]
): Promise<[Document, number][]> {
return this.searchPostgres(query, k, filter, false);
}

/**
* Method to ensure the existence of the table in the database. It creates
* the table if it does not already exist.
Expand Down Expand Up @@ -885,4 +908,46 @@ export class PGVectorStore extends VectorStore {
);
}
}

/**
* Return documents selected using the maximal marginal relevance.
* Maximal marginal relevance optimizes for similarity to the query AND
* diversity among selected documents.
* @param query Text to look up documents similar to.
* @param options.k=4 Number of documents to return.
* @param options.fetchK=20 Number of documents to fetch before passing to
* the MMR algorithm.
* @param options.lambda=0.5 Number between 0 and 1 that determines the
* degree of diversity among the results, where 0 corresponds to maximum
* diversity and 1 to minimum diversity.
* @returns List of documents selected by maximal marginal relevance.
*/
async maxMarginalRelevanceSearch(
query: string,
options: MaxMarginalRelevanceSearchOptions<this["FilterType"]>
): Promise<Document[]> {
const { k = 4, fetchK = 20, lambda = 0.5, filter } = options;
const queryEmbedding = await this.embeddings.embedQuery(query);

const docs = await this.searchPostgres(
queryEmbedding,
fetchK,
filter,
true
);

const embeddingList = docs.map((doc) =>
JSON.parse(doc[0].metadata[this.vectorColumnName])
);

const mmrIndexes = maximalMarginalRelevance(
queryEmbedding,
embeddingList,
lambda,
k
);

const mmrDocs = mmrIndexes.map((index) => docs[index][0]);
return mmrDocs;
}
}
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
# Run this command to start the database:
# docker-compose up --build
version: "3"
# docker compose up --build
services:
db:
hostname: 127.0.0.1
image: ankane/pgvector
image: pgvector/pgvector:pg16
ports:
- 5432:5432
restart: always
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,35 @@ describe("PGVectorStore", () => {
expect(results[0].pageContent).toEqual("Cat drinks milk");
});

test.only("Test MMR search", async () => {
const documents = [
{
pageContent: "hello",
metadata: { a: 1 },
},
{
pageContent: "Cat drinks milk",
metadata: { a: 2 },
},
{
pageContent: "foo",
metadata: { a: 2 },
},
{ pageContent: "hi", metadata: { a: 1 } },
];
await pgvectorVectorStore.addDocuments(documents);
const results = await pgvectorVectorStore.maxMarginalRelevanceSearch(
"milk",
{
k: 2,
}
);

expect(results).toHaveLength(2);
expect(results[0].pageContent).toEqual("Cat drinks milk");
expect(results[1].pageContent).toEqual("foo");
});

test("PGvector can save documents with a list greater than default chunk size", async () => {
// Extract the default chunk size and add one.
const docsToGenerate = pgvectorVectorStore.chunkSize + 1;
Expand Down

0 comments on commit c132cf9

Please sign in to comment.