diff --git a/libs/langchain-community/src/vectorstores/couchbase_query.ts b/libs/langchain-community/src/vectorstores/couchbase_query.ts new file mode 100644 index 000000000000..fa847b44c26d --- /dev/null +++ b/libs/langchain-community/src/vectorstores/couchbase_query.ts @@ -0,0 +1,718 @@ +/* eslint-disable @typescript-eslint/no-explicit-any */ +/* eslint-disable import/no-extraneous-dependencies */ +import { EmbeddingsInterface } from "@langchain/core/embeddings"; +import { VectorStore } from "@langchain/core/vectorstores"; +import { Bucket, Cluster, Collection, Scope } from "couchbase"; +import { Document } from "@langchain/core/documents"; +import { v4 as uuid } from "uuid"; + +/** + * Enum for different distance strategies supported by Couchbase vector search + */ +export enum DistanceStrategy { + DOT = "dot", + COSINE = "cosine", + EUCLIDEAN = "euclidean", + EUCLIDEAN_SQUARED = "euclidean_squared", +} + +export enum IndexType { + COMPOSITE = "composite", + HYPERSCALE = "hyperscale", +} + +/** + * Interface for create_index method parameters + */ +export interface CreateIndexOptions { + indexType: IndexType; + indexDescription: string; + distanceMetric?: DistanceStrategy; + indexName?: string; + vectorField?: string; + vectorDimension?: number; + fields?: string[]; + whereClause?: string; + indexScanNprobes?: number; + indexTrainlist?: number; +} + +/** + * This interface define the optional fields for adding vector + * - `ids` - vector of ids for each document. If undefined, then uuid will be used + * - `metadata` - vector of metadata object for each document + */ +export interface AddVectorOptions { + ids?: string[]; + metadata?: Record[]; +} + +/** + * This interface defines the fields required to initialize a query vector store + * These are the fields part of config: + * @property {Cluster} cluster - The Couchbase cluster that the store will interact with. + * @property {string} bucketName - The name of the bucket in the Couchbase cluster. + * @property {string} scopeName - The name of the scope within the bucket. + * @property {string} collectionName - The name of the collection within the scope. + * @property {string} textKey - The key to be used for text in the documents. Defaults to "text". + * @property {string} embeddingKey - The key to be used for embeddings in the documents. Defaults to "embedding". + * @property {DistanceStrategy} distanceStrategy - The distance strategy to use for vector similarity calculations. Defaults to DOT. + * @property {AddVectorOptions} addVectorOptions - Options for adding vectors with specific id/metadata + */ +export interface CouchbaseQueryVectorStoreArgs { + cluster: Cluster; + bucketName: string; + scopeName: string; + collectionName: string; + textKey?: string; + embeddingKey?: string; + distanceStrategy?: DistanceStrategy; + addVectorOptions?: AddVectorOptions; +} + +/** + * This type defines the search filters used in couchbase query vector search + * - `where`: Optional WHERE clause conditions for the SQL++ query + * - `fields`: Optional list of fields to include in the results + */ +type CouchbaseQueryVectorStoreFilter = { + where?: string; + fields?: string[]; +}; + +/** + * Class for interacting with the Couchbase database using Query service for vector search. + * It extends the VectorStore class and provides methods for adding vectors and + * documents, and searching for similar vectors using SQL++ queries. + * Initiate the class using initialize() method. + */ +export class CouchbaseQueryVectorStore extends VectorStore { + declare FilterType: CouchbaseQueryVectorStoreFilter; + + private metadataKey = "metadata"; + + private readonly defaultTextKey = "text"; + + private readonly defaultEmbeddingKey = "embedding"; + + private readonly defaultDistanceStrategy = DistanceStrategy.DOT; + + private cluster: Cluster; + + private _bucket: Bucket; + + private _scope: Scope; + + private _collection: Collection; + + private bucketName: string; + + private scopeName: string; + + private collectionName: string; + + private textKey = this.defaultTextKey; + + private embeddingKey = this.defaultEmbeddingKey; + + private distanceStrategy = this.defaultDistanceStrategy; + + /** + * The private constructor used to provide embedding to parent class. + * Initialize the class using static initialize() method + * @param embedding - object to generate embedding + * @param config - the fields required to initialize a vector store + */ + private constructor( + embedding: EmbeddingsInterface, + config: CouchbaseQueryVectorStoreArgs + ) { + super(embedding, config); + } + + _vectorstoreType(): string { + return "couchbase_query"; + } + + /** + * initialize class for interacting with the Couchbase database using Query service. + * It extends the VectorStore class and provides methods + * for adding vectors and documents, and searching for similar vectors. + * This also verifies the params + * + * @param embeddings - object to generate embedding + * @param config - the fields required to initialize a vector store + */ + static async initialize( + embeddings: EmbeddingsInterface, + config: CouchbaseQueryVectorStoreArgs + ) { + const store = new CouchbaseQueryVectorStore(embeddings, config); + + const { + cluster, + bucketName, + scopeName, + collectionName, + textKey, + embeddingKey, + distanceStrategy, + } = config; + + store.cluster = cluster; + store.bucketName = bucketName; + store.scopeName = scopeName; + store.collectionName = collectionName; + if (textKey) { + store.textKey = textKey; + } else { + store.textKey = store.defaultTextKey; + } + + if (embeddingKey) { + store.embeddingKey = embeddingKey; + } else { + store.embeddingKey = store.defaultEmbeddingKey; + } + + if (distanceStrategy) { + store.distanceStrategy = distanceStrategy; + } else { + store.distanceStrategy = store.defaultDistanceStrategy; + } + + try { + store._bucket = store.cluster.bucket(store.bucketName); + store._scope = store._bucket.scope(store.scopeName); + store._collection = store._scope.collection(store.collectionName); + } catch (err) { + throw new Error( + `Error connecting to couchbase, Please check connection and credentials. ${err}` + ); + } + + try { + if ( + !(await store.checkBucketExists()) || + !(await store.checkScopeAndCollectionExists()) + ) { + throw new Error("Error while initializing vector store"); + } + } catch (err) { + throw new Error(`Error while initializing vector store: ${err}`); + } + return store; + } + + /** + * An asynchronous method to verify the bucket exists. + * It retrieves bucket information and checks if the bucket is present. + * + * @throws - If the specified bucket does not exist in the database. + * + * @returns - returns promise true if no error is found + */ + private async checkBucketExists(): Promise { + try { + await this.cluster.buckets().getBucket(this.bucketName); + return true; + } catch (err) { + throw new Error( + `Bucket with name ${this.bucketName} does not exist. Error: ${err}` + ); + } + } + + /** + * An asynchronous method to verify the scope and collection exist. + * It checks if the specified scope and collection are present. + * + * @throws - If the specified scope or collection does not exist in the database. + * + * @returns - returns promise true if no error is found + */ + private async checkScopeAndCollectionExists(): Promise { + try { + const scopes = await this._bucket.collections().getAllScopes(); + const scope = scopes.find((s: any) => s.name === this.scopeName); + if (!scope) { + throw new Error(`Scope ${this.scopeName} does not exist`); + } + + const collection = scope.collections.find( + (c: any) => c.name === this.collectionName + ); + if (!collection) { + throw new Error(`Collection ${this.collectionName} does not exist`); + } + + return true; + } catch (err) { + throw new Error( + `Scope ${this.scopeName} or Collection ${this.collectionName} does not exist. Error: ${err}` + ); + } + } + + /** + * Method to add vectors and documents to the vector store. + * + * @param vectors - Vectors to be added to the vector store. + * @param documents - Documents to be added to the vector store. + * @param options - Optional parameters for adding vectors. + * + * @returns - Promise that resolves to an array of document IDs. + */ + async addVectors( + vectors: number[][], + documents: Document[], + options?: AddVectorOptions + ): Promise { + if (vectors.length === 0) { + return []; + } + + if (vectors.length !== documents.length) { + throw new Error("Vectors and documents must have the same length"); + } + + const documentIds = options?.ids || documents.map(() => uuid()); + const documentsToInsert: { [key: string]: any }[] = []; + + for (let index = 0; index < vectors.length; index += 1) { + const vector = vectors[index]; + const document = documents[index]; + const documentId = documentIds[index]; + + const documentToInsert = { + [documentId]: { + [this.textKey]: document.pageContent, + [this.embeddingKey]: vector, + [this.metadataKey]: document.metadata, + }, + }; + + documentsToInsert.push(documentToInsert); + } + + const docIds = await this.upsertDocuments(documentsToInsert); + return docIds; + } + + /** + * Method to add documents to the vector store. It first converts + * the documents to vectors using the embeddings and then adds them to the vector store. + * + * @param documents - Documents to be added to the vector store. + * @param options - Optional parameters for adding documents. + * + * @returns - Promise that resolves to an array of document IDs. + */ + async addDocuments( + documents: Document[], + options?: AddVectorOptions + ): Promise { + const texts = documents.map(({ pageContent }) => pageContent); + const vectors = await this.embeddings.embedDocuments(texts); + return this.addVectors(vectors, documents, options); + } + + /** + * Method to delete documents from the vector store. + * + * @param ids - Array of document IDs to be deleted. + * + * @returns - Promise that resolves when the deletion is complete. + */ + async delete(options: { ids: string[] }): Promise { + const { ids } = options; + const deletePromises = ids.map((id) => + this._collection.remove(id).catch((e: any) => { + throw new Error(`Delete failed with error: ${e}`); + }) + ); + + await Promise.all(deletePromises); + } + + /** + * Return documents that are most similar to the vector embedding using SQL++ query. + * + * @param queryEmbeddings - Embedding vector to look up documents similar to. + * @param k - Number of documents to return. Defaults to 4. + * @param filter - Optional search filter that are passed to Couchbase query. Defaults to empty object. + * - `where`: Optional WHERE clause conditions for the SQL++ query + * - `fields`: Optional list of fields to include in the results + * + * @returns - Promise of list of [document, score] that are the most similar to the query vector. + * + * @throws If the search operation fails. + */ + async similaritySearchVectorWithScore( + queryEmbeddings: number[], + k = 4, + filter: CouchbaseQueryVectorStoreFilter = {} + ): Promise<[Document, number][]> { + const { where, fields } = filter; + + // Build the SELECT clause + let selectClause = `META().id, ${this.textKey}, ${this.metadataKey}`; + if (fields && fields.length > 0) { + selectClause = fields.join(", "); + if (!fields.includes(this.textKey)) { + selectClause += `, ${this.textKey}`; + } + if (!fields.includes(this.metadataKey)) { + selectClause += `, ${this.metadataKey}`; + } + if (!fields.includes("META().id")) { + selectClause += `, META().id`; + } + } + + // Build the WHERE clause + let whereClause = ""; + if (where) { + whereClause = `AND ${where}`; + } + + // Build the SQL++ query with vector search using APPROX_VECTOR_DISTANCE function + // Using the configured distance metric for similarity scoring + + const distanceMetric = this.distanceStrategy; + const query = ` + SELECT ${selectClause}, + APPROX_VECTOR_DISTANCE(${this.embeddingKey}, [${queryEmbeddings}], "${distanceMetric}") as distance + FROM \`${this.bucketName}\`.\`${this.scopeName}\`.\`${this.collectionName}\` + WHERE ${this.embeddingKey} IS NOT NULL ${whereClause} + ORDER BY APPROX_VECTOR_DISTANCE(${this.embeddingKey}, [${queryEmbeddings}], "${distanceMetric}") + LIMIT ${k} + `; + + const docsWithScore: [Document, number][] = []; + try { + const result = await this.cluster.query(query, { + parameters: { + queryVector: queryEmbeddings, + k, + }, + }); + + for (const row of result.rows) { + const text = row[this.textKey]; + const metadata = row[this.metadataKey] || {}; + // Convert distance to similarity score (lower distance = higher similarity) + const distance = row.distance || 0; + const doc = new Document({ + pageContent: text, + metadata, + }); + docsWithScore.push([doc, distance]); + } + } catch (err) { + throw new Error(`Query failed with error: ${err}`); + } + return docsWithScore; + } + + /** + * Return documents that are most similar to the vector embedding. + * + * @param queryEmbeddings - Embedding to look up documents similar to. + * @param k - The number of similar documents to return. Defaults to 4. + * @param filter - Optional search filter that are passed to Couchbase query. Defaults to empty object. + * - `where`: Optional WHERE clause conditions for the SQL++ query + * - `fields`: Optional list of fields to include in the results + * + * @returns - A promise that resolves to an array of documents that match the similarity search. + */ + async similaritySearchByVector( + queryEmbeddings: number[], + k = 4, + filter: CouchbaseQueryVectorStoreFilter = {} + ): Promise { + const docsWithScore = await this.similaritySearchVectorWithScore( + queryEmbeddings, + k, + filter + ); + const docs = []; + for (const doc of docsWithScore) { + docs.push(doc[0]); + } + return docs; + } + + /** + * Return documents that are most similar to the query. + * + * @param query - Query to look up for similar documents + * @param k - The number of similar documents to return. Defaults to 4. + * @param filter - Optional search filter that are passed to Couchbase query. Defaults to empty object. + * - `where`: Optional WHERE clause conditions for the SQL++ query + * - `fields`: Optional list of fields to include in the results + * + * @returns - Promise of list of documents that are most similar to the query. + */ + async similaritySearch( + query: string, + k = 4, + filter: CouchbaseQueryVectorStoreFilter = {} + ): Promise { + const queryEmbeddings = await this.embeddings.embedQuery(query); + const docsWithScore = await this.similaritySearchVectorWithScore( + queryEmbeddings, + k, + filter + ); + const docs = []; + for (const doc of docsWithScore) { + docs.push(doc[0]); + } + return docs; + } + + /** + * Return documents that are most similar to the query with their scores. + * + * @param query - Query to look up for similar documents + * @param k - The number of similar documents to return. Defaults to 4. + * @param filter - Optional search filter that are passed to Couchbase query. Defaults to empty object. + * - `where`: Optional WHERE clause conditions for the SQL++ query + * - `fields`: Optional list of fields to include in the results + * + * @returns - Promise of list of documents that are most similar to the query. + */ + async similaritySearchWithScore( + query: string, + k = 4, + filter: CouchbaseQueryVectorStoreFilter = {} + ): Promise<[Document, number][]> { + const queryEmbeddings = await this.embeddings.embedQuery(query); + const docsWithScore = await this.similaritySearchVectorWithScore( + queryEmbeddings, + k, + filter + ); + return docsWithScore; + } + + /** + * upsert documents asynchronously into a couchbase collection + * @param documentsToInsert Documents to be inserted into couchbase collection with embeddings, original text and metadata + * @returns DocIds of the inserted documents + */ + private async upsertDocuments( + documentsToInsert: { + [x: string]: any; + }[] + ) { + // Create promises for each document to be upserted + const upsertDocumentsPromises = documentsToInsert.map((document) => { + const currentDocumentKey = Object.keys(document)[0]; + return this._collection + .upsert(currentDocumentKey, document[currentDocumentKey]) + .then(() => currentDocumentKey) + .catch((e: any) => { + throw new Error(`Upsert failed with error: ${e}`); + }); + }); + + // Upsert all documents asynchronously + const docIds = await Promise.all(upsertDocumentsPromises); + const successfulDocIds: string[] = []; + for (const id of docIds) { + if (id) { + successfulDocIds.push(id); + } + } + return successfulDocIds; + } + + /** + * Create a new vector index for the Query vector store. + * + * @param options - Configuration options for creating the index + * @param options.indexType - Type of the index (HYPERSCALE or COMPOSITE) to create + * @param options.indexDescription - Description of the index like "IVF,SQ8" + * @param options.distanceMetric - Distance metric to use for the index. Defaults to the distance metric in the constructor + * @param options.indexName - Name of the index to create. Defaults to "langchain_{indexType}_query_index" + * @param options.vectorField - Name of the vector field to use for the index. Defaults to the embedding key in the constructor + * @param options.vectorDimension - Dimension of the vector field. If not provided, it will be determined from the embedding object + * @param options.fields - List of fields to include in the index. Defaults to the text field in the constructor + * @param options.whereClause - Optional where clause to filter the documents to index + * @param options.indexScanNprobes - Number of probes to use for the index + * @param options.indexTrainlist - Number of training samples to use for the index + * + * @throws {Error} If index creation fails or invalid parameters are provided + */ + async createIndex(options: CreateIndexOptions): Promise { + const { + indexType, + indexDescription, + distanceMetric, + indexName, + vectorField, + vectorDimension, + fields, + whereClause, + indexScanNprobes, + indexTrainlist, + } = options; + + if (!Object.values(IndexType).includes(indexType)) { + throw new Error( + `Invalid index type. Got ${indexType}. Expected one of: ${Object.values( + IndexType + ).join(", ")}` + ); + } + + if (!indexDescription) { + throw new Error( + "Index description is required for creating Vector Query index." + ); + } + + const similarityMetric = distanceMetric || this.distanceStrategy; + const vectorFieldName = vectorField || this.embeddingKey; + + // Get the vector dimension for the index + let vectorDim = vectorDimension; + if (!vectorDim) { + try { + const testEmbedding = await this.embeddings.embedQuery( + "check the size of the vector embeddings" + ); + vectorDim = testEmbedding.length; + } catch (e) { + throw new Error( + "Vector dimension is required for creating Query index. " + + "Unable to determine the dimension from the embedding object. " + + `Error: ${e}` + ); + } + } + + // Create the index parameters for the index creation query + const indexParams: Record = { + dimension: vectorDim, + similarity: similarityMetric, + description: indexDescription, + }; + + if (indexScanNprobes) { + indexParams.scan_nprobes = indexScanNprobes; + } + if (indexTrainlist) { + indexParams.train_list = indexTrainlist; + } + + // Add the text field to the fields if empty or if it is not present + const includeFields = fields || [this.textKey]; + if (!includeFields.includes(this.textKey)) { + includeFields.push(this.textKey); + } + + // Build where clause if provided + const whereClauseStr = whereClause ? `WHERE ${whereClause}` : ""; + + // Convert index params to WITH clause format + const withClause = `WITH ${JSON.stringify(indexParams).replace(/"/g, "'")}`; + + let indexQuery: string; + let finalIndexName: string; + + if (indexType === IndexType.HYPERSCALE) { + finalIndexName = indexName || "langchain_hyperscale_query_index"; + // HYPERSCALE: Specialized vector index with INCLUDE clause for additional fields + indexQuery = + `CREATE VECTOR INDEX \`${finalIndexName}\` ON \`${this.bucketName}\`.\`${this.scopeName}\`.\`${this.collectionName}\` ` + + `(\`${vectorFieldName}\` VECTOR) INCLUDE (${includeFields + .map((f) => `\`${f}\``) + .join(", ")}) ` + + `${whereClauseStr} USING GSI ${withClause}`; + } else if (indexType === IndexType.COMPOSITE) { + finalIndexName = indexName || "langchain_composite_query_index"; + // COMPOSITE: General GSI index that includes vector field alongside other fields with VECTOR keyword + indexQuery = + `CREATE INDEX \`${finalIndexName}\` ON \`${this.bucketName}\`.\`${this.scopeName}\`.\`${this.collectionName}\` ` + + `(${includeFields + .map((f) => `\`${f}\``) + .join(", ")}, \`${vectorFieldName}\` VECTOR) ` + + `${whereClauseStr} USING GSI ${withClause}`; + } else { + throw new Error(`Unsupported index type: ${indexType}`); + } + + try { + await this.cluster.query(indexQuery); + } catch (e) { + if ( + e && + typeof e === "object" && + "cause" in e && + e.cause && + typeof e.cause === "object" && + "first_error_message" in e.cause + ) { + throw new Error( + `Index creation failed with error: ${e.cause.first_error_message}` + ); + } + throw new Error(`Index creation failed with error: ${e}`); + } + } + + /** + * Static method to create a new CouchbaseQueryVectorStore from an array of texts. + * It first converts the texts to vectors using the embeddings and then creates a new vector store. + * + * @param texts - Array of texts to be converted to vectors. + * @param metadatas - Array of metadata objects corresponding to the texts. + * @param embeddings - Embeddings to be used for converting texts to vectors. + * @param config - Configuration for the vector store. + * + * @returns - Promise that resolves to a new CouchbaseQueryVectorStore instance. + */ + static async fromTexts( + texts: string[], + metadatas: object[] | object, + embeddings: EmbeddingsInterface, + config: CouchbaseQueryVectorStoreArgs + ): Promise { + const docs: Document[] = []; + for (let i = 0; i < texts.length; i += 1) { + const metadata = Array.isArray(metadatas) ? metadatas[i] : metadatas; + const newDoc = new Document({ + pageContent: texts[i], + metadata, + }); + docs.push(newDoc); + } + return CouchbaseQueryVectorStore.fromDocuments(docs, embeddings, config); + } + + /** + * Static method to create a new CouchbaseQueryVectorStore from an array of documents. + * It first converts the documents to vectors using the embeddings and then creates a new vector store. + * + * @param docs - Array of documents to be converted to vectors. + * @param embeddings - Embeddings to be used for converting documents to vectors. + * @param config - Configuration for the vector store. + * + * @returns - Promise that resolves to a new CouchbaseQueryVectorStore instance. + */ + static async fromDocuments( + docs: Document[], + embeddings: EmbeddingsInterface, + config: CouchbaseQueryVectorStoreArgs + ): Promise { + const instance = await CouchbaseQueryVectorStore.initialize( + embeddings, + config + ); + await instance.addDocuments(docs); + return instance; + } +} diff --git a/libs/langchain-community/src/vectorstores/couchbase_search.ts b/libs/langchain-community/src/vectorstores/couchbase_search.ts index 350ee0414180..24e47a1084f8 100644 --- a/libs/langchain-community/src/vectorstores/couchbase_search.ts +++ b/libs/langchain-community/src/vectorstores/couchbase_search.ts @@ -1,4 +1,3 @@ -/* eslint-disable no-param-reassign */ /* eslint-disable @typescript-eslint/no-explicit-any */ /* eslint-disable import/no-extraneous-dependencies */ import { EmbeddingsInterface } from "@langchain/core/embeddings"; diff --git a/libs/langchain-community/src/vectorstores/tests/couchbase_query.test.ts b/libs/langchain-community/src/vectorstores/tests/couchbase_query.test.ts new file mode 100644 index 000000000000..f23decff9352 --- /dev/null +++ b/libs/langchain-community/src/vectorstores/tests/couchbase_query.test.ts @@ -0,0 +1,608 @@ +/* eslint-disable @typescript-eslint/no-explicit-any */ +import { + describe, + test, + beforeEach, + afterAll, + expect, + beforeAll, +} from "@jest/globals"; +import { Cluster } from "couchbase"; +import { OpenAIEmbeddings } from "@langchain/openai"; +import { Document } from "@langchain/core/documents"; +import { faker } from "@faker-js/faker"; + +import { + CouchbaseQueryVectorStore, + CouchbaseQueryVectorStoreArgs, + DistanceStrategy, + IndexType, +} from "../couchbase_query.js"; + +describe.skip("CouchbaseQueryVectorStore", () => { + // Configuration + const config = { + // **Note** user must have permissions to create buckets and indexes, and must be able to flush buckets + // unfortunately, Couchbase Capella doesn't support this level of access for database users, + // so these tests must run against a local Couchbase server + cluster: process.env.COUCHBASE_CLUSTER || "couchbase://localhost", + username: process.env.COUCHBASE_USERNAME || "Administrator", + password: process.env.COUCHBASE_PASSWORD || "password", + bucketName: "test-bucket", + indexTestBucketName: "test-index-bucket", + scopeName: "_default", + collectionName: "_default", + textKey: "text", + embeddingKey: "embedding", + distanceStrategy: DistanceStrategy.COSINE, + }; + + let cluster: Cluster; + let store: CouchbaseQueryVectorStore; + let indexTestStore: CouchbaseQueryVectorStore; + let embeddings: OpenAIEmbeddings; + + beforeAll(async () => { + // Create embeddings instance + embeddings = new OpenAIEmbeddings({ + openAIApiKey: process.env.OPENAI_API_KEY, + }); + + // Connect to Couchbase + cluster = await Cluster.connect(config.cluster, { + username: config.username, + password: config.password, + }); + + // Create bucket if it doesn't exist + try { + const buckets = await cluster.buckets().getAllBuckets(); + if (!buckets.some((bucket) => bucket.name === config.bucketName)) { + await cluster.buckets().createBucket({ + name: config.bucketName, + ramQuotaMB: 2000, + flushEnabled: true, + }); + } + // create a separate bucket for index testing + if ( + !buckets.some((bucket) => bucket.name === config.indexTestBucketName) + ) { + await cluster.buckets().createBucket({ + name: config.indexTestBucketName, + ramQuotaMB: 2000, + flushEnabled: true, + }); + } + } catch (err: any) { + if (err.code !== 605) { + // 605 is bucket_exists error + console.error("Error creating bucket:", err); + throw err; + } + } + }); + + beforeEach(async () => { + try { + await cluster.buckets().flushBucket(config.bucketName); + } catch (error: any) { + console.warn("Could not flush bucket during cleanup:", error.message); + } + // Initialize store + try { + const storeConfig: CouchbaseQueryVectorStoreArgs = { + cluster, + bucketName: config.bucketName, + scopeName: config.scopeName, + collectionName: config.collectionName, + textKey: config.textKey, + embeddingKey: config.embeddingKey, + }; + + store = await CouchbaseQueryVectorStore.initialize( + embeddings, + storeConfig + ); + + const indexTestStoreConfig: CouchbaseQueryVectorStoreArgs = { + cluster, + bucketName: config.indexTestBucketName, + scopeName: config.scopeName, + collectionName: config.collectionName, + textKey: config.textKey, + embeddingKey: config.embeddingKey, + }; + + indexTestStore = await CouchbaseQueryVectorStore.initialize( + embeddings, + indexTestStoreConfig + ); + } catch (error) { + console.error("Failed to initialize test suite:", error); + throw error; + } + }); + + afterAll(async () => { + if (cluster) { + try { + await cluster.buckets().flushBucket(config.bucketName); + } catch (error: any) { + console.warn( + "Could not flush bucket during aterAll cleanup:", + error.message + ); + } + await cluster.close(); + } + }); + + // Helper function to create test data + const createTestData = (count: number) => { + const texts = Array.from({ length: count }, () => faker.lorem.paragraph()); + const metadatas = Array.from({ length: count }, () => ({ + source: faker.system.fileName(), + author: faker.person.fullName(), + })); + return { texts, metadatas }; + }; + + // Helper function to create bulk test data for index training + const createBulkTestData = (count: number) => { + const documents = []; + for (let i = 0; i < count; i += 1) { + documents.push( + new Document({ + pageContent: `Document ${i}: ${faker.hacker.phrase()}! ${faker.company.catchPhrase()}`, + metadata: { + source: "bulk_test", + index: i, + category: faker.helpers.arrayElement([ + "tech", + "business", + "science", + "art", + ]), + rating: faker.number.int({ min: 1, max: 5 }), + }, + }) + ); + } + return documents; + }; + + // Helper function to add documents in batches for better performance + const addDocumentsInBatches = async ( + documents: Document[], + batchSize = 50 + ) => { + const allIds = []; + for (let i = 0; i < documents.length; i += batchSize) { + const batch = documents.slice(i, i + batchSize); + const ids = await indexTestStore.addDocuments(batch); + allIds.push(...ids); + } + return allIds; + }; + + describe("Initialization", () => { + test("should initialize with default values", async () => { + expect(store).toBeDefined(); + expect(store.embeddings).toBeDefined(); + }); + }); + + describe("Document Operations", () => { + test("should add documents with metadata", async () => { + const { texts, metadatas } = createTestData(2); + const documents = texts.map( + (text, i) => new Document({ pageContent: text, metadata: metadatas[i] }) + ); + + const ids = await store.addDocuments(documents); + expect(ids).toHaveLength(2); + + const results = await store.similaritySearch(texts[0], 1); + expect(results).toHaveLength(1); + expect(results[0].pageContent).toBe(texts[0]); + expect(results[0].metadata).toEqual(metadatas[0]); + }); + + test("should add documents with custom IDs", async () => { + const { texts } = createTestData(2); + const documents = texts.map( + (text) => new Document({ pageContent: text }) + ); + const customIds = ["doc1", "doc2"]; + + const ids = await store.addDocuments(documents, { ids: customIds }); + expect(ids).toEqual(customIds); + + const results = await store.similaritySearch(texts[0], 1); + expect(results).toHaveLength(1); + expect(results[0].pageContent).toBe(texts[0]); + }); + + test("should delete documents", async () => { + const { texts } = createTestData(2); + const documents = texts.map( + (text) => new Document({ pageContent: text }) + ); + + const ids = await store.addDocuments(documents); + expect(ids).toHaveLength(2); + + await store.delete({ ids }); + const results = await store.similaritySearch(texts[0], 1); + expect(results).toHaveLength(0); + }); + }); + + describe("Search Operations", () => { + test("should perform similarity search", async () => { + const { texts } = createTestData(2); + const documents = texts.map( + (text) => new Document({ pageContent: text }) + ); + + await store.addDocuments(documents); + + const results = await store.similaritySearch(texts[0], 1); + expect(results).toHaveLength(1); + expect(results[0].pageContent).toBe(texts[0]); + }); + + test("should perform similarity search with score", async () => { + const { texts } = createTestData(2); + const documents = texts.map( + (text) => new Document({ pageContent: text }) + ); + + await store.addDocuments(documents); + + const results = await store.similaritySearchWithScore(texts[0], 1); + expect(results).toHaveLength(1); + expect(results[0][0].pageContent).toBe(texts[0]); + expect(typeof results[0][1]).toBe("number"); + }); + + test("should perform similarity search by vector", async () => { + const { texts } = createTestData(2); + const documents = texts.map( + (text) => new Document({ pageContent: text }) + ); + + await store.addDocuments(documents); + + const queryEmbedding = await embeddings.embedQuery(texts[0]); + const results = await store.similaritySearchByVector(queryEmbedding, 1); + expect(results).toHaveLength(1); + expect(results[0].pageContent).toBe(texts[0]); + }); + + test("should perform similarity search with filters", async () => { + const { texts, metadatas } = createTestData(2); + const documents = texts.map( + (text, i) => new Document({ pageContent: text, metadata: metadatas[i] }) + ); + + await store.addDocuments(documents); + + const results = await store.similaritySearch(texts[0], 1, { + fields: ["text", "metadata.author"], + }); + expect(results).toHaveLength(1); + expect(results[0].pageContent).toBe(texts[0]); + expect(results[0].metadata.author).toBe(metadatas[0].author); + }); + }); + + describe("Factory Methods", () => { + test("should create store from texts", async () => { + const { texts, metadatas } = createTestData(2); + + const newStore = await CouchbaseQueryVectorStore.fromTexts( + texts, + metadatas, + embeddings, + { + cluster, + bucketName: config.bucketName, + scopeName: config.scopeName, + collectionName: config.collectionName, + textKey: config.textKey, + embeddingKey: config.embeddingKey, + } + ); + + const results = await newStore.similaritySearch(texts[0], 1); + expect(results).toHaveLength(1); + expect(results[0].pageContent).toBe(texts[0]); + expect(results[0].metadata).toEqual(metadatas[0]); + }); + + test("should create store from documents", async () => { + const { texts, metadatas } = createTestData(2); + const documents = texts.map( + (text, i) => new Document({ pageContent: text, metadata: metadatas[i] }) + ); + + const newStore = await CouchbaseQueryVectorStore.fromDocuments( + documents, + embeddings, + { + cluster, + bucketName: config.bucketName, + scopeName: config.scopeName, + collectionName: config.collectionName, + textKey: config.textKey, + embeddingKey: config.embeddingKey, + } + ); + + const results = await newStore.similaritySearch(texts[0], 1); + expect(results).toHaveLength(1); + expect(results[0].pageContent).toBe(texts[0]); + expect(results[0].metadata).toEqual(metadatas[0]); + }); + }); + + describe("Index Creation", () => { + const MINIMUM_DOCS_FOR_TRAINING = 1200; // Slightly above the 1024 minimum + let bulkDocumentIds: string[] = []; + + beforeAll(async () => { + // Create bulk test data + const bulkDocuments = createBulkTestData(MINIMUM_DOCS_FOR_TRAINING); + + // Add documents in batches for better performance + bulkDocumentIds = await addDocumentsInBatches(bulkDocuments, 100); + }); + + afterAll(async () => { + // Clean up bulk documents + if (bulkDocumentIds.length > 0) { + try { + await indexTestStore.delete({ ids: bulkDocumentIds }); + } catch (error) { + console.warn("Error cleaning up bulk documents:", error); + } + } + + // Clean up indexes + await dropAllIndexesWithManager(cluster, config.indexTestBucketName); + }); + + async function dropAllIndexesWithManager( + cluster: Cluster, + bucketName: string + ) { + const queryIndexManager = cluster.queryIndexes(); + + try { + // Get all indexes + const indexes = await queryIndexManager.getAllIndexes(bucketName); + + // Drop all secondary indexes + for (const index of indexes) { + if (!index.isPrimary) { + await queryIndexManager.dropIndex(bucketName, index.name); + } + } + } catch (error) { + console.error("Error:", error); + } + } + + test("should create HYPERSCALE vector index", async () => { + const createHyperscaleIndexOptions = { + indexType: IndexType.HYPERSCALE, + indexDescription: "IVF1024,SQ8", + distanceMetric: DistanceStrategy.COSINE, + indexName: "my_hyperscale_vector_index", + vectorDimension: 1536, + fields: ["text", "metadata"], + whereClause: "metadata.source = 'bulk_test'", + indexScanNprobes: 10, + indexTrainlist: 1024, + }; + + // Test that createIndex doesn't throw an error + await expect( + indexTestStore.createIndex(createHyperscaleIndexOptions) + ).resolves.not.toThrow(); + + const indexes = await cluster + .queryIndexes() + .getAllIndexes(config.indexTestBucketName); + expect( + indexes.some( + (index) => index.name === createHyperscaleIndexOptions.indexName + ) + ).toBe(true); + }); + + test("should create COMPOSITE vector index", async () => { + const createCompositeIndexOptions = { + indexType: IndexType.COMPOSITE, + indexDescription: "IVF1024,SQ8", + distanceMetric: DistanceStrategy.COSINE, + indexName: "my_composite_vector_index", + vectorDimension: 1536, + fields: ["text", "metadata.category"], + whereClause: "metadata.source = 'bulk_test'", + indexScanNprobes: 3, + indexTrainlist: 1024, + }; + + // Test that createIndex doesn't throw an error + await expect( + indexTestStore.createIndex(createCompositeIndexOptions) + ).resolves.not.toThrow(); + + const indexes = await cluster + .queryIndexes() + .getAllIndexes(config.indexTestBucketName); + expect( + indexes.some( + (index) => index.name === createCompositeIndexOptions.indexName + ) + ).toBe(true); + }); + + test("should create index with minimal options", async () => { + const minimalOptions = { + indexType: IndexType.HYPERSCALE, + indexDescription: "IVF,SQ8", + indexName: "minimal_options_index", + whereClause: "metadata.source = 'bulk_test'", + }; + + // Test that createIndex works with minimal options + await expect( + indexTestStore.createIndex(minimalOptions) + ).resolves.not.toThrow(); + + const indexes = await cluster + .queryIndexes() + .getAllIndexes(config.indexTestBucketName); + expect( + indexes.some((index) => index.name === minimalOptions.indexName) + ).toBe(true); + }); + + test("should auto-detect vector dimension from embeddings", async () => { + const optionsWithoutDimension = { + indexType: IndexType.HYPERSCALE, + indexDescription: "IVF,SQ8", + indexName: "auto_dimension_index", + whereClause: "metadata.source = 'bulk_test'", + }; + + // Test that createIndex works without specifying dimension + await expect( + indexTestStore.createIndex(optionsWithoutDimension) + ).resolves.not.toThrow(); + + const indexes = await cluster + .queryIndexes() + .getAllIndexes(config.indexTestBucketName); + expect( + indexes.some( + (index) => index.name === optionsWithoutDimension.indexName + ) + ).toBe(true); + }); + + test("should handle index creation errors gracefully", async () => { + const invalidOptions = { + indexType: IndexType.HYPERSCALE, + indexDescription: "", // Empty description should cause an error + indexName: "invalid_index", + }; + + // Test that createIndex handles errors gracefully + await expect( + indexTestStore.createIndex(invalidOptions) + ).rejects.toThrow(); + + const indexes = await cluster + .queryIndexes() + .getAllIndexes(config.indexTestBucketName); + expect( + indexes.some((index) => index.name === invalidOptions.indexName) + ).toBe(false); + }); + + test("should create both HYPERSCALE and COMPOSITE indexes sequentially", async () => { + const createHyperscaleIndexOptions = { + indexType: IndexType.HYPERSCALE, + indexDescription: "IVF1024,SQ8", + distanceMetric: DistanceStrategy.COSINE, + indexName: "sequential_hyperscale_index", + whereClause: "metadata.source = 'bulk_test'", + }; + + const createCompositeIndexOptions = { + indexType: IndexType.COMPOSITE, + indexDescription: "IVF1024,SQ8", + distanceMetric: DistanceStrategy.COSINE, + indexName: "sequential_composite_index", + whereClause: "metadata.source = 'bulk_test'", + }; + + // Test creating both index types sequentially + await expect( + indexTestStore.createIndex(createHyperscaleIndexOptions) + ).resolves.not.toThrow(); + await expect( + indexTestStore.createIndex(createCompositeIndexOptions) + ).resolves.not.toThrow(); + + const indexes = await cluster + .queryIndexes() + .getAllIndexes(config.indexTestBucketName); + expect( + indexes.some( + (index) => index.name === createHyperscaleIndexOptions.indexName + ) + ).toBe(true); + expect( + indexes.some( + (index) => index.name === createCompositeIndexOptions.indexName + ) + ).toBe(true); + }); + + test("should use default distance strategy when not specified", async () => { + const optionsWithoutDistance = { + indexType: IndexType.HYPERSCALE, + indexDescription: "IVF,SQ8", + indexName: "default_distance_index", + whereClause: "metadata.source = 'bulk_test'", + }; + + // Test that createIndex uses default distance strategy + await expect( + indexTestStore.createIndex(optionsWithoutDistance) + ).resolves.not.toThrow(); + + const indexes = await cluster + .queryIndexes() + .getAllIndexes(config.indexTestBucketName); + expect( + indexes.some((index) => index.name === optionsWithoutDistance.indexName) + ).toBe(true); + }); + + test("should handle different distance strategies", async () => { + const distanceStrategies = [ + DistanceStrategy.DOT, + DistanceStrategy.EUCLIDEAN, + DistanceStrategy.COSINE, + DistanceStrategy.EUCLIDEAN_SQUARED, + ]; + + for (let i = 0; i < distanceStrategies.length; i += 1) { + const options = { + indexType: IndexType.HYPERSCALE, + indexDescription: "IVF,SQ8", + distanceMetric: distanceStrategies[i], + indexName: `distance_test_index_${i}`, + whereClause: "metadata.source = 'bulk_test'", + }; + + await expect( + indexTestStore.createIndex(options) + ).resolves.not.toThrow(); + const indexes = await cluster + .queryIndexes() + .getAllIndexes(config.indexTestBucketName); + expect(indexes.some((index) => index.name === options.indexName)).toBe( + true + ); + } + }, 60000); + }); +}); diff --git a/libs/langchain-community/src/vectorstores/tests/couchbase_search.int.test.ts b/libs/langchain-community/src/vectorstores/tests/couchbase_search.int.test.ts index fff633a64d6f..5e0ce903c135 100644 --- a/libs/langchain-community/src/vectorstores/tests/couchbase_search.int.test.ts +++ b/libs/langchain-community/src/vectorstores/tests/couchbase_search.int.test.ts @@ -1,5 +1,4 @@ /* eslint-disable @typescript-eslint/no-explicit-any */ -/* eslint-disable no-process-env */ import { describe, test,