From 1f9fd98c6f4a481c168c4a29329dbff9d30a78c2 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Thu, 18 Jul 2024 20:41:19 +0700 Subject: [PATCH 01/17] feat: add filter for query in ts templates --- .../engines/typescript/agent/chat.ts | 38 ++++++++++++++++++- .../typescript/documents/pipeline.ts | 1 + .../typescript/streaming/annotations.ts | 4 +- .../llamaindex/typescript/streaming/events.ts | 6 +-- .../types/streaming/express/package.json | 2 +- .../types/streaming/nextjs/next.config.mjs | 3 +- templates/types/streaming/nextjs/package.json | 2 +- 7 files changed, 46 insertions(+), 10 deletions(-) diff --git a/templates/components/engines/typescript/agent/chat.ts b/templates/components/engines/typescript/agent/chat.ts index 516b92517..77e425462 100644 --- a/templates/components/engines/typescript/agent/chat.ts +++ b/templates/components/engines/typescript/agent/chat.ts @@ -1,4 +1,9 @@ -import { BaseToolWithCall, OpenAIAgent, QueryEngineTool } from "llamaindex"; +import { + BaseToolWithCall, + MetadataFilters, + OpenAIAgent, + QueryEngineTool, +} from "llamaindex"; import fs from "node:fs/promises"; import path from "node:path"; import { getDataSource } from "./index"; @@ -14,7 +19,7 @@ export async function createChatEngine(documentIds?: string[]) { tools.push( new QueryEngineTool({ queryEngine: index.asQueryEngine({ - preFilters: undefined, // TODO: Add filters once LITS supports it (getQueryFilters) + preFilters: generateFilters(documentIds || []), }), metadata: { name: "data_query_engine", @@ -41,3 +46,32 @@ export async function createChatEngine(documentIds?: string[]) { systemPrompt: process.env.SYSTEM_PROMPT, }); } + +function generateFilters(documentIds: string[]): MetadataFilters | undefined { + if (!documentIds.length) { + return { + filters: [ + { + key: "private", + value: ["true"], + operator: "nin", + }, + ], + }; + } + return { + filters: [ + { + key: "private", + value: "true", + operator: "!=", + }, + { + key: "doc_id", + value: documentIds, + operator: "in", + }, + ], + condition: "or", + }; +} diff --git a/templates/components/llamaindex/typescript/documents/pipeline.ts b/templates/components/llamaindex/typescript/documents/pipeline.ts index c5353efd7..436410cd1 100644 --- a/templates/components/llamaindex/typescript/documents/pipeline.ts +++ b/templates/components/llamaindex/typescript/documents/pipeline.ts @@ -18,6 +18,7 @@ export async function runPipeline(documents: Document[], filename: string) { for (const document of documents) { document.metadata = { ...document.metadata, + doc_id: document.id_, file_name: filename, private: "true", // to separate from other public documents }; diff --git a/templates/components/llamaindex/typescript/streaming/annotations.ts b/templates/components/llamaindex/typescript/streaming/annotations.ts index dba4bfae4..211886a1e 100644 --- a/templates/components/llamaindex/typescript/streaming/annotations.ts +++ b/templates/components/llamaindex/typescript/streaming/annotations.ts @@ -35,9 +35,9 @@ export function retrieveDocumentIds(annotations?: JSONValue[]): string[] { ) { const files = data.files as DocumentFile[]; for (const file of files) { - if (Array.isArray(file.content)) { + if (Array.isArray(file.content.value)) { // it's an array, so it's an array of doc IDs - for (const id of file.content) { + for (const id of file.content.value) { ids.push(id); } } diff --git a/templates/components/llamaindex/typescript/streaming/events.ts b/templates/components/llamaindex/typescript/streaming/events.ts index 41f3ec991..054e10e01 100644 --- a/templates/components/llamaindex/typescript/streaming/events.ts +++ b/templates/components/llamaindex/typescript/streaming/events.ts @@ -77,7 +77,7 @@ export function createCallbackManager(stream: StreamData) { const callbackManager = new CallbackManager(); callbackManager.on("retrieve-end", async (data) => { - const { nodes, query } = data.detail.payload; + const { nodes, query } = data.detail; await appendSourceData(stream, nodes); appendEventData(stream, `Retrieving context for query: '${query}'`); appendEventData( @@ -87,7 +87,7 @@ export function createCallbackManager(stream: StreamData) { }); callbackManager.on("llm-tool-call", (event) => { - const { name, input } = event.detail.payload.toolCall; + const { name, input } = event.detail.toolCall; const inputString = Object.entries(input) .map(([key, value]) => `${key}: ${value}`) .join(", "); @@ -98,7 +98,7 @@ export function createCallbackManager(stream: StreamData) { }); callbackManager.on("llm-tool-result", (event) => { - const { toolCall, toolResult } = event.detail.payload; + const { toolCall, toolResult } = event.detail; appendToolData(stream, toolCall, toolResult); }); diff --git a/templates/types/streaming/express/package.json b/templates/types/streaming/express/package.json index 0d4055968..cb8c6d469 100644 --- a/templates/types/streaming/express/package.json +++ b/templates/types/streaming/express/package.json @@ -20,7 +20,7 @@ "dotenv": "^16.3.1", "duck-duck-scrape": "^2.2.5", "express": "^4.18.2", - "llamaindex": "0.4.14", + "llamaindex": "0.5.5", "pdf2json": "3.0.5", "ajv": "^8.12.0", "@e2b/code-interpreter": "^0.0.5", diff --git a/templates/types/streaming/nextjs/next.config.mjs b/templates/types/streaming/nextjs/next.config.mjs index 124122bfa..7e4bf29ab 100644 --- a/templates/types/streaming/nextjs/next.config.mjs +++ b/templates/types/streaming/nextjs/next.config.mjs @@ -1,8 +1,9 @@ /** @type {import('next').NextConfig} */ import fs from "fs"; +import withLlamaIndex from "llamaindex/next"; import webpack from "./webpack.config.mjs"; const nextConfig = JSON.parse(fs.readFileSync("./next.config.json", "utf-8")); nextConfig.webpack = webpack; -export default nextConfig; +export default withLlamaIndex(nextConfig); diff --git a/templates/types/streaming/nextjs/package.json b/templates/types/streaming/nextjs/package.json index 293f7fecc..799f318d1 100644 --- a/templates/types/streaming/nextjs/package.json +++ b/templates/types/streaming/nextjs/package.json @@ -24,7 +24,7 @@ "duck-duck-scrape": "^2.2.5", "formdata-node": "^6.0.3", "got": "^14.4.1", - "llamaindex": "0.4.14", + "llamaindex": "0.5.5", "lucide-react": "^0.294.0", "next": "^14.2.4", "react": "^18.2.0", From 3b9a4780a0c941a1a6c06cad3317a23eca74e580 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Fri, 19 Jul 2024 12:36:20 +0700 Subject: [PATCH 02/17] fix: use != operator to filter public documents --- templates/components/engines/typescript/agent/chat.ts | 4 ++-- templates/types/streaming/express/package.json | 2 +- templates/types/streaming/nextjs/package.json | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/templates/components/engines/typescript/agent/chat.ts b/templates/components/engines/typescript/agent/chat.ts index 77e425462..e63cd7ce5 100644 --- a/templates/components/engines/typescript/agent/chat.ts +++ b/templates/components/engines/typescript/agent/chat.ts @@ -53,8 +53,8 @@ function generateFilters(documentIds: string[]): MetadataFilters | undefined { filters: [ { key: "private", - value: ["true"], - operator: "nin", + value: "true", + operator: "!=", }, ], }; diff --git a/templates/types/streaming/express/package.json b/templates/types/streaming/express/package.json index cb8c6d469..9a3d982ad 100644 --- a/templates/types/streaming/express/package.json +++ b/templates/types/streaming/express/package.json @@ -20,7 +20,7 @@ "dotenv": "^16.3.1", "duck-duck-scrape": "^2.2.5", "express": "^4.18.2", - "llamaindex": "0.5.5", + "llamaindex": "0.5.6", "pdf2json": "3.0.5", "ajv": "^8.12.0", "@e2b/code-interpreter": "^0.0.5", diff --git a/templates/types/streaming/nextjs/package.json b/templates/types/streaming/nextjs/package.json index 799f318d1..8c51b50df 100644 --- a/templates/types/streaming/nextjs/package.json +++ b/templates/types/streaming/nextjs/package.json @@ -24,7 +24,7 @@ "duck-duck-scrape": "^2.2.5", "formdata-node": "^6.0.3", "got": "^14.4.1", - "llamaindex": "0.5.5", + "llamaindex": "0.5.6", "lucide-react": "^0.294.0", "next": "^14.2.4", "react": "^18.2.0", From 059bc1ac9e00c36e9b2faa39974361aac36e0d21 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Fri, 19 Jul 2024 12:36:42 +0700 Subject: [PATCH 03/17] Create curvy-penguins-work.md --- .changeset/curvy-penguins-work.md | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 .changeset/curvy-penguins-work.md diff --git a/.changeset/curvy-penguins-work.md b/.changeset/curvy-penguins-work.md new file mode 100644 index 000000000..9078602e6 --- /dev/null +++ b/.changeset/curvy-penguins-work.md @@ -0,0 +1,5 @@ +--- +"create-llama": patch +--- + +add filter for query in ts templates From 4e240757c40cf48b0785310362d9e150100b6cc3 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Fri, 19 Jul 2024 13:04:14 +0700 Subject: [PATCH 04/17] feat: use next config without withLlamaIndex for fullstack template --- helpers/typescript.ts | 7 +++++++ templates/types/streaming/nextjs/next.config.simple.mjs | 8 ++++++++ 2 files changed, 15 insertions(+) create mode 100644 templates/types/streaming/nextjs/next.config.simple.mjs diff --git a/helpers/typescript.ts b/helpers/typescript.ts index def104da4..3ebd486dc 100644 --- a/helpers/typescript.ts +++ b/helpers/typescript.ts @@ -55,6 +55,12 @@ export const installTSTemplate = async ({ nextConfigJson.output = "export"; nextConfigJson.images = { unoptimized: true }; console.log("\nUsing static site generation\n"); + + // if having backend, copy overwrite next.config.simple.mjs to next.config.mjs + await fs.copyFile( + path.join(root, "next.config.simple.mjs"), + path.join(root, "next.config.mjs"), + ); } else { if (vectorDb === "milvus") { nextConfigJson.experimental.serverComponentsExternalPackages = @@ -64,6 +70,7 @@ export const installTSTemplate = async ({ ); } } + await fs.rm(path.join(root, "next.config.simple.mjs")); await fs.writeFile( nextConfigJsonFile, JSON.stringify(nextConfigJson, null, 2) + os.EOL, diff --git a/templates/types/streaming/nextjs/next.config.simple.mjs b/templates/types/streaming/nextjs/next.config.simple.mjs new file mode 100644 index 000000000..124122bfa --- /dev/null +++ b/templates/types/streaming/nextjs/next.config.simple.mjs @@ -0,0 +1,8 @@ +/** @type {import('next').NextConfig} */ +import fs from "fs"; +import webpack from "./webpack.config.mjs"; + +const nextConfig = JSON.parse(fs.readFileSync("./next.config.json", "utf-8")); +nextConfig.webpack = webpack; + +export default nextConfig; From ae3be5d113e7daf299a52e732852fd88b5f8fac3 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Fri, 19 Jul 2024 16:23:48 +0700 Subject: [PATCH 05/17] download llamacloud file from nodes --- .../llamaindex/typescript/streaming/events.ts | 6 +- .../typescript/streaming/service.ts | 135 ++++++++++-------- 2 files changed, 78 insertions(+), 63 deletions(-) diff --git a/templates/components/llamaindex/typescript/streaming/events.ts b/templates/components/llamaindex/typescript/streaming/events.ts index 054e10e01..18a22791d 100644 --- a/templates/components/llamaindex/typescript/streaming/events.ts +++ b/templates/components/llamaindex/typescript/streaming/events.ts @@ -84,6 +84,7 @@ export function createCallbackManager(stream: StreamData) { stream, `Retrieved ${nodes.length} sources to use as context for the query`, ); + LLamaCloudFileService.downloadFiles(nodes); // don't await to avoid blocking chat streaming }); callbackManager.on("llm-tool-call", (event) => { @@ -118,9 +119,8 @@ async function getNodeUrl(metadata: Metadata) { const pipelineId = metadata["pipeline_id"]; if (pipelineId && !isLocalFile) { // file is from LlamaCloud and was not ingested locally - // TODO trigger but don't await file download and just use convention to generate the URL (see Python code) - // return `${process.env.FILESERVER_URL_PREFIX}/output/llamacloud/${pipelineId}\$${fileName}`; - return await LLamaCloudFileService.getFileUrl(fileName, pipelineId); + const name = LLamaCloudFileService.toDownloadedName(pipelineId, fileName); + return `${process.env.FILESERVER_URL_PREFIX}/output/llamacloud/${name}`; } const isPrivate = metadata["private"] === "true"; const folder = isPrivate ? "output/uploaded" : "data"; diff --git a/templates/components/llamaindex/typescript/streaming/service.ts b/templates/components/llamaindex/typescript/streaming/service.ts index e6e02a871..c3176c72e 100644 --- a/templates/components/llamaindex/typescript/streaming/service.ts +++ b/templates/components/llamaindex/typescript/streaming/service.ts @@ -1,86 +1,66 @@ +import { Metadata, NodeWithScore } from "llamaindex"; import fs from "node:fs"; import https from "node:https"; import path from "node:path"; const LLAMA_CLOUD_OUTPUT_DIR = "output/llamacloud"; const LLAMA_CLOUD_BASE_URL = "https://cloud.llamaindex.ai/api/v1"; +const FILE_DELIMITER = "$"; // delimiter between pipelineId and filename -export interface LlamaCloudFile { +interface LlamaCloudFile { name: string; file_id: string; project_id: string; } export class LLamaCloudFileService { - static async getFiles(pipelineId: string): Promise { - const url = `${LLAMA_CLOUD_BASE_URL}/pipelines/${pipelineId}/files`; - const headers = { - Accept: "application/json", - Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`, - }; - const response = await fetch(url, { method: "GET", headers }); - const data = await response.json(); - return data; + public static async downloadFiles(nodes: NodeWithScore[]) { + console.log("Downloading files from LlamaCloud..."); + const files = this.nodesToDownloadFiles(nodes); + for (const file of files) { + await this.downloadFile(file.pipelineId, file.fileName); + } } - static async getFileDetail( - projectId: string, - fileId: string, - ): Promise<{ url: string }> { - const url = `${LLAMA_CLOUD_BASE_URL}/files/${fileId}/content?project_id=${projectId}`; - const headers = { - Accept: "application/json", - Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`, - }; - const response = await fetch(url, { method: "GET", headers }); - const data = (await response.json()) as { url: string }; - return data; + public static toDownloadedName(pipelineId: string, fileName: string) { + return `${pipelineId}${FILE_DELIMITER}${fileName}`; } - static async getFileUrl( - name: string, - pipelineId: string, - ): Promise { - try { - const files = await this.getFiles(pipelineId); - for (const file of files) { - if (file.name === name) { - const fileId = file.file_id; - const projectId = file.project_id; - const fileDetail = await this.getFileDetail(projectId, fileId); - const localFileUrl = this.downloadFile(fileDetail.url, fileId, name); - return localFileUrl; - } + private static nodesToDownloadFiles(nodes: NodeWithScore[]) { + const downloadFiles: Array<{ + pipelineId: string; + fileName: string; + }> = []; + for (const node of nodes) { + const isLocalFile = node.node.metadata["is_local_file"] === "true"; + if (isLocalFile) continue; + const pipelineId = node.node.metadata["pipeline_id"]; + const fileName = node.node.metadata["file_name"]; + if (!pipelineId || !fileName) continue; + const isDuplicate = downloadFiles.some( + (f) => f.pipelineId === pipelineId && f.fileName === fileName, + ); + if (!isDuplicate) { + downloadFiles.push({ pipelineId, fileName }); } - return null; - } catch (error) { - console.error("Error fetching file from LlamaCloud:", error); - return null; } + return downloadFiles; } - static downloadFile(url: string, fileId: string, filename: string) { - const FILE_DELIMITER = "$"; // delimiter between fileId and filename - const downloadedFileName = `${fileId}${FILE_DELIMITER}${filename}`; - const downloadedFilePath = path.join( - LLAMA_CLOUD_OUTPUT_DIR, - downloadedFileName, - ); - const urlPrefix = `${process.env.FILESERVER_URL_PREFIX}/${LLAMA_CLOUD_OUTPUT_DIR}`; - const fileUrl = `${urlPrefix}/${downloadedFileName}`; - + private static async downloadFile(pipelineId: string, fileName: string) { try { + const downloadedName = this.toDownloadedName(pipelineId, fileName); + const downloadedPath = path.join(LLAMA_CLOUD_OUTPUT_DIR, downloadedName); + // Check if file already exists - if (fs.existsSync(downloadedFilePath)) return fileUrl; + if (fs.existsSync(downloadedPath)) return; - // Create directory if it doesn't exist - if (!fs.existsSync(LLAMA_CLOUD_OUTPUT_DIR)) { - fs.mkdirSync(LLAMA_CLOUD_OUTPUT_DIR, { recursive: true }); - } + const urlToDownload = await this.getFileUrlByName(pipelineId, fileName); + if (!urlToDownload) throw new Error("File not found in LlamaCloud"); - const file = fs.createWriteStream(downloadedFilePath); + const file = fs.createWriteStream(downloadedPath); https - .get(url, (response) => { + .get(urlToDownload, (response) => { response.pipe(file); file.on("finish", () => { file.close(() => { @@ -89,15 +69,50 @@ export class LLamaCloudFileService { }); }) .on("error", (err) => { - fs.unlink(downloadedFilePath, () => { + fs.unlink(downloadedPath, () => { console.error("Error downloading file:", err); throw err; }); }); - - return fileUrl; } catch (error) { throw new Error(`Error downloading file from LlamaCloud: ${error}`); } } + + private static async getFileUrlByName( + pipelineId: string, + name: string, + ): Promise { + const files = await this.getAllFiles(pipelineId); + const file = files.find((file) => file.name === name); + if (!file) return null; + return await this.getFileUrlById(file.project_id, file.file_id); + } + + private static async getFileUrlById( + projectId: string, + fileId: string, + ): Promise { + const url = `${LLAMA_CLOUD_BASE_URL}/files/${fileId}/content?project_id=${projectId}`; + const headers = { + Accept: "application/json", + Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`, + }; + const response = await fetch(url, { method: "GET", headers }); + const data = (await response.json()) as { url: string }; + return data.url; + } + + private static async getAllFiles( + pipelineId: string, + ): Promise { + const url = `${LLAMA_CLOUD_BASE_URL}/pipelines/${pipelineId}/files`; + const headers = { + Accept: "application/json", + Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`, + }; + const response = await fetch(url, { method: "GET", headers }); + const data = await response.json(); + return data; + } } From 6bdffd299119b2abe4d3f2085de80a3c095dd941 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Fri, 19 Jul 2024 16:46:15 +0700 Subject: [PATCH 06/17] fix: add private to metadata when generating docs --- templates/components/engines/typescript/agent/chat.ts | 4 ++-- templates/components/vectordbs/python/llamacloud/generate.py | 2 ++ templates/components/vectordbs/python/none/generate.py | 3 +++ .../components/vectordbs/typescript/llamacloud/generate.ts | 2 ++ templates/components/vectordbs/typescript/none/generate.ts | 5 +++++ 5 files changed, 14 insertions(+), 2 deletions(-) diff --git a/templates/components/engines/typescript/agent/chat.ts b/templates/components/engines/typescript/agent/chat.ts index e63cd7ce5..77e425462 100644 --- a/templates/components/engines/typescript/agent/chat.ts +++ b/templates/components/engines/typescript/agent/chat.ts @@ -53,8 +53,8 @@ function generateFilters(documentIds: string[]): MetadataFilters | undefined { filters: [ { key: "private", - value: "true", - operator: "!=", + value: ["true"], + operator: "nin", }, ], }; diff --git a/templates/components/vectordbs/python/llamacloud/generate.py b/templates/components/vectordbs/python/llamacloud/generate.py index 157ffbe5b..e291ef9ce 100644 --- a/templates/components/vectordbs/python/llamacloud/generate.py +++ b/templates/components/vectordbs/python/llamacloud/generate.py @@ -31,8 +31,10 @@ def generate_datasource(): documents = get_documents() # Set is_local_file=true to distinguish locally ingested files from LlamaCloud files + # Set private=false to mark the document as public (required for filtering) for doc in documents: doc.metadata["is_local_file"] = "true" + doc.metadata["private"] = "false" LlamaCloudIndex.from_documents( documents=documents, diff --git a/templates/components/vectordbs/python/none/generate.py b/templates/components/vectordbs/python/none/generate.py index 3016db1d0..c9e08f946 100644 --- a/templates/components/vectordbs/python/none/generate.py +++ b/templates/components/vectordbs/python/none/generate.py @@ -21,6 +21,9 @@ def generate_datasource(): storage_dir = os.environ.get("STORAGE_DIR", "storage") # load the documents and create the index documents = get_documents() + # Set private=false to mark the document as public (required for filtering) + for doc in documents: + doc.metadata["private"] = "false" index = VectorStoreIndex.from_documents( documents, ) diff --git a/templates/components/vectordbs/typescript/llamacloud/generate.ts b/templates/components/vectordbs/typescript/llamacloud/generate.ts index 05f41ad43..330f670e8 100644 --- a/templates/components/vectordbs/typescript/llamacloud/generate.ts +++ b/templates/components/vectordbs/typescript/llamacloud/generate.ts @@ -10,10 +10,12 @@ dotenv.config(); async function loadAndIndex() { const documents = await getDocuments(); // Set is_local_file=true to distinguish locally ingested files from LlamaCloud files + // Set private=false to mark the document as public (required for filtering) for (const document of documents) { document.metadata = { ...document.metadata, is_local_file: "true", + private: "false", }; } await getDataSource(); diff --git a/templates/components/vectordbs/typescript/none/generate.ts b/templates/components/vectordbs/typescript/none/generate.ts index 8c162805b..cb9881ee6 100644 --- a/templates/components/vectordbs/typescript/none/generate.ts +++ b/templates/components/vectordbs/typescript/none/generate.ts @@ -25,6 +25,11 @@ async function generateDatasource() { persistDir: STORAGE_CACHE_DIR, }); const documents = await getDocuments(); + // Set private=false to mark the document as public (required for filtering) + documents.forEach((doc) => { + doc.metadata["private"] = "false"; + }); + await VectorStoreIndex.fromDocuments(documents, { storageContext, }); From e73681405c27374b7429d6faaad4d8af9df2b593 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Fri, 19 Jul 2024 17:02:55 +0700 Subject: [PATCH 07/17] fix: only download when file length > 0 --- .../components/llamaindex/typescript/streaming/service.ts | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/templates/components/llamaindex/typescript/streaming/service.ts b/templates/components/llamaindex/typescript/streaming/service.ts index c3176c72e..0634054c2 100644 --- a/templates/components/llamaindex/typescript/streaming/service.ts +++ b/templates/components/llamaindex/typescript/streaming/service.ts @@ -15,8 +15,9 @@ interface LlamaCloudFile { export class LLamaCloudFileService { public static async downloadFiles(nodes: NodeWithScore[]) { - console.log("Downloading files from LlamaCloud..."); const files = this.nodesToDownloadFiles(nodes); + if (!files.length) return; + console.log("Downloading files from LlamaCloud..."); for (const file of files) { await this.downloadFile(file.pipelineId, file.fileName); } From c04438fa85f32de8e04789d112a0df78d366032b Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Fri, 19 Jul 2024 17:18:10 +0700 Subject: [PATCH 08/17] feat: support filter and download private doc from llamacloud --- .../components/llamaindex/typescript/documents/pipeline.ts | 1 + .../components/llamaindex/typescript/streaming/service.ts | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/components/llamaindex/typescript/documents/pipeline.ts b/templates/components/llamaindex/typescript/documents/pipeline.ts index 436410cd1..6d3b92eb3 100644 --- a/templates/components/llamaindex/typescript/documents/pipeline.ts +++ b/templates/components/llamaindex/typescript/documents/pipeline.ts @@ -21,6 +21,7 @@ export async function runPipeline(documents: Document[], filename: string) { doc_id: document.id_, file_name: filename, private: "true", // to separate from other public documents + is_local_file: "true", // to distinguish from cloud data sources }; } diff --git a/templates/components/llamaindex/typescript/streaming/service.ts b/templates/components/llamaindex/typescript/streaming/service.ts index 0634054c2..0eee63a7a 100644 --- a/templates/components/llamaindex/typescript/streaming/service.ts +++ b/templates/components/llamaindex/typescript/streaming/service.ts @@ -34,10 +34,9 @@ export class LLamaCloudFileService { }> = []; for (const node of nodes) { const isLocalFile = node.node.metadata["is_local_file"] === "true"; - if (isLocalFile) continue; const pipelineId = node.node.metadata["pipeline_id"]; const fileName = node.node.metadata["file_name"]; - if (!pipelineId || !fileName) continue; + if (isLocalFile || !pipelineId || !fileName) continue; const isDuplicate = downloadFiles.some( (f) => f.pipelineId === pipelineId && f.fileName === fileName, ); From a6d3e89324575922477d03368926bf462fd2800f Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Mon, 22 Jul 2024 14:56:14 +0700 Subject: [PATCH 09/17] refactor: remove is_local_file when generate or upload --- .../llamaindex/typescript/documents/pipeline.ts | 1 - .../llamaindex/typescript/streaming/service.ts | 12 +++++++++++- .../vectordbs/typescript/llamacloud/generate.ts | 2 -- 3 files changed, 11 insertions(+), 4 deletions(-) diff --git a/templates/components/llamaindex/typescript/documents/pipeline.ts b/templates/components/llamaindex/typescript/documents/pipeline.ts index 6d3b92eb3..436410cd1 100644 --- a/templates/components/llamaindex/typescript/documents/pipeline.ts +++ b/templates/components/llamaindex/typescript/documents/pipeline.ts @@ -21,7 +21,6 @@ export async function runPipeline(documents: Document[], filename: string) { doc_id: document.id_, file_name: filename, private: "true", // to separate from other public documents - is_local_file: "true", // to distinguish from cloud data sources }; } diff --git a/templates/components/llamaindex/typescript/streaming/service.ts b/templates/components/llamaindex/typescript/streaming/service.ts index 0eee63a7a..6b6c4206c 100644 --- a/templates/components/llamaindex/typescript/streaming/service.ts +++ b/templates/components/llamaindex/typescript/streaming/service.ts @@ -27,13 +27,23 @@ export class LLamaCloudFileService { return `${pipelineId}${FILE_DELIMITER}${fileName}`; } + /** + * This function will return an array of unique files to download from LlamaCloud + * We only download files that are uploaded directly in LlamaCloud datasources (don't have `private` in metadata) + * Files are uploaded directly in LlamaCloud datasources don't have `private` in metadata (public docs) + * Files are uploaded from local via `generate` command will have `private=false` (public docs) + * Files are uploaded from local via `/chat/upload` endpoint will have `private=true` (private docs) + * + * @param nodes + * @returns list of unique files to download + */ private static nodesToDownloadFiles(nodes: NodeWithScore[]) { const downloadFiles: Array<{ pipelineId: string; fileName: string; }> = []; for (const node of nodes) { - const isLocalFile = node.node.metadata["is_local_file"] === "true"; + const isLocalFile = node.node.metadata["private"] != null; const pipelineId = node.node.metadata["pipeline_id"]; const fileName = node.node.metadata["file_name"]; if (isLocalFile || !pipelineId || !fileName) continue; diff --git a/templates/components/vectordbs/typescript/llamacloud/generate.ts b/templates/components/vectordbs/typescript/llamacloud/generate.ts index 330f670e8..2c9f4968d 100644 --- a/templates/components/vectordbs/typescript/llamacloud/generate.ts +++ b/templates/components/vectordbs/typescript/llamacloud/generate.ts @@ -9,12 +9,10 @@ dotenv.config(); async function loadAndIndex() { const documents = await getDocuments(); - // Set is_local_file=true to distinguish locally ingested files from LlamaCloud files // Set private=false to mark the document as public (required for filtering) for (const document of documents) { document.metadata = { ...document.metadata, - is_local_file: "true", private: "false", }; } From 2261863105575b0b6e20eee4d18379e69a33b945 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Mon, 22 Jul 2024 14:57:42 +0700 Subject: [PATCH 10/17] fix: remove async and do not get file url by is_local_file --- .../llamaindex/typescript/streaming/events.ts | 25 ++++++++----------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/templates/components/llamaindex/typescript/streaming/events.ts b/templates/components/llamaindex/typescript/streaming/events.ts index 18a22791d..d06afe307 100644 --- a/templates/components/llamaindex/typescript/streaming/events.ts +++ b/templates/components/llamaindex/typescript/streaming/events.ts @@ -8,20 +8,18 @@ import { } from "llamaindex"; import { LLamaCloudFileService } from "./service"; -export async function appendSourceData( +export function appendSourceData( data: StreamData, sourceNodes?: NodeWithScore[], ) { if (!sourceNodes?.length) return; try { - const nodes = await Promise.all( - sourceNodes.map(async (node) => ({ - ...node.node.toMutableJSON(), - id: node.node.id_, - score: node.score ?? null, - url: await getNodeUrl(node.node.metadata), - })), - ); + const nodes = sourceNodes.map((node) => ({ + ...node.node.toMutableJSON(), + id: node.node.id_, + score: node.score ?? null, + url: getNodeUrl(node.node.metadata), + })); data.appendMessageAnnotation({ type: "sources", data: { @@ -76,9 +74,9 @@ export function createStreamTimeout(stream: StreamData) { export function createCallbackManager(stream: StreamData) { const callbackManager = new CallbackManager(); - callbackManager.on("retrieve-end", async (data) => { + callbackManager.on("retrieve-end", (data) => { const { nodes, query } = data.detail; - await appendSourceData(stream, nodes); + appendSourceData(stream, nodes); appendEventData(stream, `Retrieving context for query: '${query}'`); appendEventData( stream, @@ -106,7 +104,7 @@ export function createCallbackManager(stream: StreamData) { return callbackManager; } -async function getNodeUrl(metadata: Metadata) { +function getNodeUrl(metadata: Metadata) { if (!process.env.FILESERVER_URL_PREFIX) { console.warn( "FILESERVER_URL_PREFIX is not set. File URLs will not be generated.", @@ -115,9 +113,8 @@ async function getNodeUrl(metadata: Metadata) { const fileName = metadata["file_name"]; if (fileName && process.env.FILESERVER_URL_PREFIX) { // file_name exists and file server is configured - const isLocalFile = metadata["is_local_file"] === "true"; const pipelineId = metadata["pipeline_id"]; - if (pipelineId && !isLocalFile) { + if (pipelineId && metadata["private"] == null) { // file is from LlamaCloud and was not ingested locally const name = LLamaCloudFileService.toDownloadedName(pipelineId, fileName); return `${process.env.FILESERVER_URL_PREFIX}/output/llamacloud/${name}`; From dfecff92a71de87dd15c03f097ae01b0f466bc99 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Mon, 22 Jul 2024 14:58:23 +0700 Subject: [PATCH 11/17] refactor: detach private and public filter --- .../engines/typescript/agent/chat.ts | 42 +++++++++---------- 1 file changed, 19 insertions(+), 23 deletions(-) diff --git a/templates/components/engines/typescript/agent/chat.ts b/templates/components/engines/typescript/agent/chat.ts index 77e425462..88819c815 100644 --- a/templates/components/engines/typescript/agent/chat.ts +++ b/templates/components/engines/typescript/agent/chat.ts @@ -1,5 +1,6 @@ import { BaseToolWithCall, + MetadataFilter, MetadataFilters, OpenAIAgent, QueryEngineTool, @@ -48,30 +49,25 @@ export async function createChatEngine(documentIds?: string[]) { } function generateFilters(documentIds: string[]): MetadataFilters | undefined { - if (!documentIds.length) { - return { - filters: [ - { - key: "private", - value: ["true"], - operator: "nin", - }, - ], - }; - } + // public documents don't have the "private" field or it's set to "false" + const publicDocumentsFilter: MetadataFilter = { + key: "private", + value: ["true"], + operator: "nin", + }; + + // if no documentIds are provided, only retrieve information from public documents + if (!documentIds.length) return { filters: [publicDocumentsFilter] }; + + const privateDocumentsFilter: MetadataFilter = { + key: "doc_id", + value: documentIds, + operator: "in", + }; + + // if documentIds are provided, retrieve information from public and private documents return { - filters: [ - { - key: "private", - value: "true", - operator: "!=", - }, - { - key: "doc_id", - value: documentIds, - operator: "in", - }, - ], + filters: [publicDocumentsFilter, privateDocumentsFilter], condition: "or", }; } From 41861bba1398bf62e90c113b78b198300d6efd00 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Mon, 22 Jul 2024 15:28:31 +0700 Subject: [PATCH 12/17] refactor: check private null for llamacloud datasource files --- .../components/vectordbs/python/llamacloud/generate.py | 2 -- .../types/streaming/fastapi/app/api/routers/models.py | 8 +++----- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/templates/components/vectordbs/python/llamacloud/generate.py b/templates/components/vectordbs/python/llamacloud/generate.py index e291ef9ce..f494941ae 100644 --- a/templates/components/vectordbs/python/llamacloud/generate.py +++ b/templates/components/vectordbs/python/llamacloud/generate.py @@ -30,10 +30,8 @@ def generate_datasource(): documents = get_documents() - # Set is_local_file=true to distinguish locally ingested files from LlamaCloud files # Set private=false to mark the document as public (required for filtering) for doc in documents: - doc.metadata["is_local_file"] = "true" doc.metadata["private"] = "false" LlamaCloudIndex.from_documents( diff --git a/templates/types/streaming/fastapi/app/api/routers/models.py b/templates/types/streaming/fastapi/app/api/routers/models.py index 8ba519154..d7247b249 100644 --- a/templates/types/streaming/fastapi/app/api/routers/models.py +++ b/templates/types/streaming/fastapi/app/api/routers/models.py @@ -192,8 +192,7 @@ def get_url_from_metadata(cls, metadata: Dict[str, Any]) -> str: if file_name and url_prefix: # file_name exists and file server is configured pipeline_id = metadata.get("pipeline_id") - is_local_file = metadata.get("is_local_file") - if pipeline_id and not is_local_file: + if pipeline_id and metadata.get("private") is None: # file is from LlamaCloud and was not ingested locally file_name = f"{pipeline_id}${file_name}" return f"{url_prefix}/output/llamacloud/{file_name}" @@ -219,9 +218,8 @@ def get_download_files(nodes: List[NodeWithScore]) -> Set[LlamaCloudFile]: ) for node in source_nodes if ( - not node.metadata.get( - "is_local_file" - ) # Download the file of the node flagged as not local + node.metadata.get("private") + is None # Only download files are from LlamaCloud and were not ingested locally and node.metadata.get("pipeline_id") is not None and node.metadata.get("file_name") is not None ) From cd0ca41047259cfa03586070b5d82170a1f1741e Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Mon, 22 Jul 2024 15:48:07 +0700 Subject: [PATCH 13/17] fix: use nin for filtering public docs in Python --- templates/types/streaming/fastapi/app/api/routers/chat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/templates/types/streaming/fastapi/app/api/routers/chat.py b/templates/types/streaming/fastapi/app/api/routers/chat.py index f81d1f166..b1e6b6b50 100644 --- a/templates/types/streaming/fastapi/app/api/routers/chat.py +++ b/templates/types/streaming/fastapi/app/api/routers/chat.py @@ -76,8 +76,8 @@ def generate_filters(doc_ids): filters=[ MetadataFilter( key="private", - value="true", - operator="!=", # type: ignore + value=["true"], + operator="nin", # type: ignore ), MetadataFilter( key="doc_id", From 614a92f0b93f74a28ba70c51f0df58f7783842e9 Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Mon, 22 Jul 2024 16:08:27 +0700 Subject: [PATCH 14/17] feat: append doc_id for metadata --- templates/types/streaming/fastapi/app/api/services/file.py | 1 + 1 file changed, 1 insertion(+) diff --git a/templates/types/streaming/fastapi/app/api/services/file.py b/templates/types/streaming/fastapi/app/api/services/file.py index a14dfa89f..8b965d15e 100644 --- a/templates/types/streaming/fastapi/app/api/services/file.py +++ b/templates/types/streaming/fastapi/app/api/services/file.py @@ -79,6 +79,7 @@ def store_and_parse_file(file_data, extension) -> List[Document]: documents = reader.load_data(file_path) # Add custom metadata for doc in documents: + doc.metadata["doc_id"] = doc.doc_id doc.metadata["file_name"] = file_name doc.metadata["private"] = "true" return documents From c3322dc49d6d9f0b581324544c74ffcb0e72a7bd Mon Sep 17 00:00:00 2001 From: Thuc Pham <51660321+thucpn@users.noreply.github.com> Date: Mon, 22 Jul 2024 18:41:01 +0700 Subject: [PATCH 15/17] don't need to doc_id because it's always in vector_store --- templates/components/llamaindex/typescript/documents/pipeline.ts | 1 - templates/types/streaming/fastapi/app/api/services/file.py | 1 - 2 files changed, 2 deletions(-) diff --git a/templates/components/llamaindex/typescript/documents/pipeline.ts b/templates/components/llamaindex/typescript/documents/pipeline.ts index 436410cd1..c5353efd7 100644 --- a/templates/components/llamaindex/typescript/documents/pipeline.ts +++ b/templates/components/llamaindex/typescript/documents/pipeline.ts @@ -18,7 +18,6 @@ export async function runPipeline(documents: Document[], filename: string) { for (const document of documents) { document.metadata = { ...document.metadata, - doc_id: document.id_, file_name: filename, private: "true", // to separate from other public documents }; diff --git a/templates/types/streaming/fastapi/app/api/services/file.py b/templates/types/streaming/fastapi/app/api/services/file.py index 8b965d15e..a14dfa89f 100644 --- a/templates/types/streaming/fastapi/app/api/services/file.py +++ b/templates/types/streaming/fastapi/app/api/services/file.py @@ -79,7 +79,6 @@ def store_and_parse_file(file_data, extension) -> List[Document]: documents = reader.load_data(file_path) # Add custom metadata for doc in documents: - doc.metadata["doc_id"] = doc.doc_id doc.metadata["file_name"] = file_name doc.metadata["private"] = "true" return documents From 0f143024c82f36b2a044f5ed0262b137736c0322 Mon Sep 17 00:00:00 2001 From: Marcus Schiesser Date: Mon, 22 Jul 2024 14:45:12 +0200 Subject: [PATCH 16/17] fix: removed unused code --- templates/types/streaming/fastapi/app/api/services/file.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/templates/types/streaming/fastapi/app/api/services/file.py b/templates/types/streaming/fastapi/app/api/services/file.py index a14dfa89f..a478570a0 100644 --- a/templates/types/streaming/fastapi/app/api/services/file.py +++ b/templates/types/streaming/fastapi/app/api/services/file.py @@ -19,12 +19,6 @@ from llama_index.readers.file import FlatReader -def file_metadata_func(*args, **kwargs) -> Dict: - default_meta = default_file_metadata_func(*args, **kwargs) - default_meta["private"] = "true" - return default_meta - - def get_llamaparse_parser(): from app.engine.loaders import load_configs from app.engine.loaders.file import FileLoaderConfig, llama_parse_parser From 60c7479f1bbe1284d15336f2662ecbebda7a97cc Mon Sep 17 00:00:00 2001 From: Marcus Schiesser Date: Mon, 22 Jul 2024 16:19:20 +0200 Subject: [PATCH 17/17] fix: next.config --- .changeset/curvy-penguins-work.md | 2 +- helpers/typescript.ts | 7 ------- templates/types/streaming/nextjs/next.config.json | 3 +-- templates/types/streaming/nextjs/next.config.mjs | 1 + templates/types/streaming/nextjs/next.config.simple.mjs | 8 -------- 5 files changed, 3 insertions(+), 18 deletions(-) delete mode 100644 templates/types/streaming/nextjs/next.config.simple.mjs diff --git a/.changeset/curvy-penguins-work.md b/.changeset/curvy-penguins-work.md index 9078602e6..52ec8313e 100644 --- a/.changeset/curvy-penguins-work.md +++ b/.changeset/curvy-penguins-work.md @@ -2,4 +2,4 @@ "create-llama": patch --- -add filter for query in ts templates +Filter private documents for Typescript (Using MetadataFilters) and update to LlamaIndexTS 0.5.6 diff --git a/helpers/typescript.ts b/helpers/typescript.ts index 3ebd486dc..def104da4 100644 --- a/helpers/typescript.ts +++ b/helpers/typescript.ts @@ -55,12 +55,6 @@ export const installTSTemplate = async ({ nextConfigJson.output = "export"; nextConfigJson.images = { unoptimized: true }; console.log("\nUsing static site generation\n"); - - // if having backend, copy overwrite next.config.simple.mjs to next.config.mjs - await fs.copyFile( - path.join(root, "next.config.simple.mjs"), - path.join(root, "next.config.mjs"), - ); } else { if (vectorDb === "milvus") { nextConfigJson.experimental.serverComponentsExternalPackages = @@ -70,7 +64,6 @@ export const installTSTemplate = async ({ ); } } - await fs.rm(path.join(root, "next.config.simple.mjs")); await fs.writeFile( nextConfigJsonFile, JSON.stringify(nextConfigJson, null, 2) + os.EOL, diff --git a/templates/types/streaming/nextjs/next.config.json b/templates/types/streaming/nextjs/next.config.json index 856fb100f..264e20ef3 100644 --- a/templates/types/streaming/nextjs/next.config.json +++ b/templates/types/streaming/nextjs/next.config.json @@ -2,7 +2,6 @@ "experimental": { "outputFileTracingIncludes": { "/*": ["./cache/**/*"] - }, - "serverComponentsExternalPackages": ["sharp", "onnxruntime-node"] + } } } diff --git a/templates/types/streaming/nextjs/next.config.mjs b/templates/types/streaming/nextjs/next.config.mjs index 7e4bf29ab..64bdff27a 100644 --- a/templates/types/streaming/nextjs/next.config.mjs +++ b/templates/types/streaming/nextjs/next.config.mjs @@ -6,4 +6,5 @@ import webpack from "./webpack.config.mjs"; const nextConfig = JSON.parse(fs.readFileSync("./next.config.json", "utf-8")); nextConfig.webpack = webpack; +// use withLlamaIndex to add necessary modifications for llamaindex library export default withLlamaIndex(nextConfig); diff --git a/templates/types/streaming/nextjs/next.config.simple.mjs b/templates/types/streaming/nextjs/next.config.simple.mjs deleted file mode 100644 index 124122bfa..000000000 --- a/templates/types/streaming/nextjs/next.config.simple.mjs +++ /dev/null @@ -1,8 +0,0 @@ -/** @type {import('next').NextConfig} */ -import fs from "fs"; -import webpack from "./webpack.config.mjs"; - -const nextConfig = JSON.parse(fs.readFileSync("./next.config.json", "utf-8")); -nextConfig.webpack = webpack; - -export default nextConfig;