run-llama · marcusschiesser · Jul 22, 2024 · Jul 18, 2024 · Jul 19, 2024 · Jul 19, 2024
diff --git a/.changeset/curvy-penguins-work.md b/.changeset/curvy-penguins-work.md
@@ -0,0 +1,5 @@
+---
+"create-llama": patch
+---
+
+add filter for query in ts templates
diff --git a/helpers/typescript.ts b/helpers/typescript.ts
@@ -55,6 +55,12 @@ export const installTSTemplate = async ({
       nextConfigJson.output = "export";
       nextConfigJson.images = { unoptimized: true };
       console.log("\nUsing static site generation\n");
+
+      // if having backend, copy overwrite next.config.simple.mjs to next.config.mjs
+      await fs.copyFile(
+        path.join(root, "next.config.simple.mjs"),
+        path.join(root, "next.config.mjs"),
+      );
     } else {
       if (vectorDb === "milvus") {
         nextConfigJson.experimental.serverComponentsExternalPackages =
@@ -64,6 +70,7 @@ export const installTSTemplate = async ({
         );
       }
     }
+    await fs.rm(path.join(root, "next.config.simple.mjs"));
     await fs.writeFile(
       nextConfigJsonFile,
       JSON.stringify(nextConfigJson, null, 2) + os.EOL,

diff --git a/templates/components/engines/typescript/agent/chat.ts b/templates/components/engines/typescript/agent/chat.ts
@@ -1,4 +1,9 @@
-import { BaseToolWithCall, OpenAIAgent, QueryEngineTool } from "llamaindex";
+import {
+  BaseToolWithCall,
+  MetadataFilters,
+  OpenAIAgent,
+  QueryEngineTool,
+} from "llamaindex";
 import fs from "node:fs/promises";
 import path from "node:path";
 import { getDataSource } from "./index";
@@ -14,7 +19,7 @@ export async function createChatEngine(documentIds?: string[]) {
     tools.push(
       new QueryEngineTool({
         queryEngine: index.asQueryEngine({
-          preFilters: undefined, // TODO: Add filters once LITS supports it (getQueryFilters)
+          preFilters: generateFilters(documentIds || []),
         }),
         metadata: {
           name: "data_query_engine",
@@ -41,3 +46,32 @@ export async function createChatEngine(documentIds?: string[]) {
     systemPrompt: process.env.SYSTEM_PROMPT,
   });
 }
+
+function generateFilters(documentIds: string[]): MetadataFilters | undefined {
+  if (!documentIds.length) {
+    return {
+      filters: [
+        {
+          key: "private",
+          value: ["true"],
+          operator: "nin",
+        },
+      ],
+    };
+  }
+  return {
+    filters: [
+      {
+        key: "private",
+        value: "true",
+        operator: "!=",
+      },
+      {
+        key: "doc_id",
+        value: documentIds,
+        operator: "in",
+      },
+    ],
+    condition: "or",
+  };
+}
diff --git a/templates/components/llamaindex/typescript/documents/pipeline.ts b/templates/components/llamaindex/typescript/documents/pipeline.ts
@@ -18,8 +18,10 @@ export async function runPipeline(documents: Document[], filename: string) {
   for (const document of documents) {
     document.metadata = {
       ...document.metadata,
+      doc_id: document.id_,
       file_name: filename,
       private: "true", // to separate from other public documents
+      is_local_file: "true", // to distinguish from cloud data sources
     };
   }
 

diff --git a/templates/components/llamaindex/typescript/streaming/annotations.ts b/templates/components/llamaindex/typescript/streaming/annotations.ts
@@ -35,9 +35,9 @@ export function retrieveDocumentIds(annotations?: JSONValue[]): string[] {
     ) {
       const files = data.files as DocumentFile[];
       for (const file of files) {
-        if (Array.isArray(file.content)) {
+        if (Array.isArray(file.content.value)) {
           // it's an array, so it's an array of doc IDs
-          for (const id of file.content) {
+          for (const id of file.content.value) {
             ids.push(id);
           }
         }

diff --git a/templates/components/llamaindex/typescript/streaming/events.ts b/templates/components/llamaindex/typescript/streaming/events.ts
@@ -77,17 +77,18 @@ export function createCallbackManager(stream: StreamData) {
   const callbackManager = new CallbackManager();
 
   callbackManager.on("retrieve-end", async (data) => {
-    const { nodes, query } = data.detail.payload;
+    const { nodes, query } = data.detail;
     await appendSourceData(stream, nodes);
     appendEventData(stream, `Retrieving context for query: '${query}'`);
     appendEventData(
       stream,
       `Retrieved ${nodes.length} sources to use as context for the query`,
     );
+    LLamaCloudFileService.downloadFiles(nodes); // don't await to avoid blocking chat streaming
   });
 
   callbackManager.on("llm-tool-call", (event) => {
-    const { name, input } = event.detail.payload.toolCall;
+    const { name, input } = event.detail.toolCall;
     const inputString = Object.entries(input)
       .map(([key, value]) => `${key}: ${value}`)
       .join(", ");
@@ -98,7 +99,7 @@ export function createCallbackManager(stream: StreamData) {
   });
 
   callbackManager.on("llm-tool-result", (event) => {
-    const { toolCall, toolResult } = event.detail.payload;
+    const { toolCall, toolResult } = event.detail;
     appendToolData(stream, toolCall, toolResult);
   });
 
@@ -118,9 +119,8 @@ async function getNodeUrl(metadata: Metadata) {
     const pipelineId = metadata["pipeline_id"];
     if (pipelineId && !isLocalFile) {
       // file is from LlamaCloud and was not ingested locally
-      // TODO trigger but don't await file download and just use convention to generate the URL (see Python code)
-      // return `${process.env.FILESERVER_URL_PREFIX}/output/llamacloud/${pipelineId}\$${fileName}`;
-      return await LLamaCloudFileService.getFileUrl(fileName, pipelineId);
+      const name = LLamaCloudFileService.toDownloadedName(pipelineId, fileName);
+      return `${process.env.FILESERVER_URL_PREFIX}/output/llamacloud/${name}`;
     }
     const isPrivate = metadata["private"] === "true";
     const folder = isPrivate ? "output/uploaded" : "data";

diff --git a/templates/components/llamaindex/typescript/streaming/service.ts b/templates/components/llamaindex/typescript/streaming/service.ts
@@ -1,86 +1,66 @@
+import { Metadata, NodeWithScore } from "llamaindex";
 import fs from "node:fs";
 import https from "node:https";
 import path from "node:path";
 
 const LLAMA_CLOUD_OUTPUT_DIR = "output/llamacloud";
 const LLAMA_CLOUD_BASE_URL = "https://cloud.llamaindex.ai/api/v1";
+const FILE_DELIMITER = "$"; // delimiter between pipelineId and filename
 
-export interface LlamaCloudFile {
+interface LlamaCloudFile {
   name: string;
   file_id: string;
   project_id: string;
 }
 
 export class LLamaCloudFileService {
-  static async getFiles(pipelineId: string): Promise<LlamaCloudFile[]> {
-    const url = `${LLAMA_CLOUD_BASE_URL}/pipelines/${pipelineId}/files`;
-    const headers = {
-      Accept: "application/json",
-      Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`,
-    };
-    const response = await fetch(url, { method: "GET", headers });
-    const data = await response.json();
-    return data;
+  public static async downloadFiles(nodes: NodeWithScore<Metadata>[]) {
+    const files = this.nodesToDownloadFiles(nodes);
+    if (!files.length) return;
+    console.log("Downloading files from LlamaCloud...");
+    for (const file of files) {
+      await this.downloadFile(file.pipelineId, file.fileName);
+    }
   }
 
-  static async getFileDetail(
-    projectId: string,
-    fileId: string,
-  ): Promise<{ url: string }> {
-    const url = `${LLAMA_CLOUD_BASE_URL}/files/${fileId}/content?project_id=${projectId}`;
-    const headers = {
-      Accept: "application/json",
-      Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`,
-    };
-    const response = await fetch(url, { method: "GET", headers });
-    const data = (await response.json()) as { url: string };
-    return data;
+  public static toDownloadedName(pipelineId: string, fileName: string) {
+    return `${pipelineId}${FILE_DELIMITER}${fileName}`;
   }
 
-  static async getFileUrl(
-    name: string,
-    pipelineId: string,
-  ): Promise<string | null> {
-    try {
-      const files = await this.getFiles(pipelineId);
-      for (const file of files) {
-        if (file.name === name) {
-          const fileId = file.file_id;
-          const projectId = file.project_id;
-          const fileDetail = await this.getFileDetail(projectId, fileId);
-          const localFileUrl = this.downloadFile(fileDetail.url, fileId, name);
-          return localFileUrl;
-        }
+  private static nodesToDownloadFiles(nodes: NodeWithScore<Metadata>[]) {
+    const downloadFiles: Array<{
+      pipelineId: string;
+      fileName: string;
+    }> = [];
+    for (const node of nodes) {
+      const isLocalFile = node.node.metadata["is_local_file"] === "true";
+      const pipelineId = node.node.metadata["pipeline_id"];
+      const fileName = node.node.metadata["file_name"];
+      if (isLocalFile || !pipelineId || !fileName) continue;
+      const isDuplicate = downloadFiles.some(
+        (f) => f.pipelineId === pipelineId && f.fileName === fileName,
+      );
+      if (!isDuplicate) {
+        downloadFiles.push({ pipelineId, fileName });
       }
-      return null;
-    } catch (error) {
-      console.error("Error fetching file from LlamaCloud:", error);
-      return null;
     }
+    return downloadFiles;
   }
 
-  static downloadFile(url: string, fileId: string, filename: string) {
-    const FILE_DELIMITER = "$"; // delimiter between fileId and filename
-    const downloadedFileName = `${fileId}${FILE_DELIMITER}${filename}`;
-    const downloadedFilePath = path.join(
-      LLAMA_CLOUD_OUTPUT_DIR,
-      downloadedFileName,
-    );
-    const urlPrefix = `${process.env.FILESERVER_URL_PREFIX}/${LLAMA_CLOUD_OUTPUT_DIR}`;
-    const fileUrl = `${urlPrefix}/${downloadedFileName}`;
-
+  private static async downloadFile(pipelineId: string, fileName: string) {
     try {
+      const downloadedName = this.toDownloadedName(pipelineId, fileName);
+      const downloadedPath = path.join(LLAMA_CLOUD_OUTPUT_DIR, downloadedName);
+
       // Check if file already exists
-      if (fs.existsSync(downloadedFilePath)) return fileUrl;
+      if (fs.existsSync(downloadedPath)) return;
 
-      // Create directory if it doesn't exist
-      if (!fs.existsSync(LLAMA_CLOUD_OUTPUT_DIR)) {
-        fs.mkdirSync(LLAMA_CLOUD_OUTPUT_DIR, { recursive: true });
-      }
+      const urlToDownload = await this.getFileUrlByName(pipelineId, fileName);
+      if (!urlToDownload) throw new Error("File not found in LlamaCloud");
 
-      const file = fs.createWriteStream(downloadedFilePath);
+      const file = fs.createWriteStream(downloadedPath);
       https
-        .get(url, (response) => {
+        .get(urlToDownload, (response) => {
           response.pipe(file);
           file.on("finish", () => {
             file.close(() => {
@@ -89,15 +69,50 @@ export class LLamaCloudFileService {
           });
         })
         .on("error", (err) => {
-          fs.unlink(downloadedFilePath, () => {
+          fs.unlink(downloadedPath, () => {
             console.error("Error downloading file:", err);
             throw err;
           });
         });
-
-      return fileUrl;
     } catch (error) {
       throw new Error(`Error downloading file from LlamaCloud: ${error}`);
     }
   }
+
+  private static async getFileUrlByName(
+    pipelineId: string,
+    name: string,
+  ): Promise<string | null> {
+    const files = await this.getAllFiles(pipelineId);
+    const file = files.find((file) => file.name === name);
+    if (!file) return null;
+    return await this.getFileUrlById(file.project_id, file.file_id);
+  }
+
+  private static async getFileUrlById(
+    projectId: string,
+    fileId: string,
+  ): Promise<string> {
+    const url = `${LLAMA_CLOUD_BASE_URL}/files/${fileId}/content?project_id=${projectId}`;
+    const headers = {
+      Accept: "application/json",
+      Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`,
+    };
+    const response = await fetch(url, { method: "GET", headers });
+    const data = (await response.json()) as { url: string };
+    return data.url;
+  }
+
+  private static async getAllFiles(
+    pipelineId: string,
+  ): Promise<LlamaCloudFile[]> {
+    const url = `${LLAMA_CLOUD_BASE_URL}/pipelines/${pipelineId}/files`;
+    const headers = {
+      Accept: "application/json",
+      Authorization: `Bearer ${process.env.LLAMA_CLOUD_API_KEY}`,
+    };
+    const response = await fetch(url, { method: "GET", headers });
+    const data = await response.json();
+    return data;
+  }
 }
diff --git a/templates/components/vectordbs/python/llamacloud/generate.py b/templates/components/vectordbs/python/llamacloud/generate.py
@@ -31,8 +31,10 @@ def generate_datasource():
     documents = get_documents()
 
     # Set is_local_file=true to distinguish locally ingested files from LlamaCloud files
+    # Set private=false to mark the document as public (required for filtering)
     for doc in documents:
         doc.metadata["is_local_file"] = "true"
+        doc.metadata["private"] = "false"
 
     LlamaCloudIndex.from_documents(
         documents=documents,

diff --git a/templates/components/vectordbs/python/none/generate.py b/templates/components/vectordbs/python/none/generate.py
@@ -21,6 +21,9 @@ def generate_datasource():
     storage_dir = os.environ.get("STORAGE_DIR", "storage")
     # load the documents and create the index
     documents = get_documents()
+    # Set private=false to mark the document as public (required for filtering)
+    for doc in documents:
+        doc.metadata["private"] = "false"
     index = VectorStoreIndex.from_documents(
         documents,
     )

diff --git a/templates/components/vectordbs/typescript/llamacloud/generate.ts b/templates/components/vectordbs/typescript/llamacloud/generate.ts
@@ -10,10 +10,12 @@ dotenv.config();
 async function loadAndIndex() {
   const documents = await getDocuments();
   // Set is_local_file=true to distinguish locally ingested files from LlamaCloud files
+  // Set private=false to mark the document as public (required for filtering)
   for (const document of documents) {
     document.metadata = {
       ...document.metadata,
       is_local_file: "true",
+      private: "false",
     };
   }
   await getDataSource();

diff --git a/templates/components/vectordbs/typescript/none/generate.ts b/templates/components/vectordbs/typescript/none/generate.ts
@@ -25,6 +25,11 @@ async function generateDatasource() {
       persistDir: STORAGE_CACHE_DIR,
     });
     const documents = await getDocuments();
+    //  Set private=false to mark the document as public (required for filtering)
+    documents.forEach((doc) => {
+      doc.metadata["private"] = "false";
+    });
+
     await VectorStoreIndex.fromDocuments(documents, {
       storageContext,
     });

diff --git a/templates/types/streaming/express/package.json b/templates/types/streaming/express/package.json
@@ -20,7 +20,7 @@
     "dotenv": "^16.3.1",
     "duck-duck-scrape": "^2.2.5",
     "express": "^4.18.2",
-    "llamaindex": "0.4.14",
+    "llamaindex": "0.5.6",
     "pdf2json": "3.0.5",
     "ajv": "^8.12.0",
     "@e2b/code-interpreter": "^0.0.5",

diff --git a/templates/types/streaming/nextjs/next.config.mjs b/templates/types/streaming/nextjs/next.config.mjs
@@ -1,8 +1,9 @@
 /** @type {import('next').NextConfig} */
 import fs from "fs";
+import withLlamaIndex from "llamaindex/next";
 import webpack from "./webpack.config.mjs";
 
 const nextConfig = JSON.parse(fs.readFileSync("./next.config.json", "utf-8"));
 nextConfig.webpack = webpack;
 
-export default nextConfig;
+export default withLlamaIndex(nextConfig);
diff --git a/templates/types/streaming/nextjs/next.config.simple.mjs b/templates/types/streaming/nextjs/next.config.simple.mjs
@@ -0,0 +1,8 @@
+/** @type {import('next').NextConfig} */
+import fs from "fs";
+import webpack from "./webpack.config.mjs";
+
+const nextConfig = JSON.parse(fs.readFileSync("./next.config.json", "utf-8"));
+nextConfig.webpack = webpack;
+
+export default nextConfig;