Skip to content

Commit 455ab68

Browse files
thucpnleehuwujmarcusschiesser
authored
feat: display files from llamacloud (#153)
--------- Co-authored-by: leehuwuj <[email protected]> Co-authored-by: Marcus Schiesser <[email protected]>
1 parent 58e6c15 commit 455ab68

File tree

24 files changed

+528
-213
lines changed

24 files changed

+528
-213
lines changed

.changeset/happy-hairs-kick.md

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
---
2+
"create-llama": patch
3+
---
4+
5+
Display files in sources using LlamaCloud indexes.

helpers/index.ts

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -177,10 +177,9 @@ export const installTemplate = async (
177177
}
178178

179179
// Create outputs directory
180-
if (props.tools && props.tools.length > 0) {
181-
await makeDir(path.join(props.root, "output/tools"));
182-
await makeDir(path.join(props.root, "output/uploaded"));
183-
}
180+
await makeDir(path.join(props.root, "output/tools"));
181+
await makeDir(path.join(props.root, "output/uploaded"));
182+
await makeDir(path.join(props.root, "output/llamacloud"));
184183
} else {
185184
// this is a frontend for a full-stack app, create .env file with model information
186185
await createFrontendEnvFile(props.root, {

templates/components/llamaindex/typescript/documents/documents.ts

Lines changed: 0 additions & 104 deletions
This file was deleted.
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
import fs from "fs";
2+
import crypto from "node:crypto";
3+
import { getExtractors } from "../../engine/loader";
4+
5+
const MIME_TYPE_TO_EXT: Record<string, string> = {
6+
"application/pdf": "pdf",
7+
"text/plain": "txt",
8+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document":
9+
"docx",
10+
};
11+
12+
const UPLOADED_FOLDER = "output/uploaded";
13+
14+
export async function loadDocuments(fileBuffer: Buffer, mimeType: string) {
15+
const extractors = getExtractors();
16+
const reader = extractors[MIME_TYPE_TO_EXT[mimeType]];
17+
18+
if (!reader) {
19+
throw new Error(`Unsupported document type: ${mimeType}`);
20+
}
21+
console.log(`Processing uploaded document of type: ${mimeType}`);
22+
return await reader.loadDataAsContent(fileBuffer);
23+
}
24+
25+
export async function saveDocument(fileBuffer: Buffer, mimeType: string) {
26+
const fileExt = MIME_TYPE_TO_EXT[mimeType];
27+
if (!fileExt) throw new Error(`Unsupported document type: ${mimeType}`);
28+
29+
const filename = `${crypto.randomUUID()}.${fileExt}`;
30+
const filepath = `${UPLOADED_FOLDER}/${filename}`;
31+
const fileurl = `${process.env.FILESERVER_URL_PREFIX}/${filepath}`;
32+
33+
if (!fs.existsSync(UPLOADED_FOLDER)) {
34+
fs.mkdirSync(UPLOADED_FOLDER, { recursive: true });
35+
}
36+
await fs.promises.writeFile(filepath, fileBuffer);
37+
38+
console.log(`Saved document file to ${filepath}.\nURL: ${fileurl}`);
39+
return {
40+
filename,
41+
filepath,
42+
fileurl,
43+
};
44+
}
Lines changed: 65 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
import {
2+
BaseNode,
3+
Document,
4+
IngestionPipeline,
5+
Metadata,
6+
Settings,
7+
SimpleNodeParser,
8+
storageContextFromDefaults,
9+
VectorStoreIndex,
10+
} from "llamaindex";
11+
import { LlamaCloudIndex } from "llamaindex/cloud/LlamaCloudIndex";
12+
import { getDataSource } from "../../engine";
13+
14+
export async function runPipeline(documents: Document[], filename: string) {
15+
const currentIndex = await getDataSource();
16+
17+
// Update documents with metadata
18+
for (const document of documents) {
19+
document.metadata = {
20+
...document.metadata,
21+
file_name: filename,
22+
private: "true", // to separate from other public documents
23+
};
24+
}
25+
26+
if (currentIndex instanceof LlamaCloudIndex) {
27+
// LlamaCloudIndex processes the documents automatically
28+
// so we don't need ingestion pipeline, just insert the documents directly
29+
for (const document of documents) {
30+
await currentIndex.insert(document);
31+
}
32+
} else {
33+
// Use ingestion pipeline to process the documents into nodes and add them to the vector store
34+
const pipeline = new IngestionPipeline({
35+
transformations: [
36+
new SimpleNodeParser({
37+
chunkSize: Settings.chunkSize,
38+
chunkOverlap: Settings.chunkOverlap,
39+
}),
40+
Settings.embedModel,
41+
],
42+
});
43+
const nodes = await pipeline.run({ documents });
44+
await addNodesToVectorStore(nodes, currentIndex);
45+
}
46+
47+
return documents.map((document) => document.id_);
48+
}
49+
50+
async function addNodesToVectorStore(
51+
nodes: BaseNode<Metadata>[],
52+
currentIndex: VectorStoreIndex | null,
53+
) {
54+
if (currentIndex) {
55+
await currentIndex.insertNodes(nodes);
56+
} else {
57+
// Not using vectordb and haven't generated local index yet
58+
const storageContext = await storageContextFromDefaults({
59+
persistDir: "./cache",
60+
});
61+
currentIndex = await VectorStoreIndex.init({ nodes, storageContext });
62+
}
63+
currentIndex.storageContext.docStore.persist();
64+
console.log("Added nodes to the vector store.");
65+
}
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
import { loadDocuments, saveDocument } from "./helper";
2+
import { runPipeline } from "./pipeline";
3+
4+
export async function uploadDocument(raw: string): Promise<string[]> {
5+
const [header, content] = raw.split(",");
6+
const mimeType = header.replace("data:", "").replace(";base64", "");
7+
const fileBuffer = Buffer.from(content, "base64");
8+
const documents = await loadDocuments(fileBuffer, mimeType);
9+
const { filename } = await saveDocument(fileBuffer, mimeType);
10+
return await runPipeline(documents, filename);
11+
}

templates/components/llamaindex/typescript/streaming/events.ts

Lines changed: 34 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -6,23 +6,31 @@ import {
66
ToolCall,
77
ToolOutput,
88
} from "llamaindex";
9+
import { LLamaCloudFileService } from "./service";
910

10-
export function appendSourceData(
11+
export async function appendSourceData(
1112
data: StreamData,
1213
sourceNodes?: NodeWithScore<Metadata>[],
1314
) {
1415
if (!sourceNodes?.length) return;
15-
data.appendMessageAnnotation({
16-
type: "sources",
17-
data: {
18-
nodes: sourceNodes.map((node) => ({
16+
try {
17+
const nodes = await Promise.all(
18+
sourceNodes.map(async (node) => ({
1919
...node.node.toMutableJSON(),
2020
id: node.node.id_,
2121
score: node.score ?? null,
22-
url: getNodeUrl(node.node.metadata),
22+
url: await getNodeUrl(node.node.metadata),
2323
})),
24-
},
25-
});
24+
);
25+
data.appendMessageAnnotation({
26+
type: "sources",
27+
data: {
28+
nodes,
29+
},
30+
});
31+
} catch (error) {
32+
console.error("Error appending source data:", error);
33+
}
2634
}
2735

2836
export function appendEventData(data: StreamData, title?: string) {
@@ -68,9 +76,9 @@ export function createStreamTimeout(stream: StreamData) {
6876
export function createCallbackManager(stream: StreamData) {
6977
const callbackManager = new CallbackManager();
7078

71-
callbackManager.on("retrieve-end", (data) => {
79+
callbackManager.on("retrieve-end", async (data) => {
7280
const { nodes, query } = data.detail.payload;
73-
appendSourceData(stream, nodes);
81+
await appendSourceData(stream, nodes);
7482
appendEventData(stream, `Retrieving context for query: '${query}'`);
7583
appendEventData(
7684
stream,
@@ -97,19 +105,27 @@ export function createCallbackManager(stream: StreamData) {
97105
return callbackManager;
98106
}
99107

100-
function getNodeUrl(metadata: Metadata) {
101-
const url = metadata["URL"];
102-
if (url) return url;
103-
const fileName = metadata["file_name"];
108+
async function getNodeUrl(metadata: Metadata) {
104109
if (!process.env.FILESERVER_URL_PREFIX) {
105110
console.warn(
106111
"FILESERVER_URL_PREFIX is not set. File URLs will not be generated.",
107112
);
108-
return undefined;
109113
}
110-
if (fileName) {
111-
const folder = metadata["private"] ? "output/uploaded" : "data";
114+
const fileName = metadata["file_name"];
115+
if (fileName && process.env.FILESERVER_URL_PREFIX) {
116+
// file_name exists and file server is configured
117+
const isLocalFile = metadata["is_local_file"] === "true";
118+
const pipelineId = metadata["pipeline_id"];
119+
if (pipelineId && !isLocalFile) {
120+
// file is from LlamaCloud and was not ingested locally
121+
// TODO trigger but don't await file download and just use convention to generate the URL (see Python code)
122+
// return `${process.env.FILESERVER_URL_PREFIX}/output/llamacloud/${pipelineId}\$${fileName}`;
123+
return await LLamaCloudFileService.getFileUrl(fileName, pipelineId);
124+
}
125+
const isPrivate = metadata["private"] === "true";
126+
const folder = isPrivate ? "output/uploaded" : "data";
112127
return `${process.env.FILESERVER_URL_PREFIX}/${folder}/${fileName}`;
113128
}
114-
return undefined;
129+
// fallback to URL in metadata (e.g. for websites)
130+
return metadata["URL"];
115131
}

0 commit comments

Comments
 (0)