diff --git a/langchain/package.json b/langchain/package.json index e3f888df971a..1ac81a5a7494 100644 --- a/langchain/package.json +++ b/langchain/package.json @@ -544,6 +544,7 @@ "@langchain/textsplitters": ">=0.0.0 <0.2.0", "js-tiktoken": "^1.0.12", "js-yaml": "^4.1.0", + "jschardet": "^3.1.4", "jsonpointer": "^5.0.1", "langsmith": ">=0.2.8 <0.4.0", "openapi-types": "^12.1.3", diff --git a/langchain/src/document_loaders/fs/helpers.ts b/langchain/src/document_loaders/fs/helpers.ts new file mode 100644 index 000000000000..95cb8f25efed --- /dev/null +++ b/langchain/src/document_loaders/fs/helpers.ts @@ -0,0 +1,58 @@ +import * as jschardet from "jschardet"; +import * as fs from "fs"; + +/** + * Represents file encoding information + */ +export interface FileEncoding { + encoding: BufferEncoding | null; + confidence: number; +} + +const EXECUTION_TIMEOUT = 5000; + +/** + * Detects file encodings for a given file path + * @param filePath - Path to the file + * @param timeout - Timeout in milliseconds + * @returns Promise containing list of detected encodings ordered by confidence + */ +export async function detectFileEncodings( + filePath: fs.PathLike, + timeout: number = EXECUTION_TIMEOUT +): Promise { + try { + // Create a promise that rejects after the timeout + const timeoutPromise = new Promise((_, reject) => { + setTimeout(() => { + reject( + new Error(`Timeout reached while detecting encoding for ${filePath}`) + ); + }, timeout); + }); + + // Create the detection promise + const detectionPromise = async (): Promise => { + const buffer = fs.readFileSync(filePath); + const results = jschardet.detectAll(buffer); + + if (!results || results.every((result) => result.encoding === null)) { + throw new Error(`Could not detect encoding for ${filePath}`); + } + + return results + .filter((result) => result.encoding !== null) + .map((result) => ({ + encoding: result.encoding.toLowerCase() as BufferEncoding, + confidence: result.confidence, + })); + }; + + // Race between timeout and detection + return await Promise.race([detectionPromise(), timeoutPromise]); + } catch (error) { + throw new Error( + `An unknown error occurred during encoding detection ${error}` + ); + } +} diff --git a/langchain/src/document_loaders/fs/text.ts b/langchain/src/document_loaders/fs/text.ts index 54584b4134f2..e958bd7f0aeb 100644 --- a/langchain/src/document_loaders/fs/text.ts +++ b/langchain/src/document_loaders/fs/text.ts @@ -2,6 +2,7 @@ import type { readFile as ReadFileT } from "node:fs/promises"; import { Document } from "@langchain/core/documents"; import { getEnv } from "@langchain/core/utils/env"; import { BaseDocumentLoader } from "../base.js"; +import { detectFileEncodings, FileEncoding } from "./helpers.js"; /** * A class that extends the `BaseDocumentLoader` class. It represents a @@ -46,9 +47,26 @@ export class TextLoader extends BaseDocumentLoader { public async load(): Promise { let text: string; let metadata: Record; + let currentEncoding: BufferEncoding | null = null; + if (typeof this.filePathOrBlob === "string") { const { readFile } = await TextLoader.imports(); - text = await readFile(this.filePathOrBlob, "utf8"); + const detectedEncodings: FileEncoding[] = await detectFileEncodings( + this.filePathOrBlob + ); + + for (const encoding of detectedEncodings) { + try { + await readFile(this.filePathOrBlob, { encoding: encoding.encoding }); + currentEncoding = encoding.encoding; + break; + } catch (error) { + continue; + } + } + text = (await readFile(this.filePathOrBlob, { + encoding: currentEncoding, + })) as string; metadata = { source: this.filePathOrBlob }; } else { text = await this.filePathOrBlob.text();