diff --git a/ui/__tests__/scientific-rag.test.ts b/ui/__tests__/scientific-rag.test.ts new file mode 100644 index 0000000..015fd76 --- /dev/null +++ b/ui/__tests__/scientific-rag.test.ts @@ -0,0 +1,66 @@ +import { + buildCitationKey, + buildScientificMetadata, + detectScientificSection, + formatRetrievedDocument, +} from '@/utils/server/scientific-rag'; +import { describe, expect, it } from 'vitest'; + +describe('scientific-rag helpers', () => { + it('detects scientific sections from document chunks', () => { + expect(detectScientificSection('Abstract\nThis paper studies retrieval.')).toBe( + 'abstract', + ); + expect(detectScientificSection('METHODS\nWe used a benchmark.')).toBe( + 'methods', + ); + }); + + it('builds stable citation keys', () => { + expect( + buildCitationKey({ + title: 'Scientific RAG for Papers!', + page: 4, + chunkIndex: 2, + }), + ).toBe('scientific-rag-for-papers:p4:c3'); + }); + + it('builds metadata with title fallback and section', () => { + const metadata = buildScientificMetadata( + { + pageContent: 'Results\nThe model improved citation accuracy.', + metadata: { + loc: { pageNumber: 7 }, + pdf: { info: { Title: '' } }, + source: '/tmp/paper.pdf', + }, + }, + 'paper.pdf', + 0, + ); + + expect(metadata).toMatchObject({ + title: 'paper.pdf', + page: 7, + section: 'results', + citationKey: 'paper-pdf:p7:c1', + }); + }); + + it('formats retrieved documents with citation metadata', () => { + expect( + formatRetrievedDocument({ + content: 'Citation-aware answer context.', + metadata: { + title: 'Paper', + page: 2, + section: 'discussion', + citationKey: 'paper:p2:c1', + }, + distance: 0.12345, + index: 0, + }), + ).toContain('Source 1 [paper:p2:c1]'); + }); +}); diff --git a/ui/package-lock.json b/ui/package-lock.json index 998e32c..8345168 100644 --- a/ui/package-lock.json +++ b/ui/package-lock.json @@ -13,7 +13,7 @@ "@radix-ui/react-select": "^1.2.2", "@radix-ui/react-tooltip": "^1.0.6", "@tabler/icons-react": "^2.9.0", - "@xenova/transformers": "^2.5.2", + "@xenova/transformers": "^2.17.2", "chromadb": "^1.5.6", "class-variance-authority": "^0.7.0", "clsx": "^2.0.0", @@ -449,30 +449,6 @@ "integrity": "sha512-0hYQ8SB4Db5zvZB4axdMHGwEaQjkZzFjQiN9LVYvIFB2nSUHW9tYpxWriPrWDASIxiaXax83REcLxuSdnGPZtw==", "dev": true }, - "node_modules/@cspotcode/source-map-support": { - "version": "0.8.1", - "resolved": "https://registry.npmjs.org/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz", - "integrity": "sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==", - "optional": true, - "peer": true, - "dependencies": { - "@jridgewell/trace-mapping": "0.3.9" - }, - "engines": { - "node": ">=12" - } - }, - "node_modules/@cspotcode/source-map-support/node_modules/@jridgewell/trace-mapping": { - "version": "0.3.9", - "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.9.tgz", - "integrity": "sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==", - "optional": true, - "peer": true, - "dependencies": { - "@jridgewell/resolve-uri": "^3.0.3", - "@jridgewell/sourcemap-codec": "^1.4.10" - } - }, "node_modules/@dqbd/tiktoken": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/@dqbd/tiktoken/-/tiktoken-1.0.2.tgz", @@ -920,6 +896,15 @@ "resolved": "https://registry.npmjs.org/@floating-ui/utils/-/utils-0.1.1.tgz", "integrity": "sha512-m0G6wlnhm/AX0H12IOWtK8gASEMffnX08RtKkCgTdHb9JpHKGloI7icFfLg9ZmQeavcvR0PKmzxClyuFPSjKWw==" }, + "node_modules/@huggingface/jinja": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.2.2.tgz", + "integrity": "sha512-/KPde26khDUIPkTGU82jdtTW9UAuvUTumCAbFs/7giR0SxsvZC4hru51PBvpijH6BVkHcROcvZM/lpy5h1jRRA==", + "license": "MIT", + "engines": { + "node": ">=18" + } + }, "node_modules/@humanwhocodes/config-array": { "version": "0.11.8", "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.11.8.tgz", @@ -1958,34 +1943,6 @@ } } }, - "node_modules/@tsconfig/node10": { - "version": "1.0.9", - "resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.9.tgz", - "integrity": "sha512-jNsYVVxU8v5g43Erja32laIDHXeoNvFEpX33OK4d6hljo3jDhCBDhx5dhCCTMWUojscpAagGiRkBKxpdl9fxqA==", - "optional": true, - "peer": true - }, - "node_modules/@tsconfig/node12": { - "version": "1.0.11", - "resolved": "https://registry.npmjs.org/@tsconfig/node12/-/node12-1.0.11.tgz", - "integrity": "sha512-cqefuRsh12pWyGsIoBKJA9luFu3mRxCA+ORZvA4ktLSzIuCUtWVxGIuXigEwO5/ywWFMZ2QEGKWvkZG1zDMTag==", - "optional": true, - "peer": true - }, - "node_modules/@tsconfig/node14": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/@tsconfig/node14/-/node14-1.0.3.tgz", - "integrity": "sha512-ysT8mhdixWK6Hw3i1V2AeRqZ5WfXg1G43mqoYlM2nc6388Fq5jcXyr5mRsqViLx/GJYdoL0bfXD8nmF+Zn/Iow==", - "optional": true, - "peer": true - }, - "node_modules/@tsconfig/node16": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.3.tgz", - "integrity": "sha512-yOlFc+7UtL/89t2ZhjPvvB/DeAr3r+Dq58IgzsFkOAvVC6NMJXmCGjbptdXdR9qsX7pKcTL+s87FtYREi2dEEQ==", - "optional": true, - "peer": true - }, "node_modules/@types/chai": { "version": "4.3.4", "resolved": "https://registry.npmjs.org/@types/chai/-/chai-4.3.4.tgz", @@ -2372,10 +2329,12 @@ } }, "node_modules/@xenova/transformers": { - "version": "2.5.2", - "resolved": "https://registry.npmjs.org/@xenova/transformers/-/transformers-2.5.2.tgz", - "integrity": "sha512-zmCKv6xMoYPlLlH1f9gWYTsnKfjGDfVcoH0POnqIjT0leB+CljnN+YrPSE5B1/xnXQCGOF03e/SknOCqlLRzRw==", + "version": "2.17.2", + "resolved": "https://registry.npmjs.org/@xenova/transformers/-/transformers-2.17.2.tgz", + "integrity": "sha512-lZmHqzrVIkSvZdKZEx7IYY51TK0WDrC8eR0c5IMnBsO8di8are1zzw8BlLhyO2TklZKLN5UffNGs1IJwT6oOqQ==", + "license": "Apache-2.0", "dependencies": { + "@huggingface/jinja": "^0.2.2", "onnxruntime-web": "1.14.0", "sharp": "^0.32.0" }, @@ -3339,13 +3298,6 @@ "url": "https://opencollective.com/core-js" } }, - "node_modules/create-require": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/create-require/-/create-require-1.1.1.tgz", - "integrity": "sha512-dcKFX3jn0MpIaXjisoRvexIJVEKzaq7z2rZKxf+MSr9TkdmHmsU4m2lcLojrj/FHl8mk5VxMmYA+ftRkP/3oKQ==", - "optional": true, - "peer": true - }, "node_modules/cross-spawn": { "version": "7.0.3", "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz", @@ -6721,13 +6673,6 @@ "semver": "bin/semver.js" } }, - "node_modules/make-error": { - "version": "1.3.6", - "resolved": "https://registry.npmjs.org/make-error/-/make-error-1.3.6.tgz", - "integrity": "sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==", - "optional": true, - "peer": true - }, "node_modules/markdown-table": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.3.tgz", @@ -10247,77 +10192,6 @@ "url": "https://github.com/sponsors/wooorm" } }, - "node_modules/ts-node": { - "version": "10.9.1", - "resolved": "https://registry.npmjs.org/ts-node/-/ts-node-10.9.1.tgz", - "integrity": "sha512-NtVysVPkxxrwFGUUxGYhfux8k78pQB3JqYBXlLRZgdGUqTO5wU/UyHop5p70iEbGhB7q5KmiZiU0Y3KlJrScEw==", - "optional": true, - "peer": true, - "dependencies": { - "@cspotcode/source-map-support": "^0.8.0", - "@tsconfig/node10": "^1.0.7", - "@tsconfig/node12": "^1.0.7", - "@tsconfig/node14": "^1.0.0", - "@tsconfig/node16": "^1.0.2", - "acorn": "^8.4.1", - "acorn-walk": "^8.1.1", - "arg": "^4.1.0", - "create-require": "^1.1.0", - "diff": "^4.0.1", - "make-error": "^1.1.1", - "v8-compile-cache-lib": "^3.0.1", - "yn": "3.1.1" - }, - "bin": { - "ts-node": "dist/bin.js", - "ts-node-cwd": "dist/bin-cwd.js", - "ts-node-esm": "dist/bin-esm.js", - "ts-node-script": "dist/bin-script.js", - "ts-node-transpile-only": "dist/bin-transpile.js", - "ts-script": "dist/bin-script-deprecated.js" - }, - "peerDependencies": { - "@swc/core": ">=1.2.50", - "@swc/wasm": ">=1.2.50", - "@types/node": "*", - "typescript": ">=2.7" - }, - "peerDependenciesMeta": { - "@swc/core": { - "optional": true - }, - "@swc/wasm": { - "optional": true - } - } - }, - "node_modules/ts-node/node_modules/acorn-walk": { - "version": "8.2.0", - "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-8.2.0.tgz", - "integrity": "sha512-k+iyHEuPgSw6SbuDpGQM+06HQUa04DZ3o+F6CSzXMvvI5KMvnaEqXe+YVe555R9nn6GPt404fos4wcgpw12SDA==", - "optional": true, - "peer": true, - "engines": { - "node": ">=0.4.0" - } - }, - "node_modules/ts-node/node_modules/arg": { - "version": "4.1.3", - "resolved": "https://registry.npmjs.org/arg/-/arg-4.1.3.tgz", - "integrity": "sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==", - "optional": true, - "peer": true - }, - "node_modules/ts-node/node_modules/diff": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/diff/-/diff-4.0.2.tgz", - "integrity": "sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==", - "optional": true, - "peer": true, - "engines": { - "node": ">=0.3.1" - } - }, "node_modules/tsconfig-paths": { "version": "3.14.2", "resolved": "https://registry.npmjs.org/tsconfig-paths/-/tsconfig-paths-3.14.2.tgz", @@ -10691,13 +10565,6 @@ "node": ">=8" } }, - "node_modules/v8-compile-cache-lib": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz", - "integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==", - "optional": true, - "peer": true - }, "node_modules/v8-to-istanbul": { "version": "9.1.0", "resolved": "https://registry.npmjs.org/v8-to-istanbul/-/v8-to-istanbul-9.1.0.tgz", @@ -11198,16 +11065,6 @@ "node": ">=10" } }, - "node_modules/yn": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/yn/-/yn-3.1.1.tgz", - "integrity": "sha512-Ux4ygGWsu2c7isFWe8Yu1YluJmqVhxqK2cLXNQA5AcC3QfbGNpM7fu0Y8b/z16pXLnFxZYvWhd3fhBY9DLmC6Q==", - "optional": true, - "peer": true, - "engines": { - "node": ">=6" - } - }, "node_modules/yocto-queue": { "version": "0.1.0", "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz", @@ -11580,29 +11437,6 @@ "integrity": "sha512-0hYQ8SB4Db5zvZB4axdMHGwEaQjkZzFjQiN9LVYvIFB2nSUHW9tYpxWriPrWDASIxiaXax83REcLxuSdnGPZtw==", "dev": true }, - "@cspotcode/source-map-support": { - "version": "0.8.1", - "resolved": "https://registry.npmjs.org/@cspotcode/source-map-support/-/source-map-support-0.8.1.tgz", - "integrity": "sha512-IchNf6dN4tHoMFIn/7OE8LWZ19Y6q/67Bmf6vnGREv8RSbBVb9LPJxEcnwrcwX6ixSvaiGoomAUvu4YSxXrVgw==", - "optional": true, - "peer": true, - "requires": { - "@jridgewell/trace-mapping": "0.3.9" - }, - "dependencies": { - "@jridgewell/trace-mapping": { - "version": "0.3.9", - "resolved": "https://registry.npmjs.org/@jridgewell/trace-mapping/-/trace-mapping-0.3.9.tgz", - "integrity": "sha512-3Belt6tdc8bPgAtbcmdtNJlirVoTmEb5e2gC94PnkwEW9jI6CAHUeoG85tjWP5WquqfavoMtMwiG4P926ZKKuQ==", - "optional": true, - "peer": true, - "requires": { - "@jridgewell/resolve-uri": "^3.0.3", - "@jridgewell/sourcemap-codec": "^1.4.10" - } - } - } - }, "@dqbd/tiktoken": { "version": "1.0.2", "resolved": "https://registry.npmjs.org/@dqbd/tiktoken/-/tiktoken-1.0.2.tgz", @@ -11830,6 +11664,11 @@ "resolved": "https://registry.npmjs.org/@floating-ui/utils/-/utils-0.1.1.tgz", "integrity": "sha512-m0G6wlnhm/AX0H12IOWtK8gASEMffnX08RtKkCgTdHb9JpHKGloI7icFfLg9ZmQeavcvR0PKmzxClyuFPSjKWw==" }, + "@huggingface/jinja": { + "version": "0.2.2", + "resolved": "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.2.2.tgz", + "integrity": "sha512-/KPde26khDUIPkTGU82jdtTW9UAuvUTumCAbFs/7giR0SxsvZC4hru51PBvpijH6BVkHcROcvZM/lpy5h1jRRA==" + }, "@humanwhocodes/config-array": { "version": "0.11.8", "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.11.8.tgz", @@ -12418,34 +12257,6 @@ "lodash": "^4.17.21" } }, - "@tsconfig/node10": { - "version": "1.0.9", - "resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.9.tgz", - "integrity": "sha512-jNsYVVxU8v5g43Erja32laIDHXeoNvFEpX33OK4d6hljo3jDhCBDhx5dhCCTMWUojscpAagGiRkBKxpdl9fxqA==", - "optional": true, - "peer": true - }, - "@tsconfig/node12": { - "version": "1.0.11", - "resolved": "https://registry.npmjs.org/@tsconfig/node12/-/node12-1.0.11.tgz", - "integrity": "sha512-cqefuRsh12pWyGsIoBKJA9luFu3mRxCA+ORZvA4ktLSzIuCUtWVxGIuXigEwO5/ywWFMZ2QEGKWvkZG1zDMTag==", - "optional": true, - "peer": true - }, - "@tsconfig/node14": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/@tsconfig/node14/-/node14-1.0.3.tgz", - "integrity": "sha512-ysT8mhdixWK6Hw3i1V2AeRqZ5WfXg1G43mqoYlM2nc6388Fq5jcXyr5mRsqViLx/GJYdoL0bfXD8nmF+Zn/Iow==", - "optional": true, - "peer": true - }, - "@tsconfig/node16": { - "version": "1.0.3", - "resolved": "https://registry.npmjs.org/@tsconfig/node16/-/node16-1.0.3.tgz", - "integrity": "sha512-yOlFc+7UtL/89t2ZhjPvvB/DeAr3r+Dq58IgzsFkOAvVC6NMJXmCGjbptdXdR9qsX7pKcTL+s87FtYREi2dEEQ==", - "optional": true, - "peer": true - }, "@types/chai": { "version": "4.3.4", "resolved": "https://registry.npmjs.org/@types/chai/-/chai-4.3.4.tgz", @@ -12767,10 +12578,11 @@ } }, "@xenova/transformers": { - "version": "2.5.2", - "resolved": "https://registry.npmjs.org/@xenova/transformers/-/transformers-2.5.2.tgz", - "integrity": "sha512-zmCKv6xMoYPlLlH1f9gWYTsnKfjGDfVcoH0POnqIjT0leB+CljnN+YrPSE5B1/xnXQCGOF03e/SknOCqlLRzRw==", + "version": "2.17.2", + "resolved": "https://registry.npmjs.org/@xenova/transformers/-/transformers-2.17.2.tgz", + "integrity": "sha512-lZmHqzrVIkSvZdKZEx7IYY51TK0WDrC8eR0c5IMnBsO8di8are1zzw8BlLhyO2TklZKLN5UffNGs1IJwT6oOqQ==", "requires": { + "@huggingface/jinja": "^0.2.2", "onnxruntime-node": "1.14.0", "onnxruntime-web": "1.14.0", "sharp": "^0.32.0" @@ -13447,13 +13259,6 @@ "resolved": "https://registry.npmjs.org/core-js/-/core-js-3.29.1.tgz", "integrity": "sha512-+jwgnhg6cQxKYIIjGtAHq2nwUOolo9eoFZ4sHfUH09BLXBgxnH4gA0zEd+t+BO2cNB8idaBtZFcFTRjQJRJmAw==" }, - "create-require": { - "version": "1.1.1", - "resolved": "https://registry.npmjs.org/create-require/-/create-require-1.1.1.tgz", - "integrity": "sha512-dcKFX3jn0MpIaXjisoRvexIJVEKzaq7z2rZKxf+MSr9TkdmHmsU4m2lcLojrj/FHl8mk5VxMmYA+ftRkP/3oKQ==", - "optional": true, - "peer": true - }, "cross-spawn": { "version": "7.0.3", "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.3.tgz", @@ -15709,13 +15514,6 @@ } } }, - "make-error": { - "version": "1.3.6", - "resolved": "https://registry.npmjs.org/make-error/-/make-error-1.3.6.tgz", - "integrity": "sha512-s8UhlNe7vPKomQhC1qFelMokr/Sc3AgNbso3n74mVPA5LTZwkB9NlXf4XPamLxJE8h0gh73rM94xvwRT2CVInw==", - "optional": true, - "peer": true - }, "markdown-table": { "version": "3.0.3", "resolved": "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.3.tgz", @@ -18072,51 +17870,6 @@ "resolved": "https://registry.npmjs.org/trough/-/trough-2.1.0.tgz", "integrity": "sha512-AqTiAOLcj85xS7vQ8QkAV41hPDIJ71XJB4RCUrzo/1GM2CQwhkJGaf9Hgr7BOugMRpgGUrqRg/DrBDl4H40+8g==" }, - "ts-node": { - "version": "10.9.1", - "resolved": "https://registry.npmjs.org/ts-node/-/ts-node-10.9.1.tgz", - "integrity": "sha512-NtVysVPkxxrwFGUUxGYhfux8k78pQB3JqYBXlLRZgdGUqTO5wU/UyHop5p70iEbGhB7q5KmiZiU0Y3KlJrScEw==", - "optional": true, - "peer": true, - "requires": { - "@cspotcode/source-map-support": "^0.8.0", - "@tsconfig/node10": "^1.0.7", - "@tsconfig/node12": "^1.0.7", - "@tsconfig/node14": "^1.0.0", - "@tsconfig/node16": "^1.0.2", - "acorn": "^8.4.1", - "acorn-walk": "^8.1.1", - "arg": "^4.1.0", - "create-require": "^1.1.0", - "diff": "^4.0.1", - "make-error": "^1.1.1", - "v8-compile-cache-lib": "^3.0.1", - "yn": "3.1.1" - }, - "dependencies": { - "acorn-walk": { - "version": "8.2.0", - "resolved": "https://registry.npmjs.org/acorn-walk/-/acorn-walk-8.2.0.tgz", - "integrity": "sha512-k+iyHEuPgSw6SbuDpGQM+06HQUa04DZ3o+F6CSzXMvvI5KMvnaEqXe+YVe555R9nn6GPt404fos4wcgpw12SDA==", - "optional": true, - "peer": true - }, - "arg": { - "version": "4.1.3", - "resolved": "https://registry.npmjs.org/arg/-/arg-4.1.3.tgz", - "integrity": "sha512-58S9QDqG0Xx27YwPSt9fJxivjYl432YCwfDMfZ+71RAqUrZef7LrKQZ3LHLOwCS4FLNBplP533Zx895SeOCHvA==", - "optional": true, - "peer": true - }, - "diff": { - "version": "4.0.2", - "resolved": "https://registry.npmjs.org/diff/-/diff-4.0.2.tgz", - "integrity": "sha512-58lmxKSA4BNyLz+HHMUzlOEpg09FV+ev6ZMe3vJihgdxzgcwZ8VoEEPmALCZG9LmqfVoNMMKpttIYTVG6uDY7A==", - "optional": true, - "peer": true - } - } - }, "tsconfig-paths": { "version": "3.14.2", "resolved": "https://registry.npmjs.org/tsconfig-paths/-/tsconfig-paths-3.14.2.tgz", @@ -18372,13 +18125,6 @@ "sade": "^1.7.3" } }, - "v8-compile-cache-lib": { - "version": "3.0.1", - "resolved": "https://registry.npmjs.org/v8-compile-cache-lib/-/v8-compile-cache-lib-3.0.1.tgz", - "integrity": "sha512-wa7YjyUGfNZngI/vtK0UHAN+lgDCxBPCylVXGp0zu59Fz5aiGtNXaq3DhIov063MorB+VfufLh3JlF2KdTK3xg==", - "optional": true, - "peer": true - }, "v8-to-istanbul": { "version": "9.1.0", "resolved": "https://registry.npmjs.org/v8-to-istanbul/-/v8-to-istanbul-9.1.0.tgz", @@ -18689,13 +18435,6 @@ "integrity": "sha512-y11nGElTIV+CT3Zv9t7VKl+Q3hTQoT9a1Qzezhhl6Rp21gJ/IVTW7Z3y9EWXhuUBC2Shnf+DX0antecpAwSP8w==", "dev": true }, - "yn": { - "version": "3.1.1", - "resolved": "https://registry.npmjs.org/yn/-/yn-3.1.1.tgz", - "integrity": "sha512-Ux4ygGWsu2c7isFWe8Yu1YluJmqVhxqK2cLXNQA5AcC3QfbGNpM7fu0Y8b/z16pXLnFxZYvWhd3fhBY9DLmC6Q==", - "optional": true, - "peer": true - }, "yocto-queue": { "version": "0.1.0", "resolved": "https://registry.npmjs.org/yocto-queue/-/yocto-queue-0.1.0.tgz", diff --git a/ui/package.json b/ui/package.json index e3fd3d7..06ed642 100644 --- a/ui/package.json +++ b/ui/package.json @@ -17,7 +17,7 @@ "@radix-ui/react-select": "^1.2.2", "@radix-ui/react-tooltip": "^1.0.6", "@tabler/icons-react": "^2.9.0", - "@xenova/transformers": "^2.5.2", + "@xenova/transformers": "^2.17.2", "chromadb": "^1.5.6", "class-variance-authority": "^0.7.0", "clsx": "^2.0.0", diff --git a/ui/pages/api/fetch-documents.ts b/ui/pages/api/fetch-documents.ts index 9304e48..c660075 100644 --- a/ui/pages/api/fetch-documents.ts +++ b/ui/pages/api/fetch-documents.ts @@ -1,25 +1,91 @@ -import type { NextApiRequest, NextApiResponse } from "next"; -import { ChromaClient, TransformersEmbeddingFunction } from "chromadb"; +import type { NextApiRequest, NextApiResponse } from 'next'; +import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; +import { pipeline } from '@xenova/transformers'; + +// Singleton for reranker to avoid reloading model on every request +class RerankerPipeline { + static instance: any = null; + static async getInstance() { + if (!this.instance) { + // Use a fast and efficient cross-encoder model for reranking + this.instance = pipeline('text-classification', 'Xenova/ms-marco-MiniLM-L-6-v2'); + } + return this.instance; + } +} export default async function handler(req: NextApiRequest, res: NextApiResponse) { try { const client = new ChromaClient({ - path: "http://chroma-server:8000", + path: process.env.CHROMA_PATH || 'http://chroma-server:8000', }); const query = req.body.input; + const finalNResults = Math.min(Number(req.body.nResults ?? 6), 10); + // Fetch more results initially to have a good pool for reranking + const fetchNResults = finalNResults * 3; + + // Metadata filtering support + const where = req.body.where || undefined; const embedder = new TransformersEmbeddingFunction(); - const collection = await client.getOrCreateCollection({ name: "default-collection", embeddingFunction: embedder }); + const collection = await client.getOrCreateCollection({ + name: 'default-collection', + embeddingFunction: embedder, + }); + + // 1. Retrieval Stage + const results = await collection.query({ + nResults: fetchNResults, + queryTexts: [query], + where, + include: ['documents', 'metadatas', 'distances'] as any, + }); + + const documents = results.documents[0] || []; + const metadatas = results.metadatas[0] || []; + const distances = results.distances?.[0] || []; + + if (documents.length === 0) { + return res.status(200).json(results); + } + + // 2. Reranking Stage + const reranker = await RerankerPipeline.getInstance(); + + // Format pairs for the cross encoder: [query, document] + const pairs = documents.map((doc) => [query, doc]); + + // Score the pairs + const scores = await reranker(pairs); + + // Combine all data to sort + const scoredResults = documents.map((doc, i) => ({ + document: doc, + metadata: metadatas[i], + distance: distances[i], + // For text-classification pipelines, the output is usually [{label: "LABEL_0", score: 0.99}, ...] + // Note: for some models we might need to extract the score for the positive class. + // ms-marco models output a single score or positive/negative. We use the raw score. + rerankScore: scores[i]?.score ?? 0, + })); + + // Sort by rerank score descending + scoredResults.sort((a, b) => b.rerankScore - a.rerankScore); + + // Take top K + const topResults = scoredResults.slice(0, finalNResults); - // query the collection - const results = await collection.query({ - nResults: 4, - queryTexts: [query] - }) + // Format back to ChromaDB query result format + const finalResult = { + documents: [topResults.map((r) => r.document)], + metadatas: [topResults.map((r) => r.metadata)], + distances: [topResults.map((r) => r.distance)], + rerankScores: [topResults.map((r) => r.rerankScore)], + }; - res.status(200).json(results); + res.status(200).json(finalResult); } catch (error) { if (error instanceof Error) { console.error('Error message:', error.message); diff --git a/ui/pages/api/inject-documents.ts b/ui/pages/api/inject-documents.ts index 532a635..729dfd9 100644 --- a/ui/pages/api/inject-documents.ts +++ b/ui/pages/api/inject-documents.ts @@ -3,10 +3,14 @@ import type { NextApiRequest, NextApiResponse } from 'next'; import { ChromaClient, TransformersEmbeddingFunction } from 'chromadb'; import { IncomingForm } from 'formidable'; import { PDFLoader } from 'langchain/document_loaders/fs/pdf'; -import { RecursiveCharacterTextSplitter } from "langchain/text_splitter"; +import { RecursiveCharacterTextSplitter } from 'langchain/text_splitter'; import path from 'path'; import { v4 as uuidv4 } from 'uuid'; +import { + buildScientificMetadata, + SCIENTIFIC_TEXT_SEPARATORS, +} from '@/utils/server/scientific-rag'; export const config = { api: { @@ -37,18 +41,22 @@ export default async function handler( const originalDocs = await loader.load(); - console.log(JSON.stringify(originalDocs)); - + console.log('Original docs loaded:', originalDocs.length); const splitter = new RecursiveCharacterTextSplitter({ chunkSize: 500, chunkOverlap: 100, - }); + separators: SCIENTIFIC_TEXT_SEPARATORS, + }); const docs = await splitter.splitDocuments(originalDocs); - + + // Extract filename for fallback title + const originalFileName = Array.isArray(files.pdf) ? files.pdf[0].originalFilename : (files.pdf as any).originalFilename; + const fallbackTitle = originalFileName || path.basename(files.pdf[0].filepath); + // Process the documents and perform other logic - const { ids, metadatas, documentContents } = processDocuments(docs); + const { ids, metadatas, documentContents } = processDocuments(docs, fallbackTitle); const embedder = new TransformersEmbeddingFunction(); const collection = await client.getOrCreateCollection({ @@ -75,28 +83,33 @@ export default async function handler( } } -function processDocuments(docs: any) { +function processDocuments(docs: any[], fallbackTitle: string) { const ids = []; const metadatas = []; const documentContents = []; - for (const document of docs) { + for (let i = 0; i < docs.length; i++) { + const document = docs[i]; + // Generate an ID for each document, or use some existing unique identifier const id = uuidv4(); ids.push(id); - const fallbackTitle = path.basename(document.metadata.source); - const titleFromMetadata = document.metadata.pdf.info.Title; - - const title = titleFromMetadata && titleFromMetadata.length > 0 ? titleFromMetadata : fallbackTitle; + // Build rich scientific metadata + const metadata = buildScientificMetadata(document, fallbackTitle, i); + + // Ensure ChromaDB only receives supported metadata types (string, boolean, number) + // We convert everything to strings/numbers + const safeMetadata: any = {}; + for (const [key, value] of Object.entries(metadata)) { + if (typeof value === 'string' || typeof value === 'number' || typeof value === 'boolean') { + safeMetadata[key] = value; + } else if (value !== undefined && value !== null) { + safeMetadata[key] = String(value); + } + } - - const metadata = { - title: title, - page: document.metadata.loc.pageNumber, // Define this function to extract chapter info - source: document.metadata.source, // Define this function to extract verse info - }; - metadatas.push(metadata); + metadatas.push(safeMetadata); // Add the page content to the documents array documentContents.push(document.pageContent); diff --git a/ui/pages/api/rag-chat.ts b/ui/pages/api/rag-chat.ts index ce84d67..b289b4b 100644 --- a/ui/pages/api/rag-chat.ts +++ b/ui/pages/api/rag-chat.ts @@ -29,8 +29,16 @@ async function fetchAndFormatDocuments(lastMessageContent: string) { } const data = await response.json(); - const result = data.metadatas[0].map((metadata: any, index: number) => { - return `Source ${index + 1}) Title: ${metadata.title}, Page: ${metadata.page}, Content: ${data.documents[0][index]}\n`; + const { formatRetrievedDocument } = require('@/utils/server/scientific-rag'); + + const result = data.documents[0].map((content: string, index: number) => { + return formatRetrievedDocument({ + content, + metadata: data.metadatas[0][index], + distance: data.distances?.[0]?.[index], + rerankScore: data.rerankScores?.[0]?.[index], + index, + }) + '\\n\\n'; }).join(''); console.log(result); @@ -107,7 +115,7 @@ const handler = async (req: Request): Promise => { { role: "user", content: codeBlock` - Here is the relevant documentation: + Here is the relevant documentation with scientific citations and relevance scores: ${relevantDocuments} `, }, @@ -130,7 +138,10 @@ const handler = async (req: Request): Promise => { - Prefer splitting your response into multiple paragraphs. `} ${oneLine` - - Output as markdown with citations based on the documentation. + - You MUST cite your sources exactly using the provided bracket format, e.g. [paper-title:pX:cY]. + `} + ${oneLine` + - Pay attention to the Relevance Score and Section metadata. Give priority to context with higher scores. `} `, }, diff --git a/ui/utils/server/scientific-rag.ts b/ui/utils/server/scientific-rag.ts new file mode 100644 index 0000000..1b55177 --- /dev/null +++ b/ui/utils/server/scientific-rag.ts @@ -0,0 +1,159 @@ +export type ScientificDocument = { + pageContent: string; + metadata: { + loc?: { + pageNumber?: number; + }; + pdf?: { + info?: { + Title?: string; + }; + }; + source?: string; + [key: string]: unknown; + }; +}; + +export type ScientificChunkMetadata = { + title: string; + page: number | string; + source: string; + section: string; + chunkIndex: number; + citationKey: string; +}; + +const SCIENTIFIC_SECTIONS = [ + 'abstract', + 'introduction', + 'background', + 'methods', + 'methodology', + 'materials and methods', + 'results', + 'discussion', + 'limitations', + 'conclusion', + 'references', +]; + +export const SCIENTIFIC_TEXT_SEPARATORS = [ + '\nAbstract', + '\nABSTRACT', + '\nIntroduction', + '\nINTRODUCTION', + '\nMethods', + '\nMETHODS', + '\nMaterials and Methods', + '\nResults', + '\nRESULTS', + '\nDiscussion', + '\nDISCUSSION', + '\nConclusion', + '\nCONCLUSION', + '\nReferences', + '\nREFERENCES', + '\n\n', + '\n', + '. ', + ' ', + '', +]; + +export const normalizeTitle = ( + titleFromMetadata: string | undefined, + fallbackTitle: string, +) => { + const title = titleFromMetadata?.trim(); + + return title && title.length > 0 ? title : fallbackTitle; +}; + +export const detectScientificSection = (content: string) => { + const firstLines = content + .split('\n') + .slice(0, 8) + .join(' ') + .toLowerCase(); + + for (const section of SCIENTIFIC_SECTIONS) { + const sectionRegex = new RegExp(`\\b${section}\\b`, 'i'); + + if (sectionRegex.test(firstLines)) { + return section; + } + } + + return 'body'; +}; + +export const buildCitationKey = ({ + title, + page, + chunkIndex, +}: { + title: string; + page: number | string; + chunkIndex: number; +}) => { + const slug = title + .toLowerCase() + .replace(/[^a-z0-9]+/g, '-') + .replace(/(^-|-$)/g, '') + .slice(0, 40); + + return `${slug || 'document'}:p${page}:c${chunkIndex + 1}`; +}; + +export const buildScientificMetadata = ( + document: ScientificDocument, + fallbackTitle: string, + chunkIndex: number, +): ScientificChunkMetadata => { + const title = normalizeTitle(document.metadata.pdf?.info?.Title, fallbackTitle); + const page = document.metadata.loc?.pageNumber ?? 'unknown'; + const section = detectScientificSection(document.pageContent); + + return { + title, + page, + source: document.metadata.source ?? fallbackTitle, + section, + chunkIndex, + citationKey: buildCitationKey({ title, page, chunkIndex }), + }; +}; + +export const formatRetrievedDocument = ({ + content, + metadata, + distance, + index, + rerankScore, +}: { + content: string; + metadata: Partial; + distance?: number; + index: number; + rerankScore?: number; +}) => { + const citationKey = metadata.citationKey ?? `source-${index + 1}`; + const page = metadata.page ?? 'unknown'; + const section = metadata.section ?? 'body'; + const scoreLine = typeof rerankScore === 'number' + ? `Relevance Score: ${rerankScore.toFixed(4)}\n` + : typeof distance === 'number' + ? `Distance: ${distance.toFixed(4)}\n` + : ''; + + return [ + `Source ${index + 1} [${citationKey}]`, + `Title: ${metadata.title ?? 'Untitled'}`, + `Page: ${page}`, + `Section: ${section}`, + scoreLine.trim(), + `Content: ${content}`, + ] + .filter(Boolean) + .join('\n'); +};