From d6a3c2821c77d767030f1bced44701543d67e2a5 Mon Sep 17 00:00:00 2001
From: ww2283 <ww2283@columbia.edu>
Date: Wed, 22 Oct 2025 14:10:47 -0400
Subject: [PATCH 1/6] feat: add metadata output to search results

- Add --show-metadata flag to display file paths in search results
- Preserve document metadata (file_path, file_name, timestamps) during chunking
- Update MCP tool schema to support show_metadata parameter
- Enhance CLI search output to display metadata when requested
- Fix pre-existing bug: args.backend -> args.backend_name

Resolves yichuan-w/LEANN#144
---
 packages/leann-core/src/leann/cli.py | 61 +++++++++++++++++++++++++---
 packages/leann-core/src/leann/mcp.py |  7 ++++
 2 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py
index 47ea88ce..7709259c 100644
--- a/packages/leann-core/src/leann/cli.py
+++ b/packages/leann-core/src/leann/cli.py
@@ -254,6 +254,11 @@ def create_parser(self) -> argparse.ArgumentParser:
             action="store_true",
             help="Non-interactive mode: automatically select index without prompting",
         )
+        search_parser.add_argument(
+            "--show-metadata",
+            action="store_true",
+            help="Display file paths and metadata in search results",
+        )
 
         # Ask command
         ask_parser = subparsers.add_parser("ask", help="Ask questions")
@@ -1261,7 +1266,7 @@ def file_filter(
                 from .chunking_utils import create_text_chunks
 
                 # Use enhanced chunking with AST support
-                all_texts = create_text_chunks(
+                chunk_texts = create_text_chunks(
                     documents,
                     chunk_size=self.node_parser.chunk_size,
                     chunk_overlap=self.node_parser.chunk_overlap,
@@ -1272,6 +1277,17 @@ def file_filter(
                     ast_fallback_traditional=getattr(args, "ast_fallback_traditional", True),
                 )
 
+                # Note: AST chunking currently returns plain text chunks without metadata
+                # We preserve basic file info by associating chunks with their source documents
+                # For better metadata preservation, documents list order should be maintained
+                for chunk_text in chunk_texts:
+                    # TODO: Enhance create_text_chunks to return metadata alongside text
+                    # For now, we store chunks with empty metadata
+                    all_texts.append({
+                        "text": chunk_text,
+                        "metadata": {}
+                    })
+
             except ImportError as e:
                 print(
                     f"⚠️  AST chunking utilities not available in package ({e}), falling back to traditional chunking"
@@ -1283,14 +1299,30 @@ def file_filter(
             for doc in tqdm(documents, desc="Chunking documents", unit="doc"):
                 # Check if this is a code file based on source path
                 source_path = doc.metadata.get("source", "")
+                file_path = doc.metadata.get("file_path", "")
                 is_code_file = any(source_path.endswith(ext) for ext in code_file_exts)
 
+                # Extract metadata to preserve with chunks
+                chunk_metadata = {
+                    "file_path": file_path or source_path,
+                    "file_name": doc.metadata.get("file_name", ""),
+                }
+
+                # Add optional metadata if available
+                if "creation_date" in doc.metadata:
+                    chunk_metadata["creation_date"] = doc.metadata["creation_date"]
+                if "last_modified_date" in doc.metadata:
+                    chunk_metadata["last_modified_date"] = doc.metadata["last_modified_date"]
+
                 # Use appropriate parser based on file type
                 parser = self.code_parser if is_code_file else self.node_parser
                 nodes = parser.get_nodes_from_documents([doc])
 
                 for node in nodes:
-                    all_texts.append(node.get_content())
+                    all_texts.append({
+                        "text": node.get_content(),
+                        "metadata": chunk_metadata
+                    })
 
         print(f"Loaded {len(documents)} documents, {len(all_texts)} chunks")
         return all_texts
@@ -1365,7 +1397,7 @@ async def build_index(self, args):
 
         index_dir.mkdir(parents=True, exist_ok=True)
 
-        print(f"Building index '{index_name}' with {args.backend} backend...")
+        print(f"Building index '{index_name}' with {args.backend_name} backend...")
 
         embedding_options: dict[str, Any] = {}
         if args.embedding_mode == "ollama":
@@ -1377,7 +1409,7 @@ async def build_index(self, args):
                 embedding_options["api_key"] = resolved_embedding_key
 
         builder = LeannBuilder(
-            backend_name=args.backend,
+            backend_name=args.backend_name,
             embedding_model=args.embedding_model,
             embedding_mode=args.embedding_mode,
             embedding_options=embedding_options or None,
@@ -1388,8 +1420,8 @@ async def build_index(self, args):
             num_threads=args.num_threads,
         )
 
-        for chunk_text in all_texts:
-            builder.add_text(chunk_text)
+        for chunk in all_texts:
+            builder.add_text(chunk["text"], metadata=chunk["metadata"])
 
         builder.build_index(index_path)
         print(f"Index built at {index_path}")
@@ -1510,6 +1542,23 @@ async def search_documents(self, args):
         print(f"Search results for '{query}' (top {len(results)}):")
         for i, result in enumerate(results, 1):
             print(f"{i}. Score: {result.score:.3f}")
+
+            # Display metadata if flag is set
+            if args.show_metadata and result.metadata:
+                file_path = result.metadata.get("file_path", "")
+                if file_path:
+                    print(f"   📄 File: {file_path}")
+
+                file_name = result.metadata.get("file_name", "")
+                if file_name and file_name != file_path:
+                    print(f"   📝 Name: {file_name}")
+
+                # Show timestamps if available
+                if "creation_date" in result.metadata:
+                    print(f"   🕐 Created: {result.metadata['creation_date']}")
+                if "last_modified_date" in result.metadata:
+                    print(f"   🕑 Modified: {result.metadata['last_modified_date']}")
+
             print(f"   {result.text[:200]}...")
             print()
 
diff --git a/packages/leann-core/src/leann/mcp.py b/packages/leann-core/src/leann/mcp.py
index d0577888..8ccde94b 100755
--- a/packages/leann-core/src/leann/mcp.py
+++ b/packages/leann-core/src/leann/mcp.py
@@ -60,6 +60,11 @@ def handle_request(request):
                                     "maximum": 128,
                                     "description": "Search complexity level. Use 16-32 for fast searches (recommended), 64+ for higher precision when needed.",
                                 },
+                                "show_metadata": {
+                                    "type": "boolean",
+                                    "default": False,
+                                    "description": "Include file paths and metadata in search results. Useful for understanding which files contain the results.",
+                                },
                             },
                             "required": ["index_name", "query"],
                         },
@@ -104,6 +109,8 @@ def handle_request(request):
                     f"--complexity={args.get('complexity', 32)}",
                     "--non-interactive",
                 ]
+                if args.get("show_metadata", False):
+                    cmd.append("--show-metadata")
                 result = subprocess.run(cmd, capture_output=True, text=True)
 
             elif tool_name == "leann_list":

From 76e16338ca565b125f8489ca0857d03e295da82d Mon Sep 17 00:00:00 2001
From: ww2283 <ww2283@columbia.edu>
Date: Wed, 22 Oct 2025 18:53:13 -0400
Subject: [PATCH 2/6] fix: resolve ZMQ linking issues in Python extension

- Use pkg_check_modules IMPORTED_TARGET to create PkgConfig::ZMQ
- Set PKG_CONFIG_PATH to prioritize ARM64 Homebrew on Apple Silicon
- Override macOS -undefined dynamic_lookup to force proper symbol resolution
- Use PUBLIC linkage for ZMQ in faiss library for transitive linking
- Mark cppzmq includes as SYSTEM to suppress warnings

Fixes editable install ZMQ symbol errors while maintaining compatibility
across Linux, macOS Intel, and macOS ARM64 platforms.
---
 packages/leann-backend-hnsw/CMakeLists.txt    | 19 ++++++++++++++++---
 packages/leann-backend-hnsw/third_party/faiss |  2 +-
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/packages/leann-backend-hnsw/CMakeLists.txt b/packages/leann-backend-hnsw/CMakeLists.txt
index 87d4592e..b588c259 100644
--- a/packages/leann-backend-hnsw/CMakeLists.txt
+++ b/packages/leann-backend-hnsw/CMakeLists.txt
@@ -29,12 +29,25 @@ if(APPLE)
     set(CMAKE_OSX_DEPLOYMENT_TARGET "11.0" CACHE STRING "Minimum macOS version")
 endif()
 
-# Use system ZeroMQ instead of building from source
+# Find ZMQ using pkg-config with IMPORTED_TARGET for automatic target creation
 find_package(PkgConfig REQUIRED)
-pkg_check_modules(ZMQ REQUIRED libzmq)
+
+# On ARM64 macOS, ensure pkg-config finds ARM64 Homebrew packages first
+if(APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64")
+    set(ENV{PKG_CONFIG_PATH} "/opt/homebrew/lib/pkgconfig:/opt/homebrew/share/pkgconfig:$ENV{PKG_CONFIG_PATH}")
+endif()
+
+pkg_check_modules(ZMQ REQUIRED IMPORTED_TARGET libzmq)
+
+# This creates PkgConfig::ZMQ target automatically with correct properties
+if(TARGET PkgConfig::ZMQ)
+    message(STATUS "Found and configured ZMQ target: PkgConfig::ZMQ")
+else()
+    message(FATAL_ERROR "pkg_check_modules did not create IMPORTED target for ZMQ.")
+endif()
 
 # Add cppzmq headers
-include_directories(third_party/cppzmq)
+include_directories(SYSTEM third_party/cppzmq)
 
 # Configure msgpack-c - disable boost dependency
 set(MSGPACK_USE_BOOST OFF CACHE BOOL "" FORCE)
diff --git a/packages/leann-backend-hnsw/third_party/faiss b/packages/leann-backend-hnsw/third_party/faiss
index 59527452..6ec1b9b1 160000
--- a/packages/leann-backend-hnsw/third_party/faiss
+++ b/packages/leann-backend-hnsw/third_party/faiss
@@ -1 +1 @@
-Subproject commit 595274523790e3bb5991437c3fc6032f170ebad9
+Subproject commit 6ec1b9b1c7bbb557aea7aa7190c38db6bd50f725

From 5073f312b6285596fed579e03dd37d906a2192e7 Mon Sep 17 00:00:00 2001
From: ww2283 <ww2283@columbia.edu>
Date: Wed, 22 Oct 2025 20:13:25 -0400
Subject: [PATCH 3/6] style: apply ruff formatting

---
 packages/leann-core/src/leann/cli.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py
index 7709259c..a60bfcd7 100644
--- a/packages/leann-core/src/leann/cli.py
+++ b/packages/leann-core/src/leann/cli.py
@@ -1283,10 +1283,7 @@ def file_filter(
                 for chunk_text in chunk_texts:
                     # TODO: Enhance create_text_chunks to return metadata alongside text
                     # For now, we store chunks with empty metadata
-                    all_texts.append({
-                        "text": chunk_text,
-                        "metadata": {}
-                    })
+                    all_texts.append({"text": chunk_text, "metadata": {}})
 
             except ImportError as e:
                 print(
@@ -1319,10 +1316,7 @@ def file_filter(
                 nodes = parser.get_nodes_from_documents([doc])
 
                 for node in nodes:
-                    all_texts.append({
-                        "text": node.get_content(),
-                        "metadata": chunk_metadata
-                    })
+                    all_texts.append({"text": node.get_content(), "metadata": chunk_metadata})
 
         print(f"Loaded {len(documents)} documents, {len(all_texts)} chunks")
         return all_texts

From 585ef7785d5116f192d1a76057d9c58938554868 Mon Sep 17 00:00:00 2001
From: ww2283 <ww2283@columbia.edu>
Date: Sat, 25 Oct 2025 10:44:48 -0400
Subject: [PATCH 4/6] chore: update faiss submodule to use ww2283 fork

Use ww2283/faiss fork with fix/zmq-linking branch to resolve CI checkout
failures. The ZMQ linking fixes are not yet merged upstream.
---
 .gitmodules | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.gitmodules b/.gitmodules
index 359164c0..0d91b78f 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -3,7 +3,8 @@
 	url = https://github.com/yichuan-w/DiskANN.git
 [submodule "packages/leann-backend-hnsw/third_party/faiss"]
 	path = packages/leann-backend-hnsw/third_party/faiss
-	url = https://github.com/yichuan-w/faiss.git
+	url = https://github.com/ww2283/faiss.git
+	branch = fix/zmq-linking
 [submodule "packages/leann-backend-hnsw/third_party/msgpack-c"]
 	path = packages/leann-backend-hnsw/third_party/msgpack-c
 	url = https://github.com/msgpack/msgpack-c.git

From d226f72bc0b65f9866ac7214f94c3da353e482e5 Mon Sep 17 00:00:00 2001
From: ww2283 <ww2283@columbia.edu>
Date: Thu, 23 Oct 2025 11:30:09 -0400
Subject: [PATCH 5/6] feat: implement true batch processing for Ollama
 embeddings

Migrate from deprecated /api/embeddings to modern /api/embed endpoint
which supports batch inputs. This reduces HTTP overhead by sending
32 texts per request instead of making individual API calls.

Changes:
- Update endpoint from /api/embeddings to /api/embed
- Change parameter from 'prompt' (single) to 'input' (array)
- Update response parsing for batch embeddings array
- Increase timeout to 60s for batch processing
- Improve error handling for batch requests

Performance:
- Reduces API calls by 32x (batch size)
- Eliminates HTTP connection overhead per text
- Note: Ollama still processes batch items sequentially internally

Related: #151
---
 .../leann-core/src/leann/embedding_compute.py | 104 +++++++++---------
 1 file changed, 54 insertions(+), 50 deletions(-)

diff --git a/packages/leann-core/src/leann/embedding_compute.py b/packages/leann-core/src/leann/embedding_compute.py
index 06fba3df..e3d9f865 100644
--- a/packages/leann-core/src/leann/embedding_compute.py
+++ b/packages/leann-core/src/leann/embedding_compute.py
@@ -574,9 +574,10 @@ def compute_embeddings_ollama(
     host: Optional[str] = None,
 ) -> np.ndarray:
     """
-    Compute embeddings using Ollama API with simplified batch processing.
+    Compute embeddings using Ollama API with true batch processing.
 
-    Uses batch size of 32 for MPS/CPU and 128 for CUDA to optimize performance.
+    Uses the /api/embed endpoint which supports batch inputs.
+    Batch size: 32 for MPS/CPU, 128 for CUDA to optimize performance.
 
     Args:
         texts: List of texts to compute embeddings for
@@ -681,11 +682,11 @@ def compute_embeddings_ollama(
             logger.info(f"Resolved model name '{model_name}' to '{resolved_model_name}'")
         model_name = resolved_model_name
 
-        # Verify the model supports embeddings by testing it
+        # Verify the model supports embeddings by testing it with /api/embed
         try:
             test_response = requests.post(
-                f"{resolved_host}/api/embeddings",
-                json={"model": model_name, "prompt": "test"},
+                f"{resolved_host}/api/embed",
+                json={"model": model_name, "input": "test"},
                 timeout=10,
             )
             if test_response.status_code != 200:
@@ -717,56 +718,55 @@ def compute_embeddings_ollama(
         # If torch is not available, use conservative batch size
         batch_size = 32
 
-    logger.info(f"Using batch size: {batch_size}")
+    logger.info(f"Using batch size: {batch_size} for true batch processing")
 
     def get_batch_embeddings(batch_texts):
-        """Get embeddings for a batch of texts."""
-        all_embeddings = []
-        failed_indices = []
+        """Get embeddings for a batch of texts using /api/embed endpoint."""
+        max_retries = 3
+        retry_count = 0
 
-        for i, text in enumerate(batch_texts):
-            max_retries = 3
-            retry_count = 0
+        # Truncate very long texts to avoid API issues
+        truncated_texts = [text[:8000] if len(text) > 8000 else text for text in batch_texts]
 
-            # Truncate very long texts to avoid API issues
-            truncated_text = text[:8000] if len(text) > 8000 else text
-            while retry_count < max_retries:
-                try:
-                    response = requests.post(
-                        f"{resolved_host}/api/embeddings",
-                        json={"model": model_name, "prompt": truncated_text},
-                        timeout=30,
-                    )
-                    response.raise_for_status()
+        while retry_count < max_retries:
+            try:
+                # Use /api/embed endpoint with "input" parameter for batch processing
+                response = requests.post(
+                    f"{resolved_host}/api/embed",
+                    json={"model": model_name, "input": truncated_texts},
+                    timeout=60,  # Increased timeout for batch processing
+                )
+                response.raise_for_status()
+
+                result = response.json()
+                batch_embeddings = result.get("embeddings")
 
-                    result = response.json()
-                    embedding = result.get("embedding")
+                if batch_embeddings is None:
+                    raise ValueError("No embeddings returned from API")
 
-                    if embedding is None:
-                        raise ValueError(f"No embedding returned for text {i}")
+                if not isinstance(batch_embeddings, list):
+                    raise ValueError(f"Invalid embeddings format: {type(batch_embeddings)}")
 
-                    if not isinstance(embedding, list) or len(embedding) == 0:
-                        raise ValueError(f"Invalid embedding format for text {i}")
+                if len(batch_embeddings) != len(batch_texts):
+                    raise ValueError(
+                        f"Mismatch: requested {len(batch_texts)} embeddings, got {len(batch_embeddings)}"
+                    )
+
+                return batch_embeddings, []
 
-                    all_embeddings.append(embedding)
-                    break
+            except requests.exceptions.Timeout:
+                retry_count += 1
+                if retry_count >= max_retries:
+                    logger.warning(f"Timeout for batch after {max_retries} retries")
+                    return None, list(range(len(batch_texts)))
 
-                except requests.exceptions.Timeout:
-                    retry_count += 1
-                    if retry_count >= max_retries:
-                        logger.warning(f"Timeout for text {i} after {max_retries} retries")
-                        failed_indices.append(i)
-                        all_embeddings.append(None)
-                        break
+            except Exception as e:
+                retry_count += 1
+                if retry_count >= max_retries:
+                    logger.error(f"Failed to get embeddings for batch: {e}")
+                    return None, list(range(len(batch_texts)))
 
-                except Exception as e:
-                    retry_count += 1
-                    if retry_count >= max_retries:
-                        logger.error(f"Failed to get embedding for text {i}: {e}")
-                        failed_indices.append(i)
-                        all_embeddings.append(None)
-                        break
-        return all_embeddings, failed_indices
+        return None, list(range(len(batch_texts)))
 
     # Process texts in batches
     all_embeddings = []
@@ -784,7 +784,7 @@ def get_batch_embeddings(batch_texts):
     num_batches = (len(texts) + batch_size - 1) // batch_size
 
     if show_progress:
-        batch_iterator = tqdm(range(num_batches), desc="Computing Ollama embeddings")
+        batch_iterator = tqdm(range(num_batches), desc="Computing Ollama embeddings (batched)")
     else:
         batch_iterator = range(num_batches)
 
@@ -795,10 +795,14 @@ def get_batch_embeddings(batch_texts):
 
         batch_embeddings, batch_failed = get_batch_embeddings(batch_texts)
 
-        # Adjust failed indices to global indices
-        global_failed = [start_idx + idx for idx in batch_failed]
-        all_failed_indices.extend(global_failed)
-        all_embeddings.extend(batch_embeddings)
+        if batch_embeddings is not None:
+            all_embeddings.extend(batch_embeddings)
+        else:
+            # Entire batch failed, add None placeholders
+            all_embeddings.extend([None] * len(batch_texts))
+            # Adjust failed indices to global indices
+            global_failed = [start_idx + idx for idx in batch_failed]
+            all_failed_indices.extend(global_failed)
 
     # Handle failed embeddings
     if all_failed_indices:

From 6c8801480d978493bc75cdd1373289de77dbfd29 Mon Sep 17 00:00:00 2001
From: yichuan520030910320 <yichuan_wang@berkeley.edu>
Date: Thu, 30 Oct 2025 16:36:14 -0700
Subject: [PATCH 6/6] fall back to original faiss as i merge the PR

---
 .gitmodules                                   | 3 +--
 packages/leann-backend-hnsw/third_party/faiss | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 0d91b78f..359164c0 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -3,8 +3,7 @@
 	url = https://github.com/yichuan-w/DiskANN.git
 [submodule "packages/leann-backend-hnsw/third_party/faiss"]
 	path = packages/leann-backend-hnsw/third_party/faiss
-	url = https://github.com/ww2283/faiss.git
-	branch = fix/zmq-linking
+	url = https://github.com/yichuan-w/faiss.git
 [submodule "packages/leann-backend-hnsw/third_party/msgpack-c"]
 	path = packages/leann-backend-hnsw/third_party/msgpack-c
 	url = https://github.com/msgpack/msgpack-c.git
diff --git a/packages/leann-backend-hnsw/third_party/faiss b/packages/leann-backend-hnsw/third_party/faiss
index 6ec1b9b1..59527452 160000
--- a/packages/leann-backend-hnsw/third_party/faiss
+++ b/packages/leann-backend-hnsw/third_party/faiss
@@ -1 +1 @@
-Subproject commit 6ec1b9b1c7bbb557aea7aa7190c38db6bd50f725
+Subproject commit 595274523790e3bb5991437c3fc6032f170ebad9