From d6a3c2821c77d767030f1bced44701543d67e2a5 Mon Sep 17 00:00:00 2001 From: ww2283 Date: Wed, 22 Oct 2025 14:10:47 -0400 Subject: [PATCH 1/6] feat: add metadata output to search results - Add --show-metadata flag to display file paths in search results - Preserve document metadata (file_path, file_name, timestamps) during chunking - Update MCP tool schema to support show_metadata parameter - Enhance CLI search output to display metadata when requested - Fix pre-existing bug: args.backend -> args.backend_name Resolves yichuan-w/LEANN#144 --- packages/leann-core/src/leann/cli.py | 61 +++++++++++++++++++++++++--- packages/leann-core/src/leann/mcp.py | 7 ++++ 2 files changed, 62 insertions(+), 6 deletions(-) diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index 47ea88ce..7709259c 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -254,6 +254,11 @@ def create_parser(self) -> argparse.ArgumentParser: action="store_true", help="Non-interactive mode: automatically select index without prompting", ) + search_parser.add_argument( + "--show-metadata", + action="store_true", + help="Display file paths and metadata in search results", + ) # Ask command ask_parser = subparsers.add_parser("ask", help="Ask questions") @@ -1261,7 +1266,7 @@ def file_filter( from .chunking_utils import create_text_chunks # Use enhanced chunking with AST support - all_texts = create_text_chunks( + chunk_texts = create_text_chunks( documents, chunk_size=self.node_parser.chunk_size, chunk_overlap=self.node_parser.chunk_overlap, @@ -1272,6 +1277,17 @@ def file_filter( ast_fallback_traditional=getattr(args, "ast_fallback_traditional", True), ) + # Note: AST chunking currently returns plain text chunks without metadata + # We preserve basic file info by associating chunks with their source documents + # For better metadata preservation, documents list order should be maintained + for chunk_text in chunk_texts: + # TODO: Enhance create_text_chunks to return metadata alongside text + # For now, we store chunks with empty metadata + all_texts.append({ + "text": chunk_text, + "metadata": {} + }) + except ImportError as e: print( f"⚠️ AST chunking utilities not available in package ({e}), falling back to traditional chunking" @@ -1283,14 +1299,30 @@ def file_filter( for doc in tqdm(documents, desc="Chunking documents", unit="doc"): # Check if this is a code file based on source path source_path = doc.metadata.get("source", "") + file_path = doc.metadata.get("file_path", "") is_code_file = any(source_path.endswith(ext) for ext in code_file_exts) + # Extract metadata to preserve with chunks + chunk_metadata = { + "file_path": file_path or source_path, + "file_name": doc.metadata.get("file_name", ""), + } + + # Add optional metadata if available + if "creation_date" in doc.metadata: + chunk_metadata["creation_date"] = doc.metadata["creation_date"] + if "last_modified_date" in doc.metadata: + chunk_metadata["last_modified_date"] = doc.metadata["last_modified_date"] + # Use appropriate parser based on file type parser = self.code_parser if is_code_file else self.node_parser nodes = parser.get_nodes_from_documents([doc]) for node in nodes: - all_texts.append(node.get_content()) + all_texts.append({ + "text": node.get_content(), + "metadata": chunk_metadata + }) print(f"Loaded {len(documents)} documents, {len(all_texts)} chunks") return all_texts @@ -1365,7 +1397,7 @@ async def build_index(self, args): index_dir.mkdir(parents=True, exist_ok=True) - print(f"Building index '{index_name}' with {args.backend} backend...") + print(f"Building index '{index_name}' with {args.backend_name} backend...") embedding_options: dict[str, Any] = {} if args.embedding_mode == "ollama": @@ -1377,7 +1409,7 @@ async def build_index(self, args): embedding_options["api_key"] = resolved_embedding_key builder = LeannBuilder( - backend_name=args.backend, + backend_name=args.backend_name, embedding_model=args.embedding_model, embedding_mode=args.embedding_mode, embedding_options=embedding_options or None, @@ -1388,8 +1420,8 @@ async def build_index(self, args): num_threads=args.num_threads, ) - for chunk_text in all_texts: - builder.add_text(chunk_text) + for chunk in all_texts: + builder.add_text(chunk["text"], metadata=chunk["metadata"]) builder.build_index(index_path) print(f"Index built at {index_path}") @@ -1510,6 +1542,23 @@ async def search_documents(self, args): print(f"Search results for '{query}' (top {len(results)}):") for i, result in enumerate(results, 1): print(f"{i}. Score: {result.score:.3f}") + + # Display metadata if flag is set + if args.show_metadata and result.metadata: + file_path = result.metadata.get("file_path", "") + if file_path: + print(f" 📄 File: {file_path}") + + file_name = result.metadata.get("file_name", "") + if file_name and file_name != file_path: + print(f" 📝 Name: {file_name}") + + # Show timestamps if available + if "creation_date" in result.metadata: + print(f" 🕐 Created: {result.metadata['creation_date']}") + if "last_modified_date" in result.metadata: + print(f" 🕑 Modified: {result.metadata['last_modified_date']}") + print(f" {result.text[:200]}...") print() diff --git a/packages/leann-core/src/leann/mcp.py b/packages/leann-core/src/leann/mcp.py index d0577888..8ccde94b 100755 --- a/packages/leann-core/src/leann/mcp.py +++ b/packages/leann-core/src/leann/mcp.py @@ -60,6 +60,11 @@ def handle_request(request): "maximum": 128, "description": "Search complexity level. Use 16-32 for fast searches (recommended), 64+ for higher precision when needed.", }, + "show_metadata": { + "type": "boolean", + "default": False, + "description": "Include file paths and metadata in search results. Useful for understanding which files contain the results.", + }, }, "required": ["index_name", "query"], }, @@ -104,6 +109,8 @@ def handle_request(request): f"--complexity={args.get('complexity', 32)}", "--non-interactive", ] + if args.get("show_metadata", False): + cmd.append("--show-metadata") result = subprocess.run(cmd, capture_output=True, text=True) elif tool_name == "leann_list": From 76e16338ca565b125f8489ca0857d03e295da82d Mon Sep 17 00:00:00 2001 From: ww2283 Date: Wed, 22 Oct 2025 18:53:13 -0400 Subject: [PATCH 2/6] fix: resolve ZMQ linking issues in Python extension - Use pkg_check_modules IMPORTED_TARGET to create PkgConfig::ZMQ - Set PKG_CONFIG_PATH to prioritize ARM64 Homebrew on Apple Silicon - Override macOS -undefined dynamic_lookup to force proper symbol resolution - Use PUBLIC linkage for ZMQ in faiss library for transitive linking - Mark cppzmq includes as SYSTEM to suppress warnings Fixes editable install ZMQ symbol errors while maintaining compatibility across Linux, macOS Intel, and macOS ARM64 platforms. --- packages/leann-backend-hnsw/CMakeLists.txt | 19 ++++++++++++++++--- packages/leann-backend-hnsw/third_party/faiss | 2 +- 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/packages/leann-backend-hnsw/CMakeLists.txt b/packages/leann-backend-hnsw/CMakeLists.txt index 87d4592e..b588c259 100644 --- a/packages/leann-backend-hnsw/CMakeLists.txt +++ b/packages/leann-backend-hnsw/CMakeLists.txt @@ -29,12 +29,25 @@ if(APPLE) set(CMAKE_OSX_DEPLOYMENT_TARGET "11.0" CACHE STRING "Minimum macOS version") endif() -# Use system ZeroMQ instead of building from source +# Find ZMQ using pkg-config with IMPORTED_TARGET for automatic target creation find_package(PkgConfig REQUIRED) -pkg_check_modules(ZMQ REQUIRED libzmq) + +# On ARM64 macOS, ensure pkg-config finds ARM64 Homebrew packages first +if(APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64") + set(ENV{PKG_CONFIG_PATH} "/opt/homebrew/lib/pkgconfig:/opt/homebrew/share/pkgconfig:$ENV{PKG_CONFIG_PATH}") +endif() + +pkg_check_modules(ZMQ REQUIRED IMPORTED_TARGET libzmq) + +# This creates PkgConfig::ZMQ target automatically with correct properties +if(TARGET PkgConfig::ZMQ) + message(STATUS "Found and configured ZMQ target: PkgConfig::ZMQ") +else() + message(FATAL_ERROR "pkg_check_modules did not create IMPORTED target for ZMQ.") +endif() # Add cppzmq headers -include_directories(third_party/cppzmq) +include_directories(SYSTEM third_party/cppzmq) # Configure msgpack-c - disable boost dependency set(MSGPACK_USE_BOOST OFF CACHE BOOL "" FORCE) diff --git a/packages/leann-backend-hnsw/third_party/faiss b/packages/leann-backend-hnsw/third_party/faiss index 59527452..6ec1b9b1 160000 --- a/packages/leann-backend-hnsw/third_party/faiss +++ b/packages/leann-backend-hnsw/third_party/faiss @@ -1 +1 @@ -Subproject commit 595274523790e3bb5991437c3fc6032f170ebad9 +Subproject commit 6ec1b9b1c7bbb557aea7aa7190c38db6bd50f725 From 5073f312b6285596fed579e03dd37d906a2192e7 Mon Sep 17 00:00:00 2001 From: ww2283 Date: Wed, 22 Oct 2025 20:13:25 -0400 Subject: [PATCH 3/6] style: apply ruff formatting --- packages/leann-core/src/leann/cli.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/packages/leann-core/src/leann/cli.py b/packages/leann-core/src/leann/cli.py index 7709259c..a60bfcd7 100644 --- a/packages/leann-core/src/leann/cli.py +++ b/packages/leann-core/src/leann/cli.py @@ -1283,10 +1283,7 @@ def file_filter( for chunk_text in chunk_texts: # TODO: Enhance create_text_chunks to return metadata alongside text # For now, we store chunks with empty metadata - all_texts.append({ - "text": chunk_text, - "metadata": {} - }) + all_texts.append({"text": chunk_text, "metadata": {}}) except ImportError as e: print( @@ -1319,10 +1316,7 @@ def file_filter( nodes = parser.get_nodes_from_documents([doc]) for node in nodes: - all_texts.append({ - "text": node.get_content(), - "metadata": chunk_metadata - }) + all_texts.append({"text": node.get_content(), "metadata": chunk_metadata}) print(f"Loaded {len(documents)} documents, {len(all_texts)} chunks") return all_texts From 585ef7785d5116f192d1a76057d9c58938554868 Mon Sep 17 00:00:00 2001 From: ww2283 Date: Sat, 25 Oct 2025 10:44:48 -0400 Subject: [PATCH 4/6] chore: update faiss submodule to use ww2283 fork Use ww2283/faiss fork with fix/zmq-linking branch to resolve CI checkout failures. The ZMQ linking fixes are not yet merged upstream. --- .gitmodules | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 359164c0..0d91b78f 100644 --- a/.gitmodules +++ b/.gitmodules @@ -3,7 +3,8 @@ url = https://github.com/yichuan-w/DiskANN.git [submodule "packages/leann-backend-hnsw/third_party/faiss"] path = packages/leann-backend-hnsw/third_party/faiss - url = https://github.com/yichuan-w/faiss.git + url = https://github.com/ww2283/faiss.git + branch = fix/zmq-linking [submodule "packages/leann-backend-hnsw/third_party/msgpack-c"] path = packages/leann-backend-hnsw/third_party/msgpack-c url = https://github.com/msgpack/msgpack-c.git From d226f72bc0b65f9866ac7214f94c3da353e482e5 Mon Sep 17 00:00:00 2001 From: ww2283 Date: Thu, 23 Oct 2025 11:30:09 -0400 Subject: [PATCH 5/6] feat: implement true batch processing for Ollama embeddings Migrate from deprecated /api/embeddings to modern /api/embed endpoint which supports batch inputs. This reduces HTTP overhead by sending 32 texts per request instead of making individual API calls. Changes: - Update endpoint from /api/embeddings to /api/embed - Change parameter from 'prompt' (single) to 'input' (array) - Update response parsing for batch embeddings array - Increase timeout to 60s for batch processing - Improve error handling for batch requests Performance: - Reduces API calls by 32x (batch size) - Eliminates HTTP connection overhead per text - Note: Ollama still processes batch items sequentially internally Related: #151 --- .../leann-core/src/leann/embedding_compute.py | 104 +++++++++--------- 1 file changed, 54 insertions(+), 50 deletions(-) diff --git a/packages/leann-core/src/leann/embedding_compute.py b/packages/leann-core/src/leann/embedding_compute.py index 06fba3df..e3d9f865 100644 --- a/packages/leann-core/src/leann/embedding_compute.py +++ b/packages/leann-core/src/leann/embedding_compute.py @@ -574,9 +574,10 @@ def compute_embeddings_ollama( host: Optional[str] = None, ) -> np.ndarray: """ - Compute embeddings using Ollama API with simplified batch processing. + Compute embeddings using Ollama API with true batch processing. - Uses batch size of 32 for MPS/CPU and 128 for CUDA to optimize performance. + Uses the /api/embed endpoint which supports batch inputs. + Batch size: 32 for MPS/CPU, 128 for CUDA to optimize performance. Args: texts: List of texts to compute embeddings for @@ -681,11 +682,11 @@ def compute_embeddings_ollama( logger.info(f"Resolved model name '{model_name}' to '{resolved_model_name}'") model_name = resolved_model_name - # Verify the model supports embeddings by testing it + # Verify the model supports embeddings by testing it with /api/embed try: test_response = requests.post( - f"{resolved_host}/api/embeddings", - json={"model": model_name, "prompt": "test"}, + f"{resolved_host}/api/embed", + json={"model": model_name, "input": "test"}, timeout=10, ) if test_response.status_code != 200: @@ -717,56 +718,55 @@ def compute_embeddings_ollama( # If torch is not available, use conservative batch size batch_size = 32 - logger.info(f"Using batch size: {batch_size}") + logger.info(f"Using batch size: {batch_size} for true batch processing") def get_batch_embeddings(batch_texts): - """Get embeddings for a batch of texts.""" - all_embeddings = [] - failed_indices = [] + """Get embeddings for a batch of texts using /api/embed endpoint.""" + max_retries = 3 + retry_count = 0 - for i, text in enumerate(batch_texts): - max_retries = 3 - retry_count = 0 + # Truncate very long texts to avoid API issues + truncated_texts = [text[:8000] if len(text) > 8000 else text for text in batch_texts] - # Truncate very long texts to avoid API issues - truncated_text = text[:8000] if len(text) > 8000 else text - while retry_count < max_retries: - try: - response = requests.post( - f"{resolved_host}/api/embeddings", - json={"model": model_name, "prompt": truncated_text}, - timeout=30, - ) - response.raise_for_status() + while retry_count < max_retries: + try: + # Use /api/embed endpoint with "input" parameter for batch processing + response = requests.post( + f"{resolved_host}/api/embed", + json={"model": model_name, "input": truncated_texts}, + timeout=60, # Increased timeout for batch processing + ) + response.raise_for_status() + + result = response.json() + batch_embeddings = result.get("embeddings") - result = response.json() - embedding = result.get("embedding") + if batch_embeddings is None: + raise ValueError("No embeddings returned from API") - if embedding is None: - raise ValueError(f"No embedding returned for text {i}") + if not isinstance(batch_embeddings, list): + raise ValueError(f"Invalid embeddings format: {type(batch_embeddings)}") - if not isinstance(embedding, list) or len(embedding) == 0: - raise ValueError(f"Invalid embedding format for text {i}") + if len(batch_embeddings) != len(batch_texts): + raise ValueError( + f"Mismatch: requested {len(batch_texts)} embeddings, got {len(batch_embeddings)}" + ) + + return batch_embeddings, [] - all_embeddings.append(embedding) - break + except requests.exceptions.Timeout: + retry_count += 1 + if retry_count >= max_retries: + logger.warning(f"Timeout for batch after {max_retries} retries") + return None, list(range(len(batch_texts))) - except requests.exceptions.Timeout: - retry_count += 1 - if retry_count >= max_retries: - logger.warning(f"Timeout for text {i} after {max_retries} retries") - failed_indices.append(i) - all_embeddings.append(None) - break + except Exception as e: + retry_count += 1 + if retry_count >= max_retries: + logger.error(f"Failed to get embeddings for batch: {e}") + return None, list(range(len(batch_texts))) - except Exception as e: - retry_count += 1 - if retry_count >= max_retries: - logger.error(f"Failed to get embedding for text {i}: {e}") - failed_indices.append(i) - all_embeddings.append(None) - break - return all_embeddings, failed_indices + return None, list(range(len(batch_texts))) # Process texts in batches all_embeddings = [] @@ -784,7 +784,7 @@ def get_batch_embeddings(batch_texts): num_batches = (len(texts) + batch_size - 1) // batch_size if show_progress: - batch_iterator = tqdm(range(num_batches), desc="Computing Ollama embeddings") + batch_iterator = tqdm(range(num_batches), desc="Computing Ollama embeddings (batched)") else: batch_iterator = range(num_batches) @@ -795,10 +795,14 @@ def get_batch_embeddings(batch_texts): batch_embeddings, batch_failed = get_batch_embeddings(batch_texts) - # Adjust failed indices to global indices - global_failed = [start_idx + idx for idx in batch_failed] - all_failed_indices.extend(global_failed) - all_embeddings.extend(batch_embeddings) + if batch_embeddings is not None: + all_embeddings.extend(batch_embeddings) + else: + # Entire batch failed, add None placeholders + all_embeddings.extend([None] * len(batch_texts)) + # Adjust failed indices to global indices + global_failed = [start_idx + idx for idx in batch_failed] + all_failed_indices.extend(global_failed) # Handle failed embeddings if all_failed_indices: From 6c8801480d978493bc75cdd1373289de77dbfd29 Mon Sep 17 00:00:00 2001 From: yichuan520030910320 Date: Thu, 30 Oct 2025 16:36:14 -0700 Subject: [PATCH 6/6] fall back to original faiss as i merge the PR --- .gitmodules | 3 +-- packages/leann-backend-hnsw/third_party/faiss | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.gitmodules b/.gitmodules index 0d91b78f..359164c0 100644 --- a/.gitmodules +++ b/.gitmodules @@ -3,8 +3,7 @@ url = https://github.com/yichuan-w/DiskANN.git [submodule "packages/leann-backend-hnsw/third_party/faiss"] path = packages/leann-backend-hnsw/third_party/faiss - url = https://github.com/ww2283/faiss.git - branch = fix/zmq-linking + url = https://github.com/yichuan-w/faiss.git [submodule "packages/leann-backend-hnsw/third_party/msgpack-c"] path = packages/leann-backend-hnsw/third_party/msgpack-c url = https://github.com/msgpack/msgpack-c.git diff --git a/packages/leann-backend-hnsw/third_party/faiss b/packages/leann-backend-hnsw/third_party/faiss index 6ec1b9b1..59527452 160000 --- a/packages/leann-backend-hnsw/third_party/faiss +++ b/packages/leann-backend-hnsw/third_party/faiss @@ -1 +1 @@ -Subproject commit 6ec1b9b1c7bbb557aea7aa7190c38db6bd50f725 +Subproject commit 595274523790e3bb5991437c3fc6032f170ebad9