Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
url = https://github.com/yichuan-w/DiskANN.git
[submodule "packages/leann-backend-hnsw/third_party/faiss"]
path = packages/leann-backend-hnsw/third_party/faiss
url = https://github.com/yichuan-w/faiss.git
url = https://github.com/ww2283/faiss.git
branch = fix/zmq-linking
[submodule "packages/leann-backend-hnsw/third_party/msgpack-c"]
path = packages/leann-backend-hnsw/third_party/msgpack-c
url = https://github.com/msgpack/msgpack-c.git
Expand Down
19 changes: 16 additions & 3 deletions packages/leann-backend-hnsw/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -29,12 +29,25 @@ if(APPLE)
set(CMAKE_OSX_DEPLOYMENT_TARGET "11.0" CACHE STRING "Minimum macOS version")
endif()

# Use system ZeroMQ instead of building from source
# Find ZMQ using pkg-config with IMPORTED_TARGET for automatic target creation
find_package(PkgConfig REQUIRED)
pkg_check_modules(ZMQ REQUIRED libzmq)

# On ARM64 macOS, ensure pkg-config finds ARM64 Homebrew packages first
if(APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64")
set(ENV{PKG_CONFIG_PATH} "/opt/homebrew/lib/pkgconfig:/opt/homebrew/share/pkgconfig:$ENV{PKG_CONFIG_PATH}")
endif()

pkg_check_modules(ZMQ REQUIRED IMPORTED_TARGET libzmq)

# This creates PkgConfig::ZMQ target automatically with correct properties
if(TARGET PkgConfig::ZMQ)
message(STATUS "Found and configured ZMQ target: PkgConfig::ZMQ")
else()
message(FATAL_ERROR "pkg_check_modules did not create IMPORTED target for ZMQ.")
endif()

# Add cppzmq headers
include_directories(third_party/cppzmq)
include_directories(SYSTEM third_party/cppzmq)

# Configure msgpack-c - disable boost dependency
set(MSGPACK_USE_BOOST OFF CACHE BOOL "" FORCE)
Expand Down
2 changes: 1 addition & 1 deletion packages/leann-backend-hnsw/third_party/faiss
60 changes: 49 additions & 11 deletions packages/leann-core/src/leann/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,11 @@ def create_parser(self) -> argparse.ArgumentParser:
action="store_true",
help="Non-interactive mode: automatically select index without prompting",
)
search_parser.add_argument(
"--show-metadata",
action="store_true",
help="Display file paths and metadata in search results",
)

# Ask command
ask_parser = subparsers.add_parser("ask", help="Ask questions")
Expand Down Expand Up @@ -1263,7 +1268,7 @@ def file_filter(
from .chunking_utils import create_text_chunks

# Use enhanced chunking with AST support
all_texts = create_text_chunks(
chunk_texts = create_text_chunks(
documents,
chunk_size=self.node_parser.chunk_size,
chunk_overlap=self.node_parser.chunk_overlap,
Expand All @@ -1274,6 +1279,14 @@ def file_filter(
ast_fallback_traditional=getattr(args, "ast_fallback_traditional", True),
)

# Note: AST chunking currently returns plain text chunks without metadata
# We preserve basic file info by associating chunks with their source documents
# For better metadata preservation, documents list order should be maintained
for chunk_text in chunk_texts:
# TODO: Enhance create_text_chunks to return metadata alongside text
# For now, we store chunks with empty metadata
all_texts.append({"text": chunk_text, "metadata": {}})

except ImportError as e:
print(
f"⚠️ AST chunking utilities not available in package ({e}), falling back to traditional chunking"
Expand All @@ -1285,17 +1298,27 @@ def file_filter(
for doc in tqdm(documents, desc="Chunking documents", unit="doc"):
# Check if this is a code file based on source path
source_path = doc.metadata.get("source", "")
file_path = doc.metadata.get("file_path", "")
is_code_file = any(source_path.endswith(ext) for ext in code_file_exts)

# Extract metadata to preserve with chunks
chunk_metadata = {
"file_path": file_path or source_path,
"file_name": doc.metadata.get("file_name", ""),
}

# Add optional metadata if available
if "creation_date" in doc.metadata:
chunk_metadata["creation_date"] = doc.metadata["creation_date"]
if "last_modified_date" in doc.metadata:
chunk_metadata["last_modified_date"] = doc.metadata["last_modified_date"]

# Use appropriate parser based on file type
parser = self.code_parser if is_code_file else self.node_parser
nodes = parser.get_nodes_from_documents([doc])

for node in nodes:
text_with_source = (
"Chunk source:" + source_path + "\n" + node.get_content().replace("\n", " ")
)
all_texts.append(text_with_source)
all_texts.append({"text": node.get_content(), "metadata": chunk_metadata})

print(f"Loaded {len(documents)} documents, {len(all_texts)} chunks")
return all_texts
Expand Down Expand Up @@ -1370,7 +1393,7 @@ async def build_index(self, args):

index_dir.mkdir(parents=True, exist_ok=True)

print(f"Building index '{index_name}' with {args.backend} backend...")
print(f"Building index '{index_name}' with {args.backend_name} backend...")

embedding_options: dict[str, Any] = {}
if args.embedding_mode == "ollama":
Expand All @@ -1382,7 +1405,7 @@ async def build_index(self, args):
embedding_options["api_key"] = resolved_embedding_key

builder = LeannBuilder(
backend_name=args.backend,
backend_name=args.backend_name,
embedding_model=args.embedding_model,
embedding_mode=args.embedding_mode,
embedding_options=embedding_options or None,
Expand All @@ -1393,10 +1416,8 @@ async def build_index(self, args):
num_threads=args.num_threads,
)

for chunk_text_with_source in all_texts:
chunk_source = chunk_text_with_source.split("\n")[0].split(":")[1]
chunk_text = chunk_text_with_source.split("\n")[1]
builder.add_text(chunk_text, {"source": chunk_source})
for chunk in all_texts:
builder.add_text(chunk["text"], metadata=chunk["metadata"])

builder.build_index(index_path)
print(f"Index built at {index_path}")
Expand Down Expand Up @@ -1517,6 +1538,23 @@ async def search_documents(self, args):
print(f"Search results for '{query}' (top {len(results)}):")
for i, result in enumerate(results, 1):
print(f"{i}. Score: {result.score:.3f}")

# Display metadata if flag is set
if args.show_metadata and result.metadata:
file_path = result.metadata.get("file_path", "")
if file_path:
print(f" 📄 File: {file_path}")

file_name = result.metadata.get("file_name", "")
if file_name and file_name != file_path:
print(f" 📝 Name: {file_name}")

# Show timestamps if available
if "creation_date" in result.metadata:
print(f" 🕐 Created: {result.metadata['creation_date']}")
if "last_modified_date" in result.metadata:
print(f" 🕑 Modified: {result.metadata['last_modified_date']}")

print(f" {result.text[:200]}...")
print(f" Source: {result.metadata.get('source', '')}")
print()
Expand Down
104 changes: 54 additions & 50 deletions packages/leann-core/src/leann/embedding_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -574,9 +574,10 @@ def compute_embeddings_ollama(
host: Optional[str] = None,
) -> np.ndarray:
"""
Compute embeddings using Ollama API with simplified batch processing.
Compute embeddings using Ollama API with true batch processing.

Uses batch size of 32 for MPS/CPU and 128 for CUDA to optimize performance.
Uses the /api/embed endpoint which supports batch inputs.
Batch size: 32 for MPS/CPU, 128 for CUDA to optimize performance.

Args:
texts: List of texts to compute embeddings for
Expand Down Expand Up @@ -681,11 +682,11 @@ def compute_embeddings_ollama(
logger.info(f"Resolved model name '{model_name}' to '{resolved_model_name}'")
model_name = resolved_model_name

# Verify the model supports embeddings by testing it
# Verify the model supports embeddings by testing it with /api/embed
try:
test_response = requests.post(
f"{resolved_host}/api/embeddings",
json={"model": model_name, "prompt": "test"},
f"{resolved_host}/api/embed",
json={"model": model_name, "input": "test"},
timeout=10,
)
if test_response.status_code != 200:
Expand Down Expand Up @@ -717,56 +718,55 @@ def compute_embeddings_ollama(
# If torch is not available, use conservative batch size
batch_size = 32

logger.info(f"Using batch size: {batch_size}")
logger.info(f"Using batch size: {batch_size} for true batch processing")

def get_batch_embeddings(batch_texts):
"""Get embeddings for a batch of texts."""
all_embeddings = []
failed_indices = []
"""Get embeddings for a batch of texts using /api/embed endpoint."""
max_retries = 3
retry_count = 0

for i, text in enumerate(batch_texts):
max_retries = 3
retry_count = 0
# Truncate very long texts to avoid API issues
truncated_texts = [text[:8000] if len(text) > 8000 else text for text in batch_texts]

# Truncate very long texts to avoid API issues
truncated_text = text[:8000] if len(text) > 8000 else text
while retry_count < max_retries:
try:
response = requests.post(
f"{resolved_host}/api/embeddings",
json={"model": model_name, "prompt": truncated_text},
timeout=30,
)
response.raise_for_status()
while retry_count < max_retries:
try:
# Use /api/embed endpoint with "input" parameter for batch processing
response = requests.post(
f"{resolved_host}/api/embed",
json={"model": model_name, "input": truncated_texts},
timeout=60, # Increased timeout for batch processing
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am wondering if this will result in OOM?
If you test on a large scale, I think I am fine with this

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I will keep this in mind when doing next step as I closely monitor its behavior. thanks for the merge! I will check around to be sure about the conflict are resolved before next step. currently Ollama has its limitation, that is the batching is correctly received but not really properly batched in itself, which is not the same behavior as in other client e.g. lm studio. So lm studio is using openai mode endpoint, and it's not oom, so I assume that ollama should be fine, even when later they decide to do proper batching. but for now the batching is ready with ollama. sadly headless server autoloading and unloading model with proper JIT is still the most smooth with ollama. Or next close solution is llama-swap but not as convenient. currently, the most speedy solution in apple silicon is either ollama with moe embedding model, which we currently only have that nomad v2, or lm studio with embeddinggemma which can offer equivalent speed comparing to that ollama hosted moe. embeddinggemma has the great two advantages: longer sequence length support (2048 vs 512) and template prepending which should theoretically be important for better results.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

on a side note, speed is important, at least to me, because I use a posttoolhook in claude code that will embed once see a git commit to keep the codebase indexing up to date. so the embedding in LEAAN has to be fast.

)
response.raise_for_status()

result = response.json()
batch_embeddings = result.get("embeddings")

result = response.json()
embedding = result.get("embedding")
if batch_embeddings is None:
raise ValueError("No embeddings returned from API")

if embedding is None:
raise ValueError(f"No embedding returned for text {i}")
if not isinstance(batch_embeddings, list):
raise ValueError(f"Invalid embeddings format: {type(batch_embeddings)}")

if not isinstance(embedding, list) or len(embedding) == 0:
raise ValueError(f"Invalid embedding format for text {i}")
if len(batch_embeddings) != len(batch_texts):
raise ValueError(
f"Mismatch: requested {len(batch_texts)} embeddings, got {len(batch_embeddings)}"
)

return batch_embeddings, []

all_embeddings.append(embedding)
break
except requests.exceptions.Timeout:
retry_count += 1
if retry_count >= max_retries:
logger.warning(f"Timeout for batch after {max_retries} retries")
return None, list(range(len(batch_texts)))

except requests.exceptions.Timeout:
retry_count += 1
if retry_count >= max_retries:
logger.warning(f"Timeout for text {i} after {max_retries} retries")
failed_indices.append(i)
all_embeddings.append(None)
break
except Exception as e:
retry_count += 1
if retry_count >= max_retries:
logger.error(f"Failed to get embeddings for batch: {e}")
return None, list(range(len(batch_texts)))

except Exception as e:
retry_count += 1
if retry_count >= max_retries:
logger.error(f"Failed to get embedding for text {i}: {e}")
failed_indices.append(i)
all_embeddings.append(None)
break
return all_embeddings, failed_indices
return None, list(range(len(batch_texts)))

# Process texts in batches
all_embeddings = []
Expand All @@ -784,7 +784,7 @@ def get_batch_embeddings(batch_texts):
num_batches = (len(texts) + batch_size - 1) // batch_size

if show_progress:
batch_iterator = tqdm(range(num_batches), desc="Computing Ollama embeddings")
batch_iterator = tqdm(range(num_batches), desc="Computing Ollama embeddings (batched)")
else:
batch_iterator = range(num_batches)

Expand All @@ -795,10 +795,14 @@ def get_batch_embeddings(batch_texts):

batch_embeddings, batch_failed = get_batch_embeddings(batch_texts)

# Adjust failed indices to global indices
global_failed = [start_idx + idx for idx in batch_failed]
all_failed_indices.extend(global_failed)
all_embeddings.extend(batch_embeddings)
if batch_embeddings is not None:
all_embeddings.extend(batch_embeddings)
else:
# Entire batch failed, add None placeholders
all_embeddings.extend([None] * len(batch_texts))
# Adjust failed indices to global indices
global_failed = [start_idx + idx for idx in batch_failed]
all_failed_indices.extend(global_failed)

# Handle failed embeddings
if all_failed_indices:
Expand Down
7 changes: 7 additions & 0 deletions packages/leann-core/src/leann/mcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,11 @@ def handle_request(request):
"maximum": 128,
"description": "Search complexity level. Use 16-32 for fast searches (recommended), 64+ for higher precision when needed.",
},
"show_metadata": {
"type": "boolean",
"default": False,
"description": "Include file paths and metadata in search results. Useful for understanding which files contain the results.",
},
},
"required": ["index_name", "query"],
},
Expand Down Expand Up @@ -104,6 +109,8 @@ def handle_request(request):
f"--complexity={args.get('complexity', 32)}",
"--non-interactive",
]
if args.get("show_metadata", False):
cmd.append("--show-metadata")
result = subprocess.run(cmd, capture_output=True, text=True)

elif tool_name == "leann_list":
Expand Down
Loading