Skip to content

Commit ef3b889

Browse files
committed
fix: merge EMBEDDING_MODEL_LIMITS and remove redundant validation
- Merged upstream's model list with our corrected token limits - Kept our corrected nomic-embed-text: 2048 (not 512) - Removed post-chunking validation (redundant with embedding-time truncation) - All tests passing except 2 pre-existing integration test failures
1 parent 2868a3a commit ef3b889

File tree

2 files changed

+9
-22
lines changed

2 files changed

+9
-22
lines changed

packages/leann-core/src/leann/chunking_utils.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -406,13 +406,6 @@ def create_text_chunks(
406406

407407
logger.info(f"Total chunks created: {len(all_chunks)}")
408408

409-
# Validate chunk token limits (default to 512 for safety)
410-
# This provides a safety net for embedding models with token limits
411-
validated_chunks, num_truncated = validate_chunk_token_limits(all_chunks, max_tokens=512)
412-
413-
if num_truncated > 0:
414-
logger.info(
415-
f"Post-chunking validation: {num_truncated} chunks were truncated to fit 512 token limit"
416-
)
417-
418-
return validated_chunks
409+
# Note: Token truncation is now handled at embedding time with dynamic model limits
410+
# See get_model_token_limit() and truncate_to_token_limit() in embedding_compute.py
411+
return all_chunks

packages/leann-core/src/leann/embedding_compute.py

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -109,9 +109,14 @@ def get_model_token_limit(model_name: str) -> int:
109109
# Ollama models use dynamic discovery via /api/show
110110
EMBEDDING_MODEL_LIMITS = {
111111
# Nomic models (common across servers)
112-
"nomic-embed-text": 2048,
112+
"nomic-embed-text": 2048, # Corrected from 512 - verified via /api/show
113113
"nomic-embed-text-v1.5": 2048,
114114
"nomic-embed-text-v2": 512,
115+
# Other embedding models
116+
"mxbai-embed-large": 512,
117+
"all-minilm": 512,
118+
"bge-m3": 8192,
119+
"snowflake-arctic-embed": 512,
115120
# OpenAI models
116121
"text-embedding-3-small": 8192,
117122
"text-embedding-3-large": 8192,
@@ -216,17 +221,6 @@ def _query_ollama_context_limit(model_name: str, base_url: str) -> Optional[int]
216221
# Global model cache to avoid repeated loading
217222
_model_cache: dict[str, Any] = {}
218223

219-
# Known embedding model token limits
220-
EMBEDDING_MODEL_LIMITS = {
221-
"nomic-embed-text": 512,
222-
"nomic-embed-text-v2": 512,
223-
"mxbai-embed-large": 512,
224-
"all-minilm": 512,
225-
"bge-m3": 8192,
226-
"snowflake-arctic-embed": 512,
227-
# Add more models as needed
228-
}
229-
230224

231225
def compute_embeddings(
232226
texts: list[str],

0 commit comments

Comments
 (0)