Skip to content

Commit d6ed618

Browse files
committed
fixing chunking token issues within limit for embedding models
1 parent a85d0ad commit d6ed618

File tree

4 files changed

+267
-24
lines changed

4 files changed

+267
-24
lines changed

apps/base_rag_example.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -180,14 +180,14 @@ def _create_parser(self) -> argparse.ArgumentParser:
180180
ast_group.add_argument(
181181
"--ast-chunk-size",
182182
type=int,
183-
default=512,
184-
help="Maximum characters per AST chunk (default: 512)",
183+
default=300,
184+
help="Maximum CHARACTERS per AST chunk (default: 300). Final chunks may be larger due to overlap. For 512 token models: recommended 300 chars",
185185
)
186186
ast_group.add_argument(
187187
"--ast-chunk-overlap",
188188
type=int,
189189
default=64,
190-
help="Overlap between AST chunks (default: 64)",
190+
help="Overlap between AST chunks in CHARACTERS (default: 64). Added to chunk size, not included in it",
191191
)
192192
ast_group.add_argument(
193193
"--code-file-extensions",

packages/leann-core/src/leann/chunking_utils.py

Lines changed: 135 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,119 @@
1111

1212
logger = logging.getLogger(__name__)
1313

14+
15+
def estimate_token_count(text: str) -> int:
16+
"""
17+
Estimate token count for a text string.
18+
Uses conservative estimation: ~4 characters per token for natural text,
19+
~1.2 tokens per character for code (worse tokenization).
20+
21+
Args:
22+
text: Input text to estimate tokens for
23+
24+
Returns:
25+
Estimated token count
26+
"""
27+
try:
28+
import tiktoken
29+
30+
encoder = tiktoken.get_encoding("cl100k_base")
31+
return len(encoder.encode(text))
32+
except ImportError:
33+
# Fallback: Conservative character-based estimation
34+
# Assume worst case for code: 1.2 tokens per character
35+
return int(len(text) * 1.2)
36+
37+
38+
def calculate_safe_chunk_size(
39+
model_token_limit: int,
40+
overlap_tokens: int,
41+
chunking_mode: str = "traditional",
42+
safety_factor: float = 0.9,
43+
) -> int:
44+
"""
45+
Calculate safe chunk size accounting for overlap and safety margin.
46+
47+
Args:
48+
model_token_limit: Maximum tokens supported by embedding model
49+
overlap_tokens: Overlap size (tokens for traditional, chars for AST)
50+
chunking_mode: "traditional" (tokens) or "ast" (characters)
51+
safety_factor: Safety margin (0.9 = 10% safety margin)
52+
53+
Returns:
54+
Safe chunk size: tokens for traditional, characters for AST
55+
"""
56+
safe_limit = int(model_token_limit * safety_factor)
57+
58+
if chunking_mode == "traditional":
59+
# Traditional chunking uses tokens
60+
# Max chunk = chunk_size + overlap, so chunk_size = limit - overlap
61+
return max(1, safe_limit - overlap_tokens)
62+
else: # AST chunking
63+
# AST uses characters, need to convert
64+
# Conservative estimate: 1.2 tokens per char for code
65+
overlap_chars = int(overlap_tokens * 3) # ~3 chars per token for code
66+
safe_chars = int(safe_limit / 1.2)
67+
return max(1, safe_chars - overlap_chars)
68+
69+
70+
def validate_chunk_token_limits(chunks: list[str], max_tokens: int = 512) -> tuple[list[str], int]:
71+
"""
72+
Validate that chunks don't exceed token limits and truncate if necessary.
73+
74+
Args:
75+
chunks: List of text chunks to validate
76+
max_tokens: Maximum tokens allowed per chunk
77+
78+
Returns:
79+
Tuple of (validated_chunks, num_truncated)
80+
"""
81+
validated_chunks = []
82+
num_truncated = 0
83+
84+
for i, chunk in enumerate(chunks):
85+
estimated_tokens = estimate_token_count(chunk)
86+
87+
if estimated_tokens > max_tokens:
88+
# Truncate chunk to fit token limit
89+
try:
90+
import tiktoken
91+
92+
encoder = tiktoken.get_encoding("cl100k_base")
93+
tokens = encoder.encode(chunk)
94+
if len(tokens) > max_tokens:
95+
truncated_tokens = tokens[:max_tokens]
96+
truncated_chunk = encoder.decode(truncated_tokens)
97+
validated_chunks.append(truncated_chunk)
98+
num_truncated += 1
99+
logger.warning(
100+
f"Truncated chunk {i} from {len(tokens)} to {max_tokens} tokens "
101+
f"(from {len(chunk)} to {len(truncated_chunk)} characters)"
102+
)
103+
else:
104+
validated_chunks.append(chunk)
105+
except ImportError:
106+
# Fallback: Conservative character truncation
107+
char_limit = int(max_tokens / 1.2) # Conservative for code
108+
if len(chunk) > char_limit:
109+
truncated_chunk = chunk[:char_limit]
110+
validated_chunks.append(truncated_chunk)
111+
num_truncated += 1
112+
logger.warning(
113+
f"Truncated chunk {i} from {len(chunk)} to {char_limit} characters "
114+
f"(conservative estimate for {max_tokens} tokens)"
115+
)
116+
else:
117+
validated_chunks.append(chunk)
118+
else:
119+
validated_chunks.append(chunk)
120+
121+
if num_truncated > 0:
122+
logger.warning(f"Truncated {num_truncated}/{len(chunks)} chunks to fit token limits")
123+
124+
return validated_chunks, num_truncated
125+
126+
14127
# Code file extensions supported by astchunk
15128
CODE_EXTENSIONS = {
16129
".py": "python",
@@ -82,6 +195,17 @@ def create_ast_chunks(
82195
continue
83196

84197
try:
198+
# Warn if AST chunk size + overlap might exceed common token limits
199+
estimated_max_tokens = int(
200+
(max_chunk_size + chunk_overlap) * 1.2
201+
) # Conservative estimate
202+
if estimated_max_tokens > 512:
203+
logger.warning(
204+
f"AST chunk size ({max_chunk_size}) + overlap ({chunk_overlap}) = {max_chunk_size + chunk_overlap} chars "
205+
f"may exceed 512 token limit (~{estimated_max_tokens} tokens estimated). "
206+
f"Consider reducing --ast-chunk-size to {int(400 / 1.2)} or --ast-chunk-overlap to {int(50 / 1.2)}"
207+
)
208+
85209
configs = {
86210
"max_chunk_size": max_chunk_size,
87211
"language": language,
@@ -217,4 +341,14 @@ def create_text_chunks(
217341
all_chunks = create_traditional_chunks(documents, chunk_size, chunk_overlap)
218342

219343
logger.info(f"Total chunks created: {len(all_chunks)}")
220-
return all_chunks
344+
345+
# Validate chunk token limits (default to 512 for safety)
346+
# This provides a safety net for embedding models with token limits
347+
validated_chunks, num_truncated = validate_chunk_token_limits(all_chunks, max_tokens=512)
348+
349+
if num_truncated > 0:
350+
logger.info(
351+
f"Post-chunking validation: {num_truncated} chunks were truncated to fit 512 token limit"
352+
)
353+
354+
return validated_chunks

packages/leann-core/src/leann/cli.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -181,25 +181,25 @@ def create_parser(self) -> argparse.ArgumentParser:
181181
"--doc-chunk-size",
182182
type=int,
183183
default=256,
184-
help="Document chunk size in tokens/characters (default: 256)",
184+
help="Document chunk size in TOKENS (default: 256). Final chunks may be larger due to overlap. For 512 token models: recommended 350 tokens (350 + 128 overlap = 478 max)",
185185
)
186186
build_parser.add_argument(
187187
"--doc-chunk-overlap",
188188
type=int,
189189
default=128,
190-
help="Document chunk overlap (default: 128)",
190+
help="Document chunk overlap in TOKENS (default: 128). Added to chunk size, not included in it",
191191
)
192192
build_parser.add_argument(
193193
"--code-chunk-size",
194194
type=int,
195195
default=512,
196-
help="Code chunk size in tokens/lines (default: 512)",
196+
help="Code chunk size in TOKENS (default: 512). Final chunks may be larger due to overlap. For 512 token models: recommended 400 tokens (400 + 50 overlap = 450 max)",
197197
)
198198
build_parser.add_argument(
199199
"--code-chunk-overlap",
200200
type=int,
201201
default=50,
202-
help="Code chunk overlap (default: 50)",
202+
help="Code chunk overlap in TOKENS (default: 50). Added to chunk size, not included in it",
203203
)
204204
build_parser.add_argument(
205205
"--use-ast-chunking",
@@ -209,14 +209,14 @@ def create_parser(self) -> argparse.ArgumentParser:
209209
build_parser.add_argument(
210210
"--ast-chunk-size",
211211
type=int,
212-
default=768,
213-
help="AST chunk size in characters (default: 768)",
212+
default=300,
213+
help="AST chunk size in CHARACTERS (non-whitespace) (default: 300). Final chunks may be larger due to overlap and expansion. For 512 token models: recommended 300 chars (300 + 64 overlap ~= 480 tokens)",
214214
)
215215
build_parser.add_argument(
216216
"--ast-chunk-overlap",
217217
type=int,
218-
default=96,
219-
help="AST chunk overlap in characters (default: 96)",
218+
default=64,
219+
help="AST chunk overlap in CHARACTERS (default: 64). Added to chunk size, not included in it. ~1.2 tokens per character for code",
220220
)
221221
build_parser.add_argument(
222222
"--ast-fallback-traditional",

0 commit comments

Comments
 (0)