Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 28 additions & 14 deletions src/memos/chunkers/sentence_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,22 +21,36 @@ def __init__(self, config: SentenceChunkerConfig):

self.config = config

# Try new API first (v1.4.0+)
try:
self.chunker = ChonkieSentenceChunker(
tokenizer=config.tokenizer_or_token_counter,
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
min_sentences_per_chunk=config.min_sentences_per_chunk,
common_kwargs = {
"chunk_size": config.chunk_size,
"chunk_overlap": config.chunk_overlap,
"min_sentences_per_chunk": config.min_sentences_per_chunk,
}
self.chunker = None
last_error: Exception | None = None
# Try chonkie >=1.4.0 API first, then the pre-1.4 signature.
for kwarg in ("tokenizer", "tokenizer_or_token_counter"):
try:
self.chunker = ChonkieSentenceChunker(
**{kwarg: config.tokenizer_or_token_counter}, **common_kwargs
)
break
except (TypeError, AttributeError, ValueError) as e:
last_error = e
continue

# If the configured tokenizer can't be loaded (no tiktoken, no
# HuggingFace access, etc.), fall back to chonkie's built-in
# 'character' counter so the chunker still works offline. Note:
# chunk_size semantics change from token count to character count
# for fallback runs.
if self.chunker is None:
logger.warning(
f"Tokenizer '{config.tokenizer_or_token_counter}' unavailable "
f"({last_error!r}); falling back to 'character'"
)
except (TypeError, AttributeError) as e:
# Fallback to old API (<v1.4.0)
logger.debug(f"Falling back to old chonkie API: {e}")
self.chunker = ChonkieSentenceChunker(
tokenizer_or_token_counter=config.tokenizer_or_token_counter,
chunk_size=config.chunk_size,
chunk_overlap=config.chunk_overlap,
min_sentences_per_chunk=config.min_sentences_per_chunk,
tokenizer_or_token_counter="character", **common_kwargs
)

logger.info(f"Initialized SentenceChunker with config: {config}")
Expand Down
83 changes: 83 additions & 0 deletions tests/chunkers/test_sentence_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,86 @@ def test_sentence_chunker(self):
self.assertEqual(chunks[0].text, "This is the first sentence.")
self.assertEqual(chunks[0].token_count, 6)
self.assertEqual(chunks[0].sentences, ["This is the first sentence."])

def test_sentence_chunker_falls_back_to_character(self):
"""Falls back to 'character' tokenizer when the configured one cannot be loaded.

Regression: in environments without tiktoken and without HuggingFace
access, chonkie raises ValueError trying to load tokenizers like
'gpt2'. The chunker should recover by falling back to chonkie's
built-in 'character' counter instead of propagating the error.
"""
mock_instance = MagicMock()

def side_effect(*args, **kwargs):
# New API (chonkie >=1.4.0) uses 'tokenizer='.
if "tokenizer" in kwargs:
raise TypeError("unexpected keyword argument 'tokenizer'")
value = kwargs.get("tokenizer_or_token_counter")
if value == "character":
return mock_instance
raise ValueError(f"Tokenizer not found in transformers/tokenizers/tiktoken: {value}")

with patch("chonkie.SentenceChunker", side_effect=side_effect) as mock_chunker_cls:
config = ChunkerConfigFactory.model_validate(
{
"backend": "sentence",
"config": {
"tokenizer_or_token_counter": "gpt2",
"chunk_size": 10,
"chunk_overlap": 2,
},
}
)
chunker = ChunkerFactory.from_config(config)

self.assertIs(chunker.chunker, mock_instance)
# Last call should be the 'character' fallback.
self.assertEqual(
mock_chunker_cls.call_args.kwargs.get("tokenizer_or_token_counter"),
"character",
)

def test_sentence_chunker_no_warning_when_character_configured(self):
"""When 'character' is explicitly configured, no fallback warning is emitted.

Guards against a regression where the fallback warning fires for
users who deliberately picked the character counter.
"""
import logging

def side_effect(*args, **kwargs):
if "tokenizer" in kwargs:
raise TypeError("unexpected keyword argument 'tokenizer'")
return MagicMock()

records: list[logging.LogRecord] = []

class _Capture(logging.Handler):
def emit(self, record: logging.LogRecord) -> None:
records.append(record)

handler = _Capture(level=logging.WARNING)
logger_under_test = logging.getLogger("memos.chunkers.sentence_chunker")
logger_under_test.addHandler(handler)
try:
with patch("chonkie.SentenceChunker", side_effect=side_effect):
config = ChunkerConfigFactory.model_validate(
{
"backend": "sentence",
"config": {
"tokenizer_or_token_counter": "character",
"chunk_size": 10,
},
}
)
ChunkerFactory.from_config(config)
finally:
logger_under_test.removeHandler(handler)

fallback_warnings = [r for r in records if "falling back to 'character'" in r.getMessage()]
self.assertEqual(
fallback_warnings,
[],
f"Unexpected fallback warning when 'character' was configured: {fallback_warnings}",
)