From d24beed6de44e2a3d405595ddc10e47a1d4c449e Mon Sep 17 00:00:00 2001 From: aeneasr <3372410+aeneasr@users.noreply.github.com> Date: Wed, 20 May 2026 12:56:17 +0200 Subject: [PATCH] fix(stdio): short-circuit semantic_search when index is being rebuilt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a background indexer holds the index flock, ensureIndexed sets out.StaleWarning and returns immediately, but handleSemanticSearch then proceeded to call embedQuery. On a single-instance embedding backend (e.g. LM Studio) saturated by the indexer's 32-chunk batches, the query embed queues behind those batches and the MCP call hangs well past Claude Code's timeout — the user sees an infinite "loading" state and never receives the warning text telling them to use standard tools for the next 10 calls. Reproduced reliably with two parallel semantic_search calls into different subdirectories of a monorepo while a background lumen index of the monorepo root holds the flock. After the fix both responses return in ~0.4s with the StaleWarning text instead of timing out at 100s. Co-Authored-By: Claude Opus 4.7 --- cmd/stdio.go | 14 ++++ cmd/stdio_concurrency_test.go | 153 ++++++++++++++++++++++++++++++++++ 2 files changed, 167 insertions(+) diff --git a/cmd/stdio.go b/cmd/stdio.go index e052185a..031ceb37 100644 --- a/cmd/stdio.go +++ b/cmd/stdio.go @@ -537,6 +537,20 @@ func (ic *indexerCache) handleSemanticSearch(ctx context.Context, req *mcp.CallT } out.SeedWarning = seedWarning + // When the index is being rebuilt by a concurrent indexer, the + // StaleWarning text already instructs the caller to skip semantic_search + // for the next 10 tool calls. Embedding and searching now would (a) waste + // work the caller is told to ignore, and (b) contend with the busy + // indexer for the embedding backend — on a single-instance LM Studio, + // the query embed can queue behind the indexer's batches indefinitely + // and hang the MCP call. Return the warning immediately instead. + if out.StaleWarning != "" { + text := formatSearchResults(input.Path, out) + return &mcp.CallToolResult{ + Content: []mcp.Content{&mcp.TextContent{Text: text}}, + }, nil, nil + } + queryVec, err := ic.embedQuery(ctx, input.Query) if err != nil { return nil, nil, err diff --git a/cmd/stdio_concurrency_test.go b/cmd/stdio_concurrency_test.go index 67e04d8e..fb34ad34 100644 --- a/cmd/stdio_concurrency_test.go +++ b/cmd/stdio_concurrency_test.go @@ -19,11 +19,15 @@ import ( "fmt" "os" "path/filepath" + "strings" "sync" "testing" "time" + "github.com/modelcontextprotocol/go-sdk/mcp" + "github.com/ory/lumen/internal/config" "github.com/ory/lumen/internal/index" + "github.com/ory/lumen/internal/indexlock" ) // --------------------------------------------------------------------------- @@ -324,6 +328,155 @@ done: ic.Close() } +// TestHandleSemanticSearch_StaleWarningShortCircuits is a regression test for +// the hang observed against /Users/aeneas/workspace/go/cloud: when a background +// indexer process holds the index flock, `ensureIndexed` correctly sets +// `out.StaleWarning` and returns immediately, but `handleSemanticSearch` then +// proceeded to call `embedQuery` against the saturated embedding server. With +// LM Studio (single-instance) being hammered by the background indexer's +// 32-chunk batches, the query embed never completes within Claude Code's MCP +// timeout, and the user sees an infinite hang. +// +// The fix is to short-circuit `handleSemanticSearch` when `StaleWarning` is set +// — the warning text already tells the caller to skip semantic searches for +// the next 10 tool calls, so embedding+searching is pure waste anyway. +// +// This test holds the flock externally, points the cache at an indexer whose +// embedder blocks forever, and asserts the handler returns quickly with the +// warning text — and that Embed was never called. +func TestHandleSemanticSearch_StaleWarningShortCircuits(t *testing.T) { + const dims = 4 + + // Resolve symlinks up-front: validateSearchInput will EvalSymlinks the + // path, and on macOS t.TempDir() returns /var/folders/... which resolves + // to /private/var/folders/... — without this, the test's pre-acquired + // lock would be on the unresolved path while handleSemanticSearch looks + // up the resolved one, hiding the bug behind a different dbPath. + rawDir := t.TempDir() + projectDir, err := filepath.EvalSymlinks(rawDir) + if err != nil { + t.Fatal(err) + } + writeTestGoFile(t, projectDir, "main.go", `package main + +// Demo gives the indexer something to index. +func Demo() {} +`) + + // Step 1: pre-create the index with a fast embedder so the DB exists and + // has at least one chunk. This isolates the test from the chunking path + // and lets the search call against the cache succeed if (incorrectly) + // reached. + fastEmb := &stubEmbedder{model: "blocking-stub"} + dbPath := config.DBPathForProject(projectDir, fastEmb.ModelName()) + if err := os.MkdirAll(filepath.Dir(dbPath), 0o755); err != nil { + t.Fatal(err) + } + // Ensure the DB file (and any lock) does not leak across test runs. + t.Cleanup(func() { + _ = os.RemoveAll(filepath.Dir(dbPath)) + }) + + idx, err := index.NewIndexer(dbPath, fastEmb, 512) + if err != nil { + t.Fatal(err) + } + if _, err := idx.Index(context.Background(), projectDir, false, nil); err != nil { + t.Fatal(err) + } + _ = idx.Close() + + // Step 2: hold the flock from "another process" — flock.New creates a + // new file descriptor so the same-process check in indexlock.IsHeld sees + // it as foreign (matches the pattern in TestEnsureIndexed_FlockHeldSkipsReindex). + lockPath := indexlock.LockPathForDB(dbPath) + lk, lockErr := indexlock.TryAcquire(lockPath) + if lockErr != nil { + t.Fatal(lockErr) + } + if lk == nil { + t.Fatal("expected to acquire indexlock for test setup") + } + defer lk.Release() + if !indexlock.IsHeld(lockPath) { + t.Skip("flock TryAcquire+IsHeld is reentrant in the same process on this OS — test cannot simulate background indexer holding lock") + } + + // Step 3: re-open the indexer with a blocking embedder. If + // handleSemanticSearch wrongly reaches embedQuery, it will block on + // Embed() forever. + blockEmb := newBlockingStubEmbedder(dims) + idx, err = index.NewIndexer(dbPath, blockEmb, 512) + if err != nil { + t.Fatal(err) + } + defer func() { + blockEmb.Unblock() + _ = idx.Close() + }() + + ic := &indexerCache{ + cache: map[string]cacheEntry{ + projectDir: {idx: idx, effectiveRoot: projectDir, model: blockEmb.ModelName()}, + }, + embedder: blockEmb, + cfg: newTestConfigService(t, 512), + log: discardLog, + freshnessTTL: 1 * time.Nanosecond, // force the merkle/flock path; do not trust LastIndexedAt + } + + // Step 4: call handleSemanticSearch with a deadline that's much shorter + // than the embed timeout. Before the fix, this call hangs on blockEmb.Embed + // forever; after the fix it returns immediately with the warning. + ctx, cancel := context.WithTimeout(context.Background(), 3*time.Second) + defer cancel() + + done := make(chan struct { + result *mcp.CallToolResult + err error + }, 1) + + start := time.Now() + go func() { + req := &mcp.CallToolRequest{Params: &mcp.CallToolParamsRaw{}} + result, _, callErr := ic.handleSemanticSearch(ctx, req, SemanticSearchInput{ + Cwd: projectDir, + Path: projectDir, + Query: "demo", + Limit: 3, + }) + done <- struct { + result *mcp.CallToolResult + err error + }{result, callErr} + }() + + select { + case out := <-done: + elapsed := time.Since(start) + if out.err != nil { + t.Fatalf("handleSemanticSearch returned error: %v (elapsed %v)", out.err, elapsed) + } + if elapsed > 1*time.Second { + t.Fatalf("handleSemanticSearch took %v — expected sub-second short-circuit when StaleWarning is set", elapsed) + } + text := mustTextResult(t, out.result) + if !strings.Contains(text, "Index is being updated in the background") { + t.Fatalf("expected StaleWarning text in result, got:\n%s", text) + } + case <-time.After(3 * time.Second): + t.Fatal("handleSemanticSearch did not return within 3s — bug: embedQuery contends with background indexer even when StaleWarning is set") + } + + // The embedder must NEVER have been called: the short-circuit must + // happen before embedQuery. + select { + case <-blockEmb.started: + t.Fatal("embedQuery was called even though StaleWarning was set — handleSemanticSearch must short-circuit before embedding") + default: + } +} + // writeTestGoFile creates a Go source file in dir for test setup. func writeTestGoFile(t *testing.T, dir, name, content string) { t.Helper()