diff --git a/apps/memos-local-plugin/adapters/openclaw/bridge.ts b/apps/memos-local-plugin/adapters/openclaw/bridge.ts index 9c527f30c..f8932d86d 100644 --- a/apps/memos-local-plugin/adapters/openclaw/bridge.ts +++ b/apps/memos-local-plugin/adapters/openclaw/bridge.ts @@ -189,10 +189,15 @@ export function flattenMessages(input: unknown[] | undefined): FlatMessage[] { } for (const tc of inlineToolCalls) out.push(tc); - // OpenAI legacy: assistant has a top-level `tool_calls` array - // (separate from content). Fold these in after pi-ai inline - // tool calls so order is preserved when both shapes coexist. - if (Array.isArray(m.tool_calls)) { + // OpenAI-legacy fallback only: when the message has NO pi-ai + // inline tool calls but does have a top-level `tool_calls` array + // (pure OpenAI Function-Calling shape). When both shapes coexist + // (as OpenClaw's pi-ai bundled OpenAI adapter does), pi-ai + // already populated `content[].toolCall`, so re-reading the + // top-level field would emit each call twice — which in turn + // causes `extractTurn`'s `pendingCalls.set(key, …)` to clobber + // the first stub's `thinkingBefore` with an empty second stub. + if (inlineToolCalls.length === 0 && Array.isArray(m.tool_calls)) { for (const tc of m.tool_calls as Array>) { const fn = tc.function as Record | undefined; if (!fn) continue; @@ -476,35 +481,51 @@ export function extractTurn(messages: FlatMessage[], now: number): CapturedTurn const userText = messages[lastUserIdx].content.trim(); const tail = messages.slice(lastUserIdx + 1); - const assistantParts: string[] = []; - const thinkingParts: string[] = []; const pendingCalls = new Map & { _id?: string }>(); const toolCalls: ToolCallDTO[] = []; + // Two separate buffers accumulate content not yet assigned to a tool. + // + // `pendingThinking`: Claude extended-thinking blocks (`ThinkingContent`) + // `pendingAssistant`: regular model text (`TextContent`) + // + // When a `tool_call` arrives, BOTH buffers are flushed together into + // that tool's `thinkingBefore` — this is the reasoning (structured OR + // natural language) the model did before deciding to invoke the tool. + // + // After all messages are processed, whatever remains in the buffers + // forms the final output: `pendingAssistant` → `agentText` (the + // reply) and `pendingThinking` → `agentThinking` (model reasoning + // shown in a dedicated bubble for non-tool turns). + let pendingThinking: string[] = []; + let pendingAssistant: string[] = []; + for (const m of tail) { if (m.role === "assistant") { - if (m.content) assistantParts.push(m.content); + if (m.content) pendingAssistant.push(m.content); continue; } if (m.role === "thinking") { - if (m.content) thinkingParts.push(m.content); + if (m.content) pendingThinking.push(m.content); continue; } if (m.role === "tool_call" && m.toolName) { - // Assistant decided to call a tool. Stash until the matching - // tool_result lands so we can stitch the full ToolCallDTO. + const parts = [...pendingThinking, ...pendingAssistant]; + const thinkingBefore = parts.join("\n\n").trim() || undefined; + pendingThinking = []; + pendingAssistant = []; + const key = m.toolCallId ?? m.toolName; pendingCalls.set(key, { _id: m.toolCallId, name: m.toolName, input: m.toolInput, startedAt: m.ts ?? now, + thinkingBefore, }); continue; } if (m.role === "tool_result") { - // Pair by id (preferred — works even when two parallel calls hit - // the same tool name) or fall back to toolName. const key = m.toolCallId ?? m.toolName ?? ""; const stub = pendingCalls.get(key); const errorCode = stub @@ -517,16 +538,13 @@ export function extractTurn(messages: FlatMessage[], now: number): CapturedTurn errorCode, startedAt: stub?.startedAt ?? (m.ts ?? now), endedAt: m.ts ?? now, + thinkingBefore: stub?.thinkingBefore, }); if (key) pendingCalls.delete(key); continue; } - // system / unknown: ignore for the purpose of extractTurn. } - // Any tool call that never received a paired tool_result still lands - // in the trace (with `output: undefined`) so the viewer can show - // "tool was invoked but produced no result". for (const stub of pendingCalls.values()) { if (!stub.name) continue; toolCalls.push({ @@ -535,14 +553,15 @@ export function extractTurn(messages: FlatMessage[], now: number): CapturedTurn output: undefined, startedAt: stub.startedAt ?? now, endedAt: now, + thinkingBefore: stub.thinkingBefore, }); } - const agentThinking = thinkingParts.join("\n\n").trim(); + const agentThinking = pendingThinking.join("\n\n").trim(); return { userText, - agentText: assistantParts.join("\n\n").trim(), - agentThinking: agentThinking ? agentThinking : undefined, + agentText: pendingAssistant.join("\n\n").trim(), + agentThinking: agentThinking || undefined, toolCalls, }; } @@ -796,6 +815,7 @@ export function createOpenClawBridge(opts: BridgeOptions): BridgeHandle { hasError: !!event.error, }); + try { // Legacy adapter parity: even when `success === false` we still // enqueue the user's message (and whatever the assistant managed diff --git a/apps/memos-local-plugin/agent-contract/dto.ts b/apps/memos-local-plugin/agent-contract/dto.ts index a76beead3..7b2353a71 100644 --- a/apps/memos-local-plugin/agent-contract/dto.ts +++ b/apps/memos-local-plugin/agent-contract/dto.ts @@ -40,6 +40,16 @@ export interface ToolCallDTO { errorCode?: string; startedAt: EpochMs; endedAt: EpochMs; + /** + * LLM-native thinking emitted *before* the model decided to invoke this + * tool — e.g. "I got an error from tool_1, let me try a different + * approach". Populated by the adapter when the model interleaves + * thinking blocks between tool calls. `undefined` for legacy data or + * when no thinking preceded this particular call. + * + * Stored inside `tool_calls_json` (no schema migration needed). + */ + thinkingBefore?: string; } export interface TurnInputDTO { diff --git a/apps/memos-local-plugin/core/capture/normalizer.ts b/apps/memos-local-plugin/core/capture/normalizer.ts index bb4bebdb8..2e773f673 100644 --- a/apps/memos-local-plugin/core/capture/normalizer.ts +++ b/apps/memos-local-plugin/core/capture/normalizer.ts @@ -36,15 +36,25 @@ export function normalizeSteps( continue; } - const last = out[out.length - 1]; - if ( - last && - last.agentText === agentText && - last.userText === userText && - sameToolCalls(last.toolCalls, toolCalls) - ) { - log.debug("normalize.skip_duplicate", { key: step.key }); - continue; + // Sub-steps produced by the per-tool-call extractor (V7 §0.1) have + // intentionally-identical userText="" / agentText="" and carry only + // a single tool call each — but two different tools can still share + // a short input fingerprint, which the generic dedup path below + // would incorrectly collapse. Skip dedup for sub-steps; the key + // uniqueness guarantees they can't be genuine duplicates. + const isSubStep = (step.meta as Record | undefined)?.subStep === true; + + if (!isSubStep) { + const last = out[out.length - 1]; + if ( + last && + last.agentText === agentText && + last.userText === userText && + sameToolCalls(last.toolCalls, toolCalls) + ) { + log.debug("normalize.skip_duplicate", { key: step.key }); + continue; + } } out.push({ diff --git a/apps/memos-local-plugin/core/capture/step-extractor.ts b/apps/memos-local-plugin/core/capture/step-extractor.ts index 89bca5aa2..2e566930f 100644 --- a/apps/memos-local-plugin/core/capture/step-extractor.ts +++ b/apps/memos-local-plugin/core/capture/step-extractor.ts @@ -175,7 +175,11 @@ function segmentToSteps( out.push({ key: `${episode.id}:${ts}:tool:${i}`, ts, - userText, + // Only the first sub-step carries the user query; subsequent + // sub-steps leave `userText` empty so the viewer's flattenChat + // doesn't render the same user bubble N times. The turn's + // provenance (episodeId) still links them together. + userText: i === 0 ? userText : "", agentText: "", agentThinking: i === 0 ? fullThinking : null, toolCalls: [tc], @@ -232,6 +236,7 @@ function toolCallFromTurn(turn: EpisodeTurn): ToolCallDTO | null { const endedAt = typeof meta.endedAt === "number" ? meta.endedAt : turn.ts; const input = meta.input ?? meta.args ?? undefined; const errorCode = typeof meta.errorCode === "string" ? meta.errorCode : undefined; + const thinkingBefore = typeof meta.thinkingBefore === "string" ? meta.thinkingBefore : undefined; return { name, input, @@ -239,6 +244,7 @@ function toolCallFromTurn(turn: EpisodeTurn): ToolCallDTO | null { errorCode, startedAt, endedAt, + thinkingBefore, }; } @@ -264,7 +270,8 @@ function coerceToolCall(raw: unknown): ToolCallDTO | null { const startedAt = typeof r.startedAt === "number" ? r.startedAt : Date.now(); const endedAt = typeof r.endedAt === "number" ? r.endedAt : startedAt; - return { name, input, output, errorCode, startedAt, endedAt }; + const thinkingBefore = typeof r.thinkingBefore === "string" ? r.thinkingBefore : undefined; + return { name, input, output, errorCode, startedAt, endedAt, thinkingBefore }; } function depthFromMeta(meta: Record): number { diff --git a/apps/memos-local-plugin/core/config/defaults.ts b/apps/memos-local-plugin/core/config/defaults.ts index 4e24e7436..971f86a66 100644 --- a/apps/memos-local-plugin/core/config/defaults.ts +++ b/apps/memos-local-plugin/core/config/defaults.ts @@ -157,21 +157,27 @@ export const DEFAULT_CONFIG: ResolvedConfig = { episodeGoalMinSim: 0.45, tagFilter: "auto", keywordTopK: 20, - relativeThresholdFloor: 0.4, + // Lowered from 0.4 → 0.2 with the 2026 ranker overhaul: the new + // base relevance already uses channel rank as a first-class + // signal, so the old 0.4 floor was over-pruning keyword hits + // with modest V·decay. + relativeThresholdFloor: 0.2, skillEtaBlend: 0.15, smartSeed: true, + smartSeedRatio: 0.7, + multiChannelBypass: true, skillInjectionMode: "summary", skillSummaryChars: 200, llmFilterEnabled: true, // Tighter than the legacy default (5) so the LLM filter has a - // budget that forces "drop, don't pad". Combined with the - // few-shot prompt this dramatically improves precision. + // small budget; combined with the richer prompt (v3) this keeps + // packets concise without over-dropping. llmFilterMaxKeep: 4, - // Lowered from 3 → 2: small packets (e.g. just a Tier-1 skill + - // a Tier-2 trace) used to skip the LLM filter entirely and ship - // both items even when one was tangential. Now anything > 1 - // candidate gets a precision pass. - llmFilterMinCandidates: 2, + // Lowered from 2 → 1: even a single candidate gets a precision + // pass. Mirrors `memos-local-openclaw`'s tool-level filter and + // prevents a lone off-topic memory from sneaking through unchecked. + llmFilterMinCandidates: 1, + llmFilterCandidateBodyChars: 500, }, }, hub: { diff --git a/apps/memos-local-plugin/core/config/schema.ts b/apps/memos-local-plugin/core/config/schema.ts index 0736ceff9..11280b23c 100644 --- a/apps/memos-local-plugin/core/config/schema.ts +++ b/apps/memos-local-plugin/core/config/schema.ts @@ -322,8 +322,14 @@ const AlgorithmSchema = Type.Object({ * `minTraceSim` — when the best hit is weak, we keep more (lower * absolute floor); when there's a clear winner, we drop noise. * Set to 0 to disable the relative cutoff entirely. + * + * Default lowered to 0.2 with the 2026 ranker overhaul: the new + * base formula already weighs channel-rank evidence (so a raw + * FTS-only hit lands in a comparable range to a cosine-0.8 hit), + * and the old 0.4 floor was over-pruning keyword matches with + * modest V·decay. */ - relativeThresholdFloor: NumberInRange(0.4, 0, 1), + relativeThresholdFloor: NumberInRange(0.2, 0, 1), /** * Tier-1 skill relevance blend weight for `η` (skill reliability). * Old default `0.4` made well-trodden skills outrank obviously-more- @@ -333,12 +339,28 @@ const AlgorithmSchema = Type.Object({ skillEtaBlend: NumberInRange(0.15, 0, 1), /** * MMR Phase-A seed-by-tier policy. When `true`, only seed a tier - * if its best candidate's relevance ≥ `relativeThresholdFloor * - * topRelevance`. This prevents the ranker from force-injecting a - * stale Tier-1 skill / Tier-3 world-model just because it cleared - * the absolute floors. + * if its best candidate's relevance ≥ `poolTopRelevance * + * smartSeedRatio` (see below). This prevents the ranker from + * force-injecting a stale Tier-1 skill / Tier-3 world-model just + * because it cleared the absolute floors. */ smartSeed: Bool(true), + /** + * Seed cutoff for smart-seed MMR — tier is seeded iff its best + * candidate's relevance ≥ `poolTopRelevance * smartSeedRatio`. + * Independent of `relativeThresholdFloor` so the seed gate can be + * stricter than the generic drop floor (0.7 is "within 30% of the + * best available candidate anywhere in the pool"). + */ + smartSeedRatio: NumberInRange(0.7, 0, 1), + /** + * When a candidate is surfaced by ≥ 2 retrieval channels (e.g. + * both vec and fts hit the same trace), bypass the relative + * threshold. Multi-channel agreement is a strong signal, and + * without this keyword-only matches with modest V·decay often + * get dropped by a noisy `topRelevance`. + */ + multiChannelBypass: Bool(true), /** * How Tier-1 skills are surfaced in the injected prompt: * - "summary" (default): inject only `name + η + 1-line summary + @@ -368,10 +390,21 @@ const AlgorithmSchema = Type.Object({ /** Keep at most this many candidates after the LLM filter. */ llmFilterMaxKeep: NumberInRange(5, 1, 30), /** - * Skip the filter when the ranked list already has ≤ this many - * items — no point paying an LLM round-trip to prune 3 candidates. + * Skip the filter when the ranked list has fewer than this many + * items. Default 1 — even a single candidate gets a precision + * pass, matching `memos-local-openclaw`'s tool-level filter and + * preventing a lone off-topic memory from sneaking through + * unchecked. + */ + llmFilterMinCandidates: NumberInRange(1, 1, 50), + /** + * Body-text budget per candidate when building the LLM filter + * prompt. Higher = more context for precise judgement, at the + * cost of more tokens per round-trip. Default 500 (openclaw uses + * 300 without tags/channels; we include richer metadata, so a + * slightly larger window pays for itself). */ - llmFilterMinCandidates: NumberInRange(4, 1, 50), + llmFilterCandidateBodyChars: NumberInRange(500, 120, 2000), }, { default: {} }), }, { default: {} }); diff --git a/apps/memos-local-plugin/core/llm/prompts/retrieval-filter.ts b/apps/memos-local-plugin/core/llm/prompts/retrieval-filter.ts index a73c2fbc9..3db213888 100644 --- a/apps/memos-local-plugin/core/llm/prompts/retrieval-filter.ts +++ b/apps/memos-local-plugin/core/llm/prompts/retrieval-filter.ts @@ -3,89 +3,128 @@ import type { PromptDef } from "./index.js"; /** * Relevance-filter prompt for retrieved candidates. * - * Mirrors the legacy `memos-local-openclaw` `unifiedLLMFilter`, but with - * three deliberate changes baked into the prompt itself: + * Mirrors the legacy `memos-local-openclaw` `unifiedLLMFilter`, but + * tuned for the plugin's tier-aware candidate labels (SKILL / TRACE / + * EPISODE / WORLD-MODEL). Key design choices: * - * 1. **Few-shot examples.** Two cases — one ACCEPT, one REJECT — pin - * down what "tangentially-related but should drop" means. Without - * this LLMs often pad to the maximum allowed selection. - * 2. **"Drop, don't pad" instruction.** Explicit: returning fewer - * items (or `[]`) is preferred over including marginal hits. - * 3. **Hard upper bound on output.** We say `≤ 4 items` (caller still - * enforces via `llmFilterMaxKeep`). + * 1. **Four few-shot examples** — two KEEP, two DROP — so the model + * sees both "useful fact that should survive" and "surface-similar + * but wrong sub-problem". Earlier two-example versions were too + * conservative and dropped genuinely relevant traces. + * 2. **Informational tone, not strict gatekeeping.** The filter is + * the *precision* pass, not a second retrieval — we lean towards + * keeping anything that could plausibly help, because the ranker + * already pruned the obvious noise. + * 3. **`sufficient` self-report.** The model reports whether the + * kept set is enough to answer the query; callers surface this so + * the agent can decide whether to widen recall. * - * Bumping `version` here also rotates the prompt-fingerprint id used by - * `core/llm` audit trails. + * Bumping `version` rotates the prompt-fingerprint id used by + * `core/llm` audit trails, so A/B data from v2 and v3 stays separable. */ export const RETRIEVAL_FILTER_PROMPT: PromptDef = { id: "retrieval.filter", - version: 2, + version: 3, description: - "Pick only the candidates that are genuinely useful for the user query before injection.", - system: `You are a strict relevance gatekeeper for an AI agent's memory retrieval. - -Given: -- QUERY: the user's current request -- CANDIDATES: a numbered list of items the retriever surfaced, each - labelled with a kind (SKILL / TRACE / EPISODE / WORLD-MODEL). - -Your job: pick ONLY the candidates that are genuinely useful for answering -THIS query. Vector retrieval over-matches on surface similarity — most of -your candidates will be tangentially related and should be DROPPED. - -Decision rules (apply in order): -- KEEP a SKILL only if its name + description directly addresses the - exact sub-problem the user is asking about, NOT just the same domain. -- KEEP a WORLD-MODEL only if its title's domain matches the query's - domain AND the body provides a structural fact the agent would - otherwise need to re-discover. -- KEEP a TRACE / EPISODE only if its content contains specific evidence - (a fact, a command, a snippet, a name) the agent could cite or reuse - verbatim. Vague topical similarity is NOT enough. -- DROP items in the same broad area but on a different sub-problem - (e.g. query asks "write a pytest test", candidate is "write a Python - JWT validator" — same language, different problem → DROP). -- DROP "scaffolding" memories (greetings, throwaway acks, capability - questions) even when topically related. - -PREFERENCE: drop, don't pad. Returning 1 truly useful item is better -than returning 4 marginal ones. Returning [] is the right answer when -nothing is genuinely relevant. - -HARD LIMITS: keep at most 4 candidates total. - -──── Example 1 ──── + "Pick the retrieved candidates that are plausibly useful for the user query, and report whether that set is sufficient.", + system: `You are the relevance check for an AI agent's memory retrieval. A +mechanical retriever has already surfaced candidates by vector / keyword +hit. Your job is to keep the ones that a helpful assistant would want to +read before answering, and drop the ones that merely share surface +keywords. + +Input: +- QUERY: the user's current request (or a tool-driven retrieval query). +- CANDIDATES: a numbered list. Each item is labelled with a kind + (SKILL / TRACE / EPISODE / WORLD-MODEL) and metadata such as + \`time\`, \`tags\`, \`via\` (which channels hit — vec / fts / pattern), + and \`score\` (the ranker's relevance). + +Decision guidance: +- KEEP a TRACE / EPISODE when it carries a concrete fact the agent + could use: a name, number, file path, command, preference, or a + specific past exchange that answers the query. Surface-similar chat + without such facts should be dropped. +- KEEP a SKILL when its name / description plausibly addresses the + user's sub-problem. The agent decides later whether to call + \`skill_get\` for the full procedure — err on the side of keeping + one candidate skill. +- KEEP a WORLD-MODEL when its topic matches the domain of the query + and the body contains structural information the agent would + otherwise have to re-derive. +- DROP items in the same broad area but a different sub-problem + (e.g. query asks "write a pytest test", candidate is "write a + Python JWT validator" — same language, different problem). +- DROP scaffolding chatter (greetings, capability questions, acks) + unless the query is explicitly about the chat history. +- Prefer keeping an item when uncertain — you are the precision pass, + not a second retriever. + +After choosing, self-report whether the kept set is enough: +- \`sufficient: true\` when the kept items plausibly answer the QUERY + as-is. +- \`sufficient: false\` when the kept items are only a starting point + and the agent should broaden recall (e.g. run \`memory_search\` with + a different query). + +──── Example 1 (React dark mode, KEEP 2) ──── QUERY: 把这个 React 组件改成支持暗黑模式 CANDIDATES: -1. [SKILL] React Tailwind dark-mode toggle — adds class="dark" toggling and useTheme hook for any React project -2. [TRACE] [user] 我喜欢的运动是游泳 [assistant] 记住了 -3. [SKILL] Python JWT validator — verifies HS256 / RS256 tokens via PyJWT -4. [TRACE] [user] 上次我们用 React Context 写了 ThemeProvider,文件在 src/theme/ [assistant] 记得,要继续用同样的模式吗? +1. [SKILL time=2026-03-01 10:00 via=vec+fts score=0.84] React Tailwind dark-mode toggle · η=0.82 · active + adds class="dark" toggling and useTheme hook for any React project +2. [TRACE time=2026-02-14 09:30 tags=[chit-chat] via=vec score=0.41] [user] 我喜欢的运动是游泳 [assistant] 记住了 +3. [SKILL time=2026-01-11 08:10 via=vec score=0.51] Python JWT validator · η=0.75 · active + verifies HS256 / RS256 tokens via PyJWT +4. [TRACE time=2026-03-04 14:20 tags=[react,theme] via=vec+pattern score=0.79] 上次我们用 React Context 写了 ThemeProvider,文件在 src/theme/ [assistant] 记得,要继续用同样的模式吗? + +Correct output: {"selected": [1, 4], "sufficient": true} + +──── Example 2 (phone number lookup, KEEP 1 via FTS only) ──── +QUERY: 还记得我的手机号吗? + +CANDIDATES: +1. [TRACE time=2026-02-20 21:05 tags=[profile] via=fts score=0.18] [user] 我的手机号是 13800001234 [assistant] 已记住 +2. [TRACE time=2026-02-10 09:30 tags=[chit-chat] via=vec score=0.35] [user] 今天天气怎么样 [assistant] 杭州小雨 +3. [SKILL time=2025-12-01 11:00 via=vec score=0.22] phone-number-validator · η=0.88 -Correct output: {"selected": [1, 4]} -Reasoning: 1 directly addresses dark-mode in React; 4 contains the -exact file path the agent will need. 2 is unrelated. 3 is wrong language -+ wrong sub-problem. +Correct output: {"selected": [1], "sufficient": true} +Reasoning: candidate 1 is only surfaced by FTS with a modest score, but +it carries the exact fact the user is asking about. Keep it. -──── Example 2 ──── +──── Example 3 (weather lookup, KEEP 1 fact) ──── QUERY: 帮我看下今天天气 CANDIDATES: -1. [TRACE] [user] 我住在杭州 [assistant] 已记住 -2. [SKILL] Docker container syslib install fix -3. [WORLD-MODEL] React project layout — components in src/components/ +1. [TRACE time=2026-01-04 18:05 tags=[profile] via=fts score=0.22] [user] 我住在杭州 [assistant] 已记住 +2. [SKILL time=2025-10-02 09:10 via=vec score=0.31] Docker container syslib install fix · η=0.77 +3. [WORLD-MODEL time=2025-09-11 16:00 via=vec score=0.29] React project layout — components in src/components/ + +Correct output: {"selected": [1], "sufficient": false} +Reasoning: only 1 carries a fact the agent needs (location). The agent +still needs a live weather lookup tool, so the kept set alone is not +enough. + +──── Example 4 (DROP everything, DROP 3) ──── +QUERY: 写一个快速排序的 Python 实现 + +CANDIDATES: +1. [TRACE time=2026-03-02 11:00 tags=[chit-chat] via=vec score=0.40] [user] 你好 [assistant] 你好!今天想做什么? +2. [TRACE time=2026-01-19 22:00 tags=[japanese] via=fts score=0.21] [user] 「クイック」は何の意味? [assistant] fast / quick +3. [SKILL time=2025-08-01 09:00 via=vec score=0.33] Python JWT validator · η=0.70 -Correct output: {"selected": [1]} -Reasoning: only 1 carries a fact the agent needs (location for weather -lookup). 2 and 3 are completely unrelated. +Correct output: {"selected": [], "sufficient": false} +Reasoning: no candidate carries information the agent needs to produce +the answer. The chit-chat and translation traces share only surface +keywords. Drop all and let the agent answer from its own knowledge. ──── Output format ──── Return JSON only, no prose: { - "selected": [1, 3] + "selected": [1, 3], + "sufficient": true } -where each number is the 1-based index in the CANDIDATES list. +where each number is the 1-based index into CANDIDATES. -If nothing is truly relevant, return {"selected": []}.`, +If nothing is truly relevant, return {"selected": [], "sufficient": false}.`, }; diff --git a/apps/memos-local-plugin/core/pipeline/deps.ts b/apps/memos-local-plugin/core/pipeline/deps.ts index 5322bf0d4..28fa9f79c 100644 --- a/apps/memos-local-plugin/core/pipeline/deps.ts +++ b/apps/memos-local-plugin/core/pipeline/deps.ts @@ -147,12 +147,15 @@ export function extractAlgorithmConfig( relativeThresholdFloor: alg.retrieval.relativeThresholdFloor, skillEtaBlend: alg.retrieval.skillEtaBlend, smartSeed: alg.retrieval.smartSeed, + smartSeedRatio: alg.retrieval.smartSeedRatio, + multiChannelBypass: alg.retrieval.multiChannelBypass, skillInjectionMode: alg.retrieval.skillInjectionMode, skillSummaryChars: alg.retrieval.skillSummaryChars, decayHalfLifeDays: alg.reward.decayHalfLifeDays, llmFilterEnabled: alg.retrieval.llmFilterEnabled, llmFilterMaxKeep: alg.retrieval.llmFilterMaxKeep, llmFilterMinCandidates: alg.retrieval.llmFilterMinCandidates, + llmFilterCandidateBodyChars: alg.retrieval.llmFilterCandidateBodyChars, }, session: { followUpMode: alg.session.followUpMode, diff --git a/apps/memos-local-plugin/core/pipeline/memory-core.ts b/apps/memos-local-plugin/core/pipeline/memory-core.ts index bcb599898..2d26d5670 100644 --- a/apps/memos-local-plugin/core/pipeline/memory-core.ts +++ b/apps/memos-local-plugin/core/pipeline/memory-core.ts @@ -824,6 +824,22 @@ export function createMemoryCore( snippet: string; }> = []; let filtered: typeof candidates = []; + let retrievalStats: { + raw?: number; + ranked?: number; + droppedByThreshold?: number; + thresholdFloor?: number; + topRelevance?: number; + llmFilter?: { + outcome?: string; + kept?: number; + dropped?: number; + sufficient?: boolean | null; + }; + channelHits?: Record; + queryTokens?: number; + queryTags?: string[]; + } | undefined; try { const result = await turnStartRetrieve(deps, { reason: "turn_start", @@ -857,6 +873,29 @@ export function createMemoryCore( snippet: h.snippet, })); filtered = candidates; // post-filter is what we return → same list. + + // Three-stage observability — surfaced verbatim so the viewer's + // Logs page can render "raw → threshold → ranked → LLM filter" + // funnels. All fields are optional on the producer side so older + // consumers keep working. + const s = result.stats; + retrievalStats = { + raw: s.rawCandidateCount, + ranked: s.rankedCount, + droppedByThreshold: s.droppedByThresholdCount, + thresholdFloor: s.thresholdFloor, + topRelevance: s.topRelevance, + llmFilter: { + outcome: s.llmFilterOutcome, + kept: s.llmFilterKept, + dropped: s.llmFilterDropped, + sufficient: s.llmFilterSufficient ?? null, + }, + channelHits: s.channelHits as Record | undefined, + queryTokens: s.queryTokens, + queryTags: s.queryTags, + }; + return { query, hits, @@ -883,6 +922,7 @@ export function createMemoryCore( candidates, hubCandidates: [] as unknown[], filtered, + stats: retrievalStats, } : { error: "retrieval_failed" }, durationMs: Date.now() - startedAt, diff --git a/apps/memos-local-plugin/core/pipeline/orchestrator.ts b/apps/memos-local-plugin/core/pipeline/orchestrator.ts index 672f1cc9b..8c68571f9 100644 --- a/apps/memos-local-plugin/core/pipeline/orchestrator.ts +++ b/apps/memos-local-plugin/core/pipeline/orchestrator.ts @@ -687,6 +687,11 @@ export function createPipeline(deps: PipelineDeps): PipelineHandle { errorCode: tc.errorCode, startedAt: tc.startedAt, endedAt: tc.endedAt, + // V7 §0.1: preserve the model's "Thought for X" narration that + // precedes this call so `step-extractor` can re-attach it to + // the captured ToolCallDTO. Without this, chained tool calls + // lose the natural-language bridge between steps. + thinkingBefore: tc.thinkingBefore, }, }); } diff --git a/apps/memos-local-plugin/core/retrieval/llm-filter.ts b/apps/memos-local-plugin/core/retrieval/llm-filter.ts index a42771908..f103c240d 100644 --- a/apps/memos-local-plugin/core/retrieval/llm-filter.ts +++ b/apps/memos-local-plugin/core/retrieval/llm-filter.ts @@ -2,29 +2,31 @@ * LLM-based relevance filter — post-processing step after `rank()`. * * Motivation (ported from legacy `memos-local-openclaw::unifiedLLMFilter`): - * cosine retrieval is greedy — any Python prompt pulls back every + * mechanical retrieval is greedy — any Python prompt pulls back every * Python-tagged trace even when the sub-problem doesn't match. A small * LLM call ("given this query, pick the truly relevant candidates") * removes most of the noise with a single round-trip. * * Design constraints: - * - One LLM call per turn, bounded output (just the index list). + * - One LLM call per turn, bounded output (index list + `sufficient`). * - Totally opt-in: if the LLM is null, or the config flag is off, - * or the candidate list is small enough, we pass through the - * ranked list unchanged. - * - On ANY failure (network, schema, timeout) we fall back to the - * ranked list. A missing filter must never crash retrieval. + * or the candidate list is empty, we pass through unchanged. + * - On ANY failure (network, schema, timeout) we fall back to a + * mechanical cutoff. A broken filter must never crash retrieval. * - Returns both kept and dropped candidates so callers can log * exactly what the LLM pruned (feeds the Logs page). + * - Rich candidate labels — we include role/time/tags/channels/score + * because openclaw's filter runs on those fields and loses precision + * without them. */ import type { LlmClient } from "../llm/index.js"; import type { Logger } from "../logger/types.js"; import { RETRIEVAL_FILTER_PROMPT } from "../llm/prompts/index.js"; import type { RankedCandidate } from "./ranker.js"; -import type { RetrievalConfig } from "./types.js"; +import type { RetrievalConfig, TierCandidate } from "./types.js"; -const MAX_CANDIDATE_CONTENT_CHARS = 240; +const DEFAULT_CANDIDATE_BODY_CHARS = 500; export interface FilterInput { query: string; @@ -36,7 +38,10 @@ export interface FilterDeps { log: Logger; config: Pick< RetrievalConfig, - "llmFilterEnabled" | "llmFilterMaxKeep" | "llmFilterMinCandidates" + | "llmFilterEnabled" + | "llmFilterMaxKeep" + | "llmFilterMinCandidates" + | "llmFilterCandidateBodyChars" >; } @@ -59,6 +64,13 @@ export interface FilterResult { // `relativeThresholdFloor · topRelevance`) instead of dumping the // entire ranked list into the prompt. | "llm_failed_safe_cutoff"; + /** + * The LLM's self-report on whether the *kept* candidates are enough + * to answer `query`, or whether the caller should widen recall / + * run a follow-up `memory_search`. `null` when the filter didn't + * run (disabled / passthrough / failure paths). + */ + sufficient: boolean | null; } export async function llmFilterCandidates( @@ -69,12 +81,17 @@ export async function llmFilterCandidates( if (!deps.config.llmFilterEnabled) { return passthrough(ranked, "disabled"); } - // `llmFilterMinCandidates` is "minimum candidates required to RUN the - // filter". `<` so a packet with exactly the threshold count still gets - // a precision pass (the most useful case — small but noisy packets). + // `llmFilterMinCandidates` is the *minimum* list length required to + // RUN the filter. Default is 1, meaning even a single candidate gets + // a precision pass — openclaw behaviour, and matches the user + // reports that "a single off-topic memory sneaks through when the + // filter skips the check". if (ranked.length < deps.config.llmFilterMinCandidates) { return passthrough(ranked, "below_threshold"); } + if (ranked.length === 0) { + return passthrough(ranked, "below_threshold"); + } if (!query || !query.trim()) { return passthrough(ranked, "empty_query"); } @@ -82,16 +99,19 @@ export async function llmFilterCandidates( return passthrough(ranked, "no_llm"); } + const bodyChars = + deps.config.llmFilterCandidateBodyChars ?? DEFAULT_CANDIDATE_BODY_CHARS; const items = ranked.map((r, i) => ({ index: i, - label: describeCandidate(r), + label: describeCandidate(r, bodyChars), })); - const list = items - .map((x) => `${x.index + 1}. ${x.label}`) - .join("\n"); + const list = items.map((x) => `${x.index + 1}. ${x.label}`).join("\n"); try { - const rsp = await deps.llm.completeJson<{ selected?: unknown }>( + const rsp = await deps.llm.completeJson<{ + selected?: unknown; + sufficient?: unknown; + }>( [ { role: "system", content: RETRIEVAL_FILTER_PROMPT.system }, { @@ -105,22 +125,18 @@ ${list}`, { op: `retrieval.${RETRIEVAL_FILTER_PROMPT.id}.v${RETRIEVAL_FILTER_PROMPT.version}`, temperature: 0, - // Short output — we only need an array of integers. Keep the - // token cap tight so a misbehaving model can't blow budgets. - maxTokens: 120, + // Short output — indices + one bool. Kept tight so a misbehaving + // model can't blow budgets. + maxTokens: 160, malformedRetries: 1, }, ); const raw = (rsp.value?.selected ?? []) as unknown; + const sufficient = coerceBool(rsp.value?.sufficient); if (!Array.isArray(raw)) { - deps.log.debug("llm_filter.malformed", { - got: typeof raw, - }); - // Same fallback policy as throw — we'd rather lean conservative - // than dump the whole ranked list into the prompt. + deps.log.debug("llm_filter.malformed", { got: typeof raw }); return safeCutoff(ranked, deps); } - // Convert 1-based indices → 0-based, drop duplicates and out-of-range. const keepIndices = new Set(); for (const v of raw) { const n = typeof v === "number" ? v : Number(v); @@ -131,14 +147,14 @@ ${list}`, if (keepIndices.size >= deps.config.llmFilterMaxKeep) break; } if (keepIndices.size === 0) { - // Model asked us to drop everything — we honour it even when the - // ranked list was non-empty. Surface this explicitly so the Logs - // page can show "LLM found nothing relevant" instead of silently - // injecting a partial packet. + // Model asked us to drop everything — honoured. Surface this + // explicitly so the Logs page can show "LLM found nothing + // relevant" instead of silently injecting a partial packet. return { kept: [], dropped: [...ranked], outcome: "llm_filtered", + sufficient: sufficient ?? false, }; } const kept: RankedCandidate[] = []; @@ -151,6 +167,7 @@ ${list}`, dropped, outcome: kept.length === ranked.length ? "llm_kept_all" : "llm_filtered", + sufficient, }; } catch (err) { deps.log.warn("llm_filter.failed", { @@ -165,32 +182,39 @@ function passthrough( ranked: readonly RankedCandidate[], outcome: FilterResult["outcome"], ): FilterResult { - return { kept: [...ranked], dropped: [], outcome }; + return { kept: [...ranked], dropped: [], outcome, sufficient: null }; } /** * Mechanical fail-closed: when the LLM is unavailable / errored, * apply a relative-relevance cutoff so we don't dump the entire ranked * list into the prompt. Keeps: - * 1. items whose score ≥ `topScore · relativeThresholdFloor` + * 1. items whose score ≥ `topScore · 0.7` * 2. capped at `llmFilterMaxKeep` so the prompt stays small. * - * The ranker already applied an initial cutoff with the same floor, - * but the LLM is expected to prune further (because cosine + RRF still - * over-includes); this fallback uses a slightly tighter ratio so the - * "fail" path doesn't ship as much noise as the success path. + * The ranker already applied an initial cutoff with the same family of + * floors, but the LLM is expected to prune further (because the + * ranker is tuned for recall). This fallback uses a slightly tighter + * ratio so the "fail" path doesn't ship as much noise as the success + * path. */ function safeCutoff( ranked: readonly RankedCandidate[], deps: FilterDeps, ): FilterResult { if (ranked.length === 0) { - return { kept: [], dropped: [], outcome: "llm_failed_safe_cutoff" }; + return { + kept: [], + dropped: [], + outcome: "llm_failed_safe_cutoff", + sufficient: null, + }; } - // Tighter than the ranker's relativeThresholdFloor — when LLM has - // failed, lean conservative. const ratio = 0.7; - const topScore = ranked.reduce((m, c) => Math.max(m, c.score ?? c.relevance), 0); + const topScore = ranked.reduce( + (m, c) => Math.max(m, c.score ?? c.relevance), + 0, + ); const cutoff = topScore > 0 ? topScore * ratio : 0; const keepCap = Math.max(1, deps.config.llmFilterMaxKeep); const kept: RankedCandidate[] = []; @@ -201,29 +225,52 @@ function safeCutoff( else dropped.push(c); } // If the cutoff would have dropped everything, keep the single best - // candidate so the agent at least sees one option. Better than 0. + // candidate so the agent at least sees one option. if (kept.length === 0 && ranked.length > 0) { kept.push(ranked[0]!); dropped.shift(); } - return { kept, dropped, outcome: "llm_failed_safe_cutoff" }; + return { + kept, + dropped, + outcome: "llm_failed_safe_cutoff", + sufficient: null, + }; +} + +function coerceBool(v: unknown): boolean | null { + if (typeof v === "boolean") return v; + if (v === "true" || v === "yes" || v === 1) return true; + if (v === "false" || v === "no" || v === 0) return false; + return null; } -function describeCandidate(r: RankedCandidate): string { +/** + * Render a ranked candidate into a single labelled string for the LLM. + * Much richer than the old 240-char summary — now includes time, role, + * tags, which channels surfaced the row, and the ranker's score. This + * mirrors what openclaw's `filterRelevant` receives and lets the model + * reason over "fresh vs stale", "skill vs memory", "keyword vs vector + * hit" without guessing. + */ +function describeCandidate(r: RankedCandidate, bodyChars: number): string { const c = r.candidate; + const meta = metaOf(r, c); switch (c.tier) { case "tier1": { const skill = c as { skillName?: string; invocationGuide?: string; eta?: number; + status?: string; }; - const name = skill.skillName ?? "(skill)"; - const hint = (skill.invocationGuide ?? "") - .replace(/\s+/g, " ") - .trim() - .slice(0, MAX_CANDIDATE_CONTENT_CHARS); - return `[SKILL] ${name} — ${hint}`; + const head = `${skill.skillName ?? "(skill)"}${ + typeof skill.eta === "number" + ? ` · η=${skill.eta.toFixed(2)}` + : "" + }${skill.status ? ` · ${skill.status}` : ""}`; + const hint = squashBody(skill.invocationGuide ?? "", bodyChars); + return `[SKILL ${meta}] ${head}${hint ? `\n ${hint}` : ""}`; } case "tier2": { if (c.refKind === "trace") { @@ -233,29 +280,62 @@ function describeCandidate(r: RankedCandidate): string { agentText?: string; reflection?: string | null; }; - const body = (tr.summary || tr.userText || tr.agentText || "") - .replace(/\s+/g, " ") - .trim() - .slice(0, MAX_CANDIDATE_CONTENT_CHARS); - return `[TRACE] ${body}`; + const parts: string[] = []; + if (tr.summary?.trim()) parts.push(tr.summary.trim()); + if (tr.userText?.trim()) parts.push(`[user] ${tr.userText.trim()}`); + if (tr.agentText?.trim()) + parts.push(`[assistant] ${tr.agentText.trim()}`); + if (tr.reflection?.trim()) + parts.push(`[note] ${tr.reflection.trim()}`); + const body = squashBody(parts.join(" "), bodyChars); + return `[TRACE ${meta}] ${body}`; } const ep = c as { summary?: string }; - const body = (ep.summary ?? "") - .replace(/\s+/g, " ") - .trim() - .slice(0, MAX_CANDIDATE_CONTENT_CHARS); - return `[EPISODE] ${body}`; + const body = squashBody(ep.summary ?? "", bodyChars); + return `[EPISODE ${meta}] ${body}`; } case "tier3": { const wm = c as { title?: string; body?: string }; const head = wm.title ?? "(world-model)"; - const hint = (wm.body ?? "") - .replace(/\s+/g, " ") - .trim() - .slice(0, MAX_CANDIDATE_CONTENT_CHARS); - return `[WORLD-MODEL] ${head} — ${hint}`; + const body = squashBody(wm.body ?? "", bodyChars); + return `[WORLD-MODEL ${meta}] ${head}${body ? `\n ${body}` : ""}`; } default: - return "[UNKNOWN]"; + return `[UNKNOWN ${meta}]`; + } +} + +function metaOf(r: RankedCandidate, c: TierCandidate): string { + const bits: string[] = []; + if (typeof c.ts === "number" && c.ts > 0) { + bits.push(`time=${formatTime(c.ts)}`); + } + if (Array.isArray((c as { tags?: readonly string[] }).tags)) { + const tags = ((c as { tags?: readonly string[] }).tags ?? []) + .filter(Boolean) + .slice(0, 6); + if (tags.length) bits.push(`tags=[${tags.join(",")}]`); + } + const channels = (c.channels ?? []) + .map((ch) => ch.channel) + .filter(Boolean) + .slice(0, 4); + if (channels.length) bits.push(`via=${channels.join("+")}`); + const score = r.score ?? r.relevance; + if (Number.isFinite(score)) bits.push(`score=${score.toFixed(3)}`); + return bits.join(" "); +} + +function squashBody(s: string, max: number): string { + const cleaned = s.replace(/\s+/g, " ").trim(); + if (cleaned.length <= max) return cleaned; + return cleaned.slice(0, Math.max(0, max - 1)) + "…"; +} + +function formatTime(ts: number): string { + try { + return new Date(ts).toISOString().slice(0, 16).replace("T", " "); + } catch { + return String(ts); } } diff --git a/apps/memos-local-plugin/core/retrieval/ranker.ts b/apps/memos-local-plugin/core/retrieval/ranker.ts index 083d10fe1..453fdbeb1 100644 --- a/apps/memos-local-plugin/core/retrieval/ranker.ts +++ b/apps/memos-local-plugin/core/retrieval/ranker.ts @@ -1,28 +1,35 @@ /** * Ranker — fuses candidates across tiers and enforces diversity. * - * Three passes: + * Design (2026 overhaul, aligned with `memos-local-openclaw::recall/engine`): * - * 1. **Per-channel RRF.** Each `RankedCandidate` carries one - * `ChannelRank` per channel that contributed it (vec_summary, - * vec_action, fts, pattern, structural). The fused score is - * `Σ 1 / (k + rank_i + 1)` over those ranks. A row that surfaces - * in 3 channels gets a much bigger lift than a vector-only hit. - * This is what plugs the "single-channel false positive" hole. + * 1. **Base = best channel score.** A candidate's base evidence is the + * strongest single-channel hit it has — cosine for vector, `1/(rank+1)` + * for FTS / pattern, `0.9` synthetic for structural error-signature. + * This puts all channels on a comparable (0, 1] footing without the + * "cosine=0 for keyword hits" trap the old formula had. * - * 2. **Relative threshold drop.** After computing per-candidate - * `relevance`, drop everyone whose `relevance < topRelevance · - * relativeThresholdFloor`. Adaptive: a strong query (top score 0.9) - * keeps only items ≥ 0.36; a weak query (top 0.4) keeps items ≥ 0.16. + * 2. **RRF bonus across channels.** Multi-channel matches add + * `rrfWeight · Σ 1/(k + rank_i + 1)`. A row confirmed by 2+ channels + * gets a clear lift over single-channel false-positives. * - * 3. **MMR with smart per-tier seed.** Seed at most one candidate per - * non-empty tier (so a packet is never a single-tier monoculture) - * — but only seed a tier if its best candidate clears the relative - * threshold. This kills the "irrelevant skill / world-model gets - * force-injected" failure mode. + * 3. **Tier-specific additive boosts.** V·decay (Tier-2) and η + * (Tier-1) are add-ons that differentiate rows *within* the same + * base-score band — not a dominant term that washes out the RRF + * signal. * - * This module is pure and framework-agnostic — no storage, no embedder, - * no side effects. Unit testable by passing in plain arrays. + * 4. **Multi-channel bypass.** Any candidate surfaced by ≥ 2 channels + * is exempt from the relative-threshold drop (it can still lose in + * MMR on redundancy). This is the backstop that guarantees a + * keyword-only hit confirmed by vector can never be silently + * dropped because a noisy topRelevance dragged the floor up. + * + * 5. **Smart-seed MMR.** Phase A seeds at most one candidate per tier, + * and only if its relevance is within `smartSeedRatio` of the pool + * top. Prevents "force-inject an irrelevant Tier-1 / Tier-3 just + * because the tier had a candidate". + * + * The module stays pure — no storage, no embedder, no side effects. */ import { cosinePrenormed, norm2 } from "../storage/vector.js"; @@ -31,6 +38,7 @@ import { priorityFor } from "../reward/backprop.js"; import type { ChannelRank, EpisodeCandidate, + RetrievalChannel, RetrievalConfig, SkillCandidate, TierCandidate, @@ -53,18 +61,20 @@ export interface RankerInput { export interface RankedCandidate { candidate: TierCandidate; /** - * Base relevance used by MMR. Blends: - * - cosine + priority (vector-aware tiers) - * - small η nudge for Tier-1 - * - per-channel RRF lift (so multi-channel matches surface) + * Base relevance used by MMR. + * relevance = bestChannelScore + rrfWeight · Σ 1/(k+rank+1) + * + priorityBoost (tier2) + etaBoost (tier1) */ relevance: number; - /** Fused RRF score across channels. */ + /** Fused RRF score across channels (pre-weighting). */ rrf: number; /** Final MMR-adjusted score. */ score: number; /** `||vec||²`, cached for MMR. `null` means "no vec → treat as fully diverse". */ normSq: number | null; + /** True when this candidate was allowed past the threshold via the + * multi-channel bypass (useful for logs / "why did this survive?"). */ + bypassedThreshold?: boolean; } export interface RankerResult { @@ -77,10 +87,23 @@ export interface RankerResult { topRelevance: number; /** Number of candidates the relative-threshold cut. */ droppedByThreshold: number; + /** Absolute floor applied (`topRelevance · floor`). */ + thresholdFloor: number; + /** Channel hit counts aggregated across all candidates. */ + channelHits: Partial>; } -const DEFAULT_RELATIVE_THRESHOLD = 0.4; +const DEFAULT_RELATIVE_THRESHOLD = 0.2; +const DEFAULT_SMART_SEED_RATIO = 0.7; const DEFAULT_SKILL_ETA_BLEND = 0.15; +/** + * How much each channel's RRF contribution is scaled by in the base + * relevance formula. Kept small so that "best-channel-score" dominates + * per-candidate but multi-channel agreement still gets a clear lift. + */ +const RRF_WEIGHT = 0.4; +/** Default priority blend — V·decay contributes this much at V=1. */ +const DEFAULT_PRIORITY_BLEND = 0.3; export function rank(input: RankerInput): RankerResult { const tierSizes: Record = { @@ -89,6 +112,7 @@ export function rank(input: RankerInput): RankerResult { tier3: input.tier3.length, }; const kept: Record = { tier1: 0, tier2: 0, tier3: 0 }; + const channelHits: Partial> = {}; // ─── 1. Bag every candidate with relevance + RRF ────────────────────────── const bag: RankedCandidate[] = []; @@ -97,6 +121,13 @@ export function rank(input: RankerInput): RankerResult { pushAll(bag, input.tier2Episodes, (c) => relevanceFor(c, input)); pushAll(bag, input.tier3, (c) => relevanceFor(c, input)); + // Tally channel hits for observability. + for (const c of bag) { + for (const ch of c.candidate.channels ?? []) { + channelHits[ch.channel] = (channelHits[ch.channel] ?? 0) + 1; + } + } + if (bag.length === 0) { return { ranked: [], @@ -104,24 +135,44 @@ export function rank(input: RankerInput): RankerResult { kept, topRelevance: 0, droppedByThreshold: 0, + thresholdFloor: 0, + channelHits, }; } assignChannelRrf(bag, input.config.rrfConstant); - // Fold the channel-RRF into relevance so MMR + threshold both honour it. - for (const c of bag) c.relevance += c.rrf; + for (const c of bag) c.relevance += RRF_WEIGHT * c.rrf; - // ─── 2. Relative threshold cut ──────────────────────────────────────────── + // ─── 2. Relative threshold cut (with multi-channel bypass) ──────────────── const topRelevance = bag.reduce((m, c) => Math.max(m, c.relevance), 0); const floorRatio = input.config.relativeThresholdFloor ?? DEFAULT_RELATIVE_THRESHOLD; const cutoff = topRelevance > 0 ? topRelevance * floorRatio : 0; - const droppedByThreshold = bag.filter((c) => c.relevance < cutoff).length; - const survivors = - cutoff > 0 ? bag.filter((c) => c.relevance >= cutoff) : [...bag]; + const bypassEnabled = input.config.multiChannelBypass !== false; + + let droppedByThreshold = 0; + const survivors: RankedCandidate[] = []; + for (const c of bag) { + const channels = c.candidate.channels ?? []; + const multiChannel = bypassEnabled && channels.length >= 2; + if (multiChannel) c.bypassedThreshold = true; + if (cutoff > 0 && c.relevance < cutoff && !multiChannel) { + droppedByThreshold += 1; + continue; + } + survivors.push(c); + } if (survivors.length === 0) { - return { ranked: [], tierSizes, kept, topRelevance, droppedByThreshold }; + return { + ranked: [], + tierSizes, + kept, + topRelevance, + droppedByThreshold, + thresholdFloor: cutoff, + channelHits, + }; } // ─── 3. MMR-style greedy pick ───────────────────────────────────────────── @@ -132,13 +183,18 @@ export function rank(input: RankerInput): RankerResult { const pool = [...survivors]; const limit = Math.min(input.limit, survivors.length); const smartSeed = input.config.smartSeed !== false; - // Smart-seed cutoff: only seed a tier if its best candidate beats this. - // Falls back to plain `cutoff` so we never seed an item we'd otherwise - // drop. Setting `smartSeed = false` reverts to the legacy "seed best - // of every non-empty tier". - const seedCutoff = smartSeed ? cutoff : 0; + const seedRatio = smartSeed + ? input.config.smartSeedRatio ?? DEFAULT_SMART_SEED_RATIO + : 0; + const poolTop = pool.reduce((m, c) => Math.max(m, c.relevance), 0); + const seedCutoff = smartSeed ? poolTop * seedRatio : 0; // Phase A — seeded picks per tier (preserves cross-tier diversity). + // V7 §2.6: each tier answers a different question — we keep at most + // one seed per tier so a packet is never a monoculture, but we only + // seed if the tier's best candidate is within `smartSeedRatio` of the + // pool top. Irrelevant Tier-1 / Tier-3 candidates no longer slip in + // just because the tier was non-empty. const seedTiers: TierKind[] = ["tier1", "tier2", "tier3"]; for (const tk of seedTiers) { if (out.length >= limit) break; @@ -185,35 +241,100 @@ export function rank(input: RankerInput): RankerResult { // Sort the final list by score desc (MMR scores are not guaranteed // monotone during the loop because Phase A seeds get their raw relevance). out.sort((a, b) => b.score - a.score || b.rrf - a.rrf); - return { ranked: out, tierSizes, kept, topRelevance, droppedByThreshold }; + return { + ranked: out, + tierSizes, + kept, + topRelevance, + droppedByThreshold, + thresholdFloor: cutoff, + channelHits, + }; } // ─── Helpers ──────────────────────────────────────────────────────────────── +/** + * Per-candidate base relevance. New design: + * + * relevance = bestChannelScore + * + priorityBlend · priorityForLive (trace / episode) + * + skillEtaBlend · η (skill) + * + * RRF across channels is added *after* this function runs (so we have + * access to `rrfConstant`). We start from `bestChannelScore` — which for + * vec hits is cosine, for fts/pattern is `1/(rank+1)`, for structural is + * the synthetic 0.9 — meaning an exact keyword hit at rank 0 starts at + * the same base (1.0) as a cosine-1.0 hit. Without this, pure-keyword + * hits with cosine=0 would score essentially zero and get guillotined + * by the relative threshold. + */ function relevanceFor(c: TierCandidate, input: RankerInput): number { - const cosW = input.config.weightCosine; - const priW = input.config.weightPriority; - const cos = clamp(c.cosine, -1, 1); + const base = bestChannelScore(c); if (c.tier === "tier1") { const sk = c as SkillCandidate; - const etaBlend = - input.config.skillEtaBlend ?? DEFAULT_SKILL_ETA_BLEND; - // Cosine still dominates; η is a small reliability nudge. - return cosW * cos + etaBlend * clamp(sk.eta, 0, 1); + const etaBlend = input.config.skillEtaBlend ?? DEFAULT_SKILL_ETA_BLEND; + return base + etaBlend * clamp(sk.eta, 0, 1); } if (c.refKind === "trace") { const tc = c as TraceCandidate; - const live = priorityFor(tc.value, tc.ts, input.config.decayHalfLifeDays, input.now); - return cosW * cos + priW * live; + const live = priorityFor( + tc.value, + tc.ts, + input.config.decayHalfLifeDays, + input.now, + ); + const blend = priorityBlendFor(input.config); + return base + blend * live; } if (c.refKind === "episode") { const ep = c as EpisodeCandidate; - const live = priorityFor(ep.maxValue, ep.ts, input.config.decayHalfLifeDays, input.now); - return cosW * cos + priW * live; + const live = priorityFor( + ep.maxValue, + ep.ts, + input.config.decayHalfLifeDays, + input.now, + ); + const blend = priorityBlendFor(input.config); + return base + blend * live; + } + // Tier 3 world-model — no V signal; rely on base + RRF. + return base; +} + +/** + * `weightPriority` is kept in config for backwards-compat, but the new + * default-semantics is: "how much priority lifts relevance at V=1". + * Historically this was used as a linear weight on a `cos + priority` + * blend where `cos` was already in 0~1; now `base` already carries a + * 0~1 signal so we scale priority to a non-dominating floor (default + * 0.3). Configs that explicitly set `weightPriority` higher than that + * still work — their intent "priority matters more" is preserved. + */ +function priorityBlendFor(config: RetrievalConfig): number { + const w = config.weightPriority; + if (w == null || w <= 0) return 0; + // Cap the effective blend so priority can't single-handedly push a + // V=1 trace above a channel-confirmed keyword hit — priority is a + // tie-breaker, not a dominant term. + return Math.min(w, DEFAULT_PRIORITY_BLEND); +} + +function bestChannelScore(c: TierCandidate): number { + const channels = c.channels ?? []; + if (channels.length === 0) { + // Legacy path — callers that build candidates without `channels` + // (unit tests, older fixtures) fall back to the raw cosine. + return clamp(c.cosine, 0, 1); + } + let best = 0; + for (const ch of channels) { + if (ch.score > best) best = ch.score; } - // Tier 3 — cosine only; world-models have no V. - return cosW * cos; + // If the candidate also carries a cosine (e.g. structural bumped), + // honour it as a floor — structural hits set cosine=0.9 synthetically. + return Math.max(best, clamp(c.cosine, 0, 1)); } function pushAll( diff --git a/apps/memos-local-plugin/core/retrieval/retrieve.ts b/apps/memos-local-plugin/core/retrieval/retrieve.ts index 2377359f9..f621d7209 100644 --- a/apps/memos-local-plugin/core/retrieval/retrieve.ts +++ b/apps/memos-local-plugin/core/retrieval/retrieve.ts @@ -264,6 +264,8 @@ async function runAll( const tier3LatencyMs = plan.wantTier3 ? Date.now() - tier3Start : 0; const fuseStart = Date.now(); + const rawCandidateCount = + tier1.length + tier2.traces.length + tier2.episodes.length + tier3.length; const ranked = rank({ tier1, tier2Traces: tier2.traces, @@ -281,7 +283,8 @@ async function runAll( // items that share surface keywords with the query but aren't // actually relevant. Fails open — on any error we keep the // mechanical ranking. - const queryText = (ctx as { userText?: string }).userText ?? compiled.text ?? ""; + const queryText = + (ctx as { userText?: string }).userText ?? compiled.text ?? ""; const filtered = await llmFilterCandidates( { query: queryText, ranked: ranked.ranked }, { @@ -292,9 +295,15 @@ async function runAll( ); log.debug("llm_filter.done", { outcome: filtered.outcome, - before: ranked.ranked.length, + sufficient: filtered.sufficient, + raw: rawCandidateCount, + afterThreshold: ranked.ranked.length, + droppedByThreshold: ranked.droppedByThreshold, + thresholdFloor: round(ranked.thresholdFloor, 3), + topRelevance: round(ranked.topRelevance, 3), kept: filtered.kept.length, dropped: filtered.dropped.length, + channels: ranked.channelHits, }); const { packet } = toPacket({ @@ -342,6 +351,16 @@ async function runAll( queryTokens: approxTokens(compiled.text), queryTags: compiled.tags, emptyPacket: packet.snippets.length === 0, + rawCandidateCount, + droppedByThresholdCount: ranked.droppedByThreshold, + thresholdFloor: ranked.thresholdFloor, + topRelevance: ranked.topRelevance, + rankedCount: ranked.ranked.length, + llmFilterOutcome: filtered.outcome, + llmFilterSufficient: filtered.sufficient ?? undefined, + llmFilterKept: filtered.kept.length, + llmFilterDropped: filtered.dropped.length, + channelHits: ranked.channelHits, }; log.info("done", { @@ -432,6 +451,12 @@ function approxTokens(s: string): number { return Math.ceil(s.length / 4); } +function round(n: number, d: number): number { + if (!Number.isFinite(n)) return n; + const f = 10 ** d; + return Math.round(n * f) / f; +} + /** Thin façade so pipelines can `new Retriever(deps)` if they prefer OO. */ export class Retriever { constructor(private readonly deps: RetrievalDeps) {} diff --git a/apps/memos-local-plugin/core/retrieval/types.ts b/apps/memos-local-plugin/core/retrieval/types.ts index 2b72c697a..5f68fb3ea 100644 --- a/apps/memos-local-plugin/core/retrieval/types.ts +++ b/apps/memos-local-plugin/core/retrieval/types.ts @@ -213,9 +213,26 @@ export interface RetrievalConfig { skillEtaBlend?: number; /** * Smart MMR seeding — only seed a tier if its best candidate clears - * `topRelevance · relativeThresholdFloor`. Default true. + * `topRelevance · smartSeedRatio` (see below). Default true. + * `smartSeed: false` restores the legacy "seed best of every non-empty + * tier" behaviour regardless of relevance. */ smartSeed?: boolean; + /** + * When `smartSeed` is on, only seed a tier whose best candidate's + * relevance is ≥ `poolTopRelevance · smartSeedRatio`. Default 0.7. + * Independent of `relativeThresholdFloor` so the seed gate can be + * stricter than the generic drop floor. + */ + smartSeedRatio?: number; + /** + * If a candidate is surfaced by ≥ 2 channels, bypass the relative + * threshold (it still participates in MMR). This compensates for + * the ranker's base formula being "max channel score + additive + * boosts" — a two-channel agreement is a strong signal even when + * the absolute score falls below the drop floor. Default true. + */ + multiChannelBypass?: boolean; /** * V7 §2.6 Tier-1 rendering mode. @@ -254,8 +271,16 @@ export interface RetrievalConfig { llmFilterEnabled: boolean; /** Keep at most N candidates after the LLM filter. */ llmFilterMaxKeep: number; - /** Skip the filter entirely when the ranked list has ≤ this many items. */ + /** Skip the filter entirely when the ranked list has fewer than this many items. */ llmFilterMinCandidates: number; + /** + * Max chars of body text to show the LLM filter for each candidate. + * Higher = more context for precise judgement, at the cost of more + * tokens per round-trip. Default 500 (openclaw uses 300 without + * tags/channels; we include richer metadata so a slightly bigger + * window pays for itself). + */ + llmFilterCandidateBodyChars?: number; } /** @@ -514,6 +539,44 @@ export interface RetrievalStats { queryTokens: number; queryTags: string[]; emptyPacket: boolean; + /** + * Observability breakdown — populated so the Logs page (and + * api_logs) can show "how many candidates survived each stage" and + * operators can spot "this stage is the lossy one" at a glance. + * All fields are optional so legacy callers / older RetrievalStats + * consumers keep compiling. + */ + rawCandidateCount?: number; + droppedByThresholdCount?: number; + thresholdFloor?: number; + topRelevance?: number; + rankedCount?: number; + llmFilterOutcome?: + | "disabled" + | "no_llm" + | "below_threshold" + | "empty_query" + | "llm_kept_all" + | "llm_filtered" + | "llm_failed_safe_cutoff"; + llmFilterSufficient?: boolean; + llmFilterKept?: number; + llmFilterDropped?: number; + /** + * Channel hit counts across all tiers, e.g. + * `{ vec_summary: 12, fts: 7, pattern: 3, structural: 0 }`. Helps + * identify queries that got hits only through one channel (likely + * fragile). + */ + channelHits?: Partial>; } /** Discriminated context union — one per entry point in `retrieve.ts`. */ diff --git a/apps/memos-local-plugin/tests/unit/adapters/openclaw-bridge.test.ts b/apps/memos-local-plugin/tests/unit/adapters/openclaw-bridge.test.ts index 267d56899..94298d7be 100644 --- a/apps/memos-local-plugin/tests/unit/adapters/openclaw-bridge.test.ts +++ b/apps/memos-local-plugin/tests/unit/adapters/openclaw-bridge.test.ts @@ -180,6 +180,41 @@ describe("flattenMessages", () => { expect(flat[3].content).toBe("file.txt"); }); + it("does NOT double-emit tool calls when content[] and top-level tool_calls coexist (pi-ai + OpenAI bundle)", () => { + // Regression for the "tool call rows duplicated 2x" bug. OpenAI + // messages plumbed through pi-ai carry the canonical pi-ai + // `content[{type:"toolCall"}]` shape AND the legacy OpenAI + // `tool_calls` top-level array. Pre-fix, flattenMessages emitted + // BOTH, which made extractTurn's `pendingCalls.set(key, …)` + // overwrite the first stub (with its `thinkingBefore`) with an + // empty second stub — so `thinkingBefore` silently went missing + // AND the trace ended up with 2× rows per tool. + const flat = flattenMessages([ + { role: "user", content: "deploy" }, + { + role: "assistant", + content: [ + { type: "text", text: "running" }, + { type: "toolCall", id: "call_X", name: "sh", arguments: { cmd: "deploy" } }, + ], + tool_calls: [ + { + id: "call_X", + function: { name: "sh", arguments: JSON.stringify({ cmd: "deploy" }) }, + }, + ], + }, + ]); + const toolCallEntries = flat.filter((m) => m.role === "tool_call"); + expect(toolCallEntries).toHaveLength(1); + expect(toolCallEntries[0].toolName).toBe("sh"); + expect(toolCallEntries[0].toolCallId).toBe("call_X"); + // Ensure the assistant text emitted for the SAME message is + // preserved — it's the `thinkingBefore` source for this call. + const assistantText = flat.find((m) => m.role === "assistant"); + expect(assistantText?.content).toBe("running"); + }); + it("does NOT coerce unknown roles into 'user' (the bug that captured tool stdout as user input)", () => { const flat = flattenMessages([ { role: "user", content: "real user input" }, @@ -257,18 +292,15 @@ describe("extractTurn", () => { const turn = extractTurn(flat, 1_700_000_000_000); expect(turn).not.toBeNull(); expect(turn!.userText).toBe("how many files?"); - expect(turn!.agentText).toContain("2 files"); + expect(turn!.agentText).toBe("2 files"); expect(turn!.toolCalls).toHaveLength(1); expect(turn!.toolCalls[0].name).toBe("sh"); expect(turn!.toolCalls[0].input).toEqual({ cmd: "ls" }); expect(turn!.toolCalls[0].output).toContain("a.txt"); + expect(turn!.toolCalls[0].thinkingBefore).toBe("running ls"); }); it("captures sysctl-style exec invocation: tool stdout lands in tool output, NOT in userText", () => { - // Regression for the user's bug: an exec tool with a complex - // command + multi-line stdout used to be parsed as a fresh user - // turn whose content was the stdout. Lock down that pi-ai's - // toolResult shape now keeps the boundaries straight. const flat = flattenMessages([ { role: "user", content: "帮我看下当前运行的系统是几个核心多少内存" }, { @@ -304,12 +336,11 @@ describe("extractTurn", () => { expect(turn!.userText).toBe("帮我看下当前运行的系统是几个核心多少内存"); expect(turn!.userText).not.toContain("17179869184"); expect(turn!.userText).not.toContain("Hardware:"); - // Both assistant texts are kept in chronological order — the - // lead-in ("I'll check the system.") and the final answer - // ("10 核 / 16 GB"). What we explicitly forbid is tool stdout - // leaking back into agentText. - expect(turn!.agentText).toContain("I'll check the system."); - expect(turn!.agentText).toContain("10 核 / 16 GB"); + // "I'll check the system." is the model's pre-tool reasoning and + // is captured in the tool's thinkingBefore. The final reply after + // the tool result is agentText. + expect(turn!.toolCalls[0].thinkingBefore).toBe("I'll check the system."); + expect(turn!.agentText).toBe("10 核 / 16 GB"); expect(turn!.agentText).not.toContain("17179869184"); expect(turn!.agentText).not.toContain("Hardware:"); expect(turn!.toolCalls).toHaveLength(1); @@ -338,6 +369,173 @@ describe("extractTurn", () => { expect(turn!.agentThinking).toBe("Let me read the issue first."); }); + it("assigns interleaved thinking to each tool call's thinkingBefore", () => { + // OpenClaw's PI agent alternates: think → tool → result → think → tool. + // Both thinking blocks and regular text before a tool call are + // captured in thinkingBefore. + const flat = flattenMessages([ + { role: "user", content: "fix the build" }, + { + role: "assistant", + content: [ + { type: "thinking", thinking: "Let me check the error log first." }, + { type: "text", text: "checking" }, + { type: "toolCall", id: "c1", name: "sh", arguments: { cmd: "cat error.log" } }, + ], + }, + { + role: "toolResult", + toolCallId: "c1", + toolName: "sh", + content: "pg_config not found", + isError: false, + }, + { + role: "assistant", + content: [ + { + type: "thinking", + thinking: "The error says pg_config is missing. I need to install libpq-dev.", + }, + { type: "toolCall", id: "c2", name: "sh", arguments: { cmd: "apt-get install libpq-dev" } }, + ], + }, + { + role: "toolResult", + toolCallId: "c2", + toolName: "sh", + content: "ok", + isError: false, + }, + { + role: "assistant", + content: [ + { type: "thinking", thinking: "Good, now let me retry the build." }, + { type: "toolCall", id: "c3", name: "sh", arguments: { cmd: "make build" } }, + ], + }, + { + role: "toolResult", + toolCallId: "c3", + toolName: "sh", + content: "BUILD SUCCESSFUL", + isError: false, + }, + { + role: "assistant", + content: [{ type: "text", text: "Fixed — the build passes now." }], + }, + ]); + const turn = extractTurn(flat, 0); + expect(turn).not.toBeNull(); + expect(turn!.toolCalls).toHaveLength(3); + // First tool: thinking + text merged into thinkingBefore + expect(turn!.toolCalls[0].thinkingBefore).toBe( + "Let me check the error log first.\n\nchecking", + ); + expect(turn!.toolCalls[1].thinkingBefore).toBe( + "The error says pg_config is missing. I need to install libpq-dev.", + ); + expect(turn!.toolCalls[2].thinkingBefore).toBe("Good, now let me retry the build."); + // All thinking was flushed into tool calls; none left over + expect(turn!.agentThinking).toBeUndefined(); + expect(turn!.agentText).toBe("Fixed — the build passes now."); + }); + + it("tool call has no thinkingBefore when model goes directly to the tool", () => { + const flat = flattenMessages([ + { role: "user", content: "list files" }, + { + role: "assistant", + content: [ + { type: "toolCall", id: "c1", name: "sh", arguments: { cmd: "ls" } }, + ], + }, + { + role: "toolResult", + toolCallId: "c1", + toolName: "sh", + content: "a.txt", + isError: false, + }, + { + role: "assistant", + content: [{ type: "text", text: "found a.txt" }], + }, + ]); + const turn = extractTurn(flat, 0); + expect(turn!.toolCalls[0].thinkingBefore).toBeUndefined(); + expect(turn!.agentText).toBe("found a.txt"); + }); + + it("captures regular assistant text between tool calls as thinkingBefore (most models)", () => { + // Most models (non-Claude, or Claude without extended thinking) + // produce regular text between tool calls, not ThinkingContent. + // This text is the model's reasoning and must be captured. + const flat = flattenMessages([ + { role: "user", content: "帮我查下当前系统有几个cpu有多少g内存" }, + { + role: "assistant", + content: [ + { type: "text", text: "Let me check the CPU count first." }, + { type: "toolCall", id: "c1", name: "exec", arguments: { command: "sysctl -n hw.ncpu" } }, + ], + }, + { + role: "toolResult", + toolCallId: "c1", + toolName: "exec", + content: "10", + isError: false, + }, + { + role: "assistant", + content: [ + { type: "text", text: "OK, 10 CPUs. Now let me check the memory." }, + { type: "toolCall", id: "c2", name: "exec", arguments: { command: "sysctl -n hw.memsize" } }, + ], + }, + { + role: "toolResult", + toolCallId: "c2", + toolName: "exec", + content: "17179869184", + isError: false, + }, + { + role: "assistant", + content: [ + { type: "text", text: "Now let me check disk space." }, + { type: "toolCall", id: "c3", name: "exec", arguments: { command: "df -h /" } }, + ], + }, + { + role: "toolResult", + toolCallId: "c3", + toolName: "exec", + content: "/dev/disk1s1 466Gi 200Gi 266Gi 43% /", + isError: false, + }, + { + role: "assistant", + content: [ + { type: "text", text: "Your system has 10 CPUs, 16 GB RAM, and 266 GB free disk space." }, + ], + }, + ]); + const turn = extractTurn(flat, 0); + expect(turn).not.toBeNull(); + expect(turn!.toolCalls).toHaveLength(3); + expect(turn!.toolCalls[0].thinkingBefore).toBe("Let me check the CPU count first."); + expect(turn!.toolCalls[1].thinkingBefore).toBe("OK, 10 CPUs. Now let me check the memory."); + expect(turn!.toolCalls[2].thinkingBefore).toBe("Now let me check disk space."); + expect(turn!.agentText).toBe( + "Your system has 10 CPUs, 16 GB RAM, and 266 GB free disk space.", + ); + // No thinking blocks used, so agentThinking is empty + expect(turn!.agentThinking).toBeUndefined(); + }); + it("falls back gracefully when assistant.toolCall has no matching toolResult", () => { const flat = flattenMessages([ { role: "user", content: "do x" }, diff --git a/apps/memos-local-plugin/tests/unit/retrieval/integration.test.ts b/apps/memos-local-plugin/tests/unit/retrieval/integration.test.ts index b0676e911..73e6c2e77 100644 --- a/apps/memos-local-plugin/tests/unit/retrieval/integration.test.ts +++ b/apps/memos-local-plugin/tests/unit/retrieval/integration.test.ts @@ -148,6 +148,9 @@ function makeDeps(handle: TmpDbHandle): RetrievalDeps { minTraceSim: 0.3, tagFilter: "auto", decayHalfLifeDays: 30, + llmFilterEnabled: false, + llmFilterMaxKeep: 4, + llmFilterMinCandidates: 1, }, now: () => NOW as never, }; diff --git a/apps/memos-local-plugin/tests/unit/retrieval/llm-filter.test.ts b/apps/memos-local-plugin/tests/unit/retrieval/llm-filter.test.ts index a8b05bee9..4097c5f06 100644 --- a/apps/memos-local-plugin/tests/unit/retrieval/llm-filter.test.ts +++ b/apps/memos-local-plugin/tests/unit/retrieval/llm-filter.test.ts @@ -9,16 +9,17 @@ import type { const cfg: Pick< RetrievalConfig, - "llmFilterEnabled" | "llmFilterMaxKeep" | "llmFilterMinCandidates" + | "llmFilterEnabled" + | "llmFilterMaxKeep" + | "llmFilterMinCandidates" + | "llmFilterCandidateBodyChars" > = { llmFilterEnabled: true, llmFilterMaxKeep: 4, - llmFilterMinCandidates: 2, + llmFilterMinCandidates: 1, + llmFilterCandidateBodyChars: 500, }; -// Minimal Logger stub — `llm-filter` only calls `.warn`, `.debug`, `.info`. -// We use `as any` rather than implementing the full `Logger` interface, -// since the missing methods are never invoked in this filter path. const log = { trace: vi.fn(), debug: vi.fn(), @@ -33,18 +34,19 @@ function trace(id: string, score: number): RankedCandidate { refKind: "trace", refId: id as never, cosine: score, - ts: 1 as never, + ts: 1_700_000_000_000 as never, vec: null, value: 0.5 as never, priority: 0.5 as never, episodeId: "e1" as never, sessionId: "s1" as never, vecKind: "summary", - userText: "u", - agentText: "a", - summary: "summary text", + userText: `user ${id}`, + agentText: `agent ${id}`, + summary: `summary ${id}`, reflection: null, - tags: [], + tags: ["sample"], + channels: [{ channel: "vec_summary", rank: 0, score }], }; return { candidate: cand, @@ -56,28 +58,46 @@ function trace(id: string, score: number): RankedCandidate { } describe("retrieval/llm-filter", () => { - it("disabled → passthrough", async () => { + it("disabled → passthrough with null sufficient", async () => { const result = await llmFilterCandidates( { query: "anything", ranked: [trace("a", 0.9), trace("b", 0.5)] }, { llm: null, log, config: { ...cfg, llmFilterEnabled: false } }, ); expect(result.outcome).toBe("disabled"); expect(result.kept.length).toBe(2); + expect(result.sufficient).toBeNull(); }); - it("below threshold → passthrough", async () => { + it("below threshold → passthrough (minCandidates can lift the gate)", async () => { const result = await llmFilterCandidates( { query: "x", ranked: [trace("only", 0.9)] }, - { llm: null, log, config: cfg }, + { llm: null, log, config: { ...cfg, llmFilterMinCandidates: 5 } }, ); expect(result.outcome).toBe("below_threshold"); expect(result.kept.length).toBe(1); + expect(result.sufficient).toBeNull(); + }); + + it("single candidate → filter still runs at minCandidates=1 default", async () => { + const llm: any = { + completeJson: vi.fn().mockResolvedValue({ + value: { selected: [1], sufficient: true }, + servedBy: "fake", + }), + }; + const result = await llmFilterCandidates( + { query: "q", ranked: [trace("solo", 0.9)] }, + { llm, log, config: cfg }, + ); + expect(result.outcome).toBe("llm_kept_all"); + expect(result.kept.map((r) => String(r.candidate.refId))).toEqual(["solo"]); + expect(result.sufficient).toBe(true); }); - it("LLM returns selected indices → filters precisely", async () => { + it("LLM returns selected indices → filters precisely and surfaces sufficient", async () => { const llm: any = { completeJson: vi.fn().mockResolvedValue({ - value: { selected: [1, 3] }, + value: { selected: [1, 3], sufficient: false }, servedBy: "fake", }), }; @@ -89,12 +109,13 @@ describe("retrieval/llm-filter", () => { expect(result.outcome).toBe("llm_filtered"); expect(result.kept.map((r) => String(r.candidate.refId))).toEqual(["a", "c"]); expect(result.dropped.map((r) => String(r.candidate.refId))).toEqual(["b"]); + expect(result.sufficient).toBe(false); }); - it("LLM returns empty selection → keeps nothing (drops the whole packet)", async () => { + it("LLM returns empty selection → drops everything and marks insufficient", async () => { const llm: any = { completeJson: vi.fn().mockResolvedValue({ - value: { selected: [] }, + value: { selected: [], sufficient: false }, servedBy: "fake", }), }; @@ -106,6 +127,21 @@ describe("retrieval/llm-filter", () => { expect(result.outcome).toBe("llm_filtered"); expect(result.kept.length).toBe(0); expect(result.dropped.length).toBe(2); + expect(result.sufficient).toBe(false); + }); + + it("coerces string / number `sufficient` fields sent by lax models", async () => { + const llm: any = { + completeJson: vi.fn().mockResolvedValue({ + value: { selected: [1], sufficient: "yes" }, + servedBy: "fake", + }), + }; + const result = await llmFilterCandidates( + { query: "q", ranked: [trace("a", 0.9)] }, + { llm, log, config: cfg }, + ); + expect(result.sufficient).toBe(true); }); it("LLM throws → mechanical safe cutoff (NOT passthrough)", async () => { @@ -115,16 +151,16 @@ describe("retrieval/llm-filter", () => { const ranked = [ trace("strong", 0.9), trace("middle", 0.6), - trace("weak", 0.05), // far below 0.7·top → cut by safeCutoff + trace("weak", 0.05), ]; const result = await llmFilterCandidates( { query: "q", ranked }, { llm, log, config: cfg }, ); expect(result.outcome).toBe("llm_failed_safe_cutoff"); + expect(result.sufficient).toBeNull(); const ids = result.kept.map((r) => String(r.candidate.refId)); expect(ids).toContain("strong"); - // weak is far below the relative cutoff → dropped expect(ids).not.toContain("weak"); }); @@ -132,16 +168,11 @@ describe("retrieval/llm-filter", () => { const llm: any = { completeJson: vi.fn().mockRejectedValue(new Error("boom")), }; - const ranked = [trace("only", 0.05)]; - // Below threshold gates the LLM call entirely, so this exercises - // the safeCutoff path indirectly by raising the cutoff via cfg - // override: const result = await llmFilterCandidates( { query: "q", ranked: [trace("a", 0.5), trace("b", 0.49)] }, { llm, log, config: cfg }, ); expect(result.outcome).toBe("llm_failed_safe_cutoff"); - // both are above 0.7 · 0.5 = 0.35, so both kept expect(result.kept.length).toBeGreaterThanOrEqual(1); }); @@ -149,14 +180,13 @@ describe("retrieval/llm-filter", () => { const llm: any = { completeJson: vi.fn().mockRejectedValue(new Error("boom")), }; - // 6 candidates all above threshold, llmFilterMaxKeep=2 → kept ≤ 2. const ranked = [ trace("a", 0.95), trace("b", 0.94), trace("c", 0.93), trace("d", 0.92), trace("e", 0.91), - trace("f", 0.90), + trace("f", 0.9), ]; const result = await llmFilterCandidates( { query: "q", ranked }, @@ -168,10 +198,32 @@ describe("retrieval/llm-filter", () => { it("no LLM at all → passthrough (not safe-cutoff, since the call never happens)", async () => { const result = await llmFilterCandidates( - { query: "q", ranked: [trace("a", 0.9), trace("b", 0.8), trace("c", 0.7)] }, + { + query: "q", + ranked: [trace("a", 0.9), trace("b", 0.8), trace("c", 0.7)], + }, { llm: null, log, config: cfg }, ); expect(result.outcome).toBe("no_llm"); expect(result.kept.length).toBe(3); + expect(result.sufficient).toBeNull(); + }); + + it("candidate description includes time / tags / channels / score metadata", async () => { + const seen: string[] = []; + const llm: any = { + completeJson: vi.fn().mockImplementation(async (messages: any[]) => { + seen.push(messages[1].content); + return { value: { selected: [1], sufficient: true }, servedBy: "fake" }; + }), + }; + await llmFilterCandidates( + { query: "q", ranked: [trace("a", 0.9)] }, + { llm, log, config: cfg }, + ); + expect(seen[0]).toContain("time="); + expect(seen[0]).toContain("tags=[sample]"); + expect(seen[0]).toContain("via=vec_summary"); + expect(seen[0]).toContain("score="); }); }); diff --git a/apps/memos-local-plugin/tests/unit/retrieval/ranker.test.ts b/apps/memos-local-plugin/tests/unit/retrieval/ranker.test.ts index da4f53d34..ec7efc6c2 100644 --- a/apps/memos-local-plugin/tests/unit/retrieval/ranker.test.ts +++ b/apps/memos-local-plugin/tests/unit/retrieval/ranker.test.ts @@ -23,6 +23,9 @@ const cfg: RetrievalConfig = { minTraceSim: 0.35, tagFilter: "auto", decayHalfLifeDays: 30, + llmFilterEnabled: false, + llmFilterMaxKeep: 4, + llmFilterMinCandidates: 1, }; const NOW = 1_700_000_000_000; @@ -110,34 +113,37 @@ describe("retrieval/ranker", () => { expect(out.ranked.length).toBe(0); }); - it("seeds at least one pick per non-empty tier", () => { + it("smart-seed picks every tier when all tier-bests are close to pool top", () => { const out = rank({ tier1: [skill("sk1", 0.9, 0.9)], - tier2Traces: [trace("t1", 0.8, 0.5)], + tier2Traces: [trace("t1", 0.85, 0.5)], tier2Episodes: [], - tier3: [world("w1", 0.7)], + tier3: [world("w1", 0.8)], limit: 3, - config: cfg, + config: { ...cfg, relativeThresholdFloor: 0, smartSeedRatio: 0.7 }, now: NOW, }); - expect(out.ranked.map((r) => r.candidate.tier).sort()).toEqual(["tier1", "tier2", "tier3"]); + expect(out.ranked.map((r) => r.candidate.tier).sort()).toEqual([ + "tier1", + "tier2", + "tier3", + ]); }); - it("tier-2 V-aware order beats pure cosine when weights favor priority", () => { - const highCosLowV = trace("t1", 0.95, 0.0); // high sim, worthless - const highV = trace("t2", 0.4, 0.9); // mediocre sim, high V + it("priority breaks ties within the same base-score band", () => { + // Same cosine → same base. Higher V adds a priority lift. + const lowV = trace("t1", 0.5, 0.0); + const highV = trace("t2", 0.5, 0.9); const out = rank({ tier1: [], - tier2Traces: [highCosLowV, highV], + tier2Traces: [lowV, highV], tier2Episodes: [], tier3: [], limit: 2, - config: { ...cfg, weightCosine: 0.2, weightPriority: 0.8 }, + config: { ...cfg, relativeThresholdFloor: 0 }, now: NOW, }); - // t2 should rank ahead of t1 under priority-heavy weights - const first = out.ranked[0]!.candidate.refId; - expect(String(first)).toBe("t2"); + expect(String(out.ranked[0]!.candidate.refId)).toBe("t2"); }); it("MMR suppresses near-duplicate vectors", () => { @@ -151,7 +157,7 @@ describe("retrieval/ranker", () => { tier2Episodes: [], tier3: [], limit: 2, - config: { ...cfg, mmrLambda: 0 }, // pure diversity + config: { ...cfg, mmrLambda: 0, relativeThresholdFloor: 0 }, // pure diversity now: NOW, }); const picked = out.ranked.map((r) => String(r.candidate.refId)); @@ -179,10 +185,10 @@ describe("retrieval/ranker", () => { tier2Episodes: [episode("ep1", 0.5, 0.9)], tier3: [world("w1", 0.4)], limit: 5, - config: cfg, + config: { ...cfg, relativeThresholdFloor: 0, smartSeedRatio: 0.3 }, now: NOW, }); - // Both tiers are seeded; ep1 should outrank w1 due to its high maxValue. + // ep1 has higher base AND a priority lift from maxValue → should lead. expect(out.ranked[0]!.candidate.refId).toBe("ep1"); }); @@ -192,9 +198,9 @@ describe("retrieval/ranker", () => { const out = rank({ tier1: [], tier2Traces: [ - trace("strong", 0.9, 0.8), // topRelevance ≈ 0.86 + trace("strong", 0.9, 0.8), trace("middle", 0.5, 0.4), - trace("weak", 0.05, 0.0), // ≈ 0.03 → far below floor + trace("weak", 0.05, 0.0), ], tier2Episodes: [], tier3: [], @@ -208,18 +214,23 @@ describe("retrieval/ranker", () => { expect(out.droppedByThreshold).toBeGreaterThanOrEqual(1); }); - it("smart-seed refuses to seed a tier when its best candidate is irrelevant", () => { + it("smart-seed refuses to seed a tier when its best candidate is far from pool top", () => { // Tier-1 + Tier-3 only have weak candidates; Tier-2 has a strong - // signal. With smartSeed=true, the ranker should ship just the - // tier-2 hit and skip the noisy seeds — the previous behaviour - // would have force-injected a marginal Tier-1 + Tier-3 each. + // signal. With smartSeedRatio=0.7 AND the relative threshold on, + // the irrelevant tiers should be cut by threshold — smart-seed is + // the Phase-A gate, threshold is the pool-wide gate. const out = rank({ tier1: [skill("sk_irrelevant", 0.05, 0.9)], tier2Traces: [trace("t_strong", 0.9, 0.8)], tier2Episodes: [], tier3: [world("w_irrelevant", 0.05)], limit: 5, - config: { ...cfg, relativeThresholdFloor: 0.4, smartSeed: true }, + config: { + ...cfg, + relativeThresholdFloor: 0.4, + smartSeed: true, + smartSeedRatio: 0.7, + }, now: NOW, }); const ids = out.ranked.map((r) => String(r.candidate.refId)); @@ -228,6 +239,29 @@ describe("retrieval/ranker", () => { expect(ids).not.toContain("w_irrelevant"); }); + it("smart-seed blocks Phase-A tier seeding even when threshold is disabled", () => { + // When threshold=0 the pool keeps everyone, but Phase-A must still + // skip seeding weak tiers. We verify t_strong is seeded first + // (proving Phase-A ran) and that sk_irrelevant / w_irrelevant can + // only appear via Phase-B MMR, not as forced tier seeds. + const out = rank({ + tier1: [skill("sk_irrelevant", 0.05, 0.9)], + tier2Traces: [trace("t_strong", 0.9, 0.8)], + tier2Episodes: [], + tier3: [world("w_irrelevant", 0.05)], + limit: 1, + config: { + ...cfg, + relativeThresholdFloor: 0, + smartSeed: true, + smartSeedRatio: 0.7, + }, + now: NOW, + }); + expect(out.ranked.length).toBe(1); + expect(String(out.ranked[0]!.candidate.refId)).toBe("t_strong"); + }); + it("smartSeed=false restores legacy behaviour (force-seed every tier)", () => { const out = rank({ tier1: [skill("sk_irrelevant", 0.05, 0.9)], @@ -247,14 +281,14 @@ describe("retrieval/ranker", () => { expect(ids).toContain("w_irrelevant"); }); - it("multi-channel hits get an RRF lift over single-channel hits at same cosine", () => { + it("multi-channel hits get an RRF lift over single-channel hits at same base", () => { const single = trace("single_ch", 0.6, 0.0); single.channels = [{ channel: "vec_summary", rank: 0, score: 0.6 }]; const multi = trace("multi_ch", 0.6, 0.0); multi.channels = [ { channel: "vec_summary", rank: 0, score: 0.6 }, - { channel: "fts", rank: 0, score: 1 / 61 }, - { channel: "pattern", rank: 1, score: 1 / 62 }, + { channel: "fts", rank: 0, score: 1 }, + { channel: "pattern", rank: 1, score: 0.5 }, ]; const out = rank({ tier1: [], @@ -268,10 +302,58 @@ describe("retrieval/ranker", () => { expect(String(out.ranked[0]!.candidate.refId)).toBe("multi_ch"); }); + it("multi-channel bypass lets low-relevance keyword hits survive threshold", () => { + // Strong candidate pulls topRelevance up; keyword-only single-channel + // hit would be guillotined by the relative floor, BUT a multi-channel + // hit with the same base should survive via the bypass. + const strong = trace("strong", 0.9, 0.9); + strong.channels = [{ channel: "vec_summary", rank: 0, score: 0.9 }]; + const ftsOnly = trace("fts_only", 0.1, 0.0); + ftsOnly.channels = [{ channel: "fts", rank: 3, score: 0.25 }]; + const confirmed = trace("confirmed", 0.12, 0.0); + confirmed.channels = [ + { channel: "fts", rank: 3, score: 0.25 }, + { channel: "pattern", rank: 2, score: 0.33 }, + ]; + const out = rank({ + tier1: [], + tier2Traces: [strong, ftsOnly, confirmed], + tier2Episodes: [], + tier3: [], + limit: 5, + config: { ...cfg, relativeThresholdFloor: 0.4, multiChannelBypass: true }, + now: NOW, + }); + const ids = out.ranked.map((r) => String(r.candidate.refId)); + expect(ids).toContain("strong"); + expect(ids).toContain("confirmed"); + // The single-channel weak FTS hit should still get cut. + expect(ids).not.toContain("fts_only"); + }); + + it("multiChannelBypass=false restores strict threshold for multi-channel hits", () => { + const strong = trace("strong", 0.9, 0.9); + strong.channels = [{ channel: "vec_summary", rank: 0, score: 0.9 }]; + const confirmed = trace("confirmed", 0.12, 0.0); + confirmed.channels = [ + { channel: "fts", rank: 3, score: 0.25 }, + { channel: "pattern", rank: 2, score: 0.33 }, + ]; + const out = rank({ + tier1: [], + tier2Traces: [strong, confirmed], + tier2Episodes: [], + tier3: [], + limit: 5, + config: { ...cfg, relativeThresholdFloor: 0.5, multiChannelBypass: false }, + now: NOW, + }); + const ids = out.ranked.map((r) => String(r.candidate.refId)); + expect(ids).toContain("strong"); + expect(ids).not.toContain("confirmed"); + }); + it("skill η no longer dominates cosine — the more-relevant skill wins", () => { - // Old behaviour blended `0.4·η`, so a high-η stale skill could - // outrank a fresh, query-aligned one. With the new default - // `skillEtaBlend=0.15`, cosine dominates. const fresh = skill("fresh_match", 0.85, 0.5); fresh.channels = [{ channel: "vec", rank: 0, score: 0.85 }]; const stale = skill("stale_high_eta", 0.2, 0.95); @@ -287,4 +369,28 @@ describe("retrieval/ranker", () => { }); expect(String(out.ranked[0]!.candidate.refId)).toBe("fresh_match"); }); + + it("tallies channel hits for observability", () => { + const a = trace("a", 0.8, 0.5); + a.channels = [ + { channel: "vec_summary", rank: 0, score: 0.8 }, + { channel: "fts", rank: 1, score: 0.5 }, + ]; + const b = trace("b", 0.6, 0.5); + b.channels = [{ channel: "pattern", rank: 0, score: 0.9 }]; + const out = rank({ + tier1: [], + tier2Traces: [a, b], + tier2Episodes: [], + tier3: [], + limit: 5, + config: { ...cfg, relativeThresholdFloor: 0 }, + now: NOW, + }); + expect(out.channelHits.vec_summary).toBe(1); + expect(out.channelHits.fts).toBe(1); + expect(out.channelHits.pattern).toBe(1); + expect(out.topRelevance).toBeGreaterThan(0); + expect(out.thresholdFloor).toBe(0); + }); }); diff --git a/apps/memos-local-plugin/tests/unit/web/tasks-chat.test.ts b/apps/memos-local-plugin/tests/unit/web/tasks-chat.test.ts index 7cb81b876..a1c3a23ee 100644 --- a/apps/memos-local-plugin/tests/unit/web/tasks-chat.test.ts +++ b/apps/memos-local-plugin/tests/unit/web/tasks-chat.test.ts @@ -31,14 +31,11 @@ function trace(part: Partial): TimelineTrace { } describe("flattenChat", () => { - it("emits user → thinking → tools → assistant in that order; reflection is dropped", () => { + it("emits user → [thinking+tool pairs] → assistant; reflection is dropped", () => { const t = trace({ id: "tr1", userText: "go fix the deploy", agentText: "done — see PR #42", - // LLM-native thinking — must surface as a chat bubble. - agentThinking: "Looking at the error chain, pg_config is missing.", - // Plugin-internal reflection — must NEVER appear in the chat log. reflection: "INTERNAL: scoring note — α should be high because this step pinpointed the root cause.", toolCalls: [ @@ -49,6 +46,7 @@ describe("flattenChat", () => { startedAt: T0 + 10, endedAt: T0 + 200, errorCode: "EXIT_1", + thinkingBefore: "Looking at the error chain, pg_config is missing.", }, { name: "bash", @@ -67,20 +65,15 @@ describe("flattenChat", () => { "tool", "assistant", ]); - // The thinking bubble is the model's NATIVE reasoning, NOT - // reflection (which is the plugin's scoring scratchpad). expect(msgs[1]!.text).toContain("pg_config is missing"); expect(msgs[1]!.text).not.toContain("INTERNAL: scoring note"); - // Both tools point back at the same trace and carry their full payload. expect(msgs[2]!.traceId).toBe("tr1"); expect(msgs[2]!.toolName).toBe("bash"); expect(msgs[2]!.toolInput).toContain("pip install psycopg2"); expect(msgs[2]!.toolOutput).toContain("pg_config not found"); expect(msgs[2]!.errorCode).toBe("EXIT_1"); expect(msgs[2]!.toolDurationMs).toBe(190); - // Assistant is the agent text. expect(msgs[4]!.text).toBe("done — see PR #42"); - // No bubble's text leaks the reflection content anywhere. for (const m of msgs) { expect(m.text).not.toContain("INTERNAL: scoring note"); } @@ -210,6 +203,107 @@ describe("flattenChat", () => { ]); }); + it("interleaves per-tool thinking when thinkingBefore is present", () => { + const t = trace({ + id: "tr_interleave", + userText: "fix the build", + agentText: "Fixed — build passes now.", + agentThinking: "Check error log.\n\nNeed libpq-dev.\n\nRetry the build.", + toolCalls: [ + { + name: "sh", + input: "cat error.log", + output: "pg_config not found", + startedAt: T0 + 10, + endedAt: T0 + 200, + thinkingBefore: "Check error log.", + }, + { + name: "sh", + input: "apt-get install libpq-dev", + output: "ok", + startedAt: T0 + 300, + endedAt: T0 + 800, + thinkingBefore: "Need libpq-dev.", + }, + { + name: "sh", + input: "make build", + output: "BUILD SUCCESSFUL", + startedAt: T0 + 900, + endedAt: T0 + 1500, + thinkingBefore: "Retry the build.", + }, + ], + }); + const msgs = flattenChat([t]); + expect(msgs.map((m) => m.role)).toEqual([ + "user", + "thinking", // before tool 0 + "tool", + "thinking", // before tool 1 + "tool", + "thinking", // before tool 2 + "tool", + "assistant", + ]); + expect(msgs[1]!.text).toBe("Check error log."); + expect(msgs[3]!.text).toBe("Need libpq-dev."); + expect(msgs[5]!.text).toBe("Retry the build."); + }); + + it("no thinking bubbles when tools lack thinkingBefore (agentThinking only shown for no-tool turns)", () => { + const t = trace({ + id: "tr_no_tb", + userText: "go", + agentText: "done", + agentThinking: "Some thinking.", + toolCalls: [ + { name: "tool_a", startedAt: T0 + 10, endedAt: T0 + 100 }, + { name: "tool_b", startedAt: T0 + 200, endedAt: T0 + 300 }, + ], + }); + const msgs = flattenChat([t]); + expect(msgs.map((m) => m.role)).toEqual([ + "user", + "tool", + "tool", + "assistant", + ]); + }); + + it("only some tools have thinkingBefore — those without get no bubble", () => { + const t = trace({ + id: "tr_partial", + userText: "go", + agentText: "done", + agentThinking: "initial\n\nsecond thought", + toolCalls: [ + { + name: "tool_a", + startedAt: T0 + 10, + endedAt: T0 + 100, + thinkingBefore: "initial", + }, + { + name: "tool_b", + startedAt: T0 + 200, + endedAt: T0 + 300, + // no thinkingBefore — model went straight to the next tool + }, + ], + }); + const msgs = flattenChat([t]); + expect(msgs.map((m) => m.role)).toEqual([ + "user", + "thinking", // before tool_a + "tool", + "tool", // no thinking before tool_b + "assistant", + ]); + expect(msgs[1]!.text).toBe("initial"); + }); + it("returns empty array for empty input", () => { expect(flattenChat([])).toEqual([]); }); diff --git a/apps/memos-local-plugin/web/src/stores/i18n.ts b/apps/memos-local-plugin/web/src/stores/i18n.ts index 47c1a306f..c5192b082 100644 --- a/apps/memos-local-plugin/web/src/stores/i18n.ts +++ b/apps/memos-local-plugin/web/src/stores/i18n.ts @@ -524,6 +524,7 @@ const en = { "logs.search.droppedByLlm": "Dropped by LLM", "logs.search.noCandidates": "No candidates.", "logs.search.noneRelevant": "Candidates were returned but the LLM dropped them all.", + "logs.search.funnel": "Retrieval funnel", "logs.add.warnings": "Warnings", "logs.add.details": "Per-turn items", "pager.pageN": "Page {n} / {total}", @@ -1067,6 +1068,7 @@ const zh: Record = { "logs.search.droppedByLlm": "LLM 剔除", "logs.search.noCandidates": "没有候选。", "logs.search.noneRelevant": "有候选但被 LLM 全部剔除。", + "logs.search.funnel": "召回漏斗", "logs.add.warnings": "警告", "logs.add.details": "每轮条目", "pager.pageN": "第 {n} 页 / 共 {total} 页", diff --git a/apps/memos-local-plugin/web/src/views/LogsView.tsx b/apps/memos-local-plugin/web/src/views/LogsView.tsx index a090fb38f..828ddb5b8 100644 --- a/apps/memos-local-plugin/web/src/views/LogsView.tsx +++ b/apps/memos-local-plugin/web/src/views/LogsView.tsx @@ -354,8 +354,25 @@ interface SearchOutput { hubCandidates?: SearchCandidate[]; filtered?: SearchCandidate[]; droppedByLlm?: SearchCandidate[]; + stats?: RetrievalStatsPayload; error?: string; } +interface RetrievalStatsPayload { + raw?: number; + ranked?: number; + droppedByThreshold?: number; + thresholdFloor?: number; + topRelevance?: number; + llmFilter?: { + outcome?: string; + kept?: number; + dropped?: number; + sufficient?: boolean | null; + }; + channelHits?: Record; + queryTokens?: number; + queryTags?: string[]; +} interface SearchCandidate { tier?: number; refKind?: string; @@ -404,6 +421,7 @@ function MemorySearchDetail({ ) : ( <> + {out.stats && } + typeof n === "number" && Number.isFinite(n) ? n.toFixed(digits) : "—"; + const channelEntries = Object.entries(stats.channelHits ?? {}).filter( + ([, v]) => typeof v === "number" && v > 0, + ); + return ( +
+
+ + {t("logs.search.funnel")} + +
+
+ raw {raw} + ranked {ranked} + {dropped > 0 && ( + dropped≥floor {dropped} + )} + {typeof kept === "number" && ( + llm kept {kept} + )} + outcome {outcome} + {lf.sufficient !== null && lf.sufficient !== undefined && ( + + sufficient {String(lf.sufficient)} + + )} + + floor {fmtNum(stats.thresholdFloor)} · top {fmtNum(stats.topRelevance)} + +
+ {channelEntries.length > 0 && ( +
+ {channelEntries.map(([ch, n]) => ( + + {ch} · {n} + + ))} +
+ )} +
+ ); +} + function CandidateSection({ title, count, diff --git a/apps/memos-local-plugin/web/src/views/tasks-chat-data.ts b/apps/memos-local-plugin/web/src/views/tasks-chat-data.ts index 7dff8502a..04fd297be 100644 --- a/apps/memos-local-plugin/web/src/views/tasks-chat-data.ts +++ b/apps/memos-local-plugin/web/src/views/tasks-chat-data.ts @@ -15,6 +15,7 @@ export interface TimelineToolCall { errorCode?: string; startedAt?: number; endedAt?: number; + thinkingBefore?: string | null; } export interface TimelineTrace { @@ -71,13 +72,10 @@ const TOOL_OUTPUT_PREVIEW_CHARS = 1_600; * recognise, in pi-ai's natural emission order: * * 1. `user` — the user query that opened the step (if non-empty). - * 2. `thinking` — LLM-native thinking blocks the model emitted - * before its visible reply (Claude extended, - * pi-ai `ThinkingContent`). Sourced from - * `trace.agentThinking`. Never from `reflection`. - * 3. `tool` × N — every tool call the assistant made, sorted by - * `startedAt` so the chain reads chronologically. - * 4. `assistant` — the assistant's final text reply (if non-empty). + * 2. Interleaved `thinking` + `tool` blocks — each tool call's + * `thinkingBefore` is rendered as a thinking bubble directly + * before its tool, faithfully mirroring the model's think→act loop. + * 3. `assistant` — the assistant's final text reply (if non-empty). * * `trace.reflection` is **deliberately not** turned into a chat bubble. * Reflection is the MemOS plugin's own post-hoc note used to compute @@ -104,21 +102,38 @@ export function flattenChat(traces: readonly TimelineTrace[]): ChatMsg[] { }); } - const thinking = (tr.agentThinking ?? "").trim(); - if (thinking) { - out.push({ - role: "thinking", - text: thinking, - ts: tr.ts, - key: `${tr.id}:thinking`, - traceId: tr.id, - }); - } - const tools = [...(tr.toolCalls ?? [])].sort( (a, b) => (a.startedAt ?? tr.ts) - (b.startedAt ?? tr.ts), ); + + // When there are no tool calls, agentThinking (if present) appears + // as a standalone thinking bubble. When tools exist, the per-tool + // `thinkingBefore` fields carry the interleaved reasoning instead. + if (tools.length === 0) { + const thinking = (tr.agentThinking ?? "").trim(); + if (thinking) { + out.push({ + role: "thinking", + text: thinking, + ts: tr.ts, + key: `${tr.id}:thinking`, + traceId: tr.id, + }); + } + } + tools.forEach((tc, idx) => { + const tb = (tc.thinkingBefore ?? "").trim(); + if (tb) { + out.push({ + role: "thinking", + text: tb, + ts: tc.startedAt ?? tr.ts, + key: `${tr.id}:thinking:${idx}`, + traceId: tr.id, + }); + } + const inputStr = serializeToolPayload(tc.input); const outputStr = serializeToolPayload(tc.output); const dur =