From 7597b09e9af179664e6c4ae214241010c8d3f105 Mon Sep 17 00:00:00 2001 From: Rodrigo Fernandez Date: Fri, 19 Jun 2026 05:58:39 -0400 Subject: [PATCH] fix(deriver): prevent AI agent speech misattribution to human users (Option C + A) - Add SQL filter in queue_manager.py to only include messages from the observed peer (plus preceding context) - Harden minimal_deriver_prompt with explicit speaker attribution rules and doubled emphasis - Fixes contamination where messages from other speakers were being processed as if spoken by the observed peer Fixes #817 --- src/deriver/prompts.py | 8 ++++++-- src/deriver/queue_manager.py | 6 ++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/deriver/prompts.py b/src/deriver/prompts.py index 402bcae71..8c978dce6 100644 --- a/src/deriver/prompts.py +++ b/src/deriver/prompts.py @@ -55,7 +55,7 @@ def minimal_deriver_prompt( custom_instructions_section = _custom_instructions_section(custom_instructions) return c( f""" -Analyze messages to extract **explicit atomic facts** about the target peer. +Analyze messages to extract **explicit atomic facts** about the target peer. Output MUST be valid JSON. [EXPLICIT] DEFINITION: Facts about the target peer that can be derived directly from their messages. - Transform statements into one or multiple conclusions @@ -66,11 +66,15 @@ def minimal_deriver_prompt( - The target peer is the peer identified below under `Target peer:`. - A peer can be a human user, AI agent, bot, service, or other actor. - Use the exact peer id from `Target peer:` in final observations, not the phrase "the target peer". +- CRITICAL: Only extract facts from messages where the target peer is the SPEAKER (the name before the colon in each line, e.g. "target_peer: ..."). Messages from other speakers provide conversational context but must NOT generate observations about the target peer unless the target peer actually said them. - Properly attribute observations to the correct subject: if it is about the target peer, use the exact peer id as the subject. If the target peer is referencing someone or something else, make that clear. - Observations should make sense on their own. Each observation will be used in the future to better understand the target peer. -- Extract ALL observations from the target peer's messages, using others as context. +- Extract ALL observations from the target peer's own messages (where they are the speaker). Do not extract facts from messages spoken by other speakers. - Contextualize each observation sufficiently (e.g. "Ann is nervous about the job interview at the pharmacy" not just "Ann is nervous") +OUTPUT FORMAT — Respond with ONLY a JSON object (no markdown, no explanation): +{{"explicit": [{{"content": "fact 1"}}, {{"content": "fact 2"}}]}} + EXAMPLES (using `alice` as the target peer id): - EXPLICIT: "I just had my 25th birthday last Saturday" → "alice is 25 years old", "alice's birthday is June 21st" - EXPLICIT: "I took my dog for a walk in NYC" → "alice has a dog", "alice lives in NYC" diff --git a/src/deriver/queue_manager.py b/src/deriver/queue_manager.py index e97e67ed4..07b6567d0 100644 --- a/src/deriver/queue_manager.py +++ b/src/deriver/queue_manager.py @@ -864,6 +864,12 @@ async def get_queue_item_batch( .where(models.Message.session_name == parsed_key.session_name) .where(models.Message.workspace_name == parsed_key.workspace_name) .where(models.Message.id >= effective_start_id) + .where( + or_( + models.Message.peer_name == parsed_key.observed, + models.Message.id == preceding_message_id_subq, + ) + ) .subquery() )