From 7597b09e9af179664e6c4ae214241010c8d3f105 Mon Sep 17 00:00:00 2001
From: Rodrigo Fernandez <rodrigo@nxtlevelsaas.com>
Date: Fri, 19 Jun 2026 05:58:39 -0400
Subject: [PATCH] fix(deriver): prevent AI agent speech misattribution to human
 users (Option C + A)

- Add SQL filter in queue_manager.py to only include messages from the observed peer (plus preceding context)
- Harden minimal_deriver_prompt with explicit speaker attribution rules and doubled emphasis
- Fixes contamination where messages from other speakers were being processed as if spoken by the observed peer

Fixes #817
---
 src/deriver/prompts.py       | 8 ++++++--
 src/deriver/queue_manager.py | 6 ++++++
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/src/deriver/prompts.py b/src/deriver/prompts.py
index 402bcae71..8c978dce6 100644
--- a/src/deriver/prompts.py
+++ b/src/deriver/prompts.py
@@ -55,7 +55,7 @@ def minimal_deriver_prompt(
     custom_instructions_section = _custom_instructions_section(custom_instructions)
     return c(
         f"""
-Analyze messages to extract **explicit atomic facts** about the target peer.
+Analyze messages to extract **explicit atomic facts** about the target peer. Output MUST be valid JSON.
 
 [EXPLICIT] DEFINITION: Facts about the target peer that can be derived directly from their messages.
    - Transform statements into one or multiple conclusions
@@ -66,11 +66,15 @@ def minimal_deriver_prompt(
 - The target peer is the peer identified below under `Target peer:`.
 - A peer can be a human user, AI agent, bot, service, or other actor.
 - Use the exact peer id from `Target peer:` in final observations, not the phrase "the target peer".
+- CRITICAL: Only extract facts from messages where the target peer is the SPEAKER (the name before the colon in each line, e.g. "target_peer: ..."). Messages from other speakers provide conversational context but must NOT generate observations about the target peer unless the target peer actually said them.
 - Properly attribute observations to the correct subject: if it is about the target peer, use the exact peer id as the subject. If the target peer is referencing someone or something else, make that clear.
 - Observations should make sense on their own. Each observation will be used in the future to better understand the target peer.
-- Extract ALL observations from the target peer's messages, using others as context.
+- Extract ALL observations from the target peer's own messages (where they are the speaker). Do not extract facts from messages spoken by other speakers.
 - Contextualize each observation sufficiently (e.g. "Ann is nervous about the job interview at the pharmacy" not just "Ann is nervous")
 
+OUTPUT FORMAT — Respond with ONLY a JSON object (no markdown, no explanation):
+{{"explicit": [{{"content": "fact 1"}}, {{"content": "fact 2"}}]}}
+
 EXAMPLES (using `alice` as the target peer id):
 - EXPLICIT: "I just had my 25th birthday last Saturday" → "alice is 25 years old", "alice's birthday is June 21st"
 - EXPLICIT: "I took my dog for a walk in NYC" → "alice has a dog", "alice lives in NYC"
diff --git a/src/deriver/queue_manager.py b/src/deriver/queue_manager.py
index e97e67ed4..07b6567d0 100644
--- a/src/deriver/queue_manager.py
+++ b/src/deriver/queue_manager.py
@@ -864,6 +864,12 @@ async def get_queue_item_batch(
                 .where(models.Message.session_name == parsed_key.session_name)
                 .where(models.Message.workspace_name == parsed_key.workspace_name)
                 .where(models.Message.id >= effective_start_id)
+                .where(
+                    or_(
+                        models.Message.peer_name == parsed_key.observed,
+                        models.Message.id == preceding_message_id_subq,
+                    )
+                )
                 .subquery()
             )