memorium/debug_agent.py at master · small-cactus/memorium · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import urllib.request
import urllib.parse
import json
import ssl
import time
import sys
import os

# Config from App.jsx
GEMINI_KEY = os.environ.get("GEMINI_API_KEY", "")
GEMINI_MODEL = "gemini-2.5-flash"
API_BASE = "http://localhost:8765"

def perform_vector_search(query, limit=5):
    q = urllib.parse.quote(query)
    url = f"{API_BASE}/api/search?q={q}&k={limit}"
    try:
        with urllib.request.urlopen(url) as response:
            data = json.loads(response.read().decode())
            return data.get("results", [])
    except Exception as e:
        print(f"[Error] Search failed: {e}")
        return []

def call_gemini(messages, tools=None):
    if not GEMINI_KEY:
        raise RuntimeError("Set GEMINI_API_KEY before running this debug script.")
    url = f"https://generativelanguage.googleapis.com/v1beta/models/{GEMINI_MODEL}:generateContent?key={GEMINI_KEY}"

    # Convert messages to Gemini format (App.jsx does this in buildPrompt,
    # but here we need to ensure the structure matches what the API expects)
    # The API expects "contents": [ { "role": "user", "parts": [...] } ]

    contents = []
    for m in messages:
        role = m["role"]
        # Map app roles to Gemini roles
        if role == "system":
            # System instructions usually go to 'system_instruction' field or just prepended to user
            # App.jsx prepends to user message in buildPrompt.
            continue

        parts = m["parts"]
        contents.append({ "role": role, "parts": parts })

    payload = {
        "contents": contents,
        "generationConfig": {
            "temperature": 0.3,
            "maxOutputTokens": 500
        }
    }
    if tools:
        payload["tools"] = tools

    data = json.dumps(payload).encode("utf-8")
    req = urllib.request.Request(url, data=data, headers={"Content-Type": "application/json"})

    try:
        # Create unverified context to avoid cert errors if any
        context = ssl._create_unverified_context()
        with urllib.request.urlopen(req, context=context) as response:
            return json.loads(response.read().decode())
    except urllib.error.HTTPError as e:
        print(f"[Gemini API Error] {e.code}: {e.read().decode()}")
        return None

def build_prompt_system(query, results):
    # This is the EXACT system prompt from App.jsx we want to test/iterate on
    system_lines = [
        "You are Memorium, a visual match agent.",
        "Goal: find the best screenshot for the user's request.",
        "IMPORTANT: The database stores Moondream2 captions for each screenshot; text search works best when you use words likely to appear in those captions (visible UI elements, app names, on-screen text).",
        "When using `search_vector_db`, differentiate your queries from the user's abstract request:",
        "1. Translate concepts into descriptive phrases (e.g., instead of 'coding', search for 'dark VS Code window with Python code and a file explorer on the left').",
        "2. Mention specific UI elements or on-screen text (e.g., 'Run button', 'terminal prompt', 'blue send button').",
        "3. Keep queries concrete and visual, not abstract intentions.",
        "Use `search_vector_db` if the current context is insufficient.",
        "Use `provide_answer` ONLY when you have found the matching memory.",
        "chosen_index refers to the Context list below."
    ]

    # Python equivalent of the JS .join(" ")
    system_text = " ".join(system_lines)

    context_parts = []
    for idx, r in enumerate(results):
        # We don't have base64 here easily without fetching, but for text-only debugging of the prompt logic,
        # we can skip the actual image data or provide a placeholder if the model complains.
        # However, the App.jsx provides inline_data.
        # Crucially, the model *might* be failing because it CAN'T see the image without the data.
        # But it should at least rely on the text description we provide.

        preds = "; ".join([f"{p['rank']}. {p['label']}" for p in r.get('predictions', [])])
        desc = f"[Index {idx}] ID:{r['capture_id'][:6]} Time:{r['created_at']} Preds:{preds}"
        context_parts.append({ "text": desc })
        # Omit inline_data for this lightweight script unless necessary.
        # If the model refuses to answer because it's "blind", we'll know.

    parts = [{ "text": system_text + "\n\nUser: " + query + "\n\nContext:\n" }]
    parts.extend(context_parts)

    return [ { "role": "user", "parts": parts } ]

TOOLS_DEF = [{
  "function_declarations": [{
    "name": "search_vector_db",
    "description": "Search the vector database for screenshots matching a query. Supports filtering by time range.",
    "parameters": {
      "type": "OBJECT",
      "properties": {
        "query": { "type": "STRING", "description": "The detailed search query for the vector database." },
        "limit": { "type": "INTEGER", "description": "Number of results to return (default 30)." },
        "start_time": { "type": "STRING", "description": "Optional ISO timestamp" },
        "end_time": { "type": "STRING", "description": "Optional ISO timestamp" }
      },
      "required": ["query"]
    }
  }, {
    "name": "provide_answer",
    "description": "Provide the final answer to the user.",
    "parameters": {
      "type": "OBJECT",
      "properties": {
        "chosen_index": { "type": "INTEGER", "description": "The 0-based index of the best screenshot in the Context list." },
        "answer": { "type": "STRING", "description": "The concise answer." },
        "reason": { "type": "STRING", "description": "Why this image was chosen." }
      },
      "required": ["chosen_index", "answer", "reason"]
    }
  }]
}]

def run_test(query):
    print(f"--- Testing Query: '{query}' ---")

    # 1. Initial Local Search
    print("1. Performing initial local search...")
    initial_results = perform_vector_search(query, limit=5)
    print(f"   Found {len(initial_results)} results.")
    for i, r in enumerate(initial_results):
        print(f"   [{i}] {r['created_at']} - {r['image_path']}")

    # 2. Build Prompt
    messages = build_prompt_system(query, initial_results)

    # 3. Call Gemini
    print("2. Calling Gemini...")
    resp = call_gemini(messages, tools=TOOLS_DEF)

    if not resp:
        print("   No response from Gemini.")
        return

    cand = resp.get("candidates", [{}])[0]
    content = cand.get("content", {})
    parts = content.get("parts", [])

    print("3. Gemini Response:")
    for p in parts:
        if "text" in p:
            print(f"   [Text]: {p['text']}")
        if "functionCall" in p:
            fc = p["functionCall"]
            print(f"   [Function Call]: {fc['name']} args={fc['args']}")

if __name__ == "__main__":
    if len(sys.argv) > 1:
        q = sys.argv[1]
    else:
        q = "coding in python"
    run_test(q)