Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions server.py
Original file line number Diff line number Diff line change
Expand Up @@ -795,6 +795,71 @@ def _parse_antigravity_transcript(text: str) -> list[dict[str, str]]:
return pairs


def _content_to_text(content: Any) -> str:
"""Extract readable text from Claude Code message content blocks."""
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree. Since this parser is used by both the standalone server and the production memory route, please move the Claude transcript parsing into src/utils/transcripts.py and have both server.py and src/api/routes/memory.py import the shared parser from there.

if isinstance(content, str):
return content.strip()
if isinstance(content, list):
chunks: list[str] = []
for item in content:
if isinstance(item, str):
chunks.append(item)
elif isinstance(item, dict) and item.get("type") == "text":
chunks.append(str(item.get("text", "")))
return "\n".join(chunk.strip() for chunk in chunks if chunk.strip()).strip()
return ""
Comment on lines +798 to +810
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The logic for _content_to_text and _parse_claude_code_transcript is duplicated between server.py and src/api/routes/memory.py. This increases maintenance overhead and the risk of inconsistencies as the parsing logic evolves. Consider moving these utilities to a shared module (e.g., src/utils/transcripts.py) that both files can import from.



def _parse_claude_code_transcript(text: str) -> list[dict[str, str]]:
"""Parse Claude Code JSONL transcripts into message pairs."""
pairs: list[dict[str, str]] = []
current_user_query: str | None = None
assistant_chunks: list[str] = []

for raw_line in text.splitlines():
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The current implementation of _parse_claude_code_transcript iterates through every line of the input text and attempts to parse it as JSON. This can be inefficient for large non-JSON transcripts (e.g., standard markdown files that don't match Cursor or Antigravity formats). Since Claude Code transcripts are JSONL files, adding a quick heuristic check at the beginning of the function can avoid unnecessary processing.

Suggested change
for raw_line in text.splitlines():
if not text.strip().startswith("{"):
return []
for raw_line in text.splitlines():

raw_line = raw_line.strip()
if not raw_line:
continue

try:
event = json.loads(raw_line)
except json.JSONDecodeError:
continue

if not isinstance(event, dict):
continue

has_nested_message = isinstance(event.get("message"), dict)
message = event["message"] if has_nested_message else event
role = message.get("role") or (event.get("type") if not has_nested_message else None)
content = _content_to_text(message.get("content"))
if not content:
continue

if role == "user":
if current_user_query and assistant_chunks:
pairs.append({
"user_query": current_user_query,
"agent_response": "\n\n".join(assistant_chunks).strip(),
})
current_user_query = content
assistant_chunks = []
elif current_user_query:
current_user_query = f"{current_user_query}\n\n{content}"
else:
current_user_query = content
elif role == "assistant" and current_user_query:
assistant_chunks.append(content)

if current_user_query and assistant_chunks:
pairs.append({
"user_query": current_user_query,
"agent_response": "\n\n".join(assistant_chunks).strip(),
})

return pairs
Comment on lines +798 to +860
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 _content_to_text and _parse_claude_code_transcript are duplicated verbatim

Both functions are copied character-for-character between server.py and src/api/routes/memory.py, continuing the existing duplication pattern for the Cursor and Antigravity parsers. Any future bug fix or format change will need to be applied in two places. Centralising the logic in a shared module (e.g. src/utils/transcript_parsers.py) and importing from both files would remove the maintenance risk.

Fix in Cursor Fix in Codex Fix in Claude Code



async def _parse_transcript_with_llm(text: str) -> list[dict[str, str]]:
"""Use an LLM to parse transcript text when format detection fails."""
from src.models import get_model
Expand Down Expand Up @@ -854,6 +919,10 @@ def _parse_transcript_text(text: str) -> tuple[str, list[dict[str, str]]]:
if pairs:
return "antigravity", pairs

pairs = _parse_claude_code_transcript(text)
if pairs:
return "claude_code", pairs

return "unknown", []


Expand Down
75 changes: 75 additions & 0 deletions src/api/routes/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -447,6 +447,77 @@ def _parse_antigravity_transcript(text: str) -> List[MessagePair]:
return pairs


def _content_to_text(content: Any) -> str:
"""Extract readable text from Claude Code message content blocks."""
if isinstance(content, str):
return content.strip()
if isinstance(content, list):
chunks: List[str] = []
for item in content:
if isinstance(item, str):
chunks.append(item)
elif isinstance(item, dict) and item.get("type") == "text":
chunks.append(str(item.get("text", "")))
return "\n".join(chunk.strip() for chunk in chunks if chunk.strip()).strip()
return ""


def _parse_claude_code_transcript(text: str) -> List[MessagePair]:
"""Parse Claude Code JSONL transcripts into message pairs.

Claude Code stores/export transcripts as newline-delimited JSON objects. User
and assistant turns live under ``message.role`` and ``message.content``.
Tool calls/results are intentionally ignored so only conversational text is
sent to the memory pipeline.
"""
pairs: List[MessagePair] = []
current_user_query: str | None = None
assistant_chunks: List[str] = []

for raw_line in text.splitlines():
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

The current implementation of _parse_claude_code_transcript iterates through every line of the input text and attempts to parse it as JSON. This can be inefficient for large non-JSON transcripts. Adding a quick heuristic check at the beginning of the function can avoid unnecessary processing for files that are clearly not in JSONL format.

Suggested change
for raw_line in text.splitlines():
if not text.strip().startswith("{"):
return []
for raw_line in text.splitlines():

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good call. Since Claude Code transcripts are JSONL, the shared parser should first reject obvious non-JSONL input before iterating through every line. This should be fixed in the shared parser rather than separately in both files.

raw_line = raw_line.strip()
if not raw_line:
continue

try:
event = json.loads(raw_line)
except json.JSONDecodeError:
continue

if not isinstance(event, dict):
continue

has_nested_message = isinstance(event.get("message"), dict)
message = event["message"] if has_nested_message else event
role = message.get("role") or (event.get("type") if not has_nested_message else None)
Comment on lines +490 to +492
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 event.type not used as role fallback when message dict is present but lacks role

event.get("type") is only tried as a fallback when has_nested_message is False. When a nested message dict exists but contains no "role" key (e.g. {"type":"user","message":{"content":"..."}} — a plausible compact variant), role resolves to None and the event is silently dropped. Using the outer type field as a fallback in both branches would be safer.

Fix in Cursor Fix in Codex Fix in Claude Code

content = _content_to_text(message.get("content"))
if not content:
continue

if role == "user":
if current_user_query and assistant_chunks:
pairs.append(MessagePair(
user_query=current_user_query,
agent_response="\n\n".join(assistant_chunks).strip(),
))
current_user_query = content
assistant_chunks = []
elif current_user_query:
current_user_query = f"{current_user_query}\n\n{content}"
else:
current_user_query = content
elif role == "assistant" and current_user_query:
assistant_chunks.append(content)
Comment on lines +497 to +510
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Tool-only assistant turns silently merge the preceding and following user messages

When an assistant turn contains only tool calls, _content_to_text returns "" and the line is skipped entirely via continue. From the state machine's perspective there is no assistant response between the two user messages, so the elif current_user_query branch fires and concatenates them. For example: User "Add tests" → Assistant [tool_use only] → User "Run them" → Assistant "Done." produces the pair user_query = "Add tests\n\nRun them" instead of two separate pairs, misrepresenting the conversation structure in stored memories.

Fix in Cursor Fix in Codex Fix in Claude Code


if current_user_query and assistant_chunks:
pairs.append(MessagePair(
user_query=current_user_query,
agent_response="\n\n".join(assistant_chunks).strip(),
))

return pairs


async def _parse_transcript_with_llm(text: str) -> List[MessagePair]:
"""Use an LLM to parse transcript text when format detection fails."""
from src.models import get_model
Expand Down Expand Up @@ -507,6 +578,10 @@ def _parse_transcript_text(text: str) -> tuple[str, List[MessagePair]]:
if pairs:
return "antigravity", pairs

pairs = _parse_claude_code_transcript(text)
if pairs:
return "claude_code", pairs
Comment on lines +581 to +583
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Catch-all JSONL parser with no format guard

Unlike the Cursor and Antigravity parsers, _parse_claude_code_transcript has no upfront marker check, so it runs on any input that isn't matched by the earlier detectors. Any JSONL file containing objects with role: "user" / role: "assistant" (or matching type values) and text content — such as an OpenAI or Gemini API log, a generic chat export, or a database dump — will be silently misidentified as a Claude Code transcript and its contents ingested as memories. Consider adding a lightweight guard, e.g. checking that at least one line is a dict containing a "message" key wrapping another dict with a "role" field, before committing to the "claude_code" format. The same issue exists in server.py at the equivalent call site.

Fix in Cursor Fix in Codex Fix in Claude Code


return "unknown", []


Expand Down
73 changes: 73 additions & 0 deletions tests/test_claude_code_transcript.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import ast
import json
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any, List


@dataclass
class MessagePair:
user_query: str
agent_response: str


def load_parser():
source = Path("src/api/routes/memory.py").read_text()
tree = ast.parse(source)
wanted = {
"_parse_cursor_transcript",
"_parse_antigravity_transcript",
"_content_to_text",
"_parse_claude_code_transcript",
"_parse_transcript_text",
}
module = ast.Module(
body=[node for node in tree.body if isinstance(node, ast.FunctionDef) and node.name in wanted],
type_ignores=[],
)
ast.fix_missing_locations(module)
namespace = {
"Any": Any,
"List": List,
"MessagePair": MessagePair,
"json": json,
"re": re,
}
exec(compile(module, "memory_parser_subset", "exec"), namespace)
return namespace["_parse_transcript_text"]


def test_parse_claude_code_jsonl_transcript():
parse_transcript_text = load_parser()
transcript = "\n".join([
'{"type":"user","message":{"role":"user","content":"Add tests for login"}}',
'{"type":"assistant","message":{"role":"assistant","content":[{"type":"text","text":"I added the login tests."},{"type":"tool_use","name":"Bash"}]}}',
'{"type":"user","message":{"role":"user","content":[{"type":"text","text":"Run them"}]}}',
'{"type":"assistant","message":{"role":"assistant","content":"All tests passed."}}',
])

format_detected, pairs = parse_transcript_text(transcript)

assert format_detected == "claude_code"
assert len(pairs) == 2
assert pairs[0].user_query == "Add tests for login"
assert pairs[0].agent_response == "I added the login tests."
assert pairs[1].user_query == "Run them"
assert pairs[1].agent_response == "All tests passed."


def test_parse_claude_code_ignores_tool_only_blocks():
parse_transcript_text = load_parser()
transcript = "\n".join([
'{"message":{"role":"user","content":"Inspect the repo"}}',
'{"message":{"role":"assistant","content":[{"type":"tool_use","name":"Read"}]}}',
'{"message":{"role":"assistant","content":[{"type":"text","text":"The repo uses FastAPI."}]}}',
])

format_detected, pairs = parse_transcript_text(transcript)

assert format_detected == "claude_code"
assert len(pairs) == 1
assert pairs[0].user_query == "Inspect the repo"
assert pairs[0].agent_response == "The repo uses FastAPI."
Loading