fix(langchain): preserve anthropic cache metrics (#455)

AbhiPrasad · web-flow · commit 0ca914ebadb9 · 2026-05-22T17:53:52.000Z
LangChain Anthropic responses report cache reads and cache writes separately from normal input tokens, including TTL-specific cache creation buckets. The previous cached-token fix avoided OpenAI double counting, but it could drop Anthropic cache-write detail from spans and produce totals that were less useful for cost analysis. Preserve the cache creation metrics users need to understand prompt-cache spend and keep token totals aligned with the prompt-cache semantics, while continuing to avoid double counting OpenAI cached input tokens. ref https://github.com/braintrustdata/braintrust-spec/blob/main/docs/features/prompt-cache.md
diff --git a/examples/langchain/README.md b/examples/langchain/README.md
@@ -11,3 +11,20 @@ export OPENAI_API_KEY=...
 uv sync
 uv run python example.py
 ```
+
+## Anthropic prompt-cache metrics demo
+
+Use it to verify cache reads/writes and token totals on real Braintrust spans.
+
+```bash
+# Loads BRAINTRUST_API_KEY and ANTHROPIC_API_KEY from ../../.env automatically.
+uv sync
+uv run python anthropic_prompt_cache.py
+```
+
+To inspect the logged spans with the Braintrust CLI:
+
+```bash
+bt projects list --json | jq '.[] | select(.name == "z-abhi-langchain-anthropic-cache-demo")'
+bt view logs --object-ref project_logs:<project-id> --list-mode spans --limit 10 --json
+```
diff --git a/examples/langchain/anthropic_prompt_cache.py b/examples/langchain/anthropic_prompt_cache.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python
+"""Verify LangChain Anthropic prompt-cache metrics in Braintrust.
+
+This sends two Anthropic requests through LangChain with a cacheable system
+prompt. The resulting Braintrust spans should show Anthropic cache reads and
+cache writes, including TTL-specific cache creation metrics when Anthropic
+returns them.
+"""
+
+import os
+import uuid
+from pathlib import Path
+
+import braintrust
+from braintrust.integrations.langchain import BraintrustCallbackHandler
+from dotenv import load_dotenv
+from langchain_anthropic import ChatAnthropic
+from langchain_core.messages import BaseMessage, HumanMessage, SystemMessage
+
+
+ROOT = Path(__file__).resolve().parents[2]
+load_dotenv(ROOT / ".env")
+
+PROJECT_NAME = os.environ.get("BRAINTRUST_PROJECT", "py-sdk-demo-langchain-anthropic-cache")
+MODEL = os.environ.get("ANTHROPIC_MODEL", "claude-sonnet-4-5-20250929")
+
+# Anthropic prompt caching requires a sufficiently long cacheable prefix.
+CACHEABLE_SYSTEM_PROMPT = "\n".join(
+    [
+        "You are helping validate prompt-cache accounting in an SDK integration.",
+        "Always answer briefly and mention the requested section title.",
+        "",
+        "Reference document:",
+        *[
+            f"Section {i}: This paragraph describes stable product guidance, tracing semantics, "
+            "token accounting, and prompt-cache behavior for repeat requests."
+            for i in range(1, 90)
+        ],
+        f"Stable cache key: {os.environ.get('CACHE_DEMO_KEY', 'langchain-anthropic-cache-demo')}",
+    ]
+)
+
+
+def main() -> None:
+    logger = braintrust.init_logger(project=PROJECT_NAME)
+    handler = BraintrustCallbackHandler(logger=logger)
+    model = ChatAnthropic(model=MODEL, max_tokens=64)
+
+    messages: list[BaseMessage] = [
+        SystemMessage(
+            content=[
+                {
+                    "type": "text",
+                    "text": CACHEABLE_SYSTEM_PROMPT,
+                    "cache_control": {"type": "ephemeral"},
+                }
+            ]
+        ),
+        HumanMessage(content=f"What is this document for? Run id: {uuid.uuid4().hex}"),
+    ]
+
+    for label in ("cache write", "cache read"):
+        result = model.invoke(messages, config={"callbacks": [handler]})
+        print(f"{label}: {result.content}")
+
+    braintrust.flush()
+    print(f"Logged demo spans to Braintrust project: {PROJECT_NAME}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/langchain/pyproject.toml b/examples/langchain/pyproject.toml
@@ -5,8 +5,10 @@ description = "LangChain chain traced via BraintrustCallbackHandler"
 requires-python = ">=3.10"
 dependencies = [
     "braintrust",
+    "langchain-anthropic",
     "langchain-core",
     "langchain-openai",
+    "python-dotenv",
 ]
 
 [tool.uv.sources]
diff --git a/py/src/braintrust/integrations/langchain/callbacks.py b/py/src/braintrust/integrations/langchain/callbacks.py
@@ -661,36 +661,40 @@ def _get_metrics_from_response(response: LLMResult):
             input_token_details = usage_metadata.get("input_token_details")
             if input_token_details and isinstance(input_token_details, dict):
                 cache_read = input_token_details.get("cache_read")
-                # langchain-anthropic >= 1.4.0 maps cache_creation_input_tokens to
-                # ephemeral tier fields (ephemeral_5m_input_tokens, ephemeral_1h_input_tokens)
-                # rather than the top-level cache_creation field. Sum both for compat.
                 cache_creation = input_token_details.get("cache_creation")
-                if not cache_creation and (
-                    "ephemeral_5m_input_tokens" in input_token_details
-                    or "ephemeral_1h_input_tokens" in input_token_details
-                ):
-                    cache_creation = input_token_details.get("ephemeral_5m_input_tokens", 0) + input_token_details.get(
-                        "ephemeral_1h_input_tokens", 0
-                    )
+                cache_creation_5m = input_token_details.get("ephemeral_5m_input_tokens")
+                cache_creation_1h = input_token_details.get("ephemeral_1h_input_tokens")
+                has_cache_creation_breakdown = cache_creation_5m is not None or cache_creation_1h is not None
 
                 if cache_read is not None:
                     metrics["prompt_cached_tokens"] = cache_read
-                if cache_creation is not None:
-                    metrics["prompt_cache_creation_tokens"] = cache_creation
-
-                cache_tokens = (cache_read or 0) + (cache_creation or 0)
+                if has_cache_creation_breakdown:
+                    # Anthropic exposes TTL-specific cache creation buckets. Preserve the
+                    # split so downstream cost tooling can price 5m vs 1h writes correctly.
+                    if cache_creation_5m is not None:
+                        metrics["prompt_cache_creation_5m_tokens"] = cache_creation_5m
+                    if cache_creation_1h is not None:
+                        metrics["prompt_cache_creation_1h_tokens"] = cache_creation_1h
+                    effective_cache_creation = (cache_creation_5m or 0) + (cache_creation_1h or 0)
+                else:
+                    if cache_creation is not None:
+                        metrics["prompt_cache_creation_tokens"] = cache_creation
+                    effective_cache_creation = cache_creation or 0
+                cache_tokens = (cache_read or 0) + effective_cache_creation
                 prompt_tokens = metrics.get("prompt_tokens")
                 completion_tokens = metrics.get("completion_tokens")
                 total_tokens = metrics.get("total_tokens")
-                if (
-                    cache_tokens
-                    and prompt_tokens is not None
-                    and completion_tokens is not None
-                    and total_tokens == prompt_tokens + completion_tokens
-                    and _cache_tokens_are_separate_from_input_tokens(input_token_details)
-                ):
-                    metrics["prompt_tokens"] = prompt_tokens + cache_tokens
-                    metrics["total_tokens"] = total_tokens + cache_tokens
+                if prompt_tokens is not None and completion_tokens is not None:
+                    if (
+                        cache_tokens
+                        and total_tokens == prompt_tokens + completion_tokens
+                        and _cache_tokens_are_separate_from_input_tokens(input_token_details)
+                    ):
+                        prompt_tokens += cache_tokens
+                        metrics["prompt_tokens"] = prompt_tokens
+                        if total_tokens is not None:
+                            metrics["total_tokens"] = total_tokens + cache_tokens
+                    metrics["tokens"] = prompt_tokens + completion_tokens
 
     if not metrics or not any(metrics.values()):
         llm_output: dict[str, Any] = response.llm_output or {}
diff --git a/py/src/braintrust/integrations/langchain/test_callbacks.py b/py/src/braintrust/integrations/langchain/test_callbacks.py
@@ -8,12 +8,10 @@
 import pytest
 from braintrust import logger
 from braintrust.integrations.langchain import BraintrustCallbackHandler
-from braintrust.integrations.langchain.callbacks import _get_metrics_from_response
 from braintrust.logger import flush
 from braintrust.test_helpers import init_test_logger
 from langchain_core.callbacks import BaseCallbackHandler
 from langchain_core.messages import AIMessage, BaseMessage, HumanMessage, SystemMessage, ToolMessage
-from langchain_core.outputs import ChatGeneration, LLMResult
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_core.prompts.prompt import PromptTemplate
 from langchain_core.runnables import RunnableMap, RunnableSerializable
@@ -908,34 +906,6 @@ def test_streaming_ttft(logger_memory_logger):
     )
 
 
-def test_openai_cached_tokens_are_not_folded_into_prompt_tokens():
-    response = LLMResult(
-        generations=[
-            [
-                ChatGeneration(
-                    message=AIMessage(
-                        content="Done",
-                        response_metadata={"model_name": "gpt-4o-mini-2024-07-18"},
-                        usage_metadata={
-                            "input_tokens": 1000,
-                            "output_tokens": 200,
-                            "total_tokens": 1200,
-                            "input_token_details": {"cache_read": 500},
-                        },
-                    )
-                )
-            ]
-        ]
-    )
-
-    assert _get_metrics_from_response(response) == {
-        "prompt_tokens": 1000,
-        "completion_tokens": 200,
-        "total_tokens": 1200,
-        "prompt_cached_tokens": 500,
-    }
-
-
 @pytest.mark.vcr
 def test_prompt_caching_tokens(logger_memory_logger):
     from langchain_anthropic import ChatAnthropic
@@ -1114,11 +1084,19 @@ def test_prompt_caching_tokens(logger_memory_logger):
     assert "prompt_tokens" in first_metrics
     assert first_metrics["prompt_tokens"] > 0
 
-    assert "prompt_cache_creation_tokens" in first_metrics
-    assert first_metrics["prompt_cache_creation_tokens"] > 0
+    first_has_cache_creation_split = (
+        "prompt_cache_creation_5m_tokens" in first_metrics or "prompt_cache_creation_1h_tokens" in first_metrics
+    )
+    first_cache_creation_split = first_metrics.get("prompt_cache_creation_5m_tokens", 0) + first_metrics.get(
+        "prompt_cache_creation_1h_tokens", 0
+    )
+    first_cache_creation_tokens = first_cache_creation_split or first_metrics.get("prompt_cache_creation_tokens", 0)
+    assert first_cache_creation_tokens > 0
+    if first_has_cache_creation_split:
+        assert "prompt_cache_creation_tokens" not in first_metrics
     assert first_metrics["prompt_cached_tokens"] == 0
-    assert first_metrics["prompt_tokens"] >= first_metrics["prompt_cache_creation_tokens"]
-    assert first_metrics["total_tokens"] == first_metrics["prompt_tokens"] + first_metrics["completion_tokens"]
+    assert first_metrics["prompt_tokens"] >= first_cache_creation_tokens
+    assert first_metrics["tokens"] == first_metrics["prompt_tokens"] + first_metrics["completion_tokens"]
 
     second_metrics = None
     for attempt in range(3):
@@ -1147,9 +1125,14 @@ def test_prompt_caching_tokens(logger_memory_logger):
             time.sleep(1)
 
     assert second_metrics is not None
+    second_has_cache_creation_split = (
+        "prompt_cache_creation_5m_tokens" in second_metrics or "prompt_cache_creation_1h_tokens" in second_metrics
+    )
+    if second_has_cache_creation_split:
+        assert "prompt_cache_creation_tokens" not in second_metrics
     assert second_metrics["prompt_cached_tokens"] > 0
     assert second_metrics["prompt_tokens"] >= second_metrics["prompt_cached_tokens"]
-    assert second_metrics["total_tokens"] == second_metrics["prompt_tokens"] + second_metrics["completion_tokens"]
+    assert second_metrics["tokens"] == second_metrics["prompt_tokens"] + second_metrics["completion_tokens"]
 
 
 @pytest.mark.vcr