fix(llm): add fallback extraction for reasoning traces from <think> tags (#1474)

Pouyanpi · tgasser-nv · commit d2413233d9bf · 2025-10-28T16:12:07.000-05:00
Adds a compatibility layer for LLM providers that don't properly populate reasoning_content in additional_kwargs. When reasoning_content is missing, the system now falls back to extracting reasoning traces from &lt;think&gt;...&lt;/think&gt; tags in the response content and removes the tags from the final output.

This fixes compatibility with certain NVIDIA models (e.g., nvidia/llama-3.3-nemotron-super-49b-v1.5) in langchain-nvidia-ai-endpoints that include reasoning traces in &lt;think&gt; tags but fail to populate the reasoning_content field.

All reasoning models using ChatNVIDIA should expose reasoning content consistently through the same interface
diff --git a/nemoguardrails/actions/llm/utils.py b/nemoguardrails/actions/llm/utils.py
@@ -13,9 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import logging
 import re
 from typing import Any, Dict, List, Optional, Sequence, Union
 
+logger = logging.getLogger(__name__)
+
 from langchain.base_language import BaseLanguageModel
 from langchain.callbacks.base import AsyncCallbackHandler, BaseCallbackManager
 from langchain_core.runnables import RunnableConfig
@@ -238,15 +241,78 @@ def _convert_messages_to_langchain_format(prompt: List[dict]) -> List:
 
 
 def _store_reasoning_traces(response) -> None:
+    """Store reasoning traces from response in context variable.
+
+    Extracts reasoning content from response.additional_kwargs["reasoning_content"]
+    if available. Otherwise, falls back to extracting from <think> tags in the
+    response content (and removes the tags from content).
+
+    Args:
+        response: The LLM response object
+    """
+
+    reasoning_content = _extract_reasoning_content(response)
+
+    if not reasoning_content:
+        # Some LLM providers (e.g., certain NVIDIA models) embed reasoning in <think> tags
+        # instead of properly populating reasoning_content in additional_kwargs, so we need
+        # both extraction methods to support different provider implementations.
+        reasoning_content = _extract_and_remove_think_tags(response)
+
+    if reasoning_content:
+        reasoning_trace_var.set(reasoning_content)
+
+
+def _extract_reasoning_content(response):
     if hasattr(response, "additional_kwargs"):
         additional_kwargs = response.additional_kwargs
         if (
             isinstance(additional_kwargs, dict)
             and "reasoning_content" in additional_kwargs
         ):
-            reasoning_content = additional_kwargs["reasoning_content"]
-            if reasoning_content:
-                reasoning_trace_var.set(reasoning_content)
+            return additional_kwargs["reasoning_content"]
+    return None
+
+
+def _extract_and_remove_think_tags(response) -> Optional[str]:
+    """Extract reasoning from <think> tags and remove them from `response.content`.
+
+    This function looks for <think>...</think> tags in the response content,
+    and if found, extracts the reasoning content inside the tags. It has a side-effect:
+    it removes the full reasoning trace and tags from response.content.
+
+    Args:
+        response: The LLM response object
+
+    Returns:
+        The extracted reasoning content, or None if no <think> tags found
+    """
+    if not hasattr(response, "content"):
+        return None
+
+    content = response.content
+    has_opening_tag = "<think>" in content
+    has_closing_tag = "</think>" in content
+
+    if not has_opening_tag and not has_closing_tag:
+        return None
+
+    if has_opening_tag != has_closing_tag:
+        logger.warning(
+            "Malformed <think> tags detected: missing %s tag. "
+            "Skipping reasoning extraction to prevent corrupted content.",
+            "closing" if has_opening_tag else "opening",
+        )
+        return None
+
+    match = re.search(r"<think>(.*?)</think>", content, re.DOTALL)
+    if match:
+        reasoning_content = match.group(1).strip()
+        response.content = re.sub(
+            r"<think>.*?</think>", "", content, flags=re.DOTALL
+        ).strip()
+        return reasoning_content
+    return None
 
 
 def _store_tool_calls(response) -> None:
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -22,5 +22,15 @@
 )
 
 
+@pytest.fixture(autouse=True)
+def reset_reasoning_trace_var():
+    """Reset reasoning_trace_var before each test to prevent state leakage."""
+    from nemoguardrails.context import reasoning_trace_var
+
+    reasoning_trace_var.set(None)
+    yield
+    reasoning_trace_var.set(None)
+
+
 def pytest_configure(config):
     patch("prompt_toolkit.PromptSession", autospec=True).start()
diff --git a/tests/test_actions_llm_utils.py b/tests/test_actions_llm_utils.py
@@ -13,7 +13,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from nemoguardrails.actions.llm.utils import _infer_provider_from_module
+from nemoguardrails.actions.llm.utils import (
+    _extract_and_remove_think_tags,
+    _infer_provider_from_module,
+    _store_reasoning_traces,
+)
+from nemoguardrails.context import reasoning_trace_var
 
 
 class MockOpenAILLM:
@@ -123,3 +128,179 @@ class Wrapper3(Wrapper2):
     llm = Wrapper3()
     provider = _infer_provider_from_module(llm)
     assert provider == "anthropic"
+
+
+class MockResponse:
+    def __init__(self, content="", additional_kwargs=None):
+        self.content = content
+        self.additional_kwargs = additional_kwargs or {}
+
+
+def test_store_reasoning_traces_from_additional_kwargs():
+    reasoning_trace_var.set(None)
+
+    response = MockResponse(
+        content="The answer is 42",
+        additional_kwargs={"reasoning_content": "Let me think about this..."},
+    )
+
+    _store_reasoning_traces(response)
+
+    assert reasoning_trace_var.get() == "Let me think about this..."
+
+
+def test_store_reasoning_traces_from_think_tags():
+    reasoning_trace_var.set(None)
+
+    response = MockResponse(
+        content="<think>Let me think about this...</think>The answer is 42"
+    )
+
+    _store_reasoning_traces(response)
+
+    assert reasoning_trace_var.get() == "Let me think about this..."
+    assert response.content == "The answer is 42"
+
+
+def test_store_reasoning_traces_multiline_think_tags():
+    reasoning_trace_var.set(None)
+
+    response = MockResponse(
+        content="<think>Step 1: Analyze the problem\nStep 2: Consider options\nStep 3: Choose solution</think>The answer is 42"
+    )
+
+    _store_reasoning_traces(response)
+
+    assert (
+        reasoning_trace_var.get()
+        == "Step 1: Analyze the problem\nStep 2: Consider options\nStep 3: Choose solution"
+    )
+    assert response.content == "The answer is 42"
+
+
+def test_store_reasoning_traces_prefers_additional_kwargs():
+    reasoning_trace_var.set(None)
+
+    response = MockResponse(
+        content="<think>This should not be used</think>The answer is 42",
+        additional_kwargs={"reasoning_content": "This should be used"},
+    )
+
+    _store_reasoning_traces(response)
+
+    assert reasoning_trace_var.get() == "This should be used"
+
+
+def test_store_reasoning_traces_no_reasoning_content():
+    reasoning_trace_var.set(None)
+
+    response = MockResponse(content="The answer is 42")
+
+    _store_reasoning_traces(response)
+
+    assert reasoning_trace_var.get() is None
+
+
+def test_store_reasoning_traces_empty_reasoning_content():
+    reasoning_trace_var.set(None)
+
+    response = MockResponse(
+        content="The answer is 42", additional_kwargs={"reasoning_content": ""}
+    )
+
+    _store_reasoning_traces(response)
+
+    assert reasoning_trace_var.get() is None
+
+
+def test_store_reasoning_traces_incomplete_think_tags():
+    reasoning_trace_var.set(None)
+
+    response = MockResponse(content="<think>This is incomplete")
+
+    _store_reasoning_traces(response)
+
+    assert reasoning_trace_var.get() is None
+
+
+def test_store_reasoning_traces_no_content_attribute():
+    reasoning_trace_var.set(None)
+
+    class ResponseWithoutContent:
+        def __init__(self):
+            self.additional_kwargs = {}
+
+    response = ResponseWithoutContent()
+
+    _store_reasoning_traces(response)
+
+    assert reasoning_trace_var.get() is None
+
+
+def test_store_reasoning_traces_removes_think_tags_with_whitespace():
+    reasoning_trace_var.set(None)
+
+    response = MockResponse(
+        content="  <think>reasoning here</think>  \n\n  Final answer  "
+    )
+
+    _store_reasoning_traces(response)
+
+    assert reasoning_trace_var.get() == "reasoning here"
+    assert response.content == "Final answer"
+
+
+def test_extract_and_remove_think_tags_basic():
+    response = MockResponse(content="<think>reasoning</think>answer")
+
+    result = _extract_and_remove_think_tags(response)
+
+    assert result == "reasoning"
+    assert response.content == "answer"
+
+
+def test_extract_and_remove_think_tags_multiline():
+    response = MockResponse(content="<think>line1\nline2\nline3</think>final answer")
+
+    result = _extract_and_remove_think_tags(response)
+
+    assert result == "line1\nline2\nline3"
+    assert response.content == "final answer"
+
+
+def test_extract_and_remove_think_tags_no_tags():
+    response = MockResponse(content="just a normal response")
+
+    result = _extract_and_remove_think_tags(response)
+
+    assert result is None
+    assert response.content == "just a normal response"
+
+
+def test_extract_and_remove_think_tags_incomplete():
+    response = MockResponse(content="<think>incomplete")
+
+    result = _extract_and_remove_think_tags(response)
+
+    assert result is None
+    assert response.content == "<think>incomplete"
+
+
+def test_extract_and_remove_think_tags_no_content_attribute():
+    class ResponseWithoutContent:
+        pass
+
+    response = ResponseWithoutContent()
+
+    result = _extract_and_remove_think_tags(response)
+
+    assert result is None
+
+
+def test_extract_and_remove_think_tags_wrong_order():
+    response = MockResponse(content="</think> text here <think>")
+
+    result = _extract_and_remove_think_tags(response)
+
+    assert result is None
+    assert response.content == "</think> text here <think>"
diff --git a/tests/test_reasoning_trace_extraction.py b/tests/test_reasoning_trace_extraction.py
@@ -304,3 +304,94 @@ async def test_reasoning_content_with_other_additional_kwargs(self):
         assert stored_trace == test_reasoning
 
         reasoning_trace_var.set(None)
+
+    @pytest.mark.asyncio
+    async def test_llm_call_extracts_reasoning_from_think_tags(self):
+        test_reasoning = "Let me analyze this step by step"
+
+        mock_llm = AsyncMock()
+        mock_response = AIMessage(
+            content=f"<think>{test_reasoning}</think>The answer is 42",
+            additional_kwargs={},
+        )
+        mock_llm.ainvoke = AsyncMock(return_value=mock_response)
+
+        from nemoguardrails.actions.llm.utils import llm_call
+
+        reasoning_trace_var.set(None)
+        result = await llm_call(mock_llm, "What is the answer?")
+
+        assert result == "The answer is 42"
+        assert "<think>" not in result
+        stored_trace = reasoning_trace_var.get()
+        assert stored_trace == test_reasoning
+
+        reasoning_trace_var.set(None)
+
+    @pytest.mark.asyncio
+    async def test_llm_call_prefers_additional_kwargs_over_think_tags(self):
+        reasoning_from_kwargs = "This should be used"
+        reasoning_from_tags = "This should be ignored"
+
+        mock_llm = AsyncMock()
+        mock_response = AIMessage(
+            content=f"<think>{reasoning_from_tags}</think>Response",
+            additional_kwargs={"reasoning_content": reasoning_from_kwargs},
+        )
+        mock_llm.ainvoke = AsyncMock(return_value=mock_response)
+
+        from nemoguardrails.actions.llm.utils import llm_call
+
+        reasoning_trace_var.set(None)
+        result = await llm_call(mock_llm, "Query")
+
+        assert result == f"<think>{reasoning_from_tags}</think>Response"
+        stored_trace = reasoning_trace_var.get()
+        assert stored_trace == reasoning_from_kwargs
+
+        reasoning_trace_var.set(None)
+
+    @pytest.mark.asyncio
+    async def test_llm_call_extracts_multiline_reasoning_from_think_tags(self):
+        multiline_reasoning = """Step 1: Understand the question
+Step 2: Break down the problem
+Step 3: Formulate the answer"""
+
+        mock_llm = AsyncMock()
+        mock_response = AIMessage(
+            content=f"<think>{multiline_reasoning}</think>Final answer",
+            additional_kwargs={},
+        )
+        mock_llm.ainvoke = AsyncMock(return_value=mock_response)
+
+        from nemoguardrails.actions.llm.utils import llm_call
+
+        reasoning_trace_var.set(None)
+        result = await llm_call(mock_llm, "Question")
+
+        assert result == "Final answer"
+        assert "<think>" not in result
+        stored_trace = reasoning_trace_var.get()
+        assert stored_trace == multiline_reasoning
+
+        reasoning_trace_var.set(None)
+
+    @pytest.mark.asyncio
+    async def test_llm_call_handles_incomplete_think_tags(self):
+        mock_llm = AsyncMock()
+        mock_response = AIMessage(
+            content="<think>This is incomplete",
+            additional_kwargs={},
+        )
+        mock_llm.ainvoke = AsyncMock(return_value=mock_response)
+
+        from nemoguardrails.actions.llm.utils import llm_call
+
+        reasoning_trace_var.set(None)
+        result = await llm_call(mock_llm, "Query")
+
+        assert result == "<think>This is incomplete"
+        stored_trace = reasoning_trace_var.get()
+        assert stored_trace is None
+
+        reasoning_trace_var.set(None)