From 27b1daf5d64a186f99dc169e3a399cb21888193b Mon Sep 17 00:00:00 2001 From: Nandan Priyadarshi Date: Sat, 28 Mar 2026 23:51:59 +0800 Subject: [PATCH] fix: [#90] Add OpenTelemetry tracing support as optional backend Add OpenTelemetry-based tracing alongside the existing JSON file recorder. Key changes: - New trae_agent/utils/otel_recorder.py module with: - setup_otel_tracing() for configuring OTLP exporter - OTelTrajectoryRecorder that creates spans for agent runs, LLM calls, and agent steps - Graceful fallback when opentelemetry packages are not installed - Modified Agent.__init__() to detect OTEL_EXPORTER_OTLP_ENDPOINT env var and initialize OTel tracing when available - Modified Agent.run() to start/finalize OTel traces around agent execution - Added 'otel' optional dependency group in pyproject.toml: uv sync --extra otel Usage: OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 trae-cli run "..." The JSON file recorder continues to work unchanged - OTel is purely additive and requires explicit opt-in via env var + installed packages. --- pyproject.toml | 5 + trae_agent/agent/agent.py | 35 ++++++ trae_agent/utils/otel_recorder.py | 171 ++++++++++++++++++++++++++++++ 3 files changed, 211 insertions(+) create mode 100644 trae_agent/utils/otel_recorder.py diff --git a/pyproject.toml b/pyproject.toml index 35993509..46818fb7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,11 @@ evaluation = [ "pexpect>=4.9.0", "unidiff>=0.7.5", ] +otel = [ + "opentelemetry-api>=1.20.0", + "opentelemetry-sdk>=1.20.0", + "opentelemetry-exporter-otlp-proto-grpc>=1.20.0", +] [project.scripts] trae-cli = "trae_agent.cli:main" diff --git a/trae_agent/agent/agent.py b/trae_agent/agent/agent.py index bbca94f0..54b0cf03 100644 --- a/trae_agent/agent/agent.py +++ b/trae_agent/agent/agent.py @@ -1,5 +1,6 @@ import asyncio import contextlib +import os from enum import Enum from trae_agent.utils.cli.cli_console import CLIConsole @@ -34,6 +35,24 @@ def __init__( self.trajectory_recorder = TrajectoryRecorder() self.trajectory_file = self.trajectory_recorder.get_trajectory_path() + # Set up OpenTelemetry tracing (optional, enabled via env var or OTEL config) + self._otel_recorder = None + try: + from trae_agent.utils.otel_recorder import ( + OTelTrajectoryRecorder, + is_otel_available, + setup_otel_tracing, + ) + + otel_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT") + if is_otel_available() and otel_endpoint: + tracer = setup_otel_tracing(service_name="trae-agent", endpoint=otel_endpoint) + if tracer: + self._otel_recorder = OTelTrajectoryRecorder(tracer=tracer) + except Exception: + # OpenTelemetry is optional – silently skip if unavailable + pass + match self.agent_type: case AgentType.TraeAgent: if config.trae_agent is None: @@ -83,6 +102,15 @@ async def run( task_details[key.capitalize()] = value self.agent.cli_console.print_task_details(task_details) + # Start OpenTelemetry trace if available + if self._otel_recorder: + self._otel_recorder.start_recording( + task=task, + provider=self.agent_config.model.model_provider.provider, + model=self.agent_config.model.model, + max_steps=self.agent_config.max_steps, + ) + cli_console_task = ( asyncio.create_task(self.agent.cli_console.start()) if self.agent.cli_console else None ) @@ -97,4 +125,11 @@ async def run( if cli_console_task: await cli_console_task + # Finalize OpenTelemetry trace + if self._otel_recorder: + self._otel_recorder.finalize_recording( + success=execution.success, + final_result=execution.final_result, + ) + return execution diff --git a/trae_agent/utils/otel_recorder.py b/trae_agent/utils/otel_recorder.py new file mode 100644 index 00000000..301f9aac --- /dev/null +++ b/trae_agent/utils/otel_recorder.py @@ -0,0 +1,171 @@ +# Copyright (c) 2025 ByteDance Ltd. and/or its affiliates +# SPDX-License-Identifier: MIT + +# pyright: reportExplicitAny=false +# pyright: reportArgumentType=false +# pyright: reportAny=false + +"""OpenTelemetry-based tracing for Trae Agent. + +This module provides an alternative trajectory recorder that emits +OpenTelemetry spans instead of (or in addition to) writing JSON files. +It is designed to be a drop-in enhancement: when enabled, spans are +created for each LLM interaction and agent step, allowing integration +with any OTLP-compatible backend (Jaeger, Zipkin, Grafana Tempo, etc.). + +Enable via environment variables: + OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4317 + OTEL_SERVICE_NAME=trae-agent + +Or programmatically by calling setup_otel_tracing() before agent execution. +""" + +import os +from typing import Any + +try: + from opentelemetry import trace + from opentelemetry.sdk.trace import TracerProvider + from opentelemetry.sdk.trace.export import BatchSpanProcessor + from opentelemetry.sdk.resources import Resource, SERVICE_NAME + + _OTEL_AVAILABLE = True +except ImportError: + _OTEL_AVAILABLE = False + + +def is_otel_available() -> bool: + """Check if OpenTelemetry packages are installed.""" + return _OTEL_AVAILABLE + + +def setup_otel_tracing( + service_name: str = "trae-agent", + endpoint: str | None = None, +) -> "trace.Tracer | None": + """Initialise OpenTelemetry tracing. + + Args: + service_name: Logical service name shown in the tracing backend. + endpoint: OTLP gRPC endpoint (e.g. ``http://localhost:4317``). + Falls back to the ``OTEL_EXPORTER_OTLP_ENDPOINT`` + environment variable. + + Returns: + A ``Tracer`` instance, or ``None`` if OpenTelemetry is not installed. + """ + if not _OTEL_AVAILABLE: + return None + + resource = Resource.create({SERVICE_NAME: service_name}) + provider = TracerProvider(resource=resource) + + # Use OTLP gRPC exporter if endpoint is available + otlp_endpoint = endpoint or os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT") + if otlp_endpoint: + try: + from opentelemetry.exporter.otlp.proto.grpc.trace_exporter import OTLPSpanExporter + + exporter = OTLPSpanExporter(endpoint=otlp_endpoint) + provider.add_span_processor(BatchSpanProcessor(exporter)) + except ImportError: + # Fallback: just record spans in memory (no export) + pass + + trace.set_tracer_provider(provider) + return trace.get_tracer("trae-agent") + + +class OTelTrajectoryRecorder: + """Records agent trajectory as OpenTelemetry spans. + + Can be used alongside or instead of the JSON-based + ``TrajectoryRecorder``. Each agent run produces a root span; + LLM interactions and tool calls become child spans. + """ + + def __init__(self, tracer: "trace.Tracer | None" = None): + if not _OTEL_AVAILABLE: + raise RuntimeError( + "OpenTelemetry packages are not installed. " + "Install with: uv add opentelemetry-api opentelemetry-sdk " + "opentelemetry-exporter-otlp-proto-grpc" + ) + self._tracer = tracer or trace.get_tracer("trae-agent") + self._root_span: Any | None = None + self._step_spans: dict[int, Any] = {} + + def start_recording(self, task: str, provider: str, model: str, max_steps: int) -> None: + """Start the root span for an agent run.""" + self._root_span = self._tracer.start_span( + "agent.run", + attributes={ + "agent.task": task, + "agent.provider": provider, + "agent.model": model, + "agent.max_steps": max_steps, + }, + ) + + def record_llm_interaction( + self, + messages: list[Any], + response: Any, + provider: str, + model: str, + tools: list[Any] | None = None, + ) -> None: + """Record an LLM call as a span under the current root span.""" + if not self._root_span: + return + + ctx = trace.set_span_in_context(self._root_span) + with self._tracer.start_as_current_span( + "llm.call", + context=ctx, + attributes={ + "llm.provider": provider, + "llm.model": model, + "llm.input_message_count": len(messages), + "llm.output_tokens": response.usage.output_tokens if response.usage else 0, + "llm.input_tokens": response.usage.input_tokens if response.usage else 0, + "llm.finish_reason": response.finish_reason or "", + "llm.tool_call_count": len(response.tool_calls) if response.tool_calls else 0, + }, + ): + pass # span auto-ends + + def record_agent_step( + self, + step_number: int, + state: str, + tool_calls: list[Any] | None = None, + tool_results: list[Any] | None = None, + error: str | None = None, + **_kwargs: Any, + ) -> None: + """Record an agent step as a span.""" + if not self._root_span: + return + + ctx = trace.set_span_in_context(self._root_span) + attrs: dict[str, Any] = { + "step.number": step_number, + "step.state": state, + } + if tool_calls: + attrs["step.tool_call_count"] = len(tool_calls) + attrs["step.tool_names"] = [tc.name for tc in tool_calls] + if error: + attrs["step.error"] = error + + span = self._tracer.start_as_current_span("agent.step", context=ctx, attributes=attrs) + self._step_spans[step_number] = span + + def finalize_recording(self, success: bool, final_result: str | None = None) -> None: + """End the root span.""" + if self._root_span: + self._root_span.set_attribute("agent.success", success) + if final_result: + self._root_span.set_attribute("agent.final_result", final_result[:2048]) + self._root_span.end()