diff --git a/sdks/python/pyproject.toml b/sdks/python/pyproject.toml index 0e1d03e..172b141 100644 --- a/sdks/python/pyproject.toml +++ b/sdks/python/pyproject.toml @@ -24,6 +24,7 @@ classifiers = [ ] dependencies = [ # LLM access is via LangChain; the langchain-* packages pull in provider SDKs as needed. + "httpx>=0.27.0", "pydantic>=2.0.0", "textstat>=0.7.0", "langchain-anthropic>=0.2.0", diff --git a/sdks/python/src/learning_commons_evaluators/evaluators/base.py b/sdks/python/src/learning_commons_evaluators/evaluators/base.py index 9954c98..998988b 100644 --- a/sdks/python/src/learning_commons_evaluators/evaluators/base.py +++ b/sdks/python/src/learning_commons_evaluators/evaluators/base.py @@ -42,6 +42,7 @@ TokenUsage, prompt_settings_to_extras_value, ) +from learning_commons_evaluators.telemetry import schedule_send_telemetry InputT = TypeVar("InputT", bound=EvaluationInput) OutputT = TypeVar("OutputT", bound=EvaluationResult) @@ -145,8 +146,7 @@ async def evaluate( "evaluation end", extra={"evaluation_metadata": evaluation_metadata}, ) - # TODO: add full input to telemetry if enabled - # TODO: send_telemetry(evaluation_metadata) + schedule_send_telemetry(evaluation_metadata, input, self.config) def evaluate_sync( self, diff --git a/sdks/python/src/learning_commons_evaluators/schemas/__init__.py b/sdks/python/src/learning_commons_evaluators/schemas/__init__.py index f28b750..123781e 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/__init__.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/__init__.py @@ -42,6 +42,13 @@ from learning_commons_evaluators.schemas.text_complexity import ( TextComplexityEvaluationInput, ) +from learning_commons_evaluators.schemas.ts_telemetry import ( + EvaluationTelemetryStatus, + TelemetryEvent, + TelemetryMetadataPayload, + TelemetryStageDetail, + TelemetryTokenUsage, +) __all__ = [ "AnyInputSpec", @@ -56,6 +63,7 @@ "EvaluationMetadata", "EvaluationResult", "EvaluationSettings", + "EvaluationTelemetryStatus", "EvaluatorMetadata", "EvaluatorMaturity", "GradeInputField", @@ -67,6 +75,10 @@ "PROMPT_STEP_EXTRA_TOKEN_USAGE", "Status", "StepMetadata", + "TelemetryEvent", + "TelemetryMetadataPayload", + "TelemetryStageDetail", + "TelemetryTokenUsage", "TextComplexityEvaluationInput", "TextInputField", "TokenUsage", diff --git a/sdks/python/src/learning_commons_evaluators/schemas/config.py b/sdks/python/src/learning_commons_evaluators/schemas/config.py index 8d1d035..22d5e6d 100644 --- a/sdks/python/src/learning_commons_evaluators/schemas/config.py +++ b/sdks/python/src/learning_commons_evaluators/schemas/config.py @@ -6,6 +6,7 @@ create_config_no_telemetry, create_config_telemetry_with_full_input). """ +import uuid from dataclasses import dataclass, field from enum import Enum @@ -13,6 +14,11 @@ from learning_commons_evaluators.logger import Logger, get_logger +DEFAULT_TELEMETRY_EVENTS_ENDPOINT = "https://api.learningcommons.org/evaluators-telemetry/v1/events" + +# Shared per process so multiple :class:`EvaluatorConfig` instances derive the same client id. +_PROCESS_CLIENT_ID_SEED = uuid.uuid4() + # --- LLM provider configs (for LLM calls in prompt steps) --- @@ -79,6 +85,7 @@ class EvaluationSettings(BaseModel): class TelemetryConfig: """Config for telemetry.""" + endpoint: str = DEFAULT_TELEMETRY_EVENTS_ENDPOINT telemetry_partner_id: str | None = None send_full_input_with_telemetry: bool = False @@ -106,6 +113,11 @@ class EvaluatorConfig: logger: Logger = field(default_factory=get_logger) telemetry: TelemetryConfig = field(default_factory=TelemetryConfig) + # Temporary until we finalize the telemetry API key/client id strategy. + #: UUID v5 namespace for deriving ``X-Client-ID`` when ``telemetry_partner_id`` is an API key. + #: Defaults to a single per-process seed so all configs in one run share the same derived id. + client_id_seed: uuid.UUID = field(default=_PROCESS_CLIENT_ID_SEED) + def create_config( *, @@ -129,38 +141,95 @@ def create_config( ) -def create_config_no_telemetry( +def create_config_telemetry_with_full_input( *, google_llm_provider_config: GoogleLLMProviderConfig | None = None, openai_llm_provider_config: OpenAILLMProviderConfig | None = None, anthropic_llm_provider_config: AnthropicLLMProviderConfig | None = None, logger: Logger | None = None, + telemetry_partner_id: str, ) -> EvaluatorConfig: - """Create evaluator config with telemetry disabled.""" + """Create evaluator config with telemetry and full input sent with telemetry.""" return EvaluatorConfig( google_llm_provider_config=google_llm_provider_config, openai_llm_provider_config=openai_llm_provider_config, anthropic_llm_provider_config=anthropic_llm_provider_config, logger=get_logger() if logger is None else logger, - telemetry=TelemetryConfig(telemetry_partner_id=None, send_full_input_with_telemetry=False), + telemetry=TelemetryConfig( + telemetry_partner_id=telemetry_partner_id, send_full_input_with_telemetry=True + ), ) -def create_config_telemetry_with_full_input( +def create_config_anonymous_telemetry( *, google_llm_provider_config: GoogleLLMProviderConfig | None = None, openai_llm_provider_config: OpenAILLMProviderConfig | None = None, anthropic_llm_provider_config: AnthropicLLMProviderConfig | None = None, logger: Logger | None = None, - telemetry_partner_id: str, + send_full_input_with_telemetry: bool = False, ) -> EvaluatorConfig: - """Create evaluator config with telemetry and full input sent with telemetry.""" + """Create evaluator config with anonymous telemetry.""" + anonymous_telemetry_id = str(uuid.uuid4()) + return create_config_with_telemetry_config( + google_llm_provider_config=google_llm_provider_config, + openai_llm_provider_config=openai_llm_provider_config, + anthropic_llm_provider_config=anthropic_llm_provider_config, + logger=logger, + telemetry_config=TelemetryConfig( + telemetry_partner_id=anonymous_telemetry_id, + send_full_input_with_telemetry=send_full_input_with_telemetry, + ), + ) + + +def create_config_anonymous_telemetry_with_full_input( + *, + google_llm_provider_config: GoogleLLMProviderConfig | None = None, + openai_llm_provider_config: OpenAILLMProviderConfig | None = None, + anthropic_llm_provider_config: AnthropicLLMProviderConfig | None = None, + logger: Logger | None = None, +) -> EvaluatorConfig: + """Create evaluator config with anonymous telemetry and full input sent with telemetry.""" + return create_config_anonymous_telemetry( + google_llm_provider_config=google_llm_provider_config, + openai_llm_provider_config=openai_llm_provider_config, + anthropic_llm_provider_config=anthropic_llm_provider_config, + logger=logger, + send_full_input_with_telemetry=True, + ) + + +def create_config_with_telemetry_config( + *, + google_llm_provider_config: GoogleLLMProviderConfig | None = None, + openai_llm_provider_config: OpenAILLMProviderConfig | None = None, + anthropic_llm_provider_config: AnthropicLLMProviderConfig | None = None, + logger: Logger | None = None, + telemetry_config: TelemetryConfig, +) -> EvaluatorConfig: + """Create evaluator config with telemetry. telemetry_config is required.""" return EvaluatorConfig( google_llm_provider_config=google_llm_provider_config, openai_llm_provider_config=openai_llm_provider_config, anthropic_llm_provider_config=anthropic_llm_provider_config, logger=get_logger() if logger is None else logger, - telemetry=TelemetryConfig( - telemetry_partner_id=telemetry_partner_id, send_full_input_with_telemetry=True - ), + telemetry=telemetry_config, + ) + + +def create_config_no_telemetry( + *, + google_llm_provider_config: GoogleLLMProviderConfig | None = None, + openai_llm_provider_config: OpenAILLMProviderConfig | None = None, + anthropic_llm_provider_config: AnthropicLLMProviderConfig | None = None, + logger: Logger | None = None, +) -> EvaluatorConfig: + """Create evaluator config with telemetry disabled.""" + return EvaluatorConfig( + google_llm_provider_config=google_llm_provider_config, + openai_llm_provider_config=openai_llm_provider_config, + anthropic_llm_provider_config=anthropic_llm_provider_config, + logger=get_logger() if logger is None else logger, + telemetry=TelemetryConfig(telemetry_partner_id=None, send_full_input_with_telemetry=False), ) diff --git a/sdks/python/src/learning_commons_evaluators/schemas/ts_telemetry.py b/sdks/python/src/learning_commons_evaluators/schemas/ts_telemetry.py new file mode 100644 index 0000000..2374304 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/schemas/ts_telemetry.py @@ -0,0 +1,69 @@ +"""Wire types aligned with ``sdks/typescript/src/telemetry/types.ts``. + +Hand-maintained; keep in sync with the TypeScript SDK until a shared schema exists. +""" + +from __future__ import annotations + +from typing import Literal + +from pydantic import BaseModel, ConfigDict + +__all__ = [ + "EvaluationTelemetryStatus", + "TelemetryEvent", + "TelemetryMetadataPayload", + "TelemetryStageDetail", + "TelemetryTokenUsage", +] + +# Mirrors TS ``EvaluationStatus`` +EvaluationTelemetryStatus = Literal["success", "error"] + + +class TelemetryTokenUsage(BaseModel): + """Mirrors TS ``TokenUsage``.""" + + model_config = ConfigDict(extra="forbid") + + input_tokens: int + output_tokens: int + + +class TelemetryStageDetail(BaseModel): + """Mirrors TS ``StageDetail``.""" + + model_config = ConfigDict(extra="forbid") + + stage: str + provider: str + latency_ms: float + token_usage: TelemetryTokenUsage | None = None + schema_validation_failed: bool | None = None + + +class TelemetryMetadataPayload(BaseModel): + """Mirrors TS ``TelemetryMetadata``.""" + + model_config = ConfigDict(extra="forbid") + + stage_details: list[TelemetryStageDetail] | None = None + + +class TelemetryEvent(BaseModel): + """Mirrors TS ``TelemetryEvent`` (JSON field names match the TS interface).""" + + model_config = ConfigDict(extra="forbid") + + timestamp: str + sdk_version: str + evaluator_type: str + grade: str | None = None + status: EvaluationTelemetryStatus + error_code: str | None = None + latency_ms: float + text_length_chars: int + provider: str + token_usage: TelemetryTokenUsage | None = None + metadata: TelemetryMetadataPayload | None = None + input_text: str | None = None diff --git a/sdks/python/src/learning_commons_evaluators/telemetry/__init__.py b/sdks/python/src/learning_commons_evaluators/telemetry/__init__.py new file mode 100644 index 0000000..f9d850f --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/telemetry/__init__.py @@ -0,0 +1,121 @@ +"""Telemetry: schedule and send evaluation events (fire-and-forget HTTP POST).""" + +from __future__ import annotations + +import asyncio +import threading +import uuid +from concurrent.futures import ThreadPoolExecutor +from datetime import datetime, timezone + +import httpx + +from learning_commons_evaluators.schemas.config import EvaluatorConfig +from learning_commons_evaluators.schemas.evaluator import EvaluationInput +from learning_commons_evaluators.schemas.metadata import EvaluationMetadata +from learning_commons_evaluators.telemetry.adapter import evaluation_to_typescript_telemetry_event +from learning_commons_evaluators.telemetry.utils import client_id_from_seed, iso_utc_z + +__all__ = [ + "evaluation_to_typescript_telemetry_event", + "schedule_send_telemetry", + "send_telemetry", + "should_send_telemetry", +] + +_TELEMETRY_EXECUTOR: ThreadPoolExecutor | None = None +_TELEMETRY_EXECUTOR_LOCK = threading.Lock() + + +def _get_telemetry_executor() -> ThreadPoolExecutor: + global _TELEMETRY_EXECUTOR + with _TELEMETRY_EXECUTOR_LOCK: + if _TELEMETRY_EXECUTOR is None: + _TELEMETRY_EXECUTOR = ThreadPoolExecutor( + max_workers=2, + thread_name_prefix="lc-telemetry", + ) + return _TELEMETRY_EXECUTOR + + +def should_send_telemetry(config: EvaluatorConfig) -> bool: + """Return True when telemetry is configured with a non-empty partner / client id.""" + partner_id = config.telemetry.telemetry_partner_id + return bool(partner_id and partner_id.strip()) + + +def _is_uuid(value: str | None) -> bool: + if value is None: + return False + try: + uuid.UUID(value) + return True + except (ValueError, TypeError, AttributeError): + return False + + +async def send_telemetry( + evaluation_metadata: EvaluationMetadata, + inp: EvaluationInput | None, + config: EvaluatorConfig, +) -> None: + """POST a TypeScript-shaped telemetry JSON payload. Never raises to callers (logs failures).""" + if not should_send_telemetry(config): + return + + try: + partner_id = config.telemetry.telemetry_partner_id + assert ( + partner_id is not None + ) # for mypy: ``should_send_telemetry`` guarantees non-empty after strip. + telemetry_partner_id = partner_id.strip() + + event = evaluation_to_typescript_telemetry_event(evaluation_metadata, inp, config) + # TS SDK sets timestamp at send time (`new Date().toISOString()`), not evaluation start. + event = event.model_copy(update={"timestamp": iso_utc_z(datetime.now(timezone.utc))}) + payload = event.model_dump(mode="json", exclude_none=True) + + api_key = telemetry_partner_id if not _is_uuid(telemetry_partner_id) else None + client_id = ( + telemetry_partner_id + if _is_uuid(telemetry_partner_id) + else client_id_from_seed(telemetry_partner_id, config.client_id_seed) + ) + + headers: dict[str, str] = { + "Content-Type": "application/json", + "X-Client-ID": client_id, + } + if api_key is not None: + headers["X-API-Key"] = api_key + + timeout = httpx.Timeout(5.0) + async with httpx.AsyncClient(timeout=timeout) as client: + response = await client.post(config.telemetry.endpoint, json=payload, headers=headers) + if response.is_error: + # Log status only; response bodies may echo input text or other sensitive data. + config.logger.warning( + "telemetry send failed: HTTP %s", + response.status_code, + ) + except Exception as e: + # Log exception type only; ``str(e)`` may include payload fields (e.g. input_text). + config.logger.warning( + "telemetry send failed: %s", + type(e).__qualname__, + ) + + +def schedule_send_telemetry( + evaluation_metadata: EvaluationMetadata, + inp: EvaluationInput | None, + config: EvaluatorConfig, +) -> None: + """Fire-and-forget: run :func:`send_telemetry` on a shared worker when telemetry is enabled.""" + if not should_send_telemetry(config): + return + + def _run() -> None: + asyncio.run(send_telemetry(evaluation_metadata, inp, config)) + + _get_telemetry_executor().submit(_run) diff --git a/sdks/python/src/learning_commons_evaluators/telemetry/adapter.py b/sdks/python/src/learning_commons_evaluators/telemetry/adapter.py new file mode 100644 index 0000000..7d62b17 --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/telemetry/adapter.py @@ -0,0 +1,189 @@ +"""Map Python evaluation context to TypeScript-shaped :class:`~learning_commons_evaluators.schemas.ts_telemetry.TelemetryEvent`. + +Information intentionally dropped or simplified when adapting: + +- Full :class:`~learning_commons_evaluators.schemas.metadata.EvaluatorMetadata` (name, + version, description, maturity, input specs) is not sent; only ``evaluator_metadata.id`` + becomes ``evaluator_type`` and ``evaluator_metadata.sdk_version`` maps to ``sdk_version``. +- The entire ``input_metadata`` dict is not reproduced; ``grade`` and ``text_length_chars`` + are inferred from common shapes when possible. +- :class:`~learning_commons_evaluators.schemas.metadata.Status` ``processing`` is mapped to + ``status`` ``"error"`` (non-terminal) the same as failures, since TS only allows + success/error. +- ``error_code`` is derived from ``error_details`` text, not a structured error taxonomy. +- :attr:`~learning_commons_evaluators.schemas.metadata.EvaluationMetadata.evaluation_settings` + is omitted. +- Per-step ``StepMetadata.error_details`` and most of ``extras`` beyond prompt settings / + token usage are not mapped to ``stage_details``. +""" + +from __future__ import annotations + +from typing import Any + +from learning_commons_evaluators.schemas.config import EvaluatorConfig, LLMProvider +from learning_commons_evaluators.schemas.evaluator import EvaluationInput +from learning_commons_evaluators.schemas.metadata import ( + PROMPT_STEP_EXTRA_PROMPT_SETTINGS, + PROMPT_STEP_EXTRA_TOKEN_USAGE, + EvaluationMetadata, + Status, + StepMetadata, +) +from learning_commons_evaluators.schemas.metadata import ( + TokenUsage as EvaluationTokenUsage, +) +from learning_commons_evaluators.schemas.ts_telemetry import ( + EvaluationTelemetryStatus, + TelemetryEvent, + TelemetryMetadataPayload, + TelemetryStageDetail, + TelemetryTokenUsage, +) +from learning_commons_evaluators.telemetry.utils import iso_utc_z + + +def _map_status(status: Status) -> EvaluationTelemetryStatus: + if status is Status.succeeded: + return "success" + return "error" + + +def _grade_from_input_metadata(input_metadata: dict[str, Any]) -> str | None: + gl = input_metadata.get("grade_level") + if gl is None: + return None + if isinstance(gl, dict) and "grade" in gl: + return str(gl["grade"]) + if isinstance(gl, (int, float, str)): + return str(gl) + return None + + +def _extract_primary_input_text(inp: EvaluationInput | None) -> str | None: + if inp is None: + return None + field = getattr(inp, "text", None) + if field is None: + return None + value = getattr(field, "value", None) + return value if isinstance(value, str) else None + + +def _text_length_chars( + input_metadata: dict[str, Any], + input_text: str | None, + inp: EvaluationInput | None, +) -> int: + if input_text is not None: + return len(input_text) + text_meta = input_metadata.get("text") + if isinstance(text_meta, dict): + for key in ("textLength", "length", "charCount"): + v = text_meta.get(key) + if isinstance(v, int): + return v + extracted = _extract_primary_input_text(inp) + if extracted is not None: + return len(extracted) + return 0 + + +def _error_code(error_details: str | None) -> str | None: + if not error_details: + return None + line = error_details.strip().split("\n", 1)[0].strip() + return line[:512] if line else None + + +def _provider_label(usage: EvaluationTokenUsage) -> str: + return f"{usage.provider_type.value}:{usage.model}" + + +def _aggregate_provider(total: dict[LLMProvider, EvaluationTokenUsage]) -> str: + if not total: + return "unknown" + parts = [_provider_label(u) for u in total.values()] + return " + ".join(sorted(parts)) + + +def _aggregate_token_usage( + total: dict[LLMProvider, EvaluationTokenUsage], +) -> TelemetryTokenUsage | None: + if not total: + return None + it = sum(u.input_tokens for u in total.values()) + ot = sum(u.output_tokens for u in total.values()) + return TelemetryTokenUsage(input_tokens=it, output_tokens=ot) + + +def _provider_from_step_extras(step: StepMetadata) -> str: + raw = step.extras.get(PROMPT_STEP_EXTRA_PROMPT_SETTINGS) + if isinstance(raw, dict): + pt = raw.get("provider_type") + model = raw.get("model") + if isinstance(pt, str) and isinstance(model, str): + return f"{pt}:{model}" + return "unknown" + + +def _token_usage_from_step_extras(step: StepMetadata) -> TelemetryTokenUsage | None: + raw = step.extras.get(PROMPT_STEP_EXTRA_TOKEN_USAGE) + if not isinstance(raw, dict): + return None + try: + it = int(raw["input_tokens"]) + ot = int(raw["output_tokens"]) + except (KeyError, TypeError, ValueError): + return None + return TelemetryTokenUsage(input_tokens=it, output_tokens=ot) + + +def _stage_details(evaluation_metadata: EvaluationMetadata) -> list[TelemetryStageDetail] | None: + if not evaluation_metadata.step_details: + return None + details: list[TelemetryStageDetail] = [] + for step_id, step in evaluation_metadata.step_details.items(): + details.append( + TelemetryStageDetail( + stage=step_id, + provider=_provider_from_step_extras(step), + latency_ms=step.processing_time_ms, + token_usage=_token_usage_from_step_extras(step), + ) + ) + return details or None + + +def evaluation_to_typescript_telemetry_event( + evaluation_metadata: EvaluationMetadata, + inp: EvaluationInput | None, + config: EvaluatorConfig, +) -> TelemetryEvent: + """Build a TS-shaped :class:`TelemetryEvent` from Python evaluation state.""" + input_text: str | None = None + if config.telemetry.send_full_input_with_telemetry and inp is not None: + input_text = _extract_primary_input_text(inp) + + meta = evaluation_metadata.input_metadata + grade = _grade_from_input_metadata(meta) + status = _map_status(evaluation_metadata.status) + stages = _stage_details(evaluation_metadata) + metadata_payload = ( + TelemetryMetadataPayload(stage_details=stages) if stages is not None else None + ) + + return TelemetryEvent( + timestamp=iso_utc_z(evaluation_metadata.timestamp), + sdk_version=evaluation_metadata.evaluator_metadata.sdk_version, + evaluator_type=evaluation_metadata.evaluator_metadata.id, + grade=grade, + status=status, + error_code=_error_code(evaluation_metadata.error_details) if status == "error" else None, + latency_ms=evaluation_metadata.processing_time_ms, + text_length_chars=_text_length_chars(meta, input_text, inp), + provider=_aggregate_provider(evaluation_metadata.total_token_usage), + token_usage=_aggregate_token_usage(evaluation_metadata.total_token_usage), + metadata=metadata_payload, + input_text=input_text, + ) diff --git a/sdks/python/src/learning_commons_evaluators/telemetry/utils.py b/sdks/python/src/learning_commons_evaluators/telemetry/utils.py new file mode 100644 index 0000000..5e1623e --- /dev/null +++ b/sdks/python/src/learning_commons_evaluators/telemetry/utils.py @@ -0,0 +1,22 @@ +"""Telemetry helpers.""" + +from __future__ import annotations + +import uuid +from datetime import datetime, timezone + + +def iso_utc_z(dt: datetime) -> str: + """Format *dt* as an ISO 8601 UTC string with a ``Z`` suffix (TS wire format).""" + s = dt.astimezone(timezone.utc).isoformat() + return s.replace("+00:00", "Z") + + +def client_id_from_seed(learning_commons_api_key: str, client_id_seed: uuid.UUID) -> str: + """Return a deterministic UUID string for this API key within the given namespace. + + ``client_id_seed`` is used as the RFC 4122 UUIDv5 *namespace*; the stripped API key is + the *name*. The same :class:`~learning_commons_evaluators.schemas.config.EvaluatorConfig` + (same seed + same telemetry partner API key) always yields the same ``X-Client-ID``. + """ + return str(uuid.uuid5(client_id_seed, learning_commons_api_key.strip())) diff --git a/sdks/python/tests/evaluators/test_base.py b/sdks/python/tests/evaluators/test_base.py index f595af9..c2370af 100644 --- a/sdks/python/tests/evaluators/test_base.py +++ b/sdks/python/tests/evaluators/test_base.py @@ -1,8 +1,8 @@ """Tests for :class:`~learning_commons_evaluators.evaluators.base.BaseEvaluator`. -Covers ``__init__``, ``evaluate`` / ``evaluate_sync``, metadata and settings override, -success/failure paths, ``update_total_token_usage``, ``execute_step``, and -``execute_prompt_chain_step``. +Covers ``__init__``, ``evaluate`` / ``evaluate_sync`` (metadata, settings override, +success/failure paths, ``schedule_send_telemetry`` wiring), ``update_total_token_usage``, +``execute_step``, and ``execute_prompt_chain_step``. ``EvaluationMetadata`` always uses ``input.input_metadata()`` (including when ``send_full_input_with_telemetry`` is enabled). Helpers use both a minimal stub evaluator and conventionality-oriented fixtures where useful. @@ -220,7 +220,12 @@ def test_evaluate_sets_metadata_from_input_metadata(self, stub_evaluator): assert result.metadata.input_metadata["text"] == {"textLength": 11} assert result.metadata.input_metadata["grade_level"] == {"grade": 3} - def test_full_telemetry_config_still_uses_input_metadata_not_raw_values(self, stub_evaluator): + +class TestEvaluateTelemetry: + """``evaluate`` always calls :func:`schedule_send_telemetry` (send vs skip lives in telemetry).""" + + @patch("learning_commons_evaluators.evaluators.base.schedule_send_telemetry") + def test_send_full_input_config_preserves_input_metadata(self, mock_schedule): """``send_full_input_with_telemetry`` does not replace ``input_metadata`` with raw values.""" cfg = create_config(telemetry_partner_id="test", send_full_input_with_telemetry=True) ev = _StubEvaluator(cfg) @@ -229,6 +234,56 @@ def test_full_telemetry_config_still_uses_input_metadata_not_raw_values(self, st assert result.metadata.input_metadata == inp.input_metadata() assert result.metadata.input_metadata["text"] == {"textLength": 11} assert result.metadata.input_metadata["grade_level"] == {"grade": 3} + mock_schedule.assert_called_once() + meta_arg, input_arg, cfg_arg = mock_schedule.call_args.args + assert meta_arg is result.metadata + assert input_arg is inp + assert cfg_arg is cfg + + @patch("learning_commons_evaluators.evaluators.base.schedule_send_telemetry") + def test_invokes_schedule_with_metadata_input_and_config(self, mock_schedule, stub_evaluator): + inp = _stub_input() + stub_evaluator.evaluate_sync(inp) + mock_schedule.assert_called_once() + meta, passed_inp, cfg = mock_schedule.call_args.args + assert cfg is stub_evaluator.config + assert passed_inp is inp + assert meta.status == Status.succeeded + assert meta.evaluator_metadata.id == "stub-evaluator" + + @patch("learning_commons_evaluators.evaluators.base.schedule_send_telemetry") + def test_passes_failed_metadata_to_schedule_on_error(self, mock_schedule): + cfg = create_config(telemetry_partner_id="tid") + ev = _StubEvaluator(cfg) + inp = _stub_input() + with ( + patch.object(ev, "evaluate_impl", side_effect=RuntimeError("boom")), + pytest.raises(RuntimeError, match="boom"), + ): + ev.evaluate_sync(inp) + mock_schedule.assert_called_once() + meta, passed_inp, out_cfg = mock_schedule.call_args.args + assert out_cfg is ev.config + assert passed_inp is inp + assert meta.status == Status.failed + assert "boom" in (meta.error_details or "") + + @patch("learning_commons_evaluators.evaluators.base.schedule_send_telemetry") + def test_invokes_schedule_on_validation_error(self, mock_schedule, stub_evaluator): + inp = TextComplexityEvaluationInput( + text=TextInputField( + spec=TextInputSpec(name="text", min_text_length=100), + value="short", + ), + grade_level=GradeInputField(spec=GradeInputSpec(name="grade_level"), value=3), + ) + with pytest.raises(ValidationError): + stub_evaluator.evaluate_sync(inp) + mock_schedule.assert_called_once() + meta, passed_inp, cfg = mock_schedule.call_args.args + assert cfg is stub_evaluator.config + assert passed_inp is inp + assert meta.status == Status.failed class TestStubEvaluateErrorHandling: diff --git a/sdks/python/tests/schemas/test_config.py b/sdks/python/tests/schemas/test_config.py index 92942f4..4a4a619 100644 --- a/sdks/python/tests/schemas/test_config.py +++ b/sdks/python/tests/schemas/test_config.py @@ -1,19 +1,24 @@ """Tests for EvaluatorConfig, LLMProviderConfig subclasses, and factory functions.""" +import uuid from dataclasses import FrozenInstanceError import pytest from learning_commons_evaluators.logger import SDK_LOGGER_NAME, get_logger from learning_commons_evaluators.schemas.config import ( + DEFAULT_TELEMETRY_EVENTS_ENDPOINT, AnthropicLLMProviderConfig, GoogleLLMProviderConfig, LLMProvider, OpenAILLMProviderConfig, TelemetryConfig, create_config, + create_config_anonymous_telemetry, + create_config_anonymous_telemetry_with_full_input, create_config_no_telemetry, create_config_telemetry_with_full_input, + create_config_with_telemetry_config, ) @@ -49,6 +54,7 @@ class TestEvaluatorConfigFactory: def test_create_config_no_telemetry_defaults(self): config = create_config_no_telemetry() assert config.telemetry.telemetry_partner_id is None + assert config.telemetry.endpoint == DEFAULT_TELEMETRY_EVENTS_ENDPOINT assert config.telemetry.send_full_input_with_telemetry is False assert config.logger.name == SDK_LOGGER_NAME @@ -63,13 +69,76 @@ def test_create_config_no_telemetry_accepts_providers(self): def test_create_config_sets_telemetry_partner_id(self): config = create_config(telemetry_partner_id="tid-123") assert config.telemetry.telemetry_partner_id == "tid-123" + assert config.telemetry.endpoint == DEFAULT_TELEMETRY_EVENTS_ENDPOINT assert config.telemetry.send_full_input_with_telemetry is False + def test_default_client_id_seed_shared_across_configs(self): + a = create_config(telemetry_partner_id="tid-a") + b = create_config(telemetry_partner_id="tid-b") + assert a.client_id_seed == b.client_id_seed + def test_create_config_telemetry_with_full_input_sets_flag(self): config = create_config_telemetry_with_full_input(telemetry_partner_id="tid") assert config.telemetry.telemetry_partner_id == "tid" assert config.telemetry.send_full_input_with_telemetry is True + def test_create_config_with_telemetry_config_uses_given_telemetry_config(self): + telemetry = TelemetryConfig( + endpoint="https://example.com/events", + telemetry_partner_id="partner-1", + send_full_input_with_telemetry=True, + ) + config = create_config_with_telemetry_config(telemetry_config=telemetry) + assert config.telemetry is telemetry + assert config.telemetry.endpoint == "https://example.com/events" + assert config.telemetry.telemetry_partner_id == "partner-1" + assert config.telemetry.send_full_input_with_telemetry is True + + def test_create_config_with_telemetry_config_accepts_providers(self): + config = create_config_with_telemetry_config( + google_llm_provider_config=GoogleLLMProviderConfig(api_key="gk"), + telemetry_config=TelemetryConfig(telemetry_partner_id="p"), + ) + assert config.google_llm_provider_config.api_key == "gk" + + def test_create_config_with_telemetry_config_preserves_logger(self): + custom = get_logger("custom_with_telemetry") + telemetry = TelemetryConfig(telemetry_partner_id="p") + config = create_config_with_telemetry_config(logger=custom, telemetry_config=telemetry) + assert config.logger is custom + + def test_create_config_anonymous_telemetry_uuid_partner_id(self): + config = create_config_anonymous_telemetry() + pid = config.telemetry.telemetry_partner_id + assert pid is not None + uuid.UUID(pid) + assert config.telemetry.send_full_input_with_telemetry is False + assert config.telemetry.endpoint == DEFAULT_TELEMETRY_EVENTS_ENDPOINT + + def test_create_config_anonymous_telemetry_distinct_partner_ids(self): + a = create_config_anonymous_telemetry() + b = create_config_anonymous_telemetry() + assert a.telemetry.telemetry_partner_id != b.telemetry.telemetry_partner_id + + def test_create_config_anonymous_telemetry_accepts_providers(self): + config = create_config_anonymous_telemetry( + openai_llm_provider_config=OpenAILLMProviderConfig(api_key="ok"), + ) + assert config.openai_llm_provider_config.api_key == "ok" + + def test_create_config_anonymous_telemetry_with_full_input(self): + config = create_config_anonymous_telemetry_with_full_input() + pid = config.telemetry.telemetry_partner_id + assert pid is not None + uuid.UUID(pid) + assert config.telemetry.send_full_input_with_telemetry is True + assert config.telemetry.endpoint == DEFAULT_TELEMETRY_EVENTS_ENDPOINT + + def test_create_config_anonymous_telemetry_with_full_input_distinct_partner_ids(self): + a = create_config_anonymous_telemetry_with_full_input() + b = create_config_anonymous_telemetry_with_full_input() + assert a.telemetry.telemetry_partner_id != b.telemetry.telemetry_partner_id + def test_explicit_logger_is_preserved(self): custom = get_logger("custom_test") config = create_config_no_telemetry(logger=custom) @@ -80,5 +149,6 @@ def test_config_is_frozen(self): config = create_config_no_telemetry() with pytest.raises(FrozenInstanceError): config.telemetry = TelemetryConfig( - telemetry_partner_id="x", send_full_input_with_telemetry=False + telemetry_partner_id="x", + send_full_input_with_telemetry=False, ) diff --git a/sdks/python/tests/telemetry/test_telemetry.py b/sdks/python/tests/telemetry/test_telemetry.py new file mode 100644 index 0000000..cc252df --- /dev/null +++ b/sdks/python/tests/telemetry/test_telemetry.py @@ -0,0 +1,207 @@ +"""Tests for :mod:`learning_commons_evaluators.telemetry` (send path, scheduling, guards).""" + +from __future__ import annotations + +import asyncio +from datetime import datetime, timezone +from unittest.mock import AsyncMock, MagicMock, patch + +from learning_commons_evaluators import ( + create_config, + create_config_telemetry_with_full_input, +) +from learning_commons_evaluators.schemas.common_inputs import GradeInputField, TextInputField +from learning_commons_evaluators.schemas.input_specs import GradeInputSpec, TextInputSpec +from learning_commons_evaluators.schemas.metadata import Status +from learning_commons_evaluators.schemas.text_complexity import TextComplexityEvaluationInput +from learning_commons_evaluators.telemetry import ( + schedule_send_telemetry, + send_telemetry, + should_send_telemetry, +) +from learning_commons_evaluators.telemetry.utils import client_id_from_seed + + +def _sample_evaluation_input() -> TextComplexityEvaluationInput: + return TextComplexityEvaluationInput( + text=TextInputField(spec=TextInputSpec(name="text"), value="hello world"), + grade_level=GradeInputField(spec=GradeInputSpec(name="grade_level"), value=3), + ) + + +def _mock_async_httpx_client(mock_client_class: MagicMock) -> MagicMock: + mock_client = MagicMock() + mock_client_class.return_value.__aenter__.return_value = mock_client + mock_client_class.return_value.__aexit__.return_value = None + mock_response = MagicMock() + mock_response.is_error = False + mock_response.text = "" + mock_client.post = AsyncMock(return_value=mock_response) + return mock_client + + +class TestShouldSendTelemetry: + def test_false_without_partner_id(self, config): + assert should_send_telemetry(config) is False + + def test_true_with_partner_id(self): + cfg = create_config(telemetry_partner_id="tid") + assert should_send_telemetry(cfg) is True + + +class TestScheduleSendTelemetry: + def test_does_not_submit_when_partner_missing(self, config, evaluation_metadata): + with patch("learning_commons_evaluators.telemetry._get_telemetry_executor") as mock_exec: + schedule_send_telemetry(evaluation_metadata, None, config) + mock_exec.assert_not_called() + + @patch("learning_commons_evaluators.telemetry._get_telemetry_executor") + def test_submits_to_shared_executor(self, mock_get_executor, evaluation_metadata): + cfg = create_config(telemetry_partner_id="tid") + mock_executor = MagicMock() + mock_get_executor.return_value = mock_executor + + schedule_send_telemetry(evaluation_metadata, None, cfg) + + mock_get_executor.assert_called_once() + mock_executor.submit.assert_called_once() + + +class TestSendTelemetryHttp: + @patch("learning_commons_evaluators.telemetry.httpx.AsyncClient") + def test_posts_json_body_on_success(self, mock_client_class, evaluation_metadata): + mock_client = _mock_async_httpx_client(mock_client_class) + + cfg = create_config(telemetry_partner_id="tid") + evaluation_metadata.status = Status.succeeded + evaluation_metadata.input_metadata = {"grade_level": {"grade": 3}} + + asyncio.run(send_telemetry(evaluation_metadata, None, cfg)) + + mock_client_class.assert_called_once() + mock_client.post.assert_awaited_once() + call_args = mock_client.post.call_args + assert call_args[0][0] == cfg.telemetry.endpoint + payload = call_args[1]["json"] + assert payload["evaluator_type"] == "test-evaluator" + assert payload["status"] == "success" + assert payload["grade"] == "3" + assert "learning-commons-evaluators-python" in payload["sdk_version"] + assert "input_text" not in payload + headers = call_args[1]["headers"] + assert headers["X-Client-ID"] == client_id_from_seed("tid", cfg.client_id_seed) + assert headers["X-API-Key"] == "tid" + + @patch("learning_commons_evaluators.telemetry.httpx.AsyncClient") + def test_uuid_partner_uses_same_value_as_client_id_only( + self, mock_client_class, evaluation_metadata + ): + partner_uuid = "550e8400-e29b-41d4-a716-446655440000" + mock_client = _mock_async_httpx_client(mock_client_class) + cfg = create_config(telemetry_partner_id=partner_uuid) + evaluation_metadata.status = Status.succeeded + + asyncio.run(send_telemetry(evaluation_metadata, None, cfg)) + + headers = mock_client.post.call_args[1]["headers"] + assert headers["X-Client-ID"] == partner_uuid + assert "X-API-Key" not in headers + + @patch("learning_commons_evaluators.telemetry.httpx.AsyncClient") + def test_includes_input_text_when_full_input_enabled( + self, mock_client_class, evaluation_metadata + ): + mock_client = _mock_async_httpx_client(mock_client_class) + + cfg = create_config_telemetry_with_full_input(telemetry_partner_id="tid") + evaluation_metadata.status = Status.succeeded + inp = _sample_evaluation_input() + + asyncio.run(send_telemetry(evaluation_metadata, inp, cfg)) + + payload = mock_client.post.call_args[1]["json"] + assert payload["input_text"] == "hello world" + + @patch("learning_commons_evaluators.telemetry.httpx.AsyncClient") + def test_omits_input_text_when_full_input_disabled( + self, mock_client_class, evaluation_metadata + ): + mock_client = _mock_async_httpx_client(mock_client_class) + + cfg = create_config(telemetry_partner_id="tid", send_full_input_with_telemetry=False) + evaluation_metadata.status = Status.succeeded + inp = _sample_evaluation_input() + + asyncio.run(send_telemetry(evaluation_metadata, inp, cfg)) + + payload = mock_client.post.call_args[1]["json"] + assert "input_text" not in payload + + @patch("learning_commons_evaluators.telemetry.httpx.AsyncClient") + def test_posts_json_body_on_failed_run(self, mock_client_class, evaluation_metadata): + mock_client = _mock_async_httpx_client(mock_client_class) + + cfg = create_config(telemetry_partner_id="tid") + evaluation_metadata.status = Status.failed + evaluation_metadata.error_details = "boom" + + asyncio.run(send_telemetry(evaluation_metadata, None, cfg)) + + payload = mock_client.post.call_args[1]["json"] + assert payload["status"] == "error" + assert "boom" in (payload.get("error_code") or "") + + @patch("learning_commons_evaluators.telemetry.httpx.AsyncClient") + def test_send_telemetry_posts_via_asyncio_run(self, mock_client_class, evaluation_metadata): + _mock_async_httpx_client(mock_client_class) + + cfg = create_config(telemetry_partner_id="tid") + evaluation_metadata.status = Status.succeeded + + asyncio.run(send_telemetry(evaluation_metadata, None, cfg)) + + mock_client_class.assert_called_once() + + @patch("learning_commons_evaluators.telemetry.httpx.AsyncClient") + def test_send_telemetry_no_op_without_partner( + self, mock_client_class, config, evaluation_metadata + ): + asyncio.run(send_telemetry(evaluation_metadata, None, config)) + mock_client_class.assert_not_called() + + @patch("learning_commons_evaluators.telemetry.httpx.AsyncClient") + def test_timestamp_is_set_at_send_time_not_evaluation_start( + self, mock_client_class, evaluation_metadata + ): + _mock_async_httpx_client(mock_client_class) + cfg = create_config(telemetry_partner_id="tid") + evaluation_metadata.timestamp = datetime(2020, 1, 1, tzinfo=timezone.utc) + evaluation_metadata.status = Status.succeeded + + before = datetime.now(timezone.utc) + asyncio.run(send_telemetry(evaluation_metadata, None, cfg)) + after = datetime.now(timezone.utc) + + payload = mock_client_class.return_value.__aenter__.return_value.post.call_args[1]["json"] + sent = datetime.fromisoformat(payload["timestamp"].replace("Z", "+00:00")) + assert before <= sent <= after + + @patch( + "learning_commons_evaluators.telemetry.evaluation_to_typescript_telemetry_event", + side_effect=RuntimeError("adapter blew up"), + ) + def test_send_telemetry_swallows_non_http_errors(self, _mock_adapter, evaluation_metadata): + mock_logger = MagicMock() + cfg = create_config(telemetry_partner_id="tid", logger=mock_logger) + asyncio.run(send_telemetry(evaluation_metadata, None, cfg)) + mock_logger.warning.assert_called_once() + + +class TestClientIdSeed: + def test_same_partner_id_across_configs_yields_same_client_id(self): + cfg_a = create_config(telemetry_partner_id="my-key") + cfg_b = create_config(telemetry_partner_id="my-key") + assert cfg_a.client_id_seed == cfg_b.client_id_seed + assert client_id_from_seed("my-key", cfg_a.client_id_seed) == client_id_from_seed( + "my-key", cfg_b.client_id_seed + ) diff --git a/sdks/python/tests/telemetry/test_telemetry_adapter.py b/sdks/python/tests/telemetry/test_telemetry_adapter.py new file mode 100644 index 0000000..6a4f078 --- /dev/null +++ b/sdks/python/tests/telemetry/test_telemetry_adapter.py @@ -0,0 +1,78 @@ +"""Tests for :func:`evaluation_to_typescript_telemetry_event`.""" + +from __future__ import annotations + +from learning_commons_evaluators import create_config +from learning_commons_evaluators.schemas.config import LLMProvider, PromptSettings +from learning_commons_evaluators.schemas.metadata import ( + PROMPT_STEP_EXTRA_PROMPT_SETTINGS, + PROMPT_STEP_EXTRA_TOKEN_USAGE, + Status, + StepMetadata, + TokenUsage, +) +from learning_commons_evaluators.telemetry.adapter import evaluation_to_typescript_telemetry_event + + +def test_adapter_maps_success_and_aggregate_tokens(evaluation_metadata): + cfg = create_config(telemetry_partner_id="tid") + evaluation_metadata.status = Status.succeeded + evaluation_metadata.input_metadata = {"grade_level": {"grade": 5}} + evaluation_metadata.total_token_usage[LLMProvider.OPENAI] = TokenUsage( + provider_type=LLMProvider.OPENAI, + model="gpt-4o", + input_tokens=10, + output_tokens=20, + ) + evaluation_metadata.total_token_usage[LLMProvider.GOOGLE] = TokenUsage( + provider_type=LLMProvider.GOOGLE, + model="gemini-pro", + input_tokens=5, + output_tokens=5, + ) + + event = evaluation_to_typescript_telemetry_event(evaluation_metadata, None, cfg) + + assert event.status == "success" + assert event.evaluator_type == "test-evaluator" + assert event.grade == "5" + assert event.provider == "google:gemini-pro + openai:gpt-4o" + assert event.token_usage is not None + assert event.token_usage.input_tokens == 15 + assert event.token_usage.output_tokens == 25 + + +def test_adapter_stage_details_from_step_extras(evaluation_metadata): + cfg = create_config(telemetry_partner_id="tid") + evaluation_metadata.status = Status.succeeded + ps = PromptSettings(provider_type=LLMProvider.OPENAI, model="gpt-4o-mini", temperature=0.0) + evaluation_metadata.step_details["step_a"] = StepMetadata( + step_id="step_a", + status=Status.succeeded, + processing_time_ms=12.5, + extras={ + PROMPT_STEP_EXTRA_PROMPT_SETTINGS: { + "provider_type": ps.provider_type.value, + "model": ps.model, + "temperature": ps.temperature, + }, + PROMPT_STEP_EXTRA_TOKEN_USAGE: { + "provider_type": "openai", + "model": "gpt-4o-mini", + "input_tokens": 3, + "output_tokens": 4, + }, + }, + ) + + event = evaluation_to_typescript_telemetry_event(evaluation_metadata, None, cfg) + + assert event.metadata is not None + assert event.metadata.stage_details is not None + assert len(event.metadata.stage_details) == 1 + sd = event.metadata.stage_details[0] + assert sd.stage == "step_a" + assert sd.provider == "openai:gpt-4o-mini" + assert sd.token_usage is not None + assert sd.token_usage.input_tokens == 3 + assert sd.token_usage.output_tokens == 4 diff --git a/sdks/python/tests/telemetry/test_telemetry_utils.py b/sdks/python/tests/telemetry/test_telemetry_utils.py new file mode 100644 index 0000000..b38f361 --- /dev/null +++ b/sdks/python/tests/telemetry/test_telemetry_utils.py @@ -0,0 +1,27 @@ +"""Tests for :mod:`learning_commons_evaluators.telemetry.utils`.""" + +from __future__ import annotations + +import uuid + +from learning_commons_evaluators.telemetry.utils import client_id_from_seed + + +class TestClientIdFromSeed: + def test_deterministic_for_same_namespace_and_key(self): + ns = uuid.UUID("018f1234-5678-7abc-8def-0123456789ab") + assert client_id_from_seed("my-api-key", ns) == client_id_from_seed("my-api-key", ns) + + def test_different_api_keys_differ(self): + ns = uuid.UUID("018f1234-5678-7abc-8def-0123456789ab") + assert client_id_from_seed("a", ns) != client_id_from_seed("b", ns) + + def test_same_api_key_different_namespace_differs(self): + assert client_id_from_seed("k", uuid.uuid4()) != client_id_from_seed("k", uuid.uuid4()) + + def test_returns_valid_uuid_string(self): + uuid.UUID(client_id_from_seed("any-key", uuid.uuid4())) + + def test_strips_whitespace_on_api_key(self): + ns = uuid.UUID("018f1234-5678-7abc-8def-0123456789ab") + assert client_id_from_seed(" k ", ns) == client_id_from_seed("k", ns)