Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 34 additions & 0 deletions sdks/python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,31 @@ config = create_config(
)
```

### Provider config validation

When you construct an evaluator (or call `evaluate` / `evaluate_sync`), the SDK checks
that every `LLMProvider` referenced by a `PromptSettings` field on the active
`EvaluationSettings` has a matching provider config on `EvaluatorConfig`. For example,
the vocabulary evaluator’s defaults use both Google and OpenAI, so both
`google_llm_provider_config` and `openai_llm_provider_config` must be set; conventionality
only needs Google.

Validation runs:

- In `BaseEvaluator.__init__` against the resolved default evaluation settings (constructor
override or the subclass class attribute).
- At the start of each `evaluate()` call against the settings used for that run (including
per-call `evaluation_settings` overrides).

If a required provider is missing, construction or evaluation raises `ConfigurationError`
with the same message used at LLM call time (for example,
`Google provider config is not set on EvaluatorConfig`). You can also call
`config.validate_supports_evaluation_settings(settings)` directly before constructing an
evaluator.

Only providers actually used in the settings object are required — you do not need to
configure every provider on every evaluator.

### Per-instance default evaluation settings

Every `BaseEvaluator` subclass defines **class-level** `default_evaluation_settings`
Expand Down Expand Up @@ -261,6 +286,11 @@ result = evaluator.evaluate_sync(input)
result = evaluator.evaluate_sync(input, evaluation_settings=other_settings)
```

If `other_settings` references a provider that is not on `config`, `evaluate_sync` raises
`ConfigurationError` before any LLM call. The same applies when you pass
`default_evaluation_settings` at construction: every provider in those settings must be
configured on `config`.

If you omit `default_evaluation_settings` at construction, attribute lookup uses the
subclass class attribute, same as before. Whenever you call `evaluate_sync()` or
`await evaluator.evaluate(...)` without `evaluation_settings`, the SDK uses
Expand Down Expand Up @@ -437,6 +467,10 @@ class MyEvaluator(BaseEvaluator[MyInput, EvaluationResult, MySettings]):

If you override `__init__` on the subclass, accept the same keyword-only argument and forward it: `super().__init__(config, default_evaluation_settings=default_evaluation_settings)`.

Declare each prompt step as a `PromptSettings` field on your settings model (typically named
`prompt_settings_*`). The base class uses those fields to determine which provider configs
must be present on `EvaluatorConfig`.

## License

MIT
12 changes: 11 additions & 1 deletion sdks/python/src/learning_commons_evaluators/evaluators/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ class BaseEvaluator(ABC, Generic[InputT, OutputT, SettingsT]):
Pass ``default_evaluation_settings`` at construction to override the class-level
defaults for that instance (used when :meth:`evaluate` is called without
``evaluation_settings``).

Raises:
ConfigurationError: Default evaluation settings require an LLM provider
that is not configured on ``config``.
"""

config: EvaluatorConfig
Expand All @@ -79,7 +83,12 @@ def __init__(
self.config = config
if default_evaluation_settings is not None:
self.default_evaluation_settings = default_evaluation_settings
# TODO: validate config
settings_for_validation = (
default_evaluation_settings
if default_evaluation_settings is not None
else self.__class__.default_evaluation_settings
)
config.validate_supports_evaluation_settings(settings_for_validation)

async def evaluate(
self,
Expand Down Expand Up @@ -130,6 +139,7 @@ async def evaluate(
extra={"evaluation_metadata": evaluation_metadata},
)
try:
self.config.validate_supports_evaluation_settings(evaluation_settings)
input.validate()
result = await self.evaluate_impl(input, evaluation_settings, evaluation_metadata)
evaluation_metadata.status = Status.succeeded
Expand Down
35 changes: 35 additions & 0 deletions sdks/python/src/learning_commons_evaluators/schemas/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from pydantic import BaseModel, ConfigDict

from learning_commons_evaluators.logger import Logger, get_logger
from learning_commons_evaluators.schemas.errors import ConfigurationError

# --- LLM provider configs (for LLM calls in prompt steps) ---

Expand Down Expand Up @@ -75,6 +76,29 @@ class EvaluationSettings(BaseModel):
model_config = ConfigDict(arbitrary_types_allowed=True)


_PROVIDER_CONFIG_ATTR: dict[LLMProvider, str] = {
LLMProvider.GOOGLE: "google_llm_provider_config",
LLMProvider.OPENAI: "openai_llm_provider_config",
LLMProvider.ANTHROPIC: "anthropic_llm_provider_config",
}

_PROVIDER_MISSING_MESSAGE: dict[LLMProvider, str] = {
LLMProvider.GOOGLE: "Google provider config is not set on EvaluatorConfig",
LLMProvider.OPENAI: "OpenAI provider config is not set on EvaluatorConfig",
LLMProvider.ANTHROPIC: "Anthropic provider config is not set on EvaluatorConfig",
}


def _required_llm_providers(settings: EvaluationSettings) -> set[LLMProvider]:
"""Collect LLM providers referenced by PromptSettings fields on settings."""
providers: set[LLMProvider] = set()
for name in type(settings).model_fields:
value = getattr(settings, name)
if isinstance(value, PromptSettings):
providers.add(value.provider_type)
return providers


@dataclass(frozen=True)
class TelemetryConfig:
"""Config for telemetry."""
Expand Down Expand Up @@ -106,6 +130,17 @@ class EvaluatorConfig:
logger: Logger = field(default_factory=get_logger)
telemetry: TelemetryConfig = field(default_factory=TelemetryConfig)

def validate_supports_evaluation_settings(self, settings: EvaluationSettings) -> None:
"""Raise ConfigurationError if settings require an LLM provider not configured on self."""
required = _required_llm_providers(settings)
missing_messages = [
_PROVIDER_MISSING_MESSAGE[provider]
for provider in sorted(required, key=lambda p: p.value)
if getattr(self, _PROVIDER_CONFIG_ATTR[provider]) is None
]
if missing_messages:
raise ConfigurationError("; ".join(missing_messages))


def create_config(
*,
Expand Down
23 changes: 22 additions & 1 deletion sdks/python/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
import pytest

from learning_commons_evaluators import create_config_no_telemetry
from learning_commons_evaluators.schemas.config import EvaluationSettings
from learning_commons_evaluators.schemas.config import (
EvaluationSettings,
GoogleLLMProviderConfig,
OpenAILLMProviderConfig,
)
from learning_commons_evaluators.schemas.metadata import (
EvaluationMetadata,
EvaluatorMaturity,
Expand Down Expand Up @@ -42,3 +46,20 @@ def evaluation_metadata(evaluator_metadata):
def config():
"""EvaluatorConfig with no telemetry, suitable for unit tests."""
return create_config_no_telemetry()


@pytest.fixture
def config_with_google():
"""EvaluatorConfig with Google provider set (conventionality and similar evaluators)."""
return create_config_no_telemetry(
google_llm_provider_config=GoogleLLMProviderConfig(api_key="test-google-key"),
)


@pytest.fixture
def config_with_google_and_openai():
"""EvaluatorConfig with Google and OpenAI providers set (vocabulary evaluator)."""
return create_config_no_telemetry(
google_llm_provider_config=GoogleLLMProviderConfig(api_key="test-google-key"),
openai_llm_provider_config=OpenAILLMProviderConfig(api_key="test-openai-key"),
)
11 changes: 3 additions & 8 deletions sdks/python/tests/contract_tests/test_conventionality.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,7 @@
copy.
"""

from learning_commons_evaluators import (
ConventionalityEvaluationInput,
ConventionalityEvaluator,
create_config_no_telemetry,
)
from learning_commons_evaluators import ConventionalityEvaluationInput, ConventionalityEvaluator
from learning_commons_evaluators.schemas.metadata import Status

from .conventionality import (
Expand All @@ -38,7 +34,7 @@


class TestConventionalityContract:
def test_turnip_grade4(self) -> None:
def test_turnip_grade4(self, config_with_google) -> None:
"""Turnip classroom narrative, grade 4.

Verifies:
Expand All @@ -49,8 +45,7 @@ def test_turnip_grade4(self) -> None:
"""
case = load_conventionality_turnip_case()

config = create_config_no_telemetry()
evaluator = ConventionalityEvaluator(config)
evaluator = ConventionalityEvaluator(config_with_google)
inp = ConventionalityEvaluationInput(
text=case.input["text"],
grade=case.input["grade"],
Expand Down
16 changes: 5 additions & 11 deletions sdks/python/tests/contract_tests/test_vocabulary.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,11 +32,7 @@
the ``user_prompt`` and ``llm_response`` fields need to be populated.
"""

from learning_commons_evaluators import (
VocabularyEvaluationInput,
VocabularyEvaluator,
create_config_no_telemetry,
)
from learning_commons_evaluators import VocabularyEvaluationInput, VocabularyEvaluator
from learning_commons_evaluators.schemas.metadata import Status

from .harness import ContractTestHarness
Expand All @@ -49,7 +45,7 @@


class TestVocabularyContractGrades34:
def test_marco_polo_grade3(self) -> None:
def test_marco_polo_grade3(self, config_with_google_and_openai) -> None:
"""Marco Polo passage, grade 3 — grades 3–4 Gemini path.

Verifies:
Expand All @@ -60,8 +56,7 @@ def test_marco_polo_grade3(self) -> None:
"""
case = load_vocabulary_grade34_case()

config = create_config_no_telemetry()
evaluator = VocabularyEvaluator(config)
evaluator = VocabularyEvaluator(config_with_google_and_openai)
inp = VocabularyEvaluationInput(
text=case.input["text"],
grade=case.input["grade"],
Expand Down Expand Up @@ -94,7 +89,7 @@ def test_marco_polo_grade3(self) -> None:


class TestVocabularyContractOtherGrades:
def test_hurricanes_grade7(self) -> None:
def test_hurricanes_grade7(self, config_with_google_and_openai) -> None:
"""Hurricane formation passage, grade 7 — grades 5–12 GPT path.

Verifies:
Expand All @@ -105,8 +100,7 @@ def test_hurricanes_grade7(self) -> None:
"""
case = load_vocabulary_other_grades_case()

config = create_config_no_telemetry()
evaluator = VocabularyEvaluator(config)
evaluator = VocabularyEvaluator(config_with_google_and_openai)
inp = VocabularyEvaluationInput(
text=case.input["text"],
grade=case.input["grade"],
Expand Down
Loading