4646from .eval_metrics import EvalMetricResult
4747from .eval_metrics import EvalMetricResultDetails
4848from .eval_metrics import EvalMetricResultPerInvocation
49+ from .eval_metrics import Rubric
4950from .eval_result import EvalCaseResult
5051from .eval_set import EvalCase
5152from .eval_set_results_manager import EvalSetResultsManager
@@ -67,6 +68,46 @@ def _get_session_id() -> str:
6768 return f'{ EVAL_SESSION_ID_PREFIX } { str (uuid .uuid4 ())} '
6869
6970
71+ def _add_rubrics_to_invocation (
72+ invocation : Invocation , rubrics_to_add : list [Rubric ]
73+ ):
74+ """Adds rubrics to invocation, throwing ValueError on duplicate rubric_id."""
75+ if not invocation .rubrics :
76+ invocation .rubrics = []
77+ existing_ids = {r .rubric_id for r in invocation .rubrics }
78+ for rubric in rubrics_to_add :
79+ if rubric .rubric_id in existing_ids :
80+ raise ValueError (
81+ f"Rubric with rubric_id '{ rubric .rubric_id } ' already exists."
82+ )
83+ invocation .rubrics .append (rubric )
84+ existing_ids .add (rubric .rubric_id )
85+
86+
87+ def _copy_eval_case_rubrics_to_actual_invocations (
88+ eval_case : EvalCase , actual_invocations : list [Invocation ]
89+ ):
90+ """Copies EvalCase level rubrics to all actual invocations."""
91+ if hasattr (eval_case , 'rubrics' ) and eval_case .rubrics :
92+ for invocation in actual_invocations :
93+ _add_rubrics_to_invocation (invocation , eval_case .rubrics )
94+
95+
96+ def _copy_invocation_rubrics_to_actual_invocations (
97+ expected_invocations : Optional [list [Invocation ]],
98+ actual_invocations : list [Invocation ],
99+ ):
100+ """Copies invocation level rubrics to corresponding actual invocations."""
101+ if expected_invocations :
102+ for actual_invocation , expected_invocation in zip (
103+ actual_invocations , expected_invocations
104+ ):
105+ if expected_invocation .rubrics :
106+ _add_rubrics_to_invocation (
107+ actual_invocation , expected_invocation .rubrics
108+ )
109+
110+
70111@experimental
71112class LocalEvalService (BaseEvalService ):
72113 """An implementation of BaseEvalService, that runs the evals locally."""
@@ -249,76 +290,27 @@ async def _evaluate_single_inference_result(
249290 )
250291 )
251292
252- for eval_metric in evaluate_config .eval_metrics :
253- # Perform evaluation of the metric.
254- try :
255- with client_label_context (EVAL_CLIENT_LABEL ):
256- evaluation_result = await self ._evaluate_metric (
257- eval_metric = eval_metric ,
258- actual_invocations = inference_result .inferences ,
259- expected_invocations = eval_case .conversation ,
260- conversation_scenario = eval_case .conversation_scenario ,
261- )
262- except Exception as e :
263- # We intentionally catch the Exception as we don't want failures to
264- # affect other metric evaluation.
265- logger .error (
266- "Metric evaluation failed for metric `%s` for eval case id '%s'"
267- ' with following error `%s`' ,
268- eval_metric .metric_name ,
269- eval_case .eval_id ,
270- e ,
271- exc_info = True ,
272- )
273- # We use an empty result.
274- evaluation_result = EvaluationResult (
275- overall_eval_status = EvalStatus .NOT_EVALUATED
276- )
293+ actual_invocations = inference_result .inferences
294+ expected_invocations = eval_case .conversation
277295
278- # Track overall score across all invocations.
279- eval_metric_result_details = EvalMetricResultDetails (
280- rubric_scores = evaluation_result .overall_rubric_scores
281- )
282- overall_eval_metric_results .append (
283- EvalMetricResult (
284- score = evaluation_result .overall_score ,
285- eval_status = evaluation_result .overall_eval_status ,
286- details = eval_metric_result_details ,
287- ** eval_metric .model_dump (),
288- )
289- )
296+ # 1. Copy EvalCase level rubrics to all actual invocations.
297+ _copy_eval_case_rubrics_to_actual_invocations (eval_case , actual_invocations )
290298
291- if (
292- evaluation_result .overall_eval_status != EvalStatus .NOT_EVALUATED
293- and len (evaluation_result .per_invocation_results )
294- != len (eval_metric_result_per_invocation )
295- ):
296- raise ValueError (
297- 'Eval metric should return results for each invocation. Found '
298- f'{ len (evaluation_result .per_invocation_results )} results for '
299- f'{ len (eval_metric_result_per_invocation )} invocations.'
300- )
299+ # 2. If expected invocations are present, copy invocation level
300+ # rubrics to corresponding actual invocations.
301+ _copy_invocation_rubrics_to_actual_invocations (
302+ expected_invocations , actual_invocations
303+ )
301304
302- # Track score across individual invocations.
303- for idx , invocation in enumerate (eval_metric_result_per_invocation ):
304- invocation_result = (
305- evaluation_result .per_invocation_results [idx ]
306- if evaluation_result .overall_eval_status != EvalStatus .NOT_EVALUATED
307- else PerInvocationResult (
308- actual_invocation = invocation .actual_invocation
309- )
310- )
311- eval_metric_result_details = EvalMetricResultDetails (
312- rubric_scores = invocation_result .rubric_scores
313- )
314- invocation .eval_metric_results .append (
315- EvalMetricResult (
316- score = invocation_result .score ,
317- eval_status = invocation_result .eval_status ,
318- details = eval_metric_result_details ,
319- ** eval_metric .model_dump (),
320- )
321- )
305+ for eval_metric in evaluate_config .eval_metrics :
306+ # Perform evaluation of the metric.
307+ await self ._evaluate_metric_for_eval_case (
308+ eval_metric ,
309+ eval_case ,
310+ inference_result ,
311+ eval_metric_result_per_invocation ,
312+ overall_eval_metric_results ,
313+ )
322314
323315 final_eval_status = self ._generate_final_eval_status (
324316 overall_eval_metric_results
@@ -342,6 +334,84 @@ async def _evaluate_single_inference_result(
342334
343335 return (inference_result , eval_case_result )
344336
337+ async def _evaluate_metric_for_eval_case (
338+ self ,
339+ eval_metric : EvalMetric ,
340+ eval_case : EvalCase ,
341+ inference_result : InferenceResult ,
342+ eval_metric_result_per_invocation : list [EvalMetricResultPerInvocation ],
343+ overall_eval_metric_results : list [EvalMetricResult ],
344+ ):
345+ """Performs evaluation of a metric for a given eval case and inference result."""
346+ try :
347+ with client_label_context (EVAL_CLIENT_LABEL ):
348+ evaluation_result = await self ._evaluate_metric (
349+ eval_metric = eval_metric ,
350+ actual_invocations = inference_result .inferences ,
351+ expected_invocations = eval_case .conversation ,
352+ conversation_scenario = eval_case .conversation_scenario ,
353+ )
354+ except Exception as e :
355+ # We intentionally catch the Exception as we don't want failures to
356+ # affect other metric evaluation.
357+ logger .error (
358+ "Metric evaluation failed for metric `%s` for eval case id '%s'"
359+ ' with following error `%s`' ,
360+ eval_metric .metric_name ,
361+ eval_case .eval_id ,
362+ e ,
363+ exc_info = True ,
364+ )
365+ # We use an empty result.
366+ evaluation_result = EvaluationResult (
367+ overall_eval_status = EvalStatus .NOT_EVALUATED
368+ )
369+
370+ # Track overall score across all invocations.
371+ eval_metric_result_details = EvalMetricResultDetails (
372+ rubric_scores = evaluation_result .overall_rubric_scores
373+ )
374+ overall_eval_metric_results .append (
375+ EvalMetricResult (
376+ score = evaluation_result .overall_score ,
377+ eval_status = evaluation_result .overall_eval_status ,
378+ details = eval_metric_result_details ,
379+ ** eval_metric .model_dump (),
380+ )
381+ )
382+
383+ if (
384+ evaluation_result .overall_eval_status != EvalStatus .NOT_EVALUATED
385+ and len (evaluation_result .per_invocation_results )
386+ != len (eval_metric_result_per_invocation )
387+ ):
388+ raise ValueError (
389+ 'Eval metric should return results for each invocation. Found '
390+ f'{ len (evaluation_result .per_invocation_results )} results for '
391+ f'{ len (eval_metric_result_per_invocation )} invocations.'
392+ )
393+
394+ # Track score across individual invocations.
395+ for idx , invocation in enumerate (eval_metric_result_per_invocation ):
396+ invocation_result = (
397+ evaluation_result .per_invocation_results [idx ]
398+ if evaluation_result .overall_eval_status != EvalStatus .NOT_EVALUATED
399+ else PerInvocationResult (
400+ actual_invocation = invocation .actual_invocation
401+ )
402+ )
403+ eval_metric_result_details = EvalMetricResultDetails (
404+ rubric_scores = invocation_result .rubric_scores
405+ )
406+ invocation .eval_metric_results .append (
407+ EvalMetricResult (
408+ score = invocation_result .score ,
409+ eval_status = invocation_result .eval_status ,
410+ details = eval_metric_result_details ,
411+ ** eval_metric .model_dump (),
412+ )
413+ )
414+
345415 async def _evaluate_metric (
346416 self ,
347417 eval_metric : EvalMetric ,
0 commit comments