Clean up RagasEvaluator interfaces

alimaredia · alimaredia · commit f8277b156367 · 2025-01-15T23:09:58.000-05:00
Refactor RagasEvaluator Class for use for
`ilab` interface.

Signed-off-by: Ali Maredia &lt;amaredia@redhat.com&gt;
diff --git a/src/instructlab/eval/ragas.py b/src/instructlab/eval/ragas.py
@@ -74,7 +74,7 @@ class ModelConfig(BaseModel):
     max_tokens: int = 768
 
     # Random seed for reproducibility. Caution: this isn't supported by all model serving runtimes.
-    seed: int = DEFAULT_SEED
+    seed: int = 42
 
 
 class RagasEvaluator(Evaluator):
@@ -96,29 +96,25 @@ def __init__(
         self.judge_openai_api_key = judge_openai_api_key
 
     @staticmethod
-    def _validate_dataset(df: DataFrame):
+    def validate_dataset(df: DataFrame):
         """
         Validates whether or not the given `df` is a valid dataset of `Sample` objects.
 
         Args:
-            df (DataFrame): DataFrame containing the dataset to be evaluated.
+            df (DataFrame):            DataFrame containing the dataset to be evaluated.
         """
-        # We have to hardcode these fields because the automated way of resolving the required fields from a TypedDict
-        # is only included by default in Python3.11+. For earlier versions, the `typing_extensions` package is required.
-        # See: https://docs.python.org/3/whatsnew/3.11.html#pep-655-marking-individual-typeddict-items-as-required-or-not-required
-        required_keys = {"user_input", "reference"}
-        missing_keys = required_keys - set(df.columns)
-        if missing_keys:
+        required_keys = {"user_input", "reference", "response"}
+
+        columns_list = set(df.columns)
+        if not columns_list.issubset(required_keys):
             raise ValueError(
-                f"invalid dataset provided, missing the following keys: {', '.join(missing_keys)}"
+                f"Dataset can only have the following keys: {', '.join(required_keys)}. Keys provided were: {', '.join(columns_list)}"
             )
 
     def run(
         self,
-        dataset: List[Sample] | Path,
-        student_model: ModelConfig | None = None,
+        dataset: List[Sample] | DataFrame,
         run_config: RunConfig | None = None,
-        student_openai_client: OpenAIClient | None = None,
         judge_model_name: str | None = None,
         judge_openai_api_key: str | None = None,
     ) -> EvaluationResult:
@@ -132,17 +128,12 @@ def run(
             dataset (List[Sample] | Path):
                 Can be either a list of `Sample` objects or a path to a jsonl file containing
                 records matching `Sample`.
-            student_model: (StudentModelConfig):
-                When this parameter is provided, we'll attempt to use the described model in order to
                 generate the responses from the given list of questions.
             run_config (RunConfig | None, optional):
                 Configuration to use when running evaluations. If none is provided, then
                 a default one is created containing extremely permissive settings when handling
                 timeouts. This is because by default, OpenAI tier-1 usage accounts have very high
                 rate limits resulting in heavy throttling during evaluations.
-            student_openai_client (openai.Client | None, optional):
-                The client to use when generating questions from the student model, must be compatible with the OpenAI API.
-                This field is required when `student_model` is provided.
             judge_model_name (str | None, optional):
                 Name of the OpenAI model to use as the judge model. Defaults to "gpt-4o" when none is specified.
             judge_openai_api_key (str | None, optional):
@@ -158,50 +149,29 @@ def run(
         judge_openai_api_key = (
             judge_openai_api_key if judge_openai_api_key else self.judge_openai_api_key
         )
-        student_model = student_model if student_model else self.student_model
         run_config = run_config if run_config else self.run_config
-        student_openai_client = (
-            student_openai_client
-            if student_openai_client
-            else self.student_openai_client
-        )
 
         # ensure we are in the dataframe format
-        input_df = None
+        input_df = dataset
         if isinstance(dataset, list):
             input_df = DataFrame(dataset)
-        elif isinstance(dataset, Path):
-            input_df = read_json(dataset, orient="records", lines=True)
-        else:
+        elif not isinstance(dataset, DataFrame):
             raise TypeError(f"invalid type of dataset: {type(dataset)}")
 
         # this should never happen, but pylint is not smart enough to detect it
         if TYPE_CHECKING:
             assert input_df is not None
 
         # ensure the dataset is in the format we expect it
-        self._validate_dataset(input_df)
-
-        need_to_generate_questions = "response" not in input_df.columns
-        if need_to_generate_questions:
-            logger.debug(
-                "`response` is missing in the input dataframe columns, generating questions from the model is required."
-            )
-            if not student_model or not student_openai_client:
-                raise ValueError(
-                    "provided dataset doesn't contain the model `response`, but either `student_model` or `student_openai_client` wasn't provided for inference"
-                )
-
-        # if the student model was provided then we always generate regardless
-        if student_model:
-            if not student_openai_client:
-                raise ValueError(
-                    "`student_model` was specified but `student_openai_client` was not provided"
-                )
-            input_df = self._generate_answers_from_model(
-                input_df, student_model, student_openai_client
+        # this looks similar to validate_dataset but here we want an exact match, not a subset
+        required_keys = {"user_input", "reference", "response"}
+        columns = set(input_df.columns)
+        if columns != required_keys:
+            raise ValueError(
+                f"Input Dataset can only have the following keys: {', '.join(required_keys)}. Keys provided were: {', '.join(columns)}"
             )
 
+
         if not run_config:
             # we set extreme timeout/retry values by default since OpenAI tier-1 rate limits
             # are horrible and will result in half of our evaluation results being NaN or 0
@@ -229,15 +199,25 @@ def run(
         )
         return results
 
-    def _generate_answers_from_model(
-        self,
+    @staticmethod
+    def generate_answers_from_model(
         questions: DataFrame,
-        student_model: ModelConfig,
-        student_openai_client: OpenAIClient,
+        model_config: ModelConfig,
+        openai_client: OpenAIClient,
     ) -> DataFrame:
         """
         Given a DataFrame containing `user_input` columns, generates responses from the given model
         and returns a new DataFrame containing its answers in the `response` column.
+
+        Args:
+            questions: (DataFrame):
+                Questions and refernce answers to be returned with the responses from the model
+            model_config: (ModelConfig):
+                Configuration settings for the model when getting responses.
+            openai_client (openai.Client | None, optional):
+                The client to use when generating questions from the model, must be compatible with the OpenAI API.
+        Returns:
+            DataFrame with user_input, reference, and response columns. Responses for the user_input from the model
         """
         # initialize response to write into
         updated_df = questions.copy()
@@ -247,17 +227,17 @@ def _generate_answers_from_model(
             messages: List[ChatCompletionMessageParam] = [
                 {
                     "role": "system",
-                    "content": student_model.system_prompt,
+                    "content": model_config.system_prompt,
                 },
                 {"role": "user", "content": qna["user_input"]},
             ]
-            response = student_openai_client.chat.completions.create(
+            response = openai_client.chat.completions.create(
                 messages=messages,
-                model=student_model.model_name,
+                model=model_config.model_name,
                 # specify the seed so we can at least try to have some reproducibility when the clients support it
-                seed=42,
-                max_tokens=student_model.max_tokens,
-                temperature=student_model.temperature,
+                seed=model_config.seed,
+                max_tokens=model_config.max_tokens,
+                temperature=model_config.temperature,
             )
             updated_df.at[i, "response"] = response.choices[0].message.content
         return updated_df