DigitalHarborFoundation · levon003 · Jun 13, 2026 · Jun 13, 2026
diff --git a/.github/workflows/validate.yaml b/.github/workflows/validate.yaml
@@ -55,9 +55,10 @@ jobs:
         run: |
           uv sync --group docs
 
-      # Builds the same way github-pages.yml deploys, so doc-build breakage
-      # (and, with -W, doc warnings) is caught on PRs instead of only on the
-      # deploy-to-main step. This job does not deploy.
+      # Builds the same way github-pages.yml deploys, so doc-build breakage is
+      # caught on PRs instead of only on the deploy-to-main step. -W turns doc
+      # warnings into errors (--keep-going reports them all before failing) so
+      # they can't silently accumulate. This job does not deploy.
       - name: Build docs
         run: |
-          make html
+          make html SPHINXOPTS="-W --keep-going"
diff --git a/docs/conf.py b/docs/conf.py
@@ -177,6 +177,11 @@ def linkcode_resolve(domain, info):
 # "stub file not found" warnings those tables' :toctree: would otherwise emit.
 numpydoc_show_class_members = False
 autodoc_typehints = "signature"
+# Some models hold fields that have no JSON-schema representation (e.g.
+# FunctionsCollection.functions is a list[Callable]). Coerce rather than warn so
+# autodoc-pydantic renders a schema for the serializable fields instead of
+# emitting a build warning for the model.
+autodoc_pydantic_model_show_json_error_strategy = "coerce"
 autodoc_default_options = {
     "members": True,
     "undoc-members": True,
@@ -189,23 +194,28 @@ def linkcode_resolve(domain, info):
 def skip_peewee_internals(app, what, name, obj, skip, options):
     """Hide peewee-generated noise from the API docs.
 
-    peewee's model metaclass adds two kinds of members to every model class
+    peewee's model metaclass adds several kinds of members to every model class
     that aren't useful in the generated reference:
 
-    - a per-model ``DoesNotExist`` exception (e.g. ``MetricDoesNotExist``), and
+    - a per-model ``DoesNotExist`` exception (e.g. ``MetricDoesNotExist``),
     - a ``<fk>_id`` alias for every foreign key (e.g. ``dataset_id`` alongside
       ``dataset``). The alias shares the same ``Field`` object as the FK, whose
-      ``.name`` is the FK field name, so we can detect it by name mismatch.
+      ``.name`` is the FK field name, so we can detect it by name mismatch, and
+    - a back-reference accessor for every relation pointing at the model (e.g.
+      ``Dataset.messages`` from ``Message.dataset``). These render as bare names
+      with no docstring or type, and they trip docutils warnings, so we drop them.
 
     Genuine fields and methods are left untouched. (Inherited members are
-    excluded separately via ``inherited-members: False`` below — note that
-    peewee's per-model ``DoesNotExist`` and ``_id`` accessors are defined on the
-    model class itself, not inherited, which is why they need explicit skipping.)
+    excluded separately via ``inherited-members: False`` below — note that these
+    generated members are defined on the model class itself, not inherited, which
+    is why they need explicit skipping.)
     """
     if name == "DoesNotExist":
         return True
     if isinstance(obj, pw.ForeignKeyField) and name != obj.name:
         return True
+    if isinstance(obj, pw.BackrefAccessor):
+        return True
     return skip
 
 

diff --git a/src/flexeval/__about__.py b/src/flexeval/__about__.py
@@ -1 +1 @@
-__version__ = "0.5.2"
+__version__ = "0.5.3"
diff --git a/src/flexeval/classes/jsonview.py b/src/flexeval/classes/jsonview.py
@@ -70,10 +70,11 @@ def refresh_from_model(self):
 class JsonView:
     """Descriptor that provides dict-like access to a JSON text field.
 
-    Example:
-    class SomeModel(pw.Model):
-        some_field = pw.TextField(default="{}")
-        some_field_dict = JsonView(text_field_attr_name="some_field")
+    Example::
+
+        class SomeModel(pw.Model):
+            some_field = pw.TextField(default="{}")
+            some_field_dict = JsonView(text_field_attr_name="some_field")
     """
 
     def __init__(self, text_field_attr_name):

diff --git a/src/flexeval/compute_metrics.py b/src/flexeval/compute_metrics.py
@@ -407,15 +407,16 @@ def process_thread_dependency_graph(
         return evaluated_metrics
 
     def compute_metrics(self, object: Union[Thread, Turn, Message, ToolCall]):
-        """we've defined a variable called metrics_to_evaluate
-        it's a list we need to loop through
-        each entry looks like this
-        {
-            'name': 'string_length',
-            'type': 'function',
-            'kwargs': {},
-            'depends_on': []
-        }
+        """Loop through ``metrics_to_evaluate``.
+
+        Each entry looks like this::
+
+            {
+                'name': 'string_length',
+                'type': 'function',
+                'kwargs': {},
+                'depends_on': []
+            }
         """
         # we'll keep the results in a list
         # for each new metric, if it has dependencies, we'll need to make sure they're met - otherwise we won't run it

diff --git a/src/flexeval/configuration/completion_functions.py b/src/flexeval/configuration/completion_functions.py
@@ -2,25 +2,27 @@
 and produce conversational turns (aka completions) as output.
 
 When writing a new function, the arguments must include, at minimum:
-* conversation_history - list of dictionaries with keys ("role","content"), whose values are strings
-* kwargs - dictionary of optional values that can probably be ignored
+
+* ``conversation_history`` - list of dictionaries with keys ("role", "content"), whose values are strings
+* ``kwargs`` - dictionary of optional values that can probably be ignored
+
 Other arguments can be added, but then must also be specified
 in the "completion_llm" section of the evals.yaml config.
 
-The outputs must conform to the structure described here:
-https://platform.openai.com/docs/guides/text-generation/chat-completions-api
-with the following format:
+The outputs must conform to the `structure described here
+<https://platform.openai.com/docs/guides/text-generation/chat-completions-api>`_,
+with the following format::
+
     completion = {
         "choices": [
             {
-                "message":{
+                "message": {
                     "content": MY_CONTENT_HERE,
-                    "role":"assistant"
+                    "role": "assistant"
                 }
             }
         ]
     }
-
 """
 
 import json

diff --git a/src/flexeval/configuration/function_metrics.py b/src/flexeval/configuration/function_metrics.py
@@ -39,18 +39,17 @@
 def process_single_message(
     message: str,
 ) -> Union[int, float, dict[str, Union[int, float]]]:
-    """
-        Process a single conversational message and return the desired output
-
-        Args: 
-        message (str): a single conversational message as a string
-                NOTE: Metrics that take a string as input are valid at the Turn
-                      and Message levels.
-
-        Returns:
-        an integer (e.g., 2), \
-        or a floating point number (e.g., 2.8), \
-        or a dictionary of metric/value pairs (e.g. {'metric1':value1, 'metric2':value2})
+    """Process a single conversational message and return the desired output.
+
+    Args:
+        message (str): a single conversational message as a string.
+            NOTE: Metrics that take a string as input are valid at the Turn
+            and Message levels.
+
+    Returns:
+        An integer (e.g., ``2``), a floating point number (e.g., ``2.8``), or a
+        dictionary of metric/value pairs
+        (e.g., ``{'metric1': value1, 'metric2': value2}``).
     """
     pass
 
@@ -61,19 +60,19 @@ def process_conversation(
 ) -> Union[
     int, float, dict[str, Union[int, float]], list[dict[str, Union[int, float]]]
 ]:
-    """
-        Process an entire conversation and return the desired output
-        
-        Args: 
-        conversation (list): an entire conversation as a list
-                NOTE: Metrics that take a list as input are valid at the Thread
-                      and Turn levels.
-        Returns: 
-        an integer, e.g., 2 \
-        or a floating point number, e.g., 2.8 \
-        or a dictionary of metric/value pairs, e.g. {'metric1':value1, 'metric2':value2}\
-        or a list of dictionaries. The key can be either 'role' or 'metric'. \
-            e.g., [{"role":role1, "value":value1}, {"role":role2, "value":value2}, ...]
+    """Process an entire conversation and return the desired output.
+
+    Args:
+        conversation (list): an entire conversation as a list.
+            NOTE: Metrics that take a list as input are valid at the Thread
+            and Turn levels.
+
+    Returns:
+        An integer (e.g., ``2``), a floating point number (e.g., ``2.8``),
+        a dictionary of metric/value pairs
+        (e.g., ``{'metric1': value1, 'metric2': value2}``), or a list of
+        dictionaries where the key can be either 'role' or 'metric'
+        (e.g., ``[{"role": role1, "value": value1}, ...]``).
     """
     pass
 
@@ -511,13 +510,14 @@ def count_errors(object: Union[Thread, Turn, Message, ToolCall]) -> dict:
     If a Turn, Message, or ToolCall, ditto.
 
     It does this by iterating through ToolCalls and identifying whether there are
-    entries like "*_errors" in tool_call.additional_kwargs
+    entries like ``*_errors`` in tool_call.additional_kwargs
+
+    If a ToolCall, returns 1 if there is an error of each type::
 
-    If a ToolCall, returns 1 if there is an error of each type
-    {
-        "python_errors": 3,
-        "javascript_errors": 1
-    }
+        {
+            "python_errors": 3,
+            "javascript_errors": 1
+        }
     """
     if isinstance(object, ToolCall):
         return {