Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions .github/workflows/validate.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -55,9 +55,10 @@ jobs:
run: |
uv sync --group docs

# Builds the same way github-pages.yml deploys, so doc-build breakage
# (and, with -W, doc warnings) is caught on PRs instead of only on the
# deploy-to-main step. This job does not deploy.
# Builds the same way github-pages.yml deploys, so doc-build breakage is
# caught on PRs instead of only on the deploy-to-main step. -W turns doc
# warnings into errors (--keep-going reports them all before failing) so
# they can't silently accumulate. This job does not deploy.
- name: Build docs
run: |
make html
make html SPHINXOPTS="-W --keep-going"
22 changes: 16 additions & 6 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,11 @@ def linkcode_resolve(domain, info):
# "stub file not found" warnings those tables' :toctree: would otherwise emit.
numpydoc_show_class_members = False
autodoc_typehints = "signature"
# Some models hold fields that have no JSON-schema representation (e.g.
# FunctionsCollection.functions is a list[Callable]). Coerce rather than warn so
# autodoc-pydantic renders a schema for the serializable fields instead of
# emitting a build warning for the model.
autodoc_pydantic_model_show_json_error_strategy = "coerce"
autodoc_default_options = {
"members": True,
"undoc-members": True,
Expand All @@ -189,23 +194,28 @@ def linkcode_resolve(domain, info):
def skip_peewee_internals(app, what, name, obj, skip, options):
"""Hide peewee-generated noise from the API docs.

peewee's model metaclass adds two kinds of members to every model class
peewee's model metaclass adds several kinds of members to every model class
that aren't useful in the generated reference:

- a per-model ``DoesNotExist`` exception (e.g. ``MetricDoesNotExist``), and
- a per-model ``DoesNotExist`` exception (e.g. ``MetricDoesNotExist``),
- a ``<fk>_id`` alias for every foreign key (e.g. ``dataset_id`` alongside
``dataset``). The alias shares the same ``Field`` object as the FK, whose
``.name`` is the FK field name, so we can detect it by name mismatch.
``.name`` is the FK field name, so we can detect it by name mismatch, and
- a back-reference accessor for every relation pointing at the model (e.g.
``Dataset.messages`` from ``Message.dataset``). These render as bare names
with no docstring or type, and they trip docutils warnings, so we drop them.

Genuine fields and methods are left untouched. (Inherited members are
excluded separately via ``inherited-members: False`` below — note that
peewee's per-model ``DoesNotExist`` and ``_id`` accessors are defined on the
model class itself, not inherited, which is why they need explicit skipping.)
excluded separately via ``inherited-members: False`` below — note that these
generated members are defined on the model class itself, not inherited, which
is why they need explicit skipping.)
"""
if name == "DoesNotExist":
return True
if isinstance(obj, pw.ForeignKeyField) and name != obj.name:
return True
if isinstance(obj, pw.BackrefAccessor):
return True
return skip


Expand Down
2 changes: 1 addition & 1 deletion src/flexeval/__about__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.5.2"
__version__ = "0.5.3"
9 changes: 5 additions & 4 deletions src/flexeval/classes/jsonview.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,10 +70,11 @@ def refresh_from_model(self):
class JsonView:
"""Descriptor that provides dict-like access to a JSON text field.

Example:
class SomeModel(pw.Model):
some_field = pw.TextField(default="{}")
some_field_dict = JsonView(text_field_attr_name="some_field")
Example::

class SomeModel(pw.Model):
some_field = pw.TextField(default="{}")
some_field_dict = JsonView(text_field_attr_name="some_field")
"""

def __init__(self, text_field_attr_name):
Expand Down
19 changes: 10 additions & 9 deletions src/flexeval/compute_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -407,15 +407,16 @@ def process_thread_dependency_graph(
return evaluated_metrics

def compute_metrics(self, object: Union[Thread, Turn, Message, ToolCall]):
"""we've defined a variable called metrics_to_evaluate
it's a list we need to loop through
each entry looks like this
{
'name': 'string_length',
'type': 'function',
'kwargs': {},
'depends_on': []
}
"""Loop through ``metrics_to_evaluate``.

Each entry looks like this::

{
'name': 'string_length',
'type': 'function',
'kwargs': {},
'depends_on': []
}
"""
# we'll keep the results in a list
# for each new metric, if it has dependencies, we'll need to make sure they're met - otherwise we won't run it
Expand Down
18 changes: 10 additions & 8 deletions src/flexeval/configuration/completion_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,27 @@
and produce conversational turns (aka completions) as output.

When writing a new function, the arguments must include, at minimum:
* conversation_history - list of dictionaries with keys ("role","content"), whose values are strings
* kwargs - dictionary of optional values that can probably be ignored

* ``conversation_history`` - list of dictionaries with keys ("role", "content"), whose values are strings
* ``kwargs`` - dictionary of optional values that can probably be ignored

Other arguments can be added, but then must also be specified
in the "completion_llm" section of the evals.yaml config.

The outputs must conform to the structure described here:
https://platform.openai.com/docs/guides/text-generation/chat-completions-api
with the following format:
The outputs must conform to the `structure described here
<https://platform.openai.com/docs/guides/text-generation/chat-completions-api>`_,
with the following format::

completion = {
"choices": [
{
"message":{
"message": {
"content": MY_CONTENT_HERE,
"role":"assistant"
"role": "assistant"
}
}
]
}

"""

import json
Expand Down
62 changes: 31 additions & 31 deletions src/flexeval/configuration/function_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,18 +39,17 @@
def process_single_message(
message: str,
) -> Union[int, float, dict[str, Union[int, float]]]:
"""
Process a single conversational message and return the desired output

Args:
message (str): a single conversational message as a string
NOTE: Metrics that take a string as input are valid at the Turn
and Message levels.

Returns:
an integer (e.g., 2), \
or a floating point number (e.g., 2.8), \
or a dictionary of metric/value pairs (e.g. {'metric1':value1, 'metric2':value2})
"""Process a single conversational message and return the desired output.

Args:
message (str): a single conversational message as a string.
NOTE: Metrics that take a string as input are valid at the Turn
and Message levels.

Returns:
An integer (e.g., ``2``), a floating point number (e.g., ``2.8``), or a
dictionary of metric/value pairs
(e.g., ``{'metric1': value1, 'metric2': value2}``).
"""
pass

Expand All @@ -61,19 +60,19 @@ def process_conversation(
) -> Union[
int, float, dict[str, Union[int, float]], list[dict[str, Union[int, float]]]
]:
"""
Process an entire conversation and return the desired output
Args:
conversation (list): an entire conversation as a list
NOTE: Metrics that take a list as input are valid at the Thread
and Turn levels.
Returns:
an integer, e.g., 2 \
or a floating point number, e.g., 2.8 \
or a dictionary of metric/value pairs, e.g. {'metric1':value1, 'metric2':value2}\
or a list of dictionaries. The key can be either 'role' or 'metric'. \
e.g., [{"role":role1, "value":value1}, {"role":role2, "value":value2}, ...]
"""Process an entire conversation and return the desired output.

Args:
conversation (list): an entire conversation as a list.
NOTE: Metrics that take a list as input are valid at the Thread
and Turn levels.

Returns:
An integer (e.g., ``2``), a floating point number (e.g., ``2.8``),
a dictionary of metric/value pairs
(e.g., ``{'metric1': value1, 'metric2': value2}``), or a list of
dictionaries where the key can be either 'role' or 'metric'
(e.g., ``[{"role": role1, "value": value1}, ...]``).
"""
pass

Expand Down Expand Up @@ -511,13 +510,14 @@ def count_errors(object: Union[Thread, Turn, Message, ToolCall]) -> dict:
If a Turn, Message, or ToolCall, ditto.

It does this by iterating through ToolCalls and identifying whether there are
entries like "*_errors" in tool_call.additional_kwargs
entries like ``*_errors`` in tool_call.additional_kwargs

If a ToolCall, returns 1 if there is an error of each type::

If a ToolCall, returns 1 if there is an error of each type
{
"python_errors": 3,
"javascript_errors": 1
}
{
"python_errors": 3,
"javascript_errors": 1
}
"""
if isinstance(object, ToolCall):
return {
Expand Down
Loading