diff --git a/prepare/cards/bfcl.py b/prepare/cards/bfcl.py index 1b5159937c..2d8ae2ddff 100644 --- a/prepare/cards/bfcl.py +++ b/prepare/cards/bfcl.py @@ -5,9 +5,11 @@ from unitxt.operators import ( Copy, ExecuteExpression, + FilterByExpression, + FixJsonSchemaOfParameterTypes, Set, ) -from unitxt.stream_operators import JoinStreams +from unitxt.stream_operators import DeleteSplits, JoinStreams from unitxt.test_utils.card import test_card base_path = "https://raw.githubusercontent.com/ShishirPatil/gorilla/70b6a4a2144597b1f99d1f4d3185d35d7ee532a4/berkeley-function-call-leaderboard/data/" @@ -31,9 +33,10 @@ on="id", new_stream_name="test", ), + DeleteSplits(splits=["questions", "answers"]), Copy(field="question/0/0/content", to_field="query"), Copy(field="function", to_field="tools"), - "operators.fix_json_schema", + FixJsonSchemaOfParameterTypes(main_field="tools"), # Process ground truth data in this dataset, which is a provided as a list of options per field, # and convert it into a list of explicit tool calls # @@ -100,11 +103,15 @@ on="id", new_stream_name="test", ), + DeleteSplits(splits=["questions", "answers"]), Copy(field="question/*/0", to_field="dialog"), Copy(field="function", to_field="tools"), - "operators.fix_json_schema", + FixJsonSchemaOfParameterTypes(main_field="tools"), + FilterByExpression( + expression="all(isinstance(v, dict) for d in ground_truth for k, v in d.items())" + ), ExecuteExpression( - expression='[{"name": k, "arguments": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*v.values())]', + expression='[{"name": k, "arguments": dict(zip(v.keys(), vals))} for d in ground_truth for k, v in d.items() for vals in itertools.product(*[[w for w in vval if w != ""] for vval in v.values()])]', to_field="reference_calls", imports_list=["itertools"], ), @@ -138,52 +145,52 @@ test_card(card, strict=False) add_to_catalog(card, f"cards.bfcl.multi_turn.{subset}_v3", overwrite=True) - for subset in [ - "live_relevance", - "live_irrelevance", - ]: - card = TaskCard( - loader=LoadJsonFile( - files={ - "test": base_path + f"BFCL_v3_{subset}.json", - }, - lines=True, - data_classification_policy=["public"], - ), - preprocess_steps=[ - Copy(field="question/*/0", to_field="dialog"), - Copy(field="function", to_field="tools"), - "operators.fix_json_schema", - Set(fields={"reference_calls": []}), - ], - task="tasks.tool_calling.multi_turn", - templates=["templates.tool_calling.multi_turn"], - __description__=( - """The Berkeley function calling leaderboard is a live leaderboard to evaluate the ability of different LLMs to call functions (also referred to as tools). We built this dataset from our learnings to be representative of most users' function calling use-cases, for example, in agents, as a part of enterprise workflows, etc. To this end, our evaluation dataset spans diverse categories, and across multiple languages.""" - ), - __title__=f"""Berkeley Function Calling Leaderboard (Multi Turn Setup) - {subset.replace("_", " ").title()} V3""", - __tags__={ - "annotations_creators": "expert-generated", - "language": ["en"], - "license": "apache-2.0", - "size_categories": ["10K