1414import java .util .function .Function ;
1515import javax .annotation .Nonnull ;
1616import javax .annotation .Nullable ;
17- import lombok .Getter ;
1817import lombok .SneakyThrows ;
1918
2019/**
@@ -33,7 +32,7 @@ public final class Eval<INPUT, OUTPUT> {
3332 private final @ Nonnull BraintrustApiClient client ;
3433 private final @ Nonnull BraintrustApiClient .OrganizationAndProjectInfo orgAndProject ;
3534 private final @ Nonnull Tracer tracer ;
36- private final @ Nonnull List < EvalCase < INPUT , OUTPUT >> evalCases ;
35+ private final @ Nonnull Dataset < INPUT , OUTPUT > dataset ;
3736 private final @ Nonnull Task <INPUT , OUTPUT > task ;
3837 private final @ Nonnull List <Scorer <INPUT , OUTPUT >> scorers ;
3938
@@ -52,13 +51,13 @@ private Eval(Builder<INPUT, OUTPUT> builder) {
5251 "invalid project id: " + builder .projectId ));
5352 }
5453 this .tracer = Objects .requireNonNull (builder .tracer );
55- this .evalCases = List . copyOf ( builder .evalCases ) ;
54+ this .dataset = builder .dataset ;
5655 this .task = Objects .requireNonNull (builder .task );
5756 this .scorers = List .copyOf (builder .scorers );
5857 }
5958
6059 /** Runs the evaluation and returns results. */
61- public Result run () {
60+ public EvalResult run () {
6261 var experiment =
6362 client .getOrCreateExperiment (
6463 new BraintrustApiClient .CreateExperimentRequest (
@@ -67,36 +66,48 @@ public Result run() {
6766 Optional .empty (),
6867 Optional .empty ()));
6968 var experimentID = experiment .id ();
70- var evalCaseResults =
71- evalCases .stream ().map (evalCase -> evalOne (experimentID , evalCase )).toList ();
72- return new Result ();
69+
70+ try (var cursor = dataset .openCursor ()) {
71+ for (var datsetCase = cursor .next ();
72+ datsetCase .isPresent ();
73+ datsetCase = cursor .next ()) {
74+ evalOne (experimentID , datsetCase .get ());
75+ }
76+ }
77+ var experimentUrl =
78+ "%s/experiments/%s"
79+ .formatted (
80+ BraintrustUtils .createProjectURI (config .appUrl (), orgAndProject )
81+ .toASCIIString (),
82+ experimentName );
83+ return new EvalResult (experimentUrl );
7384 }
7485
75- private EvalCase .Result <INPUT , OUTPUT > evalOne (
76- String experimentId , EvalCase <INPUT , OUTPUT > evalCase ) {
86+ @ SneakyThrows
87+ private void evalOne (String experimentId , DatasetCase <INPUT , OUTPUT > datasetCase ) {
88+ JSON_MAPPER .writeValueAsString (Map .of ("type" , "eval" ));
7789 var rootSpan =
7890 tracer .spanBuilder ("eval" ) // TODO: allow names for eval cases
7991 .setNoParent () // each eval case is its own trace
8092 .setSpanKind (SpanKind .CLIENT )
8193 .setAttribute (PARENT , "experiment_id:" + experimentId )
82- .setAttribute ("braintrust.span_attributes" , "{\" type\" :\" eval\" }" )
83- // FIXME: use proper object mapper for json stuff
94+ .setAttribute ("braintrust.span_attributes" , json (Map .of ("type" , "eval" )))
8495 .setAttribute (
85- "braintrust.input_json" ,
86- "{ \" input\" :\" " + evalCase .input () + "\" }" )
87- .setAttribute ("braintrust.expected" , "\" " + evalCase .expected () + "\" " )
96+ "braintrust.input_json" , json (Map .of ("input" , datasetCase .input ())))
97+ .setAttribute ("braintrust.expected" , json (datasetCase .expected ()))
8898 .startSpan ();
8999 try (var rootScope = BraintrustContext .ofExperiment (experimentId , rootSpan ).makeCurrent ()) {
90- final OUTPUT result ;
100+ final TaskResult < INPUT , OUTPUT > result ;
91101 { // run task
92102 var taskSpan =
93103 tracer .spanBuilder ("task" )
94104 .setAttribute (PARENT , "experiment_id:" + experimentId )
95- .setAttribute ("braintrust.span_attributes" , "{\" type\" :\" task\" }" )
105+ .setAttribute (
106+ "braintrust.span_attributes" , json (Map .of ("type" , "task" )))
96107 .startSpan ();
97108 try (var unused =
98109 BraintrustContext .ofExperiment (experimentId , taskSpan ).makeCurrent ()) {
99- result = task .apply (evalCase );
110+ result = task .apply (datasetCase );
100111 } finally {
101112 taskSpan .end ();
102113 }
@@ -113,66 +124,40 @@ private EvalCase.Result<INPUT, OUTPUT> evalOne(
113124 var scoreSpan =
114125 tracer .spanBuilder ("score" )
115126 .setAttribute (PARENT , "experiment_id:" + experimentId )
116- .setAttribute ("braintrust.span_attributes" , "{\" type\" :\" score\" }" )
127+ .setAttribute (
128+ "braintrust.span_attributes" , json (Map .of ("type" , "score" )))
117129 .startSpan ();
118130 try (var unused =
119131 BraintrustContext .ofExperiment (experimentId , scoreSpan ).makeCurrent ()) {
120- // NOTE: linked hash map to preserve ordering. Not in the spec but nice user
121- // experience
122- final HashMap <String , Double > nameToScore = new LinkedHashMap <>();
123- var scores =
124- scorers .stream ()
125- .map (
126- scorer -> {
127- var score = scorer .score (evalCase , result );
128- if (score < 0.0 || score > 1.0 ) {
129- throw new RuntimeException (
130- "score must be between 0 and 1: "
131- + scorer .getName ()
132- + " : "
133- + score );
134- }
135- nameToScore .put (scorer .getName (), score );
136- return score ;
137- })
138- .toList ();
139- try {
140- scoreSpan .setAttribute (
141- "braintrust.scores" , JSON_MAPPER .writeValueAsString (nameToScore ));
142- } catch (JsonProcessingException e ) {
143- throw new RuntimeException (e );
144- }
132+ // linked map to preserve ordering. Not in the spec but nice user experience
133+ final Map <String , Double > nameToScore = new LinkedHashMap <>();
134+ scorers .forEach (
135+ scorer -> {
136+ var scores = scorer .score (result );
137+ scores .forEach (
138+ score -> {
139+ if (score .value () < 0.0 || score .value () > 1.0 ) {
140+ throw new RuntimeException (
141+ "score must be between 0 and 1: %s : %s"
142+ .formatted (
143+ scorer .getName (), score ));
144+ }
145+ nameToScore .put (score .name (), score .value ());
146+ });
147+ });
148+ scoreSpan .setAttribute ("braintrust.scores" , json (nameToScore ));
145149 } finally {
146150 scoreSpan .end ();
147151 }
148152 }
149- return new EvalCase .Result <>(evalCase , result );
150153 } finally {
151154 rootSpan .end ();
152155 }
153156 }
154157
155- /** Results of all eval cases of an experiment. */
156- public class Result {
157- @ Getter private final String experimentUrl ;
158-
159- @ SneakyThrows
160- private Result () {
161- this .experimentUrl =
162- "%s/experiments/%s"
163- .formatted (
164- BraintrustUtils .createProjectURI (config .appUrl (), orgAndProject )
165- .toASCIIString (),
166- experimentName );
167- }
168-
169- public String createReportString () {
170- try {
171- return "Experiment complete. View results in braintrust: " + experimentUrl ;
172- } catch (Exception e ) {
173- throw new RuntimeException (e );
174- }
175- }
158+ @ SneakyThrows
159+ private String json (Object o ) {
160+ return JSON_MAPPER .writeValueAsString (o );
176161 }
177162
178163 /** Creates a new eval builder. */
@@ -182,12 +167,12 @@ public static <INPUT, OUTPUT> Builder<INPUT, OUTPUT> builder() {
182167
183168 /** Builder for creating evaluations with fluent API. */
184169 public static final class Builder <INPUT , OUTPUT > {
170+ public @ Nonnull Dataset <INPUT , OUTPUT > dataset ;
185171 private @ Nonnull String experimentName = "unnamed-java-eval" ;
186172 private @ Nullable BraintrustConfig config ;
187173 private @ Nullable BraintrustApiClient apiClient ;
188174 private @ Nullable String projectId ;
189175 private @ Nullable Tracer tracer = null ;
190- private @ Nonnull List <EvalCase <INPUT , OUTPUT >> evalCases = List .of ();
191176 private @ Nullable Task <INPUT , OUTPUT > task ;
192177 private @ Nonnull List <Scorer <INPUT , OUTPUT >> scorers = List .of ();
193178
@@ -201,15 +186,13 @@ public Eval<INPUT, OUTPUT> build() {
201186 if (projectId == null ) {
202187 projectId = config .defaultProjectId ().orElse (null );
203188 }
204- if (evalCases .isEmpty ()) {
205- throw new RuntimeException ("must provide at least one eval case" );
206- }
207189 if (scorers .isEmpty ()) {
208190 throw new RuntimeException ("must provide at least one scorer" );
209191 }
210192 if (null == apiClient ) {
211193 apiClient = BraintrustApiClient .of (config );
212194 }
195+ Objects .requireNonNull (dataset );
213196 Objects .requireNonNull (task );
214197 return new Eval <>(this );
215198 }
@@ -239,10 +222,28 @@ public Builder<INPUT, OUTPUT> tracer(Tracer tracer) {
239222 return this ;
240223 }
241224
225+ public Builder <INPUT , OUTPUT > dataset (Dataset <INPUT , OUTPUT > dataset ) {
226+ this .dataset = dataset ;
227+ return this ;
228+ }
229+
230+ /** Deprecated. Use {@link #cases(DatasetCase[])} or {@link #dataset(Dataset)} instead */
231+ @ Deprecated
242232 @ SafeVarargs
243233 public final Builder <INPUT , OUTPUT > cases (EvalCase <INPUT , OUTPUT >... cases ) {
244- this .evalCases = List .of (cases );
245- return this ;
234+ return cases (
235+ Arrays .stream (cases )
236+ .map (evalCase -> DatasetCase .of (evalCase .input (), evalCase .expected ()))
237+ .toList ()
238+ .toArray (new DatasetCase [0 ]));
239+ }
240+
241+ @ SafeVarargs
242+ public final Builder <INPUT , OUTPUT > cases (DatasetCase <INPUT , OUTPUT >... cases ) {
243+ if (cases .length == 0 ) {
244+ throw new RuntimeException ("must provide at least one case" );
245+ }
246+ return dataset (Dataset .of (cases ));
246247 }
247248
248249 public Builder <INPUT , OUTPUT > task (Task <INPUT , OUTPUT > task ) {
@@ -251,7 +252,15 @@ public Builder<INPUT, OUTPUT> task(Task<INPUT, OUTPUT> task) {
251252 }
252253
253254 public Builder <INPUT , OUTPUT > taskFunction (Function <INPUT , OUTPUT > taskFn ) {
254- return task (evalCase -> taskFn .apply (evalCase .input ()));
255+ return task (
256+ new Task <>() {
257+ @ Override
258+ public TaskResult <INPUT , OUTPUT > apply (
259+ DatasetCase <INPUT , OUTPUT > datasetCase ) {
260+ var result = taskFn .apply (datasetCase .input ());
261+ return new TaskResult <>(result , datasetCase );
262+ }
263+ });
255264 }
256265
257266 @ SafeVarargs
0 commit comments