Skip to content

Commit 178f6be

Browse files
committed
Rework evals api
- datasets interface - currently only in-memory datasets, but future proof for fetching from branitrust - task and scorers output to record classes - to support passing secondary data
1 parent 6a96b07 commit 178f6be

12 files changed

Lines changed: 268 additions & 99 deletions

File tree

examples/src/main/java/dev/braintrust/examples/ExperimentExample.java

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import com.openai.models.ChatModel;
55
import com.openai.models.chat.completions.ChatCompletionCreateParams;
66
import dev.braintrust.Braintrust;
7-
import dev.braintrust.eval.EvalCase;
7+
import dev.braintrust.eval.DatasetCase;
88
import dev.braintrust.eval.Scorer;
99
import dev.braintrust.instrumentation.openai.BraintrustOpenAI;
1010
import java.util.function.Function;
@@ -37,10 +37,10 @@ public static void main(String[] args) throws Exception {
3737
// will append new cases to
3838
// the same experiment
3939
.cases(
40-
EvalCase.of("strawberry", "fruit"),
41-
EvalCase.of("asparagus", "vegetable"),
42-
EvalCase.of("apple", "fruit"),
43-
EvalCase.of("banana", "fruit"))
40+
DatasetCase.of("strawberry", "fruit"),
41+
DatasetCase.of("asparagus", "vegetable"),
42+
DatasetCase.of("apple", "fruit"),
43+
DatasetCase.of("banana", "fruit"))
4444
.taskFunction(getFoodType)
4545
.scorers(
4646
Scorer.of(
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
package dev.braintrust.eval;
2+
3+
import java.util.List;
4+
import java.util.Optional;
5+
6+
public interface Dataset<INPUT, OUTPUT> {
7+
Cursor<DatasetCase<INPUT, OUTPUT>> openCursor();
8+
9+
String id();
10+
11+
String version();
12+
13+
interface Cursor<CASE> extends AutoCloseable {
14+
/**
15+
* Fetch the next case. Returns empty if there are no more cases to fetch.
16+
*
17+
* <p>Implementations may make external requests to fetch data.
18+
*
19+
* <p>If this method is invoked after {@link #close()} an IllegalStateException will be
20+
* thrown
21+
*/
22+
Optional<CASE> next();
23+
24+
/** close all cursor resources */
25+
void close();
26+
}
27+
28+
/** TODO: document */
29+
@SafeVarargs
30+
static <INPUT, OUTPUT> Dataset<INPUT, OUTPUT> of(DatasetCase<INPUT, OUTPUT>... cases) {
31+
return new DatasetInMemoryImpl<>(List.of(cases));
32+
}
33+
}
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
package dev.braintrust.eval;
2+
3+
import java.util.List;
4+
import java.util.Map;
5+
import javax.annotation.Nonnull;
6+
7+
/** A single row in a dataset. */
8+
public record DatasetCase<INPUT, OUTPUT>(
9+
INPUT input,
10+
OUTPUT expected,
11+
@Nonnull List<String> tags,
12+
@Nonnull Map<String, Object> metadata) {
13+
public DatasetCase {
14+
if (!metadata.isEmpty()) {
15+
throw new RuntimeException("TODO: metadata support not yet implemented");
16+
}
17+
if (!tags.isEmpty()) {
18+
throw new RuntimeException("TODO: tags support not yet implemented");
19+
}
20+
}
21+
22+
public static <INPUT, OUTPUT> DatasetCase<INPUT, OUTPUT> of(INPUT input, OUTPUT expected) {
23+
return new DatasetCase<>(input, expected, List.of(), Map.of());
24+
}
25+
}
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
package dev.braintrust.eval;
2+
3+
import java.util.List;
4+
import java.util.Optional;
5+
6+
/** A dataset held entirely in memory */
7+
class DatasetInMemoryImpl<INPUT, OUTPUT> implements Dataset<INPUT, OUTPUT> {
8+
private final List<DatasetCase<INPUT, OUTPUT>> cases;
9+
private final String id;
10+
11+
public DatasetInMemoryImpl(List<DatasetCase<INPUT, OUTPUT>> cases) {
12+
this.cases = List.copyOf(cases);
13+
id = "in-memory-dataset<" + this.cases.hashCode() + ">";
14+
}
15+
16+
@Override
17+
public Cursor<DatasetCase<INPUT, OUTPUT>> openCursor() {
18+
return new Cursor<>() {
19+
int nextIndex = 0;
20+
boolean closed = false;
21+
22+
@Override
23+
public Optional<DatasetCase<INPUT, OUTPUT>> next() {
24+
if (closed) {
25+
throw new IllegalStateException("this method may not be invoked after close");
26+
} else if (nextIndex < cases.size()) {
27+
return Optional.of(cases.get(nextIndex++));
28+
} else {
29+
return Optional.empty();
30+
}
31+
}
32+
33+
@Override
34+
public void close() {
35+
closed = true;
36+
}
37+
};
38+
}
39+
40+
@Override
41+
public String id() {
42+
return id;
43+
}
44+
45+
@Override
46+
public String version() {
47+
return "0";
48+
}
49+
}

src/main/java/dev/braintrust/eval/Eval.java

Lines changed: 81 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414
import java.util.function.Function;
1515
import javax.annotation.Nonnull;
1616
import javax.annotation.Nullable;
17-
import lombok.Getter;
1817
import lombok.SneakyThrows;
1918

2019
/**
@@ -33,7 +32,7 @@ public final class Eval<INPUT, OUTPUT> {
3332
private final @Nonnull BraintrustApiClient client;
3433
private final @Nonnull BraintrustApiClient.OrganizationAndProjectInfo orgAndProject;
3534
private final @Nonnull Tracer tracer;
36-
private final @Nonnull List<EvalCase<INPUT, OUTPUT>> evalCases;
35+
private final @Nonnull Dataset<INPUT, OUTPUT> dataset;
3736
private final @Nonnull Task<INPUT, OUTPUT> task;
3837
private final @Nonnull List<Scorer<INPUT, OUTPUT>> scorers;
3938

@@ -52,13 +51,13 @@ private Eval(Builder<INPUT, OUTPUT> builder) {
5251
"invalid project id: " + builder.projectId));
5352
}
5453
this.tracer = Objects.requireNonNull(builder.tracer);
55-
this.evalCases = List.copyOf(builder.evalCases);
54+
this.dataset = builder.dataset;
5655
this.task = Objects.requireNonNull(builder.task);
5756
this.scorers = List.copyOf(builder.scorers);
5857
}
5958

6059
/** Runs the evaluation and returns results. */
61-
public Result run() {
60+
public EvalResult run() {
6261
var experiment =
6362
client.getOrCreateExperiment(
6463
new BraintrustApiClient.CreateExperimentRequest(
@@ -67,36 +66,48 @@ public Result run() {
6766
Optional.empty(),
6867
Optional.empty()));
6968
var experimentID = experiment.id();
70-
var evalCaseResults =
71-
evalCases.stream().map(evalCase -> evalOne(experimentID, evalCase)).toList();
72-
return new Result();
69+
70+
try (var cursor = dataset.openCursor()) {
71+
for (var datsetCase = cursor.next();
72+
datsetCase.isPresent();
73+
datsetCase = cursor.next()) {
74+
evalOne(experimentID, datsetCase.get());
75+
}
76+
}
77+
var experimentUrl =
78+
"%s/experiments/%s"
79+
.formatted(
80+
BraintrustUtils.createProjectURI(config.appUrl(), orgAndProject)
81+
.toASCIIString(),
82+
experimentName);
83+
return new EvalResult(experimentUrl);
7384
}
7485

75-
private EvalCase.Result<INPUT, OUTPUT> evalOne(
76-
String experimentId, EvalCase<INPUT, OUTPUT> evalCase) {
86+
@SneakyThrows
87+
private void evalOne(String experimentId, DatasetCase<INPUT, OUTPUT> datasetCase) {
88+
JSON_MAPPER.writeValueAsString(Map.of("type", "eval"));
7789
var rootSpan =
7890
tracer.spanBuilder("eval") // TODO: allow names for eval cases
7991
.setNoParent() // each eval case is its own trace
8092
.setSpanKind(SpanKind.CLIENT)
8193
.setAttribute(PARENT, "experiment_id:" + experimentId)
82-
.setAttribute("braintrust.span_attributes", "{\"type\":\"eval\"}")
83-
// FIXME: use proper object mapper for json stuff
94+
.setAttribute("braintrust.span_attributes", json(Map.of("type", "eval")))
8495
.setAttribute(
85-
"braintrust.input_json",
86-
"{ \"input\":\"" + evalCase.input() + "\"}")
87-
.setAttribute("braintrust.expected", "\"" + evalCase.expected() + "\"")
96+
"braintrust.input_json", json(Map.of("input", datasetCase.input())))
97+
.setAttribute("braintrust.expected", json(datasetCase.expected()))
8898
.startSpan();
8999
try (var rootScope = BraintrustContext.ofExperiment(experimentId, rootSpan).makeCurrent()) {
90-
final OUTPUT result;
100+
final TaskResult<INPUT, OUTPUT> result;
91101
{ // run task
92102
var taskSpan =
93103
tracer.spanBuilder("task")
94104
.setAttribute(PARENT, "experiment_id:" + experimentId)
95-
.setAttribute("braintrust.span_attributes", "{\"type\":\"task\"}")
105+
.setAttribute(
106+
"braintrust.span_attributes", json(Map.of("type", "task")))
96107
.startSpan();
97108
try (var unused =
98109
BraintrustContext.ofExperiment(experimentId, taskSpan).makeCurrent()) {
99-
result = task.apply(evalCase);
110+
result = task.apply(datasetCase);
100111
} finally {
101112
taskSpan.end();
102113
}
@@ -113,66 +124,40 @@ private EvalCase.Result<INPUT, OUTPUT> evalOne(
113124
var scoreSpan =
114125
tracer.spanBuilder("score")
115126
.setAttribute(PARENT, "experiment_id:" + experimentId)
116-
.setAttribute("braintrust.span_attributes", "{\"type\":\"score\"}")
127+
.setAttribute(
128+
"braintrust.span_attributes", json(Map.of("type", "score")))
117129
.startSpan();
118130
try (var unused =
119131
BraintrustContext.ofExperiment(experimentId, scoreSpan).makeCurrent()) {
120-
// NOTE: linked hash map to preserve ordering. Not in the spec but nice user
121-
// experience
122-
final HashMap<String, Double> nameToScore = new LinkedHashMap<>();
123-
var scores =
124-
scorers.stream()
125-
.map(
126-
scorer -> {
127-
var score = scorer.score(evalCase, result);
128-
if (score < 0.0 || score > 1.0) {
129-
throw new RuntimeException(
130-
"score must be between 0 and 1: "
131-
+ scorer.getName()
132-
+ " : "
133-
+ score);
134-
}
135-
nameToScore.put(scorer.getName(), score);
136-
return score;
137-
})
138-
.toList();
139-
try {
140-
scoreSpan.setAttribute(
141-
"braintrust.scores", JSON_MAPPER.writeValueAsString(nameToScore));
142-
} catch (JsonProcessingException e) {
143-
throw new RuntimeException(e);
144-
}
132+
// linked map to preserve ordering. Not in the spec but nice user experience
133+
final Map<String, Double> nameToScore = new LinkedHashMap<>();
134+
scorers.forEach(
135+
scorer -> {
136+
var scores = scorer.score(result);
137+
scores.forEach(
138+
score -> {
139+
if (score.value() < 0.0 || score.value() > 1.0) {
140+
throw new RuntimeException(
141+
"score must be between 0 and 1: %s : %s"
142+
.formatted(
143+
scorer.getName(), score));
144+
}
145+
nameToScore.put(score.name(), score.value());
146+
});
147+
});
148+
scoreSpan.setAttribute("braintrust.scores", json(nameToScore));
145149
} finally {
146150
scoreSpan.end();
147151
}
148152
}
149-
return new EvalCase.Result<>(evalCase, result);
150153
} finally {
151154
rootSpan.end();
152155
}
153156
}
154157

155-
/** Results of all eval cases of an experiment. */
156-
public class Result {
157-
@Getter private final String experimentUrl;
158-
159-
@SneakyThrows
160-
private Result() {
161-
this.experimentUrl =
162-
"%s/experiments/%s"
163-
.formatted(
164-
BraintrustUtils.createProjectURI(config.appUrl(), orgAndProject)
165-
.toASCIIString(),
166-
experimentName);
167-
}
168-
169-
public String createReportString() {
170-
try {
171-
return "Experiment complete. View results in braintrust: " + experimentUrl;
172-
} catch (Exception e) {
173-
throw new RuntimeException(e);
174-
}
175-
}
158+
@SneakyThrows
159+
private String json(Object o) {
160+
return JSON_MAPPER.writeValueAsString(o);
176161
}
177162

178163
/** Creates a new eval builder. */
@@ -182,12 +167,12 @@ public static <INPUT, OUTPUT> Builder<INPUT, OUTPUT> builder() {
182167

183168
/** Builder for creating evaluations with fluent API. */
184169
public static final class Builder<INPUT, OUTPUT> {
170+
public @Nonnull Dataset<INPUT, OUTPUT> dataset;
185171
private @Nonnull String experimentName = "unnamed-java-eval";
186172
private @Nullable BraintrustConfig config;
187173
private @Nullable BraintrustApiClient apiClient;
188174
private @Nullable String projectId;
189175
private @Nullable Tracer tracer = null;
190-
private @Nonnull List<EvalCase<INPUT, OUTPUT>> evalCases = List.of();
191176
private @Nullable Task<INPUT, OUTPUT> task;
192177
private @Nonnull List<Scorer<INPUT, OUTPUT>> scorers = List.of();
193178

@@ -201,15 +186,13 @@ public Eval<INPUT, OUTPUT> build() {
201186
if (projectId == null) {
202187
projectId = config.defaultProjectId().orElse(null);
203188
}
204-
if (evalCases.isEmpty()) {
205-
throw new RuntimeException("must provide at least one eval case");
206-
}
207189
if (scorers.isEmpty()) {
208190
throw new RuntimeException("must provide at least one scorer");
209191
}
210192
if (null == apiClient) {
211193
apiClient = BraintrustApiClient.of(config);
212194
}
195+
Objects.requireNonNull(dataset);
213196
Objects.requireNonNull(task);
214197
return new Eval<>(this);
215198
}
@@ -239,10 +222,28 @@ public Builder<INPUT, OUTPUT> tracer(Tracer tracer) {
239222
return this;
240223
}
241224

225+
public Builder<INPUT, OUTPUT> dataset(Dataset<INPUT, OUTPUT> dataset) {
226+
this.dataset = dataset;
227+
return this;
228+
}
229+
230+
/** Deprecated. Use {@link #cases(DatasetCase[])} or {@link #dataset(Dataset)} instead */
231+
@Deprecated
242232
@SafeVarargs
243233
public final Builder<INPUT, OUTPUT> cases(EvalCase<INPUT, OUTPUT>... cases) {
244-
this.evalCases = List.of(cases);
245-
return this;
234+
return cases(
235+
Arrays.stream(cases)
236+
.map(evalCase -> DatasetCase.of(evalCase.input(), evalCase.expected()))
237+
.toList()
238+
.toArray(new DatasetCase[0]));
239+
}
240+
241+
@SafeVarargs
242+
public final Builder<INPUT, OUTPUT> cases(DatasetCase<INPUT, OUTPUT>... cases) {
243+
if (cases.length == 0) {
244+
throw new RuntimeException("must provide at least one case");
245+
}
246+
return dataset(Dataset.of(cases));
246247
}
247248

248249
public Builder<INPUT, OUTPUT> task(Task<INPUT, OUTPUT> task) {
@@ -251,7 +252,15 @@ public Builder<INPUT, OUTPUT> task(Task<INPUT, OUTPUT> task) {
251252
}
252253

253254
public Builder<INPUT, OUTPUT> taskFunction(Function<INPUT, OUTPUT> taskFn) {
254-
return task(evalCase -> taskFn.apply(evalCase.input()));
255+
return task(
256+
new Task<>() {
257+
@Override
258+
public TaskResult<INPUT, OUTPUT> apply(
259+
DatasetCase<INPUT, OUTPUT> datasetCase) {
260+
var result = taskFn.apply(datasetCase.input());
261+
return new TaskResult<>(result, datasetCase);
262+
}
263+
});
255264
}
256265

257266
@SafeVarargs

0 commit comments

Comments
 (0)