Skip to content

Commit 986009c

Browse files
authored
Updates the baseline evals with embedded 3 large (#2533)
1 parent ef11a09 commit 986009c

17 files changed

+209
-63
lines changed

evals/results/baseline/README.md

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
# Baseline Evaluation
2+
3+
This evaluation was done with the application using the following models:
4+
5+
* Chat completion: gpt-4o-mini
6+
* Embedding: text-embedding-3-large (with binary quantization, 1024 dimension reducation, and oversampling)
7+
8+
These are the default models and settings as of May 8, 2025.

evals/results/baseline/config.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"testdata_path": "ground_truth.jsonl",
3-
"results_dir": "results/gpt-4o-mini",
3+
"results_dir": "results/experiment<TIMESTAMP>",
44
"requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citations_matched", "any_citation"],
55
"target_url": "http://localhost:50505/chat",
66
"target_parameters": {

evals/results/baseline/eval_results.jsonl

Lines changed: 50 additions & 50 deletions
Large diffs are not rendered by default.

evals/results/baseline/evaluate_parameters.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"evaluation_gpt_model": "gpt-4o",
3-
"evaluation_timestamp": 1744920281,
3+
"evaluation_timestamp": 1746818372,
44
"testdata_path": "/Users/pamelafox/azure-search-openai-demo/evals/ground_truth.jsonl",
55
"target_url": "http://localhost:50505/chat",
66
"target_parameters": {

evals/results/baseline/summary.json

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,27 +1,27 @@
11
{
22
"gpt_groundedness": {
3-
"pass_count": 44,
4-
"pass_rate": 0.88,
5-
"mean_rating": 4.62
3+
"pass_count": 43,
4+
"pass_rate": 0.86,
5+
"mean_rating": 4.5
66
},
77
"gpt_relevance": {
88
"pass_count": 42,
99
"pass_rate": 0.84,
10-
"mean_rating": 4.12
10+
"mean_rating": 4.22
1111
},
1212
"answer_length": {
13-
"mean": 922.42,
14-
"max": 1616,
13+
"mean": 919.26,
14+
"max": 1647,
1515
"min": 193
1616
},
1717
"latency": {
18-
"mean": 3.14,
19-
"max": 7.583068,
20-
"min": 1.598833
18+
"mean": 4.46,
19+
"max": 15.129978,
20+
"min": 2.465542
2121
},
2222
"citations_matched": {
23-
"total": 25,
24-
"rate": 0.5
23+
"total": 24,
24+
"rate": 0.49
2525
},
2626
"any_citation": {
2727
"total": 50,
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
{
2+
"testdata_path": "ground_truth.jsonl",
3+
"results_dir": "results/gpt-4o-mini",
4+
"requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citations_matched", "any_citation"],
5+
"target_url": "http://localhost:50505/chat",
6+
"target_parameters": {
7+
"overrides": {
8+
"top": 3,
9+
"temperature": 0.3,
10+
"minimum_reranker_score": 0,
11+
"minimum_search_score": 0,
12+
"retrieval_mode": "hybrid",
13+
"semantic_ranker": true,
14+
"semantic_captions": false,
15+
"suggest_followup_questions": false,
16+
"use_oid_security_filter": false,
17+
"use_groups_security_filter": false,
18+
"vector_fields": [
19+
"embedding"
20+
],
21+
"use_gpt4v": false,
22+
"gpt4v_input": "textAndImages",
23+
"seed": 1
24+
}
25+
},
26+
"target_response_answer_jmespath": "message.content",
27+
"target_response_context_jmespath": "context.data_points.text"
28+
}

evals/results/gpt4omini-ada002/eval_results.jsonl

Lines changed: 50 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,27 @@
1+
{
2+
"evaluation_gpt_model": "gpt-4o",
3+
"evaluation_timestamp": 1744920281,
4+
"testdata_path": "/Users/pamelafox/azure-search-openai-demo/evals/ground_truth.jsonl",
5+
"target_url": "http://localhost:50505/chat",
6+
"target_parameters": {
7+
"overrides": {
8+
"top": 3,
9+
"temperature": 0.3,
10+
"minimum_reranker_score": 0,
11+
"minimum_search_score": 0,
12+
"retrieval_mode": "hybrid",
13+
"semantic_ranker": true,
14+
"semantic_captions": false,
15+
"suggest_followup_questions": false,
16+
"use_oid_security_filter": false,
17+
"use_groups_security_filter": false,
18+
"vector_fields": [
19+
"embedding"
20+
],
21+
"use_gpt4v": false,
22+
"gpt4v_input": "textAndImages",
23+
"seed": 1
24+
}
25+
},
26+
"num_questions": null
27+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
{
2+
"gpt_groundedness": {
3+
"pass_count": 44,
4+
"pass_rate": 0.88,
5+
"mean_rating": 4.62
6+
},
7+
"gpt_relevance": {
8+
"pass_count": 42,
9+
"pass_rate": 0.84,
10+
"mean_rating": 4.12
11+
},
12+
"answer_length": {
13+
"mean": 922.42,
14+
"max": 1616,
15+
"min": 193
16+
},
17+
"latency": {
18+
"mean": 3.14,
19+
"max": 7.583068,
20+
"min": 1.598833
21+
},
22+
"citations_matched": {
23+
"total": 25,
24+
"rate": 0.5
25+
},
26+
"any_citation": {
27+
"total": 50,
28+
"rate": 1.0
29+
},
30+
"num_questions": {
31+
"total": 50
32+
}
33+
}

0 commit comments

Comments
 (0)