Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add table driven evals example #29

Merged
merged 4 commits into from
Jan 14, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,10 @@ jobs:
run: |
go test -run TestEval ./...
evals | tee evals.txt >> $GITHUB_STEP_SUMMARY
env:
OPENAI_KEY: ${{ secrets.OPENAI_KEY }}
ANTHROPIC_KEY: ${{ secrets.ANTHROPIC_KEY }}
GOOGLE_KEY: ${{ secrets.GOOGLE_KEY }}

- name: Upload evals.db
uses: actions/upload-artifact@v4
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ func TestEvalPrompt(t *testing.T) {
// Evals only run if "go test" is being run with "-test.run=TestEval", e.g.: "go test -test.run=TestEval ./..."
eval.Run(t, "answers with a pong", func(e *eval.E) {
// Initialize our intensely powerful LLM.
llm := &llm{response: "plong"}
llm := &powerfulLLM{response: "plong"}

// Send our input to the LLM and get an output back.
input := "ping"
Expand All @@ -58,11 +58,11 @@ func TestEvalPrompt(t *testing.T) {
})
}

type llm struct {
type powerfulLLM struct {
response string
}

func (l *llm) Prompt(request string) string {
func (l *powerfulLLM) Prompt(request string) string {
return l.response
}
```
3 changes: 2 additions & 1 deletion anthropic_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ func TestAnthropicClientCompletion(t *testing.T) {
MaxTokens: anthropic.F(int64(4)),
})
is.NotError(t, err)
is.True(t, strings.Contains(fmt.Sprint(res.Content), "Hi"))
is.True(t, len(res.Content) > 0)
is.True(t, strings.Contains(fmt.Sprint(res.Content[0].Text), "Hi"))
})
}
2 changes: 1 addition & 1 deletion google_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,6 @@ func TestGoogleClientCompletion(t *testing.T) {
res, err := model.GenerateContent(context.Background(), genai.Text("Hi."))
is.NotError(t, err)
is.True(t, len(res.Candidates) > 0)
is.True(t, strings.Contains(fmt.Sprint(res.Candidates[0].Content.Parts), "Hi"))
is.True(t, strings.Contains(fmt.Sprint(res.Candidates[0].Content.Parts[0]), "Hi"))
})
}
102 changes: 102 additions & 0 deletions internal/examples/llm_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
package examples_test

import (
"context"
"fmt"
"strings"
"testing"

"github.com/anthropics/anthropic-sdk-go"
"github.com/google/generative-ai-go/genai"
"github.com/openai/openai-go"
"maragu.dev/env"
"maragu.dev/llm"
"maragu.dev/llm/eval"
)

// TestEvalLLMs evaluates different LLMs with the same prompts.
func TestEvalLLMs(t *testing.T) {
_ = env.Load("../../.env.test.local")

tests := []struct {
name string
prompt func(prompt string) string
expected string
}{
{
name: "gpt-4o-mini",
prompt: gpt4oMini,
expected: "Hello! How can I assist you today?",
},
{
name: "gemini-1.5-flash",
prompt: gemini15Flash,
expected: "Hi there! How can I help you today?",
},
{
name: "claude-3.5-haiku",
prompt: claude35Haiku,
expected: "Hello! How are you doing today? Is there anything I can help you with?",
},
}

for _, test := range tests {
eval.Run(t, test.name, func(e *eval.E) {
input := "Hi!"
output := test.prompt(input)

sample := eval.Sample{
Input: input,
Output: output,
Expected: test.expected,
}

result := e.Score(sample, eval.LevenshteinDistanceScorer())

e.Log(sample, result)
})
}
}

func gpt4oMini(prompt string) string {
client := llm.NewOpenAIClient(llm.NewOpenAIClientOptions{Key: env.GetStringOrDefault("OPENAI_KEY", "")})
res, err := client.Client.Chat.Completions.New(context.Background(), openai.ChatCompletionNewParams{
Messages: openai.F([]openai.ChatCompletionMessageParamUnion{
openai.UserMessage(prompt),
}),
Model: openai.F(openai.ChatModelGPT4oMini),
Temperature: openai.F(0.0),
})
if err != nil {
panic(err)
}
return res.Choices[0].Message.Content
}

func gemini15Flash(prompt string) string {
client := llm.NewGoogleClient(llm.NewGoogleClientOptions{Key: env.GetStringOrDefault("GOOGLE_KEY", "")})
model := client.Client.GenerativeModel("models/gemini-1.5-flash-latest")
var temperature float32 = 0
model.Temperature = &temperature
res, err := model.GenerateContent(context.Background(), genai.Text(prompt))
if err != nil {
panic(err)
}
return strings.TrimSpace(fmt.Sprint(res.Candidates[0].Content.Parts[0]))
}

func claude35Haiku(prompt string) string {
client := llm.NewAnthropicClient(llm.NewAnthropicClientOptions{Key: env.GetStringOrDefault("ANTHROPIC_KEY", "")})
res, err := client.Client.Messages.New(context.Background(), anthropic.MessageNewParams{
Messages: anthropic.F([]anthropic.MessageParam{
anthropic.NewUserMessage(anthropic.NewTextBlock(prompt)),
}),
Model: anthropic.F(anthropic.ModelClaude3_5HaikuLatest),
MaxTokens: anthropic.F(int64(1024)),
Temperature: anthropic.F(0.0),
})
if err != nil {
panic(err)
}
return fmt.Sprint(res.Content[0].Text)
}
6 changes: 3 additions & 3 deletions internal/examples/prompt_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ func TestEvalPrompt(t *testing.T) {
// Evals only run if "go test" is being run with "-test.run=TestEval", e.g.: "go test -test.run=TestEval ./..."
eval.Run(t, "answers with a pong", func(e *eval.E) {
// Initialize our intensely powerful LLM.
llm := &llm{response: "plong"}
llm := &powerfulLLM{response: "plong"}

// Send our input to the LLM and get an output back.
input := "ping"
Expand All @@ -34,10 +34,10 @@ func TestEvalPrompt(t *testing.T) {
})
}

type llm struct {
type powerfulLLM struct {
response string
}

func (l *llm) Prompt(request string) string {
func (l *powerfulLLM) Prompt(request string) string {
return l.response
}
Loading