maragudk · markuswustenberg · Jan 14, 2025 · Jan 14, 2025 · Jan 14, 2025 · Jan 14, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -79,6 +79,10 @@ jobs:
         run: |
           go test -run TestEval ./...
           evals | tee evals.txt >> $GITHUB_STEP_SUMMARY
+        env:
+          OPENAI_KEY: ${{ secrets.OPENAI_KEY }}
+          ANTHROPIC_KEY: ${{ secrets.ANTHROPIC_KEY }}
+          GOOGLE_KEY: ${{ secrets.GOOGLE_KEY }}
 
       - name: Upload evals.db
         uses: actions/upload-artifact@v4

diff --git a/README.md b/README.md
@@ -36,7 +36,7 @@ func TestEvalPrompt(t *testing.T) {
 	// Evals only run if "go test" is being run with "-test.run=TestEval", e.g.: "go test -test.run=TestEval ./..."
 	eval.Run(t, "answers with a pong", func(e *eval.E) {
 		// Initialize our intensely powerful LLM.
-		llm := &llm{response: "plong"}
+		llm := &powerfulLLM{response: "plong"}
 
 		// Send our input to the LLM and get an output back.
 		input := "ping"
@@ -58,11 +58,11 @@ func TestEvalPrompt(t *testing.T) {
 	})
 }
 
-type llm struct {
+type powerfulLLM struct {
 	response string
 }
 
-func (l *llm) Prompt(request string) string {
+func (l *powerfulLLM) Prompt(request string) string {
 	return l.response
 }
 ```
diff --git a/anthropic_test.go b/anthropic_test.go
@@ -38,6 +38,7 @@ func TestAnthropicClientCompletion(t *testing.T) {
 			MaxTokens: anthropic.F(int64(4)),
 		})
 		is.NotError(t, err)
-		is.True(t, strings.Contains(fmt.Sprint(res.Content), "Hi"))
+		is.True(t, len(res.Content) > 0)
+		is.True(t, strings.Contains(fmt.Sprint(res.Content[0].Text), "Hi"))
 	})
 }
diff --git a/google_test.go b/google_test.go
@@ -32,6 +32,6 @@ func TestGoogleClientCompletion(t *testing.T) {
 		res, err := model.GenerateContent(context.Background(), genai.Text("Hi."))
 		is.NotError(t, err)
 		is.True(t, len(res.Candidates) > 0)
-		is.True(t, strings.Contains(fmt.Sprint(res.Candidates[0].Content.Parts), "Hi"))
+		is.True(t, strings.Contains(fmt.Sprint(res.Candidates[0].Content.Parts[0]), "Hi"))
 	})
 }
diff --git a/internal/examples/llm_test.go b/internal/examples/llm_test.go
@@ -0,0 +1,102 @@
+package examples_test
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"testing"
+
+	"github.com/anthropics/anthropic-sdk-go"
+	"github.com/google/generative-ai-go/genai"
+	"github.com/openai/openai-go"
+	"maragu.dev/env"
+	"maragu.dev/llm"
+	"maragu.dev/llm/eval"
+)
+
+// TestEvalLLMs evaluates different LLMs with the same prompts.
+func TestEvalLLMs(t *testing.T) {
+	_ = env.Load("../../.env.test.local")
+
+	tests := []struct {
+		name     string
+		prompt   func(prompt string) string
+		expected string
+	}{
+		{
+			name:     "gpt-4o-mini",
+			prompt:   gpt4oMini,
+			expected: "Hello! How can I assist you today?",
+		},
+		{
+			name:     "gemini-1.5-flash",
+			prompt:   gemini15Flash,
+			expected: "Hi there! How can I help you today?",
+		},
+		{
+			name:     "claude-3.5-haiku",
+			prompt:   claude35Haiku,
+			expected: "Hello! How are you doing today? Is there anything I can help you with?",
+		},
+	}
+
+	for _, test := range tests {
+		eval.Run(t, test.name, func(e *eval.E) {
+			input := "Hi!"
+			output := test.prompt(input)
+
+			sample := eval.Sample{
+				Input:    input,
+				Output:   output,
+				Expected: test.expected,
+			}
+
+			result := e.Score(sample, eval.LevenshteinDistanceScorer())
+
+			e.Log(sample, result)
+		})
+	}
+}
+
+func gpt4oMini(prompt string) string {
+	client := llm.NewOpenAIClient(llm.NewOpenAIClientOptions{Key: env.GetStringOrDefault("OPENAI_KEY", "")})
+	res, err := client.Client.Chat.Completions.New(context.Background(), openai.ChatCompletionNewParams{
+		Messages: openai.F([]openai.ChatCompletionMessageParamUnion{
+			openai.UserMessage(prompt),
+		}),
+		Model:       openai.F(openai.ChatModelGPT4oMini),
+		Temperature: openai.F(0.0),
+	})
+	if err != nil {
+		panic(err)
+	}
+	return res.Choices[0].Message.Content
+}
+
+func gemini15Flash(prompt string) string {
+	client := llm.NewGoogleClient(llm.NewGoogleClientOptions{Key: env.GetStringOrDefault("GOOGLE_KEY", "")})
+	model := client.Client.GenerativeModel("models/gemini-1.5-flash-latest")
+	var temperature float32 = 0
+	model.Temperature = &temperature
+	res, err := model.GenerateContent(context.Background(), genai.Text(prompt))
+	if err != nil {
+		panic(err)
+	}
+	return strings.TrimSpace(fmt.Sprint(res.Candidates[0].Content.Parts[0]))
+}
+
+func claude35Haiku(prompt string) string {
+	client := llm.NewAnthropicClient(llm.NewAnthropicClientOptions{Key: env.GetStringOrDefault("ANTHROPIC_KEY", "")})
+	res, err := client.Client.Messages.New(context.Background(), anthropic.MessageNewParams{
+		Messages: anthropic.F([]anthropic.MessageParam{
+			anthropic.NewUserMessage(anthropic.NewTextBlock(prompt)),
+		}),
+		Model:       anthropic.F(anthropic.ModelClaude3_5HaikuLatest),
+		MaxTokens:   anthropic.F(int64(1024)),
+		Temperature: anthropic.F(0.0),
+	})
+	if err != nil {
+		panic(err)
+	}
+	return fmt.Sprint(res.Content[0].Text)
+}
diff --git a/internal/examples/prompt_test.go b/internal/examples/prompt_test.go
@@ -12,7 +12,7 @@ func TestEvalPrompt(t *testing.T) {
 	// Evals only run if "go test" is being run with "-test.run=TestEval", e.g.: "go test -test.run=TestEval ./..."
 	eval.Run(t, "answers with a pong", func(e *eval.E) {
 		// Initialize our intensely powerful LLM.
-		llm := &llm{response: "plong"}
+		llm := &powerfulLLM{response: "plong"}
 
 		// Send our input to the LLM and get an output back.
 		input := "ping"
@@ -34,10 +34,10 @@ func TestEvalPrompt(t *testing.T) {
 	})
 }
 
-type llm struct {
+type powerfulLLM struct {
 	response string
 }
 
-func (l *llm) Prompt(request string) string {
+func (l *powerfulLLM) Prompt(request string) string {
 	return l.response
 }