-
Notifications
You must be signed in to change notification settings - Fork 2
/
embedder.go
225 lines (209 loc) · 7.01 KB
/
embedder.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
// Package raggo provides a high-level interface for text embedding and retrieval
// operations in RAG (Retrieval-Augmented Generation) systems. It simplifies the
// process of converting text into vector embeddings using various providers.
package raggo
import (
"context"
"fmt"
"github.com/teilomillet/raggo/rag"
"github.com/teilomillet/raggo/rag/providers"
)
// EmbeddedChunk represents a chunk of text with its embeddings and metadata.
// It serves as the core data structure for storing and retrieving embedded content
// in the RAG system.
//
// Structure:
// - Text: The original text content that was embedded
// - Embeddings: Vector representations from different models/providers
// - Metadata: Additional context and information about the chunk
//
// Example:
//
// chunk := EmbeddedChunk{
// Text: "Sample text content",
// Embeddings: map[string][]float64{
// "default": []float64{0.1, 0.2, 0.3},
// },
// Metadata: map[string]interface{}{
// "source": "document1.txt",
// "timestamp": time.Now(),
// },
// }
type EmbeddedChunk = rag.EmbeddedChunk
// EmbedderOption is a function type for configuring the Embedder.
// It follows the functional options pattern to provide a clean and
// flexible configuration API.
//
// Common options include:
// - SetEmbedderProvider: Choose the embedding service provider
// - SetEmbedderModel: Select the specific embedding model
// - SetEmbedderAPIKey: Configure authentication
// - SetOption: Set custom provider-specific options
type EmbedderOption = rag.EmbedderOption
// SetEmbedderProvider sets the provider for the Embedder.
// Supported providers include:
// - "openai": OpenAI's text-embedding-ada-002 and other models
// - "cohere": Cohere's embedding models
// - "local": Local embedding models (if configured)
//
// Example:
//
// embedder, err := NewEmbedder(
// SetEmbedderProvider("openai"),
// SetEmbedderModel("text-embedding-ada-002"),
// )
func SetEmbedderProvider(provider string) EmbedderOption {
return rag.SetProvider(provider)
}
// SetEmbedderModel sets the specific model to use for embedding.
// Available models depend on the chosen provider:
// - OpenAI: "text-embedding-ada-002" (recommended)
// - Cohere: "embed-multilingual-v2.0"
// - Local: Depends on configured models
//
// Example:
//
// embedder, err := NewEmbedder(
// SetEmbedderProvider("openai"),
// SetEmbedderModel("text-embedding-ada-002"),
// )
func SetEmbedderModel(model string) EmbedderOption {
return rag.SetModel(model)
}
// SetEmbedderAPIKey sets the authentication key for the embedding service.
// This is required for most cloud-based embedding providers.
//
// Security Note: Store API keys securely and never commit them to version control.
// Consider using environment variables or secure key management systems.
//
// Example:
//
// embedder, err := NewEmbedder(
// SetEmbedderProvider("openai"),
// SetEmbedderAPIKey(os.Getenv("OPENAI_API_KEY")),
// )
func SetEmbedderAPIKey(apiKey string) EmbedderOption {
return rag.SetAPIKey(apiKey)
}
// SetOption sets a custom option for the Embedder.
// This allows for provider-specific configuration that isn't covered
// by the standard options.
//
// Example:
//
// embedder, err := NewEmbedder(
// SetEmbedderProvider("openai"),
// SetOption("timeout", 30*time.Second),
// SetOption("max_retries", 3),
// )
func SetOption(key string, value interface{}) EmbedderOption {
return rag.SetOption(key, value)
}
// Embedder interface defines the contract for embedding implementations.
// This allows for different embedding providers to be used interchangeably.
type Embedder = providers.Embedder
// NewEmbedder creates a new Embedder instance based on the provided options.
// It handles provider selection and configuration, returning a ready-to-use
// embedding interface.
//
// Returns an error if:
// - No provider is specified
// - The provider is not supported
// - Configuration is invalid
// - Authentication fails
//
// Example:
//
// embedder, err := NewEmbedder(
// SetEmbedderProvider("openai"),
// SetEmbedderModel("text-embedding-ada-002"),
// SetEmbedderAPIKey(os.Getenv("OPENAI_API_KEY")),
// )
// if err != nil {
// log.Fatal(err)
// }
func NewEmbedder(opts ...EmbedderOption) (Embedder, error) {
return rag.NewEmbedder(opts...)
}
// EmbeddingService handles the embedding process for text content.
// It supports multiple embedders for different fields or purposes,
// allowing for flexible embedding strategies.
type EmbeddingService struct {
embedders map[string]Embedder
}
// NewEmbeddingService creates a new embedding service with the specified embedder
// as the default embedding provider.
//
// Example:
//
// embedder, _ := NewEmbedder(SetEmbedderProvider("openai"))
// service := NewEmbeddingService(embedder)
func NewEmbeddingService(embedder Embedder) *EmbeddingService {
return &EmbeddingService{
embedders: map[string]Embedder{"default": embedder},
}
}
// EmbedChunks processes a slice of text chunks and generates embeddings for each one.
// It supports multiple embedding fields per chunk, using different embedders
// for each field if configured.
//
// The function:
// 1. Processes each chunk through configured embedders
// 2. Combines embeddings from all fields
// 3. Preserves chunk metadata
// 4. Handles errors for individual chunks
//
// Example:
//
// chunks := []rag.Chunk{
// {Text: "First chunk", TokenSize: 10},
// {Text: "Second chunk", TokenSize: 12},
// }
// embedded, err := service.EmbedChunks(ctx, chunks)
func (s *EmbeddingService) EmbedChunks(ctx context.Context, chunks []rag.Chunk) ([]rag.EmbeddedChunk, error) {
embeddedChunks := make([]rag.EmbeddedChunk, 0, len(chunks))
for _, chunk := range chunks {
embeddings := make(map[string][]float64)
for field, embedder := range s.embedders {
embedding, err := embedder.Embed(ctx, chunk.Text)
if err != nil {
return nil, fmt.Errorf("error embedding chunk for field %s: %w", field, err)
}
embeddings[field] = embedding
}
embeddedChunk := rag.EmbeddedChunk{
Text: chunk.Text,
Embeddings: embeddings,
Metadata: map[string]interface{}{
"token_size": chunk.TokenSize,
"start_sentence": chunk.StartSentence,
"end_sentence": chunk.EndSentence,
},
}
embeddedChunks = append(embeddedChunks, embeddedChunk)
}
return embeddedChunks, nil
}
// Embed generates embeddings for a single text string using the default embedder.
// This is a convenience method for simple embedding operations.
//
// Example:
//
// text := "Sample text to embed"
// embedding, err := service.Embed(ctx, text)
// if err != nil {
// log.Fatal(err)
// }
func (s *EmbeddingService) Embed(ctx context.Context, text string) ([]float64, error) {
// Get the default embedder
embedder, ok := s.embedders["default"]
if !ok {
return nil, fmt.Errorf("no default embedder found")
}
// Get embedding using the default embedder
embedding, err := embedder.Embed(ctx, text)
if err != nil {
return nil, fmt.Errorf("error embedding text: %w", err)
}
return embedding, nil
}