Skip to content

Commit 6428470

Browse files
authored
[registry-facade] Do retry if copying a blob fails mid way (#20879)
1 parent c568be5 commit 6428470

File tree

5 files changed

+200
-25
lines changed

5 files changed

+200
-25
lines changed

components/registry-facade/pkg/registry/blob.go

Lines changed: 42 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,13 @@ import (
3434
"github.com/gitpod-io/gitpod/registry-facade/api"
3535
)
3636

37-
var backoffParams = wait.Backoff{
38-
Duration: 100 * time.Millisecond,
39-
Factor: 1.5,
37+
// retrievalBackoffParams defines the backoff parameters for blob retrieval.
38+
// Aiming at ~10 seconds total time for retries
39+
var retrievalBackoffParams = wait.Backoff{
40+
Duration: 1 * time.Second,
41+
Factor: 1.2,
4042
Jitter: 0.2,
41-
Steps: 4,
43+
Steps: 5,
4244
}
4345

4446
func (reg *Registry) handleBlob(ctx context.Context, r *http.Request) http.Handler {
@@ -213,35 +215,49 @@ func (bh *blobHandler) getBlob(w http.ResponseWriter, r *http.Request) {
213215

214216
func (bh *blobHandler) retrieveFromSource(ctx context.Context, src BlobSource, w http.ResponseWriter, r *http.Request) (handled, dontCache bool, err error) {
215217
log.Debugf("retrieving blob %s from %s", bh.Digest, src.Name())
216-
dontCache, mediaType, url, rc, err := src.GetBlob(ctx, bh.Spec, bh.Digest)
217-
if err != nil {
218-
return false, true, xerrors.Errorf("cannnot fetch the blob from source %s: %v", src.Name(), err)
219-
}
220-
if rc != nil {
221-
defer rc.Close()
222-
}
223218

224-
if url != "" {
225-
http.Redirect(w, r, url, http.StatusPermanentRedirect)
226-
return true, true, nil
227-
}
219+
var n int64
220+
t0 := time.Now()
221+
var body bytes.Buffer
222+
var finalMediaType string
223+
224+
// The entire operation is now inside the backoff loop
225+
err = wait.ExponentialBackoffWithContext(ctx, retrievalBackoffParams, func(ctx context.Context) (done bool, err error) {
226+
// 1. GetBlob is now INSIDE the retry loop
227+
var url string
228+
var rc io.ReadCloser
229+
dontCache, finalMediaType, url, rc, err = src.GetBlob(ctx, bh.Spec, bh.Digest)
230+
if err != nil {
231+
log.WithField("blobSource", src.Name()).WithError(err).Warn("error fetching blob from source, retrying...")
232+
return false, nil
233+
}
234+
if rc != nil {
235+
defer rc.Close()
236+
}
228237

229-
w.Header().Set("Content-Type", mediaType)
238+
if url != "" {
239+
http.Redirect(w, r, url, http.StatusPermanentRedirect)
240+
dontCache = true
241+
return true, nil
242+
}
230243

231-
bp := bufPool.Get().(*[]byte)
232-
defer bufPool.Put(bp)
244+
body.Reset()
245+
bp := bufPool.Get().(*[]byte)
246+
defer bufPool.Put(bp)
233247

234-
var n int64
235-
t0 := time.Now()
236-
err = wait.ExponentialBackoffWithContext(ctx, backoffParams, func(ctx context.Context) (done bool, err error) {
237-
n, err = io.CopyBuffer(w, rc, *bp)
248+
// 2. CopyBuffer is also inside the retry loop
249+
n, err = io.CopyBuffer(&body, rc, *bp)
238250
if err == nil {
239251
return true, nil
240252
}
253+
254+
// Check for retryable errors during copy
241255
if errors.Is(err, syscall.ECONNRESET) || errors.Is(err, syscall.EPIPE) {
242-
log.WithField("blobSource", src.Name()).WithField("baseRef", bh.Spec.BaseRef).WithError(err).Warn("retry get blob because of error")
256+
// TODO(gpl): current error seem to be captured by this - but does it make sense to widen this condition?
257+
log.WithField("blobSource", src.Name()).WithField("baseRef", bh.Spec.BaseRef).WithError(err).Warn("retry get blob because of streaming error")
243258
return false, nil
244259
}
260+
245261
return true, err
246262
})
247263

@@ -252,6 +268,9 @@ func (bh *blobHandler) retrieveFromSource(ctx context.Context, src BlobSource, w
252268
return false, true, err
253269
}
254270

271+
w.Header().Set("Content-Type", finalMediaType)
272+
w.Write(body.Bytes())
273+
255274
if bh.Metrics != nil {
256275
bh.Metrics.BlobDownloadCounter.WithLabelValues(src.Name(), "true").Inc()
257276
bh.Metrics.BlobDownloadSpeedHist.WithLabelValues(src.Name()).Observe(float64(n) / time.Since(t0).Seconds())

components/registry-facade/pkg/registry/blob_test.go

Lines changed: 142 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,18 @@ package registry
77
import (
88
"bytes"
99
"context"
10+
"errors"
1011
"fmt"
1112
"io"
1213
"net/http"
1314
"net/http/httptest"
1415
"os"
1516
"path/filepath"
17+
"strings"
1618
"sync"
1719
"syscall"
1820
"testing"
21+
"time"
1922

2023
"github.com/alicebob/miniredis/v2"
2124
"github.com/containerd/containerd/remotes"
@@ -30,6 +33,9 @@ import (
3033
ma "github.com/multiformats/go-multiaddr"
3134
"github.com/opencontainers/go-digest"
3235
redis "github.com/redis/go-redis/v9"
36+
"github.com/stretchr/testify/assert"
37+
"github.com/stretchr/testify/require"
38+
"k8s.io/apimachinery/pkg/util/wait"
3339

3440
rfapi "github.com/gitpod-io/gitpod/registry-facade/api"
3541
)
@@ -250,3 +256,139 @@ func (rw *failFirstResponseWriter) Write(buf []byte) (int, error) {
250256
func (rw *failFirstResponseWriter) WriteHeader(code int) {
251257
rw.code = code
252258
}
259+
260+
// mockBlobSource allows faking BlobSource behavior for tests.
261+
type mockBlobSource struct {
262+
// How many times GetBlob should fail before succeeding.
263+
failCount int
264+
// The error to return on failure.
265+
failError error
266+
267+
// Internal counter for calls.
268+
callCount int
269+
// The data to return on success.
270+
successData string
271+
272+
// Whether to use a reader that fails mid-stream on the first call.
273+
failReaderOnFirstCall bool
274+
// The number of bytes to read successfully before the reader fails.
275+
failAfterBytes int
276+
}
277+
278+
func (m *mockBlobSource) Name() string { return "mock" }
279+
func (m *mockBlobSource) HasBlob(ctx context.Context, details *rfapi.ImageSpec, dgst digest.Digest) bool {
280+
return true
281+
}
282+
283+
func (m *mockBlobSource) GetBlob(ctx context.Context, details *rfapi.ImageSpec, dgst digest.Digest) (dontCache bool, mediaType string, url string, data io.ReadCloser, err error) {
284+
m.callCount++
285+
if m.callCount <= m.failCount {
286+
return false, "", "", nil, m.failError
287+
}
288+
289+
if m.failReaderOnFirstCall && m.callCount == 1 {
290+
return false, "application/octet-stream", "", io.NopCloser(&failingReader{
291+
reader: strings.NewReader(m.successData),
292+
failAfterBytes: m.failAfterBytes,
293+
failError: m.failError,
294+
}), nil
295+
}
296+
297+
return false, "application/octet-stream", "", io.NopCloser(strings.NewReader(m.successData)), nil
298+
}
299+
300+
// failingReader is a reader that fails after a certain point.
301+
type failingReader struct {
302+
reader io.Reader
303+
failAfterBytes int
304+
failError error
305+
bytesRead int
306+
}
307+
308+
func (fr *failingReader) Read(p []byte) (n int, err error) {
309+
if fr.bytesRead >= fr.failAfterBytes {
310+
return 0, fr.failError
311+
}
312+
n, err = fr.reader.Read(p)
313+
if err != nil {
314+
return n, err
315+
}
316+
fr.bytesRead += n
317+
if fr.bytesRead >= fr.failAfterBytes {
318+
// Return the error, but also the bytes read in this call.
319+
return n, fr.failError
320+
}
321+
return n, nil
322+
}
323+
324+
func TestRetrieveFromSource_RetryOnGetBlob(t *testing.T) {
325+
// Arrange
326+
mockSource := &mockBlobSource{
327+
failCount: 2,
328+
failError: errors.New("transient network error"),
329+
successData: "hello world",
330+
}
331+
332+
bh := &blobHandler{
333+
Digest: "sha256:dummy",
334+
Spec: &rfapi.ImageSpec{},
335+
}
336+
337+
// Use short backoff for testing
338+
originalBackoff := retrievalBackoffParams
339+
retrievalBackoffParams = wait.Backoff{
340+
Duration: 1 * time.Millisecond,
341+
Steps: 3,
342+
}
343+
defer func() { retrievalBackoffParams = originalBackoff }()
344+
345+
w := httptest.NewRecorder()
346+
r := httptest.NewRequest("GET", "/v2/...", nil)
347+
348+
// Act
349+
handled, dontCache, err := bh.retrieveFromSource(context.Background(), mockSource, w, r)
350+
351+
// Assert
352+
require.NoError(t, err)
353+
assert.True(t, handled)
354+
assert.False(t, dontCache)
355+
assert.Equal(t, "hello world", w.Body.String())
356+
assert.Equal(t, 3, mockSource.callCount, "Expected GetBlob to be called 3 times (2 failures + 1 success)")
357+
}
358+
359+
func TestRetrieveFromSource_RetryOnCopy(t *testing.T) {
360+
// Arrange
361+
mockSource := &mockBlobSource{
362+
failCount: 0, // GetBlob succeeds immediately
363+
failReaderOnFirstCall: true,
364+
failAfterBytes: 5,
365+
failError: syscall.EPIPE,
366+
successData: "hello world",
367+
}
368+
369+
bh := &blobHandler{
370+
Digest: "sha256:dummy",
371+
Spec: &rfapi.ImageSpec{},
372+
}
373+
374+
// Use short backoff for testing
375+
originalBackoff := retrievalBackoffParams
376+
retrievalBackoffParams = wait.Backoff{
377+
Duration: 1 * time.Millisecond,
378+
Steps: 3,
379+
}
380+
defer func() { retrievalBackoffParams = originalBackoff }()
381+
382+
w := httptest.NewRecorder()
383+
r := httptest.NewRequest("GET", "/v2/...", nil)
384+
385+
// Act
386+
handled, dontCache, err := bh.retrieveFromSource(context.Background(), mockSource, w, r)
387+
388+
// Assert
389+
require.NoError(t, err)
390+
assert.True(t, handled)
391+
assert.False(t, dontCache)
392+
assert.Equal(t, "hello world", w.Body.String())
393+
assert.Equal(t, 2, mockSource.callCount, "Expected GetBlob to be called twice (1st succeeds, copy fails, 2nd succeeds)")
394+
}

memory-bank/activeContext.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,11 +18,13 @@ Building a comprehensive knowledge base of the Gitpod codebase and architecture
1818
- Documented 33 service components and 11 API components
1919
- Enhanced API component documentation with code generation information
2020
- Implemented server readiness probe with database, SpiceDB, and Redis connectivity checks
21+
- **Improved `registry-facade` resilience by implementing a comprehensive retry mechanism for blob retrieval, addressing transient network errors.**
2122

2223
## Next Steps
2324

24-
1. **Component Interactions**: Understand inter-component communication
25-
2. **Development Environment**: Configure local development setup
25+
1. **Monitor `registry-facade`:** Observe the component's behavior with the new retry logic to ensure it correctly handles the previously identified network issues.
26+
2. **Component Interactions**: Understand inter-component communication
27+
3. **Development Environment**: Configure local development setup
2628
3. **Build System**: Gain experience with in-tree and Leeway builds
2729
4. **Component Builds**: Practice building different component types
2830
5. **Initial Tasks**: Identify specific improvement areas

memory-bank/components/registry-facade.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,12 @@ The component acts as an "image layer smuggler," inserting layers into container
3333
- `cmd/run.go`: Implements the main registry service
3434
- `cmd/setup.go`: Handles service setup and configuration
3535
- `pkg/registry/`: Core registry implementation
36+
- `blob.go`: Handles blob retrieval from various sources (local store, IPFS, upstream registries). Contains a resilient retry mechanism to handle transient network errors during both connection and data transfer phases.
37+
38+
## Key Implementation Details
39+
40+
### Blob Retrieval Retry Logic
41+
The `retrieveFromSource` function in `pkg/registry/blob.go` implements an exponential backoff retry mechanism that wraps the entire blob retrieval process. This ensures that transient network errors, such as `TLS handshake timeout` or `connection reset`, that occur during either the initial connection (`GetBlob`) or the data streaming (`io.CopyBuffer`) are retried. This makes the service more resilient to intermittent network issues when fetching blobs from upstream sources like S3.
3642

3743
## Dependencies
3844

memory-bank/progress.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,12 @@ No specific blockers identified yet.
6868

6969
## Recent Progress
7070

71+
### 6/6/2025
72+
- Investigated `registry-facade` 500 errors (CLC-195).
73+
- Analyzed logs and identified TLS handshake timeouts as a root cause.
74+
- Implemented a more resilient retry mechanism in `pkg/registry/blob.go` to handle transient network errors during blob retrieval.
75+
- Updated `registry-facade` component documentation.
76+
7177
### 3/17/2025
7278
- Implemented server readiness probe with database, SpiceDB, and Redis checks
7379
- Created PRD document for the implementation

0 commit comments

Comments
 (0)