Skip to content

Commit c1ad1ce

Browse files
committed
add better error handling and retry
1 parent 96503c9 commit c1ad1ce

File tree

6 files changed

+92
-9
lines changed

6 files changed

+92
-9
lines changed

.github/workflows/validation-lambdalabs.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ on:
99
pull_request:
1010
paths:
1111
- 'internal/lambdalabs/**'
12-
- 'pkg/v1/**'
1312
branches: [ main ]
1413

1514
jobs:
@@ -44,7 +43,7 @@ jobs:
4443
LAMBDALABS_API_KEY: ${{ secrets.LAMBDALABS_API_KEY }}
4544
run: |
4645
cd internal/lambdalabs
47-
go test -v -short=false -timeout=20m ./...
46+
go test -v -short=false -timeout=30m ./...
4847
4948
- name: Upload test results
5049
uses: actions/upload-artifact@v4

internal/collections/collections.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,11 @@
11
package collections
22

3+
import (
4+
"fmt"
5+
6+
"github.com/cenkalti/backoff/v4"
7+
)
8+
39
func Flatten[T any](listOfLists [][]T) []T {
410
result := []T{}
511
for _, list := range listOfLists {
@@ -78,3 +84,15 @@ func Filter[T any](list []T, f func(T) bool) []T {
7884
func Ptr[T any](x T) *T {
7985
return &x
8086
}
87+
88+
func RetryWithDataAndAttemptCount[T any](o backoff.OperationWithData[T], b backoff.BackOff) (T, error) {
89+
attemptCount := 0
90+
t, err := backoff.RetryWithData(func() (T, error) {
91+
attemptCount++
92+
return o()
93+
}, b)
94+
if err != nil {
95+
return t, fmt.Errorf("attemptCount %d: %w", attemptCount, err)
96+
}
97+
return t, nil
98+
}

internal/lambdalabs/v1/client.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,11 @@ import (
55
"crypto/sha256"
66
"fmt"
77
"net/http"
8+
"time"
89

910
openapi "github.com/brevdev/cloud/internal/lambdalabs/gen/lambdalabs"
1011
v1 "github.com/brevdev/cloud/pkg/v1"
12+
"github.com/cenkalti/backoff/v4"
1113
)
1214

1315
// LambdaLabsCredential implements the CloudCredential interface for Lambda Labs
@@ -118,3 +120,10 @@ func (c *LambdaLabsClient) makeAuthContext(ctx context.Context) context.Context
118120
UserName: c.apiKey,
119121
})
120122
}
123+
124+
func getBackoff() backoff.BackOff {
125+
bo := backoff.NewExponentialBackOff()
126+
bo.InitialInterval = 1000 * time.Millisecond
127+
bo.MaxElapsedTime = 120 * time.Second
128+
return bo
129+
}

internal/lambdalabs/v1/errors.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
package v1
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"io"
7+
"net/http"
8+
"strings"
9+
10+
openapi "github.com/brevdev/cloud/internal/lambdalabs/gen/lambdalabs"
11+
v1 "github.com/brevdev/cloud/pkg/v1"
12+
"github.com/cenkalti/backoff/v4"
13+
)
14+
15+
func handleAPIError(ctx context.Context, resp *http.Response, err error) error {
16+
body := ""
17+
e, ok := err.(openapi.GenericOpenAPIError)
18+
if ok {
19+
body = string(e.Body())
20+
}
21+
if body == "" {
22+
bodyBytes, errr := io.ReadAll(resp.Body)
23+
if errr != nil {
24+
fmt.Printf("Error reading response body: %v\n", errr)
25+
}
26+
body = string(bodyBytes)
27+
}
28+
outErr := fmt.Errorf("LambdaLabs API error\n%s\n%s:\nErr: %s\n%s", resp.Request.URL, resp.Status, err.Error(), body)
29+
if strings.Contains(body, "instance does not exist") { //nolint:gocritic // ignore
30+
return backoff.Permanent(v1.ErrInstanceNotFound)
31+
} else if strings.Contains(body, "banned you temporarily") {
32+
return outErr
33+
} else if resp.StatusCode < 500 && resp.StatusCode != 429 { // 429 Too Many Requests (use back off)
34+
return backoff.Permanent(outErr)
35+
} else {
36+
return outErr
37+
}
38+
}
39+
40+
func handleErrToCloudErr(e error) error {
41+
if e == nil {
42+
return nil
43+
}
44+
if strings.Contains(e.Error(), "Not enough capacity") || strings.Contains(e.Error(), "insufficient-capacity") { //nolint:gocritic // ignore
45+
return v1.ErrInsufficientResources
46+
} else if strings.Contains(e.Error(), "global/invalid-parameters") && strings.Contains(e.Error(), "Region") && strings.Contains(e.Error(), "does not exist") {
47+
return v1.ErrInsufficientResources
48+
} else {
49+
return e
50+
}
51+
}

internal/lambdalabs/v1/instance.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ func (c *LambdaLabsClient) CreateInstance(ctx context.Context, attrs v1.CreateIn
6666
defer func() { _ = httpResp.Body.Close() }()
6767
}
6868
if err != nil {
69-
return nil, fmt.Errorf("failed to launch instance: %w", err)
69+
return nil, fmt.Errorf("failed to launch instance: %w", handleErrToCloudErr(err))
7070
}
7171

7272
if len(resp.Data.InstanceIds) != 1 {

internal/lambdalabs/v1/instancetype.go

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -90,15 +90,21 @@ func (c *LambdaLabsClient) GetInstanceTypes(ctx context.Context, args v1.GetInst
9090
}
9191

9292
func (c *LambdaLabsClient) getInstanceTypes(ctx context.Context) (*openapi.InstanceTypes200Response, error) {
93-
resp, httpResp, err := c.client.DefaultAPI.InstanceTypes(c.makeAuthContext(ctx)).Execute()
94-
if httpResp != nil {
95-
defer func() { _ = httpResp.Body.Close() }()
96-
}
93+
ilr, err := collections.RetryWithDataAndAttemptCount(func() (*openapi.InstanceTypes200Response, error) {
94+
res, resp, err := c.client.DefaultAPI.InstanceTypes(c.makeAuthContext(ctx)).Execute()
95+
if resp != nil {
96+
defer resp.Body.Close() //nolint:errcheck // ignore because using defer (for some reason HandleErrDefer)
97+
}
98+
if err != nil {
99+
return &openapi.InstanceTypes200Response{}, handleAPIError(ctx, resp, err)
100+
}
101+
return res, nil
102+
}, getBackoff())
97103
if err != nil {
98-
return nil, fmt.Errorf("failed to get instance types: %w", err)
104+
return nil, err
99105
}
100106

101-
return resp, nil
107+
return ilr, nil
102108
}
103109

104110
func parseGPUFromDescription(input string) (v1.GPU, error) {

0 commit comments

Comments
 (0)