Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions BASETEN.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Baseten gpu-operator fork
This fork is for a temporary workaround of a race-condition that is causing crashes (fatal error: concurrent map read and map write).

## Custom build image
Use the Dockerfile in the root directory. We used the official compatible version base image (e.g. nvcr.io/nvidia/gpu-operator:v23.9.1) and then override the gpu-operator binary. By doing this we remove the need to do all the dependency packaging that is requried to build a working image (see /docker folder for their docker file).

Modify the Dockerfile with the make target depending on whether you want the normal binary or the binary with race detector.


## Compile binary with golang race detector
Use the `gpu-operator-race` target.

Race detector requires CGO. To get around the glibc dynamic linking issues (image needs compatiable lib installed) we use static linking which forces the binary to include the dependencies.
14 changes: 14 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
FROM golang:1.21 AS builder

WORKDIR /
COPY go.mod go.sum ./
RUN go mod download
COPY . .
RUN make gpu-operator-race

FROM nvcr.io/nvidia/gpu-operator:v23.9.1
WORKDIR /
COPY --from=builder /gpu-operator /usr/bin/

USER gpu-operator
ENTRYPOINT ["/usr/bin/gpu-operator"]
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,11 @@ gpu-operator:
CGO_ENABLED=0 GOOS=$(GOOS) \
go build -ldflags "-s -w -X $(VERSION_PKG).gitCommit=$(GIT_COMMIT) -X $(VERSION_PKG).version=$(VERSION)" -o gpu-operator main.go

gpu-operator-race:
CGO_ENABLED=1 GOOS=$(GOOS) CGO_LDFLAGS="-static" \
go build -ldflags "-s -w -X $(VERSION_PKG).gitCommit=$(GIT_COMMIT) -X $(VERSION_PKG).version=$(VERSION)" -o gpu-operator ./cmd/gpu-operator/...


# Run against the configured Kubernetes cluster in ~/.kube/config
run: generate check manifests
go run ./main.go
Expand Down
17 changes: 17 additions & 0 deletions controllers/clusterpolicy_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,18 @@ import (
"fmt"

"github.com/go-logr/logr"
apiconfigv1 "github.com/openshift/api/config/v1"
apiimagev1 "github.com/openshift/api/image/v1"
secv1 "github.com/openshift/api/security/v1"
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/client-go/util/workqueue"

"time"
Expand Down Expand Up @@ -403,5 +408,17 @@ func (r *ClusterPolicyReconciler) SetupWithManager(ctx context.Context, mgr ctrl
return fmt.Errorf("failed to add index key: %w", err)
}

// [BASETEN]
// AddToScheme modifies the map in the Scheme object. This write causes race conditions thus
// should not be called once the controller starts using (read) the Scheme.
//
// The controller-runtime pattern is to build the scheme as part of manager/controller setup.
// https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/scheme
// "in the entrypoint for your manager, assemble the scheme containing exactly the types you need, panicing if scheme registration failed"
utilruntime.Must(promv1.AddToScheme(r.Scheme))
utilruntime.Must(secv1.Install(r.Scheme))
utilruntime.Must(apiconfigv1.Install(r.Scheme))
utilruntime.Must(apiimagev1.Install(r.Scheme))

return nil
}
10 changes: 0 additions & 10 deletions controllers/state_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,10 @@ import (
"path/filepath"
"strings"

secv1 "github.com/openshift/api/security/v1"
promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"

gpuv1 "github.com/NVIDIA/gpu-operator/api/v1"

"github.com/go-logr/logr"
apiconfigv1 "github.com/openshift/api/config/v1"
apiimagev1 "github.com/openshift/api/image/v1"
configv1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1"
"golang.org/x/mod/semver"
corev1 "k8s.io/api/core/v1"
Expand Down Expand Up @@ -783,11 +778,6 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP
n.k8sVersion = k8sVersion
n.rec.Log.Info("Kubernetes version detected", "version", k8sVersion)

utilruntime.Must(promv1.AddToScheme(reconciler.Scheme))
utilruntime.Must(secv1.Install(reconciler.Scheme))
utilruntime.Must(apiconfigv1.Install(reconciler.Scheme))
utilruntime.Must(apiimagev1.Install(reconciler.Scheme))

n.operatorMetrics = initOperatorMetrics(n)
n.rec.Log.Info("Operator metrics initialized.")

Expand Down