diff --git a/BASETEN.md b/BASETEN.md new file mode 100644 index 000000000..3742761ed --- /dev/null +++ b/BASETEN.md @@ -0,0 +1,13 @@ +# Baseten gpu-operator fork +This fork is for a temporary workaround of a race-condition that is causing crashes (fatal error: concurrent map read and map write). + +## Custom build image +Use the Dockerfile in the root directory. We used the official compatible version base image (e.g. nvcr.io/nvidia/gpu-operator:v23.9.1) and then override the gpu-operator binary. By doing this we remove the need to do all the dependency packaging that is requried to build a working image (see /docker folder for their docker file). + +Modify the Dockerfile with the make target depending on whether you want the normal binary or the binary with race detector. + + +## Compile binary with golang race detector +Use the `gpu-operator-race` target. + +Race detector requires CGO. To get around the glibc dynamic linking issues (image needs compatiable lib installed) we use static linking which forces the binary to include the dependencies. diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 000000000..959297aeb --- /dev/null +++ b/Dockerfile @@ -0,0 +1,14 @@ +FROM golang:1.21 AS builder + +WORKDIR / +COPY go.mod go.sum ./ +RUN go mod download +COPY . . +RUN make gpu-operator-race + +FROM nvcr.io/nvidia/gpu-operator:v23.9.1 +WORKDIR / +COPY --from=builder /gpu-operator /usr/bin/ + +USER gpu-operator +ENTRYPOINT ["/usr/bin/gpu-operator"] diff --git a/Makefile b/Makefile index 858bfebd2..75195e523 100644 --- a/Makefile +++ b/Makefile @@ -100,6 +100,11 @@ gpu-operator: CGO_ENABLED=0 GOOS=$(GOOS) \ go build -ldflags "-s -w -X $(VERSION_PKG).gitCommit=$(GIT_COMMIT) -X $(VERSION_PKG).version=$(VERSION)" -o gpu-operator main.go +gpu-operator-race: + CGO_ENABLED=1 GOOS=$(GOOS) CGO_LDFLAGS="-static" \ + go build -ldflags "-s -w -X $(VERSION_PKG).gitCommit=$(GIT_COMMIT) -X $(VERSION_PKG).version=$(VERSION)" -o gpu-operator ./cmd/gpu-operator/... + + # Run against the configured Kubernetes cluster in ~/.kube/config run: generate check manifests go run ./main.go diff --git a/controllers/clusterpolicy_controller.go b/controllers/clusterpolicy_controller.go index 30b7dec0c..4c65f3e0b 100644 --- a/controllers/clusterpolicy_controller.go +++ b/controllers/clusterpolicy_controller.go @@ -21,6 +21,10 @@ import ( "fmt" "github.com/go-logr/logr" + apiconfigv1 "github.com/openshift/api/config/v1" + apiimagev1 "github.com/openshift/api/image/v1" + secv1 "github.com/openshift/api/security/v1" + promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" appsv1 "k8s.io/api/apps/v1" corev1 "k8s.io/api/core/v1" @@ -28,6 +32,7 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/client-go/util/workqueue" "time" @@ -403,5 +408,17 @@ func (r *ClusterPolicyReconciler) SetupWithManager(ctx context.Context, mgr ctrl return fmt.Errorf("failed to add index key: %w", err) } + // [BASETEN] + // AddToScheme modifies the map in the Scheme object. This write causes race conditions thus + // should not be called once the controller starts using (read) the Scheme. + // + // The controller-runtime pattern is to build the scheme as part of manager/controller setup. + // https://pkg.go.dev/sigs.k8s.io/controller-runtime/pkg/scheme + // "in the entrypoint for your manager, assemble the scheme containing exactly the types you need, panicing if scheme registration failed" + utilruntime.Must(promv1.AddToScheme(r.Scheme)) + utilruntime.Must(secv1.Install(r.Scheme)) + utilruntime.Must(apiconfigv1.Install(r.Scheme)) + utilruntime.Must(apiimagev1.Install(r.Scheme)) + return nil } diff --git a/controllers/state_manager.go b/controllers/state_manager.go index 24e53d584..11902093e 100644 --- a/controllers/state_manager.go +++ b/controllers/state_manager.go @@ -23,15 +23,10 @@ import ( "path/filepath" "strings" - secv1 "github.com/openshift/api/security/v1" - promv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" - utilruntime "k8s.io/apimachinery/pkg/util/runtime" - gpuv1 "github.com/NVIDIA/gpu-operator/api/v1" "github.com/go-logr/logr" apiconfigv1 "github.com/openshift/api/config/v1" - apiimagev1 "github.com/openshift/api/image/v1" configv1 "github.com/openshift/client-go/config/clientset/versioned/typed/config/v1" "golang.org/x/mod/semver" corev1 "k8s.io/api/core/v1" @@ -783,11 +778,6 @@ func (n *ClusterPolicyController) init(ctx context.Context, reconciler *ClusterP n.k8sVersion = k8sVersion n.rec.Log.Info("Kubernetes version detected", "version", k8sVersion) - utilruntime.Must(promv1.AddToScheme(reconciler.Scheme)) - utilruntime.Must(secv1.Install(reconciler.Scheme)) - utilruntime.Must(apiconfigv1.Install(reconciler.Scheme)) - utilruntime.Must(apiimagev1.Install(reconciler.Scheme)) - n.operatorMetrics = initOperatorMetrics(n) n.rec.Log.Info("Operator metrics initialized.")