Skip to content

Commit

Permalink
⚠ Recover panics per default (#2905)
Browse files Browse the repository at this point in the history
* Recover panics per default

Signed-off-by: Stefan Büringer [email protected]

* fix review findings

---------

Signed-off-by: Stefan Büringer [email protected]
  • Loading branch information
sbueringer authored Aug 6, 2024
1 parent 250454b commit 9516c0f
Show file tree
Hide file tree
Showing 10 changed files with 158 additions and 24 deletions.
31 changes: 24 additions & 7 deletions pkg/builder/webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ type WebhookBuilder struct {
gvk schema.GroupVersionKind
mgr manager.Manager
config *rest.Config
recoverPanic bool
recoverPanic *bool
logConstructor func(base logr.Logger, req *admission.Request) logr.Logger
err error
}
Expand Down Expand Up @@ -84,8 +84,9 @@ func (blder *WebhookBuilder) WithLogConstructor(logConstructor func(base logr.Lo
}

// RecoverPanic indicates whether panics caused by the webhook should be recovered.
func (blder *WebhookBuilder) RecoverPanic() *WebhookBuilder {
blder.recoverPanic = true
// Defaults to true.
func (blder *WebhookBuilder) RecoverPanic(recoverPanic bool) *WebhookBuilder {
blder.recoverPanic = &recoverPanic
return blder
}

Expand Down Expand Up @@ -169,10 +170,18 @@ func (blder *WebhookBuilder) registerDefaultingWebhook() {

func (blder *WebhookBuilder) getDefaultingWebhook() *admission.Webhook {
if defaulter := blder.customDefaulter; defaulter != nil {
return admission.WithCustomDefaulter(blder.mgr.GetScheme(), blder.apiType, defaulter).WithRecoverPanic(blder.recoverPanic)
w := admission.WithCustomDefaulter(blder.mgr.GetScheme(), blder.apiType, defaulter)
if blder.recoverPanic != nil {
w = w.WithRecoverPanic(*blder.recoverPanic)
}
return w
}
if defaulter, ok := blder.apiType.(admission.Defaulter); ok {
return admission.DefaultingWebhookFor(blder.mgr.GetScheme(), defaulter).WithRecoverPanic(blder.recoverPanic)
w := admission.DefaultingWebhookFor(blder.mgr.GetScheme(), defaulter)
if blder.recoverPanic != nil {
w = w.WithRecoverPanic(*blder.recoverPanic)
}
return w
}
log.Info(
"skip registering a mutating webhook, object does not implement admission.Defaulter or WithDefaulter wasn't called",
Expand Down Expand Up @@ -200,10 +209,18 @@ func (blder *WebhookBuilder) registerValidatingWebhook() {

func (blder *WebhookBuilder) getValidatingWebhook() *admission.Webhook {
if validator := blder.customValidator; validator != nil {
return admission.WithCustomValidator(blder.mgr.GetScheme(), blder.apiType, validator).WithRecoverPanic(blder.recoverPanic)
w := admission.WithCustomValidator(blder.mgr.GetScheme(), blder.apiType, validator)
if blder.recoverPanic != nil {
w = w.WithRecoverPanic(*blder.recoverPanic)
}
return w
}
if validator, ok := blder.apiType.(admission.Validator); ok {
return admission.ValidatingWebhookFor(blder.mgr.GetScheme(), validator).WithRecoverPanic(blder.recoverPanic)
w := admission.ValidatingWebhookFor(blder.mgr.GetScheme(), validator)
if blder.recoverPanic != nil {
w = w.WithRecoverPanic(*blder.recoverPanic)
}
return w
}
log.Info(
"skip registering a validating webhook, object does not implement admission.Validator or WithValidator wasn't called",
Expand Down
4 changes: 2 additions & 2 deletions pkg/builder/webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ func runTests(admissionReviewVersion string) {

err = WebhookManagedBy(m).
For(&TestDefaulter{Panic: true}).
RecoverPanic().
// RecoverPanic defaults to true.
Complete()
ExpectWithOffset(1, err).NotTo(HaveOccurred())
svr := m.GetWebhookServer()
Expand Down Expand Up @@ -369,7 +369,7 @@ func runTests(admissionReviewVersion string) {

err = WebhookManagedBy(m).
For(&TestValidator{Panic: true}).
RecoverPanic().
RecoverPanic(true).
Complete()
ExpectWithOffset(1, err).NotTo(HaveOccurred())
svr := m.GetWebhookServer()
Expand Down
1 change: 1 addition & 0 deletions pkg/config/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ type Controller struct {

// RecoverPanic indicates whether the panic caused by reconcile should be recovered.
// Defaults to the Controller.RecoverPanic setting from the Manager if unset.
// Defaults to true if Controller.RecoverPanic setting from the Manager is also unset.
RecoverPanic *bool

// NeedLeaderElection indicates whether the controller needs to use leader election.
Expand Down
1 change: 1 addition & 0 deletions pkg/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ type TypedOptions[request comparable] struct {

// RecoverPanic indicates whether the panic caused by reconcile should be recovered.
// Defaults to the Controller.RecoverPanic setting from the Manager if unset.
// Defaults to true if Controller.RecoverPanic setting from the Manager is also unset.
RecoverPanic *bool

// NeedLeaderElection indicates whether the controller needs to use leader election.
Expand Down
11 changes: 8 additions & 3 deletions pkg/internal/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ type Controller[request comparable] struct {
LogConstructor func(request *request) logr.Logger

// RecoverPanic indicates whether the panic caused by reconcile should be recovered.
// Defaults to true.
RecoverPanic *bool

// LeaderElected indicates whether the controller is leader elected or always running.
Expand All @@ -97,7 +98,9 @@ type Controller[request comparable] struct {
func (c *Controller[request]) Reconcile(ctx context.Context, req request) (_ reconcile.Result, err error) {
defer func() {
if r := recover(); r != nil {
if c.RecoverPanic != nil && *c.RecoverPanic {
ctrlmetrics.ReconcilePanics.WithLabelValues(c.Name).Inc()

if c.RecoverPanic == nil || *c.RecoverPanic {
for _, fn := range utilruntime.PanicHandlers {
fn(ctx, r)
}
Expand Down Expand Up @@ -269,13 +272,15 @@ const (
)

func (c *Controller[request]) initMetrics() {
ctrlmetrics.ActiveWorkers.WithLabelValues(c.Name).Set(0)
ctrlmetrics.ReconcileErrors.WithLabelValues(c.Name).Add(0)
ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelError).Add(0)
ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelRequeueAfter).Add(0)
ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelRequeue).Add(0)
ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelSuccess).Add(0)
ctrlmetrics.ReconcileErrors.WithLabelValues(c.Name).Add(0)
ctrlmetrics.TerminalReconcileErrors.WithLabelValues(c.Name).Add(0)
ctrlmetrics.ReconcilePanics.WithLabelValues(c.Name).Add(0)
ctrlmetrics.WorkerCount.WithLabelValues(c.Name).Set(float64(c.MaxConcurrentReconciles))
ctrlmetrics.ActiveWorkers.WithLabelValues(c.Name).Set(0)
}

func (c *Controller[request]) reconcileHandler(ctx context.Context, req request) {
Expand Down
21 changes: 20 additions & 1 deletion pkg/internal/controller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,14 @@ var _ = Describe("controller", func() {
Expect(result).To(Equal(reconcile.Result{Requeue: true}))
})

It("should not recover panic if RecoverPanic is false by default", func() {
It("should not recover panic if RecoverPanic is false", func() {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()

defer func() {
Expect(recover()).ShouldNot(BeNil())
}()
ctrl.RecoverPanic = ptr.To(false)
ctrl.Do = reconcile.Func(func(context.Context, reconcile.Request) (reconcile.Result, error) {
var res *reconcile.Result
return *res, nil
Expand All @@ -105,6 +106,24 @@ var _ = Describe("controller", func() {
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}})
})

It("should recover panic if RecoverPanic is true by default", func() {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()

defer func() {
Expect(recover()).To(BeNil())
}()
// RecoverPanic defaults to true.
ctrl.Do = reconcile.Func(func(context.Context, reconcile.Request) (reconcile.Result, error) {
var res *reconcile.Result
return *res, nil
})
_, err := ctrl.Reconcile(ctx,
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}})
Expect(err).To(HaveOccurred())
Expect(err.Error()).To(ContainSubstring("[recovered]"))
})

It("should recover panic if RecoverPanic is true", func() {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
Expand Down
8 changes: 8 additions & 0 deletions pkg/internal/controller/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@ var (
Help: "Total number of terminal reconciliation errors per controller",
}, []string{"controller"})

// ReconcilePanics is a prometheus counter metrics which holds the total
// number of panics from the Reconciler.
ReconcilePanics = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "controller_runtime_reconcile_panics_total",
Help: "Total number of reconciliation panics per controller",
}, []string{"controller"})

// ReconcileTime is a prometheus metric which keeps track of the duration
// of reconciliations.
ReconcileTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{
Expand Down Expand Up @@ -75,6 +82,7 @@ func init() {
ReconcileTotal,
ReconcileErrors,
TerminalReconcileErrors,
ReconcilePanics,
ReconcileTime,
WorkerCount,
ActiveWorkers,
Expand Down
39 changes: 39 additions & 0 deletions pkg/webhook/admission/metrics/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/*
Copyright 2024 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package metrics

import (
"github.com/prometheus/client_golang/prometheus"
"sigs.k8s.io/controller-runtime/pkg/metrics"
)

var (
// WebhookPanics is a prometheus counter metrics which holds the total
// number of panics from webhooks.
WebhookPanics = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "controller_runtime_webhook_panics_total",
Help: "Total number of webhook panics",
}, []string{})
)

func init() {
metrics.Registry.MustRegister(
WebhookPanics,
)
// Init metric.
WebhookPanics.WithLabelValues().Add(0)
}
31 changes: 23 additions & 8 deletions pkg/webhook/admission/webhook.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"k8s.io/apimachinery/pkg/util/json"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
"k8s.io/klog/v2"
admissionmetrics "sigs.k8s.io/controller-runtime/pkg/webhook/admission/metrics"

logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/webhook/internal/metrics"
Expand Down Expand Up @@ -123,7 +124,8 @@ type Webhook struct {
Handler Handler

// RecoverPanic indicates whether the panic caused by webhook should be recovered.
RecoverPanic bool
// Defaults to true.
RecoverPanic *bool

// WithContextFunc will allow you to take the http.Request.Context() and
// add any additional information such as passing the request path or
Expand All @@ -141,8 +143,9 @@ type Webhook struct {
}

// WithRecoverPanic takes a bool flag which indicates whether the panic caused by webhook should be recovered.
// Defaults to true.
func (wh *Webhook) WithRecoverPanic(recoverPanic bool) *Webhook {
wh.RecoverPanic = recoverPanic
wh.RecoverPanic = &recoverPanic
return wh
}

Expand All @@ -151,25 +154,37 @@ func (wh *Webhook) WithRecoverPanic(recoverPanic bool) *Webhook {
// If the webhook is validating type, it delegates the AdmissionRequest to each handler and
// deny the request if anyone denies.
func (wh *Webhook) Handle(ctx context.Context, req Request) (response Response) {
if wh.RecoverPanic {
defer func() {
if r := recover(); r != nil {
defer func() {
if r := recover(); r != nil {
admissionmetrics.WebhookPanics.WithLabelValues().Inc()

if wh.RecoverPanic == nil || *wh.RecoverPanic {
for _, fn := range utilruntime.PanicHandlers {
fn(ctx, r)
}
response = Errored(http.StatusInternalServerError, fmt.Errorf("panic: %v [recovered]", r))
// Note: We explicitly have to set the response UID. Usually that is done via resp.Complete below,
// but if we encounter a panic in wh.Handler.Handle we are never going to reach resp.Complete.
response.UID = req.UID
return
}
}()
}

log := logf.FromContext(ctx)
log.Info(fmt.Sprintf("Observed a panic in webhook: %v", r))
panic(r)
}
}()

reqLog := wh.getLogger(&req)
ctx = logf.IntoContext(ctx, reqLog)

resp := wh.Handler.Handle(ctx, req)
if err := resp.Complete(req); err != nil {
reqLog.Error(err, "unable to encode response")
return Errored(http.StatusInternalServerError, errUnableToEncodeResponse)
resp := Errored(http.StatusInternalServerError, errUnableToEncodeResponse)
// Note: We explicitly have to set the response UID. Usually that is done via resp.Complete.
resp.UID = req.UID
return resp
}

return resp
Expand Down
35 changes: 32 additions & 3 deletions pkg/webhook/admission/webhook_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
authenticationv1 "k8s.io/api/authentication/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
machinerytypes "k8s.io/apimachinery/pkg/types"
"k8s.io/utils/ptr"

logf "sigs.k8s.io/controller-runtime/pkg/log"
"sigs.k8s.io/controller-runtime/pkg/log/zap"
Expand Down Expand Up @@ -199,6 +200,33 @@ var _ = Describe("Admission Webhooks", func() {
})

Describe("panic recovery", func() {
It("should recover panic if RecoverPanic is true by default", func() {
panicHandler := func() *Webhook {
handler := &fakeHandler{
fn: func(ctx context.Context, req Request) Response {
panic("fake panic test")
},
}
webhook := &Webhook{
Handler: handler,
// RecoverPanic defaults to true.
}

return webhook
}

By("setting up a webhook with a panicking handler")
webhook := panicHandler()

By("invoking the webhook")
resp := webhook.Handle(context.Background(), Request{})

By("checking that it errored the request")
Expect(resp.Allowed).To(BeFalse())
Expect(resp.Result.Code).To(Equal(int32(http.StatusInternalServerError)))
Expect(resp.Result.Message).To(Equal("panic: fake panic test [recovered]"))
})

It("should recover panic if RecoverPanic is true", func() {
panicHandler := func() *Webhook {
handler := &fakeHandler{
Expand All @@ -208,7 +236,7 @@ var _ = Describe("Admission Webhooks", func() {
}
webhook := &Webhook{
Handler: handler,
RecoverPanic: true,
RecoverPanic: ptr.To[bool](true),
}

return webhook
Expand All @@ -226,15 +254,16 @@ var _ = Describe("Admission Webhooks", func() {
Expect(resp.Result.Message).To(Equal("panic: fake panic test [recovered]"))
})

It("should not recover panic if RecoverPanic is false by default", func() {
It("should not recover panic if RecoverPanic is false", func() {
panicHandler := func() *Webhook {
handler := &fakeHandler{
fn: func(ctx context.Context, req Request) Response {
panic("fake panic test")
},
}
webhook := &Webhook{
Handler: handler,
Handler: handler,
RecoverPanic: ptr.To[bool](false),
}

return webhook
Expand Down

0 comments on commit 9516c0f

Please sign in to comment.