diff --git a/cmd/activator/main.go b/cmd/activator/main.go index 8593ccbed5dd..9167ad389078 100644 --- a/cmd/activator/main.go +++ b/cmd/activator/main.go @@ -203,7 +203,8 @@ func main() { // Open a WebSocket connection to the autoscaler. autoscalerEndpoint := "ws://" + pkgnet.GetServiceHostname("autoscaler", system.Namespace()) + autoscalerPort logger.Info("Connecting to Autoscaler at ", autoscalerEndpoint) - statSink := websocket.NewDurableSendingConnection(autoscalerEndpoint, logger) + statSink := websocket.NewDurableSendingConnection(autoscalerEndpoint, logger, + activator.AutoscalerConnectionOptions(logger, mp)...) defer statSink.Shutdown() go activator.ReportStats(logger, statSink, statCh) diff --git a/pkg/activator/metrics.go b/pkg/activator/metrics.go new file mode 100644 index 000000000000..802bd7c2c285 --- /dev/null +++ b/pkg/activator/metrics.go @@ -0,0 +1,70 @@ +/* +Copyright 2024 The Knative Authors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package activator + +import ( + "go.opentelemetry.io/otel" + "go.opentelemetry.io/otel/attribute" + "go.opentelemetry.io/otel/metric" +) + +var scopeName = "knative.dev/serving/pkg/activator" + +// peerAttrKey is the attribute key for identifying the connection peer. +var peerAttrKey = attribute.Key("peer") + +// PeerAutoscaler is the attribute value for autoscaler connections. +var PeerAutoscaler = peerAttrKey.String("autoscaler") + +type statReporterMetrics struct { + reachable metric.Int64Gauge + connectionErrors metric.Int64Counter +} + +func newStatReporterMetrics(mp metric.MeterProvider) *statReporterMetrics { + var ( + m statReporterMetrics + err error + provider = mp + ) + + if provider == nil { + provider = otel.GetMeterProvider() + } + + meter := provider.Meter(scopeName) + + m.reachable, err = meter.Int64Gauge( + "kn.activator.reachable", + metric.WithDescription("Whether a peer is reachable from the activator (1 = reachable, 0 = not reachable)"), + metric.WithUnit("{reachable}"), + ) + if err != nil { + panic(err) + } + + m.connectionErrors, err = meter.Int64Counter( + "kn.activator.connection_errors", + metric.WithDescription("Number of connection errors from the activator"), + metric.WithUnit("{error}"), + ) + if err != nil { + panic(err) + } + + return &m +} diff --git a/pkg/activator/stat_reporter.go b/pkg/activator/stat_reporter.go index bfd60719c955..1989bc89c53d 100644 --- a/pkg/activator/stat_reporter.go +++ b/pkg/activator/stat_reporter.go @@ -17,9 +17,13 @@ limitations under the License. package activator import ( + "context" + "github.com/gorilla/websocket" + "go.opentelemetry.io/otel/metric" "go.uber.org/zap" - "knative.dev/serving/pkg/autoscaler/metrics" + pkgwebsocket "knative.dev/pkg/websocket" + asmetrics "knative.dev/serving/pkg/autoscaler/metrics" ) // RawSender sends raw byte array messages with a message type @@ -28,13 +32,31 @@ type RawSender interface { SendRaw(msgType int, msg []byte) error } +// AutoscalerConnectionOptions returns websocket connection options that handle +// connection status changes via callbacks. This enables real-time metric updates +// when the connection state changes, without polling. +func AutoscalerConnectionOptions(logger *zap.SugaredLogger, mp metric.MeterProvider) []pkgwebsocket.ConnectionOption { + metrics := newStatReporterMetrics(mp) + return []pkgwebsocket.ConnectionOption{ + pkgwebsocket.WithOnConnect(func() { + logger.Info("Autoscaler connection established") + metrics.reachable.Record(context.Background(), 1, metric.WithAttributes(PeerAutoscaler)) + }), + pkgwebsocket.WithOnDisconnect(func(err error) { + logger.Errorw("Autoscaler connection lost", zap.Error(err)) + metrics.reachable.Record(context.Background(), 0, metric.WithAttributes(PeerAutoscaler)) + metrics.connectionErrors.Add(context.Background(), 1, metric.WithAttributes(PeerAutoscaler)) + }), + } +} + // ReportStats sends any messages received on the source channel to the sink. // The messages are sent on a goroutine to avoid blocking, which means that // messages may arrive out of order. -func ReportStats(logger *zap.SugaredLogger, sink RawSender, source <-chan []metrics.StatMessage) { +func ReportStats(logger *zap.SugaredLogger, sink RawSender, source <-chan []asmetrics.StatMessage) { for sms := range source { - go func(sms []metrics.StatMessage) { - wsms := metrics.ToWireStatMessages(sms) + go func(sms []asmetrics.StatMessage) { + wsms := asmetrics.ToWireStatMessages(sms) b, err := wsms.Marshal() if err != nil { logger.Errorw("Error while marshaling stats", zap.Error(err)) @@ -42,7 +64,9 @@ func ReportStats(logger *zap.SugaredLogger, sink RawSender, source <-chan []metr } if err := sink.SendRaw(websocket.BinaryMessage, b); err != nil { - logger.Errorw("Error while sending stats", zap.Error(err)) + logger.Errorw("Autoscaler is not reachable from activator. Stats were not sent.", + zap.Error(err), + zap.Int("stat_message_count", len(sms))) } }(sms) } diff --git a/pkg/activator/stat_reporter_test.go b/pkg/activator/stat_reporter_test.go index 785f28d1fa4b..487fb0a007f6 100644 --- a/pkg/activator/stat_reporter_test.go +++ b/pkg/activator/stat_reporter_test.go @@ -17,6 +17,7 @@ limitations under the License. package activator import ( + "errors" "testing" "time" @@ -95,3 +96,41 @@ type sendRawFunc func(msgType int, msg []byte) error func (fn sendRawFunc) SendRaw(msgType int, msg []byte) error { return fn(msgType, msg) } + +func TestReportStatsSendFailure(t *testing.T) { + logger := logtesting.TestLogger(t) + ch := make(chan []metrics.StatMessage) + + sendErr := errors.New("connection refused") + errorReceived := make(chan struct{}) + sink := sendRawFunc(func(msgType int, msg []byte) error { + close(errorReceived) + return sendErr + }) + + defer close(ch) + go ReportStats(logger, sink, ch) + + // Send a stat message + ch <- []metrics.StatMessage{{ + Key: types.NamespacedName{Name: "test-revision"}, + }} + + // Wait for the error to be processed + select { + case <-errorReceived: + // Success - the error path was executed + case <-time.After(2 * time.Second): + t.Fatal("SendRaw was not called within timeout") + } +} + +func TestAutoscalerConnectionOptions(t *testing.T) { + logger := logtesting.TestLogger(t) + + opts := AutoscalerConnectionOptions(logger, nil) + + if len(opts) != 2 { + t.Errorf("Expected 2 connection options, got %d", len(opts)) + } +}