diff --git a/poetry.lock b/poetry.lock index b6578ec161..feed59c717 100644 --- a/poetry.lock +++ b/poetry.lock @@ -606,7 +606,7 @@ description = "Common protobufs used in Google APIs" optional = true python-versions = ">=3.7" groups = ["main"] -markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"all\"" +markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"opentelemetry-metrics\" or extra == \"all\"" files = [ {file = "googleapis_common_protos-1.70.0-py3-none-any.whl", hash = "sha256:b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8"}, {file = "googleapis_common_protos-1.70.0.tar.gz", hash = "sha256:0e1b44e0ea153e6594f9f394fef15193a68aaaea2d843f83e2742717ca753257"}, @@ -625,7 +625,7 @@ description = "HTTP/2-based RPC framework" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"all\"" +markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"opentelemetry-metrics\" or extra == \"all\"" files = [ {file = "grpcio-1.78.0-cp310-cp310-linux_armv7l.whl", hash = "sha256:7cc47943d524ee0096f973e1081cb8f4f17a4615f2116882a5f1416e4cfe92b5"}, {file = "grpcio-1.78.0-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:c3f293fdc675ccba4db5a561048cca627b5e7bd1c8a6973ffedabe7d116e22e2"}, @@ -990,7 +990,7 @@ files = [ {file = "importlib_metadata-8.7.1-py3-none-any.whl", hash = "sha256:5a1f80bf1daa489495071efbb095d75a634cf28a8bc299581244063b53176151"}, {file = "importlib_metadata-8.7.1.tar.gz", hash = "sha256:49fef1ae6440c182052f407c8d34a68f72efc36db9ca90dc0113398f2fdde8bb"}, ] -markers = {main = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"all\"", dev = "python_version < \"3.12\" and platform_machine != \"ppc64le\" and platform_machine != \"s390x\""} +markers = {main = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"opentelemetry-metrics\" or extra == \"all\"", dev = "python_version < \"3.12\" and platform_machine != \"ppc64le\" and platform_machine != \"s390x\""} [package.dependencies] zipp = ">=3.20" @@ -1931,7 +1931,7 @@ description = "OpenTelemetry Python API" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"all\"" +markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"opentelemetry-metrics\" or extra == \"all\"" files = [ {file = "opentelemetry_api-1.40.0-py3-none-any.whl", hash = "sha256:82dd69331ae74b06f6a874704be0cfaa49a1650e1537d4a813b86ecef7d0ecf9"}, {file = "opentelemetry_api-1.40.0.tar.gz", hash = "sha256:159be641c0b04d11e9ecd576906462773eb97ae1b657730f0ecf64d32071569f"}, @@ -1948,7 +1948,7 @@ description = "OpenTelemetry Collector Exporters" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"all\"" +markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"opentelemetry-metrics\" or extra == \"all\"" files = [ {file = "opentelemetry_exporter_otlp-1.40.0-py3-none-any.whl", hash = "sha256:48c87e539ec9afb30dc443775a1334cc5487de2f72a770a4c00b1610bf6c697d"}, {file = "opentelemetry_exporter_otlp-1.40.0.tar.gz", hash = "sha256:7caa0870b95e2fcb59d64e16e2b639ecffb07771b6cd0000b5d12e5e4fef765a"}, @@ -1965,7 +1965,7 @@ description = "OpenTelemetry Protobuf encoding" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"all\"" +markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"opentelemetry-metrics\" or extra == \"all\"" files = [ {file = "opentelemetry_exporter_otlp_proto_common-1.40.0-py3-none-any.whl", hash = "sha256:7081ff453835a82417bf38dccf122c827c3cbc94f2079b03bba02a3165f25149"}, {file = "opentelemetry_exporter_otlp_proto_common-1.40.0.tar.gz", hash = "sha256:1cbee86a4064790b362a86601ee7934f368b81cd4cc2f2e163902a6e7818a0fa"}, @@ -1981,7 +1981,7 @@ description = "OpenTelemetry Collector Protobuf over gRPC Exporter" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"all\"" +markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"opentelemetry-metrics\" or extra == \"all\"" files = [ {file = "opentelemetry_exporter_otlp_proto_grpc-1.40.0-py3-none-any.whl", hash = "sha256:2aa0ca53483fe0cf6405087a7491472b70335bc5c7944378a0a8e72e86995c52"}, {file = "opentelemetry_exporter_otlp_proto_grpc-1.40.0.tar.gz", hash = "sha256:bd4015183e40b635b3dab8da528b27161ba83bf4ef545776b196f0fb4ec47740"}, @@ -2010,7 +2010,7 @@ description = "OpenTelemetry Collector Protobuf over HTTP Exporter" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"all\"" +markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"opentelemetry-metrics\" or extra == \"all\"" files = [ {file = "opentelemetry_exporter_otlp_proto_http-1.40.0-py3-none-any.whl", hash = "sha256:a8d1dab28f504c5d96577d6509f80a8150e44e8f45f82cdbe0e34c99ab040069"}, {file = "opentelemetry_exporter_otlp_proto_http-1.40.0.tar.gz", hash = "sha256:db48f5e0f33217588bbc00274a31517ba830da576e59503507c839b38fa0869c"}, @@ -2053,7 +2053,7 @@ description = "OpenTelemetry Python Proto" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"all\"" +markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"opentelemetry-metrics\" or extra == \"all\"" files = [ {file = "opentelemetry_proto-1.40.0-py3-none-any.whl", hash = "sha256:266c4385d88923a23d63e353e9761af0f47a6ed0d486979777fe4de59dc9b25f"}, {file = "opentelemetry_proto-1.40.0.tar.gz", hash = "sha256:03f639ca129ba513f5819810f5b1f42bcb371391405d99c168fe6937c62febcd"}, @@ -2069,7 +2069,7 @@ description = "OpenTelemetry Python SDK" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"all\"" +markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"opentelemetry-metrics\" or extra == \"all\"" files = [ {file = "opentelemetry_sdk-1.40.0-py3-none-any.whl", hash = "sha256:787d2154a71f4b3d81f20524a8ce061b7db667d24e46753f32a7bc48f1c1f3f1"}, {file = "opentelemetry_sdk-1.40.0.tar.gz", hash = "sha256:18e9f5ec20d859d268c7cb3c5198c8d105d073714db3de50b593b8c1345a48f2"}, @@ -2087,7 +2087,7 @@ description = "OpenTelemetry Semantic Conventions" optional = true python-versions = ">=3.9" groups = ["main"] -markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"all\"" +markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"opentelemetry-metrics\" or extra == \"all\"" files = [ {file = "opentelemetry_semantic_conventions-0.61b0-py3-none-any.whl", hash = "sha256:fa530a96be229795f8cef353739b618148b0fe2b4b3f005e60e262926c4d38e2"}, {file = "opentelemetry_semantic_conventions-0.61b0.tar.gz", hash = "sha256:072f65473c5d7c6dc0355b27d6c9d1a679d63b6d4b4b16a9773062cb7e31192a"}, @@ -2302,7 +2302,7 @@ description = "" optional = true python-versions = ">=3.8" groups = ["main"] -markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"all\"" +markers = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"opentelemetry-metrics\" or extra == \"all\"" files = [ {file = "protobuf-5.29.4-cp310-abi3-win32.whl", hash = "sha256:13eb236f8eb9ec34e63fc8b1d6efd2777d062fa6aaa68268fb67cf77f6839ad7"}, {file = "protobuf-5.29.4-cp310-abi3-win_amd64.whl", hash = "sha256:bcefcdf3976233f8a502d265eb65ea740c989bacc6c30a58290ed0e519eb4b8d"}, @@ -3939,7 +3939,7 @@ files = [ {file = "zipp-3.23.0-py3-none-any.whl", hash = "sha256:071652d6115ed432f5ce1d34c336c0adfd6a884660d1e9712a256d3d3bd4b14e"}, {file = "zipp-3.23.0.tar.gz", hash = "sha256:a07157588a12518c9d4034df3fbbee09c814741a33ff63c05fa29d26a2404166"}, ] -markers = {main = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"all\"", dev = "python_version < \"3.12\" and platform_machine != \"ppc64le\" and platform_machine != \"s390x\""} +markers = {main = "extra == \"opentracing-otlp\" or extra == \"opentelemetry-log-handler\" or extra == \"opentelemetry-metrics\" or extra == \"all\"", dev = "python_version < \"3.12\" and platform_machine != \"ppc64le\" and platform_machine != \"s390x\""} [package.extras] check = ["pytest-checkdocs (>=2.4)", "pytest-ruff (>=0.2.1) ; sys_platform != \"cygwin\""] @@ -4038,6 +4038,7 @@ jwt = ["authlib"] matrix-synapse-ldap3 = ["matrix-synapse-ldap3"] oidc = ["authlib"] opentelemetry-log-handler = ["opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk"] +opentelemetry-metrics = ["opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-sdk"] opentracing-jaeger = ["jaeger-client", "opentracing", "thrift", "tornado"] opentracing-otlp = ["opentelemetry-api", "opentelemetry-exporter-otlp", "opentelemetry-opentracing-shim", "opentelemetry-sdk", "opentracing"] postgres = ["psycopg2", "psycopg2cffi", "psycopg2cffi-compat"] @@ -4050,4 +4051,4 @@ url-preview = ["lxml"] [metadata] lock-version = "2.1" python-versions = ">=3.10.0,<4.0.0" -content-hash = "6b14e814692dc21ef76e136cceb6b778a18ba9e54c2766a38da71c1cbb87a779" +content-hash = "90446b4a9b78e5f3166173ef19089cadc70f2b92dcb49e2c0dcecce84976f280" diff --git a/pyproject.toml b/pyproject.toml index 4d6566ab36..03e1ae96ec 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -173,6 +173,11 @@ opentelemetry-log-handler = [ "opentelemetry-sdk==1.40.0", "opentelemetry-exporter-otlp==1.40.0", ] +opentelemetry-metrics = [ + "opentelemetry-api==1.40.0", + "opentelemetry-sdk==1.40.0", + "opentelemetry-exporter-otlp==1.40.0", +] # The duplication here is awful. # diff --git a/synapse/api/auth/__init__.py b/synapse/api/auth/__init__.py index 201c295f06..269106631e 100644 --- a/synapse/api/auth/__init__.py +++ b/synapse/api/auth/__init__.py @@ -20,13 +20,12 @@ # from typing import TYPE_CHECKING, Protocol -from prometheus_client import Histogram - from twisted.web.server import Request from synapse.appservice import ApplicationService from synapse.http.site import SynapseRequest from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Histogram from synapse.types import Requester if TYPE_CHECKING: diff --git a/synapse/app/_base.py b/synapse/app/_base.py index a9a3411b89..850312a7e2 100644 --- a/synapse/app/_base.py +++ b/synapse/app/_base.py @@ -337,9 +337,21 @@ def listen_metrics( Twisted reactor thread between bytecode boundaries and the metrics thread gets scheduled with roughly equal priority to the Twisted reactor thread. + When ``SYNAPSE_METRICS_BACKEND=otlp`` the Prometheus scrape endpoint is not + needed – metrics are pushed via OTLP instead – so this is a no-op. + Returns: List of WSGIServer with the thread they are running on. """ + from synapse.metrics.instruments import METRICS_BACKEND + + if METRICS_BACKEND == "otlp": + logger.info( + "Not starting Prometheus metrics listener because " + "SYNAPSE_METRICS_BACKEND=otlp; metrics are exported via OTLP" + ) + return [] + from prometheus_client import start_http_server as start_http_server_prometheus from synapse.metrics import RegistryProxy @@ -764,6 +776,16 @@ def log_shutdown() -> None: setup_sentry(hs) setup_sdnotify(hs) + # Flush pending OTLP metrics on shutdown. + from synapse.metrics.instruments import METRICS_BACKEND + + if METRICS_BACKEND == "otlp": + from synapse.metrics._otel import shutdown as shutdown_otel_metrics + + hs.get_clock().add_system_event_trigger( + "during", "shutdown", shutdown_otel_metrics + ) + # Register background tasks required by this server. This must be done # somewhat manually due to the background tasks not being registered # unless handlers are instantiated. diff --git a/synapse/app/generic_worker.py b/synapse/app/generic_worker.py index 159cd44237..388004c7b4 100644 --- a/synapse/app/generic_worker.py +++ b/synapse/app/generic_worker.py @@ -193,7 +193,15 @@ def _listen_http(self, listener_config: ListenerConfig) -> None: for res in listener_config.http_options.resources: for name in res.names: if name == "metrics": - resources[METRICS_PREFIX] = MetricsResource(RegistryProxy) + from synapse.metrics.instruments import METRICS_BACKEND + + if METRICS_BACKEND == "otlp": + logger.info( + "Not exposing Prometheus /metrics resource because " + "SYNAPSE_METRICS_BACKEND=otlp; metrics are exported via OTLP" + ) + else: + resources[METRICS_PREFIX] = MetricsResource(RegistryProxy) elif name == "client": resource: Resource = ClientRestResource(self) diff --git a/synapse/app/homeserver.py b/synapse/app/homeserver.py index 2b1760416b..843c65e6c0 100644 --- a/synapse/app/homeserver.py +++ b/synapse/app/homeserver.py @@ -257,10 +257,18 @@ def _configure_named_resource( resources[SERVER_KEY_PREFIX] = KeyResource(self) if name == "metrics" and self.config.metrics.enable_metrics: - metrics_resource: Resource = MetricsResource(RegistryProxy) - if compress: - metrics_resource = gz_wrap(metrics_resource) - resources[METRICS_PREFIX] = metrics_resource + from synapse.metrics.instruments import METRICS_BACKEND + + if METRICS_BACKEND == "otlp": + logger.info( + "Not exposing Prometheus /metrics resource because " + "SYNAPSE_METRICS_BACKEND=otlp; metrics are exported via OTLP" + ) + else: + metrics_resource: Resource = MetricsResource(RegistryProxy) + if compress: + metrics_resource = gz_wrap(metrics_resource) + resources[METRICS_PREFIX] = metrics_resource if name == "replication": resources[REPLICATION_PREFIX] = ReplicationRestResource(self) diff --git a/synapse/app/phone_stats_home.py b/synapse/app/phone_stats_home.py index 7b4bf25c28..e97af3e150 100644 --- a/synapse/app/phone_stats_home.py +++ b/synapse/app/phone_stats_home.py @@ -24,11 +24,10 @@ import sys from typing import TYPE_CHECKING, Mapping, Sized -from prometheus_client import Gauge - from twisted.internet import defer from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Gauge from synapse.types import JsonDict from synapse.util.duration import Duration diff --git a/synapse/appservice/api.py b/synapse/appservice/api.py index 66c962e17d..c394859e79 100644 --- a/synapse/appservice/api.py +++ b/synapse/appservice/api.py @@ -29,7 +29,6 @@ TypeVar, ) -from prometheus_client import Counter from typing_extensions import ParamSpec, TypeGuard from synapse.api.constants import ThirdPartyEntityKind @@ -44,6 +43,7 @@ from synapse.http.client import SimpleHttpClient, is_unknown_endpoint from synapse.logging import opentracing from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Counter from synapse.types import DeviceListUpdates, JsonDict, JsonMapping, ThirdPartyInstanceID from synapse.util.caches.response_cache import ResponseCache from synapse.util.duration import Duration diff --git a/synapse/config/metrics.py b/synapse/config/metrics.py index d6ff367617..8485870dbe 100644 --- a/synapse/config/metrics.py +++ b/synapse/config/metrics.py @@ -20,6 +20,7 @@ # # +import os from typing import Any import attr @@ -50,6 +51,25 @@ class MetricsConfig(Config): def read_config(self, config: JsonDict, **kwargs: Any) -> None: self.enable_metrics = config.get("enable_metrics", False) + # The metrics backend is authoritative from the env var because metric + # objects are created at import time (before config is read). The yaml + # value is accepted for documentation / validation purposes. + env_backend = os.environ.get("SYNAPSE_METRICS_BACKEND", "prometheus").lower() + yaml_backend = config.get("metrics_backend", "prometheus").lower() + if yaml_backend not in ("prometheus", "otlp"): + raise ConfigError( + "Invalid metrics_backend %r (must be 'prometheus' or 'otlp')" + % yaml_backend + ) + if yaml_backend == "otlp" and env_backend != "otlp": + raise ConfigError( + "metrics_backend is set to 'otlp' in config but the " + "SYNAPSE_METRICS_BACKEND env var is %r. The env var must also " + "be set to 'otlp' because metrics are initialised at import " + "time (before config is read)." % env_backend + ) + self.metrics_backend: str = env_backend + self.report_stats = config.get("report_stats", None) self.report_stats_endpoint = config.get( "report_stats_endpoint", "https://matrix.org/report-usage-stats/push" diff --git a/synapse/federation/federation_client.py b/synapse/federation/federation_client.py index 78a1900c73..2dc9727bc0 100644 --- a/synapse/federation/federation_client.py +++ b/synapse/federation/federation_client.py @@ -40,7 +40,6 @@ ) import attr -from prometheus_client import Counter from synapse.api.constants import Direction, EventContentFields, EventTypes, Membership from synapse.api.errors import ( @@ -71,6 +70,7 @@ from synapse.http.types import QueryParams from synapse.logging.opentracing import SynapseTags, log_kv, set_tag, tag_args, trace from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Counter from synapse.types import JsonDict, StrCollection, UserID, get_domain_from_id from synapse.util.async_helpers import concurrently_execute from synapse.util.caches.expiringcache import ExpiringCache diff --git a/synapse/federation/federation_server.py b/synapse/federation/federation_server.py index a869f231f3..5268bad188 100644 --- a/synapse/federation/federation_server.py +++ b/synapse/federation/federation_server.py @@ -31,8 +31,6 @@ Mapping, ) -from prometheus_client import Counter, Gauge, Histogram - from twisted.python import failure from synapse.api.constants import ( @@ -80,6 +78,7 @@ ) from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics.background_process_metrics import wrap_as_background_process +from synapse.metrics.instruments import Counter, Gauge, Histogram from synapse.replication.http.federation import ( ReplicationFederationSendEduRestServlet, ) diff --git a/synapse/federation/sender/__init__.py b/synapse/federation/sender/__init__.py index f7240c2f7f..4450a294b6 100644 --- a/synapse/federation/sender/__init__.py +++ b/synapse/federation/sender/__init__.py @@ -141,7 +141,6 @@ ) import attr -from prometheus_client import Counter from twisted.internet import defer @@ -166,6 +165,7 @@ from synapse.metrics.background_process_metrics import ( wrap_as_background_process, ) +from synapse.metrics.instruments import Counter from synapse.types import ( JsonDict, ReadReceipt, diff --git a/synapse/federation/sender/per_destination_queue.py b/synapse/federation/sender/per_destination_queue.py index cdacf16d72..1eedbb051a 100644 --- a/synapse/federation/sender/per_destination_queue.py +++ b/synapse/federation/sender/per_destination_queue.py @@ -26,7 +26,6 @@ from typing import TYPE_CHECKING, Hashable, Iterable import attr -from prometheus_client import Counter from twisted.internet import defer @@ -44,6 +43,7 @@ from synapse.logging.context import PreserveLoggingContext from synapse.logging.opentracing import SynapseTags, set_tag from synapse.metrics import SERVER_NAME_LABEL, sent_transactions_counter +from synapse.metrics.instruments import Counter from synapse.types import JsonDict, ReadReceipt from synapse.util.retryutils import NotRetryingDestination, get_retry_limiter from synapse.visibility import filter_events_for_server diff --git a/synapse/federation/sender/transaction_manager.py b/synapse/federation/sender/transaction_manager.py index 99aa05ebd6..281466398b 100644 --- a/synapse/federation/sender/transaction_manager.py +++ b/synapse/federation/sender/transaction_manager.py @@ -20,8 +20,6 @@ import logging from typing import TYPE_CHECKING -from prometheus_client import Gauge - from synapse.api.constants import EduTypes from synapse.api.errors import HttpResponseException from synapse.events import EventBase @@ -35,6 +33,7 @@ whitelisted_homeserver, ) from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Gauge from synapse.types import JsonDict from synapse.util.json import json_decoder from synapse.util.metrics import measure_func diff --git a/synapse/handlers/appservice.py b/synapse/handlers/appservice.py index c91d2adbe1..fad1166afd 100644 --- a/synapse/handlers/appservice.py +++ b/synapse/handlers/appservice.py @@ -26,8 +26,6 @@ Mapping, ) -from prometheus_client import Counter - from twisted.internet import defer import synapse @@ -44,6 +42,7 @@ from synapse.metrics.background_process_metrics import ( wrap_as_background_process, ) +from synapse.metrics.instruments import Counter from synapse.storage.databases.main.directory import RoomAliasMapping from synapse.types import ( DeviceListUpdates, diff --git a/synapse/handlers/auth.py b/synapse/handlers/auth.py index e2c41fc168..ee8d108f7d 100644 --- a/synapse/handlers/auth.py +++ b/synapse/handlers/auth.py @@ -39,7 +39,6 @@ import attr import bcrypt import unpaddedbase64 -from prometheus_client import Counter from twisted.internet.defer import CancelledError from twisted.web.server import Request @@ -66,6 +65,7 @@ from synapse.logging.context import defer_to_thread from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics.background_process_metrics import run_as_background_process +from synapse.metrics.instruments import Counter from synapse.storage.databases.main.registration import ( LoginTokenExpired, LoginTokenLookupResult, diff --git a/synapse/handlers/federation.py b/synapse/handlers/federation.py index f18cfce9e7..83938c8724 100644 --- a/synapse/handlers/federation.py +++ b/synapse/handlers/federation.py @@ -34,7 +34,6 @@ ) import attr -from prometheus_client import Histogram from signedjson.key import decode_verify_key_bytes from signedjson.sign import verify_signed_json from unpaddedbase64 import decode_base64 @@ -66,6 +65,7 @@ from synapse.logging.context import nested_logging_context from synapse.logging.opentracing import SynapseTags, set_tag, tag_args, trace from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Histogram from synapse.module_api import NOT_SPAM from synapse.storage.databases.main.events_worker import EventRedactBehaviour from synapse.storage.invite_rule import InviteRule diff --git a/synapse/handlers/federation_event.py b/synapse/handlers/federation_event.py index e314180e12..f2803eb54b 100644 --- a/synapse/handlers/federation_event.py +++ b/synapse/handlers/federation_event.py @@ -31,8 +31,6 @@ Sequence, ) -from prometheus_client import Counter, Histogram - from synapse import event_auth from synapse.api.constants import ( EventContentFields, @@ -76,6 +74,7 @@ trace, ) from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Counter, Histogram from synapse.replication.http.federation import ( ReplicationFederationSendEventsRestServlet, ) diff --git a/synapse/handlers/presence.py b/synapse/handlers/presence.py index 4c3adca46e..9b1f5fccce 100644 --- a/synapse/handlers/presence.py +++ b/synapse/handlers/presence.py @@ -90,8 +90,6 @@ Iterable, ) -from prometheus_client import Counter - import synapse.metrics from synapse.api.constants import EduTypes, EventTypes, Membership, PresenceState from synapse.api.errors import SynapseError @@ -103,6 +101,7 @@ from synapse.metrics.background_process_metrics import ( wrap_as_background_process, ) +from synapse.metrics.instruments import Counter from synapse.replication.http.presence import ( ReplicationBumpPresenceActiveTime, ReplicationPresenceSetState, diff --git a/synapse/handlers/register.py b/synapse/handlers/register.py index 139c14dcf4..a65f4d5e38 100644 --- a/synapse/handlers/register.py +++ b/synapse/handlers/register.py @@ -29,8 +29,6 @@ TypedDict, ) -from prometheus_client import Counter - from synapse import types from synapse.api.constants import ( MAX_USERID_LENGTH, @@ -50,6 +48,7 @@ from synapse.config.server import is_threepid_reserved from synapse.http.servlet import assert_params_in_dict from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Counter from synapse.replication.http.login import RegisterDeviceReplicationServlet from synapse.replication.http.register import ( ReplicationPostRegisterActionsServlet, diff --git a/synapse/handlers/sliding_sync/__init__.py b/synapse/handlers/sliding_sync/__init__.py index 1cc587d4a7..d8c207e064 100644 --- a/synapse/handlers/sliding_sync/__init__.py +++ b/synapse/handlers/sliding_sync/__init__.py @@ -18,7 +18,6 @@ from typing import TYPE_CHECKING, AbstractSet, Mapping import attr -from prometheus_client import Histogram from typing_extensions import assert_never from synapse.api.constants import Direction, EventTypes, Membership @@ -40,6 +39,7 @@ trace, ) from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Histogram from synapse.storage.databases.main.roommember import extract_heroes_from_room_summary from synapse.storage.databases.main.state_deltas import StateDelta from synapse.storage.databases.main.stream import PaginateFunction diff --git a/synapse/handlers/sync.py b/synapse/handlers/sync.py index c88f703ae9..56e2e5d306 100644 --- a/synapse/handlers/sync.py +++ b/synapse/handlers/sync.py @@ -29,7 +29,6 @@ ) import attr -from prometheus_client import Counter from synapse.api.constants import ( AccountDataTypes, @@ -55,6 +54,7 @@ trace, ) from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Counter from synapse.storage.databases.main.event_push_actions import RoomNotifCounts from synapse.storage.databases.main.roommember import extract_heroes_from_room_summary from synapse.storage.databases.main.stream import PaginateFunction diff --git a/synapse/http/client.py b/synapse/http/client.py index 05c5f13a87..671213a914 100644 --- a/synapse/http/client.py +++ b/synapse/http/client.py @@ -36,7 +36,6 @@ import treq from canonicaljson import encode_canonical_json from netaddr import AddrFormatError, IPAddress, IPSet -from prometheus_client import Counter from zope.interface import implementer from OpenSSL import SSL @@ -85,6 +84,7 @@ ) from synapse.logging.opentracing import set_tag, start_active_span, tags from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Counter from synapse.types import ISynapseReactor, StrSequence from synapse.util.async_helpers import timeout_deferred from synapse.util.clock import Clock diff --git a/synapse/http/matrixfederationclient.py b/synapse/http/matrixfederationclient.py index dbd4f1e4b6..c8a3bc1c79 100644 --- a/synapse/http/matrixfederationclient.py +++ b/synapse/http/matrixfederationclient.py @@ -43,7 +43,6 @@ import attr import treq from canonicaljson import encode_canonical_json -from prometheus_client import Counter from signedjson.sign import sign_json from twisted.internet import defer @@ -84,6 +83,7 @@ from synapse.logging.context import make_deferred_yieldable, run_in_background from synapse.logging.opentracing import set_tag, start_active_span, tags from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Counter from synapse.types import JsonDict from synapse.util.async_helpers import AwakenableSleeper, Linearizer, timeout_deferred from synapse.util.clock import Clock diff --git a/synapse/http/request_metrics.py b/synapse/http/request_metrics.py index 5cc8a2ebd8..b4dd721da2 100644 --- a/synapse/http/request_metrics.py +++ b/synapse/http/request_metrics.py @@ -24,10 +24,9 @@ import traceback from typing import Mapping -from prometheus_client.core import Counter, Histogram - from synapse.logging.context import current_context from synapse.metrics import SERVER_NAME_LABEL, LaterGauge +from synapse.metrics.instruments import Counter, Histogram logger = logging.getLogger(__name__) diff --git a/synapse/logging/handlers.py b/synapse/logging/handlers.py index 32279c250f..fa856f0e9c 100644 --- a/synapse/logging/handlers.py +++ b/synapse/logging/handlers.py @@ -112,7 +112,7 @@ def close(self) -> None: class RealOtlpHandler(LoggingHandler): def __init__(self, level: int = logging.NOTSET) -> None: self.logger_provider = LoggerProvider( - resource=Resource(attributes={"service.name": "synapse"}) + resource=Resource.create(attributes={"service.name": "synapse"}) ) self.logger_provider.add_log_record_processor( BatchLogRecordProcessor(OTLPLogExporter()) diff --git a/synapse/logging/opentracing.py b/synapse/logging/opentracing.py index 709f893240..5d0d2937a9 100644 --- a/synapse/logging/opentracing.py +++ b/synapse/logging/opentracing.py @@ -518,7 +518,7 @@ def init_tracer_otlp(hs: "HomeServer") -> None: from opentelemetry.shim.opentracing_shim import create_tracer tracer_provider = TracerProvider( - resource=Resource(attributes={"service.name": "synapse"}) + resource=Resource.create(attributes={"service.name": "synapse"}) ) tracer_provider.add_span_processor(BatchSpanProcessor(OTLPSpanExporter())) trace.set_tracer_provider(tracer_provider) diff --git a/synapse/metrics/__init__.py b/synapse/metrics/__init__.py index a86debf9f2..11386be6e9 100644 --- a/synapse/metrics/__init__.py +++ b/synapse/metrics/__init__.py @@ -41,15 +41,10 @@ import attr from packaging.version import parse as parse_version from prometheus_client import ( - CollectorRegistry, - Counter, - Gauge, - Histogram, Metric, generate_latest, ) from prometheus_client.core import ( - REGISTRY, GaugeHistogramMetricFamily, GaugeMetricFamily, ) @@ -62,6 +57,13 @@ import synapse.metrics._reactor_metrics # noqa: F401 from synapse.metrics._gc import MIN_TIME_BETWEEN_GCS, install_gc_manager from synapse.metrics._types import Collector +from synapse.metrics.instruments import ( + REGISTRY, + CollectorRegistry, + Counter, + Gauge, + Histogram, +) from synapse.synapse_rust import get_rustc_version from synapse.types import StrSequence from synapse.util import SYNAPSE_VERSION diff --git a/synapse/metrics/_gc.py b/synapse/metrics/_gc.py index 1da871f18f..cbee51d2b9 100644 --- a/synapse/metrics/_gc.py +++ b/synapse/metrics/_gc.py @@ -27,17 +27,15 @@ from typing import Iterable from prometheus_client.core import ( - REGISTRY, CounterMetricFamily, - Gauge, GaugeMetricFamily, - Histogram, Metric, ) from twisted.internet import task from synapse.metrics._types import Collector +from synapse.metrics.instruments import REGISTRY, Gauge, Histogram """Prometheus metrics for garbage collection""" diff --git a/synapse/metrics/_otel.py b/synapse/metrics/_otel.py new file mode 100644 index 0000000000..c8e55c6098 --- /dev/null +++ b/synapse/metrics/_otel.py @@ -0,0 +1,615 @@ +# +# This file is licensed under the Affero General Public License (AGPL) version 3. +# +# Copyright (C) 2025 Famedly GmbH +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# See the GNU Affero General Public License for more details: +# . +# +""" +OTLP-backed metric instruments with an API compatible with ``prometheus_client``. + +Provides :class:`Counter`, :class:`Gauge` and :class:`Histogram` drop-in +replacements that forward measurements to an OpenTelemetry OTLP exporter +instead of the Prometheus scrape endpoint. + +The OTLP exporter picks up its configuration from the standard ``OTEL_*`` +environment variables (``OTEL_EXPORTER_OTLP_ENDPOINT``, etc.). + +This module is only imported when ``SYNAPSE_METRICS_BACKEND=otlp``. +""" + +from __future__ import annotations + +import contextlib +import gc +import logging +import os +import resource +import threading +import time +from typing import ( + Any, + Callable, + Generator, + Iterable, + Sequence, +) + +from opentelemetry.exporter.otlp.proto.http.metric_exporter import OTLPMetricExporter +from opentelemetry.metrics import Observation +from opentelemetry.sdk.metrics import MeterProvider +from opentelemetry.sdk.metrics.export import PeriodicExportingMetricReader +from opentelemetry.sdk.resources import Resource + +logger = logging.getLogger(__name__) + +# Global OTel meter – created eagerly so that module-level metric definitions +# (the common pattern in Synapse) can use it immediately. + +_resource = Resource.create(attributes={"service.name": "synapse"}) +_exporter = OTLPMetricExporter() +_reader = PeriodicExportingMetricReader(_exporter) +_meter_provider = MeterProvider(resource=_resource, metric_readers=[_reader]) +_meter = _meter_provider.get_meter("synapse") + + +def shutdown() -> None: + """Flush pending metrics and release resources. Call on server shutdown.""" + _meter_provider.shutdown() + + +# Pre-collect hook machinery +# +# Some metrics in Synapse (e.g. cache metrics) are updated lazily via hooks +# that normally run during a Prometheus scrape. When exporting via OTLP we +# need to trigger these hooks ourselves before the periodic metric reader +# collects observations. +# +# Each :class:`Gauge` observable callback calls :func:`_run_pre_collect_hooks` +# which, at most once per collection cycle, invokes every registered hook. + +_pre_collect_hooks: list[Callable[[], None]] = [] +_pre_collect_lock = threading.Lock() +_last_pre_collect_time: float = 0.0 +_PRE_COLLECT_MIN_INTERVAL: float = 0.5 # seconds + + +def register_pre_collect_hook(hook: Callable[[], None]) -> None: + """Register a callable invoked before observable metrics are read. + + This is intended for things like + ``DynamicCollectorRegistry.run_hooks``, which must run before Gauge + observation callbacks so that the stored values are up-to-date. + """ + with _pre_collect_lock: + _pre_collect_hooks.append(hook) + + +def _run_pre_collect_hooks() -> None: + """Run registered hooks, but at most once per ``_PRE_COLLECT_MIN_INTERVAL``.""" + global _last_pre_collect_time + now = time.monotonic() + if now - _last_pre_collect_time < _PRE_COLLECT_MIN_INTERVAL: + return + _last_pre_collect_time = now + for hook in _pre_collect_hooks: + try: + hook() + except Exception: + logger.debug("Pre-collect hook %s failed", hook, exc_info=True) + + +class _GaugeChild: + """A single label-set projection of an :class:`Gauge`.""" + + __slots__ = ("_parent", "_attrs", "_key") + + def __init__(self, parent: Gauge, attrs: dict[str, str], key: tuple) -> None: + self._parent = parent + self._attrs = attrs + self._key = key + + def set(self, value: float) -> None: + self._parent._values[self._key] = float(value) + + def inc(self, amount: float = 1) -> None: + k = self._key + vals = self._parent._values + vals[k] = vals.get(k, 0.0) + float(amount) + + def dec(self, amount: float = 1) -> None: + k = self._key + vals = self._parent._values + vals[k] = vals.get(k, 0.0) - float(amount) + + def set_function(self, fn: Callable[[], float]) -> None: + self._parent._functions[self._key] = (self._attrs, fn) + self._parent._values.pop(self._key, None) + + def set_to_current_time(self) -> None: + self.set(time.time()) + + @contextlib.contextmanager + def track_inprogress(self) -> Generator[None, None, None]: + """Context-manager that increments the gauge on entry and decrements on exit.""" + self.inc() + try: + yield + finally: + self.dec() + + +class Gauge: + """OTLP-backed drop-in replacement for ``prometheus_client.Gauge``. + + Internally this creates an OTel *ObservableGauge* whose callback returns + the most recently stored values. This is a natural fit because Synapse + gauges are written to sporadically (e.g. from hooks) and read periodically. + """ + + def __init__( + self, + name: str, + documentation: str = "", + labelnames: Iterable[str] = (), + namespace: str = "", + subsystem: str = "", + unit: str = "", + registry: Any = None, + _labelvalues: Any = None, + multiprocess_mode: str = "all", + ) -> None: + self._name = name + self._labelnames = list(labelnames) + + # {attrs_key_tuple: float} + self._values: dict[tuple, float] = {} + # {attrs_key_tuple: (attrs_dict, callable)} + self._functions: dict[tuple, tuple[dict[str, str], Callable[[], float]]] = {} + self._children: dict[tuple, _GaugeChild] = {} + + # When there are no label names the metric is used directly (.set(), .inc(), …). + if not self._labelnames: + self._no_label_child: _GaugeChild | None = _GaugeChild(self, {}, ()) + else: + self._no_label_child = None + + def _callback(options: Any) -> Sequence[Observation]: + _run_pre_collect_hooks() + obs: list[Observation] = [] + for key, value in list(self._values.items()): + obs.append(Observation(value, dict(key))) + for attrs, fn in list(self._functions.values()): + try: + obs.append(Observation(fn(), attrs)) + except Exception: + pass + return obs + + self._instrument = _meter.create_observable_gauge( + name, + callbacks=[_callback], + description=documentation, + ) + + def labels(self, *args: str, **kwargs: str) -> _GaugeChild: + if args: + attrs = dict(zip(self._labelnames, args)) + else: + attrs = kwargs + key = tuple(sorted(attrs.items())) + child = self._children.get(key) + if child is None: + child = _GaugeChild(self, attrs, key) + self._children[key] = child + return child + + def remove(self, *labelvalues: str) -> None: + """Remove the child and all stored data for the given label values. + + This mirrors ``prometheus_client.Gauge.remove`` so that callers like + ``BatchingQueue.shutdown()`` work identically under the OTLP backend. + """ + attrs = dict(zip(self._labelnames, labelvalues)) + key = tuple(sorted(attrs.items())) + self._values.pop(key, None) + self._functions.pop(key, None) + self._children.pop(key, None) + + def set(self, value: float) -> None: + assert self._no_label_child is not None, "Must call .labels() first" + self._no_label_child.set(value) + + def inc(self, amount: float = 1) -> None: + assert self._no_label_child is not None, "Must call .labels() first" + self._no_label_child.inc(amount) + + def dec(self, amount: float = 1) -> None: + assert self._no_label_child is not None, "Must call .labels() first" + self._no_label_child.dec(amount) + + def set_function(self, fn: Callable[[], float]) -> None: + assert self._no_label_child is not None, "Must call .labels() first" + self._no_label_child.set_function(fn) + + def set_to_current_time(self) -> None: + assert self._no_label_child is not None, "Must call .labels() first" + self._no_label_child.set_to_current_time() + + def describe(self) -> list: + # prometheus compat stub (not needed for OTLP) + return [] + + def collect(self) -> list: + # prometheus compat stub (not needed for OTLP) + return [] + + +class _CounterChild: + """A single label-set projection of a :class:`Counter`.""" + + __slots__ = ("_instrument", "_attrs") + + def __init__(self, instrument: Any, attrs: dict[str, str]) -> None: + self._instrument = instrument + self._attrs = attrs + + def inc(self, amount: float = 1) -> None: + if amount < 0: + raise ValueError( + "Counter.inc amount must not be negative (got %s)" % amount + ) + self._instrument.add(amount, self._attrs) + + +class Counter: + """OTLP-backed drop-in replacement for ``prometheus_client.Counter``.""" + + def __init__( + self, + name: str, + documentation: str = "", + labelnames: Iterable[str] = (), + namespace: str = "", + subsystem: str = "", + unit: str = "", + registry: Any = None, + _labelvalues: Any = None, + ) -> None: + self._name = name + self._labelnames = list(labelnames) + self._instrument = _meter.create_counter(name, description=documentation) + self._children: dict[tuple, _CounterChild] = {} + + if not self._labelnames: + self._no_label_child: _CounterChild | None = _CounterChild( + self._instrument, + {}, + ) + else: + self._no_label_child = None + + def labels(self, *args: str, **kwargs: str) -> _CounterChild: + if args: + attrs = dict(zip(self._labelnames, args)) + else: + attrs = kwargs + key = tuple(sorted(attrs.items())) + child = self._children.get(key) + if child is None: + child = _CounterChild(self._instrument, attrs) + self._children[key] = child + return child + + def inc(self, amount: float = 1) -> None: + assert self._no_label_child is not None, "Must call .labels() first" + self._no_label_child.inc(amount) + + def describe(self) -> list: + return [] + + def collect(self) -> list: + return [] + + +class _HistogramTimer: + """Context-manager returned by ``Histogram.time()``.""" + + __slots__ = ("_child", "_start") + + def __init__(self, child: _HistogramChild) -> None: + self._child = child + self._start: float | None = None + + def __enter__(self) -> _HistogramTimer: + self._start = time.monotonic() + return self + + def __exit__(self, *args: Any) -> None: + assert self._start is not None + self._child.observe(time.monotonic() - self._start) + + +class _HistogramChild: + """A single label-set projection of a :class:`Histogram`.""" + + __slots__ = ("_instrument", "_attrs") + + def __init__(self, instrument: Any, attrs: dict[str, str]) -> None: + self._instrument = instrument + self._attrs = attrs + + def observe(self, value: float) -> None: + self._instrument.record(value, self._attrs) + + def time(self) -> _HistogramTimer: + """Return a context-manager that observes the elapsed wall-clock time.""" + return _HistogramTimer(self) + + +class Histogram: + """OTLP-backed drop-in replacement for ``prometheus_client.Histogram``. + + .. note:: + + The *buckets* parameter is accepted for API compatibility but is + **not** forwarded to the OTel instrument. Bucket boundaries in + OpenTelemetry are configured via *Views* on the ``MeterProvider``; + the SDK default boundaries apply unless overridden there. + """ + + DEFAULT_BUCKETS = ( + 0.005, + 0.01, + 0.025, + 0.05, + 0.075, + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 2.5, + 5.0, + 7.5, + 10.0, + float("inf"), + ) + + def __init__( + self, + name: str, + documentation: str = "", + labelnames: Iterable[str] = (), + namespace: str = "", + subsystem: str = "", + unit: str = "", + registry: Any = None, + _labelvalues: Any = None, + buckets: Sequence[float | str] = DEFAULT_BUCKETS, + ) -> None: + self._name = name + self._labelnames = list(labelnames) + self._instrument = _meter.create_histogram(name, description=documentation) + self._children: dict[tuple, _HistogramChild] = {} + + if not self._labelnames: + self._no_label_child: _HistogramChild | None = _HistogramChild( + self._instrument, + {}, + ) + else: + self._no_label_child = None + + def labels(self, *args: str, **kwargs: str) -> _HistogramChild: + if args: + attrs = dict(zip(self._labelnames, args)) + else: + attrs = kwargs + key = tuple(sorted(attrs.items())) + child = self._children.get(key) + if child is None: + child = _HistogramChild(self._instrument, attrs) + self._children[key] = child + return child + + def observe(self, value: float) -> None: + assert self._no_label_child is not None, "Must call .labels() first" + self._no_label_child.observe(value) + + def time(self) -> _HistogramTimer: + assert self._no_label_child is not None, "Must call .labels() first" + return self._no_label_child.time() + + def describe(self) -> list: + return [] + + def collect(self) -> list: + return [] + + +# Process-level metrics +# +# Replicate the metrics normally provided by prometheus_client's +# built-in ProcessCollector and Synapse's CPUMetrics / GCCounts +# custom collectors, which only feed the Prometheus REGISTRY. + +_HAVE_PROC_SELF_STAT = os.path.exists("/proc/self/stat") + +try: + _PAGESIZE: int = os.sysconf("SC_PAGESIZE") +except (ValueError, OSError, AttributeError): + _PAGESIZE = 4096 + +try: + _TICKS_PER_SEC: int = os.sysconf("SC_CLK_TCK") +except (ValueError, OSError, AttributeError): + _TICKS_PER_SEC = 100 + +# Boot time (seconds since epoch) for process_start_time_seconds. +_BOOT_TIME: float | None = None +try: + with open("/proc/stat") as _f: + for _line in _f: + if _line.startswith("btime "): + _BOOT_TIME = float(_line.split()[1]) + break +except OSError: + pass + + +def _read_proc_self_stat() -> list[str] | None: + """Return fields of ``/proc/self/stat`` after the *comm* field. + + Index 11 = utime, 12 = stime, 19 = starttime, 20 = vsize, 21 = rss. + """ + try: + with open("/proc/self/stat") as fh: + data = fh.read() + return data.split(") ", 1)[1].split(" ") + except Exception: + return None + + +if _HAVE_PROC_SELF_STAT: + + def _observe_cpu_seconds(options: Any) -> Sequence[Observation]: + fields = _read_proc_self_stat() + if fields is None: + return [] + utime = float(fields[11]) / _TICKS_PER_SEC + stime = float(fields[12]) / _TICKS_PER_SEC + return [Observation(utime + stime)] + + _meter.create_observable_gauge( + "process_cpu_seconds_total", + callbacks=[_observe_cpu_seconds], + description="Total user and system CPU time spent in seconds.", + ) + + def _observe_cpu_user(options: Any) -> Sequence[Observation]: + fields = _read_proc_self_stat() + if fields is None: + return [] + return [Observation(float(fields[11]) / _TICKS_PER_SEC)] + + _meter.create_observable_gauge( + "process_cpu_user_seconds_total", + callbacks=[_observe_cpu_user], + description="Total user CPU time spent in seconds.", + ) + + def _observe_cpu_system(options: Any) -> Sequence[Observation]: + fields = _read_proc_self_stat() + if fields is None: + return [] + return [Observation(float(fields[12]) / _TICKS_PER_SEC)] + + _meter.create_observable_gauge( + "process_cpu_system_seconds_total", + callbacks=[_observe_cpu_system], + description="Total system CPU time spent in seconds.", + ) + + def _observe_resident_memory(options: Any) -> Sequence[Observation]: + fields = _read_proc_self_stat() + if fields is None: + return [] + # Index 21 = rss in pages. + return [Observation(float(fields[21]) * _PAGESIZE)] + + _meter.create_observable_gauge( + "process_resident_memory_bytes", + callbacks=[_observe_resident_memory], + description="Resident memory size in bytes.", + ) + + def _observe_virtual_memory(options: Any) -> Sequence[Observation]: + fields = _read_proc_self_stat() + if fields is None: + return [] + # Index 20 = vsize in bytes. + return [Observation(float(fields[20]))] + + _meter.create_observable_gauge( + "process_virtual_memory_bytes", + callbacks=[_observe_virtual_memory], + description="Virtual memory size in bytes.", + ) + + if _BOOT_TIME is not None: + + def _observe_start_time(options: Any) -> Sequence[Observation]: + fields = _read_proc_self_stat() + if fields is None: + return [] + # Index 19 = starttime in ticks since boot. + return [Observation(float(fields[19]) / _TICKS_PER_SEC + _BOOT_TIME)] + + _meter.create_observable_gauge( + "process_start_time_seconds", + callbacks=[_observe_start_time], + description="Start time of the process since unix epoch in seconds.", + ) + + def _observe_open_fds(options: Any) -> Sequence[Observation]: + try: + return [Observation(float(len(os.listdir("/proc/self/fd"))))] + except OSError: + return [] + + _meter.create_observable_gauge( + "process_open_fds", + callbacks=[_observe_open_fds], + description="Number of open file descriptors.", + ) + + def _observe_max_fds(options: Any) -> Sequence[Observation]: + try: + soft, _hard = resource.getrlimit(resource.RLIMIT_NOFILE) + return [Observation(float(soft))] + except (ValueError, OSError): + return [] + + _meter.create_observable_gauge( + "process_max_fds", + callbacks=[_observe_max_fds], + description="Maximum number of open file descriptors.", + ) + + +def _observe_gc_counts(options: Any) -> Sequence[Observation]: + return [ + Observation(float(count), {"gen": str(gen)}) + for gen, count in enumerate(gc.get_count()) + ] + + +_meter.create_observable_gauge( + "python_gc_counts", + callbacks=[_observe_gc_counts], + description="GC object counts per generation.", +) + + +class CollectorRegistry: + def collect(self): + return [] + + +class Registry: + def register(self, other): + def _drain(): + for _ in other.collect(): + pass + + register_pre_collect_hook(_drain) + + +REGISTRY = Registry() diff --git a/synapse/metrics/_reactor_metrics.py b/synapse/metrics/_reactor_metrics.py index d528b7c5e5..93755f1a7b 100644 --- a/synapse/metrics/_reactor_metrics.py +++ b/synapse/metrics/_reactor_metrics.py @@ -24,14 +24,15 @@ from selectors import SelectSelector, _PollLikeSelector # type: ignore[attr-defined] from typing import Any, Callable, Iterable -from prometheus_client import Histogram, Metric -from prometheus_client.core import REGISTRY, GaugeMetricFamily +from prometheus_client import Metric +from prometheus_client.core import GaugeMetricFamily from twisted.internet import reactor, selectreactor from twisted.internet.asyncioreactor import AsyncioSelectorReactor from synapse.app.complement_fork_proxied_reactor import ProxiedReactor from synapse.metrics._types import Collector +from synapse.metrics.instruments import REGISTRY, Histogram try: from selectors import KqueueSelector # type: ignore[attr-defined] diff --git a/synapse/metrics/background_process_metrics.py b/synapse/metrics/background_process_metrics.py index 8ff2803455..61a8db1889 100644 --- a/synapse/metrics/background_process_metrics.py +++ b/synapse/metrics/background_process_metrics.py @@ -37,7 +37,6 @@ ) from prometheus_client import Metric -from prometheus_client.core import REGISTRY, Counter, Gauge from typing_extensions import Concatenate, ParamSpec from twisted.internet import defer @@ -55,6 +54,7 @@ ) from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics._types import Collector +from synapse.metrics.instruments import REGISTRY, Counter, Gauge if TYPE_CHECKING: import resource diff --git a/synapse/metrics/common_usage_metrics.py b/synapse/metrics/common_usage_metrics.py index ea2cdecf51..603c56d1dc 100644 --- a/synapse/metrics/common_usage_metrics.py +++ b/synapse/metrics/common_usage_metrics.py @@ -29,7 +29,7 @@ if TYPE_CHECKING: from synapse.server import HomeServer -from prometheus_client import Gauge +from synapse.metrics.instruments import Gauge # Gauge to expose daily active users metrics current_dau_gauge = Gauge( diff --git a/synapse/metrics/instruments.py b/synapse/metrics/instruments.py new file mode 100644 index 0000000000..2bc76e0bd8 --- /dev/null +++ b/synapse/metrics/instruments.py @@ -0,0 +1,62 @@ +# +# This file is licensed under the Affero General Public License (AGPL) version 3. +# +# Copyright (C) 2025 Famedly GmbH +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# See the GNU Affero General Public License for more details: +# . +# +""" +Re-exports ``Counter``, ``Gauge`` and ``Histogram`` from the appropriate +backend. + +When the environment variable ``SYNAPSE_METRICS_BACKEND`` is set to ``otlp``, +the classes come from :mod:`synapse.metrics._otel` and measurements are +exported via OTLP (configured through the standard ``OTEL_*`` environment +variables). + +Otherwise the classes are the stock ``prometheus_client`` implementations and +metrics are exposed on the Prometheus scrape endpoint as usual. +""" + +import os + +METRICS_BACKEND = os.environ.get("SYNAPSE_METRICS_BACKEND", "prometheus").lower() + +if METRICS_BACKEND == "otlp": + try: + from synapse.metrics._otel import ( + REGISTRY, + CollectorRegistry, + Counter, + Gauge, + Histogram, + ) # noqa: F401 + except ImportError: + raise ImportError( + "SYNAPSE_METRICS_BACKEND is set to 'otlp' but the required " + "OpenTelemetry packages are not installed. " + "Install them with: pip install matrix-synapse[opentelemetry-metrics]" + ) +else: + from prometheus_client import ( # noqa: F401 + REGISTRY, + CollectorRegistry, + Counter, + Gauge, + Histogram, + ) + +__all__ = [ + "Counter", + "Gauge", + "Histogram", + "CollectorRegistry", + "REGISTRY", + "METRICS_BACKEND", +] diff --git a/synapse/metrics/jemalloc.py b/synapse/metrics/jemalloc.py index 03cecec3ca..76c39fecdd 100644 --- a/synapse/metrics/jemalloc.py +++ b/synapse/metrics/jemalloc.py @@ -26,10 +26,11 @@ from typing import Iterable, Literal, overload import attr -from prometheus_client import REGISTRY, Metric +from prometheus_client import Metric from synapse.metrics import GaugeMetricFamily from synapse.metrics._types import Collector +from synapse.metrics.instruments import REGISTRY logger = logging.getLogger(__name__) diff --git a/synapse/notifier.py b/synapse/notifier.py index f1cec74462..3963a3c8c8 100644 --- a/synapse/notifier.py +++ b/synapse/notifier.py @@ -33,7 +33,6 @@ ) import attr -from prometheus_client import Counter from twisted.internet import defer from twisted.internet.defer import Deferred @@ -47,6 +46,7 @@ from synapse.logging.context import PreserveLoggingContext from synapse.logging.opentracing import log_kv, start_active_span from synapse.metrics import SERVER_NAME_LABEL, LaterGauge +from synapse.metrics.instruments import Counter from synapse.streams.config import PaginationConfig from synapse.types import ( ISynapseReactor, diff --git a/synapse/push/bulk_push_rule_evaluator.py b/synapse/push/bulk_push_rule_evaluator.py index 7cf89200a8..567cbcb35c 100644 --- a/synapse/push/bulk_push_rule_evaluator.py +++ b/synapse/push/bulk_push_rule_evaluator.py @@ -29,8 +29,6 @@ cast, ) -from prometheus_client import Counter - from twisted.internet.defer import Deferred from synapse.api.constants import ( @@ -46,6 +44,7 @@ from synapse.events.snapshot import EventContext, EventPersistencePair from synapse.logging.context import make_deferred_yieldable, run_in_background from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Counter from synapse.state import CREATE_KEY, POWER_KEY from synapse.storage.databases.main.roommember import EventIdMembership from synapse.storage.invite_rule import InviteRule diff --git a/synapse/push/httppusher.py b/synapse/push/httppusher.py index ca63a99e3e..566ed2c392 100644 --- a/synapse/push/httppusher.py +++ b/synapse/push/httppusher.py @@ -23,8 +23,6 @@ import urllib.parse from typing import TYPE_CHECKING, Optional -from prometheus_client import Counter - from twisted.internet.error import AlreadyCalled, AlreadyCancelled from twisted.internet.interfaces import IDelayedCall @@ -32,6 +30,7 @@ from synapse.events import EventBase from synapse.logging import opentracing from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Counter from synapse.push import Pusher, PusherConfig, PusherConfigException from synapse.storage.databases.main.event_push_actions import HttpPushAction from synapse.types import JsonDict, JsonMapping diff --git a/synapse/push/mailer.py b/synapse/push/mailer.py index 1ebbc6d4f3..17edf762f1 100644 --- a/synapse/push/mailer.py +++ b/synapse/push/mailer.py @@ -26,13 +26,13 @@ import bleach import jinja2 from markupsafe import Markup -from prometheus_client import Counter from synapse.api.constants import EventContentFields, EventTypes, Membership, RoomTypes from synapse.api.errors import StoreError from synapse.config.emailconfig import EmailSubjectConfig from synapse.events import EventBase from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Counter from synapse.push.presentable_names import ( calculate_room_name, descriptor_from_member_events, diff --git a/synapse/push/pusherpool.py b/synapse/push/pusherpool.py index 7b5b06db83..e2609bce20 100644 --- a/synapse/push/pusherpool.py +++ b/synapse/push/pusherpool.py @@ -22,13 +22,12 @@ import logging from typing import TYPE_CHECKING, Iterable -from prometheus_client import Gauge - from synapse.api.errors import Codes, SynapseError from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics.background_process_metrics import ( wrap_as_background_process, ) +from synapse.metrics.instruments import Gauge from synapse.push import Pusher, PusherConfig, PusherConfigException from synapse.push.pusher import PusherFactory from synapse.replication.http.push import ( diff --git a/synapse/replication/http/_base.py b/synapse/replication/http/_base.py index 87d6e80898..840b194d87 100644 --- a/synapse/replication/http/_base.py +++ b/synapse/replication/http/_base.py @@ -25,8 +25,6 @@ from inspect import signature from typing import TYPE_CHECKING, Any, Awaitable, Callable, ClassVar -from prometheus_client import Counter, Gauge - from twisted.internet.error import ConnectError, DNSLookupError from twisted.web.server import Request @@ -39,6 +37,7 @@ from synapse.logging import opentracing from synapse.logging.opentracing import trace_with_opname from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Counter, Gauge from synapse.types import JsonDict from synapse.util.caches.response_cache import ResponseCache from synapse.util.cancellation import is_function_cancellable diff --git a/synapse/replication/tcp/external_cache.py b/synapse/replication/tcp/external_cache.py index ca959a7aae..bbb8b7d181 100644 --- a/synapse/replication/tcp/external_cache.py +++ b/synapse/replication/tcp/external_cache.py @@ -22,11 +22,10 @@ import logging from typing import TYPE_CHECKING, Any -from prometheus_client import Counter, Histogram - from synapse.logging import opentracing from synapse.logging.context import make_deferred_yieldable from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Counter, Histogram from synapse.util.json import json_decoder, json_encoder if TYPE_CHECKING: diff --git a/synapse/replication/tcp/handler.py b/synapse/replication/tcp/handler.py index ad9fed72dd..e6fee05c6a 100644 --- a/synapse/replication/tcp/handler.py +++ b/synapse/replication/tcp/handler.py @@ -29,11 +29,10 @@ TypeVar, ) -from prometheus_client import Counter - from twisted.internet.protocol import ReconnectingClientFactory from synapse.metrics import SERVER_NAME_LABEL, LaterGauge +from synapse.metrics.instruments import Counter from synapse.replication.tcp.commands import ( CancelTaskCommand, ClearUserSyncsCommand, diff --git a/synapse/replication/tcp/protocol.py b/synapse/replication/tcp/protocol.py index 489a2c76a6..514c158745 100644 --- a/synapse/replication/tcp/protocol.py +++ b/synapse/replication/tcp/protocol.py @@ -30,7 +30,6 @@ from inspect import isawaitable from typing import TYPE_CHECKING, Any, Collection -from prometheus_client import Counter from zope.interface import Interface, implementer from twisted.internet import task @@ -43,6 +42,7 @@ from synapse.metrics.background_process_metrics import ( BackgroundProcessLoggingContext, ) +from synapse.metrics.instruments import Counter from synapse.replication.tcp.commands import ( VALID_CLIENT_COMMANDS, VALID_SERVER_COMMANDS, diff --git a/synapse/replication/tcp/resource.py b/synapse/replication/tcp/resource.py index 36dd39ed67..7b392b7cc8 100644 --- a/synapse/replication/tcp/resource.py +++ b/synapse/replication/tcp/resource.py @@ -24,12 +24,11 @@ import random from typing import TYPE_CHECKING -from prometheus_client import Counter - from twisted.internet.interfaces import IAddress from twisted.internet.protocol import ServerFactory from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Counter from synapse.replication.tcp.commands import PositionCommand from synapse.replication.tcp.protocol import ServerReplicationStreamProtocol from synapse.replication.tcp.streams import EventsStream diff --git a/synapse/rest/client/room.py b/synapse/rest/client/room.py index 83664814a6..33e638f179 100644 --- a/synapse/rest/client/room.py +++ b/synapse/rest/client/room.py @@ -29,7 +29,6 @@ from urllib import parse as urlparse import attr -from prometheus_client.core import Histogram from twisted.web.server import Request @@ -75,6 +74,7 @@ from synapse.logging.context import make_deferred_yieldable, run_in_background from synapse.logging.opentracing import set_tag, trace from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Histogram from synapse.rest.client._base import client_patterns from synapse.rest.client.transactions import HttpTransactionCache from synapse.state import CREATE_KEY, POWER_KEY diff --git a/synapse/state/__init__.py b/synapse/state/__init__.py index 2f0e3f2c3e..a71f47f65d 100644 --- a/synapse/state/__init__.py +++ b/synapse/state/__init__.py @@ -33,7 +33,6 @@ import attr from immutabledict import immutabledict -from prometheus_client import Counter, Histogram from synapse.api.constants import EventTypes from synapse.api.room_versions import KNOWN_ROOM_VERSIONS, StateResolutionVersions @@ -46,6 +45,7 @@ from synapse.logging.context import ContextResourceUsage from synapse.logging.opentracing import tag_args, trace from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Counter, Histogram from synapse.replication.http.state import ReplicationUpdateCurrentStateRestServlet from synapse.state import v1, v2 from synapse.storage.databases.main.event_federation import StateDifference diff --git a/synapse/storage/controllers/persist_events.py b/synapse/storage/controllers/persist_events.py index 7cc6a39639..c23a370d71 100644 --- a/synapse/storage/controllers/persist_events.py +++ b/synapse/storage/controllers/persist_events.py @@ -39,7 +39,6 @@ ) import attr -from prometheus_client import Counter, Histogram from twisted.internet import defer @@ -58,6 +57,7 @@ trace, ) from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Counter, Histogram from synapse.storage.controllers.state import StateStorageController from synapse.storage.databases import Databases from synapse.storage.databases.main.events import DeltaState diff --git a/synapse/storage/database.py b/synapse/storage/database.py index 6e38b55686..156922994f 100644 --- a/synapse/storage/database.py +++ b/synapse/storage/database.py @@ -42,7 +42,6 @@ ) import attr -from prometheus_client import Counter, Histogram from typing_extensions import Concatenate, ParamSpec from twisted.enterprise import adbapi @@ -57,6 +56,7 @@ make_deferred_yieldable, ) from synapse.metrics import SERVER_NAME_LABEL, register_threadpool +from synapse.metrics.instruments import Counter, Histogram from synapse.storage.background_updates import BackgroundUpdater from synapse.storage.engines import BaseDatabaseEngine, PostgresEngine, Sqlite3Engine from synapse.storage.types import Connection, Cursor, SQLQueryParameters diff --git a/synapse/storage/databases/main/event_federation.py b/synapse/storage/databases/main/event_federation.py index 415926eb0a..5d4a207d03 100644 --- a/synapse/storage/databases/main/event_federation.py +++ b/synapse/storage/databases/main/event_federation.py @@ -32,7 +32,6 @@ ) import attr -from prometheus_client import Counter, Gauge from synapse.api.constants import MAX_DEPTH from synapse.api.errors import StoreError @@ -41,6 +40,7 @@ from synapse.logging.opentracing import tag_args, trace from synapse.metrics import SERVER_NAME_LABEL from synapse.metrics.background_process_metrics import wrap_as_background_process +from synapse.metrics.instruments import Counter, Gauge from synapse.storage._base import db_to_json, make_in_list_sql_clause from synapse.storage.background_updates import ForeignKeyConstraint from synapse.storage.database import ( diff --git a/synapse/storage/databases/main/events.py b/synapse/storage/databases/main/events.py index 12c918eca6..1bfb6c9b10 100644 --- a/synapse/storage/databases/main/events.py +++ b/synapse/storage/databases/main/events.py @@ -35,7 +35,6 @@ ) import attr -from prometheus_client import Counter import synapse.metrics from synapse.api.constants import ( @@ -58,6 +57,7 @@ from synapse.events.utils import parse_stripped_state_event from synapse.logging.opentracing import trace from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Counter from synapse.storage._base import db_to_json, make_in_list_sql_clause from synapse.storage.database import ( DatabasePool, diff --git a/synapse/storage/databases/main/events_worker.py b/synapse/storage/databases/main/events_worker.py index cc79b8042b..f047b0cd5a 100644 --- a/synapse/storage/databases/main/events_worker.py +++ b/synapse/storage/databases/main/events_worker.py @@ -36,7 +36,6 @@ ) import attr -from prometheus_client import Gauge from twisted.internet import defer @@ -67,6 +66,7 @@ from synapse.metrics.background_process_metrics import ( wrap_as_background_process, ) +from synapse.metrics.instruments import Gauge from synapse.replication.tcp.streams import BackfillStream, UnPartialStatedEventStream from synapse.replication.tcp.streams.events import EventsStream from synapse.replication.tcp.streams.partial_state import UnPartialStatedEventStreamRow diff --git a/synapse/util/batching_queue.py b/synapse/util/batching_queue.py index 43eefcb7f1..6a6c518ea1 100644 --- a/synapse/util/batching_queue.py +++ b/synapse/util/batching_queue.py @@ -29,12 +29,11 @@ TypeVar, ) -from prometheus_client import Gauge - from twisted.internet import defer from synapse.logging.context import PreserveLoggingContext, make_deferred_yieldable from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Gauge from synapse.util.clock import Clock from synapse.util.duration import Duration diff --git a/synapse/util/caches/__init__.py b/synapse/util/caches/__init__.py index bd52c0c0ed..c182dcd901 100644 --- a/synapse/util/caches/__init__.py +++ b/synapse/util/caches/__init__.py @@ -27,11 +27,10 @@ from typing import Any, Callable, Sized, TypeVar import attr -from prometheus_client import REGISTRY -from prometheus_client.core import Gauge from synapse.config.cache import add_resizable_cache from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import REGISTRY, Gauge from synapse.util.metrics import DynamicCollectorRegistry logger = logging.getLogger(__name__) diff --git a/synapse/util/caches/deferred_cache.py b/synapse/util/caches/deferred_cache.py index e7e21d0de9..c04984d9ba 100644 --- a/synapse/util/caches/deferred_cache.py +++ b/synapse/util/caches/deferred_cache.py @@ -34,13 +34,12 @@ cast, ) -from prometheus_client import Gauge - from twisted.internet import defer from twisted.python.failure import Failure from synapse.logging.context import PreserveLoggingContext from synapse.metrics import SERVER_NAME_LABEL +from synapse.metrics.instruments import Gauge from synapse.util.async_helpers import ObservableDeferred from synapse.util.caches.lrucache import LruCache from synapse.util.caches.treecache import TreeCache, iterate_tree_cache_entry diff --git a/synapse/util/metrics.py b/synapse/util/metrics.py index 5e86939e37..d19ba0ed77 100644 --- a/synapse/util/metrics.py +++ b/synapse/util/metrics.py @@ -37,7 +37,7 @@ TypeVar, ) -from prometheus_client import CollectorRegistry, Counter, Metric +from prometheus_client import Metric from typing_extensions import Concatenate, ParamSpec from synapse.logging.context import ( @@ -46,6 +46,7 @@ current_context, ) from synapse.metrics import SERVER_NAME_LABEL, InFlightGauge +from synapse.metrics.instruments import CollectorRegistry, Counter from synapse.util.clock import Clock logger = logging.getLogger(__name__) diff --git a/synapse/util/ratelimitutils.py b/synapse/util/ratelimitutils.py index d1053d227b..b40f76d06e 100644 --- a/synapse/util/ratelimitutils.py +++ b/synapse/util/ratelimitutils.py @@ -34,8 +34,6 @@ ) from weakref import WeakSet -from prometheus_client.core import Counter - from twisted.internet import defer from synapse.api.errors import LimitExceededError @@ -47,6 +45,7 @@ ) from synapse.logging.opentracing import start_active_span from synapse.metrics import SERVER_NAME_LABEL, Histogram, LaterGauge +from synapse.metrics.instruments import Counter from synapse.util.clock import Clock from synapse.util.duration import Duration