softwaremill
diff --git a/‎Cargo.lock‎
Lines changed: 21 additions & 0 deletions b/‎Cargo.lock‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 3 additions & 1 deletion b/‎Cargo.toml‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎Dockerfile.dev‎
Lines changed: 1 addition & 1 deletion b/‎Dockerfile.dev‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 52 additions & 1 deletion b/‎README.md‎
Lines changed: 52 additions & 1 deletion
diff --git a/‎config.example.toml‎
Lines changed: 31 additions & 0 deletions b/‎config.example.toml‎
Lines changed: 31 additions & 0 deletions
diff --git a/‎src/cluster/manager.rs‎
Lines changed: 28 additions & 0 deletions b/‎src/cluster/manager.rs‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎src/collector/timestamp_sampler.rs‎
Lines changed: 5 additions & 0 deletions b/‎src/collector/timestamp_sampler.rs‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎src/config.rs‎
Lines changed: 12 additions & 1 deletion b/‎src/config.rs‎
Lines changed: 12 additions & 1 deletion
@@ -30,6 +30,7 @@ config = "0.14"
 tracing = "0.1"
 tracing-subscriber = { version = "0.3", features = ["env-filter"] }
 
+tikv-jemallocator = { version = "0.6", optional = true }
 regex = "1"
 dashmap = "6"
 thiserror = "1"
@@ -46,7 +47,8 @@ k8s-openapi = { version = "0.24", features = ["v1_32"], optional = true }
 kube-lease-manager = { version = "0.8", optional = true }
 
 [features]
-default = []
+default = ["jemalloc"]
+jemalloc = ["tikv-jemallocator"]
 kubernetes = ["kube", "k8s-openapi", "kube-lease-manager"]
 
 [dev-dependencies]
 
@@ -1,5 +1,5 @@
 # Build stage
-FROM rust:1.83-bookworm AS builder
+FROM rust:1.85-bookworm AS builder
 
 # Install build dependencies for rdkafka
 RUN apt-get update && apt-get install -y \
 
@@ -513,6 +513,9 @@ max_concurrent_groups = 20      # Default: 10
 
 # Maximum partitions to fetch watermarks for in parallel
 max_concurrent_watermarks = 100 # Default: 50
+
+# Client recycling interval — see Memory Management section below
+client_recycle_interval = 50    # Default: 50 (set 0 to disable)
 ```
 
 ### Recommended Settings by Cluster Size
@@ -546,6 +549,52 @@ max_concurrent_watermarks = 100 # Default: 50
 
 4. **Consider running multiple instances** — Split monitoring across clusters or consumer group subsets using different whitelist patterns.
 
+### Memory Management
+
+#### The librdkafka metadata cache problem
+
+librdkafka (the C library underlying the Rust Kafka client) maintains an internal hash table of topic handles. Every time the exporter touches a topic — via watermark fetches, offset lookups, or config queries — librdkafka creates or reuses a handle for that topic. These handles are **never freed** until the client is destroyed. There is no API to evict individual entries.
+
+On large clusters with thousands of topics, the internal cache grows with every collection cycle. If topics are created and deleted over time (topic churn), the handle count only increases — deleted topics remain as stale entries.
+
+#### Client recycling
+
+To prevent unbounded memory growth, klag-exporter periodically destroys and recreates its internal Kafka clients, releasing all accumulated metadata. This is controlled by the `client_recycle_interval` setting:
+
+```toml
+[exporter.performance]
+# Number of collection cycles between client recycling.
+# Set to 0 to disable (recommended for small/stable clusters).
+client_recycle_interval = 50   # Default: every 50 cycles (~25 min at 30s poll)
+```
+
+| Setting | When to use |
+|---------|-------------|
+| `0` (disabled) | Small clusters with few topics, or stable clusters with no topic churn |
+| `50` (default) | Large clusters with many topics or moderate topic churn |
+| `100+` | Large clusters where you want less frequent recycling overhead |
+
+Recycling is safe — it only runs between collection cycles after all in-flight operations have completed. The trade-off is a brief memory spike (~2-10 MB) while new clients are created before old ones are fully torn down.
+
+#### jemalloc
+
+klag-exporter uses [jemalloc](https://jemalloc.net/) as the default memory allocator (enabled via the `jemalloc` feature flag). jemalloc provides significantly better memory return behavior than glibc malloc, which tends to hold onto freed pages indefinitely in long-running processes.
+
+To disable jemalloc:
+
+```bash
+cargo build --release --no-default-features
+```
+
+#### Timestamp consumer pool sizing
+
+Each entry in the timestamp consumer pool (`max_concurrent_fetches`) is a full librdkafka client with its own background threads and connection state, consuming ~5-15 MB of memory. Size the pool to match your actual concurrency needs, not your topic or partition count:
+
+```toml
+[exporter.timestamp_sampling]
+max_concurrent_fetches = 5   # Default: 5. Each is a full Kafka client.
+```
+
 ## Troubleshooting
 
 ### Time Lag Shows Gaps in Grafana
@@ -564,9 +613,11 @@ This is expected when:
 
 ### High Memory Usage
 
-- Reduce `max_concurrent_fetches`
+- Reduce `max_concurrent_fetches` — each concurrent fetch is a full librdkafka client (~5-15 MB)
 - Use `granularity = "topic"` instead of `"partition"`
 - Add more restrictive `group_blacklist` / `topic_blacklist` patterns
+- On large clusters with topic churn, ensure `client_recycle_interval` is enabled (see below)
+- jemalloc is the default allocator and provides much better memory behavior than glibc malloc; disable with `--no-default-features` only if needed
 
 ### Connection Errors
 
 
@@ -23,6 +23,37 @@ cache_ttl = "60s"
 # Maximum concurrent timestamp fetch operations
 max_concurrent_fetches = 10
 
+[exporter.performance]
+# Timeout for Kafka API operations (metadata, watermarks, etc.)
+# kafka_timeout = "30s"
+
+# Timeout for fetching committed offsets per consumer group
+# offset_fetch_timeout = "10s"
+
+# Maximum consumer groups to fetch offsets for in parallel
+# max_concurrent_groups = 10
+
+# Maximum partitions to fetch watermarks for in parallel
+# max_concurrent_watermarks = 50
+
+# Client recycling interval (number of collection cycles).
+#
+# librdkafka caches internal topic handles that are never freed until the
+# client is destroyed. On large clusters with many topics (especially with
+# topic churn — topics being created and deleted), this cache grows
+# unboundedly and can consume gigabytes of memory.
+#
+# Recycling periodically destroys and recreates the internal Kafka clients,
+# releasing all accumulated metadata. The trade-off is a brief allocation
+# spike during the swap (~2-10 MB depending on cluster size).
+#
+# Guidelines:
+#   0   = disabled (recommended for small/stable clusters)
+#   50  = default (~25 min at 30s poll; good for large clusters)
+#   100 = less frequent (lower overhead, more metadata accumulation)
+#
+# client_recycle_interval = 50
+
 [exporter.otel]
 # Enable OpenTelemetry export (default: false)
 enabled = false
 
@@ -30,6 +30,7 @@ pub struct ClusterManager {
     max_concurrent_fetches: usize,
     cache_cleanup_interval: Duration,
     collection_timeout: Duration,
+    client_recycle_interval: u64,
 }
 
 impl ClusterManager {
@@ -90,6 +91,7 @@ impl ClusterManager {
             max_concurrent_fetches: exporter_config.timestamp_sampling.max_concurrent_fetches,
             cache_cleanup_interval: exporter_config.timestamp_sampling.cache_ttl * 2,
             collection_timeout,
+            client_recycle_interval: exporter_config.performance.client_recycle_interval,
         })
     }
 
@@ -102,6 +104,7 @@ impl ClusterManager {
         let mut consecutive_errors = 0u32;
         let mut current_backoff = Duration::from_secs(1);
         let mut was_leader = leadership.is_leader();
+        let mut cycle_count: u64 = 0;
 
         if !was_leader {
             info!(
@@ -145,6 +148,31 @@ impl ClusterManager {
                             consecutive_errors = 0;
                             current_backoff = Duration::from_secs(1);
                             self.registry.set_healthy(true);
+
+                            // Periodically recycle Kafka clients to release
+                            // accumulated librdkafka internal metadata
+                            if self.client_recycle_interval > 0 {
+                                cycle_count += 1;
+                                if cycle_count >= self.client_recycle_interval {
+                                    cycle_count = 0;
+                                    if let Err(e) = self.client.recycle() {
+                                        warn!(
+                                            cluster = %self.cluster_name,
+                                            error = %e,
+                                            "Failed to recycle Kafka clients"
+                                        );
+                                    }
+                                    if let Some(ref sampler) = self.timestamp_sampler {
+                                        if let Err(e) = sampler.recycle_pool() {
+                                            warn!(
+                                                cluster = %self.cluster_name,
+                                                error = %e,
+                                                "Failed to recycle timestamp consumer pool"
+                                            );
+                                        }
+                                    }
+                                }
+                            }
                         }
                         Ok(Err(e)) => {
                             consecutive_errors += 1;
 
@@ -104,6 +104,11 @@ impl TimestampSampler {
             .collect()
     }
 
+    /// Recycle the underlying consumer pool to release accumulated metadata.
+    pub fn recycle_pool(&self) -> Result<()> {
+        self.inner.consumer.recycle_pool()
+    }
+
     pub fn clear_stale_entries(&self) {
         let now = Instant::now();
         self.inner
 
@@ -64,6 +64,12 @@ pub struct PerformanceConfig {
     /// Maximum number of partitions to fetch watermarks for in parallel
     #[serde(default = "default_max_concurrent_watermarks")]
     pub max_concurrent_watermarks: usize,
+    /// Number of collection cycles between Kafka client recycling.
+    /// Recycling destroys and recreates internal librdkafka clients to release
+    /// accumulated metadata that librdkafka never frees on its own.
+    /// Set to 0 to disable. Default: 50 (~25 min at 30s poll interval).
+    #[serde(default = "default_client_recycle_interval")]
+    pub client_recycle_interval: u64,
 }
 
 #[derive(Debug, Deserialize, Clone)]
@@ -154,7 +160,7 @@ fn default_cache_ttl() -> Duration {
 }
 
 fn default_max_concurrent_fetches() -> usize {
-    10
+    5
 }
 
 fn default_kafka_timeout() -> Duration {
@@ -173,6 +179,10 @@ fn default_max_concurrent_watermarks() -> usize {
     50
 }
 
+fn default_client_recycle_interval() -> u64 {
+    50
+}
+
 fn default_otel_endpoint() -> String {
     "http://localhost:4317".to_string()
 }
@@ -250,6 +260,7 @@ impl Default for PerformanceConfig {
             offset_fetch_timeout: default_offset_fetch_timeout(),
             max_concurrent_groups: default_max_concurrent_groups(),
             max_concurrent_watermarks: default_max_concurrent_watermarks(),
+            client_recycle_interval: default_client_recycle_interval(),
         }
     }
 }
Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,12 @@ pub struct PerformanceConfig {`
`64`	`64`	`/// Maximum number of partitions to fetch watermarks for in parallel`
`65`	`65`	`#[serde(default = "default_max_concurrent_watermarks")]`
`66`	`66`	`pub max_concurrent_watermarks: usize,`
	`67`	`+ /// Number of collection cycles between Kafka client recycling.`
	`68`	`+ /// Recycling destroys and recreates internal librdkafka clients to release`
	`69`	`+ /// accumulated metadata that librdkafka never frees on its own.`
	`70`	`+ /// Set to 0 to disable. Default: 50 (~25 min at 30s poll interval).`
	`71`	`+ #[serde(default = "default_client_recycle_interval")]`
	`72`	`+ pub client_recycle_interval: u64,`
`67`	`73`	`}`
`68`	`74`
`69`	`75`	`#[derive(Debug, Deserialize, Clone)]`
`@@ -154,7 +160,7 @@ fn default_cache_ttl() -> Duration {`
`154`	`160`	`}`
`155`	`161`
`156`	`162`	`fn default_max_concurrent_fetches() -> usize {`
`157`		`- 10`
	`163`	`+ 5`
`158`	`164`	`}`
`159`	`165`
`160`	`166`	`fn default_kafka_timeout() -> Duration {`
`@@ -173,6 +179,10 @@ fn default_max_concurrent_watermarks() -> usize {`
`173`	`179`	`50`
`174`	`180`	`}`
`175`	`181`
	`182`	`+fn default_client_recycle_interval() -> u64 {`
	`183`	`+ 50`
	`184`	`+}`
	`185`	`+`
`176`	`186`	`fn default_otel_endpoint() -> String {`
`177`	`187`	`"http://localhost:4317".to_string()`
`178`	`188`	`}`
`@@ -250,6 +260,7 @@ impl Default for PerformanceConfig {`
`250`	`260`	`offset_fetch_timeout: default_offset_fetch_timeout(),`
`251`	`261`	`max_concurrent_groups: default_max_concurrent_groups(),`
`252`	`262`	`max_concurrent_watermarks: default_max_concurrent_watermarks(),`
	`263`	`+ client_recycle_interval: default_client_recycle_interval(),`
`253`	`264`	`}`
`254`	`265`	`}`
`255`	`266`	`}`