Skip to content

Commit f239c37

Browse files
authored
Separate fast and slow lgalloc metrics (#32565)
Switch to lgalloc 0.6, which allows us to separate fast and slow metrics collection. We'll collect the fast stats more frequently than the slow stats, or could even consider disabling collecting the slow stats. --------- Signed-off-by: Moritz Hoffmann <[email protected]>
1 parent 9b101de commit f239c37

File tree

9 files changed

+133
-31
lines changed

9 files changed

+133
-31
lines changed

Cargo.lock

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

src/cluster/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ async-trait = "0.1.88"
1515
crossbeam-channel = "0.5.15"
1616
differential-dataflow = "0.15.2"
1717
futures = "0.3.31"
18-
lgalloc = "0.5.0"
18+
lgalloc = "0.6.0"
1919
mz-cluster-client = { path = "../cluster-client" }
2020
mz-ore = { path = "../ore", features = ["async", "process", "tracing"] }
2121
mz-service = { path = "../service" }

src/compute/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ differential-dataflow = "0.15.2"
2020
differential-dogs3 = "0.1.13"
2121
futures = "0.3.31"
2222
itertools = "0.14.0"
23-
lgalloc = "0.5"
23+
lgalloc = "0.6"
2424
mz-cluster = { path = "../cluster" }
2525
mz-compute-client = { path = "../compute-client" }
2626
mz-compute-types = { path = "../compute-types" }

src/metrics/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ publish = false
1010
workspace = true
1111

1212
[dependencies]
13-
lgalloc = "0.5"
13+
lgalloc = "0.6"
1414
libc = "0.2.172"
1515
mz-dyncfg = { path = "../dyncfg" }
1616
mz-ore = { path = "../ore", features = ["metrics"] }

src/metrics/src/dyncfgs.rs

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,13 @@ use std::time::Duration;
1313

1414
use mz_dyncfg::{Config, ConfigSet};
1515

16+
/// How frequently to refresh lgalloc map stats.
17+
pub(crate) const MZ_METRICS_LGALLOC_MAP_REFRESH_INTERVAL: Config<Duration> = Config::new(
18+
"mz_metrics_lgalloc_map_refresh_interval",
19+
Duration::from_secs(60),
20+
"How frequently to refresh lgalloc stats. A zero duration disables refreshing.",
21+
);
22+
1623
/// How frequently to refresh lgalloc stats.
1724
pub(crate) const MZ_METRICS_LGALLOC_REFRESH_INTERVAL: Config<Duration> = Config::new(
1825
"mz_metrics_lgalloc_refresh_interval",
@@ -30,6 +37,7 @@ pub(crate) const MZ_METRICS_RUSAGE_REFRESH_INTERVAL: Config<Duration> = Config::
3037
/// Adds the full set of all storage `Config`s.
3138
pub fn all_dyncfgs(configs: ConfigSet) -> ConfigSet {
3239
configs
40+
.add(&MZ_METRICS_LGALLOC_MAP_REFRESH_INTERVAL)
3341
.add(&MZ_METRICS_LGALLOC_REFRESH_INTERVAL)
3442
.add(&MZ_METRICS_RUSAGE_REFRESH_INTERVAL)
3543
}

src/metrics/src/lgalloc.rs

Lines changed: 109 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
use std::collections::BTreeMap;
99
use std::ops::AddAssign;
1010

11-
use lgalloc::{FileStats, SizeClassStats};
11+
use lgalloc::{FileStats, MapStats};
1212
use mz_ore::cast::CastFrom;
1313
use mz_ore::metrics::{MetricsRegistry, raw};
1414
use paste::paste;
@@ -48,6 +48,11 @@ struct FileStatsAccum {
4848
pub file_size: usize,
4949
/// Size of the file on disk in bytes.
5050
pub allocated_size: usize,
51+
}
52+
53+
/// An accumulator for [`MapStats`].
54+
#[derive(Default)]
55+
struct MapStatsAccum {
5156
/// Number of mapped bytes, if different from `dirty`. Consult `man 7 numa` for details.
5257
pub mapped: usize,
5358
/// Number of active bytes. Consult `man 7 numa` for details.
@@ -60,6 +65,11 @@ impl AddAssign<&FileStats> for FileStatsAccum {
6065
fn add_assign(&mut self, rhs: &FileStats) {
6166
self.file_size += rhs.file_size;
6267
self.allocated_size += rhs.allocated_size;
68+
}
69+
}
70+
71+
impl AddAssign<&MapStats> for MapStatsAccum {
72+
fn add_assign(&mut self, rhs: &MapStats) {
6373
self.mapped += rhs.mapped;
6474
self.active += rhs.active;
6575
self.dirty += rhs.dirty;
@@ -118,32 +128,25 @@ macro_rules! metrics_size_class {
118128
}
119129
fn update(&mut self) -> Result<(), Error> {
120130
let stats = lgalloc::lgalloc_stats();
121-
for sc in &stats.size_class {
122-
let sc_stats = self.get_size_class(sc.size_class);
123-
$(sc_stats.$metric.set(($conv)(u64::cast_from(sc.$name), sc));)*
131+
for (size_class, sc) in &stats.size_class {
132+
let sc_stats = self.get_size_class(*size_class);
133+
$(sc_stats.$metric.set(($conv)(u64::cast_from(sc.$name), *size_class));)*
124134
}
125-
let mut accums = BTreeMap::new();
126-
match &stats.file_stats {
127-
Ok(file_stats) => {
128-
for file_stat in file_stats {
129-
let accum: &mut FileStatsAccum = accums.entry(file_stat.size_class).or_default();
135+
let mut f_accums = BTreeMap::new();
136+
for (size_class, file_stat) in &stats.file {
137+
let accum: &mut FileStatsAccum = f_accums.entry(*size_class).or_default();
138+
match file_stat {
139+
Ok(file_stat) => {
130140
accum.add_assign(file_stat);
131141
}
132-
}
133-
#[cfg(target_os = "linux")]
134-
Err(err) => {
135-
return Err(Error::new(ErrorKind::FileStatsFailed(err.to_string())));
136-
}
137-
#[cfg(not(target_os = "linux"))]
138-
Err(err) => {
139-
if err.kind() != std::io::ErrorKind::NotFound {
140-
return Err(Error::new(ErrorKind::FileStatsFailed(err.to_string())));
142+
Err(err) => {
143+
return Err(ErrorKind::FileStatsFailed(err.to_string()).into());
141144
}
142145
}
143146
}
144-
for (size_class, accum) in accums {
147+
for (size_class, f_accum) in f_accums {
145148
let sc_stats = self.get_size_class(size_class);
146-
$(sc_stats.$f_metric.set(u64::cast_from(accum.$f_name));)*
149+
$(sc_stats.$f_metric.set(u64::cast_from(f_accum.$f_name));)*
147150
}
148151
Ok(())
149152
}
@@ -152,11 +155,74 @@ macro_rules! metrics_size_class {
152155
};
153156
}
154157

155-
fn normalize_by_size_class(value: u64, stats: &SizeClassStats) -> u64 {
156-
value * u64::cast_from(stats.size_class)
158+
macro_rules! map_metrics {
159+
($namespace:ident
160+
@mem ($(($m_name:ident, $m_metric:ident, $m_desc:expr)),*)
161+
) => {
162+
map_metrics! {
163+
@define $namespace
164+
@mem $(($m_name, $m_metric, $m_desc)),*
165+
}
166+
};
167+
(@define $namespace:ident
168+
@mem $(($m_name:ident, $m_metric:ident, $m_desc:expr)),*
169+
) => {
170+
paste! {
171+
pub(crate) struct LgMapMetrics {
172+
size_class: BTreeMap<usize, LgMapMetricsSC>,
173+
$($m_metric: raw::UIntGaugeVec,)*
174+
}
175+
struct LgMapMetricsSC {
176+
$($m_metric: GenericGauge<AtomicU64>,)*
177+
}
178+
impl LgMapMetrics {
179+
fn new(registry: &MetricsRegistry) -> Self {
180+
Self {
181+
size_class: BTreeMap::default(),
182+
$($m_metric: registry.register(mz_ore::metric!(
183+
name: concat!(stringify!($namespace), "_", stringify!($m_metric)),
184+
help: $m_desc,
185+
var_labels: ["size_class"],
186+
)),)*
187+
}
188+
}
189+
fn get_size_class(&mut self, size_class: usize) -> &LgMapMetricsSC {
190+
self.size_class.entry(size_class).or_insert_with(|| {
191+
let labels: &[&str] = &[&size_class.to_string()];
192+
LgMapMetricsSC {
193+
$($m_metric: self.$m_metric.with_label_values(labels),)*
194+
}
195+
})
196+
}
197+
#[cfg(target_os = "linux")]
198+
fn update(&mut self) -> std::io::Result<()> {
199+
let stats = lgalloc::lgalloc_stats_with_mapping()?;
200+
let mut m_accums = BTreeMap::new();
201+
for (size_class, map_stat) in stats.map.iter().flatten() {
202+
let accum: &mut MapStatsAccum = m_accums.entry(*size_class).or_default();
203+
accum.add_assign(map_stat);
204+
}
205+
for (size_class, m_accum) in m_accums {
206+
let sc_stats = self.get_size_class(size_class);
207+
$(sc_stats.$m_metric.set(u64::cast_from(m_accum.$m_name));)*
208+
}
209+
Ok(())
210+
}
211+
#[cfg(not(target_os = "linux"))]
212+
fn update(&mut self) -> std::io::Result<()> {
213+
// On non-Linux systems, we do not have `numa_maps` stats.
214+
Ok(())
215+
}
216+
}
217+
}
218+
};
157219
}
158220

159-
fn id(value: u64, _stats: &SizeClassStats) -> u64 {
221+
fn normalize_by_size_class(value: u64, size_class: usize) -> u64 {
222+
value * u64::cast_from(size_class)
223+
}
224+
225+
fn id(value: u64, _size_class: usize) -> u64 {
160226
value
161227
}
162228

@@ -184,7 +250,13 @@ metrics_size_class! {
184250
)
185251
@file (
186252
(file_size, file_size_bytes, "Sum of file sizes in size class"),
187-
(allocated_size, file_allocated_size_bytes, "Sum of allocated sizes in size class"),
253+
(allocated_size, file_allocated_size_bytes, "Sum of allocated sizes in size class")
254+
)
255+
}
256+
257+
map_metrics! {
258+
mz_metrics_lgalloc
259+
@mem (
188260
(mapped, vm_mapped_bytes, "Sum of mapped sizes in size class"),
189261
(active, vm_active_bytes, "Sum of active sizes in size class"),
190262
(dirty, vm_dirty_bytes, "Sum of dirty sizes in size class")
@@ -196,10 +268,23 @@ pub(crate) fn register_metrics_into(metrics_registry: &MetricsRegistry) -> LgMet
196268
LgMetrics::new(metrics_registry)
197269
}
198270

271+
/// Register a task to read mapping-related lgalloc stats.
272+
pub(crate) fn register_map_metrics_into(metrics_registry: &MetricsRegistry) -> LgMapMetrics {
273+
LgMapMetrics::new(metrics_registry)
274+
}
275+
199276
impl MetricsUpdate for LgMetrics {
200277
type Error = Error;
201278
const NAME: &'static str = "lgalloc";
202279
fn update(&mut self) -> Result<(), Self::Error> {
203280
self.update()
204281
}
205282
}
283+
284+
impl MetricsUpdate for LgMapMetrics {
285+
type Error = std::io::Error;
286+
const NAME: &'static str = "lgalloc_map";
287+
fn update(&mut self) -> Result<(), Self::Error> {
288+
self.update()
289+
}
290+
}

src/metrics/src/lib.rs

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ pub mod rusage;
3434
pub struct Metrics {
3535
config_set: ConfigSet,
3636
lgalloc: MetricsTask,
37+
lgalloc_map: MetricsTask,
3738
rusage: MetricsTask,
3839
}
3940

@@ -61,6 +62,12 @@ pub async fn register_metrics_into(metrics_registry: &MetricsRegistry, config_se
6162
dyncfgs::MZ_METRICS_LGALLOC_REFRESH_INTERVAL,
6263
&update_duration_metric,
6364
);
65+
let lgalloc_map = Metrics::new_metrics_task(
66+
metrics_registry,
67+
lgalloc::register_map_metrics_into,
68+
dyncfgs::MZ_METRICS_LGALLOC_MAP_REFRESH_INTERVAL,
69+
&update_duration_metric,
70+
);
6471
let rusage = Metrics::new_metrics_task(
6572
metrics_registry,
6673
rusage::register_metrics_into,
@@ -70,6 +77,7 @@ pub async fn register_metrics_into(metrics_registry: &MetricsRegistry, config_se
7077

7178
*METRICS.lock().expect("lock poisoned") = Some(Metrics {
7279
lgalloc,
80+
lgalloc_map,
7381
rusage,
7482
config_set,
7583
});
@@ -89,6 +97,7 @@ impl Metrics {
8997
config_updates.apply(&self.config_set);
9098
// Notify tasks about updated configuration.
9199
self.lgalloc.update_dyncfg(&self.config_set);
100+
self.lgalloc_map.update_dyncfg(&self.config_set);
92101
self.rusage.update_dyncfg(&self.config_set);
93102
}
94103

src/ore/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ either = "1.15.0"
3535
futures = { version = "0.3.31", optional = true }
3636
hibitset = { version = "0.6.4", optional = true }
3737
itertools = "0.14.0"
38-
lgalloc = { version = "0.5", optional = true }
38+
lgalloc = { version = "0.6", optional = true }
3939
libc = { version = "0.2.172", optional = true }
4040
mz-ore-proc = { path = "../ore-proc", default-features = false }
4141
num = "0.4.3"

src/timely-util/Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ columnation = "0.1.0"
1818
differential-dataflow = "0.15.2"
1919
either = "1"
2020
futures-util = "0.3.31"
21-
lgalloc = "0.5"
21+
lgalloc = "0.6"
2222
mz-ore = { path = "../ore", features = ["async", "process", "tracing", "test", "num-traits", "region", "differential-dataflow", "overflowing"] }
2323
num-traits = "0.2"
2424
proptest = { version = "1.6.0", default-features = false, features = ["std"] }

0 commit comments

Comments
 (0)