Skip to content
This repository was archived by the owner on Jan 22, 2025. It is now read-only.

Commit ba04063

Browse files
authored
Add CPUmetrics (#25802)
Add in some CPU utilization metrics such as: number of vCPUs, clock frequency, average load across different time intervals, and number of total threads
1 parent d4e4871 commit ba04063

File tree

5 files changed

+65
-2
lines changed

5 files changed

+65
-2
lines changed

core/src/system_monitor_service.rs

Lines changed: 54 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ use {
1212
thread::{self, sleep, Builder, JoinHandle},
1313
time::Duration,
1414
},
15+
sys_info::{Error, LoadAvg},
1516
};
1617

1718
const MS_PER_S: u64 = 1_000;
@@ -20,6 +21,7 @@ const MS_PER_H: u64 = MS_PER_M * 60;
2021
const SAMPLE_INTERVAL_UDP_MS: u64 = 2 * MS_PER_S;
2122
const SAMPLE_INTERVAL_OS_NETWORK_LIMITS_MS: u64 = MS_PER_H;
2223
const SAMPLE_INTERVAL_MEM_MS: u64 = MS_PER_S;
24+
const SAMPLE_INTERVAL_CPU_MS: u64 = MS_PER_S;
2325
const SLEEP_INTERVAL: Duration = Duration::from_millis(500);
2426

2527
#[cfg(target_os = "linux")]
@@ -41,6 +43,13 @@ struct UdpStats {
4143
ignored_multi: usize,
4244
}
4345

46+
struct CpuInfo {
47+
cpu_num: u32,
48+
cpu_freq_mhz: u64,
49+
load_avg: LoadAvg,
50+
num_threads: u64,
51+
}
52+
4453
impl UdpStats {
4554
fn from_map(udp_stats: &HashMap<String, usize>) -> Self {
4655
Self {
@@ -121,12 +130,18 @@ impl SystemMonitorService {
121130
exit: Arc<AtomicBool>,
122131
report_os_memory_stats: bool,
123132
report_os_network_stats: bool,
133+
report_os_cpu_stats: bool,
124134
) -> Self {
125135
info!("Starting SystemMonitorService");
126136
let thread_hdl = Builder::new()
127137
.name("system-monitor".to_string())
128138
.spawn(move || {
129-
Self::run(exit, report_os_memory_stats, report_os_network_stats);
139+
Self::run(
140+
exit,
141+
report_os_memory_stats,
142+
report_os_network_stats,
143+
report_os_cpu_stats,
144+
);
130145
})
131146
.unwrap();
132147

@@ -335,11 +350,45 @@ impl SystemMonitorService {
335350
}
336351
}
337352

338-
pub fn run(exit: Arc<AtomicBool>, report_os_memory_stats: bool, report_os_network_stats: bool) {
353+
fn cpu_info() -> Result<CpuInfo, Error> {
354+
let cpu_num = sys_info::cpu_num()?;
355+
let cpu_freq_mhz = sys_info::cpu_speed()?;
356+
let load_avg = sys_info::loadavg()?;
357+
let num_threads = sys_info::proc_total()?;
358+
359+
Ok(CpuInfo {
360+
cpu_num,
361+
cpu_freq_mhz,
362+
load_avg,
363+
num_threads,
364+
})
365+
}
366+
367+
fn report_cpu_stats() {
368+
if let Ok(info) = Self::cpu_info() {
369+
datapoint_info!(
370+
"cpu-stats",
371+
("cpu_num", info.cpu_num as i64, i64),
372+
("cpu0_freq_mhz", info.cpu_freq_mhz as i64, i64),
373+
("average_load_one_minute", info.load_avg.one, f64),
374+
("average_load_five_minutes", info.load_avg.five, f64),
375+
("average_load_fifteen_minutes", info.load_avg.fifteen, f64),
376+
("total_num_threads", info.num_threads as i64, i64),
377+
)
378+
}
379+
}
380+
381+
pub fn run(
382+
exit: Arc<AtomicBool>,
383+
report_os_memory_stats: bool,
384+
report_os_network_stats: bool,
385+
report_os_cpu_stats: bool,
386+
) {
339387
let mut udp_stats = None;
340388
let network_limits_timer = AtomicInterval::default();
341389
let udp_timer = AtomicInterval::default();
342390
let mem_timer = AtomicInterval::default();
391+
let cpu_timer = AtomicInterval::default();
343392

344393
loop {
345394
if exit.load(Ordering::Relaxed) {
@@ -356,6 +405,9 @@ impl SystemMonitorService {
356405
if report_os_memory_stats && mem_timer.should_update(SAMPLE_INTERVAL_MEM_MS) {
357406
Self::report_mem_stats();
358407
}
408+
if report_os_cpu_stats && cpu_timer.should_update(SAMPLE_INTERVAL_CPU_MS) {
409+
Self::report_cpu_stats();
410+
}
359411
sleep(SLEEP_INTERVAL);
360412
}
361413
}

core/src/validator.rs

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -159,6 +159,7 @@ pub struct ValidatorConfig {
159159
pub no_poh_speed_test: bool,
160160
pub no_os_memory_stats_reporting: bool,
161161
pub no_os_network_stats_reporting: bool,
162+
pub no_os_cpu_stats_reporting: bool,
162163
pub poh_pinned_cpu_core: usize,
163164
pub poh_hashes_per_batch: u64,
164165
pub account_indexes: AccountSecondaryIndexes,
@@ -219,6 +220,7 @@ impl Default for ValidatorConfig {
219220
no_poh_speed_test: true,
220221
no_os_memory_stats_reporting: true,
221222
no_os_network_stats_reporting: true,
223+
no_os_cpu_stats_reporting: true,
222224
poh_pinned_cpu_core: poh_service::DEFAULT_PINNED_CPU_CORE,
223225
poh_hashes_per_batch: poh_service::DEFAULT_HASHES_PER_BATCH,
224226
account_indexes: AccountSecondaryIndexes::default(),
@@ -497,6 +499,7 @@ impl Validator {
497499
Arc::clone(&exit),
498500
!config.no_os_memory_stats_reporting,
499501
!config.no_os_network_stats_reporting,
502+
!config.no_os_cpu_stats_reporting,
500503
));
501504

502505
let (poh_timing_point_sender, poh_timing_point_receiver) = unbounded();

ledger-tool/src/main.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2182,6 +2182,7 @@ fn main() {
21822182
Arc::clone(&exit_signal),
21832183
!no_os_memory_stats_reporting,
21842184
false,
2185+
false,
21852186
);
21862187

21872188
accounts_index_config.index_limit_mb = if let Some(limit) =

local-cluster/src/validator_configs.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@ pub fn safe_clone_config(config: &ValidatorConfig) -> ValidatorConfig {
4747
no_poh_speed_test: config.no_poh_speed_test,
4848
no_os_memory_stats_reporting: config.no_os_memory_stats_reporting,
4949
no_os_network_stats_reporting: config.no_os_network_stats_reporting,
50+
no_os_cpu_stats_reporting: config.no_os_cpu_stats_reporting,
5051
poh_pinned_cpu_core: config.poh_pinned_cpu_core,
5152
account_indexes: config.account_indexes.clone(),
5253
accounts_db_caching_enabled: config.accounts_db_caching_enabled,

validator/src/main.rs

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -965,6 +965,11 @@ pub fn main() {
965965
.long("no-os-network-stats-reporting")
966966
.help("Disable reporting of OS network statistics.")
967967
)
968+
.arg(
969+
Arg::with_name("no_os_cpu_stats_reporting")
970+
.long("no-os-cpu-stats-reporting")
971+
.help("Disable reporting of OS CPU statistics.")
972+
)
968973
.arg(
969974
Arg::with_name("accounts-hash-interval-slots")
970975
.long("accounts-hash-interval-slots")
@@ -2529,6 +2534,7 @@ pub fn main() {
25292534
no_poh_speed_test: matches.is_present("no_poh_speed_test"),
25302535
no_os_memory_stats_reporting: matches.is_present("no_os_memory_stats_reporting"),
25312536
no_os_network_stats_reporting: matches.is_present("no_os_network_stats_reporting"),
2537+
no_os_cpu_stats_reporting: matches.is_present("no_os_cpu_stats_reporting"),
25322538
poh_pinned_cpu_core: value_of(&matches, "poh_pinned_cpu_core")
25332539
.unwrap_or(poh_service::DEFAULT_PINNED_CPU_CORE),
25342540
poh_hashes_per_batch: value_of(&matches, "poh_hashes_per_batch")

0 commit comments

Comments
 (0)