Skip to content

Commit 00d4c09

Browse files
committed
RocksDB config tuning
1 parent 79dd270 commit 00d4c09

3 files changed

Lines changed: 75 additions & 43 deletions

File tree

Cargo.lock

Lines changed: 9 additions & 16 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,7 +215,7 @@ revm-primitives = { version = "19.1.0", default-features = false, features = [
215215
revm-state = { version = "4.0.1", default-features = false, features = [
216216
"serde",
217217
] }
218-
rocksdb = "0.21.0"
218+
rocksdb = "0.22.0"
219219
# 0.8.2 doesn't build with Rust 1.87. Remove `=` once
220220
# https://github.com/linera-io/linera-protocol/issues/4742 is resolved.
221221
ruzstd = "=0.8.1"

linera-views/src/backends/rocks_db.rs

Lines changed: 65 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ use std::{
1414
};
1515

1616
use linera_base::ensure;
17-
use rocksdb::{BlockBasedOptions, Cache, DBCompactionStyle, SliceTransform};
17+
use rocksdb::{BlockBasedOptions, Cache, DBCompactionStyle, SliceTransform, WriteBufferManager};
1818
use serde::{Deserialize, Serialize};
19-
use sysinfo::{CpuRefreshKind, MemoryRefreshKind, RefreshKind, System};
19+
use sysinfo::{MemoryRefreshKind, RefreshKind, System};
2020
use tempfile::TempDir;
2121
use thiserror::Error;
2222

@@ -51,8 +51,25 @@ const MAX_VALUE_SIZE: usize = 3 * 1024 * 1024 * 1024 - 400;
5151
// For offset reasons we decrease by 400
5252
const MAX_KEY_SIZE: usize = 8 * 1024 * 1024 - 400;
5353

54-
const WRITE_BUFFER_SIZE: usize = 256 * 1024 * 1024; // 256 MiB
55-
const MAX_WRITE_BUFFER_NUMBER: i32 = 6;
54+
// RocksDB defaults - explicitly stated for clarity
55+
const TARGET_FILE_SIZE_BASE: u64 = 64 * 1024 * 1024; // 64 MiB (RocksDB default)
56+
const MAX_WRITE_BUFFER_NUMBER: i32 = 4; // RocksDB default is 2, we use 4 for more write buffering
57+
58+
/// Returns the available memory for this process, respecting cgroup limits if running in a container.
59+
fn get_available_memory(sys: &System) -> usize {
60+
// Prefer cgroup limit if running in a container (e.g., Kubernetes)
61+
sys.cgroup_limits()
62+
.map_or_else(|| sys.total_memory() as usize, |c| c.total_memory as usize)
63+
}
64+
65+
/// Returns the number of CPUs available to this process, respecting cgroup limits if running in a container.
66+
/// Uses `std::thread::available_parallelism()` which handles cgroup v2 quota detection automatically.
67+
fn get_available_cpus() -> i32 {
68+
std::thread::available_parallelism()
69+
.map(|p| p.get() as i32)
70+
.unwrap_or(1)
71+
}
72+
5673
const HYPER_CLOCK_CACHE_BLOCK_SIZE: usize = 8 * 1024; // 8 KiB
5774

5875
/// The RocksDB client that we use.
@@ -340,32 +357,46 @@ impl RocksDbStoreInternal {
340357
std::fs::create_dir(path_buf.clone())?;
341358
}
342359
let sys = System::new_with_specifics(
343-
RefreshKind::nothing()
344-
.with_cpu(CpuRefreshKind::everything())
345-
.with_memory(MemoryRefreshKind::nothing().with_ram()),
360+
RefreshKind::nothing().with_memory(MemoryRefreshKind::nothing().with_ram()),
346361
);
347-
let num_cpus = sys.cpus().len() as i32;
348-
let total_ram = sys.total_memory() as usize;
362+
// Use cgroup-aware resource detection for containerized environments (e.g., Kubernetes)
363+
let num_cpus = get_available_cpus();
364+
let total_ram = get_available_memory(&sys);
365+
349366
let mut options = rocksdb::Options::default();
350367
options.create_if_missing(true);
351368
options.create_missing_column_families(true);
352-
// Flush in-memory buffer to disk more often
353-
options.set_write_buffer_size(WRITE_BUFFER_SIZE);
369+
370+
// Use smaller memtables (64 MiB) to flush more frequently.
371+
// This reduces the impact of range tombstone fragmentation on reads,
372+
// since tombstones in immutable/flushed memtables are pre-fragmented.
373+
let memtable_size = 64 * 1024 * 1024;
374+
options.set_write_buffer_size(memtable_size);
354375
options.set_max_write_buffer_number(MAX_WRITE_BUFFER_NUMBER);
355-
options.set_compression_type(rocksdb::DBCompressionType::Lz4);
356-
options.set_level_zero_slowdown_writes_trigger(8);
357-
options.set_level_zero_stop_writes_trigger(12);
358-
options.set_level_zero_file_num_compaction_trigger(2);
359-
// We deliberately give RocksDB one background thread *per* CPU so that
360-
// flush + (N-1) compactions can hammer the NVMe at full bandwidth while
361-
// still leaving enough CPU time for the foreground application threads.
362-
options.increase_parallelism(num_cpus);
363-
options.set_max_background_jobs(num_cpus);
364-
options.set_max_subcompactions(num_cpus as u32);
365-
options.set_level_compaction_dynamic_level_bytes(true);
366-
367-
options.set_compaction_style(DBCompactionStyle::Level);
368-
options.set_target_file_size_base(2 * WRITE_BUFFER_SIZE as u64);
376+
options.set_target_file_size_base(TARGET_FILE_SIZE_BASE);
377+
options.set_min_write_buffer_number_to_merge(2);
378+
// No compression for L0/L1 (hot, frequently accessed during compaction)
379+
// LZ4 for deeper levels (cold data, worth the CPU cost for space savings)
380+
options.set_compression_per_level(&[
381+
rocksdb::DBCompressionType::None, // L0
382+
rocksdb::DBCompressionType::None, // L1
383+
rocksdb::DBCompressionType::None, // L2
384+
rocksdb::DBCompressionType::None, // L3
385+
rocksdb::DBCompressionType::None, // L4
386+
rocksdb::DBCompressionType::None, // L5
387+
rocksdb::DBCompressionType::None, // L6
388+
]);
389+
// Give RocksDB more headroom before throttling writes
390+
options.set_level_zero_file_num_compaction_trigger(4);
391+
options.set_level_zero_slowdown_writes_trigger(12);
392+
options.set_level_zero_stop_writes_trigger(24);
393+
394+
let max_background_jobs = (num_cpus / 2).max(4);
395+
options.increase_parallelism(max_background_jobs);
396+
options.set_max_background_jobs(max_background_jobs);
397+
options.set_max_subcompactions(4);
398+
399+
options.set_compaction_style(DBCompactionStyle::Universal);
369400

370401
let mut block_options = BlockBasedOptions::default();
371402
block_options.set_pin_l0_filter_and_index_blocks_in_cache(true);
@@ -376,10 +407,18 @@ impl RocksDbStoreInternal {
376407
// - Follows common practice for database caching in server environments
377408
// - Prevents excessive memory pressure that could lead to swapping or OOM conditions
378409
block_options.set_block_cache(&Cache::new_hyper_clock_cache(
379-
total_ram / 4,
410+
total_ram / 6,
380411
HYPER_CLOCK_CACHE_BLOCK_SIZE,
381412
));
382413

414+
// Limit total memtable memory to 1/4 of available RAM across all column families.
415+
// When memory exceeds this limit, writers stall until flushes complete.
416+
// Smaller limit encourages more frequent flushes, reducing range tombstone
417+
// fragmentation overhead on reads.
418+
let write_buffer_manager =
419+
WriteBufferManager::new_write_buffer_manager(total_ram / 4, true);
420+
options.set_write_buffer_manager(&write_buffer_manager);
421+
383422
// Configure bloom filters for prefix iteration optimization
384423
block_options.set_bloom_filter(10.0, false);
385424
block_options.set_whole_key_filtering(false);

0 commit comments

Comments
 (0)