@@ -14,9 +14,9 @@ use std::{
1414} ;
1515
1616use linera_base:: ensure;
17- use rocksdb:: { BlockBasedOptions , Cache , DBCompactionStyle , SliceTransform } ;
17+ use rocksdb:: { BlockBasedOptions , Cache , DBCompactionStyle , SliceTransform , WriteBufferManager } ;
1818use serde:: { Deserialize , Serialize } ;
19- use sysinfo:: { CpuRefreshKind , MemoryRefreshKind , RefreshKind , System } ;
19+ use sysinfo:: { MemoryRefreshKind , RefreshKind , System } ;
2020use tempfile:: TempDir ;
2121use thiserror:: Error ;
2222
@@ -51,8 +51,25 @@ const MAX_VALUE_SIZE: usize = 3 * 1024 * 1024 * 1024 - 400;
5151// For offset reasons we decrease by 400
5252const MAX_KEY_SIZE : usize = 8 * 1024 * 1024 - 400 ;
5353
54- const WRITE_BUFFER_SIZE : usize = 256 * 1024 * 1024 ; // 256 MiB
55- const MAX_WRITE_BUFFER_NUMBER : i32 = 6 ;
54+ // RocksDB defaults - explicitly stated for clarity
55+ const TARGET_FILE_SIZE_BASE : u64 = 64 * 1024 * 1024 ; // 64 MiB (RocksDB default)
56+ const MAX_WRITE_BUFFER_NUMBER : i32 = 4 ; // RocksDB default is 2, we use 4 for more write buffering
57+
58+ /// Returns the available memory for this process, respecting cgroup limits if running in a container.
59+ fn get_available_memory ( sys : & System ) -> usize {
60+ // Prefer cgroup limit if running in a container (e.g., Kubernetes)
61+ sys. cgroup_limits ( )
62+ . map_or_else ( || sys. total_memory ( ) as usize , |c| c. total_memory as usize )
63+ }
64+
65+ /// Returns the number of CPUs available to this process, respecting cgroup limits if running in a container.
66+ /// Uses `std::thread::available_parallelism()` which handles cgroup v2 quota detection automatically.
67+ fn get_available_cpus ( ) -> i32 {
68+ std:: thread:: available_parallelism ( )
69+ . map ( |p| p. get ( ) as i32 )
70+ . unwrap_or ( 1 )
71+ }
72+
5673const HYPER_CLOCK_CACHE_BLOCK_SIZE : usize = 8 * 1024 ; // 8 KiB
5774
5875/// The RocksDB client that we use.
@@ -340,32 +357,46 @@ impl RocksDbStoreInternal {
340357 std:: fs:: create_dir ( path_buf. clone ( ) ) ?;
341358 }
342359 let sys = System :: new_with_specifics (
343- RefreshKind :: nothing ( )
344- . with_cpu ( CpuRefreshKind :: everything ( ) )
345- . with_memory ( MemoryRefreshKind :: nothing ( ) . with_ram ( ) ) ,
360+ RefreshKind :: nothing ( ) . with_memory ( MemoryRefreshKind :: nothing ( ) . with_ram ( ) ) ,
346361 ) ;
347- let num_cpus = sys. cpus ( ) . len ( ) as i32 ;
348- let total_ram = sys. total_memory ( ) as usize ;
362+ // Use cgroup-aware resource detection for containerized environments (e.g., Kubernetes)
363+ let num_cpus = get_available_cpus ( ) ;
364+ let total_ram = get_available_memory ( & sys) ;
365+
349366 let mut options = rocksdb:: Options :: default ( ) ;
350367 options. create_if_missing ( true ) ;
351368 options. create_missing_column_families ( true ) ;
352- // Flush in-memory buffer to disk more often
353- options. set_write_buffer_size ( WRITE_BUFFER_SIZE ) ;
369+
370+ // Use smaller memtables (64 MiB) to flush more frequently.
371+ // This reduces the impact of range tombstone fragmentation on reads,
372+ // since tombstones in immutable/flushed memtables are pre-fragmented.
373+ let memtable_size = 64 * 1024 * 1024 ;
374+ options. set_write_buffer_size ( memtable_size) ;
354375 options. set_max_write_buffer_number ( MAX_WRITE_BUFFER_NUMBER ) ;
355- options. set_compression_type ( rocksdb:: DBCompressionType :: Lz4 ) ;
356- options. set_level_zero_slowdown_writes_trigger ( 8 ) ;
357- options. set_level_zero_stop_writes_trigger ( 12 ) ;
358- options. set_level_zero_file_num_compaction_trigger ( 2 ) ;
359- // We deliberately give RocksDB one background thread *per* CPU so that
360- // flush + (N-1) compactions can hammer the NVMe at full bandwidth while
361- // still leaving enough CPU time for the foreground application threads.
362- options. increase_parallelism ( num_cpus) ;
363- options. set_max_background_jobs ( num_cpus) ;
364- options. set_max_subcompactions ( num_cpus as u32 ) ;
365- options. set_level_compaction_dynamic_level_bytes ( true ) ;
366-
367- options. set_compaction_style ( DBCompactionStyle :: Level ) ;
368- options. set_target_file_size_base ( 2 * WRITE_BUFFER_SIZE as u64 ) ;
376+ options. set_target_file_size_base ( TARGET_FILE_SIZE_BASE ) ;
377+ options. set_min_write_buffer_number_to_merge ( 2 ) ;
378+ // No compression for L0/L1 (hot, frequently accessed during compaction)
379+ // LZ4 for deeper levels (cold data, worth the CPU cost for space savings)
380+ options. set_compression_per_level ( & [
381+ rocksdb:: DBCompressionType :: None , // L0
382+ rocksdb:: DBCompressionType :: None , // L1
383+ rocksdb:: DBCompressionType :: None , // L2
384+ rocksdb:: DBCompressionType :: None , // L3
385+ rocksdb:: DBCompressionType :: None , // L4
386+ rocksdb:: DBCompressionType :: None , // L5
387+ rocksdb:: DBCompressionType :: None , // L6
388+ ] ) ;
389+ // Give RocksDB more headroom before throttling writes
390+ options. set_level_zero_file_num_compaction_trigger ( 4 ) ;
391+ options. set_level_zero_slowdown_writes_trigger ( 12 ) ;
392+ options. set_level_zero_stop_writes_trigger ( 24 ) ;
393+
394+ let max_background_jobs = ( num_cpus / 2 ) . max ( 4 ) ;
395+ options. increase_parallelism ( max_background_jobs) ;
396+ options. set_max_background_jobs ( max_background_jobs) ;
397+ options. set_max_subcompactions ( 4 ) ;
398+
399+ options. set_compaction_style ( DBCompactionStyle :: Universal ) ;
369400
370401 let mut block_options = BlockBasedOptions :: default ( ) ;
371402 block_options. set_pin_l0_filter_and_index_blocks_in_cache ( true ) ;
@@ -376,10 +407,18 @@ impl RocksDbStoreInternal {
376407 // - Follows common practice for database caching in server environments
377408 // - Prevents excessive memory pressure that could lead to swapping or OOM conditions
378409 block_options. set_block_cache ( & Cache :: new_hyper_clock_cache (
379- total_ram / 4 ,
410+ total_ram / 6 ,
380411 HYPER_CLOCK_CACHE_BLOCK_SIZE ,
381412 ) ) ;
382413
414+ // Limit total memtable memory to 1/4 of available RAM across all column families.
415+ // When memory exceeds this limit, writers stall until flushes complete.
416+ // Smaller limit encourages more frequent flushes, reducing range tombstone
417+ // fragmentation overhead on reads.
418+ let write_buffer_manager =
419+ WriteBufferManager :: new_write_buffer_manager ( total_ram / 4 , true ) ;
420+ options. set_write_buffer_manager ( & write_buffer_manager) ;
421+
383422 // Configure bloom filters for prefix iteration optimization
384423 block_options. set_bloom_filter ( 10.0 , false ) ;
385424 block_options. set_whole_key_filtering ( false ) ;
0 commit comments