Skip to content

Commit 8ac6ed9

Browse files
committed
feat: supports capacity eviction management for the entire curvine cluster
1 parent 1349800 commit 8ac6ed9

File tree

16 files changed

+779
-43
lines changed

16 files changed

+779
-43
lines changed

Cargo.lock

Lines changed: 11 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -120,4 +120,5 @@ bigdecimal = "0.4.8"
120120
bitvec = "1.0.1"
121121
config = "0.13.4"
122122
tempfile = "3.21.0"
123-
md-5 = "0.10.6"
123+
md-5 = "0.10.6"
124+
lru = "0.16.1"

curvine-common/src/conf/master_conf.rs

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -107,6 +107,15 @@ pub struct MasterConf {
107107
pub ttl_retry_interval: String,
108108
#[serde(skip)]
109109
pub ttl_retry_interval_unit: DurationUnit,
110+
111+
// Eviction configuration
112+
pub enable_quota_eviction: bool,
113+
pub quota_eviction_mode: String,
114+
pub quota_eviction_policy: String,
115+
pub quota_eviction_high_rate: f64,
116+
pub quota_eviction_low_rate: f64,
117+
pub quota_eviction_scan_page: i32,
118+
pub quota_eviction_dry_run: bool,
110119
}
111120

112121
impl MasterConf {
@@ -252,6 +261,15 @@ impl Default for MasterConf {
252261

253262
ttl_retry_interval: "1s".to_string(),
254263
ttl_retry_interval_unit: Default::default(),
264+
265+
// Eviction configuration defaults
266+
enable_quota_eviction: false,
267+
quota_eviction_mode: "free".to_string(),
268+
quota_eviction_policy: "lru".to_string(),
269+
quota_eviction_high_rate: 0.8,
270+
quota_eviction_low_rate: 0.6,
271+
quota_eviction_scan_page: 2,
272+
quota_eviction_dry_run: false,
255273
};
256274

257275
conf.init().unwrap();

curvine-server/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -45,3 +45,4 @@ chrono = { workspace = true }
4545
rand = { workspace = true }
4646
url = { workspace = true }
4747
parking_lot = { workspace = true }
48+
lru = { workspace = true }

curvine-server/src/master/fs/heartbeat_checker.rs

Lines changed: 48 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
// limitations under the License.
1414

1515
use crate::master::fs::MasterFilesystem;
16+
use crate::master::quota::QuotaManager;
1617
use crate::master::replication::master_replication_manager::MasterReplicationManager;
1718
use crate::master::MasterMonitor;
1819
use curvine_common::error::FsError;
@@ -30,6 +31,7 @@ pub struct HeartbeatChecker {
3031
worker_blacklist_ms: u64,
3132
worker_lost_ms: u64,
3233
replication_manager: Arc<MasterReplicationManager>,
34+
quota_manager: Arc<QuotaManager>,
3335
}
3436

3537
impl HeartbeatChecker {
@@ -38,6 +40,7 @@ impl HeartbeatChecker {
3840
monitor: MasterMonitor,
3941
executor: Arc<GroupExecutor>,
4042
replication_manager: Arc<MasterReplicationManager>,
43+
quota_manager: Arc<QuotaManager>,
4144
) -> Self {
4245
let worker_blacklist_ms = fs.conf.worker_blacklist_interval_ms();
4346
let worker_lost_ms = fs.conf.worker_lost_interval_ms();
@@ -48,6 +51,7 @@ impl HeartbeatChecker {
4851
worker_blacklist_ms,
4952
worker_lost_ms,
5053
replication_manager,
54+
quota_manager,
5155
}
5256
}
5357
}
@@ -60,50 +64,56 @@ impl LoopTask for HeartbeatChecker {
6064
return Ok(());
6165
}
6266

63-
let mut wm = self.fs.worker_manager.write();
64-
let workers = wm.get_last_heartbeat();
65-
let now = LocalTime::mills();
67+
{
68+
let mut wm = self.fs.worker_manager.write();
69+
let workers = wm.get_last_heartbeat();
70+
let now = LocalTime::mills();
6671

67-
for (id, last_update) in workers {
68-
if now > last_update + self.worker_blacklist_ms {
69-
// Worker blacklist timeout
70-
let worker = wm.add_blacklist_worker(id);
71-
warn!(
72-
"Worker {:?} has no heartbeat for more than {} ms and will be blacklisted",
73-
worker, self.worker_blacklist_ms
74-
);
75-
}
72+
for (id, last_update) in workers {
73+
if now > last_update + self.worker_blacklist_ms {
74+
// Worker blacklist timeout
75+
let worker = wm.add_blacklist_worker(id);
76+
warn!(
77+
"Worker {:?} has no heartbeat for more than {} ms and will be blacklisted",
78+
worker, self.worker_blacklist_ms
79+
);
80+
}
7681

77-
if now > last_update + self.worker_lost_ms {
78-
// Heartbeat timeout
79-
let removed = wm.remove_expired_worker(id);
80-
warn!(
81-
"Worker {:?} has no heartbeat for more than {} ms and will be removed",
82-
removed, self.worker_lost_ms
83-
);
84-
// Asynchronously delete all block location data.
85-
let fs = self.fs.clone();
86-
let rm = self.replication_manager.clone();
87-
let res = self.executor.spawn(move || {
88-
let spend = TimeSpent::new();
89-
let block_ids = try_log!(fs.delete_locations(id), vec![]);
90-
let block_num = block_ids.len();
91-
if let Err(e) = rm.report_under_replicated_blocks(id, block_ids) {
92-
error!(
93-
"Errors on reporting under-replicated {} blocks. err: {:?}",
94-
block_num, e
95-
);
96-
}
97-
info!(
98-
"Delete worker {} all locations used {} ms",
99-
id,
100-
spend.used_ms()
82+
if now > last_update + self.worker_lost_ms {
83+
// Heartbeat timeout
84+
let removed = wm.remove_expired_worker(id);
85+
warn!(
86+
"Worker {:?} has no heartbeat for more than {} ms and will be removed",
87+
removed, self.worker_lost_ms
10188
);
102-
});
103-
let _ = try_log!(res);
89+
// Asynchronously delete all block location data.
90+
let fs = self.fs.clone();
91+
let rm = self.replication_manager.clone();
92+
let res = self.executor.spawn(move || {
93+
let spend = TimeSpent::new();
94+
let block_ids = try_log!(fs.delete_locations(id), vec![]);
95+
let block_num = block_ids.len();
96+
if let Err(e) = rm.report_under_replicated_blocks(id, block_ids) {
97+
error!(
98+
"Errors on reporting under-replicated {} blocks. err: {:?}",
99+
block_num, e
100+
);
101+
}
102+
info!(
103+
"Delete worker {} all locations used {} ms",
104+
id,
105+
spend.used_ms()
106+
);
107+
});
108+
let _ = try_log!(res);
109+
}
104110
}
105111
}
106112

113+
if let Ok(info) = self.fs.master_info() {
114+
self.quota_manager.detector(Some(info));
115+
};
116+
107117
Ok(())
108118
}
109119

curvine-server/src/master/fs/master_actor.rs

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ use crate::master::fs::master_filesystem::MasterFilesystem;
1717
use crate::master::meta::inode::ttl::ttl_manager::InodeTtlManager;
1818
use crate::master::meta::inode::ttl::ttl_scheduler::TtlHeartbeatChecker;
1919
use crate::master::meta::inode::ttl_scheduler::TtlHeartbeatConfig;
20+
use crate::master::quota::QuotaManager;
2021
use crate::master::replication::master_replication_manager::MasterReplicationManager;
2122
use crate::master::MasterMonitor;
2223
use curvine_common::executor::ScheduledExecutor;
@@ -30,6 +31,7 @@ pub struct MasterActor {
3031
pub master_monitor: MasterMonitor,
3132
pub executor: Arc<GroupExecutor>,
3233
pub replication_manager: Arc<MasterReplicationManager>,
34+
pub quota_manager: Arc<QuotaManager>,
3335
}
3436

3537
impl MasterActor {
@@ -38,12 +40,14 @@ impl MasterActor {
3840
master_monitor: MasterMonitor,
3941
executor: Arc<GroupExecutor>,
4042
replication_manager: &Arc<MasterReplicationManager>,
43+
quota_manager: Arc<QuotaManager>,
4144
) -> Self {
4245
Self {
4346
fs,
4447
master_monitor,
4548
executor,
4649
replication_manager: replication_manager.clone(),
50+
quota_manager,
4751
}
4852
}
4953

@@ -54,6 +58,7 @@ impl MasterActor {
5458
self.master_monitor.clone(),
5559
self.executor.clone(),
5660
self.replication_manager.clone(),
61+
self.quota_manager.clone(),
5762
)
5863
.unwrap();
5964

@@ -105,11 +110,18 @@ impl MasterActor {
105110
master_monitor: MasterMonitor,
106111
executor: Arc<GroupExecutor>,
107112
replication_manager: Arc<MasterReplicationManager>,
113+
quota_manager: Arc<QuotaManager>,
108114
) -> CommonResult<()> {
109115
let check_ms = fs.conf.worker_check_interval_ms();
110116
let scheduler = ScheduledExecutor::new("worker-heartbeat", check_ms);
111117

112-
let task = HeartbeatChecker::new(fs, master_monitor, executor, replication_manager);
118+
let task = HeartbeatChecker::new(
119+
fs,
120+
master_monitor,
121+
executor,
122+
replication_manager,
123+
quota_manager,
124+
);
113125

114126
scheduler.start(task)?;
115127
Ok(())

curvine-server/src/master/journal/journal_system.rs

Lines changed: 31 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,16 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
use crate::master::eviction::evictor::LRUEvictor;
16+
use crate::master::eviction::types::EvictionPolicy;
17+
use crate::master::eviction::EvictionConf;
1518
use crate::master::fs::{MasterFilesystem, WorkerManager};
1619
use crate::master::journal::{JournalLoader, JournalWriter};
1720
use crate::master::meta::inode::ttl::ttl_bucket::TtlBucketList;
1821
use crate::master::meta::FsDir;
19-
use crate::master::{MasterMonitor, MetaRaftJournal, MountManager, SyncFsDir, SyncWorkerManager};
22+
use crate::master::{
23+
MasterMonitor, MetaRaftJournal, MountManager, QuotaManager, SyncFsDir, SyncWorkerManager,
24+
};
2025
use curvine_common::conf::ClusterConf;
2126
use curvine_common::proto::raft::SnapshotData;
2227
use curvine_common::raft::storage::{AppStorage, LogStorage, RocksLogStorage};
@@ -39,6 +44,7 @@ pub struct JournalSystem {
3944
raft_journal: MetaRaftJournal,
4045
master_monitor: MasterMonitor,
4146
mount_manager: Arc<MountManager>,
47+
quota_manager: Arc<QuotaManager>,
4248
}
4349

4450
impl JournalSystem {
@@ -49,6 +55,7 @@ impl JournalSystem {
4955
raft_journal: MetaRaftJournal,
5056
master_monitor: MasterMonitor,
5157
mount_manager: Arc<MountManager>,
58+
quota_manager: Arc<QuotaManager>,
5259
) -> Self {
5360
Self {
5461
rt,
@@ -57,6 +64,7 @@ impl JournalSystem {
5764
raft_journal,
5865
master_monitor,
5966
mount_manager,
67+
quota_manager,
6068
}
6169
}
6270

@@ -88,7 +96,20 @@ impl JournalSystem {
8896
// Create TTL bucket list early with configuration
8997
let ttl_bucket_list = Arc::new(TtlBucketList::new(conf.master.ttl_bucket_interval_ms()));
9098

91-
let fs_dir = SyncFsDir::new(FsDir::new(conf, journal_writer, ttl_bucket_list)?);
99+
let eviction_conf = EvictionConf::from_conf(conf);
100+
let evictor = match eviction_conf.policy {
101+
EvictionPolicy::Lru | EvictionPolicy::Lfu | EvictionPolicy::Arc => {
102+
Arc::new(LRUEvictor::new(eviction_conf.clone()))
103+
}
104+
};
105+
106+
let fs_dir = SyncFsDir::new(FsDir::new(
107+
conf,
108+
journal_writer,
109+
ttl_bucket_list,
110+
evictor.clone(),
111+
)?);
112+
92113
let fs = MasterFilesystem::new(
93114
conf,
94115
fs_dir.clone(),
@@ -98,6 +119,9 @@ impl JournalSystem {
98119

99120
let mount_manager = Arc::new(MountManager::new(fs.clone()));
100121

122+
let quota_manager =
123+
QuotaManager::new(eviction_conf, fs.clone(), evictor.clone(), rt.clone());
124+
101125
let raft_journal = MetaRaftJournal::new(
102126
rt.clone(),
103127
log_store,
@@ -113,6 +137,7 @@ impl JournalSystem {
113137
raft_journal,
114138
master_monitor,
115139
mount_manager,
140+
quota_manager,
116141
);
117142

118143
Ok(js)
@@ -149,6 +174,10 @@ impl JournalSystem {
149174
self.mount_manager.clone()
150175
}
151176

177+
pub fn quota_manager(&self) -> Arc<QuotaManager> {
178+
self.quota_manager.clone()
179+
}
180+
152181
// Create a snapshot manually, dedicated for testing.
153182
pub fn create_snapshot(&self) -> RaftResult<()> {
154183
let data = self.raft_journal.app_store().create_snapshot(1, 1)?;

curvine-server/src/master/master_server.rs

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -139,6 +139,7 @@ impl Master {
139139
let fs = journal_system.fs();
140140
let worker_manager = journal_system.worker_manager();
141141
let mount_manager = journal_system.mount_manager();
142+
let quota_manager = journal_system.quota_manager();
142143

143144
let rt = Arc::new(conf.master_server_conf().create_runtime());
144145

@@ -149,6 +150,7 @@ impl Master {
149150
journal_system.master_monitor(),
150151
conf.master.new_executor(),
151152
&replication_manager,
153+
quota_manager,
152154
);
153155

154156
let job_manager = Arc::new(JobManager::from_cluster_conf(

0 commit comments

Comments
 (0)