Skip to content

Commit 2daa80e

Browse files
committed
systemd: Add support for managing cgroups through systemd
`SystemdCgroup` takes a `parent`, which is the name of a slice, and a `unit`, which is the name of a slice or a scope unit, and provides methods to start, kill the unit, as well as set properties for the unit. The mods, `cpu`, `memory`, `cpuset`, and `pids`, are designed to generate properties quickly. It hides the difference between cgroups v1 and v2, and does simple checks for the systemd version and arguments. Signed-off-by: Xuewei Niu <[email protected]>
1 parent d0b81d6 commit 2daa80e

File tree

12 files changed

+1523
-0
lines changed

12 files changed

+1523
-0
lines changed

Cargo.toml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ libc = "0.2"
1818
serde = { version = "1.0", features = ["derive"], optional = true }
1919
thiserror = "1"
2020
oci-spec = "0.6"
21+
zbus = "3.12"
22+
bit-vec = "0.6"
2123

2224
[dev-dependencies]
2325
libc = "0.2.76"

src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ pub mod manager;
99
pub use manager::{FsManager, Manager};
1010
pub mod stat;
1111
pub use stat::Stats;
12+
pub mod systemd;
1213

1314
/// The maximum value for CPU shares in cgroups v1
1415
pub const CPU_SHARES_V1_MAX: u64 = 262144;

src/systemd/cgroup.rs

Lines changed: 176 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,176 @@
1+
// Copyright (c) 2025 Ant Group
2+
//
3+
// SPDX-License-Identifier: Apache-2.0 or MIT
4+
//
5+
6+
use libc::SIGKILL;
7+
use zbus::blocking::Connection;
8+
use zbus::zvariant::Value;
9+
use zbus::Error as ZbusError;
10+
11+
use crate::fs::hierarchies;
12+
use crate::systemd::dbus::{
13+
SystemManager, BLOCK_IO_ACCOUNTING, CPU_ACCOUNTING, DEFAULT_DEPENDENCIES, DELEGATE,
14+
DESCRIPTION, IO_ACCOUNTING, MEMORY_ACCOUNTING, NO_SUCH_UNIT, PIDS, SLICE, TASKS_ACCOUNTING,
15+
UNIT_MODE_REPLACE, WANTS, WHO_ENUM_ALL,
16+
};
17+
use crate::systemd::error::{Error, Result};
18+
use crate::systemd::{utils, Property};
19+
use crate::CgroupPid;
20+
21+
pub struct SystemdCgroup {
22+
/// The name of slice
23+
slice: String,
24+
/// The name of the systemd unit (slice or scope)
25+
unit: String,
26+
/// Whether the systemd unit is using cgroup v2
27+
v2: bool,
28+
}
29+
30+
impl SystemdCgroup {
31+
pub fn new(slice: &str, unit: &str) -> Result<Self> {
32+
Ok(Self {
33+
slice: slice.to_string(),
34+
unit: unit.to_string(),
35+
v2: hierarchies::is_cgroup2_unified_mode(),
36+
})
37+
}
38+
}
39+
40+
impl SystemdCgroup {
41+
fn system_manager(&self) -> Result<SystemManager> {
42+
let system_bus = Connection::system()?;
43+
let proxy = SystemManager::new(&system_bus)?;
44+
45+
Ok(proxy)
46+
}
47+
48+
/// Start a slice or a scope unit controlled and supervised by systemd.
49+
///
50+
/// For more information, see:
51+
/// https://www.freedesktop.org/software/systemd/man/latest/systemd.unit.html
52+
/// https://www.freedesktop.org/software/systemd/man/latest/systemd.slice.html
53+
/// https://www.freedesktop.org/software/systemd/man/latest/systemd.scope.html
54+
pub fn start(&self, pid: CgroupPid) -> Result<()> {
55+
let mut properties: Vec<Property> = vec![
56+
(CPU_ACCOUNTING, Value::Bool(true)),
57+
(DEFAULT_DEPENDENCIES, Value::Bool(false)),
58+
// MemoryAccount is for cgroupsv2 as documented in dbus.
59+
// However, "github.com/opencontainer/runc" uses it for all.
60+
// Shall we follow the same way?
61+
(MEMORY_ACCOUNTING, Value::Bool(true)),
62+
(TASKS_ACCOUNTING, Value::Bool(true)),
63+
(DESCRIPTION, Value::Str("kata-containers unit".into())),
64+
(PIDS, Value::Array(vec![pid.pid as u32].into())),
65+
];
66+
67+
if self.v2() {
68+
properties.push((IO_ACCOUNTING, Value::Bool(true)));
69+
} else {
70+
properties.push((BLOCK_IO_ACCOUNTING, Value::Bool(true)));
71+
}
72+
73+
if utils::is_slice_unit(&self.unit) {
74+
// If we create a slice, the parent is defined via a Wants=.
75+
properties.push((WANTS, Value::Str(self.slice.as_str().into())));
76+
} else {
77+
// Otherwise it's a scope, which we put into a Slice=.
78+
properties.push((SLICE, Value::Str(self.slice.as_str().into())));
79+
// Assume scopes always support delegation (supported since systemd v218).
80+
properties.push((DELEGATE, Value::Bool(true)));
81+
}
82+
83+
let sysmgr = self.system_manager()?;
84+
85+
sysmgr.start_transient_unit(&self.unit, UNIT_MODE_REPLACE, &properties, &[])?;
86+
87+
Ok(())
88+
}
89+
90+
/// Set properties for the unit through dbus `SetUnitProperties`.
91+
pub fn set_properties(&self, properties: &Vec<Property>) -> Result<()> {
92+
let sysmgr = self.system_manager()?;
93+
94+
sysmgr.set_unit_properties(&self.unit, true, properties)?;
95+
96+
Ok(())
97+
}
98+
99+
/// Kill the unit through dbus `KillUnit` with `SIGKILL` signal.
100+
pub fn kill(&self) -> Result<()> {
101+
let sysmgr = self.system_manager()?;
102+
103+
let ret = sysmgr.kill_unit(&self.unit, WHO_ENUM_ALL, SIGKILL);
104+
105+
// Ignore no such unit error
106+
if let Err(ZbusError::MethodError(err_name, _, _)) = &ret {
107+
if err_name.as_str() == NO_SUCH_UNIT {
108+
return Ok(());
109+
}
110+
}
111+
ret?;
112+
113+
Ok(())
114+
}
115+
116+
/// Freeze the unit through dbus `FreezeUnit`.
117+
pub fn freeze(&self) -> Result<()> {
118+
let sysmgr = self.system_manager()?;
119+
120+
sysmgr.freeze_unit(&self.unit)?;
121+
122+
Ok(())
123+
}
124+
125+
/// Thaw the frozen unit through dbus `ThawUnit`.
126+
pub fn thaw(&self) -> Result<()> {
127+
let sysmgr = self.system_manager()?;
128+
129+
sysmgr.thaw_unit(&self.unit)?;
130+
131+
Ok(())
132+
}
133+
134+
/// Get the systemd version.
135+
pub fn systemd_version(&self) -> Result<usize> {
136+
let sysmgr = self.system_manager()?;
137+
138+
let version = sysmgr.version()?;
139+
let version = version
140+
.parse::<usize>()
141+
.map_err(|_| Error::CorruptedSystemdVersion(version.clone()))?;
142+
143+
Ok(version)
144+
}
145+
146+
/// Check if the unit exists.
147+
pub fn exists(&self) -> Result<bool> {
148+
let sysmgr = self.system_manager()?;
149+
150+
let ret = sysmgr.get_unit(&self.unit);
151+
152+
if let Err(ZbusError::MethodError(err_name, _, _)) = &ret {
153+
if err_name.as_str() == NO_SUCH_UNIT {
154+
return Ok(false);
155+
}
156+
}
157+
158+
ret?;
159+
160+
Ok(true)
161+
}
162+
163+
/// Add a process (tgid) to the unit through dbus
164+
/// `AttachProcessesToUnit`.
165+
pub fn add_process(&self, pid: CgroupPid, subcgroup: &str) -> Result<()> {
166+
let sysmgr = self.system_manager()?;
167+
168+
sysmgr.attach_processes_to_unit(&self.unit, subcgroup, &[pid.pid as u32])?;
169+
170+
Ok(())
171+
}
172+
173+
pub fn v2(&self) -> bool {
174+
self.v2
175+
}
176+
}

src/systemd/cpu.rs

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
// Copyright (c) 2025 Ant Group
2+
//
3+
// SPDX-License-Identifier: Apache-2.0 or MIT
4+
//
5+
6+
use crate::systemd::error::{Error, Result};
7+
use crate::systemd::{dbus, CPU_SYSTEMD_VERSION};
8+
use crate::{CPU_SHARES_V1_MAX, CPU_WEIGHT_V2_MAX};
9+
10+
/// Returns the property for CPU shares.
11+
///
12+
/// Please note that if the shares is obtained from OCI runtime spec, it
13+
/// MUST be converted, see [1] and `convert_shares_to_v2()`.
14+
///
15+
/// 1: https://github.com/containers/crun/blob/main/crun.1.md#cgroup-v2
16+
pub fn shares(shares: u64, v2: bool) -> Result<(&'static str, u64)> {
17+
let (id, shares) = if v2 {
18+
let shares = match shares {
19+
0 => 100,
20+
1..=CPU_WEIGHT_V2_MAX => shares,
21+
_ => return Err(Error::InvalidArgument),
22+
};
23+
24+
(dbus::CPU_WEIGHT, shares)
25+
} else {
26+
let shares = match shares {
27+
0 => 1024,
28+
2..=CPU_SHARES_V1_MAX => shares,
29+
_ => return Err(Error::InvalidArgument),
30+
};
31+
32+
(dbus::CPU_SHARES, shares)
33+
};
34+
35+
Ok((id, shares))
36+
}
37+
38+
/// Returns the property for CPU period.
39+
pub fn period(period: u64, systemd_version: usize) -> Result<(&'static str, u64)> {
40+
if systemd_version < CPU_SYSTEMD_VERSION {
41+
return Err(Error::ObsoleteSystemd);
42+
}
43+
44+
Ok((dbus::CPU_QUOTA_PERIOD_US, period))
45+
}
46+
47+
/// Return the property for CPU quota.
48+
pub fn quota(quota: u64) -> Result<(&'static str, u64)> {
49+
Ok((dbus::CPU_QUOTA_PER_SEC_US, quota))
50+
}

src/systemd/cpuset.rs

Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
// Copyright (c) 2025 Ant Group
2+
//
3+
// SPDX-License-Identifier: Apache-2.0 or MIT
4+
//
5+
6+
use bit_vec::BitVec;
7+
8+
use crate::systemd::error::{Error, Result};
9+
use crate::systemd::{dbus, CPUSET_SYSTEMD_VERSION};
10+
11+
const BYTE_IN_BITS: usize = 8;
12+
13+
/// Returns the property for cpuset CPUs.
14+
pub fn cpuset_cpus(cpus: &str, systemd_version: usize) -> Result<(&'static str, Vec<u8>)> {
15+
if systemd_version < CPUSET_SYSTEMD_VERSION {
16+
return Err(Error::ObsoleteSystemd);
17+
}
18+
19+
let mask = convert_list_to_mask(cpus)?;
20+
21+
Ok((dbus::ALLOWED_CPUS, mask))
22+
}
23+
24+
/// Returns the property for cpuset memory nodes.
25+
pub fn cpuset_mems(mems: &str, systemd_version: usize) -> Result<(&'static str, Vec<u8>)> {
26+
if systemd_version < CPUSET_SYSTEMD_VERSION {
27+
return Err(Error::ObsoleteSystemd);
28+
}
29+
30+
let mask = convert_list_to_mask(mems)?;
31+
32+
Ok((dbus::ALLOWED_MEMORY_NODES, mask))
33+
}
34+
35+
/// Convert cpuset cpus/mems from the string in comma-separated list format
36+
/// to bitmask restored in `Vec<u8>`, see [1].
37+
///
38+
/// 1: https://man7.org/linux/man-pages/man7/cpuset.7.html
39+
///
40+
/// # Arguments
41+
///
42+
/// * `list` - A string slice that holds the list of CPUs in the format
43+
/// "0-3,5,7".
44+
fn convert_list_to_mask(list: &str) -> Result<Vec<u8>> {
45+
let mut bit_vec = BitVec::from_elem(8, false);
46+
47+
let local_idx =
48+
|index: usize| -> usize { index / BYTE_IN_BITS * BYTE_IN_BITS + 7 - index % BYTE_IN_BITS };
49+
50+
for part1 in list.split(',') {
51+
let range: Vec<&str> = part1.split('-').collect();
52+
match range.len() {
53+
// x-
54+
1 => {
55+
let left: usize = range[0].parse().map_err(|_| Error::InvalidArgument)?;
56+
57+
while left >= bit_vec.len() {
58+
bit_vec.grow(BYTE_IN_BITS, false);
59+
}
60+
bit_vec.set(local_idx(left), true);
61+
}
62+
// x-y
63+
2 => {
64+
let left: usize = range[0].parse().map_err(|_| Error::InvalidArgument)?;
65+
let right: usize = range[1].parse().map_err(|_| Error::InvalidArgument)?;
66+
67+
while right >= bit_vec.len() {
68+
bit_vec.grow(BYTE_IN_BITS, false);
69+
}
70+
71+
for index in left..=right {
72+
bit_vec.set(local_idx(index), true);
73+
}
74+
}
75+
_ => {
76+
return Err(Error::InvalidArgument);
77+
}
78+
}
79+
}
80+
81+
let mut mask = bit_vec.to_bytes();
82+
mask.reverse();
83+
84+
Ok(mask)
85+
}
86+
87+
#[cfg(test)]
88+
mod tests {
89+
use crate::systemd::cpuset::convert_list_to_mask;
90+
91+
#[test]
92+
fn test_convert_list_to_mask() {
93+
let mask = convert_list_to_mask("2-4").unwrap();
94+
assert_eq!(vec![0b00011100 as u8], mask);
95+
96+
let mask = convert_list_to_mask("1,7").unwrap();
97+
assert_eq!(vec![0b10000010 as u8], mask);
98+
99+
let mask = convert_list_to_mask("0-4,9").unwrap();
100+
assert_eq!(vec![0b00000010 as u8, 0b00011111 as u8], mask);
101+
102+
assert!(convert_list_to_mask("1-3-4").is_err());
103+
104+
assert!(convert_list_to_mask("1-3,,").is_err());
105+
}
106+
}

0 commit comments

Comments
 (0)