diff --git a/src/vmm/src/devices/virtio/block/virtio/device.rs b/src/vmm/src/devices/virtio/block/virtio/device.rs index b11c757d43c..6bfa0fc9d92 100644 --- a/src/vmm/src/devices/virtio/block/virtio/device.rs +++ b/src/vmm/src/devices/virtio/block/virtio/device.rs @@ -25,7 +25,7 @@ use crate::devices::virtio::block::CacheType; use crate::devices::virtio::block::virtio::metrics::{BlockDeviceMetrics, BlockMetricsPerDevice}; use crate::devices::virtio::device::{DeviceState, IrqTrigger, IrqType, VirtioDevice}; use crate::devices::virtio::generated::virtio_blk::{ - VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, + VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_RO, VIRTIO_BLK_ID_BYTES, VIRTIO_BLK_F_DISCARD, }; use crate::devices::virtio::generated::virtio_config::VIRTIO_F_VERSION_1; use crate::devices::virtio::generated::virtio_ring::VIRTIO_RING_F_EVENT_IDX; @@ -295,7 +295,8 @@ impl VirtioBlock { .map_err(VirtioBlockError::RateLimiter)? .unwrap_or_default(); - let mut avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_RING_F_EVENT_IDX); + let mut avail_features = (1u64 << VIRTIO_F_VERSION_1) | (1u64 << VIRTIO_RING_F_EVENT_IDX) + | (1u64 << VIRTIO_BLK_F_DISCARD); if config.cache_type == CacheType::Writeback { avail_features |= 1u64 << VIRTIO_BLK_F_FLUSH; diff --git a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs index f83d9dea1df..a0cc1de4454 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/async_io.rs @@ -110,7 +110,7 @@ impl AsyncFileEngine { Ok(()) } - #[cfg(test)] + pub fn file(&self) -> &File { &self.file } @@ -230,6 +230,27 @@ impl AsyncFileEngine { Ok(()) } + pub fn push_discard( + &mut self, + offset: u64, + len: u32, + req: PendingRequest, + ) -> Result<(), RequestError> { + let wrapped_user_data = WrappedRequest::new(req); + + self.ring + .push(Operation::fallocate( + 0, + len, + offset, + wrapped_user_data, + )) + .map_err(|(io_uring_error, data)| RequestError { + req: data.req, + error: AsyncIoError::IoUring(io_uring_error), + }) + } + fn do_pop(&mut self) -> Result>, AsyncIoError> { self.ring.pop().map_err(AsyncIoError::IoUring) } diff --git a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs index 09cc7c4e31d..663f1d910c7 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/mod.rs @@ -6,9 +6,12 @@ pub mod sync_io; use std::fmt::Debug; use std::fs::File; +use libc::{c_int, off64_t, FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE}; +use std::os::unix::io::AsRawFd; pub use self::async_io::{AsyncFileEngine, AsyncIoError}; pub use self::sync_io::{SyncFileEngine, SyncIoError}; +use crate::device_manager::mmio::MMIO_LEN; use crate::devices::virtio::block::virtio::PendingRequest; use crate::devices::virtio::block::virtio::device::FileEngineType; use crate::vstate::memory::{GuestAddress, GuestMemoryMmap}; @@ -75,7 +78,7 @@ impl FileEngine { Ok(()) } - #[cfg(test)] + pub fn file(&self) -> &File { match self { FileEngine::Async(engine) => engine.file(), @@ -172,6 +175,32 @@ impl FileEngine { FileEngine::Sync(engine) => engine.flush().map_err(BlockIoError::Sync), } } + + pub fn discard( + &mut self, + offset: u64, + count: u32, + req: PendingRequest, + ) -> Result> { + + match self { + FileEngine::Async(engine) => match engine.push_discard(offset, count, req) { + Ok(_) => Ok(FileEngineOk::Submitted), + Err(err) => Err(RequestError { + req: err.req, + error: BlockIoError::Async(err.error), + }), + }, + FileEngine::Sync(engine) => match engine.discard(offset,count) { + Ok(count) => Ok(FileEngineOk::Executed(RequestOk { req, count })), + Err(err) => Err(RequestError { + req, + error: BlockIoError::Sync(err), + }), + }, + } + } + } #[cfg(test)] diff --git a/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs b/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs index eec3b3d8b8d..b21a34ed29a 100644 --- a/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs +++ b/src/vmm/src/devices/virtio/block/virtio/io/sync_io.rs @@ -3,6 +3,8 @@ use std::fs::File; use std::io::{Seek, SeekFrom, Write}; +use libc::{c_int, off64_t, FALLOC_FL_KEEP_SIZE, FALLOC_FL_PUNCH_HOLE}; +use std::os::unix::io::AsRawFd; use vm_memory::{GuestMemoryError, ReadVolatile, WriteVolatile}; @@ -18,6 +20,8 @@ pub enum SyncIoError { SyncAll(std::io::Error), /// Transfer: {0} Transfer(GuestMemoryError), + /// Discard: {0} + Discard(std::io::Error) } #[derive(Debug)] @@ -33,7 +37,7 @@ impl SyncFileEngine { SyncFileEngine { file } } - #[cfg(test)] + pub fn file(&self) -> &File { &self.file } @@ -81,4 +85,30 @@ impl SyncFileEngine { // Sync data out to physical media on host. self.file.sync_all().map_err(SyncIoError::SyncAll) } + + pub fn discard(&mut self, offset: u64, len: u32) -> Result { + + unsafe { + let ret = libc::fallocate( + self.file.as_raw_fd(), + libc::FALLOC_FL_PUNCH_HOLE | libc::FALLOC_FL_KEEP_SIZE, + offset as i64, + len as i64, + ); + if ret != 0 { + return Err(SyncIoError::Discard(std::io::Error::last_os_error())); + } + } + Ok(len) + } + + pub fn fallocate(fd: c_int, mode: i32, offset: off64_t, len: off64_t) -> Result<(), std::io::Error> { + let ret: i32 = unsafe { libc::fallocate(fd, mode, offset, len) }; + if ret == 0 { + Ok(()) + } else { + Err(std::io::Error::last_os_error()) + } + } + } diff --git a/src/vmm/src/devices/virtio/block/virtio/metrics.rs b/src/vmm/src/devices/virtio/block/virtio/metrics.rs index 0abe2a58963..44491c56b2d 100644 --- a/src/vmm/src/devices/virtio/block/virtio/metrics.rs +++ b/src/vmm/src/devices/virtio/block/virtio/metrics.rs @@ -157,6 +157,8 @@ pub struct BlockDeviceMetrics { pub invalid_reqs_count: SharedIncMetric, /// Number of flushes operation triggered on this block device. pub flush_count: SharedIncMetric, + /// Number of discard operation triggered on this block device. + pub discard_count: SharedIncMetric, /// Number of events triggered on the queue of this block device. pub queue_event_count: SharedIncMetric, /// Number of events ratelimiter-related. @@ -210,6 +212,7 @@ impl BlockDeviceMetrics { self.invalid_reqs_count .add(other.invalid_reqs_count.fetch_diff()); self.flush_count.add(other.flush_count.fetch_diff()); + self.discard_count.add(other.discard_count.fetch_diff()); self.queue_event_count .add(other.queue_event_count.fetch_diff()); self.rate_limiter_event_count diff --git a/src/vmm/src/devices/virtio/block/virtio/mod.rs b/src/vmm/src/devices/virtio/block/virtio/mod.rs index 8ea59a5aba4..b9ba10761d2 100644 --- a/src/vmm/src/devices/virtio/block/virtio/mod.rs +++ b/src/vmm/src/devices/virtio/block/virtio/mod.rs @@ -63,4 +63,12 @@ pub enum VirtioBlockError { RateLimiter(std::io::Error), /// Persistence error: {0} Persist(crate::devices::virtio::persist::PersistError), + /// Sector overflow in discard segment + SectorOverflow, + /// Discard segment exceeds disk size + BeyondDiskSize, + /// Invalid flags in discard segment + InvalidDiscardFlags, + /// Invalid discard request (e.g., empty segments) + InvalidDiscardRequest, } diff --git a/src/vmm/src/devices/virtio/block/virtio/request.rs b/src/vmm/src/devices/virtio/block/virtio/request.rs index 00aba034943..a807c400d99 100644 --- a/src/vmm/src/devices/virtio/block/virtio/request.rs +++ b/src/vmm/src/devices/virtio/block/virtio/request.rs @@ -9,17 +9,19 @@ use std::convert::From; use vm_memory::GuestMemoryError; +use super::io::{BlockIoError, SyncIoError}; use super::{SECTOR_SHIFT, SECTOR_SIZE, VirtioBlockError, io as block_io}; use crate::devices::virtio::block::virtio::device::DiskProperties; use crate::devices::virtio::block::virtio::metrics::BlockDeviceMetrics; pub use crate::devices::virtio::generated::virtio_blk::{ VIRTIO_BLK_ID_BYTES, VIRTIO_BLK_S_IOERR, VIRTIO_BLK_S_OK, VIRTIO_BLK_S_UNSUPP, VIRTIO_BLK_T_FLUSH, VIRTIO_BLK_T_GET_ID, VIRTIO_BLK_T_IN, VIRTIO_BLK_T_OUT, + VIRTIO_BLK_T_DISCARD }; use crate::devices::virtio::queue::DescriptorChain; use crate::logger::{IncMetric, error}; use crate::rate_limiter::{RateLimiter, TokenType}; -use crate::vstate::memory::{ByteValued, Bytes, GuestAddress, GuestMemoryMmap}; +use crate::vstate::memory::{ByteValued, Bytes, GuestAddress, GuestMemoryMmap, Address}; #[derive(Debug, derive_more::From)] pub enum IoErr { @@ -34,6 +36,7 @@ pub enum RequestType { Out, Flush, GetDeviceID, + Discard, Unsupported(u32), } @@ -44,6 +47,7 @@ impl From for RequestType { VIRTIO_BLK_T_OUT => RequestType::Out, VIRTIO_BLK_T_FLUSH => RequestType::Flush, VIRTIO_BLK_T_GET_ID => RequestType::GetDeviceID, + VIRTIO_BLK_T_DISCARD => RequestType::Discard, t => RequestType::Unsupported(t), } } @@ -176,6 +180,12 @@ impl PendingRequest { (Ok(transferred_data_len), RequestType::GetDeviceID) => { Status::from_data(self.data_len, transferred_data_len, true) } + (Ok(_), RequestType::Discard) => { + block_metrics.discard_count.inc(); + Status::Ok { + num_bytes_to_mem: 0, + } + } (_, RequestType::Unsupported(op)) => Status::Unsupported { op }, (Err(err), _) => Status::IoErr { num_bytes_to_mem: 0, @@ -231,6 +241,16 @@ impl RequestHeader { } } +// For this, please see VirtIO v1.2. This struct mimics that of the implementation in Virtio. +#[derive(Debug, Copy, Clone, PartialEq, Eq)] +#[repr(C)] +pub struct DiscardSegment { + sector: u64, + num_sectors: u32, + flags: u32, +} +unsafe impl ByteValued for DiscardSegment {} + #[derive(Debug, PartialEq, Eq)] pub struct Request { pub r#type: RequestType, @@ -238,6 +258,7 @@ pub struct Request { pub status_addr: GuestAddress, sector: u64, data_addr: GuestAddress, + discard_segments: Option>, } impl Request { @@ -258,10 +279,11 @@ impl Request { data_addr: GuestAddress(0), data_len: 0, status_addr: GuestAddress(0), + discard_segments: None }; let data_desc; - let status_desc; + let mut status_desc; let desc = avail_desc .next_descriptor() .ok_or(VirtioBlockError::DescriptorChainTooShort)?; @@ -272,6 +294,7 @@ impl Request { if req.r#type != RequestType::Flush { return Err(VirtioBlockError::DescriptorChainTooShort); } + data_desc = desc; } else { data_desc = desc; status_desc = data_desc @@ -287,6 +310,9 @@ impl Request { if !data_desc.is_write_only() && req.r#type == RequestType::GetDeviceID { return Err(VirtioBlockError::UnexpectedReadOnlyDescriptor); } + if data_desc.is_write_only() && req.r#type == RequestType::Discard { + return Err(VirtioBlockError::UnexpectedWriteOnlyDescriptor); + } req.data_addr = data_desc.addr; req.data_len = data_desc.len; @@ -313,6 +339,45 @@ impl Request { return Err(VirtioBlockError::InvalidDataLength); } } + RequestType::Discard => { + // Validate data length + let segment_size = std::mem::size_of::() as u32; + if data_desc.len % segment_size != 0 { + return Err(VirtioBlockError::InvalidDataLength); + } + + // Calculate number of segments + let num_segments = data_desc.len / segment_size; + if num_segments == 0 { + return Err(VirtioBlockError::InvalidDiscardRequest); + } + let mut segments = Vec::with_capacity(num_segments as usize); + + // Populate DiscardSegments vector + for i in 0..num_segments { + let offset = i * segment_size; + let segment: DiscardSegment = mem.read_obj(data_desc.addr.unchecked_add(offset as u64)) + .map_err(VirtioBlockError::GuestMemory)?; + if segment.flags & !0x1 != 0 { + return Err(VirtioBlockError::InvalidDiscardFlags); + } + let end_sector = segment.sector.checked_add(segment.num_sectors as u64) + .ok_or(VirtioBlockError::SectorOverflow)?; + if end_sector > num_disk_sectors { + return Err(VirtioBlockError::BeyondDiskSize); + } + segments.push(segment); + } + + // Assign to req.discard_segments + req.discard_segments = Some(segments); + req.data_len = data_desc.len; + status_desc = data_desc.next_descriptor() + .ok_or(VirtioBlockError::DescriptorChainTooShort)?; + if !status_desc.is_write_only() { + return Err(VirtioBlockError::UnexpectedReadOnlyDescriptor); + } + } _ => {} } @@ -390,6 +455,11 @@ impl Request { .map_err(IoErr::GetId); return ProcessingResult::Executed(pending.finish(mem, res, block_metrics)); } + RequestType::Discard => { + let _metric = block_metrics.write_agg.record_latency_metrics(); + disk.file_engine + .discard(self.offset(), self.data_len, pending) + } RequestType::Unsupported(_) => { return ProcessingResult::Executed(pending.finish(mem, Ok(0), block_metrics)); } @@ -730,6 +800,7 @@ mod tests { RequestType::Out => VIRTIO_BLK_T_OUT, RequestType::Flush => VIRTIO_BLK_T_FLUSH, RequestType::GetDeviceID => VIRTIO_BLK_T_GET_ID, + RequestType::Discard => VIRTIO_BLK_T_DISCARD, RequestType::Unsupported(id) => id, } } @@ -742,6 +813,7 @@ mod tests { RequestType::Out => VIRTQ_DESC_F_NEXT, RequestType::Flush => VIRTQ_DESC_F_NEXT, RequestType::GetDeviceID => VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE, + RequestType::Discard => VIRTQ_DESC_F_NEXT, RequestType::Unsupported(_) => VIRTQ_DESC_F_NEXT, } } @@ -833,6 +905,7 @@ mod tests { status_addr, sector: sector & (NUM_DISK_SECTORS - sectors_len), data_addr, + discard_segments: None }; let mut request_header = RequestHeader::new(virtio_request_id, request.sector); diff --git a/src/vmm/src/io_uring/operation/mod.rs b/src/vmm/src/io_uring/operation/mod.rs index 6307d1413c0..a7dd66a9c36 100644 --- a/src/vmm/src/io_uring/operation/mod.rs +++ b/src/vmm/src/io_uring/operation/mod.rs @@ -29,6 +29,8 @@ pub enum OpCode { Write = generated::IORING_OP_WRITE as u8, /// Fsync operation. Fsync = generated::IORING_OP_FSYNC as u8, + /// Fallocate operation. + Fallocate = generated::IORING_OP_FALLOCATE as u8, } // Useful for outputting errors. @@ -38,6 +40,7 @@ impl From for &'static str { OpCode::Read => "read", OpCode::Write => "write", OpCode::Fsync => "fsync", + OpCode::Fallocate => "fallocate", } } } @@ -113,6 +116,19 @@ impl Operation { } } + /// Construct a fallocate operation. + pub fn fallocate(fd: FixedFd, len: u32, offset: u64, user_data: T) -> Self { + Self { + fd, + opcode: OpCode::Fallocate, + addr: None, + len: Some(len), + flags: 0, + offset: Some(offset), + user_data, + } + } + pub(crate) fn fd(&self) -> FixedFd { self.fd } diff --git a/tests/integration_tests/functional/test_discard.py b/tests/integration_tests/functional/test_discard.py new file mode 100644 index 00000000000..05b594ee505 --- /dev/null +++ b/tests/integration_tests/functional/test_discard.py @@ -0,0 +1,38 @@ +import pytest +import os +import host_tools.drive as drive_tools + + +def test_discard_support(uvm_plain_rw): + """ + Test the VIRTIO_BLK_T_DISCARD feature for valid and invalid requests. + """ + + vm = uvm_plain_rw + vm.spawn() + vm.basic_config() + vm.add_net_iface() + + fs1 = drive_tools.FilesystemFile(os.path.join(vm.fsfiles, "test_disk"), size=128) + vm.add_drive( + drive_id="test_disk", + path_on_host=fs1.path, + is_root_device=False, + is_read_only=False, + ) + + vm.start() + + exit_code, stdout, _ = vm.ssh.run("cat /sys/block/vda/queue/discard_max_bytes") + + assert exit_code == 0, "Failed to read discard_max_bytes" + assert int(stdout.strip()) > 0, f"discard_max_bytes is 0: {stdout}" + + vm.ssh.run("mount -o remount,discard /") + exit_code, stdout, _ = vm.ssh.run("mount | grep /dev/vda") + assert exit_code == 0, f"Failed to remount root filesystem with discard option: {stdout}" + + exit_code, stdout, _ = vm.ssh.run("fstrim -v /") + assert exit_code == 0, f"fstrim -v failed: {stdout}" + assert "bytes trimmed" in stdout, f"Unexpected fstrim output: {stdout}" + vm.kill() \ No newline at end of file