diff --git a/CHANGELOG.md b/CHANGELOG.md index 4f43bd83..17ba893f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ ### Added - \[[#311](https://github.com/rust-vmm/vm-memory/pull/311)\] Allow compiling without the ReadVolatile and WriteVolatile implementations +- \[[#327](https://github.com/rust-vmm/vm-memory/pull/327)\] I/O virtual memory support via `IoMemory`, `IommuMemory`, and `Iommu`/`Iotlb` ### Changed diff --git a/Cargo.toml b/Cargo.toml index 4c884563..873a85df 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ default = ["rawfd"] backend-bitmap = [] backend-mmap = ["dep:libc"] backend-atomic = ["arc-swap"] +iommu = ["dep:rangemap"] rawfd = ["dep:libc"] xen = ["backend-mmap", "bitflags", "vmm-sys-util"] @@ -23,6 +24,7 @@ xen = ["backend-mmap", "bitflags", "vmm-sys-util"] libc = { version = "0.2.39", optional = true } arc-swap = { version = "1.0.0", optional = true } bitflags = { version = "2.4.0", optional = true } +rangemap = { version = "1.5.1", optional = true } thiserror = "1.0.40" vmm-sys-util = { version = "0.12.1", optional = true } diff --git a/DESIGN.md b/DESIGN.md index 5915f50e..c3098d85 100644 --- a/DESIGN.md +++ b/DESIGN.md @@ -2,8 +2,8 @@ ## Objectives -- Provide a set of traits for accessing and configuring the physical memory of - a virtual machine. +- Provide a set of traits for accessing and configuring the physical and/or + I/O virtual memory of a virtual machine. - Provide a clean abstraction of the VM memory such that rust-vmm components can use it without depending on the implementation details specific to different VMMs. @@ -122,6 +122,29 @@ let buf = &mut [0u8; 5]; let result = guest_memory_mmap.write(buf, addr); ``` +### I/O Virtual Address Space + +When using an IOMMU, there no longer is direct access to the guest (physical) +address space, but instead only to I/O virtual address space. In this case: + +- `IoMemory` replaces `GuestMemory`: It requires specifying the required access + permissions (which are relevant for virtual memory). It also removes + interfaces that imply a mostly linear memory layout, because virtual memory is + fragmented into many pages instead of few (large) memory regions. + - Any `IoMemory` still has a `GuestMemory` inside as the underlying address + space, but if an IOMMU is used, that will generally not be guest physical + address space. With vhost-user, for example, it will be the VMM’s user + address space instead. + - `IommuMemory` as our only actually IOMMU-supporting `IoMemory` + implementation uses an `Iommu` object to translate I/O virtual addresses + (IOVAs) into VMM user addresses (VUAs), which are then passed to the inner + `GuestMemory` implementation (like `GuestMemoryMmap`). +- `GuestAddress` (for compatibility) refers to an address in any of these + address spaces: + - Guest physical addresses (GPAs) when no IOMMU is used, + - I/O virtual addresses (IOVAs), + - VMM user addresses (VUAs). + ### Utilities and Helpers The following utilities and helper traits/macros are imported from the @@ -143,7 +166,8 @@ with minor changes: - `Address` inherits `AddressValue` - `GuestMemoryRegion` inherits `Bytes`. The `Bytes` trait must be implemented. -- `GuestMemory` has a generic implementation of `Bytes`. +- `GuestMemory` has a generic implementation of `IoMemory` +- `IoMemory` has a generic implementation of `Bytes`. **Types**: diff --git a/src/atomic.rs b/src/atomic.rs index 4b20b2c4..4268b40f 100644 --- a/src/atomic.rs +++ b/src/atomic.rs @@ -2,7 +2,7 @@ // Copyright (C) 2020 Red Hat, Inc. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -//! A wrapper over an `ArcSwap` struct to support RCU-style mutability. +//! A wrapper over an `ArcSwap` struct to support RCU-style mutability. //! //! With the `backend-atomic` feature enabled, simply replacing `GuestMemoryMmap` //! with `GuestMemoryAtomic` will enable support for mutable memory maps. @@ -15,17 +15,17 @@ use arc_swap::{ArcSwap, Guard}; use std::ops::Deref; use std::sync::{Arc, LockResult, Mutex, MutexGuard, PoisonError}; -use crate::{GuestAddressSpace, GuestMemory}; +use crate::{GuestAddressSpace, IoMemory}; /// A fast implementation of a mutable collection of memory regions. /// /// This implementation uses `ArcSwap` to provide RCU-like snapshotting of the memory map: -/// every update of the memory map creates a completely new `GuestMemory` object, and +/// every update of the memory map creates a completely new `IoMemory` object, and /// readers will not be blocked because the copies they retrieved will be collected once /// no one can access them anymore. Under the assumption that updates to the memory map /// are rare, this allows a very efficient implementation of the `memory()` method. #[derive(Clone, Debug)] -pub struct GuestMemoryAtomic { +pub struct GuestMemoryAtomic { // GuestAddressSpace, which we want to implement, is basically a drop-in // replacement for &M. Therefore, we need to pass to devices the `GuestMemoryAtomic` // rather than a reference to it. To obtain this effect we wrap the actual fields @@ -34,9 +34,9 @@ pub struct GuestMemoryAtomic { inner: Arc<(ArcSwap, Mutex<()>)>, } -impl From> for GuestMemoryAtomic { +impl From> for GuestMemoryAtomic { /// create a new `GuestMemoryAtomic` object whose initial contents come from - /// the `map` reference counted `GuestMemory`. + /// the `map` reference counted `IoMemory`. fn from(map: Arc) -> Self { let inner = (ArcSwap::new(map), Mutex::new(())); GuestMemoryAtomic { @@ -45,9 +45,9 @@ impl From> for GuestMemoryAtomic { } } -impl GuestMemoryAtomic { +impl GuestMemoryAtomic { /// create a new `GuestMemoryAtomic` object whose initial contents come from - /// the `map` `GuestMemory`. + /// the `map` `IoMemory`. pub fn new(map: M) -> Self { Arc::new(map).into() } @@ -75,7 +75,7 @@ impl GuestMemoryAtomic { } } -impl GuestAddressSpace for GuestMemoryAtomic { +impl GuestAddressSpace for GuestMemoryAtomic { type T = GuestMemoryLoadGuard; type M = M; @@ -86,14 +86,14 @@ impl GuestAddressSpace for GuestMemoryAtomic { /// A guard that provides temporary access to a `GuestMemoryAtomic`. This /// object is returned from the `memory()` method. It dereference to -/// a snapshot of the `GuestMemory`, so it can be used transparently to +/// a snapshot of the `IoMemory`, so it can be used transparently to /// access memory. #[derive(Debug)] -pub struct GuestMemoryLoadGuard { +pub struct GuestMemoryLoadGuard { guard: Guard>, } -impl GuestMemoryLoadGuard { +impl GuestMemoryLoadGuard { /// Make a clone of the held pointer and returns it. This is more /// expensive than just using the snapshot, but it allows to hold on /// to the snapshot outside the scope of the guard. It also allows @@ -104,7 +104,7 @@ impl GuestMemoryLoadGuard { } } -impl Clone for GuestMemoryLoadGuard { +impl Clone for GuestMemoryLoadGuard { fn clone(&self) -> Self { GuestMemoryLoadGuard { guard: Guard::from_inner(Arc::clone(&*self.guard)), @@ -112,7 +112,7 @@ impl Clone for GuestMemoryLoadGuard { } } -impl Deref for GuestMemoryLoadGuard { +impl Deref for GuestMemoryLoadGuard { type Target = M; fn deref(&self) -> &Self::Target { @@ -125,12 +125,12 @@ impl Deref for GuestMemoryLoadGuard { /// possibly after updating the memory map represented by the /// `GuestMemoryAtomic` that created the guard. #[derive(Debug)] -pub struct GuestMemoryExclusiveGuard<'a, M: GuestMemory> { +pub struct GuestMemoryExclusiveGuard<'a, M: IoMemory> { parent: &'a GuestMemoryAtomic, _guard: MutexGuard<'a, ()>, } -impl GuestMemoryExclusiveGuard<'_, M> { +impl GuestMemoryExclusiveGuard<'_, M> { /// Replace the memory map in the `GuestMemoryAtomic` that created the guard /// with the new memory map, `map`. The lock is then dropped since this /// method consumes the guard. @@ -143,7 +143,7 @@ impl GuestMemoryExclusiveGuard<'_, M> { #[cfg(feature = "backend-mmap")] mod tests { use super::*; - use crate::{GuestAddress, GuestMemory, GuestMemoryRegion, GuestUsize, MmapRegion}; + use crate::{GuestAddress, GuestMemory, GuestMemoryRegion, GuestUsize, IoMemory, MmapRegion}; type GuestMemoryMmap = crate::GuestMemoryMmap<()>; type GuestRegionMmap = crate::GuestRegionMmap<()>; @@ -159,7 +159,8 @@ mod tests { let mut iterated_regions = Vec::new(); let gmm = GuestMemoryMmap::from_ranges(®ions).unwrap(); let gm = GuestMemoryMmapAtomic::new(gmm); - let mem = gm.memory(); + let vmem = gm.memory(); + let mem = vmem.physical_memory().unwrap(); for region in mem.iter() { assert_eq!(region.len(), region_size as GuestUsize); @@ -178,7 +179,7 @@ mod tests { .map(|x| (x.0, x.1)) .eq(iterated_regions.iter().copied())); - let mem2 = mem.into_inner(); + let mem2 = vmem.into_inner(); for region in mem2.iter() { assert_eq!(region.len(), region_size as GuestUsize); } diff --git a/src/guest_memory.rs b/src/guest_memory.rs index 2944169d..08e48f48 100644 --- a/src/guest_memory.rs +++ b/src/guest_memory.rs @@ -44,6 +44,7 @@ use std::convert::From; use std::fs::File; use std::io; +use std::mem::size_of; use std::ops::{BitAnd, BitOr, Deref}; use std::rc::Rc; use std::sync::atomic::Ordering; @@ -53,7 +54,10 @@ use crate::address::{Address, AddressValue}; use crate::bitmap::{Bitmap, BS, MS}; use crate::bytes::{AtomicAccess, Bytes}; use crate::io::{ReadVolatile, WriteVolatile}; +#[cfg(feature = "iommu")] +use crate::iommu::Error as IommuError; use crate::volatile_memory::{self, VolatileSlice}; +use crate::{IoMemory, Permissions}; /// Errors associated with handling guest memory accesses. #[allow(missing_docs)] @@ -82,6 +86,10 @@ pub enum Error { /// The address to be read by `try_access` is outside the address range. #[error("The address to be read by `try_access` is outside the address range")] GuestAddressOverflow, + #[cfg(feature = "iommu")] + /// IOMMU translation error + #[error("IOMMU failed to translate guest address: {0}")] + IommuError(IommuError), } impl From for Error { @@ -353,7 +361,7 @@ pub trait GuestMemoryRegion: Bytes { /// ``` pub trait GuestAddressSpace { /// The type that will be used to access guest memory. - type M: GuestMemory; + type M: IoMemory; /// A type that provides access to the memory. type T: Clone + Deref; @@ -364,7 +372,7 @@ pub trait GuestAddressSpace { fn memory(&self) -> Self::T; } -impl GuestAddressSpace for &M { +impl GuestAddressSpace for &M { type M = M; type T = Self; @@ -373,7 +381,7 @@ impl GuestAddressSpace for &M { } } -impl GuestAddressSpace for Rc { +impl GuestAddressSpace for Rc { type M = M; type T = Self; @@ -382,7 +390,7 @@ impl GuestAddressSpace for Rc { } } -impl GuestAddressSpace for Arc { +impl GuestAddressSpace for Arc { type M = M; type T = Self; @@ -501,9 +509,9 @@ pub trait GuestMemory { /// - the error code returned by the callback 'f' /// - the size of the already handled data when encountering the first hole /// - the size of the already handled data when the whole range has been handled - fn try_access(&self, count: usize, addr: GuestAddress, mut f: F) -> Result + fn try_access<'a, F>(&'a self, count: usize, addr: GuestAddress, mut f: F) -> Result where - F: FnMut(usize, usize, MemoryRegionAddress, &Self::R) -> Result, + F: FnMut(usize, usize, MemoryRegionAddress, &'a Self::R) -> Result, { let mut cur = addr; let mut total = 0; @@ -584,15 +592,16 @@ pub trait GuestMemory { } } -impl Bytes for T { +impl Bytes for T { type E = Error; fn write(&self, buf: &[u8], addr: GuestAddress) -> Result { self.try_access( buf.len(), addr, - |offset, _count, caddr, region| -> Result { - region.write(&buf[offset..], caddr) + Permissions::Write, + |offset, count, caddr, region| -> Result { + region.write(&buf[offset..(offset + count)], caddr) }, ) } @@ -601,8 +610,9 @@ impl Bytes for T { self.try_access( buf.len(), addr, - |offset, _count, caddr, region| -> Result { - region.read(&mut buf[offset..], caddr) + Permissions::Read, + |offset, count, caddr, region| -> Result { + region.read(&mut buf[offset..(offset + count)], caddr) }, ) } @@ -668,9 +678,12 @@ impl Bytes for T { where F: ReadVolatile, { - self.try_access(count, addr, |_, len, caddr, region| -> Result { - region.read_volatile_from(caddr, src, len) - }) + self.try_access( + count, + addr, + Permissions::Write, + |_, len, caddr, region| -> Result { region.read_volatile_from(caddr, src, len) }, + ) } fn read_exact_volatile_from( @@ -696,11 +709,16 @@ impl Bytes for T { where F: WriteVolatile, { - self.try_access(count, addr, |_, len, caddr, region| -> Result { - // For a non-RAM region, reading could have side effects, so we - // must use write_all(). - region.write_all_volatile_to(caddr, dst, len).map(|()| len) - }) + self.try_access( + count, + addr, + Permissions::Read, + |_, len, caddr, region| -> Result { + // For a non-RAM region, reading could have side effects, so we + // must use write_all(). + region.write_all_volatile_to(caddr, dst, len).map(|()| len) + }, + ) } fn write_all_volatile_to(&self, addr: GuestAddress, dst: &mut F, count: usize) -> Result<()> @@ -718,17 +736,64 @@ impl Bytes for T { } fn store(&self, val: O, addr: GuestAddress, order: Ordering) -> Result<()> { - // `find_region` should really do what `to_region_addr` is doing right now, except - // it should keep returning a `Result`. - self.to_region_addr(addr) - .ok_or(Error::InvalidGuestAddress(addr)) - .and_then(|(region, region_addr)| region.store(val, region_addr, order)) + let expected = size_of::(); + + let completed = self.try_access( + expected, + addr, + Permissions::Write, + |offset, len, region_addr, region| -> Result { + assert_eq!(offset, 0); + if len < expected { + return Err(Error::PartialBuffer { + expected, + completed: len, + }); + } + region.store(val, region_addr, order).map(|()| expected) + }, + )?; + + if completed < expected { + Err(Error::PartialBuffer { + expected, + completed, + }) + } else { + Ok(()) + } } fn load(&self, addr: GuestAddress, order: Ordering) -> Result { - self.to_region_addr(addr) - .ok_or(Error::InvalidGuestAddress(addr)) - .and_then(|(region, region_addr)| region.load(region_addr, order)) + let expected = size_of::(); + let mut result = None::; + + let completed = self.try_access( + expected, + addr, + Permissions::Read, + |offset, len, region_addr, region| -> Result { + assert_eq!(offset, 0); + if len < expected { + return Err(Error::PartialBuffer { + expected, + completed: len, + }); + } + result = Some(region.load(region_addr, order)?); + Ok(expected) + }, + )?; + + if completed < expected { + Err(Error::PartialBuffer { + expected, + completed, + }) + } else { + // Must be set because `completed == expected` + Ok(result.unwrap()) + } } } diff --git a/src/io_memory.rs b/src/io_memory.rs new file mode 100644 index 00000000..e437ee5b --- /dev/null +++ b/src/io_memory.rs @@ -0,0 +1,183 @@ +// Copyright (C) 2025 Red Hat. All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +//! Provides a trait for virtual I/O memory. +//! +//! This trait is more stripped down than `GuestMemory` because the fragmented nature of virtual +//! memory does not allow a direct translation to long continuous regions. +//! +//! In addition, any access to virtual memory must be annotated with the intended access mode (i.e. +//! reading and/or writing). + +use crate::guest_memory::Result; +use crate::{bitmap, GuestAddress, GuestMemory, MemoryRegionAddress, VolatileSlice}; + +/// Permissions for accessing virtual memory. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +pub enum Permissions { + /// No permissions + No, + /// Read-only + Read, + /// Write-only + Write, + /// Allow both reading and writing + ReadWrite, +} + +impl Permissions { + /// Check whether the permissions `self` allow the given `access`. + pub fn allow(&self, access: Self) -> bool { + *self & access == access + } +} + +impl std::ops::BitOr for Permissions { + type Output = Permissions; + + /// Return the union of `self` and `rhs`. + fn bitor(self, rhs: Permissions) -> Self::Output { + use Permissions::*; + + match (self, rhs) { + (No, rhs) => rhs, + (lhs, No) => lhs, + (ReadWrite, _) | (_, ReadWrite) => ReadWrite, + (Read, Read) => Read, + (Read, Write) | (Write, Read) => ReadWrite, + (Write, Write) => Write, + } + } +} + +impl std::ops::BitAnd for Permissions { + type Output = Permissions; + + /// Return the intersection of `self` and `rhs`. + fn bitand(self, rhs: Permissions) -> Self::Output { + use Permissions::*; + + match (self, rhs) { + (No, _) | (_, No) => No, + (ReadWrite, rhs) => rhs, + (lhs, ReadWrite) => lhs, + (Read, Read) => Read, + (Read, Write) | (Write, Read) => No, + (Write, Write) => Write, + } + } +} + +/// Represents virtual I/O memory. +/// +/// `IoMemory` is generally backed by some “physical” `GuestMemory`, which then consists for +/// `GuestMemoryRegion` objects. However, the mapping from I/O virtual addresses (IOVAs) to +/// physical addresses may be arbitrarily fragmented. Translation is done via an IOMMU. +/// +/// Note in contrast to `GuestMemory`: +/// - Any IOVA range may consist of arbitrarily many underlying ranges in physical memory. +/// - Accessing an IOVA requires passing the intended access mode, and the IOMMU will check whether +/// the given access mode is permitted for the given IOVA. +/// - The translation result for a given IOVA may change over time (i.e. the physical address +/// associated with an IOVA may change). +pub trait IoMemory { + /// Underlying `GuestMemory` type. + type PhysicalMemory: GuestMemory; + + /// Return `true` if `addr..(addr + count)` is accessible with `access`. + fn range_accessible(&self, addr: GuestAddress, count: usize, access: Permissions) -> bool; + + /// Invokes callback `f` to handle data in the address range `[addr, addr + count)`, with + /// permissions `access`. + /// + /// The address range `[addr, addr + count)` may span more than one + /// [`GuestMemoryRegion`](trait.GuestMemoryRegion.html) object, or even have holes in it. + /// So [`try_access()`](trait.IoMemory.html#method.try_access) invokes the callback 'f' + /// for each [`GuestMemoryRegion`](trait.GuestMemoryRegion.html) object involved and returns: + /// - the error code returned by the callback 'f' + /// - the size of the already handled data when encountering the first hole + /// - the size of the already handled data when the whole range has been handled + fn try_access<'a, F>( + &'a self, + count: usize, + addr: GuestAddress, + access: Permissions, + f: F, + ) -> Result + where + F: FnMut( + usize, + usize, + MemoryRegionAddress, + &'a ::R, + ) -> Result; + + /// Returns a [`VolatileSlice`](struct.VolatileSlice.html) of `count` bytes starting at + /// `addr`. + /// + /// Note that because of the fragmented nature of virtual memory, it can easily happen that the + /// range `[addr, addr + count)` is not backed by a continuous region in our own virtual + /// memory, which will make generating the slice impossible. + fn get_slice( + &self, + addr: GuestAddress, + count: usize, + access: Permissions, + ) -> Result>>; + + /// If this virtual memory is just a plain `GuestMemory` object underneath without an IOMMU + /// translation layer in between, return that `GuestMemory` object. + fn physical_memory(&self) -> Option<&Self::PhysicalMemory> { + None + } +} + +impl IoMemory for M { + type PhysicalMemory = M; + + fn range_accessible(&self, addr: GuestAddress, count: usize, _access: Permissions) -> bool { + if count <= 1 { + ::address_in_range(self, addr) + } else if let Some(end) = addr.0.checked_add(count as u64 - 1) { + ::address_in_range(self, addr) + && ::address_in_range(self, GuestAddress(end)) + } else { + false + } + } + + fn try_access<'a, F>( + &'a self, + count: usize, + addr: GuestAddress, + _access: Permissions, + f: F, + ) -> Result + where + F: FnMut( + usize, + usize, + MemoryRegionAddress, + &'a ::R, + ) -> Result, + { + ::try_access(self, count, addr, f) + } + + fn get_slice( + &self, + addr: GuestAddress, + count: usize, + _access: Permissions, + ) -> Result>> { + ::get_slice(self, addr, count) + } + + fn physical_memory(&self) -> Option<&Self::PhysicalMemory> { + Some(self) + } +} diff --git a/src/iommu.rs b/src/iommu.rs new file mode 100644 index 00000000..1b1fbcf1 --- /dev/null +++ b/src/iommu.rs @@ -0,0 +1,515 @@ +// Copyright (C) 2025 Red Hat. All rights reserved. +// +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE-BSD-3-Clause file. +// +// SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause + +//! Provide an interface for IOMMUs enabling I/O virtual address (IOVA) translation. +//! +//! All IOMMUs consist of an IOTLB ([`Iotlb`]), which is backed by a data source that can deliver +//! all mappings. For example, for vhost-user, that data source is the vhost-user front-end; i.e. +//! IOTLB misses require sending a notification to the front-end and awaiting a reply that supplies +//! the desired mapping. + +use crate::guest_memory::{Error as GuestMemoryError, Result as GuestMemoryResult}; +use crate::{ + bitmap, GuestAddress, GuestMemory, IoMemory, MemoryRegionAddress, Permissions, VolatileSlice, +}; +use rangemap::RangeMap; +use std::cmp; +use std::fmt::Debug; +use std::num::Wrapping; +use std::ops::{Deref, Range}; +use std::sync::Arc; + +/// Errors associated with IOMMU address translation. +#[derive(Debug, thiserror::Error)] +pub enum Error { + /// Lookup cannot be resolved. + #[error( + "Cannot translate I/O virtual address range {:#x}+{}: {reason}", + iova_range.base.0, + iova_range.length, + )] + CannotResolve { + /// IOVA range that could not be resolved + iova_range: IovaRange, + /// Some human-readable specifics about the reason + reason: String, + }, + + /// Wanted to translate an IOVA range into a single slice, but the range is fragmented. + #[error( + "Expected {:#x}+{} to be a continuous I/O virtual address range, but only {continuous_length} bytes are", + iova_range.base.0, + iova_range.length, + )] + Fragmented { + /// Full IOVA range that was to be translated + iova_range: IovaRange, + /// Length of the continuous head (i.e. the first fragment) + continuous_length: usize, + }, + + /// IOMMU is not configured correctly, and so cannot translate addresses. + #[error("IOMMU not configured correctly, cannot operate: {reason}")] + IommuMisconfigured { + /// Some human-readable specifics about the misconfiguration + reason: String, + }, +} + +/// An IOMMU, allowing translation of I/O virtual addresses (IOVAs). +/// +/// Generally, `Iommu` implementaions consist of an [`Iotlb`], which is supposed to be consulted +/// first for lookup requests. All misses and access failures then should be resolved by looking +/// up the affected ranges in the actual IOMMU (which has all current mappings) and putting the +/// results back into the IOTLB. A subsequent lookup in the IOTLB should result in a full +/// translation, which can then be returned. +pub trait Iommu: Debug + Send + Sync { + /// `Deref` type associated with the type that internally wraps the `Iotlb`. + /// + /// For example, the `Iommu` may keep the `Iotlb` wrapped in an `RwLock`, making this type + /// `RwLockReadGuard<'a, Iotlb>`. + /// + /// We need this specific type instead of a plain reference so that [`IotlbIterator`] can + /// actually own the reference and prolong its lifetime. + type IotlbGuard<'a>: Deref + 'a + where + Self: 'a; + + /// Translate the given range for the given access into the underlying address space. + /// + /// Any translation request is supposed to be fully served by an internal [`Iotlb`] instance. + /// Any misses or access failures should result in a lookup in the full IOMMU structures, + /// filling the IOTLB with the results, and then repeating the lookup in there. + fn translate( + &self, + iova: GuestAddress, + length: usize, + access: Permissions, + ) -> Result>, Error>; +} + +/// Mapping target in an IOMMU/IOTLB. +/// +/// This is the data to which each entry in an IOMMU/IOTLB maps. +#[derive(Clone, Copy, Debug, Eq, PartialEq)] +struct IommuMapping { + /// Difference between the mapped and the IOVA address, i.e. what to add to an IOVA address to + /// get the mapped adrress. + /// + /// We cannot store the more obvious mapped base address for this range because that would + /// allow rangemap to wrongfully merge consecutive map entries if they are a duplicate mapping + /// (which does happen). Storing the difference ensures that entries are only merged when they + /// are indeed consecutive. + /// + /// Note that we make no granularity restrictions (i.e. do not operate on a unit like pages), + /// so the source and target address may have arbitrary alignment. That is why both fields + /// here need to be separate and we cannot merge the two bits that are `permissions` with this + /// base address into a single `u64` field. + target_source_diff: Wrapping, + /// Allowed access for the mapped range + permissions: Permissions, +} + +/// Provides an IOTLB. +/// +/// The IOTLB caches IOMMU mappings. It must be preemptively updated whenever mappings are +/// restricted or removed; in contrast, adding mappings or making them more permissive does not +/// require preemptive updates, as subsequent accesses that violate the previous (more restrictive) +/// permissions will trigger TLB misses or access failures, which is then supposed to result in an +/// update from the outer [`Iommu`] object that performs the translation. +#[derive(Debug, Default)] +pub struct Iotlb { + /// Mappings of which we know. + /// + /// Note that the vhost(-user) specification makes no mention of a specific page size, even + /// though in practice the IOVA address space will be organized in terms of pages. However, we + /// cannot really rely on that (or any specific page size; it could be 4k, the guest page size, + /// or the host page size), so we need to be able to handle continuous ranges of any + /// granularity. + tlb: RangeMap, +} + +/// Iterates over a range of valid IOTLB mappings that together constitute a continuous range in +/// I/O virtual address space. +/// +/// Returned by [`Iotlb::lookup()`] and [`Iommu::translate()`] in case translation was successful +/// (i.e. the whole requested range is mapped and permits the given access). +#[derive(Clone, Debug)] +pub struct IotlbIterator> { + /// IOTLB that provides these mapings + iotlb: D, + /// I/O virtual address range left to iterate over + range: Range, + /// Requested access permissions + access: Permissions, +} + +/// Representation of an IOVA memory range (i.e. in the I/O virtual address space). +#[derive(Clone, Debug)] +pub struct IovaRange { + /// IOVA base address + pub base: GuestAddress, + /// Length (in bytes) of this range + pub length: usize, +} + +/// Representation of a mapped memory range in the underlying address space. +#[derive(Clone, Debug)] +pub struct MappedRange { + /// Base address in the underlying address space + pub base: GuestAddress, + /// Length (in bytes) of this mapping + pub length: usize, +} + +/// Lists the subranges in I/O virtual address space that turned out to not be accessible when +/// trying to access an IOVA range. +#[derive(Clone, Debug)] +pub struct IotlbFails { + /// Subranges not mapped at all + pub misses: Vec, + /// Subranges that are mapped, but do not allow the requested access mode + pub access_fails: Vec, +} + +/// [`IoMemory`] type that consists of an underlying [`GuestMemory`] object plus an [`Iommu`]. +/// +/// The underlying [`GuestMemory`] is basically the physical memory, and the [`Iommu`] translates +/// the I/O virtual address space that `IommuMemory` provides into that underlying physical address +/// space. +#[derive(Debug, Default)] +pub struct IommuMemory { + /// Physical memory + inner: M, + /// IOMMU to translate IOVAs into physical addresses + iommu: Arc, + /// Whether the IOMMU is even to be used or not; disabling it makes this a pass-through to + /// `inner`. + use_iommu: bool, +} + +impl IommuMapping { + /// Create a new mapping. + fn new(source_base: u64, target_base: u64, permissions: Permissions) -> Self { + IommuMapping { + target_source_diff: Wrapping(target_base) - Wrapping(source_base), + permissions, + } + } + + /// Map the given source address (IOVA) to its corresponding target address. + fn map(&self, iova: u64) -> u64 { + (Wrapping(iova) + self.target_source_diff).0 + } + + /// Return the permissions for this mapping. + fn permissions(&self) -> Permissions { + self.permissions + } +} + +impl Iotlb { + /// Create a new empty instance. + pub fn new() -> Self { + Default::default() + } + + /// Change the mapping of the given IOVA range. + pub fn set_mapping( + &mut self, + iova: GuestAddress, + map_to: GuestAddress, + length: usize, + perm: Permissions, + ) -> Result<(), Error> { + // Soft TODO: We may want to evict old entries here once the TLB grows to a certain size, + // but that will require LRU book-keeping. However, this is left for the future, because: + // - this TLB is not implemented in hardware, so we do not really have strong entry count + // constraints, and + // - it seems like at least Linux guests invalidate mappings often, automatically limiting + // our entry count. + + let mapping = IommuMapping::new(iova.0, map_to.0, perm); + self.tlb.insert(iova.0..(iova.0 + length as u64), mapping); + + Ok(()) + } + + /// Remove any mapping in the given IOVA range. + pub fn invalidate_mapping(&mut self, iova: GuestAddress, length: usize) { + self.tlb.remove(iova.0..(iova.0 + length as u64)); + } + + /// Remove all mappings. + pub fn invalidate_all(&mut self) { + self.tlb.clear(); + } + + /// Perform a lookup for the given range and the given `access` mode. + /// + /// If the whole range is mapped and accessible, return an iterator over all mappings. + /// + /// If any part of the range is not mapped or does not permit the given access mode, return an + /// `Err(_)` that contains a list of all such subranges. + pub fn lookup>( + this: D, + iova: GuestAddress, + length: usize, + access: Permissions, + ) -> Result, IotlbFails> { + let full_range = iova.0..(iova.0 + length as u64); + + let has_misses = this.tlb.gaps(&full_range).any(|_| true); + let has_access_fails = this + .tlb + .overlapping(full_range.clone()) + .any(|(_, mapping)| !mapping.permissions().allow(access)); + + if has_misses || has_access_fails { + let misses = this + .tlb + .gaps(&full_range) + .map(|range| { + // Gaps are always cut down to the range given to `gaps()` + debug_assert!(range.start >= full_range.start && range.end <= full_range.end); + range.try_into().unwrap() + }) + .collect::>(); + + let access_fails = this + .tlb + .overlapping(full_range.clone()) + .filter(|(_, mapping)| !mapping.permissions().allow(access)) + .map(|(range, _)| { + let start = cmp::max(range.start, full_range.start); + let end = cmp::min(range.end, full_range.end); + (start..end).try_into().unwrap() + }) + .collect::>(); + + return Err(IotlbFails { + misses, + access_fails, + }); + } + + Ok(IotlbIterator { + iotlb: this, + range: full_range, + access, + }) + } +} + +impl> Iterator for IotlbIterator { + /// Addresses in the underlying address space + type Item = MappedRange; + + fn next(&mut self) -> Option { + // Note that we can expect the whole IOVA range to be mapped with the right access flags. + // The `IotlbIterator` is created by `Iotlb::lookup()` only if the whole range is mapped + // accessibly; we have a permanent reference to `Iotlb`, so the range cannot be invalidated + // in the meantime. + // Another note: It is tempting to have `IotlbIterator` wrap around the + // `rangemap::Overlapping` iterator, but that just takes a (lifetimed) reference to the + // map, not an owned reference (like RwLockReadGuard), which we want to use; so using that + // would probably require self-referential structs. + + if self.range.is_empty() { + return None; + } + + let (range, mapping) = self.iotlb.tlb.get_key_value(&self.range.start).unwrap(); + + assert!(mapping.permissions().allow(self.access)); + + let mapping_iova_start = self.range.start; + let mapping_iova_end = cmp::min(self.range.end, range.end); + let mapping_len = mapping_iova_end - mapping_iova_start; + + self.range.start = mapping_iova_end; + + Some(MappedRange { + base: GuestAddress(mapping.map(mapping_iova_start)), + length: mapping_len.try_into().unwrap(), + }) + } +} + +impl TryFrom> for IovaRange { + type Error = >::Error; + + fn try_from(range: Range) -> Result { + Ok(IovaRange { + base: GuestAddress(range.start), + length: (range.end - range.start).try_into()?, + }) + } +} + +impl IommuMemory { + /// Create a new `IommuMemory` instance. + pub fn new(inner: M, iommu: Arc, use_iommu: bool) -> Self { + IommuMemory { + inner, + iommu, + use_iommu, + } + } + + /// Create a new version of `self` with the underlying physical memory replaced. + /// + /// Note that the inner `Arc` reference to the IOMMU is cloned, i.e. both the existing and the + /// new `IommuMemory` object will share an IOMMU instance. (The `use_iommu` flag however is + /// copied, so is independent between the two instances.) + pub fn inner_replaced(&self, inner: M) -> Self { + IommuMemory { + inner, + iommu: Arc::clone(&self.iommu), + use_iommu: self.use_iommu, + } + } + + /// Enable or disable the IOMMU. + /// + /// Disabling the IOMMU switches to pass-through mode, where every access is done directly on + /// the underlying physical memory. + pub fn set_iommu_enabled(&mut self, enabled: bool) { + self.use_iommu = enabled; + } + + /// Return a reference to the IOMMU. + pub fn iommu(&self) -> &Arc { + &self.iommu + } + + /// Return a reference to the inner physical memory object. + pub fn inner(&self) -> &M { + &self.inner + } +} + +impl Clone for IommuMemory { + fn clone(&self) -> Self { + IommuMemory { + inner: self.inner.clone(), + iommu: Arc::clone(&self.iommu), + use_iommu: self.use_iommu, + } + } +} + +impl IoMemory for IommuMemory { + type PhysicalMemory = M; + + fn range_accessible(&self, addr: GuestAddress, count: usize, access: Permissions) -> bool { + if !self.use_iommu { + return self.inner.range_accessible(addr, count, access); + } + + let Ok(mut translated_iter) = self.iommu.translate(addr, count, access) else { + return false; + }; + + translated_iter.all(|translated| { + self.inner + .range_accessible(translated.base, translated.length, access) + }) + } + + fn try_access<'a, F>( + &'a self, + count: usize, + addr: GuestAddress, + access: Permissions, + mut f: F, + ) -> GuestMemoryResult + where + F: FnMut( + usize, + usize, + MemoryRegionAddress, + &'a ::R, + ) -> GuestMemoryResult, + { + if !self.use_iommu { + return self.inner.try_access(count, addr, f); + } + + let translated = self + .iommu + .translate(addr, count, access) + .map_err(GuestMemoryError::IommuError)?; + + let mut total = 0; + for mapping in translated { + let handled = self.inner.try_access( + mapping.length, + mapping.base, + |inner_offset, count, in_region_addr, region| { + f(total + inner_offset, count, in_region_addr, region) + }, + )?; + + if handled == 0 { + break; + } else if handled > count { + return Err(GuestMemoryError::CallbackOutOfRange); + } + + total += handled; + // `GuestMemory::try_access()` only returns a short count when no more data needs to be + // processed, so we can stop here + if handled < mapping.length { + break; + } + } + + Ok(total) + } + + fn get_slice( + &self, + addr: GuestAddress, + count: usize, + access: Permissions, + ) -> GuestMemoryResult>> { + if !self.use_iommu { + return self.inner.get_slice(addr, count); + } + + // Ensure `count` is at least 1 so we can translate something + let adj_count = cmp::max(count, 1); + + let mut translated = self + .iommu + .translate(addr, adj_count, access) + .map_err(GuestMemoryError::IommuError)?; + + let mapping = translated.next().unwrap(); + if translated.next().is_some() { + return Err(GuestMemoryError::IommuError(Error::Fragmented { + iova_range: IovaRange { + base: addr, + length: count, + }, + continuous_length: mapping.length, + })); + } + + assert!(mapping.length == count || (count == 0 && mapping.length == 1)); + self.inner.get_slice(mapping.base, count) + } + + fn physical_memory(&self) -> Option<&Self::PhysicalMemory> { + if self.use_iommu { + None + } else { + Some(&self.inner) + } + } +} diff --git a/src/lib.rs b/src/lib.rs index b8fe5f40..a90e72e0 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -53,6 +53,14 @@ pub use guest_memory::{ pub mod io; pub use io::{ReadVolatile, WriteVolatile}; +pub mod io_memory; +pub use io_memory::{IoMemory, Permissions}; + +#[cfg(feature = "iommu")] +pub mod iommu; +#[cfg(feature = "iommu")] +pub use iommu::{Iommu, IommuMemory, Iotlb}; + #[cfg(feature = "backend-mmap")] pub mod mmap; diff --git a/src/mmap/mod.rs b/src/mmap/mod.rs index b8dd0842..72eceffa 100644 --- a/src/mmap/mod.rs +++ b/src/mmap/mod.rs @@ -77,7 +77,7 @@ pub enum Error { /// in the virtual address space of the calling process. #[derive(Debug)] pub struct GuestRegionMmap { - mapping: MmapRegion, + mapping: Arc>, guest_base: GuestAddress, } @@ -85,13 +85,21 @@ impl Deref for GuestRegionMmap { type Target = MmapRegion; fn deref(&self) -> &MmapRegion { - &self.mapping + self.mapping.as_ref() } } impl GuestRegionMmap { /// Create a new memory-mapped memory region for the guest's physical memory. pub fn new(mapping: MmapRegion, guest_base: GuestAddress) -> result::Result { + Self::with_arc(Arc::new(mapping), guest_base) + } + + /// Same as [`Self::new()`], but takes an `Arc`-wrapped `mapping`. + pub fn with_arc( + mapping: Arc>, + guest_base: GuestAddress, + ) -> result::Result { if guest_base.0.checked_add(mapping.size() as u64).is_none() { return Err(Error::InvalidGuestRegion); } @@ -101,6 +109,16 @@ impl GuestRegionMmap { guest_base, }) } + + /// Return a reference to the inner `Arc` (as opposed to + /// [`.deref()`](Self::deref()), which does not reference the `Arc`). + /// + /// The returned reference can be cloned to construct a new `GuestRegionMmap` with a different + /// base address (e.g. when switching between memory address spaces based on the guest physical + /// address vs. the VMM userspace virtual address). + pub fn get_mapping(&self) -> &Arc> { + &self.mapping + } } #[cfg(not(feature = "xen"))]