Skip to content

Commit 0e06506

Browse files
authored
Support guest memory sizes >=1028 GiB (#912)
Fixes #907. This change both produces an E820 table for OVMF to consult for highmem layout, and clamps the RTC CMOS bytes through which we communicate highmem size to OVMF. The E820 table supplants the RTC CMOS for OVMF's purposes, so in theory nothing will check those bytes anymore. Even so, clamping to them to `ffffff` seems more legible.
1 parent 4808da9 commit 0e06506

File tree

5 files changed

+243
-2
lines changed

5 files changed

+243
-2
lines changed

bin/propolis-server/src/lib/initializer.rs

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1112,6 +1112,28 @@ impl MachineInitializer<'_> {
11121112
smb_tables.commit()
11131113
}
11141114

1115+
fn generate_e820(&self) -> Result<Entry, MachineInitError> {
1116+
info!(self.log, "Generating E820 map for guest address space");
1117+
1118+
let mut e820_table = fwcfg::formats::E820Table::new();
1119+
1120+
for (addr, len, kind) in self.machine.map_physmem.mappings().into_iter()
1121+
{
1122+
let addr = addr.try_into().expect("usize should fit into u64");
1123+
let len = len.try_into().expect("usize should fit into u64");
1124+
match kind {
1125+
propolis::vmm::MapType::Dram => {
1126+
e820_table.add_mem(addr, len);
1127+
}
1128+
_ => {
1129+
e820_table.add_reserved(addr, len);
1130+
}
1131+
}
1132+
}
1133+
1134+
Ok(e820_table.finish())
1135+
}
1136+
11151137
fn generate_bootorder(&self) -> Result<Option<Entry>, MachineInitError> {
11161138
info!(
11171139
self.log,
@@ -1212,6 +1234,10 @@ impl MachineInitializer<'_> {
12121234
MachineInitError::FwcfgInsertFailed("bootorder", e)
12131235
})?;
12141236
}
1237+
let e820_entry = self.generate_e820()?;
1238+
fwcfg
1239+
.insert_named("etc/e820", e820_entry)
1240+
.map_err(|e| MachineInitError::FwcfgInsertFailed("e820", e))?;
12151241

12161242
let ramfb = ramfb::RamFb::create(
12171243
self.log.new(slog::o!("component" => "ramfb")),

bin/propolis-standalone/src/main.rs

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -964,6 +964,32 @@ fn generate_smbios(params: SmbiosParams) -> anyhow::Result<smbios::TableBytes> {
964964
Ok(smb_tables.commit())
965965
}
966966

967+
fn generate_e820(
968+
machine: &Machine,
969+
log: &slog::Logger,
970+
) -> anyhow::Result<fwcfg::Entry> {
971+
slog::info!(log, "Generating E820 map for guest address space",);
972+
973+
let mut e820_table = fwcfg::formats::E820Table::new();
974+
975+
use propolis::vmm::MapType;
976+
977+
for (addr, len, kind) in machine.map_physmem.mappings().into_iter() {
978+
let addr = addr.try_into().context("usize should fit into u64")?;
979+
let len = len.try_into().context("usize should fit into u64")?;
980+
match kind {
981+
MapType::Dram => {
982+
e820_table.add_mem(addr, len);
983+
}
984+
_ => {
985+
e820_table.add_reserved(addr, len);
986+
}
987+
}
988+
}
989+
990+
Ok(e820_table.finish())
991+
}
992+
967993
fn generate_bootorder(
968994
config: &config::Config,
969995
log: &slog::Logger,
@@ -1350,6 +1376,8 @@ fn setup_instance(
13501376
{
13511377
fwcfg.insert_named("bootorder", boot_config).unwrap();
13521378
}
1379+
let e820_entry = generate_e820(machine, log).expect("can build E820 table");
1380+
fwcfg.insert_named("etc/e820", e820_entry).unwrap();
13531381

13541382
fwcfg.attach(pio, &machine.acc_mem);
13551383

lib/propolis/src/hw/bhyve/rtc.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,15 @@ impl BhyveRtc {
9494

9595
// High-mem
9696
if high_mem > 0 {
97-
let high = ((high_mem / CHUNK) as u32).to_le_bytes();
97+
// If high_mem is 1TiB or larger, the division below produces a
98+
// number that overflows the 24 bits available at
99+
// `CMOS_OFF_MEM_HIGH`. Clamp the value so guests aren't subjected
100+
// to arbitrary wrapping. OVMF is told about the highmem layout via
101+
// E820 table anyway, so the only thing that might care about these
102+
// bytes are guest OSes that check the RTC CMOS bytes directly.
103+
let chunks =
104+
std::cmp::min(high_mem / CHUNK, u32::MAX as usize) as u32;
105+
let high = chunks.to_le_bytes();
98106

99107
hdl.rtc_write(CMOS_OFF_MEM_HIGH, high[0])?;
100108
hdl.rtc_write(CMOS_OFF_MEM_HIGH + 1, high[1])?;

lib/propolis/src/hw/qemu/fwcfg.rs

Lines changed: 153 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1057,6 +1057,158 @@ mod test {
10571057
pub mod formats {
10581058
use super::Entry;
10591059
use crate::hw::pci;
1060+
use zerocopy::{Immutable, IntoBytes};
1061+
1062+
/// A type for a range described in an E820 map entry.
1063+
///
1064+
/// This is canonically defined as the ACPI "Address Range Types", though we
1065+
/// only define the types we use, which are a subset of the types that EDK2
1066+
/// is known to care about, which itself is a subset of types that ACPI and
1067+
/// OEMs define or guest OSes may care about.
1068+
#[derive(IntoBytes, Immutable)]
1069+
#[repr(u32)]
1070+
enum EfiAcpiMemoryType {
1071+
Memory = 1,
1072+
Reserved = 2,
1073+
// For reference, though these types are unused.
1074+
// Acpi = 3,
1075+
// Nvs = 4,
1076+
}
1077+
1078+
/// One address/length/type entry in the E820 map.
1079+
///
1080+
/// This is... almost defined by ACPI's "Address Range Descriptor Structure"
1081+
/// table, under "INT 15H, E820". Critically, ACPI defines this structure
1082+
/// with an additional "Extended Attributes" field which EDK2 does not know
1083+
/// about and so we do not provide. Consequently the size of this struct is
1084+
/// 20 bytes as defined in `OvmfPkg/Include/IndustryStandard/E820.h` rather
1085+
/// than the ACPI definition's 24 bytes.
1086+
#[derive(IntoBytes, Immutable)]
1087+
#[repr(C, packed)]
1088+
struct E820Entry64 {
1089+
base_addr: u64,
1090+
length: u64,
1091+
ty: EfiAcpiMemoryType,
1092+
}
1093+
1094+
/// A list of E820 memory map entries.
1095+
///
1096+
/// This is not defined by ACPI, but is an EDK2 implementation of a QMEU
1097+
/// construct to communicate an E820 map to the firmware. It is parsed by
1098+
/// EDK2 and added to its EFI memory map; it is not, itself, the memory map
1099+
/// that OVMF presents via UEFI services. It is not required to be sorted,
1100+
/// and EDK2 ignores entries starting below 4 GiB. Adding additional
1101+
/// low-memory entries is not harmful, but not valuable to EDK2 either.
1102+
pub struct E820Table(Vec<E820Entry64>);
1103+
impl E820Table {
1104+
pub fn new() -> Self {
1105+
Self(Vec::new())
1106+
}
1107+
1108+
/// Add an address range corresponding to usable memory.
1109+
pub fn add_mem(&mut self, base_addr: u64, length: u64) {
1110+
self.0.push(E820Entry64 {
1111+
base_addr,
1112+
length,
1113+
ty: EfiAcpiMemoryType::Memory,
1114+
});
1115+
}
1116+
1117+
/// Add a reserved address, not to be used by the guest OS.
1118+
pub fn add_reserved(&mut self, base_addr: u64, length: u64) {
1119+
self.0.push(E820Entry64 {
1120+
base_addr,
1121+
length,
1122+
ty: EfiAcpiMemoryType::Reserved,
1123+
});
1124+
}
1125+
1126+
pub fn finish(self) -> Entry {
1127+
Entry::Bytes(self.0.as_bytes().to_vec())
1128+
}
1129+
}
1130+
1131+
#[cfg(test)]
1132+
mod test_e820 {
1133+
use super::{E820Entry64, E820Table};
1134+
use crate::hw::qemu::fwcfg::Entry;
1135+
1136+
#[test]
1137+
fn entry_size_is_correct() {
1138+
// Compare the size of our definition of an E820 to EDK2's
1139+
// definition. EDK2 interprets our provided bytes by its definition,
1140+
// so they must match.
1141+
assert_eq!(std::mem::size_of::<E820Entry64>(), 20);
1142+
}
1143+
1144+
#[test]
1145+
fn basic() {
1146+
let mut e820_table = E820Table::new();
1147+
1148+
// Arbitrary bit patterns here, just to make eyeballing the layout
1149+
// more straightforward.
1150+
//
1151+
// Also note the E820 table itself does not check if ranges overlap.
1152+
// In practice it is directly constructed from an ASpace, which does
1153+
// perform those checks.
1154+
e820_table.add_mem(0x0102_0304_0506_0010, 0x1122_3344_5566_7788);
1155+
e820_table
1156+
.add_reserved(0x0102_0304_0506_fff0, 0xffee_ddcc_bbaa_9988);
1157+
1158+
// We also don't require the E820 map to be ordered. ACPI does not
1159+
// imply that it should be, nor do EDK2 or guest OSes, even though
1160+
// entries are often enumerated in address order.
1161+
e820_table.add_mem(0x0102_0304_0506_0000, 0x1122_3344_5566_7799);
1162+
1163+
// rustfmt::skip here and below because eight bytes per line helps
1164+
// eyeball with the entries as written above. rustfmt would try to
1165+
// fit ten bytes per row to pack the 80-column width and that's just
1166+
// annoying here.
1167+
#[rustfmt::skip]
1168+
const FIRST_ENTRY: [u8; 20] = [
1169+
0x10, 0x00, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
1170+
0x88, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11,
1171+
0x01, 0x00, 0x00, 0x00,
1172+
];
1173+
1174+
#[rustfmt::skip]
1175+
const SECOND_ENTRY: [u8; 20] = [
1176+
0xf0, 0xff, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
1177+
0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff,
1178+
0x02, 0x00, 0x00, 0x00,
1179+
];
1180+
1181+
#[rustfmt::skip]
1182+
const THIRD_ENTRY: [u8; 20] = [
1183+
0x00, 0x00, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01,
1184+
0x99, 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11,
1185+
0x01, 0x00, 0x00, 0x00,
1186+
];
1187+
1188+
let entry = e820_table.finish();
1189+
let Entry::Bytes(bytes) = entry else {
1190+
panic!("entry did not produce bytes, but instead {:?}", entry);
1191+
};
1192+
1193+
let expected_size =
1194+
FIRST_ENTRY.len() + SECOND_ENTRY.len() + THIRD_ENTRY.len();
1195+
assert_eq!(bytes.len(), expected_size);
1196+
1197+
let tests = [
1198+
(&bytes[0..20], &FIRST_ENTRY, "First E820 entry"),
1199+
(&bytes[20..40], &SECOND_ENTRY, "Second E820 entry"),
1200+
(&bytes[40..60], &THIRD_ENTRY, "Third E820 entry"),
1201+
];
1202+
1203+
for (actual, expected, entry_name) in tests.iter() {
1204+
assert_eq!(
1205+
actual, expected,
1206+
"{} contents are incorrect",
1207+
entry_name
1208+
);
1209+
}
1210+
}
1211+
}
10601212

10611213
/// Collect one or more device elections for use in generating a boot order
10621214
/// `fw_cfg` entry, suitable for consumption by OVMF bootrom.
@@ -1117,7 +1269,7 @@ pub mod formats {
11171269
}
11181270

11191271
#[cfg(test)]
1120-
mod test {
1272+
mod test_bootorder {
11211273
use super::BootOrder;
11221274
use crate::hw::pci::BusLocation;
11231275
use crate::hw::qemu::fwcfg;

lib/propolis/src/vmm/mem.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,22 @@ pub(crate) struct MapEnt {
6161
kind: MapKind,
6262
}
6363

64+
impl MapEnt {
65+
fn map_type(&self) -> MapType {
66+
match &self.kind {
67+
MapKind::Dram(_) => MapType::Dram,
68+
MapKind::Rom(_) => MapType::Rom,
69+
MapKind::MmioReserve => MapType::Mmio,
70+
}
71+
}
72+
}
73+
74+
pub enum MapType {
75+
Dram,
76+
Rom,
77+
Mmio,
78+
}
79+
6480
pub struct PhysMap {
6581
map: Arc<Mutex<ASpace<MapEnt>>>,
6682
hdl: Arc<VmmHdl>,
@@ -162,6 +178,17 @@ impl PhysMap {
162178
Ok(())
163179
}
164180

181+
pub fn mappings(&self) -> Vec<(usize, usize, MapType)> {
182+
let guard = self.map.lock().unwrap();
183+
let mut mappings = Vec::new();
184+
185+
for (addr, len, ent) in guard.iter() {
186+
mappings.push((addr, len, ent.map_type()));
187+
}
188+
189+
mappings
190+
}
191+
165192
pub(crate) fn memctx(&mut self) -> Arc<MemCtx> {
166193
self.memctx.clone()
167194
}

0 commit comments

Comments
 (0)