From 737efa969fc4093b914021be4c3f1cbd7e288f1e Mon Sep 17 00:00:00 2001 From: Lucy Menon <168595099+syntactically@users.noreply.github.com> Date: Tue, 29 Jul 2025 14:49:26 +0100 Subject: [PATCH 1/3] Unify page table manipulation code between the guest and the host Currently, the guest and the host both have code that manipulates architecture-specific page table structures: the guest has a general map operation, and the host has a much more specific routine that builds an identity map. As we move to more complex virtual memory configurations in the guest, the host will need the ability to build more complex mappings in the guest, so this commit removes the simple implementation in the host, and replaces it with calls to the implementation originally written for the guest (now moved to `hyperlight_common` and factored into an architecture-independent interface and architecture-specific code parts). Signed-off-by: Simon Davies --- src/hyperlight_common/src/arch/amd64/vm.rs | 217 ++++++++++++++++ src/hyperlight_common/src/lib.rs | 2 + src/hyperlight_common/src/vm.rs | 131 ++++++++++ src/hyperlight_guest_bin/src/paging.rs | 238 ++++-------------- src/hyperlight_host/src/mem/layout.rs | 35 +-- src/hyperlight_host/src/mem/memory_region.rs | 28 --- src/hyperlight_host/src/mem/mgr.rs | 238 +++++++----------- .../src/sandbox/uninitialized_evolve.rs | 5 +- 8 files changed, 495 insertions(+), 399 deletions(-) create mode 100644 src/hyperlight_common/src/arch/amd64/vm.rs create mode 100644 src/hyperlight_common/src/vm.rs diff --git a/src/hyperlight_common/src/arch/amd64/vm.rs b/src/hyperlight_common/src/arch/amd64/vm.rs new file mode 100644 index 000000000..cb7aff1e6 --- /dev/null +++ b/src/hyperlight_common/src/arch/amd64/vm.rs @@ -0,0 +1,217 @@ +/* +Copyright 2025 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +use crate::vm::{Mapping, MappingKind, TableOps}; + +#[inline(always)] +/// Utility function to extract an (inclusive on both ends) bit range +/// from a quadword. +fn bits(x: u64) -> u64 { + (x & ((1 << (HIGH_BIT + 1)) - 1)) >> LOW_BIT +} + +/// A helper structure indicating a mapping operation that needs to be +/// performed +struct MapRequest { + table_base: T, + vmin: VirtAddr, + len: u64, +} + +/// A helper structure indicating that a particular PTE needs to be +/// modified +struct MapResponse { + entry_ptr: T, + vmin: VirtAddr, + len: u64, +} + +struct ModifyPteIterator { + request: MapRequest, + n: u64, +} +impl Iterator + for ModifyPteIterator +{ + type Item = MapResponse; + fn next(&mut self) -> Option { + if (self.n << LOW_BIT) >= self.request.len { + return None; + } + // next stage parameters + let mut next_vmin = self.request.vmin + (self.n << LOW_BIT); + let lower_bits_mask = (1 << LOW_BIT) - 1; + if self.n > 0 { + next_vmin &= !lower_bits_mask; + } + let entry_ptr = Op::entry_addr( + self.request.table_base, + bits::(next_vmin) << 3, + ); + let len_from_here = self.request.len - (next_vmin - self.request.vmin); + let max_len = (1 << LOW_BIT) - (next_vmin & lower_bits_mask); + let next_len = core::cmp::min(len_from_here, max_len); + + // update our state + self.n += 1; + + Some(MapResponse { + entry_ptr, + vmin: next_vmin, + len: next_len, + }) + } +} +fn modify_ptes( + r: MapRequest, +) -> ModifyPteIterator { + ModifyPteIterator { request: r, n: 0 } +} + +/// Page-mapping callback to allocate a next-level page table if necessary. +/// # Safety +/// This function modifies page table data structures, and should not be called concurrently +/// with any other operations that modify the page tables. +unsafe fn alloc_pte_if_needed( + op: &Op, + x: MapResponse, +) -> MapRequest { + let pte = unsafe { op.read_entry(x.entry_ptr) }; + let present = pte & 0x1; + if present != 0 { + return MapRequest { + table_base: Op::from_phys(pte & !0xfff), + vmin: x.vmin, + len: x.len, + }; + } + + let page_addr = unsafe { op.alloc_table() }; + + #[allow(clippy::identity_op)] + #[allow(clippy::precedence)] + let pte = Op::to_phys(page_addr) | + 1 << 5 | // A - we don't track accesses at table level + 0 << 4 | // PCD - leave caching enabled + 0 << 3 | // PWT - write-back + 1 << 2 | // U/S - allow user access to everything (for now) + 1 << 1 | // R/W - we don't use block-level permissions + 1 << 0; // P - this entry is present + unsafe { op.write_entry(x.entry_ptr, pte) }; + MapRequest { + table_base: page_addr, + vmin: x.vmin, + len: x.len, + } +} + +/// Map a normal memory page +/// # Safety +/// This function modifies page table data structures, and should not be called concurrently +/// with any other operations that modify the page tables. +#[allow(clippy::identity_op)] +#[allow(clippy::precedence)] +unsafe fn map_page(op: &Op, mapping: &Mapping, r: MapResponse) { + let pte = match &mapping.kind { + MappingKind::BasicMapping(bm) => + // TODO: Support not readable + { + (mapping.phys_base + (r.vmin - mapping.virt_base)) | + (!bm.executable as u64) << 63 | // NX - no execute unless allowed + 1 << 7 | // 1 - RES1 according to manual + 1 << 6 | // D - we don't presently track dirty state for anything + 1 << 5 | // A - we don't presently track access for anything + 0 << 4 | // PCD - leave caching enabled + 0 << 3 | // PWT - write-back + 1 << 2 | // U/S - allow user access to everything (for now) + (bm.writable as u64) << 1 | // R/W - for now make everything r/w + 1 << 0 // P - this entry is present + } + }; + unsafe { + op.write_entry(r.entry_ptr, pte); + } +} + +// There are no notable architecture-specific safety considerations +// here, and the general conditions are documented in the +// architecture-independent re-export in vm.rs +#[allow(clippy::missing_safety_doc)] +pub unsafe fn map(op: &Op, mapping: Mapping) { + modify_ptes::<47, 39, Op>(MapRequest { + table_base: op.root_table(), + vmin: mapping.virt_base, + len: mapping.len, + }) + .map(|r| unsafe { alloc_pte_if_needed(op, r) }) + .flat_map(modify_ptes::<38, 30, Op>) + .map(|r| unsafe { alloc_pte_if_needed(op, r) }) + .flat_map(modify_ptes::<29, 21, Op>) + .map(|r| unsafe { alloc_pte_if_needed(op, r) }) + .flat_map(modify_ptes::<20, 12, Op>) + .map(|r| unsafe { map_page(op, &mapping, r) }) + .for_each(drop); +} + +/// # Safety +/// This function traverses page table data structures, and should not +/// be called concurrently with any other operations that modify the +/// page table. +unsafe fn require_pte_exist( + op: &Op, + x: MapResponse, +) -> Option> { + let pte = unsafe { op.read_entry(x.entry_ptr) }; + let present = pte & 0x1; + if present == 0 { + return None; + } + Some(MapRequest { + table_base: Op::from_phys(pte & !0xfff), + vmin: x.vmin, + len: x.len, + }) +} + +// There are no notable architecture-specific safety considerations +// here, and the general conditions are documented in the +// architecture-independent re-export in vm.rs +#[allow(clippy::missing_safety_doc)] +pub unsafe fn vtop(op: &Op, address: u64) -> Option { + modify_ptes::<47, 39, Op>(MapRequest { + table_base: op.root_table(), + vmin: address, + len: 1, + }) + .filter_map(|r| unsafe { require_pte_exist::(op, r) }) + .flat_map(modify_ptes::<38, 30, Op>) + .filter_map(|r| unsafe { require_pte_exist::(op, r) }) + .flat_map(modify_ptes::<29, 21, Op>) + .filter_map(|r| unsafe { require_pte_exist::(op, r) }) + .flat_map(modify_ptes::<20, 12, Op>) + .filter_map(|r| { + let pte = unsafe { op.read_entry(r.entry_ptr) }; + let present = pte & 0x1; + if present == 0 { None } else { Some(pte) } + }) + .next() +} + +pub const PAGE_SIZE: usize = 4096; +pub const PAGE_TABLE_SIZE: usize = 4096; +pub type PageTableEntry = u64; +pub type VirtAddr = u64; +pub type PhysAddr = u64; diff --git a/src/hyperlight_common/src/lib.rs b/src/hyperlight_common/src/lib.rs index 962b0ca25..64aefb2b9 100644 --- a/src/hyperlight_common/src/lib.rs +++ b/src/hyperlight_common/src/lib.rs @@ -38,3 +38,5 @@ pub mod resource; /// cbindgen:ignore pub mod func; +// cbindgen:ignore +pub mod vm; diff --git a/src/hyperlight_common/src/vm.rs b/src/hyperlight_common/src/vm.rs new file mode 100644 index 000000000..e34692304 --- /dev/null +++ b/src/hyperlight_common/src/vm.rs @@ -0,0 +1,131 @@ +/* +Copyright 2025 The Hyperlight Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + */ + +#[cfg_attr(target_arch = "x86_64", path = "arch/amd64/vm.rs")] +mod arch; + +pub use arch::{PAGE_SIZE, PAGE_TABLE_SIZE, PageTableEntry, PhysAddr, VirtAddr}; +pub const PAGE_TABLE_ENTRIES_PER_TABLE: usize = + PAGE_TABLE_SIZE / core::mem::size_of::(); + +/// The operations used to actually access the page table structures, +/// used to allow the same code to be used in the host and the guest +/// for page table setup +pub trait TableOps { + /// The type of table addresses + type TableAddr: Copy; + + /// Allocate a zeroed table + /// + /// # Safety + /// The current implementations of this function are not + /// inherently unsafe, but the guest implementation will likely + /// become so in the future when a real physical page allocator is + /// implemented. + /// + /// Currently, callers should take care not to call this on + /// multiple threads at the same time. + /// + /// # Panics + /// This function may panic if: + /// - The Layout creation fails + /// - Memory allocation fails + unsafe fn alloc_table(&self) -> Self::TableAddr; + + /// Offset the table address by the u64 entry offset + fn entry_addr(addr: Self::TableAddr, entry_offset: u64) -> Self::TableAddr; + + /// Read a u64 from the given address, used to read existing page + /// table entries + /// + /// # Safety + /// This reads from the given memory address, and so all the usual + /// Rust things about raw pointers apply. This will also be used + /// to update guest page tables, so especially in the guest, it is + /// important to ensure that the page tables updates do not break + /// invariants. The implementor of the trait should ensure that + /// nothing else will be reading/writing the address at the same + /// time as mapping code using the trait. + unsafe fn read_entry(&self, addr: Self::TableAddr) -> PageTableEntry; + + /// Write a u64 to the given address, used to write updated page + /// table entries + /// + /// # Safety + /// This writes to the given memory address, and so all the usual + /// Rust things about raw pointers apply. This will also be used + /// to update guest page tables, so especially in the guest, it is + /// important to ensure that the page tables updates do not break + /// invariants. The implementor of the trait should ensure that + /// nothing else will be reading/writing the address at the same + /// time as mapping code using the trait. + unsafe fn write_entry(&self, addr: Self::TableAddr, x: PageTableEntry); + + /// Convert an abstract physical address to a concrete u64 which + /// can be e.g. written into a table + fn to_phys(addr: Self::TableAddr) -> PhysAddr; + + /// Convert a concrete u64 which may have been e.g. read from a + /// table back into an abstract physical address + fn from_phys(addr: PhysAddr) -> Self::TableAddr; + + /// Return the address of the root page table + fn root_table(&self) -> Self::TableAddr; +} + +#[derive(Debug)] +pub struct BasicMapping { + pub readable: bool, + pub writable: bool, + pub executable: bool, +} + +#[derive(Debug)] +pub enum MappingKind { + BasicMapping(BasicMapping), + /* TODO: What useful things other than basic mappings actually + * require touching the tables? */ +} + +#[derive(Debug)] +pub struct Mapping { + pub phys_base: u64, + pub virt_base: u64, + pub len: u64, + pub kind: MappingKind, +} + +/// Assumption: all are page-aligned +/// +/// # Safety +/// This function modifies pages backing a virtual memory range which +/// is inherently unsafe w.r.t. the Rust memory model. +/// +/// When using this function, please note: +/// - No locking is performed before touching page table data structures, +/// as such do not use concurrently with any other page table operations +/// - TLB invalidation is not performed, if previously-mapped ranges +/// are being remapped, TLB invalidation may need to be performed +/// afterwards. +pub use arch::map; +/// This function is not presently used for anything, but is useful +/// for debugging +/// +/// # Safety +/// This function traverses page table data structures, and should not +/// be called concurrently with any other operations that modify the +/// page table. +pub use arch::vtop; diff --git a/src/hyperlight_guest_bin/src/paging.rs b/src/hyperlight_guest_bin/src/paging.rs index 4ee3d827a..a804e1355 100644 --- a/src/hyperlight_guest_bin/src/paging.rs +++ b/src/hyperlight_guest_bin/src/paging.rs @@ -38,20 +38,42 @@ pub fn ptov(x: u64) -> *mut u8 { // virtual address 0, and Rust raw pointer operations can't be // used to read/write from address 0. -/// A helper structure indicating a mapping operation that needs to be -/// performed -struct MapRequest { - table_base: u64, - vmin: *mut u8, - len: u64, -} - -/// A helper structure indicating that a particular PTE needs to be -/// modified -struct MapResponse { - entry_ptr: *mut u64, - vmin: *mut u8, - len: u64, +struct GuestMappingOperations {} +impl hyperlight_common::vm::TableOps for GuestMappingOperations { + type TableAddr = u64; + unsafe fn alloc_table(&self) -> u64 { + let page_addr = unsafe { alloc_phys_pages(1) }; + unsafe { ptov(page_addr).write_bytes(0u8, hyperlight_common::vm::PAGE_TABLE_SIZE) }; + page_addr + } + fn entry_addr(addr: u64, offset: u64) -> u64 { + addr + offset + } + unsafe fn read_entry(&self, addr: u64) -> u64 { + let ret: u64; + unsafe { + asm!("mov {}, qword ptr [{}]", out(reg) ret, in(reg) addr); + } + ret + } + unsafe fn write_entry(&self, addr: u64, x: u64) { + unsafe { + asm!("mov qword ptr [{}], {}", in(reg) addr, in(reg) x); + } + } + fn to_phys(addr: u64) -> u64 { + addr + } + fn from_phys(addr: u64) -> u64 { + addr + } + fn root_table(&self) -> u64 { + let pml4_base: u64; + unsafe { + asm!("mov {}, cr3", out(reg) pml4_base); + } + pml4_base & !0xfff + } } /// Assumption: all are page-aligned @@ -65,64 +87,22 @@ struct MapResponse { /// if previously-unmapped ranges are not being mapped, TLB invalidation may need to be performed afterwards. #[instrument(skip_all, parent = Span::current(), level= "Trace")] pub unsafe fn map_region(phys_base: u64, virt_base: *mut u8, len: u64) { - let mut pml4_base: u64; + use hyperlight_common::vm; unsafe { - asm!("mov {}, cr3", out(reg) pml4_base); - } - pml4_base &= !0xfff; - modify_ptes::<47, 39>(MapRequest { - table_base: pml4_base, - vmin: virt_base, - len, - }) - .map(|r| unsafe { alloc_pte_if_needed(r) }) - .flat_map(modify_ptes::<38, 30>) - .map(|r| unsafe { alloc_pte_if_needed(r) }) - .flat_map(modify_ptes::<29, 21>) - .map(|r| unsafe { alloc_pte_if_needed(r) }) - .flat_map(modify_ptes::<20, 12>) - .map(|r| map_normal(phys_base, virt_base, r)) - .for_each(drop); -} - -#[allow(unused)] -/// This function is not presently used for anything, but is useful -/// for debugging -/// # Safety -/// This function traverses page table data structures, and should not be called concurrently -/// with any other operations that modify the page table. -/// # Panics -/// This function will panic if: -/// - A page map request resolves to multiple page table entries -pub unsafe fn dbg_print_address_pte(address: u64) -> u64 { - let mut pml4_base: u64 = 0; - unsafe { - asm!("mov {}, cr3", out(reg) pml4_base); - } - pml4_base &= !0xfff; - let addrs = modify_ptes::<47, 39>(MapRequest { - table_base: pml4_base, - vmin: address as *mut u8, - len: unsafe { OS_PAGE_SIZE as u64 }, - }) - .map(|r| unsafe { require_pte_exist(r) }) - .flat_map(modify_ptes::<38, 30>) - .map(|r| unsafe { require_pte_exist(r) }) - .flat_map(modify_ptes::<29, 21>) - .map(|r| unsafe { require_pte_exist(r) }) - .flat_map(modify_ptes::<20, 12>) - .map(|r| { - let mut pte: u64 = 0; - unsafe { - asm!("mov {}, qword ptr [{}]", out(reg) pte, in(reg) r.entry_ptr); - } - pte - }) - .collect::>(); - if addrs.len() != 1 { - panic!("impossible: 1 page map request resolved to multiple PTEs"); + vm::map::( + &GuestMappingOperations {}, + vm::Mapping { + phys_base, + virt_base: virt_base as u64, + len, + kind: vm::MappingKind::BasicMapping(vm::BasicMapping { + readable: true, + writable: true, + executable: true, + }), + }, + ); } - addrs[0] } /// Allocate n contiguous physical pages and return the physical @@ -149,124 +129,6 @@ pub unsafe fn alloc_phys_pages(n: u64) -> u64 { } } -/// # Safety -/// This function traverses page table data structures, and should not be called concurrently -/// with any other operations that modify the page table. -unsafe fn require_pte_exist(x: MapResponse) -> MapRequest { - let mut pte: u64; - unsafe { - asm!("mov {}, qword ptr [{}]", out(reg) pte, in(reg) x.entry_ptr); - } - let present = pte & 0x1; - if present == 0 { - panic!("debugging: found not-present pte"); - } - MapRequest { - table_base: pte & !0xfff, - vmin: x.vmin, - len: x.len, - } -} - -/// Page-mapping callback to allocate a next-level page table if necessary. -/// # Safety -/// This function modifies page table data structures, and should not be called concurrently -/// with any other operations that modify the page table. -unsafe fn alloc_pte_if_needed(x: MapResponse) -> MapRequest { - let mut pte: u64; - unsafe { - asm!("mov {}, qword ptr [{}]", out(reg) pte, in(reg) x.entry_ptr); - } - let present = pte & 0x1; - if present != 0 { - return MapRequest { - table_base: pte & !0xfff, - vmin: x.vmin, - len: x.len, - }; - } - let page_addr = unsafe { alloc_phys_pages(1) }; - unsafe { ptov(page_addr).write_bytes(0u8, OS_PAGE_SIZE as usize) }; - - #[allow(clippy::identity_op)] - #[allow(clippy::precedence)] - let pte = page_addr | - 1 << 5 | // A - we don't track accesses at table level - 0 << 4 | // PCD - leave caching enabled - 0 << 3 | // PWT - write-back - 1 << 2 | // U/S - allow user access to everything (for now) - 1 << 1 | // R/W - we don't use block-level permissions - 1 << 0; // P - this entry is present - unsafe { - asm!("mov qword ptr [{}], {}", in(reg) x.entry_ptr, in(reg) pte); - } - MapRequest { - table_base: page_addr, - vmin: x.vmin, - len: x.len, - } -} - -/// Map a normal memory page -/// -/// TODO: support permissions; currently mapping is always RWX -fn map_normal(phys_base: u64, virt_base: *mut u8, r: MapResponse) { - #[allow(clippy::identity_op)] - #[allow(clippy::precedence)] - let pte = (phys_base + (r.vmin as u64 - virt_base as u64)) | - 1 << 6 | // D - we don't presently track dirty state for anything - 1 << 5 | // A - we don't presently track access for anything - 0 << 4 | // PCD - leave caching enabled - 0 << 3 | // PWT - write-back - 1 << 2 | // U/S - allow user access to everything (for now) - 1 << 1 | // R/W - for now make everything r/w - 1 << 0; // P - this entry is present - unsafe { - r.entry_ptr.write_volatile(pte); - } -} - -#[inline(always)] -/// Utility function to extract an (inclusive on both ends) bit range -/// from a quadword. -fn bits(x: u64) -> u64 { - (x & ((1 << (HIGH_BIT + 1)) - 1)) >> LOW_BIT -} - -struct ModifyPteIterator { - request: MapRequest, - n: u64, -} -impl Iterator for ModifyPteIterator { - type Item = MapResponse; - fn next(&mut self) -> Option { - if (self.n << LOW_BIT) >= self.request.len { - return None; - } - // next stage parameters - let next_vmin = self.request.vmin.wrapping_add((self.n << LOW_BIT) as usize); - let entry_ptr = ptov(self.request.table_base) - .wrapping_add((bits::(next_vmin as u64) << 3) as usize) - as *mut u64; - let len_from_here = self.request.len - (self.n << LOW_BIT); - let next_len = core::cmp::min(len_from_here, 1 << LOW_BIT); - - // update our state - self.n += 1; - - Some(MapResponse { - entry_ptr, - vmin: next_vmin, - len: next_len, - }) - } -} -fn modify_ptes( - r: MapRequest, -) -> ModifyPteIterator { - ModifyPteIterator { request: r, n: 0 } -} - pub fn flush_tlb() { // Currently this just always flips CR4.PGE back and forth to // trigger a tlb flush. We should use a faster approach where diff --git a/src/hyperlight_host/src/mem/layout.rs b/src/hyperlight_host/src/mem/layout.rs index 32d7b5478..ca64b3211 100644 --- a/src/hyperlight_host/src/mem/layout.rs +++ b/src/hyperlight_host/src/mem/layout.rs @@ -217,27 +217,6 @@ impl SandboxMemoryLayout { /// The offset into the sandbox's memory where the PML4 Table is located. /// See https://www.pagetable.com/?p=14 for more information. pub(crate) const PML4_OFFSET: usize = 0x0000; - /// The offset into the sandbox's memory where the Page Directory Pointer - /// Table starts. - #[cfg(feature = "init-paging")] - pub(super) const PDPT_OFFSET: usize = 0x1000; - /// The offset into the sandbox's memory where the Page Directory starts. - #[cfg(feature = "init-paging")] - pub(super) const PD_OFFSET: usize = 0x2000; - /// The offset into the sandbox's memory where the Page Tables start. - #[cfg(feature = "init-paging")] - pub(super) const PT_OFFSET: usize = 0x3000; - /// The address (not the offset) to the start of the page directory - #[cfg(feature = "init-paging")] - pub(super) const PD_GUEST_ADDRESS: usize = Self::BASE_ADDRESS + Self::PD_OFFSET; - /// The address (not the offset) into sandbox memory where the Page - /// Directory Pointer Table starts - #[cfg(feature = "init-paging")] - pub(super) const PDPT_GUEST_ADDRESS: usize = Self::BASE_ADDRESS + Self::PDPT_OFFSET; - /// The address (not the offset) into sandbox memory where the Page - /// Tables start - #[cfg(feature = "init-paging")] - pub(super) const PT_GUEST_ADDRESS: usize = Self::BASE_ADDRESS + Self::PT_OFFSET; /// The maximum amount of memory a single sandbox will be allowed. /// The addressable virtual memory with current paging setup is virtual address 0x0 - 0x40000000 (excl.), /// However, the memory up to Self::BASE_ADDRESS is not used. @@ -478,16 +457,10 @@ impl SandboxMemoryLayout { self.total_page_table_size } - // This function calculates the page table size for the sandbox - // We need enough memory to store the PML4, PDPT, PD and PTs - // The size of a single table is 4K, we can map up to 1GB total memory which requires 1 PML4, 1 PDPT, 1 PD and 512 PTs - // but we only need enough PTs to map the memory we are using. (In other words we only need 512 PTs to map the memory if the memory size is 1GB) - // - // We can calculate the amount of memory needed for the PTs by calculating how much memory is needed for the sandbox configuration in total, - // and then add 3 * 4K (for the PML4, PDPT and PD) to that, - // then add 2MB to that (the maximum size of memory required for the PTs themselves is 2MB when we map 1GB of memory in 4K pages), - // then divide that by 0x200_000 (as we can map 2MB in each PT). - // This will give us the total size of the PTs required for the sandbox to which we can add the size of the PML4, PDPT and PD. + // TODO: This over-counts on small sandboxes (because not all 512 + // PTs may be required), under-counts on sandboxes with more than + // 1GiB memory, and would get unreasonably complicated if we + // needed to support hugepages. #[instrument(skip_all, parent = Span::current(), level= "Trace")] #[cfg(feature = "init-paging")] fn get_total_page_table_size( diff --git a/src/hyperlight_host/src/mem/memory_region.rs b/src/hyperlight_host/src/mem/memory_region.rs index 2b801329b..c7ae6b629 100644 --- a/src/hyperlight_host/src/mem/memory_region.rs +++ b/src/hyperlight_host/src/mem/memory_region.rs @@ -31,9 +31,6 @@ use mshv_bindings::{hv_x64_memory_intercept_message, mshv_user_mem_region}; #[cfg(target_os = "windows")] use windows::Win32::System::Hypervisor::{self, WHV_MEMORY_ACCESS_TYPE}; -#[cfg(feature = "init-paging")] -use super::mgr::{PAGE_NX, PAGE_PRESENT, PAGE_RW, PAGE_USER}; - pub(crate) const DEFAULT_GUEST_BLOB_MEM_FLAGS: MemoryRegionFlags = MemoryRegionFlags::READ; bitflags! { @@ -53,31 +50,6 @@ bitflags! { } } -impl MemoryRegionFlags { - #[cfg(feature = "init-paging")] - pub(crate) fn translate_flags(&self) -> u64 { - let mut page_flags = 0; - - page_flags |= PAGE_PRESENT; // Mark page as present - - if self.contains(MemoryRegionFlags::WRITE) { - page_flags |= PAGE_RW; // Allow read/write - } - - if self.contains(MemoryRegionFlags::STACK_GUARD) { - page_flags |= PAGE_RW; // The guard page is marked RW so that if it gets written to we can detect it in the host - } - - if self.contains(MemoryRegionFlags::EXECUTE) { - page_flags |= PAGE_USER; // Allow user access - } else { - page_flags |= PAGE_NX; // Mark as non-executable if EXECUTE is not set - } - - page_flags - } -} - impl std::fmt::Display for MemoryRegionFlags { fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { if self.is_empty() { diff --git a/src/hyperlight_host/src/mem/mgr.rs b/src/hyperlight_host/src/mem/mgr.rs index aa896e726..f3e1f0652 100644 --- a/src/hyperlight_host/src/mem/mgr.rs +++ b/src/hyperlight_host/src/mem/mgr.rs @@ -23,13 +23,18 @@ use hyperlight_common::flatbuffer_wrappers::function_call::{ use hyperlight_common::flatbuffer_wrappers::function_types::FunctionCallResult; use hyperlight_common::flatbuffer_wrappers::guest_log_data::GuestLogData; use hyperlight_common::flatbuffer_wrappers::host_function_details::HostFunctionDetails; +#[cfg(feature = "init-paging")] +use hyperlight_common::vm::{ + self, BasicMapping, Mapping, MappingKind, PAGE_TABLE_ENTRIES_PER_TABLE, PAGE_TABLE_SIZE, + PageTableEntry, PhysAddr, +}; use tracing::{Span, instrument}; use super::exe::ExeInfo; use super::layout::SandboxMemoryLayout; use super::memory_region::MemoryRegion; #[cfg(feature = "init-paging")] -use super::memory_region::{DEFAULT_GUEST_BLOB_MEM_FLAGS, MemoryRegionType}; +use super::memory_region::MemoryRegionFlags; use super::ptr::{GuestPtr, RawPtr}; use super::ptr_offset::Offset; use super::shared_mem::{ExclusiveSharedMemory, GuestSharedMemory, HostSharedMemory, SharedMemory}; @@ -40,16 +45,6 @@ use crate::{Result, log_then_return, new_error}; cfg_if::cfg_if! { if #[cfg(feature = "init-paging")] { - /// Paging Flags - /// - /// See the following links explaining paging, also see paging-development-notes.md in docs: - /// - /// * Very basic description: https://stackoverflow.com/a/26945892 - /// * More in-depth descriptions: https://wiki.osdev.org/Paging - pub(crate) const PAGE_PRESENT: u64 = 1; // Page is Present - pub(crate) const PAGE_RW: u64 = 1 << 1; // Page is Read/Write (if not set page is read only so long as the WP bit in CR0 is set to 1 - which it is in Hyperlight) - pub(crate) const PAGE_USER: u64 = 1 << 2; // User/Supervisor (if this bit is set then the page is accessible by user mode code) - pub(crate) const PAGE_NX: u64 = 1 << 63; // Execute Disable (if this bit is set then data in the page cannot be executed)` // The amount of memory that can be mapped per page table pub(super) const AMOUNT_OF_MEMORY_PER_PT: usize = 0x200_000; } @@ -80,6 +75,62 @@ pub(crate) struct SandboxMemoryManager { pub(crate) abort_buffer: Vec, } +#[cfg(feature = "init-paging")] +struct GuestPageTableBuffer { + buffer: std::cell::RefCell>, +} +#[cfg(feature = "init-paging")] +impl vm::TableOps for GuestPageTableBuffer { + type TableAddr = (usize, usize); + unsafe fn alloc_table(&self) -> (usize, usize) { + let mut b = self.buffer.borrow_mut(); + let page_addr = b.len(); + b.push([0; PAGE_TABLE_ENTRIES_PER_TABLE]); + (page_addr, 0) + } + fn entry_addr(addr: (usize, usize), offset: u64) -> (usize, usize) { + (addr.0, offset as usize >> 3) + } + unsafe fn read_entry(&self, addr: (usize, usize)) -> PageTableEntry { + let b = self.buffer.borrow(); + b[addr.0][addr.1] + } + unsafe fn write_entry(&self, addr: (usize, usize), x: PageTableEntry) { + let mut b = self.buffer.borrow_mut(); + b[addr.0][addr.1] = x; + } + fn to_phys(addr: (usize, usize)) -> PhysAddr { + (addr.0 as u64 * PAGE_TABLE_SIZE as u64) + addr.1 as u64 + } + fn from_phys(addr: PhysAddr) -> (usize, usize) { + ( + addr as usize / PAGE_TABLE_SIZE, + addr as usize % PAGE_TABLE_SIZE, + ) + } + fn root_table(&self) -> (usize, usize) { + (0, 0) + } +} +#[cfg(feature = "init-paging")] +impl GuestPageTableBuffer { + fn new() -> Self { + GuestPageTableBuffer { + buffer: std::cell::RefCell::new(vec![[0; PAGE_TABLE_ENTRIES_PER_TABLE]]), + } + } + fn into_bytes(self) -> Box<[u8]> { + let bx = self.buffer.into_inner().into_boxed_slice(); + let len = bx.len(); + unsafe { + Box::from_raw(std::ptr::slice_from_raw_parts_mut( + Box::into_raw(bx) as *mut u8, + len * PAGE_TABLE_SIZE, + )) + } + } +} + impl SandboxMemoryManager where S: SharedMemory, @@ -121,17 +172,13 @@ where &mut self.shared_mem } - /// Set up the hypervisor partition in the given `SharedMemory` parameter - /// `shared_mem`, with the given memory size `mem_size` + /// Set up the guest page tables in the given `SharedMemory` parameter + /// `shared_mem` // TODO: This should perhaps happen earlier and use an // ExclusiveSharedMemory from the beginning. #[instrument(err(Debug), skip_all, parent = Span::current(), level= "Trace")] #[cfg(feature = "init-paging")] - pub(crate) fn set_up_shared_memory( - &mut self, - mem_size: u64, - regions: &mut [MemoryRegion], - ) -> Result { + pub(crate) fn set_up_shared_memory(&mut self, regions: &mut [MemoryRegion]) -> Result { let rsp: u64 = self.layout.get_top_of_user_stack_offset() as u64 + SandboxMemoryLayout::BASE_ADDRESS as u64 + self.layout.stack_size as u64 @@ -142,144 +189,37 @@ where - 0x28; self.shared_mem.with_exclusivity(|shared_mem| { - // Create PDL4 table with only 1 PML4E - shared_mem.write_u64( - SandboxMemoryLayout::PML4_OFFSET, - SandboxMemoryLayout::PDPT_GUEST_ADDRESS as u64 | PAGE_PRESENT | PAGE_RW, - )?; - - // Create PDPT with only 1 PDPTE - shared_mem.write_u64( - SandboxMemoryLayout::PDPT_OFFSET, - SandboxMemoryLayout::PD_GUEST_ADDRESS as u64 | PAGE_PRESENT | PAGE_RW, - )?; - - for i in 0..512 { - let offset = SandboxMemoryLayout::PD_OFFSET + (i * 8); - let val_to_write: u64 = (SandboxMemoryLayout::PT_GUEST_ADDRESS as u64 - + (i * 4096) as u64) - | PAGE_PRESENT - | PAGE_RW; - shared_mem.write_u64(offset, val_to_write)?; + let buffer = GuestPageTableBuffer::new(); + for region in regions.iter() { + let readable = region.flags.contains(MemoryRegionFlags::READ); + let writable = region.flags.contains(MemoryRegionFlags::WRITE) + // Temporary hack: the stack guard page is + // currently checked for in the host, rather than + // the guest, so we need to mark it writable in + // the Stage 1 translation so that the fault + // exception on a write is taken to the + // hypervisor, rather than the guest kernel + || region.flags.contains(MemoryRegionFlags::STACK_GUARD); + let executable = region.flags.contains(MemoryRegionFlags::EXECUTE); + let mapping = Mapping { + phys_base: region.guest_region.start as u64, + virt_base: region.guest_region.start as u64, + len: region.guest_region.len() as u64, + kind: MappingKind::BasicMapping(BasicMapping { + readable, + writable, + executable, + }), + }; + unsafe { vm::map(&buffer, mapping) }; } - - // We only need to create enough PTEs to map the amount of memory we have - // We need one PT for every 2MB of memory that is mapped - // We can use the memory size to calculate the number of PTs we need - // We round up mem_size/2MB - - let mem_size = usize::try_from(mem_size)?; - - let num_pages: usize = mem_size.div_ceil(AMOUNT_OF_MEMORY_PER_PT); - - // Create num_pages PT with 512 PTEs - // Pre-allocate buffer for all page table entries to minimize shared memory writes - let total_ptes = num_pages * 512; - let mut pte_buffer = vec![0u64; total_ptes]; // Pre-allocate u64 buffer directly - let mut cached_region_idx: Option = None; // Cache for optimized region lookup - let mut pte_index = 0; - - for p in 0..num_pages { - for i in 0..512 { - // Each PTE maps a 4KB page - let flags = match Self::get_page_flags(p, i, regions, &mut cached_region_idx) { - Ok(region_type) => match region_type { - // TODO: We parse and load the exe according to its sections and then - // have the correct flags set rather than just marking the entire binary as executable - MemoryRegionType::Code => PAGE_PRESENT | PAGE_RW | PAGE_USER, - MemoryRegionType::InitData => self - .layout - .init_data_permissions - .map(|perm| perm.translate_flags()) - .unwrap_or(DEFAULT_GUEST_BLOB_MEM_FLAGS.translate_flags()), - MemoryRegionType::Stack => PAGE_PRESENT | PAGE_RW | PAGE_USER | PAGE_NX, - #[cfg(feature = "executable_heap")] - MemoryRegionType::Heap => PAGE_PRESENT | PAGE_RW | PAGE_USER, - #[cfg(not(feature = "executable_heap"))] - MemoryRegionType::Heap => PAGE_PRESENT | PAGE_RW | PAGE_USER | PAGE_NX, - // The guard page is marked RW and User so that if it gets written to we can detect it in the host - // If/When we implement an interrupt handler for page faults in the guest then we can remove this access and handle things properly there - MemoryRegionType::GuardPage => { - PAGE_PRESENT | PAGE_RW | PAGE_USER | PAGE_NX - } - MemoryRegionType::InputData => PAGE_PRESENT | PAGE_RW | PAGE_NX, - MemoryRegionType::OutputData => PAGE_PRESENT | PAGE_RW | PAGE_NX, - MemoryRegionType::Peb => PAGE_PRESENT | PAGE_RW | PAGE_NX, - // Host Function Definitions are readonly in the guest - MemoryRegionType::HostFunctionDefinitions => PAGE_PRESENT | PAGE_NX, - MemoryRegionType::PageTables => PAGE_PRESENT | PAGE_RW | PAGE_NX, - }, - // If there is an error then the address isn't mapped so mark it as not present - Err(_) => 0, - }; - let val_to_write = ((p << 21) as u64 | (i << 12) as u64) | flags; - // Write u64 directly to buffer - more efficient than converting to bytes - pte_buffer[pte_index] = val_to_write.to_le(); - pte_index += 1; - } - } - - // Write the entire PTE buffer to shared memory in a single operation - // Convert u64 buffer to bytes for writing to shared memory - let pte_bytes = unsafe { - std::slice::from_raw_parts(pte_buffer.as_ptr() as *const u8, pte_buffer.len() * 8) - }; - shared_mem.copy_from_slice(pte_bytes, SandboxMemoryLayout::PT_OFFSET)?; + shared_mem.copy_from_slice(&buffer.into_bytes(), SandboxMemoryLayout::PML4_OFFSET)?; Ok::<(), crate::HyperlightError>(()) })??; Ok(rsp) } - /// Optimized page flags getter that maintains state for sequential access patterns - #[cfg(feature = "init-paging")] - fn get_page_flags( - p: usize, - i: usize, - regions: &[MemoryRegion], - cached_region_idx: &mut Option, - ) -> Result { - let addr = (p << 21) + (i << 12); - - // First check if we're still in the cached region - if let Some(cached_idx) = *cached_region_idx - && cached_idx < regions.len() - && regions[cached_idx].guest_region.contains(&addr) - { - return Ok(regions[cached_idx].region_type); - } - - // If not in cached region, try adjacent regions first (common for sequential access) - if let Some(cached_idx) = *cached_region_idx { - // Check next region - if cached_idx + 1 < regions.len() - && regions[cached_idx + 1].guest_region.contains(&addr) - { - *cached_region_idx = Some(cached_idx + 1); - return Ok(regions[cached_idx + 1].region_type); - } - } - - // Fall back to binary search for non-sequential access - let idx = regions.binary_search_by(|region| { - if region.guest_region.contains(&addr) { - std::cmp::Ordering::Equal - } else if region.guest_region.start > addr { - std::cmp::Ordering::Greater - } else { - std::cmp::Ordering::Less - } - }); - - match idx { - Ok(index) => { - *cached_region_idx = Some(index); - Ok(regions[index].region_type) - } - Err(_) => Err(new_error!("Could not find region for address: {}", addr)), - } - } - /// Create a snapshot with the given mapped regions pub(crate) fn snapshot( &mut self, diff --git a/src/hyperlight_host/src/sandbox/uninitialized_evolve.rs b/src/hyperlight_host/src/sandbox/uninitialized_evolve.rs index 87870b3a2..4369258ac 100644 --- a/src/hyperlight_host/src/sandbox/uninitialized_evolve.rs +++ b/src/hyperlight_host/src/sandbox/uninitialized_evolve.rs @@ -30,7 +30,7 @@ use crate::mem::mgr::SandboxMemoryManager; use crate::mem::ptr::{GuestPtr, RawPtr}; use crate::mem::ptr_offset::Offset; use crate::mem::shared_mem::GuestSharedMemory; -#[cfg(any(feature = "init-paging", target_os = "windows"))] +#[cfg(target_os = "windows")] use crate::mem::shared_mem::SharedMemory; #[cfg(gdb)] use crate::sandbox::config::DebugInfo; @@ -108,8 +108,7 @@ pub(crate) fn set_up_hypervisor_partition( #[cfg(feature = "init-paging")] let rsp_ptr = { let mut regions = mgr.layout.get_memory_regions(&mgr.shared_mem)?; - let mem_size = u64::try_from(mgr.shared_mem.mem_size())?; - let rsp_u64 = mgr.set_up_shared_memory(mem_size, &mut regions)?; + let rsp_u64 = mgr.set_up_shared_memory(&mut regions)?; let rsp_raw = RawPtr::from(rsp_u64); GuestPtr::try_from(rsp_raw) }?; From e30e0438a16dd72f9fe95327e64a3f8e681f0deb Mon Sep 17 00:00:00 2001 From: Simon Davies Date: Wed, 10 Dec 2025 11:34:37 +0000 Subject: [PATCH 2/3] Only include vm mod when init-paging is enabled Signed-off-by: Simon Davies --- src/hyperlight_common/Cargo.toml | 1 + src/hyperlight_common/src/lib.rs | 1 + src/hyperlight_guest_bin/Cargo.toml | 2 +- src/hyperlight_host/Cargo.toml | 2 +- 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/hyperlight_common/Cargo.toml b/src/hyperlight_common/Cargo.toml index a10a991d6..199bb1f12 100644 --- a/src/hyperlight_common/Cargo.toml +++ b/src/hyperlight_common/Cargo.toml @@ -30,6 +30,7 @@ fuzzing = ["dep:arbitrary"] trace_guest = [] mem_profile = [] std = ["thiserror/std", "log/std", "tracing/std"] +init-paging = [] [lib] bench = false # see https://bheisler.github.io/criterion.rs/book/faq.html#cargo-bench-gives-unrecognized-option-errors-for-valid-command-line-options diff --git a/src/hyperlight_common/src/lib.rs b/src/hyperlight_common/src/lib.rs index 64aefb2b9..a66383903 100644 --- a/src/hyperlight_common/src/lib.rs +++ b/src/hyperlight_common/src/lib.rs @@ -39,4 +39,5 @@ pub mod resource; /// cbindgen:ignore pub mod func; // cbindgen:ignore +#[cfg(feature = "init-paging")] pub mod vm; diff --git a/src/hyperlight_guest_bin/Cargo.toml b/src/hyperlight_guest_bin/Cargo.toml index 056e636e6..74888db00 100644 --- a/src/hyperlight_guest_bin/Cargo.toml +++ b/src/hyperlight_guest_bin/Cargo.toml @@ -23,7 +23,7 @@ macros = ["dep:hyperlight-guest-macro", "dep:linkme"] [dependencies] hyperlight-guest = { workspace = true, default-features = false } -hyperlight-common = { workspace = true, default-features = false } +hyperlight-common = { workspace = true, default-features = false, features = [ "init-paging" ] } hyperlight-guest-tracing = { workspace = true, default-features = false } hyperlight-guest-macro = { workspace = true, default-features = false, optional = true } buddy_system_allocator = "0.11.0" diff --git a/src/hyperlight_host/Cargo.toml b/src/hyperlight_host/Cargo.toml index 8cd0ef944..19383bee9 100644 --- a/src/hyperlight_host/Cargo.toml +++ b/src/hyperlight_host/Cargo.toml @@ -40,7 +40,7 @@ tracing = { version = "0.1.43", features = ["log"] } tracing-log = "0.2.0" tracing-core = "0.1.35" tracing-opentelemetry = { version = "0.32.0", optional = true } -hyperlight-common = { workspace = true, default-features = true, features = [ "std" ] } +hyperlight-common = { workspace = true, default-features = true, features = [ "std", "init-paging" ] } hyperlight-guest-tracing = { workspace = true, default-features = true, optional = true } vmm-sys-util = "0.15.0" crossbeam-channel = "0.5.15" From 5626c95534f5d8dd1dea6dc75a92fa065002fc45 Mon Sep 17 00:00:00 2001 From: Simon Davies Date: Tue, 16 Dec 2025 15:53:37 +0000 Subject: [PATCH 3/3] Review changes Signed-off-by: Simon Davies --- src/hyperlight_common/src/arch/amd64/vm.rs | 600 ++++++++++++++++-- src/hyperlight_common/src/vm.rs | 25 +- src/hyperlight_guest_bin/src/paging.rs | 6 +- src/hyperlight_host/src/mem/layout.rs | 20 +- src/hyperlight_host/src/mem/mgr.rs | 102 ++- .../src/sandbox/uninitialized_evolve.rs | 2 - 6 files changed, 664 insertions(+), 91 deletions(-) diff --git a/src/hyperlight_common/src/arch/amd64/vm.rs b/src/hyperlight_common/src/arch/amd64/vm.rs index cb7aff1e6..a3af9050e 100644 --- a/src/hyperlight_common/src/arch/amd64/vm.rs +++ b/src/hyperlight_common/src/arch/amd64/vm.rs @@ -14,8 +14,63 @@ See the License for the specific language governing permissions and limitations under the License. */ +//! x86-64 4-level page table manipulation code. +//! +//! This module implements page table setup for x86-64 long mode using 4-level paging: +//! - PML4 (Page Map Level 4) - bits 47:39 - 512 entries, each covering 512GB +//! - PDPT (Page Directory Pointer Table) - bits 38:30 - 512 entries, each covering 1GB +//! - PD (Page Directory) - bits 29:21 - 512 entries, each covering 2MB +//! - PT (Page Table) - bits 20:12 - 512 entries, each covering 4KB pages +//! +//! The code uses an iterator-based approach to walk the page table hierarchy, +//! allocating intermediate tables as needed and setting appropriate flags on leaf PTEs. + use crate::vm::{Mapping, MappingKind, TableOps}; +/// Paging Flags +/// +/// See the following links explaining paging: +/// +/// * Very basic description: https://stackoverflow.com/a/26945892 +/// * More in-depth descriptions: https://wiki.osdev.org/Paging +pub const PAGE_PRESENT: u64 = 1; // Page is Present +pub const PAGE_RW: u64 = 1 << 1; // Page is Read/Write (if not set page is read only so long as the WP bit in CR0 is set to 1 - which it is in Hyperlight) +pub const PAGE_NX: u64 = 1 << 63; // Execute Disable (if this bit is set then data in the page cannot be executed)` +/// Mask to extract the physical address from a PTE (bits 51:12) +/// This masks out the lower 12 flag bits AND the upper bits including NX (bit 63) +pub(crate) const PTE_ADDR_MASK: u64 = 0x000F_FFFF_FFFF_F000; +const PAGE_USER_ACCESS_DISABLED: u64 = 0 << 2; // U/S bit not set - supervisor mode only (no code runs in user mode for now) +const PAGE_DIRTY_CLEAR: u64 = 0 << 6; // D - dirty bit cleared (set by CPU when written) +const PAGE_ACCESSED_CLEAR: u64 = 0 << 5; // A - accessed bit cleared (set by CPU when accessed) +const PAGE_CACHE_ENABLED: u64 = 0 << 4; // PCD - page cache disable bit not set (caching enabled) +const PAGE_WRITE_BACK: u64 = 0 << 3; // PWT - page write-through bit not set (write-back caching) +const PAGE_PAT_WB: u64 = 0 << 7; // PAT - page attribute table index bit (0 for write-back memory when PCD=0, PWT=0) + +/// Returns PAGE_RW if writable is true, 0 otherwise +#[inline(always)] +const fn page_rw_flag(writable: bool) -> u64 { + if writable { PAGE_RW } else { 0 } +} + +/// Returns PAGE_NX if executable is false (NX = No Execute), 0 otherwise +#[inline(always)] +const fn page_nx_flag(executable: bool) -> u64 { + if executable { 0 } else { PAGE_NX } +} + +/// Read a page table entry and return it if the present bit is set +/// # Safety +/// The caller must ensure that `entry_ptr` points to a valid page table entry. +#[inline(always)] +unsafe fn read_pte_if_present(op: &Op, entry_ptr: Op::TableAddr) -> Option { + let pte = unsafe { op.read_entry(entry_ptr) }; + if (pte & PAGE_PRESENT) != 0 { + Some(pte) + } else { + None + } +} + #[inline(always)] /// Utility function to extract an (inclusive on both ends) bit range /// from a quadword. @@ -39,6 +94,18 @@ struct MapResponse { len: u64, } +/// Iterator that walks through page table entries at a specific level. +/// +/// Given a virtual address range and a table base, this iterator yields +/// `MapResponse` items for each page table entry that needs to be modified. +/// The const generics `HIGH_BIT` and `LOW_BIT` specify which bits of the +/// virtual address are used to index into this level's table. +/// +/// For example: +/// - PML4: HIGH_BIT=47, LOW_BIT=39 (9 bits = 512 entries, each covering 512GB) +/// - PDPT: HIGH_BIT=38, LOW_BIT=30 (9 bits = 512 entries, each covering 1GB) +/// - PD: HIGH_BIT=29, LOW_BIT=21 (9 bits = 512 entries, each covering 2MB) +/// - PT: HIGH_BIT=20, LOW_BIT=12 (9 bits = 512 entries, each covering 4KB) struct ModifyPteIterator { request: MapRequest, n: u64, @@ -48,24 +115,50 @@ impl Iterator { type Item = MapResponse; fn next(&mut self) -> Option { - if (self.n << LOW_BIT) >= self.request.len { - return None; - } - // next stage parameters - let mut next_vmin = self.request.vmin + (self.n << LOW_BIT); + // Each page table entry at this level covers a region of size (1 << LOW_BIT) bytes. + // For example, at the PT level (LOW_BIT=12), each entry covers 4KB (0x1000 bytes). + // At the PD level (LOW_BIT=21), each entry covers 2MB (0x200000 bytes). + // + // This mask isolates the bits below this level's index bits, used for alignment. let lower_bits_mask = (1 << LOW_BIT) - 1; - if self.n > 0 { - next_vmin &= !lower_bits_mask; + + // Calculate the virtual address for this iteration. + // On the first iteration (n=0), start at the requested vmin. + // On subsequent iterations, advance to the next aligned boundary. + // This handles the case where vmin isn't aligned to this level's entry size. + let next_vmin = if self.n == 0 { + self.request.vmin + } else { + // Align to the next boundary by adding one entry's worth and masking off lower bits + (self.request.vmin + (self.n << LOW_BIT)) & !lower_bits_mask + }; + + // Check if we've processed the entire requested range + if next_vmin >= self.request.vmin + self.request.len { + return None; } + + // Calculate the pointer to this level's page table entry. + // bits:: extracts the relevant index bits from the virtual address. + // Shift left by 3 (multiply by 8) because each entry is 8 bytes (u64). let entry_ptr = Op::entry_addr( self.request.table_base, bits::(next_vmin) << 3, ); + + // Calculate how many bytes remain to be mapped from this point let len_from_here = self.request.len - (next_vmin - self.request.vmin); + + // Calculate the maximum bytes this single entry can cover. + // If next_vmin is aligned, this is the full entry size (1 << LOW_BIT). + // If not aligned (only possible on first iteration), it's the remaining + // space until the next boundary. let max_len = (1 << LOW_BIT) - (next_vmin & lower_bits_mask); + + // The actual length for this entry is the smaller of what's needed vs what fits let next_len = core::cmp::min(len_from_here, max_len); - // update our state + // Advance iteration counter for next call self.n += 1; Some(MapResponse { @@ -89,11 +182,9 @@ unsafe fn alloc_pte_if_needed( op: &Op, x: MapResponse, ) -> MapRequest { - let pte = unsafe { op.read_entry(x.entry_ptr) }; - let present = pte & 0x1; - if present != 0 { + if let Some(pte) = unsafe { read_pte_if_present(op, x.entry_ptr) } { return MapRequest { - table_base: Op::from_phys(pte & !0xfff), + table_base: Op::from_phys(pte & PTE_ADDR_MASK), vmin: x.vmin, len: x.len, }; @@ -104,12 +195,12 @@ unsafe fn alloc_pte_if_needed( #[allow(clippy::identity_op)] #[allow(clippy::precedence)] let pte = Op::to_phys(page_addr) | - 1 << 5 | // A - we don't track accesses at table level - 0 << 4 | // PCD - leave caching enabled - 0 << 3 | // PWT - write-back - 1 << 2 | // U/S - allow user access to everything (for now) - 1 << 1 | // R/W - we don't use block-level permissions - 1 << 0; // P - this entry is present + PAGE_ACCESSED_CLEAR | // we don't track accesses at table level + PAGE_CACHE_ENABLED | // leave caching enabled + PAGE_WRITE_BACK | // use write-back caching + PAGE_USER_ACCESS_DISABLED |// dont allow user access (no code runs in user mode for now) + PAGE_RW | // R/W - we don't use block-level permissions + PAGE_PRESENT; // P - this entry is present unsafe { op.write_entry(x.entry_ptr, pte) }; MapRequest { table_base: page_addr, @@ -128,17 +219,23 @@ unsafe fn map_page(op: &Op, mapping: &Mapping, r: MapResponse // TODO: Support not readable + // NOTE: On x86-64, there is no separate "readable" bit in the page table entry. + // This means that pages cannot be made write-only or execute-only without also being readable. + // All pages that are mapped as writable or executable are also implicitly readable. + // If support for "not readable" mappings is required in the future, it would need to be + // implemented using additional mechanisms (e.g., page-fault handling or memory protection keys), + // but for now, this architectural limitation is accepted. { (mapping.phys_base + (r.vmin - mapping.virt_base)) | - (!bm.executable as u64) << 63 | // NX - no execute unless allowed - 1 << 7 | // 1 - RES1 according to manual - 1 << 6 | // D - we don't presently track dirty state for anything - 1 << 5 | // A - we don't presently track access for anything - 0 << 4 | // PCD - leave caching enabled - 0 << 3 | // PWT - write-back - 1 << 2 | // U/S - allow user access to everything (for now) - (bm.writable as u64) << 1 | // R/W - for now make everything r/w - 1 << 0 // P - this entry is present + page_nx_flag(bm.executable) | // NX - no execute unless allowed + PAGE_PAT_WB | // PAT index bit for write-back memory + PAGE_DIRTY_CLEAR | // dirty bit (set by CPU when written) + PAGE_ACCESSED_CLEAR | // accessed bit (set by CPU when accessed) + PAGE_CACHE_ENABLED | // leave caching enabled + PAGE_WRITE_BACK | // use write-back caching + PAGE_USER_ACCESS_DISABLED | // dont allow user access (no code runs in user mode for now) + page_rw_flag(bm.writable) | // R/W - set if writable + PAGE_PRESENT // P - this entry is present } }; unsafe { @@ -150,6 +247,17 @@ unsafe fn map_page(op: &Op, mapping: &Mapping, r: MapResponse(op: &Op, mapping: Mapping) { modify_ptes::<47, 39, Op>(MapRequest { table_base: op.root_table(), @@ -174,13 +282,8 @@ unsafe fn require_pte_exist( op: &Op, x: MapResponse, ) -> Option> { - let pte = unsafe { op.read_entry(x.entry_ptr) }; - let present = pte & 0x1; - if present == 0 { - return None; - } - Some(MapRequest { - table_base: Op::from_phys(pte & !0xfff), + unsafe { read_pte_if_present(op, x.entry_ptr) }.map(|pte| MapRequest { + table_base: Op::from_phys(pte & PTE_ADDR_MASK), vmin: x.vmin, len: x.len, }) @@ -190,7 +293,11 @@ unsafe fn require_pte_exist( // here, and the general conditions are documented in the // architecture-independent re-export in vm.rs #[allow(clippy::missing_safety_doc)] -pub unsafe fn vtop(op: &Op, address: u64) -> Option { +/// Translates a virtual address to its physical address by walking the page tables. +/// +/// Returns `Some(pte)` containing the leaf page table entry if the address is mapped, +/// or `None` if any level of the page table hierarchy has a non-present entry. +pub unsafe fn virt_to_phys(op: &Op, address: u64) -> Option { modify_ptes::<47, 39, Op>(MapRequest { table_base: op.root_table(), vmin: address, @@ -202,11 +309,7 @@ pub unsafe fn vtop(op: &Op, address: u64) -> Option { .flat_map(modify_ptes::<29, 21, Op>) .filter_map(|r| unsafe { require_pte_exist::(op, r) }) .flat_map(modify_ptes::<20, 12, Op>) - .filter_map(|r| { - let pte = unsafe { op.read_entry(r.entry_ptr) }; - let present = pte & 0x1; - if present == 0 { None } else { Some(pte) } - }) + .filter_map(|r| unsafe { read_pte_if_present(op, r.entry_ptr) }) .next() } @@ -215,3 +318,420 @@ pub const PAGE_TABLE_SIZE: usize = 4096; pub type PageTableEntry = u64; pub type VirtAddr = u64; pub type PhysAddr = u64; + +#[cfg(test)] +mod tests { + use alloc::vec; + use alloc::vec::Vec; + use core::cell::RefCell; + + use super::*; + use crate::vm::{BasicMapping, Mapping, MappingKind, PAGE_TABLE_ENTRIES_PER_TABLE, TableOps}; + + /// A mock TableOps implementation for testing that stores page tables in memory + struct MockTableOps { + tables: RefCell>, + } + + impl MockTableOps { + fn new() -> Self { + // Start with one table (the root/PML4) + Self { + tables: RefCell::new(vec![[0u64; PAGE_TABLE_ENTRIES_PER_TABLE]]), + } + } + + fn table_count(&self) -> usize { + self.tables.borrow().len() + } + + fn get_entry(&self, table_idx: usize, entry_idx: usize) -> u64 { + self.tables.borrow()[table_idx][entry_idx] + } + } + + impl TableOps for MockTableOps { + type TableAddr = (usize, usize); // (table_index, entry_index) + + unsafe fn alloc_table(&self) -> Self::TableAddr { + let mut tables = self.tables.borrow_mut(); + let idx = tables.len(); + tables.push([0u64; PAGE_TABLE_ENTRIES_PER_TABLE]); + (idx, 0) + } + + fn entry_addr(addr: Self::TableAddr, entry_offset: u64) -> Self::TableAddr { + // Convert to physical address, add offset, convert back + let phys = Self::to_phys(addr) + entry_offset; + Self::from_phys(phys) + } + + unsafe fn read_entry(&self, addr: Self::TableAddr) -> u64 { + self.tables.borrow()[addr.0][addr.1] + } + + unsafe fn write_entry(&self, addr: Self::TableAddr, entry: u64) { + self.tables.borrow_mut()[addr.0][addr.1] = entry; + } + + fn to_phys(addr: Self::TableAddr) -> PhysAddr { + // Each table is 4KB, entries are 8 bytes + (addr.0 as u64 * PAGE_TABLE_SIZE as u64) + (addr.1 as u64 * 8) + } + + fn from_phys(addr: PhysAddr) -> Self::TableAddr { + let table_idx = (addr / PAGE_TABLE_SIZE as u64) as usize; + let entry_idx = ((addr % PAGE_TABLE_SIZE as u64) / 8) as usize; + (table_idx, entry_idx) + } + + fn root_table(&self) -> Self::TableAddr { + (0, 0) + } + } + + // ==================== bits() function tests ==================== + + #[test] + fn test_bits_extracts_pml4_index() { + // PML4 uses bits 47:39 + // Address 0x0000_0080_0000_0000 should have PML4 index 1 + let addr: u64 = 0x0000_0080_0000_0000; + assert_eq!(bits::<47, 39>(addr), 1); + } + + #[test] + fn test_bits_extracts_pdpt_index() { + // PDPT uses bits 38:30 + // Address with PDPT index 1: bit 30 set = 0x4000_0000 (1GB) + let addr: u64 = 0x4000_0000; + assert_eq!(bits::<38, 30>(addr), 1); + } + + #[test] + fn test_bits_extracts_pd_index() { + // PD uses bits 29:21 + // Address 0x0000_0000_0020_0000 (2MB) should have PD index 1 + let addr: u64 = 0x0000_0000_0020_0000; + assert_eq!(bits::<29, 21>(addr), 1); + } + + #[test] + fn test_bits_extracts_pt_index() { + // PT uses bits 20:12 + // Address 0x0000_0000_0000_1000 (4KB) should have PT index 1 + let addr: u64 = 0x0000_0000_0000_1000; + assert_eq!(bits::<20, 12>(addr), 1); + } + + #[test] + fn test_bits_max_index() { + // Maximum 9-bit index is 511 + // PML4 index 511 = bits 47:39 all set = 0x0000_FF80_0000_0000 + let addr: u64 = 0x0000_FF80_0000_0000; + assert_eq!(bits::<47, 39>(addr), 511); + } + + // ==================== PTE flag tests ==================== + + #[test] + fn test_page_rw_flag_writable() { + assert_eq!(page_rw_flag(true), PAGE_RW); + } + + #[test] + fn test_page_rw_flag_readonly() { + assert_eq!(page_rw_flag(false), 0); + } + + #[test] + fn test_page_nx_flag_executable() { + assert_eq!(page_nx_flag(true), 0); // Executable = no NX bit + } + + #[test] + fn test_page_nx_flag_not_executable() { + assert_eq!(page_nx_flag(false), PAGE_NX); + } + + // ==================== map() function tests ==================== + + #[test] + fn test_map_single_page() { + let ops = MockTableOps::new(); + let mapping = Mapping { + phys_base: 0x1000, + virt_base: 0x1000, + len: PAGE_SIZE as u64, + kind: MappingKind::BasicMapping(BasicMapping { + readable: true, + writable: true, + executable: false, + }), + }; + + unsafe { map(&ops, mapping) }; + + // Should have allocated: PML4(exists) + PDPT + PD + PT = 4 tables + assert_eq!(ops.table_count(), 4); + + // Check PML4 entry 0 points to PDPT (table 1) with correct flags + let pml4_entry = ops.get_entry(0, 0); + assert_ne!(pml4_entry & PAGE_PRESENT, 0, "PML4 entry should be present"); + assert_ne!(pml4_entry & PAGE_RW, 0, "PML4 entry should be writable"); + + // Check the leaf PTE has correct flags + // PT is table 3, entry 1 (for virt_base 0x1000) + let pte = ops.get_entry(3, 1); + assert_ne!(pte & PAGE_PRESENT, 0, "PTE should be present"); + assert_ne!(pte & PAGE_RW, 0, "PTE should be writable"); + assert_ne!(pte & PAGE_NX, 0, "PTE should have NX set (not executable)"); + assert_eq!(pte & PTE_ADDR_MASK, 0x1000, "PTE should map to phys 0x1000"); + } + + #[test] + fn test_map_executable_page() { + let ops = MockTableOps::new(); + let mapping = Mapping { + phys_base: 0x2000, + virt_base: 0x2000, + len: PAGE_SIZE as u64, + kind: MappingKind::BasicMapping(BasicMapping { + readable: true, + writable: false, + executable: true, + }), + }; + + unsafe { map(&ops, mapping) }; + + // PT is table 3, entry 2 (for virt_base 0x2000) + let pte = ops.get_entry(3, 2); + assert_ne!(pte & PAGE_PRESENT, 0, "PTE should be present"); + assert_eq!(pte & PAGE_RW, 0, "PTE should be read-only"); + assert_eq!(pte & PAGE_NX, 0, "PTE should NOT have NX set (executable)"); + } + + #[test] + fn test_map_multiple_pages() { + let ops = MockTableOps::new(); + let mapping = Mapping { + phys_base: 0x10000, + virt_base: 0x10000, + len: 4 * PAGE_SIZE as u64, // 4 pages = 16KB + kind: MappingKind::BasicMapping(BasicMapping { + readable: true, + writable: true, + executable: false, + }), + }; + + unsafe { map(&ops, mapping) }; + + // Check all 4 PTEs are present + for i in 0..4 { + let entry_idx = 16 + i; // 0x10000 / 0x1000 = 16 + let pte = ops.get_entry(3, entry_idx); + assert_ne!(pte & PAGE_PRESENT, 0, "PTE {} should be present", i); + let expected_phys = 0x10000 + (i as u64 * PAGE_SIZE as u64); + assert_eq!( + pte & PTE_ADDR_MASK, + expected_phys, + "PTE {} should map to correct phys addr", + i + ); + } + } + + #[test] + fn test_map_reuses_existing_tables() { + let ops = MockTableOps::new(); + + // Map first region + let mapping1 = Mapping { + phys_base: 0x1000, + virt_base: 0x1000, + len: PAGE_SIZE as u64, + kind: MappingKind::BasicMapping(BasicMapping { + readable: true, + writable: true, + executable: false, + }), + }; + unsafe { map(&ops, mapping1) }; + let tables_after_first = ops.table_count(); + + // Map second region in same PT (different page) + let mapping2 = Mapping { + phys_base: 0x5000, + virt_base: 0x5000, + len: PAGE_SIZE as u64, + kind: MappingKind::BasicMapping(BasicMapping { + readable: true, + writable: true, + executable: false, + }), + }; + unsafe { map(&ops, mapping2) }; + + // Should NOT allocate new tables (reuses existing hierarchy) + assert_eq!( + ops.table_count(), + tables_after_first, + "Should reuse existing page tables" + ); + } + + // ==================== virt_to_phys() tests ==================== + + #[test] + fn test_virt_to_phys_mapped_address() { + let ops = MockTableOps::new(); + let mapping = Mapping { + phys_base: 0x1000, + virt_base: 0x1000, + len: PAGE_SIZE as u64, + kind: MappingKind::BasicMapping(BasicMapping { + readable: true, + writable: true, + executable: false, + }), + }; + + unsafe { map(&ops, mapping) }; + + let result = unsafe { virt_to_phys(&ops, 0x1000) }; + assert!(result.is_some(), "Should find mapped address"); + let pte = result.unwrap(); + assert_eq!(pte & PTE_ADDR_MASK, 0x1000); + } + + #[test] + fn test_virt_to_phys_unmapped_address() { + let ops = MockTableOps::new(); + // Don't map anything + + let result = unsafe { virt_to_phys(&ops, 0x1000) }; + assert!(result.is_none(), "Should return None for unmapped address"); + } + + #[test] + fn test_virt_to_phys_partially_mapped() { + let ops = MockTableOps::new(); + let mapping = Mapping { + phys_base: 0x1000, + virt_base: 0x1000, + len: PAGE_SIZE as u64, + kind: MappingKind::BasicMapping(BasicMapping { + readable: true, + writable: true, + executable: false, + }), + }; + + unsafe { map(&ops, mapping) }; + + // Query an address in a different PT entry (unmapped) + let result = unsafe { virt_to_phys(&ops, 0x5000) }; + assert!( + result.is_none(), + "Should return None for unmapped address in same PT" + ); + } + + // ==================== ModifyPteIterator tests ==================== + + #[test] + fn test_modify_pte_iterator_single_page() { + let ops = MockTableOps::new(); + let request = MapRequest { + table_base: ops.root_table(), + vmin: 0x1000, + len: PAGE_SIZE as u64, + }; + + let responses: Vec<_> = modify_ptes::<20, 12, MockTableOps>(request).collect(); + assert_eq!(responses.len(), 1, "Single page should yield one response"); + assert_eq!(responses[0].vmin, 0x1000); + assert_eq!(responses[0].len, PAGE_SIZE as u64); + } + + #[test] + fn test_modify_pte_iterator_multiple_pages() { + let ops = MockTableOps::new(); + let request = MapRequest { + table_base: ops.root_table(), + vmin: 0x1000, + len: 3 * PAGE_SIZE as u64, + }; + + let responses: Vec<_> = modify_ptes::<20, 12, MockTableOps>(request).collect(); + assert_eq!(responses.len(), 3, "3 pages should yield 3 responses"); + } + + #[test] + fn test_modify_pte_iterator_zero_length() { + let ops = MockTableOps::new(); + let request = MapRequest { + table_base: ops.root_table(), + vmin: 0x1000, + len: 0, + }; + + let responses: Vec<_> = modify_ptes::<20, 12, MockTableOps>(request).collect(); + assert_eq!(responses.len(), 0, "Zero length should yield no responses"); + } + + #[test] + fn test_modify_pte_iterator_unaligned_start() { + let ops = MockTableOps::new(); + // Start at 0x1800 (mid-page), map 0x1000 bytes + // Should cover 0x1800-0x1FFF (first page) and 0x2000-0x27FF (second page) + let request = MapRequest { + table_base: ops.root_table(), + vmin: 0x1800, + len: 0x1000, + }; + + let responses: Vec<_> = modify_ptes::<20, 12, MockTableOps>(request).collect(); + assert_eq!( + responses.len(), + 2, + "Unaligned mapping spanning 2 pages should yield 2 responses" + ); + assert_eq!(responses[0].vmin, 0x1800); + assert_eq!(responses[0].len, 0x800); // Remaining in first page + assert_eq!(responses[1].vmin, 0x2000); + assert_eq!(responses[1].len, 0x800); // Continuing in second page + } + + // ==================== TableOps entry_addr tests ==================== + + #[test] + fn test_entry_addr_from_table_base() { + // entry_addr is called with a table base (entry_index = 0) and a byte offset + // offset = entry_index * 8, so offset 40 means entry 5 + let result = MockTableOps::entry_addr((2, 0), 40); + assert_eq!(result, (2, 5), "Should return (table 2, entry 5)"); + } + + #[test] + fn test_entry_addr_with_nonzero_base_entry() { + // Even though entry_addr is typically called with entry_index=0, + // it should handle non-zero base correctly by adding the offset + // Base: table 1, entry 10 (phys = 1*4096 + 10*8 = 4176) + // Offset: 16 bytes (2 entries) + // Result phys: 4176 + 16 = 4192 = 1*4096 + 12*8 → (1, 12) + let result = MockTableOps::entry_addr((1, 10), 16); + assert_eq!(result, (1, 12), "Should add offset to base entry"); + } + + #[test] + fn test_to_phys_from_phys_roundtrip() { + // Verify to_phys and from_phys are inverses + let addr = (3, 42); + let phys = MockTableOps::to_phys(addr); + let back = MockTableOps::from_phys(phys); + assert_eq!(back, addr, "to_phys/from_phys should roundtrip"); + } +} diff --git a/src/hyperlight_common/src/vm.rs b/src/hyperlight_common/src/vm.rs index e34692304..b14d8fe1f 100644 --- a/src/hyperlight_common/src/vm.rs +++ b/src/hyperlight_common/src/vm.rs @@ -17,6 +17,8 @@ limitations under the License. #[cfg_attr(target_arch = "x86_64", path = "arch/amd64/vm.rs")] mod arch; +#[cfg(target_arch = "x86_64")] +pub use arch::{PAGE_NX, PAGE_PRESENT, PAGE_RW}; pub use arch::{PAGE_SIZE, PAGE_TABLE_SIZE, PageTableEntry, PhysAddr, VirtAddr}; pub const PAGE_TABLE_ENTRIES_PER_TABLE: usize = PAGE_TABLE_SIZE / core::mem::size_of::(); @@ -45,7 +47,16 @@ pub trait TableOps { /// - Memory allocation fails unsafe fn alloc_table(&self) -> Self::TableAddr; - /// Offset the table address by the u64 entry offset + /// Offset the table address by the given offset in bytes. + /// + /// # Parameters + /// - `addr`: The base address of the table. + /// - `entry_offset`: The offset in **bytes** within the page table. This is + /// not an entry index; callers must multiply the entry index by the size + /// of a page table entry (typically 8 bytes) to obtain the correct byte offset. + /// + /// # Returns + /// The address of the entry at the given byte offset from the base address. fn entry_addr(addr: Self::TableAddr, entry_offset: u64) -> Self::TableAddr; /// Read a u64 from the given address, used to read existing page @@ -72,14 +83,14 @@ pub trait TableOps { /// invariants. The implementor of the trait should ensure that /// nothing else will be reading/writing the address at the same /// time as mapping code using the trait. - unsafe fn write_entry(&self, addr: Self::TableAddr, x: PageTableEntry); + unsafe fn write_entry(&self, addr: Self::TableAddr, entry: PageTableEntry); - /// Convert an abstract physical address to a concrete u64 which - /// can be e.g. written into a table + /// Convert an abstract table address to a concrete physical address (u64) + /// which can be e.g. written into a page table entry fn to_phys(addr: Self::TableAddr) -> PhysAddr; - /// Convert a concrete u64 which may have been e.g. read from a - /// table back into an abstract physical address + /// Convert a concrete physical address (u64) which may have been e.g. read + /// from a page table entry back into an abstract table address fn from_phys(addr: PhysAddr) -> Self::TableAddr; /// Return the address of the root page table @@ -128,4 +139,4 @@ pub use arch::map; /// This function traverses page table data structures, and should not /// be called concurrently with any other operations that modify the /// page table. -pub use arch::vtop; +pub use arch::virt_to_phys; diff --git a/src/hyperlight_guest_bin/src/paging.rs b/src/hyperlight_guest_bin/src/paging.rs index a804e1355..13f6ec6dd 100644 --- a/src/hyperlight_guest_bin/src/paging.rs +++ b/src/hyperlight_guest_bin/src/paging.rs @@ -56,9 +56,9 @@ impl hyperlight_common::vm::TableOps for GuestMappingOperations { } ret } - unsafe fn write_entry(&self, addr: u64, x: u64) { + unsafe fn write_entry(&self, addr: u64, entry: u64) { unsafe { - asm!("mov qword ptr [{}], {}", in(reg) addr, in(reg) x); + asm!("mov qword ptr [{}], {}", in(reg) addr, in(reg) entry); } } fn to_phys(addr: u64) -> u64 { @@ -89,7 +89,7 @@ impl hyperlight_common::vm::TableOps for GuestMappingOperations { pub unsafe fn map_region(phys_base: u64, virt_base: *mut u8, len: u64) { use hyperlight_common::vm; unsafe { - vm::map::( + vm::map( &GuestMappingOperations {}, vm::Mapping { phys_base, diff --git a/src/hyperlight_host/src/mem/layout.rs b/src/hyperlight_host/src/mem/layout.rs index ca64b3211..40b70ff8d 100644 --- a/src/hyperlight_host/src/mem/layout.rs +++ b/src/hyperlight_host/src/mem/layout.rs @@ -28,13 +28,15 @@ use super::memory_region::MemoryRegionType::{ use super::memory_region::{ DEFAULT_GUEST_BLOB_MEM_FLAGS, MemoryRegion, MemoryRegionFlags, MemoryRegionVecBuilder, }; -#[cfg(feature = "init-paging")] -use super::mgr::AMOUNT_OF_MEMORY_PER_PT; use super::shared_mem::{ExclusiveSharedMemory, GuestSharedMemory, SharedMemory}; use crate::error::HyperlightError::{GuestOffsetIsInvalid, MemoryRequestTooBig}; use crate::sandbox::SandboxConfiguration; use crate::{Result, new_error}; +#[cfg(feature = "init-paging")] +// The amount of memory that can be mapped per page table +const AMOUNT_OF_MEMORY_PER_PT: usize = 0x200_000; + // +-------------------------------------------+ // | Init Data | (GuestBlob size) // +-------------------------------------------+ @@ -83,7 +85,6 @@ use crate::{Result, new_error}; /// the stack might be slightly bigger or smaller than this value since total memory /// size is rounded up to the nearest 4K, and there is a 16-byte stack guard written /// to the top of the stack. (see below for more details) - #[derive(Copy, Clone)] pub(crate) struct SandboxMemoryLayout { pub(super) sandbox_memory_config: SandboxConfiguration, @@ -457,10 +458,21 @@ impl SandboxMemoryLayout { self.total_page_table_size } + // This function calculates the page table size for the sandbox + // We need enough memory to store the PML4, PDPT, PD and PTs + // The size of a single table is 4K, we can map up to 1GB total memory which requires 1 PML4, 1 PDPT, 1 PD and 512 PTs + // but we only need enough PTs to map the memory we are using. (In other words we only up to 512 PTs to map the memory if the memory size is 1GB) + // + // We can calculate the amount of memory needed for the PTs by calculating how much memory is needed for the sandbox configuration in total, + // and then add 3 * 4K (for the PML4, PDPT and PD) to that, + // then add 2MB to that (the maximum size of memory required for the PTs themselves is 2MB when we map 1GB of memory in 4K pages), + // then divide that by 0x200_000 (as we can map 2MB in each PT). + // This will give us the total size of the PTs required for the sandbox to which we can add the size of the PML4, PDPT and PD. // TODO: This over-counts on small sandboxes (because not all 512 // PTs may be required), under-counts on sandboxes with more than - // 1GiB memory, and would get unreasonably complicated if we + // 1GiB memory - which are not currently supported as we only support MAX_MEMORY_SIZE, and would get unreasonably complicated if we // needed to support hugepages. + #[instrument(skip_all, parent = Span::current(), level= "Trace")] #[cfg(feature = "init-paging")] fn get_total_page_table_size( diff --git a/src/hyperlight_host/src/mem/mgr.rs b/src/hyperlight_host/src/mem/mgr.rs index f3e1f0652..9499d6270 100644 --- a/src/hyperlight_host/src/mem/mgr.rs +++ b/src/hyperlight_host/src/mem/mgr.rs @@ -43,15 +43,6 @@ use crate::sandbox::snapshot::Snapshot; use crate::sandbox::uninitialized::GuestBlob; use crate::{Result, log_then_return, new_error}; -cfg_if::cfg_if! { - if #[cfg(feature = "init-paging")] { - // The amount of memory that can be mapped per page table - pub(super) const AMOUNT_OF_MEMORY_PER_PT: usize = 0x200_000; - } -} - -/// Read/write permissions flag for the 64-bit PDE -/// The page size for the 64-bit PDE /// The size of stack guard cookies pub(crate) const STACK_COOKIE_LEN: usize = 16; @@ -89,7 +80,9 @@ impl vm::TableOps for GuestPageTableBuffer { (page_addr, 0) } fn entry_addr(addr: (usize, usize), offset: u64) -> (usize, usize) { - (addr.0, offset as usize >> 3) + // Convert to physical address, add offset, convert back + let phys = Self::to_phys(addr) + offset; + Self::from_phys(phys) } unsafe fn read_entry(&self, addr: (usize, usize)) -> PageTableEntry { let b = self.buffer.borrow(); @@ -100,12 +93,12 @@ impl vm::TableOps for GuestPageTableBuffer { b[addr.0][addr.1] = x; } fn to_phys(addr: (usize, usize)) -> PhysAddr { - (addr.0 as u64 * PAGE_TABLE_SIZE as u64) + addr.1 as u64 + (addr.0 as u64 * PAGE_TABLE_SIZE as u64) + (addr.1 as u64 * 8) } fn from_phys(addr: PhysAddr) -> (usize, usize) { ( addr as usize / PAGE_TABLE_SIZE, - addr as usize % PAGE_TABLE_SIZE, + (addr as usize % PAGE_TABLE_SIZE) / 8, ) } fn root_table(&self) -> (usize, usize) { @@ -501,17 +494,27 @@ impl SandboxMemoryManager { } #[cfg(test)] -#[cfg(feature = "init-paging")] +#[cfg(all(feature = "init-paging", target_arch = "x86_64"))] mod tests { + use hyperlight_common::vm::{PAGE_NX, PAGE_PRESENT, PAGE_RW}; use hyperlight_testing::sandbox_sizes::{LARGE_HEAP_SIZE, MEDIUM_HEAP_SIZE, SMALL_HEAP_SIZE}; use hyperlight_testing::simple_guest_as_string; use super::*; use crate::GuestBinary; + use crate::mem::memory_region::MemoryRegionType; use crate::mem::shared_mem::GuestSharedMemory; use crate::sandbox::SandboxConfiguration; use crate::sandbox::uninitialized::UninitializedSandbox; + pub(crate) const PML4_OFFSET: usize = 0x0000; + pub(super) const PDPT_OFFSET: usize = 0x1000; + pub(super) const PD_OFFSET: usize = 0x2000; + pub(super) const PT_OFFSET: usize = 0x3000; + pub(super) const PD_GUEST_ADDRESS: usize = SandboxMemoryLayout::BASE_ADDRESS + PD_OFFSET; + pub(super) const PDPT_GUEST_ADDRESS: usize = SandboxMemoryLayout::BASE_ADDRESS + PDPT_OFFSET; + pub(super) const PT_GUEST_ADDRESS: usize = SandboxMemoryLayout::BASE_ADDRESS + PT_OFFSET; + /// Helper to create a sandbox with page tables set up and return the manager fn create_sandbox_with_page_tables( config: Option, @@ -520,8 +523,6 @@ mod tests { let sandbox = UninitializedSandbox::new(GuestBinary::FilePath(path), config) .expect("failed to create sandbox"); - let mem_size = sandbox.mgr.layout.get_memory_size()?; - // Build the shared memory to get GuestSharedMemory let (_host_mem, guest_mem) = sandbox.mgr.shared_mem.build(); let mut mgr = SandboxMemoryManager { @@ -536,9 +537,8 @@ mod tests { // Get regions and set up page tables let mut regions = mgr.layout.get_memory_regions(&mgr.shared_mem)?; - let mem_size_u64 = u64::try_from(mem_size)?; // set_up_shared_memory builds the page tables in shared memory - mgr.set_up_shared_memory(mem_size_u64, &mut regions)?; + mgr.set_up_shared_memory(&mut regions)?; Ok(mgr) } @@ -557,7 +557,7 @@ mod tests { let p = addr >> 21; let i = (addr >> 12) & 0x1ff; let pte_idx = p * 512 + i; - let offset = SandboxMemoryLayout::PT_OFFSET + (pte_idx * 8); + let offset = PT_OFFSET + (pte_idx * 8); let pte_val = excl_mem.read_u64(offset)?; let expected_pte = (addr as u64) | expected_flags; @@ -579,22 +579,36 @@ mod tests { } /// Get expected flags for a memory region type + /// Previously we set User RW flag for some regions but new code for building page tables does + /// not do that since (at present) we do not run code in user mode. fn get_expected_flags(region: &MemoryRegion) -> u64 { match region.region_type { - MemoryRegionType::Code => PAGE_PRESENT | PAGE_RW | PAGE_USER, - MemoryRegionType::Stack => PAGE_PRESENT | PAGE_RW | PAGE_USER | PAGE_NX, + MemoryRegionType::Code => PAGE_PRESENT | PAGE_RW, + MemoryRegionType::Stack => PAGE_PRESENT | PAGE_RW | PAGE_NX, #[cfg(feature = "executable_heap")] - MemoryRegionType::Heap => PAGE_PRESENT | PAGE_RW | PAGE_USER, + MemoryRegionType::Heap => PAGE_PRESENT | PAGE_RW, #[cfg(not(feature = "executable_heap"))] - MemoryRegionType::Heap => PAGE_PRESENT | PAGE_RW | PAGE_USER | PAGE_NX, - MemoryRegionType::GuardPage => PAGE_PRESENT | PAGE_RW | PAGE_USER | PAGE_NX, + MemoryRegionType::Heap => PAGE_PRESENT | PAGE_RW | PAGE_NX, + MemoryRegionType::GuardPage => PAGE_PRESENT | PAGE_RW | PAGE_NX, MemoryRegionType::InputData => PAGE_PRESENT | PAGE_RW | PAGE_NX, MemoryRegionType::OutputData => PAGE_PRESENT | PAGE_RW | PAGE_NX, MemoryRegionType::Peb => PAGE_PRESENT | PAGE_RW | PAGE_NX, MemoryRegionType::HostFunctionDefinitions => PAGE_PRESENT | PAGE_NX, MemoryRegionType::PageTables => PAGE_PRESENT | PAGE_RW | PAGE_NX, - MemoryRegionType::InitData => region.flags.translate_flags(), + MemoryRegionType::InitData => translate_flags(region.flags), + } + } + + fn translate_flags(flags: MemoryRegionFlags) -> u64 { + let mut page_flags = 0; + + page_flags |= PAGE_PRESENT | PAGE_RW; // Mark page as present and writeable + + if !flags.contains(MemoryRegionFlags::EXECUTE) { + page_flags |= PAGE_NX; // Mark as non-executable if EXECUTE is not set } + + page_flags } /// Verify the complete paging structure for a sandbox configuration @@ -602,12 +616,18 @@ mod tests { let mut mgr = create_sandbox_with_page_tables(config)?; let regions = mgr.layout.get_memory_regions(&mgr.shared_mem)?; + let mem_size = mgr.layout.get_memory_size()?; + + // Calculate how many PD entries should exist based on memory size + // Each PD entry covers 2MB (0x200000 bytes) + // Previously we used to write PD entries to cover the max memory size of the Sandbox but new code only writes + // enough entries to cover the actual memory size. + let num_pd_entries_needed = mem_size.div_ceil(0x200000); mgr.shared_mem.with_exclusivity(|excl_mem| { // Verify PML4 entry (single entry pointing to PDPT) - let pml4_val = excl_mem.read_u64(SandboxMemoryLayout::PML4_OFFSET)?; - let expected_pml4 = - SandboxMemoryLayout::PDPT_GUEST_ADDRESS as u64 | PAGE_PRESENT | PAGE_RW; + let pml4_val = excl_mem.read_u64(PML4_OFFSET)?; + let expected_pml4 = PDPT_GUEST_ADDRESS as u64 | PAGE_PRESENT | PAGE_RW; if pml4_val != expected_pml4 { return Err(new_error!( "{}: PML4[0] incorrect: expected 0x{:x}, got 0x{:x}", @@ -618,9 +638,8 @@ mod tests { } // Verify PDPT entry (single entry pointing to PD) - let pdpt_val = excl_mem.read_u64(SandboxMemoryLayout::PDPT_OFFSET)?; - let expected_pdpt = - SandboxMemoryLayout::PD_GUEST_ADDRESS as u64 | PAGE_PRESENT | PAGE_RW; + let pdpt_val = excl_mem.read_u64(PDPT_OFFSET)?; + let expected_pdpt = PD_GUEST_ADDRESS as u64 | PAGE_PRESENT | PAGE_RW; if pdpt_val != expected_pdpt { return Err(new_error!( "{}: PDPT[0] incorrect: expected 0x{:x}, got 0x{:x}", @@ -630,12 +649,11 @@ mod tests { )); } - // Verify all 512 PD entries (each pointing to a PT) - for i in 0..512 { - let offset = SandboxMemoryLayout::PD_OFFSET + (i * 8); + // Verify PD entries that should be present (based on memory size) + for i in 0..num_pd_entries_needed { + let offset = PD_OFFSET + (i * 8); let pd_val = excl_mem.read_u64(offset)?; - let expected_pt_addr = - SandboxMemoryLayout::PT_GUEST_ADDRESS as u64 + (i as u64 * 4096); + let expected_pt_addr = PT_GUEST_ADDRESS as u64 + (i as u64 * 4096); let expected_pd = expected_pt_addr | PAGE_PRESENT | PAGE_RW; if pd_val != expected_pd { return Err(new_error!( @@ -648,6 +666,20 @@ mod tests { } } + // Verify remaining PD entries are not present (0) + for i in num_pd_entries_needed..512 { + let offset = PD_OFFSET + (i * 8); + let pd_val = excl_mem.read_u64(offset)?; + if pd_val != 0 { + return Err(new_error!( + "{}: PD[{}] should be 0 (not present), got 0x{:x}", + name, + i, + pd_val + )); + } + } + // Verify PTEs for each memory region for region in ®ions { let start = region.guest_region.start; diff --git a/src/hyperlight_host/src/sandbox/uninitialized_evolve.rs b/src/hyperlight_host/src/sandbox/uninitialized_evolve.rs index 4369258ac..0697fc350 100644 --- a/src/hyperlight_host/src/sandbox/uninitialized_evolve.rs +++ b/src/hyperlight_host/src/sandbox/uninitialized_evolve.rs @@ -30,8 +30,6 @@ use crate::mem::mgr::SandboxMemoryManager; use crate::mem::ptr::{GuestPtr, RawPtr}; use crate::mem::ptr_offset::Offset; use crate::mem::shared_mem::GuestSharedMemory; -#[cfg(target_os = "windows")] -use crate::mem::shared_mem::SharedMemory; #[cfg(gdb)] use crate::sandbox::config::DebugInfo; #[cfg(feature = "mem_profile")]