Skip to content

Commit 07076ff

Browse files
committed
fix(windows): prevent race between WHvCancelRunVirtualProcessor and WHvDeletePartition
kill() could call WHvCancelRunVirtualProcessor while WhpVm::drop() was calling WHvDeletePartition, causing use-after-free crashes (STATUS_ACCESS_VIOLATION or STATUS_HEAP_CORRUPTION). Fix by protecting the partition handle with an RwLock. kill() takes a read lock, and set_dropped() takes a write lock to block until all in-flight cancel calls complete before the partition is deleted. Signed-off-by: Ludvig Liljenberg <[email protected]>
1 parent 6550531 commit 07076ff

File tree

4 files changed

+131
-16
lines changed

4 files changed

+131
-16
lines changed

src/hyperlight_host/src/hypervisor/hyperlight_vm.rs

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,20 +19,22 @@ use std::collections::HashMap;
1919
#[cfg(crashdump)]
2020
use std::path::Path;
2121
#[cfg(any(kvm, mshv3))]
22+
use std::sync::atomic::AtomicBool;
23+
use std::sync::atomic::AtomicU8;
24+
#[cfg(any(kvm, mshv3))]
2225
use std::sync::atomic::AtomicU64;
23-
use std::sync::atomic::{AtomicBool, AtomicU8};
2426
use std::sync::{Arc, Mutex};
2527

2628
use log::LevelFilter;
2729
use tracing::{Span, instrument};
2830
#[cfg(feature = "trace_guest")]
2931
use tracing_opentelemetry::OpenTelemetrySpanExt;
3032

31-
#[cfg(target_os = "windows")]
32-
use super::WindowsInterruptHandle;
3333
#[cfg(gdb)]
3434
use super::gdb::{DebugCommChannel, DebugMsg, DebugResponse, DebuggableVm, VcpuStopReason, arch};
3535
use super::regs::{CommonFpu, CommonRegisters};
36+
#[cfg(target_os = "windows")]
37+
use super::{PartitionState, WindowsInterruptHandle};
3638
use crate::HyperlightError::{ExecutionCanceledByHost, NoHypervisorFound};
3739
#[cfg(not(gdb))]
3840
use crate::hypervisor::Hypervisor;
@@ -168,8 +170,10 @@ impl HyperlightVm {
168170
#[cfg(target_os = "windows")]
169171
let interrupt_handle: Arc<dyn InterruptHandleImpl> = Arc::new(WindowsInterruptHandle {
170172
state: AtomicU8::new(0),
171-
partition_handle: vm.partition_handle(),
172-
dropped: AtomicBool::new(false),
173+
partition_state: std::sync::RwLock::new(PartitionState {
174+
handle: vm.partition_handle(),
175+
dropped: false,
176+
}),
173177
});
174178

175179
#[cfg_attr(not(gdb), allow(unused_mut))]

src/hyperlight_host/src/hypervisor/hyperv_windows.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -669,6 +669,11 @@ impl DebuggableVm for WhpVm {
669669

670670
impl Drop for WhpVm {
671671
fn drop(&mut self) {
672+
// HyperlightVm::drop() calls set_dropped() before this runs.
673+
// set_dropped() ensures no WHvCancelRunVirtualProcessor calls are in progress
674+
// or will be made in the future, so it's safe to delete the partition.
675+
// (HyperlightVm::drop() runs before its fields are dropped, so
676+
// set_dropped() completes before this Drop impl runs.)
672677
if let Err(e) = unsafe { WHvDeletePartition(self.partition) } {
673678
log::error!("Failed to delete partition: {}", e);
674679
}

src/hyperlight_host/src/hypervisor/mod.rs

Lines changed: 62 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ use std::str::FromStr;
5757
#[cfg(any(kvm, mshv3))]
5858
use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU64, Ordering};
5959
#[cfg(target_os = "windows")]
60-
use std::sync::atomic::{AtomicBool, AtomicU8, Ordering};
60+
use std::sync::atomic::{AtomicU8, Ordering};
6161
#[cfg(any(kvm, mshv3))]
6262
use std::time::Duration;
6363

@@ -413,8 +413,34 @@ pub(super) struct WindowsInterruptHandle {
413413
/// (e.g., during host function calls), but is cleared at the start of each new `VirtualCPU::run()` call.
414414
state: AtomicU8,
415415

416-
partition_handle: windows::Win32::System::Hypervisor::WHV_PARTITION_HANDLE,
417-
dropped: AtomicBool,
416+
/// RwLock protecting the partition handle and dropped state.
417+
///
418+
/// This lock prevents a race condition between `kill()` calling `WHvCancelRunVirtualProcessor`
419+
/// and `WhpVm::drop()` calling `WHvDeletePartition`. These two Windows Hypervisor Platform APIs
420+
/// must not execute concurrently - if `WHvDeletePartition` frees the partition while
421+
/// `WHvCancelRunVirtualProcessor` is still accessing it, the result is a use-after-free
422+
/// causing STATUS_ACCESS_VIOLATION or STATUS_HEAP_CORRUPTION.
423+
///
424+
/// The synchronization works as follows:
425+
/// - `kill()` takes a read lock before calling `WHvCancelRunVirtualProcessor`
426+
/// - `set_dropped()` takes a write lock, which blocks until all in-flight `kill()` calls complete,
427+
/// then sets `dropped = true`. This is called from `HyperlightVm::drop()` before `WhpVm::drop()`
428+
/// runs, ensuring no `kill()` is accessing the partition when `WHvDeletePartition` is called.
429+
partition_state: std::sync::RwLock<PartitionState>,
430+
}
431+
432+
/// State protected by the RwLock in `WindowsInterruptHandle`.
433+
///
434+
/// Contains a copy of the partition handle from `WhpVm` (not an owning reference).
435+
/// The RwLock and `dropped` flag ensure this handle is never used after `WhpVm`
436+
/// deletes the partition.
437+
#[cfg(target_os = "windows")]
438+
#[derive(Debug)]
439+
pub(super) struct PartitionState {
440+
/// Copy of partition handle from `WhpVm`. Only valid while `dropped` is false.
441+
pub(super) handle: windows::Win32::System::Hypervisor::WHV_PARTITION_HANDLE,
442+
/// Set true before partition deletion; prevents further use of `handle`.
443+
pub(super) dropped: bool,
418444
}
419445

420446
#[cfg(target_os = "windows")]
@@ -468,9 +494,20 @@ impl InterruptHandleImpl for WindowsInterruptHandle {
468494
}
469495

470496
fn set_dropped(&self) {
471-
// Release ordering to ensure all VM cleanup operations are visible
472-
// to any thread that checks dropped() via Acquire
473-
self.dropped.store(true, Ordering::Release);
497+
// Take write lock to:
498+
// 1. Wait for any in-flight kill() calls (holding read locks) to complete
499+
// 2. Block new kill() calls from starting while we hold the write lock
500+
// 3. Set dropped=true so no future kill() calls will use the handle
501+
// After this returns, no WHvCancelRunVirtualProcessor calls are in progress
502+
// or will ever be made, so WHvDeletePartition can safely be called.
503+
match self.partition_state.write() {
504+
Ok(mut guard) => {
505+
guard.dropped = true;
506+
}
507+
Err(e) => {
508+
log::error!("Failed to acquire partition_state write lock: {}", e);
509+
}
510+
}
474511
}
475512
}
476513

@@ -487,7 +524,15 @@ impl InterruptHandle for WindowsInterruptHandle {
487524
// This ensures we see the running state set by the vcpu thread
488525
let state = self.state.load(Ordering::Acquire);
489526
if state & Self::RUNNING_BIT != 0 {
490-
unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, 0, 0).is_ok() }
527+
// Take read lock to prevent race with WHvDeletePartition in set_dropped().
528+
// Multiple kill() calls can proceed concurrently (read locks don't block each other),
529+
// but set_dropped() will wait for all kill() calls to complete before proceeding.
530+
let guard = self.partition_state.read().unwrap();
531+
if !guard.dropped {
532+
unsafe { WHvCancelRunVirtualProcessor(guard.handle, 0, 0).is_ok() }
533+
} else {
534+
false
535+
}
491536
} else {
492537
false
493538
}
@@ -501,16 +546,22 @@ impl InterruptHandle for WindowsInterruptHandle {
501546
// Acquire ordering to synchronize with the Release in set_running()
502547
let state = self.state.load(Ordering::Acquire);
503548
if state & Self::RUNNING_BIT != 0 {
504-
unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, 0, 0).is_ok() }
549+
// Take read lock to prevent race with WHvDeletePartition in set_dropped()
550+
let guard = self.partition_state.read().unwrap();
551+
if !guard.dropped {
552+
unsafe { WHvCancelRunVirtualProcessor(guard.handle, 0, 0).is_ok() }
553+
} else {
554+
false
555+
}
505556
} else {
506557
false
507558
}
508559
}
509560

510561
fn dropped(&self) -> bool {
511-
// Acquire ordering to synchronize with the Release in set_dropped()
512-
// This ensures we see all VM cleanup operations that happened before drop
513-
self.dropped.load(Ordering::Acquire)
562+
// Take read lock to check dropped state consistently
563+
let guard = self.partition_state.read().unwrap();
564+
guard.dropped
514565
}
515566
}
516567

src/hyperlight_host/tests/integration_test.rs

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1664,3 +1664,58 @@ fn exception_handler_installation_and_validation() {
16641664
let count: i32 = sandbox.call("GetExceptionHandlerCallCount", ()).unwrap();
16651665
assert_eq!(count, 2, "Handler should have been called twice");
16661666
}
1667+
1668+
/// This test is "likely" to catch a race condition where WHvCancelRunVirtualProcessor runs halfway, then the partition is deleted (by drop calling WHvDeletePartition),
1669+
/// and WHvCancelRunVirtualProcessor continues, and tries to access freed memory.
1670+
///
1671+
/// Based on local observations, "likely" means that if the bug exist, running this test 5 times will catch it at least once.
1672+
#[test]
1673+
#[cfg(target_os = "windows")]
1674+
fn interrupt_cancel_delete_race() {
1675+
const NUM_THREADS: usize = 8;
1676+
const NUM_KILL_THREADS: usize = 4;
1677+
const ITERATIONS_PER_THREAD: usize = 1000;
1678+
1679+
let mut handles = vec![];
1680+
1681+
for _ in 0..NUM_THREADS {
1682+
handles.push(thread::spawn(move || {
1683+
for _ in 0..ITERATIONS_PER_THREAD {
1684+
let mut sandbox: MultiUseSandbox = new_uninit().unwrap().evolve().unwrap();
1685+
let interrupt_handle = sandbox.interrupt_handle();
1686+
1687+
let stop_flag = Arc::new(AtomicBool::new(false));
1688+
1689+
let kill_handles: Vec<_> = (0..NUM_KILL_THREADS)
1690+
.map(|_| {
1691+
let handle = interrupt_handle.clone();
1692+
let stop = stop_flag.clone();
1693+
thread::spawn(move || {
1694+
while !stop.load(Ordering::Relaxed) {
1695+
handle.kill();
1696+
}
1697+
})
1698+
})
1699+
.collect();
1700+
1701+
// Makes sure RUNNING_BIT is set when kill() is called
1702+
let _ = sandbox.call::<String>("Echo", "test".to_string());
1703+
1704+
// Drop the sandbox while kill threads are spamming
1705+
drop(sandbox);
1706+
1707+
// Signal kill threads to stop
1708+
stop_flag.store(true, Ordering::Relaxed);
1709+
1710+
// Wait for kill threads
1711+
for kill_handle in kill_handles {
1712+
kill_handle.join().expect("Kill thread panicked!");
1713+
}
1714+
}
1715+
}));
1716+
}
1717+
1718+
for handle in handles {
1719+
handle.join().unwrap();
1720+
}
1721+
}

0 commit comments

Comments
 (0)