Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support DDA on VA-backed OHCL VM: Configurable Bounce Buffer for DMA #275

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1065,6 +1065,7 @@ dependencies = [
"async-trait",
"futures",
"guestmem",
"hvdef",
"inspect",
"scsi_buffers",
"stackfuture",
Expand Down Expand Up @@ -2352,6 +2353,7 @@ name = "guestmem"
version = "0.0.0"
dependencies = [
"inspect",
"memory_range",
"pal_event",
"sparse_mmap",
"thiserror 2.0.0",
Expand Down Expand Up @@ -4193,6 +4195,7 @@ dependencies = [
"futures",
"guestmem",
"guid",
"hvdef",
"inspect",
"inspect_counters",
"mesh",
Expand All @@ -4207,8 +4210,10 @@ dependencies = [
"task_control",
"test_with_tracing",
"thiserror 2.0.0",
"tracelimit",
"tracing",
"user_driver",
"virt_mshv_vtl",
"vmcore",
"zerocopy",
]
Expand Down
32 changes: 24 additions & 8 deletions openhcl/underhill_core/src/nvme_manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ use thiserror::Error;
use tracing::Instrument;
use user_driver::vfio::VfioDevice;
use user_driver::vfio::VfioDmaBuffer;
use virt_mshv_vtl::UhPartition;
use vm_resource::kind::DiskHandleKind;
use vm_resource::AsyncResolveResource;
use vm_resource::ResourceId;
Expand Down Expand Up @@ -83,6 +84,9 @@ impl NvmeManager {
driver_source: &VmTaskDriverSource,
vp_count: u32,
dma_buffer: Arc<dyn VfioDmaBuffer>,
dma_bounce_buffer_pages_per_queue: u64,
dma_bounce_buffer_pages_per_io_threshold: Option<u32>,
partition: Option<Arc<UhPartition>>,
) -> Self {
let (send, recv) = mesh::channel();
let driver = driver_source.simple();
Expand All @@ -91,6 +95,9 @@ impl NvmeManager {
devices: HashMap::new(),
vp_count,
dma_buffer,
dma_bounce_buffer_pages_per_queue,
dma_bounce_buffer_pages_per_io_threshold,
partition,
};
let task = driver.spawn("nvme-manager", async move { worker.run(recv).await });
Self {
Expand Down Expand Up @@ -167,6 +174,9 @@ struct NvmeManagerWorker {
#[inspect(skip)]
dma_buffer: Arc<dyn VfioDmaBuffer>,
vp_count: u32,
dma_bounce_buffer_pages_per_queue: u64,
dma_bounce_buffer_pages_per_io_threshold: Option<u32>,
partition: Option<Arc<UhPartition>>,
}

impl NvmeManagerWorker {
Expand Down Expand Up @@ -238,14 +248,20 @@ impl NvmeManagerWorker {
.await
.map_err(InnerError::Vfio)?;

let driver =
nvme_driver::NvmeDriver::new(&self.driver_source, self.vp_count, device)
.instrument(tracing::info_span!(
"nvme_driver_init",
pci_id = entry.key()
))
.await
.map_err(InnerError::DeviceInitFailed)?;
let driver = nvme_driver::NvmeDriver::new(
&self.driver_source,
self.vp_count,
device,
self.dma_bounce_buffer_pages_per_queue,
self.dma_bounce_buffer_pages_per_io_threshold,
self.partition.clone(),
)
.instrument(tracing::info_span!(
"nvme_driver_init",
pci_id = entry.key()
))
.await
.map_err(InnerError::DeviceInitFailed)?;

entry.insert(driver)
}
Expand Down
123 changes: 82 additions & 41 deletions openhcl/underhill_core/src/worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1207,12 +1207,89 @@ async fn new_underhill_vm(

let boot_info = runtime_params.parsed_openhcl_boot();

// Determine if x2apic is supported so that the topology matches
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are these changes indeed part of your commit? Or stale from a fork?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have to move these codes up so we can calculate device_dma using processor_topology.vp_count() below.

// reality.
//
// We don't know if x2apic is forced on, but currently it doesn't really
// matter because the topology's initial x2apic state is not currently
// used in Underhill.
//
// FUTURE: consider having Underhill decide whether x2apic is enabled at
// boot rather than allowing the host to make that decision. This would
// just require Underhill setting the apicbase register on the VPs
// before start.
//
// TODO: centralize cpuid querying logic.
#[cfg(guest_arch = "x86_64")]
let x2apic = if isolation.is_hardware_isolated() {
// For hardware CVMs, always enable x2apic support at boot.
vm_topology::processor::x86::X2ApicState::Enabled
} else if safe_x86_intrinsics::cpuid(x86defs::cpuid::CpuidFunction::VersionAndFeatures.0, 0).ecx
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fyi, this will conflict with Eric's recent change that refactors this crate.

& (1 << 21)
!= 0
{
vm_topology::processor::x86::X2ApicState::Supported
} else {
vm_topology::processor::x86::X2ApicState::Unsupported
};

#[cfg(guest_arch = "x86_64")]
let processor_topology = new_x86_topology(&boot_info.cpus, x2apic)
.context("failed to construct the processor topology")?;

#[cfg(guest_arch = "aarch64")]
let processor_topology = new_aarch64_topology(
boot_info
.gic
.context("did not get gic state from bootloader")?,
&boot_info.cpus,
)
.context("failed to construct the processor topology")?;

// The amount of memory required by the GET igvm_attest request
let attestation = get_protocol::IGVM_ATTEST_MSG_SHARED_GPA as u64 * hvdef::HV_PAGE_SIZE;

// TODO: determine actual memory usage by NVME/MANA. hardcode as 10MB
let device_dma = 10 * 1024 * 1024;
const MIN_PER_QUEUE_PAGES: u64 = (128 * 1024 + hvdef::HV_PAGE_SIZE) / hvdef::HV_PAGE_SIZE;
const DEFAULT_DMA_BOUNCE_BUFFER_PAGES_PER_QUEUE: u64 = 128;
#[allow(clippy::assertions_on_constants)]
const _: () = assert!(
DEFAULT_DMA_BOUNCE_BUFFER_PAGES_PER_QUEUE >= MIN_PER_QUEUE_PAGES,
"not enough room for an ATAPI IO plus a PRP list"
);

const DEFAULT_NVME_DRIVERS: u32 = 8;
let (max_nvme_drivers, dma_bounce_buffer_pages_per_queue, dma_bounce_buffer_pages_per_io_threshold) = dps.general.vtl2_settings.as_ref().map_or(
(DEFAULT_NVME_DRIVERS, DEFAULT_DMA_BOUNCE_BUFFER_PAGES_PER_QUEUE, None),
|vtl2_settings| {
let original_dma_bounce_buffer_pages_per_queue = vtl2_settings
.fixed
.dma_bounce_buffer_pages_per_queue
.unwrap_or(DEFAULT_DMA_BOUNCE_BUFFER_PAGES_PER_QUEUE);

let dma_bounce_buffer_pages_per_queue = if original_dma_bounce_buffer_pages_per_queue < MIN_PER_QUEUE_PAGES {
tracing::warn!(
"the value of dma_bounce_buffer_pages_per_queue ({}) is less than MIN_PER_QUEUE_PAGES ({})",
original_dma_bounce_buffer_pages_per_queue, MIN_PER_QUEUE_PAGES
);
MIN_PER_QUEUE_PAGES
} else {
original_dma_bounce_buffer_pages_per_queue
};

(
vtl2_settings.fixed.max_nvme_drivers.unwrap_or(DEFAULT_NVME_DRIVERS),
dma_bounce_buffer_pages_per_queue,
vtl2_settings.fixed.dma_bounce_buffer_pages_per_io_threshold,
)
},
);

// TODO: determine actual memory usage by NVME/MANA. hardcode as 10MB
let device_dma = 10 * 1024 * 1024
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you give a rough estimate of how much memory this will use?

I suspect that John's comments obviate this discussion, but it does seem like this is inefficient: we're allocating (what I assume is) a lot of memory for devices that may not be using them.

+ max_nvme_drivers as u64
* processor_topology.vp_count() as u64
* dma_bounce_buffer_pages_per_queue
* hvdef::HV_PAGE_SIZE;
// Determine the amount of shared memory to reserve from VTL0.
let shared_pool_size = match isolation {
#[cfg(guest_arch = "x86_64")]
Expand Down Expand Up @@ -1275,45 +1352,6 @@ async fn new_underhill_vm(
physical_address_size,
)?;

// Determine if x2apic is supported so that the topology matches
// reality.
//
// We don't know if x2apic is forced on, but currently it doesn't really
// matter because the topology's initial x2apic state is not currently
// used in Underhill.
//
// FUTURE: consider having Underhill decide whether x2apic is enabled at
// boot rather than allowing the host to make that decision. This would
// just require Underhill setting the apicbase register on the VPs
// before start.
//
// TODO: centralize cpuid querying logic.
#[cfg(guest_arch = "x86_64")]
let x2apic = if isolation.is_hardware_isolated() {
// For hardware CVMs, always enable x2apic support at boot.
vm_topology::processor::x86::X2ApicState::Enabled
} else if safe_x86_intrinsics::cpuid(x86defs::cpuid::CpuidFunction::VersionAndFeatures.0, 0).ecx
& (1 << 21)
!= 0
{
vm_topology::processor::x86::X2ApicState::Supported
} else {
vm_topology::processor::x86::X2ApicState::Unsupported
};

#[cfg(guest_arch = "x86_64")]
let processor_topology = new_x86_topology(&boot_info.cpus, x2apic)
.context("failed to construct the processor topology")?;

#[cfg(guest_arch = "aarch64")]
let processor_topology = new_aarch64_topology(
boot_info
.gic
.context("did not get gic state from bootloader")?,
&boot_info.cpus,
)
.context("failed to construct the processor topology")?;

let mut with_vmbus: bool = false;
let mut with_vmbus_relay = false;
if dps.general.vmbus_redirection_enabled {
Expand Down Expand Up @@ -1737,6 +1775,9 @@ async fn new_underhill_vm(
&driver_source,
processor_topology.vp_count(),
vfio_dma_buffer(&shared_vis_pages_pool),
dma_bounce_buffer_pages_per_queue,
dma_bounce_buffer_pages_per_io_threshold,
Some(partition.clone()),
);

resolver.add_async_resolver::<DiskHandleKind, _, NvmeDiskConfig, _>(NvmeDiskResolver::new(
Expand Down
21 changes: 21 additions & 0 deletions openhcl/virt_mshv_vtl/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1863,6 +1863,27 @@ impl UhPartition {
);
}
}

/// Pins the specified guest physical address ranges in the hypervisor.
/// The memory ranges passed to this function must be VA backed memory.
/// If a partial failure occurs (i.e., some but not all the ranges were successfully pinned),
/// the function will automatically attempt to unpin any successfully pinned ranges.
/// This "rollback" behavior ensures that no partially pinned state remains, which
/// could otherwise lead to inconsistencies.
///
pub fn pin_gpa_ranges(&self, ranges: &[MemoryRange]) -> Result<(), HvError> {
self.inner.hcl.pin_gpa_ranges(ranges)
}

/// Unpins the specified guest physical address ranges in the hypervisor.
/// The memory ranges passed to this function must be VA backed memory.
/// If a partial failure occurs (i.e., some but not all the ranges were successfully unpinned),
/// the function will automatically attempt to pin any successfully unpinned ranges. This "rollback"
/// behavior ensures that no partially unpinned state remains, which could otherwise lead to inconsistencies.
///
pub fn unpin_gpa_ranges(&self, ranges: &[MemoryRange]) -> Result<(), HvError> {
self.inner.hcl.unpin_gpa_ranges(ranges)
}
}

#[cfg(guest_arch = "x86_64")]
Expand Down
8 changes: 7 additions & 1 deletion vm/devices/get/underhill_config/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,14 @@ pub struct Vtl2SettingsFixed {
pub scsi_sub_channels: u16,
/// size of the io-uring submission queues
pub io_ring_size: u32,
/// Max bounce buffer pages active per cpu
/// Max bounce buffer pages active per cpu for unaligned IOs
pub max_bounce_buffer_pages: Option<u32>,
// DMA bounce buffer pages per queue
pub dma_bounce_buffer_pages_per_queue: Option<u64>,
// Threshold of io size in pages to use bounce buffer
pub dma_bounce_buffer_pages_per_io_threshold: Option<u32>,
// Max nvme drivers
pub max_nvme_drivers: Option<u32>,
}

#[derive(Debug, Clone, MeshPayload, Inspect)]
Expand Down
3 changes: 3 additions & 0 deletions vm/devices/get/underhill_config/src/schema/v1.rs
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,9 @@ impl ParseSchema<crate::Vtl2SettingsFixed> for Vtl2SettingsFixed {
scsi_sub_channels: self.scsi_sub_channels.map_or(0, |x| x as u16),
io_ring_size: self.io_ring_size.unwrap_or(256),
max_bounce_buffer_pages: self.max_bounce_buffer_pages,
dma_bounce_buffer_pages_per_queue: self.dma_bounce_buffer_pages_per_queue,
dma_bounce_buffer_pages_per_io_threshold: self.dma_bounce_buffer_pages_per_io_threshold,
max_nvme_drivers: self.max_nvme_drivers,
})
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@ message Vtl2SettingsFixed {
optional uint32 io_ring_size = 2;
// Specify the maximum number of bounce buffer pages allowed per cpu
optional uint32 max_bounce_buffer_pages = 3;
optional uint64 dma_bounce_buffer_pages_per_queue = 4;
optional uint32 dma_bounce_buffer_pages_per_io_threshold = 5;
optional uint32 max_nvme_drivers = 6;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Again a moot point, but I want to get in the habit of asking: we need a test that ensures that not passing in these values results in acceptable behavior. I see the code handling it, but since this is our API surface: we need to be certain we aren't regressing it. Both JSON (until we delete ...) and PB.

}

message Vtl2SettingsDynamic {
Expand Down
1 change: 1 addition & 0 deletions vm/devices/storage/disk_backend/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ rust-version.workspace = true

[dependencies]
scsi_buffers.workspace = true
hvdef.workspace = true

guestmem.workspace = true
vm_resource.workspace = true
Expand Down
3 changes: 3 additions & 0 deletions vm/devices/storage/disk_backend/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ pub mod sync_wrapper;
pub mod zerodisk;

use guestmem::AccessError;
use hvdef::HvError;
use inspect::Inspect;
use scsi_buffers::RequestBuffers;
use stackfuture::StackFuture;
Expand Down Expand Up @@ -50,6 +51,8 @@ pub enum DiskError {
ReservationConflict,
#[error("unsupported eject")]
UnsupportedEject,
#[error("failed to pin/unpin guest memory {0}")]
Hv(HvError),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not just "PinFailure" instead of Hv?

}

/// Io error details
Expand Down
3 changes: 3 additions & 0 deletions vm/devices/storage/disk_nvme/nvme_driver/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ task_control.workspace = true
user_driver.workspace = true
guestmem.workspace = true
vmcore.workspace = true
virt_mshv_vtl.workspace = true
hvdef.workspace = true

anyhow.workspace = true
event-listener.workspace = true
Expand All @@ -25,6 +27,7 @@ safeatomic.workspace = true
slab.workspace = true
thiserror.workspace = true
tracing.workspace = true
tracelimit.workspace = true
zerocopy.workspace = true

[dev-dependencies]
Expand Down
Loading
Loading