diff --git a/Cargo.lock b/Cargo.lock index ecdac628b09..fe38a68439d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1294,8 +1294,7 @@ checksum = "a156c684c91ea7d62626509bce3cb4e1d9ed5c4d978f7b4352658f96a4c26b4a" [[package]] name = "userfaultfd" version = "0.7.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6a38c7e24af201e66f02659492f86ccd5efa9fd62f2c851fff3961a60e81966" +source = "git+https://github.com/codesandbox/userfaultfd-rs.git?rev=27b2ff3bc71774b79338afca051d49cb07146d90#27b2ff3bc71774b79338afca051d49cb07146d90" dependencies = [ "bitflags 2.3.3", "cfg-if", @@ -1308,8 +1307,7 @@ dependencies = [ [[package]] name = "userfaultfd-sys" version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d75595d2a62b7db16bd47f5a1ce14e1fe05ccbe27d6c96721a958e0a027cad41" +source = "git+https://github.com/codesandbox/userfaultfd-rs.git?rev=27b2ff3bc71774b79338afca051d49cb07146d90#27b2ff3bc71774b79338afca051d49cb07146d90" dependencies = [ "bindgen 0.68.1", "cc", diff --git a/build.rs b/build.rs new file mode 100644 index 00000000000..e69de29bb2d diff --git a/resources/seccomp/aarch64-unknown-linux-musl.json b/resources/seccomp/aarch64-unknown-linux-musl.json index a8bb544f40e..eef1be4c5a7 100644 --- a/resources/seccomp/aarch64-unknown-linux-musl.json +++ b/resources/seccomp/aarch64-unknown-linux-musl.json @@ -629,6 +629,19 @@ } ] }, + { + "syscall": "msync", + "comment": "Used to sync memory from mmap to disk", + "args": [ + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 4, + "comment": "MS_SYNC" + } + ] + }, { "syscall": "rt_sigaction", "comment": "rt_sigaction is used by libc::abort during a panic to install the default handler for SIGABRT", diff --git a/resources/seccomp/x86_64-unknown-linux-musl.json b/resources/seccomp/x86_64-unknown-linux-musl.json index bcdf00edd4c..d58f671b03e 100644 --- a/resources/seccomp/x86_64-unknown-linux-musl.json +++ b/resources/seccomp/x86_64-unknown-linux-musl.json @@ -260,6 +260,31 @@ } ] }, + { + "syscall": "msync", + "comment": "Used to sync memory from mmap to disk", + "args": [ + { + "index": 2, + "type": "dword", + "op": "eq", + "val": 4, + "comment": "MS_SYNC" + } + ] + }, + { + "syscall": "memfd_create", + "comment": "Used to create a memory backed file descriptor that can be used to save memory to" + }, + { + "syscall": "nanosleep", + "comment": "Debugging sleep" + }, + { + "syscall": "copy_file_range", + "comment": "debugging" + }, { "syscall": "rt_sigaction", "comment": "rt_sigaction is used by libc::abort during a panic to install the default handler for SIGABRT", diff --git a/src/api_server/src/parsed_request.rs b/src/api_server/src/parsed_request.rs index aba39843a32..5edf6bad1ba 100644 --- a/src/api_server/src/parsed_request.rs +++ b/src/api_server/src/parsed_request.rs @@ -21,6 +21,7 @@ use crate::request::logger::parse_put_logger; use crate::request::machine_configuration::{ parse_get_machine_config, parse_patch_machine_config, parse_put_machine_config, }; +use crate::request::memory_backend::parse_put_memory_backend; use crate::request::metrics::parse_put_metrics; use crate::request::mmds::{parse_get_mmds, parse_patch_mmds, parse_put_mmds}; use crate::request::net::{parse_patch_net, parse_put_net}; @@ -91,6 +92,7 @@ impl TryFrom<&Request> for ParsedRequest { (Method::Put, "drives", Some(body)) => parse_put_drive(body, path_tokens.next()), (Method::Put, "logger", Some(body)) => parse_put_logger(body), (Method::Put, "machine-config", Some(body)) => parse_put_machine_config(body), + (Method::Put, "memory-backend", Some(body)) => parse_put_memory_backend(body), (Method::Put, "metrics", Some(body)) => parse_put_metrics(body), (Method::Put, "mmds", Some(body)) => parse_put_mmds(body, path_tokens.next()), (Method::Put, "network-interfaces", Some(body)) => { diff --git a/src/api_server/src/request/memory_backend.rs b/src/api_server/src/request/memory_backend.rs new file mode 100644 index 00000000000..a99b5afeb1e --- /dev/null +++ b/src/api_server/src/request/memory_backend.rs @@ -0,0 +1,46 @@ +// Copyright 2018 Amazon.com, Inc. or its affiliates. All Rights Reserved. +// SPDX-License-Identifier: Apache-2.0 + +use super::super::VmmAction; +use crate::parsed_request::{Error, ParsedRequest}; +use crate::request::Body; +use vmm::logger::{IncMetric, METRICS}; +use vmm::vmm_config::snapshot::MemBackendConfig; + +pub(crate) fn parse_put_memory_backend(body: &Body) -> Result { + METRICS.put_api_requests.memory_backend_cfg_count.inc(); + Ok(ParsedRequest::new_sync(VmmAction::SetMemoryBackend( + serde_json::from_slice::(body.raw()).map_err(|e| { + METRICS.put_api_requests.memory_backend_cfg_fails.inc(); + Error::SerdeJson(e) + })?, + ))) +} + +#[cfg(test)] +mod tests { + use std::path::PathBuf; + + use vmm::vmm_config::snapshot::MemBackendType; + + use super::*; + + #[test] + fn test_parse_memory_backing_file() { + assert!(parse_put_memory_backend(&Body::new("invalid_payload")).is_err()); + + let body = r#"{ + "backend_type": "File", + "backend_path": "./memory.snap" + }"#; + let same_body = MemBackendConfig { + backend_type: MemBackendType::File, + backend_path: PathBuf::from("./memory.snap"), + }; + let result = parse_put_memory_backend(&Body::new(body)); + assert!(result.is_ok()); + let parsed_req = result.unwrap_or_else(|_e| panic!("Failed test.")); + + assert!(parsed_req == ParsedRequest::new_sync(VmmAction::SetMemoryBackend(same_body))); + } +} diff --git a/src/api_server/src/request/mod.rs b/src/api_server/src/request/mod.rs index b573ad9aeeb..cb8677c4e6b 100644 --- a/src/api_server/src/request/mod.rs +++ b/src/api_server/src/request/mod.rs @@ -10,6 +10,7 @@ pub mod entropy; pub mod instance_info; pub mod logger; pub mod machine_configuration; +pub mod memory_backend; pub mod metrics; pub mod mmds; pub mod net; diff --git a/src/api_server/swagger/firecracker.yaml b/src/api_server/swagger/firecracker.yaml index 8b515a12c69..f4502988953 100644 --- a/src/api_server/swagger/firecracker.yaml +++ b/src/api_server/swagger/firecracker.yaml @@ -376,6 +376,29 @@ paths: description: Internal server error schema: $ref: "#/definitions/Error" + + /memory-backend: + put: + summary: Configures a memory backend to sync the memory changes from during the runtime of the vm + operationId: putMemoryBackend + parameters: + - name: body + in: body + description: The memory backend to use + required: true + schema: + $ref: "#/definitions/MemoryBackend" + responses: + 204: + description: Memory backend configured + 400: + description: Memory backend failed + schema: + $ref: "#/definitions/Error" + default: + description: Internal server error. + schema: + $ref: "#/definitions/Error" /metrics: put: diff --git a/src/firecracker/Cargo.toml b/src/firecracker/Cargo.toml index 415964fa986..70f1a10bf91 100644 --- a/src/firecracker/Cargo.toml +++ b/src/firecracker/Cargo.toml @@ -28,11 +28,16 @@ vmm = { path = "../vmm" } [dev-dependencies] cargo_toml = "0.16.0" -regex = { version = "1.9.5", default-features = false, features = ["std", "unicode-perl"] } +regex = { version = "1.9.5", default-features = false, features = [ + "std", + "unicode-perl", +] } # Dev-Dependencies for uffd examples serde = { version = "1.0.188", features = ["derive"] } -userfaultfd = "0.7.0" +userfaultfd = { git = "https://github.com/codesandbox/userfaultfd-rs.git", rev = "27b2ff3bc71774b79338afca051d49cb07146d90", features = [ + "linux5_7", +] } [build-dependencies] bincode = "1.2.1" diff --git a/src/jailer/src/env.rs b/src/jailer/src/env.rs index a2e587d387a..21fdaa4cc66 100644 --- a/src/jailer/src/env.rs +++ b/src/jailer/src/env.rs @@ -467,7 +467,7 @@ impl Env { // section), this latter part is not desirable in Firecracker's // threat model. Copying prevents 2 Firecracker processes from // sharing memory. - fs::copy(&self.exec_file_path, &self.chroot_dir).map_err(|err| { + fs::hard_link(&self.exec_file_path, &self.chroot_dir).map_err(|err| { JailerError::Copy(self.exec_file_path.clone(), self.chroot_dir.clone(), err) })?; diff --git a/src/snapshot/src/lib.rs b/src/snapshot/src/lib.rs index 8b394c4a4ac..5dbfed2595b 100644 --- a/src/snapshot/src/lib.rs +++ b/src/snapshot/src/lib.rs @@ -218,9 +218,10 @@ impl Snapshot { object .serialize(&mut writer, &self.version_map, self.target_version) .map_err(Error::Versionize)?; - writer - .flush() - .map_err(|ref err| Error::Io(err.raw_os_error().unwrap_or(libc::EINVAL))) + // writer + // .flush() + // .map_err(|ref err| Error::Io(err.raw_os_error().unwrap_or(libc::EINVAL))) + Ok(()) } // Returns the current snapshot format version. diff --git a/src/utils/src/vm_memory.rs b/src/utils/src/vm_memory.rs index 5b019e6640d..a7b9c59fe20 100644 --- a/src/utils/src/vm_memory.rs +++ b/src/utils/src/vm_memory.rs @@ -120,7 +120,7 @@ pub fn create_guest_memory( for region in regions { let flags = match region.0 { None => libc::MAP_NORESERVE | libc::MAP_PRIVATE | libc::MAP_ANONYMOUS, - Some(_) => libc::MAP_NORESERVE | libc::MAP_PRIVATE, + Some(_) => libc::MAP_NORESERVE | libc::MAP_SHARED, }; let mmap_region = diff --git a/src/vmm/Cargo.toml b/src/vmm/Cargo.toml index 65b0f07d1c6..e889ca64a34 100644 --- a/src/vmm/Cargo.toml +++ b/src/vmm/Cargo.toml @@ -11,7 +11,10 @@ bench = false [dependencies] aws-lc-rs = "1.0.2" bitflags = "2.0.2" -derive_more = { version = "0.99.17", default-features = false, features = ["from", "display"] } +derive_more = { version = "0.99.17", default-features = false, features = [ + "from", + "display", +] } event-manager = "0.3.0" kvm-bindings = { version = "0.6.0", features = ["fam-wrappers"] } kvm-ioctls = "0.15.0" @@ -24,21 +27,23 @@ serde_json = "1.0.78" timerfd = "1.5.0" thiserror = "1.0.32" displaydoc = "0.2.4" -userfaultfd = "0.7.0" +userfaultfd = { git = "https://github.com/codesandbox/userfaultfd-rs.git", rev = "27b2ff3bc71774b79338afca051d49cb07146d90", features = [ + "linux5_7", +] } versionize = "0.1.10" versionize_derive = "0.1.5" vm-allocator = "0.1.0" vm-fdt = "0.2.0" vm-superio = "0.7.0" log = { version = "0.4.17", features = ["std", "serde"] } -aes-gcm = { version = "0.10.1", default-features = false, features = ["aes"] } +aes-gcm = { version = "0.10.1", default-features = false, features = ["aes"] } base64 = "0.13.0" bincode = "1.2.1" micro_http = { git = "https://github.com/firecracker-microvm/micro-http" } net_gen = { path = "../net_gen" } seccompiler = { path = "../seccompiler" } -snapshot = { path = "../snapshot"} +snapshot = { path = "../snapshot" } utils = { path = "../utils" } virtio_gen = { path = "../virtio_gen" } diff --git a/src/vmm/src/builder.rs b/src/vmm/src/builder.rs index 21318d4a565..4cd0f239f01 100644 --- a/src/vmm/src/builder.rs +++ b/src/vmm/src/builder.rs @@ -6,8 +6,15 @@ #[cfg(target_arch = "x86_64")] use std::convert::TryFrom; use std::fmt::Debug; -use std::io::{self, Seek, SeekFrom}; +use std::fs::{File, OpenOptions}; +use std::io::{self, Read, Seek, SeekFrom}; +use std::os::fd::AsRawFd; +use std::os::unix::net::UnixStream; +use std::path::Path; use std::sync::{Arc, Mutex}; +use userfaultfd::{FeatureFlags, Uffd, UffdBuilder}; +use utils::sock_ctrl_msg::ScmSocket; +use vm_memory::{FileOffset, GuestMemory}; use event_manager::{MutEventSubscriber, SubscriberOps}; use libc::EFD_NONBLOCK; @@ -20,15 +27,14 @@ use linux_loader::loader::KernelLoader; use log::error; use seccompiler::BpfThreadMap; use snapshot::Persist; -use userfaultfd::Uffd; use utils::eventfd::EventFd; use utils::time::TimestampUs; -use utils::vm_memory::{GuestAddress, GuestMemory, GuestMemoryMmap, ReadVolatile}; +use utils::vm_memory::{self, GuestAddress, GuestMemoryMmap, ReadVolatile}; #[cfg(target_arch = "aarch64")] use vm_superio::Rtc; use vm_superio::Serial; -use crate::arch::InitrdConfig; +use crate::arch::{self, InitrdConfig}; #[cfg(target_arch = "aarch64")] use crate::construct_kvm_mpidrs; use crate::cpu_config::templates::{ @@ -49,7 +55,7 @@ use crate::devices::virtio::{ use crate::devices::BusDevice; #[cfg(target_arch = "aarch64")] use crate::logger; -use crate::persist::{MicrovmState, MicrovmStateError}; +use crate::persist::{GuestRegionUffdMapping, MemoryDescriptor, MicrovmState, MicrovmStateError}; use crate::resources::VmResources; use crate::vmm_config::boot_source::BootConfig; use crate::vmm_config::instance_info::InstanceInfo; @@ -64,6 +70,9 @@ pub enum StartMicrovmError { /// Unable to attach block device to Vmm. #[error("Unable to attach block device to Vmm: {0}")] AttachBlockDevice(io::Error), + /// Unable to create/open the memory backing file. + #[error("Unable to create/open the memory backing file: {0}")] + BackingMemoryFile(io::Error), /// This error is thrown by the minimal boot loader implementation. #[error("System configuration error: {0:?}")] ConfigureSystem(crate::arch::ConfigurationError), @@ -135,6 +144,25 @@ pub enum StartMicrovmError { /// Unable to set VmResources. #[error("Cannot set vm resources: {0}")] SetVmResources(VmConfigError), + /// Failed to create an UFFD Builder. + #[error("Cannot create uffd socket: {0:?}")] + CreateUffdBuilder(userfaultfd::Error), + /// Unable to connect to UDS in order to send information regarding + /// handling guest memory page-fault events. + #[error("Cannot connect to uffd socket: {0}")] + UdsConnection(io::Error), + /// Failed to register guest memory regions to UFFD. + #[error("Cannot register uffd memory regions: {0}")] + UffdMemoryRegionsRegister(userfaultfd::Error), + /// Failed to send guest memory layout and path to user fault FD used to handle + /// guest memory page faults. This information is sent to a UDS where a custom + /// page-fault handler process is listening. + #[error("Cannot send to uffd: {0}")] + UffdSend(kvm_ioctls::Error), + + /// Failed to get the memfd from the uffd socket + #[error("No memfd received from uffd.")] + NoMemFdReceived, /// Failed to create an Entropy device #[error("Cannot create the entropy device: {0}")] CreateEntropyDevice(crate::devices::virtio::rng::EntropyError), @@ -153,7 +181,7 @@ fn create_vmm_and_vcpus( instance_info: &InstanceInfo, event_manager: &mut EventManager, guest_memory: GuestMemoryMmap, - uffd: Option, + memory_descriptor: Option, track_dirty_pages: bool, vcpu_count: u8, kvm_capabilities: Vec, @@ -231,7 +259,7 @@ fn create_vmm_and_vcpus( shutdown_exit_code: None, vm, guest_memory, - uffd, + memory_descriptor, vcpus_handles: Vec::new(), vcpus_exit_evt, mmio_device_manager, @@ -263,7 +291,53 @@ pub fn build_microvm_for_boot( .ok_or(MissingKernelConfig)?; let track_dirty_pages = vm_resources.track_dirty_pages(); - let guest_memory = create_guest_memory(vm_resources.vm_config.mem_size_mib, track_dirty_pages)?; + + let (guest_memory, memory_descriptor, _file) = + if let Some(ref backend_config) = vm_resources.memory_backend { + match backend_config.backend_type { + crate::vmm_config::snapshot::MemBackendType::File => { + let file = OpenOptions::new() + .read(true) + .write(true) + .create(true) + .open(&backend_config.backend_path) + .map_err(BackingMemoryFile)?; + file.set_len((vm_resources.vm_config.mem_size_mib * 1024 * 1024) as u64) + .map_err(|e| { + error!("Failed to set backing memory file size: {}", e); + StartMicrovmError::BackingMemoryFile(e) + })?; + + let file = Arc::new(file); + + ( + create_guest_memory( + vm_resources.vm_config.mem_size_mib, + Some(file.clone()), + track_dirty_pages, + )?, + Some(MemoryDescriptor::File(file)), + None, + ) + } + crate::vmm_config::snapshot::MemBackendType::Uffd => { + let (mem, uffd, file) = create_uffd_guest_memory( + vm_resources.vm_config.mem_size_mib, + backend_config.backend_path.as_path(), + track_dirty_pages, + )?; + + (mem, Some(MemoryDescriptor::Uffd(uffd)), Some(file)) + } + } + } else { + ( + create_guest_memory(vm_resources.vm_config.mem_size_mib, None, track_dirty_pages)?, + None, + None, + ) + }; + let entry_addr = load_kernel(boot_config, &guest_memory)?; let initrd = load_initrd_from_config(boot_config, &guest_memory)?; // Clone the command-line so that a failed boot doesn't pollute the original. @@ -276,7 +350,7 @@ pub fn build_microvm_for_boot( instance_info, event_manager, guest_memory, - None, + memory_descriptor, track_dirty_pages, vm_resources.vm_config.vcpu_count, cpu_template.kvm_capabilities.clone(), @@ -427,7 +501,7 @@ pub fn build_microvm_from_snapshot( event_manager: &mut EventManager, microvm_state: MicrovmState, guest_memory: GuestMemoryMmap, - uffd: Option, + memory_descriptor: Option, track_dirty_pages: bool, seccomp_filters: &BpfThreadMap, vm_resources: &mut VmResources, @@ -441,7 +515,7 @@ pub fn build_microvm_from_snapshot( instance_info, event_manager, guest_memory.clone(), - uffd, + memory_descriptor, track_dirty_pages, vcpu_count, microvm_state.vm_state.kvm_cap_modifiers.clone(), @@ -541,21 +615,144 @@ pub fn build_microvm_from_snapshot( /// Creates GuestMemory of `mem_size_mib` MiB in size. pub fn create_guest_memory( mem_size_mib: usize, + backing_memory_file: Option>, track_dirty_pages: bool, ) -> Result { let mem_size = mem_size_mib << 20; let arch_mem_regions = crate::arch::arch_memory_regions(mem_size); + let mut offset = 0_u64; utils::vm_memory::create_guest_memory( &arch_mem_regions .iter() - .map(|(addr, size)| (None, *addr, *size)) + .map(|(addr, size)| { + let file_offset = backing_memory_file + .clone() + .map(|file| FileOffset::from_arc(file, offset)); + offset += *size as u64; + + (file_offset, *addr, *size) + }) .collect::>()[..], track_dirty_pages, ) .map_err(StartMicrovmError::GuestMemoryMmap) } +/// Creates GuestMemory of `mem_size_mib` MiB in size. +pub fn create_uffd_guest_memory( + mem_size_mib: usize, + uds_socket_path: &Path, + track_dirty_pages: bool, +) -> std::result::Result<(GuestMemoryMmap, Uffd, Arc), StartMicrovmError> { + use StartMicrovmError::{CreateUffdBuilder, NoMemFdReceived, UdsConnection, UffdSend}; + + let mut socket = UnixStream::connect(uds_socket_path).map_err(UdsConnection)?; + + let mut buf = [0u8; 8]; + let (_, memfd) = socket.recv_with_fd(&mut buf).map_err(UffdSend)?; + + if memfd.is_none() { + return Err(NoMemFdReceived); + } + + let mem_size = mem_size_mib << 20; + let arch_mem_regions = arch::arch_memory_regions(mem_size); + let backing_memory_file = Arc::new(memfd.unwrap()); + + let mut offset = 0_u64; + let guest_memory = vm_memory::create_guest_memory( + &arch_mem_regions + .iter() + .map(|(addr, size)| { + let file_offset = Some(FileOffset::from_arc(backing_memory_file.clone(), offset)); + offset += *size as u64; + + (file_offset, *addr, *size) + }) + .collect::>()[..], + track_dirty_pages, + ) + .map_err(StartMicrovmError::GuestMemoryMmap)?; + + let uffd = UffdBuilder::new() + .require_features( + FeatureFlags::EVENT_REMOVE + | FeatureFlags::EVENT_REMAP + | FeatureFlags::EVENT_FORK + | FeatureFlags::EVENT_UNMAP + | FeatureFlags::MISSING_SHMEM + | FeatureFlags::MINOR_SHMEM + | FeatureFlags::PAGEFAULT_FLAG_WP, + ) + .user_mode_only(false) + .non_blocking(true) + .create() + .map_err(CreateUffdBuilder)?; + + let mut backend_mappings = Vec::with_capacity(guest_memory.num_regions()); + let mut offset = 0; + for mem_region in guest_memory.iter() { + let host_base_addr = mem_region.as_ptr(); + let size = mem_region.size(); + + backend_mappings.push(GuestRegionUffdMapping { + base_host_virt_addr: host_base_addr as u64, + size, + offset, + }); + offset += size as u64; + } + + // This is safe to unwrap() because we control the contents of the vector + // (i.e GuestRegionUffdMapping entries). + let backend_mappings = serde_json::to_string(&backend_mappings).unwrap(); + + socket + .send_with_fd( + backend_mappings.as_bytes(), + // In the happy case we can close the fd since the other process has it open and is + // using it to serve us pages. + // + // The problem is that if other process crashes/exits, firecracker guest memory + // will simply revert to anon-mem behavior which would lead to silent errors and + // undefined behavior. + // + // To tackle this scenario, the page fault handler can notify Firecracker of any + // crashes/exits. There is no need for Firecracker to explicitly send its process ID. + // The external process can obtain Firecracker's PID by calling `getsockopt` with + // `libc::SO_PEERCRED` option like so: + // + // let mut val = libc::ucred { pid: 0, gid: 0, uid: 0 }; + // let mut ucred_size: u32 = mem::size_of::() as u32; + // libc::getsockopt( + // socket.as_raw_fd(), + // libc::SOL_SOCKET, + // libc::SO_PEERCRED, + // &mut val as *mut _ as *mut _, + // &mut ucred_size as *mut libc::socklen_t, + // ); + // + // Per this linux man page: https://man7.org/linux/man-pages/man7/unix.7.html, + // `SO_PEERCRED` returns the credentials (PID, UID and GID) of the peer process + // connected to this socket. The returned credentials are those that were in effect + // at the time of the `connect` call. + // + // Moreover, Firecracker holds a copy of the UFFD fd as well, so that even if the + // page fault handler process does not tear down Firecracker when necessary, the + // uffd will still be alive but with no one to serve faults, leading to guest freeze. + uffd.as_raw_fd(), + ) + .map_err(UffdSend)?; + + // Wait for UFFD to be ready. + // TODO: maybe add a timeout? + let mut buf = [0; 2]; + socket.read_exact(&mut buf).map_err(UdsConnection)?; + + Ok((guest_memory, uffd, backing_memory_file)) +} + fn load_kernel( boot_config: &BootConfig, guest_memory: &GuestMemoryMmap, @@ -1050,7 +1247,7 @@ pub mod tests { } pub(crate) fn default_vmm() -> Vmm { - let guest_memory = create_guest_memory(128, false).unwrap(); + let guest_memory = create_guest_memory(128, None, false).unwrap(); let vcpus_exit_evt = EventFd::new(libc::EFD_NONBLOCK) .map_err(VmmError::EventFd) @@ -1092,12 +1289,12 @@ pub mod tests { shutdown_exit_code: None, vm, guest_memory, - uffd: None, vcpus_handles: Vec::new(), vcpus_exit_evt, mmio_device_manager, #[cfg(target_arch = "x86_64")] pio_device_manager, + memory_descriptor: None, } } @@ -1303,13 +1500,13 @@ pub mod tests { // Case 1: create guest memory without dirty page tracking { - let guest_memory = create_guest_memory(mem_size, false).unwrap(); + let guest_memory = create_guest_memory(mem_size, None, false).unwrap(); assert!(!is_dirty_tracking_enabled(&guest_memory)); } // Case 2: create guest memory with dirty page tracking { - let guest_memory = create_guest_memory(mem_size, true).unwrap(); + let guest_memory = create_guest_memory(mem_size, None, true).unwrap(); assert!(is_dirty_tracking_enabled(&guest_memory)); } } @@ -1317,7 +1514,7 @@ pub mod tests { #[test] fn test_create_vcpus() { let vcpu_count = 2; - let guest_memory = create_guest_memory(128, false).unwrap(); + let guest_memory = create_guest_memory(128, None, false).unwrap(); #[allow(unused_mut)] let mut vm = Vm::new(vec![]).unwrap(); diff --git a/src/vmm/src/cpu_config/x86_64/cpuid/normalize.rs b/src/vmm/src/cpu_config/x86_64/cpuid/normalize.rs index cebd34f319e..0380d9e7e24 100644 --- a/src/vmm/src/cpu_config/x86_64/cpuid/normalize.rs +++ b/src/vmm/src/cpu_config/x86_64/cpuid/normalize.rs @@ -23,6 +23,8 @@ pub enum NormalizeCpuidError { ExtendedCacheFeatures(#[from] ExtendedCacheFeaturesError), /// Failed to set vendor ID in leaf 0x0: {0} VendorId(#[from] VendorIdError), + /// Failed to disable async pf + DisableAsyncPf, } /// Error type for setting leaf 0 section. @@ -186,6 +188,7 @@ impl super::Cpuid { self.update_feature_info_entry(cpu_index, cpu_count)?; self.update_extended_topology_entry(cpu_index, cpu_count, cpu_bits, cpus_per_core)?; self.update_extended_cache_features()?; + // self.disable_kvm_feature_async_pf()?; // Apply manufacturer specific modifications. match self { @@ -300,6 +303,25 @@ impl super::Cpuid { Ok(()) } + /// Disable async pf, as it hangs the VM from time to time + /// + /// # Errors + /// + /// Returns `NormalizeCpuidError::DisableAsyncPf` if the leaf is not found + pub fn disable_kvm_feature_async_pf(&mut self) -> Result<(), NormalizeCpuidError> { + // KVM feature bits + #[cfg(target_arch = "x86_64")] + const KVM_FEATURE_ASYNC_PF_INT_BIT: u8 = 14; + + let leaf: &mut CpuidEntry = self + .get_mut(&CpuidKey::leaf(0x4000_0001)) + .ok_or(NormalizeCpuidError::DisableAsyncPf)?; + + set_bit(&mut leaf.result.eax, KVM_FEATURE_ASYNC_PF_INT_BIT, false); + + Ok(()) + } + /// Update extended topology entry fn update_extended_topology_entry( &mut self, diff --git a/src/vmm/src/devices/virtio/balloon/device.rs b/src/vmm/src/devices/virtio/balloon/device.rs index cc474c2a7db..7ebc917a6ae 100644 --- a/src/vmm/src/devices/virtio/balloon/device.rs +++ b/src/vmm/src/devices/virtio/balloon/device.rs @@ -17,17 +17,17 @@ use virtio_gen::virtio_blk::VIRTIO_F_VERSION_1; use super::super::{ActivateError, DeviceState, Queue, VirtioDevice, TYPE_BALLOON}; use super::util::{compact_page_frame_numbers, remove_range}; use super::{ - BALLOON_DEV_ID, BALLOON_NUM_QUEUES, BALLOON_QUEUE_SIZES, DEFLATE_INDEX, INFLATE_INDEX, - MAX_PAGES_IN_DESC, MAX_PAGE_COMPACT_BUFFER, MIB_TO_4K_PAGES, STATS_INDEX, - VIRTIO_BALLOON_F_DEFLATE_ON_OOM, VIRTIO_BALLOON_F_STATS_VQ, VIRTIO_BALLOON_PFN_SHIFT, - VIRTIO_BALLOON_S_AVAIL, VIRTIO_BALLOON_S_CACHES, VIRTIO_BALLOON_S_HTLB_PGALLOC, - VIRTIO_BALLOON_S_HTLB_PGFAIL, VIRTIO_BALLOON_S_MAJFLT, VIRTIO_BALLOON_S_MEMFREE, - VIRTIO_BALLOON_S_MEMTOT, VIRTIO_BALLOON_S_MINFLT, VIRTIO_BALLOON_S_SWAP_IN, - VIRTIO_BALLOON_S_SWAP_OUT, + BALLOON_DEV_ID, BALLOON_NUM_QUEUES, BALLOON_QUEUE_SIZES, DEFLATE_INDEX, + FREE_PAGE_REPORTING_INDEX, INFLATE_INDEX, MAX_PAGES_IN_DESC, MAX_PAGE_COMPACT_BUFFER, + MIB_TO_4K_PAGES, STATS_INDEX, VIRTIO_BALLOON_F_DEFLATE_ON_OOM, VIRTIO_BALLOON_F_REPORTING, + VIRTIO_BALLOON_F_STATS_VQ, VIRTIO_BALLOON_PFN_SHIFT, VIRTIO_BALLOON_S_AVAIL, + VIRTIO_BALLOON_S_CACHES, VIRTIO_BALLOON_S_HTLB_PGALLOC, VIRTIO_BALLOON_S_HTLB_PGFAIL, + VIRTIO_BALLOON_S_MAJFLT, VIRTIO_BALLOON_S_MEMFREE, VIRTIO_BALLOON_S_MEMTOT, + VIRTIO_BALLOON_S_MINFLT, VIRTIO_BALLOON_S_SWAP_IN, VIRTIO_BALLOON_S_SWAP_OUT, }; use crate::devices::virtio::balloon::BalloonError; use crate::devices::virtio::{IrqTrigger, IrqType}; -use crate::logger::{IncMetric, METRICS}; +use crate::logger::{self, IncMetric, METRICS}; const SIZE_OF_U32: usize = std::mem::size_of::(); const SIZE_OF_STAT: usize = std::mem::size_of::(); @@ -215,10 +215,15 @@ impl Balloon { avail_features |= 1u64 << VIRTIO_BALLOON_F_STATS_VQ; } + avail_features |= 1u64 << VIRTIO_BALLOON_F_REPORTING; + + logger::debug!("balloon: registering balloon device"); + let queue_evts = [ EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?, EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?, EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?, + EventFd::new(libc::EFD_NONBLOCK).map_err(BalloonError::EventFd)?, ]; let mut queues: Vec = BALLOON_QUEUE_SIZES.iter().map(|&s| Queue::new(s)).collect(); @@ -279,6 +284,14 @@ impl Balloon { self.trigger_stats_update() } + pub(crate) fn process_free_page_report_event(&mut self) -> Result<(), BalloonError> { + logger::debug!("balloon: received free page report event"); + self.queue_evts[FREE_PAGE_REPORTING_INDEX] + .read() + .map_err(BalloonError::EventFd)?; + self.process_free_page_reporting_queue() + } + pub(crate) fn process_inflate(&mut self) -> Result<(), BalloonError> { // This is safe since we checked in the event handler that the device is activated. let mem = self.device_state.mem().unwrap(); @@ -430,6 +443,50 @@ impl Balloon { Ok(()) } + pub(crate) fn process_free_page_reporting_queue( + &mut self, + ) -> std::result::Result<(), BalloonError> { + logger::debug!("balloon: processing free page reporting queue"); + let mem = self.device_state.mem().unwrap(); + + let mut total_removed = 0; + let queue = &mut self.queues[FREE_PAGE_REPORTING_INDEX]; + let mut needs_interrupt = false; + + while let Some(head) = queue.pop(mem) { + let head_index = head.index; + let head_mem = head.mem; + + let mut last_desc = Some(head); + while let Some(desc) = last_desc { + total_removed += desc.len; + if let Err(err) = + remove_range(desc.mem, (desc.addr, desc.len as u64), self.restored) + { + error!("balloon: failed to remove range: {:?}", err); + }; + last_desc = desc.next_descriptor(); + } + + // Acknowledge the receipt of the descriptor. + queue + .add_used(head_mem, head_index, 0) + .map_err(BalloonError::Queue)?; + + logger::debug!("balloon: adding to the queue"); + + needs_interrupt = true; + } + + logger::debug!("balloon: total removed: {}MiB", total_removed >> 20); + + if needs_interrupt { + self.signal_used_queue()?; + } + + Ok(()) + } + pub(crate) fn signal_used_queue(&self) -> Result<(), BalloonError> { self.irq_trigger.trigger_irq(IrqType::Vring).map_err(|err| { METRICS.balloon.event_fails.inc(); @@ -441,6 +498,7 @@ impl Balloon { pub fn process_virtio_queues(&mut self) { let _ = self.process_inflate(); let _ = self.process_deflate_queue(); + let _ = self.process_free_page_reporting_queue(); } /// Provides the ID of this balloon device. @@ -612,8 +670,8 @@ impl VirtioDevice for Balloon { let end = start.and_then(|s| s.checked_add(data.len())); let Some(dst) = start .zip(end) - .and_then(|(start, end)| config_space_bytes.get_mut(start..end)) else - { + .and_then(|(start, end)| config_space_bytes.get_mut(start..end)) + else { error!("Failed to write config space"); return; }; diff --git a/src/vmm/src/devices/virtio/balloon/event_handler.rs b/src/vmm/src/devices/virtio/balloon/event_handler.rs index 869cf0eb657..e8be2dfaed6 100644 --- a/src/vmm/src/devices/virtio/balloon/event_handler.rs +++ b/src/vmm/src/devices/virtio/balloon/event_handler.rs @@ -9,7 +9,9 @@ use utils::epoll::EventSet; use crate::devices::report_balloon_event_fail; use crate::devices::virtio::balloon::device::Balloon; -use crate::devices::virtio::{VirtioDevice, DEFLATE_INDEX, INFLATE_INDEX, STATS_INDEX}; +use crate::devices::virtio::{ + VirtioDevice, DEFLATE_INDEX, FREE_PAGE_REPORTING_INDEX, INFLATE_INDEX, STATS_INDEX, +}; impl Balloon { fn register_runtime_events(&self, ops: &mut EventOps) { @@ -27,6 +29,15 @@ impl Balloon { error!("Failed to register stats timerfd event: {}", err); } } + if let Err(err) = ops.add(Events::new( + &self.queue_evts[FREE_PAGE_REPORTING_INDEX], + EventSet::IN, + )) { + error!( + "Failed to register free page reporting queue event: {}", + err + ); + } } fn register_activate_event(&self, ops: &mut EventOps) { @@ -65,6 +76,7 @@ impl MutEventSubscriber for Balloon { let virtq_inflate_ev_fd = self.queue_evts[INFLATE_INDEX].as_raw_fd(); let virtq_deflate_ev_fd = self.queue_evts[DEFLATE_INDEX].as_raw_fd(); let virtq_stats_ev_fd = self.queue_evts[STATS_INDEX].as_raw_fd(); + let free_page_report_ev_fd = self.queue_evts[FREE_PAGE_REPORTING_INDEX].as_raw_fd(); let stats_timer_fd = self.stats_timer.as_raw_fd(); let activate_fd = self.activate_evt.as_raw_fd(); @@ -82,6 +94,9 @@ impl MutEventSubscriber for Balloon { _ if source == stats_timer_fd => self .process_stats_timer_event() .unwrap_or_else(report_balloon_event_fail), + _ if source == free_page_report_ev_fd => self + .process_free_page_report_event() + .unwrap_or_else(report_balloon_event_fail), _ if activate_fd == source => self.process_activate_event(ops), _ => { warn!("Balloon: Spurious event received: {:?}", source); diff --git a/src/vmm/src/devices/virtio/balloon/mod.rs b/src/vmm/src/devices/virtio/balloon/mod.rs index 82202f276da..77e55a9ccd3 100644 --- a/src/vmm/src/devices/virtio/balloon/mod.rs +++ b/src/vmm/src/devices/virtio/balloon/mod.rs @@ -19,11 +19,11 @@ use crate::devices::virtio::FIRECRACKER_MAX_QUEUE_SIZE; pub const BALLOON_DEV_ID: &str = "balloon"; /// The size of the config space. pub const BALLOON_CONFIG_SPACE_SIZE: usize = 8; +pub const BALLOON_REPORTING_QUEUE_SIZE: u16 = 32; /// Number of virtio queues. -pub const BALLOON_NUM_QUEUES: usize = 3; -/// Virtio queue sizes, in number of descriptor chain heads. -// There are 3 queues for a virtio device (in this order): RX, TX, Event -pub const BALLOON_QUEUE_SIZES: [u16; BALLOON_NUM_QUEUES] = [ +pub const BALLOON_NUM_QUEUES: usize = 4; +pub const BALLOON_QUEUE_SIZES: &[u16] = &[ + FIRECRACKER_MAX_QUEUE_SIZE, FIRECRACKER_MAX_QUEUE_SIZE, FIRECRACKER_MAX_QUEUE_SIZE, FIRECRACKER_MAX_QUEUE_SIZE, @@ -43,10 +43,13 @@ pub const INFLATE_INDEX: usize = 0; pub const DEFLATE_INDEX: usize = 1; /// The index of the deflate queue from Balloon device queues/queues_evts vector. pub const STATS_INDEX: usize = 2; +// The index of the free page reporting from Balloon device queues/queues_evts vector. +pub const FREE_PAGE_REPORTING_INDEX: usize = 3; // The feature bitmap for virtio balloon. const VIRTIO_BALLOON_F_STATS_VQ: u32 = 1; // Enable statistics. const VIRTIO_BALLOON_F_DEFLATE_ON_OOM: u32 = 2; // Deflate balloon on OOM. +const VIRTIO_BALLOON_F_REPORTING: u32 = 5; // Page reporting virtqueue // The statistics tags. const VIRTIO_BALLOON_S_SWAP_IN: u16 = 0; diff --git a/src/vmm/src/devices/virtio/balloon/util.rs b/src/vmm/src/devices/virtio/balloon/util.rs index 490c9960af8..a62c5a1bd8a 100644 --- a/src/vmm/src/devices/virtio/balloon/util.rs +++ b/src/vmm/src/devices/virtio/balloon/util.rs @@ -67,7 +67,7 @@ pub(crate) fn compact_page_frame_numbers(v: &mut [u32]) -> Vec<(u32, u32)> { pub(crate) fn remove_range( guest_memory: &GuestMemoryMmap, range: (GuestAddress, u64), - restored: bool, + _restored: bool, ) -> Result<(), RemoveRegionError> { let (guest_address, range_len) = range; @@ -79,25 +79,26 @@ pub(crate) fn remove_range( .get_host_address(guest_address) .map_err(|_| RemoveRegionError::AddressTranslation)?; - // Mmap a new anonymous region over the present one in order to create a hole. - // This workaround is (only) needed after resuming from a snapshot because the guest memory - // is mmaped from file as private and there is no `madvise` flag that works for this case. - if restored { - // SAFETY: The address and length are known to be valid. - let ret = unsafe { - libc::mmap( - phys_address.cast(), - range_len as usize, - libc::PROT_READ | libc::PROT_WRITE, - libc::MAP_FIXED | libc::MAP_ANONYMOUS | libc::MAP_PRIVATE, - -1, - 0, - ) - }; - if ret == libc::MAP_FAILED { - return Err(RemoveRegionError::MmapFail(io::Error::last_os_error())); - } - }; + // CodeSandbox: since we use UFFD handler, this is not needed for us. In fact, it breaks the UFFD handler + // if this happens right now, as it unregisters the UFFD handler for the given range. + // // Mmap a new anonymous region over the present one in order to create a hole. + // // This workaround is (only) needed after resuming from a snapshot because the guest memory + // // is mmaped from file as private and there is no `madvise` flag that works for this case. + // if restored { + // let ret = unsafe { + // libc::mmap( + // phys_address as *mut _, + // range_len as usize, + // libc::PROT_READ | libc::PROT_WRITE, + // libc::MAP_FIXED | libc::MAP_ANONYMOUS | libc::MAP_PRIVATE, + // -1, + // 0, + // ) + // }; + // if ret == libc::MAP_FAILED { + // return Err(RemoveRegionError::MmapFail(io::Error::last_os_error())); + // } + // }; // Madvise the region in order to mark it as not used. // SAFETY: The address and length are known to be valid. diff --git a/src/vmm/src/lib.rs b/src/vmm/src/lib.rs index cc7ffee1351..eb0a7616c62 100644 --- a/src/vmm/src/lib.rs +++ b/src/vmm/src/lib.rs @@ -115,9 +115,9 @@ use std::sync::{Arc, Barrier, Mutex}; use std::time::Duration; use event_manager::{EventManager as BaseEventManager, EventOps, Events, MutEventSubscriber}; +use persist::MemoryDescriptor; use seccompiler::BpfProgram; use snapshot::Persist; -use userfaultfd::Uffd; use utils::epoll::EventSet; use utils::eventfd::EventFd; use utils::terminal::Terminal; @@ -310,10 +310,6 @@ pub struct Vmm { // Guest VM core resources. vm: Vm, guest_memory: GuestMemoryMmap, - // Save UFFD in order to keep it open in the Firecracker process, as well. - // Since this field is never read again, we need to allow `dead_code`. - #[allow(dead_code)] - uffd: Option, vcpus_handles: Vec, // Used by Vcpus and devices to initiate teardown; Vmm should never write here. vcpus_exit_evt: EventFd, @@ -322,6 +318,11 @@ pub struct Vmm { mmio_device_manager: MMIODeviceManager, #[cfg(target_arch = "x86_64")] pio_device_manager: PortIODeviceManager, + + // The mem file that should be mmaped. We need to keep a reference of the UFFD in the + // process so we allow dead_code + #[allow(dead_code)] + memory_descriptor: Option, } impl Vmm { diff --git a/src/vmm/src/logger/metrics.rs b/src/vmm/src/logger/metrics.rs index eb4bb8522ec..134782d42db 100644 --- a/src/vmm/src/logger/metrics.rs +++ b/src/vmm/src/logger/metrics.rs @@ -414,6 +414,10 @@ pub struct PutRequestsMetrics { pub machine_cfg_count: SharedIncMetric, /// Number of failures in configuring the machine. pub machine_cfg_fails: SharedIncMetric, + /// Number of PUTs for setting memory backing file. + pub memory_backend_cfg_count: SharedIncMetric, + /// Number of failures in configuring the machine. + pub memory_backend_cfg_fails: SharedIncMetric, /// Number of PUTs for configuring a guest's vCPUs. pub cpu_cfg_count: SharedIncMetric, /// Number of failures in configuring a guest's vCPUs. @@ -449,6 +453,8 @@ impl PutRequestsMetrics { logger_fails: SharedIncMetric::new(), machine_cfg_count: SharedIncMetric::new(), machine_cfg_fails: SharedIncMetric::new(), + memory_backend_cfg_count: SharedIncMetric::new(), + memory_backend_cfg_fails: SharedIncMetric::new(), cpu_cfg_count: SharedIncMetric::new(), cpu_cfg_fails: SharedIncMetric::new(), metrics_count: SharedIncMetric::new(), diff --git a/src/vmm/src/memory_snapshot.rs b/src/vmm/src/memory_snapshot.rs index 28e7f5d4590..4ca6ba4281a 100644 --- a/src/vmm/src/memory_snapshot.rs +++ b/src/vmm/src/memory_snapshot.rs @@ -5,7 +5,9 @@ use std::fs::File; use std::io::SeekFrom; +use std::time::Instant; +use libc::{MAP_SHARED, PROT_WRITE}; use utils::vm_memory::{ Bitmap, FileOffset, GuestAddress, GuestMemory, GuestMemoryError, GuestMemoryMmap, GuestMemoryRegion, MemoryRegionAddress, WriteVolatile, @@ -110,6 +112,9 @@ impl SnapshotMemory for GuestMemoryMmap { let mut writer_offset = 0; let page_size = get_page_size()?; + let start = Instant::now(); + let mut total_written = 0; + self.iter() .enumerate() .try_for_each(|(slot, region)| { @@ -141,7 +146,6 @@ impl SnapshotMemory for GuestMemoryMmap { write_size, )?, )?; - write_size = 0; } } @@ -151,6 +155,12 @@ impl SnapshotMemory for GuestMemoryMmap { writer.write_all_volatile( ®ion.get_slice(MemoryRegionAddress(dirty_batch_start), write_size)?, )?; + total_written += write_size; + eprintln!( + "writing {}B took {}ms", + write_size, + start.elapsed().as_millis() + ); } writer_offset += region.len(); if let Some(bitmap) = firecracker_bitmap { @@ -184,6 +194,117 @@ impl SnapshotMemory for GuestMemoryMmap { } } +/// Dumps all pages of GuestMemoryMmap present in `dirty_bitmap` to a writer. +pub fn mem_dump_dirty( + mem_map: &GuestMemoryMmap, + fd: i32, + len: usize, + dirty_bitmap: &DirtyBitmap, +) -> std::result::Result<(), SnapshotMemoryError> { + let mut writer_offset = 0_u64; + let page_size = get_page_size()?; + + let start = Instant::now(); + let mut total_written = 0; + + let source_map = + unsafe { libc::mmap(std::ptr::null_mut(), len, PROT_WRITE, MAP_SHARED, fd, 0) }; + + let res = mem_map + .iter() + .enumerate() + .try_for_each(|(slot, region)| { + let kvm_bitmap = dirty_bitmap.get(&slot).unwrap(); + let firecracker_bitmap = region.bitmap(); + let mut write_size = 0; + let mut dirty_batch_start: u64 = 0; + + let mmap_base = region.get_host_address(MemoryRegionAddress(0)).unwrap(); + for (i, v) in kvm_bitmap.iter().enumerate() { + for j in 0..64 { + let is_kvm_page_dirty = ((v >> j) & 1u64) != 0u64; + let page_offset = ((i * 64) + j) * page_size; + let is_firecracker_page_dirty = firecracker_bitmap.dirty_at(page_offset); + if is_kvm_page_dirty || is_firecracker_page_dirty { + // We are at the start of a new batch of dirty pages. + if write_size == 0 { + // Seek forward over the unmodified pages. + dirty_batch_start = page_offset as u64; + } + write_size += page_size; + } else if write_size > 0 { + let start = Instant::now(); + + eprintln!( + "starting write of {}B (source {}, dest {})", + write_size, + dirty_batch_start, + writer_offset + dirty_batch_start + ); + unsafe { + std::ptr::copy_nonoverlapping( + mmap_base.offset((dirty_batch_start) as isize), + source_map.offset((writer_offset + dirty_batch_start) as isize) + as *mut u8, + write_size, + ); + } + + eprintln!( + "writing {}B took {}ms", + write_size, + start.elapsed().as_millis() + ); + total_written += write_size; + write_size = 0; + } + } + } + + if write_size > 0 { + let start = Instant::now(); + + eprintln!( + "starting final write of {}B (source {}, dest {}) (total_size: {})", + write_size, + dirty_batch_start, + writer_offset + dirty_batch_start, + len + ); + unsafe { + std::ptr::copy_nonoverlapping( + mmap_base.offset((dirty_batch_start) as isize), + source_map.offset((writer_offset + dirty_batch_start) as isize) as *mut u8, + write_size, + ); + } + total_written += write_size; + eprintln!( + "writing {}B took {}ms", + write_size, + start.elapsed().as_millis() + ); + } + writer_offset += region.len(); + if let Some(bitmap) = firecracker_bitmap { + bitmap.reset(); + } + + Ok(()) + }) + .map_err(SnapshotMemoryError::WriteMemory); + + eprintln!( + "total write time: {}ms, total written: {}B", + start.elapsed().as_millis(), + total_written + ); + + eprintln!("memfd {}, len {}", fd, len); + + res +} + #[cfg(test)] mod tests { use std::collections::HashMap; diff --git a/src/vmm/src/mmds/mod.rs b/src/vmm/src/mmds/mod.rs index 4f9e3f68a3d..8bd1565361d 100644 --- a/src/vmm/src/mmds/mod.rs +++ b/src/vmm/src/mmds/mod.rs @@ -696,7 +696,7 @@ mod tests { assert_eq!( Error::NoTtlProvided.to_string(), "Token time to live value not found. Use `X-metadata-token-ttl-seconds` header to \ - specify the token's lifetime." + specify the token's lifetime." ); assert_eq!( diff --git a/src/vmm/src/persist.rs b/src/vmm/src/persist.rs index 7907bb5d7e6..d248f6843fc 100644 --- a/src/vmm/src/persist.rs +++ b/src/vmm/src/persist.rs @@ -5,7 +5,7 @@ use std::fmt::Debug; use std::fs::{File, OpenOptions}; -use std::io::{self, Write}; +use std::io::{self, Read}; use std::os::unix::io::AsRawFd; use std::os::unix::net::UnixStream; use std::path::Path; @@ -33,7 +33,9 @@ use crate::cpu_config::x86_64::cpuid::common::get_vendor_id_from_host; use crate::cpu_config::x86_64::cpuid::CpuidTrait; use crate::device_manager::persist::{DevicePersistError, DeviceStates}; use crate::devices::virtio::TYPE_NET; -use crate::memory_snapshot::{GuestMemoryState, SnapshotMemory}; +use crate::memory_snapshot::{ + mem_dump_dirty, GuestMemoryState, SnapshotMemory, SnapshotMemoryError, +}; use crate::resources::VmResources; #[cfg(target_arch = "x86_64")] use crate::version_map::FC_V0_23_SNAP_VERSION; @@ -231,7 +233,9 @@ pub fn create_snapshot( version_map, )?; - snapshot_memory_to_file(vmm, ¶ms.mem_file_path, ¶ms.snapshot_type)?; + if params.snapshot_type == SnapshotType::Full { + snapshot_memory_to_file(vmm, ¶ms.mem_file_path, ¶ms.snapshot_type)?; + } Ok(()) } @@ -254,12 +258,14 @@ fn snapshot_state_to_file( snapshot .save(&mut snapshot_file, microvm_state) .map_err(SerializeMicrovmState)?; - snapshot_file - .flush() - .map_err(|err| SnapshotBackingFile("flush", err))?; - snapshot_file - .sync_all() - .map_err(|err| SnapshotBackingFile("sync_all", err)) + // Disable the following lines as we're seeing some performance issues with btrfs on these operations + // snapshot_file + // .flush() + // .map_err(|err| SnapshotBackingFile("flush", err))?; + // snapshot_file + // .sync_all() + // .map_err(|err| SnapshotBackingFile("sync_all", err)) + Ok(()) } fn snapshot_memory_to_file( @@ -268,6 +274,7 @@ fn snapshot_memory_to_file( snapshot_type: &SnapshotType, ) -> Result<(), CreateSnapshotError> { use self::CreateSnapshotError::*; + let mut file = OpenOptions::new() .write(true) .create(true) @@ -283,16 +290,19 @@ fn snapshot_memory_to_file( match snapshot_type { SnapshotType::Diff => { let dirty_bitmap = vmm.get_dirty_bitmap().map_err(DirtyBitmap)?; - vmm.guest_memory() - .dump_dirty(&mut file, &dirty_bitmap) - .map_err(Memory) + + mem_dump_dirty( + vmm.guest_memory(), + file.as_raw_fd(), + (mem_size_mib * 1024 * 1024) as usize, + &dirty_bitmap, + ) + .map_err(Memory) } SnapshotType::Full => vmm.guest_memory().dump(&mut file).map_err(Memory), }?; - file.flush() - .map_err(|err| MemoryBackingFile("flush", err))?; - file.sync_all() - .map_err(|err| MemoryBackingFile("sync_all", err)) + + Ok(()) } /// Validate the microVM version and translate it to its corresponding snapshot data format. @@ -458,6 +468,16 @@ pub fn snapshot_state_sanity_check( Ok(()) } +/// Describes a descriptor that connects to the memory used by the VM. This could either be the a file descriptor +/// or a UFFD descriptor. +#[derive(Debug)] +pub enum MemoryDescriptor { + /// A file descriptor that connects to the user fault process. + Uffd(Uffd), + /// A file descriptor of the backing memory file. + File(Arc), +} + /// Error type for [`restore_from_snapshot`]. #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum RestoreFromSnapshotError { @@ -497,29 +517,27 @@ pub fn restore_from_snapshot( let mem_backend_path = ¶ms.mem_backend.backend_path; let mem_state = µvm_state.memory_state; let track_dirty_pages = params.enable_diff_snapshots; + let (guest_memory, memory_descriptor) = match params.mem_backend.backend_type { + MemBackendType::File => { + let (guest_memory, file) = + guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages) + .map_err(RestoreFromSnapshotGuestMemoryError::File)?; + (guest_memory, Some(MemoryDescriptor::File(Arc::new(file)))) + } + MemBackendType::Uffd => { + let (guest_memory, uffd) = + guest_memory_from_uffd(mem_backend_path, mem_state, track_dirty_pages) + .map_err(RestoreFromSnapshotGuestMemoryError::Uffd)?; - let (guest_memory, uffd) = match params.mem_backend.backend_type { - MemBackendType::File => ( - guest_memory_from_file(mem_backend_path, mem_state, track_dirty_pages) - .map_err(RestoreFromSnapshotGuestMemoryError::File)?, - None, - ), - MemBackendType::Uffd => guest_memory_from_uffd( - mem_backend_path, - mem_state, - track_dirty_pages, - // We enable the UFFD_FEATURE_EVENT_REMOVE feature only if a balloon device - // is present in the microVM state. - microvm_state.device_states.balloon_device.is_some(), - ) - .map_err(RestoreFromSnapshotGuestMemoryError::Uffd)?, + (guest_memory, uffd.map(MemoryDescriptor::Uffd)) + } }; builder::build_microvm_from_snapshot( instance_info, event_manager, microvm_state, guest_memory, - uffd, + memory_descriptor, track_dirty_pages, seccomp_filters, vm_resources, @@ -560,51 +578,77 @@ pub enum GuestMemoryFromFileError { Restore(#[from] crate::memory_snapshot::SnapshotMemoryError), } -fn guest_memory_from_file( - mem_file_path: &Path, - mem_state: &GuestMemoryState, - track_dirty_pages: bool, -) -> Result { - let mem_file = File::open(mem_file_path)?; - let guest_mem = GuestMemoryMmap::restore(Some(&mem_file), mem_state, track_dirty_pages)?; - Ok(guest_mem) -} - -/// Error type for [`guest_memory_from_uffd`] +/// Error type for [`guest_memory_from_file`]. #[derive(Debug, thiserror::Error, displaydoc::Display)] pub enum GuestMemoryFromUffdError { + /// Failed to load guest memory: {0} + File(#[from] std::io::Error), /// Failed to restore guest memory: {0} Restore(#[from] crate::memory_snapshot::SnapshotMemoryError), - /// Failed to UFFD object: {0} + /// Failed to connect to UFFD Handler: {0} + UdsConnection(std::io::Error), + /// Failed to receive memfd: {0} + ReceiveMemfdError(kvm_ioctls::Error), + /// No memfd reeceived + NoMemFdReceived, + /// Failed to deserialize memory: {0} + DeserializeMemory(SnapshotMemoryError), + /// Failed to create UFFD: {0} Create(userfaultfd::Error), - /// Failed to register memory address range with the userfaultfd object: {0} - Register(userfaultfd::Error), - /// Failed to connect to UDS Unix stream: {0} - Connect(#[from] std::io::Error), - /// Failed to sends file descriptor: {0} - Send(#[from] utils::errno::Error), + /// Failed to send UFFD: {0} + UffdSend(kvm_ioctls::Error), +} + +fn guest_memory_from_file( + mem_file_path: &Path, + mem_state: &GuestMemoryState, + track_dirty_pages: bool, +) -> std::result::Result<(GuestMemoryMmap, File), GuestMemoryFromFileError> { + let mem_file = OpenOptions::new() + .write(true) + .read(true) + .open(mem_file_path)?; + + Ok(( + GuestMemoryMmap::restore(Some(&mem_file), mem_state, track_dirty_pages)?, + mem_file, + )) } -fn guest_memory_from_uffd( +pub(crate) fn guest_memory_from_uffd( mem_uds_path: &Path, mem_state: &GuestMemoryState, track_dirty_pages: bool, - enable_balloon: bool, -) -> Result<(GuestMemoryMmap, Option), GuestMemoryFromUffdError> { - let guest_memory = GuestMemoryMmap::restore(None, mem_state, track_dirty_pages)?; +) -> std::result::Result<(GuestMemoryMmap, Option), GuestMemoryFromUffdError> { + let mut socket = + UnixStream::connect(mem_uds_path).map_err(GuestMemoryFromUffdError::UdsConnection)?; - let mut uffd_builder = UffdBuilder::new(); + let mut buf = [0u8; 8]; + let (_, memfd) = socket + .recv_with_fd(&mut buf) + .map_err(GuestMemoryFromUffdError::ReceiveMemfdError)?; - if enable_balloon { - // We enable this so that the page fault handler can add logic - // for treating madvise(MADV_DONTNEED) events triggerd by balloon inflation. - uffd_builder.require_features(FeatureFlags::EVENT_REMOVE); + if memfd.is_none() { + return Err(GuestMemoryFromUffdError::NoMemFdReceived); } - let uffd = uffd_builder - .close_on_exec(true) - .non_blocking(true) + let memfd = memfd.unwrap(); + + let guest_memory = GuestMemoryMmap::restore(Some(&memfd), mem_state, track_dirty_pages) + .map_err(GuestMemoryFromUffdError::DeserializeMemory)?; + + let uffd = UffdBuilder::new() + .require_features( + FeatureFlags::EVENT_REMOVE + | FeatureFlags::EVENT_REMAP + | FeatureFlags::EVENT_FORK + | FeatureFlags::EVENT_UNMAP + | FeatureFlags::MISSING_SHMEM + | FeatureFlags::MINOR_SHMEM + | FeatureFlags::PAGEFAULT_FLAG_WP, + ) .user_mode_only(false) + .non_blocking(true) .create() .map_err(GuestMemoryFromUffdError::Create)?; @@ -613,8 +657,6 @@ fn guest_memory_from_uffd( let host_base_addr = mem_region.as_ptr(); let size = mem_region.size(); - uffd.register(host_base_addr.cast(), size as _) - .map_err(GuestMemoryFromUffdError::Register)?; backend_mappings.push(GuestRegionUffdMapping { base_host_virt_addr: host_base_addr as u64, size, @@ -626,41 +668,49 @@ fn guest_memory_from_uffd( // (i.e GuestRegionUffdMapping entries). let backend_mappings = serde_json::to_string(&backend_mappings).unwrap(); - let socket = UnixStream::connect(mem_uds_path)?; - socket.send_with_fd( - backend_mappings.as_bytes(), - // In the happy case we can close the fd since the other process has it open and is - // using it to serve us pages. - // - // The problem is that if other process crashes/exits, firecracker guest memory - // will simply revert to anon-mem behavior which would lead to silent errors and - // undefined behavior. - // - // To tackle this scenario, the page fault handler can notify Firecracker of any - // crashes/exits. There is no need for Firecracker to explicitly send its process ID. - // The external process can obtain Firecracker's PID by calling `getsockopt` with - // `libc::SO_PEERCRED` option like so: - // - // let mut val = libc::ucred { pid: 0, gid: 0, uid: 0 }; - // let mut ucred_size: u32 = mem::size_of::() as u32; - // libc::getsockopt( - // socket.as_raw_fd(), - // libc::SOL_SOCKET, - // libc::SO_PEERCRED, - // &mut val as *mut _ as *mut _, - // &mut ucred_size as *mut libc::socklen_t, - // ); - // - // Per this linux man page: https://man7.org/linux/man-pages/man7/unix.7.html, - // `SO_PEERCRED` returns the credentials (PID, UID and GID) of the peer process - // connected to this socket. The returned credentials are those that were in effect - // at the time of the `connect` call. - // - // Moreover, Firecracker holds a copy of the UFFD fd as well, so that even if the - // page fault handler process does not tear down Firecracker when necessary, the - // uffd will still be alive but with no one to serve faults, leading to guest freeze. - uffd.as_raw_fd(), - )?; + socket + .send_with_fd( + backend_mappings.as_bytes(), + // In the happy case we can close the fd since the other process has it open and is + // using it to serve us pages. + // + // The problem is that if other process crashes/exits, firecracker guest memory + // will simply revert to anon-mem behavior which would lead to silent errors and + // undefined behavior. + // + // To tackle this scenario, the page fault handler can notify Firecracker of any + // crashes/exits. There is no need for Firecracker to explicitly send its process ID. + // The external process can obtain Firecracker's PID by calling `getsockopt` with + // `libc::SO_PEERCRED` option like so: + // + // let mut val = libc::ucred { pid: 0, gid: 0, uid: 0 }; + // let mut ucred_size: u32 = mem::size_of::() as u32; + // libc::getsockopt( + // socket.as_raw_fd(), + // libc::SOL_SOCKET, + // libc::SO_PEERCRED, + // &mut val as *mut _ as *mut _, + // &mut ucred_size as *mut libc::socklen_t, + // ); + // + // Per this linux man page: https://man7.org/linux/man-pages/man7/unix.7.html, + // `SO_PEERCRED` returns the credentials (PID, UID and GID) of the peer process + // connected to this socket. The returned credentials are those that were in effect + // at the time of the `connect` call. + // + // Moreover, Firecracker holds a copy of the UFFD fd as well, so that even if the + // page fault handler process does not tear down Firecracker when necessary, the + // uffd will still be alive but with no one to serve faults, leading to guest freeze. + uffd.as_raw_fd(), + ) + .map_err(GuestMemoryFromUffdError::UffdSend)?; + + // Wait for UFFD to be ready. + // TODO: maybe add a timeout? + let mut buf = [0; 2]; + socket + .read_exact(&mut buf) + .map_err(GuestMemoryFromUffdError::UdsConnection)?; Ok((guest_memory, Some(uffd))) } diff --git a/src/vmm/src/resources.rs b/src/vmm/src/resources.rs index e4ed88a504c..c6c9fbcbf6e 100644 --- a/src/vmm/src/resources.rs +++ b/src/vmm/src/resources.rs @@ -27,6 +27,7 @@ use crate::vmm_config::machine_config::{ use crate::vmm_config::metrics::{init_metrics, MetricsConfig, MetricsConfigError}; use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; use crate::vmm_config::net::*; +use crate::vmm_config::snapshot::MemBackendConfig; use crate::vmm_config::vsock::*; /// Errors encountered when configuring microVM resources. @@ -113,6 +114,8 @@ pub struct VmResources { pub mmds_size_limit: usize, /// Whether or not to load boot timer device. pub boot_timer: bool, + /// When backed by a memory on boot, this should be set + pub memory_backend: Option, } impl VmResources { @@ -235,6 +238,16 @@ impl VmResources { self.vm_config.track_dirty_pages = dirty_page_tracking; } + /// Returns the config for the backing memory file + pub fn memory_backend(&self) -> Option { + self.memory_backend.clone() + } + + /// Sets the backing memory file + pub fn set_memory_backend(&mut self, backing_mem_file: MemBackendConfig) { + self.memory_backend.get_or_insert(backing_mem_file); + } + /// Add a custom CPU template to the VM resources /// to configure vCPUs. pub fn set_custom_cpu_template(&mut self, cpu_template: CustomCpuTemplate) { @@ -562,6 +575,7 @@ mod tests { mmds: None, boot_timer: false, mmds_size_limit: HTTP_MAX_PAYLOAD_SIZE, + memory_backend: None, entropy: Default::default(), } } diff --git a/src/vmm/src/rpc_interface.rs b/src/vmm/src/rpc_interface.rs index 5054cd85609..3c81de42a04 100644 --- a/src/vmm/src/rpc_interface.rs +++ b/src/vmm/src/rpc_interface.rs @@ -41,7 +41,9 @@ use crate::vmm_config::mmds::{MmdsConfig, MmdsConfigError}; use crate::vmm_config::net::{ NetworkInterfaceConfig, NetworkInterfaceError, NetworkInterfaceUpdateConfig, }; -use crate::vmm_config::snapshot::{CreateSnapshotParams, LoadSnapshotParams, SnapshotType}; +use crate::vmm_config::snapshot::{ + CreateSnapshotParams, LoadSnapshotParams, MemBackendConfig, SnapshotType, +}; use crate::vmm_config::vsock::{VsockConfigError, VsockDeviceConfig}; use crate::vmm_config::{self, RateLimiterUpdate}; use crate::{EventManager, FcExitCode}; @@ -103,6 +105,9 @@ pub enum VmmAction { /// `BalloonDeviceConfig` as input. This action can only be called before the microVM /// has booted. SetBalloonDevice(BalloonDeviceConfig), + /// Set the memory backend for the VM. The VM will use this backend to handle its + /// memory. This action can only be called before the microVM has booted. + SetMemoryBackend(MemBackendConfig), /// Set the MMDS configuration. SetMmdsConfiguration(MmdsConfig), /// Set the vsock device or update the one that already exists using the @@ -434,6 +439,7 @@ impl<'a> PrebootApiController<'a> { SetBalloonDevice(config) => self.set_balloon_device(config), SetVsockDevice(config) => self.set_vsock_device(config), SetMmdsConfiguration(config) => self.set_mmds_config(config), + SetMemoryBackend(config) => self.set_memory_backend(config), StartMicroVm => self.start_microvm(), UpdateVmConfiguration(config) => self.update_vm_config(config), SetEntropyDevice(config) => self.set_entropy_device(config), @@ -460,6 +466,13 @@ impl<'a> PrebootApiController<'a> { .map_err(VmmActionError::BalloonConfig) } + fn set_memory_backend(&mut self, cfg: MemBackendConfig) -> Result { + self.boot_path = true; + self.vm_resources.memory_backend = Some(cfg); + + Ok(VmmData::Empty) + } + fn insert_block_device(&mut self, cfg: BlockDeviceConfig) -> Result { self.boot_path = true; self.vm_resources @@ -690,6 +703,7 @@ impl RuntimeApiController { | LoadSnapshot(_) | PutCpuConfiguration(_) | SetBalloonDevice(_) + | SetMemoryBackend(_) | SetVsockDevice(_) | SetMmdsConfiguration(_) | SetEntropyDevice(_) @@ -760,14 +774,14 @@ impl RuntimeApiController { ) -> Result { log_dev_preview_warning("Virtual machine snapshots", None); - if create_params.snapshot_type == SnapshotType::Diff - && !self.vm_resources.track_dirty_pages() - { - return Err(VmmActionError::NotSupported( - "Diff snapshots are not allowed on uVMs with dirty page tracking disabled." - .to_string(), - )); - } + // if create_params.snapshot_type == SnapshotType::Diff + // && !self.vm_resources.track_dirty_pages() + // { + // return Err(VmmActionError::NotSupported( + // "Diff snapshots are not allowed on uVMs with dirty page tracking disabled." + // .to_string(), + // )); + // } let mut locked_vmm = self.vmm.lock().unwrap(); let vm_info = VmInfo::from(&self.vm_resources); @@ -920,6 +934,7 @@ mod tests { pub boot_timer: bool, // when `true`, all self methods are forced to fail pub force_errors: bool, + pub memory_backend: Option, } impl MockVmRes { diff --git a/src/vmm/src/vmm_config/snapshot.rs b/src/vmm/src/vmm_config/snapshot.rs index 28b111b9f7d..80f4c0f4fcf 100644 --- a/src/vmm/src/vmm_config/snapshot.rs +++ b/src/vmm/src/vmm_config/snapshot.rs @@ -25,7 +25,7 @@ pub enum SnapshotType { /// 1) A file that contains the guest memory to be loaded, /// 2) An UDS where a custom page-fault handler process is listening for the UFFD set up by /// Firecracker to handle its guest memory page faults. -#[derive(Debug, PartialEq, Eq, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Deserialize)] pub enum MemBackendType { /// Guest memory contents will be loaded from a file. File, @@ -88,7 +88,7 @@ pub struct LoadSnapshotConfig { } /// Stores the configuration used for managing snapshot memory. -#[derive(Debug, PartialEq, Eq, Deserialize)] +#[derive(Clone, Debug, PartialEq, Eq, Deserialize)] #[serde(deny_unknown_fields)] pub struct MemBackendConfig { /// Path to the backend used to handle the guest memory. diff --git a/tools/devtool b/tools/devtool index 9c724a346bf..b4389679b9d 100755 --- a/tools/devtool +++ b/tools/devtool @@ -310,6 +310,7 @@ run_devctr() { --volume /dev:/dev \ --volume "$FC_ROOT_DIR:$CTR_FC_ROOT_DIR:z" \ --tmpfs /srv:exec,dev,size=32G \ + --mount type=bind,source=/usr/include/linux/userfaultfd.h,target=/usr/include/linux/userfaultfd.h \ -v /boot:/boot \ --env PYTHONDONTWRITEBYTECODE=1 \ "$DEVCTR_IMAGE" "${ctr_args[@]}"