diff --git a/.pipelines/templates/stages/testing_rollback/testing-template.yml b/.pipelines/templates/stages/testing_rollback/testing-template.yml index ba4e552d9..e58d3b059 100644 --- a/.pipelines/templates/stages/testing_rollback/testing-template.yml +++ b/.pipelines/templates/stages/testing_rollback/testing-template.yml @@ -43,7 +43,7 @@ parameters: - name: skipManualRollbackTesting displayName: "Skip manual rollback testing" type: string - default: "true" + default: "false" jobs: - job: RollbackTesting_${{ replace(parameters.flavor, '-', '_') }} @@ -139,7 +139,7 @@ jobs: SKIP_FLAGS="" if [ "${{ parameters.skipManualRollbackTesting }}" == "true" ]; then - # TODO: enable manual rollback testing when it is implemented + # skip enable manual rollback testing SKIP_FLAGS="$SKIP_FLAGS --skip-manual-rollbacks" fi diff --git a/crates/osutils/src/efivar.rs b/crates/osutils/src/efivar.rs index 2c44976ed..8a4a539f9 100644 --- a/crates/osutils/src/efivar.rs +++ b/crates/osutils/src/efivar.rs @@ -15,6 +15,7 @@ const SECURE_BOOT: &str = "SecureBoot"; const LOADER_ENTRY_ONESHOT: &str = "LoaderEntryOneShot"; const LOADER_ENTRY_DEFAULT: &str = "LoaderEntryDefault"; pub const LOADER_ENTRY_SELECTED: &str = "LoaderEntrySelected"; +const LOADER_ENTRIES_DEFAULT: &str = "LoaderEntries"; /// Converts a UTF‑8 Rust string to a UTF-16LE byte array. pub fn encode_utf16le(data: &str) -> Vec { @@ -42,6 +43,52 @@ fn decode_utf16le(mut data: &[u8]) -> String { String::from_utf16_lossy(&utf16_data) } +/// Converts a UTF-16LE byte array to a UTF‑8 Rust string. +fn decode_utf16le_to_strings(data: &[u8]) -> Vec { + let mut result = Vec::new(); + if data.len() <= 2 { + return result; + } + + let mut start = 0; + let u16_null = u16::from_le_bytes([0, 0]); + + // Iterate through the byte slice + for (i, &byte) in data.iter().enumerate() { + // Combine 2 u8 bytes into a u16 + if i % 2 == 0 { + // Only judge on u16 boundaries + continue; + } + // We are at the second byte of a u16 + let u16_byte = u16::from_le_bytes([data[i - 1], byte]); + if u16_byte == u16_null { + // Skip the null-terminating character + let end = i - 1; + let current_bytes = &data[start..end]; + + // If we encounter an empty string (two consecutive nulls, or a null at the very beginning/end) + // this usually signifies the end of the list itself. + if current_bytes.is_empty() { + // Check if this is the final, extra null terminator for the list + if i == data.len() - 1 && data.ends_with(b"\0\0") { + break; // End of list found + } + } else { + let utf16_data: Vec = current_bytes + .chunks(2) + .map(|chunk| u16::from_le_bytes([chunk[0], chunk[1]])) + .collect(); + let decoded_string = String::from_utf16_lossy(&utf16_data); + result.push(decoded_string); + } + start = i + 1; // Move the start position past the null terminator + } + } + + result +} + /// Sets an EFI variable using the efivar command-line tool. /// - `name` should include the GUID, e.g. "BootNext-8be4df61-93ca-11d2-aa0d-00e098032b8c" /// - `data` should be a hex string, e.g. "0100" for BootNext=0001 (little-endian) @@ -149,6 +196,32 @@ pub fn set_default_to_current() -> Result<(), TridentError> { ) } +/// Sets the LoaderEntryDefault EFI variable to the previous boot entry +pub fn set_default_to_previous() -> Result<(), TridentError> { + let current = read_efi_variable(BOOTLOADER_INTERFACE_GUID, LOADER_ENTRY_SELECTED)?; + let current_decoded = decode_utf16le(¤t); + let boot_entries = read_efi_variable(BOOTLOADER_INTERFACE_GUID, LOADER_ENTRIES_DEFAULT)?; + let boot_entries_decoded = decode_utf16le_to_strings(&boot_entries); + if boot_entries_decoded.len() < 2 { + return Err(TridentError::new(ServicingError::SetEfiVariable { + name: LOADER_ENTRIES_DEFAULT.to_string(), + })) + .message("Not enough boot entries to determine previous entry"); + } + if boot_entries_decoded[0] != current_decoded { + return Err(TridentError::new(ServicingError::SetEfiVariable { + name: LOADER_ENTRIES_DEFAULT.to_string(), + })) + .message("Current boot entry does not match first entry in boot entries list"); + } + let previous = &boot_entries_decoded[1]; + + set_efi_variable( + &format!("{BOOTLOADER_INTERFACE_GUID}-{LOADER_ENTRY_DEFAULT}"), + &encode_utf16le(previous), + ) +} + #[cfg(test)] mod tests { use super::*; diff --git a/crates/trident/src/cli.rs b/crates/trident/src/cli.rs index 1bbbe4c88..04229279f 100644 --- a/crates/trident/src/cli.rs +++ b/crates/trident/src/cli.rs @@ -39,6 +39,14 @@ pub fn to_operations(allowed_operations: &[AllowedOperation]) -> Operations { ops } +/// The operations that Trident is allowed to perform +#[derive(clap::ValueEnum, Copy, Clone, Debug, Eq, PartialEq)] +pub enum RollbackShowOperation { + Validation, + Target, + Chain, +} + #[derive(Subcommand, Debug)] pub enum Commands { /// Initiate an install of Azure Linux @@ -176,6 +184,33 @@ pub enum Commands { history_path: Option, }, + /// Manually rollback to previous state + Rollback { + /// Declare expectation that rollback undoes a runtime update + #[arg(long, conflicts_with = "ab")] + runtime: bool, + + /// Declare expectation that rollback undoes an A/B update + #[arg(long, conflicts_with = "runtime")] + ab: bool, + + /// Comma-separated list of operations that Trident will be allowed to perform + #[clap(long, value_delimiter = ',', num_args = 0.., default_value = "stage,finalize")] + allowed_operations: Vec, + + /// Show available rollback points + #[clap(long)] + show: Option, + + /// Path to save the resulting Host Status + #[clap(short, long)] + status: Option, + + /// Path to save an eventual fatal error + #[clap(short, long)] + error: Option, + }, + #[cfg(feature = "dangerous-options")] StreamImage { /// URL of the image to stream @@ -212,6 +247,7 @@ impl Commands { Commands::OfflineInitialize { .. } => "offline-initialize", #[cfg(feature = "dangerous-options")] Commands::StreamImage { .. } => "stream-image", + Commands::Rollback { .. } => "rollback", } } } diff --git a/crates/trident/src/datastore.rs b/crates/trident/src/datastore.rs index fc4fcfd23..d99b8d521 100644 --- a/crates/trident/src/datastore.rs +++ b/crates/trident/src/datastore.rs @@ -2,6 +2,7 @@ use std::{fs, path::Path}; use log::debug; +use sqlite::State; use trident_api::{ error::{ DatastoreError, InternalError, ReportError, ServicingError, TridentError, TridentResultExt, @@ -75,6 +76,37 @@ impl DataStore { }) } + pub(crate) fn get_host_statuses(&self) -> Result, TridentError> { + let mut all_rows_data: Vec = Vec::new(); + + // Read all HostStatus entries from the datastore, parse them into + // HostStatus structs, and return a slice of them. + let mut query_statement = self + .db + .as_ref() + .unwrap() + .prepare("SELECT contents FROM hoststatus ORDER BY id DESC") + .structured(ServicingError::Datastore { + inner: DatastoreError::InitializeDatastore, + })?; + + while let Ok(State::Row) = query_statement.next() { + let host_status_yaml = + query_statement + .read::(0) + .structured(ServicingError::Datastore { + inner: DatastoreError::InitializeDatastore, + })?; + let host_status = + serde_yaml::from_str(&host_status_yaml).structured(ServicingError::Datastore { + inner: DatastoreError::InitializeDatastore, + })?; + all_rows_data.insert(0, host_status); + } + + Ok(all_rows_data) + } + pub(crate) fn is_persistent(&self) -> bool { !self.temporary } diff --git a/crates/trident/src/engine/bootentries.rs b/crates/trident/src/engine/bootentries.rs index a7ad14c0f..6533a935b 100644 --- a/crates/trident/src/engine/bootentries.rs +++ b/crates/trident/src/engine/bootentries.rs @@ -59,6 +59,11 @@ pub fn create_and_update_boot_variables( // Get the label and path for the EFI bootloader of the inactive A/B update volume. let (entry_label_new, bootloader_path_new) = get_label_and_path(ctx, BOOT_EFI).structured(ServicingError::GetLabelAndPath)?; + debug!( + "Creating boot entry with label '{}' and bootloader path '{:?}'", + entry_label_new.as_str(), + bootloader_path_new, + ); // Check if the boot entry already exists, if so, delete the entry and // remove it from the `BootOrder`. @@ -166,8 +171,12 @@ pub fn set_boot_next_and_update_boot_order( // have boot entries start disappearing again. update_boot_order(entry_numbers, &BootOrderPosition::Last) .structured(ServicingError::UpdateBootOrder)?; - } else if ctx.servicing_type == ServicingType::CleanInstall && !use_virtdeploy_workaround { - // During clean install, immediately set the bootorder to use the new entry. + } else if matches!( + ctx.servicing_type, + ServicingType::CleanInstall | ServicingType::ManualRollback + ) && !use_virtdeploy_workaround + { + // During clean install or manual rollback, immediately set the bootorder to use the new entry. update_boot_order(entry_numbers, &BootOrderPosition::First) .structured(ServicingError::UpdateBootOrder)?; } diff --git a/crates/trident/src/engine/context/mod.rs b/crates/trident/src/engine/context/mod.rs index f24b5983d..3634c69f5 100644 --- a/crates/trident/src/engine/context/mod.rs +++ b/crates/trident/src/engine/context/mod.rs @@ -85,14 +85,19 @@ impl EngineContext { ServicingType::NoActiveServicing => None, // If host is executing a runtime update, active and update volumes are the same. ServicingType::RuntimeUpdate => self.ab_active_volume, + + // If host is executing a manual rollback and this is executed, an + // A/B update is being undone. + ServicingType::ManualRollback // If host is executing an A/B update, update volume is the opposite of active volume. - ServicingType::AbUpdate => { + | ServicingType::AbUpdate => { if self.ab_active_volume == Some(AbVolumeSelection::VolumeA) { Some(AbVolumeSelection::VolumeB) } else { Some(AbVolumeSelection::VolumeA) } } + // If host is executing a clean install, update volume is always A. ServicingType::CleanInstall => Some(AbVolumeSelection::VolumeA), } diff --git a/crates/trident/src/engine/mod.rs b/crates/trident/src/engine/mod.rs index 89e7ce099..f4722deb9 100644 --- a/crates/trident/src/engine/mod.rs +++ b/crates/trident/src/engine/mod.rs @@ -167,7 +167,7 @@ lazy_static::lazy_static! { /// /// In case of clean install, the files are persisted to the datastore path in the new root, so /// newroot_path is provided. -fn persist_background_log_and_metrics( +pub fn persist_background_log_and_metrics( datastore_path: &Path, newroot_path: Option<&Path>, servicing_state: ServicingState, diff --git a/crates/trident/src/engine/rollback.rs b/crates/trident/src/engine/rollback.rs index f46593de8..f892bd8a5 100644 --- a/crates/trident/src/engine/rollback.rs +++ b/crates/trident/src/engine/rollback.rs @@ -48,9 +48,9 @@ pub fn validate_boot(datastore: &mut DataStore) -> Result { - datastore.host_status().ab_active_volume - } + ServicingState::AbUpdateFinalized + | ServicingState::CleanInstallFinalized + | ServicingState::ManualRollbackFinalized => datastore.host_status().ab_active_volume, // For AbUpdateHealthCheckFailed, use the opposite active volume of the one // set in Host Status ServicingState::AbUpdateHealthCheckFailed => { @@ -73,6 +73,7 @@ pub fn validate_boot(datastore: &mut DataStore) -> Result ServicingType::CleanInstall, + ServicingState::ManualRollbackFinalized => ServicingType::ManualRollback, _ => ServicingType::NoActiveServicing, }; @@ -117,6 +118,17 @@ pub fn validate_boot(datastore: &mut DataStore) -> Result { + datastore.with_host_status(|host_status| { + host_status.servicing_state = ServicingState::Provisioned; + host_status.spec_old = Default::default(); + host_status.ab_active_volume = match host_status.ab_active_volume { + None | Some(AbVolumeSelection::VolumeB) => Some(AbVolumeSelection::VolumeA), + Some(AbVolumeSelection::VolumeA) => Some(AbVolumeSelection::VolumeB), + }; + })?; + return Ok(BootValidationResult::ValidBootProvisioned); + } // // Every case below will return an error. // diff --git a/crates/trident/src/engine/storage/rebuild.rs b/crates/trident/src/engine/storage/rebuild.rs index 89d492a64..df7fe176b 100755 --- a/crates/trident/src/engine/storage/rebuild.rs +++ b/crates/trident/src/engine/storage/rebuild.rs @@ -176,7 +176,11 @@ pub(crate) fn validate_rebuild_raid( match host_status.servicing_state { ServicingState::NotProvisioned | ServicingState::CleanInstallStaged - | ServicingState::CleanInstallFinalized => { + | ServicingState::CleanInstallFinalized + | ServicingState::AbUpdateHealthCheckFailed + | ServicingState::ManualRollbackStaged + | ServicingState::ManualRollbackFinalized + | ServicingState::RuntimeUpdateStaged => { bail!( "rebuild-raid command is not allowed when servicing state is {:?}", host_status.servicing_state @@ -184,8 +188,7 @@ pub(crate) fn validate_rebuild_raid( } ServicingState::Provisioned | ServicingState::AbUpdateStaged - | ServicingState::AbUpdateFinalized - | ServicingState::AbUpdateHealthCheckFailed => {} + | ServicingState::AbUpdateFinalized => {} } validate_raid_recovery(host_config, disks_to_rebuild) diff --git a/crates/trident/src/engine/update.rs b/crates/trident/src/engine/update.rs index c4d73af32..bbaeba76d 100644 --- a/crates/trident/src/engine/update.rs +++ b/crates/trident/src/engine/update.rs @@ -169,6 +169,9 @@ pub(crate) fn update( ServicingType::CleanInstall => Err(TridentError::new( InvalidInputError::CleanInstallOnProvisionedHost, )), + ServicingType::ManualRollback => Err(TridentError::internal( + "Cannot update during manual rollback", + )), ServicingType::NoActiveServicing => Err(TridentError::internal("No active servicing type")), } } diff --git a/crates/trident/src/lib.rs b/crates/trident/src/lib.rs index 12b37c18a..f569e5aab 100644 --- a/crates/trident/src/lib.rs +++ b/crates/trident/src/lib.rs @@ -32,6 +32,7 @@ mod engine; mod health; mod io_utils; mod logging; +pub mod manual_rollback; mod monitor_metrics; pub mod offline_init; mod orchestrate; @@ -53,7 +54,7 @@ pub use logging::{ }; pub use orchestrate::OrchestratorConnection; -use crate::osimage::OsImage; +use crate::{cli::RollbackShowOperation, osimage::OsImage}; /// Trident version as provided by environment variables at build time pub const TRIDENT_VERSION: &str = match option_env!("TRIDENT_VERSION") { @@ -335,7 +336,7 @@ impl Trident { } } - fn get_cosi_image(host_config: &mut HostConfiguration) -> Result { + pub fn get_cosi_image(host_config: &mut HostConfiguration) -> Result { let cosi_timeout = match host_config .internal_params .get_u64(HTTP_CONNECTION_TIMEOUT_SECONDS) @@ -632,15 +633,27 @@ impl Trident { ServicingState::CleanInstallFinalized | ServicingState::AbUpdateFinalized | ServicingState::AbUpdateHealthCheckFailed + | ServicingState::ManualRollbackFinalized ) { - info!("No servicing in progress, skipping commit"); + info!( + "No servicing in progress ({:?}), skipping commit", + datastore.host_status().servicing_state + ); return Ok(ExitKind::Done); } let rollback_result = self.execute_and_record_error(datastore, |datastore| { - rollback::validate_boot(datastore).message( + let result = rollback::validate_boot(datastore).message( "Failed to validate that firmware correctly booted from updated target OS image", - ) + ); + // Persist the Trident background log and metrics file. + engine::persist_background_log_and_metrics( + &datastore.host_status().spec.trident.datastore_path, + None, + datastore.host_status().servicing_state, + ); + + result }); if rollback_result.is_ok() { @@ -698,4 +711,52 @@ impl Trident { Ok(()) } + + pub fn rollback( + &mut self, + datastore: &mut DataStore, + expected_runtime_rollback: bool, + expected_ab_rollback: bool, + allowed_operations: Operations, + show_operation: Option, + ) -> Result { + // If host's servicing state is *Finalized or *HealthCheckFailed, need to + // re-evaluate the current state of the host. + if !matches!( + datastore.host_status().servicing_state, + ServicingState::Provisioned + | ServicingState::ManualRollbackStaged + | ServicingState::ManualRollbackFinalized + ) { + info!("Not in Provisioned or ManualRollbackStaged state, cannot rollback"); + return Ok(ExitKind::Done); + } + + if let Some(show_op) = show_operation { + let result = manual_rollback::print_show(datastore, show_op) + .message("Failed to query for --show")?; + return Ok(result); + } + + let rollback_result = self.execute_and_record_error(datastore, |datastore| { + manual_rollback::execute_rollback( + datastore, + expected_runtime_rollback, + expected_ab_rollback, + &allowed_operations, + ) + .message("Failed to rollback") + }); + + if rollback_result.is_ok() { + if let Some(ref orchestrator) = self.orchestrator { + orchestrator.report_success(Some( + serde_yaml::to_string(&datastore.host_status()) + .unwrap_or("Failed to serialize Host Status".into()), + )) + } + } + + rollback_result + } } diff --git a/crates/trident/src/main.rs b/crates/trident/src/main.rs index 5477f350f..6ce48c04a 100644 --- a/crates/trident/src/main.rs +++ b/crates/trident/src/main.rs @@ -139,7 +139,8 @@ fn run_trident( | Commands::Update { status, error, .. } | Commands::Commit { status, error } | Commands::Listen { status, error } - | Commands::RebuildRaid { status, error, .. } => { + | Commands::RebuildRaid { status, error, .. } + | Commands::Rollback { status, error, .. } => { let config_path = match &args.command { Commands::Update { config, .. } | Commands::Install { config, .. } => { Some(config.clone()) @@ -204,6 +205,19 @@ fn run_trident( &mut None, ), Commands::Commit { .. } => trident.commit(&mut datastore), + Commands::Rollback { + runtime, + ab, + ref allowed_operations, + show, + .. + } => trident.rollback( + &mut datastore, + runtime, + ab, + cli::to_operations(allowed_operations), + show, + ), Commands::Listen { .. } => { trident.listen(&mut datastore).map(|()| ExitKind::Done) } diff --git a/crates/trident/src/manual_rollback/mod.rs b/crates/trident/src/manual_rollback/mod.rs new file mode 100644 index 000000000..b5a2e51dd --- /dev/null +++ b/crates/trident/src/manual_rollback/mod.rs @@ -0,0 +1,863 @@ +#![allow(unused)] + +use std::{ + collections::{HashMap, HashSet}, + fs, + path::{Path, PathBuf}, + str::FromStr, +}; + +use anyhow::{bail, Context, Error}; +use log::{debug, info, trace}; +use serde::{Deserialize, Serialize}; + +use maplit::hashmap; +use osutils::{efivar, lsblk}; + +use trident_api::{ + config::{ + AbUpdate, AbVolumePair, Disk, FileSystem, FileSystemSource, HostConfiguration, + MountOptions, MountPoint, Operations, Partition, PartitionSize, PartitionTableType, + PartitionType, VerityCorruptionOption, VerityDevice, + }, + constants::{ + internal_params::ENABLE_UKI_SUPPORT, EFI_DEFAULT_BIN_RELATIVE_PATH, ESP_EFI_DIRECTORY, + ESP_RELATIVE_MOUNT_POINT_PATH, ROOT_MOUNT_POINT_PATH, + }, + error::{InvalidInputError, ReportError, ServicingError, TridentError, TridentResultExt}, + status::{decode_host_status, AbVolumeSelection, HostStatus, ServicingState, ServicingType}, + BlockDeviceId, +}; +use uuid::Uuid; + +use crate::{ + cli::RollbackShowOperation, + container, + datastore::{self, DataStore}, + engine::{ + self, + boot::{self, uki, ESP_EXTRACTION_DIRECTORY}, + bootentries, rollback, EngineContext, REQUIRES_REBOOT, + }, + subsystems::esp, + ExitKind, OsImage, +}; + +/// Print whether the next manual rollback requires a reboot. +pub fn print_requires_reboot(datastore: &mut DataStore) -> Result { + // Get all HostStatus entries from the datastore. + let host_statuses = datastore + .get_host_statuses() + .message("Failed to get datastore HostStatus entries")?; + // Create ManualRollback context from HostStatus entries. + let context = ManualRollbackContext::new(&host_statuses) + .message("Failed to create manual rollback context")?; + + let requires_reboot_output = context + .get_requires_reboot_output() + .structured(ServicingError::ManualRollback) + .message("Failed to query for --requires-reboot")?; + println!("{}", requires_reboot_output); + Ok(ExitKind::Done) +} + +pub fn print_show( + datastore: &mut DataStore, + show_operation: RollbackShowOperation, +) -> Result { + // Get all HostStatus entries from the datastore. + let host_statuses = datastore + .get_host_statuses() + .message("Failed to get datastore HostStatus entries")?; + // Create ManualRollback context from HostStatus entries. + let context = ManualRollbackContext::new(&host_statuses) + .message("Failed to create manual rollback context")?; + let rollback_chain = context + .get_rollback_chain() + .structured(ServicingError::ManualRollback) + .message("Failed to get available rollbacks")?; + + match show_operation { + RollbackShowOperation::Validation => { + if let Some(first_rollback_host_status) = rollback_chain.first() { + if first_rollback_host_status.requires_reboot { + info!("Next available rollback is A/B update rollback requiring reboot"); + println!("ab"); + } else { + info!( + "Next available rollback is runtime update rollback not requiring reboot" + ); + println!("runtime"); + } + } else { + info!("No available rollbacks to show validation for"); + println!("none"); + } + } + RollbackShowOperation::Target => { + if let Some(first_rollback_host_status) = rollback_chain.first() { + let target_output = + serde_json::to_string(&first_rollback_host_status.host_status.spec) + .structured(ServicingError::ManualRollback) + .message("Failed to serialize first rollback HostStatus spec")?; + println!("{}", target_output); + } else { + info!("No available rollbacks to show target for"); + println!("{{}}"); + } + } + RollbackShowOperation::Chain => { + let available_rollbacks_output = context + .get_rollback_chain_json() + .structured(ServicingError::ManualRollback) + .message("Failed to query for --show=chain")?; + println!("{}", available_rollbacks_output); + } + } + Ok(ExitKind::Done) +} + +/// Handle manual rollback operations. +pub fn execute_rollback( + datastore: &mut DataStore, + expected_runtime_rollback: bool, + expected_ab_rollback: bool, + allowed_operations: &Operations, +) -> Result { + let current_servicing_state = datastore.host_status().servicing_state; + + // Get all HostStatus entries from the datastore. + let host_statuses = datastore + .get_host_statuses() + .message("Failed to get datastore HostStatus entries")?; + // Create ManualRollback context from HostStatus entries. + let rollback_context = ManualRollbackContext::new(&host_statuses) + .message("Failed to create manual rollback context")?; + + let available_rollbacks = rollback_context + .get_rollback_chain() + .structured(ServicingError::ManualRollback) + .message("Failed to get available rollbacks")?; + if available_rollbacks.is_empty() { + info!("No available rollbacks to perform"); + return Ok(ExitKind::Done); + } + + let first_rollback = &available_rollbacks[0]; + if expected_runtime_rollback && first_rollback.requires_reboot { + return Err(TridentError::new( + InvalidInputError::InvalidRollbackExpectation { + reason: "expected to undo a runtime update but rollback will undo an A/B update" + .to_string(), + }, + )); + } + if expected_ab_rollback && !first_rollback.requires_reboot { + return Err(TridentError::new( + InvalidInputError::InvalidRollbackExpectation { + reason: "expected to undo an A/B update but rollback will undo a runtime update" + .to_string(), + }, + )); + } + + let mut first_rollback_host_config = first_rollback.host_status.spec.clone(); + let mut skip_finalize_state_check = false; + + let mut engine_context = EngineContext { + spec: first_rollback.host_status.spec.clone(), + spec_old: datastore.host_status().spec.clone(), + servicing_type: ServicingType::ManualRollback, + partition_paths: datastore.host_status().partition_paths.clone(), + ab_active_volume: datastore.host_status().ab_active_volume, + disk_uuids: datastore.host_status().disk_uuids.clone(), + install_index: datastore.host_status().install_index, + is_uki: Some(efivar::current_var_is_uki()), + image: None, + storage_graph: engine::build_storage_graph(&datastore.host_status().spec.storage)?, // Build storage graph + filesystems: Vec::new(), // Will be populated after dynamic validation + }; + // Perform staging if operation is allowed + if allowed_operations.has_stage() { + match current_servicing_state { + ServicingState::Provisioned => { + if datastore.host_status().last_error.is_some() { + return Err(TridentError::new(InvalidInputError::InvalidRollbackState { + reason: "in Provisioned state but has a last error set".to_string(), + })); + } + // OK to proceed + } + state => { + return Err(TridentError::new(InvalidInputError::InvalidRollbackState { + reason: format!("in unexpected state: {:?}", state), + })); + } + } + + stage_rollback(datastore, &engine_context, first_rollback.requires_reboot) + .message("Failed to stage manual rollback")?; + + if !allowed_operations.has_finalize() { + // Persist the Trident background log and metrics file. Otherwise, the + // staging logs would be lost. + engine::persist_background_log_and_metrics( + &datastore.host_status().spec.trident.datastore_path, + None, + datastore.host_status().servicing_state, + ); + } + // If only staging, skip finalize state check + skip_finalize_state_check = true; + } + // Perform finalize if operation is allowed + if allowed_operations.has_finalize() { + if !skip_finalize_state_check { + match current_servicing_state { + ServicingState::ManualRollbackStaged | ServicingState::ManualRollbackFinalized => { + // OK to proceed + } + state => { + return Err(TridentError::new(InvalidInputError::InvalidRollbackState { + reason: format!("in unexpected state: {:?}", state), + })); + } + } + } + let finalize_result = + finalize_rollback(datastore, &engine_context, first_rollback.requires_reboot) + .message("Failed to stage manual rollback"); + // Persist the Trident background log and metrics file. Otherwise, the + // staging logs would be lost. + engine::persist_background_log_and_metrics( + &datastore.host_status().spec.trident.datastore_path, + None, + datastore.host_status().servicing_state, + ); + + return finalize_result; + } + Ok(ExitKind::Done) +} + +fn stage_rollback( + datastore: &mut DataStore, + engine_context: &EngineContext, + requires_reboot: bool, +) -> Result<(), TridentError> { + if requires_reboot { + info!("Staging rollback that requires reboot"); + } else { + info!("Staging rollback that does not require reboot"); + } + + datastore.with_host_status(|host_status| { + host_status.spec = engine_context.spec.clone(); + host_status.servicing_state = ServicingState::ManualRollbackStaged; + })?; + Ok(()) +} + +fn finalize_rollback( + datastore: &mut DataStore, + engine_context: &EngineContext, + requires_reboot: bool, +) -> Result { + if !requires_reboot { + trace!("Manual rollback does not require reboot"); + // TODO: implement runtime update rollback + + datastore.with_host_status(|host_status| { + host_status.spec = engine_context.spec.clone(); + host_status.servicing_state = ServicingState::Provisioned; + })?; + return Ok(ExitKind::Done); + } + + trace!("Manual rollback requires reboot"); + + let root_path = if container::is_running_in_container() + .message("Failed to check if Trident is running in a container")? + { + container::get_host_root_path().message("Failed to get host root path")? + } else { + PathBuf::from(ROOT_MOUNT_POINT_PATH) + }; + let esp_path = Path::join(&root_path, ESP_RELATIVE_MOUNT_POINT_PATH); + + // In UKI, use the LoaderEntries variable to get the previous boot entry and set it as current + if engine_context.is_uki()? { + efivar::set_default_to_previous() + .message("Failed to set default boot entry to previous")?; + } + // Reconfigure UEFI boot-order to point at inactive volume + bootentries::create_and_update_boot_variables(engine_context, &esp_path)?; + // Analogous to how UEFI variables are configured. + esp::set_uefi_fallback_contents( + engine_context, + ServicingState::ManualRollbackStaged, + &root_path, + ) + .structured(ServicingError::SetUpUefiFallback)?; + + if let Some(ref encryption) = engine_context.spec.storage.encryption { + // TODO: Handle any pcr-lock encryption related changes needed + } + + datastore.with_host_status(|host_status| { + host_status.spec = engine_context.spec.clone(); + host_status.servicing_state = ServicingState::ManualRollbackFinalized; + })?; + + Ok(ExitKind::NeedsReboot) +} + +#[derive(Clone, Debug, Deserialize, Serialize)] +#[serde(rename_all = "camelCase")] +struct RollbackDetail { + requires_reboot: bool, + host_status: HostStatus, + #[serde(skip)] + host_status_index: i32, +} +struct ManualRollbackContext { + volume_a_available_rollbacks: Vec, + volume_b_available_rollbacks: Vec, + active_volume: Option, + rollback_action: Option, + rollback_volume: Option, +} +impl ManualRollbackContext { + fn new(host_statuses: &[HostStatus]) -> Result { + // Initialize context from HostStatus entries. + let mut instance = ManualRollbackContext { + volume_a_available_rollbacks: Vec::new(), + volume_b_available_rollbacks: Vec::new(), + active_volume: None, + rollback_action: None, + rollback_volume: None, + }; + + // Create special handling for offline-initialize initial state + // where there are multiple (annecdotally: 3) consecutive Provisioned + // host statuses. + let mut last_initial_consecutive_provisioned_state = -1; + for (i, hs) in host_statuses.iter().enumerate() { + if hs.servicing_state != ServicingState::Provisioned { + break; + } + last_initial_consecutive_provisioned_state = i as i32; + } + + let mut rollback = false; + let mut needs_reboot = false; + let mut active_index = -1; + + for (i, hs) in host_statuses.iter().enumerate() { + trace!( + "Processing HostStatus at index {}: servicing_state={:?}, ab_active_volume={:?}", + i, + hs.servicing_state, + hs.ab_active_volume + ); + // If the inactive volume is overwritten by + // ab-update-staged, clear the available + // rollbacks for it + if hs.servicing_state == ServicingState::AbUpdateStaged { + trace!("AbUpdateStaged detected at index {}: clearing available rollbacks for inactive volume {:?}: a:[{:?}] b:[{:?}]", + i, + hs.ab_active_volume, + instance.volume_a_available_rollbacks.len(), + instance.volume_b_available_rollbacks.len() + ); + match hs.ab_active_volume { + Some(AbVolumeSelection::VolumeA) => { + instance.volume_b_available_rollbacks = Vec::new(); + } + Some(AbVolumeSelection::VolumeB) => { + instance.volume_a_available_rollbacks = Vec::new(); + } + None => {} + } + } + + // Update rollback context for each HostStatus.ServicingState == Provisioned + if hs.servicing_state == ServicingState::Provisioned { + trace!( + "Processing Provisioned state at index {} for active volume {:?}", + i, + hs.ab_active_volume + ); + // If we entered a Provisioned state from a Provisioned state (so + // ignoring the first Provisioned state, where there can be no rollback), + // update the available rollbacks depending on whether the last action + // was a rollback or not + if active_index != -1 { + let host_status_context = RollbackDetail { + host_status: host_statuses[active_index as usize].clone(), + host_status_index: active_index, + requires_reboot: needs_reboot, + }; + if rollback { + if let Some((first_rollback, first_rollback_volume)) = + instance.get_first_rollback() + { + trace!( + "Rollback detected at index {} for active volume {:?}", + i, + instance.active_volume + ); + match first_rollback_volume { + AbVolumeSelection::VolumeA => { + if !instance.volume_a_available_rollbacks.is_empty() { + instance.volume_a_available_rollbacks.remove(0); + } + } + AbVolumeSelection::VolumeB => { + if !instance.volume_b_available_rollbacks.is_empty() { + instance.volume_b_available_rollbacks.remove(0); + } + } + } + } + } else if host_status_context.host_status_index + >= last_initial_consecutive_provisioned_state + { + trace!( + "New Provisioned state detected at index {} for active volume {:?}", + i, + instance.active_volume + ); + // Prepend the last Provisioned index to the previously active volume's available + // rollbacks. + match instance.active_volume { + Some(AbVolumeSelection::VolumeA) => { + instance + .volume_a_available_rollbacks + .insert(0, host_status_context); + } + Some(AbVolumeSelection::VolumeB) => { + instance + .volume_b_available_rollbacks + .insert(0, host_status_context); + } + None => {} + } + } + } + // Update the context's active volume and index + instance.active_volume = hs.ab_active_volume; + active_index = i as i32; + needs_reboot = false; + // Reset the loop's rollback tracking + rollback = false + } else { + // Check each non-Provisioned state to see if it represents a rollback action + rollback = matches!( + hs.servicing_state, + ServicingState::ManualRollbackStaged | ServicingState::ManualRollbackFinalized + ); + needs_reboot = matches!( + hs.servicing_state, + ServicingState::AbUpdateFinalized + | ServicingState::AbUpdateFinalized + | ServicingState::AbUpdateHealthCheckFailed + ); + trace!( + "Detected servicing state {:?} at index {}: rollback={}, needs_reboot={}", + hs.servicing_state, + i, + rollback, + needs_reboot + ) + } + } + + if let Some((first_rollback, rollback_volume)) = instance.get_first_rollback() { + trace!( + "First available rollback at index {} for volume {:?}", + first_rollback, + rollback_volume + ); + instance.rollback_volume = Some(rollback_volume); + + instance.rollback_action = None; + if first_rollback != -1 { + let rollback_next_state = + host_statuses[first_rollback as usize + 1].servicing_state; + if matches!( + rollback_next_state, + ServicingState::AbUpdateStaged | ServicingState::AbUpdateFinalized + ) { + instance.rollback_action = Some(ServicingType::AbUpdate) + } else if matches!(rollback_next_state, ServicingState::RuntimeUpdateStaged) { + instance.rollback_action = Some(ServicingType::RuntimeUpdate) + } + } + } + + Ok(instance) + } + + fn get_first_rollback_host_status(&self) -> Result, Error> { + self.get_rollback_chain() + .context("Failed to get available rollbacks")? + .into_iter() + .next() + .map_or_else(|| Ok(None), |detail| Ok(Some(detail.host_status.clone()))) + } + + fn get_first_rollback(&self) -> Option<(i32, AbVolumeSelection)> { + let mut rollback_a = -1; + let mut rollback_b = -1; + trace!( + "Checking for first available rollback: A=[{:?}] B:[{:?}]", + self.volume_a_available_rollbacks.len(), + self.volume_b_available_rollbacks.len() + ); + if !self.volume_a_available_rollbacks.is_empty() { + rollback_a = self.volume_a_available_rollbacks[0].host_status_index; + } + if !self.volume_b_available_rollbacks.is_empty() { + rollback_b = self.volume_b_available_rollbacks[0].host_status_index; + } + if rollback_a > rollback_b { + trace!("First rollback is on Volume A at index {}", rollback_a); + return Some((rollback_a, AbVolumeSelection::VolumeA)); + } + if rollback_b != -1 { + trace!("First rollback is on Volume B at index {}", rollback_b); + return Some((rollback_b, AbVolumeSelection::VolumeB)); + } + trace!(" No available rollbacks detected"); + None + } + + fn get_requires_reboot(&self) -> Result { + Ok(matches!( + self.rollback_action, + Some(ServicingType::AbUpdate) + )) + } + + fn get_requires_reboot_output(&self) -> Result { + let requires_reboot = self.get_requires_reboot()?; + info!("Rollback requires reboot: {}", requires_reboot); + Ok(requires_reboot.to_string()) + } + + fn get_rollback_chain(&self) -> Result, Error> { + let mut contexts = self + .volume_a_available_rollbacks + .clone() + .into_iter() + .chain(self.volume_b_available_rollbacks.clone()) + .collect::>(); + contexts.sort_by(|a, b| b.host_status_index.cmp(&a.host_status_index)); + info!("Available rollback count: {}", contexts.len()); + Ok(contexts) + } + + fn get_rollback_chain_json(&self) -> Result { + let contexts = self.get_rollback_chain()?; + let full_json = + serde_json::to_string(&contexts).context("Failed to serialize rollback contexts")?; + info!("Available rollbacks:\n{}", full_json); + Ok(full_json) + } +} + +#[cfg(test)] +mod tests { + use osutils::mdadm::create; + + use super::*; + + struct HostStatusTest { + host_status: HostStatus, + expected_requires_reboot: bool, + expected_available_rollbacks: Vec, + } + fn host_status( + active_volume: Option, + servicing_state: ServicingState, + ) -> HostStatus { + HostStatus { + ab_active_volume: active_volume, + servicing_state, + ..Default::default() + } + } + fn prov( + active_volume: Option, + expected_requires_reboot: bool, + expected_available_rollbacks: Vec, + ) -> HostStatusTest { + HostStatusTest { + host_status: host_status(active_volume, ServicingState::Provisioned), + expected_requires_reboot, + expected_available_rollbacks, + } + } + fn inter( + active_volume: Option, + servicing_state: ServicingState, + ) -> HostStatusTest { + HostStatusTest { + host_status: host_status(active_volume, servicing_state), + expected_requires_reboot: false, + expected_available_rollbacks: vec![], + } + } + + #[test] + fn test_rollback_context() { + let volume_a = Some(AbVolumeSelection::VolumeA); + let volume_b = Some(AbVolumeSelection::VolumeB); + let host_status_list = vec![ + inter(None, ServicingState::CleanInstallFinalized), + inter(None, ServicingState::CleanInstallFinalized), + prov(volume_a, false, vec![]), + inter(volume_a, ServicingState::RuntimeUpdateStaged), + prov(volume_a, false, vec![2]), + inter(volume_a, ServicingState::RuntimeUpdateStaged), + prov(volume_a, false, vec![4, 2]), + inter(volume_a, ServicingState::AbUpdateStaged), + inter(volume_a, ServicingState::AbUpdateFinalized), + prov(volume_b, true, vec![6, 4, 2]), + inter(volume_b, ServicingState::AbUpdateStaged), + inter(volume_b, ServicingState::AbUpdateFinalized), + prov(volume_a, true, vec![9]), + inter(volume_a, ServicingState::ManualRollbackStaged), + inter(volume_a, ServicingState::ManualRollbackFinalized), + prov(volume_b, false, vec![]), + ]; + for (i, hs) in host_status_list.iter().enumerate() { + if hs.host_status.servicing_state != ServicingState::Provisioned { + continue; + } + let host_status_list = host_status_list + .iter() + .take(i + 1) + .map(|hst| hst.host_status.clone()) + .collect::>(); + let context = ManualRollbackContext::new(&host_status_list).unwrap(); + trace!( + "HS: {:?}, expected_requires_reboot: {}, expected_available_rollbacks: {:?}", + hs.host_status.servicing_state, + hs.expected_requires_reboot, + hs.expected_available_rollbacks + ); + assert_eq!( + context.get_requires_reboot_output().unwrap(), + hs.expected_requires_reboot.to_string() + ); + let serialized_output = serde_yaml::from_str::>( + &context.get_rollback_chain_json().unwrap(), + ) + .unwrap(); + assert_eq!( + serialized_output.len(), + hs.expected_available_rollbacks.len() + ) + } + } + + #[test] + fn test_runtime_rollback_context_mid_rollback() { + let volume_a = Some(AbVolumeSelection::VolumeA); + let volume_b = Some(AbVolumeSelection::VolumeB); + let host_status_list = vec![ + inter(None, ServicingState::CleanInstallFinalized), + inter(None, ServicingState::CleanInstallFinalized), + prov(volume_a, false, vec![]), + inter(volume_a, ServicingState::RuntimeUpdateStaged), + prov(volume_a, false, vec![2]), + inter(volume_a, ServicingState::RuntimeUpdateStaged), + prov(volume_a, false, vec![4, 2]), + inter(volume_a, ServicingState::RuntimeUpdateStaged), + prov(volume_a, false, vec![6, 4, 2]), + inter(volume_a, ServicingState::ManualRollbackStaged), + inter(volume_a, ServicingState::ManualRollbackFinalized), + ]; + let host_status_list = host_status_list + .iter() + .take(host_status_list.len() + 1) + .map(|hst| hst.host_status.clone()) + .collect::>(); + let context = ManualRollbackContext::new(&host_status_list).unwrap(); + trace!( + "Validate runtime update rollback, create context in manual-rollback-finalized state" + ); + // Manual rollback undoing a runtime update does not require reboot + assert_eq!( + context.get_requires_reboot_output().unwrap(), + false.to_string() + ); + let serialized_output = serde_yaml::from_str::>( + &context.get_rollback_chain_json().unwrap(), + ) + .unwrap(); + // Pre manual rollback, there were 3 runtime updates to rollback + assert_eq!(serialized_output.len(), 3) + } + + #[test] + fn test_ab_rollback_context_mid_rollback() { + let volume_a = Some(AbVolumeSelection::VolumeA); + let volume_b = Some(AbVolumeSelection::VolumeB); + let host_status_list = vec![ + inter(None, ServicingState::CleanInstallFinalized), + inter(None, ServicingState::CleanInstallFinalized), + prov(volume_a, false, vec![]), + inter(volume_a, ServicingState::AbUpdateStaged), + inter(volume_a, ServicingState::AbUpdateFinalized), + prov(volume_b, true, vec![2]), + inter(volume_b, ServicingState::AbUpdateStaged), + inter(volume_b, ServicingState::AbUpdateFinalized), + prov(volume_a, true, vec![5]), + inter(volume_a, ServicingState::AbUpdateStaged), + inter(volume_a, ServicingState::AbUpdateFinalized), + prov(volume_b, true, vec![8]), + inter(volume_a, ServicingState::ManualRollbackStaged), + inter(volume_a, ServicingState::ManualRollbackFinalized), + ]; + let host_status_list = host_status_list + .iter() + .take(host_status_list.len() + 1) + .map(|hst| hst.host_status.clone()) + .collect::>(); + let context = ManualRollbackContext::new(&host_status_list).unwrap(); + trace!( + "Validate runtime update rollback, create context in manual-rollback-finalized state" + ); + // Manual rollback undoing a runtime update does not require reboot + assert_eq!( + context.get_requires_reboot_output().unwrap(), + true.to_string() + ); + let serialized_output = serde_yaml::from_str::>( + &context.get_rollback_chain_json().unwrap(), + ) + .unwrap(); + // Pre a/b rollback, there was 1 runtime update to rollback + assert_eq!(serialized_output.len(), 1) + } + + #[test] + fn test_offline_init_context() { + let volume_a = Some(AbVolumeSelection::VolumeA); + let volume_b = Some(AbVolumeSelection::VolumeB); + let host_status_list = vec![ + prov(volume_a, false, vec![]), + prov(volume_a, false, vec![]), + prov(volume_a, false, vec![]), + ]; + let host_status_list = host_status_list + .iter() + .take(host_status_list.len() + 1) + .map(|hst| hst.host_status.clone()) + .collect::>(); + let context = ManualRollbackContext::new(&host_status_list).unwrap(); + trace!("Validate create context for offline-init initial state"); + // There should be NO available rollbacks, as there hasn't been any updates yet + assert_eq!( + context.get_requires_reboot_output().unwrap(), + false.to_string() + ); + let serialized_output = serde_yaml::from_str::>( + &context.get_rollback_chain_json().unwrap(), + ) + .unwrap(); + // Only offline-init has run, so there should be 0 updates to rollback + assert_eq!(serialized_output.len(), 0) + } + + #[test] + fn test_offline_init_and_ab_update_context() { + let volume_a = Some(AbVolumeSelection::VolumeA); + let volume_b = Some(AbVolumeSelection::VolumeB); + let host_status_list = vec![ + prov(volume_a, false, vec![]), + prov(volume_a, false, vec![]), + prov(volume_a, false, vec![]), + inter(volume_a, ServicingState::AbUpdateStaged), + inter(volume_a, ServicingState::AbUpdateFinalized), + prov(volume_b, true, vec![2]), + ]; + let host_status_list = host_status_list + .iter() + .take(host_status_list.len() + 1) + .map(|hst| hst.host_status.clone()) + .collect::>(); + let context = ManualRollbackContext::new(&host_status_list).unwrap(); + trace!("Validate create context for offline-init initial state"); + // There should be 1 available rollback + assert_eq!( + context.get_requires_reboot_output().unwrap(), + true.to_string() + ); + let serialized_output = serde_yaml::from_str::>( + &context.get_rollback_chain_json().unwrap(), + ) + .unwrap(); + assert_eq!(serialized_output.len(), 1) + } + + #[test] + fn test_clean_install_context() { + let volume_a = Some(AbVolumeSelection::VolumeA); + let volume_b = Some(AbVolumeSelection::VolumeB); + let host_status_list = vec![ + inter(None, ServicingState::CleanInstallFinalized), + inter(None, ServicingState::CleanInstallFinalized), + prov(volume_a, false, vec![]), + ]; + let host_status_list = host_status_list + .iter() + .take(host_status_list.len() + 1) + .map(|hst| hst.host_status.clone()) + .collect::>(); + let context = ManualRollbackContext::new(&host_status_list).unwrap(); + // There should be 0 available rollbacks + assert_eq!( + context.get_requires_reboot_output().unwrap(), + false.to_string() + ); + let serialized_output = serde_yaml::from_str::>( + &context.get_rollback_chain_json().unwrap(), + ) + .unwrap(); + assert_eq!(serialized_output.len(), 0) + } + + #[test] + fn test_clean_install_and_ab_update_context() { + let volume_a = Some(AbVolumeSelection::VolumeA); + let volume_b = Some(AbVolumeSelection::VolumeB); + let host_status_list = vec![ + inter(None, ServicingState::CleanInstallFinalized), + inter(None, ServicingState::CleanInstallFinalized), + prov(volume_a, false, vec![]), + inter(volume_a, ServicingState::AbUpdateStaged), + inter(volume_a, ServicingState::AbUpdateFinalized), + prov(volume_b, true, vec![2]), + ]; + let host_status_list = host_status_list + .iter() + .take(host_status_list.len() + 1) + .map(|hst| hst.host_status.clone()) + .collect::>(); + let context = ManualRollbackContext::new(&host_status_list).unwrap(); + // There should be 1 available rollbacks for an ab update + assert_eq!( + context.get_requires_reboot_output().unwrap(), + true.to_string() + ); + let serialized_output = serde_yaml::from_str::>( + &context.get_rollback_chain_json().unwrap(), + ) + .unwrap(); + assert_eq!(serialized_output.len(), 1) + } +} diff --git a/crates/trident/src/offline_init/mod.rs b/crates/trident/src/offline_init/mod.rs index c33ca5852..27fe2a7fe 100644 --- a/crates/trident/src/offline_init/mod.rs +++ b/crates/trident/src/offline_init/mod.rs @@ -18,7 +18,10 @@ use trident_api::{ MountOptions, MountPoint, Partition, PartitionSize, PartitionTableType, PartitionType, VerityCorruptionOption, VerityDevice, }, - constants::internal_params::ENABLE_UKI_SUPPORT, + constants::{ + internal_params::ENABLE_UKI_SUPPORT, EFI_DEFAULT_BIN_RELATIVE_PATH, ESP_EFI_DIRECTORY, + ESP_RELATIVE_MOUNT_POINT_PATH, ROOT_MOUNT_POINT_PATH, + }, error::{ ExecutionEnvironmentMisconfigurationError, InitializationError, InvalidInputError, ReportError, TridentError, TridentResultExt, @@ -28,7 +31,7 @@ use trident_api::{ }; use uuid::Uuid; -use crate::datastore::DataStore; +use crate::{datastore::DataStore, subsystems::esp}; #[derive(Clone, Debug, serde::Deserialize)] struct PrismPartition { @@ -480,6 +483,21 @@ pub fn execute( .map_err(Into::into) .message("The provided Host Status has an invalid Host Configuration")?; + // Ensure AZLA/AZLB esp scheme is present by copying boot files from fallback location if needed. + let esp_path = PathBuf::from(ROOT_MOUNT_POINT_PATH).join(ESP_RELATIVE_MOUNT_POINT_PATH); + let azla_esp_path = esp_path.join(ESP_EFI_DIRECTORY).join("AZLA"); + trace!("Checking for AZLA volume ESP path at {:?}", &azla_esp_path); + if !azla_esp_path.exists() { + trace!( + "AZLA volume ESP path {:?} does not exist, attempting to copy from fallback location", + azla_esp_path + ); + let boot_esp_path = esp_path.join(EFI_DEFAULT_BIN_RELATIVE_PATH); + esp::replace_boot_files(&boot_esp_path, &azla_esp_path) + .structured(InvalidInputError::InvalidBootConfiguration) + .message("Failed to copy boot files to AZLA ESP path")?; + } + let datastore_path = host_status.spec.trident.datastore_path.clone(); let mut datastore = diff --git a/crates/trident/src/subsystems/esp.rs b/crates/trident/src/subsystems/esp.rs index 501c6825f..3bec5e3bf 100644 --- a/crates/trident/src/subsystems/esp.rs +++ b/crates/trident/src/subsystems/esp.rs @@ -263,6 +263,8 @@ fn copy_file_artifacts( /// * For A/B update /// - 'optimistic': use the opposite of the active volume /// - 'conservative': use the active volume (this may be a redundant copy) +/// * For manual rollback (this should only be called during manual rollback of an a/b update) +/// - use the opposite of the active volume /// 2. During commit, after the target OS boot has been verified, the target OS boot files /// are copied to the UEFI fallback folder. /// * For clean install, no copy is needed as it was done during finalize @@ -312,6 +314,18 @@ fn find_uefi_fallback_source_dir_name( )), _ => None, }, + ServicingState::ManualRollbackStaged => match ctx.spec.os.uefi_fallback { + UefiFallbackMode::Conservative | UefiFallbackMode::Optimistic => { + Some(boot::make_esp_dir_name( + ctx.install_index, + match ctx.ab_active_volume { + None | Some(AbVolumeSelection::VolumeB) => AbVolumeSelection::VolumeA, + Some(AbVolumeSelection::VolumeA) => AbVolumeSelection::VolumeB, + }, + )) + } + _ => None, + }, _ => None, } } @@ -374,7 +388,7 @@ fn copy_boot_files_for_uefi_fallback( source_esp_dir_path.display(), uefi_fallback_path.display() ); - simple_copy_boot_files(&source_esp_dir_path, &uefi_fallback_path).context(format!( + replace_boot_files(&source_esp_dir_path, &uefi_fallback_path).context(format!( "Failed to copy boot files from directory '{}' to directory '{}'", source_esp_dir_path.display(), uefi_fallback_path.display() @@ -383,7 +397,7 @@ fn copy_boot_files_for_uefi_fallback( } /// Copies boot files from one folder to another. -fn simple_copy_boot_files(from_dir: &Path, to_dir: &Path) -> Result<(), Error> { +pub fn replace_boot_files(from_dir: &Path, to_dir: &Path) -> Result<(), Error> { trace!( "Copying boot files from '{}' to '{}'", from_dir.display(), @@ -414,24 +428,50 @@ fn simple_copy_boot_files(from_dir: &Path, to_dir: &Path) -> Result<(), Error> { }) .context("Failed to copy files")?; + // Rename everything all pre-existing files from to_dir/ to to_dir/.old + fs::read_dir(to_dir)? + .collect::, _>>()? + .iter() + .try_for_each(|orig_path| { + let orig_file_name = orig_path.file_name(); + let orig_file_name_string = orig_file_name.to_string_lossy(); + // Skip files that end with .new + if !orig_file_name_string.ends_with(".new") { + let new_file_name = format!("{}.old", orig_file_name_string); + let to_path = to_dir.join(new_file_name); + fs::rename(orig_path.path(), &to_path).context(format!( + "Failed to rename pre-existing file '{}' to '{}'", + orig_path.path().display(), + to_path.display() + ))?; + trace!( + "Renamed pre-existing file '{}' to '{}'", + orig_path.path().display(), + to_path.display() + ); + } + Ok::<(), Error>(()) + }) + .context("Failed to rename pre-existing files")?; + // Rename all copied files from to_dir/.new to to_dir/ fs::read_dir(to_dir)? .collect::, _>>()? .iter() .try_for_each(|orig_path| { let orig_file_name = orig_path.file_name(); + let orig_file_name_string = orig_file_name.to_string_lossy(); // Skip files that do not end with .new - if orig_file_name.to_string_lossy().ends_with(".new") { - let orig_file_name_string = orig_file_name.to_string_lossy(); + if orig_file_name_string.ends_with(".new") { let new_file_name = orig_file_name_string.trim_end_matches(".new"); let to_path = to_dir.join(new_file_name); fs::rename(orig_path.path(), &to_path).context(format!( - "Failed to rename file '{}' to '{}'", + "Failed to rename copied file '{}' to '{}'", orig_path.path().display(), to_path.display() ))?; trace!( - "Renamed file '{}' to '{}'", + "Renamed copied file '{}' to '{}'", orig_path.path().display(), to_path.display() ); @@ -439,6 +479,24 @@ fn simple_copy_boot_files(from_dir: &Path, to_dir: &Path) -> Result<(), Error> { Ok::<(), Error>(()) }) .context("Failed to rename copied files")?; + + // Remove all preexisting files .old + fs::read_dir(to_dir)? + .collect::, _>>()? + .iter() + .try_for_each(|orig_path| { + let orig_file_name = orig_path.file_name(); + // Skip files that do not end with .old + if orig_file_name.to_string_lossy().ends_with(".old") { + fs::remove_file(orig_path.path()).context(format!( + "Failed to remove pre-existing file '{}'", + orig_path.path().display() + ))?; + trace!("Removed pre-existing file '{}'", orig_path.path().display()); + } + Ok::<(), Error>(()) + }) + .context("Failed to remove pre-existing files")?; Ok(()) } @@ -924,6 +982,30 @@ mod tests { None::, // with Disabled, we do not copy anything "Validate AbUpdateFinalized + Disabled + active volume A ==> None", ), + ( + ServicingState::ManualRollbackStaged, + UefiFallbackMode::Conservative, + Some(AbVolumeSelection::VolumeA), + ServicingType::ManualRollback, + Some("AZLB".to_string()), // in ManualRollbackStaged, with 'conservative', copy from inactive volume + "Validate ManualRollbackStaged + Conservative + active volume A ==> AZLB", + ), + ( + ServicingState::ManualRollbackStaged, + UefiFallbackMode::Optimistic, + Some(AbVolumeSelection::VolumeA), + ServicingType::ManualRollback, + Some("AZLB".to_string()), // in ManualRollback staged, with 'optimistic', copy from inactive volume + "Validate ManualRollbackStaged + Optimistic + active volume A ==> AZLB", + ), + ( + ServicingState::ManualRollbackStaged, + UefiFallbackMode::Disabled, + Some(AbVolumeSelection::VolumeA), + ServicingType::ManualRollback, + None::, // with Disabled, we do not copy anything + "Validate ManualRollbackStaged + Disabled + active volume A ==> None", + ), ]; for test_case in test_cases { ctx.spec.os.uefi_fallback = test_case.1; @@ -1065,7 +1147,7 @@ mod tests { } #[test] - fn test_simple_copy_boot_files() { + fn test_replace_boot_files() { let from_dir = TempDir::new().unwrap(); let to_dir = TempDir::new().unwrap(); @@ -1095,7 +1177,7 @@ mod tests { } // Call the function to copy files - simple_copy_boot_files(from_dir.path(), to_dir.path()).unwrap(); + replace_boot_files(from_dir.path(), to_dir.path()).unwrap(); // Verify that files have been copied and renamed correctly for (file_name, _content) in &file_infos { @@ -1110,19 +1192,10 @@ mod tests { ); } - // Verify that existing files that were not in from_dir are unchanged - for (file_name, content) in &existing_file_infos { + // Verify that existing files that were not in from_dir are removed + for (file_name, _) in &existing_file_infos { if !file_infos.iter().any(|(f, _)| f == file_name) { - let mut file_content = String::new(); - File::open(to_dir.path().join(file_name)) - .unwrap() - .read_to_string(&mut file_content) - .unwrap(); - assert_eq!( - file_content.trim(), - *content, - "Content of existing file {file_name} does not match" - ); + assert!(!to_dir.path().join(file_name).exists()); } } } diff --git a/crates/trident/src/subsystems/hooks.rs b/crates/trident/src/subsystems/hooks.rs index de5ef3365..ac8b82a9b 100644 --- a/crates/trident/src/subsystems/hooks.rs +++ b/crates/trident/src/subsystems/hooks.rs @@ -372,6 +372,7 @@ fn match_servicing_type_env_var(servicing_type: &ServicingType) -> &OsStr { ServicingType::AbUpdate => OsStr::new("ab_update"), ServicingType::CleanInstall => OsStr::new("clean_install"), ServicingType::NoActiveServicing => OsStr::new("none"), + ServicingType::ManualRollback => OsStr::new("manual_rollback"), } } diff --git a/crates/trident_api/src/error.rs b/crates/trident_api/src/error.rs index 551d4ccb0..443de41a3 100644 --- a/crates/trident_api/src/error.rs +++ b/crates/trident_api/src/error.rs @@ -180,6 +180,9 @@ pub enum InvalidInputError { #[error("Image contains invalid agent configuration")] ImageBadAgentConfiguration, + #[error("Invalid boot configuration")] + InvalidBootConfiguration, + #[error("Host Configuration failed dynamic validation: {inner}")] InvalidHostConfigurationDynamic { #[from] @@ -198,6 +201,12 @@ pub enum InvalidInputError { #[error("Invalid --lazy-partitions provided")] InvalidLazyPartition, + #[error("Invalid rollback expectation: '{reason}'")] + InvalidRollbackExpectation { reason: String }, + + #[error("Invalid state for rollback: '{reason}'")] + InvalidRollbackState { reason: String }, + #[error("Failed to load COSI file from '{url}'")] LoadCosi { url: Url }, @@ -565,6 +574,9 @@ pub enum ServicingError { #[error("Failed to remove the pre-existing pcrlock policy")] RemovePcrlockPolicy, + #[error("Failed to execute rollback")] + ManualRollback, + #[error( "Failed to match current root device path '{root_device_path}' to either root volume A \ path '{root_volume_a_path}' or B path '{root_volume_b_path}'" diff --git a/crates/trident_api/src/status.rs b/crates/trident_api/src/status.rs index 2fcb8a9b1..a716902c2 100644 --- a/crates/trident_api/src/status.rs +++ b/crates/trident_api/src/status.rs @@ -73,6 +73,8 @@ pub enum ServicingType { AbUpdate = 2, /// Clean install of the target OS image when the host is booted from the provisioning OS. CleanInstall = 3, + /// Manual Rollback of the target OS image to a previously deployed state. + ManualRollback = 4, } /// Servicing state describes the progress of the servicing that the Trident agent is executing on @@ -89,12 +91,18 @@ pub enum ServicingState { CleanInstallStaged, /// A/B update has been staged. The new target OS images have been deployed onto block devices. AbUpdateStaged, + /// Manual rollback has been staged. + ManualRollbackStaged, + /// Runtime update has been staged. + RuntimeUpdateStaged, /// Clean install has been finalized, i.e., UEFI variables have been set, so that firmware boots /// from the target OS image after reboot. CleanInstallFinalized, /// A/B update has been finalized. For the next boot, the firmware will boot from the updated /// target OS image. AbUpdateFinalized, + /// Manual rollback has been finalized. + ManualRollbackFinalized, /// Servicing has been completed, and the host successfully booted from the updated target OS /// image. Trident is ready to begin a new servicing. Provisioned, diff --git a/tools/storm/rollback/tests/rollback.go b/tools/storm/rollback/tests/rollback.go index b6a315b7c..53b1d50b8 100644 --- a/tools/storm/rollback/tests/rollback.go +++ b/tools/storm/rollback/tests/rollback.go @@ -136,7 +136,7 @@ func RollbackTest(testConfig stormrollbackconfig.TestConfig, vmConfig stormvmcon } hostConfig["os"] = sysextConfig } - // Perform runtime update and do + // Perform runtime update and do validation expectedAvailableRollbacks = 2 err = doUpdateTest(testConfig, vmConfig, vmIP, hostConfig, extensionVersion, expectedVolume, expectedAvailableRollbacks, false) if err != nil { @@ -238,52 +238,21 @@ func getOtherVolume(volume string) string { return "volume-a" } -func validateOs( +func validateRollbacksAvailable( testConfig stormrollbackconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig, vmIP string, - extensionVersion int, - expectedVolume string, expectedAvailableRollbacks int, expectedFirstRollbackNeedsReboot bool, ) error { - // Verify active volume is as expected - logrus.Tracef("Checking active volume, expecting '%s'", expectedVolume) - checkActiveVolumeErr := stormtridentactivevolume.CheckActiveVolume(vmConfig.VMConfig, vmIP, expectedVolume) - if checkActiveVolumeErr != nil { - return fmt.Errorf("failed to validate active volume: %w", checkActiveVolumeErr) - } - if !testConfig.SkipExtensionTesting { - if extensionVersion > 0 { - logrus.Tracef("Checking extension version, expected: '%d'", extensionVersion) - extensionTestCommand := "test-extension.sh" - extensionTestOutput, err := stormssh.SshCommand(vmConfig.VMConfig, vmIP, extensionTestCommand) - if err != nil { - return fmt.Errorf("failed to check extension on VM (%w):\n%s", err, extensionTestOutput) - } - extensionTestOutput = strings.TrimSpace(extensionTestOutput) - if extensionTestOutput != fmt.Sprintf("%d", extensionVersion) { - return fmt.Errorf("extension version mismatch: expected %d, got %s", extensionVersion, extensionTestOutput) - } - logrus.Tracef("Extension version confirmed, found: '%d'", extensionVersion) - } else { - logrus.Tracef("Checking that extension is not present") - extensionTestCommand := "test-extension.sh" - extensionTestOutput, err := stormssh.SshCommand(vmConfig.VMConfig, vmIP, extensionTestCommand) - if err == nil { - return fmt.Errorf("extension is unexpectedly still available (%w):\n%s", err, extensionTestOutput) - } - } - } if !testConfig.SkipManualRollbacks { - // TODO: Verify that there is 1 available rollback logrus.Tracef("Checking number of available rollbacks, expecting '%d'", expectedAvailableRollbacks) - availableRollbacksOutput, err := stormssh.SshCommand(vmConfig.VMConfig, vmIP, "sudo trident rollback --show-available") + availableRollbacksOutput, err := stormssh.SshCommand(vmConfig.VMConfig, vmIP, "sudo trident rollback --show chain") if err != nil { - return fmt.Errorf("failed to get available rollbacks from VM: %v", err) + return fmt.Errorf("'rollback --show chain' failed to from VM: %v", err) } - logrus.Tracef("Reported available rollbacks:\n%s", availableRollbacksOutput) + logrus.Tracef("Reported 'rollback --show chain':\n%s", availableRollbacksOutput) var availableRollbacks []map[string]interface{} err = json.Unmarshal([]byte(strings.TrimSpace(availableRollbacksOutput)), &availableRollbacks) @@ -306,10 +275,100 @@ func validateOs( return fmt.Errorf("first available rollback requiresReboot mismatch: expected %v, got %v", expectedFirstRollbackNeedsReboot, needsReboot) } } + + rollbackShowValidationOutput, err := stormssh.SshCommand(vmConfig.VMConfig, vmIP, "sudo trident rollback --show validation") + if err != nil { + return fmt.Errorf("'rollback --show validation' failed to from VM: %v", err) + } + logrus.Tracef("Reported 'rollback --show validation':\n%s", rollbackShowValidationOutput) + if expectedAvailableRollbacks > 0 { + if expectedFirstRollbackNeedsReboot { + if strings.TrimSpace(rollbackShowValidationOutput) != "ab" { + return fmt.Errorf("expected 'ab' from 'rollback --show validation', got: %s", rollbackShowValidationOutput) + } + } else { + if strings.TrimSpace(rollbackShowValidationOutput) != "runtime" { + return fmt.Errorf("expected 'runtime' from 'rollback --show validation', got: %s", rollbackShowValidationOutput) + } + } + } else { + if strings.TrimSpace(rollbackShowValidationOutput) != "none" { + return fmt.Errorf("expected 'none' from 'rollback --show validation', got: %s", rollbackShowValidationOutput) + } + } + + rollbackShowTargetOutput, err := stormssh.SshCommand(vmConfig.VMConfig, vmIP, "sudo trident rollback --show target") + if err != nil { + return fmt.Errorf("'rollback --show target' failed to from VM: %v", err) + } + logrus.Tracef("Reported 'rollback --show target':\n%s", rollbackShowTargetOutput) + if expectedAvailableRollbacks > 0 { + if expectedFirstRollbackNeedsReboot { + if strings.TrimSpace(rollbackShowTargetOutput) == "{}" { + return fmt.Errorf("expected Host Configuration from 'rollback --show target', got: %s", rollbackShowTargetOutput) + } + } + } else { + if strings.TrimSpace(rollbackShowTargetOutput) != "{}" { + return fmt.Errorf("expected '{}' from 'rollback --show target', got: %s", rollbackShowTargetOutput) + } + } } return nil } +func validateExtension( + testConfig stormrollbackconfig.TestConfig, + vmConfig stormvmconfig.AllVMConfig, + vmIP string, + extensionVersion int, +) error { + if !testConfig.SkipExtensionTesting { + if extensionVersion > 0 { + logrus.Tracef("Checking extension version, expected: '%d'", extensionVersion) + extensionTestCommand := "test-extension.sh" + extensionTestOutput, err := stormssh.SshCommand(vmConfig.VMConfig, vmIP, extensionTestCommand) + if err != nil { + return fmt.Errorf("failed to check extension on VM (%w):\n%s", err, extensionTestOutput) + } + extensionTestOutput = strings.TrimSpace(extensionTestOutput) + if extensionTestOutput != fmt.Sprintf("%d", extensionVersion) { + return fmt.Errorf("extension version mismatch: expected %d, got %s", extensionVersion, extensionTestOutput) + } + logrus.Tracef("Extension version confirmed, found: '%d'", extensionVersion) + } else { + logrus.Tracef("Checking that extension is not present") + extensionTestCommand := "test-extension.sh" + extensionTestOutput, err := stormssh.SshCommand(vmConfig.VMConfig, vmIP, extensionTestCommand) + if err == nil { + return fmt.Errorf("extension is unexpectedly still available (%w):\n%s", err, extensionTestOutput) + } + } + } + return nil +} + +func validateOs( + testConfig stormrollbackconfig.TestConfig, + vmConfig stormvmconfig.AllVMConfig, + vmIP string, + extensionVersion int, + expectedVolume string, + expectedAvailableRollbacks int, + expectedFirstRollbackNeedsReboot bool, +) error { + // Verify active volume is as expected + logrus.Tracef("Checking active volume, expecting '%s'", expectedVolume) + checkActiveVolumeErr := stormtridentactivevolume.CheckActiveVolume(vmConfig.VMConfig, vmIP, expectedVolume) + if checkActiveVolumeErr != nil { + return fmt.Errorf("failed to validate active volume: %w", checkActiveVolumeErr) + } + if err := validateExtension(testConfig, vmConfig, vmIP, extensionVersion); err != nil { + return fmt.Errorf("failed to validate extension: %w", err) + } + return validateRollbacksAvailable(testConfig, vmConfig, vmIP, expectedAvailableRollbacks, expectedFirstRollbackNeedsReboot) +} + func doUpdateTest( testConfig stormrollbackconfig.TestConfig, vmConfig stormvmconfig.AllVMConfig, diff --git a/tools/storm/scripts/build_extension_images/build_extension_images.go b/tools/storm/scripts/build_extension_images/build_extension_images.go index 94c02ed35..91d7d2ea8 100644 --- a/tools/storm/scripts/build_extension_images/build_extension_images.go +++ b/tools/storm/scripts/build_extension_images/build_extension_images.go @@ -98,7 +98,7 @@ func buildImage(extType string, numClones int) error { 0777, ) if err != nil { - return fmt.Errorf("failed to write %s extension-release file %s: %w", extType, extensionReleaseFile, err) + return fmt.Errorf("failed to write %s extension script file %s: %w", extType, extensionReleaseFile, err) } }