diff --git a/crates/trident/src/engine/rollback.rs b/crates/trident/src/engine/rollback.rs index f46593de8..ca2c3553e 100644 --- a/crates/trident/src/engine/rollback.rs +++ b/crates/trident/src/engine/rollback.rs @@ -1,4 +1,7 @@ -use std::{fs, path::PathBuf}; +use std::{ + fs, + path::{Path, PathBuf}, +}; use anyhow::{Context, Error}; use chrono::Utc; @@ -47,10 +50,10 @@ pub fn validate_boot(datastore: &mut DataStore) -> Result { - datastore.host_status().ab_active_volume - } + // For *Finalized and AbUpdateRollbackFailed, use the active volume set in Host Status + ServicingState::AbUpdateRollbackFailed + | ServicingState::AbUpdateFinalized + | ServicingState::CleanInstallFinalized => datastore.host_status().ab_active_volume, // For AbUpdateHealthCheckFailed, use the opposite active volume of the one // set in Host Status ServicingState::AbUpdateHealthCheckFailed => { @@ -73,7 +76,12 @@ pub fn validate_boot(datastore: &mut DataStore) -> Result ServicingType::CleanInstall, - _ => ServicingType::NoActiveServicing, + // For any other state, this function should not have been called + state => { + return Err(TridentError::new(InternalError::UnexpectedServicingState { + state, + })); + } }; // Create an EngineContext based on the Host Status @@ -104,18 +112,82 @@ pub fn validate_boot(datastore: &mut DataStore) -> Result { + return Ok(BootValidationResult::ValidBootHealthCheckFailed(err)); + } + BootValidationResult::ValidBootProvisioned => { + // continue + } + } + + // Complete the commit, update the boot and encryption configurations + match commit_finalized_on_expected_root(&ctx, datastore, current_servicing_state) { + Ok(result) => { + return Ok(result); + } + Err(_err) => { + if servicing_type == ServicingType::AbUpdate { + // Failed to update UEFI boot order or encryption configuration after + // successful boot to expected root for A/B Update, re-set host status + // state to AbUpdateRollbackFailed + error!( + "Re-set host status from {current_servicing_state:?} to AbUpdateRollbackFailed" + ); + datastore.with_host_status(|host_status| { + host_status.servicing_state = ServicingState::AbUpdateRollbackFailed; + })?; + + return Err(TridentError::new(ServicingError::AbUpdateRebootCheck { + root_device_path: current_root_path.to_string_lossy().to_string(), + expected_device_path: expected_root_path.to_string_lossy().to_string(), + })); + } else if servicing_type == ServicingType::CleanInstall { + // For Clean Install, when not booting from expected root, re-set + // host status state to NotProvisioned + error!("Re-set host status from {current_servicing_state:?} to NotProvisioned"); + datastore.with_host_status(|host_status| { + host_status.spec = Default::default(); + host_status.servicing_state = ServicingState::NotProvisioned; + })?; + + return Err(TridentError::new(ServicingError::CleanInstallRebootCheck { + root_device_path: current_root_path.to_string_lossy().to_string(), + expected_device_path: expected_root_path.to_string_lossy().to_string(), + })); + } + } + }; + Ok(BootValidationResult::ValidBootProvisioned) +} + +fn handle_boot_validation_errors( + datastore: &mut DataStore, + booted_to_expected_root: bool, + current_servicing_state: ServicingState, + current_root_path: &Path, + expected_root_path: &Path, +) -> Result<(), TridentError> { match (booted_to_expected_root, current_servicing_state) { + // Success case, nothing to handle here (true, ServicingState::CleanInstallFinalized) | (true, ServicingState::AbUpdateFinalized) => { - // For *Finalized states, when booting from the expected - // root, finish the commit process - info!("Host successfully booted from updated target OS image"); - return commit_finalized_on_expected_root( - &ctx, - datastore, - current_servicing_state, - servicing_type, - ); + // no-op + Ok(()) } // // Every case below will return an error. @@ -125,10 +197,15 @@ pub fn validate_boot(datastore: &mut DataStore) -> Result { // For Clean Install, when not booting from expected root, re-set @@ -139,10 +216,10 @@ pub fn validate_boot(datastore: &mut DataStore) -> Result { // AbUpdateHealthCheckFailed, when booting from expected root (the servicing OS), mark host @@ -154,11 +231,11 @@ pub fn validate_boot(datastore: &mut DataStore) -> Result { // AbUpdateFinalize, when booting from incorrect root (the servicing OS), mark host status @@ -170,26 +247,23 @@ pub fn validate_boot(datastore: &mut DataStore) -> Result { // No other states should happen, return error error!("Unexpected status: {current_servicing_state:?}"); - return Err(TridentError::new(InternalError::UnexpectedServicingState { + Err(TridentError::new(InternalError::UnexpectedServicingState { state, - })); + })) } } } -/// Completes the commit for AbUpdateFinalized and CleanInstallFinalized states when -/// the host has booted from the expected root device. This includes running health -/// checks, updating boot order, updating the encryption pcrlock policy if needed, and -/// updating the Host Status. -fn commit_finalized_on_expected_root( +/// Runs health checks on target OS +fn check_health( ctx: &EngineContext, datastore: &mut DataStore, current_servicing_state: ServicingState, @@ -206,7 +280,17 @@ fn commit_finalized_on_expected_root( return Err(err); } } + Ok(BootValidationResult::ValidBootProvisioned) +} +/// Completes the commit for AbUpdateFinalized and CleanInstallFinalized states when +/// the host has booted from the expected root device. This includes updating boot order, +/// updating the encryption pcrlock policy if needed, and updating the Host Status. +fn commit_finalized_on_expected_root( + ctx: &EngineContext, + datastore: &mut DataStore, + current_servicing_state: ServicingState, +) -> Result { // If it's virtdeploy, after confirming that we have booted into the correct image, we need // to update the `BootOrder` to boot from the correct image next time. let use_virtdeploy_workaround = virt::is_virtdeploy() diff --git a/crates/trident/src/engine/storage/rebuild.rs b/crates/trident/src/engine/storage/rebuild.rs index 89d492a64..716e09ee6 100755 --- a/crates/trident/src/engine/storage/rebuild.rs +++ b/crates/trident/src/engine/storage/rebuild.rs @@ -185,6 +185,7 @@ pub(crate) fn validate_rebuild_raid( ServicingState::Provisioned | ServicingState::AbUpdateStaged | ServicingState::AbUpdateFinalized + | ServicingState::AbUpdateRollbackFailed | ServicingState::AbUpdateHealthCheckFailed => {} } diff --git a/crates/trident_api/src/status.rs b/crates/trident_api/src/status.rs index ade396e64..3cd3fece0 100644 --- a/crates/trident_api/src/status.rs +++ b/crates/trident_api/src/status.rs @@ -99,6 +99,9 @@ pub enum ServicingState { /// A/B update has been finalized. For the next boot, the firmware will boot from the updated /// target OS image. AbUpdateFinalized, + /// A/B update has failed, and Trident initiated an auto-rollback, but the host failed to + /// successfully rollback to the servicing OS. + AbUpdateRollbackFailed, /// Servicing has been completed, and the host successfully booted from the updated target OS /// image. Trident is ready to begin a new servicing. Provisioned, diff --git a/docs/Explanation/Health-Checks.md b/docs/Explanation/Health-Checks.md index 2c66ab1c3..de758e397 100644 --- a/docs/Explanation/Health-Checks.md +++ b/docs/Explanation/Health-Checks.md @@ -75,65 +75,62 @@ Health checks are run during `trident commit` after a `trident install` or `trident update` have staged and finalized. You can see how `health checks` fit into the overall servicing flow in these diagrams: +### Clean Install with Health Checks + +```mermaid +--- +config: + theme: redux +--- +flowchart TD + A(["Clean Install (to A)"]) + style A color:#085 + A --> B["CleanInstallStaged"] + B --> C["CleanInstallFinalized
(reboot)"] + C --> D{"Commit
(unknown OS)"} + D --booted in A--> XX("in target OS (A)") + style XX color:#085 + XX --health checks
succeeded--> F["Provisioned (A)
no errors"] + style F color:#085 + D --did NOT boot in A--> YY("in unepected OS") + style YY color:#822 + YY --> E["NotProvisioned with last_error set"] + style E color:#822 + XX --health check
failed--> E + XX --commit failure--> E +``` + +### A/B Update with Health Checks + ```mermaid --- config: - theme: redux - layout: dagre + theme: redux --- flowchart TD - A["NotProvisioned"] ==> B{"trident install"} - B ==> C["CleanInstallStaged"] - C ==> D["CleanInstallFinalized"] - D === G(["Finalize Reboot"]) - G ==> E{"trident commit **A**"} - E == Commit succeeded ==> F["Provisioned **A**"] - E -.- Z(["Health Check Failure"]) - Z -.-> A - AA["Provisioned **A**"] ==> BB{"trident update"} - BB ==> CC["AbUpdateStaged"] - CC ==> DD["AbUpdateFinalized"] - DD === JJ(["Finalize Reboot"]) - JJ ==> EE{"trident commit **B**"} - EE == Commit succeeded ==> FF["Provisioned **B**"] - EE -.- ZZ(["Health Check failure"]) - ZZ -.-> HH["AbUpdateHealthCheckFailed"] - HH -.- KK(["Rollback Reboot"]) - KK -.-> II{"trident commit **A**"} - II -. Commit succeeded .-> AA - style A fill:#FFF9C4 - style C fill:#FFF9C4 - style D fill:#FFF9C4 - style G fill:#BBDEFB - style F fill:#00C853 - style Z fill:#FFCDD2 - style AA fill:#FFF9C4 - style CC fill:#FFF9C4 - style DD fill:#FFF9C4 - style JJ fill:#BBDEFB - style FF fill:#00C853 - style ZZ fill:#FFCDD2 - style HH fill:#FFF9C4 - style KK fill:#BBDEFB - linkStyle 0 stroke:#00C853,fill:none - linkStyle 1 stroke:#00C853,fill:none - linkStyle 2 stroke:#00C853,fill:none - linkStyle 3 stroke:#00C853,fill:none - linkStyle 4 stroke:#00C853,fill:none - linkStyle 5 stroke:#00C853,fill:none - linkStyle 6 stroke:#D50000,fill:none - linkStyle 7 stroke:#D50000,fill:none - linkStyle 8 stroke:#00C853,fill:none - linkStyle 9 stroke:#00C853,fill:none - linkStyle 10 stroke:#00C853,fill:none - linkStyle 11 stroke:#00C853,fill:none - linkStyle 12 stroke:#00C853,fill:none - linkStyle 13 stroke:#00C853,fill:none - linkStyle 14 stroke:#D50000,fill:none - linkStyle 15 stroke:#D50000,fill:none - linkStyle 16 stroke:#D50000,fill:none - linkStyle 17 stroke:#D50000,fill:none - linkStyle 18 stroke:#D50000,fill:none + AA["Provisioned (A)"] + style AA color:#085 + AA --> A(["A/B Update
from servicing OS A
to target OS B"]) + style A color:#085 + A --> B["AbUpdateStaged"] + B --> C["AbUpdateFinalized
(reboot)"] + C --> D{"Commit
(unknown OS)"} + D --booted in B--> XX("in target OS (B)") + style XX color:#085 + XX --health checks
succeeded--> F["Provisioned (B)
no errors"] + style F color:#085 + XX --commit infra failure
last_error set --> Z["AbUpdateRollbackFailed (B)"] + style Z color:#822 + XX --health checks
failed--> G["AbUpdateHealthCheckFailed"] + style G color:#822 + D --booted in A--> YY("in servicing OS (A)") + style YY color:#822 + G --> GG["Auto-rollback
(reboot)"] + GG --> H{"Commit
(unknown OS)"} + H --failed to rollback
in target OS (B)--> Z + H --rolled back
servicing OS (A)--> YY + YY --> J["Provisioned (A)
with last_error set"] + style J color:#822 ``` ## Health Check failures diff --git a/docs/Explanation/Trident-States.md b/docs/Explanation/Trident-States.md new file mode 100644 index 000000000..840acc949 --- /dev/null +++ b/docs/Explanation/Trident-States.md @@ -0,0 +1,169 @@ +# Trident State Machine + +## Servicing Type and Servicing State + +To track the progress of clean install or A/B upgrade and enable decoupling of +`stage` from `finalize`, Trident uses **TWO** objects: + +1. **Servicing type**: `ServicingType` describes the type of changes required +based on Host Status and Host Configuration. This object has the following values: + + - `AbUpdate`: Update that requires switching to a different root partition + and rebooting. + - `CleanInstall`: Clean install of the target OS image when the host is + booted from the servicing OS. + - `NoActiveServicing`: No servicing is currently in progress. + +2. **Servicing state**: `ServicingState` describes the current state of the +servicing done by Trident. The host will transition through a different +sequence of servicing states, depending on the servicing type that Trident is +executing. This object has the following values: + + - `NotProvisioned`: The host is running from the servicing OS and has + not yet been provisioned by Trident. + - `CleanInstallStaged`: Clean install has been staged, i.e., the initial + target OS images have been deployed onto block devices. + - `AbUpdateStaged`: A/B update has been staged. The target OS images + have been deployed onto block devices. + - `CleanInstallFinalized`: Clean install has been finalized, i.e., UEFI + variables have been set, so that firmware boots from the target OS image + after reboot. + - `AbUpdateFinalized`: A/B update has been finalized. For the next boot, the + firmware will boot from the updated target OS image. + - `AbUpdateHealthCheckFailed`: After A/B update has booted into the target OS, + user-specified health check(s) are run. Should any of them fail, the machine + will enter this state and will boot into the servicing OS. + - `AbUpdateRollbackFailed`: If A/B update fails, the machine should boot + from the servicing OS. If Trident is unable to successfully rollback to the + servicing OS, it will enter this state. + - `Provisioned`: Servicing has been completed, and the host successfully + booted from the updated target OS image. Trident is ready to begin a new + servicing. + +## State Diagrams + +The state diagrams below illustrate how `servicingState` of the host will +change in Host Status, depending on Host Configuration and the value(s) +provided in the `--allowed-operations` option: + +### Clean Install State Diagram + +```mermaid +--- +config: + theme: redux +--- +graph TD + A[not-provisioned] --> |'stage'
Valid HC received|B[not-provisioned] + B --> |Staging failed|A + B --> |Staging succeeded|C[clean-install-staged] + C --> |'finalize'
Finalizing succeeded|E[clean-install-finalized] + C --> |'finalize'
Finalizing failed|A + C --> |'stage'
Updated HC received|B + E --> |Successfully booted from
target OS image
and health checks succeeded|G[provisioned] + E --> |Successfully booted from
target OS image
but health checks failed|A + E --> |Failed to boot from
target OS image|A + + %% Adjust node styles dynamically for content fitting + style A white-space:normal,overflow-wrap:break-word,padding:10px + style B white-space:normal,overflow-wrap:break-word,padding:10px + style C white-space:normal,overflow-wrap:break-word,padding:10px + style E white-space:normal,overflow-wrap:break-word,padding:10px + style G white-space:normal,overflow-wrap:break-word,padding:10px + + %% Adjust edge text wrapping and size + linkStyle 0 max-width:500px,white-space:normal,overflow-wrap:break-word + linkStyle 1 max-width:300px,white-space:normal,overflow-wrap:break-word + linkStyle 2 max-width:300px,white-space:normal,overflow-wrap:break-word + linkStyle 3 max-width:300px,white-space:normal,overflow-wrap:break-word + linkStyle 4 max-width:300px,white-space:normal,overflow-wrap:break-word + linkStyle 5 max-width:300px,white-space:normal,overflow-wrap:break-word + linkStyle 6 max-width:500px,white-space:normal,overflow-wrap:break-word + linkStyle 7 max-width:500px,white-space:normal,overflow-wrap:break-word +``` + +### A/B Update State Diagram + +```mermaid +--- +config: + theme: redux +--- +graph TD + A[provisioned] --> |'stage'
Valid HC received|B[provisioned] + B --> |Staging failed|A + B --> |Staging succeeded|C[ab-update-staged] + C --> |'finalize'
Finalizing succeeded|E[ab-update-finalized] + C --> |'finalize'
Finalizing failed|A + C --> |'stage'
Updated HC received|B + E --> |Successfully booted from
updated target OS
and health checks succeeded|A + E --> |Successfully booted from
updated target OS
but health checks failed
and performed a rollback|A + E --> |Failed to boot from
updated target OS
and performed a rollback|A + E --> |Rollback did not succeed|F[ab-update-rollback-failed] + + + style A white-space:normal,overflow-wrap:break-word,padding:10px + style B white-space:normal,overflow-wrap:break-word,padding:10px + style C white-space:normal,overflow-wrap:break-word,padding:10px + style E white-space:normal,overflow-wrap:break-word,padding:10px + style F white-space:normal,overflow-wrap:break-word,padding:10px + + %% Adjust edge text wrapping and size + linkStyle 0 max-width:500px,white-space:normal,overflow-wrap:break-word + linkStyle 1 max-width:300px,white-space:normal,overflow-wrap:break-word + linkStyle 2 max-width:300px,white-space:normal,overflow-wrap:break-word + linkStyle 3 max-width:300px,white-space:normal,overflow-wrap:break-word + linkStyle 4 max-width:300px,white-space:normal,overflow-wrap:break-word + linkStyle 5 max-width:300px,white-space:normal,overflow-wrap:break-word + linkStyle 6 max-width:500px,white-space:normal,overflow-wrap:break-word + linkStyle 7 max-width:300px,white-space:normal,overflow-wrap:break-word + linkStyle 8 max-width:500px,white-space:normal,overflow-wrap:break-word + linkStyle 9 max-width:500px,white-space:normal,overflow-wrap:break-word +``` + +## Troubleshooting with Servicing State + +When troubleshooting Trident servicing issues, it is important to check both +the `ServicingState` and `LastError` of the host in the Host Status. The +Host Status can be viewed using the `trident get status` command. + +- If the `ServicingState` is `NotProvisioned`, it indicates that the host has + not yet been successfully provisioned by Trident. If `trident install` was run, + the `LastError` field may provide additional information about any issues + encountered during the install. Use this information to modify your Host + Configuration prior to running `trident install` from a servicing OS + (e.g., a live ISO). + +- If the `ServicingState` is `Provisioned` and there is no `LastError`, it indicates + that the host has been successfully provisioned and is running the target OS + image without any issues. If the `LastError` field contains an error message, it + indicates that there were issues encountered during the servicing process and + the host is booting from an unexpected OS. This could be doe to either: + + - The host failed to boot into the target OS after `AbUpdateFinalize`, reflecting + a failure to set the UEFI BootNext variable to the target OS or a failure of UEFI + to recognize/use the BootNext variable. In either case, the host is not booting from + the target OS as expected. Options at this point are: + + - Use `efibootmgr` to ensure that the target OS boot entry is present and + configured for BootNext and boot into the target OS to run `trident commit`. + + - Adjust the Host Configuration based on `LastError` and run `trident update`. + + - The machine booted into the target OS, a health check failed, and the host rolled + back into the servicing OS. In this case, `LastError` should have details regarding + the failed health check(s). Adjust the Host Configuration and run `trident update`. + +- If the `ServicingState` is `AbUpdateRollbackFailed`, it indicates that Trident failed + to successfully handle rollback during an A/B update. This could be due to either: + + - The host booted into the target OS, but failed to prioritize the target OS boot entry + in the UEFI `BootOrder` variable or failed to update the Trident datastore. Use + `LastError` to help understand what to address, whether it be using `efibootmgr` to + configure `BootOrder` or verifying/ensuring that the Trident datastore is accessible. + Once addressed, run `trident commit`. + + - The machine booted into the target OS and a health check failed, but the host failed + to roll back into the servicing OS. Use `efibootmgr` to ensure that the servicing OS + boot entry is present and configured as the first entry in the `BootOrder` variable, + boot into the servicing OS, and run `trident commit`.