Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 114 additions & 30 deletions crates/trident/src/engine/rollback.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
use std::{fs, path::PathBuf};
use std::{
fs,
path::{Path, PathBuf},
};

use anyhow::{Context, Error};
use chrono::Utc;
Expand Down Expand Up @@ -47,10 +50,10 @@ pub fn validate_boot(datastore: &mut DataStore) -> Result<BootValidationResult,

let current_servicing_state = datastore.host_status().servicing_state;
let ab_active_volume = match current_servicing_state {
// For *Finalized, use the active volume set in Host Status
ServicingState::AbUpdateFinalized | ServicingState::CleanInstallFinalized => {
datastore.host_status().ab_active_volume
}
// For *Finalized and AbUpdateRollbackFailed, use the active volume set in Host Status
ServicingState::AbUpdateRollbackFailed
| ServicingState::AbUpdateFinalized
| ServicingState::CleanInstallFinalized => datastore.host_status().ab_active_volume,
// For AbUpdateHealthCheckFailed, use the opposite active volume of the one
// set in Host Status
ServicingState::AbUpdateHealthCheckFailed => {
Expand All @@ -73,7 +76,12 @@ pub fn validate_boot(datastore: &mut DataStore) -> Result<BootValidationResult,
ServicingType::AbUpdate
}
ServicingState::CleanInstallFinalized => ServicingType::CleanInstall,
_ => ServicingType::NoActiveServicing,
// For any other state, this function should not have been called
state => {
return Err(TridentError::new(InternalError::UnexpectedServicingState {
state,
}));
}
};

// Create an EngineContext based on the Host Status
Expand Down Expand Up @@ -104,18 +112,82 @@ pub fn validate_boot(datastore: &mut DataStore) -> Result<BootValidationResult,
compare_root_device_paths(current_root_path.clone(), expected_root_path.clone())
.message("Host failed to boot from expected root device")?;

// Check for cases where we are not booting from the expected root. Failures
// here will return early and have updated the host status appropriately.
handle_boot_validation_errors(
datastore,
booted_to_expected_root,
current_servicing_state,
&current_root_path,
&expected_root_path,
)?;

info!("Host successfully booted in updated target OS image");

// Check health of updated OS, failures here will return early and
// ValidBootHealthCheckFailed will invoke a reboot.
match check_health(&ctx, datastore, current_servicing_state, servicing_type)? {
BootValidationResult::ValidBootHealthCheckFailed(err) => {
return Ok(BootValidationResult::ValidBootHealthCheckFailed(err));
}
BootValidationResult::ValidBootProvisioned => {
// continue
}
}

// Complete the commit, update the boot and encryption configurations
match commit_finalized_on_expected_root(&ctx, datastore, current_servicing_state) {
Ok(result) => {
return Ok(result);
}
Err(_err) => {
if servicing_type == ServicingType::AbUpdate {
// Failed to update UEFI boot order or encryption configuration after
// successful boot to expected root for A/B Update, re-set host status
// state to AbUpdateRollbackFailed
error!(
"Re-set host status from {current_servicing_state:?} to AbUpdateRollbackFailed"
);
datastore.with_host_status(|host_status| {
host_status.servicing_state = ServicingState::AbUpdateRollbackFailed;
})?;

return Err(TridentError::new(ServicingError::AbUpdateRebootCheck {
root_device_path: current_root_path.to_string_lossy().to_string(),
expected_device_path: expected_root_path.to_string_lossy().to_string(),
}));
} else if servicing_type == ServicingType::CleanInstall {
// For Clean Install, when not booting from expected root, re-set
// host status state to NotProvisioned
error!("Re-set host status from {current_servicing_state:?} to NotProvisioned");
datastore.with_host_status(|host_status| {
host_status.spec = Default::default();
host_status.servicing_state = ServicingState::NotProvisioned;
})?;

return Err(TridentError::new(ServicingError::CleanInstallRebootCheck {
root_device_path: current_root_path.to_string_lossy().to_string(),
expected_device_path: expected_root_path.to_string_lossy().to_string(),
}));
}
}
};
Ok(BootValidationResult::ValidBootProvisioned)
}

fn handle_boot_validation_errors(
datastore: &mut DataStore,
booted_to_expected_root: bool,
current_servicing_state: ServicingState,
current_root_path: &Path,
expected_root_path: &Path,
) -> Result<(), TridentError> {
match (booted_to_expected_root, current_servicing_state) {
// Success case, nothing to handle here
(true, ServicingState::CleanInstallFinalized)
| (true, ServicingState::AbUpdateFinalized) => {
// For *Finalized states, when booting from the expected
// root, finish the commit process
info!("Host successfully booted from updated target OS image");
return commit_finalized_on_expected_root(
&ctx,
datastore,
current_servicing_state,
servicing_type,
);
// no-op
Ok(())
}
//
// Every case below will return an error.
Expand All @@ -125,10 +197,15 @@ pub fn validate_boot(datastore: &mut DataStore) -> Result<BootValidationResult,
// failed to rollback, i.e boot from the servicing OS, report error
// and leave host status alone
error!("Host failed to rollback into the servicing OS");
return Err(TridentError::new(ServicingError::AbUpdateRebootCheck {
info!("Re-set host status from {current_servicing_state:?} to AbUpdateRollbackFailed");
datastore.with_host_status(|host_status| {
host_status.servicing_state = ServicingState::AbUpdateRollbackFailed;
})?;

Err(TridentError::new(ServicingError::AbUpdateRebootCheck {
root_device_path: current_root_path.to_string_lossy().to_string(),
expected_device_path: expected_root_path.to_string_lossy().to_string(),
}));
}))
}
(false, ServicingState::CleanInstallFinalized) => {
// For Clean Install, when not booting from expected root, re-set
Expand All @@ -139,10 +216,10 @@ pub fn validate_boot(datastore: &mut DataStore) -> Result<BootValidationResult,
host_status.servicing_state = ServicingState::NotProvisioned;
})?;

return Err(TridentError::new(ServicingError::CleanInstallRebootCheck {
Err(TridentError::new(ServicingError::CleanInstallRebootCheck {
root_device_path: current_root_path.to_string_lossy().to_string(),
expected_device_path: expected_root_path.to_string_lossy().to_string(),
}));
}))
}
(true, ServicingState::AbUpdateHealthCheckFailed) => {
// AbUpdateHealthCheckFailed, when booting from expected root (the servicing OS), mark host
Expand All @@ -154,11 +231,11 @@ pub fn validate_boot(datastore: &mut DataStore) -> Result<BootValidationResult,
host_status.servicing_state = ServicingState::Provisioned;
})?;

return Err(TridentError::new(
Err(TridentError::new(
HealthChecksError::AbUpdateHealthCheckCommitCheck {
expected_device_path: current_root_path.to_string_lossy().to_string(),
},
));
))
}
(false, ServicingState::AbUpdateFinalized) => {
// AbUpdateFinalize, when booting from incorrect root (the servicing OS), mark host status
Expand All @@ -170,26 +247,23 @@ pub fn validate_boot(datastore: &mut DataStore) -> Result<BootValidationResult,
host_status.servicing_state = ServicingState::Provisioned;
})?;

return Err(TridentError::new(ServicingError::AbUpdateRebootCheck {
Err(TridentError::new(ServicingError::AbUpdateRebootCheck {
root_device_path: current_root_path.to_string_lossy().to_string(),
expected_device_path: expected_root_path.to_string_lossy().to_string(),
}));
}))
}
(_, state) => {
// No other states should happen, return error
error!("Unexpected status: {current_servicing_state:?}");
return Err(TridentError::new(InternalError::UnexpectedServicingState {
Err(TridentError::new(InternalError::UnexpectedServicingState {
state,
}));
}))
}
}
}

/// Completes the commit for AbUpdateFinalized and CleanInstallFinalized states when
/// the host has booted from the expected root device. This includes running health
/// checks, updating boot order, updating the encryption pcrlock policy if needed, and
/// updating the Host Status.
fn commit_finalized_on_expected_root(
/// Runs health checks on target OS
fn check_health(
ctx: &EngineContext,
datastore: &mut DataStore,
current_servicing_state: ServicingState,
Expand All @@ -206,7 +280,17 @@ fn commit_finalized_on_expected_root(
return Err(err);
}
}
Ok(BootValidationResult::ValidBootProvisioned)
}

/// Completes the commit for AbUpdateFinalized and CleanInstallFinalized states when
/// the host has booted from the expected root device. This includes updating boot order,
/// updating the encryption pcrlock policy if needed, and updating the Host Status.
fn commit_finalized_on_expected_root(
ctx: &EngineContext,
datastore: &mut DataStore,
current_servicing_state: ServicingState,
) -> Result<BootValidationResult, TridentError> {
// If it's virtdeploy, after confirming that we have booted into the correct image, we need
// to update the `BootOrder` to boot from the correct image next time.
let use_virtdeploy_workaround = virt::is_virtdeploy()
Expand Down
1 change: 1 addition & 0 deletions crates/trident/src/engine/storage/rebuild.rs
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ pub(crate) fn validate_rebuild_raid(
ServicingState::Provisioned
| ServicingState::AbUpdateStaged
| ServicingState::AbUpdateFinalized
| ServicingState::AbUpdateRollbackFailed
| ServicingState::AbUpdateHealthCheckFailed => {}
}

Expand Down
3 changes: 3 additions & 0 deletions crates/trident_api/src/status.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,9 @@ pub enum ServicingState {
/// A/B update has been finalized. For the next boot, the firmware will boot from the updated
/// target OS image.
AbUpdateFinalized,
/// A/B update has failed, and Trident initiated an auto-rollback, but the host failed to
/// successfully rollback to the servicing OS.
AbUpdateRollbackFailed,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note: this will become an official API name, it should be reviewed/approved

/// Servicing has been completed, and the host successfully booted from the updated target OS
/// image. Trident is ready to begin a new servicing.
Provisioned,
Expand Down
105 changes: 51 additions & 54 deletions docs/Explanation/Health-Checks.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,65 +75,62 @@ Health checks are run during `trident commit` after a `trident install` or
`trident update` have staged and finalized. You can see how `health checks`
fit into the overall servicing flow in these diagrams:

### Clean Install with Health Checks

```mermaid
---
config:
theme: redux
---
flowchart TD
A(["Clean Install (to A)"])
style A color:#085
A --> B["CleanInstallStaged"]
B --> C["CleanInstallFinalized<br/>(reboot)"]
C --> D{"Commit<br/>(unknown OS)"}
D --booted in A--> XX("in target OS (A)")
style XX color:#085
XX --health checks<br/>succeeded--> F["Provisioned (A)<br/>no errors"]
style F color:#085
D --did NOT boot in A--> YY("in unepected OS")
style YY color:#822
YY --> E["NotProvisioned with last_error set"]
style E color:#822
XX --health check<br/>failed--> E
XX --commit failure--> E
```

### A/B Update with Health Checks

```mermaid
---
config:
theme: redux
layout: dagre
theme: redux
---
flowchart TD
A["NotProvisioned"] ==> B{"trident install"}
B ==> C["CleanInstallStaged"]
C ==> D["CleanInstallFinalized"]
D === G(["Finalize Reboot"])
G ==> E{"trident commit **A**"}
E == Commit succeeded ==> F["Provisioned **A**"]
E -.- Z(["Health Check Failure"])
Z -.-> A
AA["Provisioned **A**"] ==> BB{"trident update"}
BB ==> CC["AbUpdateStaged"]
CC ==> DD["AbUpdateFinalized"]
DD === JJ(["Finalize Reboot"])
JJ ==> EE{"trident commit **B**"}
EE == Commit succeeded ==> FF["Provisioned **B**"]
EE -.- ZZ(["Health Check failure"])
ZZ -.-> HH["AbUpdateHealthCheckFailed"]
HH -.- KK(["Rollback Reboot"])
KK -.-> II{"trident commit **A**"}
II -. Commit succeeded .-> AA
style A fill:#FFF9C4
style C fill:#FFF9C4
style D fill:#FFF9C4
style G fill:#BBDEFB
style F fill:#00C853
style Z fill:#FFCDD2
style AA fill:#FFF9C4
style CC fill:#FFF9C4
style DD fill:#FFF9C4
style JJ fill:#BBDEFB
style FF fill:#00C853
style ZZ fill:#FFCDD2
style HH fill:#FFF9C4
style KK fill:#BBDEFB
linkStyle 0 stroke:#00C853,fill:none
linkStyle 1 stroke:#00C853,fill:none
linkStyle 2 stroke:#00C853,fill:none
linkStyle 3 stroke:#00C853,fill:none
linkStyle 4 stroke:#00C853,fill:none
linkStyle 5 stroke:#00C853,fill:none
linkStyle 6 stroke:#D50000,fill:none
linkStyle 7 stroke:#D50000,fill:none
linkStyle 8 stroke:#00C853,fill:none
linkStyle 9 stroke:#00C853,fill:none
linkStyle 10 stroke:#00C853,fill:none
linkStyle 11 stroke:#00C853,fill:none
linkStyle 12 stroke:#00C853,fill:none
linkStyle 13 stroke:#00C853,fill:none
linkStyle 14 stroke:#D50000,fill:none
linkStyle 15 stroke:#D50000,fill:none
linkStyle 16 stroke:#D50000,fill:none
linkStyle 17 stroke:#D50000,fill:none
linkStyle 18 stroke:#D50000,fill:none
AA["Provisioned (A)"]
style AA color:#085
AA --> A(["A/B Update<br/>from servicing OS A<br/>to target OS B"])
style A color:#085
A --> B["AbUpdateStaged"]
B --> C["AbUpdateFinalized<br/>(reboot)"]
C --> D{"Commit<br/>(unknown OS)"}
D --booted in B--> XX("in target OS (B)")
style XX color:#085
XX --health checks<br/>succeeded--> F["Provisioned (B)<br/>no errors"]
style F color:#085
XX --commit infra failure<br/>last_error set --> Z["AbUpdateRollbackFailed (B)"]
style Z color:#822
XX --health checks<br/>failed--> G["AbUpdateHealthCheckFailed"]
style G color:#822
D --booted in A--> YY("in servicing OS (A)")
style YY color:#822
G --> GG["Auto-rollback<br/>(reboot)"]
GG --> H{"Commit<br/>(unknown OS)"}
H --failed to rollback<br/>in target OS (B)--> Z
H --rolled back<br/>servicing OS (A)--> YY
YY --> J["Provisioned (A)<br/>with last_error set"]
style J color:#822
```

## Health Check failures
Expand Down
Loading
Loading