Skip to content

[34/n] sled-agent logic to clear mupdate overrides #8572

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions dev-tools/reconfigurator-cli/tests/output/cmds-example-stdout
Original file line number Diff line number Diff line change
Expand Up @@ -1189,6 +1189,7 @@ LEDGERED SLED CONFIG
slot A details UNAVAILABLE: constructed via debug_assume_success()
slot B details UNAVAILABLE: constructed via debug_assume_success()
last reconciled config: matches ledgered config
no mupdate override to clear
no orphaned datasets
all disks reconciled successfully
all datasets reconciled successfully
Expand Down Expand Up @@ -1296,6 +1297,7 @@ LEDGERED SLED CONFIG
slot A details UNAVAILABLE: constructed via debug_assume_success()
slot B details UNAVAILABLE: constructed via debug_assume_success()
last reconciled config: matches ledgered config
no mupdate override to clear
no orphaned datasets
all disks reconciled successfully
all datasets reconciled successfully
Expand Down Expand Up @@ -1496,6 +1498,7 @@ LEDGERED SLED CONFIG
slot A details UNAVAILABLE: constructed via debug_assume_success()
slot B details UNAVAILABLE: constructed via debug_assume_success()
last reconciled config: matches ledgered config
no mupdate override to clear
no orphaned datasets
all disks reconciled successfully
all datasets reconciled successfully
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ LEDGERED SLED CONFIG
slot A details UNAVAILABLE: constructed via debug_assume_success()
slot B details UNAVAILABLE: constructed via debug_assume_success()
last reconciled config: matches ledgered config
error reading mupdate override, so sled agent didn't attempt to clear it
no orphaned datasets
all disks reconciled successfully
all datasets reconciled successfully
Expand Down Expand Up @@ -288,6 +289,7 @@ LEDGERED SLED CONFIG
slot A details UNAVAILABLE: constructed via debug_assume_success()
slot B details UNAVAILABLE: constructed via debug_assume_success()
last reconciled config: matches ledgered config
mupdate override present, but sled agent was not instructed to clear it
no orphaned datasets
all disks reconciled successfully
all datasets reconciled successfully
Expand Down Expand Up @@ -383,6 +385,7 @@ LEDGERED SLED CONFIG
slot A details UNAVAILABLE: constructed via debug_assume_success()
slot B details UNAVAILABLE: constructed via debug_assume_success()
last reconciled config: matches ledgered config
mupdate override present, but sled agent was not instructed to clear it
no orphaned datasets
all disks reconciled successfully
all datasets reconciled successfully
Expand Down
47 changes: 47 additions & 0 deletions nexus-sled-agent-shared/src/inventory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,10 @@ pub struct ConfigReconcilerInventory {
pub orphaned_datasets: IdOrdMap<OrphanedDataset>,
pub zones: BTreeMap<OmicronZoneUuid, ConfigReconcilerInventoryResult>,
pub boot_partitions: BootPartitionContents,
/// The result of clearing the mupdate override field.
///
/// `None` if `remove_mupdate_override` was not provided in the sled config.
pub clear_mupdate_override: Option<ClearMupdateOverrideInventory>,
}

impl ConfigReconcilerInventory {
Expand Down Expand Up @@ -200,6 +204,17 @@ impl ConfigReconcilerInventory {
.iter()
.map(|z| (z.id, ConfigReconcilerInventoryResult::Ok))
.collect();
let clear_mupdate_override = config.remove_mupdate_override.map(|_| {
ClearMupdateOverrideInventory {
boot_disk_result: Ok(
ClearMupdateOverrideBootSuccessInventory::Cleared,
),
non_boot_message: "mupdate override successfully cleared \
on non-boot disks"
.to_owned(),
}
});

Self {
last_reconciled_config: config,
external_disks,
Expand All @@ -216,6 +231,7 @@ impl ConfigReconcilerInventory {
slot_b: Err(err),
}
},
clear_mupdate_override,
}
}
}
Expand Down Expand Up @@ -277,6 +293,37 @@ impl IdOrdItem for OrphanedDataset {
id_upcast!();
}

/// Status of clearing the mupdate override in the inventory.
#[derive(Clone, Debug, PartialEq, Eq, Deserialize, JsonSchema, Serialize)]
pub struct ClearMupdateOverrideInventory {
/// The result of clearing the mupdate override on the boot disk.
#[serde(with = "snake_case_result")]
#[schemars(
schema_with = "SnakeCaseResult::<ClearMupdateOverrideBootSuccessInventory, String>::json_schema"
)]
pub boot_disk_result:
Result<ClearMupdateOverrideBootSuccessInventory, String>,

/// What happened on non-boot disks.
///
/// We aren't modeling this out in more detail, because we plan to not try
/// and keep ledgered data in sync across both disks in the future.
pub non_boot_message: String,
}

/// Status of clearing the mupdate override on the boot disk.
#[derive(Clone, Debug, PartialEq, Eq, Deserialize, JsonSchema, Serialize)]
#[serde(rename_all = "snake_case")]
pub enum ClearMupdateOverrideBootSuccessInventory {
/// The mupdate override was successfully cleared.
Cleared,

/// No mupdate override was found.
///
/// This is considered a success for idempotency reasons.
NoOverride,
}

#[derive(Clone, Debug, PartialEq, Eq, Deserialize, JsonSchema, Serialize)]
#[serde(tag = "result", rename_all = "snake_case")]
pub enum ConfigReconcilerInventoryResult {
Expand Down
104 changes: 104 additions & 0 deletions nexus/db-model/src/inventory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,8 @@ use nexus_db_schema::schema::{
};
use nexus_sled_agent_shared::inventory::BootImageHeader;
use nexus_sled_agent_shared::inventory::BootPartitionDetails;
use nexus_sled_agent_shared::inventory::ClearMupdateOverrideBootSuccessInventory;
use nexus_sled_agent_shared::inventory::ClearMupdateOverrideInventory;
use nexus_sled_agent_shared::inventory::ConfigReconcilerInventoryStatus;
use nexus_sled_agent_shared::inventory::HostPhase2DesiredContents;
use nexus_sled_agent_shared::inventory::HostPhase2DesiredSlots;
Expand Down Expand Up @@ -998,6 +1000,8 @@ pub struct InvSledConfigReconciler {
boot_disk_error: Option<String>,
pub boot_partition_a_error: Option<String>,
pub boot_partition_b_error: Option<String>,
#[diesel(embed)]
pub clear_mupdate_override: InvClearMupdateOverride,
}

impl InvSledConfigReconciler {
Expand All @@ -1008,6 +1012,7 @@ impl InvSledConfigReconciler {
boot_disk: Result<M2Slot, String>,
boot_partition_a_error: Option<String>,
boot_partition_b_error: Option<String>,
clear_mupdate_override: InvClearMupdateOverride,
) -> Self {
// TODO-cleanup We should use `HwM2Slot` instead of integers for this
// column: https://github.com/oxidecomputer/omicron/issues/8642
Expand All @@ -1025,6 +1030,7 @@ impl InvSledConfigReconciler {
boot_disk_error,
boot_partition_a_error,
boot_partition_b_error,
clear_mupdate_override,
}
}

Expand Down Expand Up @@ -1064,6 +1070,104 @@ impl InvSledConfigReconciler {
}
}

// See [`nexus_sled_agent_shared::inventory::DbClearMupdateOverrideBootSuccess`].
impl_enum_type!(
ClearMupdateOverrideBootSuccessEnum:

#[derive(Copy, Clone, Debug, AsExpression, FromSqlRow, PartialEq)]
pub enum DbClearMupdateOverrideBootSuccess;

// Enum values
Cleared => b"cleared"
NoOverride => b"no-override"
);

impl From<ClearMupdateOverrideBootSuccessInventory>
for DbClearMupdateOverrideBootSuccess
{
fn from(value: ClearMupdateOverrideBootSuccessInventory) -> Self {
match value {
ClearMupdateOverrideBootSuccessInventory::Cleared => Self::Cleared,
ClearMupdateOverrideBootSuccessInventory::NoOverride => {
Self::NoOverride
}
}
}
}

impl From<DbClearMupdateOverrideBootSuccess>
for ClearMupdateOverrideBootSuccessInventory
{
fn from(value: DbClearMupdateOverrideBootSuccess) -> Self {
match value {
DbClearMupdateOverrideBootSuccess::Cleared => Self::Cleared,
DbClearMupdateOverrideBootSuccess::NoOverride => Self::NoOverride,
}
}
}

/// See [`nexus_sled_agent_shared::inventory::ClearMupdateOverrideInventory`].
#[derive(Queryable, Clone, Debug, Selectable, Insertable)]
#[diesel(table_name = inv_sled_config_reconciler)]
pub struct InvClearMupdateOverride {
#[diesel(column_name = clear_mupdate_override_boot_success)]
pub boot_success: Option<DbClearMupdateOverrideBootSuccess>,

#[diesel(column_name = clear_mupdate_override_boot_error)]
pub boot_error: Option<String>,

#[diesel(column_name = clear_mupdate_override_non_boot_message)]
pub non_boot_message: Option<String>,
}

impl InvClearMupdateOverride {
pub fn new(
clear_mupdate_override: Option<&ClearMupdateOverrideInventory>,
) -> Self {
let boot_success = clear_mupdate_override.and_then(|inv| {
inv.boot_disk_result.as_ref().ok().map(|v| v.clone().into())
});
let boot_error = clear_mupdate_override
.and_then(|inv| inv.boot_disk_result.as_ref().err().cloned());
let non_boot_message =
clear_mupdate_override.map(|inv| inv.non_boot_message.clone());

Self { boot_success, boot_error, non_boot_message }
}

pub fn into_inventory(
self,
) -> anyhow::Result<Option<ClearMupdateOverrideInventory>> {
match self {
Self {
boot_success: Some(success),
boot_error: None,
non_boot_message: Some(non_boot_message),
} => Ok(Some(ClearMupdateOverrideInventory {
boot_disk_result: Ok(success.into()),
non_boot_message,
})),
Self {
boot_success: None,
boot_error: Some(boot_error),
non_boot_message: Some(non_boot_message),
} => Ok(Some(ClearMupdateOverrideInventory {
boot_disk_result: Err(boot_error),
non_boot_message,
})),
Self {
boot_success: None,
boot_error: None,
non_boot_message: None,
} => Ok(None),
this => Err(anyhow!(
"inv_sled_config_reconciler CHECK constraint violated: \
clear mupdate override columns are not consistent: {this:?}"
)),
}
}
}

/// See [`nexus_sled_agent_shared::inventory::BootPartitionDetails`].
#[derive(Queryable, Clone, Debug, Selectable, Insertable)]
#[diesel(table_name = inv_sled_boot_partition)]
Expand Down
3 changes: 2 additions & 1 deletion nexus/db-model/src/schema_versions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock};
///
/// This must be updated when you change the database schema. Refer to
/// schema/crdb/README.adoc in the root of this repository for details.
pub const SCHEMA_VERSION: Version = Version::new(168, 0, 0);
pub const SCHEMA_VERSION: Version = Version::new(169, 0, 0);

/// List of all past database schema versions, in *reverse* order
///
Expand All @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock<Vec<KnownVersion>> = LazyLock::new(|| {
// | leaving the first copy as an example for the next person.
// v
// KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"),
KnownVersion::new(169, "inv-clear-mupdate-override"),
KnownVersion::new(168, "add-inv-host-phase-1-flash-hash"),
KnownVersion::new(167, "add-pending-mgs-updates-rot"),
KnownVersion::new(166, "bundle-user-comment"),
Expand Down
28 changes: 26 additions & 2 deletions nexus/db-queries/src/db/datastore/inventory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ use nexus_db_errors::public_error_from_diesel;
use nexus_db_errors::public_error_from_diesel_lookup;
use nexus_db_model::ArtifactHash;
use nexus_db_model::HwM2Slot;
use nexus_db_model::InvCaboose;
use nexus_db_model::InvClickhouseKeeperMembership;
use nexus_db_model::InvCockroachStatus;
use nexus_db_model::InvCollection;
Expand Down Expand Up @@ -72,6 +71,7 @@ use nexus_db_model::{
};
use nexus_db_model::{HwPowerState, InvZoneManifestNonBoot};
use nexus_db_model::{HwRotSlot, InvMupdateOverrideNonBoot};
use nexus_db_model::{InvCaboose, InvClearMupdateOverride};
use nexus_db_schema::enums::HwM2SlotEnum;
use nexus_db_schema::enums::HwRotSlotEnum;
use nexus_db_schema::enums::RotImageErrorEnum;
Expand Down Expand Up @@ -3683,6 +3683,13 @@ impl DataStore {
BootPartitionContents { boot_disk, slot_a, slot_b }
};

let clear_mupdate_override = reconciler
.clear_mupdate_override
.into_inventory()
.map_err(|err| {
Error::internal_error(&format!("{err:#}"))
})?;

Ok::<_, Error>(ConfigReconcilerInventory {
last_reconciled_config,
external_disks: last_reconciliation_disk_results
Expand All @@ -3699,6 +3706,7 @@ impl DataStore {
.remove(&sled_id)
.unwrap_or_default(),
boot_partitions,
clear_mupdate_override,
})
})
.transpose()?;
Expand Down Expand Up @@ -3920,6 +3928,9 @@ impl ConfigReconcilerRows {
)?
};
last_reconciliation_config_id = Some(last_reconciled_config);
let clear_mupdate_override = InvClearMupdateOverride::new(
last_reconciliation.clear_mupdate_override.as_ref(),
);

self.config_reconcilers.push(InvSledConfigReconciler::new(
collection_id,
Expand All @@ -3938,6 +3949,7 @@ impl ConfigReconcilerRows {
.as_ref()
.err()
.cloned(),
clear_mupdate_override,
));

// Boot partition _errors_ are kept in `InvSledConfigReconciler`
Expand Down Expand Up @@ -4186,10 +4198,13 @@ mod test {
use nexus_inventory::examples::Representative;
use nexus_inventory::examples::representative;
use nexus_inventory::now_db_precision;
use nexus_sled_agent_shared::inventory::BootImageHeader;
use nexus_sled_agent_shared::inventory::BootPartitionContents;
use nexus_sled_agent_shared::inventory::BootPartitionDetails;
use nexus_sled_agent_shared::inventory::OrphanedDataset;
use nexus_sled_agent_shared::inventory::{
BootImageHeader, ClearMupdateOverrideBootSuccessInventory,
ClearMupdateOverrideInventory,
};
use nexus_sled_agent_shared::inventory::{
ConfigReconcilerInventory, ConfigReconcilerInventoryResult,
ConfigReconcilerInventoryStatus, OmicronZoneImageSource,
Expand Down Expand Up @@ -5051,6 +5066,15 @@ mod test {
artifact_size: 456789,
}),
},
clear_mupdate_override: Some(
ClearMupdateOverrideInventory {
boot_disk_result: Ok(
ClearMupdateOverrideBootSuccessInventory::Cleared,
),
non_boot_message: "simulated non-boot message"
.to_owned(),
},
),
}
});

Expand Down
1 change: 1 addition & 0 deletions nexus/db-schema/src/enums.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ define_enums! {
BpZoneDispositionEnum => "bp_zone_disposition",
BpZoneImageSourceEnum => "bp_zone_image_source",
CabooseWhichEnum => "caboose_which",
ClearMupdateOverrideBootSuccessEnum => "clear_mupdate_override_boot_success",
ClickhouseModeEnum => "clickhouse_mode",
DatasetKindEnum => "dataset_kind",
DnsGroupEnum => "dns_group",
Expand Down
4 changes: 4 additions & 0 deletions nexus/db-schema/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1639,6 +1639,10 @@ table! {

boot_partition_a_error -> Nullable<Text>,
boot_partition_b_error -> Nullable<Text>,

clear_mupdate_override_boot_success -> Nullable<crate::enums::ClearMupdateOverrideBootSuccessEnum>,
clear_mupdate_override_boot_error -> Nullable<Text>,
clear_mupdate_override_non_boot_message -> Nullable<Text>,
}
}

Expand Down
Loading
Loading