Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/buildomat/jobs/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -445,6 +445,13 @@ do
ACTUAL_ZPOOL_COUNT=$(pfexec zlogin oxz_switch /opt/oxide/omdb/bin/omdb db zpool list -i | wc -l)
done

# Confirm we can use `omdb` in the switch zone to fetch the `omdb` that's
# shipped in the Nexus zone, and that we can use that fetched `omdb`
# successfully.
pfexec zlogin oxz_switch /opt/oxide/omdb/bin/omdb nexus fetch-omdb /tmp/fetched-omdb
pfexec zlogin oxz_switch /tmp/fetched-omdb db inventory collections list
echo "Confirmed switch zone omdb can fetch a usable omdb from Nexus"

# The bootstrap command creates a disk, so before that: adjust the control plane
# storage buffer to 0 as the virtual hardware only creates 20G pools

Expand Down
54 changes: 54 additions & 0 deletions dev-tools/omdb/src/bin/omdb/nexus.rs
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,15 @@ use slog_error_chain::InlineErrorChain;
use std::collections::BTreeMap;
use std::collections::BTreeSet;
use std::fs::OpenOptions;
use std::os::unix::fs::PermissionsExt;
use std::str::FromStr;
use std::sync::Arc;
use support_bundle_viewer::LocalFileAccess;
use support_bundle_viewer::SupportBundleAccessor;
use tabled::Tabled;
use tabled::settings::Padding;
use tabled::settings::object::Columns;
use tokio::io::AsyncWriteExt;
use tokio::sync::OnceCell;
use update_engine::EventBuffer;
use update_engine::ExecutionStatus;
Expand Down Expand Up @@ -138,6 +140,8 @@ enum NexusCommands {
Blueprints(BlueprintsArgs),
/// interact with clickhouse policy
ClickhousePolicy(ClickhousePolicyArgs),
/// fetch an omdb binary associated with an active Nexus
FetchOmdb(FetchOmdbArgs),
/// print information about pending MGS updates
MgsUpdates,
/// interact with oximeter read policy
Expand Down Expand Up @@ -415,6 +419,12 @@ enum ClickhousePolicyMode {
Both,
}

#[derive(Debug, Args)]
struct FetchOmdbArgs {
/// output path to write the fetched omdb
output: Utf8PathBuf,
}

#[derive(Debug, Args)]
struct OximeterReadPolicyArgs {
#[command(subcommand)]
Expand Down Expand Up @@ -731,6 +741,10 @@ impl NexusArgs {
}
},

NexusCommands::FetchOmdb(args) => {
cmd_nexus_fetch_omdb(&client, args).await
}

NexusCommands::MgsUpdates => cmd_nexus_mgs_updates(&client).await,

NexusCommands::OximeterReadPolicy(OximeterReadPolicyArgs {
Expand Down Expand Up @@ -3639,6 +3653,46 @@ async fn cmd_nexus_clickhouse_policy_get(
Ok(())
}

async fn cmd_nexus_fetch_omdb(
client: &nexus_lockstep_client::Client,
args: &FetchOmdbArgs,
) -> Result<(), anyhow::Error> {
// Create the output file.
let out = tokio::fs::File::create_new(&args.output)
.await
.with_context(|| format!("could not create `{}`", args.output))?;

// Stream the binary from Nexus.
let mut out = tokio::io::BufWriter::new(out);
let body = client.fetch_omdb().await?;
let mut stream = body.into_inner().into_inner();
while let Some(maybe_chunk) = stream.next().await {
let chunk = maybe_chunk.context("failed reading chunk from Nexus")?;
tokio::io::copy(&mut std::io::Cursor::new(chunk), &mut out)
.await
.with_context(|| format!("failed writing to `{}`", args.output))?;
}
out.flush().await.with_context(|| {
format!("failed flushing data written to `{}`", args.output)
})?;

// Make it executable.
let out = out.into_inner();
let mut perms = out
.metadata()
.await
.with_context(|| {
format!("failed to read metadata of new file `{}`", args.output)
})?
.permissions();
perms.set_mode(0o0700);
out.set_permissions(perms).await.with_context(|| {
format!("failed to change permissions of new file `{}`", args.output)
})?;

Ok(())
}

async fn cmd_nexus_mgs_updates(
client: &nexus_lockstep_client::Client,
) -> Result<(), anyhow::Error> {
Expand Down
1 change: 1 addition & 0 deletions dev-tools/omdb/tests/usage_errors.out
Original file line number Diff line number Diff line change
Expand Up @@ -850,6 +850,7 @@ Commands:
background-tasks print information about background tasks
blueprints interact with blueprints
clickhouse-policy interact with clickhouse policy
fetch-omdb fetch an omdb binary associated with an active Nexus
mgs-updates print information about pending MGS updates
oximeter-read-policy interact with oximeter read policy
quiesce view or modify the quiesce status
Expand Down
34 changes: 34 additions & 0 deletions nexus-config/src/nexus_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -880,6 +880,11 @@ pub struct PackageConfig {
/// Authentication-related configuration
pub authn: AuthnConfig,
/// Timeseries database configuration.
/// Nexus-side support for `omdb`-based debugging.
///
/// This is only meaningful on real, multi-sled systems where `omdb` is in
/// use from the switch zone.
pub omdb: OmdbConfig,
#[serde(default)]
pub timeseries_db: TimeseriesDbConfig,
/// Describes how to handle and perform schema changes.
Expand Down Expand Up @@ -1167,6 +1172,8 @@ mod test {
[default_region_allocation_strategy]
type = "random"
seed = 0
[omdb]
bin_path = "/nonexistent/path/to/omdb"
"##,
)
.unwrap();
Expand Down Expand Up @@ -1236,6 +1243,9 @@ mod test {
0,
))),
},
omdb: OmdbConfig {
bin_path: "/nonexistent/path/to/omdb".into(),
},
schema: None,
tunables: Tunables {
max_vpc_ipv4_subnet_prefix: 27,
Expand Down Expand Up @@ -1504,6 +1514,9 @@ mod test {

[default_region_allocation_strategy]
type = "random"

[omdb]
bin_path = "/nonexistent/path/to/omdb"
"##,
)
.unwrap();
Expand Down Expand Up @@ -1551,6 +1564,8 @@ mod test {
subnet.net = "::/56"
[deployment.database]
type = "from_dns"
[omdb]
bin_path = "/nonexistent/path/to/omdb"
"##,
)
.expect_err("expected failure");
Expand Down Expand Up @@ -1608,6 +1623,8 @@ mod test {
subnet.net = "::/56"
[deployment.database]
type = "from_dns"
[omdb]
bin_path = "/nonexistent/path/to/omdb"
"##,
)
.expect_err("Expected failure");
Expand Down Expand Up @@ -1717,3 +1734,20 @@ pub enum RegionAllocationStrategy {
/// Like Random, but ensures that each region is allocated on its own sled.
RandomWithDistinctSleds { seed: Option<u64> },
}

/// Configuration details relevant to supporting `omdb`.
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct OmdbConfig {
/// Path to the `omdb` binary that is packaged alongside Nexus.
///
/// `omdb` is not typically used from within a Nexus zone, but we ship it
/// alongside Nexus to ensure we always have a version of `omdb` on the
/// system that matches the active version of Nexus. (During an upgrade, the
/// `omdb` shipped in the switch zone will be updated much earlier in the
/// process than the running Nexus zones, which means there's a period where
/// the switch zone `omdb` is expecting the systems it pokes to be running
/// already-updated software; this is particularly problematic for `omdb db
/// ...` when the schema migration to the new version hasn't been applied
/// yet.)
pub bin_path: Utf8PathBuf,
}
5 changes: 5 additions & 0 deletions nexus/examples/config-second.toml
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,11 @@ type = "random_with_distinct_sleds"
# same shuffling order for every region allocation.
# seed = 0

[omdb]
# In production omdb is shipped alongside Nexus in its zone; this doesn't happen
# (and isn't needed) in simulated environments.
bin_path = "/cannot/fetch/omdb/in/simulated/environments"

################################################################################
# INSTRUCTIONS: To run Nexus against an existing stack started with #
# `omicron-dev run-all`, you should only have to modify values in this #
Expand Down
5 changes: 5 additions & 0 deletions nexus/examples/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -165,3 +165,8 @@ type = "random"
# setting `seed` to a fixed value will make dataset selection ordering use the
# same shuffling order for every region allocation.
# seed = 0

[omdb]
# In production omdb is shipped alongside Nexus in its zone; this doesn't happen
# (and isn't needed) in simulated environments.
bin_path = "/cannot/fetch/omdb/in/simulated/environments"
17 changes: 17 additions & 0 deletions nexus/lockstep-api/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,23 @@ pub trait NexusLockstepApi {
Ok(HttpResponseOk(Ping { status: PingStatus::Ok }))
}

/// Fetch an `omdb` binary that lives in the zone alongside this Nexus.
///
/// This is only useful as a support tool, and is accessible via `omdb nexus
/// fetch-omdb ...`. During a Reconfigurator-driven upgrade, the `omdb`
/// binary in the switch zone is updated much earlier in the process than
/// Nexus or the database schema, meaning it is often unable to communicate
/// with Nexus or the DB due to mismatched expectation. It can be used to
/// fetch an _older_ `omdb` that matches the running Nexuses and DB schema
/// via this endpoint.
#[endpoint {
method = GET,
path = "/debug/fetch-omdb-binary",
}]
async fn fetch_omdb(
_rqctx: RequestContext<Self::Context>,
) -> Result<Response<Body>, HttpError>;

#[endpoint {
method = POST,
path = "/instances/{instance_id}/migrate",
Expand Down
4 changes: 4 additions & 0 deletions nexus/src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use authn::external::token::HttpAuthnToken;
use camino::Utf8PathBuf;
use chrono::Duration;
use nexus_config::NexusConfig;
use nexus_config::OmdbConfig;
use nexus_config::SchemeName;
use nexus_db_lookup::LookupPath;
use nexus_db_queries::authn::ConsoleSessionWithSiloId;
Expand Down Expand Up @@ -107,6 +108,8 @@ pub struct ServerContext {
pub(crate) external_tls_enabled: bool,
/// tunable settings needed for the console at runtime
pub(crate) console_config: ConsoleConfig,
/// config supporting `omdb` system introspection
pub(crate) omdb_config: OmdbConfig,
}

pub(crate) struct ConsoleConfig {
Expand Down Expand Up @@ -324,6 +327,7 @@ impl ServerContext {
),
static_dir,
},
omdb_config: config.pkg.omdb.clone(),
}))
}
}
Expand Down
34 changes: 34 additions & 0 deletions nexus/src/lockstep_api/http_entrypoints.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use std::collections::BTreeMap;

use dropshot::ApiDescription;
use dropshot::Body;
use dropshot::ErrorStatusCode;
use dropshot::Header;
use dropshot::HttpError;
use dropshot::HttpResponseCreated;
Expand All @@ -21,6 +22,7 @@ use dropshot::RequestContext;
use dropshot::ResultsPage;
use dropshot::TypedBody;
use http::Response;
use http::StatusCode;
use nexus_lockstep_api::*;
use nexus_types::deployment::Blueprint;
use nexus_types::deployment::BlueprintMetadata;
Expand Down Expand Up @@ -57,6 +59,7 @@ use omicron_common::api::external::http_pagination::ScanParams;
use omicron_common::api::external::http_pagination::data_page_params_for;
use omicron_uuid_kinds::*;
use range_requests::PotentialRange;
use slog_error_chain::InlineErrorChain;

use crate::app::support_bundles::SupportBundleQueryType;
use crate::context::ApiContext;
Expand All @@ -74,6 +77,37 @@ enum NexusLockstepApiImpl {}
impl NexusLockstepApi for NexusLockstepApiImpl {
type Context = ApiContext;

async fn fetch_omdb(
rqctx: RequestContext<Self::Context>,
) -> Result<Response<Body>, HttpError> {
let apictx = &rqctx.context().context;
let path = &apictx.omdb_config.bin_path;
let f = tokio::fs::File::open(path).await.map_err(|err| {
let err = format!(
"could not open {path}: {}",
InlineErrorChain::new(&err)
);
// Build an explicit HttpError instead of using
// `for_internal_error()` because the latter sends a generic error
// message to the client. We want to tell our client more details
// about what went wrong.
HttpError {
status_code: ErrorStatusCode::INTERNAL_SERVER_ERROR,
error_code: None,
external_message: err.clone(),
internal_message: err,
headers: None,
}
})?;
let f = hyper_staticfile::vfs::TokioFileAccess::new(f);
let f = hyper_staticfile::util::FileBytesStream::new(f);
let body = Body::wrap(hyper_staticfile::Body::Full(f));
Ok(Response::builder()
.status(StatusCode::OK)
.header(http::header::CONTENT_TYPE, "application/octet-stream")
.body(body)?)
}

async fn instance_migrate(
rqctx: RequestContext<Self::Context>,
path_params: Path<InstancePathParam>,
Expand Down
5 changes: 5 additions & 0 deletions nexus/tests/config.test.toml
Original file line number Diff line number Diff line change
Expand Up @@ -197,3 +197,8 @@ sp_ereport_ingester.period_secs = 30
# we only have one sled in the test environment, so we need to use the
# `Random` strategy, instead of `RandomWithDistinctSleds`
type = "random"

[omdb]
# In production omdb is shipped alongside Nexus in its zone; this doesn't happen
# (and isn't needed) in tests.
bin_path = "/cannot/fetch/omdb/in/test/suite"
17 changes: 17 additions & 0 deletions openapi/nexus-lockstep.json
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,23 @@
}
}
},
"/debug/fetch-omdb-binary": {
"get": {
"summary": "Fetch an `omdb` binary that lives in the zone alongside this Nexus.",
"description": "This is only useful as a support tool, and is accessible via `omdb nexus fetch-omdb ...`. During a Reconfigurator-driven upgrade, the `omdb` binary in the switch zone is updated much earlier in the process than Nexus or the database schema, meaning it is often unable to communicate with Nexus or the DB due to mismatched expectation. It can be used to fetch an _older_ `omdb` that matches the running Nexuses and DB schema via this endpoint.",
"operationId": "fetch_omdb",
"responses": {
"default": {
"description": "",
"content": {
"*/*": {
"schema": {}
}
}
}
}
}
},
"/demo-saga": {
"post": {
"summary": "Kick off an instance of the \"demo\" saga",
Expand Down
1 change: 1 addition & 0 deletions package-manifest.toml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,7 @@ only_for_targets.image = "standard"
source.type = "composite"
source.packages = [
"omicron-nexus.tar.gz",
"omicron-omdb.tar.gz",
"zone-setup.tar.gz",
"zone-network-install.tar.gz",
"opte-interface-setup.tar.gz",
Expand Down
4 changes: 4 additions & 0 deletions smf/nexus/multi-sled/config-partial.toml
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,7 @@ sp_ereport_ingester.disable = true
# by default, allocate across 3 distinct sleds
# seed is omitted so a new seed will be chosen with every allocation.
type = "random_with_distinct_sleds"

[omdb]
# Path to the omdb binary shipped in the Nexus zone.
bin_path = "/opt/oxide/omdb/bin/omdb"
4 changes: 4 additions & 0 deletions smf/nexus/single-sled/config-partial.toml
Original file line number Diff line number Diff line change
Expand Up @@ -103,3 +103,7 @@ sp_ereport_ingester.disable = true
# by default, allocate without requirement for distinct sleds.
# seed is omitted so a new seed will be chosen with every allocation.
type = "random"

[omdb]
# Path to the omdb binary shipped in the Nexus zone.
bin_path = "/opt/oxide/omdb/bin/omdb"
Loading