diff --git a/Cargo.lock b/Cargo.lock index f742c5c3da6..a5fce37b091 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6238,6 +6238,7 @@ dependencies = [ "nexus-sled-agent-shared", "nexus-types", "omicron-certificates", + "omicron-cockroach-metrics", "omicron-common", "omicron-passwords", "omicron-rpaths", @@ -6307,6 +6308,7 @@ dependencies = [ "nexus-sled-agent-shared", "nexus-test-utils", "nexus-types", + "omicron-cockroach-metrics", "omicron-common", "omicron-passwords", "omicron-rpaths", @@ -6422,10 +6424,12 @@ dependencies = [ "gateway-messages", "gateway-test-utils", "gateway-types", + "httpmock", "id-map", "iddqd", "nexus-sled-agent-shared", "nexus-types", + "omicron-cockroach-metrics", "omicron-common", "omicron-sled-agent", "omicron-uuid-kinds", @@ -6902,6 +6906,7 @@ dependencies = [ "newtype-uuid", "newtype_derive", "nexus-sled-agent-shared", + "omicron-cockroach-metrics", "omicron-common", "omicron-passwords", "omicron-uuid-kinds", @@ -7347,6 +7352,7 @@ dependencies = [ "futures", "omicron-workspace-hack", "proptest", + "reqwest", "serde", "serde_json", "slog", @@ -7652,6 +7658,7 @@ dependencies = [ "nexus-test-utils-macros", "nexus-types", "num-integer", + "omicron-cockroach-metrics", "omicron-common", "omicron-passwords", "omicron-rpaths", @@ -11828,9 +11835,9 @@ checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" [[package]] name = "signal-hook" -version = "0.3.18" +version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d881a16cf4426aa584979d30bd82cb33429027e42122b169753d6ef1085ed6e2" +checksum = "8621587d4798caf8eb44879d42e56b9a93ea5dcd315a6487c357130095b62801" dependencies = [ "libc", "signal-hook-registry", @@ -12435,9 +12442,9 @@ dependencies = [ [[package]] name = "socket2" -version = "0.5.10" +version = "0.5.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e22376abed350d73dd1cd119b57ffccad95b4e585a7cda43e286245ce23c0678" +checksum = "4f5fd57c80058a56cf5c777ab8a126398ece8e442983605d280a44ce79d0edef" dependencies = [ "libc", "windows-sys 0.52.0", @@ -14689,9 +14696,9 @@ dependencies = [ [[package]] name = "webpki-roots" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2853738d1cc4f2da3a225c18ec6c3721abb31961096e9dbf5ab35fa88b19cfdb" +checksum = "8782dd5a41a24eed3a4f40b606249b3e236ca61adf1f25ea4d45c73de122b502" dependencies = [ "rustls-pki-types", ] @@ -14710,9 +14717,9 @@ dependencies = [ [[package]] name = "whoami" -version = "1.6.0" +version = "1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6994d13118ab492c3c80c1f81928718159254c53c472bf9ce36f8dae4add02a7" +checksum = "372d5b87f58ec45c384ba03563b03544dc5fadc3983e434b286913f5b4a9bb6d" dependencies = [ "redox_syscall 0.5.7", "wasite", diff --git a/cockroach-metrics/Cargo.toml b/cockroach-metrics/Cargo.toml index 7932933005c..c75d07264cc 100644 --- a/cockroach-metrics/Cargo.toml +++ b/cockroach-metrics/Cargo.toml @@ -9,6 +9,7 @@ anyhow.workspace = true chrono.workspace = true cockroach-admin-client.workspace = true futures.workspace = true +reqwest.workspace = true serde_json.workspace = true slog.workspace = true serde.workspace = true diff --git a/cockroach-metrics/src/lib.rs b/cockroach-metrics/src/lib.rs index 7e2dcd524f0..ec16ed4c172 100644 --- a/cockroach-metrics/src/lib.rs +++ b/cockroach-metrics/src/lib.rs @@ -30,7 +30,22 @@ struct CockroachAdminClient { impl CockroachAdminClient { /// Create a new CockroachDB HTTP client fn new(log: Logger, address: SocketAddr) -> Self { - let client = Client::new(&format!("http://{address}"), log); + // It's important that we have *some* timeout here - currently, + // inventory collection will query all nodes to confirm they're + // responding. However, it's very possible that one node is down, + // and that should not block collection indefinitely. + let timeout_duration = std::time::Duration::from_secs(15); + let reqwest_client = reqwest::ClientBuilder::new() + .connect_timeout(timeout_duration) + .timeout(timeout_duration) + .build() + .expect("Failed to build HTTP client"); + + let client = Client::new_with_client( + &format!("http://{address}"), + reqwest_client, + log, + ); Self { client } } @@ -171,7 +186,9 @@ impl CockroachClusterAdminClient { } /// Fetch Prometheus metrics from all backends concurrently, returning the first successful result - pub async fn fetch_prometheus_metrics(&self) -> Result { + pub async fn fetch_prometheus_metrics_from_any_node( + &self, + ) -> Result { let clients = self.clients.read().await; if clients.is_empty() { @@ -215,7 +232,9 @@ impl CockroachClusterAdminClient { } /// Fetch node status from all backends concurrently, returning the first successful result - pub async fn fetch_node_status(&self) -> Result { + pub async fn fetch_node_status_from_any_node( + &self, + ) -> Result { let clients = self.clients.read().await; if clients.is_empty() { @@ -263,6 +282,93 @@ impl CockroachClusterAdminClient { let clients = self.clients.read().await; clients.keys().copied().collect() } + + /// Fetch Prometheus metrics from all backends, returning all successful results + pub async fn fetch_prometheus_metrics_from_all_nodes( + &self, + ) -> Vec<(SocketAddr, PrometheusMetrics)> { + let clients = self.clients.read().await; + + if clients.is_empty() { + return Vec::new(); + } + + // Create futures for all requests + let mut futures = FuturesUnordered::new(); + for (&addr, client) in clients.iter() { + let future = + async move { (addr, client.fetch_prometheus_metrics().await) }; + futures.push(future); + } + + let mut successful_results = Vec::new(); + + // Collect all successful results + while let Some((addr, result)) = futures.next().await { + match result { + Ok(metrics) => { + debug!( + self.log, + "Successfully fetched metrics from CockroachDB node"; + "address" => %addr + ); + successful_results.push((addr, metrics)); + } + Err(e) => { + // Log the error but continue trying other backends + warn!( + self.log, + "Failed to fetch metrics from CockroachDB node"; + "address" => %addr, + "error" => %e + ); + } + } + } + + successful_results + } + + /// Fetch node status from all backends, returning all successful results + pub async fn fetch_node_status_from_all_nodes( + &self, + ) -> Vec<(SocketAddr, NodesResponse)> { + let clients = self.clients.read().await; + + if clients.is_empty() { + return Vec::new(); + } + + // Create futures for all requests + let mut futures = FuturesUnordered::new(); + for (&addr, client) in clients.iter() { + let future = + async move { (addr, client.fetch_node_status().await) }; + futures.push(future); + } + + let mut successful_results = Vec::new(); + + // Collect all successful results + while let Some((addr, result)) = futures.next().await { + match result { + Ok(status) => { + successful_results.push((addr, status)); + } + Err(e) => { + // Log the error but continue trying other backends + warn!( + self.log, + "Failed to fetch node status from CockroachDB node"; + "address" => %addr, + "error" => %e + ); + } + } + } + + successful_results + } } /// A single metric value, which can be a counter, gauge, etc. @@ -580,14 +686,14 @@ impl PrometheusMetrics { Deserialize, )] #[serde(transparent)] -pub struct NodeId(pub u32); +pub struct NodeId(pub i32); impl NodeId { - pub fn new(id: u32) -> Self { + pub fn new(id: i32) -> Self { Self(id) } - pub fn as_u32(&self) -> u32 { + pub fn as_i32(&self) -> i32 { self.0 } } @@ -989,8 +1095,10 @@ sql_exec_latency_bucket{le="0.01"} 25 assert_eq!(cluster.get_cached_addresses().await.len(), 0); // Fetch should fail with no backends configured - assert!(cluster.fetch_prometheus_metrics().await.is_err()); - assert!(cluster.fetch_node_status().await.is_err()); + assert!( + cluster.fetch_prometheus_metrics_from_any_node().await.is_err() + ); + assert!(cluster.fetch_node_status_from_any_node().await.is_err()); // Add some backends let addr1: SocketAddr = "127.0.0.1:8080".parse().unwrap(); @@ -1016,8 +1124,10 @@ sql_exec_latency_bucket{le="0.01"} 25 assert_eq!(cluster.get_cached_addresses().await.len(), 0); // Fetch should fail again with no backends configured - assert!(cluster.fetch_prometheus_metrics().await.is_err()); - assert!(cluster.fetch_node_status().await.is_err()); + assert!( + cluster.fetch_prometheus_metrics_from_any_node().await.is_err() + ); + assert!(cluster.fetch_node_status_from_any_node().await.is_err()); } #[test] diff --git a/dev-tools/ls-apis/tests/api_dependencies.out b/dev-tools/ls-apis/tests/api_dependencies.out index 72835eb7ae5..0b76d0671cf 100644 --- a/dev-tools/ls-apis/tests/api_dependencies.out +++ b/dev-tools/ls-apis/tests/api_dependencies.out @@ -12,7 +12,7 @@ Clickhouse Single-Node Cluster Admin (client: clickhouse-admin-single-client) consumed by: omicron-nexus (omicron/nexus) via 2 paths CockroachDB Cluster Admin (client: cockroach-admin-client) - consumed by: omicron-nexus (omicron/nexus) via 2 paths + consumed by: omicron-nexus (omicron/nexus) via 3 paths consumed by: omicron-sled-agent (omicron/sled-agent) via 1 path Crucible Agent (client: crucible-agent-client) diff --git a/nexus/Cargo.toml b/nexus/Cargo.toml index acb32749550..674f3117003 100644 --- a/nexus/Cargo.toml +++ b/nexus/Cargo.toml @@ -65,6 +65,7 @@ nexus-networking.workspace = true nexus-saga-recovery.workspace = true nexus-test-interface.workspace = true num-integer.workspace = true +omicron-cockroach-metrics.workspace = true openssl.workspace = true oximeter-client.workspace = true oximeter-db = { workspace = true, default-features = false, features = [ "oxql" ] } diff --git a/nexus/db-model/Cargo.toml b/nexus/db-model/Cargo.toml index be488605049..ea2338eb594 100644 --- a/nexus/db-model/Cargo.toml +++ b/nexus/db-model/Cargo.toml @@ -22,6 +22,7 @@ iddqd.workspace = true ipnetwork.workspace = true macaddr.workspace = true newtype_derive.workspace = true +omicron-cockroach-metrics.workspace = true omicron-uuid-kinds.workspace = true oxnet.workspace = true parse-display.workspace = true diff --git a/nexus/db-model/src/inventory.rs b/nexus/db-model/src/inventory.rs index 1851c13010f..4854d75f546 100644 --- a/nexus/db-model/src/inventory.rs +++ b/nexus/db-model/src/inventory.rs @@ -29,7 +29,7 @@ use nexus_db_schema::schema::inv_zone_manifest_non_boot; use nexus_db_schema::schema::inv_zone_manifest_zone; use nexus_db_schema::schema::{ hw_baseboard_id, inv_caboose, inv_clickhouse_keeper_membership, - inv_collection, inv_collection_error, inv_dataset, + inv_cockroachdb_status, inv_collection, inv_collection_error, inv_dataset, inv_last_reconciliation_dataset_result, inv_last_reconciliation_disk_result, inv_last_reconciliation_orphaned_dataset, @@ -56,8 +56,8 @@ use nexus_sled_agent_shared::inventory::{ OmicronZoneDataset, OmicronZoneImageSource, OmicronZoneType, }; use nexus_types::inventory::{ - BaseboardId, Caboose, Collection, NvmeFirmware, PowerState, RotPage, - RotSlot, + BaseboardId, Caboose, CockroachStatus, Collection, NvmeFirmware, + PowerState, RotPage, RotSlot, }; use omicron_common::api::external; use omicron_common::api::internal::shared::NetworkInterface; @@ -2526,6 +2526,67 @@ impl InvClickhouseKeeperMembership { } } +#[derive(Queryable, Clone, Debug, Selectable, Insertable)] +#[diesel(table_name = inv_cockroachdb_status)] +pub struct InvCockroachStatus { + pub inv_collection_id: DbTypedUuid, + pub node_id: i32, + pub ranges_underreplicated: Option, + pub liveness_live_nodes: Option, +} + +impl InvCockroachStatus { + pub fn new( + inv_collection_id: CollectionUuid, + node_id: omicron_cockroach_metrics::NodeId, + status: &CockroachStatus, + ) -> Result { + Ok(Self { + inv_collection_id: inv_collection_id.into(), + node_id: node_id.as_i32(), + ranges_underreplicated: status + .ranges_underreplicated + .map(|n| i64::try_from(n)) + .transpose() + .with_context( + || "Converting ranges_underreplicated from u64 to i64", + )?, + liveness_live_nodes: status + .liveness_live_nodes + .map(|n| i64::try_from(n)) + .transpose() + .with_context( + || "Converting liveness_live_nodes from u64 to i64", + )?, + }) + } +} + +impl TryFrom for CockroachStatus { + type Error = anyhow::Error; + + fn try_from(value: InvCockroachStatus) -> anyhow::Result { + Ok(Self { + ranges_underreplicated: value + .ranges_underreplicated + .map(|n| { + u64::try_from(n).with_context(|| { + format!("Failed to convert ranges_underreplicated ({n}) to u64") + }) + }) + .transpose()?, + liveness_live_nodes: value + .liveness_live_nodes + .map(|n| { + u64::try_from(n).with_context(|| { + format!("Failed to convert liveness_live_nodes ({n}) to u64") + }) + }) + .transpose()?, + }) + } +} + #[cfg(test)] mod test { use nexus_types::inventory::NvmeFirmware; diff --git a/nexus/db-model/src/schema_versions.rs b/nexus/db-model/src/schema_versions.rs index e2f6ed0088f..49e9915727b 100644 --- a/nexus/db-model/src/schema_versions.rs +++ b/nexus/db-model/src/schema_versions.rs @@ -16,7 +16,7 @@ use std::{collections::BTreeMap, sync::LazyLock}; /// /// This must be updated when you change the database schema. Refer to /// schema/crdb/README.adoc in the root of this repository for details. -pub const SCHEMA_VERSION: Version = Version::new(155, 0, 0); +pub const SCHEMA_VERSION: Version = Version::new(156, 0, 0); /// List of all past database schema versions, in *reverse* order /// @@ -28,6 +28,7 @@ static KNOWN_VERSIONS: LazyLock> = LazyLock::new(|| { // | leaving the first copy as an example for the next person. // v // KnownVersion::new(next_int, "unique-dirname-with-the-sql-files"), + KnownVersion::new(156, "inv_cockroachdb_status"), KnownVersion::new(155, "vpc-firewall-icmp"), KnownVersion::new(154, "add-pending-mgs-updates"), KnownVersion::new(153, "chicken-switches"), diff --git a/nexus/db-queries/Cargo.toml b/nexus/db-queries/Cargo.toml index 50b8b1789c3..471ab2351b4 100644 --- a/nexus/db-queries/Cargo.toml +++ b/nexus/db-queries/Cargo.toml @@ -67,6 +67,7 @@ nexus-db-lookup.workspace = true nexus-db-schema.workspace = true nexus-sled-agent-shared.workspace = true nexus-types.workspace = true +omicron-cockroach-metrics.workspace = true omicron-common.workspace = true omicron-passwords.workspace = true omicron-uuid-kinds.workspace = true diff --git a/nexus/db-queries/src/db/datastore/inventory.rs b/nexus/db-queries/src/db/datastore/inventory.rs index 5dc8d98d384..3308b3245d6 100644 --- a/nexus/db-queries/src/db/datastore/inventory.rs +++ b/nexus/db-queries/src/db/datastore/inventory.rs @@ -32,6 +32,7 @@ use nexus_db_errors::public_error_from_diesel; use nexus_db_errors::public_error_from_diesel_lookup; use nexus_db_model::InvCaboose; use nexus_db_model::InvClickhouseKeeperMembership; +use nexus_db_model::InvCockroachStatus; use nexus_db_model::InvCollection; use nexus_db_model::InvCollectionError; use nexus_db_model::InvConfigReconcilerStatus; @@ -84,9 +85,11 @@ use nexus_sled_agent_shared::inventory::OrphanedDataset; use nexus_sled_agent_shared::inventory::ZoneArtifactInventory; use nexus_sled_agent_shared::inventory::ZoneManifestNonBootInventory; use nexus_types::inventory::BaseboardId; +use nexus_types::inventory::CockroachStatus; use nexus_types::inventory::Collection; use nexus_types::inventory::PhysicalDiskFirmware; use nexus_types::inventory::SledAgent; +use omicron_cockroach_metrics::NodeId as CockroachNodeId; use omicron_common::api::external::Error; use omicron_common::api::external::InternalContext; use omicron_common::api::external::LookupType; @@ -368,6 +371,15 @@ impl DataStore { ); } + let inv_cockroach_status_records: Vec = collection + .cockroach_status + .iter() + .map(|(node_id, status)| { + InvCockroachStatus::new(collection_id, *node_id, status) + }) + .collect::, _>>() + .map_err(|e| Error::internal_error(&e.to_string()))?; + // This implementation inserts all records associated with the // collection in one transaction. This is primarily for simplicity. It // means we don't have to worry about other readers seeing a @@ -1361,6 +1373,16 @@ impl DataStore { .await?; } + // Insert the cockroach status information we've observed + if !inv_cockroach_status_records.is_empty() { + use nexus_db_schema::schema::inv_cockroachdb_status::dsl; + diesel::insert_into(dsl::inv_cockroachdb_status) + .values(inv_cockroach_status_records) + .execute_async(&conn) + .await?; + } + + // Finally, insert the list of errors. { use nexus_db_schema::schema::inv_collection_error::dsl as errors_dsl; @@ -1647,6 +1669,7 @@ impl DataStore { nzpools: usize, nerrors: usize, nclickhouse_keeper_membership: usize, + ncockroach_status: usize, } let NumRowsDeleted { @@ -1674,6 +1697,7 @@ impl DataStore { nzpools, nerrors, nclickhouse_keeper_membership, + ncockroach_status, } = self.transaction_retry_wrapper("inventory_delete_collection") .transaction(&conn, |conn| async move { @@ -1901,6 +1925,17 @@ impl DataStore { .execute_async(&conn) .await? }; + // Remove rows for cockroach status + let ncockroach_status = { + use nexus_db_schema::schema::inv_cockroachdb_status::dsl; + diesel::delete( + dsl::inv_cockroachdb_status.filter( + dsl::inv_collection_id.eq(db_collection_id), + ), + ) + .execute_async(&conn) + .await? + }; Ok(NumRowsDeleted { ncollections, @@ -1927,6 +1962,7 @@ impl DataStore { nzpools, nerrors, nclickhouse_keeper_membership, + ncockroach_status, }) }) .await @@ -1963,7 +1999,8 @@ impl DataStore { "nomicron_sled_config_zone_nics" => nomicron_sled_config_zone_nics, "nzpools" => nzpools, "nerrors" => nerrors, - "nclickhouse_keeper_membership" => nclickhouse_keeper_membership + "nclickhouse_keeper_membership" => nclickhouse_keeper_membership, + "ncockroach_status" => ncockroach_status, ); Ok(()) @@ -3226,6 +3263,31 @@ impl DataStore { memberships }; + // Load the cockroach status records for all nodes. + let cockroach_status: BTreeMap = { + use nexus_db_schema::schema::inv_cockroachdb_status::dsl; + + let status_records: Vec = + dsl::inv_cockroachdb_status + .filter(dsl::inv_collection_id.eq(db_id)) + .select(InvCockroachStatus::as_select()) + .load_async(&*conn) + .await + .map_err(|e| { + public_error_from_diesel(e, ErrorHandler::Server) + })?; + + let mut result = BTreeMap::new(); + for record in status_records { + let node_id = CockroachNodeId::new(record.node_id); + let status: nexus_types::inventory::CockroachStatus = record + .try_into() + .map_err(|e| Error::internal_error(&format!("{e:#}")))?; + result.insert(node_id, status); + } + result + }; + // Finally, build up the sled-agent map using the sled agent and // omicron zone rows. A for loop is easier to understand than into_iter // + filter_map + return Result + collect. @@ -3405,6 +3467,7 @@ impl DataStore { rot_pages_found, sled_agents, clickhouse_keeper_cluster_membership, + cockroach_status, }) } } diff --git a/nexus/db-schema/src/schema.rs b/nexus/db-schema/src/schema.rs index f5599acb2a0..93b22f5869a 100644 --- a/nexus/db-schema/src/schema.rs +++ b/nexus/db-schema/src/schema.rs @@ -1854,6 +1854,15 @@ table! { } } +table! { + inv_cockroachdb_status (inv_collection_id, node_id) { + inv_collection_id -> Uuid, + node_id -> Int4, + ranges_underreplicated -> Nullable, + liveness_live_nodes -> Nullable, + } +} + /* blueprints */ table! { diff --git a/nexus/inventory/Cargo.toml b/nexus/inventory/Cargo.toml index 420c4b8e54d..62c9c8d4b76 100644 --- a/nexus/inventory/Cargo.toml +++ b/nexus/inventory/Cargo.toml @@ -36,11 +36,13 @@ thiserror.workspace = true tufaceous-artifact.workspace = true typed-rng.workspace = true uuid.workspace = true +omicron-cockroach-metrics.workspace = true omicron-workspace-hack.workspace = true [dev-dependencies] expectorate.workspace = true gateway-test-utils.workspace = true +httpmock.workspace = true omicron-sled-agent.workspace = true regex.workspace = true tokio.workspace = true diff --git a/nexus/inventory/src/builder.rs b/nexus/inventory/src/builder.rs index 5b5a577bdb0..4f9df20e65a 100644 --- a/nexus/inventory/src/builder.rs +++ b/nexus/inventory/src/builder.rs @@ -23,6 +23,7 @@ use nexus_types::inventory::BaseboardId; use nexus_types::inventory::Caboose; use nexus_types::inventory::CabooseFound; use nexus_types::inventory::CabooseWhich; +use nexus_types::inventory::CockroachStatus; use nexus_types::inventory::Collection; use nexus_types::inventory::RotPage; use nexus_types::inventory::RotPageFound; @@ -31,6 +32,9 @@ use nexus_types::inventory::RotState; use nexus_types::inventory::ServiceProcessor; use nexus_types::inventory::SledAgent; use nexus_types::inventory::Zpool; +use omicron_cockroach_metrics::CockroachMetric; +use omicron_cockroach_metrics::NodeId; +use omicron_cockroach_metrics::PrometheusMetrics; use omicron_uuid_kinds::CollectionKind; use std::collections::BTreeMap; use std::collections::BTreeSet; @@ -115,6 +119,7 @@ pub struct CollectionBuilder { sleds: IdOrdMap, clickhouse_keeper_cluster_membership: BTreeSet, + cockroach_status: BTreeMap, // CollectionBuilderRng is taken by value, rather than passed in as a // mutable ref, to encourage a tree-like structure where each RNG is // generally independent. @@ -144,6 +149,7 @@ impl CollectionBuilder { rot_pages_found: BTreeMap::new(), sleds: IdOrdMap::new(), clickhouse_keeper_cluster_membership: BTreeSet::new(), + cockroach_status: BTreeMap::new(), rng: CollectionBuilderRng::from_entropy(), } } @@ -166,6 +172,7 @@ impl CollectionBuilder { sled_agents: self.sleds, clickhouse_keeper_cluster_membership: self .clickhouse_keeper_cluster_membership, + cockroach_status: self.cockroach_status, } } @@ -559,6 +566,20 @@ impl CollectionBuilder { ) { self.clickhouse_keeper_cluster_membership.insert(membership); } + + /// Record metrics from a CockroachDB node + pub fn found_cockroach_metrics( + &mut self, + node_id: NodeId, + metrics: PrometheusMetrics, + ) { + let mut status = CockroachStatus::default(); + status.ranges_underreplicated = + metrics.get_metric_unsigned(CockroachMetric::RangesUnderreplicated); + status.liveness_live_nodes = + metrics.get_metric_unsigned(CockroachMetric::LivenessLiveNodes); + self.cockroach_status.insert(node_id, status); + } } /// Returns the current time, truncated to the previous microsecond. diff --git a/nexus/inventory/src/collector.rs b/nexus/inventory/src/collector.rs index 36aff0f2c8a..f2a2744107e 100644 --- a/nexus/inventory/src/collector.rs +++ b/nexus/inventory/src/collector.rs @@ -15,6 +15,7 @@ use nexus_types::inventory::CabooseWhich; use nexus_types::inventory::Collection; use nexus_types::inventory::RotPage; use nexus_types::inventory::RotPageWhich; +use omicron_cockroach_metrics::CockroachClusterAdminClient; use slog::Logger; use slog::o; use slog::{debug, error}; @@ -29,6 +30,7 @@ pub struct Collector<'a> { log: slog::Logger, mgs_clients: Vec, keeper_admin_clients: Vec, + cockroach_admin_client: &'a CockroachClusterAdminClient, sled_agent_lister: &'a (dyn SledAgentEnumerator + Send + Sync), in_progress: CollectionBuilder, } @@ -38,6 +40,7 @@ impl<'a> Collector<'a> { creator: &str, mgs_clients: Vec, keeper_admin_clients: Vec, + cockroach_admin_client: &'a CockroachClusterAdminClient, sled_agent_lister: &'a (dyn SledAgentEnumerator + Send + Sync), log: slog::Logger, ) -> Self { @@ -45,6 +48,7 @@ impl<'a> Collector<'a> { log, mgs_clients, keeper_admin_clients, + cockroach_admin_client, sled_agent_lister, in_progress: CollectionBuilder::new(creator), } @@ -70,6 +74,7 @@ impl<'a> Collector<'a> { self.collect_all_mgs().await; self.collect_all_sled_agents().await; self.collect_all_keepers().await; + self.collect_all_cockroach().await; debug!(&self.log, "finished collection"); @@ -401,6 +406,78 @@ impl<'a> Collector<'a> { } } } + + /// Collect inventory from CockroachDB nodes + async fn collect_all_cockroach(&mut self) { + debug!(&self.log, "begin collection from CockroachDB nodes"); + + // First, try to get node status to determine actual node IDs + let node_status_results = self + .cockroach_admin_client + .fetch_node_status_from_all_nodes() + .await; + + // When we receive these responses, they return: + // + // - A Vec of "Nodes", each of which includes a "node ID" and + // addresses of the HTTP and SQL servers. + // - Additionally, the response of "fetch_node_status_from_all_nodes" + // returns the SocketAddr we queried. + // + // However, we're querying the "cockroach admin" server, not the + // cockroach HTTP server directly. We would ideally like to know: + // "for each response, what node ID returned this data"? + // + // To access this data, we: + // + // 1. Make the assumption that the IP address of the Cockroach Admin + // server is the same as the Cockroach SQL server. + // 2. Create the mapping of "IP -> Response" + // + // If we find any responses that are in disagreement with each other, + // flag an error and stop the collection. + + let mut ip_to_node_id = std::collections::HashMap::new(); + for (_addr, nodes_response) in node_status_results { + for node in nodes_response.nodes { + let ip = node.desc.sql_address.address_field.ip(); + let id = node.desc.node_id; + if let Some(old_id) = ip_to_node_id.insert(ip, id) { + self.in_progress.found_error(InventoryError::from( + anyhow::anyhow!("Found conflicting node IDs ({old_id} vs {id}) for {ip}") + )); + return; + } + } + } + + // Fetch metrics from all nodes + let metrics_results = self + .cockroach_admin_client + .fetch_prometheus_metrics_from_all_nodes() + .await; + + if metrics_results.is_empty() { + self.in_progress.found_error(InventoryError::from( + anyhow::anyhow!("No CockroachDB nodes returned metrics"), + )); + return; + } + + // Store results for each successful node using observed node IDs + for (addr, metrics) in metrics_results { + let Some(node_id) = ip_to_node_id.get(&addr.ip()) else { + self.in_progress.found_error(InventoryError::from( + anyhow::anyhow!( + "Could not determine CockroachDB node ID for address: {}", + addr + ) + )); + continue; + }; + self.in_progress.found_cockroach_metrics(*node_id, metrics); + } + } } #[cfg(test)] @@ -415,6 +492,7 @@ mod test { use nexus_sled_agent_shared::inventory::OmicronZoneImageSource; use nexus_sled_agent_shared::inventory::OmicronZoneType; use nexus_types::inventory::Collection; + use omicron_cockroach_metrics::CockroachClusterAdminClient; use omicron_common::api::external::Generation; use omicron_common::zpool_name::ZpoolName; use omicron_sled_agent::sim; @@ -694,6 +772,60 @@ mod test { agent } + // Set up httpmock server for CockroachDB admin endpoints + fn mock_crdb_admin_server() -> httpmock::MockServer { + let mock_server = httpmock::MockServer::start(); + mock_server.mock(|when, then| { + when.method(httpmock::Method::GET).path("/proxy/status/vars"); + then.status(200) + .header("content-type", "application/json") + .body("\"# Basic CockroachDB metrics\\nliveness_livenodes 1\\nranges_underreplicated 0\\n\""); + }); + mock_server.mock(|when, then| { + when.method(httpmock::Method::GET).path("/proxy/status/nodes"); + then.status(200).header("content-type", "application/json").body( + serde_json::to_string( + &serde_json::json!({ + "nodes": [{ + "desc": { + "nodeId": 1, + "address": { + "networkField": "tcp", + "addressField": "127.0.0.1:26257" + }, + "sqlAddress": { + "networkField": "tcp", + "addressField": "127.0.0.1:26257" + }, + "httpAddress": { + "networkField": "tcp", + "addressField": "127.0.0.1:8080" + }, + "buildTag": "v21.1.0", + "startedAt": "1640995200000000000", + "clusterName": "test-cluster" + }, + "buildInfo": { + "goVersion": "go1.17", + "tag": "v21.1.0" + }, + "startedAt": "1640995200000000000", + "updatedAt": "1640995200000000000", + "totalSystemMemory": "8589934592", + "numCpus": 4 + }], + "livenessByNodeId": { + "1": 3 + } + }) + .to_string(), + ) + .unwrap(), + ); + }); + mock_server + } + #[tokio::test] async fn test_basic() { // Set up the stock MGS test setup (which includes a couple of fake SPs) @@ -732,10 +864,15 @@ mod test { // We don't have any mocks for this, and it's unclear how much value // there would be in providing them at this juncture. let keeper_clients = Vec::new(); + // Configure the mock server as a backend for the CockroachDB client + let crdb_cluster = CockroachClusterAdminClient::new(log.clone()); + let crdb_admin_server = mock_crdb_admin_server(); + crdb_cluster.update_backends(&[*crdb_admin_server.address()]).await; let collector = Collector::new( "test-suite", vec![mgs_client], keeper_clients, + &crdb_cluster, &sled_enum, log.clone(), ); @@ -743,7 +880,11 @@ mod test { .collect_all() .await .expect("failed to carry out collection"); - assert!(collection.errors.is_empty()); + assert!( + collection.errors.is_empty(), + "Collection errors: {:#?}", + collection.errors + ); assert_eq!(collection.collector, "test-suite"); let s = dump_collection(&collection); @@ -804,10 +945,14 @@ mod test { // We don't have any mocks for this, and it's unclear how much value // there would be in providing them at this juncture. let keeper_clients = Vec::new(); + let crdb_cluster = CockroachClusterAdminClient::new(log.clone()); + let crdb_admin_server = mock_crdb_admin_server(); + crdb_cluster.update_backends(&[*crdb_admin_server.address()]).await; let collector = Collector::new( "test-suite", mgs_clients, keeper_clients, + &crdb_cluster, &sled_enum, log.clone(), ); @@ -851,10 +996,14 @@ mod test { // We don't have any mocks for this, and it's unclear how much value // there would be in providing them at this juncture. let keeper_clients = Vec::new(); + let crdb_cluster = CockroachClusterAdminClient::new(log.clone()); + let crdb_admin_server = mock_crdb_admin_server(); + crdb_cluster.update_backends(&[*crdb_admin_server.address()]).await; let collector = Collector::new( "test-suite", mgs_clients, keeper_clients, + &crdb_cluster, &sled_enum, log.clone(), ); @@ -903,10 +1052,14 @@ mod test { // We don't have any mocks for this, and it's unclear how much value // there would be in providing them at this juncture. let keeper_clients = Vec::new(); + let crdb_cluster = CockroachClusterAdminClient::new(log.clone()); + let crdb_admin_server = mock_crdb_admin_server(); + crdb_cluster.update_backends(&[*crdb_admin_server.address()]).await; let collector = Collector::new( "test-suite", vec![mgs_client], keeper_clients, + &crdb_cluster, &sled_enum, log.clone(), ); diff --git a/nexus/inventory/src/examples.rs b/nexus/inventory/src/examples.rs index e1351443c20..9f797927657 100644 --- a/nexus/inventory/src/examples.rs +++ b/nexus/inventory/src/examples.rs @@ -33,6 +33,8 @@ use nexus_types::inventory::CabooseWhich; use nexus_types::inventory::RotPage; use nexus_types::inventory::RotPageWhich; use nexus_types::inventory::ZpoolName; +use omicron_cockroach_metrics::MetricValue; +use omicron_cockroach_metrics::PrometheusMetrics; use omicron_common::api::external::ByteCount; use omicron_common::disk::DatasetConfig; use omicron_common::disk::DatasetKind; @@ -64,6 +66,7 @@ use sled_agent_zone_images_examples::NON_BOOT_PATHS; use sled_agent_zone_images_examples::NON_BOOT_UUID; use sled_agent_zone_images_examples::WriteInstallDatasetContext; use sled_agent_zone_images_examples::dataset_missing_error; +use std::collections::BTreeMap; use std::sync::Arc; use std::time::Duration; use strum::IntoEnumIterator; @@ -602,6 +605,16 @@ pub fn representative() -> Representative { }, ); + builder.found_cockroach_metrics( + omicron_cockroach_metrics::NodeId::new(1), + PrometheusMetrics { + metrics: BTreeMap::from([( + "ranges_underreplicated".to_string(), + MetricValue::Unsigned(0), + )]), + }, + ); + Representative { builder, sleds: [sled1_bb, sled2_bb, sled3_bb, sled4_bb], diff --git a/nexus/reconfigurator/execution/src/omicron_zones.rs b/nexus/reconfigurator/execution/src/omicron_zones.rs index 9e08c8920da..28c981fd90e 100644 --- a/nexus/reconfigurator/execution/src/omicron_zones.rs +++ b/nexus/reconfigurator/execution/src/omicron_zones.rs @@ -168,6 +168,9 @@ async fn oximeter_cleanup( Ok(()) } +// TODO(https://github.com/oxidecomputer/omicron/issues/8496): If this service +// was fully in DNS, this would not be necessary. +// // Helper trait that is implemented by `Resolver`, but allows unit tests to // inject a fake resolver that points to a mock server when calling // `decommission_cockroachdb_node()`. diff --git a/nexus/src/app/background/init.rs b/nexus/src/app/background/init.rs index d4860b0b1c5..86911263d6f 100644 --- a/nexus/src/app/background/init.rs +++ b/nexus/src/app/background/init.rs @@ -457,6 +457,7 @@ impl BackgroundTasksInitializer { // blueprint executor runs. let inventory_watcher = { let collector = inventory_collection::InventoryCollector::new( + &opctx, datastore.clone(), resolver.clone(), &nexus_id.to_string(), diff --git a/nexus/src/app/background/tasks/blueprint_planner.rs b/nexus/src/app/background/tasks/blueprint_planner.rs index cff311899b1..38688ac9f15 100644 --- a/nexus/src/app/background/tasks/blueprint_planner.rs +++ b/nexus/src/app/background/tasks/blueprint_planner.rs @@ -291,6 +291,7 @@ mod test { ) .unwrap(); let mut collector = InventoryCollector::new( + &opctx, datastore.clone(), resolver.clone(), "test_planner", diff --git a/nexus/src/app/background/tasks/crdb_node_id_collector.rs b/nexus/src/app/background/tasks/crdb_node_id_collector.rs index 8a3f4ab6358..0f42021eda0 100644 --- a/nexus/src/app/background/tasks/crdb_node_id_collector.rs +++ b/nexus/src/app/background/tasks/crdb_node_id_collector.rs @@ -128,6 +128,9 @@ impl CockroachNodeIdCollector { // This trait exists so we can inject addresses in our unit tests below that // aren't required to have admin servers listening on the fixed // `COCKROACH_ADMIN_PORT`. +// +// TODO(https://github.com/oxidecomputer/omicron/issues/8496): Add the admin +// service to DNS, remove this? trait CockroachAdminFromBlueprint { fn cockroach_admin_addrs<'a>( &'a self, diff --git a/nexus/src/app/background/tasks/inventory_collection.rs b/nexus/src/app/background/tasks/inventory_collection.rs index f3033513f18..e644d8d1989 100644 --- a/nexus/src/app/background/tasks/inventory_collection.rs +++ b/nexus/src/app/background/tasks/inventory_collection.rs @@ -16,8 +16,11 @@ use nexus_db_queries::db::DataStore; use nexus_inventory::InventoryError; use nexus_types::deployment::SledFilter; use nexus_types::inventory::Collection; +use omicron_cockroach_metrics::CockroachClusterAdminClient; use omicron_uuid_kinds::CollectionUuid; use serde_json::json; +use slog::{debug, o, warn}; +use std::net::SocketAddr; use std::sync::Arc; use tokio::sync::watch; @@ -29,10 +32,12 @@ pub struct InventoryCollector { nkeep: u32, disable: bool, tx: watch::Sender>, + cockroach_admin_client: CockroachClusterAdminClient, } impl InventoryCollector { pub fn new( + opctx: &OpContext, datastore: Arc, resolver: internal_dns_resolver::Resolver, creator: &str, @@ -40,6 +45,11 @@ impl InventoryCollector { disable: bool, ) -> InventoryCollector { let (tx, _) = watch::channel(None); + let cockroach_admin_client = CockroachClusterAdminClient::new( + opctx + .log + .new(slog::o!("component" => "inventory_cockroach_client")), + ); InventoryCollector { datastore, resolver, @@ -47,6 +57,7 @@ impl InventoryCollector { nkeep, disable, tx, + cockroach_admin_client, } } @@ -68,6 +79,7 @@ impl BackgroundTask for InventoryCollector { &self.creator, self.nkeep, self.disable, + &self.cockroach_admin_client, ) .await .context("failed to collect inventory") @@ -104,6 +116,7 @@ async fn inventory_activate( creator: &str, nkeep: u32, disabled: bool, + cockroach_admin_client: &CockroachClusterAdminClient, ) -> Result { // If we're disabled, don't do anything. (This switch is only intended for // unforeseen production emergencies.) @@ -180,6 +193,25 @@ async fn inventory_activate( }, }; + // Update CockroachDB cluster backends. + let cockroach_addresses = resolver + .lookup_all_socket_v6(ServiceName::Cockroach) + .await + .context("looking up cockroach addresses")?; + + // TODO(https://github.com/oxidecomputer/omicron/issues/8496): If + // we could look up the admin service, instead of hard-coding it as + // "same as cockroach, but different port", that be preferable. + let admin_addresses: Vec<_> = cockroach_addresses + .into_iter() + .map(|mut addr| { + addr.set_port(omicron_common::address::COCKROACH_ADMIN_PORT); + SocketAddr::V6(addr) + }) + .collect(); + + cockroach_admin_client.update_backends(admin_addresses.as_slice()).await; + // Create an enumerator to find sled agents. let sled_enum = DbSledAgentEnumerator { opctx, datastore }; @@ -188,6 +220,7 @@ async fn inventory_activate( creator, mgs_clients, keeper_admin_clients, + cockroach_admin_client, &sled_enum, opctx.log.clone(), ); @@ -295,6 +328,7 @@ mod test { // allow a backlog to accumulate. let nkeep = 3; let mut task = InventoryCollector::new( + &opctx, datastore.clone(), resolver.clone(), "me", @@ -360,6 +394,7 @@ mod test { // Create a disabled task and make sure that does nothing. let mut task = InventoryCollector::new( + &opctx, datastore.clone(), resolver, "disabled", diff --git a/nexus/tests/integration_tests/cockroach.rs b/nexus/tests/integration_tests/cockroach.rs index 341efafabc4..50f270c9820 100644 --- a/nexus/tests/integration_tests/cockroach.rs +++ b/nexus/tests/integration_tests/cockroach.rs @@ -6,6 +6,10 @@ use cockroach_admin_client::Client as CockroachAdminClient; use nexus_test_utils_macros::nexus_test; +use omicron_cockroach_metrics::CockroachClusterAdminClient; +use omicron_cockroach_metrics::CockroachMetric; +use omicron_cockroach_metrics::MetricValue; +use omicron_cockroach_metrics::NodeLiveness; type ControlPlaneTestContext = nexus_test_utils::ControlPlaneTestContext; @@ -53,3 +57,173 @@ async fn test_cockroach_admin_server(cptestctx: &ControlPlaneTestContext) { "vars and nodes should return different data" ); } + +// Test querying the CockroachDB HTTP interface for Prometheus metrics +#[nexus_test] +async fn test_cockroach_http_prometheus_metrics( + cptestctx: &ControlPlaneTestContext, +) { + let admin_addr = cptestctx.database_admin.local_addr(); + + let client = CockroachClusterAdminClient::new(cptestctx.logctx.log.clone()); + client.update_backends(&[admin_addr]).await; + + let metrics = client + .fetch_prometheus_metrics_from_any_node() + .await + .expect("Should be able to fetch Prometheus metrics from CockroachDB"); + + // Verify we got some expected CockroachDB metrics + assert!(!metrics.metrics.is_empty(), "Should have received some metrics"); + + // Test strongly-typed metric access using the CockroachMetric enum + use strum::IntoEnumIterator; + + println!("\nTesting strongly-typed CockroachDB metrics:"); + + let mut found_metrics = 0; + let mut missing_metrics = Vec::new(); + + for metric in CockroachMetric::iter() { + if let Some(value) = metrics.get_metric(metric) { + match value { + MetricValue::Float(val) => { + println!(" {} = {} (float)", metric.metric_name(), val); + found_metrics += 1; + } + MetricValue::Unsigned(val) => { + println!(" {} = {} (unsigned)", metric.metric_name(), val); + found_metrics += 1; + } + MetricValue::Histogram(buckets) => { + println!( + " {} = {} buckets (histogram)", + metric.metric_name(), + buckets.len() + ); + found_metrics += 1; + } + MetricValue::String(s) => { + println!(" {} = {} (string)", metric.metric_name(), s); + found_metrics += 1; + } + } + } else { + missing_metrics.push(metric); + } + } + + println!( + "Found {} out of {} strongly-typed metrics", + found_metrics, + CockroachMetric::iter().count() + ); + + // We expect to find at least some of our key metrics in a running CockroachDB instance + assert!( + found_metrics > 0, + "Should find at least some of the strongly-typed CockroachDB metrics" + ); + + // Print any missing metrics for debugging + if !missing_metrics.is_empty() { + println!("Missing metrics:"); + for metric in missing_metrics { + println!(" {} - {}", metric.metric_name(), metric.description()); + } + panic!("Missing metrics; failing test"); + } + + // NOTE: I've transiently observed a "non-zero" value for ranges + // under-replicated here, even in a single node deployment. + // + // Just check that the metric exists, without checking the value. + let _ = metrics + .get_metric_unsigned(CockroachMetric::RangesUnderreplicated) + .expect("Missing 'ranges underreplicated' metric"); + assert_eq!( + metrics + .get_metric_unsigned(CockroachMetric::LivenessLiveNodes) + .expect("Missing 'live nodes' metric"), + 1 + ); + + println!("\nTesting SQL execution latency histogram:"); + if let Some(histogram) = + metrics.get_metric_histogram(CockroachMetric::SqlExecLatency) + { + println!( + "Found SQL execution latency histogram with {} buckets", + histogram.len() + ); + + // Buckets are automatically sorted by le value + println!(" Histogram buckets:"); + for bucket in histogram { + if bucket.le == f64::INFINITY { + println!(" le=+Inf: {}", bucket.count); + } else { + println!(" le={}: {}", bucket.le, bucket.count); + } + } + + // Verify histogram properties + assert!(!histogram.is_empty(), "Histogram should have buckets"); + + // Check that buckets are cumulative and sorted (each bucket count >= previous bucket count) + for i in 1..histogram.len() { + assert!( + histogram[i].count >= histogram[i - 1].count, + "Histogram buckets should be cumulative: bucket {} (le={}) has count {} < previous bucket count {}", + i, + histogram[i].le, + histogram[i].count, + histogram[i - 1].count + ); + assert!( + histogram[i].le >= histogram[i - 1].le, + "Histogram buckets should be sorted: bucket {} (le={}) < previous bucket (le={})", + i, + histogram[i].le, + histogram[i - 1].le + ); + } + println!("Histogram structure is valid (cumulative buckets)"); + } else { + panic!(" SQL execution latency histogram not found"); + } +} + +// Test fetching CockroachDB node status information +#[nexus_test] +async fn test_cockroach_http_node_status(cptestctx: &ControlPlaneTestContext) { + let admin_addr = cptestctx.database_admin.local_addr(); + + let client = CockroachClusterAdminClient::new(cptestctx.logctx.log.clone()); + client.update_backends(&[admin_addr]).await; + + let nodes_response = client + .fetch_node_status_from_any_node() + .await + .expect("Should be able to fetch nodes status from CockroachDB"); + + println!("Response: {nodes_response:#?}"); + + // Verify we saw one node, and that it's alive. + assert_eq!(nodes_response.nodes.len(), 1, "Should have one node"); + let first_node = &nodes_response.nodes[0]; + + assert_eq!( + nodes_response + .liveness_by_node_id + .get(&first_node.desc.node_id) + .unwrap(), + &NodeLiveness::Live + ); + + assert_eq!(first_node.desc.node_id.as_i32(), 1); + assert!( + !first_node.build_info.tag.is_empty(), + "Build tag should not be empty" + ); +} diff --git a/nexus/types/Cargo.toml b/nexus/types/Cargo.toml index 5aabe2a45c1..c4ed6cbf149 100644 --- a/nexus/types/Cargo.toml +++ b/nexus/types/Cargo.toml @@ -27,6 +27,7 @@ id-map.workspace = true illumos-utils.workspace = true ipnetwork.workspace = true newtype_derive.workspace = true +omicron-cockroach-metrics.workspace = true omicron-uuid-kinds.workspace = true openssl.workspace = true oxql-types.workspace = true diff --git a/nexus/types/src/inventory.rs b/nexus/types/src/inventory.rs index e6d5c34c903..3aa81096a97 100644 --- a/nexus/types/src/inventory.rs +++ b/nexus/types/src/inventory.rs @@ -148,6 +148,10 @@ pub struct Collection { /// mappings and guarantee unique pairs. pub clickhouse_keeper_cluster_membership: BTreeSet, + + /// The status of our cockroachdb cluster, keyed by node identifier + pub cockroach_status: + BTreeMap, } impl Collection { @@ -579,3 +583,9 @@ impl IdOrdItem for SledAgent { } id_upcast!(); } + +#[derive(Clone, Default, Debug, PartialEq, Eq, Deserialize, Serialize)] +pub struct CockroachStatus { + pub ranges_underreplicated: Option, + pub liveness_live_nodes: Option, +} diff --git a/schema/crdb/dbinit.sql b/schema/crdb/dbinit.sql index 8806c09c695..3eb03304eb0 100644 --- a/schema/crdb/dbinit.sql +++ b/schema/crdb/dbinit.sql @@ -4211,6 +4211,14 @@ CREATE TABLE IF NOT EXISTS omicron.public.inv_clickhouse_keeper_membership ( PRIMARY KEY (inv_collection_id, queried_keeper_id) ); +CREATE TABLE IF NOT EXISTS omicron.public.inv_cockroachdb_status ( + inv_collection_id UUID NOT NULL, + ranges_underreplicated INT8, + liveness_live_nodes INT8, + + PRIMARY KEY (inv_collection_id) +); + /* * Various runtime configuration switches for reconfigurator * @@ -6110,7 +6118,7 @@ INSERT INTO omicron.public.db_metadata ( version, target_version ) VALUES - (TRUE, NOW(), NOW(), '155.0.0', NULL) + (TRUE, NOW(), NOW(), '156.0.0', NULL) ON CONFLICT DO NOTHING; COMMIT; diff --git a/schema/crdb/inv_cockroachdb_status/up01.sql b/schema/crdb/inv_cockroachdb_status/up01.sql new file mode 100644 index 00000000000..3315fd63e13 --- /dev/null +++ b/schema/crdb/inv_cockroachdb_status/up01.sql @@ -0,0 +1,9 @@ +CREATE TABLE IF NOT EXISTS omicron.public.inv_cockroachdb_status ( + inv_collection_id UUID NOT NULL, + node_id INT4 NOT NULL, + ranges_underreplicated INT8, + liveness_live_nodes INT8, + + PRIMARY KEY (inv_collection_id, node_id) +); +