Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM keppel.eu-de-1.cloud.sap/ccloud-dockerhub-mirror/library/ubuntu:latest
FROM ubuntu:latest

RUN export DEBIAN_FRONTEND=noninteractive \
&& apt-get update \
Expand Down
338 changes: 293 additions & 45 deletions collector.py

Large diffs are not rendered by default.

50 changes: 50 additions & 0 deletions collectors/bus_collector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from prometheus_client.core import GaugeMetricFamily, InfoMetricFamily
from collectors.module_collector import ModuleCollector
from collectors.utils import _extract_kv_metrics, get_leaf_name

class BusCollector:
def __init__(self, host, target, labels, urls, connect_fn):
self.host = host
self.target = target
self.labels = labels
self.urls = urls
self.connect_server = connect_fn

def collect(self):
busses_url = self.urls.get("Busses")
if not busses_url:
return []

data = self.connect_server(busses_url)
metrics = []

for member in data.get("Members", []):
member_path = member["@odata.id"]
bus_id = get_leaf_name(member_path)

clean_labels = {
"host": self.labels.get("host", ""),
"server_manufacturer": self.labels.get("server_manufacturer", ""),
"server_model": self.labels.get("server_model", ""),
"server_serial": self.labels.get("server_serial", ""),
"system": get_leaf_name(self.labels.get("system", "")),
"dcn": get_leaf_name(self.labels.get("dcn", "")),
"bus": bus_id
}

member_data = self.connect_server(member_path)
metrics += self._extract_metrics(member_data, clean_labels)

mod_link = member_data.get("IOModules", {}).get("@odata.id")
if mod_link:
mod_collector = ModuleCollector(
self.host, self.target, clean_labels,
{"IOModules": mod_link},
self.connect_server
)
metrics += mod_collector.collect()

return metrics

def _extract_metrics(self, data, labels):
return _extract_kv_metrics("bus", data, labels)
47 changes: 47 additions & 0 deletions collectors/channel_collector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
from prometheus_client.core import GaugeMetricFamily
from collectors.utils import _extract_kv_metrics, get_leaf_name

class ChannelCollector:
def __init__(self, host, target, labels, urls, connect_fn):
self.host = host
self.target = target
self.labels = labels
self.urls = urls
self.connect_server = connect_fn

def collect(self):
ch_url = self.urls.get("IOChannels")
if not ch_url:
return []

data = self.connect_server(ch_url)
metrics = []

for ch in data.get("Members", []):
ch_data = self.connect_server(ch["@odata.id"])

# Extract the clean names for labels
system = get_leaf_name(self.labels.get("system", ""))
dcn = get_leaf_name(self.labels.get("dcn", ""))
bus = get_leaf_name(self.labels.get("bus", ""))
module = get_leaf_name(self.labels.get("module", ""))
channel = get_leaf_name(ch_data.get("Id", "channel"))

scoped_labels = {
"host": self.host,
"server_manufacturer": self.labels.get("server_manufacturer", ""),
"server_model": self.labels.get("server_model", ""),
"server_serial": self.labels.get("server_serial", ""),
"system": system,
"dcn": dcn,
"bus": bus,
"module": module,
"channel": channel
}

metrics += self._extract_metrics(ch_data, scoped_labels)

return metrics

def _extract_metrics(self, data, labels):
return _extract_kv_metrics("channel", data, labels)
45 changes: 45 additions & 0 deletions collectors/dcn_collector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
from prometheus_client.core import GaugeMetricFamily, InfoMetricFamily
from collectors.bus_collector import BusCollector
from collectors.utils import _extract_kv_metrics, get_leaf_name


class DistributedControlNodeCollector:
def __init__(self, host, target, labels, urls, connect_fn):
self.host = host
self.target = target
self.labels = labels
self.urls = urls
self.connect_server = connect_fn

def collect(self):
dcn_url = self.urls.get("DistributedControlNode")
if not dcn_url:
return []

dcn_data = self.connect_server(dcn_url)
dcn_id = get_leaf_name(dcn_data.get("Id", "dcn"))

clean_labels = {
"host": self.labels.get("host", ""),
"server_manufacturer": self.labels.get("server_manufacturer", ""),
"server_model": self.labels.get("server_model", ""),
"server_serial": self.labels.get("server_serial", ""),
"system": get_leaf_name(self.labels.get("system", "")),
"dcn": dcn_id
}

metrics = self._extract_metrics(dcn_data, clean_labels)

busses_link = dcn_data.get("Busses", {}).get("@odata.id")
if busses_link:
bus_collector = BusCollector(
self.host, self.target, clean_labels,
{"Busses": busses_link},
self.connect_server
)
metrics += bus_collector.collect()

return metrics

def _extract_metrics(self, data, labels):
return _extract_kv_metrics("dcn", data, labels)
112 changes: 112 additions & 0 deletions collectors/ethernet_collector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import logging
from prometheus_client.core import GaugeMetricFamily

class EthernetCollector:
def __init__(self, host, target, labels, urls, connect_fn):
self.host = host
self.target = target
self.labels = labels
self.urls = urls
self.connect_server = connect_fn # Redfish API connection method

self.ethernet_metrics = GaugeMetricFamily(
"redfish_ethernet_interface",
"Redfish Server Monitoring Ethernet Interface Status",
labels=["interface_name", "mac_address", "ipv4", "ipv6", "speed_mbps"] + list(self.labels.keys())
)

self.link_status_metric = GaugeMetricFamily(
"redfish_ethernet_link_status",
"Link status of the Ethernet interface (1=Up, 0=Down/Unknown)",
labels=["interface_name"] + list(self.labels.keys())
)

self.duplex_metric = GaugeMetricFamily(
"redfish_ethernet_full_duplex",
"Whether the Ethernet interface is in full duplex mode",
labels=["interface_name"] + list(self.labels.keys())
)

self.dhcp_metric = GaugeMetricFamily(
"redfish_ethernet_dhcp_enabled",
"Whether DHCPv4 is enabled on the Ethernet interface",
labels=["interface_name"] + list(self.labels.keys())
)

def collect(self):
eth_url = self.urls.get("NetworkInterfaces") or self.urls.get("EthernetInterfaces")
if not eth_url:
logging.warning("Target %s: No Ethernet interface URL found.", self.target)
return

iface_list = self.connect_server(eth_url)
if not iface_list or "Members" not in iface_list:
logging.warning("Target %s: EthernetInterfaces returned no members.", self.target)
return

for iface in iface_list["Members"]:
iface_url = iface.get("@odata.id")
if not iface_url:
continue

iface_data = self.connect_server(iface_url)
if not iface_data:
continue

interface_name = iface_data.get("Name", "unknown")
mac = iface_data.get("MACAddress", "")
speed = str(iface_data.get("SpeedMbps", 0))
ipv4 = "unknown"
ipv6 = "unknown"

ipv4_list = iface_data.get("IPv4Addresses", [])
if ipv4_list and isinstance(ipv4_list, list) and "Address" in ipv4_list[0]:
ipv4 = ipv4_list[0]["Address"]

ipv6_list = iface_data.get("IPv6Addresses", [])
if ipv6_list and isinstance(ipv6_list, list) and "Address" in ipv6_list[0]:
ipv6 = ipv6_list[0]["Address"]

metric_labels = {
"interface_name": interface_name,
"mac_address": mac,
"ipv4": ipv4,
"ipv6": ipv6,
"speed_mbps": speed
}
metric_labels.update(self.labels)

self.ethernet_metrics.add_sample(
"redfish_ethernet_interface", value=1, labels=metric_labels
)

# LinkStatus
link_status = iface_data.get("LinkStatus", "Unknown")
self.link_status_metric.add_sample(
"redfish_ethernet_link_status",
value=1 if link_status.lower() == "up" else 0,
labels={"interface_name": interface_name, **self.labels}
)

# FullDuplex
full_duplex = iface_data.get("FullDuplex", False)
self.duplex_metric.add_sample(
"redfish_ethernet_full_duplex",
value=1 if full_duplex else 0,
labels={"interface_name": interface_name, **self.labels}
)

# DHCPv4 Enabled
dhcp_enabled = iface_data.get("DHCPv4", {}).get("DHCPEnabled", False)
self.dhcp_metric.add_sample(
"redfish_ethernet_dhcp_enabled",
value=1 if dhcp_enabled else 0,
labels={"interface_name": interface_name, **self.labels}
)

return [
self.ethernet_metrics,
self.link_status_metric,
self.duplex_metric,
self.dhcp_metric
]
49 changes: 45 additions & 4 deletions collectors/health_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,8 +206,12 @@ def get_power_health(self):
power_data = self.col.connect_server(self.col.urls["Power"])
if not power_data:
return

for psu in power_data["PowerSupplies"]:
power_supplies = power_data.get("PowerSupplies", [])
if not power_supplies:
logging.warning("Target %s: No PowerSupplies found in Power data!", self.col.target)
return
for psu in power_supplies:
#for psu in power_data["PowerSupplies"]:
psu_name = psu["Name"] if "Name" in psu and psu["Name"] is not None else "unknown"
psu_model = psu["Model"] if "Model" in psu and psu["Model"] is not None else "unknown"

Expand All @@ -224,15 +228,52 @@ def get_power_health(self):
"Health",
current_labels
)
def get_ethernet_health(self):
"""Get Ethernet Interface data from the Redfish API."""
logging.debug("Target %s: Get the Ethernet Interface health data.", self.col.target)

eth_collection = self.col.connect_server(self.col.urls["NetworkInterfaces"])
if not eth_collection or "Members" not in eth_collection:
logging.warning("Target %s: No Ethernet Interfaces found.", self.col.target)
return

for iface in eth_collection["Members"]:
iface_data = self.col.connect_server(iface["@odata.id"])
if not iface_data:
continue

eth_status = self.extract_health_status(
iface_data, "EthernetInterface", iface_data.get("Id", "unknown")
)
current_labels = {
"device_type": "ethernet",
"device_name": iface_data.get("Name", "unknown"),
"mac_address": iface_data.get("MACAddress", "unknown"),
"speed_mbps": str(iface_data.get("SpeedMbps", "unknown")),
"interface_enabled": str(iface_data.get("InterfaceEnabled", "unknown")),
}
current_labels.update(self.col.labels)

self.add_metric_sample(
"redfish_health",
{"Health": eth_status},
"Health",
current_labels
)


def get_thermal_health(self):
"""Get the Thermal data from the Redfish API."""
logging.debug("Target %s: Get the thermal health data.", self.col.target)
thermal_data = self.col.connect_server(self.col.urls["Thermal"])
if not thermal_data:
return
fans = thermal_data.get("Fans",[])
if not fans:
logging.warning("Target %s: No PowerSupplies found in Power Data!", self.col.target)
return

for fan in thermal_data["Fans"]:
for fan in fans:
fan_name = fan.get("Name", "unknown")
current_labels = {
"device_type": "fan",
Expand Down Expand Up @@ -370,7 +411,7 @@ def collect(self):
current_labels
)

for url_key in ["Processors", "Storage", "Chassis", "Power", "Thermal", "Memory"]:
for url_key in ["Processors", "Storage", "Chassis", "Power", "Thermal", "Memory", "NetworkInterfaces"]:
self.collect_health_data(url_key)

def __exit__(self, exc_type, exc_val, exc_tb):
Expand Down
55 changes: 55 additions & 0 deletions collectors/module_collector.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
from prometheus_client.core import GaugeMetricFamily, InfoMetricFamily
from collectors.channel_collector import ChannelCollector
from collectors.utils import _extract_kv_metrics, get_leaf_name

class ModuleCollector:
def __init__(self, host, target, labels, urls, connect_fn):
self.host = host
self.target = target
self.labels = labels
self.urls = urls
self.connect_server = connect_fn

def collect(self):
mod_url = self.urls.get("IOModules")
if not mod_url:
return []

data = self.connect_server(mod_url)
metrics = []

for mod in data.get("Members", []):
mod_path = mod["@odata.id"]
mod_id = get_leaf_name(mod_path)

# Clean all labels here
clean_labels = {
"host": self.labels.get("host", ""),
"server_manufacturer": self.labels.get("server_manufacturer", ""),
"server_model": self.labels.get("server_model", ""),
"server_serial": self.labels.get("server_serial", ""),
"system": get_leaf_name(self.labels.get("system", "")),
"dcn": get_leaf_name(self.labels.get("dcn", "")),
"bus": get_leaf_name(self.labels.get("bus", "")),
"module": mod_id
}

mod_data = self.connect_server(mod_path)

metrics += self._extract_metrics(mod_data, clean_labels)

ch_link = mod_data.get("IOChannels", {}).get("@odata.id")
if ch_link:
ch_collector = ChannelCollector(
self.host,
self.target,
clean_labels,
{"IOChannels": ch_link},
self.connect_server
)
metrics += ch_collector.collect()

return metrics

def _extract_metrics(self, data, labels):
return _extract_kv_metrics("module", data, labels)
Loading