diff --git a/Dockerfile b/Dockerfile index 59f9380..f7f2d59 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,4 +1,4 @@ -FROM keppel.eu-de-1.cloud.sap/ccloud-dockerhub-mirror/library/ubuntu:latest +FROM ubuntu:latest RUN export DEBIAN_FRONTEND=noninteractive \ && apt-get update \ diff --git a/collector.py b/collector.py index 78699b2..62a5c9f 100644 --- a/collector.py +++ b/collector.py @@ -5,11 +5,24 @@ import sys import re import requests +import redfish +import math from prometheus_client.core import GaugeMetricFamily from collectors.performance_collector import PerformanceCollector from collectors.firmware_collector import FirmwareCollector from collectors.health_collector import HealthCollector from collectors.certificate_collector import CertificateCollector +from collectors.ethernet_collector import EthernetCollector +from collectors.operating_system_collector import OperatingSystemCollector +from collectors.recursive_collector import RecursiveCollector +from collectors.system_collector import SystemCollector +from collectors.dcn_collector import DistributedControlNodeCollector +from collectors.bus_collector import BusCollector +from collectors.module_collector import ModuleCollector +from collectors.channel_collector import ChannelCollector +from collectors.utils import _extract_kv_metrics + + class RedfishMetricsCollector: """Class for collecting Redfish metrics.""" @@ -19,6 +32,7 @@ def __enter__(self): def __init__(self, config, target, host, usr, pwd, metrics_type): self.target = target self.host = host + self.config = config self._username = usr self._password = pwd @@ -44,7 +58,7 @@ def __init__(self, config, target, host, usr, pwd, metrics_type): "Thermal": "", "PowerSubsystem": "", "ThermalSubsystem": "", - "NetworkInterfaces": "", + "EthernetInterfaces": "", } self.server_health = 0 @@ -69,6 +83,11 @@ def __init__(self, config, target, host, usr, pwd, metrics_type): self._basic_auth = False self._session = "" self.redfish_version = "not available" + self.health_summary_metrics = GaugeMetricFamily( + "redfish_health_summary", + "Redfish Server Monitoring Summary Metrics (CPU, Memory, etc.)", + labels=["host", "server_manufacturer", "server_model", "server_serial", "device_type", "cpu_model", "cpu_count", "total_system_memory_gb"] + ) def get_session(self): """Get the url for the server info and messure the response time""" @@ -164,12 +183,37 @@ def get_session(self): ) self._basic_auth = True - if result: - if result.status_code in [200, 201]: - self._auth_token = result.headers['X-Auth-Token'] - self._session_url = result.json()['@odata.id'] - logging.info("Target %s: Got an auth token from server %s!", self.target, self.host) - self._redfish_up = 1 + if result and result.status_code in [200, 201]: + print("Status code :", result.status_code) + print("Response Text:", result.text) + # print("Response Headers are :", result.headers) + # logging.info(f"Response Headers: {result.headers}") + logging.info(f"Response Headers: {result.headers}") + logging.info(f"The status code is {result.status_code} and the response is {result}") + self._auth_token = result.headers.get('X-Auth-Token') + session_url = result.headers.get('Location') + if not self._auth_token: + logging.warning("Target %s: No X-Auth-Token in headers", self.target) + self._redfish_up = 0 + return + + if not session_url: + try: + json_body = result.json() + session_url = json_body.get('@odata.id') + logging.debug("Session URL from JSON: %s", session_url) + except (ValueError, requests.exceptions.JSONDecodeError) as e: + logging.warning("Invalid or empty JSON body. Exception: %s", e) + + if not session_url: + logging.warning("Session URL not found in either JSON body or Location header.") + self._redfish_up = 0 + return + + self._session_url = session_url + # self._session_url = result.json()['@odata.id'] + logging.info("Target %s: Got an auth token from server %s!", self.target, self.host) + self._redfish_up = 1 def connect_server(self, command, noauth=False, basic_auth=False): """Connect to the server and get the data.""" @@ -282,60 +326,198 @@ def connect_server(self, command, noauth=False, basic_auth=False): request_duration = round(time.time() - request_start, 2) logging.debug("Target %s: Request duration: %s", self.target, request_duration) return server_response - def get_base_labels(self): - """Get the basic labels for the metrics.""" + """Get base labels and populate Redfish component URLs.""" systems = self.connect_server(self.urls['Systems']) - if not systems: + logging.error("Target %s: No response from /Systems", self.target) return power_states = {"off": 0, "on": 1} - # Get the server info for the labels - server_info = {} - for member in systems['Members']: - self._systems_url = member['@odata.id'] - info = self.connect_server(self._systems_url) - if info: - server_info.update(info) + members = systems.get("Members", []) + if not members: + logging.error("Target %s: No system members found under /Systems", self.target) + return + + self._systems_url = members[0].get("@odata.id") + if not self._systems_url: + logging.error("Target %s: No @odata.id in first system member", self.target) + return + + server_info = self.connect_server(self._systems_url) if not server_info: + logging.error("Target %s: Could not fetch system info at %s", self.target, self._systems_url) return - self.manufacturer = server_info.get('Manufacturer') - self.model = server_info.get('Model') + self.urls["EthernetInterfaces"] = server_info.get("EthernetInterfaces", {}).get("@odata.id", "") + logging.debug("EthernetInterfaces URL: %s", self.urls["EthernetInterfaces"]) + + + # Extract labels + self.manufacturer = server_info.get("Manufacturer", "Custom") + self.model = server_info.get("Model", "unknown") + self.serial = server_info.get("SerialNumber", "") + if not self.manufacturer or not self.model: logging.error("Target %s: No manufacturer or model found on server %s!", self.target, self.host) + logging.debug("Target %s: Full server_info payload: %s", self.target, server_info) return - self.powerstate = power_states[server_info['PowerState'].lower()] - # Dell has the Serial# in the SKU field, others in the SerialNumber field. - if "SKU" in server_info and re.match(r'^[Dd]ell.*', server_info['Manufacturer']): - self.serial = server_info['SKU'] - else: - self.serial = server_info['SerialNumber'] - - self.labels.update( - { - "host": self.host, - "server_manufacturer": self.manufacturer, - "server_model": self.model, - "server_serial": self.serial + + power_state_raw = server_info.get("PowerState", "off").lower() + self.powerstate = power_states.get(power_state_raw, 0) + + self.labels.update({ + "host": self.host, + "server_manufacturer": self.manufacturer, + "server_model": self.model, + "server_serial": self.serial + }) + + # Overall health + status_obj = server_info.get("Status", {}) + self.server_health = self.status.get(status_obj.get("Health", "").lower(), 0) + # Store processor summary + processor_summary = server_info.get("ProcessorSummary", {}) + if processor_summary: + labels = { + "device_type": "processor_summary", + "cpu_model": processor_summary.get("Model", "unknown"), + "cpu_count": str(processor_summary.get("Count", "unknown")) } - ) + labels.update(self.labels) + self.health_summary_metrics.add_sample( + "redfish_health_summary", + value=self.status.get(processor_summary.get("Status", {}).get("Health", "").lower(), math.nan), + labels=labels + ) + + # Store memory summary + memory_summary = server_info.get("MemorySummary", {}) + if memory_summary: + labels = { + "device_type": "memory_summary", + "total_system_memory_gb": str(memory_summary.get("TotalSystemMemory", "unknown")) + } + labels.update(self.labels) + self.health_summary_metrics.add_sample( + "redfish_health_summary", + value=self.status.get(memory_summary.get("Status", {}).get("Health", "").lower(), math.nan), + labels=labels + ) - self.server_health = self.status[server_info['Status']['Health'].lower()] - # get the links of the parts for later - for url in self.urls: - if url in server_info: - self.urls[url] = server_info[url]['@odata.id'] + # Set component URLs + keys_direct = ["Processors", "Memory", "Storage", "Power", "Thermal", "EthernetInterfaces"] + for key in keys_direct: + self.urls[key] = server_info.get(key, {}).get("@odata.id", "") + + links = server_info.get("Links", {}) + chassis_list = links.get("Chassis", []) + if chassis_list: + chassis_ref = chassis_list[0] + self.urls["Chassis"] = chassis_ref["@odata.id"] if isinstance(chassis_ref, dict) else chassis_ref + + manager_list = links.get("ManagedBy", []) + if manager_list: + manager_ref = manager_list[0] + self.urls["ManagedBy"] = manager_ref["@odata.id"] if isinstance(manager_ref, dict) else manager_ref + + logging.debug("Target %s: Parsed Redfish component URLs: %s", self.target, self.urls) + + # Now try to discover thermal/power subsystems + self.get_chassis_urls() + + + # def get_base_labels(self): + # """Get the basic labels for the metrics.""" + # systems = self.connect_server(self.urls['Systems']) + + # if not systems: + # return + + # power_states = {"off": 0, "on": 1} + # # Get the server info for the labels + # # server_info = {} + # members = systems.get("Members", []) + # if not members: + # logging.error("Target %s: No system members found under /Systems", self.target) + # return + # # Always take the first system + # self._systems_url = members[0].get("@odata.id") + # if not self._systems_url: + # logging.error("Target %s: No @odata.id in first system member", self.target) + # return + # server_info = self.connect_server(self._systems_url) + # if not server_info: + # logging.error("Target %s: Could not fetch system info at %s", self.target, self._systems_url) + # return + # # for member in systems['Members']: + # # self._systems_url = member['@odata.id'] + # # info = self.connect_server(self._systems_url) + # # if info: + # # server_info.update(info) + + # # if not server_info: + # # return + # self.manufacturer = server_info.get('Manufacturer') + # self.model = server_info.get('Model') + # if not self.manufacturer or not self.model: + # logging.error("Target %s: No manufacturer or model found on server %s!", self.target, self.host) + # return + # self.powerstate = power_states[server_info['PowerState'].lower()] + # # Dell has the Serial# in the SKU field, others in the SerialNumber field. + # if "SKU" in server_info and re.match(r'^[Dd]ell.*', server_info['Manufacturer']): + # self.serial = server_info['SKU'] + # else: + # self.serial = server_info['SerialNumber'] + + # self.labels.update( + # { + # "host": self.host, + # "server_manufacturer": self.manufacturer, + # "server_model": self.model, + # "server_serial": self.serial + # } + # ) + + # self.server_health = self.status[server_info['Status']['Health'].lower()] + + # # get the links of the parts for later + # # for url in self.urls: + # # if url in server_info: + # # self.urls[url] = server_info[url]['@odata.id'] + + # # # standard is a list but there are exceptions + # # if isinstance(server_info['Links']['Chassis'][0], str): + # # self.urls['Chassis'] = server_info['Links']['Chassis'][0] + # # self.urls['ManagedBy'] = server_info['Links']['ManagedBy'][0] + # # else: + # # self.urls['Chassis'] = server_info['Links']['Chassis'][0]['@odata.id'] + # # self.urls['ManagedBy'] = server_info['Links']['ManagedBy'][0]['@odata.id'] + # # Extract direct component paths + direct_keys = ["Processors", "Memory", "Storage", "Power", "Thermal", "EthernetInterfaces"] + for key in direct_keys: + self.urls[key] = server_info.get(key, {}).get("@odata.id", "") + + # Handle nested Chassis and Manager links + chassis_links = server_info.get("Links", {}).get("Chassis", []) + if chassis_links: + chassis_ref = chassis_links[0] + if isinstance(chassis_ref, dict): + self.urls["Chassis"] = chassis_ref.get("@odata.id", "") + elif isinstance(chassis_ref, str): + self.urls["Chassis"] = chassis_ref + + manager_links = server_info.get("Links", {}).get("ManagedBy", []) + if manager_links: + manager_ref = manager_links[0] + if isinstance(manager_ref, dict): + self.urls["ManagedBy"] = manager_ref.get("@odata.id", "") + elif isinstance(manager_ref, str): + self.urls["ManagedBy"] = manager_ref + + logging.debug("Target %s: Parsed component URLs: %s", self.target, self.urls) - # standard is a list but there are exceptions - if isinstance(server_info['Links']['Chassis'][0], str): - self.urls['Chassis'] = server_info['Links']['Chassis'][0] - self.urls['ManagedBy'] = server_info['Links']['ManagedBy'][0] - else: - self.urls['Chassis'] = server_info['Links']['Chassis'][0]['@odata.id'] - self.urls['ManagedBy'] = server_info['Links']['ManagedBy'][0]['@odata.id'] self.get_chassis_urls() @@ -426,6 +608,7 @@ def collect(self): yield metrics.mem_metrics_uncorrectable yield metrics.health_metrics + # Get the firmware information if self.metrics_type == 'firmware': metrics = FirmwareCollector(self) @@ -459,6 +642,71 @@ def collect(self): value = duration, labels = self.labels, ) + ether_collector = EthernetCollector( + self.host, + self.target, + self.labels, + self.urls, + self.connect_server + ) + for metric in ether_collector.collect(): + yield metric + # SYSTEM collector + system_collector = SystemCollector(self.host, self.target, self.labels, self.urls, self.connect_server) + for metric in system_collector.collect(): + yield metric + + # DCN collector + dcn_collector = DistributedControlNodeCollector(self.host, self.target, self.labels, self.urls, self.connect_server) + for metric in dcn_collector.collect(): + yield metric + + # BUS collector + bus_collector = BusCollector(self.host, self.target, self.labels, self.urls, self.connect_server) + for metric in bus_collector.collect(): + yield metric + + # MODULE collector + module_collector = ModuleCollector(self.host, self.target, self.labels, self.urls, self.connect_server) + for metric in module_collector.collect(): + yield metric + + # CHANNEL collector + channel_collector = ChannelCollector(self.host, self.target, self.labels, self.urls, self.connect_server) + for metric in channel_collector.collect(): + yield metric + + os_collector = OperatingSystemCollector( + self.host, + self.target, + self.labels, + self.urls, + self.connect_server + ) + os_collector = OperatingSystemCollector(self.host, self.target, self.labels, self.urls, self.connect_server) + for metric in os_collector.collect(): + yield metric + # ether_collector.collect() + # yield ether_collector.ethernet_metrics + # eth_metrics = EthernetCollector(self.host, self.target, self.labels, self.urls) + # eth_metrics.collect() + # yield eth_metrics.ethernet_health_metrics + # recursive_collector = RecursiveCollector( + # host=self.host, + # target=self.target, + # labels=self.labels, + # # start_path="/redfish/v1/Systems/AXCF3152/DistributedControlNode/Busses/Axioline/IOModules", + # start_path="/redfish/v1/Systems", + # connect_fn=self.connect_server, + # # max_depth=8 # Adjust as needed + # config=self.config + # ) + # for metric in recursive_collector.collect(): + # yield metric + + + if hasattr(self, "health_summary_metrics"): + yield self.health_summary_metrics yield scrape_metrics def __exit__(self, exc_type, exc_val, exc_tb): diff --git a/collectors/bus_collector.py b/collectors/bus_collector.py new file mode 100644 index 0000000..028d1c9 --- /dev/null +++ b/collectors/bus_collector.py @@ -0,0 +1,50 @@ +from prometheus_client.core import GaugeMetricFamily, InfoMetricFamily +from collectors.module_collector import ModuleCollector +from collectors.utils import _extract_kv_metrics, get_leaf_name + +class BusCollector: + def __init__(self, host, target, labels, urls, connect_fn): + self.host = host + self.target = target + self.labels = labels + self.urls = urls + self.connect_server = connect_fn + + def collect(self): + busses_url = self.urls.get("Busses") + if not busses_url: + return [] + + data = self.connect_server(busses_url) + metrics = [] + + for member in data.get("Members", []): + member_path = member["@odata.id"] + bus_id = get_leaf_name(member_path) + + clean_labels = { + "host": self.labels.get("host", ""), + "server_manufacturer": self.labels.get("server_manufacturer", ""), + "server_model": self.labels.get("server_model", ""), + "server_serial": self.labels.get("server_serial", ""), + "system": get_leaf_name(self.labels.get("system", "")), + "dcn": get_leaf_name(self.labels.get("dcn", "")), + "bus": bus_id + } + + member_data = self.connect_server(member_path) + metrics += self._extract_metrics(member_data, clean_labels) + + mod_link = member_data.get("IOModules", {}).get("@odata.id") + if mod_link: + mod_collector = ModuleCollector( + self.host, self.target, clean_labels, + {"IOModules": mod_link}, + self.connect_server + ) + metrics += mod_collector.collect() + + return metrics + + def _extract_metrics(self, data, labels): + return _extract_kv_metrics("bus", data, labels) diff --git a/collectors/channel_collector.py b/collectors/channel_collector.py new file mode 100644 index 0000000..ebf3f8f --- /dev/null +++ b/collectors/channel_collector.py @@ -0,0 +1,47 @@ +from prometheus_client.core import GaugeMetricFamily +from collectors.utils import _extract_kv_metrics, get_leaf_name + +class ChannelCollector: + def __init__(self, host, target, labels, urls, connect_fn): + self.host = host + self.target = target + self.labels = labels + self.urls = urls + self.connect_server = connect_fn + + def collect(self): + ch_url = self.urls.get("IOChannels") + if not ch_url: + return [] + + data = self.connect_server(ch_url) + metrics = [] + + for ch in data.get("Members", []): + ch_data = self.connect_server(ch["@odata.id"]) + + # Extract the clean names for labels + system = get_leaf_name(self.labels.get("system", "")) + dcn = get_leaf_name(self.labels.get("dcn", "")) + bus = get_leaf_name(self.labels.get("bus", "")) + module = get_leaf_name(self.labels.get("module", "")) + channel = get_leaf_name(ch_data.get("Id", "channel")) + + scoped_labels = { + "host": self.host, + "server_manufacturer": self.labels.get("server_manufacturer", ""), + "server_model": self.labels.get("server_model", ""), + "server_serial": self.labels.get("server_serial", ""), + "system": system, + "dcn": dcn, + "bus": bus, + "module": module, + "channel": channel + } + + metrics += self._extract_metrics(ch_data, scoped_labels) + + return metrics + + def _extract_metrics(self, data, labels): + return _extract_kv_metrics("channel", data, labels) diff --git a/collectors/dcn_collector.py b/collectors/dcn_collector.py new file mode 100644 index 0000000..88e89ff --- /dev/null +++ b/collectors/dcn_collector.py @@ -0,0 +1,45 @@ +from prometheus_client.core import GaugeMetricFamily, InfoMetricFamily +from collectors.bus_collector import BusCollector +from collectors.utils import _extract_kv_metrics, get_leaf_name + + +class DistributedControlNodeCollector: + def __init__(self, host, target, labels, urls, connect_fn): + self.host = host + self.target = target + self.labels = labels + self.urls = urls + self.connect_server = connect_fn + + def collect(self): + dcn_url = self.urls.get("DistributedControlNode") + if not dcn_url: + return [] + + dcn_data = self.connect_server(dcn_url) + dcn_id = get_leaf_name(dcn_data.get("Id", "dcn")) + + clean_labels = { + "host": self.labels.get("host", ""), + "server_manufacturer": self.labels.get("server_manufacturer", ""), + "server_model": self.labels.get("server_model", ""), + "server_serial": self.labels.get("server_serial", ""), + "system": get_leaf_name(self.labels.get("system", "")), + "dcn": dcn_id + } + + metrics = self._extract_metrics(dcn_data, clean_labels) + + busses_link = dcn_data.get("Busses", {}).get("@odata.id") + if busses_link: + bus_collector = BusCollector( + self.host, self.target, clean_labels, + {"Busses": busses_link}, + self.connect_server + ) + metrics += bus_collector.collect() + + return metrics + + def _extract_metrics(self, data, labels): + return _extract_kv_metrics("dcn", data, labels) diff --git a/collectors/ethernet_collector.py b/collectors/ethernet_collector.py new file mode 100644 index 0000000..ca9078d --- /dev/null +++ b/collectors/ethernet_collector.py @@ -0,0 +1,112 @@ +import logging +from prometheus_client.core import GaugeMetricFamily + +class EthernetCollector: + def __init__(self, host, target, labels, urls, connect_fn): + self.host = host + self.target = target + self.labels = labels + self.urls = urls + self.connect_server = connect_fn # Redfish API connection method + + self.ethernet_metrics = GaugeMetricFamily( + "redfish_ethernet_interface", + "Redfish Server Monitoring Ethernet Interface Status", + labels=["interface_name", "mac_address", "ipv4", "ipv6", "speed_mbps"] + list(self.labels.keys()) + ) + + self.link_status_metric = GaugeMetricFamily( + "redfish_ethernet_link_status", + "Link status of the Ethernet interface (1=Up, 0=Down/Unknown)", + labels=["interface_name"] + list(self.labels.keys()) + ) + + self.duplex_metric = GaugeMetricFamily( + "redfish_ethernet_full_duplex", + "Whether the Ethernet interface is in full duplex mode", + labels=["interface_name"] + list(self.labels.keys()) + ) + + self.dhcp_metric = GaugeMetricFamily( + "redfish_ethernet_dhcp_enabled", + "Whether DHCPv4 is enabled on the Ethernet interface", + labels=["interface_name"] + list(self.labels.keys()) + ) + + def collect(self): + eth_url = self.urls.get("NetworkInterfaces") or self.urls.get("EthernetInterfaces") + if not eth_url: + logging.warning("Target %s: No Ethernet interface URL found.", self.target) + return + + iface_list = self.connect_server(eth_url) + if not iface_list or "Members" not in iface_list: + logging.warning("Target %s: EthernetInterfaces returned no members.", self.target) + return + + for iface in iface_list["Members"]: + iface_url = iface.get("@odata.id") + if not iface_url: + continue + + iface_data = self.connect_server(iface_url) + if not iface_data: + continue + + interface_name = iface_data.get("Name", "unknown") + mac = iface_data.get("MACAddress", "") + speed = str(iface_data.get("SpeedMbps", 0)) + ipv4 = "unknown" + ipv6 = "unknown" + + ipv4_list = iface_data.get("IPv4Addresses", []) + if ipv4_list and isinstance(ipv4_list, list) and "Address" in ipv4_list[0]: + ipv4 = ipv4_list[0]["Address"] + + ipv6_list = iface_data.get("IPv6Addresses", []) + if ipv6_list and isinstance(ipv6_list, list) and "Address" in ipv6_list[0]: + ipv6 = ipv6_list[0]["Address"] + + metric_labels = { + "interface_name": interface_name, + "mac_address": mac, + "ipv4": ipv4, + "ipv6": ipv6, + "speed_mbps": speed + } + metric_labels.update(self.labels) + + self.ethernet_metrics.add_sample( + "redfish_ethernet_interface", value=1, labels=metric_labels + ) + + # LinkStatus + link_status = iface_data.get("LinkStatus", "Unknown") + self.link_status_metric.add_sample( + "redfish_ethernet_link_status", + value=1 if link_status.lower() == "up" else 0, + labels={"interface_name": interface_name, **self.labels} + ) + + # FullDuplex + full_duplex = iface_data.get("FullDuplex", False) + self.duplex_metric.add_sample( + "redfish_ethernet_full_duplex", + value=1 if full_duplex else 0, + labels={"interface_name": interface_name, **self.labels} + ) + + # DHCPv4 Enabled + dhcp_enabled = iface_data.get("DHCPv4", {}).get("DHCPEnabled", False) + self.dhcp_metric.add_sample( + "redfish_ethernet_dhcp_enabled", + value=1 if dhcp_enabled else 0, + labels={"interface_name": interface_name, **self.labels} + ) + + return [ + self.ethernet_metrics, + self.link_status_metric, + self.duplex_metric, + self.dhcp_metric + ] diff --git a/collectors/health_collector.py b/collectors/health_collector.py index 846c5d8..15931a1 100644 --- a/collectors/health_collector.py +++ b/collectors/health_collector.py @@ -206,8 +206,12 @@ def get_power_health(self): power_data = self.col.connect_server(self.col.urls["Power"]) if not power_data: return - - for psu in power_data["PowerSupplies"]: + power_supplies = power_data.get("PowerSupplies", []) + if not power_supplies: + logging.warning("Target %s: No PowerSupplies found in Power data!", self.col.target) + return + for psu in power_supplies: + #for psu in power_data["PowerSupplies"]: psu_name = psu["Name"] if "Name" in psu and psu["Name"] is not None else "unknown" psu_model = psu["Model"] if "Model" in psu and psu["Model"] is not None else "unknown" @@ -224,6 +228,39 @@ def get_power_health(self): "Health", current_labels ) + def get_ethernet_health(self): + """Get Ethernet Interface data from the Redfish API.""" + logging.debug("Target %s: Get the Ethernet Interface health data.", self.col.target) + + eth_collection = self.col.connect_server(self.col.urls["NetworkInterfaces"]) + if not eth_collection or "Members" not in eth_collection: + logging.warning("Target %s: No Ethernet Interfaces found.", self.col.target) + return + + for iface in eth_collection["Members"]: + iface_data = self.col.connect_server(iface["@odata.id"]) + if not iface_data: + continue + + eth_status = self.extract_health_status( + iface_data, "EthernetInterface", iface_data.get("Id", "unknown") + ) + current_labels = { + "device_type": "ethernet", + "device_name": iface_data.get("Name", "unknown"), + "mac_address": iface_data.get("MACAddress", "unknown"), + "speed_mbps": str(iface_data.get("SpeedMbps", "unknown")), + "interface_enabled": str(iface_data.get("InterfaceEnabled", "unknown")), + } + current_labels.update(self.col.labels) + + self.add_metric_sample( + "redfish_health", + {"Health": eth_status}, + "Health", + current_labels + ) + def get_thermal_health(self): """Get the Thermal data from the Redfish API.""" @@ -231,8 +268,12 @@ def get_thermal_health(self): thermal_data = self.col.connect_server(self.col.urls["Thermal"]) if not thermal_data: return + fans = thermal_data.get("Fans",[]) + if not fans: + logging.warning("Target %s: No PowerSupplies found in Power Data!", self.col.target) + return - for fan in thermal_data["Fans"]: + for fan in fans: fan_name = fan.get("Name", "unknown") current_labels = { "device_type": "fan", @@ -370,7 +411,7 @@ def collect(self): current_labels ) - for url_key in ["Processors", "Storage", "Chassis", "Power", "Thermal", "Memory"]: + for url_key in ["Processors", "Storage", "Chassis", "Power", "Thermal", "Memory", "NetworkInterfaces"]: self.collect_health_data(url_key) def __exit__(self, exc_type, exc_val, exc_tb): diff --git a/collectors/module_collector.py b/collectors/module_collector.py new file mode 100644 index 0000000..c340f30 --- /dev/null +++ b/collectors/module_collector.py @@ -0,0 +1,55 @@ +from prometheus_client.core import GaugeMetricFamily, InfoMetricFamily +from collectors.channel_collector import ChannelCollector +from collectors.utils import _extract_kv_metrics, get_leaf_name + +class ModuleCollector: + def __init__(self, host, target, labels, urls, connect_fn): + self.host = host + self.target = target + self.labels = labels + self.urls = urls + self.connect_server = connect_fn + + def collect(self): + mod_url = self.urls.get("IOModules") + if not mod_url: + return [] + + data = self.connect_server(mod_url) + metrics = [] + + for mod in data.get("Members", []): + mod_path = mod["@odata.id"] + mod_id = get_leaf_name(mod_path) + + # Clean all labels here + clean_labels = { + "host": self.labels.get("host", ""), + "server_manufacturer": self.labels.get("server_manufacturer", ""), + "server_model": self.labels.get("server_model", ""), + "server_serial": self.labels.get("server_serial", ""), + "system": get_leaf_name(self.labels.get("system", "")), + "dcn": get_leaf_name(self.labels.get("dcn", "")), + "bus": get_leaf_name(self.labels.get("bus", "")), + "module": mod_id + } + + mod_data = self.connect_server(mod_path) + + metrics += self._extract_metrics(mod_data, clean_labels) + + ch_link = mod_data.get("IOChannels", {}).get("@odata.id") + if ch_link: + ch_collector = ChannelCollector( + self.host, + self.target, + clean_labels, + {"IOChannels": ch_link}, + self.connect_server + ) + metrics += ch_collector.collect() + + return metrics + + def _extract_metrics(self, data, labels): + return _extract_kv_metrics("module", data, labels) diff --git a/collectors/operating_system_collector.py b/collectors/operating_system_collector.py new file mode 100644 index 0000000..73ac6b4 --- /dev/null +++ b/collectors/operating_system_collector.py @@ -0,0 +1,68 @@ +import logging +from prometheus_client.core import GaugeMetricFamily + +class OperatingSystemCollector: + def __init__(self, host, target, labels, urls, connect_fn): + self.host = host + self.target = target + self.labels = labels + self.urls = urls + self.connect_server = connect_fn + + def collect(self): + os_url = self.urls.get("OperatingSystem") + if not os_url: + logging.warning("OperatingSystem URL not found in provided URLs") + return [] + + os_info = self.connect_server(os_url) + if not os_info: + logging.warning("No data returned from OperatingSystem endpoint") + return [] + + # Collect basic OS info as a 1-valued metric + os_info_metric = GaugeMetricFamily( + "redfish_operating_system_info", + "Basic information about the operating system", + labels=["os_id", "os_name", "kernel_name", "hostname", "processor_type"] + list(self.labels.keys()) + ) + os_info_metric.add_sample( + "redfish_operating_system_info", + value=1, + labels={ + "os_id": os_info.get("Id", "unknown"), + "os_name": os_info.get("OperatingSystemName", "unknown"), + "kernel_name": os_info.get("KernelName", "unknown"), + "hostname": os_info.get("Hostname", "unknown"), + "processor_type": os_info.get("ProcessorType", "unknown"), + **self.labels + } + ) + + # Try to follow OperatingSystemMetrics endpoint + metrics_url = ( + os_info.get("OperatingSystemMetrics", {}) + .get("@odata.id") + ) + os_metrics = self.connect_server(metrics_url) if metrics_url else {} + + status_metric = GaugeMetricFamily( + "redfish_operating_system_status", + "Uptime or other system metrics from OperatingSystemMetrics", + labels=["metric", "unit"] + list(self.labels.keys()) + ) + + # Example: Collect uptime if available + uptime = os_metrics.get("UptimeSeconds") + if uptime is not None: + status_metric.add_sample( + "redfish_operating_system_status", + value=float(uptime), + labels={ + "metric": "uptime_seconds", + "unit": "seconds", + **self.labels + } + ) + + return [os_info_metric, status_metric] diff --git a/collectors/recursive_collector.py b/collectors/recursive_collector.py new file mode 100644 index 0000000..5249b90 --- /dev/null +++ b/collectors/recursive_collector.py @@ -0,0 +1,144 @@ +import logging +from prometheus_client.core import GaugeMetricFamily, InfoMetricFamily + +class RecursiveCollector: + def __init__(self, host, target, labels, start_path, connect_fn, config=None): + self.host = host + self.target = target + self.labels = labels + self.start_path = start_path + self.connect_server = connect_fn + self.metrics = {} + self.status_map = {"OK": 1.0, "Warning": 0.5, "Critical": 0.0} + self.max_depth = 20 + + recursive_config = config.get("recursive", {}) if config else {} + include_logs = recursive_config.get("include_logs", False) + self.skip_paths = ["/LogServices/Log/Entries"] if not include_logs else [] + + def collect(self): + self.metrics = {} + self.walk_and_collect(self.start_path, self.labels.copy(), 0) + return list(self.metrics.values()) + + def walk_and_collect(self, path, inherited_labels, depth): + if depth > self.max_depth: + return + if any(skip in path for skip in self.skip_paths): + return + + data = self.connect_server(path) + if not isinstance(data, dict): + return + + context_labels = inherited_labels.copy() + context_labels["host"] = self.host + context_labels["odata_id"] = path + context_labels.update(self.extract_labels_from_path(path)) + + self.extract_fields_as_metrics(data, context_labels) + + # Follow child links + visited = set() + for val in data.values(): + if isinstance(val, dict) and "@odata.id" in val: + next_path = val["@odata.id"] + if next_path not in visited: + visited.add(next_path) + self.walk_and_collect(next_path, context_labels, depth + 1) + + if "Members" in data and isinstance(data["Members"], list): + for member in data["Members"]: + if isinstance(member, dict) and "@odata.id" in member: + next_path = member["@odata.id"] + if next_path not in visited: + visited.add(next_path) + self.walk_and_collect(next_path, context_labels, depth + 1) + + def extract_fields_as_metrics(self, data, labels): + flat = self.flatten_dict(data) + for raw_key, raw_val in flat.items(): + key = self.sanitize_key(raw_key) + if key in {"@odata_id", "odata_context", "odata_type", "id"}: + continue + if isinstance(raw_val, (int, float)): + self.add_gauge_metric(key, float(raw_val), labels) + elif isinstance(raw_val, str): + if raw_val.strip().lower() in {"", "value", "id", "none", "unknown", "null"}: + continue + self.add_info_metric(key, raw_val.strip(), labels) + + # Extract from status dict + status = data.get("Status", {}) + for skey in ["Health", "HealthRollup", "State"]: + if skey in status: + val = status[skey] + metric_key = f"status_{skey.lower()}" + self.add_info_metric(metric_key, val, labels) + if val in self.status_map: + self.add_gauge_metric(metric_key + "_score", self.status_map[val], labels) + + # Handle common keys + for key in ["ErrorCode", "ErrorText", "Name"]: + if key in data: + val = str(data[key]) + if val.lower() not in {"null", "none", "unknown", ""}: + self.add_info_metric(key.lower(), val, labels) + + def flatten_dict(self, d, parent_key="", sep="_"): + items = {} + for k, v in d.items(): + new_key = f"{parent_key}{sep}{k}" if parent_key else k + if isinstance(v, dict): + # ⚠️ Skip link-only children like {"@odata.id": "/..."} + if "@odata.id" in v and len(v.keys()) == 1: + continue + items.update(self.flatten_dict(v, new_key, sep=sep)) + elif isinstance(v, list): + continue # Skipping list + else: + items[new_key] = v + return items + + def extract_labels_from_path(self, path): + parts = path.strip("/").split("/") + labels = {} + for i, part in enumerate(parts): + if part.lower() == "systems" and i + 1 < len(parts): + labels["system"] = parts[i + 1] + elif part.lower() == "distributedcontrolnode": + labels["dcn"] = parts[i + 1] if i + 1 < len(parts) else "" + elif part.lower() == "busses" and i + 1 < len(parts): + labels["bus"] = parts[i + 1] + elif part.lower() == "iomodules" and i + 1 < len(parts): + labels["module"] = parts[i + 1] + elif part.lower() == "iochannels" and i + 1 < len(parts): + labels["channel"] = parts[i + 1] + return labels + + def sanitize_key(self, key): + return ( + key.lower() + .strip() + .replace(" ", "_") + .replace("-", "_") + .replace(".", "_") + .replace("/", "_") + .replace("@", "") + ) + + def add_gauge_metric(self, key, value, labels): + metric_name = f"redfish_{key}" + if metric_name not in self.metrics: + self.metrics[metric_name] = GaugeMetricFamily( + metric_name, f"Redfish numeric metric for '{key}'", labels=list(labels.keys()) + ) + self.metrics[metric_name].add_sample(metric_name, value=value, labels=labels) + + def add_info_metric(self, key, value, labels): + metric_name = f"redfish_{key}_info" + if metric_name not in self.metrics: + self.metrics[metric_name] = InfoMetricFamily( + metric_name, f"Redfish info for '{key}'", labels=list(labels.keys()) + ["value"] + ) + self.metrics[metric_name].add_metric({**labels, "value": value}, {}) diff --git a/collectors/system_collector.py b/collectors/system_collector.py new file mode 100644 index 0000000..37ca743 --- /dev/null +++ b/collectors/system_collector.py @@ -0,0 +1,63 @@ +import logging +from prometheus_client.core import GaugeMetricFamily, InfoMetricFamily +from collectors.dcn_collector import DistributedControlNodeCollector + +class SystemCollector: + def __init__(self, host, target, labels, urls, connect_fn): + self.host = host + self.target = target + self.labels = labels + self.urls = urls + self.connect_server = connect_fn + self.logger = logging.getLogger(__name__) + self.metrics = {} + + def collect(self): + system_url = self.urls.get("Systems") + if not system_url: + return [] + + systems_data = self.connect_server(system_url) + if not systems_data: + return [] + + all_metrics = [] + members = systems_data.get("Members", []) + for member in members: + sys_url = member.get("@odata.id") + sys_data = self.connect_server(sys_url) + if not sys_data: + continue + + sys_id = sys_data.get("Id", "unknown") + labels = {**self.labels, "host": self.host, "system": sys_id} + all_metrics += self._extract_metrics(sys_data, labels) + + # Trigger DCN Collector + dcn_link = sys_data.get("DistributedControlNode", {}).get("@odata.id") + if dcn_link: + dcn_collector = DistributedControlNodeCollector( + self.host, self.target, labels, + {"DistributedControlNode": dcn_link}, + self.connect_server + ) + all_metrics += dcn_collector.collect() + + return all_metrics + + def _extract_metrics(self, data, labels): + metrics = [] + status = data.get("Status", {}) + for key, value in status.items(): + if isinstance(value, str): + m = InfoMetricFamily(f"redfish_status_{key.lower()}_info", "System status field", labels=list(labels.keys()) + ["value"]) + m.add_metric({**labels, "value": value}, {}) + metrics.append(m) + + for key, value in data.items(): + if isinstance(value, (int, float)): + m = GaugeMetricFamily(f"redfish_system_{key.lower()}", f"System metric {key}", labels=labels.keys()) + m.add_metric(labels.values(), value) + metrics.append(m) + + return metrics diff --git a/collectors/utils.py b/collectors/utils.py new file mode 100644 index 0000000..95fb8ef --- /dev/null +++ b/collectors/utils.py @@ -0,0 +1,60 @@ +from prometheus_client.core import GaugeMetricFamily +import re + +def _sanitize_metric_name(name): + return re.sub(r'[^a-zA-Z0-9_]', '_', name).lower() + +def get_leaf_name(path): + if isinstance(path, str) and '/' in path: + return path.rstrip('/').split('/')[-1] + return path + +def _extract_kv_metrics(prefix, data, labels): + metrics = [] + excluded_keys = ["@odata.id", "@odata.type", "Id"] + + for key, value in data.items(): + if key in excluded_keys: + continue + + metric_key = _sanitize_metric_name(f"{prefix}_{key}") + label_copy = labels.copy() + + if isinstance(value, dict): + # Handle nested objects like Status + if key.lower() == "status": + for sub_key, sub_value in value.items(): + metric_name = _sanitize_metric_name(f"{prefix}_{key}_{sub_key}_info") + labels_with_status = label_copy.copy() + labels_with_status[sub_key] = str(sub_value) + metric = GaugeMetricFamily(metric_name, f"{prefix} {key} info for {sub_key}", labels=list(labels_with_status.keys())) + metric.add_metric(list(labels_with_status.values()), 1.0) + metrics.append(metric) + else: + metrics.extend(_extract_kv_metrics(f"{prefix}_{key}", value, label_copy)) + + elif isinstance(value, list): + continue # We don't need lists as metrics directly + + else: + if isinstance(value, str) and value.startswith('/redfish/v1'): + clean_value = get_leaf_name(value) + else: + clean_value = str(value) + + if key.lower() == "id": + # If key is Id, clean it directly + label_copy[key] = get_leaf_name(value) + else: + label_copy[key] = clean_value + + if isinstance(value, (int, float)): + metric = GaugeMetricFamily(metric_key, f"{prefix} metric for {key}", labels=list(label_copy.keys())) + metric.add_metric(list(label_copy.values()), float(value)) + metrics.append(metric) + else: + metric = GaugeMetricFamily(f"{metric_key}_info", f"{prefix} info for {key}", labels=list(label_copy.keys())) + metric.add_metric(list(label_copy.values()), 1.0) + metrics.append(metric) + + return metrics diff --git a/config.yml b/config.yml index d2a8692..b66a724 100644 --- a/config.yml +++ b/config.yml @@ -1,5 +1,7 @@ listen_port: 9220 timeout: 10 job: 'redfish-myjob' -username: admin -password: admin +username: root +password: f641e576 +recursive: + include_logs: false