Skip to content

Commit 5caac57

Browse files
committed
added ethernet & os collectors
1 parent 19d919b commit 5caac57

4 files changed

Lines changed: 407 additions & 43 deletions

File tree

collector.py

Lines changed: 213 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,14 @@
66
import re
77
import requests
88
import redfish
9+
import math
910
from prometheus_client.core import GaugeMetricFamily
1011
from collectors.performance_collector import PerformanceCollector
1112
from collectors.firmware_collector import FirmwareCollector
1213
from collectors.health_collector import HealthCollector
1314
from collectors.certificate_collector import CertificateCollector
15+
from collectors.ethernet_collector import EthernetCollector
16+
from collectors.operating_system_collector import OperatingSystemCollector
1417

1518
class RedfishMetricsCollector:
1619
"""Class for collecting Redfish metrics."""
@@ -45,7 +48,7 @@ def __init__(self, config, target, host, usr, pwd, metrics_type):
4548
"Thermal": "",
4649
"PowerSubsystem": "",
4750
"ThermalSubsystem": "",
48-
"NetworkInterfaces": "",
51+
"EthernetInterfaces": "",
4952
}
5053

5154
self.server_health = 0
@@ -70,6 +73,11 @@ def __init__(self, config, target, host, usr, pwd, metrics_type):
7073
self._basic_auth = False
7174
self._session = ""
7275
self.redfish_version = "not available"
76+
self.health_summary_metrics = GaugeMetricFamily(
77+
"redfish_health_summary",
78+
"Redfish Server Monitoring Summary Metrics (CPU, Memory, etc.)",
79+
labels=["host", "server_manufacturer", "server_model", "server_serial", "device_type", "cpu_model", "cpu_count", "total_system_memory_gb"]
80+
)
7381

7482
def get_session(self):
7583
"""Get the url for the server info and messure the response time"""
@@ -308,60 +316,198 @@ def connect_server(self, command, noauth=False, basic_auth=False):
308316
request_duration = round(time.time() - request_start, 2)
309317
logging.debug("Target %s: Request duration: %s", self.target, request_duration)
310318
return server_response
311-
312319
def get_base_labels(self):
313-
"""Get the basic labels for the metrics."""
320+
"""Get base labels and populate Redfish component URLs."""
314321
systems = self.connect_server(self.urls['Systems'])
315-
316322
if not systems:
323+
logging.error("Target %s: No response from /Systems", self.target)
317324
return
318325

319326
power_states = {"off": 0, "on": 1}
320-
# Get the server info for the labels
321-
server_info = {}
322-
for member in systems['Members']:
323-
self._systems_url = member['@odata.id']
324-
info = self.connect_server(self._systems_url)
325-
if info:
326-
server_info.update(info)
327327

328+
members = systems.get("Members", [])
329+
if not members:
330+
logging.error("Target %s: No system members found under /Systems", self.target)
331+
return
332+
333+
self._systems_url = members[0].get("@odata.id")
334+
if not self._systems_url:
335+
logging.error("Target %s: No @odata.id in first system member", self.target)
336+
return
337+
338+
server_info = self.connect_server(self._systems_url)
328339
if not server_info:
340+
logging.error("Target %s: Could not fetch system info at %s", self.target, self._systems_url)
329341
return
330-
self.manufacturer = server_info.get('Manufacturer')
331-
self.model = server_info.get('Model')
342+
self.urls["EthernetInterfaces"] = server_info.get("EthernetInterfaces", {}).get("@odata.id", "")
343+
logging.debug("EthernetInterfaces URL: %s", self.urls["EthernetInterfaces"])
344+
345+
346+
# Extract labels
347+
self.manufacturer = server_info.get("Manufacturer", "Custom")
348+
self.model = server_info.get("Model", "unknown")
349+
self.serial = server_info.get("SerialNumber", "")
350+
332351
if not self.manufacturer or not self.model:
333352
logging.error("Target %s: No manufacturer or model found on server %s!", self.target, self.host)
353+
logging.debug("Target %s: Full server_info payload: %s", self.target, server_info)
334354
return
335-
self.powerstate = power_states[server_info['PowerState'].lower()]
336-
# Dell has the Serial# in the SKU field, others in the SerialNumber field.
337-
if "SKU" in server_info and re.match(r'^[Dd]ell.*', server_info['Manufacturer']):
338-
self.serial = server_info['SKU']
339-
else:
340-
self.serial = server_info['SerialNumber']
341-
342-
self.labels.update(
343-
{
344-
"host": self.host,
345-
"server_manufacturer": self.manufacturer,
346-
"server_model": self.model,
347-
"server_serial": self.serial
355+
356+
power_state_raw = server_info.get("PowerState", "off").lower()
357+
self.powerstate = power_states.get(power_state_raw, 0)
358+
359+
self.labels.update({
360+
"host": self.host,
361+
"server_manufacturer": self.manufacturer,
362+
"server_model": self.model,
363+
"server_serial": self.serial
364+
})
365+
366+
# Overall health
367+
status_obj = server_info.get("Status", {})
368+
self.server_health = self.status.get(status_obj.get("Health", "").lower(), 0)
369+
# Store processor summary
370+
processor_summary = server_info.get("ProcessorSummary", {})
371+
if processor_summary:
372+
labels = {
373+
"device_type": "processor_summary",
374+
"cpu_model": processor_summary.get("Model", "unknown"),
375+
"cpu_count": str(processor_summary.get("Count", "unknown"))
348376
}
349-
)
377+
labels.update(self.labels)
378+
self.health_summary_metrics.add_sample(
379+
"redfish_health_summary",
380+
value=self.status.get(processor_summary.get("Status", {}).get("Health", "").lower(), math.nan),
381+
labels=labels
382+
)
350383

351-
self.server_health = self.status[server_info['Status']['Health'].lower()]
384+
# Store memory summary
385+
memory_summary = server_info.get("MemorySummary", {})
386+
if memory_summary:
387+
labels = {
388+
"device_type": "memory_summary",
389+
"total_system_memory_gb": str(memory_summary.get("TotalSystemMemory", "unknown"))
390+
}
391+
labels.update(self.labels)
392+
self.health_summary_metrics.add_sample(
393+
"redfish_health_summary",
394+
value=self.status.get(memory_summary.get("Status", {}).get("Health", "").lower(), math.nan),
395+
labels=labels
396+
)
352397

353-
# get the links of the parts for later
354-
for url in self.urls:
355-
if url in server_info:
356-
self.urls[url] = server_info[url]['@odata.id']
357398

358-
# standard is a list but there are exceptions
359-
if isinstance(server_info['Links']['Chassis'][0], str):
360-
self.urls['Chassis'] = server_info['Links']['Chassis'][0]
361-
self.urls['ManagedBy'] = server_info['Links']['ManagedBy'][0]
362-
else:
363-
self.urls['Chassis'] = server_info['Links']['Chassis'][0]['@odata.id']
364-
self.urls['ManagedBy'] = server_info['Links']['ManagedBy'][0]['@odata.id']
399+
# Set component URLs
400+
keys_direct = ["Processors", "Memory", "Storage", "Power", "Thermal", "EthernetInterfaces"]
401+
for key in keys_direct:
402+
self.urls[key] = server_info.get(key, {}).get("@odata.id", "")
403+
404+
links = server_info.get("Links", {})
405+
chassis_list = links.get("Chassis", [])
406+
if chassis_list:
407+
chassis_ref = chassis_list[0]
408+
self.urls["Chassis"] = chassis_ref["@odata.id"] if isinstance(chassis_ref, dict) else chassis_ref
409+
410+
manager_list = links.get("ManagedBy", [])
411+
if manager_list:
412+
manager_ref = manager_list[0]
413+
self.urls["ManagedBy"] = manager_ref["@odata.id"] if isinstance(manager_ref, dict) else manager_ref
414+
415+
logging.debug("Target %s: Parsed Redfish component URLs: %s", self.target, self.urls)
416+
417+
# Now try to discover thermal/power subsystems
418+
self.get_chassis_urls()
419+
420+
421+
# def get_base_labels(self):
422+
# """Get the basic labels for the metrics."""
423+
# systems = self.connect_server(self.urls['Systems'])
424+
425+
# if not systems:
426+
# return
427+
428+
# power_states = {"off": 0, "on": 1}
429+
# # Get the server info for the labels
430+
# # server_info = {}
431+
# members = systems.get("Members", [])
432+
# if not members:
433+
# logging.error("Target %s: No system members found under /Systems", self.target)
434+
# return
435+
# # Always take the first system
436+
# self._systems_url = members[0].get("@odata.id")
437+
# if not self._systems_url:
438+
# logging.error("Target %s: No @odata.id in first system member", self.target)
439+
# return
440+
# server_info = self.connect_server(self._systems_url)
441+
# if not server_info:
442+
# logging.error("Target %s: Could not fetch system info at %s", self.target, self._systems_url)
443+
# return
444+
# # for member in systems['Members']:
445+
# # self._systems_url = member['@odata.id']
446+
# # info = self.connect_server(self._systems_url)
447+
# # if info:
448+
# # server_info.update(info)
449+
450+
# # if not server_info:
451+
# # return
452+
# self.manufacturer = server_info.get('Manufacturer')
453+
# self.model = server_info.get('Model')
454+
# if not self.manufacturer or not self.model:
455+
# logging.error("Target %s: No manufacturer or model found on server %s!", self.target, self.host)
456+
# return
457+
# self.powerstate = power_states[server_info['PowerState'].lower()]
458+
# # Dell has the Serial# in the SKU field, others in the SerialNumber field.
459+
# if "SKU" in server_info and re.match(r'^[Dd]ell.*', server_info['Manufacturer']):
460+
# self.serial = server_info['SKU']
461+
# else:
462+
# self.serial = server_info['SerialNumber']
463+
464+
# self.labels.update(
465+
# {
466+
# "host": self.host,
467+
# "server_manufacturer": self.manufacturer,
468+
# "server_model": self.model,
469+
# "server_serial": self.serial
470+
# }
471+
# )
472+
473+
# self.server_health = self.status[server_info['Status']['Health'].lower()]
474+
475+
# # get the links of the parts for later
476+
# # for url in self.urls:
477+
# # if url in server_info:
478+
# # self.urls[url] = server_info[url]['@odata.id']
479+
480+
# # # standard is a list but there are exceptions
481+
# # if isinstance(server_info['Links']['Chassis'][0], str):
482+
# # self.urls['Chassis'] = server_info['Links']['Chassis'][0]
483+
# # self.urls['ManagedBy'] = server_info['Links']['ManagedBy'][0]
484+
# # else:
485+
# # self.urls['Chassis'] = server_info['Links']['Chassis'][0]['@odata.id']
486+
# # self.urls['ManagedBy'] = server_info['Links']['ManagedBy'][0]['@odata.id']
487+
# # Extract direct component paths
488+
direct_keys = ["Processors", "Memory", "Storage", "Power", "Thermal", "EthernetInterfaces"]
489+
for key in direct_keys:
490+
self.urls[key] = server_info.get(key, {}).get("@odata.id", "")
491+
492+
# Handle nested Chassis and Manager links
493+
chassis_links = server_info.get("Links", {}).get("Chassis", [])
494+
if chassis_links:
495+
chassis_ref = chassis_links[0]
496+
if isinstance(chassis_ref, dict):
497+
self.urls["Chassis"] = chassis_ref.get("@odata.id", "")
498+
elif isinstance(chassis_ref, str):
499+
self.urls["Chassis"] = chassis_ref
500+
501+
manager_links = server_info.get("Links", {}).get("ManagedBy", [])
502+
if manager_links:
503+
manager_ref = manager_links[0]
504+
if isinstance(manager_ref, dict):
505+
self.urls["ManagedBy"] = manager_ref.get("@odata.id", "")
506+
elif isinstance(manager_ref, str):
507+
self.urls["ManagedBy"] = manager_ref
508+
509+
logging.debug("Target %s: Parsed component URLs: %s", self.target, self.urls)
510+
365511

366512
self.get_chassis_urls()
367513

@@ -452,6 +598,7 @@ def collect(self):
452598
yield metrics.mem_metrics_uncorrectable
453599
yield metrics.health_metrics
454600

601+
455602
# Get the firmware information
456603
if self.metrics_type == 'firmware':
457604
metrics = FirmwareCollector(self)
@@ -485,6 +632,33 @@ def collect(self):
485632
value = duration,
486633
labels = self.labels,
487634
)
635+
ether_collector = EthernetCollector(
636+
self.host,
637+
self.target,
638+
self.labels,
639+
self.urls,
640+
self.connect_server
641+
)
642+
for metric in ether_collector.collect():
643+
yield metric
644+
os_collector = OperatingSystemCollector(
645+
self.host,
646+
self.target,
647+
self.labels,
648+
self.urls,
649+
self.connect_server
650+
)
651+
os_collector = OperatingSystemCollector(self.host, self.target, self.labels, self.urls, self.connect_server)
652+
for metric in os_collector.collect():
653+
yield metric
654+
# ether_collector.collect()
655+
# yield ether_collector.ethernet_metrics
656+
# eth_metrics = EthernetCollector(self.host, self.target, self.labels, self.urls)
657+
# eth_metrics.collect()
658+
# yield eth_metrics.ethernet_health_metrics
659+
660+
if hasattr(self, "health_summary_metrics"):
661+
yield self.health_summary_metrics
488662
yield scrape_metrics
489663

490664
def __exit__(self, exc_type, exc_val, exc_tb):

0 commit comments

Comments
 (0)