Skip to content

Commit

Permalink
Fetch alarm dictionaries from cluster server (#531)
Browse files Browse the repository at this point in the history
The alarm server uses the alarm dictionary endpoints
of the cluster server to fetch the alarm dictionaries.
These dictionaries are cached in the same way as the
nodeClusters and nodeClusterTypes objects.

There are 2 new methods in the infrastructure client to
return the nodeClusterTypeID and the alarmDefinitionID.
These methods return the IDs if they are present in cache
or try to fetch them from the cluster server if they are missing.

There is also a resync mechanism that runs every hour.

Signed-off-by: Marcelo Guerrero <[email protected]>
  • Loading branch information
mlguerrero12 authored Jan 31, 2025
1 parent 72163e7 commit 744dece
Show file tree
Hide file tree
Showing 35 changed files with 1,871 additions and 1,732 deletions.
17 changes: 17 additions & 0 deletions bundle/manifests/oran-o2ims.clusterserviceversion.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,33 @@ rules:
verbs:
- create
- post
- nonResourceURLs:
- /o2ims-infrastructureCluster/v1/alarmDictionaries
verbs:
- get
- list
- nonResourceURLs:
- /o2ims-infrastructureCluster/v1/alarmDictionaries/*
verbs:
- get
- nonResourceURLs:
- /o2ims-infrastructureCluster/v1/nodeClusterTypes
verbs:
- get
- list
- nonResourceURLs:
- /o2ims-infrastructureCluster/v1/nodeClusterTypes/*
verbs:
- get
- nonResourceURLs:
- /o2ims-infrastructureCluster/v1/nodeClusters
verbs:
- get
- list
- nonResourceURLs:
- /o2ims-infrastructureCluster/v1/nodeClusters/*
verbs:
- get
- apiGroups:
- ""
resources:
Expand Down
15 changes: 15 additions & 0 deletions internal/controllers/inventory_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ import (
//+kubebuilder:rbac:urls="/internal/v1/caas-alerts/alertmanager",verbs=create;post
//+kubebuilder:rbac:urls="/o2ims-infrastructureCluster/v1/nodeClusterTypes",verbs=get;list
//+kubebuilder:rbac:urls="/o2ims-infrastructureCluster/v1/nodeClusters",verbs=get;list
//+kubebuilder:rbac:urls="/o2ims-infrastructureCluster/v1/alarmDictionaries",verbs=get;list
//+kubebuilder:rbac:urls="/o2ims-infrastructureCluster/v1/nodeClusterTypes/*",verbs=get
//+kubebuilder:rbac:urls="/o2ims-infrastructureCluster/v1/nodeClusters/*",verbs=get
//+kubebuilder:rbac:urls="/o2ims-infrastructureCluster/v1/alarmDictionaries/*",verbs=get
//+kubebuilder:rbac:urls="/hardware-manager/inventory/*",verbs=get;list
//+kubebuilder:rbac:groups="batch",resources=cronjobs,verbs=get;list;watch;create;update;patch;delete

Expand Down Expand Up @@ -1248,12 +1252,23 @@ func (t *reconcilerTask) createAlarmServerClusterRole(ctx context.Context) error
NonResourceURLs: []string{
"/o2ims-infrastructureCluster/v1/nodeClusterTypes",
"/o2ims-infrastructureCluster/v1/nodeClusters",
"/o2ims-infrastructureCluster/v1/alarmDictionaries",
},
Verbs: []string{
"get",
"list",
},
},
{
NonResourceURLs: []string{
"/o2ims-infrastructureCluster/v1/nodeClusterTypes/*",
"/o2ims-infrastructureCluster/v1/nodeClusters/*",
"/o2ims-infrastructureCluster/v1/alarmDictionaries/*",
},
Verbs: []string{
"get",
},
},
},
}

Expand Down
3 changes: 3 additions & 0 deletions internal/controllers/utils/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -289,6 +289,9 @@ const (
ClusterTemplateArtifactsLabel = "clustertemplates.o2ims.provisioning.oran.org/templateId"
)

// AlarmDefinitionSeverityField severity field within additional fields of alarm definition
const AlarmDefinitionSeverityField = "severity"

// Alertmanager values
const (
AlertmanagerObjectName = "alertmanager"
Expand Down
69 changes: 15 additions & 54 deletions internal/service/alarms/api/server.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@ import (
"sync"
"time"

"github.com/openshift-kni/oran-o2ims/internal/service/alarms/internal/serviceconfig"

"github.com/google/uuid"
"github.com/jackc/pgerrcode"
"github.com/jackc/pgx/v5/pgconn"
Expand All @@ -20,7 +18,7 @@ import (
"github.com/openshift-kni/oran-o2ims/internal/service/alarms/internal/db/models"
"github.com/openshift-kni/oran-o2ims/internal/service/alarms/internal/db/repo"
"github.com/openshift-kni/oran-o2ims/internal/service/alarms/internal/infrastructure"
"github.com/openshift-kni/oran-o2ims/internal/service/alarms/internal/infrastructure/clusterserver"
"github.com/openshift-kni/oran-o2ims/internal/service/alarms/internal/serviceconfig"
api2 "github.com/openshift-kni/oran-o2ims/internal/service/common/api"
common "github.com/openshift-kni/oran-o2ims/internal/service/common/api/generated"
"github.com/openshift-kni/oran-o2ims/internal/service/common/notifier"
Expand Down Expand Up @@ -308,44 +306,16 @@ func (a *AlarmsServer) PatchAlarm(ctx context.Context, request api.PatchAlarmReq
}), nil
}

// Check if associated alarm definition has clearing type "manual". If not, return 409.
alarmDefinition, err := a.AlarmsRepository.GetAlarmDefinition(ctx, *record.AlarmDefinitionID)
if errors.Is(err, utils.ErrNotFound) {
return api.PatchAlarm404ApplicationProblemPlusJSONResponse(common.ProblemDetails{
AdditionalAttributes: &map[string]string{
"alarmEventRecordId": request.AlarmEventRecordId.String(),
},
Detail: "associated Alarm Definition not found",
Status: http.StatusNotFound,
}), nil
}

if alarmDefinition.ClearingType != string(common.MANUAL) {
return api.PatchAlarm409ApplicationProblemPlusJSONResponse(common.ProblemDetails{
AdditionalAttributes: &map[string]string{
"alarmEventRecordId": request.AlarmEventRecordId.String(),
},
Detail: "cannot clear an alarm with clearing type other than MANUAL",
Status: http.StatusConflict,
}), nil
}
// All our alarms have AUTOMATIC clearing type
// TODO: support clearing type MANUAL alarms

// Check if the Alarm Event Record has already been cleared
if record.PerceivedSeverity == perceivedSeverity {
// Nothing to patch
return api.PatchAlarm409ApplicationProblemPlusJSONResponse(common.ProblemDetails{
AdditionalAttributes: &map[string]string{
"alarmEventRecordId": request.AlarmEventRecordId.String(),
},
Detail: "Alarm record is already cleared",
Status: http.StatusConflict,
}), nil
}

// Patch the Alarm Event Record
record.PerceivedSeverity = perceivedSeverity
currentTime := time.Now()
record.AlarmClearedTime = &currentTime
return api.PatchAlarm409ApplicationProblemPlusJSONResponse(common.ProblemDetails{
AdditionalAttributes: &map[string]string{
"alarmEventRecordId": request.AlarmEventRecordId.String(),
},
Detail: "cannot clear an alarm with clearing type other than MANUAL",
Status: http.StatusConflict,
}), nil
}

// Patch alarmAcknowledged
Expand Down Expand Up @@ -515,25 +485,16 @@ func (a *AlarmsServer) AmNotification(ctx context.Context, request api.AmNotific
return nil, fmt.Errorf("%s: %w", msg, err)
}

// Get NodeCluster NodeClusterType mapping
var clusterIDToNodeClusterTypeID map[uuid.UUID]uuid.UUID
// Get cached cluster server data
var clusterServer infrastructure.Client
for i := range a.Infrastructure.Clients {
if a.Infrastructure.Clients[i].Name() == clusterserver.Name {
clusterIDToNodeClusterTypeID = a.Infrastructure.Clients[i].(*clusterserver.ClusterServer).GetClusterIDToResourceTypeID()
break
if a.Infrastructure.Clients[i].Name() == infrastructure.Name {
clusterServer = a.Infrastructure.Clients[i]
}
}

// Get the definition data based on current set of Alert names and managed cluster ID
alarmDefinitions, err := a.AlarmsRepository.GetAlarmDefinitions(ctx, request.Body, clusterIDToNodeClusterTypeID)
if err != nil {
msg := "failed to get AlarmDefinitions"
slog.Error(msg, "error", err)
return nil, fmt.Errorf("%s: %w", msg, err)
}

// Combine possible definitions with events
aerModels := alertmanager.ConvertAmToAlarmEventRecordModels(request.Body, alarmDefinitions, clusterIDToNodeClusterTypeID)
aerModels := alertmanager.ConvertAmToAlarmEventRecordModels(request.Body, clusterServer)

// Insert and update AlarmEventRecord
if err := a.AlarmsRepository.UpsertAlarmEventRecord(ctx, aerModels); err != nil {
Expand Down
42 changes: 24 additions & 18 deletions internal/service/alarms/internal/alertmanager/alertmanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,13 +11,13 @@ import (
"time"

"github.com/google/uuid"
api "github.com/openshift-kni/oran-o2ims/internal/service/alarms/api/generated"
"github.com/openshift-kni/oran-o2ims/internal/service/alarms/internal/db/models"

corev1 "k8s.io/api/core/v1"
"sigs.k8s.io/controller-runtime/pkg/client"

"github.com/openshift-kni/oran-o2ims/internal/controllers/utils"
api "github.com/openshift-kni/oran-o2ims/internal/service/alarms/api/generated"
"github.com/openshift-kni/oran-o2ims/internal/service/alarms/internal/db/models"
"github.com/openshift-kni/oran-o2ims/internal/service/alarms/internal/infrastructure"
"github.com/openshift-kni/oran-o2ims/internal/service/common/clients/k8s"
)

Expand Down Expand Up @@ -79,7 +79,7 @@ func Setup(ctx context.Context) error {
}

// ConvertAmToAlarmEventRecordModels get alarmEventRecords based on the alertmanager notification and AlarmDefinition
func ConvertAmToAlarmEventRecordModels(am *api.AlertmanagerNotification, aDefinitionRecords []models.AlarmDefinition, clusterIDToObjectTypeID map[uuid.UUID]uuid.UUID) []models.AlarmEventRecord {
func ConvertAmToAlarmEventRecordModels(am *api.AlertmanagerNotification, infrastructureClient infrastructure.Client) []models.AlarmEventRecord {
records := make([]models.AlarmEventRecord, 0, len(am.Alerts))
for _, alert := range am.Alerts {
record := models.AlarmEventRecord{
Expand All @@ -89,16 +89,6 @@ func ConvertAmToAlarmEventRecordModels(am *api.AlertmanagerNotification, aDefini
Fingerprint: *alert.Fingerprint,
}

// for caas alerts object is the cluster ID
record.ObjectID = GetClusterID(*alert.Labels)

// derive ObjectTypeID from ObjectID
if id := record.ObjectID; id != nil {
if typeID, exists := clusterIDToObjectTypeID[*id]; exists {
record.ObjectTypeID = &typeID
}
}

// Make sure the current payload has the right severity
if *alert.Status == api.Resolved {
record.PerceivedSeverity = severityToPerceivedSeverity("cleared")
Expand All @@ -110,11 +100,27 @@ func ConvertAmToAlarmEventRecordModels(am *api.AlertmanagerNotification, aDefini
// Update Extensions with things we didn't really process
record.Extensions = getExtensions(*alert.Labels, *alert.Annotations)

// for caas alerts object is the cluster ID
record.ObjectID = GetClusterID(*alert.Labels)

// derive ObjectTypeID from ObjectID
if record.ObjectID != nil {
objectTypeID, err := infrastructureClient.GetObjectTypeID(*record.ObjectID)
if err != nil {
slog.Warn("Could not get object type ID", "objectID", record.ObjectID, "err", err.Error())
} else {
record.ObjectTypeID = &objectTypeID
}
}

// See if possible to pick up additional info from its definition
for _, def := range aDefinitionRecords {
if def.AlarmName == GetAlertName(*alert.Labels) && def.ObjectTypeID == *record.ObjectTypeID && severityToPerceivedSeverity(def.Severity) == record.PerceivedSeverity {
record.AlarmDefinitionID = &def.AlarmDefinitionID
record.ProbableCauseID = &def.ProbableCauseID
if record.ObjectTypeID != nil {
_, severity := GetPerceivedSeverity(*alert.Labels)
alarmDefinitionID, err := infrastructureClient.GetAlarmDefinitionID(*record.ObjectTypeID, GetAlertName(*alert.Labels), severity)
if err != nil {
slog.Warn("Could not get alarm definition ID", "objectTypeID", *record.ObjectTypeID, "name", GetAlertName(*alert.Labels), "severity", severity, "err", err.Error())
} else {
record.AlarmDefinitionID = &alarmDefinitionID
}
}

Expand Down

This file was deleted.

This file was deleted.

This file was deleted.

Loading

0 comments on commit 744dece

Please sign in to comment.