Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions internal/common/errormatch/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// Package errormatch provides types and functions for matching job failure
// signals: exit codes, termination messages, and Kubernetes conditions.
//
// [ExitCodeMatcher] supports In/NotIn set membership against container exit
// codes. Exit code 0 never matches. [RegexMatcher] holds a pattern string
// that callers compile at construction time and pass to [MatchPattern].
//
// The condition constants ([ConditionOOMKilled], [ConditionEvicted],
// [ConditionDeadlineExceeded]) and [KnownConditions] map are provided for
// config validation.
package errormatch
33 changes: 33 additions & 0 deletions internal/common/errormatch/match.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package errormatch

import "regexp"

// MatchExitCode returns true if the exit code matches the matcher.
// Exit code 0 never matches (successful containers are not failures).
func MatchExitCode(matcher *ExitCodeMatcher, exitCode int32) bool {
if matcher == nil || exitCode == 0 {
return false
}
switch matcher.Operator {
case ExitCodeOperatorIn:
for _, v := range matcher.Values {
if exitCode == v {
return true
}
}
case ExitCodeOperatorNotIn:
for _, v := range matcher.Values {
if exitCode == v {
return false
}
}
return true
}
return false
}

// MatchPattern returns true if the value matches the compiled regex.
// Empty values never match.
func MatchPattern(re *regexp.Regexp, value string) bool {
return value != "" && re.MatchString(value)
}
89 changes: 89 additions & 0 deletions internal/common/errormatch/match_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
package errormatch

import (
"regexp"
"testing"

"github.com/stretchr/testify/assert"
)

func TestMatchExitCode(t *testing.T) {
tests := map[string]struct {
matcher *ExitCodeMatcher
exitCode int32
expected bool
}{
"In matches": {
matcher: &ExitCodeMatcher{Operator: ExitCodeOperatorIn, Values: []int32{74, 75}},
exitCode: 74,
expected: true,
},
"In does not match": {
matcher: &ExitCodeMatcher{Operator: ExitCodeOperatorIn, Values: []int32{74, 75}},
exitCode: 1,
expected: false,
},
"NotIn matches when code absent": {
matcher: &ExitCodeMatcher{Operator: ExitCodeOperatorNotIn, Values: []int32{1, 2}},
exitCode: 42,
expected: true,
},
"NotIn does not match when code present": {
matcher: &ExitCodeMatcher{Operator: ExitCodeOperatorNotIn, Values: []int32{1, 2}},
exitCode: 1,
expected: false,
},
"exit code 0 never matches In": {
matcher: &ExitCodeMatcher{Operator: ExitCodeOperatorIn, Values: []int32{0}},
exitCode: 0,
expected: false,
},
"exit code 0 never matches NotIn": {
matcher: &ExitCodeMatcher{Operator: ExitCodeOperatorNotIn, Values: []int32{1}},
exitCode: 0,
expected: false,
},
"nil matcher returns false": {
matcher: nil,
exitCode: 1,
expected: false,
},
}

for name, tc := range tests {
t.Run(name, func(t *testing.T) {
assert.Equal(t, tc.expected, MatchExitCode(tc.matcher, tc.exitCode))
})
}
}

func TestMatchPattern(t *testing.T) {
tests := map[string]struct {
pattern string
value string
expected bool
}{
"matches": {
pattern: "(?i)cuda.*error",
value: "CUDA memory error on device 0",
expected: true,
},
"does not match": {
pattern: "(?i)cuda.*error",
value: "segfault",
expected: false,
},
"empty value never matches": {
pattern: ".*",
value: "",
expected: false,
},
}

for name, tc := range tests {
t.Run(name, func(t *testing.T) {
re := regexp.MustCompile(tc.pattern)
assert.Equal(t, tc.expected, MatchPattern(re, tc.value))
})
}
}
33 changes: 33 additions & 0 deletions internal/common/errormatch/types.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package errormatch

// ExitCodeOperator is a set membership operator: In or NotIn.
type ExitCodeOperator string

const (
ExitCodeOperatorIn ExitCodeOperator = "In"
ExitCodeOperatorNotIn ExitCodeOperator = "NotIn"
)

// ExitCodeMatcher specifies an operator and a set of exit code values.
type ExitCodeMatcher struct {
Operator ExitCodeOperator `yaml:"operator"`
Values []int32 `yaml:"values"`
}

// RegexMatcher specifies a regex pattern as a string.
type RegexMatcher struct {
Pattern string `yaml:"pattern"`
}

const (
ConditionOOMKilled = "OOMKilled"
ConditionEvicted = "Evicted"
ConditionDeadlineExceeded = "DeadlineExceeded"
)

// KnownConditions is the set of valid condition strings for config validation.
var KnownConditions = map[string]bool{
ConditionOOMKilled: true,
ConditionEvicted: true,
ConditionDeadlineExceeded: true,
}
11 changes: 11 additions & 0 deletions internal/executor/application.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ import (
common_metrics "github.com/armadaproject/armada/internal/common/metrics"
"github.com/armadaproject/armada/internal/common/task"
"github.com/armadaproject/armada/internal/common/util"
"github.com/armadaproject/armada/internal/executor/categorizer"
"github.com/armadaproject/armada/internal/executor/configuration"
executor_context "github.com/armadaproject/armada/internal/executor/context"
"github.com/armadaproject/armada/internal/executor/job"
Expand Down Expand Up @@ -202,6 +203,14 @@ func setupExecutorApiComponents(
ctx.Fatalf("Config error in failed pod checks: %s", err)
}

var classifier *categorizer.Classifier
if len(config.Application.ErrorCategories) > 0 {
classifier, err = categorizer.NewClassifier(config.Application.ErrorCategories)
if err != nil {
ctx.Fatalf("Config error in error categories: %s", err)
}
}

eventReporter, stopReporter := reporter.NewJobEventReporter(eventSender, clock.RealClock{}, 200)

submitter := job.NewSubmitter(
Expand Down Expand Up @@ -240,6 +249,7 @@ func setupExecutorApiComponents(
pendingPodChecker,
failedPodChecker,
config.Kubernetes.StuckTerminatingPodExpiry,
classifier,
)
if err != nil {
ctx.Fatalf("Failed to create pod issue service: %s", err)
Expand All @@ -249,6 +259,7 @@ func setupExecutorApiComponents(
clusterContext,
eventReporter,
podIssueService,
classifier,
)
if err != nil {
ctx.Fatalf("Failed to create job state reporter: %s", err)
Expand Down
Loading
Loading