Skip to content

Commit

Permalink
Restrict false positive warning
Browse files Browse the repository at this point in the history
"Environment stopped unexpectedly" that got also triggered on migrations.
  • Loading branch information
mpass99 committed Sep 24, 2024
1 parent b966aac commit 0d175c2
Showing 1 changed file with 27 additions and 1 deletion.
28 changes: 27 additions & 1 deletion internal/runner/nomad_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ import (
"time"

nomadApi "github.com/hashicorp/nomad/api"
"github.com/hashicorp/nomad/nomad/structs"
influxdb2 "github.com/influxdata/influxdb-client-go/v2"
"github.com/openHPI/poseidon/internal/config"
"github.com/openHPI/poseidon/internal/nomad"
Expand All @@ -20,6 +21,8 @@ import (
"github.com/openHPI/poseidon/pkg/util"
)

const environmentMigrationDelay = time.Minute

var (
log = logging.GetLogger("runner")
ErrUnknownExecutionEnvironment = errors.New("execution environment not found")
Expand Down Expand Up @@ -330,6 +333,29 @@ func monitorAllocationStartupDuration(startup time.Duration, runnerID string, en
monitoring.WriteInfluxPoint(p)
}

// checkForMigratingEnvironmentJob checks if the Nomad environment job is still running after the delay.
func (m *NomadRunnerManager) checkForMigratingEnvironmentJob(jobID string, delay time.Duration) {
log.WithField(dto.KeyEnvironmentID, jobID).Debug("Environment stopped unexpectedly. Checking again..")
<-time.After(delay)

templateJobs, err := m.apiClient.LoadEnvironmentJobs()
if err != nil {
log.WithError(err).Warn("couldn't load template jobs")
}

var environmentStillRunning bool
for _, job := range templateJobs {
if jobID == *job.ID && *job.Status == structs.JobStatusRunning {
environmentStillRunning = true
break
}
}

if !environmentStillRunning {
log.WithField(dto.KeyEnvironmentID, jobID).Warn("Environment stopped unexpectedly")
}
}

// onAllocationStopped is the callback for when Nomad stopped an allocation.
func (m *NomadRunnerManager) onAllocationStopped(ctx context.Context, runnerID string, reason error) (alreadyRemoved bool) {
log.WithField(dto.KeyRunnerID, runnerID).Debug("Runner stopped")
Expand All @@ -343,7 +369,7 @@ func (m *NomadRunnerManager) onAllocationStopped(ctx context.Context, runnerID s
}
_, ok := m.environments.Get(environmentID.ToString())
if ok {
log.WithField(dto.KeyEnvironmentID, environmentID).Warn("Environment stopped unexpectedly")
go m.checkForMigratingEnvironmentJob(runnerID, environmentMigrationDelay)
}
return !ok
}
Expand Down

0 comments on commit 0d175c2

Please sign in to comment.