Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 4 additions & 2 deletions teuthology/kill.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,8 @@ def kill_run(run_name, archive_base=None, owner=None, machine_type=None,
targets = find_targets(run_name)
names = list(targets.keys())
lock_ops.unlock_safe(names, owner, run_name)
report.try_mark_run_dead(run_name)
# Provide a reason so individual jobs get a failure_reason when marked dead
report.try_mark_run_dead(run_name, reason="killed by user")


def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False):
Expand All @@ -93,7 +94,8 @@ def kill_job(run_name, job_id, archive_base=None, owner=None, skip_unlock=False)
owner = job_info['owner']
if kill_processes(run_name, [job_info.get('pid')]):
return
report.try_push_job_info(job_info, dict(status="dead"))
# Indicate this was a manual kill so the results server can show a reason
report.try_push_job_info(job_info, dict(status="dead", failure_reason="killed by user"))
if 'machine_type' in job_info:
teuthology.exporter.JobResults().record(
machine_type=job_info["machine_type"],
Expand Down
23 changes: 17 additions & 6 deletions teuthology/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -566,7 +566,7 @@ def try_delete_job(job_id):
try_delete_job(job_id)


def try_mark_run_dead(run_name):
def try_mark_run_dead(run_name, reason=None):
"""
Using the same error checking and retry mechanism as try_push_job_info(),
mark any unfinished runs as dead.
Expand All @@ -578,18 +578,29 @@ def try_mark_run_dead(run_name):
if not reporter.base_uri:
return

log.debug("Marking run as dead: {name}".format(name=run_name))
log.debug("Marking run as dead: {name} reason={reason}".format(name=run_name, reason=reason))
jobs = reporter.get_jobs(run_name, fields=['status'])
for job in jobs:
if job['status'] not in ['pass', 'fail', 'dead']:
job_id = job['job_id']
try:
log.info("Marking job {job_id} as dead".format(job_id=job_id))
reporter.report_job(run_name, job['job_id'], dead=True)
if "machine_type" in job:
# Load existing job_info from the archive, merge in our
# extra fields so the results server gets a useful
# failure_reason when a run is marked dead manually.
job_info = reporter.serializer.job_info(run_name, job_id)
# Ensure status is set to dead and include a reason if given
job_info.update({'status': 'dead'})
if reason:
# Use the common 'failure_reason' field elsewhere in the
# codebase so tooling can pick it up.
job_info['failure_reason'] = reason

reporter.report_job(run_name, job_id, job_info=job_info)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you still need dead=True here.
Otherwise it'll be False:
def report_job(self, run_name, job_id, job_info=None, dead=False)
and once you do that job status is already updated in report_job():

 if dead and get_status(job_info) is None:
            set_status(job_info, 'dead')

I think we can stick to just updating the failure reason here

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, got it. I’ll make the suggested changes accordingly.

if "machine_type" in job_info:
teuthology.exporter.JobResults().record(
machine_type=job["machine_type"],
status=job["status"],
machine_type=job_info["machine_type"],
status=job_info["status"],
)
except report_exceptions:
log.exception("Could not mark job as dead: {job_id}".format(
Expand Down
4 changes: 3 additions & 1 deletion teuthology/task/internal/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,9 @@ def check_packages(ctx, config):
# set the failure message and update paddles with the status
ctx.summary["failure_reason"] = msg
set_status(ctx.summary, "dead")
report.try_push_job_info(ctx.config, dict(status='dead'))
# Include the failure_reason so the results server records why
# this job was marked dead (e.g. missing packages)
report.try_push_job_info(ctx.config, dict(status='dead', failure_reason=msg))
raise VersionNotFoundError(package.base_url)
else:
log.info(
Expand Down
36 changes: 36 additions & 0 deletions teuthology/test/test_report_dead_reason.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import pytest
from unittest.mock import patch, MagicMock

import teuthology.report as report


@patch('teuthology.report.ResultsReporter')
def test_try_mark_run_dead_includes_reason(mock_reporter_cls):
# Set up a fake reporter with serializer.job_info and report_job
mock_reporter = MagicMock()
mock_reporter_cls.return_value = mock_reporter

# Simulate one job returned by get_jobs
mock_reporter.get_jobs.return_value = [
{'job_id': '1', 'status': 'running'}
]

# serializer.job_info should return a dict representing archived job info
mock_reporter.serializer.job_info.return_value = {
'job_id': '1',
'machine_type': 'smithi',
}

# Call the function under test
report.try_mark_run_dead('fake-run', reason='killed by user')

# Ensure report_job was called with job_info that contains failure_reason
assert mock_reporter.report_job.called
called_args, called_kwargs = mock_reporter.report_job.call_args
# call signature: report_job(run_name, job_id, job_info=...)
assert called_args[0] == 'fake-run'
assert called_args[1] == '1'

job_info = called_kwargs.get('job_info') if 'job_info' in called_kwargs else called_args[2]
assert job_info['status'] == 'dead'
assert job_info['failure_reason'] == 'killed by user'
Loading