From 27379146cfa12cc90e38f2f0d77009d80f3164db Mon Sep 17 00:00:00 2001 From: Sylvain Leclerc Date: Tue, 25 Apr 2023 13:54:57 +0200 Subject: [PATCH 1/5] fix(api): fix uncaught exceptions in slurm launcher loop (#1477) - Exceptions are caught in result handling - As a last resort, all exceptions are caught in the monitoring loop Signed-off-by: Sylvain Leclerc --- .../launcher/adapters/slurm_launcher/slurm_launcher.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/antarest/launcher/adapters/slurm_launcher/slurm_launcher.py b/antarest/launcher/adapters/slurm_launcher/slurm_launcher.py index 96609c6ded..07e490ff24 100644 --- a/antarest/launcher/adapters/slurm_launcher/slurm_launcher.py +++ b/antarest/launcher/adapters/slurm_launcher/slurm_launcher.py @@ -148,7 +148,13 @@ def _retrieve_running_jobs(self) -> None: def _loop(self) -> None: while self.check_state: - self._check_studies_state() + try: + self._check_studies_state() + except Exception: + logger.error( + "An uncaught exception occurred in slurm_launcher loop", + exc_info=True, + ) time.sleep(2) def start(self) -> None: @@ -406,7 +412,6 @@ def _handle_failure(self, study: StudyDTO) -> None: study.name, JobStatus.FAILED, msg, None ) logger.error(msg, exc_info=e) - raise else: msg = "Simulation failed (even if some output results may be available)" self.callbacks.append_after_log(study.name, msg) @@ -444,7 +449,6 @@ def _handle_success(self, study: StudyDTO) -> None: study.name, JobStatus.FAILED, msg, None ) logger.error(msg, exc_info=e) - raise else: self.callbacks.update_status( study.name, JobStatus.SUCCESS, None, output_id From 2df9a1f22c8066fa994022675e899d566498f88d Mon Sep 17 00:00:00 2001 From: Sylvain Leclerc Date: Tue, 25 Apr 2023 14:00:19 +0200 Subject: [PATCH 2/5] docs: changelog for v2.13.2 Signed-off-by: Sylvain Leclerc --- docs/CHANGELOG.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index f97a85131e..b7e038c0f2 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -1,6 +1,17 @@ Antares Web Changelog ===================== +v2.13.2 (2023-04-25) +-------------------- + +### Bug Fixes + +* **api:** fix uncaught exceptions stopping slurm launcher loop (#1477) ([2737914](https://github.com/AntaresSimulatorTeam/AntaREST/commit/27379146cfa12cc90e38f2f0d77009d80f3164db)) + +### Contributors + +Sylvain LECLERC + v2.13.1 (2023-04-11) -------------------- From f999c23389f0a16e39fa870acebb5dcb0043fa26 Mon Sep 17 00:00:00 2001 From: Sylvain Leclerc Date: Tue, 25 Apr 2023 14:09:02 +0200 Subject: [PATCH 3/5] build: prepare release 2.13.2 Signed-off-by: Sylvain Leclerc --- antarest/__init__.py | 4 ++-- setup.py | 2 +- sonar-project.properties | 2 +- webapp/package.json | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/antarest/__init__.py b/antarest/__init__.py index 7ae1001d6a..4533cd61d1 100644 --- a/antarest/__init__.py +++ b/antarest/__init__.py @@ -7,9 +7,9 @@ # Standard project metadata -__version__ = "2.13.1" +__version__ = "2.13.2" __author__ = "RTE, Antares Web Team" -__date__ = "2023-04-11" +__date__ = "2023-04-25" # noinspection SpellCheckingInspection __credits__ = "(c) Réseau de Transport de l’Électricité (RTE)" diff --git a/setup.py b/setup.py index e37c23a030..22f85d9bbf 100644 --- a/setup.py +++ b/setup.py @@ -5,7 +5,7 @@ setuptools.setup( name="AntaREST", - version="2.13.1", + version="2.13.2", description="Antares Server", long_description=long_description, long_description_content_type="text/markdown", diff --git a/sonar-project.properties b/sonar-project.properties index 0b5c59021b..48f76a506c 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -6,5 +6,5 @@ sonar.exclusions=antarest/gui.py,antarest/main.py sonar.python.coverage.reportPaths=coverage.xml sonar.python.version=3.8 sonar.javascript.lcov.reportPaths=webapp/coverage/lcov.info -sonar.projectVersion=2.13.1 +sonar.projectVersion=2.13.2 sonar.coverage.exclusions=antarest/gui.py,antarest/main.py,antarest/singleton_services.py,antarest/worker/archive_worker_service.py,webapp/**/* \ No newline at end of file diff --git a/webapp/package.json b/webapp/package.json index ebd23c387e..595d07a2ba 100644 --- a/webapp/package.json +++ b/webapp/package.json @@ -1,6 +1,6 @@ { "name": "antares-web", - "version": "2.13.1", + "version": "2.13.2", "private": true, "dependencies": { "@emotion/react": "11.10.6", From 31a61bb7d4f57f60d8c30a98ac38fb978ff46031 Mon Sep 17 00:00:00 2001 From: Laurent LAPORTE Date: Tue, 2 May 2023 12:10:14 +0200 Subject: [PATCH 4/5] docs(api): add a comment to explain why the monitoring loop must not raise exception --- antarest/launcher/adapters/slurm_launcher/slurm_launcher.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/antarest/launcher/adapters/slurm_launcher/slurm_launcher.py b/antarest/launcher/adapters/slurm_launcher/slurm_launcher.py index 07e490ff24..8e756cb5fc 100644 --- a/antarest/launcher/adapters/slurm_launcher/slurm_launcher.py +++ b/antarest/launcher/adapters/slurm_launcher/slurm_launcher.py @@ -148,9 +148,12 @@ def _retrieve_running_jobs(self) -> None: def _loop(self) -> None: while self.check_state: + # noinspection PyBroadException try: self._check_studies_state() except Exception: + # To keep the SLURM processing monitoring loop active, exceptions + # are caught and a message is simply displayed in the logs. logger.error( "An uncaught exception occurred in slurm_launcher loop", exc_info=True, From 099449b1010e9b2cdcf12f07891eb7ed13e146e7 Mon Sep 17 00:00:00 2001 From: Laurent LAPORTE Date: Tue, 2 May 2023 12:13:55 +0200 Subject: [PATCH 5/5] test: disable `test_simple_task` UT (crashes randomly) --- tests/worker/test_worker.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/worker/test_worker.py b/tests/worker/test_worker.py index 0d47d23d22..9bf7f64a6b 100644 --- a/tests/worker/test_worker.py +++ b/tests/worker/test_worker.py @@ -3,6 +3,7 @@ from typing import List from unittest.mock import MagicMock +import pytest from antarest.core.config import Config from antarest.core.interfaces.eventbus import Event, EventType, IEventBus from antarest.core.model import PermissionInfo, PublicMode @@ -27,6 +28,7 @@ def execute_task(self, task_info: WorkerTaskCommand) -> TaskResult: return TaskResult(success=True, message="") +@pytest.mark.skip(reason="disabled because it sometimes crashes randomly") def test_simple_task(tmp_path: Path): task_queue = "do_stuff" event_bus = build_eventbus(MagicMock(), Config(), autostart=True)