From 101dd8c2a149c5112669d557d0851a9b1659d683 Mon Sep 17 00:00:00 2001 From: Laurent LAPORTE <43534797+laurent-laporte-pro@users.noreply.github.com> Date: Wed, 22 Mar 2023 14:44:17 +0100 Subject: [PATCH] fix(launcher): improved reliability of task state retrieval sent to SLUM (#1417) --- .../adapters/slurm_launcher/slurm_launcher.py | 47 +++++++++---------- requirements.txt | 2 +- 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/antarest/launcher/adapters/slurm_launcher/slurm_launcher.py b/antarest/launcher/adapters/slurm_launcher/slurm_launcher.py index ae247009f8..96609c6ded 100644 --- a/antarest/launcher/adapters/slurm_launcher/slurm_launcher.py +++ b/antarest/launcher/adapters/slurm_launcher/slurm_launcher.py @@ -334,35 +334,34 @@ def _import_xpansion_result(self, job_id: str, xpansion_mode: str) -> None: logger.warning("Output path in xpansion result not found") def _check_studies_state(self) -> None: - try: - with self.antares_launcher_lock: + with self.antares_launcher_lock: + try: self._call_launcher( arguments=self.launcher_args, parameters=self.launcher_params, ) - except Exception as e: - logger.info("Could not get data on remote server", exc_info=e) - - study_list = self.data_repo_tinydb.get_list_of_studies() - for study in study_list: - log_path = SlurmLauncher._get_log_path(study) - if study.with_error: - self.log_tail_manager.stop_tracking(log_path) - self._handle_failure(study) - elif study.done: - self.log_tail_manager.stop_tracking(log_path) - self._handle_success(study) - else: - # study.started => still running - # study.finished => waiting for ZIP + logs retrieval (or failure) - self.log_tail_manager.track( - log_path, self.create_update_log(study.name) - ) + except Exception as e: + logger.info("Could not get data on remote server", exc_info=e) + + study_list = self.data_repo_tinydb.get_list_of_studies() + for study in study_list: + log_path = SlurmLauncher._get_log_path(study) + if study.with_error: + self.log_tail_manager.stop_tracking(log_path) + self._handle_failure(study) + elif study.done: + self.log_tail_manager.stop_tracking(log_path) + self._handle_success(study) + else: + # study.started => still running + # study.finished => waiting for ZIP + logs retrieval (or failure) + self.log_tail_manager.track( + log_path, self.create_update_log(study.name) + ) - # Re-fetching the study list is necessary as new studies may have been added - # during the `import_output` process. Afterward, we clean up the list to ensure - # that any removed studies are removed from the database. - with self.antares_launcher_lock: + # Re-fetching the study list is necessary as new studies may have been added + # during the `import_output` process. Afterward, we clean up the list to ensure + # that any removed studies are removed from the database. # fmt: off cleanup_list = [s for s in study_list if s.with_error or s.done] for study in cleanup_list: diff --git a/requirements.txt b/requirements.txt index 06f220802c..500878c52b 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,4 @@ -Antares-Launcher~=1.2.2 +Antares-Launcher~=1.2.4 aiofiles~=0.8.0 alembic~=1.7.5