Skip to content

Commit

Permalink
fix(launcher): improved reliability of task state retrieval sent to S…
Browse files Browse the repository at this point in the history
…LUM (#1417)
  • Loading branch information
laurent-laporte-pro authored Mar 22, 2023
1 parent 6979e87 commit 101dd8c
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 25 deletions.
47 changes: 23 additions & 24 deletions antarest/launcher/adapters/slurm_launcher/slurm_launcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,35 +334,34 @@ def _import_xpansion_result(self, job_id: str, xpansion_mode: str) -> None:
logger.warning("Output path in xpansion result not found")

def _check_studies_state(self) -> None:
try:
with self.antares_launcher_lock:
with self.antares_launcher_lock:
try:
self._call_launcher(
arguments=self.launcher_args,
parameters=self.launcher_params,
)
except Exception as e:
logger.info("Could not get data on remote server", exc_info=e)

study_list = self.data_repo_tinydb.get_list_of_studies()
for study in study_list:
log_path = SlurmLauncher._get_log_path(study)
if study.with_error:
self.log_tail_manager.stop_tracking(log_path)
self._handle_failure(study)
elif study.done:
self.log_tail_manager.stop_tracking(log_path)
self._handle_success(study)
else:
# study.started => still running
# study.finished => waiting for ZIP + logs retrieval (or failure)
self.log_tail_manager.track(
log_path, self.create_update_log(study.name)
)
except Exception as e:
logger.info("Could not get data on remote server", exc_info=e)

study_list = self.data_repo_tinydb.get_list_of_studies()
for study in study_list:
log_path = SlurmLauncher._get_log_path(study)
if study.with_error:
self.log_tail_manager.stop_tracking(log_path)
self._handle_failure(study)
elif study.done:
self.log_tail_manager.stop_tracking(log_path)
self._handle_success(study)
else:
# study.started => still running
# study.finished => waiting for ZIP + logs retrieval (or failure)
self.log_tail_manager.track(
log_path, self.create_update_log(study.name)
)

# Re-fetching the study list is necessary as new studies may have been added
# during the `import_output` process. Afterward, we clean up the list to ensure
# that any removed studies are removed from the database.
with self.antares_launcher_lock:
# Re-fetching the study list is necessary as new studies may have been added
# during the `import_output` process. Afterward, we clean up the list to ensure
# that any removed studies are removed from the database.
# fmt: off
cleanup_list = [s for s in study_list if s.with_error or s.done]
for study in cleanup_list:
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
Antares-Launcher~=1.2.2
Antares-Launcher~=1.2.4

aiofiles~=0.8.0
alembic~=1.7.5
Expand Down

0 comments on commit 101dd8c

Please sign in to comment.