From ce2154af298d993dc4aaf6f770656def80492728 Mon Sep 17 00:00:00 2001 From: mathiasg Date: Tue, 30 Oct 2018 15:45:09 -0400 Subject: [PATCH 1/4] enh: account for timeouts during job status checks --- nipype/pipeline/plugins/slurm.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nipype/pipeline/plugins/slurm.py b/nipype/pipeline/plugins/slurm.py index e27c05be04..e3619fd014 100644 --- a/nipype/pipeline/plugins/slurm.py +++ b/nipype/pipeline/plugins/slurm.py @@ -70,6 +70,10 @@ def _is_pending(self, taskid): terminal_output='allatonce').run() return res.runtime.stdout.find(str(taskid)) > -1 except RuntimeError as e: + if any(ss in str(e) for ss + in ['Socket timed out', 'not available at the moment']): + # do not raise error and allow recheck + return True if 'Invalid job id' not in str(e): raise(e) return False From e7c8cb7b8648e1ec750fb873f8d0cdbb370b245b Mon Sep 17 00:00:00 2001 From: mathiasg Date: Tue, 30 Oct 2018 17:03:41 -0400 Subject: [PATCH 2/4] enh: add log to alert when SLURM db is overloaded --- nipype/pipeline/plugins/slurm.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/nipype/pipeline/plugins/slurm.py b/nipype/pipeline/plugins/slurm.py index e3619fd014..6340cba4cc 100644 --- a/nipype/pipeline/plugins/slurm.py +++ b/nipype/pipeline/plugins/slurm.py @@ -17,7 +17,7 @@ from ...interfaces.base import CommandLine from .base import SGELikeBatchManagerBase, logger -iflogger = logging.getLogger('nipype.interface') +iflogger = logging.getLogger('nipype.workflow') class SLURMPlugin(SGELikeBatchManagerBase): @@ -73,6 +73,10 @@ def _is_pending(self, taskid): if any(ss in str(e) for ss in ['Socket timed out', 'not available at the moment']): # do not raise error and allow recheck + logger.warning( + "SLURM timeout encountered while checking job status, + "treating job %d as pending", taskid + ) return True if 'Invalid job id' not in str(e): raise(e) From 2bd8f926bad39beab14c30ab3dfc4fc040cb836c Mon Sep 17 00:00:00 2001 From: mathiasg Date: Tue, 30 Oct 2018 17:06:37 -0400 Subject: [PATCH 3/4] fix: keep old logger --- nipype/pipeline/plugins/slurm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nipype/pipeline/plugins/slurm.py b/nipype/pipeline/plugins/slurm.py index 6340cba4cc..bbd5f4f9bf 100644 --- a/nipype/pipeline/plugins/slurm.py +++ b/nipype/pipeline/plugins/slurm.py @@ -17,7 +17,7 @@ from ...interfaces.base import CommandLine from .base import SGELikeBatchManagerBase, logger -iflogger = logging.getLogger('nipype.workflow') +iflogger = logging.getLogger('nipype.interface') class SLURMPlugin(SGELikeBatchManagerBase): From 912dde8e114de2d47466520eb53db5648f0b97c6 Mon Sep 17 00:00:00 2001 From: mathiasg Date: Thu, 1 Nov 2018 13:23:26 -0400 Subject: [PATCH 4/4] fix: log message --- nipype/pipeline/plugins/slurm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/nipype/pipeline/plugins/slurm.py b/nipype/pipeline/plugins/slurm.py index bbd5f4f9bf..4645e52fba 100644 --- a/nipype/pipeline/plugins/slurm.py +++ b/nipype/pipeline/plugins/slurm.py @@ -74,8 +74,8 @@ def _is_pending(self, taskid): in ['Socket timed out', 'not available at the moment']): # do not raise error and allow recheck logger.warning( - "SLURM timeout encountered while checking job status, - "treating job %d as pending", taskid + "SLURM timeout encountered while checking job status," + " treating job %d as pending", taskid ) return True if 'Invalid job id' not in str(e):