Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added maximum queue limit warnings for SLURM and PBS #111

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions devtools/install_all.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,8 +83,8 @@ fi

#Compile RMG-Py
make
#Update pyjulia to the latest version
$COMMAND_PKG update pyjulia -c conda_forge -y
#Update pyjulia to the latest version - Don't need this at moment - 0.6.1 is the latest version on RMG-Py
#$COMMAND_PKG update pyjulia -c conda_forge -y

#Ensure that added paths etc. are set and then reactivate rmg_env
. ~/.bashrc
Expand Down
25 changes: 25 additions & 0 deletions t3/runners/rmg_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,23 @@ def submit_job(project_directory: str,
if not len(stdout):
time.sleep(10)
stdout, stderr = execute_command(cmd)
if stderr:
if cluster_soft.lower() == 'slurm' and any('AssocMaxSubmitJobLimit' in err_line for err_line in stderr):
logger.warning('Max number of submitted jobs was reached, sleeping...')
time.sleep(5 * 60)
submit_job(project_directory=project_directory,
logger=logger,
cluster_soft=cluster_soft,
memory=memory
)
if cluster_soft.lower() == 'pbs' and any('qsub: would exceed' in err_line for err_line in stderr):
logger.warning('Max number of submitted jobs was reached, sleeping...')
time.sleep(5 * 60)
submit_job(project_directory=project_directory,
logger=logger,
cluster_soft=cluster_soft,
memory=memory
)
if not len(stdout):
return None, None
if len(stderr) > 0 or len(stdout) == 0:
Expand Down Expand Up @@ -186,6 +203,14 @@ def run_rmg_incore(rmg_input_file_path: str,
stdout, stderr = execute_command(commands, shell=True, no_fail=True, executable='/bin/bash')
if 'RMG threw an exception and did not converge.\n' in stderr:
return True
# Check for err file that can be generated by Julia
# Todo: Improve the error checking. This situation arose from Julia having a TypeError issue from a recent commit.
if os.path.isfile(os.path.join(project_directory, 'err.txt')):
with open(os.path.join(project_directory, 'err.txt'), 'r') as f:
lines = f.readlines()
if 'Traceback (most recent call last):\n' in lines:
# Raise an error that will inform the user that Julia has failed
raise RuntimeError('Julia has raised an error with RMS and therefore RMG cannot complete. Please check your Julia installation\n')
return False


Expand Down
1 change: 0 additions & 1 deletion tests/test_rmg_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,6 @@ def test_rmg_job_converged(self):
assert converged
assert error is None


def teardown_module():
"""teardown any state that was previously setup with a setup_module method."""
file_paths = [os.path.join(EXAMPLES_BASE_PATH, 'minimal', 'submit.sh')]
Expand Down