Skip to content

Commit 19314a0

Browse files
committed
compute slurm job timeout based on task timeout, total num tasks, num of concurrent tasks
1 parent 81d0ea5 commit 19314a0

File tree

1 file changed

+51
-0
lines changed
  • vcell-server/src/main/java/cbit/vcell/message/server/htc/slurm

1 file changed

+51
-0
lines changed

vcell-server/src/main/java/cbit/vcell/message/server/htc/slurm/SlurmProxy.java

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -448,6 +448,45 @@ private static String extractUser(ExecutableCommand.Container commandSet) {
448448
throw new RuntimeException("Could not extract user from command set: "+commandSet);
449449
}
450450

451+
452+
/**
453+
* Compute SBATCH --time
454+
* as HH:MM:00 (seconds fixed to "00").
455+
* or as D-HH:MM:00 (if over 99 hours).
456+
*
457+
* Uses integer math only and adds a 10% cushion, rounded up to whole minutes.
458+
*
459+
* totalNumberOfJobs: total simulation tasks
460+
* numberOfConcurrentTasks: parallel slots (SLURM_NTASKS)
461+
* timeoutSeconds: per-task timeout in seconds
462+
*/
463+
public static String computeSlurmTimeLimit(int totalNumberOfJobs,
464+
int numberOfConcurrentTasks,
465+
int timeoutSeconds) {
466+
if (totalNumberOfJobs < 0) throw new IllegalArgumentException("totalNumberOfJobs >= 0 required");
467+
if (numberOfConcurrentTasks <= 0) throw new IllegalArgumentException("numberOfConcurrentTasks > 0 required");
468+
if (timeoutSeconds <= 0) throw new IllegalArgumentException("timeoutSeconds > 0 required");
469+
470+
int perTaskMinutes = (timeoutSeconds + 59) / 60; // ceiling(timeoutSeconds/60)
471+
int batches = (totalNumberOfJobs + numberOfConcurrentTasks - 1) / numberOfConcurrentTasks;
472+
long workMinutes = (long) batches * perTaskMinutes;
473+
long extraMinutes = 3L * perTaskMinutes;
474+
long totalMinutes = workMinutes + extraMinutes;
475+
long cushionedMinutes = (long) Math.ceil(totalMinutes * 1.10);
476+
477+
long totalHours = cushionedMinutes / 60;
478+
long minutes = cushionedMinutes % 60;
479+
480+
if (totalHours < 100) {
481+
return String.format("%02d:%02d:00", totalHours, minutes);
482+
} else {
483+
long days = totalHours / 24;
484+
long hours = totalHours % 24;
485+
return String.format("%d-%02d:%02d:00", days, hours, minutes);
486+
}
487+
}
488+
489+
451490
String generateLangevinBatchScript(String jobName, ExecutableCommand.Container commandSet, double memSizeMB,
452491
Collection<PortableCommand> postProcessingCommands, SimulationTask simTask) {
453492

@@ -461,11 +500,23 @@ String generateLangevinBatchScript(String jobName, ExecutableCommand.Container
461500
SolverDescription solverDescription = std.getSolverDescription();
462501
// MemLimitResults memoryMBAllowed = HtcProxy.getMemoryLimit(vcellUserid, simID, solverDescription, memSizeMB, simTask.isPowerUser());
463502

503+
int timeoutPerTaskSeconds = 300;
504+
String slurmJobTimeout = computeSlurmTimeLimit(totalNumberOfJobs, numberOfConcurrentTasks, timeoutPerTaskSeconds);
505+
464506
int memPerTask = 2048; // in MB
465507
LineStringBuilder slurmCommands = new LineStringBuilder();
466508
slurmBatchScriptInit(jobName, simTask.isPowerUser(), memPerTask, numberOfConcurrentTasks, slurmCommands);
467509
System.out.println(slurmCommands.sb.toString());
468510

511+
String user = extractUser(commandSet);
512+
System.out.println("USERID="+user);
513+
System.out.println("SIM_ID="+simID);
514+
System.out.println("TOTAL_JOBS="+totalNumberOfJobs);
515+
System.out.println("TIMEOUT_SECONDS="+user);
516+
System.out.println("USERID="+user);
517+
518+
519+
469520
int javaMemXmx = getRoundedMemoryLimit(memPerTask);
470521
LineStringBuilder singularityCommands = buildSingularitySlurmSection(simTask, javaMemXmx);
471522
System.out.println(singularityCommands.sb.toString());

0 commit comments

Comments
 (0)