Skip to content

Commit 70cacfe

Browse files
committed
performance tuning
1 parent 2ccb342 commit 70cacfe

File tree

3 files changed

+15
-10
lines changed

3 files changed

+15
-10
lines changed

vcell-client/src/main/java/cbit/vcell/client/ClientSimManager.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -277,7 +277,7 @@ public void run(Hashtable<String, Object> hashTable) throws Exception {
277277
if(failure) { // just open some dialog for now; eventually we'll have some unobtrusive visual notification
278278
PopupGenerator.showErrorDialog(ClientSimManager.this.getDocumentWindowManager(), "PostProcessing failed");
279279
} else {
280-
PopupGenerator.showInfoDialog(ClientSimManager.this.getDocumentWindowManager(), "PostProcessing successful");
280+
// PopupGenerator.showInfoDialog(ClientSimManager.this.getDocumentWindowManager(), "PostProcessing successful");
281281
}
282282
}
283283
};

vcell-server/src/main/java/cbit/vcell/message/server/htc/slurm/SlurmProxy.java

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -419,10 +419,10 @@ public String getPostProcessCommands() {
419419

420420
}
421421

422-
private static int getRoundedMemoryLimit(long memPerTaskMB) {
423-
int rawLimit = (int)(memPerTaskMB * 0.9);
424-
// Round down to nearest 100 MB
425-
return (rawLimit / 100) * 100;
422+
private static int roundUpToBlock(long memPerTaskMB, int blockSizeMB) {
423+
long block = blockSizeMB; // promote to long for safe math
424+
long rounded = ((memPerTaskMB + block - 1) / block) * block;
425+
return (int) rounded;
426426
}
427427
private static String extractUser(ExecutableCommand.Container commandSet) {
428428
for (ExecutableCommand ec: commandSet.getExecCommands()) {
@@ -661,9 +661,14 @@ String generateLangevinBatchScript(String jobName, ExecutableCommand.Container
661661
SolverDescription solverDescription = std.getSolverDescription();
662662
MemLimitResults memoryMBAllowed = HtcProxy.getMemoryLimit(vcellUserid, simID, solverDescription, memSizeMB, simTask.isPowerUser());
663663

664-
int timeoutPerTaskSeconds = 28800; // 8 hours TODO: do we hardcode this? Should it be part of LangevinSimulationOptions?
664+
// TODO: do we hardcode these? Should it be part of LangevinSimulationOptions? Or, even better, properties?
665+
int timeoutPerTaskSeconds = 86400; // seconds. 24 hours
666+
long hardbBtchMemoryLimitPerTask = 1024; // MB. we hard limit mem to 1G for langevin batch jobs
667+
int blockSizeMB = 256; // MB. SLURM memory allocation granularity
665668
String slurmJobTimeout = computeSlurmTimeLimit(totalNumberOfJobs, numberOfConcurrentTasks, timeoutPerTaskSeconds);
666-
int javaMemXmx = getRoundedMemoryLimit(memoryMBAllowed.getMemLimit());
669+
long batchMemoryLimitPerTask = memoryMBAllowed.getMemLimit();
670+
batchMemoryLimitPerTask = Math.min(batchMemoryLimitPerTask, hardbBtchMemoryLimitPerTask);
671+
int javaMemXmx = roundUpToBlock(batchMemoryLimitPerTask, blockSizeMB) + blockSizeMB; // add extra block for overhead
667672

668673
// -------------------------------------------------------------
669674

vcell-server/src/test/resources/slurm_fixtures/langevin/V_TEST2_999999999_0_0.slurm.sub

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
#SBATCH --cpus-per-task=1
1010
#SBATCH --mem-per-cpu=4096M
1111
#SBATCH --nodes=1
12-
#SBATCH --time=52:49:00 # timeout for the entire job
12+
#SBATCH --time=6-14:24:00 # timeout for the entire job
1313
#SBATCH --no-kill
1414
#SBATCH --no-requeue
1515

@@ -22,7 +22,7 @@ set +e
2222
USERID=danv
2323
SIM_ID=999999999
2424
TOTAL_JOBS=8 # to be set by generator to lso.getTotalNumberOfJobs()
25-
JOB_TIMEOUT_SECONDS=28800 # per-job timeout (seconds), adjust per generator
25+
JOB_TIMEOUT_SECONDS=86400 # per-job timeout (seconds), adjust per generator
2626
LOG_FILE="/share/apps/vcell3/htclogs/V_TEST2_999999999_0_.submit.log"
2727
MESSAGING_CONFIG_FILE="/share/apps/vcell3/users/danv/SimID_999999999_0_.langevinMessagingConfig"
2828

@@ -94,7 +94,7 @@ container_bindings+="--bind /share/apps/vcell12/users:/share/apps/vcell12/users
9494
container_bindings+="--bind /share/apps/vcell3/htclogs:/htclogs "
9595
container_bindings+="--bind /scratch/vcell:/solvertmp "
9696

97-
container_env="--env java_mem_Xmx=3600M "
97+
container_env="--env java_mem_Xmx=1280M "
9898
container_env+="--env jmshost_sim_internal=k8s-wn-01.cam.uchc.edu "
9999
container_env+="--env jmsport_sim_internal=31618 "
100100
container_env+="--env jmsrestport_sim_internal=30163 "

0 commit comments

Comments
 (0)