Skip to content

Commit 8d4a247

Browse files
committed
fix(cache): improve NFS/lock error handling in DefaultCacheStore
Three improvements to DefaultCacheStore.openDb(): 1. Log root cause: the underlying LevelDB exception is now logged at DEBUG level so it appears in .nextflow.log, aiding diagnosis. 2. Add NXF_CACHE_DIR env-var support: the new resolveCacheBaseDir() helper checks for the NXF_CACHE_DIR environment variable and uses it as the base directory for the LevelDB cache. This lets users on NFS / Lustre / GPFS clusters redirect only the cache DB to a local path (e.g. /tmp) without having to move the entire pipeline launch directory. 3. Improve error message: the previous message incorrectly advised users to use '-w' (which controls the work directory, not the cache directory). The new message accurately describes the root cause and documents both remedies: running from a local directory with -w, or setting NXF_CACHE_DIR. Fixes: DefaultCacheStore caught a generic Exception from LevelDB when the launch directory is on a filesystem without POSIX file-lock support (NFS, Lustre, GPFS, BeeGFS, …) and emitted a misleading error message. Signed-off-by: Matthias De Smet <11850640+matthdsm@users.noreply.github.com>
1 parent f39afea commit 8d4a247

File tree

1 file changed

+33
-4
lines changed

1 file changed

+33
-4
lines changed

modules/nextflow/src/main/groovy/nextflow/cache/DefaultCacheStore.groovy

Lines changed: 33 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import java.nio.file.Path
2020

2121
import com.google.common.hash.HashCode
2222
import groovy.transform.CompileStatic
23+
import groovy.util.logging.Slf4j
2324
import nextflow.Const
2425
import nextflow.exception.AbortOperationException
2526
import nextflow.util.CacheHelper
@@ -32,6 +33,7 @@ import org.iq80.leveldb.impl.Iq80DBFactory
3233
*
3334
* @author Paolo Di Tommaso <paolo.ditommaso@gmail.com>
3435
*/
36+
@Slf4j
3537
@CompileStatic
3638
class DefaultCacheStore implements CacheStore {
3739

@@ -63,11 +65,31 @@ class DefaultCacheStore implements CacheStore {
6365
this.KEY_SIZE = CacheHelper.hasher('x').hash().asBytes().size()
6466
this.uniqueId = uniqueId
6567
this.runName = runName
66-
this.baseDir = home ?: Const.appCacheDir.toAbsolutePath()
68+
this.baseDir = home ?: resolveCacheBaseDir()
6769
this.dataDir = baseDir.resolve("cache/$uniqueId")
6870
this.indexFile = dataDir.resolve("index.$runName")
6971
}
7072

73+
/**
74+
* Resolve the base directory for the cache DB.
75+
*
76+
* The {@code NXF_CACHE_DIR} environment variable can be used to redirect
77+
* the LevelDB cache to a local filesystem that supports file locking
78+
* (e.g. {@code export NXF_CACHE_DIR=/tmp/nxf-cache}) when the pipeline
79+
* launch directory resides on a network filesystem such as NFS or Lustre.
80+
*
81+
* When {@code NXF_CACHE_DIR} is not set the default ({@code .nextflow/}
82+
* relative to the launch directory) is used.
83+
*/
84+
private static Path resolveCacheBaseDir() {
85+
final override = System.getenv('NXF_CACHE_DIR')
86+
if( override ) {
87+
log.debug "Using NXF_CACHE_DIR for cache base directory: $override"
88+
return Path.of(override).toAbsolutePath()
89+
}
90+
return Const.appCacheDir.toAbsolutePath()
91+
}
92+
7193
private void openDb() {
7294
// make sure the db path exists
7395
dataDir.mkdirs()
@@ -90,11 +112,18 @@ class DefaultCacheStore implements CacheStore {
90112
throw new IOException(msg)
91113
}
92114
else {
115+
// Log the underlying cause so it is visible in .nextflow.log for diagnosis.
116+
log.debug "Failed to open LevelDB cache at path: $file -- cause: ${e.message}", e
93117
msg = "Can't open cache DB: $file"
94118
msg += '\n\n'
95-
msg += "Nextflow needs to be executed in a shared file system that supports file locks.\n"
96-
msg += "Alternatively, you can run it in a local directory and specify the shared work\n"
97-
msg += "directory by using the `-w` command line option."
119+
msg += "The Nextflow cache DB is located on a filesystem that does not support file locking.\n"
120+
msg += "This is common when the pipeline is launched from a network filesystem (NFS, Lustre, GPFS, …).\n"
121+
msg += "\nTo fix this, choose one of the following options:\n"
122+
msg += " 1. Run Nextflow from a local directory and point the work directory to your shared\n"
123+
msg += " filesystem using the `-w` command line option.\n"
124+
msg += " 2. Set the NXF_CACHE_DIR environment variable to a local path so that only the\n"
125+
msg += " cache DB is redirected to a lock-capable filesystem, e.g.:\n"
126+
msg += " export NXF_CACHE_DIR=/tmp/nxf-cache-\$USER"
98127
throw new IOException(msg, e)
99128
}
100129
}

0 commit comments

Comments
 (0)