diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml index 0540c37..595eff1 100644 --- a/.github/workflows/docker-build.yml +++ b/.github/workflows/docker-build.yml @@ -21,7 +21,7 @@ jobs: - name: Checkout uses: actions/checkout@v3 - - name: Build and push R container + - name: Build and push R container to docker hub uses: mr-smithers-excellent/docker-build-push@v6.4 with: image: fishbotherer/r-tools @@ -31,5 +31,4 @@ jobs: dockerfile: containers/r/Dockerfile directory: containers/r username: ${{ secrets.DOCKER_LOGIN }} - password: ${{ secrets.DOCKER_TOKEN }} - + password: ${{ secrets.DOCKER_TOKEN }} \ No newline at end of file diff --git a/conf/base.config b/conf/base.config index 80854f0..ba04208 100644 --- a/conf/base.config +++ b/conf/base.config @@ -1,31 +1,23 @@ -int cores = (int)get_p(params,'maxCpus') -nextflow.util.MemoryUnit mem = get_p(params,'maxMemory') as nextflow.util.MemoryUnit -nextflow.util.Duration d = get_p(params,'maxTime') as nextflow.util.Duration -insect = get_p(params,"insect") -blast = get_p(params,"blast") -demux = get_p(params,"demultiplexedBy") == "index" -split = get_p(params,"split") - - /* * general process options (mainly which labels go with which docker/singulariy images) * but also the base resource usage settings */ process { - withLabel: 'fastqc' { container = 'biocontainers/fastqc:v0.11.9_cv7' } - withLabel: 'adapterRemoval' { container = 'biocontainers/adapterremoval:v2.2.0-1-deb_cv1' } - withLabel: 'obitools' { container = 'biocontainers/obitools:v1.2.12dfsg-2-deb_cv1' } - withLabel: 'r' { container = 'fishbotherer/r-tools:latest' } - withLabel: 'shell' { container = 'nextflow/bash:latest' } - withLabel: 'multiqc' { container = 'multiqc/multiqc:latest'} + withName: 'fastqc' { container = 'quay.io/biocontainers/fastqc:0.12.1--hdfd78af_0 ' } + withName: 'filter_merge' { container = 'quay.io/biocontainers/adapterremoval:2.3.4--pl5321haf24da9_1' } + withName: 'multiqc' { container = 'quay.io/biocontainers/multiqc:1.27.1--pyhdfd78af_0'} + withLabel: 'shell' { container = 'quay.io/nextflow/bash:latest' } + withLabel: 'obitools' { container = 'quay.io/biocontainers/obitools:1.2.13--py27heb79e2c_3' } + withLabel: 'blast' { container = 'quay.io/biocontainers/blast:2.17.0--h66d330f_0' } + withLabel: 'r' { container = 'fishbotherer/r-tools:latest' } withLabel: 'denoiser' { if (params.execDenoiser) container = '' else if (params.denoiser == "vsearch") { - container = 'biocontainers/vsearch:v2.10.4-1-deb_cv1' + container = 'quay.io/biocontainers/vsearch:2.30.0--hd6d6fdc_0' } else if (params.denoiser in ['usearch','usearch32']) { - container = 'sunqiangkun/usearch:v1' + container = 'quay.io/biocontainers/usearch:12.0_beta--h9ee0642_1' } } @@ -68,34 +60,10 @@ process { errorStrategy = 'retry' maxRetries = 2 } - -// // allocate cpus to AdapterRemoval -// withLabel: 'demux_cpus' { -// if (get_p(params,'demultiplexedBy') == 'barcode' ) { -// cpus = cores > 1 ? cores-1 : cores -// } else { -// cpus = 4 * task.attempt -// } -// } - // allow all cpus - withLabel: 'all_cpus' { cpus = { cores } } - - // allocate blast cpus - withLabel: 'blast' { - // cpus = { params.insect ? cores / 2 : cores } - // memory = { params.insect ? mem / 2 : mem } - container = 'ncbi/blast:latest' - } - - // alloacte insect cpus - withLabel: 'insect' { - // cpus = { params.blast ? cores / 2 : cores } - // memory = { params.blast ? mem / 2 : mem } - container = 'fishbotherer/r-tools:latest' - } + withLabel: 'all_cpus' { cpus = { (int)params.maxCpus } } - cache = 'lenient' + cache = 'lenient' // set default cpus and memory cpus = { check_max( 1 * task.attempt, 'cpus' ) } diff --git a/containers/r/Dockerfile b/containers/r/Dockerfile index 44d13ed..31f8cf7 100644 --- a/containers/r/Dockerfile +++ b/containers/r/Dockerfile @@ -6,6 +6,6 @@ MAINTAINER "Mykle Hoban" mykle.hoban@hawaii.edu RUN /rocker_scripts/install_tidyverse.sh # install packages -RUN Rscript -e 'install.packages(c("insect","optparse","phangorn","furrr","worrms","rentrez"))' +RUN Rscript -e 'install.packages(c("insect","optparse","phangorn"))' RUN Rscript -e 'devtools::install_github(c("tobiasgf/lulu","GuillemSalazar/EcolUtils"))' RUN Rscript -e 'BiocManager::install(c("phyloseq","dada2","DECIPHER","decontam"))' \ No newline at end of file diff --git a/nextflow.config b/nextflow.config index 2dfe8f3..18f8228 100644 --- a/nextflow.config +++ b/nextflow.config @@ -6,7 +6,7 @@ manifest { mainScript = 'rainbow_bridge.nf' defaultBranch = 'main' version = '1.33.8' - nextflowVersion = '!>=22.03' + nextflowVersion = '!>=25.04.7' } @@ -73,6 +73,7 @@ try { } /* general script options */ +params.saveConfig = false params.preDir = 'preprocess' params.outDir = 'output' params.publishMode = "symlink" @@ -164,53 +165,33 @@ params.maxTime = 240.h // ourselves if we want consisistency. what we'll do is give the // kebab-case version priority -// convert string to camelCase -def to_camel(s) { - return s.replaceAll(/(?[a-z])-(?[a-z])/) { - m,a,b -> a+b.toUpperCase() - } -} - -// convert string to kebab-case -def to_kebab(s) { - return s.replaceAll(/([a-z])([A-Z])/,'$1-$2').toLowerCase() -} - -def get_p(p,k) { - if (p.containsKey(to_kebab(k))) { - return p.get(to_kebab(k)) - } else { - return p.get(k) - } -} - // Function to ensure that resource requirements don't go beyond a maximum limit def check_max(obj, type) { if (type == 'memory') { try { - if (obj.compareTo(get_p(params,'maxMemory') as nextflow.util.MemoryUnit) == 1) - return get_p(params,'maxMemory') as nextflow.util.MemoryUnit + if (obj.compareTo(params.maxMemory as nextflow.util.MemoryUnit) == 1) + return params.maxMemory as nextflow.util.MemoryUnit else return obj } catch (all) { - println " ### ERROR ### Max memory '${get_p(params,'maxMemory')}' is not valid! Using default value: $obj" + println " ### ERROR ### Max memory '${params.maxMemory}' is not valid! Using default value: $obj" return obj } } else if (type == 'time') { try { - if (obj.compareTo(get_p(params,'maxTime') as nextflow.util.Duration) == 1) - return get_p(params,'maxTime') as nextflow.util.Duration + if (obj.compareTo(params.maxTime as nextflow.util.Duration) == 1) + return params.maxTime as nextflow.util.Duration else return obj } catch (all) { - println " ### ERROR ### Max time '${get_p(params,'maxTime')}' is not valid! Using default value: $obj" + println " ### ERROR ### Max time '${params.maxTime}' is not valid! Using default value: $obj" return obj } } else if (type == 'cpus') { try { - return Math.min( obj, get_p(params,'maxCpus') as int ) + return Math.min( obj, params.maxCpus as int ) } catch (all) { - println " ### ERROR ### Max cpus '${get_p(params,'maxCpus')}' is not valid! Using default value: $obj" + println " ### ERROR ### Max cpus '${params.maxCpus}' is not valid! Using default value: $obj" return obj } } @@ -229,25 +210,22 @@ profiles { // make default executor local // and limit max cpus to param value executor.name = 'local' - executor.cpus = (int)get_p(params,'maxCpus') - executor.memory = get_p(params,'maxMemory') + executor.cpus = (int)params.maxCpus + executor.memory = params.maxMemory singularity { /* enable singularity and have it do automounts */ enabled = true autoMounts = true - String bd = get_p(params,'bindDir') - String sc = get_p(params,'singularityCache') - // construct options for singularity bind directories - if (bd && bd != '') { - runOptions = "-B " + bd.split().join(" -B ") + if (params.bindDir && params.bindDir != '') { + runOptions = "-B " + params.bindDir.split().join(" -B ") } // set singularity cache directory if specified - if (sc && sc != "") { - cacheDir = sc + if (params.singularityCache && params.singularityCache != "") { + cacheDir = params.singularityCache } } } diff --git a/rainbow_bridge.nf b/rainbow_bridge.nf index a2eb260..e6b739d 100755 --- a/rainbow_bridge.nf +++ b/rainbow_bridge.nf @@ -196,7 +196,6 @@ def check_params() { // trim and (where relevant) merge paired-end reads process filter_merge { - label 'adapterRemoval' label 'process_medium' publishDir "${params.preDir}/trim_merge", mode: params.publishMode @@ -364,7 +363,6 @@ process relabel { tuple val(key), path(fastq, name: 'input-????.fastq') output: path('*_relabeled.fasta'), optional: true, emit: result - path 'settings.txt' script: @@ -372,10 +370,9 @@ process relabel { if (params.denoiser == "vsearch") { def combined = "<(cat input-*.fastq)" """ - echo "denoiser: vsearch" > settings.txt # this may or may not be necessary anymore, but it seems like a good sanity check # since this will fail on empty files - vsearch --threads ${task.cpus} --fastq_qmax ${params.maxQuality} --fastx_filter ${combined} --relabel "${key}." --fastaout - | \ + vsearch --threads ${task.cpus} --fastq_qmax ${params.maxQuality} --fastx_filter ${combined} --relabel "${key}." --label_suffix ";sample=${key}" --fastaout - | \ awk '/^>/ {print;} !/^>/ {print(toupper(\$0))}' > "${key}_relabeled.fasta" """ } else { @@ -383,7 +380,6 @@ process relabel { def combined = "combined.fastq" def denoiser = params.execDenoiser ? params.denoiser : 'usearch' """ - echo "denoiser: ${params.denoiser}" > settings.txt cat input-*.fastq > ${combined} # we have to convert everything to uppercase because obisplit --uppercase is broken ${denoiser} -fastx_relabel ${combined} -prefix "${key}." -fastaout /dev/stdout | \ @@ -426,16 +422,11 @@ process dereplicate { output: tuple val(id), path("${id}_unique.fasta"), path("${id}_zotus.fasta"), path("zotu_table.tsv"), emit: result - path 'settings.txt' path 'zotu_map.tsv' script: if (params.denoiser == "vsearch") { """ - echo "denoiser: vsearch" > settings.txt - echo "minimum sequence abundance: ${params.minAbundance}" >> settings.txt - echo "alpha: ${params.alpha}" >> settings.txt - echo "fractional identity: ${params.zotuIdentity}" >> settings.txt # steps: # 1. get unique sequence variants # 2. run denoising algorithm @@ -443,19 +434,19 @@ process dereplicate { # 4. match original sequences to zotus by 97% identity if [ -s "${relabeled_merged}" ]; then vsearch \ - --threads ${task.cpus} --fastq_qmax ${params.maxQuality} \ + --threads ${task.cpus} \ --derep_fulllength ${relabeled_merged} --sizeout \ --output "${id}_unique.fasta" vsearch \ - --threads ${task.cpus} --fastq_qmax ${params.maxQuality} \ + --threads ${task.cpus} \ --cluster_unoise "${id}_unique.fasta" --centroids "${id}_centroids.fasta" \ --minsize ${params.minAbundance} --unoise_alpha ${params.alpha} vsearch \ - --threads ${task.cpus} --fastq_qmax ${params.maxQuality} \ + --threads ${task.cpus} \ --uchime3_denovo "${id}_centroids.fasta" --nonchimeras "${id}_zotus.fasta" \ --relabel Zotu vsearch \ - --threads ${task.cpus} --fastq_qmax ${params.maxQuality} \ + --threads ${task.cpus} \ --usearch_global ${relabeled_merged} --db "${id}_zotus.fasta" \ --id ${params.zotuIdentity} --otutabout zotu_table.tsv \ --userout zotu_map.tsv --userfields "query+target" \ @@ -468,10 +459,6 @@ process dereplicate { } else { def denoiser = params.execDenoiser ? params.denoiser : 'usearch' """ - echo "denoiser: ${denoiser}" > settings.txt - echo "minimum sequence abundance: ${params.minAbundance}" >> settings.txt - echo "alpha: ${params.alpha}" >> settings.txt - echo "fractional identity: ${params.zotuIdentity}" >> settings.txt # steps: # 1. get unique sequences # 2. run denoising & chimera removal @@ -509,7 +496,7 @@ process blast { output: path("blast_result.tsv"), emit: result - path 'blast_settings.txt' + path 'settings.yml' script: @@ -530,8 +517,8 @@ process blast { // get extra blast options passed on the command line as --blastn-* def extra_options = params - .findAll { it.key =~ /^blastn-/ } - .collectEntries { k, v -> [k.tokenize('-')[1],v] } + .findAll { it.key =~ /^blastn[A-Z].+/ } + .collectEntries { k, v -> [k.replaceAll(/^blastn/,'').toLowerCase(),v] } // merge blast options with any extra options blast_options = blast_options << extra_options @@ -541,10 +528,10 @@ process blast { .join(" ") """ # record blast settings - echo "Percent identity: ${pid}" > blast_settings.txt - echo "e-value: ${evalue}" >> blast_settings.txt - echo "Query qoverage: ${qcov}" >> blast_settings.txt - echo "Max. target sequences: ${params.maxQueryResults}" >> blast_settings.txt + echo "percent_identity: ${pid}" > settings.yml + echo "e_value: ${evalue}" >> settings.yml + echo "query_coverage: ${qcov}" >> settings.yml + echo "max_sequences: ${params.maxQueryResults}" >> settings.yml # set BLASTDB to local working directory export BLASTDB=. @@ -595,14 +582,9 @@ process lulu { output: tuple path("lulu_zotu_table.tsv"), path("lulu_zotu_map.tsv"), path("lulu_result_object.rds"), emit: result - path 'settings.txt' script: """ - echo "minimum ratio: ${params.luluMinRatio}" > settings.txt - echo "minimum ratio type: ${params.luluMinRatioType}" >> settings.txt - echo "minimum match: ${params.luluMinMatch}" >> settings.txt - echo "minimum RC: ${params.luluMinRc}" >> settings.txt lulu.R \ -m ${params.luluMinRatio} \ -t ${params.luluMinRatioType} \ @@ -629,7 +611,7 @@ process collapse_taxonomy { output: path("lca_taxonomy.tsv"), emit: taxonomy path("lca_intermediate.tsv") - path 'lca_settings.txt' + path 'settings.yml' script: @@ -638,12 +620,12 @@ process collapse_taxonomy { params.lcaCaseInsensitive && pf << "--case-insensitive" """ # save settings - echo "Minimum query coverage %: ${params.lcaQcov}" > lca_settings.txt - echo "Minimum percent identity: ${params.lcaPid}" >> lca_settings.txt - echo "Minium percent identity difference: ${params.lcaDiff}" >> lca_settings.txt - echo "Filter to maximum query coverage: ${params.lcaFilterMaxQcov ? 'yes' : 'no'}" >> lca_settings.txt - echo "Filter taxa by regex: ${params.lcaTaxonFilter}" >> lca_settings.txt - echo "Taxon filter case sensitive: ${!params.lcaCaseInsensitive ? 'yes' : 'no'}" >> lca_settings.txt + echo "min_query_coverage: ${params.lcaQcov}" > settings.yml + echo "min_percent_identity: ${params.lcaPid}" >> settings.yml + echo "pid_diff: ${params.lcaDiff}" >> settings.yml + echo "filter_max_query_coverage: ${params.lcaFilterMaxQcov ? 'yes' : 'no'}" >> settings.yml + echo "taxon_filter: ${params.lcaTaxonFilter}" >> settings.yml + echo "taxon_filter_case_sensitive: ${!params.lcaCaseInsensitive ? 'yes' : 'no'}" >> settings.yml collapse_taxonomy.R \ --qcov ${params.lcaQcov} \ @@ -663,7 +645,7 @@ process collapse_taxonomy { // run insect classifier model process insect { - label 'insect' + label 'r' label 'all_cpus' publishDir { @@ -680,7 +662,7 @@ process insect { output: path('insect_taxonomy.tsv'), emit: taxonomy path('insect_model.rds') - path('insect_settings.txt') + path('settings.yml') script: def offs = String.format("%d",(Integer)num(params.insectOffset)) @@ -690,10 +672,10 @@ process insect { """ # record insect settings - echo "Offset: ${offs}" > insect_settings.txt - echo "Threshold: ${thresh}" >> insect_settings.txt - echo "Minimum count: ${minc}" >> insect_settings.txt - echo "Ping: ${ping}" >> insect_settings.txt + echo "offset: ${offs}" > settings.yml + echo "threshold: ${thresh}" >> settings.yml + echo "minimum_count: ${minc}" >> settings.yml + echo "ping: ${ping}" >> settings.yml if [ "${classifier}" != "insect_model.rds" ]; then mv ${classifier} insect_model.rds @@ -1074,9 +1056,8 @@ workflow { // (notably vsearch) will cut on hyphens as a delimiter and potentially cause havok as a result. Channel.fromFilePairs(pattern, checkIfExists: true) | - // make sure we have a key value and replace dashes with underscores - // (although this may be a problem sometimes! see #138) - map { key,reads -> [ (key ?: params.project).replaceAll(/-/,'_'), reads ] } | + // make sure we have a key value (project ID) + map { key,reads -> [ key ?: params.project, reads ] } | ifEmpty { // bail if we didn't find anything exit(1,"No paired reads matched by pattern '${pattern}'. Check command-line options.") @@ -1596,4 +1577,27 @@ workflow { } } } + + // save the config file in yaml format + // TODO: saving the config file to a new file seems to break caching for the blast process? + // but downstream things that rely on blast are still cached. it makes no sense + if (params.saveConfig) { + // default to 'options.yml' in the launch directory + def config_file = launchDir / "options.yml" + // if it's a string, use that as the location + if (params.saveConfig instanceof String) { + config_file = params.saveConfig + } + + // make sure the yaml is dumped in block format + def opts = new org.yaml.snakeyaml.DumperOptions() + opts.setDefaultFlowStyle(org.yaml.snakeyaml.DumperOptions.FlowStyle.BLOCK) + opts.setPrettyFlow(true) + def y = new org.yaml.snakeyaml.Yaml(opts) + + // dump the yaml file + new File(config_file.toString()).withWriter { w -> + y.dump(new LinkedHashMap(params).collectEntries{ [it.key.toString(), it.value instanceof nextflow.util.Duration ? it.value.toString() : it.value ]}, w) + } + } }