rainbow_bridge/rainbow_bridge.nf at main · mhoban/rainbow_bridge · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env nextflow

nextflow.enable.dsl=2

// pull in the helper class
import helper
import colors

// quick check if variable is numeric
def is_num(x) {
  return x instanceof Number
}

// quickly make a number
def num(x) {
  if (!is_num(x)) {
    return Float.parseFloat(x)
  } else {
    return x
  }
}

// sanity check to make sure command-line parameters are correct and valid
def check_params() {
  // show help message and bail
  if (params.help) {
    helper.usage(params)
    if (params.debug) {
      println("\n\n\n")
      println(params)
    }
    exit(0)
  }

  // give example of what a demultiplexed FASTA file looks like
  if (params.demuxedExample) {
    helper.demuxed_example()
    exit(0)
  }

  if (params.split && params.demultiplexedBy == "index") {
    println(colors.red("Parameters") + colors.bred(" --split ") + colors.red("and") + colors.bred(" index-based demultiplexing ") + colors.red("are mutually exclusive"))
    exit(1)
  }

  // make sure the right version of single,paired,demultiplexed is passed
  if (!helper.file_exists(params.demuxedFasta) && !params.standaloneTaxonomy && params.single == params.paired) {
    if (!params.single) {
      println(colors.red("One of either ") + colors.bred("--single") + colors.red(" or ") + colors.bred("--paired") + colors.red(" MUST be passed"))
    } else {
      println(colors.red("Only one of either ") + colors.bred("--single") + colors.red(" or ") + colors.bred("--paired") + colors.red(" may be passed"))
    }
    exit(1)
  }

  // check phyloseq params
  if (params.phyloseq) {
    if (!helper.file_exists(params.metadata)) {
      println(colors.yellow("The metadata file you passed to use with phyloseq ('${params.metadata}') does not exist"))
      /* exit(1) */
    }

    switch(params.taxonomy) {
      case 'lca':
        if (!params.collapseTaxonomy) {
          println(colors.yellow("You passed --phyloseq with 'lca' as the taxonomy option, but LCA has not been run."))
          println(colors.yellow("Did you forget the --collapse-taxonomy option?"))
        }
        break
      case 'insect':
        if (!params.insect) {
          println(colors.yellow("You passed --phyloseq with 'insect' as the taxonomy option, but insect has not been run."))
          println(colors.yellow("Did you forget the --insect option?"))
        }
        break
      case 'combined':
        if (!params.collapseTaxonomy && !params.insect) {
          println(colors.yellow("Note: phyloseq generation requires one of --insect, --collapse-taxonomy, or a custom taxonomy table."))
        }
        break
      default:
        if (!helper.file_exists(params.taxonomy)) {
          println(colors.yellow("You passed --phyloseq with a user-supplied taxonomy table, but the file '${params.taxonomy}' does not exist"))
          /* exit(1) */
        }
        break
    }
  }

  // check to make sure standalone taxonomy will work
  if (params.standaloneTaxonomy) {
    if (!helper.file_exists(params.blastFile)) {
      println(colors.red("The supplied blast result table \"${params.blastFile}\" does not exist"))
      exit(1)
    }
  }

  if (params.sampleMap != "" && !helper.file_exists(params.sampleMap)) {
    println(colors.red("The supplied sample map file ${params.sampleMap} does not exist"))
    exit(1)
  }

  // the --vsearch param is no longer supported, but try to catch if someone still uses it
  if (params.containsKey('vsearch')) {
    println(colors.yellow("FYI: vsearch is now used as the default denoiser and the ") + colors.byellow("--vsearch") + colors.yellow(" option is ignored.\nIf you want to use usearch, pass --denoiser usearch."))
  }

  // if denoiser is an executable, treat it as such
  // otherwise check to make sure it's a valid input
  if (!params.execDenoiser) {
    if (!(params.denoiser in ['usearch','usearch32','vsearch'])) {
      println(colors.bred("--denoiser") + colors.red(" must be either 'usearch', 'usearch32', 'vsearch', or a path to an executable (e.g., /opt/sw/bin/usearch64)"))
      exit(1)
    }
  }

  // sanity check, blast database
  if (params.blast) {

    // if --blast-taxdb is passed, check that it's a directory and that files exist
    if (params.blastTaxdb != "nofile-blast-taxdb") {
      def ok = false
      // check for directory
      if (helper.is_dir(params.blastTaxdb)) {
        // make sure we have two files matching our search pattern
        if( file(params.blastTaxdb).list().findAll{it =~ /taxdb\.bt[id]/}.size() == 2) {
          ok = true
        }
      }
      // otherwise give an error message and bail
      if (!ok) {
        println(colors.bred("--blast-taxdb") + colors.red(" must be a directory containing the NCBI BLAST taxonomy files (taxdb.btd, taxdb.bti)"))
        exit(1)
      }
    }

    // get blast environment variable
    def bdb = helper.get_env("FLOW_BLAST")

    // make --blast-db param into a list, if it isn't
    def blasts = params.blastDb
    if (!helper.is_list(blasts))
      blasts = [blasts]

    // if $FLOW_BLAST was set and we're not ignoring it, add it to the front of the list
    if (bdb != "" && !params.ignoreBlastEnv)
      blasts = blasts.plus(0,bdb)

    // get unique vals
    blasts = blasts.unique(false)

    // make sure we've got at least one db
    if (!blasts.size()) {
      println(colors.red("You must pass at least one value to --blast-db or FLOW_BLAST must point to a valid BLAST database"))
      exit(1)
    } else {
      // make sure all dbs exist
      blasts.each {
        if (!file("${it}.ndb").exists()) {
          println(colors.red("Could not find BLAST database '${it}'. Please provide the path to an existing blast database."))
          if (it =~ /~/) {
            println(colors.yellow("The BLAST database '${it}' contains a '~' that was not expanded by the shell. Try entering an absolute path."))
          }
          exit(1)
        }
      }
    }
  } else {
    if (params.collapseTaxonomy) {
      println(colors.red("--collapse-taxonomy requires the --blast option."))
      exit(1)
    }
  }

  // check LCA options
  if (params.collapseTaxonomy) {
    if (params.lcaDiff <= 0) {
      println(colors.red("--lca-diff argument must be a number greater than zero."))
      exit(1)
    }
  }


  // make sure insect parameter is valid: either a file or one of the pretrained models
  if (params.insect) {
    if (!helper.insect_classifiers.containsKey(params.insect.toLowerCase())) {
      if (!helper.file_exists(params.insect)) {
        println(colors.red("Value passed to ") + colors.bred("--insect") + colors.red(" must be one of the supported builtins or an RDS file"))
        println(colors.red("containing a trained insect classifier model."))
        println(colors.red("See rainbow_bridge.nf ") + colors.bred("--help") + colors.red(" for supported builtin models"))
        exit(1)
      }
    }
  }
}

// trim and (where relevant) merge paired-end reads
process filter_merge {
  label 'adapterRemoval'
  label 'process_medium'

  publishDir "${params.preDir}/trim_merge", mode: params.publishMode

  input:
    tuple val(key), path(reads)

  output:
    tuple val(key), path('*_trimmed_merged.fastq')

  script:
  if( params.single ) {
    // single end
    """
    AdapterRemoval --threads ${task.cpus} --file1 ${reads} \
      --trimns --trimqualities \
      --minquality ${params.minQuality} \
      --qualitymax ${params.maxQuality} \
      --mate-separator ${params.mateSeparator} \
      --basename ${key}

    mv ${key}.truncated ${key}_trimmed_merged.fastq
    """
  } else if ( params.paired ) {
    // if reads are paired-end then merge
    """
    AdapterRemoval --threads ${task.cpus} --file1 ${reads[0]} --file2 ${reads[1]} \
      --collapse --trimns --trimqualities \
      --minquality $params.minQuality \
      --qualitymax ${params.maxQuality} \
      --minalignmentlength ${params.minAlignLen} \
      --mate-separator ${params.mateSeparator} \
      --basename ${key}

    mv ${key}.collapsed ${key}_trimmed_merged.fastq
    """
  }
}

process filter_ambiguous_indices {
  label 'obitools'
  label 'process_single'

  publishDir "${params.preDir}/index_filtered", mode: params.publishMode

  input:
    tuple val(key), path(reads)

  output:
    tuple val(key), path("*_valid_index.fastq")

  script:
  """
  obigrep --uppercase -D ':[ACGT]+\\+[ACGT]+\$' ${reads} > "${key}_valid_index.fastq"
  """
}

// replace I's with N's in barcode file primer sequences
process fix_barcodes {
  label 'shell'
  label 'process_single'

  input:
    path(barcodes)

  output:
    path("${barcodes.baseName}_fixed.${barcodes.extension}")

  script:
  """
  if [[ -e "${barcodes}" ]]; then
    fix_barcode.awk "${barcodes}" > "${barcodes.baseName}_fixed.${barcodes.extension}"
  else
    touch "${barcodes.baseName}_fixed.${barcodes.extension}"
  fi
  """
}

// split and properly modify barcode file if they're pooled
process split_barcodes {
  label 'shell'
  label 'process_single'

  input:
    path(barcodes)

  output:
    path('bc/*.tsv')

  script:
  """
  mkdir bc
  split_barcode.awk -v parent="${barcodes.baseName}" ${barcodes}
  """
}

// primer mismatch & sample assignment
// multiple different barcode files are possible
process ngsfilter {
  label 'obitools'
  label 'process_single'
  label 'process_more_memory'

  publishDir "${params.preDir}/ngsfilter", mode: params.publishMode

  input:
    tuple val(key), path(read), path(barcode)


  output:
    tuple val(key), path("*_annotated.fastq"), val("${barcode.baseName}")

  script:
  """
  ngsfilter --uppercase -t ${barcode} -e ${params.primerMismatch} -u "${key}_filter_orphans.fastq" ${read} > "${key}_${barcode.baseName}_annotated.fastq"
  """
}

// combine outputs from (possible) multiple barcode files, filter by length
process filter_length {
  label 'obitools'
  label 'process_single'

  publishDir "${params.preDir}/length_filtered", mode: params.publishMode

  input:
    tuple val(key), path(fastq), val(barcode)

  output:
    tuple val(key), path('*_length_filtered.fastq'), val(barcode)

  script:
  """
  obigrep --uppercase -l ${params.minLen} "${fastq}" > "${key}_length_filtered.fastq"
  """
}

// for non-demultiplexed runs, split the annotated reads file by samples
process split_samples {
  label 'obitools'
  label 'process_single'

  publishDir "${params.preDir}/split_samples", mode: params.publishMode

  input:
    tuple val(key), path(fastq), val(barcode)

  output:
    path("__split__*.fastq"), optional: true

  script:
  """
  obisplit --uppercase -p "__split__" -t sample -u "${key}_split_orphans.fastq" ${fastq}
  """
}

// relabel files for dereplication
process relabel {
  label 'denoiser'
  label 'process_low'

  publishDir "${params.preDir}/relabeled", mode: params.publishMode

  input:
    tuple val(key), path(fastq, name: 'input-????.fastq')
  output:
    path('*_relabeled.fasta'), optional: true, emit: result
    path 'settings.txt'


  script:

  if (params.denoiser == "vsearch") {
    def combined = "<(cat input-*.fastq)"
    """
    echo "denoiser: vsearch" > settings.txt
    # this may or may not be necessary anymore, but it seems like a good sanity check
    # since this will fail on empty files
    vsearch --threads ${task.cpus} --fastq_qmax ${params.maxQuality} --fastx_filter ${combined} --relabel "${key}." --fastaout - | \
      awk '/^>/ {print;} !/^>/ {print(toupper(\$0))}' > "${key}_relabeled.fasta"
    """
  } else {
    // combined = fastq.collect{ it.baseName }.join('_') + "_combined.fastq"
    def combined = "combined.fastq"
    def denoiser = params.execDenoiser ? params.denoiser : 'usearch'
    """
    echo "denoiser: ${params.denoiser}" > settings.txt
    cat input-*.fastq > ${combined}
    # we have to convert everything to uppercase because obisplit --uppercase is broken
    ${denoiser} -fastx_relabel ${combined} -prefix "${key}." -fastaout /dev/stdout | \
      tail -n+7 | \
      awk '/^>/ {print;} !/^>/ {print(toupper(\$0))}' > "${key}"_relabeled.fasta
    """
  }
}

// concatenate all relabeled files. we only do this in a process
// instead of using collectFile so we can see that it's happening
// in the process list. also, the output from collectFile may not be cached and this will be
process merge_relabeled {
  label 'shell'
  label 'process_single'

  publishDir "${params.preDir}/merged"

  input:
    path('input-fastq?????.fastq')

  output:
    path("${params.project}_relabeled_merged.fasta")

  script:
  """
  cat input-*.fastq > "${params.project}_relabeled_merged.fasta"
  """
}

// dereplication, chimera removal, zOTU table generation
process dereplicate {
  label 'denoiser'
  label 'process_high'

  publishDir "${params.outDir}/zotus", mode: params.publishMode

  input:
    tuple val(id), path(relabeled_merged)

  output:
    tuple val(id), path("${id}_unique.fasta"), path("${id}_zotus.fasta"), path("zotu_table.tsv"), emit: result
    path 'settings.txt'
    path 'zotu_map.tsv'

  script:
  if (params.denoiser == "vsearch") {
    """
    echo "denoiser: vsearch" > settings.txt
    echo "minimum sequence abundance: ${params.minAbundance}" >> settings.txt
    echo "alpha: ${params.alpha}" >> settings.txt
    echo "fractional identity: ${params.zotuIdentity}" >> settings.txt
    # steps:
    # 1. get unique sequence variants
    # 2. run denoising algorithm
    # 3. get rid of chimeras
    # 4. match original sequences to zotus by 97% identity
    if [ -s "${relabeled_merged}" ]; then
      vsearch \
        --threads ${task.cpus} --fastq_qmax ${params.maxQuality} \
        --derep_fulllength ${relabeled_merged} --sizeout \
        --output "${id}_unique.fasta"
      vsearch \
        --threads ${task.cpus} --fastq_qmax ${params.maxQuality} \
        --cluster_unoise "${id}_unique.fasta" --centroids "${id}_centroids.fasta" \
        --minsize ${params.minAbundance} --unoise_alpha ${params.alpha}
      vsearch \
        --threads ${task.cpus} --fastq_qmax ${params.maxQuality} \
        --uchime3_denovo "${id}_centroids.fasta" --nonchimeras "${id}_zotus.fasta" \
        --relabel Zotu
      vsearch \
        --threads ${task.cpus} --fastq_qmax ${params.maxQuality} \
        --usearch_global ${relabeled_merged} --db "${id}_zotus.fasta" \
        --id ${params.zotuIdentity} --otutabout zotu_table.tsv \
        --userout zotu_map.tsv --userfields "query+target" \
        --top_hits_only
    else
      >&2 echo "Merged FASTA is empty. Did your PCR primers match anything?"
      exit 1
    fi
    """
  } else {
    def denoiser = params.execDenoiser ? params.denoiser : 'usearch'
    """
    echo "denoiser: ${denoiser}" > settings.txt
    echo "minimum sequence abundance: ${params.minAbundance}" >> settings.txt
    echo "alpha: ${params.alpha}" >> settings.txt
    echo "fractional identity: ${params.zotuIdentity}" >> settings.txt
    # steps:
    # 1. get unique sequences
    # 2. run denoising & chimera removal
    # 3. generate zotu table
    if [ -s "${relabeled_merged}" ]; then
      ${denoiser} -fastx_uniques ${relabeled_merged} \
        -sizeout -fastaout "${id}_unique.fasta"
      ${denoiser} -unoise3 "${id}_unique.fasta"  -zotus "${id}_zotus.fasta" \
        -tabbedout "${id}_unique_unoise3.txt" -minsize ${params.minAbundance} \
        -unoise_alpha ${params.alpha}
      ${denoiser} -otutab ${relabeled_merged} -id ${params.zotuIdentity} \
        -zotus ${id}_zotus.fasta -otutabout zotu_table.tsv -mapout zotu_map.tsv
    else
      >&2 echo "Merged FASTA is empty. Did your PCR primers match anything?"
      exit 1
    fi
    """
  }
}

// run blast query
process blast {
  label 'blast'
  label 'all_cpus'

  publishDir {
    def pid = String.format("%d",(Integer)num(params.percentIdentity ))
    def evalue = String.format("%.3f",num(params.evalue))
    def qcov = String.format("%d",(Integer)num(params.qcov))
    return "${params.outDir}/blast/pid${pid}_eval${evalue}_qcov${qcov}_max${params.maxQueryResults}/${db_name}"
  }, mode: params.publishMode

  input:
    tuple path(zotus_fasta), val(db_name), path(db_files), path(taxdb)

  output:
    path("blast_result.tsv"), emit: result
    path 'blast_settings.txt'

  script:

  // format settings values
  def pid = String.format("%d",(Integer)num(params.percentIdentity))
  def evalue = String.format("%.3f",num(params.evalue))
  def qcov = String.format("%d",(Integer)num(params.qcov))

  // setup and populate the "basic" blast options
  def blast_options = [:]
  blast_options['task'] = "blastn"
  blast_options['perc_identity'] = params.percentIdentity
  blast_options['evalue'] = params.evalue
  blast_options['qcov_hsp_perc'] = params.qcov
  blast_options['max_target_seqs'] = params.maxQueryResults
  blast_options['best_hit_score_edge'] = 0.05
  blast_options['best_hit_overhang'] = 0.25

  // get extra blast options passed on the command line as --blastn-*
  def extra_options = params
    .findAll { it.key =~ /^blastn-/ }
    .collectEntries { k, v -> [k.tokenize('-')[1],v] }

  // merge blast options with any extra options
  blast_options = blast_options << extra_options
  // collapse them into a single string
  def blast_opt_str = blast_options
    .collect { k, v -> v == true ? "-${k}" : "-${k} ${v}" }
    .join(" ")
  """
  # record blast settings
  echo "Percent identity: ${pid}" > blast_settings.txt
  echo "e-value: ${evalue}" >> blast_settings.txt
  echo "Query qoverage: ${qcov}" >> blast_settings.txt
  echo "Max. target sequences: ${params.maxQueryResults}" >> blast_settings.txt

  # set BLASTDB to local working directory
  export BLASTDB=.

  # blast our zotus
  blastn \
    -db "${db_name}" \
    -outfmt "6 qseqid sseqid staxid ssciname scomname sskingdom pident length qlen slen mismatch gapopen gaps qstart qend sstart send stitle evalue bitscore qcovs qcovhsp" \
    ${blast_opt_str} \
    -query ${zotus_fasta} -num_threads ${task.cpus} \
    > blast_result.tsv
  """
}

// make custom blast database for LULU curation
process lulu_blast {
  label 'blast'
  label 'process_medium'

  input:
    tuple val(key), path(zotus_fasta), path(zotu_table)

  output:
    tuple val(key), path('match_list.txt'), path(zotu_table)

  script:
  """
  # blast zotus against themselves to create the match list LULU needs
  makeblastdb -in ${zotus_fasta} -parse_seqids -dbtype nucl -out ${key}_zotus
  blastn -db ${key}_zotus \
    -outfmt "6 qseqid sseqid pident" \
    -out match_list.txt -qcov_hsp_perc 80 \
    -perc_identity 84 -query ${zotus_fasta} \
    -num_threads ${task.cpus}
  """
}

// LULU curation
process lulu {
  label 'r'
  label 'process_single'
  label 'process_more_memory'

  publishDir "${params.outDir}/lulu", mode: params.publishMode

  input:
    tuple val(key), path(match_list), path(zotu_table)

  output:
    tuple path("lulu_zotu_table.tsv"), path("lulu_zotu_map.tsv"), path("lulu_result_object.rds"), emit: result
    path 'settings.txt'

  script:
  """
  echo "minimum ratio: ${params.luluMinRatio}" > settings.txt
  echo "minimum ratio type: ${params.luluMinRatioType}" >> settings.txt
  echo "minimum match: ${params.luluMinMatch}" >> settings.txt
  echo "minimum RC: ${params.luluMinRc}" >> settings.txt
  lulu.R \
    -m ${params.luluMinRatio} \
    -t ${params.luluMinRatioType} \
    -a ${params.luluMinMatch} \
    -r ${params.luluMinRc} \
    ${zotu_table} ${match_list} "lulu_zotu_table.tsv" "lulu_zotu_map.tsv" "lulu_result_object.rds"
  """
}

// assign/collapse taxonomy using the R port of the original python script
process collapse_taxonomy {
  label 'r'
  label 'process_single'
  label 'process_more_memory'

  publishDir {
    def td = params.standaloneTaxonomy ? 'standalone_taxonomy' : 'taxonomy'
    "${params.outDir}/${td}/lca/qcov${params.lcaQcov}_pid${params.lcaPid}_diff${params.lcaDiff}"
  }, mode: params.publishMode

  input:
    tuple path(blast_result), path(lineage), path('*')

  output:
    path("lca_taxonomy.tsv"), emit: taxonomy
    path("lca_intermediate.tsv")
    path 'lca_settings.txt'


  script:
  def pf = []
  params.lcaFilterMaxQcov && pf << "--filter-max-qcov"
  params.lcaCaseInsensitive && pf << "--case-insensitive"
  """
  # save settings
  echo "Minimum query coverage %: ${params.lcaQcov}" > lca_settings.txt
  echo "Minimum percent identity: ${params.lcaPid}" >> lca_settings.txt
  echo "Minium percent identity difference: ${params.lcaDiff}" >> lca_settings.txt
  echo "Filter to maximum query coverage: ${params.lcaFilterMaxQcov ? 'yes' : 'no'}" >> lca_settings.txt
  echo "Filter taxa by regex: ${params.lcaTaxonFilter}" >> lca_settings.txt
  echo "Taxon filter case sensitive: ${!params.lcaCaseInsensitive ? 'yes' : 'no'}" >> lca_settings.txt

  collapse_taxonomy.R \
    --qcov ${params.lcaQcov} \
    --pid ${params.lcaPid} \
    --diff ${params.lcaDiff} \
    --merged merged.dmp \
    --nodes nodes.dmp \
    --taxid-lineage taxidlineage.dmp \
    --output lca_taxonomy.tsv \
    --dropped "${params.dropped}" \
    --intermediate "lca_intermediate.tsv" \
    ${pf.join(" ")} \
    --taxon-filter "${params.lcaTaxonFilter}" \
    ${blast_result} ${lineage}
  """
}

// run insect classifier model
process insect {
  label 'insect'
  label 'all_cpus'

  publishDir {
    def offs = String.format("%d",(Integer)num(params.insectOffset))
    def thresh = String.format("%.2f",num(params.insectThreshold))
    def minc = String.format("%d",(Integer)num(params.insectMinCount))
    def ping = String.format("%.2f",num(params.insectPing))
    return "${params.outDir}/taxonomy/insect/thresh${thresh}_offset${offs}_mincount${minc}_ping${ping}"
  }, mode: params.publishMode

  input:
    tuple path(classifier), path('*'), path(zotus)

  output:
    path('insect_taxonomy.tsv'), emit: taxonomy
    path('insect_model.rds')
    path('insect_settings.txt')

  script:
  def offs = String.format("%d",(Integer)num(params.insectOffset))
  def thresh = String.format("%.2f",num(params.insectThreshold))
  def minc = String.format("%d",(Integer)num(params.insectMinCount))
  def ping = String.format("%.2f",num(params.insectPing))

  """
  # record insect settings
  echo "Offset: ${offs}" > insect_settings.txt
  echo "Threshold: ${thresh}" >> insect_settings.txt
  echo "Minimum count: ${minc}" >> insect_settings.txt
  echo "Ping: ${ping}" >> insect_settings.txt

  if [ "${classifier}" != "insect_model.rds" ]; then
    mv ${classifier} insect_model.rds
  fi
  insect.R \
     --cores ${task.cpus} \
     --threshold ${params.insectThreshold} \
     --offset ${params.insectOffset} \
     --min-count ${params.insectMinCount} \
     --ping ${params.insectPing} \
     --lineage rankedlineage.dmp \
     --merged merged.dmp \
     ${zotus} insect_model.rds "insect_taxonomy.tsv"
  """
}

// dummy process to generate published file
process save_taxdump {
  label 'shell'
  label 'process_single'

  publishDir 'output/taxonomy/ncbi_taxdump'

  input:
    path(taxdump)

  output:
    path(taxdump)

  script:
  """
  echo "linking taxdump to publish dir"
  """
}

// produce a phyloseq object from pipeline output
process phyloseq {
  label 'r'
  label 'process_single'
  label 'process_more_memory'

  publishDir "${params.outDir}/phyloseq", mode: params.publishMode

  input:
    path(zotu_table)
    path(taxonomy)
    path(metadata)
    path(sequences)

  output:
    path("phyloseq.rds")

  script:
  def opt = []
  params.tree && opt << "--tree"
  params.tree && opt << "--sequences \"${sequences}\""
  params.optimizeTree && opt << "--optimize"
  // if (method == "insect") {
  //   otu = "representative"
  //   c = "--tax-columns \"kingdom,phylum,class,order,family,genus,species,taxon,NNtaxon,NNrank\""
  // }
  """
  phyloseq.R \
    --out phyloseq.rds \
    ${opt.join(" ")} \
    "${zotu_table}" "${taxonomy}" "${metadata}"
  """
}

process finalize {
  label 'r'
  label 'process_single'
  label 'process_more_memory'

  publishDir {
    def td = params.standaloneTaxonomy ? 'standalone_taxonomy/final' : 'final'
    "${params.outDir}/${td}"
  }, mode: params.publishMode

  input:
    tuple path(zotu_table), path(curated_zotu_table), path(lca_taxonomy), path(insect_taxonomy)

  output:
    path("zotu_table_raw.tsv")
    path("taxonomy.tsv"), emit: taxonomy
    path("zotu_table_final*.tsv")
    path("zotu_table_lca.tsv"), optional: true
    path("zotu_table_insect.tsv"), optional: true

  script:
  def opt = []
  params.abundanceFilter && opt << "--abundance-filter"
  params.rarefy && opt << "--rarefy"
  params.filterMinimum && opt << "--filter-min"
  params.lcaTable && opt << "--lca-table"
  params.insectTable && opt << "--insect-table"

  """
  finalize.R \
    --filter "${params.taxonFilter}" \
    --remap "${params.taxonRemap}" \
    --insect "${insect_taxonomy}" \
    --lca "${lca_taxonomy}" \
    --dropped "${params.dropped}" \
    --controls "${params.controls}" \
    --control-action "${params.controlAction}" \
    --control-threshold "${params.controlThreshold}" \
    --decontam-method "${params.decontamMethod}" \
    --concentration "${params.dnaConcentration}" \
    --abundance-threshold "${params.abundanceThreshold}" \
    --rarefaction-method "${params.rarefactionMethod}" \
    --permutations "${params.permutations}" \
    --taxon-priority "${params.taxonPriority}" \
    --curated "${curated_zotu_table}" \
    ${opt.join(" ")} \
    ${zotu_table}
  """
}


// we reuse fastqc/multiqc processes at different steps so they're
// included from an external module
include { fastqc as first_fastqc }    from './modules/modules.nf'
include { fastqc as second_fastqc }   from './modules/modules.nf'
include { multiqc as first_multiqc }  from './modules/modules.nf'
include { multiqc as second_multiqc } from './modules/modules.nf'
include { get_web as get_model } from './modules/modules.nf'
include { get_web as get_taxdb } from './modules/modules.nf'
include { get_web as get_taxdump } from './modules/modules.nf'
include { extract_zip as extract_lineage } from './modules/modules.nf'
include { extract_zip as extract_ncbi } from './modules/modules.nf'
include { extract_targz as extract_taxdb } from './modules/modules.nf'

workflow {
  // make sure our arguments are all in order
  check_params()

  def directions = []

  // do standalone taxonomy assignment
  if (params.standaloneTaxonomy) {
    // build input channels and get appropriate process
    blast_result = Channel.fromPath(params.blastFile, checkIfExists: true)
    // load lineage and (optionally) other taxonomy dump files
    if (!helper.file_exists(params.lcaLineage)) {
      if (!helper.file_exists(params.taxdump)) {
        if (params.taxdump != "") {
          println(colors.yellow("Taxonomy dump archive '${params.taxdump}' does not exist and will be downloaded"))
        }

        Channel.of('https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/new_taxdump/new_taxdump.zip') |
          combine(Channel.fromPath('new_taxdump.zip')) |
          get_taxdump |
          set { taxdump }

        taxdump |
          combine(Channel.of('rankedlineage.dmp')) |
          extract_lineage | collect |
          set { lineage }
        taxdump |
          combine(Channel.of('merged.dmp','nodes.dmp','taxidlineage.dmp')) |
          extract_ncbi | collect | toList |
          set { ncbi_dumps }

        taxdump |
          save_taxdump
      } else {
        zip = Channel.fromPath(params.taxdump)
        zip |
          combine(Channel.of('rankedlineage.dmp')) |
          extract_lineage | collect |
          set { lineage }
        zip |
          combine(Channel.of('merged.dmp','nodes.dmp','taxidlineage.dmp')) |
          extract_ncbi | collect | toList |
          set { ncbi_dumps }
      }
    } else {
      // if user has passed a custom ranked lineage dump, use it instead
      lineage = Channel.fromPath(params.lcaLineage)
      ncbi_dumps = Channel.fromPath('nodumps')
    }


    // run taxonomy process
    blast_result |
      combine(lineage) |
      combine(ncbi_dumps) |
      collapse_taxonomy

    // pull out lca table
    collapse_taxonomy.out.taxonomy |
      set { lca_taxonomy }

    // do this part if the zotu table exists
    if (helper.file_exists(params.zotuTable)) {
      zotu_table = Channel.fromPath(params.zotuTable, checkIfExists: true)
      curated_zotu_table = Channel.fromPath("nofile-curated-zotu-table", checkIfExists: false)
      insect_taxonomy = Channel.fromPath("nofile-insect-taxonomy", checkIfExists: false)
      // run it through finalize
      zotu_table |
        combine(curated_zotu_table) |
        combine(lca_taxonomy) |
        combine(insect_taxonomy) |
        finalize
    }
  } else {
    if (!helper.file_exists(params.demuxedFasta)) {
      if (params.single) {
        // if params.reads is a directory, make it a glob
        def reads_files = params.reads
        if (helper.is_dir(reads_files)) {
          reads_files = file(reads_files) / '*.f*q*'
        }
        // here we load whatever was passed as the --reads option
        // if it's a glob, we get a list of files. if it's just one, we get just one
        // if it's a directory, it's made into a glob to find reads in that directory
        // and we use the basename of the file as a sample ID.
        Channel.fromPath(reads_files, checkIfExists: true) |
          map { [ it.baseName, [it] ] } |
          set { reads }
      } else if (params.paired) {
        // if fwd and rev point to files that exists, just load them directly
        if ( helper.file_exists(params.fwd) && helper.file_exists(params.rev) ) {
          directions = [params.fwd,params.rev]
          Channel.of(params.project) |
            combine(Channel.fromPath(params.fwd,checkIfExists: true)) |
            combine(Channel.fromPath(params.rev,checkIfExists: true)) |
            map { a,b,c -> [a,[b,c]] } |
            set { reads }
         } else {
          // figure out how the reads are to be found and find them
          def pattern = ""
          // if --fwd and --rev are both globs
          if ( params.fwd != "" && params.rev != "" && helper.is_list(file(params.fwd)) && helper.is_list(file(params.rev)) ) {
            // get directory part of fwd and rev globs.
            // file() resolves the glob's fully qualified path
            // and .Parent gets the directory part
            // file() will also get all matches to the glob, so we call unique()
            // to collapse directories, hoping there is only one
            def fwd_path = file(params.fwd).Parent.unique()
            def rev_path = file(params.rev).Parent.unique()

            // make sure globs actually matched something
            if (fwd_path.size() == 0) {
              exit(1,"No files matched by --fwd glob.")
            }
            // make sure globs actually matched something
            if (rev_path.size() == 0) {
              exit(1,"No files matched by --rev glob.")
            }

            // make sure globs only mached a single directory
            if (fwd_path.size() > 1 || rev_path.size() > 1) {
              exit(1,"Files matched by --fwd/--rev globs must reside in single directories.")
            }
            // reduce to first element and convert to string
            fwd_path = fwd_path[0].toString()
            rev_path = rev_path[0].toString()

            // make sure the directory part ends in '/'
            if (fwd_path[-1] != '/') fwd_path += '/'
            if (rev_path[-1] != '/') rev_path += '/'

            // CEB: The strategy here is to idenitify identical and non-identical text in the --fwd and --rev globs to generate
            // a single glob that is compatible with Channel.fromFilePairs

            // Extract the glob part from the provided paths
            // new File() is used because it parses but does not resolve globs
            def fwd_file_pattern = new File(params.fwd).Name
            def rev_file_pattern = new File(params.rev).Name

            // get common prefix
            // for some weird reason, nextflow doesn't like having def and the assignment on the same line
            // when assigning the results of a call to a helper class method, so define it first
            def common_path_prefix = ""
            common_path_prefix = helper.common(fwd_path,rev_path)

            // CEB: Adjust common_prefix to remove the last character if it's where they start to differ
            // MH: I'm not sure why this is needed but it doesn't seem to harm anything so I'm leaving it in
            if (common_path_prefix.size() > 0 && fwd_path.charAt(common_path_prefix.size() - 1) != rev_path.charAt(common_path_prefix.size() - 1)) {
              common_path_prefix = common_path_prefix[0..-2]
            }

            // CEB: Only run the following block if the fwd_path and rev_path are different
            def fwd_path_diff = ""
            def rev_path_diff = ""
            def common_path_suffix = ""
            if (fwd_path != rev_path) {
              // Extract the common path suffix by comparing the strings in reverse
              def fwd_path_reversed = fwd_path.reverse()
              def rev_path_reversed = rev_path.reverse()

              common_path_suffix = helper.common(fwd_path_reversed,rev_path_reversed).reverse()

              // Extract the differing middle part of the path
              fwd_path_diff = fwd_path.substring(common_path_prefix.size(), fwd_path.size() - common_path_suffix.size())
              rev_path_diff = rev_path.substring(common_path_prefix.size(), rev_path.size() - common_path_suffix.size())

            }
            // CEB: Extract the common file prefix
            // for some weird reason, nextflow doesn't like having def and the assignment on the same line
            // when assigning the results of a call to a helper class method, so define it first
            def common_file_prefix = ""