Merge pull request #140 from MitraDarja/ins

[FIX] Fix input pathways for inser and delete.
seqan · Aug 24, 2023 · 538c9f4 · 538c9f4
2 parents 439eeab + f244e64
commit 538c9f4
Show file tree

Hide file tree

Showing 16 changed files with 1,861 additions and 8 deletions.
diff --git a/src/estimate.cpp b/src/estimate.cpp
@@ -32,7 +32,7 @@
 template <class IBFType, bool last_exp, bool normalization, typename exp_t>
 void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector<uint16_t> & estimations_i,
                seqan3::dna4_vector const seq, std::vector<uint32_t> & prev_counts,
-               exp_t const & expressions, uint16_t const k, std::vector<double> const fprs, std::vector<int> deleted)
+               exp_t const & expressions, uint16_t const k, std::vector<double> const fprs, std::vector<int> & deleted)
 {
     // Check, if one expression threshold for all or individual thresholds
     static constexpr bool multiple_expressions = std::same_as<exp_t, std::vector<std::vector<uint16_t>>>;

diff --git a/src/ibf.cpp b/src/ibf.cpp
@@ -1340,6 +1340,7 @@ std::vector<uint16_t> insert(std::vector<std::filesystem::path> const & sequence
     else
         insert_helper<false, false>(sequence_files,ibf_args, path_in, cutoffs, expression_by_genome_file, minimiser_args);
 
+    store_args(ibf_args, std::string{ibf_args.path_out} + "IBF_Data");
     return ibf_args.expression_thresholds;
 }
 
@@ -1349,6 +1350,7 @@ std::vector<uint16_t> insert(std::vector<std::filesystem::path> const & minimise
                              std::filesystem::path const expression_by_genome_file, std::filesystem::path path_in, bool samplewise)
 {
     std::vector<uint8_t> cutoffs{};
+    load_args(ibf_args, std::string{path_in} + "IBF_Data");
     if (samplewise)
         insert_helper<true>(minimiser_files, ibf_args, path_in, cutoffs, expression_by_genome_file);
     else
@@ -1379,15 +1381,20 @@ void delete_bin(std::vector<uint64_t> const & delete_files,
     {
         std::filesystem::path filename;
         if (samplewise)
-            filename = ibf_args.path_out.string() + "IBF_Level_" + std::to_string(i);
+            filename = path_in.string() + "IBF_Level_" + std::to_string(i);
         else
-            filename = ibf_args.path_out.string() + "IBF_" + std::to_string(ibf_args.expression_thresholds[i]);
+            filename = path_in.string() + "IBF_" + std::to_string(ibf_args.expression_thresholds[i]);
 
         seqan3::interleaved_bloom_filter<seqan3::data_layout::uncompressed> ibf;
         load_ibf(ibf, filename);
         ibf.clear(bins_to_delete);
 
-        if (ibf_args.compressed)
+        if (samplewise)
+            filename = ibf_args.path_out.string() + "IBF_Level_" + std::to_string(i);
+        else
+            filename = ibf_args.path_out.string() + "IBF_" + std::to_string(ibf_args.expression_thresholds[i]);
+
+	if (ibf_args.compressed)
         {
             seqan3::interleaved_bloom_filter<seqan3::data_layout::compressed> ibf{ibf};
             store_ibf(ibf, filename);

diff --git a/src/main.cpp b/src/main.cpp
@@ -104,7 +104,7 @@ int run_needle_count(seqan3::argument_parser & parser)
                                     "for all sequences in the genome file based on the exact minimiser occurrences of "
                                     "the given sequence files. Please run genome beforehand to create the genome file.";
     parser.add_positional_option(sequence_files, "Please provide at least one sequence file.");
-    parser.add_option(include_file, '\0', "include", "Please provide one sequence file with transcripts.");
+    parser.add_option(include_file, '\0', "include", "Please provide one sequence file with transcripts.",seqan3::option_spec::required);
     parser.add_option(genome_file, '\0', "genome", "Please provide one *.genome file created with the genome command.");
     parser.add_flag(paired, 'p', "paired", "If set, experiments are paired. Default: Not paired.");
 

diff --git a/test/cli/CMakeLists.txt b/test/cli/CMakeLists.txt
@@ -12,3 +12,4 @@ target_use_datasources (estimate_options_test FILES IBF_1 mini_gen.fasta)
 add_cli_test (count_options_test.cpp)
 target_use_datasources (count_options_test FILES mini_example.fasta mini_gen.fasta mini_gen.genome)
 add_cli_test (delete_options_test.cpp)
+target_use_datasources (delete_options_test FILES exp_01.fasta)
diff --git a/test/cli/count_options_test.cpp b/test/cli/count_options_test.cpp
@@ -28,7 +28,7 @@ TEST_F(count_options_test, no_options)
 
 TEST_F(count_options_test, fail_no_argument)
 {
-    cli_test_result result = execute_app("needle count", "--seed 0");
+    cli_test_result result = execute_app("needle count", "--seed 0 --genome", data("mini_gen.genome")," --include", data("mini_gen.fasta"));
     std::string expected
     {
         "Error. Incorrect command line input for count. Not enough positional arguments provided "

diff --git a/test/cli/delete_options_test.cpp b/test/cli/delete_options_test.cpp
@@ -20,3 +20,20 @@ TEST_F(delete_options_test, delete_no_options)
     EXPECT_EQ(result.out, expected);
     EXPECT_EQ(result.err, std::string{});
 }
+
+TEST_F(delete_options_test, with_argument)
+{
+    estimate_ibf_arguments ibf_args{};
+    minimiser_arguments minimiser_args{};
+    ibf_args.expression_thresholds = {1, 2};
+    std::vector<double> fpr = {0.05};
+    std::vector<std::filesystem::path> sequence_files = {data("exp_01.fasta"),data("exp_01.fasta")};
+    ibf_args.path_out = "Test_";
+    std::vector<uint8_t> cutoffs{};
+    ibf(sequence_files, ibf_args, minimiser_args, fpr, cutoffs);
+
+    cli_test_result result = execute_app("needle delete -i ", "Test_ ", "0");
+    EXPECT_EQ(result.exit_code, 0);
+    EXPECT_EQ(result.out, "");
+    EXPECT_EQ(result.err, std::string{});
+}
diff --git a/utils/README.md b/utils/README.md
@@ -1,5 +1,9 @@
 # Evaluation of Needle count
 
+## Comparison with kallisto and salmon
+All scripts for the comparison with kallisto and salmon can be found in this [repo](https://github.com/MitraDarja/analysis_needle/tree/main), please use the provided scripts `run_simulated.sh` and `run_seqc.sh`.
+
+
 ## Differential Expression
 Download the sequencing experiments listed in accession.lst and the human transcripts from gencode as a fasta file.
 Then create a Needle-genome file via:
@@ -29,7 +33,7 @@ python3 unify_results.py deseq_in_23.csv 19_23_infiles.lst accession.lst
 python3 unify_results.py deseq_in_39.csv 19_39_infiles.lst accession.lst
 ```
 
-Now, you can run the RScript "breastcancer.R" to analyze the differential expressed genes and generate the heatmap for the 67 
+Now, you can run the RScript "breastcancer.R" to analyze the differential expressed genes and generate the heatmap for the 67
 differential expressed genes from https://doi.org/10.1016/j.dib.2018.03.079.
 
 
@@ -44,6 +48,42 @@ needle minimiser -k 21 -w 25 -t X SRR1313229.fastq.gz SRR1313228.fastq.gz  SRR13
 ```
 For the ram friendly version a `--ram` was added to the previous command.
 
+## Insertion and Deletion
+
+If Needle indices were built according to the scripts [here](https://github.com/MitraDarja/analysis_needle/tree/main). Then insertion and deletion were tested in the following way:
+
+```
+# Testing insertion
+/usr/bin/time -v ../needle/build_thesis/bin/needle insertmin -o w_21/Insert_ -i w_21/SRR_ w_21/SRR1313229.fastq.minimiser w_21/SRR1313228.fastq.minimiser w_21/SRR1313227.fastq.minimiser w_21/SRR1313226.fastq.minimiser
+/usr/bin/time -v ../needle/build_thesis/bin/needle insertmin -o w_25/Insert_ -i w_25/SRR_ w_25/SRR1313229.fastq.minimiser w_25/SRR1313228.fastq.minimiser w_25/SRR1313227.fastq.minimiser w_25/SRR1313226.fastq.minimiser
+/usr/bin/time -v ../needle/build_thesis/bin/needle insertmin -o w_41/Insert_ -i w_41/SRR_ w_41/SRR1313229.fastq.minimiser w_41/SRR1313228.fastq.minimiser w_41/SRR1313227.fastq.minimiser w_41/SRR1313226.fastq.minimiser
+
+# Testing Deletion
+/usr/bin/time -v ../needle/build_thesis/bin/needle delete -i w_21/SRR_ -o w_21/DeletedIBF_ 614 615 616 617
+/usr/bin/time -v ../needle/build_thesis/bin/needle delete -i w_25/SRR_ -o w_25/DeletedIBF_ 614 615 616 617
+/usr/bin/time -v ../needle/build_thesis/bin/needle delete -i w_41/SRR_ -o w_41/DeletedIBF_ 614 615 616 617
+```
+
+## Comparison with Reindeer and MetaGraph in regards of space and speed
+The scripts for the comparison with Reindeer can be found in this [repo](https://github.com/MitraDarja/analysis_needle/tree/main).
+There you can find what data to download and how Reindeer and Needle are run, here the commands for MetaGraph are given. Please, check out
+the scripts before using them, as the paths to the executable need to be added and sometimes an input file created first.
+
+
+```
+# Preprocessing
+bash run_kmc.sh
+bash run_build.sh
+# From here on each step was considered for the comparison
+/usr/bin/time -v bash run_clean.sh
+/usr/bin/time -v bash run_clean_smooth.sh
+bash metagraph.sh
+```
+
+### Preprocessing
+The comparison of the preprocessing steps was based on the same files as the parallelization analysis. Use the script `compare_preprocess.sh` to obtain the numbers.
+
+
 ## Differential Expression
 Download the sequencing experiments of the GEO experiment with the accession number GSE58135 (https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE58135).
 Then create a Needle index in the following way, assuming all sequencing files are stored in a folder named GSE58135:
@@ -82,4 +122,3 @@ RScrip tissue_type.R
 ```
 
 The differential expressed genes can then by obtained via ShinyGo using Ontology.Jensen.TISSUES as a database.
-
diff --git a/utils/compare_preprocess.sh b/utils/compare_preprocess.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+
+needle="Set path to needle executable"
+input_dir="Set to directory that contains the sequence files"
+
+# Comparison for 4 Threads, change thread number in each step to obtain numbers for 4 threads
+/usr/bin/time -v -o needle_21_21_preprocess4.time $needle minimiser -k 21 -w 21 -t 4 --cutoff 49 $input_dir/SRR1313229.fastq.gz $input_dir/SRR1313228.fastq.gz  $input_dir/SRR1313227.fastq.gz $input_dir/SRR1313226.fastq.gz
+/usr/bin/time -v -o needle_25_21_preprocess4.time $needle minimiser -k 21 -w 25 -t 4 --cutoff 49 $input_dir/SRR1313229.fastq.gz $input_dir/SRR1313228.fastq.gz  $input_dir/SRR1313227.fastq.gz $input_dir/SRR1313226.fastq.gz
+/usr/bin/time -v -o needle_21_41_preprocess4.time $needle minimiser -k 21 -w 41 -t 4 --cutoff 49 $input_dir/SRR1313229.fastq.gz $input_dir/SRR1313228.fastq.gz  $input_dir/SRR1313227.fastq.gz $input_dir/SRR1313226.fastq.gz
+
+# For both bcalm and kmc a file named "files.lst" with the path to the four files needs to be created.
+/usr/bin/time -v -o bcalm_preprocess4.time bash run_bcalm4.sh
+/usr/bin/time -v -o kmc_preprocess4.time bash run_kmc4.sh
diff --git a/utils/metagraph.sh b/utils/metagraph.sh
@@ -0,0 +1,53 @@
+metagraph="Set path to metagraph executable"
+out_dir="Set out directory"
+
+mkdir $out_dir/clean_anno
+/usr/bin/time -v -o jointbuild.time  $metagraph build -p 4 -k 21 --count-kmers --count-width 32 --outfile-base $out_dir/graph single_dbgs/clean/SRR*.fasta.gz &> jointbuilt.log
+/usr/bin/time -v -o annotate_joint.time  $metagraph annotate --separately -p 4 -i $out_dir/graph.dbg --anno-filename --count-kmers --count-width 32 -o $out_dir/clean_anno single_dbgs/clean/SRR*.fasta.gz &> anno.log
+
+mkdir $out_dir/rd0
+mkdir $out_dir/rd1
+mkdir $out_dir/rd2
+/usr/bin/time -v -o rd0.time  $metagraph transform_anno -v --anno-type row_diff --count-kmers --row-diff-stage 0 --mem-cap-gb 500 -o $out_dir/rd0/rd -i $out_dir/graph.dbg -p 4 --disk-swap "" $out_dir/clean_anno/*.column.annodbg &>rd0.log
+/usr/bin/time -v -o rd1.time  $metagraph transform_anno -v --anno-type row_diff --count-kmers --row-diff-stage 1 --mem-cap-gb 500 -o $out_dir/rd1/rd -i $out_dir/graph.dbg -p 4 --disk-swap "" $out_dir/clean_anno/*.column.annodbg &> rd1.log
+/usr/bin/time -v -o rd2.time  $metagraph transform_anno -v --anno-type row_diff --count-kmers --row-diff-stage 2 --mem-cap-gb 500 -o $out_dir/rd2/rd -i $out_dir/graph.dbg -p 4 --disk-swap "" $out_dir/clean_anno/*.column.annodbg &> rd2.log
+
+/usr/bin/time -v -o anno.time $metagraph transform_anno --anno-type row_diff_int_brwt --greedy --fast --subsample 1000000 -i $out_dir/graph.dbg -o $out_dir/annotation_final $out_dir/rd2/*.column.annodbg -p 4 --parallel-nodes 10 &> last_anno.log
+/usr/bin/time -v -o relax.time $metagraph relax_brwt -v --relax-arity 32 -p 4 -o $out_dir/annotation_final_relaxed $out_dir/annotation_final.row_diff_int_brwt.annodbg &> relax.log
+
+# Note data/*.fa are the files from here: https://github.com/MitraDarja/analysis_needle/tree/main
+/usr/bin/time -v -o query_1.time  $metagraph query -p 4 --query-counts -i $out_dir/graph.dbg -a $out_dir/annotation_final_relaxed.row_diff_int_brwt.annodbg data/query_1.fa
+/usr/bin/time -v -o query_100.time  $metagraph query -p 4  --query-counts -i $out_dir/graph.dbg -a $out_dir/annotation_final_relaxed.row_diff_int_brwt.annodbg data/query_100.fa
+/usr/bin/time -v -o query_1000.time  $metagraph query -p 4 --query-counts -i $out_dir/graph.dbg -a $out_dir/annotation_final_relaxed.row_diff_int_brwt.annodbg data/query_1000.fa
+
+# Query with one thead
+/usr/bin/time -v -o query_1_1.time  $metagraph query -p 1 --query-counts -i $out_dir/graph.dbg -a $out_dir/annotation_final_relaxed.row_diff_int_brwt.annodbg data/query_1.fa
+/usr/bin/time -v -o query_100_1.time  $metagraph query -p 1  --query-counts -i $out_dir/graph.dbg -a $out_dir/annotation_final_relaxed.row_diff_int_brwt.annodbg data/query_100.fa
+/usr/bin/time -v -o query_1000_1.time  $metagraph query -p 1 --query-counts -i $out_dir/graph.dbg -a $out_dir/annotation_final_relaxed.row_diff_int_brwt.annodbg data/query_1000.fa
+
+
+# Construction for smooth option
+
+mkdir $out_dir/clean_anno_smooth
+/usr/bin/time -v -o jointbuild_smooth.time  $metagraph build -p 4 -k 21 --count-kmers --count-width 32 --outfile-base $out_dir/graph_smooth single_dbgs/clean_smooth/SRR*.fasta.gz &> jointbuilt.log
+/usr/bin/time -v -o annotate_joint_smooth.time  $metagraph annotate --separately -p 4 -i $out_dir/graph.dbg --anno-filename --count-kmers --count-width 32 -o $out_dir/clean_anno_smooth single_dbgs/clean_smooth/SRR*.fasta.gz &> anno.log
+
+mkdir $out_dir/smooth_rd0
+mkdir $out_dir/smooth_rd1
+mkdir $out_dir/smooth_rd2
+/usr/bin/time -v -o rd0_smooth.time  $metagraph transform_anno -v --anno-type row_diff --count-kmers --row-diff-stage 0 --mem-cap-gb 500 -o $out_dir/smooth_rd0/rd -i $out_dir/graph_smooth.dbg -p 4 --disk-swap "" $out_dir/clean_anno_smooth/*.column.annodbg &>rd0.log
+/usr/bin/time -v -o rd1_smooth.time  $metagraph transform_anno -v --anno-type row_diff --count-kmers --row-diff-stage 1 --mem-cap-gb 500 -o $out_dir/smooth_rd1/rd -i $out_dir/graph_smooth.dbg -p 4 --disk-swap "" $out_dir/clean_anno_smooth/*.column.annodbg &> rd1.log
+/usr/bin/time -v -o rd2_smooth.time  $metagraph transform_anno -v --anno-type row_diff --count-kmers --row-diff-stage 2 --mem-cap-gb 500 -o $out_dir/smooth_rd2/rd -i $out_dir/graph_smooth.dbg -p 4 --disk-swap "" $out_dir/clean_anno_smooth/*.column.annodbg &> rd2.log
+
+/usr/bin/time -v -o anno_smooth.time $metagraph transform_anno --anno-type row_diff_int_brwt --greedy --fast --subsample 1000000 -i $out_dir/graph_smooth.dbg -o $out_dir/annotation_final_smooth $out_dir/rd2/*.column.annodbg -p 4 --parallel-nodes 10 &> last_anno.log
+/usr/bin/time -v -o relax_smooth.time $metagraph relax_brwt -v --relax-arity 32 -p 4 -o $out_dir/annotation_final_relaxed_smooth $out_dir/annotation_final_smooth.row_diff_int_brwt.annodbg &> relax.log
+
+# Note data/*.fa are the files from here: https://github.com/MitraDarja/analysis_needle/tree/main
+/usr/bin/time -v -o query_1_smooth.time  $metagraph query -p 4 --query-counts -i $out_dir/graph_smooth.dbg -a $out_dir/annotation_final_relaxed_smooth.row_diff_int_brwt.annodbg data/query_1.fa
+/usr/bin/time -v -o query_100_smooth.time  $metagraph query -p 4 --query-counts -i $out_dir/graph_smooth.dbg -a $out_dir/annotation_final_relaxed_smooth.row_diff_int_brwt.annodbg data/query_100.fa
+/usr/bin/time -v -o query_1000_smooth.time  $metagraph query -p 4 --query-counts -i $out_dir/graph_smooth.dbg -a $out_dir/annotation_final_relaxed_smooth.row_diff_int_brwt.annodbg data/query_1000.fa
+
+# Query with one thead
+/usr/bin/time -v -o query_1_smooth_1.time  $metagraph query -p 1 --query-counts -i $out_dir/graph_smooth.dbg -a $out_dir/annotation_final_relaxed_smooth.row_diff_int_brwt.annodbg data/query_1.fa
+/usr/bin/time -v -o query_100_smooth_1.time  $metagraph query -p 1 --query-counts -i $out_dir/graph_smooth.dbg -a $out_dir/annotation_final_relaxed_smooth.row_diff_int_brwt.annodbg data/query_100.fa
+/usr/bin/time -v -o query_1000_smooth_1.time  $metagraph query -p 1 --query-counts -i $out_dir/graph_smooth.dbg -a $out_dir/annotation_final_relaxed_smooth.row_diff_int_brwt.annodbg data/query_1000.fa
diff --git a/utils/run_bcalm4.sh b/utils/run_bcalm4.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+# Based on https://github.com/kamimrcht/REINDEER/blob/master/reproduce_manuscript_results/bcalm_2585.sh
+
+bcalm="Set path to bcalm2 executable"
+
+#get fastq.gz and launch bcalm on each file
+while read -r filename; do
+    $bcalm -in $filename -kmer-size 21 -abundance-min 50 -nb-cores 2 -max-memory 500000 -out-tmp bcalm2 -out-dir bcalm2
+done < files.lst
diff --git a/utils/run_build.sh b/utils/run_build.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -eu
+
+metagraph="Set path to metagraph executable"
+# Note: You need to have a kmc_files.lst, which is a file where in each line there is a path to the kmc_files with the ending ".kmc_suf"
+
+mkdir single_dbgs/
+while read -r filename; do
+	$metagraph build --state fast --mode canonical --parallel 16 --count-kmers --count-width 32 -k 21 --mem-cap-gb 8 -o single_dbgs/$(basename ${filename})   $filename
+	echo $filename
+done  < kmc_files.lst
diff --git a/utils/run_clean.sh b/utils/run_clean.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -eu
+
+metagraph="Set path to metagraph executable"
+# Note: You need to have a dbg_files.txt, which is a file where in each line there is a path to the dbg files with the ending ".dbg" (created with run_build.sh)
+
+mkdir single_dbgs/clean/
+
+while read -r filename; do
+	$metagraph clean -p4 --to-fasta --primary-kmers --smoothing-window 1 -o single_dbgs/clean/$(basename $filename)  --count-kmers --count-width 32  $filename
+done  < dbg_files.txt
diff --git a/utils/run_clean_smooth.sh b/utils/run_clean_smooth.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -eu
+
+metagraph="Set path to metagraph executable"
+# Note: You need to have a dbg_files.txt, which is a file where in each line there is a path to the dbg files with the ending ".dbg" (created with run_build.sh)
+
+mkdir single_dbgs/clean_smooth/
+
+while read -r filename; do
+	$metagraph clean -p 4 --to-fasta --primary-kmers --smoothing-window 1000000000 -o single_dbgs/clean_smooth/$(basename $filename)  --count-kmers --count-width 32  $filename
+done  < dbg_files.txt
diff --git a/utils/run_kmc.sh b/utils/run_kmc.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+set -eu
+
+kmc="Path to kmc executable"
+
+mkdir kmc_files
+mkdir kmc_tmp
+while read -r filename threshold; do
+	$kmc -t64 -r -k21 -ci$threshold -cs65535 -hp $filename kmc_files/$(basename $filename) kmc_tmp/
+	echo $filename
+done  < samples.in
diff --git a/utils/run_kmc4.sh b/utils/run_kmc4.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+
+kmc="Set path to kmc executable"
+
+#get fastq.gz and launch bcalm on each file
+while read -r filename; do
+    $kmc -t4 -r -k21 -ci50 -cs65535 -hp -m500 $filename  $(basename $filename .res) kmc_tmp/
+done < files.lst