Skip to content

Commit

Permalink
Merge pull request #140 from MitraDarja/ins
Browse files Browse the repository at this point in the history
[FIX] Fix input pathways for inser and delete.
  • Loading branch information
MitraDarja authored Aug 24, 2023
2 parents 439eeab + f244e64 commit 538c9f4
Show file tree
Hide file tree
Showing 16 changed files with 1,861 additions and 8 deletions.
2 changes: 1 addition & 1 deletion src/estimate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
template <class IBFType, bool last_exp, bool normalization, typename exp_t>
void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector<uint16_t> & estimations_i,
seqan3::dna4_vector const seq, std::vector<uint32_t> & prev_counts,
exp_t const & expressions, uint16_t const k, std::vector<double> const fprs, std::vector<int> deleted)
exp_t const & expressions, uint16_t const k, std::vector<double> const fprs, std::vector<int> & deleted)
{
// Check, if one expression threshold for all or individual thresholds
static constexpr bool multiple_expressions = std::same_as<exp_t, std::vector<std::vector<uint16_t>>>;
Expand Down
13 changes: 10 additions & 3 deletions src/ibf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1340,6 +1340,7 @@ std::vector<uint16_t> insert(std::vector<std::filesystem::path> const & sequence
else
insert_helper<false, false>(sequence_files,ibf_args, path_in, cutoffs, expression_by_genome_file, minimiser_args);

store_args(ibf_args, std::string{ibf_args.path_out} + "IBF_Data");
return ibf_args.expression_thresholds;
}

Expand All @@ -1349,6 +1350,7 @@ std::vector<uint16_t> insert(std::vector<std::filesystem::path> const & minimise
std::filesystem::path const expression_by_genome_file, std::filesystem::path path_in, bool samplewise)
{
std::vector<uint8_t> cutoffs{};
load_args(ibf_args, std::string{path_in} + "IBF_Data");
if (samplewise)
insert_helper<true>(minimiser_files, ibf_args, path_in, cutoffs, expression_by_genome_file);
else
Expand Down Expand Up @@ -1379,15 +1381,20 @@ void delete_bin(std::vector<uint64_t> const & delete_files,
{
std::filesystem::path filename;
if (samplewise)
filename = ibf_args.path_out.string() + "IBF_Level_" + std::to_string(i);
filename = path_in.string() + "IBF_Level_" + std::to_string(i);
else
filename = ibf_args.path_out.string() + "IBF_" + std::to_string(ibf_args.expression_thresholds[i]);
filename = path_in.string() + "IBF_" + std::to_string(ibf_args.expression_thresholds[i]);

seqan3::interleaved_bloom_filter<seqan3::data_layout::uncompressed> ibf;
load_ibf(ibf, filename);
ibf.clear(bins_to_delete);

if (ibf_args.compressed)
if (samplewise)
filename = ibf_args.path_out.string() + "IBF_Level_" + std::to_string(i);
else
filename = ibf_args.path_out.string() + "IBF_" + std::to_string(ibf_args.expression_thresholds[i]);

if (ibf_args.compressed)
{
seqan3::interleaved_bloom_filter<seqan3::data_layout::compressed> ibf{ibf};
store_ibf(ibf, filename);
Expand Down
2 changes: 1 addition & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ int run_needle_count(seqan3::argument_parser & parser)
"for all sequences in the genome file based on the exact minimiser occurrences of "
"the given sequence files. Please run genome beforehand to create the genome file.";
parser.add_positional_option(sequence_files, "Please provide at least one sequence file.");
parser.add_option(include_file, '\0', "include", "Please provide one sequence file with transcripts.");
parser.add_option(include_file, '\0', "include", "Please provide one sequence file with transcripts.",seqan3::option_spec::required);
parser.add_option(genome_file, '\0', "genome", "Please provide one *.genome file created with the genome command.");
parser.add_flag(paired, 'p', "paired", "If set, experiments are paired. Default: Not paired.");

Expand Down
1 change: 1 addition & 0 deletions test/cli/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,4 @@ target_use_datasources (estimate_options_test FILES IBF_1 mini_gen.fasta)
add_cli_test (count_options_test.cpp)
target_use_datasources (count_options_test FILES mini_example.fasta mini_gen.fasta mini_gen.genome)
add_cli_test (delete_options_test.cpp)
target_use_datasources (delete_options_test FILES exp_01.fasta)
2 changes: 1 addition & 1 deletion test/cli/count_options_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ TEST_F(count_options_test, no_options)

TEST_F(count_options_test, fail_no_argument)
{
cli_test_result result = execute_app("needle count", "--seed 0");
cli_test_result result = execute_app("needle count", "--seed 0 --genome", data("mini_gen.genome")," --include", data("mini_gen.fasta"));
std::string expected
{
"Error. Incorrect command line input for count. Not enough positional arguments provided "
Expand Down
17 changes: 17 additions & 0 deletions test/cli/delete_options_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,3 +20,20 @@ TEST_F(delete_options_test, delete_no_options)
EXPECT_EQ(result.out, expected);
EXPECT_EQ(result.err, std::string{});
}

TEST_F(delete_options_test, with_argument)
{
estimate_ibf_arguments ibf_args{};
minimiser_arguments minimiser_args{};
ibf_args.expression_thresholds = {1, 2};
std::vector<double> fpr = {0.05};
std::vector<std::filesystem::path> sequence_files = {data("exp_01.fasta"),data("exp_01.fasta")};
ibf_args.path_out = "Test_";
std::vector<uint8_t> cutoffs{};
ibf(sequence_files, ibf_args, minimiser_args, fpr, cutoffs);

cli_test_result result = execute_app("needle delete -i ", "Test_ ", "0");
EXPECT_EQ(result.exit_code, 0);
EXPECT_EQ(result.out, "");
EXPECT_EQ(result.err, std::string{});
}
43 changes: 41 additions & 2 deletions utils/README.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Evaluation of Needle count

## Comparison with kallisto and salmon
All scripts for the comparison with kallisto and salmon can be found in this [repo](https://github.com/MitraDarja/analysis_needle/tree/main), please use the provided scripts `run_simulated.sh` and `run_seqc.sh`.


## Differential Expression
Download the sequencing experiments listed in accession.lst and the human transcripts from gencode as a fasta file.
Then create a Needle-genome file via:
Expand Down Expand Up @@ -29,7 +33,7 @@ python3 unify_results.py deseq_in_23.csv 19_23_infiles.lst accession.lst
python3 unify_results.py deseq_in_39.csv 19_39_infiles.lst accession.lst
```

Now, you can run the RScript "breastcancer.R" to analyze the differential expressed genes and generate the heatmap for the 67
Now, you can run the RScript "breastcancer.R" to analyze the differential expressed genes and generate the heatmap for the 67
differential expressed genes from https://doi.org/10.1016/j.dib.2018.03.079.


Expand All @@ -44,6 +48,42 @@ needle minimiser -k 21 -w 25 -t X SRR1313229.fastq.gz SRR1313228.fastq.gz SRR13
```
For the ram friendly version a `--ram` was added to the previous command.

## Insertion and Deletion

If Needle indices were built according to the scripts [here](https://github.com/MitraDarja/analysis_needle/tree/main). Then insertion and deletion were tested in the following way:

```
# Testing insertion
/usr/bin/time -v ../needle/build_thesis/bin/needle insertmin -o w_21/Insert_ -i w_21/SRR_ w_21/SRR1313229.fastq.minimiser w_21/SRR1313228.fastq.minimiser w_21/SRR1313227.fastq.minimiser w_21/SRR1313226.fastq.minimiser
/usr/bin/time -v ../needle/build_thesis/bin/needle insertmin -o w_25/Insert_ -i w_25/SRR_ w_25/SRR1313229.fastq.minimiser w_25/SRR1313228.fastq.minimiser w_25/SRR1313227.fastq.minimiser w_25/SRR1313226.fastq.minimiser
/usr/bin/time -v ../needle/build_thesis/bin/needle insertmin -o w_41/Insert_ -i w_41/SRR_ w_41/SRR1313229.fastq.minimiser w_41/SRR1313228.fastq.minimiser w_41/SRR1313227.fastq.minimiser w_41/SRR1313226.fastq.minimiser
# Testing Deletion
/usr/bin/time -v ../needle/build_thesis/bin/needle delete -i w_21/SRR_ -o w_21/DeletedIBF_ 614 615 616 617
/usr/bin/time -v ../needle/build_thesis/bin/needle delete -i w_25/SRR_ -o w_25/DeletedIBF_ 614 615 616 617
/usr/bin/time -v ../needle/build_thesis/bin/needle delete -i w_41/SRR_ -o w_41/DeletedIBF_ 614 615 616 617
```

## Comparison with Reindeer and MetaGraph in regards of space and speed
The scripts for the comparison with Reindeer can be found in this [repo](https://github.com/MitraDarja/analysis_needle/tree/main).
There you can find what data to download and how Reindeer and Needle are run, here the commands for MetaGraph are given. Please, check out
the scripts before using them, as the paths to the executable need to be added and sometimes an input file created first.


```
# Preprocessing
bash run_kmc.sh
bash run_build.sh
# From here on each step was considered for the comparison
/usr/bin/time -v bash run_clean.sh
/usr/bin/time -v bash run_clean_smooth.sh
bash metagraph.sh
```

### Preprocessing
The comparison of the preprocessing steps was based on the same files as the parallelization analysis. Use the script `compare_preprocess.sh` to obtain the numbers.


## Differential Expression
Download the sequencing experiments of the GEO experiment with the accession number GSE58135 (https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE58135).
Then create a Needle index in the following way, assuming all sequencing files are stored in a folder named GSE58135:
Expand Down Expand Up @@ -82,4 +122,3 @@ RScrip tissue_type.R
```

The differential expressed genes can then by obtained via ShinyGo using Ontology.Jensen.TISSUES as a database.

13 changes: 13 additions & 0 deletions utils/compare_preprocess.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

needle="Set path to needle executable"
input_dir="Set to directory that contains the sequence files"

# Comparison for 4 Threads, change thread number in each step to obtain numbers for 4 threads
/usr/bin/time -v -o needle_21_21_preprocess4.time $needle minimiser -k 21 -w 21 -t 4 --cutoff 49 $input_dir/SRR1313229.fastq.gz $input_dir/SRR1313228.fastq.gz $input_dir/SRR1313227.fastq.gz $input_dir/SRR1313226.fastq.gz
/usr/bin/time -v -o needle_25_21_preprocess4.time $needle minimiser -k 21 -w 25 -t 4 --cutoff 49 $input_dir/SRR1313229.fastq.gz $input_dir/SRR1313228.fastq.gz $input_dir/SRR1313227.fastq.gz $input_dir/SRR1313226.fastq.gz
/usr/bin/time -v -o needle_21_41_preprocess4.time $needle minimiser -k 21 -w 41 -t 4 --cutoff 49 $input_dir/SRR1313229.fastq.gz $input_dir/SRR1313228.fastq.gz $input_dir/SRR1313227.fastq.gz $input_dir/SRR1313226.fastq.gz

# For both bcalm and kmc a file named "files.lst" with the path to the four files needs to be created.
/usr/bin/time -v -o bcalm_preprocess4.time bash run_bcalm4.sh
/usr/bin/time -v -o kmc_preprocess4.time bash run_kmc4.sh
53 changes: 53 additions & 0 deletions utils/metagraph.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
metagraph="Set path to metagraph executable"
out_dir="Set out directory"

mkdir $out_dir/clean_anno
/usr/bin/time -v -o jointbuild.time $metagraph build -p 4 -k 21 --count-kmers --count-width 32 --outfile-base $out_dir/graph single_dbgs/clean/SRR*.fasta.gz &> jointbuilt.log
/usr/bin/time -v -o annotate_joint.time $metagraph annotate --separately -p 4 -i $out_dir/graph.dbg --anno-filename --count-kmers --count-width 32 -o $out_dir/clean_anno single_dbgs/clean/SRR*.fasta.gz &> anno.log

mkdir $out_dir/rd0
mkdir $out_dir/rd1
mkdir $out_dir/rd2
/usr/bin/time -v -o rd0.time $metagraph transform_anno -v --anno-type row_diff --count-kmers --row-diff-stage 0 --mem-cap-gb 500 -o $out_dir/rd0/rd -i $out_dir/graph.dbg -p 4 --disk-swap "" $out_dir/clean_anno/*.column.annodbg &>rd0.log
/usr/bin/time -v -o rd1.time $metagraph transform_anno -v --anno-type row_diff --count-kmers --row-diff-stage 1 --mem-cap-gb 500 -o $out_dir/rd1/rd -i $out_dir/graph.dbg -p 4 --disk-swap "" $out_dir/clean_anno/*.column.annodbg &> rd1.log
/usr/bin/time -v -o rd2.time $metagraph transform_anno -v --anno-type row_diff --count-kmers --row-diff-stage 2 --mem-cap-gb 500 -o $out_dir/rd2/rd -i $out_dir/graph.dbg -p 4 --disk-swap "" $out_dir/clean_anno/*.column.annodbg &> rd2.log

/usr/bin/time -v -o anno.time $metagraph transform_anno --anno-type row_diff_int_brwt --greedy --fast --subsample 1000000 -i $out_dir/graph.dbg -o $out_dir/annotation_final $out_dir/rd2/*.column.annodbg -p 4 --parallel-nodes 10 &> last_anno.log
/usr/bin/time -v -o relax.time $metagraph relax_brwt -v --relax-arity 32 -p 4 -o $out_dir/annotation_final_relaxed $out_dir/annotation_final.row_diff_int_brwt.annodbg &> relax.log

# Note data/*.fa are the files from here: https://github.com/MitraDarja/analysis_needle/tree/main
/usr/bin/time -v -o query_1.time $metagraph query -p 4 --query-counts -i $out_dir/graph.dbg -a $out_dir/annotation_final_relaxed.row_diff_int_brwt.annodbg data/query_1.fa
/usr/bin/time -v -o query_100.time $metagraph query -p 4 --query-counts -i $out_dir/graph.dbg -a $out_dir/annotation_final_relaxed.row_diff_int_brwt.annodbg data/query_100.fa
/usr/bin/time -v -o query_1000.time $metagraph query -p 4 --query-counts -i $out_dir/graph.dbg -a $out_dir/annotation_final_relaxed.row_diff_int_brwt.annodbg data/query_1000.fa

# Query with one thead
/usr/bin/time -v -o query_1_1.time $metagraph query -p 1 --query-counts -i $out_dir/graph.dbg -a $out_dir/annotation_final_relaxed.row_diff_int_brwt.annodbg data/query_1.fa
/usr/bin/time -v -o query_100_1.time $metagraph query -p 1 --query-counts -i $out_dir/graph.dbg -a $out_dir/annotation_final_relaxed.row_diff_int_brwt.annodbg data/query_100.fa
/usr/bin/time -v -o query_1000_1.time $metagraph query -p 1 --query-counts -i $out_dir/graph.dbg -a $out_dir/annotation_final_relaxed.row_diff_int_brwt.annodbg data/query_1000.fa


# Construction for smooth option

mkdir $out_dir/clean_anno_smooth
/usr/bin/time -v -o jointbuild_smooth.time $metagraph build -p 4 -k 21 --count-kmers --count-width 32 --outfile-base $out_dir/graph_smooth single_dbgs/clean_smooth/SRR*.fasta.gz &> jointbuilt.log
/usr/bin/time -v -o annotate_joint_smooth.time $metagraph annotate --separately -p 4 -i $out_dir/graph.dbg --anno-filename --count-kmers --count-width 32 -o $out_dir/clean_anno_smooth single_dbgs/clean_smooth/SRR*.fasta.gz &> anno.log

mkdir $out_dir/smooth_rd0
mkdir $out_dir/smooth_rd1
mkdir $out_dir/smooth_rd2
/usr/bin/time -v -o rd0_smooth.time $metagraph transform_anno -v --anno-type row_diff --count-kmers --row-diff-stage 0 --mem-cap-gb 500 -o $out_dir/smooth_rd0/rd -i $out_dir/graph_smooth.dbg -p 4 --disk-swap "" $out_dir/clean_anno_smooth/*.column.annodbg &>rd0.log
/usr/bin/time -v -o rd1_smooth.time $metagraph transform_anno -v --anno-type row_diff --count-kmers --row-diff-stage 1 --mem-cap-gb 500 -o $out_dir/smooth_rd1/rd -i $out_dir/graph_smooth.dbg -p 4 --disk-swap "" $out_dir/clean_anno_smooth/*.column.annodbg &> rd1.log
/usr/bin/time -v -o rd2_smooth.time $metagraph transform_anno -v --anno-type row_diff --count-kmers --row-diff-stage 2 --mem-cap-gb 500 -o $out_dir/smooth_rd2/rd -i $out_dir/graph_smooth.dbg -p 4 --disk-swap "" $out_dir/clean_anno_smooth/*.column.annodbg &> rd2.log

/usr/bin/time -v -o anno_smooth.time $metagraph transform_anno --anno-type row_diff_int_brwt --greedy --fast --subsample 1000000 -i $out_dir/graph_smooth.dbg -o $out_dir/annotation_final_smooth $out_dir/rd2/*.column.annodbg -p 4 --parallel-nodes 10 &> last_anno.log
/usr/bin/time -v -o relax_smooth.time $metagraph relax_brwt -v --relax-arity 32 -p 4 -o $out_dir/annotation_final_relaxed_smooth $out_dir/annotation_final_smooth.row_diff_int_brwt.annodbg &> relax.log

# Note data/*.fa are the files from here: https://github.com/MitraDarja/analysis_needle/tree/main
/usr/bin/time -v -o query_1_smooth.time $metagraph query -p 4 --query-counts -i $out_dir/graph_smooth.dbg -a $out_dir/annotation_final_relaxed_smooth.row_diff_int_brwt.annodbg data/query_1.fa
/usr/bin/time -v -o query_100_smooth.time $metagraph query -p 4 --query-counts -i $out_dir/graph_smooth.dbg -a $out_dir/annotation_final_relaxed_smooth.row_diff_int_brwt.annodbg data/query_100.fa
/usr/bin/time -v -o query_1000_smooth.time $metagraph query -p 4 --query-counts -i $out_dir/graph_smooth.dbg -a $out_dir/annotation_final_relaxed_smooth.row_diff_int_brwt.annodbg data/query_1000.fa

# Query with one thead
/usr/bin/time -v -o query_1_smooth_1.time $metagraph query -p 1 --query-counts -i $out_dir/graph_smooth.dbg -a $out_dir/annotation_final_relaxed_smooth.row_diff_int_brwt.annodbg data/query_1.fa
/usr/bin/time -v -o query_100_smooth_1.time $metagraph query -p 1 --query-counts -i $out_dir/graph_smooth.dbg -a $out_dir/annotation_final_relaxed_smooth.row_diff_int_brwt.annodbg data/query_100.fa
/usr/bin/time -v -o query_1000_smooth_1.time $metagraph query -p 1 --query-counts -i $out_dir/graph_smooth.dbg -a $out_dir/annotation_final_relaxed_smooth.row_diff_int_brwt.annodbg data/query_1000.fa
9 changes: 9 additions & 0 deletions utils/run_bcalm4.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
# Based on https://github.com/kamimrcht/REINDEER/blob/master/reproduce_manuscript_results/bcalm_2585.sh

bcalm="Set path to bcalm2 executable"

#get fastq.gz and launch bcalm on each file
while read -r filename; do
$bcalm -in $filename -kmer-size 21 -abundance-min 50 -nb-cores 2 -max-memory 500000 -out-tmp bcalm2 -out-dir bcalm2
done < files.lst
11 changes: 11 additions & 0 deletions utils/run_build.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
set -eu

metagraph="Set path to metagraph executable"
# Note: You need to have a kmc_files.lst, which is a file where in each line there is a path to the kmc_files with the ending ".kmc_suf"

mkdir single_dbgs/
while read -r filename; do
$metagraph build --state fast --mode canonical --parallel 16 --count-kmers --count-width 32 -k 21 --mem-cap-gb 8 -o single_dbgs/$(basename ${filename}) $filename
echo $filename
done < kmc_files.lst
11 changes: 11 additions & 0 deletions utils/run_clean.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
set -eu

metagraph="Set path to metagraph executable"
# Note: You need to have a dbg_files.txt, which is a file where in each line there is a path to the dbg files with the ending ".dbg" (created with run_build.sh)

mkdir single_dbgs/clean/

while read -r filename; do
$metagraph clean -p4 --to-fasta --primary-kmers --smoothing-window 1 -o single_dbgs/clean/$(basename $filename) --count-kmers --count-width 32 $filename
done < dbg_files.txt
11 changes: 11 additions & 0 deletions utils/run_clean_smooth.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
set -eu

metagraph="Set path to metagraph executable"
# Note: You need to have a dbg_files.txt, which is a file where in each line there is a path to the dbg files with the ending ".dbg" (created with run_build.sh)

mkdir single_dbgs/clean_smooth/

while read -r filename; do
$metagraph clean -p 4 --to-fasta --primary-kmers --smoothing-window 1000000000 -o single_dbgs/clean_smooth/$(basename $filename) --count-kmers --count-width 32 $filename
done < dbg_files.txt
11 changes: 11 additions & 0 deletions utils/run_kmc.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
#!/bin/bash
set -eu

kmc="Path to kmc executable"

mkdir kmc_files
mkdir kmc_tmp
while read -r filename threshold; do
$kmc -t64 -r -k21 -ci$threshold -cs65535 -hp $filename kmc_files/$(basename $filename) kmc_tmp/
echo $filename
done < samples.in
8 changes: 8 additions & 0 deletions utils/run_kmc4.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

kmc="Set path to kmc executable"

#get fastq.gz and launch bcalm on each file
while read -r filename; do
$kmc -t4 -r -k21 -ci50 -cs65535 -hp -m500 $filename $(basename $filename .res) kmc_tmp/
done < files.lst
Loading

0 comments on commit 538c9f4

Please sign in to comment.