From a8e45c1683af1a246c5d8b29232fc9172ca9c270 Mon Sep 17 00:00:00 2001 From: Mitra Darja Darvish Date: Fri, 17 Sep 2021 17:43:23 +0200 Subject: [PATCH 1/7] [DOC] Update Readme. --- README.md | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 1a13a37..94d6288 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,11 @@ ## Needle -Needle provides a space-efficient data structure to index a large amount of NGS data and allows fast searches through these indices. -Due to the space-efficiency of one index, it is affordable to create multiple indices with different expression rates. Therefore, a semi-quantitative analysis of the data becomes possible. Needle is based on Interleaved Bloom Filters, which is a compact and efficient structure to store multiple Bloom Filters. Furthermore, Needle uses a windowing scheme (also called minimisers) to reduce the amount of data to store. -## Build +[![Build Status](https://github.com/seqan/app-template/workflows/App%20CI/badge.svg)](https://github.com/seqan/needle/actions?query=branch%3Amaster+workflow%3A%22App+CI%22) + +Needle is a tool for semi-quantitative analysis of very large collections of nucleotide sequences. +Needle stores its data in multiple interleaved Bloom filter, a fast and space efficient probabilistic data structure and uses a windowing scheme (also called minimisers) to reduce the amount of data to store. How many interleaved Bloom filter are used is defined by the user. Each interleaved Bloom filter has a so called expression threshold and stores minimisers with an occurrence greater than or equal to its own expression threshold and smaller than the next biggest expression threshold (if there is no bigger expression threshold, all greater than or equal to the threshold are stored). These expression thresholds are then used during the query (called estimate) to approximate the expression values of given transcripts. + +## Install Needle can be built by following these commands: @@ -21,26 +24,21 @@ make test If you are interested in building the documentation, just use the command: `make doc` -## Create an IBF -In order to create an IBF a number of sequence files have to be given. All sequence file formats from seqan3 are accepted as an input (fasta, fastq, embl,... and their compressed forms). With the parameter m can be defined, which of these sequence files belong together, either because they are the result of paired-end sequencing or they are multiple replicates of the same experiment. If no specification with m is given, every sequence file is seen as one experiment. For paired-end experiments one can use the flag '--paired' to indicate this, so two consecutive sequence files are seen as belonging together. (This is equivalent to using -m 2 for all experiments.) -Besides, the false positive rate of the IBF has to be specified with parameter f. -Use -h/--help for more information and to see further parameters. +## Build +In order to build a Needle index a number of sequence files have to be given. All sequence file formats supported by seqan3 are accepted as an input (fasta, fastq, embl,... and their compressed forms). The flag `--paired` in the example below indicates that the given sequence files are paired-end experiments. Furthermore, the false positive rate has to be specified with the parameter `f`. +Use -h/--help for more information and to see further parameters. The flag `-c` can be used to build a compressed Needle index. -The following example creates an IBF for two experiments for the expression levels 4 and 32. Both experiments had two replicates, therefore m is used to specify this. With c a compressed IBF is created. +The following example creates a compressed Needle index for two paired-end experiments for the expression thresholds 4 and 32. ``` -./bin/needle ibf ../needle/test/data/exp_*.fasta --samples 2 --samples 2 -e 4 -e 32 -f 0.3 -c -o example - -// Or with flag paired ./bin/needle ibf ../needle/test/data/exp_*.fasta --paired -e 16 -e 32 -f 0.3 -c -o example ``` -## Calculate Minimisers -In case one is only interested in the minimisers or wants to preprocess the data first before creating an IBF, the function minimiser can be used. It calculates the minimisers of given experiments and stores their hash values and their occurrences in a binary file named ".minimiser". Furthermore, a txt file is created where all used arguments are stored (like used k-mer size or window size), the used expression levels and the minimiser counts per expression level. +Although, this works. It is recommended to calculate the minimisers beforehand by using the option `minimisers`. It calculates the minimisers of given experiments and stores their hash values and their occurrences in a binary file named ".minimiser". The following command calculates the minimisers in the two experiments. ``` -./bin/needle minimiser ../needle/test/data/exp_*.fasta -samples 2 -samples 2 +./bin/needle minimiser ../needle/test/data/exp_*.fasta --paired ``` A minimiser file is a binary file containing the following data: @@ -52,13 +50,13 @@ A minimiser file is a binary file containing the following data: - shape (uint64_t), if flag is false - all minimiser hashes (uint64_t) with their occurrences (uint16_t) -Based on a minimiser file the ibfs can be computed by using the following command: +Based on the minimiser files the Needle index can be computed by using the following command: ``` ./bin/needle ibfmin exp*.minimiser -e 16 -e 32 -f 0.3 -c -o example ``` ## Estimate -To estimate the expression value of one transcript a sequence file has to be given. Use the parameter "-i" to define where the IBFs can be found (should be equal with "-o" in the previous commands). +To estimate the expression value of one transcript a sequence file has to be given. Use the parameter "-i" to define where the Needle index can be found (should be equal with "-o" in the previous commands). Use -h/--help for more information and to see further parameters. The following example searches for one gene, which is expressed in the first experiment with expression 6 and in the second with expression 37. Therefore, it should be found only in the second experiment but not the first when using expression levels of 16 and 32. From e540d97eb6ad61160f5f2e5af797af732802d3b1 Mon Sep 17 00:00:00 2001 From: Mitra Darja Darvish Date: Fri, 17 Sep 2021 18:07:21 +0200 Subject: [PATCH 2/7] [DOC] Better description of parameters. --- src/main.cpp | 48 +++++++++++++++++------------- test/cli/estimate_options_test.cpp | 4 +-- test/cli/ibf_options_test.cpp | 8 ++--- 3 files changed, 34 insertions(+), 26 deletions(-) diff --git a/src/main.cpp b/src/main.cpp index 9951c3f..fb00fb2 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -12,26 +12,28 @@ uint64_t se; void initialise_min_arguments(seqan3::argument_parser & parser, min_arguments & args) { - parser.add_option(args.k, 'k', "kmer", "Define kmer size."); - parser.add_option(args.path_out, 'o', "out", "Directory, where output files should be saved."); - parser.add_option(w_size, 'w', "window", "Define window size. Default: 60."); - parser.add_option(shape, '\0', "shape", "Define a shape by the decimal of a bitvector, where 0 symbolizes a " + parser.add_option(args.k, 'k', "kmer", "Define k-mer size for the minimisers. Default: 20."); + parser.add_option(w_size, 'w', "window", "Define window size for the minimisers. Default: 60."); + parser.add_option(shape, '\0', "shape", "Define a shape for the minimisers by the decimal of a bitvector, where 0 symbolizes a " "position to be ignored, 1 a position considered. Default: ungapped."); - parser.add_option(se, '\0', "seed", "Define seed."); + parser.add_option(se, '\0', "seed", "Define seed for the minimisers."); + parser.add_option(args.path_out, 'o', "out", "Directory, where output files should be saved."); parser.add_option(args.threads, 't', "threads", "Number of threads to use. Default: 1."); } void initialise_arguments_ibf(seqan3::argument_parser & parser, estimate_ibf_arguments & ibf_args, size_t & num_hash, std::vector & fpr) { - parser.add_flag(ibf_args.compressed, 'c', "compressed", "If c is set, ibf is compressed. Default: Not compressed."); + parser.add_flag(ibf_args.compressed, 'c', "compressed", "If c is set, the IBFS are compressed. Default: Not compressed."); parser.add_option(fpr, 'f', "fpr", "List of bin false positive rate per expression level. If only one is given" ", then that fpr is used for all expression levels."); - parser.add_option(ibf_args.expression_levels, 'e', "expression_levels", "Which expression thresholds should be used for" - " constructing the IBFs."); - parser.add_option(ibf_args.number_expression_levels, 'l', "number_expression_levels", "Number of expression levels."); + parser.add_option(ibf_args.expression_levels, 'e', "expression_thresholds", "Which expression thresholds should be used for" + " constructing the IBFs."); + parser.add_option(ibf_args.number_expression_levels, 'l', "number_expression_thresholds", "Number of expression thresholds. " + "Can be set alternatively to expression_thresholds, then " + "the expression thresholds are determined automatically."); parser.add_option(num_hash, 'n', "hash", "Number of hash functions that should be used when constructing " - "one IBF."); + "one IBF."); } void parsing(seqan3::argument_parser & parser, min_arguments & args) @@ -60,8 +62,9 @@ void initialise_arguments_minimiser(seqan3::argument_parser & parser, minimiser_ parser.add_flag(minimiser_args.paired, 'p', "paired", "If set, experiments are paired. Default: Not paired."); parser.add_option(minimiser_args.cutoffs, '\0', "cutoff", "Define for each sample, what number of found minimisers " "should be considered the result of a sequencing error " - "and therefore be ignored. Default: Every sample has a" - "cutoff of zero."); + "and therefore be ignored. Default: Every sample has an" + "automatically genereated cutoff, which is based on the " + "file size."); } @@ -75,7 +78,10 @@ int run_needle_count(seqan3::argument_parser & parser) std::filesystem::path out_path = "./"; bool paired = false; - parser.info.short_description = "Get expression value depending on minimizers."; + parser.info.short_description = "Get expression value depending on minimizers. This function is only used to test " + "the validity of Needle's estimation approach. It estimates the expression value " + "for all sequences in the genome file based on the exact minimiser occurrences of " + "the given sequence files."; parser.add_positional_option(sequence_files, "Please provide at least one sequence file."); parser.add_option(genome_file, 'g', "genome", "Please provide one sequence file with transcripts."); parser.add_option(exclude_file, '\0', "exclude", "Please provide one sequence file with minimizers to ignore."); @@ -107,7 +113,7 @@ int run_needle_estimate(seqan3::argument_parser & parser) { estimate_ibf_arguments args{}; estimate_arguments estimate_args{}; - parser.info.short_description = "Estimate expression value of transcript based on IBFs."; + parser.info.short_description = "Estimate expression value of transcript based on the Needle index."; parser.info.version = "1.0.0"; parser.info.author = "Mitra Darvish"; @@ -118,8 +124,10 @@ int run_needle_estimate(seqan3::argument_parser & parser) parser.add_option(args.path_out, 'o', "out", "Directory, where output files should be saved."); parser.add_flag(estimate_args.normalization_method, 'm', "normalization-mode", "Set, if normalization is wanted. Normalization is achieved by" - "dividing the expression value with the expression threshold of the first ibf." - "Only make sense if every bin has its own expression values." + "dividing the expression value with the expression threshold of the first" + " ibf. Only make sense if every bin has its own expression " + "thresholds (which is the case if expression thresholds " + "were generated automatically)." "Default: False."); try @@ -159,11 +167,11 @@ int run_needle_ibf(seqan3::argument_parser & parser) initialise_arguments_ibf(parser, ibf_args, num_hash, fpr); initialise_arguments_minimiser(parser, minimiser_args); - parser.info.short_description = "Constructs an IBF."; + parser.info.short_description = "Constructs the Needle index."; parser.add_positional_option(sequence_files, "Please provide at least one sequence file."); parser.add_option(minimiser_args.experiment_names, '\0', "experiment-names", "If set, names of the experiments are stored" - " in a txt file."); + " in a txt file."); parser.add_option(expression_by_genome_file, '\0', "levels-by-genome", "Sequence file containing minimizers, only " "those minimizers will be considered for " "determining the expression thresholds."); @@ -199,7 +207,7 @@ int run_needle_ibf_min(seqan3::argument_parser & parser) std::filesystem::path expression_by_genome_file = ""; std::vector fpr{}; // The fpr of one IBF, can be different for different expression levels - parser.info.short_description = "Constructs an IBF from the minimiser and header files created by needle minimiser."; + parser.info.short_description = "Constructs the Needle index from the minimiser files created by needle minimiser."; parser.add_positional_option(minimiser_files, "Please provide at least one minimiser file. It is assumed that the " "header file exits in the same directory."); @@ -208,7 +216,7 @@ int run_needle_ibf_min(seqan3::argument_parser & parser) parser.add_option(ibf_args.threads, 't', "threads", "Number of threads to use. Default: 1."); parser.add_option(expression_by_genome_file, '\0', "levels-by-genome", "Sequence file containing minimizers, only " "those minimizers will be considered for " - "determining the expression levels."); + "determining the expression thresholds."); initialise_arguments_ibf(parser, ibf_args, num_hash, fpr); diff --git a/test/cli/estimate_options_test.cpp b/test/cli/estimate_options_test.cpp index aa4c0be..509b9c7 100644 --- a/test/cli/estimate_options_test.cpp +++ b/test/cli/estimate_options_test.cpp @@ -12,8 +12,8 @@ TEST_F(estimate_options_test, no_options) cli_test_result result = execute_app("needle estimate"); std::string expected { - "needle-estimate - Estimate expression value of transcript based on IBFs.\n" - "========================================================================\n" + "needle-estimate - Estimate expression value of transcript based on the Needle index.\n" + "====================================================================================\n" " Try -h or --help for more information.\n" }; EXPECT_EQ(result.exit_code, 0); diff --git a/test/cli/ibf_options_test.cpp b/test/cli/ibf_options_test.cpp index e6d861f..a5c3289 100644 --- a/test/cli/ibf_options_test.cpp +++ b/test/cli/ibf_options_test.cpp @@ -9,8 +9,8 @@ TEST_F(ibf_options_test, ibf_no_options) cli_test_result result = execute_app("needle ibf"); std::string expected { - "needle-ibf - Constructs an IBF.\n" - "===============================\n" + "needle-ibf - Constructs the Needle index.\n" + "=========================================\n" " Try -h or --help for more information.\n" }; EXPECT_EQ(result.exit_code, 0); @@ -79,8 +79,8 @@ TEST_F(ibf_options_test, ibfmin_no_options) cli_test_result result = execute_app("needle ibfmin"); std::string expected { - "needle-ibfmin - Constructs an IBF from the minimiser and header files created by needle minimiser.\n" - "==================================================================================================\n" + "needle-ibfmin - Constructs the Needle index from the minimiser files created by needle minimiser.\n" + "=================================================================================================\n" " Try -h or --help for more information.\n" }; EXPECT_EQ(result.exit_code, 0); From ec9120e94efbce47221c26470218ee633456c2f7 Mon Sep 17 00:00:00 2001 From: Mitra Darja Darvish Date: Fri, 17 Sep 2021 18:07:48 +0200 Subject: [PATCH 3/7] [DOC] Update License. --- LICENSE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/LICENSE.md b/LICENSE.md index 4c62f44..9a6ac6e 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,6 +1,6 @@ BSD 3-Clause License -Copyright (c) 2020, Mitra Darvish +Copyright (c) 2021, Mitra Darvish All rights reserved. Redistribution and use in source and binary forms, with or without From c896b614f370f148290e632756dd3f8efed1e14d Mon Sep 17 00:00:00 2001 From: Mitra Darja Darvish Date: Fri, 17 Sep 2021 18:10:18 +0200 Subject: [PATCH 4/7] [MISC] Update gitignore. --- .gitignore | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.gitignore b/.gitignore index 824cba9..40557fb 100644 --- a/.gitignore +++ b/.gitignore @@ -35,10 +35,6 @@ test/data/IBF_0.000000 test/data/IBF_0.500000 test/data/IBF_2.000000 -test/data/Genome_mean/ -test/data/Genome_median/ -test/data/mean/ -test/data/median/ test/data/IBF_Level_0 test/data/IBF_Level_1 test/data/IBF_Level_2 From c4f04c77c5421b65c74ef3e01f9c4e0182c35623 Mon Sep 17 00:00:00 2001 From: Mitra Darja Darvish Date: Fri, 17 Sep 2021 18:29:29 +0200 Subject: [PATCH 5/7] [DOC] Better description of functions. --- include/ibf.h | 60 ++++++++++-------------------------------------- src/estimate.cpp | 3 +++ src/ibf.cpp | 10 ++++++-- 3 files changed, 23 insertions(+), 50 deletions(-) diff --git a/include/ibf.h b/include/ibf.h index 250e6c3..b21e13c 100644 --- a/include/ibf.h +++ b/include/ibf.h @@ -13,8 +13,8 @@ struct minimiser_arguments { - std::filesystem::path include_file; // Needs to be defined when only minimizers appearing in this file should be stored - std::filesystem::path exclude_file; // Needs to be defined when minimizers appearing in this file should NOT be stored + std::filesystem::path include_file; // Needs to be defined when only minimisers appearing in this file should be stored + std::filesystem::path exclude_file; // Needs to be defined when minimisers appearing in this file should NOT be stored std::vector samples{}; // Can be used to indicate that sequence files belong to the same experiment bool paired = false; // If true, than experiments are seen as paired-end experiments std::vector cutoffs{}; @@ -33,42 +33,6 @@ struct RandomGenerator { } }; -/*!\brief Calculate best bin size based on number of elements maximal inserted, false positive rate and number of - * hash functions. See: https://hur.st/bloomfilter/ - * \param count The number of elements to be stored. - * \param fpr The false positive rate to use. - * \param num_hash The number of hash functions to use. - * \returns bin_size - */ -inline uint64_t get_bin_size(uint64_t count, float fpr, size_t num_hash) -{ - return std::ceil((count * std::log(fpr)) / std::log(1 / std::pow(2, std::log(2)))); -} - -/*!\brief Gets all sequences from a specified number of sequence files. - * \param sequence_files A vector of paths to the sequence files. - * \param sequences The data strucuture, where the sequenecs should be stored in. - * \param min_len The minimum length a sequence should have to be stored. - * \param first The first position of in the sequence file vector, which should be considered. Default: 0. - * \param num_exp The number of sequence files that should be considered. Default: 1. - */ -void get_sequences(std::vector const & sequence_files, - seqan3::concatenated_sequences & sequences, uint16_t min_len, - unsigned first = 0, unsigned num_exp = 1); - -/*!\brief Gets all minimisers from all sequences in a given vector. -* \param args The minimiser arguments to use (seed, shape, window size). -* \param sequences The data strucuture, where the sequenecs are stored. -* \param hash_table The hash table, where minimisers should be stored. -* \param genome_set_table The minimisers found in a genome mask. -* \param genome_file The file to the genome mask. Default: "". -* \param only_genome True, if only minimisers found in the genome mask should be stored. Default: False. -*/ -void get_minimisers(min_arguments const & args, seqan3::concatenated_sequences const & sequences, - robin_hood::unordered_node_map & hash_table, - robin_hood::unordered_set const & genome_set_table, - std::filesystem::path const & genome_file = "", bool only_genome = false); - /*!\brief Get the concrete expression values (= median of all counts of one transcript) for given experiments. * This function can be used to estimate how good the median approach can be, if all count values are available. * \param args The minimiser arguments to use (seed, shape, window size). @@ -95,31 +59,31 @@ void read_binary(std::filesystem::path filename, robin_hood::unordered_node_map< */ void read_binary_start(min_arguments & args, std::filesystem::path filename, uint64_t & num_of_minimisers); -/*! \brief Create IBF. +/*! \brief Creates IBFs. * \param sequence_files A vector of sequence file paths. * \param ibf_args The IBF specific arguments to use (bin size, number of hash functions, ...). See * struct ibf_arguments. * \param minimiser_args The minimiser specific arguments to use. * \param fpr The average false positive rate that should be used. - * \param expression_by_genome_file File that contains the only minimisers that should be comnsidered for the - * determination of the expression_levels. + * \param expression_by_genome_file File that contains the only minimisers that should be considered for the + * determination of the expression thresholds. * \param num_hash The number of hash functions to use. - * \returns The normalized expression values per experiment. + * \returns The expression thresholds per experiment. */ std::vector ibf(std::vector const & sequence_files, estimate_ibf_arguments & ibf_args, minimiser_arguments & minimiser_args, std::vector & fpr, std::filesystem::path const expression_by_genome_file = "", size_t num_hash = 1); -/*! \brief Create IBF based on the minimiser and header files - * \param minimiser_files A vector of minimiser file paths. - * \param ibf_args The IBF specific arguments to use (bin size, number of hash functions, ...). See - * struct ibf_arguments. +/*! \brief Creates IBFs based on the minimiser files + * \param minimiser_files A vector of minimiser file paths. + * \param ibf_args The IBF specific arguments to use (bin size, number of hash functions, ...). See + * struct ibf_arguments. * \param fpr The average false positive rate that should be used. * \param expression_by_genome_file File that contains the only minimisers that should be comnsidered for the * determination of the expression_levels. * \param num_hash The number of hash functions to use. - * \returns The normalized expression values per experiment. + * \returns The expression thresholds per experiment. */ std::vector ibf(std::vector const & minimiser_files, estimate_ibf_arguments & ibf_args, std::vector & fpr, @@ -128,7 +92,7 @@ std::vector ibf(std::vector const & minimiser_f /*! \brief Create minimiser and header files. * \param sequence_files A vector of sequence file paths. -* \param args The minimiser arguments to use (seed, shape, window size). +* \param args The minimiser arguments to use (seed, shape, window size). * \param minimiser_args The minimiser specific arguments to use. */ void minimiser(std::vector const & sequence_files, min_arguments const & args, diff --git a/src/estimate.cpp b/src/estimate.cpp index bf42de7..e1392ec 100644 --- a/src/estimate.cpp +++ b/src/estimate.cpp @@ -21,6 +21,8 @@ #include #include "estimate.h" + +// Actual estimation template void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector & estimations_i, seqan3::dna4_vector const seq, std::vector & prev_counts, @@ -235,6 +237,7 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat } +// Calls the correct form of estimate void call_estimate(estimate_ibf_arguments & args, estimate_arguments & estimate_args) { load_args(args, std::string{estimate_args.path_in} + "IBF_Data"); diff --git a/src/ibf.cpp b/src/ibf.cpp index b2b49a8..d918ed1 100644 --- a/src/ibf.cpp +++ b/src/ibf.cpp @@ -65,6 +65,7 @@ inline bool check_for_fasta_format(std::vector const & valid_extens return std::ranges::find_if(valid_extensions, case_insensitive_ends_with) != valid_extensions.end(); } +// Determine cutoff for one experiment uint8_t calculate_cutoff(std::filesystem::path sequence_file, int samples) { // Cutoff according to Mantis paper, divided by two because we store expression levels and @@ -94,7 +95,7 @@ uint8_t calculate_cutoff(std::filesystem::path sequence_file, int samples) return cutoff; } -// Fill hash table with minimisers with cutoff. +// Fill hash table with minimisers greater than the cutoff. void fill_hash_table(min_arguments const & args, seqan3::sequence_file_input> & fin, robin_hood::unordered_node_map & hash_table, @@ -299,6 +300,7 @@ void check_cutoffs_samples(std::vector const & sequence_f throw std::invalid_argument{"Error. Incorrect command line input for multiple-samples."}; } +// Check input of fpr void check_fpr(uint8_t const number_expression_levels, std::vector & fprs) { // If no bin size is given or not the right amount, throw error. @@ -366,6 +368,7 @@ void get_expression_levels(uint8_t const number_expression_levels, counts.clear(); } +// Estimate the file size for every expression level, necessary when samplewise=false void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t const number_expression_levels, std::vector const & expression_levels, std::vector & sizes, robin_hood::unordered_set const & genome, bool all = true) @@ -406,6 +409,7 @@ void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t co fin.close(); } +// Actual ibf construction template void ibf_helper(std::vector const & minimiser_files, std::vector const & fprs, @@ -648,6 +652,7 @@ void ibf_helper(std::vector const & minimiser_files, } } +// Create ibfs std::vector ibf(std::vector const & sequence_files, estimate_ibf_arguments & ibf_args, minimiser_arguments & minimiser_args, std::vector & fpr, @@ -688,7 +693,7 @@ std::vector ibf(std::vector const & sequence_fi return ibf_args.expression_levels; } -// Create ibf based on the minimiser file +// Create ibfs based on the minimiser file std::vector ibf(std::vector const & minimiser_files, estimate_ibf_arguments & ibf_args, std::vector & fpr, std::filesystem::path const expression_by_genome_file, @@ -709,6 +714,7 @@ std::vector ibf(std::vector const & minimiser_f return ibf_args.expression_levels; } +// Actuall minimiser calculation void calculate_minimiser(std::vector const & sequence_files, robin_hood::unordered_set const & include_set_table, robin_hood::unordered_set const & exclude_set_table, From 71ae8c24f5dd8b2f474e5e94bd3cfcccd357714a Mon Sep 17 00:00:00 2001 From: Mitra Darja Darvish Date: Fri, 17 Sep 2021 18:36:12 +0200 Subject: [PATCH 6/7] [MISC] Rename expression_levels expression_thresholds. --- include/ibf.h | 2 +- include/shared.h | 12 ++-- src/estimate.cpp | 28 ++++---- src/ibf.cpp | 104 ++++++++++++++--------------- src/main.cpp | 4 +- test/api/estimate_test.cpp | 26 ++++---- test/api/ibf_test.cpp | 29 ++++---- test/api/ibfmin_test.cpp | 20 +++--- test/api/minimiser_test.cpp | 18 ++--- test/cli/estimate_options_test.cpp | 6 +- test/cli/ibf_options_test.cpp | 2 +- 11 files changed, 128 insertions(+), 123 deletions(-) diff --git a/include/ibf.h b/include/ibf.h index b21e13c..7e3d64e 100644 --- a/include/ibf.h +++ b/include/ibf.h @@ -81,7 +81,7 @@ std::vector ibf(std::vector const & sequence_fi * struct ibf_arguments. * \param fpr The average false positive rate that should be used. * \param expression_by_genome_file File that contains the only minimisers that should be comnsidered for the - * determination of the expression_levels. + * determination of the expression_thresholds. * \param num_hash The number of hash functions to use. * \returns The expression thresholds per experiment. */ diff --git a/include/shared.h b/include/shared.h index f2b342f..5567119 100644 --- a/include/shared.h +++ b/include/shared.h @@ -34,8 +34,8 @@ struct min_arguments : all_arguments struct estimate_ibf_arguments : min_arguments { bool compressed = false; - std::vector expression_levels{}; // Expression levels which should be created - uint8_t number_expression_levels{}; // If set, the expression levels are determined by the program. + std::vector expression_thresholds{}; // Expression levels which should be created + uint8_t number_expression_thresholds{}; // If set, the expression levels are determined by the program. bool samplewise{false}; template @@ -46,8 +46,8 @@ struct estimate_ibf_arguments : min_arguments archive(s.get()); archive(shape); archive(compressed); - archive(number_expression_levels); - archive(expression_levels); + archive(number_expression_thresholds); + archive(expression_thresholds); archive(samplewise); } @@ -59,8 +59,8 @@ struct estimate_ibf_arguments : min_arguments archive(s.get()); archive(shape); archive(compressed); - archive(number_expression_levels); - archive(expression_levels); + archive(number_expression_thresholds); + archive(expression_thresholds); archive(samplewise); } }; diff --git a/src/estimate.cpp b/src/estimate.cpp index e1392ec..c49010e 100644 --- a/src/estimate.cpp +++ b/src/estimate.cpp @@ -158,13 +158,13 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat read_levels(fprs, estimate_args.path_in.string() + "IBF_FPRs.fprs"); // Make sure expression levels are sorted. - sort(args.expression_levels.begin(), args.expression_levels.end()); + sort(args.expression_thresholds.begin(), args.expression_thresholds.end()); // Initialse last expression if constexpr (samplewise) - load_ibf(ibf, estimate_args.path_in.string() + "IBF_Level_" + std::to_string(args.number_expression_levels-1)); + load_ibf(ibf, estimate_args.path_in.string() + "IBF_Level_" + std::to_string(args.number_expression_thresholds-1)); else - load_ibf(ibf, estimate_args.path_in.string() + "IBF_" + std::to_string(args.expression_levels[args.expression_levels.size()-1])); + load_ibf(ibf, estimate_args.path_in.string() + "IBF_" + std::to_string(args.expression_thresholds[args.expression_thresholds.size()-1])); counter.assign(ibf.bin_count(), 0); counter_est.assign(ibf.bin_count(), 0); @@ -182,27 +182,27 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat { if constexpr (samplewise & normalization_method) check_ibf(args, ibf, estimations[i], seqs[i], prev_counts[i], - expressions,args.number_expression_levels - 1, - fprs[args.number_expression_levels - 1]); + expressions,args.number_expression_thresholds - 1, + fprs[args.number_expression_thresholds - 1]); else if constexpr (samplewise) check_ibf(args, ibf, estimations[i], seqs[i], prev_counts[i], - expressions, args.number_expression_levels - 1, - fprs[args.number_expression_levels - 1]); + expressions, args.number_expression_thresholds - 1, + fprs[args.number_expression_thresholds - 1]); else check_ibf(args, ibf, estimations[i], seqs[i], prev_counts[i], - args.expression_levels[args.expression_levels.size() - 1], prev_expression, - fprs[args.expression_levels.size() - 1]); + args.expression_thresholds[args.expression_thresholds.size() - 1], prev_expression, + fprs[args.expression_thresholds.size() - 1]); } if constexpr (!samplewise) - prev_expression = args.expression_levels[args.expression_levels.size() - 1]; + prev_expression = args.expression_thresholds[args.expression_thresholds.size() - 1]; - for (int j = args.number_expression_levels - 2; j >= 0; j--) + for (int j = args.number_expression_thresholds - 2; j >= 0; j--) { if constexpr (samplewise) load_ibf(ibf, estimate_args.path_in.string() + "IBF_Level_" + std::to_string(j)); else - load_ibf(ibf, estimate_args.path_in.string() + "IBF_" + std::to_string(args.expression_levels[j])); + load_ibf(ibf, estimate_args.path_in.string() + "IBF_" + std::to_string(args.expression_thresholds[j])); // Go over the sequences #pragma omp parallel for @@ -216,11 +216,11 @@ void estimate(estimate_ibf_arguments & args, IBFType & ibf, std::filesystem::pat expressions, j, fprs[j]); else check_ibf(args, ibf, estimations[i], seqs[i], prev_counts[i], - args.expression_levels[j], prev_expression, fprs[j]); + args.expression_thresholds[j], prev_expression, fprs[j]); } if (!samplewise) - prev_expression = args.expression_levels[j]; + prev_expression = args.expression_thresholds[j]; } std::ofstream outfile; diff --git a/src/ibf.cpp b/src/ibf.cpp index d918ed1..e55b06f 100644 --- a/src/ibf.cpp +++ b/src/ibf.cpp @@ -255,28 +255,28 @@ void read_binary_start(min_arguments & args, } // Check number of expression levels, sort expression levels -void check_expression(std::vector & expression_levels, uint8_t & number_expression_levels, +void check_expression(std::vector & expression_thresholds, uint8_t & number_expression_thresholds, std::filesystem::path const expression_by_genome_file) { // Sort given expression rates - sort(expression_levels.begin(), expression_levels.end()); + sort(expression_thresholds.begin(), expression_thresholds.end()); // If no expression levels are given and the no number of expression levels is specified, throw. - if ((number_expression_levels == 0) & (expression_levels.size() == 0)) + if ((number_expression_thresholds == 0) & (expression_thresholds.size() == 0)) { throw std::invalid_argument{"Error. Please set the expression levels OR give the number of expression levels."}; } - else if ((expression_by_genome_file != "") & (expression_levels.size() > 0)) + else if ((expression_by_genome_file != "") & (expression_thresholds.size() > 0)) { throw std::invalid_argument{"Error. The determination of expression levels can not be used with individual levels" " already given. Please set the expression levels without the option " "--level-by-genome OR use the number of expression levels with that option."}; } - else if (number_expression_levels == 0) + else if (number_expression_thresholds == 0) { - number_expression_levels = expression_levels.size(); + number_expression_thresholds = expression_thresholds.size(); } - else if ((number_expression_levels != expression_levels.size()) & (expression_levels.size() > 0)) + else if ((number_expression_thresholds != expression_thresholds.size()) & (expression_thresholds.size() > 0)) { throw std::invalid_argument{"Error. Please set the expression levels OR give the number of expression levels."}; } @@ -301,7 +301,7 @@ void check_cutoffs_samples(std::vector const & sequence_f } // Check input of fpr -void check_fpr(uint8_t const number_expression_levels, std::vector & fprs) +void check_fpr(uint8_t const number_expression_thresholds, std::vector & fprs) { // If no bin size is given or not the right amount, throw error. if (fprs.empty()) @@ -311,9 +311,9 @@ void check_fpr(uint8_t const number_expression_levels, std::vector & fpr // If only one ibf size is given, set it for all levels. if (fprs.size() == 1) { - fprs.assign(number_expression_levels, fprs[0]); + fprs.assign(number_expression_thresholds, fprs[0]); } - else if (fprs.size() != number_expression_levels) + else if (fprs.size() != number_expression_thresholds) { throw std::invalid_argument{"Error. Length of false positive rates for IBFs is not equal to length of expression " "levels."}; @@ -321,9 +321,9 @@ void check_fpr(uint8_t const number_expression_levels, std::vector & fpr } // Calculate expression levels and sizes -void get_expression_levels(uint8_t const number_expression_levels, +void get_expression_thresholds(uint8_t const number_expression_thresholds, robin_hood::unordered_node_map const & hash_table, - std::vector & expression_levels, std::vector & sizes, + std::vector & expression_thresholds, std::vector & sizes, robin_hood::unordered_set const & genome, bool all = true) { // Calculate expression levels by taking median recursively @@ -344,10 +344,10 @@ void get_expression_levels(uint8_t const number_expression_levels, exp = counts[prev_pos + counts.size()/dev]; prev_pos = prev_pos + counts.size()/dev; dev = dev*2; - expression_levels.push_back(exp); + expression_thresholds.push_back(exp); sizes.push_back(prev_pos); - while((expression_levels.size() < number_expression_levels) & (prev_exp < max_elem) & (dev < counts.size())) + while((expression_thresholds.size() < number_expression_thresholds) & (prev_exp < max_elem) & (dev < counts.size())) { std::nth_element(counts.begin() + prev_pos, counts.begin() + prev_pos + counts.size()/dev, counts.end()); exp = counts[prev_pos + counts.size()/dev]; @@ -356,21 +356,21 @@ void get_expression_levels(uint8_t const number_expression_levels, if ((exp - prev_exp) > 1) { - expression_levels.push_back(exp); + expression_thresholds.push_back(exp); sizes.push_back(prev_pos); } prev_exp = exp; } - while(expression_levels.size() < number_expression_levels) - expression_levels.push_back(max_elem + 1); + while(expression_thresholds.size() < number_expression_thresholds) + expression_thresholds.push_back(max_elem + 1); counts.clear(); } // Estimate the file size for every expression level, necessary when samplewise=false -void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t const number_expression_levels, - std::vector const & expression_levels, std::vector & sizes, +void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t const number_expression_thresholds, + std::vector const & expression_thresholds, std::vector & sizes, robin_hood::unordered_set const & genome, bool all = true) { std::ifstream fin; @@ -391,18 +391,18 @@ void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t co uint64_t minimiser; uint16_t minimiser_count; - sizes.assign(number_expression_levels, 0); + sizes.assign(number_expression_thresholds, 0); while(fin.read((char*)&minimiser, sizeof(minimiser))) { fin.read((char*)&minimiser_count, sizeof(minimiser_count)); if (all | genome.contains(minimiser)) { - auto p = std::upper_bound(expression_levels.begin(), expression_levels.end(), minimiser_count); - if(p != expression_levels.begin()) - sizes[(p-expression_levels.begin())-1]++; - else if (minimiser_count>=expression_levels[number_expression_levels-1]) - sizes[number_expression_levels-1]++; + auto p = std::upper_bound(expression_thresholds.begin(), expression_thresholds.end(), minimiser_count); + if(p != expression_thresholds.begin()) + sizes[(p-expression_thresholds.begin())-1]++; + else if (minimiser_count>=expression_thresholds[number_expression_thresholds-1]) + sizes[number_expression_thresholds-1]++; } } @@ -434,7 +434,7 @@ void ibf_helper(std::vector const & minimiser_files, robin_hood::unordered_set exclude_set_table; // Storage for minimisers in exclude file if constexpr(samplewise) { - std::vector zero_vector(ibf_args.number_expression_levels); + std::vector zero_vector(ibf_args.number_expression_thresholds); for (unsigned j = 0; j < num_files; j++) expressions.push_back(zero_vector); @@ -451,7 +451,7 @@ void ibf_helper(std::vector const & minimiser_files, size_t const chunk_size = std::clamp(std::bit_ceil(num_files / ibf_args.threads), 8u, 64u); - // If expression_levels should only be depending on minimsers in a certain genome file, genome is created. + // If expression_thresholds should only be depending on minimsers in a certain genome file, genome is created. robin_hood::unordered_set genome{}; if (expression_by_genome_file != "") get_include_set_table(ibf_args, expression_by_genome_file, genome); @@ -485,12 +485,12 @@ void ibf_helper(std::vector const & minimiser_files, else filesize = filesize/((minimiser_args.cutoffs[i] + 1) * (is_fasta ? 1 : 2)); } - // If set_expression_levels_samplewise is not set the expressions as determined by the first file are used for + // If set_expression_thresholds_samplewise is not set the expressions as determined by the first file are used for // all files. if constexpr (samplewise) { uint64_t diff{2}; - for (std::size_t c = 0; c < ibf_args.number_expression_levels - 1; c++) + for (std::size_t c = 0; c < ibf_args.number_expression_thresholds - 1; c++) { diff = diff * 2; sizes[i].push_back(filesize/diff); @@ -499,15 +499,15 @@ void ibf_helper(std::vector const & minimiser_files, } else if constexpr (minimiser_files_given) { - get_filsize_per_expression_level(minimiser_files[i], ibf_args.number_expression_levels, ibf_args.expression_levels, sizes[i], + get_filsize_per_expression_level(minimiser_files[i], ibf_args.number_expression_thresholds, ibf_args.expression_thresholds, sizes[i], genome, expression_by_genome); } else { float diff{1}; - for (std::size_t c = 0; c < ibf_args.number_expression_levels - 1; c++) + for (std::size_t c = 0; c < ibf_args.number_expression_thresholds - 1; c++) { - diff = ibf_args.expression_levels[c+1]/ibf_args.expression_levels[c]; + diff = ibf_args.expression_thresholds[c+1]/ibf_args.expression_thresholds[c]; sizes[i].push_back(filesize/diff); } sizes[i].push_back(filesize/diff); @@ -518,7 +518,7 @@ void ibf_helper(std::vector const & minimiser_files, outfile_fpr.open(std::string{ibf_args.path_out} + "IBF_FPRs.fprs"); // Create IBFs std::vector> ibfs; - for (unsigned j = 0; j < ibf_args.number_expression_levels; j++) + for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++) { uint64_t size{0}; for (unsigned i = 0; i < num_files; i++) @@ -529,7 +529,7 @@ void ibf_helper(std::vector const & minimiser_files, seqan3::debug_stream << "[Error]. The choosen expression threshold is not well picked. If you use the automatic " << "expression threshold determination, please decrease the number of levels. If you use " << "your own expression thresholds, decrease the thresholds from level " - << ibf_args.expression_levels[j] << " on.\n"; + << ibf_args.expression_thresholds[j] << " on.\n"; } // m = -hn/ln(1-p^(1/h)) size = static_cast((-1.0*num_hash*((1.0*size)/num_files))/(std::log(1.0-std::pow(fprs[j], 1.0/num_hash)))); @@ -555,7 +555,7 @@ void ibf_helper(std::vector const & minimiser_files, // Create a smaller cutoff table to save RAM, this cutoff table is only used for constructing the hash table // and afterwards discarded. robin_hood::unordered_node_map cutoff_table; - std::vector expression_levels; + std::vector expression_thresholds; // Fill hash table with minimisers. if constexpr (minimiser_files_given) @@ -578,23 +578,23 @@ void ibf_helper(std::vector const & minimiser_files, cutoff_table.clear(); } - // If set_expression_levels_samplewise is not set the expressions as determined by the first file are used for + // If set_expression_thresholds_samplewise is not set the expressions as determined by the first file are used for // all files. if constexpr (samplewise) { - get_expression_levels(ibf_args.number_expression_levels, + get_expression_thresholds(ibf_args.number_expression_thresholds, hash_table, - expression_levels, + expression_thresholds, sizes[i], genome, expression_by_genome); - expressions[i] = expression_levels; + expressions[i] = expression_thresholds; } // Every minimiser is stored in IBF, if it occurence is greater than or equal to the expression level for (auto && elem : hash_table) { - for (int j = ibf_args.number_expression_levels - 1; j >= 0 ; --j) + for (int j = ibf_args.number_expression_thresholds - 1; j >= 0 ; --j) { if constexpr (samplewise) { @@ -606,7 +606,7 @@ void ibf_helper(std::vector const & minimiser_files, } else { - if (elem.second >= ibf_args.expression_levels[j]) + if (elem.second >= ibf_args.expression_thresholds[j]) { ibfs[j].emplace(elem.first, seqan3::bin_index{i}); break; @@ -617,13 +617,13 @@ void ibf_helper(std::vector const & minimiser_files, } // Store IBFs - for (unsigned i = 0; i < ibf_args.number_expression_levels; i++) + for (unsigned i = 0; i < ibf_args.number_expression_thresholds; i++) { std::filesystem::path filename; if constexpr(samplewise) filename = ibf_args.path_out.string() + "IBF_Level_" + std::to_string(i); else - filename = ibf_args.path_out.string() + "IBF_" + std::to_string(ibf_args.expression_levels[i]); + filename = ibf_args.path_out.string() + "IBF_" + std::to_string(ibf_args.expression_thresholds[i]); if (ibf_args.compressed) { @@ -641,7 +641,7 @@ void ibf_helper(std::vector const & minimiser_files, { std::ofstream outfile; outfile.open(std::string{ibf_args.path_out} + "IBF_Levels.levels"); - for (unsigned j = 0; j < ibf_args.number_expression_levels; j++) + for (unsigned j = 0; j < ibf_args.number_expression_thresholds; j++) { for (unsigned i = 0; i < num_files; i++) outfile << expressions[i][j] << " "; @@ -665,10 +665,10 @@ std::vector ibf(std::vector const & sequence_fi check_cutoffs_samples(sequence_files, minimiser_args.paired, minimiser_args.samples, minimiser_args.cutoffs); - check_expression(ibf_args.expression_levels, ibf_args.number_expression_levels, expression_by_genome_file); - check_fpr(ibf_args.number_expression_levels, fpr); + check_expression(ibf_args.expression_thresholds, ibf_args.number_expression_thresholds, expression_by_genome_file); + check_fpr(ibf_args.number_expression_thresholds, fpr); - ibf_args.samplewise = (ibf_args.expression_levels.size() == 0); + ibf_args.samplewise = (ibf_args.expression_thresholds.size() == 0); // Store experiment names if (minimiser_args.experiment_names) @@ -690,7 +690,7 @@ std::vector ibf(std::vector const & sequence_fi store_args(ibf_args, std::string{ibf_args.path_out} + "IBF_Data"); - return ibf_args.expression_levels; + return ibf_args.expression_thresholds; } // Create ibfs based on the minimiser file @@ -699,10 +699,10 @@ std::vector ibf(std::vector const & minimiser_f std::filesystem::path const expression_by_genome_file, size_t num_hash) { - check_expression(ibf_args.expression_levels, ibf_args.number_expression_levels, expression_by_genome_file); - check_fpr(ibf_args.number_expression_levels, fpr); + check_expression(ibf_args.expression_thresholds, ibf_args.number_expression_thresholds, expression_by_genome_file); + check_fpr(ibf_args.number_expression_thresholds, fpr); - ibf_args.samplewise = (ibf_args.expression_levels.size() == 0); + ibf_args.samplewise = (ibf_args.expression_thresholds.size() == 0); if (ibf_args.samplewise) ibf_helper(minimiser_files, fpr, ibf_args, num_hash, expression_by_genome_file); @@ -711,7 +711,7 @@ std::vector ibf(std::vector const & minimiser_f store_args(ibf_args, std::string{ibf_args.path_out} + "IBF_Data"); - return ibf_args.expression_levels; + return ibf_args.expression_thresholds; } // Actuall minimiser calculation diff --git a/src/main.cpp b/src/main.cpp index fb00fb2..880fc6a 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -27,9 +27,9 @@ void initialise_arguments_ibf(seqan3::argument_parser & parser, estimate_ibf_arg parser.add_flag(ibf_args.compressed, 'c', "compressed", "If c is set, the IBFS are compressed. Default: Not compressed."); parser.add_option(fpr, 'f', "fpr", "List of bin false positive rate per expression level. If only one is given" ", then that fpr is used for all expression levels."); - parser.add_option(ibf_args.expression_levels, 'e', "expression_thresholds", "Which expression thresholds should be used for" + parser.add_option(ibf_args.expression_thresholds, 'e', "expression_thresholds", "Which expression thresholds should be used for" " constructing the IBFs."); - parser.add_option(ibf_args.number_expression_levels, 'l', "number_expression_thresholds", "Number of expression thresholds. " + parser.add_option(ibf_args.number_expression_thresholds, 'l', "number_expression_thresholds", "Number of expression thresholds. " "Can be set alternatively to expression_thresholds, then " "the expression thresholds are determined automatically."); parser.add_option(num_hash, 'n', "hash", "Number of hash functions that should be used when constructing " diff --git a/test/api/estimate_test.cpp b/test/api/estimate_test.cpp index b2f2f52..fcfd9e7 100644 --- a/test/api/estimate_test.cpp +++ b/test/api/estimate_test.cpp @@ -30,7 +30,7 @@ TEST(estimate, small_example) estimate_arguments estimate_args{}; initialization_args(ibf_args); ibf_args.path_out = tmp_dir/"Estimate_Test_"; - ibf_args.expression_levels = {1, 2, 4}; + ibf_args.expression_thresholds = {1, 2, 4}; std::vector fpr = {0.05}; std::vector sequence_files = {std::string(DATA_INPUT_DIR) + "mini_example.fasta"}; estimate_args.search_file = std::string(DATA_INPUT_DIR) + "mini_gen.fasta"; @@ -66,7 +66,7 @@ TEST(estimate, small_example_uncompressed) initialization_args(ibf_args); ibf_args.path_out = tmp_dir/"Estimate_Test_"; ibf_args.compressed = false; - ibf_args.expression_levels = {1, 2, 4}; + ibf_args.expression_thresholds = {1, 2, 4}; std::vector fpr = {0.05}; std::vector sequence_files = {std::string(DATA_INPUT_DIR) + "mini_example.fasta"}; estimate_args.search_file = std::string(DATA_INPUT_DIR) + "mini_gen.fasta"; @@ -101,7 +101,7 @@ TEST(estimate, small_example_gene_not_found) estimate_arguments estimate_args{}; initialization_args(ibf_args); ibf_args.path_out = tmp_dir/"Estimate_Test_"; - ibf_args.expression_levels = {2, 4}; + ibf_args.expression_thresholds = {2, 4}; estimate_args.search_file = std::string(DATA_INPUT_DIR) + "mini_gen2.fasta"; estimate_args.path_in = ibf_args.path_out; std::vector fpr = {0.05}; @@ -137,16 +137,16 @@ TEST(estimate, small_example_different_expressions_per_level_normalization_1) estimate_args.normalization_method = 1; initialization_args(ibf_args); ibf_args.path_out = tmp_dir/"Estimate_Test_"; - ibf_args.number_expression_levels = 2; + ibf_args.number_expression_thresholds = 2; std::vector fpr = {0.05}; std::vector sequence_files = {std::string(DATA_INPUT_DIR) + "mini_example.fasta"}; minimiser(sequence_files, ibf_args, minimiser_args); std::vector minimiser_files{tmp_dir/"Estimate_Test_mini_example.minimiser"}; - ibf_args.expression_levels = {}; + ibf_args.expression_thresholds= {}; ibf(minimiser_files, ibf_args, fpr); - ibf_args.expression_levels = {0, 1, 2}; + ibf_args.expression_thresholds = {0, 1, 2}; estimate_args.search_file = std::string(DATA_INPUT_DIR) + "mini_gen.fasta"; estimate_args.path_in = ibf_args.path_out; ibf_args.path_out = tmp_dir/"expression.out"; @@ -181,7 +181,7 @@ TEST(estimate, example) std::vector sequence_files = {std::string(DATA_INPUT_DIR) + "exp_01.fasta", std::string(DATA_INPUT_DIR) + "exp_02.fasta", std::string(DATA_INPUT_DIR) + "exp_11.fasta", std::string(DATA_INPUT_DIR) + "exp_12.fasta"}; minimiser_args.samples = {2, 2}; - ibf_args.expression_levels = {4, 32}; + ibf_args.expression_thresholds = {4, 32}; ibf_args.path_out = tmp_dir/"Estimate_Test_Single_"; ibf_args.compressed = false; ibf(sequence_files, ibf_args, minimiser_args, fpr); @@ -218,7 +218,7 @@ TEST(estimate, example_multiple_threads) std::vector sequence_files = {std::string(DATA_INPUT_DIR) + "exp_01.fasta", std::string(DATA_INPUT_DIR) + "exp_02.fasta", std::string(DATA_INPUT_DIR) + "exp_11.fasta", std::string(DATA_INPUT_DIR) + "exp_12.fasta"}; minimiser_args.samples = {2,2}; - ibf_args.expression_levels = {4, 32}; + ibf_args.expression_thresholds = {4, 32}; std::vector fpr = {0.05}; ibf_args.path_out = tmp_dir/"Estimate_Test_Multiple_"; ibf_args.compressed = false; @@ -258,7 +258,7 @@ TEST(estimate, example_different_expressions_per_level) std::string(DATA_INPUT_DIR) + "exp_11.fasta", std::string(DATA_INPUT_DIR) + "exp_12.fasta"}; minimiser_args.cutoffs = {0, 0}; minimiser_args.samples = {2,2}; - ibf_args.number_expression_levels = 4; + ibf_args.number_expression_thresholds = 4; std::vector fpr = {0.05}; ibf_args.path_out = tmp_dir/"Estimate_Test_"; ibf_args.compressed = false; @@ -266,7 +266,7 @@ TEST(estimate, example_different_expressions_per_level) std::vector minimiser_files{tmp_dir/"Estimate_Test_exp_01.minimiser", tmp_dir/"Estimate_Test_exp_11.minimiser"}; ibf(minimiser_files, ibf_args, fpr); - ibf_args.expression_levels = {0, 1, 2}; + ibf_args.expression_thresholds = {0, 1, 2}; estimate_args.search_file = std::string(DATA_INPUT_DIR) + "gene.fasta"; estimate_args.path_in = ibf_args.path_out; ibf_args.path_out = tmp_dir/"expression.out"; @@ -303,17 +303,17 @@ TEST(estimate, example_different_expressions_per_level_multiple_threads) std::string(DATA_INPUT_DIR) + "exp_11.fasta", std::string(DATA_INPUT_DIR) + "exp_12.fasta"}; minimiser_args.cutoffs = {0, 0}; minimiser_args.samples = {2,2}; - ibf_args.number_expression_levels = 4; + ibf_args.number_expression_thresholds = 4; std::vector fpr = {0.05}; ibf_args.path_out = tmp_dir/"Estimate_Test_"; ibf_args.compressed = false; minimiser(sequence_files, ibf_args, minimiser_args); std::vector minimiser_files{tmp_dir/"Estimate_Test_exp_01.minimiser", tmp_dir/"Estimate_Test_exp_11.minimiser"}; - ibf_args.expression_levels = {}; + ibf_args.expression_thresholds= {}; ibf(minimiser_files, ibf_args, fpr); ibf_args.threads = 2; - ibf_args.expression_levels = {0, 1, 2}; + ibf_args.expression_thresholds = {0, 1, 2}; estimate_args.search_file = std::string(DATA_INPUT_DIR) + "gene4.fasta"; estimate_args.path_in = ibf_args.path_out; ibf_args.path_out = tmp_dir/"expression.out"; diff --git a/test/api/ibf_test.cpp b/test/api/ibf_test.cpp index c4ee6a2..2413d85 100644 --- a/test/api/ibf_test.cpp +++ b/test/api/ibf_test.cpp @@ -22,14 +22,14 @@ void initialization_args(estimate_ibf_arguments & args) args.s = seqan3::seed{0}; } -TEST(ibf, given_expression_levels) +TEST(ibf, given_expression_thresholds) { std::filesystem::path tmp_dir = std::filesystem::temp_directory_path(); // get the temp directory estimate_ibf_arguments ibf_args{}; minimiser_arguments minimiser_args{}; initialization_args(ibf_args); ibf_args.path_out = tmp_dir/"IBF_Test_Exp_"; - ibf_args.expression_levels = {1, 2}; + ibf_args.expression_thresholds = {1, 2}; std::vector sequence_files = {std::string(DATA_INPUT_DIR) + "mini_example.fasta"}; std::vector fpr = {0.05}; @@ -64,21 +64,21 @@ TEST(ibf, given_expression_levels) EXPECT_EQ(0, args.s.get()); EXPECT_EQ(15, args.shape.to_ulong()); EXPECT_EQ(true, args.compressed); - EXPECT_EQ(ibf_args.number_expression_levels, args.number_expression_levels); - EXPECT_RANGE_EQ(ibf_args.expression_levels, args.expression_levels); + EXPECT_EQ(ibf_args.number_expression_thresholds, args.number_expression_thresholds); + EXPECT_RANGE_EQ(ibf_args.expression_thresholds, args.expression_thresholds); EXPECT_EQ(ibf_args.samplewise, args.samplewise); } std::filesystem::remove(tmp_dir/"IBF_Test_Exp_IBF_Data"); } -TEST(ibf, given_expression_levels_include_file) +TEST(ibf, given_expression_thresholds_include_file) { std::filesystem::path tmp_dir = std::filesystem::temp_directory_path(); // get the temp directory estimate_ibf_arguments ibf_args{}; minimiser_arguments minimiser_args{}; initialization_args(ibf_args); ibf_args.path_out = tmp_dir/"IBF_Test_Include_"; - ibf_args.expression_levels = {1, 2}; + ibf_args.expression_thresholds = {1, 2}; minimiser_args.include_file = std::string(DATA_INPUT_DIR) + "mini_example.fasta"; std::vector sequence_files = {std::string(DATA_INPUT_DIR) + "mini_example.fasta"}; std::vector fpr = {0.05}; @@ -107,14 +107,14 @@ TEST(ibf, given_expression_levels_include_file) std::filesystem::remove(tmp_dir/"IBF_Test_Include_IBF_Data"); } -TEST(ibf, given_expression_levels_exclude_file) +TEST(ibf, given_expression_thresholds_exclude_file) { std::filesystem::path tmp_dir = std::filesystem::temp_directory_path(); // get the temp directory estimate_ibf_arguments ibf_args{}; minimiser_arguments minimiser_args{}; initialization_args(ibf_args); ibf_args.path_out = tmp_dir/"IBF_Test_Exclude_"; - ibf_args.expression_levels = {1, 2}; + ibf_args.expression_thresholds = {1, 2}; minimiser_args.exclude_file = std::string(DATA_INPUT_DIR) + "mini_gen.fasta"; std::vector sequence_files = {std::string(DATA_INPUT_DIR) + "mini_example.fasta"}; std::vector fpr = {0.05}; @@ -143,14 +143,14 @@ TEST(ibf, given_expression_levels_exclude_file) std::filesystem::remove(tmp_dir/"IBF_Test_Exclude_IBF_Data"); } -TEST(ibf, no_given_expression_levels) +TEST(ibf, no_given_expression_thresholds) { std::filesystem::path tmp_dir = std::filesystem::temp_directory_path(); // get the temp directory estimate_ibf_arguments ibf_args{}; minimiser_arguments minimiser_args{}; initialization_args(ibf_args); ibf_args.path_out = tmp_dir/"IBF_Test_"; - ibf_args.number_expression_levels = 2; + ibf_args.number_expression_thresholds = 2; std::vector sequence_files = {std::string(DATA_INPUT_DIR) + "mini_example.fasta"}; std::vector fpr = {0.05}; @@ -180,14 +180,19 @@ TEST(ibf, no_given_expression_levels) std::filesystem::remove(tmp_dir/"IBF_Test_IBF_Data"); } -TEST(ibf, expression_levels_by_genome) +TEST(ibf, expression_thresholds_by_genome) { std::filesystem::path tmp_dir = std::filesystem::temp_directory_path(); // get the temp directory estimate_ibf_arguments ibf_args{}; minimiser_arguments minimiser_args{}; initialization_args(ibf_args); +<<<<<<< HEAD ibf_args.path_out = tmp_dir/"IBF_Test_"; ibf_args.number_expression_levels = 1; +======= + ibf_args.path_out = tmp_dir/"Test_"; + ibf_args.number_expression_thresholds = 1; +>>>>>>> [MISC] Rename expression_levels expression_thresholds. std::vector sequence_files = {std::string(DATA_INPUT_DIR) + "mini_example.fasta"}; std::vector fpr = {0.05}; @@ -229,7 +234,7 @@ TEST(ibf, throws) EXPECT_THROW(ibf(sequence_files, ibf_args, minimiser_args, fpr), std::invalid_argument); - ibf_args.number_expression_levels = 0; + ibf_args.number_expression_thresholds = 0; fpr = {}; EXPECT_THROW(ibf(sequence_files, ibf_args, minimiser_args, fpr), std::invalid_argument); diff --git a/test/api/ibfmin_test.cpp b/test/api/ibfmin_test.cpp index c690c2f..810a5d4 100644 --- a/test/api/ibfmin_test.cpp +++ b/test/api/ibfmin_test.cpp @@ -23,11 +23,11 @@ void initialization_args(estimate_ibf_arguments & args) args.path_out = tmp_dir/"IBFMIN_Test_"; } -TEST(ibfmin, given_expression_levels) +TEST(ibfmin, given_expression_thresholds) { estimate_ibf_arguments ibf_args{}; initialization_args(ibf_args); - ibf_args.expression_levels = {1, 2}; + ibf_args.expression_thresholds = {1, 2}; std::vector fpr = {0.05, 0.05}; ibf_args.path_out = tmp_dir/"IBFMIN_Test_Given_"; std::vector minimiser_file = {std::string(DATA_INPUT_DIR) + "mini_example.minimiser"}; @@ -57,11 +57,11 @@ TEST(ibfmin, given_expression_levels) } #if defined(__GNUC__) && ((__GNUC___ == 10 && __cplusplus == 201703L) || (__GNUC__ <10)) -TEST(ibfmin, given_expression_levels_multiple_threads) +TEST(ibfmin, given_expression_thresholds_multiple_threads) { estimate_ibf_arguments ibf_args{}; initialization_args(ibf_args); - ibf_args.expression_levels = {1, 2}; + ibf_args.expression_thresholds = {1, 2}; std::vector fpr = {0.05, 0.05}; ibf_args.threads = 2; ibf_args.path_out = tmp_dir/"IBFMIN_Test_Multiple_"; @@ -89,11 +89,11 @@ TEST(ibfmin, given_expression_levels_multiple_threads) } #endif -TEST(ibfmin, no_given_expression_levels) +TEST(ibfmin, no_given_expression_thresholds) { estimate_ibf_arguments ibf_args{}; initialization_args(ibf_args); - ibf_args.number_expression_levels = 2; + ibf_args.number_expression_thresholds = 2; std::vector fpr = {0.0025, 0.0025}; std::vector minimiser_file = {std::string(DATA_INPUT_DIR) + "mini_example.minimiser"}; @@ -122,11 +122,11 @@ TEST(ibfmin, no_given_expression_levels) std::filesystem::remove(tmp_dir/"IBFMIN_Test_IBF_Levels.levels"); } -TEST(ibfmin, expression_levels_by_genome) +TEST(ibfmin, expression_thresholds_by_genome) { estimate_ibf_arguments ibf_args{}; initialization_args(ibf_args); - ibf_args.number_expression_levels = 1; + ibf_args.number_expression_thresholds = 1; std::vector fpr = {0.05}; std::vector minimiser_file = {std::string(DATA_INPUT_DIR) + "mini_example.minimiser"}; @@ -155,11 +155,11 @@ TEST(ibfmin, expression_levels_by_genome) } #if defined(__GNUC__) && ((__GNUC___ == 10 && __cplusplus == 201703L) || (__GNUC__ <10)) -TEST(ibfmin, no_given_expression_levels_multiple_threads) +TEST(ibfmin, no_given_expression_thresholds_multiple_threads) { estimate_ibf_arguments ibf_args{}; initialization_args(ibf_args); - ibf_args.number_expression_levels = 2; + ibf_args.number_expression_thresholds = 2; std::vector fpr = {0.0025, 0.0025}; ibf_args.threads = 2; ibf_args.path_out = tmp_dir/"IBFMIN_Test_Multiple_"; diff --git a/test/api/minimiser_test.cpp b/test/api/minimiser_test.cpp index 768c481..cba6bff 100644 --- a/test/api/minimiser_test.cpp +++ b/test/api/minimiser_test.cpp @@ -57,7 +57,7 @@ TEST(minimiser, small_example) minimiser_arguments minimiser_args{}; initialization_args(args); minimiser_args.cutoffs = {0, 0}; - args.expression_levels = {0}; + args.expression_thresholds = {0}; std::vector fpr = {0.05}; std::vector sequence_files = {std::string(DATA_INPUT_DIR) + "mini_example.fasta", std::string(DATA_INPUT_DIR) + "mini_example2.fasta"}; @@ -90,7 +90,7 @@ TEST(minimiser, small_example) result_hash_table.clear(); } - EXPECT_EQ(args.expression_levels, ibf(minimiser_files, args, fpr)); + EXPECT_EQ(args.expression_thresholds, ibf(minimiser_files, args, fpr)); seqan3::interleaved_bloom_filter ibf; load_ibf(ibf, tmp_dir/"Minimiser_Test_IBF_0"); auto agent = ibf.membership_agent(); @@ -147,7 +147,7 @@ TEST(minimiser, small_example_samplewise) initialization_args(args); minimiser_args.cutoffs = {0, 0}; - args.number_expression_levels = 1; + args.number_expression_thresholds = 1; std::vector fpr = {0.05}; std::vector sequence_files = {std::string(DATA_INPUT_DIR) + "mini_example.fasta", std::string(DATA_INPUT_DIR) + "mini_example2.fasta"}; @@ -164,7 +164,7 @@ TEST(minimiser, small_example_samplewise) for (int i = 0; i < sequence_files.size(); ++i) { // Test Header file - args.expression_levels = {}; + args.expression_thresholds = {}; read_binary_start(args, tmp_dir/("Minimiser_Test_" + std::string{sequence_files[i].stem()} + ".minimiser"), num_of_minimisers); EXPECT_EQ(4, args.k); EXPECT_EQ(4, args.w_size.get()); @@ -182,7 +182,7 @@ TEST(minimiser, small_example_samplewise) result_hash_table.clear(); } - args.expression_levels = {}; + args.expression_thresholds = {}; EXPECT_EQ(expected_levels, ibf(minimiser_files, args, fpr)); seqan3::interleaved_bloom_filter ibf; @@ -209,7 +209,7 @@ TEST(minimiser, cutoff_by_filesize) estimate_ibf_arguments args{}; minimiser_arguments minimiser_args{}; initialization_args(args); - args.expression_levels = {0}; + args.expression_thresholds = {0}; std::vector fpr = {0.05}; std::vector sequence_files = {std::string(DATA_INPUT_DIR) + "mini_example.fasta", std::string(DATA_INPUT_DIR) + "mini_example2.fasta"}; @@ -233,7 +233,7 @@ TEST(minimiser, cutoff_by_filesize) minimiser_files.push_back(tmp_dir/("Minimiser_Test_" + std::string{sequence_files[i].stem()} + ".minimiser")); } - EXPECT_EQ(args.expression_levels, ibf(minimiser_files, args, fpr)); + EXPECT_EQ(args.expression_thresholds, ibf(minimiser_files, args, fpr)); seqan3::interleaved_bloom_filter ibf; load_ibf(ibf, tmp_dir/"Minimiser_Test_IBF_0"); @@ -262,7 +262,7 @@ TEST(minimiser, small_example_two_threads) initialization_args(args); args.threads = 2; minimiser_args.cutoffs = {0, 0}; - args.expression_levels = {0}; + args.expression_thresholds = {0}; std::vector fpr = {0.05}; std::vector sequence_files = {std::string(DATA_INPUT_DIR) + "mini_example.fasta", std::string(DATA_INPUT_DIR) + "mini_example2.fasta"}; @@ -296,7 +296,7 @@ TEST(minimiser, small_example_two_threads) result_hash_table.clear(); } - EXPECT_EQ(args.expression_levels, ibf(minimiser_files, args, fpr)); + EXPECT_EQ(args.expression_thresholds, ibf(minimiser_files, args, fpr)); seqan3::interleaved_bloom_filter ibf; load_ibf(ibf, tmp_dir/"Minimiser_Test_IBF_0"); diff --git a/test/cli/estimate_options_test.cpp b/test/cli/estimate_options_test.cpp index 509b9c7..d89f8d3 100644 --- a/test/cli/estimate_options_test.cpp +++ b/test/cli/estimate_options_test.cpp @@ -38,7 +38,7 @@ TEST_F(estimate_options_test, with_argument) { estimate_ibf_arguments ibf_args{}; minimiser_arguments minimiser_args{}; - ibf_args.expression_levels = {1, 2}; + ibf_args.expression_thresholds = {1, 2}; std::vector fpr = {0.05}; std::vector sequence_files = {data("exp_01.fasta")}; ibf_args.path_out = "Test_"; @@ -54,7 +54,7 @@ TEST_F(estimate_options_test, with_argument_normalization_method) { estimate_ibf_arguments ibf_args{}; minimiser_arguments minimiser_args{}; - ibf_args.expression_levels = {1, 2}; + ibf_args.expression_thresholds = {1, 2}; std::vector fpr = {0.05}; std::vector sequence_files = {data("exp_01.fasta")}; ibf_args.path_out = "Test_"; @@ -70,7 +70,7 @@ TEST_F(estimate_options_test, with_argument_out) { estimate_ibf_arguments ibf_args{}; minimiser_arguments minimiser_args{}; - ibf_args.expression_levels = {1, 2}; + ibf_args.expression_thresholds = {1, 2}; std::vector fpr = {0.05}; std::vector sequence_files = {data("exp_01.fasta")}; ibf_args.path_out = "Test_"; diff --git a/test/cli/ibf_options_test.cpp b/test/cli/ibf_options_test.cpp index a5c3289..1628b49 100644 --- a/test/cli/ibf_options_test.cpp +++ b/test/cli/ibf_options_test.cpp @@ -152,7 +152,7 @@ TEST_F(ibf_options_test, more_hash_functions) EXPECT_EQ(result.err, std::string{}); } -TEST_F(ibf_options_test, expression_levels) +TEST_F(ibf_options_test, expression_thresholds) { cli_test_result result = execute_app("needle ibfmin -f 0.05 -e 2 -e 4", data("mini_example.minimiser")); EXPECT_EQ(result.exit_code, 0); From e0694b157d73584cd6021f94951a6bca5393ea7d Mon Sep 17 00:00:00 2001 From: Mitra Darja Darvish Date: Fri, 17 Sep 2021 19:01:35 +0200 Subject: [PATCH 7/7] [MISC] Delete stuff from merge. --- test/api/ibf_test.cpp | 5 ----- 1 file changed, 5 deletions(-) diff --git a/test/api/ibf_test.cpp b/test/api/ibf_test.cpp index 2413d85..878827b 100644 --- a/test/api/ibf_test.cpp +++ b/test/api/ibf_test.cpp @@ -186,13 +186,8 @@ TEST(ibf, expression_thresholds_by_genome) estimate_ibf_arguments ibf_args{}; minimiser_arguments minimiser_args{}; initialization_args(ibf_args); -<<<<<<< HEAD ibf_args.path_out = tmp_dir/"IBF_Test_"; - ibf_args.number_expression_levels = 1; -======= - ibf_args.path_out = tmp_dir/"Test_"; ibf_args.number_expression_thresholds = 1; ->>>>>>> [MISC] Rename expression_levels expression_thresholds. std::vector sequence_files = {std::string(DATA_INPUT_DIR) + "mini_example.fasta"}; std::vector fpr = {0.05};