Skip to content

Commit

Permalink
Merge pull request #120 from MitraDarja/level_0
Browse files Browse the repository at this point in the history
[MISC] cutoffs as variable.
  • Loading branch information
MitraDarja authored Oct 14, 2021
2 parents d8adf45 + 2428d14 commit 2048bf0
Show file tree
Hide file tree
Showing 11 changed files with 157 additions and 111 deletions.
11 changes: 6 additions & 5 deletions include/ibf.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ struct minimiser_arguments
std::filesystem::path exclude_file; // Needs to be defined when minimisers appearing in this file should NOT be stored
std::vector<int> samples{}; // Can be used to indicate that sequence files belong to the same experiment
bool paired = false; // If true, than experiments are seen as paired-end experiments
std::vector<uint8_t> cutoffs{};
bool experiment_names = false; // Flag, if names of experiment should be stored in a txt file
};

Expand Down Expand Up @@ -62,23 +61,24 @@ void read_binary(std::filesystem::path filename, robin_hood::unordered_node_map<
* \param args Min arguments.
* \param filename The filename of the binary file.
* \param num_of_minimisers Variable, where to number of minimisers should be stored.
* \param cutoff cutoff value.
*/
void read_binary_start(min_arguments & args, std::filesystem::path filename, uint64_t & num_of_minimisers);
void read_binary_start(min_arguments & args, std::filesystem::path filename, uint64_t & num_of_minimisers, uint8_t & cutoff);

/*! \brief Creates IBFs.
* \param sequence_files A vector of sequence file paths.
* \param ibf_args The IBF specific arguments to use (bin size, number of hash functions, ...). See
* struct ibf_arguments.
* \param minimiser_args The minimiser specific arguments to use.
* \param fpr The average false positive rate that should be used.
* \param cutoffs List of cutoffs.
* \param expression_by_genome_file File that contains the only minimisers that should be considered for the
* determination of the expression thresholds.
* \param num_hash The number of hash functions to use.
* \returns The expression thresholds per experiment.
*/
std::vector<uint16_t> ibf(std::vector<std::filesystem::path> const & sequence_files, estimate_ibf_arguments & ibf_args,
minimiser_arguments & minimiser_args, std::vector<double> & fpr,
minimiser_arguments & minimiser_args, std::vector<double> & fpr, std::vector<uint8_t> & cutoffs,
std::filesystem::path const expression_by_genome_file = "",
size_t num_hash = 1);

Expand All @@ -101,6 +101,7 @@ std::vector<uint16_t> ibf(std::vector<std::filesystem::path> const & minimiser_f
* \param sequence_files A vector of sequence file paths.
* \param args The minimiser arguments to use (seed, shape, window size).
* \param minimiser_args The minimiser specific arguments to use.
* \param cutoffs List of cutoffs.
*/
void minimiser(std::vector<std::filesystem::path> const & sequence_files, min_arguments const & args,
minimiser_arguments & minimiser_args);
minimiser_arguments & minimiser_args, std::vector<uint8_t> & cutoffs);
2 changes: 1 addition & 1 deletion src/estimate.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ void check_ibf(min_arguments const & args, IBFType const & ibf, std::vector<uint

// Perform normalization by dividing through the threshold of the first level. Only works, if multiple expressions were used.
if constexpr (normalization & multiple_expressions)
estimations_i[j] = estimations_i[j]/expressions[0][j];
estimations_i[j] = estimations_i[j]/expressions[1][j];
}
else
{
Expand Down
75 changes: 41 additions & 34 deletions src/ibf.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -75,11 +75,10 @@ inline bool check_for_fasta_format(std::vector<std::string> const & valid_extens
// Determine cutoff for one experiment
uint8_t calculate_cutoff(std::filesystem::path sequence_file, int samples)
{
// Cutoff according to Mantis paper, divided by two because we store expression thresholds and
// -1 because we use "<" and not "<="
uint16_t const default_cutoff{24};
// Cutoff according to Mantis paper -1 because we use "<" and not "<="
uint16_t const default_cutoff{49};
uint8_t cutoff{default_cutoff};
std::array<uint16_t, 4> const cutoffs{0, 1, 4, 9};
std::array<uint16_t, 4> const cutoffs{0, 2, 9, 19};
std::array<uint64_t, 4> const cutoff_bounds{314'572'800, 524'288'000, 1'073'741'824, 3'221'225'472};
cutoff = default_cutoff;

Expand Down Expand Up @@ -208,6 +207,7 @@ void read_binary(std::filesystem::path filename, robin_hood::unordered_node_map<
fin.open(filename, std::ios::binary);
fin.read((char*)&buffer, sizeof(buffer));
fin.read((char*)&small_buffer, sizeof(small_buffer));
fin.read((char*)&small_buffer, sizeof(small_buffer));
fin.read((char*)&window, sizeof(window));
fin.read((char*)&buffer, sizeof(buffer));
bool ungapped;
Expand All @@ -231,16 +231,19 @@ void read_binary(std::filesystem::path filename, robin_hood::unordered_node_map<

void read_binary_start(min_arguments & args,
std::filesystem::path filename,
uint64_t & num_of_minimisers)
uint64_t & num_of_minimisers, uint8_t & cutoff)
{
std::ifstream fin;

uint32_t window;
uint64_t buffer;
uint8_t small_buffer;
fin.open(filename, std::ios::binary);
fin.read((char*)&buffer, sizeof(buffer));
num_of_minimisers = buffer;

fin.read((char*)&small_buffer, sizeof(small_buffer));
cutoff = small_buffer;
fin.read((char*)&args.k, sizeof(args.k));
fin.read((char*)&window, sizeof(window));
args.w_size = seqan3::window_size{window};
Expand Down Expand Up @@ -332,7 +335,7 @@ void check_fpr(uint8_t const number_expression_thresholds, std::vector<double> &
void get_expression_thresholds(uint8_t const number_expression_thresholds,
robin_hood::unordered_node_map<uint64_t, uint16_t> const & hash_table,
std::vector<uint16_t> & expression_thresholds, std::vector<uint64_t> & sizes,
robin_hood::unordered_set<uint64_t> const & genome, bool all = true)
robin_hood::unordered_set<uint64_t> const & genome, uint8_t cutoff, bool all = true)
{
// Calculate expression thresholds by taking median recursively
std::vector<uint16_t> counts;
Expand All @@ -347,6 +350,8 @@ void get_expression_thresholds(uint8_t const number_expression_thresholds,
auto prev_exp{0};
auto exp{0};
auto max_elem = *std::max_element(counts.begin(), counts.end());
// Zero Level = cutoff + 1
expression_thresholds.push_back(cutoff + 1);
// First Level
std::nth_element(counts.begin() + prev_pos, counts.begin() + prev_pos + counts.size()/dev, counts.end());
exp = counts[prev_pos + counts.size()/dev];
Expand All @@ -371,6 +376,7 @@ void get_expression_thresholds(uint8_t const number_expression_thresholds,

prev_exp = exp;
}
sizes.push_back(prev_pos);
// In case not all levels have a threshold, give the last levels a maximal threshold, which can not be met by any minimiser.
while(expression_thresholds.size() < number_expression_thresholds)
expression_thresholds.push_back(max_elem + 1);
Expand All @@ -390,6 +396,7 @@ void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t co
fin.open(filename, std::ios::binary);
fin.read((char*)&buffer, sizeof(buffer));
fin.read((char*)&small_buffer, sizeof(small_buffer));
fin.read((char*)&small_buffer, sizeof(small_buffer));
fin.read((char*)&window, sizeof(window));
fin.read((char*)&buffer, sizeof(buffer));
bool ungapped;
Expand Down Expand Up @@ -423,7 +430,8 @@ void get_filsize_per_expression_level(std::filesystem::path filename, uint8_t co
template<bool samplewise, bool minimiser_files_given = true>
void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
std::vector<double> const & fprs,
estimate_ibf_arguments & ibf_args, size_t num_hash = 1, std::filesystem::path expression_by_genome_file = "",
estimate_ibf_arguments & ibf_args, std::vector<uint8_t> & cutoffs = {},
size_t num_hash = 1, std::filesystem::path expression_by_genome_file = "",
minimiser_arguments const & minimiser_args = {})
{

Expand All @@ -437,8 +445,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
std::vector<std::vector<uint64_t>> sizes{};
sizes.assign(num_files, {});

bool const calculate_cutoffs = minimiser_args.cutoffs.empty();
std::vector<uint8_t> file_cutoffs{};
bool const calculate_cutoffs = cutoffs.empty();

robin_hood::unordered_set<uint64_t> include_set_table; // Storage for minimisers in include file
robin_hood::unordered_set<uint64_t> exclude_set_table; // Storage for minimisers in exclude file
Expand Down Expand Up @@ -474,7 +481,9 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,

if constexpr(minimiser_files_given)
{
read_binary_start(ibf_args, minimiser_files[i], filesize);
uint8_t cutoff;
read_binary_start(ibf_args, minimiser_files[i], filesize, cutoff);
cutoffs.push_back(cutoff);
}
else
{
Expand All @@ -484,22 +493,19 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,

// Determine cutoffs
if (calculate_cutoffs)
file_cutoffs.push_back(calculate_cutoff(minimiser_files[file_iterator], minimiser_args.samples[i]));
cutoffs.push_back(calculate_cutoff(minimiser_files[file_iterator], minimiser_args.samples[i]));

bool const is_compressed = minimiser_files[file_iterator].extension() == ".gz" || minimiser_files[file_iterator].extension() == ".bgzf" || minimiser_files[file_iterator].extension() == ".bz2";
bool const is_fasta = is_compressed ? check_for_fasta_format(seqan3::format_fasta::file_extensions,minimiser_files[file_iterator].stem())
: check_for_fasta_format(seqan3::format_fasta::file_extensions, minimiser_files[file_iterator].extension());
filesize = std::filesystem::file_size(minimiser_files[file_iterator]) * minimiser_args.samples[i] * (is_fasta ? 2 : 1) / (is_compressed ? 1 : 3);
if (calculate_cutoffs)
filesize = filesize/((file_cutoffs[i] + 1) * (is_fasta ? 1 : 2));
else
filesize = filesize/((minimiser_args.cutoffs[i] + 1) * (is_fasta ? 1 : 2));
filesize = filesize/((cutoffs[i] + 1) * (is_fasta ? 1 : 2));
}
// If set_expression_thresholds_samplewise is not set the expressions as determined by the first file are used for
// all files.
if constexpr (samplewise)
{
uint64_t diff{2};
uint64_t diff{1};
for (std::size_t c = 0; c < ibf_args.number_expression_thresholds - 1; c++)
{
diff = diff * 2;
Expand Down Expand Up @@ -579,12 +585,8 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
for (unsigned f = 0; f < minimiser_args.samples[i]; f++)
{
seqan3::sequence_file_input<my_traits, seqan3::fields<seqan3::field::seq>> fin{minimiser_files[file_iterator+f]};
if (calculate_cutoffs)
fill_hash_table(ibf_args, fin, hash_table, cutoff_table, include_set_table, exclude_set_table,
(minimiser_args.include_file != ""), file_cutoffs[i]);
else
fill_hash_table(ibf_args, fin, hash_table, cutoff_table, include_set_table, exclude_set_table,
(minimiser_args.include_file != ""), minimiser_args.cutoffs[i]);
fill_hash_table(ibf_args, fin, hash_table, cutoff_table, include_set_table, exclude_set_table,
(minimiser_args.include_file != ""), cutoffs[i]);
}
cutoff_table.clear();
}
Expand All @@ -598,6 +600,7 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
expression_thresholds,
sizes[i],
genome,
cutoffs[i],
expression_by_genome);
expressions[i] = expression_thresholds;
}
Expand Down Expand Up @@ -667,14 +670,14 @@ void ibf_helper(std::vector<std::filesystem::path> const & minimiser_files,
// Create ibfs
std::vector<uint16_t> ibf(std::vector<std::filesystem::path> const & sequence_files,
estimate_ibf_arguments & ibf_args, minimiser_arguments & minimiser_args,
std::vector<double> & fpr,
std::vector<double> & fpr, std::vector<uint8_t> & cutoffs,
std::filesystem::path const expression_by_genome_file, size_t num_hash)
{
// Declarations
robin_hood::unordered_node_map<uint64_t, uint16_t> hash_table{}; // Storage for minimisers
seqan3::concatenated_sequences<seqan3::dna4_vector> sequences; // Storage for sequences in experiment files

check_cutoffs_samples(sequence_files, minimiser_args.paired, minimiser_args.samples, minimiser_args.cutoffs);
check_cutoffs_samples(sequence_files, minimiser_args.paired, minimiser_args.samples, cutoffs);


check_expression(ibf_args.expression_thresholds, ibf_args.number_expression_thresholds, expression_by_genome_file);
Expand All @@ -696,9 +699,9 @@ std::vector<uint16_t> ibf(std::vector<std::filesystem::path> const & sequence_fi
}

if (ibf_args.samplewise)
ibf_helper<true, false>(sequence_files, fpr, ibf_args, num_hash, expression_by_genome_file, minimiser_args);
ibf_helper<true, false>(sequence_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file, minimiser_args);
else
ibf_helper<false, false>(sequence_files, fpr, ibf_args, num_hash, expression_by_genome_file, minimiser_args);
ibf_helper<false, false>(sequence_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file, minimiser_args);

store_args(ibf_args, std::string{ibf_args.path_out} + "IBF_Data");

Expand All @@ -716,10 +719,11 @@ std::vector<uint16_t> ibf(std::vector<std::filesystem::path> const & minimiser_f

ibf_args.samplewise = (ibf_args.expression_thresholds.size() == 0);

std::vector<uint8_t> cutoffs{};
if (ibf_args.samplewise)
ibf_helper<true>(minimiser_files, fpr, ibf_args, num_hash, expression_by_genome_file);
ibf_helper<true>(minimiser_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file);
else
ibf_helper<false>(minimiser_files, fpr, ibf_args, num_hash, expression_by_genome_file);
ibf_helper<false>(minimiser_files, fpr, ibf_args, cutoffs, num_hash, expression_by_genome_file);

store_args(ibf_args, std::string{ibf_args.path_out} + "IBF_Data");

Expand All @@ -732,7 +736,8 @@ void calculate_minimiser(std::vector<std::filesystem::path> const & sequence_fil
robin_hood::unordered_set<uint64_t> const & exclude_set_table,
min_arguments const & args,
minimiser_arguments const & minimiser_args,
unsigned const i)
unsigned const i,
std::vector<uint8_t> & cutoffs)
{
robin_hood::unordered_node_map<uint64_t, uint16_t> hash_table{}; // Storage for minimisers
uint16_t count{0};
Expand All @@ -744,12 +749,12 @@ void calculate_minimiser(std::vector<std::filesystem::path> const & sequence_fil
std::ofstream outfile;
unsigned file_iterator = std::accumulate(minimiser_args.samples.begin(), minimiser_args.samples.begin() + i, 0);

bool const calculate_cutoffs = minimiser_args.cutoffs.empty();
bool const calculate_cutoffs = cutoffs.empty();

if (calculate_cutoffs)
cutoff = calculate_cutoff(sequence_files[file_iterator], minimiser_args.samples[i]);
else
cutoff = minimiser_args.cutoffs[i];
cutoff = cutoffs[i];

// Fill hash_table with minimisers.
for (unsigned f = 0; f < minimiser_args.samples[i]; f++)
Expand All @@ -764,6 +769,7 @@ void calculate_minimiser(std::vector<std::filesystem::path> const & sequence_fil
+ ".minimiser", std::ios::binary);
auto hash_size = hash_table.size();
outfile.write(reinterpret_cast<const char*>(&hash_size), sizeof(hash_size));
outfile.write(reinterpret_cast<const char*>(&cutoff), sizeof(cutoff));
outfile.write(reinterpret_cast<const char*>(&args.k), sizeof(args.k));
outfile.write(reinterpret_cast<const char*>(&args.w_size.get()), sizeof(args.w_size.get()));
outfile.write(reinterpret_cast<const char*>(&args.s.get()), sizeof(args.s.get()));
Expand All @@ -784,13 +790,14 @@ void calculate_minimiser(std::vector<std::filesystem::path> const & sequence_fil
outfile.close();
}

void minimiser(std::vector<std::filesystem::path> const & sequence_files, min_arguments const & args, minimiser_arguments & minimiser_args)
void minimiser(std::vector<std::filesystem::path> const & sequence_files, min_arguments const & args,
minimiser_arguments & minimiser_args, std::vector<uint8_t> & cutoffs)
{
// Declarations
robin_hood::unordered_set<uint64_t> include_set_table{}; // Storage for minimisers in include file
robin_hood::unordered_set<uint64_t> exclude_set_table{}; // Storage for minimisers in exclude file

check_cutoffs_samples(sequence_files, minimiser_args.paired, minimiser_args.samples, minimiser_args.cutoffs);
check_cutoffs_samples(sequence_files, minimiser_args.paired, minimiser_args.samples, cutoffs);

if (minimiser_args.include_file != "")
get_include_set_table(args, minimiser_args.include_file, include_set_table);
Expand All @@ -805,6 +812,6 @@ void minimiser(std::vector<std::filesystem::path> const & sequence_files, min_ar
#pragma omp parallel for schedule(dynamic, chunk_size)
for(unsigned i = 0; i < minimiser_args.samples.size(); i++)
{
calculate_minimiser(sequence_files, include_set_table, exclude_set_table, args, minimiser_args, i);
calculate_minimiser(sequence_files, include_set_table, exclude_set_table, args, minimiser_args, i, cutoffs);
}
}
Loading

0 comments on commit 2048bf0

Please sign in to comment.