diff --git a/.clang-format b/.clang-format new file mode 100644 index 0000000..d961b5c --- /dev/null +++ b/.clang-format @@ -0,0 +1,34 @@ +# MIT License +# +# Copyright (c) 2024 Andrew Smith +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +BasedOnStyle: LLVM +ColumnLimit: 80 +IndentWidth: 2 +AlwaysBreakAfterReturnType: All +ContinuationIndentWidth: 2 +ConstructorInitializerIndentWidth: 2 +BraceWrapping: + BeforeElse: true + BeforeCatch: true +BreakBeforeBraces: Custom +BreakConstructorInitializers: AfterColon +SpacesBeforeTrailingComments: 2 diff --git a/.gitignore b/.gitignore index 7c588eb..259148f 100644 --- a/.gitignore +++ b/.gitignore @@ -1,13 +1,32 @@ -* -!.gitignore -!docs/* -!src -!src/*pp -!m4 -!src/smithlab_cpp/*pp -!Makefile -!Makefile.am -!configure.ac -!README.md -!LICENSE -!autogen.sh +# Prerequisites +*.d + +# Compiled Object files +*.slo +*.lo +*.o +*.obj + +# Precompiled Headers +*.gch +*.pch + +# Compiled Dynamic libraries +*.so +*.dylib +*.dll + +# Fortran module files +*.mod +*.smod + +# Compiled Static libraries +*.lai +*.la +*.a +*.lib + +# Executables +*.exe +*.out +*.app diff --git a/src/AbismalAlign.hpp b/src/AbismalAlign.hpp index c0dbfc6..3390c58 100644 --- a/src/AbismalAlign.hpp +++ b/src/AbismalAlign.hpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2019-2023 Andrew D. Smith and Guilherme Sena +/* Copyright (C) 2019-2025 Andrew D. Smith and Guilherme Sena * * Authors: Andrew D. Smith and Guilherme Sena * diff --git a/src/AbismalIndex.cpp b/src/AbismalIndex.cpp index 5925e55..058b292 100644 --- a/src/AbismalIndex.cpp +++ b/src/AbismalIndex.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2018-2023 Andrew D. Smith and Guilherme Sena +/* Copyright (C) 2018-2025 Andrew D. Smith and Guilherme Sena * * Authors: Andrew D. Smith and Guilherme Sena * @@ -34,8 +34,11 @@ #include "smithlab_os.hpp" #include "smithlab_utils.hpp" -using std::clog; +using std::begin; +using std::cbegin; +using std::cend; using std::cerr; +using std::clog; using std::endl; using std::ifstream; using std::inclusive_scan; @@ -47,16 +50,12 @@ using std::sort; using std::string; using std::to_string; using std::vector; -using std::begin; -using std::runtime_error; -using std::cbegin; -using std::cend; using std::chrono::duration; using std::chrono::steady_clock; using std::chrono::time_point; -template using num_lim = std::numeric_limits; +template using num_lim = std::numeric_limits; bool AbismalIndex::VERBOSE = false; const string AbismalIndex::internal_identifier = "AbismalIndex"; @@ -81,17 +80,17 @@ struct g_interval { g_interval(const string &c, size_t s, size_t e) : chrom{c}, start_pos{s}, end_pos{e} {} g_interval() {} - bool operator<(const g_interval &rhs) const { + bool + operator<(const g_interval &rhs) const { const int x = chrom.compare(rhs.chrom); - return (x < 0 || - (x == 0 && - (start_pos < rhs.start_pos || - (start_pos == rhs.start_pos && - (end_pos < rhs.end_pos))))); + return (x < 0 || (x == 0 && (start_pos < rhs.start_pos || + (start_pos == rhs.start_pos && + (end_pos < rhs.end_pos))))); } }; -template T& +template +T & operator<<(T &out, const g_interval &g) { return out << g.chrom << '\t' << g.start_pos << '\t' << g.end_pos; } @@ -100,7 +99,8 @@ auto load_target_regions(const string &target_filename) -> vector { constexpr auto msg = "failed parsing target region"; ifstream in(target_filename); - if (!in) throw runtime_error("failed reading target file"); + if (!in) + throw runtime_error("failed reading target file"); vector targets{}; string line; @@ -109,9 +109,12 @@ load_target_regions(const string &target_filename) -> vector { while (getline(in, line)) { std::istringstream iss(line); - if (!(iss >> chrom)) throw runtime_error(msg); - if (!(iss >> start_pos)) throw runtime_error(msg); - if (!(iss >> end_pos)) throw runtime_error(msg); + if (!(iss >> chrom)) + throw runtime_error(msg); + if (!(iss >> start_pos)) + throw runtime_error(msg); + if (!(iss >> end_pos)) + throw runtime_error(msg); targets.emplace_back(chrom, start_pos, end_pos); } @@ -150,11 +153,13 @@ contiguous_n(const vector &genome) -> vector> { inside_block = false; } } - if (inside_block) r.emplace_back(distance(g_beg, g_left), genome.size()); + if (inside_block) + r.emplace_back(distance(g_beg, g_left), genome.size()); return r; } -template static inline auto +template +static inline auto get_exclude_itrs(const G &genome, const vector> &exclude) -> vector> { vector> exclude_itrs; @@ -164,12 +169,15 @@ get_exclude_itrs(const G &genome, const vector> &exclude) return exclude_itrs; } -template static inline void +template +static inline void replace_included_n(const vector> &exclude, G &genome) { size_t j = 0; for (size_t i = 0; i < genome.size(); ++i) { - if (genome[i] == 'N' && i < exclude[j].first) genome[i] = random_base(); - if (exclude[j].second <= i) ++j; + if (genome[i] == 'N' && i < exclude[j].first) + genome[i] = random_base(); + if (exclude[j].second <= i) + ++j; } } @@ -180,8 +188,8 @@ get_compressed_size(const size_t original_size) { } static auto -sort_by_chrom(const vector &names, const vector &u) - -> vector { +sort_by_chrom(const vector &names, + const vector &u) -> vector { // This function will exclude any target regions that are not // corresponding to chromosomes in the given reference genome. vector t(u.size()); @@ -208,19 +216,24 @@ AbismalIndex::create_index(const string &targets_file, const string &genome_file) { auto s_time = steady_clock::now(); - if (VERBOSE) clog << "[loading genome]"; + if (VERBOSE) + clog << "[loading genome]"; vector orig_genome; load_genome(genome_file, orig_genome, cl); - if (VERBOSE) clog << delta_seconds(s_time) << endl; + if (VERBOSE) + clog << delta_seconds(s_time) << endl; s_time = steady_clock::now(); - if (VERBOSE) clog << "[loading target regions]"; + if (VERBOSE) + clog << "[loading target regions]"; vector orig_targets = load_target_regions(targets_file); orig_targets = sort_by_chrom(cl.names, orig_targets); - if (VERBOSE) clog << delta_seconds(s_time) << endl; + if (VERBOSE) + clog << delta_seconds(s_time) << endl; s_time = steady_clock::now(); - if (VERBOSE) clog << "[cleaning reference genome]"; + if (VERBOSE) + clog << "[cleaning reference genome]"; vector> targets; for (auto &i : orig_targets) @@ -238,15 +251,18 @@ AbismalIndex::create_index(const string &targets_file, exclude.erase(rm_itr, cend(exclude)); replace_included_n(exclude, orig_genome); - if (VERBOSE) clog << delta_seconds(s_time) << endl; + if (VERBOSE) + clog << delta_seconds(s_time) << endl; s_time = steady_clock::now(); - if (VERBOSE) clog << "[encoding genome]"; + if (VERBOSE) + clog << "[encoding genome]"; genome.resize(get_compressed_size(orig_genome.size())); encode_dna_four_bit(begin(orig_genome), end(orig_genome), begin(genome)); vector().swap(orig_genome); - if (VERBOSE) clog << delta_seconds(s_time) << endl; + if (VERBOSE) + clog << delta_seconds(s_time) << endl; exclude_itr = get_exclude_itrs(genome, exclude); @@ -266,7 +282,8 @@ void AbismalIndex::create_index(const string &genome_file) { auto s_time = steady_clock::now(); - if (VERBOSE) clog << "[loading genome]"; + if (VERBOSE) + clog << "[loading genome]"; vector orig_genome; load_genome(genome_file, orig_genome, cl); if (VERBOSE) @@ -274,7 +291,8 @@ AbismalIndex::create_index(const string &genome_file) { << endl; s_time = steady_clock::now(); - if (VERBOSE) clog << "[cleaning reference genome]"; + if (VERBOSE) + clog << "[cleaning reference genome]"; exclude = contiguous_n(orig_genome); auto rm_itr = std::remove_if(begin(exclude), end(exclude), @@ -284,15 +302,18 @@ AbismalIndex::create_index(const string &genome_file) { exclude.erase(rm_itr, cend(exclude)); replace_included_n(exclude, orig_genome); - if (VERBOSE) clog << delta_seconds(s_time) << endl; + if (VERBOSE) + clog << delta_seconds(s_time) << endl; s_time = steady_clock::now(); - if (VERBOSE) clog << "[encoding genome]"; + if (VERBOSE) + clog << "[encoding genome]"; genome.resize(get_compressed_size(orig_genome.size())); encode_dna_four_bit(begin(orig_genome), end(orig_genome), begin(genome)); vector().swap(orig_genome); - if (VERBOSE) clog << delta_seconds(s_time) << endl; + if (VERBOSE) + clog << delta_seconds(s_time) << endl; exclude_itr = get_exclude_itrs(genome, exclude); @@ -308,7 +329,8 @@ AbismalIndex::create_index(const string &genome_file) { sort_buckets(); } -template void +template +void AbismalIndex::get_bucket_sizes_two() { counter_size = (1ull << seed::key_weight); @@ -320,7 +342,8 @@ AbismalIndex::get_bucket_sizes_two() { // start spooling the hash key const auto gi_lim = gi + (seed::key_weight - 1); - while (gi != gi_lim) shift_hash_key(*gi++, hash_key); + while (gi != gi_lim) + shift_hash_key(*gi++, hash_key); auto itl_itr = cbegin(is_two_let); auto keep_itr = cbegin(keep); @@ -334,13 +357,15 @@ AbismalIndex::get_bucket_sizes_two() { assert(nidx_itr != cend(exclude)); if (i < nidx_itr->first && *keep_itr) counter[hash_key] += (!use_mask || *itl_itr); - if (nidx_itr->second <= i) ++nidx_itr; + if (nidx_itr->second <= i) + ++nidx_itr; ++keep_itr; ++itl_itr; } } -template void +template +void AbismalIndex::get_bucket_sizes_three() { counter_size_three = seed::hash_mask_three; @@ -360,7 +385,8 @@ AbismalIndex::get_bucket_sizes_three() { // start building up the hash key const auto gi_lim = gi + (seed::key_weight_three - 1); - while (gi != gi_lim) shift_three_key(*gi++, hash_key); + while (gi != gi_lim) + shift_three_key(*gi++, hash_key); auto itl_itr = cbegin(is_two_let); auto keep_itr = cbegin(keep); @@ -377,7 +403,8 @@ AbismalIndex::get_bucket_sizes_three() { else counter_a[hash_key] += (!use_mask || !(*itl_itr)); } - if (nidx_itr->second <= i) ++nidx_itr; + if (nidx_itr->second <= i) + ++nidx_itr; ++keep_itr; ++itl_itr; } @@ -393,10 +420,12 @@ three_letter_cost(const uint32_t count_t, const uint32_t count_a) { return (count_t + count_a) >> 1; } -template void +template +void AbismalIndex::initialize_bucket_sizes() { auto s_time = steady_clock::now(); - if (VERBOSE) clog << "[computing bucket sizes]"; + if (VERBOSE) + clog << "[computing bucket sizes]"; #pragma omp parallel sections { #pragma omp section @@ -406,14 +435,14 @@ AbismalIndex::initialize_bucket_sizes() { #pragma omp section get_bucket_sizes_three(); } - if (VERBOSE) clog << delta_seconds(s_time) << endl; + if (VERBOSE) + clog << delta_seconds(s_time) << endl; } static inline auto -get_block_bounds(const size_t start_pos, const size_t step_size, - const size_t end_pos, - const vector> &exclude) - -> vector> { +get_block_bounds( + const size_t start_pos, const size_t step_size, const size_t end_pos, + const vector> &exclude) -> vector> { vector> blocks; auto block_start = start_pos; @@ -425,7 +454,8 @@ get_block_bounds(const size_t start_pos, const size_t step_size, block_start += step_size; // this might move block start backward - if (block_start >= i->second) block_start = (*i++).second; + if (block_start >= i->second) + block_start = (*i++).second; } else block_start = (*i++).second; @@ -443,7 +473,8 @@ void AbismalIndex::select_two_letter_positions() { constexpr size_t block_size = 1000000ul; auto s_time = steady_clock::now(); - if (VERBOSE) clog << "[selecting two-letter positions]"; + if (VERBOSE) + clog << "[selecting two-letter positions]"; // choose which have lower count under two-letters is_two_let.resize(cl.get_genome_size(), false); @@ -464,7 +495,8 @@ AbismalIndex::select_two_letter_positions() { // spool the two letter hash const auto gi_lim_two = gi_two + (seed::key_weight - 1); - while (gi_two != gi_lim_two) shift_hash_key(*gi_two++, hash_two); + while (gi_two != gi_lim_two) + shift_hash_key(*gi_two++, hash_two); uint32_t hash_t = 0, hash_a = 0; auto gi_three = g_beg + block.first; @@ -491,22 +523,24 @@ AbismalIndex::select_two_letter_positions() { three_letter_cost(counter_t[hash_t], counter_a[hash_a]); } } - if (VERBOSE) clog << delta_seconds(s_time) << endl; + if (VERBOSE) + clog << delta_seconds(s_time) << endl; } void AbismalIndex::hash_genome() { // count k-mers under each encoding with masking auto s_time = steady_clock::now(); - if (VERBOSE) clog << "[initializing index structures]"; + if (VERBOSE) + clog << "[initializing index structures]"; #pragma omp parallel sections { #pragma omp section - inclusive_scan(begin(counter), end(counter), begin(counter)); + inclusive_scan(begin(counter), end(counter), begin(counter)); #pragma omp section - inclusive_scan(begin(counter_t), end(counter_t), begin(counter_t)); + inclusive_scan(begin(counter_t), end(counter_t), begin(counter_t)); #pragma omp section - inclusive_scan(begin(counter_a), end(counter_a), begin(counter_a)); + inclusive_scan(begin(counter_a), end(counter_a), begin(counter_a)); } index_size = counter[counter_size]; @@ -521,7 +555,8 @@ AbismalIndex::hash_genome() { << "three-letter=" << index_size_three << "]" << endl; s_time = steady_clock::now(); - if (VERBOSE) clog << "[counting k-mers]"; + if (VERBOSE) + clog << "[counting k-mers]"; const auto lim = cl.get_genome_size() - seed::key_weight + 1; #pragma omp parallel sections @@ -531,7 +566,8 @@ AbismalIndex::hash_genome() { auto gi_two = genome_iterator(cbegin(genome)); const auto gi_lim = gi_two + (seed::key_weight - 1); uint32_t hash_two = 0; - while (gi_two != gi_lim) shift_hash_key(*gi_two++, hash_two); + while (gi_two != gi_lim) + shift_hash_key(*gi_two++, hash_two); auto nidx_itr = cbegin(exclude); for (size_t i = 0; i < lim; ++i) { shift_hash_key(*gi_two++, hash_two); @@ -540,7 +576,8 @@ AbismalIndex::hash_genome() { assert(counter[hash_two] > 0); index[--counter[hash_two]] = i; } - if (nidx_itr->second <= i) ++nidx_itr; + if (nidx_itr->second <= i) + ++nidx_itr; } } #pragma omp section @@ -548,7 +585,8 @@ AbismalIndex::hash_genome() { auto gi_three = genome_iterator(cbegin(genome)); const auto gi_lim = gi_three + (seed::key_weight_three - 1); uint32_t hash_t = 0; - while (gi_three != gi_lim) shift_three_key(*gi_three++, hash_t); + while (gi_three != gi_lim) + shift_three_key(*gi_three++, hash_t); auto nidx_itr = cbegin(exclude); for (size_t i = 0; i < lim; ++i) { shift_three_key(*gi_three++, hash_t); @@ -557,7 +595,8 @@ AbismalIndex::hash_genome() { assert(counter_t[hash_t] > 0); index_t[--counter_t[hash_t]] = i; } - if (nidx_itr->second <= i) ++nidx_itr; + if (nidx_itr->second <= i) + ++nidx_itr; } } #pragma omp section @@ -565,7 +604,8 @@ AbismalIndex::hash_genome() { auto gi_three = genome_iterator(begin(genome)); const auto gi_lim = gi_three + (seed::key_weight_three - 1); uint32_t hash_a = 0; - while (gi_three != gi_lim) shift_three_key(*gi_three++, hash_a); + while (gi_three != gi_lim) + shift_three_key(*gi_three++, hash_a); auto nidx_itr = cbegin(exclude); for (size_t i = 0; i < lim; ++i) { shift_three_key(*gi_three++, hash_a); @@ -574,11 +614,13 @@ AbismalIndex::hash_genome() { assert(counter_a[hash_a] > 0); index_a[--counter_a[hash_a]] = i; } - if (nidx_itr->second <= i) ++nidx_itr; + if (nidx_itr->second <= i) + ++nidx_itr; } } } - if (VERBOSE) clog << delta_seconds(s_time) << endl; + if (VERBOSE) + clog << delta_seconds(s_time) << endl; } struct dp_sol { @@ -592,33 +634,46 @@ struct dp_sol { // tell the difference using `f` and `b` of whether the deque is empty // or full. We need to ensure that the size of this deque ring buffer // is at least (window_size + 2). -template struct fixed_ring_buffer { - constexpr static size_t qsz = 32ul; // must be >= (seed::window_size + 2) +template struct fixed_ring_buffer { + constexpr static size_t qsz = 32ul; // must be >= (seed::window_size + 2) constexpr static size_t qsz_msk = 31ul; // mask to keep positions in range std::array h; - auto size() const -> size_t { - return (b == f) ? 0 : (b > f) ? b - f : qsz - (f - b); + auto + size() const -> size_t { + return (b == f) ? 0 : (b > f) ? b - f : qsz - (f - b); } - auto empty() const -> bool { return f == b; } + auto + empty() const -> bool { + return f == b; + } - auto back() -> T & { return h[(b - 1) & qsz_msk]; } + auto + back() -> T & { + return h[(b - 1) & qsz_msk]; + } - auto front() -> T & { return h[f]; } + auto + front() -> T & { + return h[f]; + } - auto pop_back() -> void { + auto + pop_back() -> void { --b; b &= qsz_msk; } - auto pop_front() -> void { + auto + pop_front() -> void { ++f; f &= qsz_msk; } - auto push_back(T &&x) -> void { + auto + push_back(T &&x) -> void { h[b] = std::move(x); ++b; b &= qsz_msk; @@ -632,11 +687,13 @@ using deque = fixed_ring_buffer; static inline void add_sol(deque &helper, const uint64_t prev, const uint64_t cost) { - while (!helper.empty() && helper.back().cost > cost) helper.pop_back(); + while (!helper.empty() && helper.back().cost > cost) + helper.pop_back(); helper.push_back({cost, prev}); - while (helper.front().prev + seed::window_size <= prev) helper.pop_front(); + while (helper.front().prev + seed::window_size <= prev) + helper.pop_front(); assert(!helper.empty() && helper.size() <= seed::window_size); } @@ -652,7 +709,8 @@ void AbismalIndex::compress_dp() { constexpr auto block_size = 1000000ul; auto s_time = steady_clock::now(); - if (VERBOSE) clog << "[dynamic programming to optimize seed selection]"; + if (VERBOSE) + clog << "[dynamic programming to optimize seed selection]"; // default is no position will be indexed std::fill(begin(keep), end(keep), false); @@ -667,7 +725,8 @@ AbismalIndex::compress_dp() { const size_t block_start = block.first; const uint32_t current_block_size = block.second - block.first; - if (current_block_size < seed::window_size) continue; + if (current_block_size < seed::window_size) + continue; uint32_t hash_two = 0; @@ -675,7 +734,8 @@ AbismalIndex::compress_dp() { auto gi_two = genome_iterator(begin(genome)) + block_start; const auto gi_lim_two = gi_two + min(current_block_size, seed::key_weight - 1); - while (gi_two != gi_lim_two) shift_hash_key(*gi_two++, hash_two); + while (gi_two != gi_lim_two) + shift_hash_key(*gi_two++, hash_two); uint32_t hash_t = 0, hash_a = 0; @@ -757,20 +817,23 @@ AbismalIndex::compress_dp() { } max_candidates = 100u; // GS: this is a heuristic - if (VERBOSE) clog << delta_seconds(s_time) << endl; + if (VERBOSE) + clog << delta_seconds(s_time) << endl; } struct BucketLess { - BucketLess(const Genome &g): g_start(begin(g)) {} + BucketLess(const Genome &g) : g_start(begin(g)) {} - bool operator()(const uint32_t a, const uint32_t b) const { + bool + operator()(const uint32_t a, const uint32_t b) const { auto idx1(g_start + a + seed::key_weight); const auto lim1(g_start + a + seed::n_sorting_positions); auto idx2(g_start + b + seed::key_weight); while (idx1 != lim1) { const uint8_t c1 = get_bit(*idx1++); const uint8_t c2 = get_bit(*idx2++); - if (c1 != c2) return c1 < c2; + if (c1 != c2) + return c1 < c2; } return false; } @@ -778,7 +841,7 @@ struct BucketLess { const genome_iterator g_start; }; -template +template static inline three_letter_t three_letter_num_srt(const uint8_t nt) { // C=T=0, A=1, G=4 @@ -786,17 +849,19 @@ three_letter_num_srt(const uint8_t nt) { return the_conv == c_to_t ? nt & 5 : nt & 10; } -template struct BucketLessThree { - BucketLessThree(const Genome &g): g_start(begin(g)) {} +template struct BucketLessThree { + BucketLessThree(const Genome &g) : g_start(begin(g)) {} - bool operator()(const uint32_t a, const uint32_t b) const { + bool + operator()(const uint32_t a, const uint32_t b) const { auto idx1(g_start + a + seed::key_weight_three); const auto lim1(g_start + a + seed::n_sorting_positions); auto idx2(g_start + b + seed::key_weight_three); while (idx1 != lim1) { const uint8_t c1 = three_letter_num_srt(*idx1++); const uint8_t c2 = three_letter_num_srt(*idx2++); - if (c1 != c2) return c1 < c2; + if (c1 != c2) + return c1 < c2; } return false; } @@ -808,36 +873,42 @@ void AbismalIndex::sort_buckets() { { auto s_time = steady_clock::now(); - if (VERBOSE) clog << "[sorting two-letter buckets]"; + if (VERBOSE) + clog << "[sorting two-letter buckets]"; const auto b = begin(index); const BucketLess bucket_less(genome); #pragma omp parallel for for (size_t i = 0; i < counter_size; ++i) if (counter[i + 1] > counter[i] + 1) sort(b + counter[i], b + counter[i + 1], bucket_less); - if (VERBOSE) clog << delta_seconds(s_time) << endl; + if (VERBOSE) + clog << delta_seconds(s_time) << endl; } { auto s_time = steady_clock::now(); - if (VERBOSE) clog << "[sorting three-letter T buckets]"; + if (VERBOSE) + clog << "[sorting three-letter T buckets]"; const auto b = begin(index_t); const BucketLessThree bucket_less(genome); #pragma omp parallel for for (size_t i = 0; i < counter_size_three; ++i) if (counter_t[i + 1] > counter_t[i] + 1) sort(b + counter_t[i], b + counter_t[i + 1], bucket_less); - if (VERBOSE) clog << delta_seconds(s_time) << endl; + if (VERBOSE) + clog << delta_seconds(s_time) << endl; } { auto s_time = steady_clock::now(); - if (VERBOSE) clog << "[sorting three-letter A buckets]"; + if (VERBOSE) + clog << "[sorting three-letter A buckets]"; const auto b = begin(index_a); const BucketLessThree bucket_less(genome); #pragma omp parallel for for (size_t i = 0; i < counter_size_three; ++i) if (counter_a[i + 1] > counter_a[i] + 1) sort(b + counter_a[i], b + counter_a[i + 1], bucket_less); - if (VERBOSE) clog << delta_seconds(s_time) << endl; + if (VERBOSE) + clog << delta_seconds(s_time) << endl; } } @@ -877,7 +948,8 @@ seed::read(FILE *in) { // n_sorting_positions uint32_t n_sorting_positions_from_file = 0; - if (fread((char *)&n_sorting_positions_from_file, sizeof(uint32_t), 1, in) != 1) + if (fread((char *)&n_sorting_positions_from_file, sizeof(uint32_t), 1, in) != + 1) throw runtime_error(error_msg); if (n_sorting_positions_from_file != n_sorting_positions) { @@ -899,7 +971,8 @@ seed::write(FILE *out) { void AbismalIndex::write(const string &index_file) const { FILE *out = fopen(index_file.c_str(), "wb"); - if (!out) throw runtime_error("cannot open output file " + index_file); + if (!out) + throw runtime_error("cannot open output file " + index_file); write_internal_identifier(out); seed::write(out); @@ -945,7 +1018,8 @@ AbismalIndex::read(const string &index_file) { static const string error_msg("failed loading index file"); FILE *in = fopen(index_file.c_str(), "rb"); - if (!in) throw runtime_error("cannot open input file " + index_file); + if (!in) + throw runtime_error("cannot open input file " + index_file); if (!check_internal_identifier(in)) throw runtime_error("index file format problem: " + index_file); @@ -1033,7 +1107,8 @@ ChromLookup::write(FILE *out) const { void ChromLookup::write(const string &outfile) const { std::ofstream out(outfile, std::ios::binary); - if (!out) throw runtime_error("cannot open output file " + outfile); + if (!out) + throw runtime_error("cannot open output file " + outfile); write(out); } @@ -1100,7 +1175,8 @@ ChromLookup::read(FILE *in) { void ChromLookup::read(const std::string &infile) { std::ifstream in(infile, std::ios::binary); - if (!in) throw runtime_error("cannot open input file " + infile); + if (!in) + throw runtime_error("cannot open input file " + infile); read(in); } @@ -1113,10 +1189,8 @@ string ChromLookup::tostring() const { std::ostringstream iss; for (auto i = 0u; i < names.size(); ++i) - iss << i << '\t' - << names[i] << '\t' - << starts[i] << '\t' - << starts[i + 1] << '\n'; + iss << i << '\t' << names[i] << '\t' << starts[i] << '\t' << starts[i + 1] + << '\n'; return iss.str(); } @@ -1148,7 +1222,8 @@ ChromLookup::get_chrom_idx_and_offset(const uint32_t pos, uint32_t &offset) const { auto idx = upper_bound(cbegin(starts), cend(starts), pos); - if (idx == cbegin(starts)) return false; // read is before any chrom + if (idx == cbegin(starts)) + return false; // read is before any chrom --idx; @@ -1157,16 +1232,18 @@ ChromLookup::get_chrom_idx_and_offset(const uint32_t pos, return (pos + readlen <= starts[chrom_idx + 1]); } -template void +template +void load_genome_impl(const string &genome_file, G &genome, ChromLookup &cl) { const auto file_size = std::filesystem::file_size(genome_file); bamxx::bgzf_file in(genome_file, "r"); - if (!in) throw runtime_error("failed to open genome file: " + genome_file); + if (!in) + throw runtime_error("failed to open genome file: " + genome_file); genome.clear(); // reserve space for padding at both ends - genome.reserve(file_size + 2*seed::padding_size); + genome.reserve(file_size + 2 * seed::padding_size); auto g_ins = std::back_inserter(genome); // pad the start of the concatenated sequence diff --git a/src/AbismalIndex.hpp b/src/AbismalIndex.hpp index 044d67f..87ea324 100644 --- a/src/AbismalIndex.hpp +++ b/src/AbismalIndex.hpp @@ -1,18 +1,18 @@ -/* Copyright (C) 2018-2023 Andrew D. Smith and Guilherme Sena +/* Copyright (C) 2018-2025 Andrew D. Smith and Guilherme Sena * - * Authors: Andrew D. Smith and Guilherme Sena + * Authors: Andrew D. Smith and Guilherme Sena * - * This file is part of ABISMAL. + * This file is part of ABISMAL. * - * ABISMAL is free software: you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by - * the Free Software Foundation, either version 3 of the License, or - * (at your option) any later version. + * ABISMAL is free software: you can redistribute it and/or modify it under + * the terms of the GNU General Public License as published by the Free + * Software Foundation, either version 3 of the License, or (at your option) + * any later version. * - * ABISMAL is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. + * ABISMAL is distributed in the hope that it will be useful, but WITHOUT ANY + * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more + * details. */ #ifndef ABISMAL_INDEX_HPP @@ -279,4 +279,4 @@ get_base_3_hash(T r, uint32_t &k) { } } -#endif +#endif // ABISMAL_INDEX_HPP diff --git a/src/abismal.cpp b/src/abismal.cpp index 1beb0f3..b965ebe 100644 --- a/src/abismal.cpp +++ b/src/abismal.cpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2018-2023 Andrew D. Smith and Guilherme Sena +/* Copyright (C) 2018-2025 Andrew D. Smith and Guilherme Sena * * Authors: Andrew D. Smith and Guilherme Sena * @@ -98,15 +98,25 @@ get_strand_code(const char strand, const conversion_type conv) { struct ReadLoader { ReadLoader(const string &fn) : cur_line{0}, filename{fn}, in{fn, "r"} {} - bool good() const { return in; } + bool + good() const { + return in; + } operator bool() const { return in; } - size_t get_current_read() const { return cur_line / 4; } + size_t + get_current_read() const { + return cur_line / 4; + } - size_t get_current_byte() const { return in.tellg(); } + size_t + get_current_byte() const { + return in.tellg(); + } - void load_reads(vector &names, vector &reads) { + void + load_reads(vector &names, vector &reads) { reads.clear(); names.clear(); @@ -176,39 +186,60 @@ struct se_element { // size = 8 se_element(const score_t d, const flags_t f, const uint32_t p) : diffs(d), flags(f), pos(p) {} - bool operator==(const se_element &rhs) const { + bool + operator==(const se_element &rhs) const { return pos == rhs.pos && flags == rhs.flags; } - bool operator!=(const se_element &rhs) const { + bool + operator!=(const se_element &rhs) const { return pos != rhs.pos || flags != rhs.flags; } // this is used to keep PE candidates sorted in the max heap - bool operator<(const se_element &rhs) const { return diffs < rhs.diffs; } + bool + operator<(const se_element &rhs) const { + return diffs < rhs.diffs; + } - inline bool rc() const { return samflags::check(flags, samflags::read_rc); } + inline bool + rc() const { + return samflags::check(flags, samflags::read_rc); + } - inline bool elem_is_a_rich() const { + inline bool + elem_is_a_rich() const { return samflags::check(flags, bsflags::read_is_a_rich); } - inline bool ambig() const { + inline bool + ambig() const { return samflags::check(flags, samflags::secondary_aln); } - inline void set_ambig() { samflags::set(flags, samflags::secondary_aln); } + inline void + set_ambig() { + samflags::set(flags, samflags::secondary_aln); + } - inline bool empty() const { return pos == 0; } + inline bool + empty() const { + return pos == 0; + } - inline bool sure_ambig() const { return ambig() && diffs == 0; } + inline bool + sure_ambig() const { + return ambig() && diffs == 0; + } - inline void reset() { + inline void + reset() { pos = 0; diffs = MAX_DIFFS; } - inline void reset(const uint32_t readlen) { + inline void + reset(const uint32_t readlen) { reset(); diffs = static_cast(invalid_hit_frac * readlen); } @@ -260,11 +291,18 @@ struct se_candidates { se_candidates() : sz(1), best(se_element()), v(vector(max_size)) {} - inline bool full() const { return sz == max_size; }; + inline bool + full() const { + return sz == max_size; + }; - inline bool has_exact_match() const { return !best.empty(); }; + inline bool + has_exact_match() const { + return !best.empty(); + }; - void update_exact_match(const flags_t s, const uint32_t p) { + void + update_exact_match(const flags_t s, const uint32_t p) { const se_element cand(0, s, p); if (best.empty()) best = cand; // cand has ambig flag set to false @@ -273,17 +311,33 @@ struct se_candidates { best.set_ambig(); } - bool enough_good_hits() const { return full() && good_diff(cutoff); } + bool + enough_good_hits() const { + return full() && good_diff(cutoff); + } - bool good_diff(const score_t d) const { return (d <= good_cutoff); } + bool + good_diff(const score_t d) const { + return (d <= good_cutoff); + } - bool should_do_sensitive() const { return (!full() || !good_diff(cutoff)); } + bool + should_do_sensitive() const { + return (!full() || !good_diff(cutoff)); + } - inline void set_specific() { cutoff = good_cutoff; } + inline void + set_specific() { + cutoff = good_cutoff; + } - inline void set_sensitive() { cutoff = v.front().diffs; } + inline void + set_sensitive() { + cutoff = v.front().diffs; + } - void update_cand(const score_t d, const flags_t s, const uint32_t p) { + void + update_cand(const score_t d, const flags_t s, const uint32_t p) { if (full()) { pop_heap(begin(v), begin(v) + sz); v[sz - 1] = se_element(d, s, p); @@ -294,8 +348,9 @@ struct se_candidates { push_heap(begin(v), begin(v) + sz); } - void update(const bool specific, const score_t d, const flags_t s, - const uint32_t p) { + void + update(const bool specific, const score_t d, const flags_t s, + const uint32_t p) { if (d == 0) update_exact_match(s, p); else @@ -305,7 +360,8 @@ struct se_candidates { cutoff = (specific ? min16(cutoff, v.front().diffs) : v.front().diffs); } - void reset() { + void + reset() { best.reset(); v.front().reset(); @@ -315,7 +371,8 @@ struct se_candidates { sz = 1; } - void reset(const uint32_t readlen) { + void + reset(const uint32_t readlen) { best.reset(readlen); v.front().reset(readlen); cutoff = v.front().diffs; @@ -326,7 +383,8 @@ struct se_candidates { } // in SE reads, we sort to exclude duplicates - void prepare_for_alignments() { + void + prepare_for_alignments() { sort(begin(v), begin(v) + sz, // no sort_heap here as heapify used "diffs" [](const se_element &a, const se_element &b) { return (a.pos < b.pos) || (a.pos == b.pos && a.flags < b.flags); @@ -429,22 +487,28 @@ format_se(const bool allow_ambig, const se_element &res, const ChromLookup &cl, struct pe_element { pe_element() : aln_score(0), r1(se_element()), r2(se_element()) {} - score_t diffs() const { return r1.diffs + r2.diffs; } + score_t + diffs() const { + return r1.diffs + r2.diffs; + } - void reset(const uint32_t readlen1, const uint32_t readlen2) { + void + reset(const uint32_t readlen1, const uint32_t readlen2) { aln_score = 0; r1.reset(readlen1); r2.reset(readlen2); max_aln_score = simple_aln::best_pair_score(readlen1, readlen2); } - void reset() { + void + reset() { aln_score = 0; r1.reset(); r2.reset(); } - bool update(const score_t scr, const se_element &s1, const se_element &s2) { + bool + update(const score_t scr, const se_element &s1, const se_element &s2) { const auto rd = r1.diffs + r2.diffs; const auto sd = s1.diffs + s2.diffs; if (scr > aln_score || (scr == aln_score && sd < rd)) { @@ -463,15 +527,23 @@ struct pe_element { // GS: this is used to decide whether ends should be // mapped as SE independently - inline bool should_report(const bool allow_ambig) const { + inline bool + should_report(const bool allow_ambig) const { return !empty() && (allow_ambig || !ambig()); } - inline bool ambig() const { return r1.ambig(); } + inline bool + ambig() const { + return r1.ambig(); + } - inline bool empty() const { return r1.empty(); } + inline bool + empty() const { + return r1.empty(); + } - inline bool sure_ambig() const { + inline bool + sure_ambig() const { return ambig() && (aln_score == max_aln_score); } @@ -621,7 +693,8 @@ format_pe(const bool allow_ambig, const pe_element &p, const ChromLookup &cl, struct pe_candidates { pe_candidates() : v(vector(max_size_large)) {} - inline void reset(const uint32_t readlen) { + inline void + reset(const uint32_t readlen) { v.front().reset(readlen); sure_ambig = false; cutoff = v.front().diffs; @@ -630,24 +703,44 @@ struct pe_candidates { capacity = max_size_small; } - inline void set_specific() { cutoff = good_cutoff; } + inline void + set_specific() { + cutoff = good_cutoff; + } - inline void set_sensitive() { cutoff = v.front().diffs; } + inline void + set_sensitive() { + cutoff = v.front().diffs; + } - inline bool should_align() { return (sz != max_size_large || cutoff != 0); } + inline bool + should_align() { + return (sz != max_size_large || cutoff != 0); + } - inline bool full() const { return sz == capacity; } + inline bool + full() const { + return sz == capacity; + } - inline bool good_diff(const score_t d) const { return (d <= good_cutoff); } + inline bool + good_diff(const score_t d) const { + return (d <= good_cutoff); + } - inline bool enough_good_hits() const { return full() && good_diff(cutoff); } + inline bool + enough_good_hits() const { + return full() && good_diff(cutoff); + } - inline bool should_do_sensitive() const { + inline bool + should_do_sensitive() const { return (capacity == max_size_small || !good_diff(cutoff)); } - void update(const bool specific, const score_t d, const flags_t s, - const uint32_t p) { + void + update(const bool specific, const score_t d, const flags_t s, + const uint32_t p) { if (full()) { // doubles capacity if heap is filled with good matches if (specific && capacity != max_size_large && good_diff(d)) { @@ -665,7 +758,8 @@ struct pe_candidates { sure_ambig = (full() && cutoff == 0); } - void prepare_for_mating() { + void + prepare_for_mating() { sort( begin(v), begin(v) + sz, // no sort_heap here as heapify used "diffs" [](const se_element &a, const se_element &b) { return a.pos < b.pos; }); @@ -748,8 +842,9 @@ struct se_map_stats { // total_bases. double edit_distance_mean{}; - void update(const bool allow_ambig, const string &read, - const bam_cigar_t &cigar, const se_element s) { + void + update(const bool allow_ambig, const string &read, const bam_cigar_t &cigar, + const se_element s) { ++total_reads; const bool valid = !s.empty(); const bool ambig = s.ambig(); @@ -762,12 +857,14 @@ struct se_map_stats { update_error_rate(s.diffs, cigar); } - void update_error_rate(const score_t diffs, const bam_cigar_t &cigar) { + void + update_error_rate(const score_t diffs, const bam_cigar_t &cigar) { edit_distance += diffs; total_bases += cigar_rseq_ops(cigar); } - void assign_values() { + void + assign_values() { constexpr auto pct = [](const double a, const double b) { return ((b == 0) ? 0.0 : 100.0 * a / b); }; @@ -783,7 +880,8 @@ struct se_map_stats { percent_skipped = pct(skipped_reads, total_reads); } - string tostring(const size_t n_tabs = 0) { + string + tostring(const size_t n_tabs = 0) { static constexpr auto tab = " "; assign_values(); @@ -887,10 +985,10 @@ struct pe_map_stats { se_map_stats end1_stats{}; se_map_stats end2_stats{}; - void update(const bool allow_ambig, const string &reads1, - const string &reads2, const bam_cigar_t &cig1, - const bam_cigar_t &cig2, const pe_element &p, const se_element s1, - const se_element s2) { + void + update(const bool allow_ambig, const string &reads1, const string &reads2, + const bam_cigar_t &cig1, const bam_cigar_t &cig2, const pe_element &p, + const se_element s1, const se_element s2) { const bool valid = !p.empty(); const bool ambig = p.ambig(); total_read_pairs++; @@ -908,13 +1006,15 @@ struct pe_map_stats { } } - void update_error_rate(const score_t d1, const score_t d2, - const bam_cigar_t &cig1, const bam_cigar_t &cig2) { + void + update_error_rate(const score_t d1, const score_t d2, const bam_cigar_t &cig1, + const bam_cigar_t &cig2) { edit_distance_pairs += d1 + d2; total_bases_pairs += cigar_rseq_ops(cig1) + cigar_rseq_ops(cig2); } - void assign_values() { + void + assign_values() { constexpr auto pct = [](const double a, const double b) { return ((b == 0) ? 0.0 : 100.0 * a / b); }; @@ -932,7 +1032,8 @@ struct pe_map_stats { percent_skipped_pairs = pct(read_pairs_skipped, total_read_pairs_tmp); } - string tostring(const bool allow_ambig) { + string + tostring(const bool allow_ambig) { static string t = " "; assign_values(); @@ -1050,7 +1151,8 @@ check_hits(const uint32_t offset, const PackedRead::const_iterator read_st, struct compare_bases { compare_bases(const genome_iterator g_) : g(g_) {} - bool operator()(const uint32_t mid, const two_letter_t chr) const { + bool + operator()(const uint32_t mid, const two_letter_t chr) const { return get_bit(*(g + mid)) < chr; } @@ -1100,7 +1202,8 @@ get_three_letter_num_fast(const uint8_t nt) { template struct compare_bases_three { compare_bases_three(const genome_iterator g_) : g(g_) {} - bool operator()(const uint32_t mid, const three_letter_t chr) const { + bool + operator()(const uint32_t mid, const three_letter_t chr) const { return get_three_letter_num_fast(*(g + mid)) < chr; } diff --git a/src/abismal_cigar_utils.hpp b/src/abismal_cigar_utils.hpp index aa8e03f..927ffa3 100644 --- a/src/abismal_cigar_utils.hpp +++ b/src/abismal_cigar_utils.hpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2023 Andrew D. Smith and Guil Sena +/* Copyright (C) 2023-2025 Andrew D. Smith and Guil Sena * * Authors: Andrew D. Smith and Guil Sena * @@ -37,17 +37,19 @@ static const uint32_t ABISMAL_BAM_CIGAR_SHIFT = 4; inline uint32_t abismal_bam_cigar_gen(const uint32_t l, const int8_t o) { // ADS: "o" can have -1 for invalid cigar ops - return (l << ABISMAL_BAM_CIGAR_SHIFT| static_cast(o)); + return (l << ABISMAL_BAM_CIGAR_SHIFT | static_cast(o)); } static const uint8_t ABISMAL_BAM_CIGAR_MASK = 0xf; inline uint8_t -abismal_bam_cigar_op(const uint32_t c) { return c & ABISMAL_BAM_CIGAR_MASK; } +abismal_bam_cigar_op(const uint32_t c) { + return c & ABISMAL_BAM_CIGAR_MASK; +} inline uint8_t abismal_bam_cigar_oplen(const uint32_t c) { return c >> ABISMAL_BAM_CIGAR_SHIFT; } -#endif +#endif // ABISMAL_CIGAR_UTILS_HPP diff --git a/src/abismal_main.cpp b/src/abismal_main.cpp index 8368bbd..29e32eb 100644 --- a/src/abismal_main.cpp +++ b/src/abismal_main.cpp @@ -1,5 +1,23 @@ +/* Copyright (C) 2018-2023 Andrew D. Smith and Guilherme Sena + * + * Authors: Andrew D. Smith and Guilherme Sena + * + * This file is part of ABISMAL. + * + * ABISMAL is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * ABISMAL is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + #include "abismal.hpp" -int main(int argc, const char **argv) { +int +main(int argc, const char **argv) { return abismal(argc, argv); } diff --git a/src/abismalidx.hpp b/src/abismalidx.hpp index cb91b51..94252ed 100644 --- a/src/abismalidx.hpp +++ b/src/abismalidx.hpp @@ -1,4 +1,24 @@ -#ifndef _ABISMALIDX_HPP -#define _ABISMALIDX_HPP -int abismalidx(int argc, const char **argv); -#endif +/* Copyright (C) 2018-2025 Andrew D. Smith and Guilherme Sena + * + * Authors: Andrew D. Smith and Guilherme Sena + * + * This file is part of ABISMAL. + * + * ABISMAL is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * ABISMAL is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef ABISMALIDX_HPP +#define ABISMALIDX_HPP + +int +abismalidx(int argc, const char **argv); + +#endif // ABISMALIDX_HPP diff --git a/src/abismalidx_main.cpp b/src/abismalidx_main.cpp index 23ac6ff..ab8db30 100644 --- a/src/abismalidx_main.cpp +++ b/src/abismalidx_main.cpp @@ -1,5 +1,23 @@ +/* Copyright (C) 2018-2025 Andrew D. Smith and Guilherme Sena + * + * Authors: Andrew D. Smith and Guilherme Sena + * + * This file is part of ABISMAL. + * + * ABISMAL is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * ABISMAL is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + #include "abismalidx.hpp" -int main(int argc, const char **argv) { +int +main(int argc, const char **argv) { return abismalidx(argc, argv); } diff --git a/src/dna_four_bit_bisulfite.hpp b/src/dna_four_bit_bisulfite.hpp index f9fc3e1..4e8d043 100644 --- a/src/dna_four_bit_bisulfite.hpp +++ b/src/dna_four_bit_bisulfite.hpp @@ -1,4 +1,4 @@ -/* Copyright (C) 2018-2020 Andrew D. Smith +/* Copyright (C) 2018-2025 Andrew D. Smith * * Authors: Andrew D. Smith * @@ -15,10 +15,11 @@ * General Public License for more details. */ -#ifndef _DNA_FOUR_BIT_BISULFITE -#define _DNA_FOUR_BIT_BISULFITE -#include // for the int8_t and friends +#ifndef DNA_FOUR_BIT_BISULFITE_HPP +#define DNA_FOUR_BIT_BISULFITE_HPP +#include // for the int8_t and friends +// clang-format off /* encoding of ASCII characters into T-rich bases, used * in encoding reads. * A: 0001 = 1 @@ -67,4 +68,7 @@ static const uint8_t encode_base_a_rich[256] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; -#endif + +// clang-format on + +#endif // DNA_FOUR_BIT_BISULFITE_HPP diff --git a/src/simreads.cpp b/src/simreads.cpp index a743edc..ac8f0bf 100644 --- a/src/simreads.cpp +++ b/src/simreads.cpp @@ -1,5 +1,4 @@ -/* - * Copyright (C) 2018-2024 Andrew D. Smith +/* Copyright (C) 2018-2025 Andrew D. Smith * * Author: Andrew D. Smith * @@ -119,8 +118,12 @@ format_fasta_record(const string &name, const string &read) { } struct FragInfo { - void set_sequential_name() { name = "read" + to_string(frag_count++); } - string read1() const { + void + set_sequential_name() { + name = "read" + to_string(frag_count++); + } + string + read1() const { assert(!name.empty()); string read = seq.substr(0, read_length); for (size_t i = 0; i < read_length - size(read); ++i) @@ -128,7 +131,8 @@ struct FragInfo { return fasta_format ? format_fasta_record(name + ".1", read) : format_fastq_record(name + ".1", read); } - string read2() const { + string + read2() const { assert(!name.empty()); string read(seq); revcomp_inplace(read); @@ -138,7 +142,8 @@ struct FragInfo { return fasta_format ? format_fasta_record(name + ".2", read) : format_fastq_record(name + ".2", read); } - void erase_info_through_insert() { + void + erase_info_through_insert() { const size_t orig_ref_len = end_pos - start_pos; if (2 * read_length < size(seq)) { string cigar2(cigar); @@ -152,11 +157,13 @@ struct FragInfo { seq.substr(size(seq) - read_length, read_length); } } - void remove_cigar_match_symbols() { + void + remove_cigar_match_symbols() { replace(begin(cigar), end(cigar), '=', 'M'); merge_equal_neighbor_cigar_ops(cigar); } - void bisulfite_conversion(const bool random_pbat, const double bs_conv) { + void + bisulfite_conversion(const bool random_pbat, const double bs_conv) { if (pbat || (random_pbat && simreads_random::rand_double() < 0.5)) { for (auto it(begin(seq)); it != end(seq); ++it) { if (*it == 'G' && (simreads_random::rand_double() < bs_conv)) @@ -171,7 +178,10 @@ struct FragInfo { } } - bool rc() const { return strand == '-'; } + bool + rc() const { + return strand == '-'; + } string chrom; size_t start_pos{}; @@ -290,7 +300,8 @@ struct FragSampler { const bool require_valid) : genome(g), cl(c), strand_code(sc), min_length(milen), max_length(malen), require_valid(require_valid) {} - void sample_fragment(FragInfo &the_info) const { + void + sample_fragment(FragInfo &the_info) const { const size_t frag_len = sim_frag_length(min_length, max_length); sim_frag_position(genome, frag_len, the_info.seq, the_info.start_pos, require_valid); @@ -308,12 +319,17 @@ struct FragSampler { revcomp_inplace(the_info.seq); the_info.cigar = to_string(frag_len) + "M"; // default, no muts } - char sim_strand() const { + char + sim_strand() const { switch (strand_code) { - case 'f': return '+'; - case 'r': return '-'; - case 'b': return (simreads_random::rand() & 1) ? '+' : '-'; - default: std::abort(); + case 'f': + return '+'; + case 'r': + return '-'; + case 'b': + return (simreads_random::rand() & 1) ? '+' : '-'; + default: + std::abort(); } } const string &genome; @@ -337,7 +353,8 @@ struct FragMutator { insertion_rate += substitution_rate; deletion_rate += insertion_rate; } - void mutate(FragInfo &the_info) const { + void + mutate(FragInfo &the_info) const { string seq, cigar; size_t i = 0; the_info.score = 0; @@ -370,7 +387,8 @@ struct FragMutator { compress_cigar(cbegin(cigar), cend(cigar), the_info.cigar); swap(seq, the_info.seq); } - char sample_mutation() const { + char + sample_mutation() const { const double x = simreads_random::rand_double(); if (x > mutation_rate) return '='; @@ -384,7 +402,8 @@ struct FragMutator { return 'D'; } } - string tostring() const { + string + tostring() const { ostringstream oss; oss << "mutation_rate=" << mutation_rate << endl << "substitution_rate=" << substitution_rate << endl diff --git a/src/simreads.hpp b/src/simreads.hpp index 1703d2f..0fc88be 100644 --- a/src/simreads.hpp +++ b/src/simreads.hpp @@ -1,4 +1,24 @@ -#ifndef _SIMREADS_HPP -#define _SIMREADS_HPP -int simreads(int argc, const char **argv); -#endif +/* Copyright (C) 2018-2025 Andrew D. Smith + * + * Author: Andrew D. Smith + * + * This file is part of ABISMAL. + * + * ABISMAL is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * ABISMAL is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef SIMREADS_HPP +#define SIMREADS_HPP + +int +simreads(int argc, const char **argv); + +#endif // SIMREADS_HPP diff --git a/src/simreads_main.cpp b/src/simreads_main.cpp index 7246fb1..76f278b 100644 --- a/src/simreads_main.cpp +++ b/src/simreads_main.cpp @@ -1,5 +1,23 @@ +/* Copyright (C) 2018-2025 Andrew D. Smith + * + * Author: Andrew D. Smith + * + * This file is part of ABISMAL. + * + * ABISMAL is free software: you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * ABISMAL is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + #include "simreads.hpp" -int main(int argc, const char **argv) { +int +main(int argc, const char **argv) { return simreads(argc, argv); }