From 45bab7fb28cb4463e34b1ca54d385a76f5651ed6 Mon Sep 17 00:00:00 2001 From: sebastiandeorowicz Date: Wed, 9 Oct 2024 12:46:26 +0200 Subject: [PATCH] Feature/better compression (#5) Better internal compression of examined sequences for lower RAM usage --- README.md | 1 - src/defs.h | 4 +- src/lz_matcher.cpp | 6 +-- src/lz_matcher.h | 3 +- src/params.h | 7 +++ src/seq_reservoir.cpp | 42 +++++++++++++++ src/seq_reservoir.h | 121 +++++++++++++++++++++++++++++++++++------- 7 files changed, 158 insertions(+), 26 deletions(-) diff --git a/README.md b/README.md index 6fbd88d..949c51f 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,5 @@ # LZ-ANI -![version](https://img.shields.io/badge/version-1.1.0-blue.svg) [![GitHub Actions CI](../../workflows/GitHub%20Actions%20CI/badge.svg)](../../actions/workflows/main.yml) [![License: GPL v3](https://img.shields.io/badge/License-GPLv3-blue.svg)](https://www.gnu.org/licenses/gpl-3.0) diff --git a/src/defs.h b/src/defs.h index 6615817..c555f52 100644 --- a/src/defs.h +++ b/src/defs.h @@ -15,8 +15,8 @@ #include #include "params.h" -const std::string LZ_ANI_VER = "lz-ani 1.1.1"; -const std::string LZ_ANI_DATE = "2024-09-12"; +const std::string LZ_ANI_VER = "lz-ani 1.2.0"; +const std::string LZ_ANI_DATE = "2024-10-09"; const std::string LZ_ANI_AUTHORS = "Sebastian Deorowicz, Adam Gudys"; const std::string LZ_ANI_INFO = LZ_ANI_VER + " (" + LZ_ANI_DATE + ") by " + LZ_ANI_AUTHORS; diff --git a/src/lz_matcher.cpp b/src/lz_matcher.cpp index 735048d..d4974db 100644 --- a/src/lz_matcher.cpp +++ b/src/lz_matcher.cpp @@ -187,7 +187,7 @@ void CLZMatcher::do_matching() // Prepare reference auto sr_iter = seq_reservoir.get_sequence(local_task_no); - parser.prepare_reference(seq_view(sr_iter->data, sr_iter->len), sr_iter->no_parts); + parser.prepare_reference(seq_view(sr_iter->data, sr_iter->len, params.internal_packing), sr_iter->no_parts); uint64_t to_add = filter.is_empty() ? local_task_no : (uint64_t)filter.get_row(local_task_no).size(); auto to_print = global_no_pairs.fetch_add(to_add); @@ -204,7 +204,7 @@ void CLZMatcher::do_matching() continue; auto sr_iter = seq_reservoir.get_sequence(id); - parser.prepare_data(seq_view(sr_iter->data, sr_iter->len), sr_iter->no_parts); + parser.prepare_data(seq_view(sr_iter->data, sr_iter->len, params.internal_packing), sr_iter->no_parts); parser.parse(); @@ -221,7 +221,7 @@ void CLZMatcher::do_matching() for (auto& id : filter.get_row(local_task_no)) { auto sr_iter = seq_reservoir.get_sequence(id); - parser.prepare_data(seq_view(sr_iter->data, sr_iter->len), sr_iter->no_parts); + parser.prepare_data(seq_view(sr_iter->data, sr_iter->len, params.internal_packing), sr_iter->no_parts); parser.parse(); diff --git a/src/lz_matcher.h b/src/lz_matcher.h index 0115c7f..71ed3f4 100644 --- a/src/lz_matcher.h +++ b/src/lz_matcher.h @@ -51,7 +51,8 @@ class CLZMatcher public: CLZMatcher(CParams& params) : - params(params) + params(params), + seq_reservoir(params.internal_packing) { if (!params.output_alignment_file_name.empty()) { diff --git a/src/params.h b/src/params.h index 11666bb..02baec6 100644 --- a/src/params.h +++ b/src/params.h @@ -21,6 +21,9 @@ using namespace std; +enum class internal_packing_t { none, two_in_byte, three_in_byte }; +enum class alphabet_t { dna, aminoacid }; + enum class output_type_t {single_txt, two_tsv}; //enum class output_component_t {seq_id1,seq_id2,seq_idx1,seq_idx2,len1,len2,total_ani,global_ani,local_ani,cov,len_ratio,nt_match,nt_mismatch,no_reg}; enum class output_component_t {query,reference,qidx,ridx,qlen,rlen,tani,gani,ani,qcov,rcov,len_ratio,nt_match,nt_mismatch,num_alns}; @@ -44,6 +47,10 @@ struct CParams double filter_thr = 0.0; bool output_in_percent = false; + alphabet_t alphabet = alphabet_t::dna; + internal_packing_t internal_packing = internal_packing_t::three_in_byte; +// internal_packing_t internal_packing = internal_packing_t::two_in_byte; + output_type_t output_type = output_type_t::two_tsv; vector input_file_names; diff --git a/src/seq_reservoir.cpp b/src/seq_reservoir.cpp index 94da0bf..0259d0b 100644 --- a/src/seq_reservoir.cpp +++ b/src/seq_reservoir.cpp @@ -16,6 +16,47 @@ // **************************************************************************** void CSeqReservoir::append(const string& name, const string& seq) { + uint8_t* ptr_seq = nullptr; + + switch (internal_packing) + { + case internal_packing_t::none: + ptr_seq = (uint8_t*)mma_seq.allocate(seq.length()); + + for (size_t i = 0; i < seq.length(); ++i) + ptr_seq[i] = dna_code[seq[i]]; + break; + case internal_packing_t::two_in_byte: + ptr_seq = (uint8_t*)mma_seq.allocate((seq.length() + 1) / 2); + + for (size_t i = 0; i < seq.length() / 2; ++i) + ptr_seq[i] = (uint8_t)((dna_code[seq[2 * i]] << 4) + dna_code[seq[2 * i + 1]]); + + if (seq.length() & 1) + ptr_seq[seq.length() / 2] = (uint8_t)(dna_code[seq[seq.length() - 1]] << 4); + break; + case internal_packing_t::three_in_byte: + ptr_seq = (uint8_t*)mma_seq.allocate((seq.length() + 2) / 3); + + for (size_t i = 0; i < seq.length() / 3; ++i) + ptr_seq[i] = (uint8_t)(36 * dna_code[seq[3 * i]] + 6 * dna_code[seq[3 * i + 1]] + dna_code[seq[3 * i + 2]]); + + switch (seq.length() % 3) + { + case 2: + ptr_seq[seq.length() / 3] = (uint8_t)(36 * dna_code[seq[seq.length() - 2]] + 6 * dna_code[seq.back()]); + break; + case 1: + ptr_seq[seq.length() / 3] = (uint8_t)(36 * dna_code[seq.back()]); + break; + case 0: + // Nothing + break; + } + break; + } + +/* #ifdef USE_PACKED_SEQS uint8_t* ptr_seq = (uint8_t*) mma_seq.allocate((seq.length() + 1) / 2); @@ -30,6 +71,7 @@ void CSeqReservoir::append(const string& name, const string& seq) for (size_t i = 0; i < seq.length(); ++i) ptr_seq[i] = dna_code[seq[i]]; #endif +*/ auto p = find(name.begin(), name.end(), ' '); diff --git a/src/seq_reservoir.h b/src/seq_reservoir.h index ea994fa..2f88c01 100644 --- a/src/seq_reservoir.h +++ b/src/seq_reservoir.h @@ -25,30 +25,57 @@ using namespace std; -#define USE_PACKED_SEQS +// #define USE_PACKED_SEQS class seq_view { const uint8_t* data; - uint32_t len; + const uint32_t len; + const internal_packing_t internal_packing; public: - seq_view(const uint8_t *data = 0, const uint32_t len = 0) : - data(data), len(len) + seq_view(const uint8_t *data = 0, const uint32_t len = 0, internal_packing_t internal_packing = internal_packing_t::none) : + data(data), + len(len), + internal_packing(internal_packing) {} - void assign(uint8_t* _data, uint32_t _len) +/* void assign(uint8_t* _data, uint32_t _len, internal_packing_t _internal_packing = internal_packing_t::none) { data = _data; - len = len; - } + len = _len; + internal_packing = _internal_packing; + }*/ uint8_t operator[](const uint32_t pos) const { - if (pos & 1) - return data[pos / 2] & 0xf; - else - return data[pos / 2] >> 4; + switch (internal_packing) + { + case internal_packing_t::none: + return data[pos]; + break; + case internal_packing_t::two_in_byte: + if (pos & 1) + return data[pos / 2] & 0xf; + else + return data[pos / 2] >> 4; + break; + case internal_packing_t::three_in_byte: + uint32_t pos_div_3 = pos / 3; + + switch (pos - 3 * pos_div_3) + { + case 0: + return data[pos_div_3] / 36; + case 1: + return (data[pos_div_3] / 6) % 6; + case 2: + return data[pos_div_3] % 6; + } + break; + } + + return 0; // Never should be here } uint32_t size() const @@ -56,7 +83,7 @@ class seq_view return len; } - static void pack(uint8_t* dest, uint8_t* src, uint32_t len) +/* static void pack2(uint8_t* dest, uint8_t* src, uint32_t len) { for (uint32_t i = 0; i < len / 2; ++i) dest[i] = (src[2 * i] << 4) + src[2 * i + 1]; @@ -65,7 +92,7 @@ class seq_view dest[len / 2] = src[len - 1] << 4; } - static void unpack(uint8_t * dest, uint8_t * src, uint32_t len) + static void unpack2(uint8_t* dest, uint8_t* src, uint32_t len) { for (uint32_t i = 0; i < len / 2; ++i) { @@ -76,6 +103,50 @@ class seq_view if (len & 1) dest[len - 1] = src[len / 2] >> 4; } + + static void pack3(uint8_t* dest, uint8_t* src, uint32_t len) + { + for (uint32_t i = 0; i < len / 3; ++i) + dest[i] = 36 * src[3 * i] + 6 * src[3 * i + 1] + src[3 * i + 2]; + + switch (len % 3) + { + case 2: + dest[len / 3] = 6 * src[len - 2] + src[len - 1]; + break; + case 1: + dest[len / 3] = src[len - 1]; + break; + case 0: + // Nothing + break; + } + } + + static void unpack3(uint8_t * dest, uint8_t * src, uint32_t len) + { + uint32_t len_div_3 = len / 3; + + for (uint32_t i = 0; i < len_div_3; ++i) + { + dest[3 * i] = src[i] / 36; + dest[3 * i + 1] = src[i] / 6 - 6 * dest[3 * i]; + dest[3 * i + 2] = src[i] - 36 * dest[3 * i] - 6 * dest[3 * i + 1]; + } + + switch (len % 3) + { + case 2: + dest[len - 2] = src[len_div_3] / 6; + dest[len - 1] = src[len_div_3] - 6 * dest[len - 2]; + break; + case 1: + dest[len - 1] = src[len_div_3]; + case 0: + // Nothing + break; + } + }*/ }; class CSeqReservoir @@ -116,21 +187,33 @@ class CSeqReservoir refresh::memory_monotonic_unsafe mma_seq; refresh::memory_monotonic_unsafe mma_name; + internal_packing_t internal_packing = internal_packing_t::none; + alphabet_t alphabet = alphabet_t::dna; + array dna_code; void append(const string& name, const string& seq); public: - CSeqReservoir() : + CSeqReservoir(internal_packing_t internal_packing = internal_packing_t::none /*, alphabet_t alphabet = alphabet_t::dna*/) : mma_seq(32 << 20, 16), mma_name(1 << 20, 16), + internal_packing(internal_packing), +// alphabet(alphabet), dna_code{} { - fill(dna_code.begin(), dna_code.end(), code_N_seq); - dna_code['A'] = dna_code['a'] = code_A; - dna_code['C'] = dna_code['c'] = code_C; - dna_code['G'] = dna_code['g'] = code_G; - dna_code['T'] = dna_code['t'] = code_T; + if (alphabet == alphabet_t::dna) + { + fill(dna_code.begin(), dna_code.end(), code_N_seq); + dna_code['A'] = dna_code['a'] = code_A; + dna_code['C'] = dna_code['c'] = code_C; + dna_code['G'] = dna_code['g'] = code_G; + dna_code['T'] = dna_code['t'] = code_T; + } + else if (alphabet == alphabet_t::aminoacid) + { + assert(0); + } } bool load_fasta(const vector& fasta_files, uint32_t sep_len, uint32_t verbosity_level);