From 8a0b48dcfded0a0888ea8a78c980be8608dec4c7 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 20 Mar 2023 13:21:40 -0400 Subject: [PATCH 01/21] init rgfa commit --- src/gfa.cpp | 339 ++++++++++++++++++++++++++++++++++ src/gfa.hpp | 42 +++++ src/subcommand/paths_main.cpp | 90 ++++++--- 3 files changed, 448 insertions(+), 23 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index 435fd29dcd3..0a21908983b 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -5,6 +5,8 @@ #include +#define debug + namespace vg { using namespace std; @@ -247,4 +249,341 @@ void write_w_line(const PathHandleGraph* graph, ostream& out, path_handle_t path out << "\n"; } +int rgfa_rank(const string& path_name) { + int rank = -1; + size_t pos = path_name.rfind(":SR:i:"); + if (pos != string::npos && path_name.length() - pos >= 6) { + pos += 6; + size_t pos2 = path_name.find(":", pos); + size_t len = pos2 == string::npos ? pos2 : pos2 - pos; + string rank_string = path_name.substr(pos, len); + rank = parse(rank_string); + } + return rank; +} + +string set_rgfa_rank(const string& path_name, int rgfa_rank) { + string new_name; + // check if we have an existing rank. if so, we srap it. + size_t pos = path_name.rfind(":SR:i:"); + if (pos != string::npos && path_name.length() - pos >= 6) { + size_t pos2 = path_name.find(":", pos + 6); + new_name = path_name.substr(0, pos); + if (pos2 != string::npos) { + new_name += path_name.substr(pos2); + } + } else { + new_name = path_name; + } + + // now append the rank + new_name += ":SR:i:" + std::to_string(rgfa_rank); + return new_name; +} + +void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, + SnarlManager* snarl_manager, + const unordered_set& reference_paths, + int64_t minimum_length, + const unordered_map>>& preferred_intervals){ + + // we use the path traversal finder for everything + // (even gbwt haplotypes, as we're using the path handle interface) + PathTraversalFinder path_trav_finder(*graph, *snarl_manager); + + // we collect the rgfa cover in parallel as a list of path fragments + size_t thread_count = get_thread_count(); + vector>>> thread_covers(thread_count); + + // we process top-level snarl_manager in parallel + snarl_manager->for_each_top_level_snarl_parallel([&](const Snarl* snarl) { + // index nodes in rgfa cover to prevent overlaps + unordered_set cover_nodes; + // per-thread output + vector>>& cover_fragments = thread_covers.at(omp_get_thread_num()); + + // we keep track of ranks. rgfa ranks are relative to the reference path, not the snarl tree + // so we begin at -1 which means unknown. it will turn to 0 at first reference anchored snarl found + // -1 rank snarls cannot be covered by this algorithm + vector> queue; // rank,snarl + + queue.push_back(make_pair(-1, snarl)); + + while(!queue.empty()) { + pair rank_snarl = queue.back(); + queue.pop_back(); + + // get the snarl cover, writing to cover_nodes and cover_fragments + // note that we are single-threaded per top-level snarl, at least for now + // this is because parent snarls and child snarls can potentially cover the + // sname things + int64_t rgfa_rank = rgfa_snarl_cover(graph, + *snarl, + path_trav_finder, + reference_paths, + minimum_length, + -1, + cover_nodes, + cover_fragments, + preferred_intervals); + + // we don't even attempt to cover rank -1 snarls, instead just + // recurse until we find a reference path. + // this does put a reference path / snarl decomposition alignment + // requirement on this code + int64_t child_rank = rgfa_rank < 0 ? rgfa_rank : rgfa_rank + 1; + + // recurse on the children + const vector& children = snarl_manager->children_of(snarl); + for (const Snarl* child_snarl : children) { + queue.push_back(make_pair(child_rank, child_snarl)); + } + } + }); + + // merge up the thread covers + vector>> rgfa_fragments = std::move(thread_covers.at(0)); + for (size_t t = 1; t < thread_count; ++t) { + rgfa_fragments.reserve(rgfa_fragments.size() + thread_covers.at(t).size()); + std::move(thread_covers.at(t).begin(), thread_covers.at(t).end(), std::back_inserter(rgfa_fragments)); + } + thread_covers.clear(); + + // we don't have a path position interface, and even if we did we probably wouldn't have it on every path + // so to keep running time linear, we need to index the fragments so their offsets can be computed in one scan + // begin by sorting by path + unordered_map> path_to_fragments; + for (size_t i = 0; i get_path_handle_of_step(rgfa_fragment.second.front()); + path_to_fragments[path_handle].push_back(i); + } + + for (const auto& path_fragments : path_to_fragments) { + const path_handle_t& path_handle = path_fragments.first; + PathSense path_sense; + string path_sample; + string path_locus; + size_t path_haplotype; + size_t path_phase_block; + subrange_t path_subrange; + PathMetadata::parse_path_name(graph->get_path_name(path_handle), path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); + + const vector& fragments = path_fragments.second; + + // for each path, start by finding the positional offset of all relevant steps in the path by brute-force scan + unordered_map step_to_pos; + for (const int64_t& frag_idx : fragments) { + const vector& rgfa_fragment = rgfa_fragments.at(frag_idx).second; + step_to_pos[rgfa_fragment.front()] = -1; + // todo: need to figure out where exactly we handle all the different orientation cases + if (rgfa_fragment.size() > 1) { + step_to_pos[rgfa_fragment.back()] = -1; + } + } + size_t pos_count = 0; + size_t pos = 0; + graph->for_each_step_in_path(path_handle, [&](const step_handle_t& step_handle) { + if (step_to_pos.count(step_handle)) { + step_to_pos[step_handle] = pos; + ++pos_count; + if (pos_count == step_to_pos.size()) { + return false; + } + } + handle_t handle = graph->get_handle_of_step(step_handle); + pos += graph->get_length(handle); + return true; + }); + assert(pos_count == step_to_pos.size()); + + // second pass to make the path fragments, now that we know the positional offsets of their endpoints + for (const int64_t frag_idx : fragments) { + int64_t rgfa_rank = rgfa_fragments.at(frag_idx).first; + const vector& rgfa_fragment = rgfa_fragments.at(frag_idx).second; + + size_t rgfa_frag_pos = step_to_pos[rgfa_fragment.front()]; + size_t rgfa_frag_length = 0; + for (const step_handle_t& step : rgfa_fragment) { + rgfa_frag_length += graph->get_length(graph->get_handle_of_step(step)); + } + subrange_t rgfa_frag_subrange; + rgfa_frag_subrange.first = rgfa_frag_pos + (path_subrange != PathMetadata::NO_SUBRANGE ? path_subrange.first : 0); + rgfa_frag_subrange.second = rgfa_frag_subrange.first + rgfa_frag_length; + + string rgfa_frag_name = PathMetadata::create_path_name(path_sense, path_sample, path_locus, path_haplotype, + path_phase_block, rgfa_frag_subrange); + + rgfa_frag_name = set_rgfa_rank(rgfa_frag_name, rgfa_rank); + +#ifdef debug +#pragma omp critical(cerr) + cerr << "making new rgfa fragment with name " << rgfa_frag_name << " and " << rgfa_fragment.size() << " steps. subrange " + << rgfa_frag_subrange.first << "," << rgfa_frag_subrange.second << endl; +#endif + path_handle_t rgfa_fragment_handle = graph->create_path_handle(rgfa_frag_name); + for (const step_handle_t& step : rgfa_fragment) { + graph->append_step(rgfa_fragment_handle, graph->get_handle_of_step(step)); + } + } + } +} + +int64_t rgfa_snarl_cover(const PathHandleGraph* graph, + const Snarl& snarl, + PathTraversalFinder& path_trav_finder, + const unordered_set& reference_paths, + int64_t minimum_length, + int64_t rgfa_rank, + unordered_set& cover_nodes, + vector>>& cover_fragments, + const unordered_map>>& preferred_intervals) { + +#ifdef debug +#pragma omp critical(cerr) + cerr << "calling rgfa_snarl_cover with rank=" << rgfa_rank << " on " << pb2json(snarl) << endl; +#endif + + // // start by finding the path traversals through the snarl + vector> travs; + { + pair, vector > > path_travs = path_trav_finder.find_path_traversals(snarl); + travs.reserve(path_travs.first.size()); + + // reduce protobuf usage by going back to vector of steps instead of keeping SnarlTraversals around + for (int64_t i = 0; i < path_travs.first.size(); ++i) { + bool reversed = false; + if (graph->get_is_reverse(graph->get_handle_of_step(path_travs.second[i].first)) != snarl.start().backward()) { + reversed == true; + } + assert((graph->get_is_reverse(graph->get_handle_of_step(path_travs.second[i].second)) != snarl.end().backward()) == reversed); + vector trav; + trav.reserve(path_travs.first[i].visit_size()); + bool done = false; + function visit_next_step = [&graph,&reversed](step_handle_t step_handle) { + return reversed ? graph->get_previous_step(step_handle) : graph->get_next_step(step_handle); + }; + for (step_handle_t step_handle = path_travs.second[i].first; !done; step_handle = visit_next_step(step_handle)) { + trav.push_back(step_handle); + if (step_handle == path_travs.second[i].second) { + done = true; + } + } + travs.push_back(trav); + } + } + + // find all reference paths through the snarl + map ref_paths; + for (int64_t i = 0; i < travs.size(); ++i) { + path_handle_t trav_path = graph->get_path_handle_of_step(travs[i].front()); + if (reference_paths.count(trav_path)) { + ref_paths[graph->get_path_name(trav_path)] = i; + } + } + + if (ref_paths.empty() && rgfa_rank <= 0) { + // we're not nested in a reference snarl, and we have no reference path + // by the current logic, there's nothing to be done. + cerr << "[rgfa] warning: No referene path through snarl " + << pb2json(snarl) << ": unable to process for rGFA cover" << endl; + return -1; + } + + if (ref_paths.size() > 1) { + // And we could probably cope with this... but don't for now + cerr << "[rgfa] error: Mutiple reference path traversals found through snarl " + << pb2json(snarl) << endl; + } + + if (!ref_paths.empty()) { + // reference paths are trivially covered outside the snarl decomposition + // all we do here is make sure the relevant nodes are flagged in the map + vector& ref_trav = travs.at(ref_paths.begin()->second); + for (step_handle_t ref_step_handle : ref_trav) { + cover_nodes.insert(graph->get_id(graph->get_handle_of_step(ref_step_handle))); + } + // this is the rank going forward for all the coers we add + // (note: we're not adding rank-0 intervals in this function -- that's done in a separate pass above) + rgfa_rank = 1; + } + +#ifdef debug +#pragma omp critical(cerr) + cerr << "found " << travs.size() << " traversals including " << ref_paths.size() << " reference traversals" << endl; +#endif + + + // find all intervals within a snarl traversal that are completely uncovered. + // the returned intervals are open-ended. + function>(const vector&)> get_uncovered_intervals = [&](const vector& trav) { + vector> intervals; + int64_t start = -1; + for (size_t i = 0; i < trav.size(); ++i) { + bool covered = cover_nodes.count(graph->get_id(graph->get_handle_of_step(trav[i]))); + if (covered) { + if (start != -1) { + intervals.push_back(make_pair(start, i)); + } + start = -1; + } else { + if (start == -1) { + start = i; + } + } + } + if (start != -1) { + intervals.push_back(make_pair(start, trav.size())); + } + return intervals; + }; + + // now we try to find candidate rgfa intervals in the other traversals + // there's lots of room here for smarter heuristics, but right now we just + // do first come first served. + for (int64_t trav_idx = 0; trav_idx < travs.size(); ++trav_idx) { + // todo: this map seems backwards? note really a big deal since + // we're only allowing one element + bool is_ref = false; + for (const auto& ref_trav : ref_paths) { + if (ref_trav.second == trav_idx) { + is_ref = true; + break; + } + } + if (is_ref) { + continue; + } + const vector& trav = travs.at(trav_idx); + vector> uncovered_intervals = get_uncovered_intervals(trav); + +#ifdef debug +#pragma omp critical(cerr) + cerr << "found " << uncovered_intervals.size() << "uncovered intervals in traversal " << trav_idx << endl; +#endif + + for (const auto& uncovered_interval : uncovered_intervals) { + int64_t interval_length = 0; + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { + interval_length += graph->get_length(graph->get_handle_of_step(trav[i])); + } + if (interval_length >= minimum_length) { + // update the cover + vector interval; + interval.reserve(uncovered_interval.second - uncovered_interval.first); + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { + interval.push_back(trav[i]); + cover_nodes.insert(graph->get_id(graph->get_handle_of_step(trav[i]))); + } + cover_fragments.push_back(make_pair(rgfa_rank, std::move(interval))); + } + } + } + + return rgfa_rank; +} + + + } diff --git a/src/gfa.hpp b/src/gfa.hpp index 3e914c21499..63f2c95c05c 100644 --- a/src/gfa.hpp +++ b/src/gfa.hpp @@ -12,6 +12,8 @@ */ #include "handle.hpp" +#include "snarls.hpp" +#include "traversal_finder.hpp" namespace vg { @@ -26,6 +28,46 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, bool rgfa_pline = false, bool use_w_lines = true); + +/// Prototype code to tag paths as rGFA paths. Either needs to be completely scrapped +/// or adapted into libhandlegraph at some point, ideally. +/// It works by adding :SR:i: to the end of the name + +/// Returns the RGFA rank (SR) of a path. This will be 0 for the reference +/// backbone, and higher the further number of (nested) bubbles away it is. +/// If the path is not an RGFA path, then return -1 +int rgfa_rank(const string& path_name); + +/// Add the RGFA rank tag to a pathname +string set_rgfa_rank(const string& path_name, int rgfa_rank); + +/// Compute the rGFA path cover +/// graph: the graph +/// snarl_manager: the snarls (todo: should use distance index) +/// reference_paths: rank-0 paths +/// minimum_length: the minimum length of a path to create (alleles shorter than this can be uncovered) +/// preferred_intervals: set of ranges (ex from minigraph) to use as possible for rGFA paths +void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, + SnarlManager* snarl_manager, + const unordered_set& reference_paths, + int64_t minimum_length, + const unordered_map>>& preferred_intervals = {}); + +int64_t rgfa_snarl_cover(const PathHandleGraph* graph, + const Snarl& snarl, + PathTraversalFinder& path_trav_finder, + const unordered_set& reference_paths, + int64_t minimum_length, + int64_t rgfa_rank, + unordered_set& cover_nodes, + vector>>& cover_fragments, + const unordered_map>>& preferred_intervals); + + +/// Extract rGFA tags from minigraph GFA in order to pass in as hints above +unordered_map>> extract_rgfa_intervals(const string& rgfa_path); + + } #endif diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index dd8589947a1..6ffc2515aea 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -16,6 +16,8 @@ #include "../vg.hpp" #include "../xg.hpp" #include "../gbwt_helper.hpp" +#include "../integrated_snarl_finder.hpp" +#include "../gfa.hpp" #include #include #include @@ -37,6 +39,10 @@ void help_paths(char** argv) { << " -V, --extract-vg output a path-only graph covering the selected paths" << endl << " -d, --drop-paths output a graph with the selected paths removed" << endl << " -r, --retain-paths output a graph with only the selected paths retained" << endl + << " rGFA cover" << endl + << " -R, --rgfa-min-length N add rGFA cover to graph, using seleciton from -Q/-S as rank-0 backbone, only adding fragments >= Nbp (default:-1=disabled)" << endl + << " -s, --snarls FILE snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover (-R)." << endl + << " -t, --threads N use up to N threads when computing rGFA covoer (default: all available)" << endl << " output path data:" << endl << " -X, --extract-gam print (as GAM alignments) the stored paths in the graph" << endl << " -A, --extract-gaf print (as GAF alignments) the stored paths in the graph" << endl @@ -99,6 +105,8 @@ int main_paths(int argc, char** argv) { bool extract_as_fasta = false; bool drop_paths = false; bool retain_paths = false; + int64_t rgfa_min_len = -1; + string snarl_filename; string graph_file; string gbwt_file; string path_prefix; @@ -131,6 +139,7 @@ int main_paths(int argc, char** argv) { {"extract-vg", no_argument, 0, 'V'}, {"drop-paths", no_argument, 0, 'd'}, {"retain-paths", no_argument, 0, 'r'}, + {"rgfa-cover", required_argument, 0, 'R'}, {"extract-gam", no_argument, 0, 'X'}, {"extract-gaf", no_argument, 0, 'A'}, {"list", no_argument, 0, 'L'}, @@ -144,16 +153,15 @@ int main_paths(int argc, char** argv) { {"variant-paths", no_argument, 0, 'a'}, {"generic-paths", no_argument, 0, 'G'}, {"coverage", no_argument, 0, 'c'}, - + {"threads", required_argument, 0, 't'}, // Hidden options for backward compatibility. - {"threads", no_argument, 0, 'T'}, {"threads-by", required_argument, 0, 'q'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:draGp:c", + c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drR:aGp:ct:", long_options, &option_index); // Detect the end of the options. @@ -182,12 +190,17 @@ int main_paths(int argc, char** argv) { case 'd': drop_paths = true; output_formats++; - break; + break; case 'r': retain_paths = true; output_formats++; break; + + case 'R': + rgfa_min_len = parse(optarg); + output_formats++; + break; case 'X': extract_as_gam = true; @@ -260,9 +273,16 @@ int main_paths(int argc, char** argv) { output_formats++; break; - case 'T': - std::cerr << "warning: [vg paths] option --threads is obsolete and unnecessary" << std::endl; + case 't': + { + int num_threads = parse(optarg); + if (num_threads <= 0) { + cerr << "error:[vg call] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); break; + } case 'q': std::cerr << "warning: [vg paths] option --threads-by is deprecated; please use --paths-by" << std::endl; @@ -299,7 +319,7 @@ int main_paths(int argc, char** argv) { } } if (output_formats != 1) { - std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -L, -F, -E, -C or -c) must be specified" << std::endl; + std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -R, -L, -F, -E, -C or -c) must be specified" << std::endl; std::exit(EXIT_FAILURE); } if (selection_criteria > 1) { @@ -350,9 +370,23 @@ int main_paths(int argc, char** argv) { exit(1); } } - - - + + // Load or compute the snarls + unique_ptr snarl_manager; + if (rgfa_min_len >= 0) { + if (!snarl_filename.empty()) { + ifstream snarl_file(snarl_filename.c_str()); + if (!snarl_file) { + cerr << "Error [vg paths]: Unable to load snarls file: " << snarl_filename << endl; + return 1; + } + snarl_manager = vg::io::VPKG::load_one(snarl_file); + } else { + IntegratedSnarlFinder finder(*graph); + snarl_manager = unique_ptr(new SnarlManager(std::move(finder.find_snarls_parallel()))); + } + } + set path_names; if (!path_file.empty()) { ifstream path_stream(path_file); @@ -545,7 +579,7 @@ int main_paths(int argc, char** argv) { }; - if (drop_paths || retain_paths) { + if (drop_paths || retain_paths || rgfa_min_len >= 0) { MutablePathMutableHandleGraph* mutable_graph = dynamic_cast(graph.get()); if (!mutable_graph) { std::cerr << "error[vg paths]: graph cannot be modified" << std::endl; @@ -557,20 +591,30 @@ int main_paths(int argc, char** argv) { exit(1); } - vector to_destroy; - if (drop_paths) { - for_each_selected_path([&](const path_handle_t& path_handle) { - string name = graph->get_path_name(path_handle); - to_destroy.push_back(name); - }); + if (drop_paths || retain_paths) { + vector to_destroy; + if (drop_paths) { + for_each_selected_path([&](const path_handle_t& path_handle) { + string name = graph->get_path_name(path_handle); + to_destroy.push_back(name); + }); + } else { + for_each_unselected_path([&](const path_handle_t& path_handle) { + string name = graph->get_path_name(path_handle); + to_destroy.push_back(name); + }); + } + for (string& path_name : to_destroy) { + mutable_graph->destroy_path(graph->get_path_handle(path_name)); + } } else { - for_each_unselected_path([&](const path_handle_t& path_handle) { - string name = graph->get_path_name(path_handle); - to_destroy.push_back(name); + assert(rgfa_min_len >= 0); + unordered_set reference_paths; + for_each_selected_path([&](const path_handle_t& path_handle) { + reference_paths.insert(path_handle); }); - } - for (string& path_name : to_destroy) { - mutable_graph->destroy_path(graph->get_path_handle(path_name)); + + rgfa_graph_cover(mutable_graph, snarl_manager.get(), reference_paths, rgfa_min_len); } // output the graph From 5ebad42ef9dace97ea3156421136bfb6015a4f90 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 20 Mar 2023 21:54:27 -0400 Subject: [PATCH 02/21] clean up recursion a bit --- src/gfa.cpp | 117 +++++++++++++++++++++++++++++----------------------- src/gfa.hpp | 19 ++++----- 2 files changed, 74 insertions(+), 62 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index 0a21908983b..30bc0299b44 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -249,7 +249,7 @@ void write_w_line(const PathHandleGraph* graph, ostream& out, path_handle_t path out << "\n"; } -int rgfa_rank(const string& path_name) { +int get_rgfa_rank(const string& path_name) { int rank = -1; size_t pos = path_name.rfind(":SR:i:"); if (pos != string::npos && path_name.length() - pos >= 6) { @@ -297,46 +297,37 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, // we process top-level snarl_manager in parallel snarl_manager->for_each_top_level_snarl_parallel([&](const Snarl* snarl) { - // index nodes in rgfa cover to prevent overlaps - unordered_set cover_nodes; // per-thread output + // each fragment is a rank and vector of steps, the cove is a list of fragments + // TODO: we can store just a first step and count instead of every fragment vector>>& cover_fragments = thread_covers.at(omp_get_thread_num()); + // we also index the fragments by their node ids for fast lookups of what's covered by what + // the value here is an index in the above vector + unordered_map cover_node_to_fragment; - // we keep track of ranks. rgfa ranks are relative to the reference path, not the snarl tree - // so we begin at -1 which means unknown. it will turn to 0 at first reference anchored snarl found - // -1 rank snarls cannot be covered by this algorithm - vector> queue; // rank,snarl - - queue.push_back(make_pair(-1, snarl)); + vector queue = {snarl}; while(!queue.empty()) { - pair rank_snarl = queue.back(); + const Snarl* cur_snarl = queue.back(); queue.pop_back(); // get the snarl cover, writing to cover_nodes and cover_fragments // note that we are single-threaded per top-level snarl, at least for now // this is because parent snarls and child snarls can potentially cover the // sname things - int64_t rgfa_rank = rgfa_snarl_cover(graph, - *snarl, - path_trav_finder, - reference_paths, - minimum_length, - -1, - cover_nodes, - cover_fragments, - preferred_intervals); - - // we don't even attempt to cover rank -1 snarls, instead just - // recurse until we find a reference path. - // this does put a reference path / snarl decomposition alignment - // requirement on this code - int64_t child_rank = rgfa_rank < 0 ? rgfa_rank : rgfa_rank + 1; + rgfa_snarl_cover(graph, + *cur_snarl, + path_trav_finder, + reference_paths, + minimum_length, + cover_fragments, + cover_node_to_fragment, + preferred_intervals); // recurse on the children - const vector& children = snarl_manager->children_of(snarl); + const vector& children = snarl_manager->children_of(cur_snarl); for (const Snarl* child_snarl : children) { - queue.push_back(make_pair(child_rank, child_snarl)); + queue.push_back(child_snarl); } } }); @@ -430,19 +421,18 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, } } -int64_t rgfa_snarl_cover(const PathHandleGraph* graph, - const Snarl& snarl, - PathTraversalFinder& path_trav_finder, - const unordered_set& reference_paths, - int64_t minimum_length, - int64_t rgfa_rank, - unordered_set& cover_nodes, - vector>>& cover_fragments, - const unordered_map>>& preferred_intervals) { +void rgfa_snarl_cover(const PathHandleGraph* graph, + const Snarl& snarl, + PathTraversalFinder& path_trav_finder, + const unordered_set& reference_paths, + int64_t minimum_length, + vector>>& cover_fragments, + unordered_map& cover_node_to_fragment, + const unordered_map>>& preferred_intervals) { #ifdef debug #pragma omp critical(cerr) - cerr << "calling rgfa_snarl_cover with rank=" << rgfa_rank << " on " << pb2json(snarl) << endl; + cerr << "calling rgfa_snarl_cover on " << pb2json(snarl) << endl; #endif // // start by finding the path traversals through the snarl @@ -453,10 +443,17 @@ int64_t rgfa_snarl_cover(const PathHandleGraph* graph, // reduce protobuf usage by going back to vector of steps instead of keeping SnarlTraversals around for (int64_t i = 0; i < path_travs.first.size(); ++i) { + string trav_path_name = graph->get_path_name(graph->get_path_handle_of_step(path_travs.second[i].first)); + if (get_rgfa_rank(trav_path_name) > 0) { + // we ignore existing (off-reference) rGFA paths + // todo: shoulgd there be better error handling? + cerr << "Warning : skipping existing rgfa traversal " << trav_path_name << endl; + continue; + } bool reversed = false; if (graph->get_is_reverse(graph->get_handle_of_step(path_travs.second[i].first)) != snarl.start().backward()) { - reversed == true; - } + reversed = true; + } assert((graph->get_is_reverse(graph->get_handle_of_step(path_travs.second[i].second)) != snarl.end().backward()) == reversed); vector trav; trav.reserve(path_travs.first[i].visit_size()); @@ -483,12 +480,12 @@ int64_t rgfa_snarl_cover(const PathHandleGraph* graph, } } - if (ref_paths.empty() && rgfa_rank <= 0) { + if (ref_paths.empty() && !cover_node_to_fragment.count(snarl.start().node_id())) { // we're not nested in a reference snarl, and we have no reference path // by the current logic, there's nothing to be done. cerr << "[rgfa] warning: No referene path through snarl " << pb2json(snarl) << ": unable to process for rGFA cover" << endl; - return -1; + return; } if (ref_paths.size() > 1) { @@ -502,11 +499,10 @@ int64_t rgfa_snarl_cover(const PathHandleGraph* graph, // all we do here is make sure the relevant nodes are flagged in the map vector& ref_trav = travs.at(ref_paths.begin()->second); for (step_handle_t ref_step_handle : ref_trav) { - cover_nodes.insert(graph->get_id(graph->get_handle_of_step(ref_step_handle))); + nid_t node_id = graph->get_id(graph->get_handle_of_step(ref_step_handle)); + // using -1 as special signifier of the reference path + cover_node_to_fragment[node_id] = -1; } - // this is the rank going forward for all the coers we add - // (note: we're not adding rank-0 intervals in this function -- that's done in a separate pass above) - rgfa_rank = 1; } #ifdef debug @@ -521,7 +517,7 @@ int64_t rgfa_snarl_cover(const PathHandleGraph* graph, vector> intervals; int64_t start = -1; for (size_t i = 0; i < trav.size(); ++i) { - bool covered = cover_nodes.count(graph->get_id(graph->get_handle_of_step(trav[i]))); + bool covered = cover_node_to_fragment.count(graph->get_id(graph->get_handle_of_step(trav[i]))); if (covered) { if (start != -1) { intervals.push_back(make_pair(start, i)); @@ -560,11 +556,30 @@ int64_t rgfa_snarl_cover(const PathHandleGraph* graph, #ifdef debug #pragma omp critical(cerr) - cerr << "found " << uncovered_intervals.size() << "uncovered intervals in traversal " << trav_idx << endl; + cerr << "found " << uncovered_intervals.size() << " uncovered intervals in traversal " << trav_idx << endl; #endif for (const auto& uncovered_interval : uncovered_intervals) { - int64_t interval_length = 0; + // we check the "backbone" interval that this interval is coming off + // since we've already covered the reference, then we know that this interval + // doesn't span the whole snarl including endpoints, so we can always afford + // to look back and ahead one + cerr << "uncovered interval " << uncovered_interval.first << ", " << uncovered_interval.second << " vs trav size " << trav.size() << endl; + assert(uncovered_interval.first > 0 && uncovered_interval.second < trav.size()); + int64_t prev_frag_idx = cover_node_to_fragment.at(graph->get_id(graph->get_handle_of_step(trav[uncovered_interval.first - 1]))); + int64_t next_frag_idx = cover_node_to_fragment.at(graph->get_id(graph->get_handle_of_step(trav[uncovered_interval.second]))); + if (prev_frag_idx != next_frag_idx) { + // todo: figure out policy for these + cerr << "[rgfa] warning: skipping fragment that is connected to different fragmengs" << endl; + continue; + } + int64_t fragment_rank; + if (prev_frag_idx == -1) { + fragment_rank = 1; + } else { + fragment_rank = cover_fragments.at(prev_frag_idx).first + 1; + } + int64_t interval_length = 0; for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { interval_length += graph->get_length(graph->get_handle_of_step(trav[i])); } @@ -574,14 +589,12 @@ int64_t rgfa_snarl_cover(const PathHandleGraph* graph, interval.reserve(uncovered_interval.second - uncovered_interval.first); for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { interval.push_back(trav[i]); - cover_nodes.insert(graph->get_id(graph->get_handle_of_step(trav[i]))); + cover_node_to_fragment[graph->get_id(graph->get_handle_of_step(trav[i]))] = cover_fragments.size(); } - cover_fragments.push_back(make_pair(rgfa_rank, std::move(interval))); + cover_fragments.push_back(make_pair(fragment_rank, std::move(interval))); } } } - - return rgfa_rank; } diff --git a/src/gfa.hpp b/src/gfa.hpp index 63f2c95c05c..ebfaf3df312 100644 --- a/src/gfa.hpp +++ b/src/gfa.hpp @@ -36,7 +36,7 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, /// Returns the RGFA rank (SR) of a path. This will be 0 for the reference /// backbone, and higher the further number of (nested) bubbles away it is. /// If the path is not an RGFA path, then return -1 -int rgfa_rank(const string& path_name); +int get_rgfa_rank(const string& path_name); /// Add the RGFA rank tag to a pathname string set_rgfa_rank(const string& path_name, int rgfa_rank); @@ -53,15 +53,14 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, int64_t minimum_length, const unordered_map>>& preferred_intervals = {}); -int64_t rgfa_snarl_cover(const PathHandleGraph* graph, - const Snarl& snarl, - PathTraversalFinder& path_trav_finder, - const unordered_set& reference_paths, - int64_t minimum_length, - int64_t rgfa_rank, - unordered_set& cover_nodes, - vector>>& cover_fragments, - const unordered_map>>& preferred_intervals); +void rgfa_snarl_cover(const PathHandleGraph* graph, + const Snarl& snarl, + PathTraversalFinder& path_trav_finder, + const unordered_set& reference_paths, + int64_t minimum_length, + vector>>& cover_fragments, + unordered_map& cover_node_to_fragment, + const unordered_map>>& preferred_intervals); /// Extract rGFA tags from minigraph GFA in order to pass in as hints above From 24d713d88855f75f081e9cdbfbc9dc06bdef2ba4 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 21 Mar 2023 13:57:40 -0400 Subject: [PATCH 03/21] implement greedy rgfa optimizer --- src/gfa.cpp | 193 ++++++++++++++++++++++++++++++++++++++++++---------- src/gfa.hpp | 12 +++- 2 files changed, 168 insertions(+), 37 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index 30bc0299b44..064a58a9e28 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -2,6 +2,7 @@ #include "utility.hpp" #include "path.hpp" #include +#include #include @@ -228,7 +229,7 @@ void write_w_line(const PathHandleGraph* graph, ostream& out, path_handle_t path if (phase_block_end_cursor != 0) { if (start_offset != 0) { // TODO: Work out a way to support phase blocks and subranges at the same time. - cerr << "[gfa] error: cannot write multiple phase blocks on a sample, haplotyope, and contig in GFA format" + cerr << "[gfa] error: cannot write multiple phase blocks on a sample, haplotype, and contig in GFA format" << " when paths already have subranges. Fix path " << graph->get_path_name(path_handle) << endl; exit(1); } @@ -360,6 +361,7 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, subrange_t path_subrange; PathMetadata::parse_path_name(graph->get_path_name(path_handle), path_sense, path_sample, path_locus, path_haplotype, path_phase_block, path_subrange); + PathSense out_path_sense = path_sense == PathSense::HAPLOTYPE ? PathSense::GENERIC : path_sense; const vector& fragments = path_fragments.second; @@ -403,7 +405,7 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, rgfa_frag_subrange.first = rgfa_frag_pos + (path_subrange != PathMetadata::NO_SUBRANGE ? path_subrange.first : 0); rgfa_frag_subrange.second = rgfa_frag_subrange.first + rgfa_frag_length; - string rgfa_frag_name = PathMetadata::create_path_name(path_sense, path_sample, path_locus, path_haplotype, + string rgfa_frag_name = PathMetadata::create_path_name(out_path_sense, path_sample, path_locus, path_haplotype, path_phase_block, rgfa_frag_subrange); rgfa_frag_name = set_rgfa_rank(rgfa_frag_name, rgfa_rank); @@ -535,9 +537,8 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, return intervals; }; - // now we try to find candidate rgfa intervals in the other traversals - // there's lots of room here for smarter heuristics, but right now we just - // do first come first served. + // build an initial ranked list of candidate traversal fragments + vector, pair>>> ranked_trav_fragments; for (int64_t trav_idx = 0; trav_idx < travs.size(); ++trav_idx) { // todo: this map seems backwards? note really a big deal since // we're only allowing one element @@ -554,47 +555,167 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, const vector& trav = travs.at(trav_idx); vector> uncovered_intervals = get_uncovered_intervals(trav); -#ifdef debug -#pragma omp critical(cerr) - cerr << "found " << uncovered_intervals.size() << " uncovered intervals in traversal " << trav_idx << endl; -#endif - for (const auto& uncovered_interval : uncovered_intervals) { - // we check the "backbone" interval that this interval is coming off - // since we've already covered the reference, then we know that this interval - // doesn't span the whole snarl including endpoints, so we can always afford - // to look back and ahead one - cerr << "uncovered interval " << uncovered_interval.first << ", " << uncovered_interval.second << " vs trav size " << trav.size() << endl; - assert(uncovered_interval.first > 0 && uncovered_interval.second < trav.size()); - int64_t prev_frag_idx = cover_node_to_fragment.at(graph->get_id(graph->get_handle_of_step(trav[uncovered_interval.first - 1]))); - int64_t next_frag_idx = cover_node_to_fragment.at(graph->get_id(graph->get_handle_of_step(trav[uncovered_interval.second]))); - if (prev_frag_idx != next_frag_idx) { - // todo: figure out policy for these - cerr << "[rgfa] warning: skipping fragment that is connected to different fragmengs" << endl; - continue; - } - int64_t fragment_rank; - if (prev_frag_idx == -1) { - fragment_rank = 1; - } else { - fragment_rank = cover_fragments.at(prev_frag_idx).first + 1; - } int64_t interval_length = 0; for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { interval_length += graph->get_length(graph->get_handle_of_step(trav[i])); } if (interval_length >= minimum_length) { - // update the cover - vector interval; - interval.reserve(uncovered_interval.second - uncovered_interval.first); - for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { - interval.push_back(trav[i]); - cover_node_to_fragment[graph->get_id(graph->get_handle_of_step(trav[i]))] = cover_fragments.size(); + auto trav_stats = rgfa_traversal_stats(graph, trav, uncovered_interval); + ranked_trav_fragments.push_back(make_pair(trav_stats, make_pair(trav_idx, uncovered_interval))); + } + } + } + + // todo: typedef! + function, pair>>& s1, + const pair, pair>>& s2)> heap_comp = + [](const pair, pair>>& s1, + const pair, pair>>& s2) { + return s1.first < s2.first; + }; + + // put the fragments into a max heap + std::make_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); + + // now greedily pull out traversal intervals from the ranked list until none are left + while (!ranked_trav_fragments.empty()) { + + // get the best scoring (max) fragment from heap + auto& best_stats_fragment = ranked_trav_fragments.front(); + std::pop_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); + ranked_trav_fragments.pop_back(); + + const vector& trav = travs.at(best_stats_fragment.second.first); + const pair& uncovered_interval = best_stats_fragment.second.second; + + // our traversal may have been partially covered by a different iteration, if so, we need to break it up + // and continue + vector> chopped_intervals; + int64_t cur_start = -1; + bool chopped = false; + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { + bool covered = cover_node_to_fragment.count(graph->get_id(graph->get_handle_of_step(trav[i]))); + if (!covered && cur_start == -1) { + cur_start = i; + } else if (covered) { + chopped = true; + if (cur_start != -1) { + chopped_intervals.push_back(make_pair(cur_start, i)); + cur_start = -1; + } + } + } + if (cur_start != -1) { + chopped_intervals.push_back(make_pair(cur_start, uncovered_interval.second)); + } + if (chopped) { + for (const pair& chopped_interval : chopped_intervals) { + int64_t chopped_trav_length = 0; + for (int64_t i = chopped_interval.first; i < chopped_interval.second; ++i) { + chopped_trav_length += graph->get_length(graph->get_handle_of_step(trav[i])); + } + if (chopped_trav_length >= minimum_length) { + auto chopped_stats = rgfa_traversal_stats(graph, trav, chopped_interval); + ranked_trav_fragments.push_back(make_pair(chopped_stats, make_pair(best_stats_fragment.second.first, chopped_interval))); + std::push_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); + cerr << "pushing interval " << endl; } - cover_fragments.push_back(make_pair(fragment_rank, std::move(interval))); } + continue; } + + // we check the "backbone" interval that this interval is coming off + // since we've already covered the reference, then we know that this interval + // doesn't span the whole snarl including endpoints, so we can always afford + // to look back and ahead one + cerr << "uncovered interval " << uncovered_interval.first << ", " << uncovered_interval.second << " vs trav size " << trav.size() << endl; + assert(uncovered_interval.first > 0 && uncovered_interval.second < trav.size()); + int64_t prev_frag_idx = cover_node_to_fragment.at(graph->get_id(graph->get_handle_of_step(trav[uncovered_interval.first - 1]))); + int64_t next_frag_idx = cover_node_to_fragment.at(graph->get_id(graph->get_handle_of_step(trav[uncovered_interval.second]))); + if (prev_frag_idx != next_frag_idx) { + // todo: figure out policy for these + cerr << "[rgfa] warning: skipping fragment that is connected to different fragmengs" << endl; + continue; + } + int64_t fragment_rank; + if (prev_frag_idx == -1) { + fragment_rank = 1; + } else { + fragment_rank = cover_fragments.at(prev_frag_idx).first + 1; + } + + // update the cover + vector interval; + interval.reserve(uncovered_interval.second - uncovered_interval.first); + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { + interval.push_back(trav[i]); + cover_node_to_fragment[graph->get_id(graph->get_handle_of_step(trav[i]))] = cover_fragments.size(); + } + cover_fragments.push_back(make_pair(fragment_rank, std::move(interval))); + } +} + +tuple rgfa_traversal_stats(const PathHandleGraph* graph, + const vector& trav, + const pair& trav_fragment) { + path_handle_t path_handle = graph->get_path_handle_of_step(trav.front()); + int64_t support = 0; + int64_t switches = 0; + int64_t dupes = 0; + handle_t prev_handle; + + for (int64_t i = trav_fragment.first; i < trav_fragment.second; ++i) { + const step_handle_t& step = trav[i]; + handle_t handle = graph->get_handle_of_step(step); + vector all_steps = graph->steps_of_handle(handle); + int64_t length = graph->get_length(handle); + support += length; + int64_t self_count = 0; + for (const step_handle_t& other_step : all_steps) { + path_handle_t step_path_handle = graph->get_path_handle_of_step(other_step); + if (step_path_handle == path_handle) { + ++self_count; + } else { + support += length; + } + } + if (self_count > 1) { + dupes += length * (self_count - 1); + } + if (i > 0 && graph->get_is_reverse(handle) != graph->get_is_reverse(prev_handle)) { + ++switches; + } + prev_handle = handle; } + + return std::make_tuple(support, switches, dupes); +} + +bool rgfa_traversal_stats_less(const tuple& s1, const tuple& s2) { + // duplicates are a deal breaker, if one traversal has no duplicates and the other does, the former wins + if (get<2>(s1) > 0 && get<2>(s2) == 0) { + return true; + } else if (get<2>(s1) == 0 && get<2>(s2) > 0) { + return false; + } + + // then support + if (get<0>(s1) < get<0>(s2)) { + return true; + } else if (get<0>(s1) > get<0>(s2)) { + return false; + } + + // then duplicates (by value) + if (get<2>(s1) > get<2>(s2)) { + return true; + } else if (get<2>(s1) < get<2>(s2)) { + return false; + } + + // then switches + return get<1>(s1) > get<1>(s2); } diff --git a/src/gfa.hpp b/src/gfa.hpp index ebfaf3df312..56e941fde1b 100644 --- a/src/gfa.hpp +++ b/src/gfa.hpp @@ -62,7 +62,17 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, unordered_map& cover_node_to_fragment, const unordered_map>>& preferred_intervals); - +/// Get some statistics from a traversal fragment that we use for ranking in the greedy algorithm +/// 1. Coverage : total step length across the traversal for all paths +/// 2. Switches : the number of nodes that would need to be flipped to forwardize the traversal +/// 3. Duplicated bases : the number of duplicated bases in the traversal path +tuple rgfa_traversal_stats(const PathHandleGraph* graph, + const vector& trav, + const pair& trav_fragment); + +/// Comparison of the above stats for the purposes of greedily selecting (highest better) traversals +bool rgfa_traversal_stats_less(const tuple& s1, const tuple& s2); + /// Extract rGFA tags from minigraph GFA in order to pass in as hints above unordered_map>> extract_rgfa_intervals(const string& rgfa_path); From f9aa01f3f97afec9dfdcbe967a5c1c763fd8277d Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 21 Mar 2023 14:40:45 -0400 Subject: [PATCH 04/21] rgfa forwardization --- src/gfa.cpp | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/gfa.hpp | 4 +++ 2 files changed, 79 insertions(+) diff --git a/src/gfa.cpp b/src/gfa.cpp index 064a58a9e28..169e49b3fd9 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -421,6 +421,9 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, } } } + + // forwardize the graph + rgfa_forwardize_paths(graph, reference_paths); } void rgfa_snarl_cover(const PathHandleGraph* graph, @@ -718,6 +721,78 @@ bool rgfa_traversal_stats_less(const tuple& s1, const return get<1>(s1) > get<1>(s2); } +// copied pretty much verbatem from +// https://github.com/ComparativeGenomicsToolkit/hal2vg/blob/v1.1.2/clip-vg.cpp#L809-L880 +void rgfa_forwardize_paths(MutablePathMutableHandleGraph* graph, + const unordered_set& reference_paths) { + + graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + if (reference_paths.count(path_handle) || get_rgfa_rank(path_name) >= 0) { + size_t fw_count = 0; + size_t total_steps = 0; + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = graph->get_handle_of_step(step_handle); + if (graph->get_is_reverse(handle)) { + handle_t flipped_handle = graph->create_handle(graph->get_sequence(handle)); + graph->follow_edges(handle, true, [&](handle_t prev_handle) { + if (graph->get_id(prev_handle) != graph->get_id(handle)) { + graph->create_edge(prev_handle, flipped_handle); + } + }); + graph->follow_edges(handle, false, [&](handle_t next_handle) { + if (graph->get_id(handle) != graph->get_id(next_handle)) { + graph->create_edge(flipped_handle, next_handle); + } + }); + // self-loop cases we punted on above: + if (graph->has_edge(handle, handle)) { + graph->create_edge(flipped_handle, flipped_handle); + } + if (graph->has_edge(handle, graph->flip(handle))) { + graph->create_edge(flipped_handle, graph->flip(flipped_handle)); + } + if (graph->has_edge(graph->flip(handle), handle)) { + graph->create_edge(graph->flip(flipped_handle), flipped_handle); + } + vector steps = graph->steps_of_handle(handle); + size_t ref_count = 0; + for (step_handle_t step : steps) { + if (graph->get_path_handle_of_step(step) == path_handle) { + ++ref_count; + } + step_handle_t next_step = graph->get_next_step(step); + handle_t new_handle = graph->get_is_reverse(graph->get_handle_of_step(step)) ? flipped_handle : + graph->flip(flipped_handle); + graph->rewrite_segment(step, next_step, {new_handle}); + } + if (ref_count > 1) { + cerr << "[rGFA] error: Cycle detected in rGFA path " << path_name << " at node " << graph->get_id(handle) << endl; + exit(1); + } + ++fw_count; + assert(graph->steps_of_handle(handle).empty()); + dynamic_cast(graph)->destroy_handle(handle); + } + ++total_steps; + }); + } + }); + + // do a check just to be sure + graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + if (reference_paths.count(path_handle) || get_rgfa_rank(path_name) >= 0) { + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = graph->get_handle_of_step(step_handle); + if (graph->get_is_reverse(handle)) { + cerr << "[rGFA] error: Failed to fowardize node " << graph->get_id(handle) << " in path " << path_name << endl; + exit(1); + } + }); + } + }); +} } diff --git a/src/gfa.hpp b/src/gfa.hpp index 56e941fde1b..d5650a1985c 100644 --- a/src/gfa.hpp +++ b/src/gfa.hpp @@ -72,6 +72,10 @@ tuple rgfa_traversal_stats(const PathHandleGraph* gra /// Comparison of the above stats for the purposes of greedily selecting (highest better) traversals bool rgfa_traversal_stats_less(const tuple& s1, const tuple& s2); + +/// Make sure all rgfa paths are forwardized +void rgfa_forwardize_paths(MutablePathMutableHandleGraph* graph, + const unordered_set& reference_paths); /// Extract rGFA tags from minigraph GFA in order to pass in as hints above unordered_map>> extract_rgfa_intervals(const string& rgfa_path); From e4762534199ca20d22fd2f83fa163ef199f880f0 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 21 Mar 2023 17:10:51 -0400 Subject: [PATCH 05/21] start rgfa tests --- src/gfa.cpp | 13 +++++++++++-- test/rgfa/rgfa_ins.gfa | 15 +++++++++++++++ test/rgfa/rgfa_ins2.gfa | 18 ++++++++++++++++++ test/rgfa/rgfa_tiny.gfa | 38 ++++++++++++++++++++++++++++++++++++++ test/t/11_vg_paths.t | 35 ++++++++++++++++++++++++++++++++++- 5 files changed, 116 insertions(+), 3 deletions(-) create mode 100644 test/rgfa/rgfa_ins.gfa create mode 100644 test/rgfa/rgfa_ins2.gfa create mode 100644 test/rgfa/rgfa_tiny.gfa diff --git a/src/gfa.cpp b/src/gfa.cpp index 169e49b3fd9..536062a264b 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -566,6 +566,7 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, if (interval_length >= minimum_length) { auto trav_stats = rgfa_traversal_stats(graph, trav, uncovered_interval); ranked_trav_fragments.push_back(make_pair(trav_stats, make_pair(trav_idx, uncovered_interval))); + cerr << "pushing trav " << trav_idx << " with stats " << get<0>(trav_stats) <<"," << get<1>(trav_stats) << "," << get<2>(trav_stats) << endl; } } } @@ -575,19 +576,27 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, const pair, pair>>& s2)> heap_comp = [](const pair, pair>>& s1, const pair, pair>>& s2) { - return s1.first < s2.first; + cerr << "COMP" << s1.first << " VS " << s2.first << " is " << rgfa_traversal_stats_less(s1.first, s2.first) << endl; + return rgfa_traversal_stats_less(s1.first, s2.first); }; // put the fragments into a max heap std::make_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); + for (auto xxx : ranked_trav_fragments) { + cerr << " heep " << xxx.first <<"->trav " << xxx.second.first << endl; + cerr << " front is " << ranked_trav_fragments.front().first << "->trav " << ranked_trav_fragments.front().second.first << endl; + } + // now greedily pull out traversal intervals from the ranked list until none are left while (!ranked_trav_fragments.empty()) { // get the best scoring (max) fragment from heap - auto& best_stats_fragment = ranked_trav_fragments.front(); + auto best_stats_fragment = ranked_trav_fragments.front(); std::pop_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); ranked_trav_fragments.pop_back(); + cerr << "popping trav " << best_stats_fragment.second.first << " with stats " + << get<0>(best_stats_fragment.first) << "," << get<1>(best_stats_fragment.first) << "," << get<2>(best_stats_fragment.first) << endl; const vector& trav = travs.at(best_stats_fragment.second.first); const pair& uncovered_interval = best_stats_fragment.second.second; diff --git a/test/rgfa/rgfa_ins.gfa b/test/rgfa/rgfa_ins.gfa new file mode 100644 index 00000000000..9751b52ba47 --- /dev/null +++ b/test/rgfa/rgfa_ins.gfa @@ -0,0 +1,15 @@ +H VN:Z:1.0 +S 1 CAAATAAG +S 2 TTT +S 3 TTT +S 4 TTT +S 5 TTT +P x 1+,5+ * +P y 1+,2+,4+,5+ * +P z 1+,2+,3+,4+,5+ * +L 1 + 2 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 4 + 0M +L 4 + 5 + 0M +L 1 + 5 + 0M diff --git a/test/rgfa/rgfa_ins2.gfa b/test/rgfa/rgfa_ins2.gfa new file mode 100644 index 00000000000..ef6f1f3055c --- /dev/null +++ b/test/rgfa/rgfa_ins2.gfa @@ -0,0 +1,18 @@ +H VN:Z:1.0 +S 1 CAAATAAG +S 2 TTT +S 3 TTT +S 4 TTT +S 5 TTT +S 6 TTTTTTTTTT +P x 1+,5+ * +P y 1+,2+,6+,4+,5+ * +P z 1+,2+,3+,4+,5+ * +L 1 + 2 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 4 + 0M +L 4 + 5 + 0M +L 1 + 5 + 0M +L 2 + 6 + 0M +L 6 + 4 + 0M diff --git a/test/rgfa/rgfa_tiny.gfa b/test/rgfa/rgfa_tiny.gfa new file mode 100644 index 00000000000..498d2e0a6dc --- /dev/null +++ b/test/rgfa/rgfa_tiny.gfa @@ -0,0 +1,38 @@ +H VN:Z:1.0 +S 5 C +S 7 A +S 12 ATAT +S 8 G +S 1 CAAATAAG +S 4 T +S 6 TTG +S 15 CCAACTCTCTG +S 2 A +S 10 A +S 9 AAATTTTCTGGAGTTCTAT +S 11 T +S 13 A +S 14 T +S 3 G +P x 1+,3+,5+,6+,8+,9+,11+,12+,14+,15+ * +P y 1+,2+,4+,6+,8+,9+,10+,12+,13+,15+ * +L 5 + 6 + 0M +L 7 + 9 + 0M +L 12 + 13 + 0M +L 12 + 14 + 0M +L 8 + 9 + 0M +L 1 + 2 + 0M +L 1 + 3 + 0M +L 4 + 6 + 0M +L 6 + 7 + 0M +L 6 + 8 + 0M +L 2 + 4 + 0M +L 2 + 5 + 0M +L 10 + 12 + 0M +L 9 + 10 + 0M +L 9 + 11 + 0M +L 11 + 12 + 0M +L 13 + 15 + 0M +L 14 + 15 + 0M +L 3 + 4 + 0M +L 3 + 5 + 0M diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t index 58591697ef5..04108df0fc1 100644 --- a/test/t/11_vg_paths.t +++ b/test/t/11_vg_paths.t @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg export LC_ALL="C" # force a consistent sort order -plan tests 18 +plan tests 22 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg @@ -59,4 +59,37 @@ is $? 0 "vg path coverage reports correct lengths in first column" rm -f q.vg q.cov.len q.len +vg paths -v rgfa/rgfa_tiny.gfa -R 1 -Q x | vg view - > rgfa_tiny.rgfa +printf "P y[33-34]:SR:i:1 10+ * +P y[38-39]:SR:i:1 13+ * +P y[8-10]:SR:i:1 2+,4+ *\n" > rgfa_tiny_expected_fragments.rgfa +grep ^P rgfa_tiny.rgfa | grep SR | sort > rgfa_tiny_fragments.rgfa +diff rgfa_tiny_fragments.rgfa rgfa_tiny_expected_fragments.rgfa +is $? 0 "Found the expected rgfa SNP cover of tiny graph" +rm -f rgfa_tiny.rgfa rgfa_tiny_expected_fragments.rgfa rgfa_tiny_fragments.rgfa + +vg paths -v rgfa/rgfa_ins.gfa -R 5 -Q x | vg view - > rgfa_ins.rgfa +printf "P z[8-17]:SR:i:1 2+,3+,4+ *\n" > rgfa_ins_expected_fragments.rgfa +grep ^P rgfa_ins.rgfa | grep SR | sort > rgfa_ins_fragments.rgfa +diff rgfa_ins_fragments.rgfa rgfa_ins_expected_fragments.rgfa +is $? 0 "Found the expected rgfa cover for simple nested insertion" + +rm -f rgfa_ins.rgfa rgfa_ins_expected_fragments.rgfa rgfa_ins_fragments.rgfa + +vg paths -v rgfa/rgfa_ins2.gfa -R 3 -Q x | vg view - > rgfa_ins2.rgfa +printf "P y[8-24]:SR:i:1 2+,6+,4+ * +P z[11-14]:SR:i:2 3+ *\n" > rgfa_ins2_expected_fragments.rgfa +grep ^P rgfa_ins2.rgfa | grep SR | sort > rgfa_ins2_fragments.rgfa +diff rgfa_ins2_fragments.rgfa rgfa_ins2_expected_fragments.rgfa +is $? 0 "Found the expected rgfa cover for simple nested insertion that requires two fragments" + +rm -f rgfa_ins2.rgfa rgfa_ins2_expected_fragments.rgfa rgfa_ins2_fragments.rgfa + +vg paths -v rgfa/rgfa_ins2.gfa -R 5 -Q x | vg view - > rgfa_ins2R5.rgfa +printf "P y[8-24]:SR:i:1 2+,6+,4+ *\n" > rgfa_ins2R5_expected_fragments.rgfa +grep ^P rgfa_ins2R5.rgfa | grep SR | sort > rgfa_ins2R5_fragments.rgfa +diff rgfa_ins2R5_fragments.rgfa rgfa_ins2R5_expected_fragments.rgfa +is $? 0 "rgfa Minimum fragment length filters out small fragment" + +rm -f rgfa_ins2R5.rgfa rgfa_ins2R5_expected_fragments.rgfa rgfa_ins2R5_fragments.rgfa From cd908a9d7de2aca366ab21dc27830e519e779a2a Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 21 Mar 2023 20:38:13 -0400 Subject: [PATCH 06/21] ../src/gfa.cpp rgfa strand swtiching test --- test/t/11_vg_paths.t | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t index 04108df0fc1..cd28a8fefa5 100644 --- a/test/t/11_vg_paths.t +++ b/test/t/11_vg_paths.t @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg export LC_ALL="C" # force a consistent sort order -plan tests 22 +plan tests 23 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg @@ -93,3 +93,16 @@ diff rgfa_ins2R5_fragments.rgfa rgfa_ins2R5_expected_fragments.rgfa is $? 0 "rgfa Minimum fragment length filters out small fragment" rm -f rgfa_ins2R5.rgfa rgfa_ins2R5_expected_fragments.rgfa rgfa_ins2R5_fragments.rgfa + +vg paths -v rgfa/rgfa_ins3.gfa -R 3 -Q x | vg view - > rgfa_ins3.rgfa +printf "P x 1+,5+ * +P y[3-19]:SR:i:1 4+,6+,2+ * +P y 5-,4+,6+,2+,1- * +P z[11-14]:SR:i:2 3+ * +P z 1+,2-,3+,4-,5+ *\n" | sort > rgfa_ins3_expected_fragments.rgfa +grep ^P rgfa_ins3.rgfa | sort > rgfa_ins3_fragments.rgfa +diff rgfa_ins3_fragments.rgfa rgfa_ins3_expected_fragments.rgfa +is $? 0 "Found the expected rgfa cover for simple nested insertion that has a reversed path that needs forwardization" + +rm -f rgfa_ins3.rgfa rgfa_ins3_expected_fragments.rgfa rgfa_ins3_fragments.rgfa + From 5895007cae05d21364302fec6dbcd131aa78e44f Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 21 Mar 2023 20:52:23 -0400 Subject: [PATCH 07/21] fix strand bug --- src/gfa.cpp | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index 536062a264b..8efbff83338 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -361,7 +361,9 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, subrange_t path_subrange; PathMetadata::parse_path_name(graph->get_path_name(path_handle), path_sense, path_sample, path_locus, path_haplotype, path_phase_block, path_subrange); - PathSense out_path_sense = path_sense == PathSense::HAPLOTYPE ? PathSense::GENERIC : path_sense; + // todo: we need a policy for this + //PathSense out_path_sense = path_sense == PathSense::HAPLOTYPE ? PathSense::GENERIC : path_sense; + PathSense out_path_sense = path_sense; const vector& fragments = path_fragments.second; @@ -472,6 +474,9 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, done = true; } } + if (reversed) { + std::reverse(trav.begin(), trav.end()); + } travs.push_back(trav); } } @@ -673,9 +678,8 @@ tuple rgfa_traversal_stats(const PathHandleGraph* gra const pair& trav_fragment) { path_handle_t path_handle = graph->get_path_handle_of_step(trav.front()); int64_t support = 0; - int64_t switches = 0; + int64_t reversed_steps = 0; int64_t dupes = 0; - handle_t prev_handle; for (int64_t i = trav_fragment.first; i < trav_fragment.second; ++i) { const step_handle_t& step = trav[i]; @@ -695,13 +699,12 @@ tuple rgfa_traversal_stats(const PathHandleGraph* gra if (self_count > 1) { dupes += length * (self_count - 1); } - if (i > 0 && graph->get_is_reverse(handle) != graph->get_is_reverse(prev_handle)) { - ++switches; + if (i > 0 && graph->get_is_reverse(handle)) { + ++reversed_steps; } - prev_handle = handle; } - return std::make_tuple(support, switches, dupes); + return std::make_tuple(support, reversed_steps, dupes); } bool rgfa_traversal_stats_less(const tuple& s1, const tuple& s2) { @@ -734,7 +737,8 @@ bool rgfa_traversal_stats_less(const tuple& s1, const // https://github.com/ComparativeGenomicsToolkit/hal2vg/blob/v1.1.2/clip-vg.cpp#L809-L880 void rgfa_forwardize_paths(MutablePathMutableHandleGraph* graph, const unordered_set& reference_paths) { - + + unordered_map id_map; graph->for_each_path_handle([&](path_handle_t path_handle) { string path_name = graph->get_path_name(path_handle); if (reference_paths.count(path_handle) || get_rgfa_rank(path_name) >= 0) { @@ -744,6 +748,7 @@ void rgfa_forwardize_paths(MutablePathMutableHandleGraph* graph, handle_t handle = graph->get_handle_of_step(step_handle); if (graph->get_is_reverse(handle)) { handle_t flipped_handle = graph->create_handle(graph->get_sequence(handle)); + id_map[graph->get_id(flipped_handle)] = graph->get_id(handle); graph->follow_edges(handle, true, [&](handle_t prev_handle) { if (graph->get_id(prev_handle) != graph->get_id(handle)) { graph->create_edge(prev_handle, flipped_handle); @@ -788,6 +793,11 @@ void rgfa_forwardize_paths(MutablePathMutableHandleGraph* graph, } }); + // rename all the ids back to what they were (so nodes keep their ids, just get flipped around) + graph->reassign_node_ids([&id_map](nid_t new_id) { + return id_map.count(new_id) ? id_map[new_id] : new_id; + }); + // do a check just to be sure graph->for_each_path_handle([&](path_handle_t path_handle) { string path_name = graph->get_path_name(path_handle); From d13bafc918db8f4c27cce546d08fc0997ecbb9a2 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 21 Mar 2023 20:53:13 -0400 Subject: [PATCH 08/21] use better serilization call in paths (that supports gfa in/out) --- src/subcommand/paths_main.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index 6ffc2515aea..9468b9c764c 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -18,6 +18,7 @@ #include "../gbwt_helper.hpp" #include "../integrated_snarl_finder.hpp" #include "../gfa.hpp" +#include "../io/save_handle_graph.hpp" #include #include #include @@ -618,7 +619,7 @@ int main_paths(int argc, char** argv) { } // output the graph - serializable_graph->serialize(cout); + vg::io::save_handle_graph(graph.get(), std::cout); } else if (coverage) { // for every node, count the number of unique paths. then add the coverage count to each one From 99baade5acd46630434d847f708c6f3009d51248 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 22 Mar 2023 09:55:53 -0400 Subject: [PATCH 09/21] add missing test file --- test/rgfa/rgfa_ins3.gfa | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 test/rgfa/rgfa_ins3.gfa diff --git a/test/rgfa/rgfa_ins3.gfa b/test/rgfa/rgfa_ins3.gfa new file mode 100644 index 00000000000..861ac58d3b5 --- /dev/null +++ b/test/rgfa/rgfa_ins3.gfa @@ -0,0 +1,18 @@ +H VN:Z:1.0 +S 1 CAAATAAG +S 2 TTT +S 3 TTT +S 4 TTT +S 5 TTT +S 6 TTTTTTTTTT +P x 1+,5+ * +P y 5-,4-,6-,2-,1- * +P z 1+,2+,3+,4+,5+ * +L 1 + 2 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 4 + 0M +L 4 + 5 + 0M +L 1 + 5 + 0M +L 2 + 6 + 0M +L 6 + 4 + 0M From baf7cd585a5f04f262d53cfd70965548701e678e Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 23 Mar 2023 16:10:19 -0400 Subject: [PATCH 10/21] some ui tweaks for rgfa --- src/gfa.cpp | 102 +++++++++++++++++++++----------- src/gfa.hpp | 3 + src/subcommand/convert_main.cpp | 8 ++- src/subcommand/paths_main.cpp | 4 +- 4 files changed, 82 insertions(+), 35 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index 8efbff83338..8e40d444538 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -6,7 +6,7 @@ #include -#define debug +//#define debug namespace vg { @@ -43,8 +43,8 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& } out << "\n"; - //Compute the rGFA tags of given paths (todo: support non-zero ranks) - unordered_map> node_offsets; + //Compute the rGFA tags of given paths + unordered_map> node_offsets_and_ranks; for (const string& path_name : rgfa_paths) { path_handle_t path_handle = graph->get_path_handle(path_name); size_t offset = 0; @@ -57,11 +57,12 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& << node_id << " is traversed on its reverse strand. rGFA only supports the forward strand." << endl; throw runtime_error(ss.str()); } - if (node_offsets.count(node_id)) { + if (node_offsets_and_ranks.count(node_id)) { cerr << "warning [gfa]: multiple selected rgfa paths found on node " << node_id << ": keeping tags for " - << graph->get_path_name(node_offsets[node_id].first) << " and ignoring those for " << path_name << endl; + << graph->get_path_name(get<0>(node_offsets_and_ranks[node_id])) << " and ignoring those for " << path_name << endl; } else { - node_offsets[node_id] = make_pair(path_handle, offset); + int64_t rgfa_rank = get_rgfa_rank(path_name); + node_offsets_and_ranks[node_id] = make_tuple(path_handle, offset, rgfa_rank <= 0 ? 0 : rgfa_rank); } offset += graph->get_length(handle); }); @@ -73,12 +74,23 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& nid_t node_id = graph->get_id(h); out << node_id << "\t"; out << graph->get_sequence(h); - auto it = node_offsets.find(node_id); - if (it != node_offsets.end()) { - // add rGFA tags - out << "\t" << "SN:Z:" << graph->get_path_name(it->second.first) - << "\t" << "SO:i:" << it->second.second - << "\t" << "SR:i:0"; // todo: support non-zero ranks? + auto it = node_offsets_and_ranks.find(node_id); + if (it != node_offsets_and_ranks.end()) { + // hack off the rgfa tag + string base_name = strip_rgfa_rank(graph->get_path_name(get<0>(it->second))); + // hack off the subrange offset (and add it to SO) + PathSense sense; + string sample, locus; + size_t haplotype, phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(base_name, sense, sample, locus, haplotype, phase_block, subrange); + int64_t base_offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; + base_name = PathMetadata::create_path_name(sense, sample, locus, haplotype, phase_block, + PathMetadata::NO_SUBRANGE); + // add rGFA tags + out << "\t" << "SN:Z:" << base_name + << "\t" << "SO:i:" << (base_offset + get<1>(it->second)) + << "\t" << "SR:i:" << get<2>(it->second); } out << "\n"; // Writing `std::endl` would flush the buffer. return true; @@ -106,9 +118,19 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& vector w_line_paths; + bool warned_about_tags_as_paths = false; // Paths as P-lines for (const path_handle_t& h : path_handles) { auto path_name = graph->get_path_name(h); + if (get_rgfa_rank(path_name) > 0) { + if (!rgfa_paths.empty()) { + // the path was put into tags, no reason to deal with it here + continue; + } else if (!warned_about_tags_as_paths) { + cerr << "warning [gfa]: outputing rGFA cover (rank>=1) path(s) as a P-line(s) and not tags because no reference (rank==0) selected" << endl; + warned_about_tags_as_paths = true; + } + } if (rgfa_pline || !rgfa_paths.count(path_name)) { if (graph->get_sense(h) != PathSense::REFERENCE && reference_samples.count(graph->get_sample_name(h))) { // We have a mix of reference and non-reference paths on the same sample which GFA can't handle. @@ -144,6 +166,16 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& { unordered_map, size_t> last_phase_block_end; for (const path_handle_t& h : w_line_paths) { + string path_name = graph->get_path_name(h); + if (get_rgfa_rank(path_name) > 0) { + if (!rgfa_paths.empty()) { + // the path was put into tags, no reason to deal with it here + continue; + } else if (!warned_about_tags_as_paths) { + cerr << "warning [gfa]: outputing rGFA cover (rank>=1) path(s) as a W-line(s) and not tags because no reference (rank==0) selected" << endl; + warned_about_tags_as_paths = true; + } + } write_w_line(graph, out, h, last_phase_block_end); } } @@ -264,6 +296,13 @@ int get_rgfa_rank(const string& path_name) { } string set_rgfa_rank(const string& path_name, int rgfa_rank) { + string new_name = strip_rgfa_rank(path_name); + // now append the rank + new_name += ":SR:i:" + std::to_string(rgfa_rank); + return new_name; +} + +string strip_rgfa_rank(const string& path_name) { string new_name; // check if we have an existing rank. if so, we srap it. size_t pos = path_name.rfind(":SR:i:"); @@ -276,9 +315,6 @@ string set_rgfa_rank(const string& path_name, int rgfa_rank) { } else { new_name = path_name; } - - // now append the rank - new_name += ":SR:i:" + std::to_string(rgfa_rank); return new_name; } @@ -288,6 +324,19 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, int64_t minimum_length, const unordered_map>>& preferred_intervals){ + // for sanity's sake, we don't want to ever support multiple rgfa covers, so start by + // deleting all existing rgfa fragments (except for rank 0 reference paths, of course) + vector existing_cover; + graph->for_each_path_handle([&](path_handle_t path_handle) { + if (get_rgfa_rank(graph->get_path_name(path_handle)) > 0) { + existing_cover.push_back(path_handle); + assert(!reference_paths.count(path_handle)); + } + }); + for (path_handle_t path_handle : existing_cover) { + graph->destroy_path(path_handle); + } + // we use the path traversal finder for everything // (even gbwt haplotypes, as we're using the path handle interface) PathTraversalFinder path_trav_finder(*graph, *snarl_manager); @@ -571,7 +620,6 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, if (interval_length >= minimum_length) { auto trav_stats = rgfa_traversal_stats(graph, trav, uncovered_interval); ranked_trav_fragments.push_back(make_pair(trav_stats, make_pair(trav_idx, uncovered_interval))); - cerr << "pushing trav " << trav_idx << " with stats " << get<0>(trav_stats) <<"," << get<1>(trav_stats) << "," << get<2>(trav_stats) << endl; } } } @@ -581,18 +629,12 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, const pair, pair>>& s2)> heap_comp = [](const pair, pair>>& s1, const pair, pair>>& s2) { - cerr << "COMP" << s1.first << " VS " << s2.first << " is " << rgfa_traversal_stats_less(s1.first, s2.first) << endl; return rgfa_traversal_stats_less(s1.first, s2.first); }; // put the fragments into a max heap std::make_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); - for (auto xxx : ranked_trav_fragments) { - cerr << " heep " << xxx.first <<"->trav " << xxx.second.first << endl; - cerr << " front is " << ranked_trav_fragments.front().first << "->trav " << ranked_trav_fragments.front().second.first << endl; - } - // now greedily pull out traversal intervals from the ranked list until none are left while (!ranked_trav_fragments.empty()) { @@ -600,8 +642,6 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, auto best_stats_fragment = ranked_trav_fragments.front(); std::pop_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); ranked_trav_fragments.pop_back(); - cerr << "popping trav " << best_stats_fragment.second.first << " with stats " - << get<0>(best_stats_fragment.first) << "," << get<1>(best_stats_fragment.first) << "," << get<2>(best_stats_fragment.first) << endl; const vector& trav = travs.at(best_stats_fragment.second.first); const pair& uncovered_interval = best_stats_fragment.second.second; @@ -636,7 +676,6 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, auto chopped_stats = rgfa_traversal_stats(graph, trav, chopped_interval); ranked_trav_fragments.push_back(make_pair(chopped_stats, make_pair(best_stats_fragment.second.first, chopped_interval))); std::push_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); - cerr << "pushing interval " << endl; } } continue; @@ -646,20 +685,17 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, // since we've already covered the reference, then we know that this interval // doesn't span the whole snarl including endpoints, so we can always afford // to look back and ahead one - cerr << "uncovered interval " << uncovered_interval.first << ", " << uncovered_interval.second << " vs trav size " << trav.size() << endl; assert(uncovered_interval.first > 0 && uncovered_interval.second < trav.size()); int64_t prev_frag_idx = cover_node_to_fragment.at(graph->get_id(graph->get_handle_of_step(trav[uncovered_interval.first - 1]))); int64_t next_frag_idx = cover_node_to_fragment.at(graph->get_id(graph->get_handle_of_step(trav[uncovered_interval.second]))); - if (prev_frag_idx != next_frag_idx) { - // todo: figure out policy for these - cerr << "[rgfa] warning: skipping fragment that is connected to different fragmengs" << endl; - continue; - } + // todo: i'm not sure if/how minigraph treats these cases, where the anchors connect to different ranks + // also, can these be avoided entirely? + int64_t min_frag_idx = std::min(prev_frag_idx, next_frag_idx); int64_t fragment_rank; - if (prev_frag_idx == -1) { + if (min_frag_idx == -1) { fragment_rank = 1; } else { - fragment_rank = cover_fragments.at(prev_frag_idx).first + 1; + fragment_rank = cover_fragments.at(min_frag_idx).first + 1; } // update the cover diff --git a/src/gfa.hpp b/src/gfa.hpp index d5650a1985c..9c57f8c2ff5 100644 --- a/src/gfa.hpp +++ b/src/gfa.hpp @@ -41,6 +41,9 @@ int get_rgfa_rank(const string& path_name); /// Add the RGFA rank tag to a pathname string set_rgfa_rank(const string& path_name, int rgfa_rank); +/// Get the name without the rank +string strip_rgfa_rank(const string& path_name); + /// Compute the rGFA path cover /// graph: the graph /// snarl_manager: the snarls (todo: should use distance index) diff --git a/src/subcommand/convert_main.cpp b/src/subcommand/convert_main.cpp index 2f7fd1bc78e..518789c140a 100644 --- a/src/subcommand/convert_main.cpp +++ b/src/subcommand/convert_main.cpp @@ -432,7 +432,13 @@ int main_convert(int argc, char** argv) { continue; } } + // Tagged (rank>0) rgfa paths get written if a base path specified + if (get_rgfa_rank(path_name) > 0) { + assert(!rgfa_paths.count(path_name)); + rgfa_paths.insert(path_name); + } }); + } graph_to_gfa(graph_to_write, std::cout, rgfa_paths, rgfa_pline, wline); } else { @@ -468,7 +474,7 @@ void help_convert(char** argv) { << "gfa output options (use with -f):" << endl << " -P, --rgfa-path STR write given path as rGFA tags instead of lines (multiple allowed, only rank-0 supported)" << endl << " -Q, --rgfa-prefix STR write paths with given prefix as rGFA tags instead of lines (multiple allowed, only rank-0 supported)" << endl - << " -B, --rgfa-pline paths written as rGFA tags also written as lines" << endl + << " -B, --rgfa-pline paths written as rank 0 rGFA tags also written as lines" << endl << " -W, --no-wline write all paths as GFA P-lines instead of W-lines. Allows handling multiple phase blocks and subranges used together." << endl << " --gbwtgraph-algorithm Always use the GBWTGraph library GFA algorithm. Not compatible with other GBWT output options or non-GBWT graphs." << endl << " --vg-algorithm Always use the VG GFA algorithm. Works with all options and graph types, but can't preserve original GFA coordinates." << endl diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index 9468b9c764c..73f75f4691d 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -612,7 +612,9 @@ int main_paths(int argc, char** argv) { assert(rgfa_min_len >= 0); unordered_set reference_paths; for_each_selected_path([&](const path_handle_t& path_handle) { - reference_paths.insert(path_handle); + if (get_rgfa_rank(graph->get_path_name(path_handle)) <= 0) { + reference_paths.insert(path_handle); + } }); rgfa_graph_cover(mutable_graph, snarl_manager.get(), reference_paths, rgfa_min_len); From 2c61cb4404a29c4a11857ce6f7d097c0d344e54b Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 24 Mar 2023 13:30:10 -0400 Subject: [PATCH 11/21] fix assertion fails that happen in full hprc graph --- src/gfa.cpp | 19 +++++++++++++++---- src/subcommand/paths_main.cpp | 11 ++++++++--- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index 8e40d444538..c537b1c00e2 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -539,7 +539,9 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, } } - if (ref_paths.empty() && !cover_node_to_fragment.count(snarl.start().node_id())) { + // note: checking both snarl endpoint here is actually necessary: if the reference path doesn't end in a tip, + // you can end up with a trivial snarl at its end which will crash on this test. + if (ref_paths.empty() && (!cover_node_to_fragment.count(snarl.start().node_id()) || !cover_node_to_fragment.count(snarl.end().node_id()))) { // we're not nested in a reference snarl, and we have no reference path // by the current logic, there's nothing to be done. cerr << "[rgfa] warning: No referene path through snarl " @@ -613,11 +615,20 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, vector> uncovered_intervals = get_uncovered_intervals(trav); for (const auto& uncovered_interval : uncovered_intervals) { + unordered_set cycle_check; + bool cyclic = false; int64_t interval_length = 0; - for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { - interval_length += graph->get_length(graph->get_handle_of_step(trav[i])); + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second && !cyclic; ++i) { + handle_t handle = graph->get_handle_of_step(trav[i]); + interval_length += graph->get_length(handle); + nid_t node_id = graph->get_id(handle); + if (cycle_check.count(node_id)) { + cyclic = true; + } else { + cycle_check.insert(node_id); + } } - if (interval_length >= minimum_length) { + if (!cyclic && interval_length >= minimum_length) { auto trav_stats = rgfa_traversal_stats(graph, trav, uncovered_interval); ranked_trav_fragments.push_back(make_pair(trav_stats, make_pair(trav_idx, uncovered_interval))); } diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index 73f75f4691d..6d4d14f36e4 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -140,7 +140,8 @@ int main_paths(int argc, char** argv) { {"extract-vg", no_argument, 0, 'V'}, {"drop-paths", no_argument, 0, 'd'}, {"retain-paths", no_argument, 0, 'r'}, - {"rgfa-cover", required_argument, 0, 'R'}, + {"rgfa-cover", required_argument, 0, 'R'}, + {"snarls", required_argument, 0, 's'}, {"extract-gam", no_argument, 0, 'X'}, {"extract-gaf", no_argument, 0, 'A'}, {"list", no_argument, 0, 'L'}, @@ -162,7 +163,7 @@ int main_paths(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drR:aGp:ct:", + c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drR:s:aGp:ct:", long_options, &option_index); // Detect the end of the options. @@ -202,7 +203,11 @@ int main_paths(int argc, char** argv) { rgfa_min_len = parse(optarg); output_formats++; break; - + + case 's': + snarl_filename = optarg; + break; + case 'X': extract_as_gam = true; output_formats++; From 42234d67fb5809995e03613db34c2667fa548ccf Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 28 Mar 2023 20:27:19 -0400 Subject: [PATCH 12/21] pivot to store rgfa info in locus --- src/gfa.cpp | 122 ++++++++++++++++++++++++++++++++++------------------ src/gfa.hpp | 25 ++++++++--- 2 files changed, 100 insertions(+), 47 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index c537b1c00e2..3675f9497b5 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -77,7 +77,7 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& auto it = node_offsets_and_ranks.find(node_id); if (it != node_offsets_and_ranks.end()) { // hack off the rgfa tag - string base_name = strip_rgfa_rank(graph->get_path_name(get<0>(it->second))); + string base_name = strip_rgfa_path_name(graph->get_path_name(get<0>(it->second))); // hack off the subrange offset (and add it to SO) PathSense sense; string sample, locus; @@ -282,42 +282,96 @@ void write_w_line(const PathHandleGraph* graph, ostream& out, path_handle_t path out << "\n"; } -int get_rgfa_rank(const string& path_name) { +int get_rgfa_rank(const string& path_name, const string& rgfa_sample) { int rank = -1; - size_t pos = path_name.rfind(":SR:i:"); - if (pos != string::npos && path_name.length() - pos >= 6) { + + PathSense path_sense; + string path_sample; + string path_locus; + size_t path_haplotype; + size_t path_phase_block; + subrange_t path_subrange; + PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); + + size_t pos = path_locus.rfind(":SR:i:"); + if (pos != string::npos && path_locus.length() - pos >= 6) { + assert(path_sample == rgfa_sample); pos += 6; - size_t pos2 = path_name.find(":", pos); + size_t pos2 = path_locus.find(":", pos); size_t len = pos2 == string::npos ? pos2 : pos2 - pos; - string rank_string = path_name.substr(pos, len); + string rank_string = path_locus.substr(pos, len); rank = parse(rank_string); } return rank; } -string set_rgfa_rank(const string& path_name, int rgfa_rank) { - string new_name = strip_rgfa_rank(path_name); - // now append the rank - new_name += ":SR:i:" + std::to_string(rgfa_rank); - return new_name; +string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subrange_t& rgfa_subrange, + const string& rgfa_sample) { + + PathSense path_sense; + string path_sample; + string path_locus; + size_t path_haplotype; + size_t path_phase_block; + subrange_t path_subrange; + PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); + + // we're going to replace the existing sample name; so move it out into locus behind SN:Z: + assert(path_locus != PathMetadata::NO_LOCUS_NAME); + string base_name; + if (path_sample != PathMetadata::NO_SAMPLE_NAME) { + base_name = path_sample + ":"; + } + base_name += "SN:Z:" + path_locus; + + // and we also load in our rGFA tag + base_name += ":SR:i:" + std::to_string(rgfa_rank); + + // and return the final path, with sample/locus/rgfa-rank embedded in locus + // (as it's a reference path, we alsos strip out the phase block) + return PathMetadata::create_path_name(PathSense::REFERENCE, rgfa_sample, base_name, path_haplotype, + PathMetadata::NO_PHASE_BLOCK, rgfa_subrange); } -string strip_rgfa_rank(const string& path_name) { - string new_name; - // check if we have an existing rank. if so, we srap it. - size_t pos = path_name.rfind(":SR:i:"); - if (pos != string::npos && path_name.length() - pos >= 6) { - size_t pos2 = path_name.find(":", pos + 6); - new_name = path_name.substr(0, pos); - if (pos2 != string::npos) { - new_name += path_name.substr(pos2); +string strip_rgfa_path_name(const string& path_name, const string& rgfa_sample) { + + PathSense path_sense; + string path_sample; + string path_locus; + size_t path_haplotype; + size_t path_phase_block; + subrange_t path_subrange; + PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); + + assert(path_locus != PathMetadata::NO_LOCUS_NAME); + + size_t sr_pos = path_locus.rfind(":SR:i:"); + if (sr_pos != string::npos && path_locus.length() - sr_pos >= 6) { + size_t sn_pos = path_locus.rfind("SN:Z:", sr_pos - 1); + assert(sn_pos != string::npos); + assert(path_sample == rgfa_sample); + + string orig_sample; + if (sn_pos > 0) { + orig_sample = path_locus.substr(0, sn_pos - 1); + } + string orig_locus = path_locus.substr(sn_pos + 5, sr_pos - sn_pos - 5); + + // todo: recover path sense / haploblock? + if (orig_sample.empty()) { + path_sense = PathSense::GENERIC; + path_haplotype = PathMetadata::NO_HAPLOTYPE; } - } else { - new_name = path_name; + return PathMetadata::create_path_name(path_sense, orig_sample, orig_locus, + path_haplotype, path_phase_block, path_subrange); } - return new_name; + return path_name; } + void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, SnarlManager* snarl_manager, const unordered_set& reference_paths, @@ -402,17 +456,7 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, for (const auto& path_fragments : path_to_fragments) { const path_handle_t& path_handle = path_fragments.first; - PathSense path_sense; - string path_sample; - string path_locus; - size_t path_haplotype; - size_t path_phase_block; - subrange_t path_subrange; - PathMetadata::parse_path_name(graph->get_path_name(path_handle), path_sense, path_sample, path_locus, - path_haplotype, path_phase_block, path_subrange); - // todo: we need a policy for this - //PathSense out_path_sense = path_sense == PathSense::HAPLOTYPE ? PathSense::GENERIC : path_sense; - PathSense out_path_sense = path_sense; + string path_name = graph->get_path_name(path_handle); const vector& fragments = path_fragments.second; @@ -452,14 +496,10 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, for (const step_handle_t& step : rgfa_fragment) { rgfa_frag_length += graph->get_length(graph->get_handle_of_step(step)); } - subrange_t rgfa_frag_subrange; - rgfa_frag_subrange.first = rgfa_frag_pos + (path_subrange != PathMetadata::NO_SUBRANGE ? path_subrange.first : 0); + subrange_t rgfa_frag_subrange = graph->get_subrange(path_handle); + rgfa_frag_subrange.first = rgfa_frag_pos + (rgfa_frag_subrange != PathMetadata::NO_SUBRANGE ? rgfa_frag_subrange.first : 0); rgfa_frag_subrange.second = rgfa_frag_subrange.first + rgfa_frag_length; - - string rgfa_frag_name = PathMetadata::create_path_name(out_path_sense, path_sample, path_locus, path_haplotype, - path_phase_block, rgfa_frag_subrange); - - rgfa_frag_name = set_rgfa_rank(rgfa_frag_name, rgfa_rank); + string rgfa_frag_name = create_rgfa_path_name(path_name, rgfa_rank, rgfa_frag_subrange); #ifdef debug #pragma omp critical(cerr) diff --git a/src/gfa.hpp b/src/gfa.hpp index 9c57f8c2ff5..ba59a292519 100644 --- a/src/gfa.hpp +++ b/src/gfa.hpp @@ -31,18 +31,31 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, /// Prototype code to tag paths as rGFA paths. Either needs to be completely scrapped /// or adapted into libhandlegraph at some point, ideally. -/// It works by adding :SR:i: to the end of the name + +/// It works by using a special sample name (default=_rGFA_) for rGFA contigs. +/// Any real sample name gets pushed into the locus field behind its rGFA tag SN:Z: +/// The rGFA rank also goes in the locus field behind SR:i: + +/// In GFA, these paths live in rGFA tags on S elements +/// In the graph, they are reference paths with SN/SR fields in their locus names. +/// As it stands, they will come out as W-lines in GFA with vg view or vg convert (without -Q/-P) + +/// Note that rank-0 rGFA fragments (aka normal reference paths) do *not* get the rGFA +/// sample, and are treated as normal reference paths all the way through (but can get rGFA tags) +/// when specified with -Q/-P in convert -f. /// Returns the RGFA rank (SR) of a path. This will be 0 for the reference /// backbone, and higher the further number of (nested) bubbles away it is. /// If the path is not an RGFA path, then return -1 -int get_rgfa_rank(const string& path_name); +int get_rgfa_rank(const string& path_name, const string& rgfa_sample="_rGFA_"); -/// Add the RGFA rank tag to a pathname -string set_rgfa_rank(const string& path_name, int rgfa_rank); +/// Add the rgfa rank to a pathname, also setting its sample to the special rgfa sample and +/// moving its old sample into the locus field +string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subrange_t& subrange, + const string& rgfa_sample="_rGFA_"); -/// Get the name without the rank -string strip_rgfa_rank(const string& path_name); +/// Remove the rGFA information from a path name, effectively undoing set_rgfa_rank +string strip_rgfa_path_name(const string& path_name, const string& rgfa_sample="_rGFA_"); /// Compute the rGFA path cover /// graph: the graph From 2b38d5a5efe9e76b8a7078552fbbd88d324f370b Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 10 May 2023 12:09:23 -0400 Subject: [PATCH 13/21] parameterize rgfa sample name --- src/gfa.cpp | 8 ++++---- src/gfa.hpp | 8 +++++--- src/subcommand/paths_main.cpp | 13 ++++++++++--- 3 files changed, 19 insertions(+), 10 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index 3675f9497b5..2848e202a6b 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -20,7 +20,7 @@ static bool should_write_as_w_line(const PathHandleGraph* graph, path_handle_t p static void write_w_line(const PathHandleGraph* graph, ostream& out, path_handle_t path_handle, unordered_map, size_t>& last_phase_block_end); void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& rgfa_paths, - bool rgfa_pline, bool use_w_lines) { + bool rgfa_pline, bool use_w_lines, const string& rgfa_sample_name) { // TODO: Support sorting nodes, paths, and/or edges for canonical output // TODO: Use a NamedNodeBackTranslation (or forward translation?) to properly round-trip GFA that has had to be chopped. @@ -335,7 +335,7 @@ string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subra PathMetadata::NO_PHASE_BLOCK, rgfa_subrange); } -string strip_rgfa_path_name(const string& path_name, const string& rgfa_sample) { +string strip_rgfa_path_name(const string& path_name) { PathSense path_sense; string path_sample; @@ -352,7 +352,6 @@ string strip_rgfa_path_name(const string& path_name, const string& rgfa_sample) if (sr_pos != string::npos && path_locus.length() - sr_pos >= 6) { size_t sn_pos = path_locus.rfind("SN:Z:", sr_pos - 1); assert(sn_pos != string::npos); - assert(path_sample == rgfa_sample); string orig_sample; if (sn_pos > 0) { @@ -376,6 +375,7 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, SnarlManager* snarl_manager, const unordered_set& reference_paths, int64_t minimum_length, + const string& rgfa_sample_name, const unordered_map>>& preferred_intervals){ // for sanity's sake, we don't want to ever support multiple rgfa covers, so start by @@ -499,7 +499,7 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, subrange_t rgfa_frag_subrange = graph->get_subrange(path_handle); rgfa_frag_subrange.first = rgfa_frag_pos + (rgfa_frag_subrange != PathMetadata::NO_SUBRANGE ? rgfa_frag_subrange.first : 0); rgfa_frag_subrange.second = rgfa_frag_subrange.first + rgfa_frag_length; - string rgfa_frag_name = create_rgfa_path_name(path_name, rgfa_rank, rgfa_frag_subrange); + string rgfa_frag_name = create_rgfa_path_name(path_name, rgfa_rank, rgfa_frag_subrange, rgfa_sample_name); #ifdef debug #pragma omp critical(cerr) diff --git a/src/gfa.hpp b/src/gfa.hpp index ba59a292519..3603224cf1a 100644 --- a/src/gfa.hpp +++ b/src/gfa.hpp @@ -26,7 +26,8 @@ using namespace std; void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& rgfa_paths = {}, bool rgfa_pline = false, - bool use_w_lines = true); + bool use_w_lines = true, + const string& rgfa_sample_name = ""); /// Prototype code to tag paths as rGFA paths. Either needs to be completely scrapped @@ -52,10 +53,10 @@ int get_rgfa_rank(const string& path_name, const string& rgfa_sample="_rGFA_"); /// Add the rgfa rank to a pathname, also setting its sample to the special rgfa sample and /// moving its old sample into the locus field string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subrange_t& subrange, - const string& rgfa_sample="_rGFA_"); + const string& rgfa_sample); /// Remove the rGFA information from a path name, effectively undoing set_rgfa_rank -string strip_rgfa_path_name(const string& path_name, const string& rgfa_sample="_rGFA_"); +string strip_rgfa_path_name(const string& path_name); /// Compute the rGFA path cover /// graph: the graph @@ -67,6 +68,7 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, SnarlManager* snarl_manager, const unordered_set& reference_paths, int64_t minimum_length, + const string& rgfa_sample_name, const unordered_map>>& preferred_intervals = {}); void rgfa_snarl_cover(const PathHandleGraph* graph, diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index 6d4d14f36e4..0e62b6de87a 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -42,8 +42,9 @@ void help_paths(char** argv) { << " -r, --retain-paths output a graph with only the selected paths retained" << endl << " rGFA cover" << endl << " -R, --rgfa-min-length N add rGFA cover to graph, using seleciton from -Q/-S as rank-0 backbone, only adding fragments >= Nbp (default:-1=disabled)" << endl + << " -N, --rgfa-sample STR give all rGFA cover fragments sample name (path prefix) STR (default: _rGFA_)." << endl << " -s, --snarls FILE snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover (-R)." << endl - << " -t, --threads N use up to N threads when computing rGFA covoer (default: all available)" << endl + << " -t, --threads N use up to N threads when computing rGFA cover (default: all available)" << endl << " output path data:" << endl << " -X, --extract-gam print (as GAM alignments) the stored paths in the graph" << endl << " -A, --extract-gaf print (as GAF alignments) the stored paths in the graph" << endl @@ -107,6 +108,7 @@ int main_paths(int argc, char** argv) { bool drop_paths = false; bool retain_paths = false; int64_t rgfa_min_len = -1; + string rgfa_sample_name = "_rGFA_"; string snarl_filename; string graph_file; string gbwt_file; @@ -141,6 +143,7 @@ int main_paths(int argc, char** argv) { {"drop-paths", no_argument, 0, 'd'}, {"retain-paths", no_argument, 0, 'r'}, {"rgfa-cover", required_argument, 0, 'R'}, + {"rgfa-sample", required_argument, 0, 'N'}, {"snarls", required_argument, 0, 's'}, {"extract-gam", no_argument, 0, 'X'}, {"extract-gaf", no_argument, 0, 'A'}, @@ -163,7 +166,7 @@ int main_paths(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drR:s:aGp:ct:", + c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drR:N:s:aGp:ct:", long_options, &option_index); // Detect the end of the options. @@ -204,6 +207,10 @@ int main_paths(int argc, char** argv) { output_formats++; break; + case 'N': + rgfa_sample_name = optarg; + break; + case 's': snarl_filename = optarg; break; @@ -622,7 +629,7 @@ int main_paths(int argc, char** argv) { } }); - rgfa_graph_cover(mutable_graph, snarl_manager.get(), reference_paths, rgfa_min_len); + rgfa_graph_cover(mutable_graph, snarl_manager.get(), reference_paths, rgfa_min_len, rgfa_sample_name); } // output the graph From 5ee7304afe83bc077fe7db4c314cd625a0ba956e Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 11 May 2023 11:07:08 -0400 Subject: [PATCH 14/21] adapt rgfa tests to latest conventions --- src/gfa.cpp | 18 +++++------------- src/gfa.hpp | 5 ++--- test/t/11_vg_paths.t | 28 ++++++++++++++-------------- 3 files changed, 21 insertions(+), 30 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index 2848e202a6b..45cf99ca1be 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -20,8 +20,8 @@ static bool should_write_as_w_line(const PathHandleGraph* graph, path_handle_t p static void write_w_line(const PathHandleGraph* graph, ostream& out, path_handle_t path_handle, unordered_map, size_t>& last_phase_block_end); void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& rgfa_paths, - bool rgfa_pline, bool use_w_lines, const string& rgfa_sample_name) { - + bool rgfa_pline, bool use_w_lines) { + // TODO: Support sorting nodes, paths, and/or edges for canonical output // TODO: Use a NamedNodeBackTranslation (or forward translation?) to properly round-trip GFA that has had to be chopped. @@ -118,7 +118,6 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& vector w_line_paths; - bool warned_about_tags_as_paths = false; // Paths as P-lines for (const path_handle_t& h : path_handles) { auto path_name = graph->get_path_name(h); @@ -126,9 +125,6 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& if (!rgfa_paths.empty()) { // the path was put into tags, no reason to deal with it here continue; - } else if (!warned_about_tags_as_paths) { - cerr << "warning [gfa]: outputing rGFA cover (rank>=1) path(s) as a P-line(s) and not tags because no reference (rank==0) selected" << endl; - warned_about_tags_as_paths = true; } } if (rgfa_pline || !rgfa_paths.count(path_name)) { @@ -171,9 +167,6 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& if (!rgfa_paths.empty()) { // the path was put into tags, no reason to deal with it here continue; - } else if (!warned_about_tags_as_paths) { - cerr << "warning [gfa]: outputing rGFA cover (rank>=1) path(s) as a W-line(s) and not tags because no reference (rank==0) selected" << endl; - warned_about_tags_as_paths = true; } } write_w_line(graph, out, h, last_phase_block_end); @@ -282,7 +275,7 @@ void write_w_line(const PathHandleGraph* graph, ostream& out, path_handle_t path out << "\n"; } -int get_rgfa_rank(const string& path_name, const string& rgfa_sample) { +int get_rgfa_rank(const string& path_name) { int rank = -1; PathSense path_sense; @@ -296,7 +289,6 @@ int get_rgfa_rank(const string& path_name, const string& rgfa_sample) { size_t pos = path_locus.rfind(":SR:i:"); if (pos != string::npos && path_locus.length() - pos >= 6) { - assert(path_sample == rgfa_sample); pos += 6; size_t pos2 = path_locus.find(":", pos); size_t len = pos2 == string::npos ? pos2 : pos2 - pos; @@ -399,10 +391,10 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, size_t thread_count = get_thread_count(); vector>>> thread_covers(thread_count); - // we process top-level snarl_manager in parallel + // we process top-level snarls in parallel snarl_manager->for_each_top_level_snarl_parallel([&](const Snarl* snarl) { // per-thread output - // each fragment is a rank and vector of steps, the cove is a list of fragments + // each fragment is a rank and vector of steps, the cover is a list of fragments // TODO: we can store just a first step and count instead of every fragment vector>>& cover_fragments = thread_covers.at(omp_get_thread_num()); // we also index the fragments by their node ids for fast lookups of what's covered by what diff --git a/src/gfa.hpp b/src/gfa.hpp index 3603224cf1a..e66d91aeb72 100644 --- a/src/gfa.hpp +++ b/src/gfa.hpp @@ -26,8 +26,7 @@ using namespace std; void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& rgfa_paths = {}, bool rgfa_pline = false, - bool use_w_lines = true, - const string& rgfa_sample_name = ""); + bool use_w_lines = true); /// Prototype code to tag paths as rGFA paths. Either needs to be completely scrapped @@ -48,7 +47,7 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, /// Returns the RGFA rank (SR) of a path. This will be 0 for the reference /// backbone, and higher the further number of (nested) bubbles away it is. /// If the path is not an RGFA path, then return -1 -int get_rgfa_rank(const string& path_name, const string& rgfa_sample="_rGFA_"); +int get_rgfa_rank(const string& path_name); /// Add the rgfa rank to a pathname, also setting its sample to the special rgfa sample and /// moving its old sample into the locus field diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t index cd28a8fefa5..29d92ba36e4 100644 --- a/test/t/11_vg_paths.t +++ b/test/t/11_vg_paths.t @@ -59,46 +59,46 @@ is $? 0 "vg path coverage reports correct lengths in first column" rm -f q.vg q.cov.len q.len -vg paths -v rgfa/rgfa_tiny.gfa -R 1 -Q x | vg view - > rgfa_tiny.rgfa -printf "P y[33-34]:SR:i:1 10+ * -P y[38-39]:SR:i:1 13+ * -P y[8-10]:SR:i:1 2+,4+ *\n" > rgfa_tiny_expected_fragments.rgfa +vg paths -v rgfa/rgfa_tiny.gfa -R 1 -Q x -N c | vg convert - -fW > rgfa_tiny.rgfa +printf "P c#0#SN:Z:y:SR:i:1[33-34] 10+ * +P c#0#SN:Z:y:SR:i:1[38-39] 13+ * +P c#0#SN:Z:y:SR:i:1[8-10] 2+,4+ *\n" > rgfa_tiny_expected_fragments.rgfa grep ^P rgfa_tiny.rgfa | grep SR | sort > rgfa_tiny_fragments.rgfa diff rgfa_tiny_fragments.rgfa rgfa_tiny_expected_fragments.rgfa is $? 0 "Found the expected rgfa SNP cover of tiny graph" rm -f rgfa_tiny.rgfa rgfa_tiny_expected_fragments.rgfa rgfa_tiny_fragments.rgfa -vg paths -v rgfa/rgfa_ins.gfa -R 5 -Q x | vg view - > rgfa_ins.rgfa -printf "P z[8-17]:SR:i:1 2+,3+,4+ *\n" > rgfa_ins_expected_fragments.rgfa +vg paths -v rgfa/rgfa_ins.gfa -R 5 -Q x -N c | vg convert - -fW > rgfa_ins.rgfa +printf "P c#0#SN:Z:z:SR:i:1[8-17] 2+,3+,4+ *\n" > rgfa_ins_expected_fragments.rgfa grep ^P rgfa_ins.rgfa | grep SR | sort > rgfa_ins_fragments.rgfa diff rgfa_ins_fragments.rgfa rgfa_ins_expected_fragments.rgfa is $? 0 "Found the expected rgfa cover for simple nested insertion" rm -f rgfa_ins.rgfa rgfa_ins_expected_fragments.rgfa rgfa_ins_fragments.rgfa -vg paths -v rgfa/rgfa_ins2.gfa -R 3 -Q x | vg view - > rgfa_ins2.rgfa -printf "P y[8-24]:SR:i:1 2+,6+,4+ * -P z[11-14]:SR:i:2 3+ *\n" > rgfa_ins2_expected_fragments.rgfa +vg paths -v rgfa/rgfa_ins2.gfa -R 3 -Q x | vg convert - -fW > rgfa_ins2.rgfa +printf "P _rGFA_#0#SN:Z:y:SR:i:1[8-24] 2+,6+,4+ * +P _rGFA_#0#SN:Z:z:SR:i:2[11-14] 3+ *\n" > rgfa_ins2_expected_fragments.rgfa grep ^P rgfa_ins2.rgfa | grep SR | sort > rgfa_ins2_fragments.rgfa diff rgfa_ins2_fragments.rgfa rgfa_ins2_expected_fragments.rgfa is $? 0 "Found the expected rgfa cover for simple nested insertion that requires two fragments" rm -f rgfa_ins2.rgfa rgfa_ins2_expected_fragments.rgfa rgfa_ins2_fragments.rgfa -vg paths -v rgfa/rgfa_ins2.gfa -R 5 -Q x | vg view - > rgfa_ins2R5.rgfa -printf "P y[8-24]:SR:i:1 2+,6+,4+ *\n" > rgfa_ins2R5_expected_fragments.rgfa +vg paths -v rgfa/rgfa_ins2.gfa -R 5 -Q x -N c | vg convert - -fW > rgfa_ins2R5.rgfa +printf "P c#0#SN:Z:y:SR:i:1[8-24] 2+,6+,4+ *\n" > rgfa_ins2R5_expected_fragments.rgfa grep ^P rgfa_ins2R5.rgfa | grep SR | sort > rgfa_ins2R5_fragments.rgfa diff rgfa_ins2R5_fragments.rgfa rgfa_ins2R5_expected_fragments.rgfa is $? 0 "rgfa Minimum fragment length filters out small fragment" rm -f rgfa_ins2R5.rgfa rgfa_ins2R5_expected_fragments.rgfa rgfa_ins2R5_fragments.rgfa -vg paths -v rgfa/rgfa_ins3.gfa -R 3 -Q x | vg view - > rgfa_ins3.rgfa +vg paths -v rgfa/rgfa_ins3.gfa -R 3 -Q x -N c | vg convert - -fW > rgfa_ins3.rgfa printf "P x 1+,5+ * -P y[3-19]:SR:i:1 4+,6+,2+ * +P c#0#SN:Z:y:SR:i:1[3-19] 4+,6+,2+ * P y 5-,4+,6+,2+,1- * -P z[11-14]:SR:i:2 3+ * +P c#0#SN:Z:z:SR:i:2[11-14] 3+ * P z 1+,2-,3+,4-,5+ *\n" | sort > rgfa_ins3_expected_fragments.rgfa grep ^P rgfa_ins3.rgfa | sort > rgfa_ins3_fragments.rgfa diff rgfa_ins3_fragments.rgfa rgfa_ins3_expected_fragments.rgfa From 13825c38abfc424f717dd38783be439b6940fe6f Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 17 May 2023 11:19:06 -0400 Subject: [PATCH 15/21] leave subpath *in* rgfa names in surject (for now) --- src/hts_alignment_emitter.cpp | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/src/hts_alignment_emitter.cpp b/src/hts_alignment_emitter.cpp index 2df5a1fac51..63b93b1c903 100644 --- a/src/hts_alignment_emitter.cpp +++ b/src/hts_alignment_emitter.cpp @@ -12,6 +12,7 @@ #include "algorithms/find_translation.hpp" #include #include +#include "gfa.hpp" #include @@ -178,7 +179,10 @@ pair>, unordered_map> extract_path auto& path = get<0>(path_info); auto& own_length = get<1>(path_info); auto& base_length = get<2>(path_info); - string base_path_name = subpath_support ? get_path_base_name(graph, path) : graph.get_path_name(path); + string base_path_name = graph.get_path_name(path); + if (subpath_support && get_rgfa_rank(base_path_name) < 0) { + base_path_name = get_path_base_name(graph, path); + } if (!base_path_set.count(base_path_name)) { path_names_and_lengths.push_back(make_pair(base_path_name, base_length)); base_path_set.insert(base_path_name); @@ -355,9 +359,14 @@ vector> get_sequence_dictionary(const strin // When we find a path or subpath we want, we will keep it. auto keep_path_or_subpath = [&](const path_handle_t& path) { - string base_name = get_path_base_name(graph, path); + string base_name = graph.get_path_name(path); + bool is_rgfa = get_rgfa_rank(base_name) >= 0; + if (!is_rgfa) { + // only process subpaths if not rgfa + base_name = get_path_base_name(graph, path); + } int64_t base_length = -1; - if (graph.get_subrange(path) == PathMetadata::NO_SUBRANGE) { + if (is_rgfa || graph.get_subrange(path) == PathMetadata::NO_SUBRANGE) { // This is a full path so we can determine base length now. base_length = graph.get_path_length(path); } @@ -691,11 +700,13 @@ void HTSAlignmentEmitter::convert_alignment(const Alignment& aln, vector Date: Fri, 19 May 2023 21:26:40 -0400 Subject: [PATCH 16/21] write all context metadata inside rgfa pathname --- src/gfa.cpp | 216 +++++++++++++++++++++++++++++++++++++++++++--------- src/gfa.hpp | 23 +++++- 2 files changed, 202 insertions(+), 37 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index 45cf99ca1be..6ed4169dc62 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -299,8 +299,15 @@ int get_rgfa_rank(const string& path_name) { } string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subrange_t& rgfa_subrange, - const string& rgfa_sample) { - + const string& rgfa_sample, + const string& start_parent_rgfa_path_name, + int64_t start_parent_offset, + int64_t start_parent_node_id, + bool start_parent_node_reversed, + const string& end_parent_rgfa_path_name, + int64_t end_parent_offset, + int64_t end_parent_node_id, + bool end_parent_node_reversed) { PathSense path_sense; string path_sample; string path_locus; @@ -321,6 +328,19 @@ string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subra // and we also load in our rGFA tag base_name += ":SR:i:" + std::to_string(rgfa_rank); + // start parent information + if (!start_parent_rgfa_path_name.empty()) { + base_name += ":SPP:Z:" + start_parent_rgfa_path_name + + ":SPO:i:" + std::to_string(start_parent_offset) + + ":SPH:Z:" + (start_parent_node_reversed ? "<" : ">") + std::to_string(start_parent_node_id); + } + // end parent information + if (!end_parent_rgfa_path_name.empty()) { + base_name += ":EPP:Z:" + end_parent_rgfa_path_name + + ":EPO:i:" + std::to_string(end_parent_offset) + + ":EPH:Z:" + (end_parent_node_reversed ? "<" : ">") + std::to_string(end_parent_node_id); + } + // and return the final path, with sample/locus/rgfa-rank embedded in locus // (as it's a reference path, we alsos strip out the phase block) return PathMetadata::create_path_name(PathSense::REFERENCE, rgfa_sample, base_name, path_haplotype, @@ -389,14 +409,15 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, // we collect the rgfa cover in parallel as a list of path fragments size_t thread_count = get_thread_count(); - vector>>> thread_covers(thread_count); + vector> thread_covers(thread_count); // we process top-level snarls in parallel snarl_manager->for_each_top_level_snarl_parallel([&](const Snarl* snarl) { // per-thread output // each fragment is a rank and vector of steps, the cover is a list of fragments // TODO: we can store just a first step and count instead of every fragment - vector>>& cover_fragments = thread_covers.at(omp_get_thread_num()); + // The last two numbers are the indexes of the start and end parent fragments (ie values in cover_not_to_fragment) + vector& cover_fragments = thread_covers.at(omp_get_thread_num()); // we also index the fragments by their node ids for fast lookups of what's covered by what // the value here is an index in the above vector unordered_map cover_node_to_fragment; @@ -429,12 +450,18 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, }); // merge up the thread covers - vector>> rgfa_fragments = std::move(thread_covers.at(0)); + vector rgfa_fragments = std::move(thread_covers.at(0)); for (size_t t = 1; t < thread_count; ++t) { rgfa_fragments.reserve(rgfa_fragments.size() + thread_covers.at(t).size()); + // adjust the offsets into each vector + for (auto& other_frag : thread_covers.at(t)) { + other_frag.start_parent_idx += rgfa_fragments.size(); + other_frag.end_parent_idx += rgfa_fragments.size(); + } std::move(thread_covers.at(t).begin(), thread_covers.at(t).end(), std::back_inserter(rgfa_fragments)); } thread_covers.clear(); + // we don't have a path position interface, and even if we did we probably wouldn't have it on every path // so to keep running time linear, we need to index the fragments so their offsets can be computed in one scan @@ -442,64 +469,127 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, unordered_map> path_to_fragments; for (size_t i = 0; i get_path_handle_of_step(rgfa_fragment.second.front()); + path_handle_t path_handle = graph->get_path_handle_of_step(rgfa_fragment.steps.front()); path_to_fragments[path_handle].push_back(i); } + unordered_map step_to_pos; + for (const auto& path_fragments : path_to_fragments) { const path_handle_t& path_handle = path_fragments.first; string path_name = graph->get_path_name(path_handle); - - const vector& fragments = path_fragments.second; - // for each path, start by finding the positional offset of all relevant steps in the path by brute-force scan - unordered_map step_to_pos; + const vector& fragments = get<1>(path_fragments); + + // for each path, start by finding the positional offset of all relevant steps in the path by brute-force scann + int64_t set_count = 0; for (const int64_t& frag_idx : fragments) { - const vector& rgfa_fragment = rgfa_fragments.at(frag_idx).second; - step_to_pos[rgfa_fragment.front()] = -1; - // todo: need to figure out where exactly we handle all the different orientation cases - if (rgfa_fragment.size() > 1) { - step_to_pos[rgfa_fragment.back()] = -1; + const vector& rgfa_fragment = rgfa_fragments.at(frag_idx).steps; + for (const step_handle_t& step: rgfa_fragment) { + step_to_pos[step] = -1; + cerr << "setting " << graph->get_id(graph->get_handle_of_step(step)) << " to -1" << endl; + assert(graph->get_path_handle_of_step(step) == path_handle); + ++set_count; } } - size_t pos_count = 0; size_t pos = 0; graph->for_each_step_in_path(path_handle, [&](const step_handle_t& step_handle) { if (step_to_pos.count(step_handle)) { step_to_pos[step_handle] = pos; - ++pos_count; - if (pos_count == step_to_pos.size()) { - return false; - } + cerr << "resetting " << graph->get_id(graph->get_handle_of_step(step_handle)) << " to " << pos << endl; + --set_count; } handle_t handle = graph->get_handle_of_step(step_handle); pos += graph->get_length(handle); return true; }); - assert(pos_count == step_to_pos.size()); + //assert(set_count == 0); + } + + for (const auto& path_fragments : path_to_fragments) { + const path_handle_t& path_handle = path_fragments.first; + string path_name = graph->get_path_name(path_handle); + + const vector& fragments = get<1>(path_fragments); // second pass to make the path fragments, now that we know the positional offsets of their endpoints for (const int64_t frag_idx : fragments) { - int64_t rgfa_rank = rgfa_fragments.at(frag_idx).first; - const vector& rgfa_fragment = rgfa_fragments.at(frag_idx).second; + const RGFAFragment& rgfa_fragment = rgfa_fragments.at(frag_idx); + // don't do anything for rank-0, which we only kept to help with metadata for other fragments + // todo: can we upstream this check? + if (rgfa_fragment.rank == 0) { + continue; + } - size_t rgfa_frag_pos = step_to_pos[rgfa_fragment.front()]; + size_t rgfa_frag_pos = step_to_pos[rgfa_fragment.steps.front()]; size_t rgfa_frag_length = 0; - for (const step_handle_t& step : rgfa_fragment) { + for (const step_handle_t& step : rgfa_fragment.steps) { rgfa_frag_length += graph->get_length(graph->get_handle_of_step(step)); } subrange_t rgfa_frag_subrange = graph->get_subrange(path_handle); rgfa_frag_subrange.first = rgfa_frag_pos + (rgfa_frag_subrange != PathMetadata::NO_SUBRANGE ? rgfa_frag_subrange.first : 0); rgfa_frag_subrange.second = rgfa_frag_subrange.first + rgfa_frag_length; - string rgfa_frag_name = create_rgfa_path_name(path_name, rgfa_rank, rgfa_frag_subrange, rgfa_sample_name); + + + string start_parent_rgfa_path_name = ""; + int64_t start_parent_offset = -1; + int64_t start_parent_node_id = -1; + bool start_parent_node_reversed = false; + + // get information about the start parent + if (rgfa_fragment.start_parent_idx >= 0) { + const RGFAFragment& start_parent_fragment = rgfa_fragments.at(rgfa_fragment.start_parent_idx); + path_handle_t start_parent_path_handle = graph->get_path_handle_of_step(start_parent_fragment.steps.front()); + start_parent_rgfa_path_name = graph->get_path_name(start_parent_path_handle); + /* + cerr << "assert is gonna check " << graph->get_path_name(graph->get_path_handle_of_step(rgfa_fragment.start_parent_step)) << " vs " << graph->get_path_name(start_parent_path_handle) << endl; + assert(graph->get_path_handle_of_step(rgfa_fragment.start_parent_step) == start_parent_path_handle); + cerr << "checking step " << start_parent_rgfa_path_name << graph->get_id(graph->get_handle_of_step(rgfa_fragment.start_parent_step)) << ":" << graph->get_is_reverse(graph->get_handle_of_step(rgfa_fragment.start_parent_step)) << endl; + cerr << "via " << path_name << " " << graph->get_id(graph->get_handle_of_step(rgfa_fragment.steps.front())) + << ":" << graph->get_is_reverse(graph->get_handle_of_step(rgfa_fragment.steps.front())) << endl; + */ + start_parent_offset = step_to_pos.at(rgfa_fragment.start_parent_step); + subrange_t start_parent_subrange = graph->get_subrange(start_parent_path_handle); + if (start_parent_subrange != PathMetadata::NO_SUBRANGE) { + start_parent_offset += start_parent_subrange.first; + } + start_parent_node_id = graph->get_id(graph->get_handle_of_step(start_parent_fragment.start_parent_step)); + start_parent_node_reversed = graph->get_is_reverse(graph->get_handle_of_step(start_parent_fragment.start_parent_step)); + } + + string end_parent_rgfa_path_name = ""; + int64_t end_parent_offset = -1; + int64_t end_parent_node_id = -1; + bool end_parent_node_reversed = false; + + // get information about the end parent + if (rgfa_fragment.end_parent_idx >= 0) { + const RGFAFragment& end_parent_fragment = rgfa_fragments.at(rgfa_fragment.end_parent_idx); + path_handle_t end_parent_path_handle = graph->get_path_handle_of_step(end_parent_fragment.steps.front()); + end_parent_rgfa_path_name = graph->get_path_name(end_parent_path_handle); + assert(graph->get_path_handle_of_step(rgfa_fragment.end_parent_step) == end_parent_path_handle); + end_parent_offset = step_to_pos.at(rgfa_fragment.end_parent_step); + subrange_t end_parent_subrange = graph->get_subrange(end_parent_path_handle); + if (end_parent_subrange != PathMetadata::NO_SUBRANGE) { + end_parent_offset += end_parent_subrange.first; + } + end_parent_node_id = graph->get_id(graph->get_handle_of_step(end_parent_fragment.end_parent_step)); + end_parent_node_reversed = graph->get_is_reverse(graph->get_handle_of_step(end_parent_fragment.end_parent_step)); + } + + string rgfa_frag_name = create_rgfa_path_name(path_name, rgfa_fragment.rank, rgfa_frag_subrange, rgfa_sample_name, + start_parent_rgfa_path_name, start_parent_offset, + start_parent_node_id, start_parent_node_reversed, + end_parent_rgfa_path_name, end_parent_offset, + end_parent_node_id, end_parent_node_reversed); #ifdef debug #pragma omp critical(cerr) - cerr << "making new rgfa fragment with name " << rgfa_frag_name << " and " << rgfa_fragment.size() << " steps. subrange " + cerr << "making new rgfa fragment with name " << rgfa_frag_name << " and " << rgfa_fragment.steps.size() << " steps. subrange " << rgfa_frag_subrange.first << "," << rgfa_frag_subrange.second << endl; #endif path_handle_t rgfa_fragment_handle = graph->create_path_handle(rgfa_frag_name); - for (const step_handle_t& step : rgfa_fragment) { + for (const step_handle_t& step : rgfa_fragment.steps) { graph->append_step(rgfa_fragment_handle, graph->get_handle_of_step(step)); } } @@ -514,7 +604,7 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, PathTraversalFinder& path_trav_finder, const unordered_set& reference_paths, int64_t minimum_length, - vector>>& cover_fragments, + vector& cover_fragments, unordered_map& cover_node_to_fragment, const unordered_map>>& preferred_intervals) { @@ -588,14 +678,21 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, } if (!ref_paths.empty()) { - // reference paths are trivially covered outside the snarl decomposition - // all we do here is make sure the relevant nodes are flagged in the map + // update the cover: note that we won't actually make a path out of this + // since we leave rank-0 paths the way they are, but it's handy to have + // a consistent representation for the cover while linking up the + // various nesting metadata. vector& ref_trav = travs.at(ref_paths.begin()->second); for (step_handle_t ref_step_handle : ref_trav) { nid_t node_id = graph->get_id(graph->get_handle_of_step(ref_step_handle)); - // using -1 as special signifier of the reference path - cover_node_to_fragment[node_id] = -1; + cover_node_to_fragment[node_id] = cover_fragments.size(); } + + // todo: check endpoints (and remove from ref_trav??) + RGFAFragment frag = { + 0, ref_trav, -1, -1, ref_trav.front(), ref_trav.back() + }; + cover_fragments.push_back(frag); } #ifdef debug @@ -738,8 +835,39 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, if (min_frag_idx == -1) { fragment_rank = 1; } else { - fragment_rank = cover_fragments.at(min_frag_idx).first + 1; + fragment_rank = cover_fragments.at(min_frag_idx).rank + 1; + } + + // now we need to find the steps on the parent path, in order to link back to its position + // todo: can we avoid this enumeration by keeping track of the parent directly from the beginning? + handle_t start_handle = graph->get_handle_of_step(trav[uncovered_interval.first - 1]); + vector start_handle_steps = graph->steps_of_handle(start_handle); + path_handle_t start_parent_path = graph->get_path_handle_of_step(cover_fragments.at(prev_frag_idx).steps.front()); + vector start_parent_step_indexes; + for (int64_t i = 0; i < start_handle_steps.size(); ++i) { + if (graph->get_path_handle_of_step(start_handle_steps[i]) == start_parent_path) { + start_parent_step_indexes.push_back(i); + } + } + assert(start_parent_step_indexes.size() == 1); + step_handle_t start_parent_step = start_handle_steps[start_parent_step_indexes[0]]; + +#ifdef debug +#pragma omp critical(cerr) + cerr << "adding step " << graph->get_path_name(start_parent_path) << " " << graph->get_id(graph->get_handle_of_step(start_parent_step)) << " as start parent for " << graph->get_path_name(graph->get_path_handle_of_step(trav.front())) << " starting at " << graph->get_id(graph->get_handle_of_step(trav.at(uncovered_interval.first))) << endl; +#endif + + handle_t end_handle = graph->get_handle_of_step(trav[uncovered_interval.second]); + vector end_handle_steps = graph->steps_of_handle(end_handle); + path_handle_t end_parent_path = graph->get_path_handle_of_step(cover_fragments.at(prev_frag_idx).steps.front()); + vector end_parent_step_indexes; + for (int64_t i = 0; i < end_handle_steps.size(); ++i) { + if (graph->get_path_handle_of_step(end_handle_steps[i]) == end_parent_path) { + end_parent_step_indexes.push_back(i); + } } + assert(end_parent_step_indexes.size() == 1); + step_handle_t end_parent_step = end_handle_steps[end_parent_step_indexes[0]]; // update the cover vector interval; @@ -748,7 +876,25 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, interval.push_back(trav[i]); cover_node_to_fragment[graph->get_id(graph->get_handle_of_step(trav[i]))] = cover_fragments.size(); } - cover_fragments.push_back(make_pair(fragment_rank, std::move(interval))); + RGFAFragment frag = { + fragment_rank, std::move(interval), + prev_frag_idx, next_frag_idx, + start_parent_step, end_parent_step + }; + +#ifdef debug +#pragma omp critical(cerr) +{ + cerr << "adding fragment path=" << graph->get_path_name(graph->get_path_handle_of_step(frag.steps.front())) << " rank=" << fragment_rank << " steps = "; + for (auto xx : frag.steps) { + cerr << graph->get_id(graph->get_handle_of_step(xx)) <<":" << graph->get_is_reverse(graph->get_handle_of_step(xx)) <<","; + } + cerr << " sps " << graph->get_id(graph->get_handle_of_step(start_parent_step)) <<":" << graph->get_is_reverse(graph->get_handle_of_step(start_parent_step)); + cerr << " eps " << graph->get_id(graph->get_handle_of_step(end_parent_step)) <<":" << graph->get_is_reverse(graph->get_handle_of_step(end_parent_step)); + cerr << endl; +} +#endif + cover_fragments.push_back(frag); } } diff --git a/src/gfa.hpp b/src/gfa.hpp index e66d91aeb72..c281fb58863 100644 --- a/src/gfa.hpp +++ b/src/gfa.hpp @@ -52,7 +52,16 @@ int get_rgfa_rank(const string& path_name); /// Add the rgfa rank to a pathname, also setting its sample to the special rgfa sample and /// moving its old sample into the locus field string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subrange_t& subrange, - const string& rgfa_sample); + const string& rgfa_sample, + const string& start_parent_rgfa_path_name = "", + int64_t start_parent_offset = -1, + int64_t start_parent_node_id = -1, + bool start_parent_node_reversed = false, + const string& end_parent_rgfa_path_name = "", + int64_t end_parent_offset = -1, + int64_t end_parent_node_id = -1, + bool end_parent_node_reversed = false); + /// Remove the rGFA information from a path name, effectively undoing set_rgfa_rank string strip_rgfa_path_name(const string& path_name); @@ -70,12 +79,22 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, const string& rgfa_sample_name, const unordered_map>>& preferred_intervals = {}); +/// Some information about an RGFA fragment we pass around and index below +struct RGFAFragment { + int64_t rank; + vector steps; + int64_t start_parent_idx; + int64_t end_parent_idx; + step_handle_t start_parent_step; + step_handle_t end_parent_step; +}; + void rgfa_snarl_cover(const PathHandleGraph* graph, const Snarl& snarl, PathTraversalFinder& path_trav_finder, const unordered_set& reference_paths, int64_t minimum_length, - vector>>& cover_fragments, + vector& cover_fragments, unordered_map& cover_node_to_fragment, const unordered_map>>& preferred_intervals); From 739eff8eddea02462e99d219917eb1bfdac8006c Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 25 May 2023 12:05:20 -0400 Subject: [PATCH 17/21] fix some rgfa naming details --- src/gfa.cpp | 299 ++++++++++++++++++++++++++++++++++++++++++++++------ src/gfa.hpp | 28 +++-- 2 files changed, 289 insertions(+), 38 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index 6ed4169dc62..232b0e8ec22 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -76,8 +76,17 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& out << graph->get_sequence(h); auto it = node_offsets_and_ranks.find(node_id); if (it != node_offsets_and_ranks.end()) { - // hack off the rgfa tag - string base_name = strip_rgfa_path_name(graph->get_path_name(get<0>(it->second))); + // hack off the rgfa stuff from the path, and move into tags + string path_name = graph->get_path_name(get<0>(it->second)); + string rgfa_tags; + string base_name; + if (get_rgfa_rank(path_name) >= 0) { + base_name = parse_rgfa_name_into_tags(graph->get_path_name(get<0>(it->second)), rgfa_tags); + } else { + // no tags, must be the rank-0 reference + base_name = path_name; + rgfa_tags = "SR:i:0"; + } // hack off the subrange offset (and add it to SO) PathSense sense; string sample, locus; @@ -90,7 +99,7 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& // add rGFA tags out << "\t" << "SN:Z:" << base_name << "\t" << "SO:i:" << (base_offset + get<1>(it->second)) - << "\t" << "SR:i:" << get<2>(it->second); + << "\t" << rgfa_tags; } out << "\n"; // Writing `std::endl` would flush the buffer. return true; @@ -300,12 +309,10 @@ int get_rgfa_rank(const string& path_name) { string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subrange_t& rgfa_subrange, const string& rgfa_sample, - const string& start_parent_rgfa_path_name, - int64_t start_parent_offset, + const string& start_parent_path_name, int64_t start_parent_node_id, bool start_parent_node_reversed, - const string& end_parent_rgfa_path_name, - int64_t end_parent_offset, + const string& end_parent_path_name, int64_t end_parent_node_id, bool end_parent_node_reversed) { PathSense path_sense; @@ -317,29 +324,76 @@ string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subra PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus, path_haplotype, path_phase_block, path_subrange); - // we're going to replace the existing sample name; so move it out into locus behind SN:Z: + // we're going to store absolutely everything in the locus name, including the sample name (behind SN:Z:) assert(path_locus != PathMetadata::NO_LOCUS_NAME); string base_name; if (path_sample != PathMetadata::NO_SAMPLE_NAME) { - base_name = path_sample + ":"; + base_name = "SN:Z:" + path_sample; } - base_name += "SN:Z:" + path_locus; + // the contig name will be behind SC... + base_name += ":SC:Z:" + path_locus; - // and we also load in our rGFA tag + // and we also load in our rGFA rank base_name += ":SR:i:" + std::to_string(rgfa_rank); - // start parent information - if (!start_parent_rgfa_path_name.empty()) { - base_name += ":SPP:Z:" + start_parent_rgfa_path_name + - ":SPO:i:" + std::to_string(start_parent_offset) + - ":SPH:Z:" + (start_parent_node_reversed ? "<" : ">") + std::to_string(start_parent_node_id); + // start parent parent path is embedded as tags (so its own #'s don't muck up the structure) + // todo: this is way too verbose and should be fixed. + if (!start_parent_path_name.empty()) { + PathSense start_parent_path_sense; + string start_parent_path_sample; + string start_parent_path_locus; + size_t start_parent_path_haplotype; + size_t start_parent_path_phase_block; + subrange_t start_parent_path_subrange; + PathMetadata::parse_path_name(start_parent_path_name, start_parent_path_sense, start_parent_path_sample, + start_parent_path_locus, start_parent_path_haplotype, + start_parent_path_phase_block, start_parent_path_subrange); + if (start_parent_path_sample != PathMetadata::NO_SAMPLE_NAME) { + base_name += ":SPS:Z:" + start_parent_path_sample; + } + if (start_parent_path_locus != PathMetadata::NO_LOCUS_NAME) { + base_name += ":SPC:Z:" + start_parent_path_locus; + } + if (start_parent_path_haplotype != PathMetadata::NO_HAPLOTYPE) { + base_name += ":SPH:i:" + to_string(start_parent_path_haplotype); + } + if (start_parent_path_phase_block != PathMetadata::NO_PHASE_BLOCK) { + base_name += ":SPB:i:" + to_string(start_parent_path_phase_block); + } + if (start_parent_path_subrange != PathMetadata::NO_SUBRANGE) { + base_name += ":SPO:i:" + to_string(start_parent_path_subrange.first); + } + base_name += ":SPN:Z:" + string(start_parent_node_reversed ? "<" : ">") + to_string(start_parent_node_id); + } + + // same for the end parent + if (!end_parent_path_name.empty()) { + PathSense end_parent_path_sense; + string end_parent_path_sample; + string end_parent_path_locus; + size_t end_parent_path_haplotype; + size_t end_parent_path_phase_block; + subrange_t end_parent_path_subrange; + PathMetadata::parse_path_name(end_parent_path_name, end_parent_path_sense, end_parent_path_sample, + end_parent_path_locus, end_parent_path_haplotype, + end_parent_path_phase_block, end_parent_path_subrange); + if (end_parent_path_sample != PathMetadata::NO_SAMPLE_NAME) { + base_name += ":EPS:Z:" + end_parent_path_sample; + } + if (end_parent_path_locus != PathMetadata::NO_LOCUS_NAME) { + base_name += ":EPC:Z:" + end_parent_path_locus; + } + if (end_parent_path_haplotype != PathMetadata::NO_HAPLOTYPE) { + base_name += ":EPH:i:" + to_string(end_parent_path_haplotype); + } + if (end_parent_path_phase_block != PathMetadata::NO_PHASE_BLOCK) { + base_name += ":EPB:i:" + to_string(end_parent_path_phase_block); + } + if (end_parent_path_subrange != PathMetadata::NO_SUBRANGE) { + base_name += ":EPO:i:" + to_string(end_parent_path_subrange.first); + } + base_name += ":EPN:Z:" + string(end_parent_node_reversed ? "<" : ">") + to_string(end_parent_node_id); } - // end parent information - if (!end_parent_rgfa_path_name.empty()) { - base_name += ":EPP:Z:" + end_parent_rgfa_path_name + - ":EPO:i:" + std::to_string(end_parent_offset) + - ":EPH:Z:" + (end_parent_node_reversed ? "<" : ">") + std::to_string(end_parent_node_id); - } // and return the final path, with sample/locus/rgfa-rank embedded in locus // (as it's a reference path, we alsos strip out the phase block) @@ -347,6 +401,175 @@ string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subra PathMetadata::NO_PHASE_BLOCK, rgfa_subrange); } +string parse_rgfa_path_name(const string& path_name, int* rgfa_rank, + string* rgfa_sample, + string* start_parent_path_name, + int64_t* start_parent_node_id, + bool* start_parent_node_reversed, + string* end_parent_path_name, + int64_t* end_parent_node_id, + bool* end_parent_node_reversed) { + + + // begin by parsing the # structure of the path + // note that all the rgfa stuff we're pulling out here will be embedded in the "locus" + PathSense path_sense; + string path_sample; + string path_locus; + size_t path_haplotype; + size_t path_phase_block; + subrange_t path_subrange; + PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); + + if (rgfa_sample) { + *rgfa_sample = path_sample; + } + + string sample_name; + string contig_name; + string start_parent_sample = PathMetadata::NO_SAMPLE_NAME; + string start_parent_contig = PathMetadata::NO_LOCUS_NAME; + int64_t start_parent_haplotype = PathMetadata::NO_HAPLOTYPE; + int64_t start_parent_phase_block = PathMetadata::NO_PHASE_BLOCK; + subrange_t start_parent_subrange = PathMetadata::NO_SUBRANGE; + string end_parent_sample = PathMetadata::NO_SAMPLE_NAME; + string end_parent_contig = PathMetadata::NO_LOCUS_NAME; + int64_t end_parent_haplotype = PathMetadata::NO_HAPLOTYPE; + int64_t end_parent_phase_block = PathMetadata::NO_PHASE_BLOCK; + subrange_t end_parent_subrange = PathMetadata::NO_SUBRANGE; + + // now we parse the locus which is just a list of tags sepearted by : + // todo (again), this is too wordy and should be compacted + vector toks = split_delims(path_locus, ":"); + assert(toks.size() % 3 == 0); + for (int64_t i = 0; i < toks.size(); i+=3) { + if (toks[i] == "SN") { + assert(toks[i+1] == "Z"); + sample_name = toks[i+2]; + } else if (toks[i] == "SC") { + assert(toks[i+1] == "Z"); + contig_name = toks[i+2]; + } else if (toks[i] == "SR" && rgfa_rank) { + assert(toks[i+1] == "i"); + *rgfa_rank = parse(toks[i+2]); + } else if (toks[i] == "SPS") { + assert(toks[i+1] == "Z"); + start_parent_sample = toks[i+2]; + } else if (toks[i] == "SPC") { + assert(toks[i+1] == "Z"); + start_parent_contig = toks[i+2]; + } else if (toks[i] == "SPH") { + assert(toks[i+1] == "i"); + start_parent_haplotype = parse(toks[i+2]); + } else if (toks[i] == "SPB") { + assert(toks[i+1] == "i"); + start_parent_phase_block = parse(toks[i+2]); + } else if (toks[i] == "SPO") { + assert(toks[i+1] == "i"); + start_parent_subrange.first = parse(toks[i+2]); + } else if (toks[i] == "SPN") { + assert(toks[i+1] == "Z"); + if (start_parent_node_id) { + *start_parent_node_id = parse(toks[i+2].substr(1)); + } + if (start_parent_node_reversed) { + *start_parent_node_reversed = toks[i+2][0] == '<'; + } + } else if (toks[i] == "EPS") { + assert(toks[i+1] == "Z"); + end_parent_sample = toks[i+2]; + } else if (toks[i] == "EPC") { + assert(toks[i+1] == "Z"); + end_parent_contig = toks[i+2]; + } else if (toks[i] == "EPH") { + assert(toks[i+1] == "i"); + end_parent_haplotype = parse(toks[i+2]); + } else if (toks[i] == "EPB") { + assert(toks[i+1] == "i"); + end_parent_phase_block = parse(toks[i+2]); + } else if (toks[i] == "EPO") { + assert(toks[i+1] == "i"); + end_parent_subrange.first = parse(toks[i+2]); + } else if (toks[i] == "EPN") { + assert(toks[i+1] == "Z"); + if (end_parent_node_id) { + *end_parent_node_id = parse(toks[i+2].substr(1)); + } + if (end_parent_node_reversed) { + *end_parent_node_reversed = toks[i+2][0] == '<'; + } + } else { + assert(false); + } + } + + // reconstruct the vg path + string orig_path_name = PathMetadata::create_path_name(path_sense, + sample_name, + contig_name, + path_haplotype, + path_phase_block, + path_subrange); + + if (start_parent_path_name) { + // reconstruct the start path name + PathSense start_parent_path_sense = start_parent_haplotype == PathMetadata::NO_HAPLOTYPE ? PathSense::REFERENCE : PathSense::HAPLOTYPE; + *start_parent_path_name = PathMetadata::create_path_name(start_parent_path_sense, + start_parent_sample, + start_parent_contig, + start_parent_haplotype, + start_parent_phase_block, + start_parent_subrange); + } + + if (end_parent_path_name) { + // reconstruc the end path name + PathSense end_parent_path_sense = end_parent_haplotype == PathMetadata::NO_HAPLOTYPE ? PathSense::REFERENCE : PathSense::HAPLOTYPE; + *end_parent_path_name = PathMetadata::create_path_name(end_parent_path_sense, + end_parent_sample, + end_parent_contig, + end_parent_haplotype, + end_parent_phase_block, + end_parent_subrange); + } + + return orig_path_name; +} + +string parse_rgfa_name_into_tags(const string& path_name, string& rgfa_tags) { + int rgfa_rank; + string rgfa_sample; + string start_parent_rgfa_path_name; + int64_t start_parent_node_id; + bool start_parent_node_reversed; + string end_parent_rgfa_path_name; + int64_t end_parent_node_id; + bool end_parent_node_reversed; + string vg_path_name = parse_rgfa_path_name(path_name, &rgfa_rank, + &rgfa_sample, + &start_parent_rgfa_path_name, + &start_parent_node_id, + &start_parent_node_reversed, + &end_parent_rgfa_path_name, + &end_parent_node_id, + &end_parent_node_reversed); + + rgfa_tags += "SR:i:" + to_string(rgfa_rank); + if (!start_parent_rgfa_path_name.empty()) { + rgfa_tags += "\tSPP:Z:" + start_parent_rgfa_path_name + + "\tSPN:Z:" + (start_parent_node_reversed ? "<" : ">") + to_string(start_parent_node_id); + } + if (!end_parent_rgfa_path_name.empty()) { + rgfa_tags += "\tEPP:Z:" + end_parent_rgfa_path_name + + "\tEPN:Z:" + (end_parent_node_reversed ? "<" : ">") + to_string(end_parent_node_id); + } + + return vg_path_name; +} + + + string strip_rgfa_path_name(const string& path_name) { PathSense path_sense; @@ -382,6 +605,24 @@ string strip_rgfa_path_name(const string& path_name) { return path_name; } +void increment_subrange_start(string& path_name, int64_t offset) { + PathSense path_sense; + string path_sample; + string path_locus; + size_t path_haplotype; + size_t path_phase_block; + subrange_t path_subrange; + PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); + + if (path_subrange == PathMetadata::NO_SUBRANGE) { + path_subrange.first = offset; + } else { + path_subrange.first += offset; + } + path_name = PathMetadata::create_path_name(path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); +} void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, SnarlManager* snarl_manager, @@ -549,10 +790,7 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, << ":" << graph->get_is_reverse(graph->get_handle_of_step(rgfa_fragment.steps.front())) << endl; */ start_parent_offset = step_to_pos.at(rgfa_fragment.start_parent_step); - subrange_t start_parent_subrange = graph->get_subrange(start_parent_path_handle); - if (start_parent_subrange != PathMetadata::NO_SUBRANGE) { - start_parent_offset += start_parent_subrange.first; - } + increment_subrange_start(start_parent_rgfa_path_name, start_parent_offset); start_parent_node_id = graph->get_id(graph->get_handle_of_step(start_parent_fragment.start_parent_step)); start_parent_node_reversed = graph->get_is_reverse(graph->get_handle_of_step(start_parent_fragment.start_parent_step)); } @@ -569,18 +807,15 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, end_parent_rgfa_path_name = graph->get_path_name(end_parent_path_handle); assert(graph->get_path_handle_of_step(rgfa_fragment.end_parent_step) == end_parent_path_handle); end_parent_offset = step_to_pos.at(rgfa_fragment.end_parent_step); - subrange_t end_parent_subrange = graph->get_subrange(end_parent_path_handle); - if (end_parent_subrange != PathMetadata::NO_SUBRANGE) { - end_parent_offset += end_parent_subrange.first; - } + increment_subrange_start(end_parent_rgfa_path_name, end_parent_offset); end_parent_node_id = graph->get_id(graph->get_handle_of_step(end_parent_fragment.end_parent_step)); end_parent_node_reversed = graph->get_is_reverse(graph->get_handle_of_step(end_parent_fragment.end_parent_step)); } string rgfa_frag_name = create_rgfa_path_name(path_name, rgfa_fragment.rank, rgfa_frag_subrange, rgfa_sample_name, - start_parent_rgfa_path_name, start_parent_offset, + start_parent_rgfa_path_name, start_parent_node_id, start_parent_node_reversed, - end_parent_rgfa_path_name, end_parent_offset, + end_parent_rgfa_path_name, end_parent_node_id, end_parent_node_reversed); #ifdef debug diff --git a/src/gfa.hpp b/src/gfa.hpp index c281fb58863..313b9e7c72d 100644 --- a/src/gfa.hpp +++ b/src/gfa.hpp @@ -49,22 +49,38 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, /// If the path is not an RGFA path, then return -1 int get_rgfa_rank(const string& path_name); +/// Remove the rGFA information from a path name, effectively undoing set_rgfa_rank +string strip_rgfa_path_name(const string& path_name); + +/// Edit the subrange of a path name +void increment_subrange_start(string& path_name, int64_t offset); + /// Add the rgfa rank to a pathname, also setting its sample to the special rgfa sample and /// moving its old sample into the locus field string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subrange_t& subrange, const string& rgfa_sample, - const string& start_parent_rgfa_path_name = "", - int64_t start_parent_offset = -1, + const string& start_parent_path_name = "", int64_t start_parent_node_id = -1, bool start_parent_node_reversed = false, - const string& end_parent_rgfa_path_name = "", - int64_t end_parent_offset = -1, + const string& end_parent_path_name = "", int64_t end_parent_node_id = -1, bool end_parent_node_reversed = false); -/// Remove the rGFA information from a path name, effectively undoing set_rgfa_rank -string strip_rgfa_path_name(const string& path_name); +/// Parse (and remove) all rgfa-specific information, copying it into the various input paramters +/// The rgfa-information-free vg path name is returned (ie the path_name input to carete_rgfa_path_name) +string parse_rgfa_path_name(const string& path_name, int* rgfa_rank = nullptr, + string* rgfa_sample = nullptr, + string* start_parent_rgfa_path_name = nullptr, + int64_t* start_parent_node_id = nullptr, + bool* start_parent_node_reversed = nullptr, + string* end_parent_rgfa_path_name = nullptr, + int64_t* end_parent_node_id = nullptr, + bool* end_parent_node_reversed = nullptr); + +/// Like above, but put the rgfa stuff into tags +string parse_rgfa_name_into_tags(const string& path_name, string& rgfa_tags); + /// Compute the rGFA path cover /// graph: the graph From 7626f4e121cfb5f1203c5592f666075996cd50d1 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 26 May 2023 15:01:48 -0400 Subject: [PATCH 18/21] yet another parent path layout --- src/gfa.cpp | 261 +++++++++++++++++++++++----------------------------- src/gfa.hpp | 9 +- 2 files changed, 121 insertions(+), 149 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index 232b0e8ec22..5182aba20af 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -44,7 +44,7 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& out << "\n"; //Compute the rGFA tags of given paths - unordered_map> node_offsets_and_ranks; + unordered_map> node_offsets; for (const string& path_name : rgfa_paths) { path_handle_t path_handle = graph->get_path_handle(path_name); size_t offset = 0; @@ -57,12 +57,11 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& << node_id << " is traversed on its reverse strand. rGFA only supports the forward strand." << endl; throw runtime_error(ss.str()); } - if (node_offsets_and_ranks.count(node_id)) { + if (node_offsets.count(node_id)) { cerr << "warning [gfa]: multiple selected rgfa paths found on node " << node_id << ": keeping tags for " - << graph->get_path_name(get<0>(node_offsets_and_ranks[node_id])) << " and ignoring those for " << path_name << endl; + << graph->get_path_name(node_offsets[node_id].first) << " and ignoring those for " << path_name << endl; } else { - int64_t rgfa_rank = get_rgfa_rank(path_name); - node_offsets_and_ranks[node_id] = make_tuple(path_handle, offset, rgfa_rank <= 0 ? 0 : rgfa_rank); + node_offsets[node_id] = make_pair(path_handle, offset); } offset += graph->get_length(handle); }); @@ -74,14 +73,14 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& nid_t node_id = graph->get_id(h); out << node_id << "\t"; out << graph->get_sequence(h); - auto it = node_offsets_and_ranks.find(node_id); - if (it != node_offsets_and_ranks.end()) { + auto it = node_offsets.find(node_id); + if (it != node_offsets.end()) { // hack off the rgfa stuff from the path, and move into tags string path_name = graph->get_path_name(get<0>(it->second)); string rgfa_tags; string base_name; if (get_rgfa_rank(path_name) >= 0) { - base_name = parse_rgfa_name_into_tags(graph->get_path_name(get<0>(it->second)), rgfa_tags); + base_name = parse_rgfa_name_into_tags(graph->get_path_name(it->second.first), rgfa_tags); } else { // no tags, must be the rank-0 reference base_name = path_name; @@ -98,8 +97,13 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& PathMetadata::NO_SUBRANGE); // add rGFA tags out << "\t" << "SN:Z:" << base_name - << "\t" << "SO:i:" << (base_offset + get<1>(it->second)) - << "\t" << rgfa_tags; + << "\t" << "SO:i:" << (base_offset + it->second.second); + if (subrange != PathMetadata::NO_SUBRANGE) { + // todo : assert this doesn't happen (at least off-reference) + assert(subrange.second > subrange.first); + out << "\t" << "SL:i" << to_string(subrange.second - subrange.first); + } + out << "\t" << rgfa_tags; } out << "\n"; // Writing `std::endl` would flush the buffer. return true; @@ -310,9 +314,11 @@ int get_rgfa_rank(const string& path_name) { string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subrange_t& rgfa_subrange, const string& rgfa_sample, const string& start_parent_path_name, + int64_t start_parent_offset, int64_t start_parent_node_id, bool start_parent_node_reversed, const string& end_parent_path_name, + int64_t end_parent_offset, int64_t end_parent_node_id, bool end_parent_node_reversed) { PathSense path_sense; @@ -336,64 +342,32 @@ string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subra // and we also load in our rGFA rank base_name += ":SR:i:" + std::to_string(rgfa_rank); - // start parent parent path is embedded as tags (so its own #'s don't muck up the structure) - // todo: this is way too verbose and should be fixed. + // start parent parent path (swap # for % as hack to not much up parent paths formatting) + // turns out we can't embed []'s either, so swap those too if (!start_parent_path_name.empty()) { - PathSense start_parent_path_sense; - string start_parent_path_sample; - string start_parent_path_locus; - size_t start_parent_path_haplotype; - size_t start_parent_path_phase_block; - subrange_t start_parent_path_subrange; - PathMetadata::parse_path_name(start_parent_path_name, start_parent_path_sense, start_parent_path_sample, - start_parent_path_locus, start_parent_path_haplotype, - start_parent_path_phase_block, start_parent_path_subrange); - if (start_parent_path_sample != PathMetadata::NO_SAMPLE_NAME) { - base_name += ":SPS:Z:" + start_parent_path_sample; - } - if (start_parent_path_locus != PathMetadata::NO_LOCUS_NAME) { - base_name += ":SPC:Z:" + start_parent_path_locus; - } - if (start_parent_path_haplotype != PathMetadata::NO_HAPLOTYPE) { - base_name += ":SPH:i:" + to_string(start_parent_path_haplotype); - } - if (start_parent_path_phase_block != PathMetadata::NO_PHASE_BLOCK) { - base_name += ":SPB:i:" + to_string(start_parent_path_phase_block); - } - if (start_parent_path_subrange != PathMetadata::NO_SUBRANGE) { - base_name += ":SPO:i:" + to_string(start_parent_path_subrange.first); + string spn = start_parent_path_name; + std::replace(spn.begin(), spn.end(), '#', '%'); + std::replace(spn.begin(), spn.end(), '[', '('); + std::replace(spn.begin(), spn.end(), ']', ')'); + base_name += ":SPP:Z:" + spn; + if (start_parent_offset >= 0) { + base_name += ":SPO:i:" + to_string(start_parent_offset); } base_name += ":SPN:Z:" + string(start_parent_node_reversed ? "<" : ">") + to_string(start_parent_node_id); } - - // same for the end parent + // end parent parent path (swap # for % as hack to not much up parent paths formatting) + // turns out we can't embed []'s either, so swap those too if (!end_parent_path_name.empty()) { - PathSense end_parent_path_sense; - string end_parent_path_sample; - string end_parent_path_locus; - size_t end_parent_path_haplotype; - size_t end_parent_path_phase_block; - subrange_t end_parent_path_subrange; - PathMetadata::parse_path_name(end_parent_path_name, end_parent_path_sense, end_parent_path_sample, - end_parent_path_locus, end_parent_path_haplotype, - end_parent_path_phase_block, end_parent_path_subrange); - if (end_parent_path_sample != PathMetadata::NO_SAMPLE_NAME) { - base_name += ":EPS:Z:" + end_parent_path_sample; - } - if (end_parent_path_locus != PathMetadata::NO_LOCUS_NAME) { - base_name += ":EPC:Z:" + end_parent_path_locus; - } - if (end_parent_path_haplotype != PathMetadata::NO_HAPLOTYPE) { - base_name += ":EPH:i:" + to_string(end_parent_path_haplotype); + string spn = end_parent_path_name; + std::replace(spn.begin(), spn.end(), '#', '%'); + std::replace(spn.begin(), spn.end(), '[', '('); + std::replace(spn.begin(), spn.end(), ']', ')'); + base_name += ":EPP:Z:" + spn; + if (end_parent_offset >= 0) { + base_name += ":EPO:i:" + to_string(end_parent_offset); } - if (end_parent_path_phase_block != PathMetadata::NO_PHASE_BLOCK) { - base_name += ":EPB:i:" + to_string(end_parent_path_phase_block); - } - if (end_parent_path_subrange != PathMetadata::NO_SUBRANGE) { - base_name += ":EPO:i:" + to_string(end_parent_path_subrange.first); - } - base_name += ":EPN:Z:" + string(end_parent_node_reversed ? "<" : ">") + to_string(end_parent_node_id); - } + base_name += ":EPN:Z:" + string(end_parent_node_reversed ? "<" : ">") + to_string(end_parent_node_id); + } // and return the final path, with sample/locus/rgfa-rank embedded in locus // (as it's a reference path, we alsos strip out the phase block) @@ -404,9 +378,11 @@ string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subra string parse_rgfa_path_name(const string& path_name, int* rgfa_rank, string* rgfa_sample, string* start_parent_path_name, + int64_t* start_parent_offset, int64_t* start_parent_node_id, bool* start_parent_node_reversed, string* end_parent_path_name, + int64_t* end_parent_offset, int64_t* end_parent_node_id, bool* end_parent_node_reversed) { @@ -428,16 +404,6 @@ string parse_rgfa_path_name(const string& path_name, int* rgfa_rank, string sample_name; string contig_name; - string start_parent_sample = PathMetadata::NO_SAMPLE_NAME; - string start_parent_contig = PathMetadata::NO_LOCUS_NAME; - int64_t start_parent_haplotype = PathMetadata::NO_HAPLOTYPE; - int64_t start_parent_phase_block = PathMetadata::NO_PHASE_BLOCK; - subrange_t start_parent_subrange = PathMetadata::NO_SUBRANGE; - string end_parent_sample = PathMetadata::NO_SAMPLE_NAME; - string end_parent_contig = PathMetadata::NO_LOCUS_NAME; - int64_t end_parent_haplotype = PathMetadata::NO_HAPLOTYPE; - int64_t end_parent_phase_block = PathMetadata::NO_PHASE_BLOCK; - subrange_t end_parent_subrange = PathMetadata::NO_SUBRANGE; // now we parse the locus which is just a list of tags sepearted by : // todo (again), this is too wordy and should be compacted @@ -450,24 +416,24 @@ string parse_rgfa_path_name(const string& path_name, int* rgfa_rank, } else if (toks[i] == "SC") { assert(toks[i+1] == "Z"); contig_name = toks[i+2]; - } else if (toks[i] == "SR" && rgfa_rank) { + } else if (toks[i] == "SR") { assert(toks[i+1] == "i"); - *rgfa_rank = parse(toks[i+2]); - } else if (toks[i] == "SPS") { - assert(toks[i+1] == "Z"); - start_parent_sample = toks[i+2]; - } else if (toks[i] == "SPC") { + if (rgfa_rank) { + *rgfa_rank = parse(toks[i+2]); + } + } else if (toks[i] == "SPP") { assert(toks[i+1] == "Z"); - start_parent_contig = toks[i+2]; - } else if (toks[i] == "SPH") { - assert(toks[i+1] == "i"); - start_parent_haplotype = parse(toks[i+2]); - } else if (toks[i] == "SPB") { - assert(toks[i+1] == "i"); - start_parent_phase_block = parse(toks[i+2]); + if (start_parent_path_name) { + *start_parent_path_name = toks[i+2]; + std::replace(start_parent_path_name->begin(), start_parent_path_name->end(), '%', '#'); + std::replace(start_parent_path_name->begin(), start_parent_path_name->end(), '(', '['); + std::replace(start_parent_path_name->begin(), start_parent_path_name->end(), ')', ']'); + } } else if (toks[i] == "SPO") { assert(toks[i+1] == "i"); - start_parent_subrange.first = parse(toks[i+2]); + if (start_parent_offset) { + *start_parent_offset = parse(toks[i+2]); + } } else if (toks[i] == "SPN") { assert(toks[i+1] == "Z"); if (start_parent_node_id) { @@ -476,21 +442,19 @@ string parse_rgfa_path_name(const string& path_name, int* rgfa_rank, if (start_parent_node_reversed) { *start_parent_node_reversed = toks[i+2][0] == '<'; } - } else if (toks[i] == "EPS") { + } else if (toks[i] == "EPP") { assert(toks[i+1] == "Z"); - end_parent_sample = toks[i+2]; - } else if (toks[i] == "EPC") { - assert(toks[i+1] == "Z"); - end_parent_contig = toks[i+2]; - } else if (toks[i] == "EPH") { - assert(toks[i+1] == "i"); - end_parent_haplotype = parse(toks[i+2]); - } else if (toks[i] == "EPB") { - assert(toks[i+1] == "i"); - end_parent_phase_block = parse(toks[i+2]); + if (end_parent_path_name) { + *end_parent_path_name = toks[i+2]; + std::replace(end_parent_path_name->begin(), end_parent_path_name->end(), '%', '#'); + std::replace(end_parent_path_name->begin(), end_parent_path_name->end(), '(', '['); + std::replace(end_parent_path_name->begin(), end_parent_path_name->end(), ')', ']'); + } } else if (toks[i] == "EPO") { assert(toks[i+1] == "i"); - end_parent_subrange.first = parse(toks[i+2]); + if (end_parent_offset) { + *end_parent_offset = parse(toks[i+2]); + } } else if (toks[i] == "EPN") { assert(toks[i+1] == "Z"); if (end_parent_node_id) { @@ -498,70 +462,59 @@ string parse_rgfa_path_name(const string& path_name, int* rgfa_rank, } if (end_parent_node_reversed) { *end_parent_node_reversed = toks[i+2][0] == '<'; - } + } } else { assert(false); } } - - // reconstruct the vg path - string orig_path_name = PathMetadata::create_path_name(path_sense, - sample_name, - contig_name, - path_haplotype, - path_phase_block, - path_subrange); - - if (start_parent_path_name) { - // reconstruct the start path name - PathSense start_parent_path_sense = start_parent_haplotype == PathMetadata::NO_HAPLOTYPE ? PathSense::REFERENCE : PathSense::HAPLOTYPE; - *start_parent_path_name = PathMetadata::create_path_name(start_parent_path_sense, - start_parent_sample, - start_parent_contig, - start_parent_haplotype, - start_parent_phase_block, - start_parent_subrange); - } - if (end_parent_path_name) { - // reconstruc the end path name - PathSense end_parent_path_sense = end_parent_haplotype == PathMetadata::NO_HAPLOTYPE ? PathSense::REFERENCE : PathSense::HAPLOTYPE; - *end_parent_path_name = PathMetadata::create_path_name(end_parent_path_sense, - end_parent_sample, - end_parent_contig, - end_parent_haplotype, - end_parent_phase_block, - end_parent_subrange); + if (path_sense == PathSense::REFERENCE && sample_name.empty()) { + // the embedded path never actually had a sample, so we need to flip back to generic + path_sense = PathSense::GENERIC; + path_haplotype = PathMetadata::NO_HAPLOTYPE; + path_phase_block = PathMetadata::NO_PHASE_BLOCK; } + + // reconstruct the vg path with rgfa stuf stripped from locus + return PathMetadata::create_path_name(path_sense, + sample_name, + contig_name, + path_haplotype, + path_phase_block, + path_subrange); - return orig_path_name; } string parse_rgfa_name_into_tags(const string& path_name, string& rgfa_tags) { int rgfa_rank; string rgfa_sample; string start_parent_rgfa_path_name; + int64_t start_parent_offset; int64_t start_parent_node_id; bool start_parent_node_reversed; string end_parent_rgfa_path_name; + int64_t end_parent_offset; int64_t end_parent_node_id; bool end_parent_node_reversed; string vg_path_name = parse_rgfa_path_name(path_name, &rgfa_rank, &rgfa_sample, &start_parent_rgfa_path_name, + &start_parent_offset, &start_parent_node_id, &start_parent_node_reversed, &end_parent_rgfa_path_name, + &end_parent_offset, &end_parent_node_id, - &end_parent_node_reversed); - + &end_parent_node_reversed); rgfa_tags += "SR:i:" + to_string(rgfa_rank); if (!start_parent_rgfa_path_name.empty()) { rgfa_tags += "\tSPP:Z:" + start_parent_rgfa_path_name + + "\tSPO:i:" + to_string(start_parent_offset) + "\tSPN:Z:" + (start_parent_node_reversed ? "<" : ">") + to_string(start_parent_node_id); } if (!end_parent_rgfa_path_name.empty()) { rgfa_tags += "\tEPP:Z:" + end_parent_rgfa_path_name + + "\tEPO:i:" + to_string(end_parent_offset) + "\tEPN:Z:" + (end_parent_node_reversed ? "<" : ">") + to_string(end_parent_node_id); } @@ -605,7 +558,7 @@ string strip_rgfa_path_name(const string& path_name) { return path_name; } -void increment_subrange_start(string& path_name, int64_t offset) { +void clamp_path_subrange(string& path_name, int64_t start, int64_t end) { PathSense path_sense; string path_sample; string path_locus; @@ -616,9 +569,12 @@ void increment_subrange_start(string& path_name, int64_t offset) { path_haplotype, path_phase_block, path_subrange); if (path_subrange == PathMetadata::NO_SUBRANGE) { - path_subrange.first = offset; + path_subrange.first = start; + path_subrange.second = end; } else { - path_subrange.first += offset; + assert(end <= path_subrange.second - path_subrange.first && end > start); + path_subrange.first += start; + path_subrange.second = path_subrange.first + (end - start); } path_name = PathMetadata::create_path_name(path_sense, path_sample, path_locus, path_haplotype, path_phase_block, path_subrange); @@ -728,7 +684,6 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, const vector& rgfa_fragment = rgfa_fragments.at(frag_idx).steps; for (const step_handle_t& step: rgfa_fragment) { step_to_pos[step] = -1; - cerr << "setting " << graph->get_id(graph->get_handle_of_step(step)) << " to -1" << endl; assert(graph->get_path_handle_of_step(step) == path_handle); ++set_count; } @@ -737,7 +692,6 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, graph->for_each_step_in_path(path_handle, [&](const step_handle_t& step_handle) { if (step_to_pos.count(step_handle)) { step_to_pos[step_handle] = pos; - cerr << "resetting " << graph->get_id(graph->get_handle_of_step(step_handle)) << " to " << pos << endl; --set_count; } handle_t handle = graph->get_handle_of_step(step_handle); @@ -782,15 +736,15 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, const RGFAFragment& start_parent_fragment = rgfa_fragments.at(rgfa_fragment.start_parent_idx); path_handle_t start_parent_path_handle = graph->get_path_handle_of_step(start_parent_fragment.steps.front()); start_parent_rgfa_path_name = graph->get_path_name(start_parent_path_handle); - /* - cerr << "assert is gonna check " << graph->get_path_name(graph->get_path_handle_of_step(rgfa_fragment.start_parent_step)) << " vs " << graph->get_path_name(start_parent_path_handle) << endl; - assert(graph->get_path_handle_of_step(rgfa_fragment.start_parent_step) == start_parent_path_handle); - cerr << "checking step " << start_parent_rgfa_path_name << graph->get_id(graph->get_handle_of_step(rgfa_fragment.start_parent_step)) << ":" << graph->get_is_reverse(graph->get_handle_of_step(rgfa_fragment.start_parent_step)) << endl; - cerr << "via " << path_name << " " << graph->get_id(graph->get_handle_of_step(rgfa_fragment.steps.front())) - << ":" << graph->get_is_reverse(graph->get_handle_of_step(rgfa_fragment.steps.front())) << endl; - */ start_parent_offset = step_to_pos.at(rgfa_fragment.start_parent_step); - increment_subrange_start(start_parent_rgfa_path_name, start_parent_offset); + subrange_t start_parent_subrange = PathMetadata::parse_subrange(start_parent_rgfa_path_name); + if (start_parent_subrange != PathMetadata::NO_SUBRANGE) { + // now we subset the path to just the fragment interval (which I think it more useful) + int64_t start_parent_range_start = step_to_pos.at(start_parent_fragment.steps.front()); + int64_t start_parent_range_end = start_parent_range_start + start_parent_fragment.length; + clamp_path_subrange(start_parent_rgfa_path_name, start_parent_range_start, start_parent_range_end); + start_parent_offset -= start_parent_range_start; + } start_parent_node_id = graph->get_id(graph->get_handle_of_step(start_parent_fragment.start_parent_step)); start_parent_node_reversed = graph->get_is_reverse(graph->get_handle_of_step(start_parent_fragment.start_parent_step)); } @@ -807,15 +761,24 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, end_parent_rgfa_path_name = graph->get_path_name(end_parent_path_handle); assert(graph->get_path_handle_of_step(rgfa_fragment.end_parent_step) == end_parent_path_handle); end_parent_offset = step_to_pos.at(rgfa_fragment.end_parent_step); - increment_subrange_start(end_parent_rgfa_path_name, end_parent_offset); + subrange_t end_parent_subrange = PathMetadata::parse_subrange(end_parent_rgfa_path_name); + if (end_parent_subrange != PathMetadata::NO_SUBRANGE) { + // now we subset the path to just the fragment interval (which I think it more useful) + int64_t end_parent_range_start = step_to_pos.at(end_parent_fragment.steps.front()); + int64_t end_parent_range_end = end_parent_range_start + end_parent_fragment.length; + clamp_path_subrange(end_parent_rgfa_path_name, end_parent_range_start, end_parent_range_end); + end_parent_offset -= end_parent_range_start; + } end_parent_node_id = graph->get_id(graph->get_handle_of_step(end_parent_fragment.end_parent_step)); end_parent_node_reversed = graph->get_is_reverse(graph->get_handle_of_step(end_parent_fragment.end_parent_step)); } string rgfa_frag_name = create_rgfa_path_name(path_name, rgfa_fragment.rank, rgfa_frag_subrange, rgfa_sample_name, start_parent_rgfa_path_name, + start_parent_offset, start_parent_node_id, start_parent_node_reversed, end_parent_rgfa_path_name, + end_parent_offset, end_parent_node_id, end_parent_node_reversed); #ifdef debug @@ -917,15 +880,17 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, // since we leave rank-0 paths the way they are, but it's handy to have // a consistent representation for the cover while linking up the // various nesting metadata. + int64_t ref_trav_length = 0; vector& ref_trav = travs.at(ref_paths.begin()->second); for (step_handle_t ref_step_handle : ref_trav) { nid_t node_id = graph->get_id(graph->get_handle_of_step(ref_step_handle)); cover_node_to_fragment[node_id] = cover_fragments.size(); + ref_trav_length += graph->get_length(graph->get_handle_of_step(ref_step_handle)); } // todo: check endpoints (and remove from ref_trav??) RGFAFragment frag = { - 0, ref_trav, -1, -1, ref_trav.front(), ref_trav.back() + 0, ref_trav, ref_trav_length, -1, -1, ref_trav.front(), ref_trav.back() }; cover_fragments.push_back(frag); } @@ -1106,13 +1071,15 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, // update the cover vector interval; + int64_t interval_length = 0; interval.reserve(uncovered_interval.second - uncovered_interval.first); for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { interval.push_back(trav[i]); + interval_length += graph->get_length(graph->get_handle_of_step(trav[i])); cover_node_to_fragment[graph->get_id(graph->get_handle_of_step(trav[i]))] = cover_fragments.size(); - } + } RGFAFragment frag = { - fragment_rank, std::move(interval), + fragment_rank, std::move(interval), interval_length, prev_frag_idx, next_frag_idx, start_parent_step, end_parent_step }; diff --git a/src/gfa.hpp b/src/gfa.hpp index 313b9e7c72d..01cfb41e379 100644 --- a/src/gfa.hpp +++ b/src/gfa.hpp @@ -52,17 +52,19 @@ int get_rgfa_rank(const string& path_name); /// Remove the rGFA information from a path name, effectively undoing set_rgfa_rank string strip_rgfa_path_name(const string& path_name); -/// Edit the subrange of a path name -void increment_subrange_start(string& path_name, int64_t offset); +/// Clamp a path to given subrange (taking into account an existing subrange) +void clamp_path_subrange(string& path_name, int64_t start, int64_t end); /// Add the rgfa rank to a pathname, also setting its sample to the special rgfa sample and /// moving its old sample into the locus field string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subrange_t& subrange, const string& rgfa_sample, const string& start_parent_path_name = "", + int64_t start_parent_offset = -1, int64_t start_parent_node_id = -1, bool start_parent_node_reversed = false, const string& end_parent_path_name = "", + int64_t end_parent_offset = -1, int64_t end_parent_node_id = -1, bool end_parent_node_reversed = false); @@ -72,9 +74,11 @@ string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subra string parse_rgfa_path_name(const string& path_name, int* rgfa_rank = nullptr, string* rgfa_sample = nullptr, string* start_parent_rgfa_path_name = nullptr, + int64_t* start_parent_offset = nullptr, int64_t* start_parent_node_id = nullptr, bool* start_parent_node_reversed = nullptr, string* end_parent_rgfa_path_name = nullptr, + int64_t* end_parent_offset = nullptr, int64_t* end_parent_node_id = nullptr, bool* end_parent_node_reversed = nullptr); @@ -99,6 +103,7 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, struct RGFAFragment { int64_t rank; vector steps; + int64_t length; int64_t start_parent_idx; int64_t end_parent_idx; step_handle_t start_parent_step; From ebf4fe7615ba3cf57319a5fa863a8b28769297f1 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 29 May 2023 12:30:02 -0400 Subject: [PATCH 19/21] fix typo --- src/gfa.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index 5182aba20af..39861c4404a 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -1059,7 +1059,7 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, handle_t end_handle = graph->get_handle_of_step(trav[uncovered_interval.second]); vector end_handle_steps = graph->steps_of_handle(end_handle); - path_handle_t end_parent_path = graph->get_path_handle_of_step(cover_fragments.at(prev_frag_idx).steps.front()); + path_handle_t end_parent_path = graph->get_path_handle_of_step(cover_fragments.at(next_frag_idx).steps.front()); vector end_parent_step_indexes; for (int64_t i = 0; i < end_handle_steps.size(); ++i) { if (graph->get_path_handle_of_step(end_handle_steps[i]) == end_parent_path) { From cd1f4fddbc68166072763fa92e337abb5d91d085 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 29 May 2023 13:24:21 -0400 Subject: [PATCH 20/21] handle duplications when annotating parent paths --- src/gfa.cpp | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/gfa.cpp b/src/gfa.cpp index 39861c4404a..bdaade4b637 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -1049,6 +1049,18 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, start_parent_step_indexes.push_back(i); } } + if (start_parent_step_indexes.size() > 1) { + // fall back to brute force scan of the entire parent path + start_parent_step_indexes.clear(); + start_handle_steps.clear(); + for (const auto& parent_step : cover_fragments.at(prev_frag_idx).steps) { + if (graph->get_handle_of_step(parent_step) == start_handle) { + start_parent_step_indexes.push_back(start_handle_steps.size()); + start_handle_steps.push_back(parent_step); + // todo: break (but leave out for now to do below assertion) + } + } + } assert(start_parent_step_indexes.size() == 1); step_handle_t start_parent_step = start_handle_steps[start_parent_step_indexes[0]]; @@ -1066,6 +1078,18 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, end_parent_step_indexes.push_back(i); } } + if (end_parent_step_indexes.size() > 1) { + // fall back to brute force scan of the entire parent path + end_parent_step_indexes.clear(); + end_handle_steps.clear(); + for (const auto& parent_step : cover_fragments.at(next_frag_idx).steps) { + if (graph->get_handle_of_step(parent_step) == end_handle) { + end_parent_step_indexes.push_back(end_handle_steps.size()); + end_handle_steps.push_back(parent_step); + // todo: break (but leave out for now to do below assertion) + } + } + } assert(end_parent_step_indexes.size() == 1); step_handle_t end_parent_step = end_handle_steps[end_parent_step_indexes[0]]; From f26bf2db3ba8d580717e07c19e33026790a53e85 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 29 May 2023 16:19:13 -0400 Subject: [PATCH 21/21] more cycle edge cases --- src/gfa.cpp | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index bdaade4b637..873e313be12 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -906,13 +906,18 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, function>(const vector&)> get_uncovered_intervals = [&](const vector& trav) { vector> intervals; int64_t start = -1; + unordered_set dupe_check; for (size_t i = 0; i < trav.size(); ++i) { - bool covered = cover_node_to_fragment.count(graph->get_id(graph->get_handle_of_step(trav[i]))); - if (covered) { + nid_t node_id = graph->get_id(graph->get_handle_of_step(trav[i])); + bool covered = cover_node_to_fragment.count(node_id); + // we break at dupes even if uncovered -- never want same id twice in an interval + bool dupe = !covered && dupe_check.count(node_id); + dupe_check.insert(node_id); + if (covered || dupe) { if (start != -1) { intervals.push_back(make_pair(start, i)); } - start = -1; + start = dupe ? i : -1; } else { if (start == -1) { start = i; @@ -1054,7 +1059,7 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, start_parent_step_indexes.clear(); start_handle_steps.clear(); for (const auto& parent_step : cover_fragments.at(prev_frag_idx).steps) { - if (graph->get_handle_of_step(parent_step) == start_handle) { + if (graph->get_id(graph->get_handle_of_step(parent_step)) == graph->get_id(start_handle)) { start_parent_step_indexes.push_back(start_handle_steps.size()); start_handle_steps.push_back(parent_step); // todo: break (but leave out for now to do below assertion) @@ -1083,7 +1088,7 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, end_parent_step_indexes.clear(); end_handle_steps.clear(); for (const auto& parent_step : cover_fragments.at(next_frag_idx).steps) { - if (graph->get_handle_of_step(parent_step) == end_handle) { + if (graph->get_id(graph->get_handle_of_step(parent_step)) == graph->get_id(end_handle)) { end_parent_step_indexes.push_back(end_handle_steps.size()); end_handle_steps.push_back(parent_step); // todo: break (but leave out for now to do below assertion)