From 56afa41522b92b145e40896b51733953675fac4a Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 20 Mar 2023 13:21:40 -0400 Subject: [PATCH 01/12] init rgfa commit --- src/gfa.cpp | 339 ++++++++++++++++++++++++++++++++++ src/gfa.hpp | 42 +++++ src/subcommand/paths_main.cpp | 90 ++++++--- 3 files changed, 448 insertions(+), 23 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index 435fd29dcd3..0a21908983b 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -5,6 +5,8 @@ #include +#define debug + namespace vg { using namespace std; @@ -247,4 +249,341 @@ void write_w_line(const PathHandleGraph* graph, ostream& out, path_handle_t path out << "\n"; } +int rgfa_rank(const string& path_name) { + int rank = -1; + size_t pos = path_name.rfind(":SR:i:"); + if (pos != string::npos && path_name.length() - pos >= 6) { + pos += 6; + size_t pos2 = path_name.find(":", pos); + size_t len = pos2 == string::npos ? pos2 : pos2 - pos; + string rank_string = path_name.substr(pos, len); + rank = parse(rank_string); + } + return rank; +} + +string set_rgfa_rank(const string& path_name, int rgfa_rank) { + string new_name; + // check if we have an existing rank. if so, we srap it. + size_t pos = path_name.rfind(":SR:i:"); + if (pos != string::npos && path_name.length() - pos >= 6) { + size_t pos2 = path_name.find(":", pos + 6); + new_name = path_name.substr(0, pos); + if (pos2 != string::npos) { + new_name += path_name.substr(pos2); + } + } else { + new_name = path_name; + } + + // now append the rank + new_name += ":SR:i:" + std::to_string(rgfa_rank); + return new_name; +} + +void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, + SnarlManager* snarl_manager, + const unordered_set& reference_paths, + int64_t minimum_length, + const unordered_map>>& preferred_intervals){ + + // we use the path traversal finder for everything + // (even gbwt haplotypes, as we're using the path handle interface) + PathTraversalFinder path_trav_finder(*graph, *snarl_manager); + + // we collect the rgfa cover in parallel as a list of path fragments + size_t thread_count = get_thread_count(); + vector>>> thread_covers(thread_count); + + // we process top-level snarl_manager in parallel + snarl_manager->for_each_top_level_snarl_parallel([&](const Snarl* snarl) { + // index nodes in rgfa cover to prevent overlaps + unordered_set cover_nodes; + // per-thread output + vector>>& cover_fragments = thread_covers.at(omp_get_thread_num()); + + // we keep track of ranks. rgfa ranks are relative to the reference path, not the snarl tree + // so we begin at -1 which means unknown. it will turn to 0 at first reference anchored snarl found + // -1 rank snarls cannot be covered by this algorithm + vector> queue; // rank,snarl + + queue.push_back(make_pair(-1, snarl)); + + while(!queue.empty()) { + pair rank_snarl = queue.back(); + queue.pop_back(); + + // get the snarl cover, writing to cover_nodes and cover_fragments + // note that we are single-threaded per top-level snarl, at least for now + // this is because parent snarls and child snarls can potentially cover the + // sname things + int64_t rgfa_rank = rgfa_snarl_cover(graph, + *snarl, + path_trav_finder, + reference_paths, + minimum_length, + -1, + cover_nodes, + cover_fragments, + preferred_intervals); + + // we don't even attempt to cover rank -1 snarls, instead just + // recurse until we find a reference path. + // this does put a reference path / snarl decomposition alignment + // requirement on this code + int64_t child_rank = rgfa_rank < 0 ? rgfa_rank : rgfa_rank + 1; + + // recurse on the children + const vector& children = snarl_manager->children_of(snarl); + for (const Snarl* child_snarl : children) { + queue.push_back(make_pair(child_rank, child_snarl)); + } + } + }); + + // merge up the thread covers + vector>> rgfa_fragments = std::move(thread_covers.at(0)); + for (size_t t = 1; t < thread_count; ++t) { + rgfa_fragments.reserve(rgfa_fragments.size() + thread_covers.at(t).size()); + std::move(thread_covers.at(t).begin(), thread_covers.at(t).end(), std::back_inserter(rgfa_fragments)); + } + thread_covers.clear(); + + // we don't have a path position interface, and even if we did we probably wouldn't have it on every path + // so to keep running time linear, we need to index the fragments so their offsets can be computed in one scan + // begin by sorting by path + unordered_map> path_to_fragments; + for (size_t i = 0; i get_path_handle_of_step(rgfa_fragment.second.front()); + path_to_fragments[path_handle].push_back(i); + } + + for (const auto& path_fragments : path_to_fragments) { + const path_handle_t& path_handle = path_fragments.first; + PathSense path_sense; + string path_sample; + string path_locus; + size_t path_haplotype; + size_t path_phase_block; + subrange_t path_subrange; + PathMetadata::parse_path_name(graph->get_path_name(path_handle), path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); + + const vector& fragments = path_fragments.second; + + // for each path, start by finding the positional offset of all relevant steps in the path by brute-force scan + unordered_map step_to_pos; + for (const int64_t& frag_idx : fragments) { + const vector& rgfa_fragment = rgfa_fragments.at(frag_idx).second; + step_to_pos[rgfa_fragment.front()] = -1; + // todo: need to figure out where exactly we handle all the different orientation cases + if (rgfa_fragment.size() > 1) { + step_to_pos[rgfa_fragment.back()] = -1; + } + } + size_t pos_count = 0; + size_t pos = 0; + graph->for_each_step_in_path(path_handle, [&](const step_handle_t& step_handle) { + if (step_to_pos.count(step_handle)) { + step_to_pos[step_handle] = pos; + ++pos_count; + if (pos_count == step_to_pos.size()) { + return false; + } + } + handle_t handle = graph->get_handle_of_step(step_handle); + pos += graph->get_length(handle); + return true; + }); + assert(pos_count == step_to_pos.size()); + + // second pass to make the path fragments, now that we know the positional offsets of their endpoints + for (const int64_t frag_idx : fragments) { + int64_t rgfa_rank = rgfa_fragments.at(frag_idx).first; + const vector& rgfa_fragment = rgfa_fragments.at(frag_idx).second; + + size_t rgfa_frag_pos = step_to_pos[rgfa_fragment.front()]; + size_t rgfa_frag_length = 0; + for (const step_handle_t& step : rgfa_fragment) { + rgfa_frag_length += graph->get_length(graph->get_handle_of_step(step)); + } + subrange_t rgfa_frag_subrange; + rgfa_frag_subrange.first = rgfa_frag_pos + (path_subrange != PathMetadata::NO_SUBRANGE ? path_subrange.first : 0); + rgfa_frag_subrange.second = rgfa_frag_subrange.first + rgfa_frag_length; + + string rgfa_frag_name = PathMetadata::create_path_name(path_sense, path_sample, path_locus, path_haplotype, + path_phase_block, rgfa_frag_subrange); + + rgfa_frag_name = set_rgfa_rank(rgfa_frag_name, rgfa_rank); + +#ifdef debug +#pragma omp critical(cerr) + cerr << "making new rgfa fragment with name " << rgfa_frag_name << " and " << rgfa_fragment.size() << " steps. subrange " + << rgfa_frag_subrange.first << "," << rgfa_frag_subrange.second << endl; +#endif + path_handle_t rgfa_fragment_handle = graph->create_path_handle(rgfa_frag_name); + for (const step_handle_t& step : rgfa_fragment) { + graph->append_step(rgfa_fragment_handle, graph->get_handle_of_step(step)); + } + } + } +} + +int64_t rgfa_snarl_cover(const PathHandleGraph* graph, + const Snarl& snarl, + PathTraversalFinder& path_trav_finder, + const unordered_set& reference_paths, + int64_t minimum_length, + int64_t rgfa_rank, + unordered_set& cover_nodes, + vector>>& cover_fragments, + const unordered_map>>& preferred_intervals) { + +#ifdef debug +#pragma omp critical(cerr) + cerr << "calling rgfa_snarl_cover with rank=" << rgfa_rank << " on " << pb2json(snarl) << endl; +#endif + + // // start by finding the path traversals through the snarl + vector> travs; + { + pair, vector > > path_travs = path_trav_finder.find_path_traversals(snarl); + travs.reserve(path_travs.first.size()); + + // reduce protobuf usage by going back to vector of steps instead of keeping SnarlTraversals around + for (int64_t i = 0; i < path_travs.first.size(); ++i) { + bool reversed = false; + if (graph->get_is_reverse(graph->get_handle_of_step(path_travs.second[i].first)) != snarl.start().backward()) { + reversed == true; + } + assert((graph->get_is_reverse(graph->get_handle_of_step(path_travs.second[i].second)) != snarl.end().backward()) == reversed); + vector trav; + trav.reserve(path_travs.first[i].visit_size()); + bool done = false; + function visit_next_step = [&graph,&reversed](step_handle_t step_handle) { + return reversed ? graph->get_previous_step(step_handle) : graph->get_next_step(step_handle); + }; + for (step_handle_t step_handle = path_travs.second[i].first; !done; step_handle = visit_next_step(step_handle)) { + trav.push_back(step_handle); + if (step_handle == path_travs.second[i].second) { + done = true; + } + } + travs.push_back(trav); + } + } + + // find all reference paths through the snarl + map ref_paths; + for (int64_t i = 0; i < travs.size(); ++i) { + path_handle_t trav_path = graph->get_path_handle_of_step(travs[i].front()); + if (reference_paths.count(trav_path)) { + ref_paths[graph->get_path_name(trav_path)] = i; + } + } + + if (ref_paths.empty() && rgfa_rank <= 0) { + // we're not nested in a reference snarl, and we have no reference path + // by the current logic, there's nothing to be done. + cerr << "[rgfa] warning: No referene path through snarl " + << pb2json(snarl) << ": unable to process for rGFA cover" << endl; + return -1; + } + + if (ref_paths.size() > 1) { + // And we could probably cope with this... but don't for now + cerr << "[rgfa] error: Mutiple reference path traversals found through snarl " + << pb2json(snarl) << endl; + } + + if (!ref_paths.empty()) { + // reference paths are trivially covered outside the snarl decomposition + // all we do here is make sure the relevant nodes are flagged in the map + vector& ref_trav = travs.at(ref_paths.begin()->second); + for (step_handle_t ref_step_handle : ref_trav) { + cover_nodes.insert(graph->get_id(graph->get_handle_of_step(ref_step_handle))); + } + // this is the rank going forward for all the coers we add + // (note: we're not adding rank-0 intervals in this function -- that's done in a separate pass above) + rgfa_rank = 1; + } + +#ifdef debug +#pragma omp critical(cerr) + cerr << "found " << travs.size() << " traversals including " << ref_paths.size() << " reference traversals" << endl; +#endif + + + // find all intervals within a snarl traversal that are completely uncovered. + // the returned intervals are open-ended. + function>(const vector&)> get_uncovered_intervals = [&](const vector& trav) { + vector> intervals; + int64_t start = -1; + for (size_t i = 0; i < trav.size(); ++i) { + bool covered = cover_nodes.count(graph->get_id(graph->get_handle_of_step(trav[i]))); + if (covered) { + if (start != -1) { + intervals.push_back(make_pair(start, i)); + } + start = -1; + } else { + if (start == -1) { + start = i; + } + } + } + if (start != -1) { + intervals.push_back(make_pair(start, trav.size())); + } + return intervals; + }; + + // now we try to find candidate rgfa intervals in the other traversals + // there's lots of room here for smarter heuristics, but right now we just + // do first come first served. + for (int64_t trav_idx = 0; trav_idx < travs.size(); ++trav_idx) { + // todo: this map seems backwards? note really a big deal since + // we're only allowing one element + bool is_ref = false; + for (const auto& ref_trav : ref_paths) { + if (ref_trav.second == trav_idx) { + is_ref = true; + break; + } + } + if (is_ref) { + continue; + } + const vector& trav = travs.at(trav_idx); + vector> uncovered_intervals = get_uncovered_intervals(trav); + +#ifdef debug +#pragma omp critical(cerr) + cerr << "found " << uncovered_intervals.size() << "uncovered intervals in traversal " << trav_idx << endl; +#endif + + for (const auto& uncovered_interval : uncovered_intervals) { + int64_t interval_length = 0; + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { + interval_length += graph->get_length(graph->get_handle_of_step(trav[i])); + } + if (interval_length >= minimum_length) { + // update the cover + vector interval; + interval.reserve(uncovered_interval.second - uncovered_interval.first); + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { + interval.push_back(trav[i]); + cover_nodes.insert(graph->get_id(graph->get_handle_of_step(trav[i]))); + } + cover_fragments.push_back(make_pair(rgfa_rank, std::move(interval))); + } + } + } + + return rgfa_rank; +} + + + } diff --git a/src/gfa.hpp b/src/gfa.hpp index 3e914c21499..63f2c95c05c 100644 --- a/src/gfa.hpp +++ b/src/gfa.hpp @@ -12,6 +12,8 @@ */ #include "handle.hpp" +#include "snarls.hpp" +#include "traversal_finder.hpp" namespace vg { @@ -26,6 +28,46 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, bool rgfa_pline = false, bool use_w_lines = true); + +/// Prototype code to tag paths as rGFA paths. Either needs to be completely scrapped +/// or adapted into libhandlegraph at some point, ideally. +/// It works by adding :SR:i: to the end of the name + +/// Returns the RGFA rank (SR) of a path. This will be 0 for the reference +/// backbone, and higher the further number of (nested) bubbles away it is. +/// If the path is not an RGFA path, then return -1 +int rgfa_rank(const string& path_name); + +/// Add the RGFA rank tag to a pathname +string set_rgfa_rank(const string& path_name, int rgfa_rank); + +/// Compute the rGFA path cover +/// graph: the graph +/// snarl_manager: the snarls (todo: should use distance index) +/// reference_paths: rank-0 paths +/// minimum_length: the minimum length of a path to create (alleles shorter than this can be uncovered) +/// preferred_intervals: set of ranges (ex from minigraph) to use as possible for rGFA paths +void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, + SnarlManager* snarl_manager, + const unordered_set& reference_paths, + int64_t minimum_length, + const unordered_map>>& preferred_intervals = {}); + +int64_t rgfa_snarl_cover(const PathHandleGraph* graph, + const Snarl& snarl, + PathTraversalFinder& path_trav_finder, + const unordered_set& reference_paths, + int64_t minimum_length, + int64_t rgfa_rank, + unordered_set& cover_nodes, + vector>>& cover_fragments, + const unordered_map>>& preferred_intervals); + + +/// Extract rGFA tags from minigraph GFA in order to pass in as hints above +unordered_map>> extract_rgfa_intervals(const string& rgfa_path); + + } #endif diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index dd8589947a1..6ffc2515aea 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -16,6 +16,8 @@ #include "../vg.hpp" #include "../xg.hpp" #include "../gbwt_helper.hpp" +#include "../integrated_snarl_finder.hpp" +#include "../gfa.hpp" #include #include #include @@ -37,6 +39,10 @@ void help_paths(char** argv) { << " -V, --extract-vg output a path-only graph covering the selected paths" << endl << " -d, --drop-paths output a graph with the selected paths removed" << endl << " -r, --retain-paths output a graph with only the selected paths retained" << endl + << " rGFA cover" << endl + << " -R, --rgfa-min-length N add rGFA cover to graph, using seleciton from -Q/-S as rank-0 backbone, only adding fragments >= Nbp (default:-1=disabled)" << endl + << " -s, --snarls FILE snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover (-R)." << endl + << " -t, --threads N use up to N threads when computing rGFA covoer (default: all available)" << endl << " output path data:" << endl << " -X, --extract-gam print (as GAM alignments) the stored paths in the graph" << endl << " -A, --extract-gaf print (as GAF alignments) the stored paths in the graph" << endl @@ -99,6 +105,8 @@ int main_paths(int argc, char** argv) { bool extract_as_fasta = false; bool drop_paths = false; bool retain_paths = false; + int64_t rgfa_min_len = -1; + string snarl_filename; string graph_file; string gbwt_file; string path_prefix; @@ -131,6 +139,7 @@ int main_paths(int argc, char** argv) { {"extract-vg", no_argument, 0, 'V'}, {"drop-paths", no_argument, 0, 'd'}, {"retain-paths", no_argument, 0, 'r'}, + {"rgfa-cover", required_argument, 0, 'R'}, {"extract-gam", no_argument, 0, 'X'}, {"extract-gaf", no_argument, 0, 'A'}, {"list", no_argument, 0, 'L'}, @@ -144,16 +153,15 @@ int main_paths(int argc, char** argv) { {"variant-paths", no_argument, 0, 'a'}, {"generic-paths", no_argument, 0, 'G'}, {"coverage", no_argument, 0, 'c'}, - + {"threads", required_argument, 0, 't'}, // Hidden options for backward compatibility. - {"threads", no_argument, 0, 'T'}, {"threads-by", required_argument, 0, 'q'}, {0, 0, 0, 0} }; int option_index = 0; - c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:draGp:c", + c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drR:aGp:ct:", long_options, &option_index); // Detect the end of the options. @@ -182,12 +190,17 @@ int main_paths(int argc, char** argv) { case 'd': drop_paths = true; output_formats++; - break; + break; case 'r': retain_paths = true; output_formats++; break; + + case 'R': + rgfa_min_len = parse(optarg); + output_formats++; + break; case 'X': extract_as_gam = true; @@ -260,9 +273,16 @@ int main_paths(int argc, char** argv) { output_formats++; break; - case 'T': - std::cerr << "warning: [vg paths] option --threads is obsolete and unnecessary" << std::endl; + case 't': + { + int num_threads = parse(optarg); + if (num_threads <= 0) { + cerr << "error:[vg call] Thread count (-t) set to " << num_threads << ", must set to a positive integer." << endl; + exit(1); + } + omp_set_num_threads(num_threads); break; + } case 'q': std::cerr << "warning: [vg paths] option --threads-by is deprecated; please use --paths-by" << std::endl; @@ -299,7 +319,7 @@ int main_paths(int argc, char** argv) { } } if (output_formats != 1) { - std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -L, -F, -E, -C or -c) must be specified" << std::endl; + std::cerr << "error: [vg paths] one output format (-X, -A, -V, -d, -r, -R, -L, -F, -E, -C or -c) must be specified" << std::endl; std::exit(EXIT_FAILURE); } if (selection_criteria > 1) { @@ -350,9 +370,23 @@ int main_paths(int argc, char** argv) { exit(1); } } - - - + + // Load or compute the snarls + unique_ptr snarl_manager; + if (rgfa_min_len >= 0) { + if (!snarl_filename.empty()) { + ifstream snarl_file(snarl_filename.c_str()); + if (!snarl_file) { + cerr << "Error [vg paths]: Unable to load snarls file: " << snarl_filename << endl; + return 1; + } + snarl_manager = vg::io::VPKG::load_one(snarl_file); + } else { + IntegratedSnarlFinder finder(*graph); + snarl_manager = unique_ptr(new SnarlManager(std::move(finder.find_snarls_parallel()))); + } + } + set path_names; if (!path_file.empty()) { ifstream path_stream(path_file); @@ -545,7 +579,7 @@ int main_paths(int argc, char** argv) { }; - if (drop_paths || retain_paths) { + if (drop_paths || retain_paths || rgfa_min_len >= 0) { MutablePathMutableHandleGraph* mutable_graph = dynamic_cast(graph.get()); if (!mutable_graph) { std::cerr << "error[vg paths]: graph cannot be modified" << std::endl; @@ -557,20 +591,30 @@ int main_paths(int argc, char** argv) { exit(1); } - vector to_destroy; - if (drop_paths) { - for_each_selected_path([&](const path_handle_t& path_handle) { - string name = graph->get_path_name(path_handle); - to_destroy.push_back(name); - }); + if (drop_paths || retain_paths) { + vector to_destroy; + if (drop_paths) { + for_each_selected_path([&](const path_handle_t& path_handle) { + string name = graph->get_path_name(path_handle); + to_destroy.push_back(name); + }); + } else { + for_each_unselected_path([&](const path_handle_t& path_handle) { + string name = graph->get_path_name(path_handle); + to_destroy.push_back(name); + }); + } + for (string& path_name : to_destroy) { + mutable_graph->destroy_path(graph->get_path_handle(path_name)); + } } else { - for_each_unselected_path([&](const path_handle_t& path_handle) { - string name = graph->get_path_name(path_handle); - to_destroy.push_back(name); + assert(rgfa_min_len >= 0); + unordered_set reference_paths; + for_each_selected_path([&](const path_handle_t& path_handle) { + reference_paths.insert(path_handle); }); - } - for (string& path_name : to_destroy) { - mutable_graph->destroy_path(graph->get_path_handle(path_name)); + + rgfa_graph_cover(mutable_graph, snarl_manager.get(), reference_paths, rgfa_min_len); } // output the graph From 9983dc73448549133ae4b6680a50cc5fb6f10e2a Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Mon, 20 Mar 2023 21:54:27 -0400 Subject: [PATCH 02/12] clean up recursion a bit --- src/gfa.cpp | 117 +++++++++++++++++++++++++++++----------------------- src/gfa.hpp | 19 ++++----- 2 files changed, 74 insertions(+), 62 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index 0a21908983b..30bc0299b44 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -249,7 +249,7 @@ void write_w_line(const PathHandleGraph* graph, ostream& out, path_handle_t path out << "\n"; } -int rgfa_rank(const string& path_name) { +int get_rgfa_rank(const string& path_name) { int rank = -1; size_t pos = path_name.rfind(":SR:i:"); if (pos != string::npos && path_name.length() - pos >= 6) { @@ -297,46 +297,37 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, // we process top-level snarl_manager in parallel snarl_manager->for_each_top_level_snarl_parallel([&](const Snarl* snarl) { - // index nodes in rgfa cover to prevent overlaps - unordered_set cover_nodes; // per-thread output + // each fragment is a rank and vector of steps, the cove is a list of fragments + // TODO: we can store just a first step and count instead of every fragment vector>>& cover_fragments = thread_covers.at(omp_get_thread_num()); + // we also index the fragments by their node ids for fast lookups of what's covered by what + // the value here is an index in the above vector + unordered_map cover_node_to_fragment; - // we keep track of ranks. rgfa ranks are relative to the reference path, not the snarl tree - // so we begin at -1 which means unknown. it will turn to 0 at first reference anchored snarl found - // -1 rank snarls cannot be covered by this algorithm - vector> queue; // rank,snarl - - queue.push_back(make_pair(-1, snarl)); + vector queue = {snarl}; while(!queue.empty()) { - pair rank_snarl = queue.back(); + const Snarl* cur_snarl = queue.back(); queue.pop_back(); // get the snarl cover, writing to cover_nodes and cover_fragments // note that we are single-threaded per top-level snarl, at least for now // this is because parent snarls and child snarls can potentially cover the // sname things - int64_t rgfa_rank = rgfa_snarl_cover(graph, - *snarl, - path_trav_finder, - reference_paths, - minimum_length, - -1, - cover_nodes, - cover_fragments, - preferred_intervals); - - // we don't even attempt to cover rank -1 snarls, instead just - // recurse until we find a reference path. - // this does put a reference path / snarl decomposition alignment - // requirement on this code - int64_t child_rank = rgfa_rank < 0 ? rgfa_rank : rgfa_rank + 1; + rgfa_snarl_cover(graph, + *cur_snarl, + path_trav_finder, + reference_paths, + minimum_length, + cover_fragments, + cover_node_to_fragment, + preferred_intervals); // recurse on the children - const vector& children = snarl_manager->children_of(snarl); + const vector& children = snarl_manager->children_of(cur_snarl); for (const Snarl* child_snarl : children) { - queue.push_back(make_pair(child_rank, child_snarl)); + queue.push_back(child_snarl); } } }); @@ -430,19 +421,18 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, } } -int64_t rgfa_snarl_cover(const PathHandleGraph* graph, - const Snarl& snarl, - PathTraversalFinder& path_trav_finder, - const unordered_set& reference_paths, - int64_t minimum_length, - int64_t rgfa_rank, - unordered_set& cover_nodes, - vector>>& cover_fragments, - const unordered_map>>& preferred_intervals) { +void rgfa_snarl_cover(const PathHandleGraph* graph, + const Snarl& snarl, + PathTraversalFinder& path_trav_finder, + const unordered_set& reference_paths, + int64_t minimum_length, + vector>>& cover_fragments, + unordered_map& cover_node_to_fragment, + const unordered_map>>& preferred_intervals) { #ifdef debug #pragma omp critical(cerr) - cerr << "calling rgfa_snarl_cover with rank=" << rgfa_rank << " on " << pb2json(snarl) << endl; + cerr << "calling rgfa_snarl_cover on " << pb2json(snarl) << endl; #endif // // start by finding the path traversals through the snarl @@ -453,10 +443,17 @@ int64_t rgfa_snarl_cover(const PathHandleGraph* graph, // reduce protobuf usage by going back to vector of steps instead of keeping SnarlTraversals around for (int64_t i = 0; i < path_travs.first.size(); ++i) { + string trav_path_name = graph->get_path_name(graph->get_path_handle_of_step(path_travs.second[i].first)); + if (get_rgfa_rank(trav_path_name) > 0) { + // we ignore existing (off-reference) rGFA paths + // todo: shoulgd there be better error handling? + cerr << "Warning : skipping existing rgfa traversal " << trav_path_name << endl; + continue; + } bool reversed = false; if (graph->get_is_reverse(graph->get_handle_of_step(path_travs.second[i].first)) != snarl.start().backward()) { - reversed == true; - } + reversed = true; + } assert((graph->get_is_reverse(graph->get_handle_of_step(path_travs.second[i].second)) != snarl.end().backward()) == reversed); vector trav; trav.reserve(path_travs.first[i].visit_size()); @@ -483,12 +480,12 @@ int64_t rgfa_snarl_cover(const PathHandleGraph* graph, } } - if (ref_paths.empty() && rgfa_rank <= 0) { + if (ref_paths.empty() && !cover_node_to_fragment.count(snarl.start().node_id())) { // we're not nested in a reference snarl, and we have no reference path // by the current logic, there's nothing to be done. cerr << "[rgfa] warning: No referene path through snarl " << pb2json(snarl) << ": unable to process for rGFA cover" << endl; - return -1; + return; } if (ref_paths.size() > 1) { @@ -502,11 +499,10 @@ int64_t rgfa_snarl_cover(const PathHandleGraph* graph, // all we do here is make sure the relevant nodes are flagged in the map vector& ref_trav = travs.at(ref_paths.begin()->second); for (step_handle_t ref_step_handle : ref_trav) { - cover_nodes.insert(graph->get_id(graph->get_handle_of_step(ref_step_handle))); + nid_t node_id = graph->get_id(graph->get_handle_of_step(ref_step_handle)); + // using -1 as special signifier of the reference path + cover_node_to_fragment[node_id] = -1; } - // this is the rank going forward for all the coers we add - // (note: we're not adding rank-0 intervals in this function -- that's done in a separate pass above) - rgfa_rank = 1; } #ifdef debug @@ -521,7 +517,7 @@ int64_t rgfa_snarl_cover(const PathHandleGraph* graph, vector> intervals; int64_t start = -1; for (size_t i = 0; i < trav.size(); ++i) { - bool covered = cover_nodes.count(graph->get_id(graph->get_handle_of_step(trav[i]))); + bool covered = cover_node_to_fragment.count(graph->get_id(graph->get_handle_of_step(trav[i]))); if (covered) { if (start != -1) { intervals.push_back(make_pair(start, i)); @@ -560,11 +556,30 @@ int64_t rgfa_snarl_cover(const PathHandleGraph* graph, #ifdef debug #pragma omp critical(cerr) - cerr << "found " << uncovered_intervals.size() << "uncovered intervals in traversal " << trav_idx << endl; + cerr << "found " << uncovered_intervals.size() << " uncovered intervals in traversal " << trav_idx << endl; #endif for (const auto& uncovered_interval : uncovered_intervals) { - int64_t interval_length = 0; + // we check the "backbone" interval that this interval is coming off + // since we've already covered the reference, then we know that this interval + // doesn't span the whole snarl including endpoints, so we can always afford + // to look back and ahead one + cerr << "uncovered interval " << uncovered_interval.first << ", " << uncovered_interval.second << " vs trav size " << trav.size() << endl; + assert(uncovered_interval.first > 0 && uncovered_interval.second < trav.size()); + int64_t prev_frag_idx = cover_node_to_fragment.at(graph->get_id(graph->get_handle_of_step(trav[uncovered_interval.first - 1]))); + int64_t next_frag_idx = cover_node_to_fragment.at(graph->get_id(graph->get_handle_of_step(trav[uncovered_interval.second]))); + if (prev_frag_idx != next_frag_idx) { + // todo: figure out policy for these + cerr << "[rgfa] warning: skipping fragment that is connected to different fragmengs" << endl; + continue; + } + int64_t fragment_rank; + if (prev_frag_idx == -1) { + fragment_rank = 1; + } else { + fragment_rank = cover_fragments.at(prev_frag_idx).first + 1; + } + int64_t interval_length = 0; for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { interval_length += graph->get_length(graph->get_handle_of_step(trav[i])); } @@ -574,14 +589,12 @@ int64_t rgfa_snarl_cover(const PathHandleGraph* graph, interval.reserve(uncovered_interval.second - uncovered_interval.first); for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { interval.push_back(trav[i]); - cover_nodes.insert(graph->get_id(graph->get_handle_of_step(trav[i]))); + cover_node_to_fragment[graph->get_id(graph->get_handle_of_step(trav[i]))] = cover_fragments.size(); } - cover_fragments.push_back(make_pair(rgfa_rank, std::move(interval))); + cover_fragments.push_back(make_pair(fragment_rank, std::move(interval))); } } } - - return rgfa_rank; } diff --git a/src/gfa.hpp b/src/gfa.hpp index 63f2c95c05c..ebfaf3df312 100644 --- a/src/gfa.hpp +++ b/src/gfa.hpp @@ -36,7 +36,7 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, /// Returns the RGFA rank (SR) of a path. This will be 0 for the reference /// backbone, and higher the further number of (nested) bubbles away it is. /// If the path is not an RGFA path, then return -1 -int rgfa_rank(const string& path_name); +int get_rgfa_rank(const string& path_name); /// Add the RGFA rank tag to a pathname string set_rgfa_rank(const string& path_name, int rgfa_rank); @@ -53,15 +53,14 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, int64_t minimum_length, const unordered_map>>& preferred_intervals = {}); -int64_t rgfa_snarl_cover(const PathHandleGraph* graph, - const Snarl& snarl, - PathTraversalFinder& path_trav_finder, - const unordered_set& reference_paths, - int64_t minimum_length, - int64_t rgfa_rank, - unordered_set& cover_nodes, - vector>>& cover_fragments, - const unordered_map>>& preferred_intervals); +void rgfa_snarl_cover(const PathHandleGraph* graph, + const Snarl& snarl, + PathTraversalFinder& path_trav_finder, + const unordered_set& reference_paths, + int64_t minimum_length, + vector>>& cover_fragments, + unordered_map& cover_node_to_fragment, + const unordered_map>>& preferred_intervals); /// Extract rGFA tags from minigraph GFA in order to pass in as hints above From 178c1f349b4add484cb13baf3afdeb2cbbaf1091 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 21 Mar 2023 13:57:40 -0400 Subject: [PATCH 03/12] implement greedy rgfa optimizer --- src/gfa.cpp | 193 ++++++++++++++++++++++++++++++++++++++++++---------- src/gfa.hpp | 12 +++- 2 files changed, 168 insertions(+), 37 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index 30bc0299b44..064a58a9e28 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -2,6 +2,7 @@ #include "utility.hpp" #include "path.hpp" #include +#include #include @@ -228,7 +229,7 @@ void write_w_line(const PathHandleGraph* graph, ostream& out, path_handle_t path if (phase_block_end_cursor != 0) { if (start_offset != 0) { // TODO: Work out a way to support phase blocks and subranges at the same time. - cerr << "[gfa] error: cannot write multiple phase blocks on a sample, haplotyope, and contig in GFA format" + cerr << "[gfa] error: cannot write multiple phase blocks on a sample, haplotype, and contig in GFA format" << " when paths already have subranges. Fix path " << graph->get_path_name(path_handle) << endl; exit(1); } @@ -360,6 +361,7 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, subrange_t path_subrange; PathMetadata::parse_path_name(graph->get_path_name(path_handle), path_sense, path_sample, path_locus, path_haplotype, path_phase_block, path_subrange); + PathSense out_path_sense = path_sense == PathSense::HAPLOTYPE ? PathSense::GENERIC : path_sense; const vector& fragments = path_fragments.second; @@ -403,7 +405,7 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, rgfa_frag_subrange.first = rgfa_frag_pos + (path_subrange != PathMetadata::NO_SUBRANGE ? path_subrange.first : 0); rgfa_frag_subrange.second = rgfa_frag_subrange.first + rgfa_frag_length; - string rgfa_frag_name = PathMetadata::create_path_name(path_sense, path_sample, path_locus, path_haplotype, + string rgfa_frag_name = PathMetadata::create_path_name(out_path_sense, path_sample, path_locus, path_haplotype, path_phase_block, rgfa_frag_subrange); rgfa_frag_name = set_rgfa_rank(rgfa_frag_name, rgfa_rank); @@ -535,9 +537,8 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, return intervals; }; - // now we try to find candidate rgfa intervals in the other traversals - // there's lots of room here for smarter heuristics, but right now we just - // do first come first served. + // build an initial ranked list of candidate traversal fragments + vector, pair>>> ranked_trav_fragments; for (int64_t trav_idx = 0; trav_idx < travs.size(); ++trav_idx) { // todo: this map seems backwards? note really a big deal since // we're only allowing one element @@ -554,47 +555,167 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, const vector& trav = travs.at(trav_idx); vector> uncovered_intervals = get_uncovered_intervals(trav); -#ifdef debug -#pragma omp critical(cerr) - cerr << "found " << uncovered_intervals.size() << " uncovered intervals in traversal " << trav_idx << endl; -#endif - for (const auto& uncovered_interval : uncovered_intervals) { - // we check the "backbone" interval that this interval is coming off - // since we've already covered the reference, then we know that this interval - // doesn't span the whole snarl including endpoints, so we can always afford - // to look back and ahead one - cerr << "uncovered interval " << uncovered_interval.first << ", " << uncovered_interval.second << " vs trav size " << trav.size() << endl; - assert(uncovered_interval.first > 0 && uncovered_interval.second < trav.size()); - int64_t prev_frag_idx = cover_node_to_fragment.at(graph->get_id(graph->get_handle_of_step(trav[uncovered_interval.first - 1]))); - int64_t next_frag_idx = cover_node_to_fragment.at(graph->get_id(graph->get_handle_of_step(trav[uncovered_interval.second]))); - if (prev_frag_idx != next_frag_idx) { - // todo: figure out policy for these - cerr << "[rgfa] warning: skipping fragment that is connected to different fragmengs" << endl; - continue; - } - int64_t fragment_rank; - if (prev_frag_idx == -1) { - fragment_rank = 1; - } else { - fragment_rank = cover_fragments.at(prev_frag_idx).first + 1; - } int64_t interval_length = 0; for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { interval_length += graph->get_length(graph->get_handle_of_step(trav[i])); } if (interval_length >= minimum_length) { - // update the cover - vector interval; - interval.reserve(uncovered_interval.second - uncovered_interval.first); - for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { - interval.push_back(trav[i]); - cover_node_to_fragment[graph->get_id(graph->get_handle_of_step(trav[i]))] = cover_fragments.size(); + auto trav_stats = rgfa_traversal_stats(graph, trav, uncovered_interval); + ranked_trav_fragments.push_back(make_pair(trav_stats, make_pair(trav_idx, uncovered_interval))); + } + } + } + + // todo: typedef! + function, pair>>& s1, + const pair, pair>>& s2)> heap_comp = + [](const pair, pair>>& s1, + const pair, pair>>& s2) { + return s1.first < s2.first; + }; + + // put the fragments into a max heap + std::make_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); + + // now greedily pull out traversal intervals from the ranked list until none are left + while (!ranked_trav_fragments.empty()) { + + // get the best scoring (max) fragment from heap + auto& best_stats_fragment = ranked_trav_fragments.front(); + std::pop_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); + ranked_trav_fragments.pop_back(); + + const vector& trav = travs.at(best_stats_fragment.second.first); + const pair& uncovered_interval = best_stats_fragment.second.second; + + // our traversal may have been partially covered by a different iteration, if so, we need to break it up + // and continue + vector> chopped_intervals; + int64_t cur_start = -1; + bool chopped = false; + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { + bool covered = cover_node_to_fragment.count(graph->get_id(graph->get_handle_of_step(trav[i]))); + if (!covered && cur_start == -1) { + cur_start = i; + } else if (covered) { + chopped = true; + if (cur_start != -1) { + chopped_intervals.push_back(make_pair(cur_start, i)); + cur_start = -1; + } + } + } + if (cur_start != -1) { + chopped_intervals.push_back(make_pair(cur_start, uncovered_interval.second)); + } + if (chopped) { + for (const pair& chopped_interval : chopped_intervals) { + int64_t chopped_trav_length = 0; + for (int64_t i = chopped_interval.first; i < chopped_interval.second; ++i) { + chopped_trav_length += graph->get_length(graph->get_handle_of_step(trav[i])); + } + if (chopped_trav_length >= minimum_length) { + auto chopped_stats = rgfa_traversal_stats(graph, trav, chopped_interval); + ranked_trav_fragments.push_back(make_pair(chopped_stats, make_pair(best_stats_fragment.second.first, chopped_interval))); + std::push_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); + cerr << "pushing interval " << endl; } - cover_fragments.push_back(make_pair(fragment_rank, std::move(interval))); } + continue; } + + // we check the "backbone" interval that this interval is coming off + // since we've already covered the reference, then we know that this interval + // doesn't span the whole snarl including endpoints, so we can always afford + // to look back and ahead one + cerr << "uncovered interval " << uncovered_interval.first << ", " << uncovered_interval.second << " vs trav size " << trav.size() << endl; + assert(uncovered_interval.first > 0 && uncovered_interval.second < trav.size()); + int64_t prev_frag_idx = cover_node_to_fragment.at(graph->get_id(graph->get_handle_of_step(trav[uncovered_interval.first - 1]))); + int64_t next_frag_idx = cover_node_to_fragment.at(graph->get_id(graph->get_handle_of_step(trav[uncovered_interval.second]))); + if (prev_frag_idx != next_frag_idx) { + // todo: figure out policy for these + cerr << "[rgfa] warning: skipping fragment that is connected to different fragmengs" << endl; + continue; + } + int64_t fragment_rank; + if (prev_frag_idx == -1) { + fragment_rank = 1; + } else { + fragment_rank = cover_fragments.at(prev_frag_idx).first + 1; + } + + // update the cover + vector interval; + interval.reserve(uncovered_interval.second - uncovered_interval.first); + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { + interval.push_back(trav[i]); + cover_node_to_fragment[graph->get_id(graph->get_handle_of_step(trav[i]))] = cover_fragments.size(); + } + cover_fragments.push_back(make_pair(fragment_rank, std::move(interval))); + } +} + +tuple rgfa_traversal_stats(const PathHandleGraph* graph, + const vector& trav, + const pair& trav_fragment) { + path_handle_t path_handle = graph->get_path_handle_of_step(trav.front()); + int64_t support = 0; + int64_t switches = 0; + int64_t dupes = 0; + handle_t prev_handle; + + for (int64_t i = trav_fragment.first; i < trav_fragment.second; ++i) { + const step_handle_t& step = trav[i]; + handle_t handle = graph->get_handle_of_step(step); + vector all_steps = graph->steps_of_handle(handle); + int64_t length = graph->get_length(handle); + support += length; + int64_t self_count = 0; + for (const step_handle_t& other_step : all_steps) { + path_handle_t step_path_handle = graph->get_path_handle_of_step(other_step); + if (step_path_handle == path_handle) { + ++self_count; + } else { + support += length; + } + } + if (self_count > 1) { + dupes += length * (self_count - 1); + } + if (i > 0 && graph->get_is_reverse(handle) != graph->get_is_reverse(prev_handle)) { + ++switches; + } + prev_handle = handle; } + + return std::make_tuple(support, switches, dupes); +} + +bool rgfa_traversal_stats_less(const tuple& s1, const tuple& s2) { + // duplicates are a deal breaker, if one traversal has no duplicates and the other does, the former wins + if (get<2>(s1) > 0 && get<2>(s2) == 0) { + return true; + } else if (get<2>(s1) == 0 && get<2>(s2) > 0) { + return false; + } + + // then support + if (get<0>(s1) < get<0>(s2)) { + return true; + } else if (get<0>(s1) > get<0>(s2)) { + return false; + } + + // then duplicates (by value) + if (get<2>(s1) > get<2>(s2)) { + return true; + } else if (get<2>(s1) < get<2>(s2)) { + return false; + } + + // then switches + return get<1>(s1) > get<1>(s2); } diff --git a/src/gfa.hpp b/src/gfa.hpp index ebfaf3df312..56e941fde1b 100644 --- a/src/gfa.hpp +++ b/src/gfa.hpp @@ -62,7 +62,17 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, unordered_map& cover_node_to_fragment, const unordered_map>>& preferred_intervals); - +/// Get some statistics from a traversal fragment that we use for ranking in the greedy algorithm +/// 1. Coverage : total step length across the traversal for all paths +/// 2. Switches : the number of nodes that would need to be flipped to forwardize the traversal +/// 3. Duplicated bases : the number of duplicated bases in the traversal path +tuple rgfa_traversal_stats(const PathHandleGraph* graph, + const vector& trav, + const pair& trav_fragment); + +/// Comparison of the above stats for the purposes of greedily selecting (highest better) traversals +bool rgfa_traversal_stats_less(const tuple& s1, const tuple& s2); + /// Extract rGFA tags from minigraph GFA in order to pass in as hints above unordered_map>> extract_rgfa_intervals(const string& rgfa_path); From b8914df135bf5fc8db88520880cf921a40a6a735 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 21 Mar 2023 14:40:45 -0400 Subject: [PATCH 04/12] rgfa forwardization --- src/gfa.cpp | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++++ src/gfa.hpp | 4 +++ 2 files changed, 79 insertions(+) diff --git a/src/gfa.cpp b/src/gfa.cpp index 064a58a9e28..169e49b3fd9 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -421,6 +421,9 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, } } } + + // forwardize the graph + rgfa_forwardize_paths(graph, reference_paths); } void rgfa_snarl_cover(const PathHandleGraph* graph, @@ -718,6 +721,78 @@ bool rgfa_traversal_stats_less(const tuple& s1, const return get<1>(s1) > get<1>(s2); } +// copied pretty much verbatem from +// https://github.com/ComparativeGenomicsToolkit/hal2vg/blob/v1.1.2/clip-vg.cpp#L809-L880 +void rgfa_forwardize_paths(MutablePathMutableHandleGraph* graph, + const unordered_set& reference_paths) { + + graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + if (reference_paths.count(path_handle) || get_rgfa_rank(path_name) >= 0) { + size_t fw_count = 0; + size_t total_steps = 0; + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = graph->get_handle_of_step(step_handle); + if (graph->get_is_reverse(handle)) { + handle_t flipped_handle = graph->create_handle(graph->get_sequence(handle)); + graph->follow_edges(handle, true, [&](handle_t prev_handle) { + if (graph->get_id(prev_handle) != graph->get_id(handle)) { + graph->create_edge(prev_handle, flipped_handle); + } + }); + graph->follow_edges(handle, false, [&](handle_t next_handle) { + if (graph->get_id(handle) != graph->get_id(next_handle)) { + graph->create_edge(flipped_handle, next_handle); + } + }); + // self-loop cases we punted on above: + if (graph->has_edge(handle, handle)) { + graph->create_edge(flipped_handle, flipped_handle); + } + if (graph->has_edge(handle, graph->flip(handle))) { + graph->create_edge(flipped_handle, graph->flip(flipped_handle)); + } + if (graph->has_edge(graph->flip(handle), handle)) { + graph->create_edge(graph->flip(flipped_handle), flipped_handle); + } + vector steps = graph->steps_of_handle(handle); + size_t ref_count = 0; + for (step_handle_t step : steps) { + if (graph->get_path_handle_of_step(step) == path_handle) { + ++ref_count; + } + step_handle_t next_step = graph->get_next_step(step); + handle_t new_handle = graph->get_is_reverse(graph->get_handle_of_step(step)) ? flipped_handle : + graph->flip(flipped_handle); + graph->rewrite_segment(step, next_step, {new_handle}); + } + if (ref_count > 1) { + cerr << "[rGFA] error: Cycle detected in rGFA path " << path_name << " at node " << graph->get_id(handle) << endl; + exit(1); + } + ++fw_count; + assert(graph->steps_of_handle(handle).empty()); + dynamic_cast(graph)->destroy_handle(handle); + } + ++total_steps; + }); + } + }); + + // do a check just to be sure + graph->for_each_path_handle([&](path_handle_t path_handle) { + string path_name = graph->get_path_name(path_handle); + if (reference_paths.count(path_handle) || get_rgfa_rank(path_name) >= 0) { + graph->for_each_step_in_path(path_handle, [&](step_handle_t step_handle) { + handle_t handle = graph->get_handle_of_step(step_handle); + if (graph->get_is_reverse(handle)) { + cerr << "[rGFA] error: Failed to fowardize node " << graph->get_id(handle) << " in path " << path_name << endl; + exit(1); + } + }); + } + }); +} } diff --git a/src/gfa.hpp b/src/gfa.hpp index 56e941fde1b..d5650a1985c 100644 --- a/src/gfa.hpp +++ b/src/gfa.hpp @@ -72,6 +72,10 @@ tuple rgfa_traversal_stats(const PathHandleGraph* gra /// Comparison of the above stats for the purposes of greedily selecting (highest better) traversals bool rgfa_traversal_stats_less(const tuple& s1, const tuple& s2); + +/// Make sure all rgfa paths are forwardized +void rgfa_forwardize_paths(MutablePathMutableHandleGraph* graph, + const unordered_set& reference_paths); /// Extract rGFA tags from minigraph GFA in order to pass in as hints above unordered_map>> extract_rgfa_intervals(const string& rgfa_path); From 2b32425f43043786d19bd3395095f50b4bf5c6b6 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 21 Mar 2023 17:10:51 -0400 Subject: [PATCH 05/12] start rgfa tests --- src/gfa.cpp | 13 +++++++++++-- test/rgfa/rgfa_ins.gfa | 15 +++++++++++++++ test/rgfa/rgfa_ins2.gfa | 18 ++++++++++++++++++ test/rgfa/rgfa_tiny.gfa | 38 ++++++++++++++++++++++++++++++++++++++ test/t/11_vg_paths.t | 35 ++++++++++++++++++++++++++++++++++- 5 files changed, 116 insertions(+), 3 deletions(-) create mode 100644 test/rgfa/rgfa_ins.gfa create mode 100644 test/rgfa/rgfa_ins2.gfa create mode 100644 test/rgfa/rgfa_tiny.gfa diff --git a/src/gfa.cpp b/src/gfa.cpp index 169e49b3fd9..536062a264b 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -566,6 +566,7 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, if (interval_length >= minimum_length) { auto trav_stats = rgfa_traversal_stats(graph, trav, uncovered_interval); ranked_trav_fragments.push_back(make_pair(trav_stats, make_pair(trav_idx, uncovered_interval))); + cerr << "pushing trav " << trav_idx << " with stats " << get<0>(trav_stats) <<"," << get<1>(trav_stats) << "," << get<2>(trav_stats) << endl; } } } @@ -575,19 +576,27 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, const pair, pair>>& s2)> heap_comp = [](const pair, pair>>& s1, const pair, pair>>& s2) { - return s1.first < s2.first; + cerr << "COMP" << s1.first << " VS " << s2.first << " is " << rgfa_traversal_stats_less(s1.first, s2.first) << endl; + return rgfa_traversal_stats_less(s1.first, s2.first); }; // put the fragments into a max heap std::make_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); + for (auto xxx : ranked_trav_fragments) { + cerr << " heep " << xxx.first <<"->trav " << xxx.second.first << endl; + cerr << " front is " << ranked_trav_fragments.front().first << "->trav " << ranked_trav_fragments.front().second.first << endl; + } + // now greedily pull out traversal intervals from the ranked list until none are left while (!ranked_trav_fragments.empty()) { // get the best scoring (max) fragment from heap - auto& best_stats_fragment = ranked_trav_fragments.front(); + auto best_stats_fragment = ranked_trav_fragments.front(); std::pop_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); ranked_trav_fragments.pop_back(); + cerr << "popping trav " << best_stats_fragment.second.first << " with stats " + << get<0>(best_stats_fragment.first) << "," << get<1>(best_stats_fragment.first) << "," << get<2>(best_stats_fragment.first) << endl; const vector& trav = travs.at(best_stats_fragment.second.first); const pair& uncovered_interval = best_stats_fragment.second.second; diff --git a/test/rgfa/rgfa_ins.gfa b/test/rgfa/rgfa_ins.gfa new file mode 100644 index 00000000000..9751b52ba47 --- /dev/null +++ b/test/rgfa/rgfa_ins.gfa @@ -0,0 +1,15 @@ +H VN:Z:1.0 +S 1 CAAATAAG +S 2 TTT +S 3 TTT +S 4 TTT +S 5 TTT +P x 1+,5+ * +P y 1+,2+,4+,5+ * +P z 1+,2+,3+,4+,5+ * +L 1 + 2 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 4 + 0M +L 4 + 5 + 0M +L 1 + 5 + 0M diff --git a/test/rgfa/rgfa_ins2.gfa b/test/rgfa/rgfa_ins2.gfa new file mode 100644 index 00000000000..ef6f1f3055c --- /dev/null +++ b/test/rgfa/rgfa_ins2.gfa @@ -0,0 +1,18 @@ +H VN:Z:1.0 +S 1 CAAATAAG +S 2 TTT +S 3 TTT +S 4 TTT +S 5 TTT +S 6 TTTTTTTTTT +P x 1+,5+ * +P y 1+,2+,6+,4+,5+ * +P z 1+,2+,3+,4+,5+ * +L 1 + 2 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 4 + 0M +L 4 + 5 + 0M +L 1 + 5 + 0M +L 2 + 6 + 0M +L 6 + 4 + 0M diff --git a/test/rgfa/rgfa_tiny.gfa b/test/rgfa/rgfa_tiny.gfa new file mode 100644 index 00000000000..498d2e0a6dc --- /dev/null +++ b/test/rgfa/rgfa_tiny.gfa @@ -0,0 +1,38 @@ +H VN:Z:1.0 +S 5 C +S 7 A +S 12 ATAT +S 8 G +S 1 CAAATAAG +S 4 T +S 6 TTG +S 15 CCAACTCTCTG +S 2 A +S 10 A +S 9 AAATTTTCTGGAGTTCTAT +S 11 T +S 13 A +S 14 T +S 3 G +P x 1+,3+,5+,6+,8+,9+,11+,12+,14+,15+ * +P y 1+,2+,4+,6+,8+,9+,10+,12+,13+,15+ * +L 5 + 6 + 0M +L 7 + 9 + 0M +L 12 + 13 + 0M +L 12 + 14 + 0M +L 8 + 9 + 0M +L 1 + 2 + 0M +L 1 + 3 + 0M +L 4 + 6 + 0M +L 6 + 7 + 0M +L 6 + 8 + 0M +L 2 + 4 + 0M +L 2 + 5 + 0M +L 10 + 12 + 0M +L 9 + 10 + 0M +L 9 + 11 + 0M +L 11 + 12 + 0M +L 13 + 15 + 0M +L 14 + 15 + 0M +L 3 + 4 + 0M +L 3 + 5 + 0M diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t index 58591697ef5..04108df0fc1 100644 --- a/test/t/11_vg_paths.t +++ b/test/t/11_vg_paths.t @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg export LC_ALL="C" # force a consistent sort order -plan tests 18 +plan tests 22 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg @@ -59,4 +59,37 @@ is $? 0 "vg path coverage reports correct lengths in first column" rm -f q.vg q.cov.len q.len +vg paths -v rgfa/rgfa_tiny.gfa -R 1 -Q x | vg view - > rgfa_tiny.rgfa +printf "P y[33-34]:SR:i:1 10+ * +P y[38-39]:SR:i:1 13+ * +P y[8-10]:SR:i:1 2+,4+ *\n" > rgfa_tiny_expected_fragments.rgfa +grep ^P rgfa_tiny.rgfa | grep SR | sort > rgfa_tiny_fragments.rgfa +diff rgfa_tiny_fragments.rgfa rgfa_tiny_expected_fragments.rgfa +is $? 0 "Found the expected rgfa SNP cover of tiny graph" +rm -f rgfa_tiny.rgfa rgfa_tiny_expected_fragments.rgfa rgfa_tiny_fragments.rgfa + +vg paths -v rgfa/rgfa_ins.gfa -R 5 -Q x | vg view - > rgfa_ins.rgfa +printf "P z[8-17]:SR:i:1 2+,3+,4+ *\n" > rgfa_ins_expected_fragments.rgfa +grep ^P rgfa_ins.rgfa | grep SR | sort > rgfa_ins_fragments.rgfa +diff rgfa_ins_fragments.rgfa rgfa_ins_expected_fragments.rgfa +is $? 0 "Found the expected rgfa cover for simple nested insertion" + +rm -f rgfa_ins.rgfa rgfa_ins_expected_fragments.rgfa rgfa_ins_fragments.rgfa + +vg paths -v rgfa/rgfa_ins2.gfa -R 3 -Q x | vg view - > rgfa_ins2.rgfa +printf "P y[8-24]:SR:i:1 2+,6+,4+ * +P z[11-14]:SR:i:2 3+ *\n" > rgfa_ins2_expected_fragments.rgfa +grep ^P rgfa_ins2.rgfa | grep SR | sort > rgfa_ins2_fragments.rgfa +diff rgfa_ins2_fragments.rgfa rgfa_ins2_expected_fragments.rgfa +is $? 0 "Found the expected rgfa cover for simple nested insertion that requires two fragments" + +rm -f rgfa_ins2.rgfa rgfa_ins2_expected_fragments.rgfa rgfa_ins2_fragments.rgfa + +vg paths -v rgfa/rgfa_ins2.gfa -R 5 -Q x | vg view - > rgfa_ins2R5.rgfa +printf "P y[8-24]:SR:i:1 2+,6+,4+ *\n" > rgfa_ins2R5_expected_fragments.rgfa +grep ^P rgfa_ins2R5.rgfa | grep SR | sort > rgfa_ins2R5_fragments.rgfa +diff rgfa_ins2R5_fragments.rgfa rgfa_ins2R5_expected_fragments.rgfa +is $? 0 "rgfa Minimum fragment length filters out small fragment" + +rm -f rgfa_ins2R5.rgfa rgfa_ins2R5_expected_fragments.rgfa rgfa_ins2R5_fragments.rgfa From 17e04e01d82675f2ed8182c60e06464955a553d1 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 21 Mar 2023 20:38:13 -0400 Subject: [PATCH 06/12] ../src/gfa.cpp rgfa strand swtiching test --- test/t/11_vg_paths.t | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/test/t/11_vg_paths.t b/test/t/11_vg_paths.t index 04108df0fc1..cd28a8fefa5 100644 --- a/test/t/11_vg_paths.t +++ b/test/t/11_vg_paths.t @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg export LC_ALL="C" # force a consistent sort order -plan tests 22 +plan tests 23 vg construct -r small/x.fa -v small/x.vcf.gz -a > x.vg vg construct -r small/x.fa -v small/x.vcf.gz > x2.vg @@ -93,3 +93,16 @@ diff rgfa_ins2R5_fragments.rgfa rgfa_ins2R5_expected_fragments.rgfa is $? 0 "rgfa Minimum fragment length filters out small fragment" rm -f rgfa_ins2R5.rgfa rgfa_ins2R5_expected_fragments.rgfa rgfa_ins2R5_fragments.rgfa + +vg paths -v rgfa/rgfa_ins3.gfa -R 3 -Q x | vg view - > rgfa_ins3.rgfa +printf "P x 1+,5+ * +P y[3-19]:SR:i:1 4+,6+,2+ * +P y 5-,4+,6+,2+,1- * +P z[11-14]:SR:i:2 3+ * +P z 1+,2-,3+,4-,5+ *\n" | sort > rgfa_ins3_expected_fragments.rgfa +grep ^P rgfa_ins3.rgfa | sort > rgfa_ins3_fragments.rgfa +diff rgfa_ins3_fragments.rgfa rgfa_ins3_expected_fragments.rgfa +is $? 0 "Found the expected rgfa cover for simple nested insertion that has a reversed path that needs forwardization" + +rm -f rgfa_ins3.rgfa rgfa_ins3_expected_fragments.rgfa rgfa_ins3_fragments.rgfa + From 03d156ea38ee2df9d68e71aee3aee2893640f315 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 21 Mar 2023 20:52:23 -0400 Subject: [PATCH 07/12] fix strand bug --- src/gfa.cpp | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index 536062a264b..8efbff83338 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -361,7 +361,9 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, subrange_t path_subrange; PathMetadata::parse_path_name(graph->get_path_name(path_handle), path_sense, path_sample, path_locus, path_haplotype, path_phase_block, path_subrange); - PathSense out_path_sense = path_sense == PathSense::HAPLOTYPE ? PathSense::GENERIC : path_sense; + // todo: we need a policy for this + //PathSense out_path_sense = path_sense == PathSense::HAPLOTYPE ? PathSense::GENERIC : path_sense; + PathSense out_path_sense = path_sense; const vector& fragments = path_fragments.second; @@ -472,6 +474,9 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, done = true; } } + if (reversed) { + std::reverse(trav.begin(), trav.end()); + } travs.push_back(trav); } } @@ -673,9 +678,8 @@ tuple rgfa_traversal_stats(const PathHandleGraph* gra const pair& trav_fragment) { path_handle_t path_handle = graph->get_path_handle_of_step(trav.front()); int64_t support = 0; - int64_t switches = 0; + int64_t reversed_steps = 0; int64_t dupes = 0; - handle_t prev_handle; for (int64_t i = trav_fragment.first; i < trav_fragment.second; ++i) { const step_handle_t& step = trav[i]; @@ -695,13 +699,12 @@ tuple rgfa_traversal_stats(const PathHandleGraph* gra if (self_count > 1) { dupes += length * (self_count - 1); } - if (i > 0 && graph->get_is_reverse(handle) != graph->get_is_reverse(prev_handle)) { - ++switches; + if (i > 0 && graph->get_is_reverse(handle)) { + ++reversed_steps; } - prev_handle = handle; } - return std::make_tuple(support, switches, dupes); + return std::make_tuple(support, reversed_steps, dupes); } bool rgfa_traversal_stats_less(const tuple& s1, const tuple& s2) { @@ -734,7 +737,8 @@ bool rgfa_traversal_stats_less(const tuple& s1, const // https://github.com/ComparativeGenomicsToolkit/hal2vg/blob/v1.1.2/clip-vg.cpp#L809-L880 void rgfa_forwardize_paths(MutablePathMutableHandleGraph* graph, const unordered_set& reference_paths) { - + + unordered_map id_map; graph->for_each_path_handle([&](path_handle_t path_handle) { string path_name = graph->get_path_name(path_handle); if (reference_paths.count(path_handle) || get_rgfa_rank(path_name) >= 0) { @@ -744,6 +748,7 @@ void rgfa_forwardize_paths(MutablePathMutableHandleGraph* graph, handle_t handle = graph->get_handle_of_step(step_handle); if (graph->get_is_reverse(handle)) { handle_t flipped_handle = graph->create_handle(graph->get_sequence(handle)); + id_map[graph->get_id(flipped_handle)] = graph->get_id(handle); graph->follow_edges(handle, true, [&](handle_t prev_handle) { if (graph->get_id(prev_handle) != graph->get_id(handle)) { graph->create_edge(prev_handle, flipped_handle); @@ -788,6 +793,11 @@ void rgfa_forwardize_paths(MutablePathMutableHandleGraph* graph, } }); + // rename all the ids back to what they were (so nodes keep their ids, just get flipped around) + graph->reassign_node_ids([&id_map](nid_t new_id) { + return id_map.count(new_id) ? id_map[new_id] : new_id; + }); + // do a check just to be sure graph->for_each_path_handle([&](path_handle_t path_handle) { string path_name = graph->get_path_name(path_handle); From adbcdb0723d6dbb86109d23a3ca70756a497c8b7 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 21 Mar 2023 20:53:13 -0400 Subject: [PATCH 08/12] use better serilization call in paths (that supports gfa in/out) --- src/subcommand/paths_main.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index 6ffc2515aea..9468b9c764c 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -18,6 +18,7 @@ #include "../gbwt_helper.hpp" #include "../integrated_snarl_finder.hpp" #include "../gfa.hpp" +#include "../io/save_handle_graph.hpp" #include #include #include @@ -618,7 +619,7 @@ int main_paths(int argc, char** argv) { } // output the graph - serializable_graph->serialize(cout); + vg::io::save_handle_graph(graph.get(), std::cout); } else if (coverage) { // for every node, count the number of unique paths. then add the coverage count to each one From 0776c018fd95f9461af9d7331d6703bc95b671e7 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Wed, 22 Mar 2023 09:55:53 -0400 Subject: [PATCH 09/12] add missing test file --- test/rgfa/rgfa_ins3.gfa | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 test/rgfa/rgfa_ins3.gfa diff --git a/test/rgfa/rgfa_ins3.gfa b/test/rgfa/rgfa_ins3.gfa new file mode 100644 index 00000000000..861ac58d3b5 --- /dev/null +++ b/test/rgfa/rgfa_ins3.gfa @@ -0,0 +1,18 @@ +H VN:Z:1.0 +S 1 CAAATAAG +S 2 TTT +S 3 TTT +S 4 TTT +S 5 TTT +S 6 TTTTTTTTTT +P x 1+,5+ * +P y 5-,4-,6-,2-,1- * +P z 1+,2+,3+,4+,5+ * +L 1 + 2 + 0M +L 2 + 3 + 0M +L 2 + 4 + 0M +L 3 + 4 + 0M +L 4 + 5 + 0M +L 1 + 5 + 0M +L 2 + 6 + 0M +L 6 + 4 + 0M From fa9367485072a4f0f9f96b2163b7a2423567ef9b Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 23 Mar 2023 16:10:19 -0400 Subject: [PATCH 10/12] some ui tweaks for rgfa --- src/gfa.cpp | 102 +++++++++++++++++++++----------- src/gfa.hpp | 3 + src/subcommand/convert_main.cpp | 8 ++- src/subcommand/paths_main.cpp | 4 +- 4 files changed, 82 insertions(+), 35 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index 8efbff83338..8e40d444538 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -6,7 +6,7 @@ #include -#define debug +//#define debug namespace vg { @@ -43,8 +43,8 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& } out << "\n"; - //Compute the rGFA tags of given paths (todo: support non-zero ranks) - unordered_map> node_offsets; + //Compute the rGFA tags of given paths + unordered_map> node_offsets_and_ranks; for (const string& path_name : rgfa_paths) { path_handle_t path_handle = graph->get_path_handle(path_name); size_t offset = 0; @@ -57,11 +57,12 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& << node_id << " is traversed on its reverse strand. rGFA only supports the forward strand." << endl; throw runtime_error(ss.str()); } - if (node_offsets.count(node_id)) { + if (node_offsets_and_ranks.count(node_id)) { cerr << "warning [gfa]: multiple selected rgfa paths found on node " << node_id << ": keeping tags for " - << graph->get_path_name(node_offsets[node_id].first) << " and ignoring those for " << path_name << endl; + << graph->get_path_name(get<0>(node_offsets_and_ranks[node_id])) << " and ignoring those for " << path_name << endl; } else { - node_offsets[node_id] = make_pair(path_handle, offset); + int64_t rgfa_rank = get_rgfa_rank(path_name); + node_offsets_and_ranks[node_id] = make_tuple(path_handle, offset, rgfa_rank <= 0 ? 0 : rgfa_rank); } offset += graph->get_length(handle); }); @@ -73,12 +74,23 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& nid_t node_id = graph->get_id(h); out << node_id << "\t"; out << graph->get_sequence(h); - auto it = node_offsets.find(node_id); - if (it != node_offsets.end()) { - // add rGFA tags - out << "\t" << "SN:Z:" << graph->get_path_name(it->second.first) - << "\t" << "SO:i:" << it->second.second - << "\t" << "SR:i:0"; // todo: support non-zero ranks? + auto it = node_offsets_and_ranks.find(node_id); + if (it != node_offsets_and_ranks.end()) { + // hack off the rgfa tag + string base_name = strip_rgfa_rank(graph->get_path_name(get<0>(it->second))); + // hack off the subrange offset (and add it to SO) + PathSense sense; + string sample, locus; + size_t haplotype, phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(base_name, sense, sample, locus, haplotype, phase_block, subrange); + int64_t base_offset = subrange == PathMetadata::NO_SUBRANGE ? 0 : subrange.first; + base_name = PathMetadata::create_path_name(sense, sample, locus, haplotype, phase_block, + PathMetadata::NO_SUBRANGE); + // add rGFA tags + out << "\t" << "SN:Z:" << base_name + << "\t" << "SO:i:" << (base_offset + get<1>(it->second)) + << "\t" << "SR:i:" << get<2>(it->second); } out << "\n"; // Writing `std::endl` would flush the buffer. return true; @@ -106,9 +118,19 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& vector w_line_paths; + bool warned_about_tags_as_paths = false; // Paths as P-lines for (const path_handle_t& h : path_handles) { auto path_name = graph->get_path_name(h); + if (get_rgfa_rank(path_name) > 0) { + if (!rgfa_paths.empty()) { + // the path was put into tags, no reason to deal with it here + continue; + } else if (!warned_about_tags_as_paths) { + cerr << "warning [gfa]: outputing rGFA cover (rank>=1) path(s) as a P-line(s) and not tags because no reference (rank==0) selected" << endl; + warned_about_tags_as_paths = true; + } + } if (rgfa_pline || !rgfa_paths.count(path_name)) { if (graph->get_sense(h) != PathSense::REFERENCE && reference_samples.count(graph->get_sample_name(h))) { // We have a mix of reference and non-reference paths on the same sample which GFA can't handle. @@ -144,6 +166,16 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& { unordered_map, size_t> last_phase_block_end; for (const path_handle_t& h : w_line_paths) { + string path_name = graph->get_path_name(h); + if (get_rgfa_rank(path_name) > 0) { + if (!rgfa_paths.empty()) { + // the path was put into tags, no reason to deal with it here + continue; + } else if (!warned_about_tags_as_paths) { + cerr << "warning [gfa]: outputing rGFA cover (rank>=1) path(s) as a W-line(s) and not tags because no reference (rank==0) selected" << endl; + warned_about_tags_as_paths = true; + } + } write_w_line(graph, out, h, last_phase_block_end); } } @@ -264,6 +296,13 @@ int get_rgfa_rank(const string& path_name) { } string set_rgfa_rank(const string& path_name, int rgfa_rank) { + string new_name = strip_rgfa_rank(path_name); + // now append the rank + new_name += ":SR:i:" + std::to_string(rgfa_rank); + return new_name; +} + +string strip_rgfa_rank(const string& path_name) { string new_name; // check if we have an existing rank. if so, we srap it. size_t pos = path_name.rfind(":SR:i:"); @@ -276,9 +315,6 @@ string set_rgfa_rank(const string& path_name, int rgfa_rank) { } else { new_name = path_name; } - - // now append the rank - new_name += ":SR:i:" + std::to_string(rgfa_rank); return new_name; } @@ -288,6 +324,19 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, int64_t minimum_length, const unordered_map>>& preferred_intervals){ + // for sanity's sake, we don't want to ever support multiple rgfa covers, so start by + // deleting all existing rgfa fragments (except for rank 0 reference paths, of course) + vector existing_cover; + graph->for_each_path_handle([&](path_handle_t path_handle) { + if (get_rgfa_rank(graph->get_path_name(path_handle)) > 0) { + existing_cover.push_back(path_handle); + assert(!reference_paths.count(path_handle)); + } + }); + for (path_handle_t path_handle : existing_cover) { + graph->destroy_path(path_handle); + } + // we use the path traversal finder for everything // (even gbwt haplotypes, as we're using the path handle interface) PathTraversalFinder path_trav_finder(*graph, *snarl_manager); @@ -571,7 +620,6 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, if (interval_length >= minimum_length) { auto trav_stats = rgfa_traversal_stats(graph, trav, uncovered_interval); ranked_trav_fragments.push_back(make_pair(trav_stats, make_pair(trav_idx, uncovered_interval))); - cerr << "pushing trav " << trav_idx << " with stats " << get<0>(trav_stats) <<"," << get<1>(trav_stats) << "," << get<2>(trav_stats) << endl; } } } @@ -581,18 +629,12 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, const pair, pair>>& s2)> heap_comp = [](const pair, pair>>& s1, const pair, pair>>& s2) { - cerr << "COMP" << s1.first << " VS " << s2.first << " is " << rgfa_traversal_stats_less(s1.first, s2.first) << endl; return rgfa_traversal_stats_less(s1.first, s2.first); }; // put the fragments into a max heap std::make_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); - for (auto xxx : ranked_trav_fragments) { - cerr << " heep " << xxx.first <<"->trav " << xxx.second.first << endl; - cerr << " front is " << ranked_trav_fragments.front().first << "->trav " << ranked_trav_fragments.front().second.first << endl; - } - // now greedily pull out traversal intervals from the ranked list until none are left while (!ranked_trav_fragments.empty()) { @@ -600,8 +642,6 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, auto best_stats_fragment = ranked_trav_fragments.front(); std::pop_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); ranked_trav_fragments.pop_back(); - cerr << "popping trav " << best_stats_fragment.second.first << " with stats " - << get<0>(best_stats_fragment.first) << "," << get<1>(best_stats_fragment.first) << "," << get<2>(best_stats_fragment.first) << endl; const vector& trav = travs.at(best_stats_fragment.second.first); const pair& uncovered_interval = best_stats_fragment.second.second; @@ -636,7 +676,6 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, auto chopped_stats = rgfa_traversal_stats(graph, trav, chopped_interval); ranked_trav_fragments.push_back(make_pair(chopped_stats, make_pair(best_stats_fragment.second.first, chopped_interval))); std::push_heap(ranked_trav_fragments.begin(), ranked_trav_fragments.end(), heap_comp); - cerr << "pushing interval " << endl; } } continue; @@ -646,20 +685,17 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, // since we've already covered the reference, then we know that this interval // doesn't span the whole snarl including endpoints, so we can always afford // to look back and ahead one - cerr << "uncovered interval " << uncovered_interval.first << ", " << uncovered_interval.second << " vs trav size " << trav.size() << endl; assert(uncovered_interval.first > 0 && uncovered_interval.second < trav.size()); int64_t prev_frag_idx = cover_node_to_fragment.at(graph->get_id(graph->get_handle_of_step(trav[uncovered_interval.first - 1]))); int64_t next_frag_idx = cover_node_to_fragment.at(graph->get_id(graph->get_handle_of_step(trav[uncovered_interval.second]))); - if (prev_frag_idx != next_frag_idx) { - // todo: figure out policy for these - cerr << "[rgfa] warning: skipping fragment that is connected to different fragmengs" << endl; - continue; - } + // todo: i'm not sure if/how minigraph treats these cases, where the anchors connect to different ranks + // also, can these be avoided entirely? + int64_t min_frag_idx = std::min(prev_frag_idx, next_frag_idx); int64_t fragment_rank; - if (prev_frag_idx == -1) { + if (min_frag_idx == -1) { fragment_rank = 1; } else { - fragment_rank = cover_fragments.at(prev_frag_idx).first + 1; + fragment_rank = cover_fragments.at(min_frag_idx).first + 1; } // update the cover diff --git a/src/gfa.hpp b/src/gfa.hpp index d5650a1985c..9c57f8c2ff5 100644 --- a/src/gfa.hpp +++ b/src/gfa.hpp @@ -41,6 +41,9 @@ int get_rgfa_rank(const string& path_name); /// Add the RGFA rank tag to a pathname string set_rgfa_rank(const string& path_name, int rgfa_rank); +/// Get the name without the rank +string strip_rgfa_rank(const string& path_name); + /// Compute the rGFA path cover /// graph: the graph /// snarl_manager: the snarls (todo: should use distance index) diff --git a/src/subcommand/convert_main.cpp b/src/subcommand/convert_main.cpp index 2f7fd1bc78e..518789c140a 100644 --- a/src/subcommand/convert_main.cpp +++ b/src/subcommand/convert_main.cpp @@ -432,7 +432,13 @@ int main_convert(int argc, char** argv) { continue; } } + // Tagged (rank>0) rgfa paths get written if a base path specified + if (get_rgfa_rank(path_name) > 0) { + assert(!rgfa_paths.count(path_name)); + rgfa_paths.insert(path_name); + } }); + } graph_to_gfa(graph_to_write, std::cout, rgfa_paths, rgfa_pline, wline); } else { @@ -468,7 +474,7 @@ void help_convert(char** argv) { << "gfa output options (use with -f):" << endl << " -P, --rgfa-path STR write given path as rGFA tags instead of lines (multiple allowed, only rank-0 supported)" << endl << " -Q, --rgfa-prefix STR write paths with given prefix as rGFA tags instead of lines (multiple allowed, only rank-0 supported)" << endl - << " -B, --rgfa-pline paths written as rGFA tags also written as lines" << endl + << " -B, --rgfa-pline paths written as rank 0 rGFA tags also written as lines" << endl << " -W, --no-wline write all paths as GFA P-lines instead of W-lines. Allows handling multiple phase blocks and subranges used together." << endl << " --gbwtgraph-algorithm Always use the GBWTGraph library GFA algorithm. Not compatible with other GBWT output options or non-GBWT graphs." << endl << " --vg-algorithm Always use the VG GFA algorithm. Works with all options and graph types, but can't preserve original GFA coordinates." << endl diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index 9468b9c764c..73f75f4691d 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -612,7 +612,9 @@ int main_paths(int argc, char** argv) { assert(rgfa_min_len >= 0); unordered_set reference_paths; for_each_selected_path([&](const path_handle_t& path_handle) { - reference_paths.insert(path_handle); + if (get_rgfa_rank(graph->get_path_name(path_handle)) <= 0) { + reference_paths.insert(path_handle); + } }); rgfa_graph_cover(mutable_graph, snarl_manager.get(), reference_paths, rgfa_min_len); From 563a2eaca866b93e01d5ac6ee974b75dd3c59c82 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Fri, 24 Mar 2023 13:30:10 -0400 Subject: [PATCH 11/12] fix assertion fails that happen in full hprc graph --- src/gfa.cpp | 19 +++++++++++++++---- src/subcommand/paths_main.cpp | 11 ++++++++--- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index 8e40d444538..c537b1c00e2 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -539,7 +539,9 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, } } - if (ref_paths.empty() && !cover_node_to_fragment.count(snarl.start().node_id())) { + // note: checking both snarl endpoint here is actually necessary: if the reference path doesn't end in a tip, + // you can end up with a trivial snarl at its end which will crash on this test. + if (ref_paths.empty() && (!cover_node_to_fragment.count(snarl.start().node_id()) || !cover_node_to_fragment.count(snarl.end().node_id()))) { // we're not nested in a reference snarl, and we have no reference path // by the current logic, there's nothing to be done. cerr << "[rgfa] warning: No referene path through snarl " @@ -613,11 +615,20 @@ void rgfa_snarl_cover(const PathHandleGraph* graph, vector> uncovered_intervals = get_uncovered_intervals(trav); for (const auto& uncovered_interval : uncovered_intervals) { + unordered_set cycle_check; + bool cyclic = false; int64_t interval_length = 0; - for (int64_t i = uncovered_interval.first; i < uncovered_interval.second; ++i) { - interval_length += graph->get_length(graph->get_handle_of_step(trav[i])); + for (int64_t i = uncovered_interval.first; i < uncovered_interval.second && !cyclic; ++i) { + handle_t handle = graph->get_handle_of_step(trav[i]); + interval_length += graph->get_length(handle); + nid_t node_id = graph->get_id(handle); + if (cycle_check.count(node_id)) { + cyclic = true; + } else { + cycle_check.insert(node_id); + } } - if (interval_length >= minimum_length) { + if (!cyclic && interval_length >= minimum_length) { auto trav_stats = rgfa_traversal_stats(graph, trav, uncovered_interval); ranked_trav_fragments.push_back(make_pair(trav_stats, make_pair(trav_idx, uncovered_interval))); } diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index 73f75f4691d..6d4d14f36e4 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -140,7 +140,8 @@ int main_paths(int argc, char** argv) { {"extract-vg", no_argument, 0, 'V'}, {"drop-paths", no_argument, 0, 'd'}, {"retain-paths", no_argument, 0, 'r'}, - {"rgfa-cover", required_argument, 0, 'R'}, + {"rgfa-cover", required_argument, 0, 'R'}, + {"snarls", required_argument, 0, 's'}, {"extract-gam", no_argument, 0, 'X'}, {"extract-gaf", no_argument, 0, 'A'}, {"list", no_argument, 0, 'L'}, @@ -162,7 +163,7 @@ int main_paths(int argc, char** argv) { }; int option_index = 0; - c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drR:aGp:ct:", + c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drR:s:aGp:ct:", long_options, &option_index); // Detect the end of the options. @@ -202,7 +203,11 @@ int main_paths(int argc, char** argv) { rgfa_min_len = parse(optarg); output_formats++; break; - + + case 's': + snarl_filename = optarg; + break; + case 'X': extract_as_gam = true; output_formats++; From 86df1470a7f82bf1690b950ec4e9696d09e0863a Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Tue, 28 Mar 2023 20:27:19 -0400 Subject: [PATCH 12/12] pivot to store rgfa info in locus --- src/gfa.cpp | 122 ++++++++++++++++++++++++++++++++++------------------ src/gfa.hpp | 25 ++++++++--- 2 files changed, 100 insertions(+), 47 deletions(-) diff --git a/src/gfa.cpp b/src/gfa.cpp index c537b1c00e2..3675f9497b5 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -77,7 +77,7 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, const set& auto it = node_offsets_and_ranks.find(node_id); if (it != node_offsets_and_ranks.end()) { // hack off the rgfa tag - string base_name = strip_rgfa_rank(graph->get_path_name(get<0>(it->second))); + string base_name = strip_rgfa_path_name(graph->get_path_name(get<0>(it->second))); // hack off the subrange offset (and add it to SO) PathSense sense; string sample, locus; @@ -282,42 +282,96 @@ void write_w_line(const PathHandleGraph* graph, ostream& out, path_handle_t path out << "\n"; } -int get_rgfa_rank(const string& path_name) { +int get_rgfa_rank(const string& path_name, const string& rgfa_sample) { int rank = -1; - size_t pos = path_name.rfind(":SR:i:"); - if (pos != string::npos && path_name.length() - pos >= 6) { + + PathSense path_sense; + string path_sample; + string path_locus; + size_t path_haplotype; + size_t path_phase_block; + subrange_t path_subrange; + PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); + + size_t pos = path_locus.rfind(":SR:i:"); + if (pos != string::npos && path_locus.length() - pos >= 6) { + assert(path_sample == rgfa_sample); pos += 6; - size_t pos2 = path_name.find(":", pos); + size_t pos2 = path_locus.find(":", pos); size_t len = pos2 == string::npos ? pos2 : pos2 - pos; - string rank_string = path_name.substr(pos, len); + string rank_string = path_locus.substr(pos, len); rank = parse(rank_string); } return rank; } -string set_rgfa_rank(const string& path_name, int rgfa_rank) { - string new_name = strip_rgfa_rank(path_name); - // now append the rank - new_name += ":SR:i:" + std::to_string(rgfa_rank); - return new_name; +string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subrange_t& rgfa_subrange, + const string& rgfa_sample) { + + PathSense path_sense; + string path_sample; + string path_locus; + size_t path_haplotype; + size_t path_phase_block; + subrange_t path_subrange; + PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); + + // we're going to replace the existing sample name; so move it out into locus behind SN:Z: + assert(path_locus != PathMetadata::NO_LOCUS_NAME); + string base_name; + if (path_sample != PathMetadata::NO_SAMPLE_NAME) { + base_name = path_sample + ":"; + } + base_name += "SN:Z:" + path_locus; + + // and we also load in our rGFA tag + base_name += ":SR:i:" + std::to_string(rgfa_rank); + + // and return the final path, with sample/locus/rgfa-rank embedded in locus + // (as it's a reference path, we alsos strip out the phase block) + return PathMetadata::create_path_name(PathSense::REFERENCE, rgfa_sample, base_name, path_haplotype, + PathMetadata::NO_PHASE_BLOCK, rgfa_subrange); } -string strip_rgfa_rank(const string& path_name) { - string new_name; - // check if we have an existing rank. if so, we srap it. - size_t pos = path_name.rfind(":SR:i:"); - if (pos != string::npos && path_name.length() - pos >= 6) { - size_t pos2 = path_name.find(":", pos + 6); - new_name = path_name.substr(0, pos); - if (pos2 != string::npos) { - new_name += path_name.substr(pos2); +string strip_rgfa_path_name(const string& path_name, const string& rgfa_sample) { + + PathSense path_sense; + string path_sample; + string path_locus; + size_t path_haplotype; + size_t path_phase_block; + subrange_t path_subrange; + PathMetadata::parse_path_name(path_name, path_sense, path_sample, path_locus, + path_haplotype, path_phase_block, path_subrange); + + assert(path_locus != PathMetadata::NO_LOCUS_NAME); + + size_t sr_pos = path_locus.rfind(":SR:i:"); + if (sr_pos != string::npos && path_locus.length() - sr_pos >= 6) { + size_t sn_pos = path_locus.rfind("SN:Z:", sr_pos - 1); + assert(sn_pos != string::npos); + assert(path_sample == rgfa_sample); + + string orig_sample; + if (sn_pos > 0) { + orig_sample = path_locus.substr(0, sn_pos - 1); + } + string orig_locus = path_locus.substr(sn_pos + 5, sr_pos - sn_pos - 5); + + // todo: recover path sense / haploblock? + if (orig_sample.empty()) { + path_sense = PathSense::GENERIC; + path_haplotype = PathMetadata::NO_HAPLOTYPE; } - } else { - new_name = path_name; + return PathMetadata::create_path_name(path_sense, orig_sample, orig_locus, + path_haplotype, path_phase_block, path_subrange); } - return new_name; + return path_name; } + void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, SnarlManager* snarl_manager, const unordered_set& reference_paths, @@ -402,17 +456,7 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, for (const auto& path_fragments : path_to_fragments) { const path_handle_t& path_handle = path_fragments.first; - PathSense path_sense; - string path_sample; - string path_locus; - size_t path_haplotype; - size_t path_phase_block; - subrange_t path_subrange; - PathMetadata::parse_path_name(graph->get_path_name(path_handle), path_sense, path_sample, path_locus, - path_haplotype, path_phase_block, path_subrange); - // todo: we need a policy for this - //PathSense out_path_sense = path_sense == PathSense::HAPLOTYPE ? PathSense::GENERIC : path_sense; - PathSense out_path_sense = path_sense; + string path_name = graph->get_path_name(path_handle); const vector& fragments = path_fragments.second; @@ -452,14 +496,10 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, for (const step_handle_t& step : rgfa_fragment) { rgfa_frag_length += graph->get_length(graph->get_handle_of_step(step)); } - subrange_t rgfa_frag_subrange; - rgfa_frag_subrange.first = rgfa_frag_pos + (path_subrange != PathMetadata::NO_SUBRANGE ? path_subrange.first : 0); + subrange_t rgfa_frag_subrange = graph->get_subrange(path_handle); + rgfa_frag_subrange.first = rgfa_frag_pos + (rgfa_frag_subrange != PathMetadata::NO_SUBRANGE ? rgfa_frag_subrange.first : 0); rgfa_frag_subrange.second = rgfa_frag_subrange.first + rgfa_frag_length; - - string rgfa_frag_name = PathMetadata::create_path_name(out_path_sense, path_sample, path_locus, path_haplotype, - path_phase_block, rgfa_frag_subrange); - - rgfa_frag_name = set_rgfa_rank(rgfa_frag_name, rgfa_rank); + string rgfa_frag_name = create_rgfa_path_name(path_name, rgfa_rank, rgfa_frag_subrange); #ifdef debug #pragma omp critical(cerr) diff --git a/src/gfa.hpp b/src/gfa.hpp index 9c57f8c2ff5..ba59a292519 100644 --- a/src/gfa.hpp +++ b/src/gfa.hpp @@ -31,18 +31,31 @@ void graph_to_gfa(const PathHandleGraph* graph, ostream& out, /// Prototype code to tag paths as rGFA paths. Either needs to be completely scrapped /// or adapted into libhandlegraph at some point, ideally. -/// It works by adding :SR:i: to the end of the name + +/// It works by using a special sample name (default=_rGFA_) for rGFA contigs. +/// Any real sample name gets pushed into the locus field behind its rGFA tag SN:Z: +/// The rGFA rank also goes in the locus field behind SR:i: + +/// In GFA, these paths live in rGFA tags on S elements +/// In the graph, they are reference paths with SN/SR fields in their locus names. +/// As it stands, they will come out as W-lines in GFA with vg view or vg convert (without -Q/-P) + +/// Note that rank-0 rGFA fragments (aka normal reference paths) do *not* get the rGFA +/// sample, and are treated as normal reference paths all the way through (but can get rGFA tags) +/// when specified with -Q/-P in convert -f. /// Returns the RGFA rank (SR) of a path. This will be 0 for the reference /// backbone, and higher the further number of (nested) bubbles away it is. /// If the path is not an RGFA path, then return -1 -int get_rgfa_rank(const string& path_name); +int get_rgfa_rank(const string& path_name, const string& rgfa_sample="_rGFA_"); -/// Add the RGFA rank tag to a pathname -string set_rgfa_rank(const string& path_name, int rgfa_rank); +/// Add the rgfa rank to a pathname, also setting its sample to the special rgfa sample and +/// moving its old sample into the locus field +string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subrange_t& subrange, + const string& rgfa_sample="_rGFA_"); -/// Get the name without the rank -string strip_rgfa_rank(const string& path_name); +/// Remove the rGFA information from a path name, effectively undoing set_rgfa_rank +string strip_rgfa_path_name(const string& path_name, const string& rgfa_sample="_rGFA_"); /// Compute the rGFA path cover /// graph: the graph