diff --git a/src/gfa.cpp b/src/gfa.cpp index 2848e202a6b..2ed88995bc1 100644 --- a/src/gfa.cpp +++ b/src/gfa.cpp @@ -375,7 +375,10 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, SnarlManager* snarl_manager, const unordered_set& reference_paths, int64_t minimum_length, +<<<<<<< HEAD const string& rgfa_sample_name, +======= +>>>>>>> origin/rgfa const unordered_map>>& preferred_intervals){ // for sanity's sake, we don't want to ever support multiple rgfa covers, so start by @@ -499,7 +502,11 @@ void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, subrange_t rgfa_frag_subrange = graph->get_subrange(path_handle); rgfa_frag_subrange.first = rgfa_frag_pos + (rgfa_frag_subrange != PathMetadata::NO_SUBRANGE ? rgfa_frag_subrange.first : 0); rgfa_frag_subrange.second = rgfa_frag_subrange.first + rgfa_frag_length; +<<<<<<< HEAD string rgfa_frag_name = create_rgfa_path_name(path_name, rgfa_rank, rgfa_frag_subrange, rgfa_sample_name); +======= + string rgfa_frag_name = create_rgfa_path_name(path_name, rgfa_rank, rgfa_frag_subrange); +>>>>>>> origin/rgfa #ifdef debug #pragma omp critical(cerr) diff --git a/src/gfa.hpp b/src/gfa.hpp index 3603224cf1a..3634da49009 100644 --- a/src/gfa.hpp +++ b/src/gfa.hpp @@ -99,6 +99,75 @@ void rgfa_forwardize_paths(MutablePathMutableHandleGraph* graph, unordered_map>> extract_rgfa_intervals(const string& rgfa_path); + +/// Prototype code to tag paths as rGFA paths. Either needs to be completely scrapped +/// or adapted into libhandlegraph at some point, ideally. + +/// It works by using a special sample name (default=_rGFA_) for rGFA contigs. +/// Any real sample name gets pushed into the locus field behind its rGFA tag SN:Z: +/// The rGFA rank also goes in the locus field behind SR:i: + +/// In GFA, these paths live in rGFA tags on S elements +/// In the graph, they are reference paths with SN/SR fields in their locus names. +/// As it stands, they will come out as W-lines in GFA with vg view or vg convert (without -Q/-P) + +/// Note that rank-0 rGFA fragments (aka normal reference paths) do *not* get the rGFA +/// sample, and are treated as normal reference paths all the way through (but can get rGFA tags) +/// when specified with -Q/-P in convert -f. + +/// Returns the RGFA rank (SR) of a path. This will be 0 for the reference +/// backbone, and higher the further number of (nested) bubbles away it is. +/// If the path is not an RGFA path, then return -1 +int get_rgfa_rank(const string& path_name, const string& rgfa_sample="_rGFA_"); + +/// Add the rgfa rank to a pathname, also setting its sample to the special rgfa sample and +/// moving its old sample into the locus field +string create_rgfa_path_name(const string& path_name, int rgfa_rank, const subrange_t& subrange, + const string& rgfa_sample="_rGFA_"); + +/// Remove the rGFA information from a path name, effectively undoing set_rgfa_rank +string strip_rgfa_path_name(const string& path_name, const string& rgfa_sample="_rGFA_"); + +/// Compute the rGFA path cover +/// graph: the graph +/// snarl_manager: the snarls (todo: should use distance index) +/// reference_paths: rank-0 paths +/// minimum_length: the minimum length of a path to create (alleles shorter than this can be uncovered) +/// preferred_intervals: set of ranges (ex from minigraph) to use as possible for rGFA paths +void rgfa_graph_cover(MutablePathMutableHandleGraph* graph, + SnarlManager* snarl_manager, + const unordered_set& reference_paths, + int64_t minimum_length, + const unordered_map>>& preferred_intervals = {}); + +void rgfa_snarl_cover(const PathHandleGraph* graph, + const Snarl& snarl, + PathTraversalFinder& path_trav_finder, + const unordered_set& reference_paths, + int64_t minimum_length, + vector>>& cover_fragments, + unordered_map& cover_node_to_fragment, + const unordered_map>>& preferred_intervals); + +/// Get some statistics from a traversal fragment that we use for ranking in the greedy algorithm +/// 1. Coverage : total step length across the traversal for all paths +/// 2. Switches : the number of nodes that would need to be flipped to forwardize the traversal +/// 3. Duplicated bases : the number of duplicated bases in the traversal path +tuple rgfa_traversal_stats(const PathHandleGraph* graph, + const vector& trav, + const pair& trav_fragment); + +/// Comparison of the above stats for the purposes of greedily selecting (highest better) traversals +bool rgfa_traversal_stats_less(const tuple& s1, const tuple& s2); + +/// Make sure all rgfa paths are forwardized +void rgfa_forwardize_paths(MutablePathMutableHandleGraph* graph, + const unordered_set& reference_paths); + +/// Extract rGFA tags from minigraph GFA in order to pass in as hints above +unordered_map>> extract_rgfa_intervals(const string& rgfa_path); + + } #endif diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index 0e62b6de87a..b5c309f95dd 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -42,9 +42,14 @@ void help_paths(char** argv) { << " -r, --retain-paths output a graph with only the selected paths retained" << endl << " rGFA cover" << endl << " -R, --rgfa-min-length N add rGFA cover to graph, using seleciton from -Q/-S as rank-0 backbone, only adding fragments >= Nbp (default:-1=disabled)" << endl +<<<<<<< HEAD << " -N, --rgfa-sample STR give all rGFA cover fragments sample name (path prefix) STR (default: _rGFA_)." << endl << " -s, --snarls FILE snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover (-R)." << endl << " -t, --threads N use up to N threads when computing rGFA cover (default: all available)" << endl +======= + << " -s, --snarls FILE snarls (from vg snarls) to avoid recomputing. snarls only used for rgfa cover (-R)." << endl + << " -t, --threads N use up to N threads when computing rGFA covoer (default: all available)" << endl +>>>>>>> origin/rgfa << " output path data:" << endl << " -X, --extract-gam print (as GAM alignments) the stored paths in the graph" << endl << " -A, --extract-gaf print (as GAF alignments) the stored paths in the graph" << endl @@ -108,7 +113,10 @@ int main_paths(int argc, char** argv) { bool drop_paths = false; bool retain_paths = false; int64_t rgfa_min_len = -1; +<<<<<<< HEAD string rgfa_sample_name = "_rGFA_"; +======= +>>>>>>> origin/rgfa string snarl_filename; string graph_file; string gbwt_file; @@ -143,7 +151,10 @@ int main_paths(int argc, char** argv) { {"drop-paths", no_argument, 0, 'd'}, {"retain-paths", no_argument, 0, 'r'}, {"rgfa-cover", required_argument, 0, 'R'}, +<<<<<<< HEAD {"rgfa-sample", required_argument, 0, 'N'}, +======= +>>>>>>> origin/rgfa {"snarls", required_argument, 0, 's'}, {"extract-gam", no_argument, 0, 'X'}, {"extract-gaf", no_argument, 0, 'A'}, @@ -166,7 +177,11 @@ int main_paths(int argc, char** argv) { }; int option_index = 0; +<<<<<<< HEAD c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drR:N:s:aGp:ct:", +======= + c = getopt_long (argc, argv, "hLXv:x:g:Q:VEMCFAS:Tq:drR:s:aGp:ct:", +>>>>>>> origin/rgfa long_options, &option_index); // Detect the end of the options. @@ -207,10 +222,13 @@ int main_paths(int argc, char** argv) { output_formats++; break; +<<<<<<< HEAD case 'N': rgfa_sample_name = optarg; break; +======= +>>>>>>> origin/rgfa case 's': snarl_filename = optarg; break;