diff --git a/deps/libbdsg b/deps/libbdsg index 5d12710fcd..6cff67667e 160000 --- a/deps/libbdsg +++ b/deps/libbdsg @@ -1 +1 @@ -Subproject commit 5d12710fcdff1090c16479210a1d25d244d6b129 +Subproject commit 6cff67667eefaf1a573cd3b64777d2d78ad2a572 diff --git a/src/snarl_distance_index.cpp b/src/snarl_distance_index.cpp index 5fc4dcd022..5ef5e1ab22 100644 --- a/src/snarl_distance_index.cpp +++ b/src/snarl_distance_index.cpp @@ -20,11 +20,12 @@ size_t maximum_distance(const SnarlDistanceIndex& distance_index, pos_t pos1, po get_id(pos2), get_is_rev(pos2), get_offset(pos2)); } -void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit, bool silence_warnings) { +void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit, bool only_top_level_chain_distances, bool silence_warnings) { distance_index->set_snarl_size_limit(size_limit); + distance_index->set_only_top_level_chain_distances(only_top_level_chain_distances); //Build the temporary distance index from the graph - SnarlDistanceIndex::TemporaryDistanceIndex temp_index = make_temporary_distance_index(graph, snarl_finder, size_limit); + SnarlDistanceIndex::TemporaryDistanceIndex temp_index = make_temporary_distance_index(graph, snarl_finder, size_limit, only_top_level_chain_distances); if (!silence_warnings && temp_index.use_oversized_snarls) { cerr << "warning: distance index uses oversized snarls, which may make mapping slow" << endl; @@ -37,7 +38,7 @@ void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGrap distance_index->get_snarl_tree_records(indexes, graph); } SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( - const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit) { + const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit, bool only_top_level_chain_distances) { #ifdef debug_distance_indexing cerr << "Creating new distance index for nodes between " << graph->min_node_id() << " and " << graph->max_node_id() << endl; @@ -210,9 +211,12 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( temp_chain_record.end_node_id = node_id; temp_chain_record.end_node_rev = graph->get_is_reverse(chain_end_handle); temp_chain_record.end_node_length = graph->get_length(chain_end_handle); + + bool is_root_chain = false; if (stack.empty()) { //If this was the last thing on the stack, then this was a root + is_root_chain = true; //Check to see if there is anything connected to the ends of the chain vector reachable_nodes; @@ -279,7 +283,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( parent_snarl_record.children.emplace_back(chain_index); } - temp_index.max_index_size += temp_chain_record.get_max_record_length(); + temp_index.max_index_size += temp_chain_record.get_max_record_length(!only_top_level_chain_distances || is_root_chain ? true : false ); #ifdef debug_distance_indexing cerr << " Ending new " << (temp_chain_record.is_trivial ? "trivial " : "") << "chain " << temp_index.structure_start_end_as_string(chain_index) << endl << " that is a child of " << temp_index.structure_start_end_as_string(temp_chain_record.parent) << endl; @@ -484,7 +488,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( temp_index.temp_snarl_records.at(chain_child_index.second); //Fill in this snarl's distances - populate_snarl_index(temp_index, chain_child_index, size_limit, graph); + populate_snarl_index(temp_index, chain_child_index, size_limit, only_top_level_chain_distances, graph); bool new_component = temp_snarl_record.min_length == std::numeric_limits::max(); if (new_component){ @@ -733,7 +737,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( for (pair& component_index : temp_index.components) { if (component_index.first == SnarlDistanceIndex::TEMP_SNARL) { SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(component_index.second); - populate_snarl_index(temp_index, component_index, size_limit, graph); + populate_snarl_index(temp_index, component_index, size_limit, only_top_level_chain_distances, graph); temp_snarl_record.min_length = std::numeric_limits::max(); } } @@ -756,7 +760,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index( void populate_snarl_index( SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, pair snarl_index, size_t size_limit, - const HandleGraph* graph) { + bool only_top_level_chain_distances, const HandleGraph* graph) { #ifdef debug_distance_indexing cerr << "Getting the distances for snarl " << temp_index.structure_start_end_as_string(snarl_index) << endl; assert(snarl_index.first == SnarlDistanceIndex::TEMP_SNARL); @@ -1063,10 +1067,15 @@ void populate_snarl_index( */ - //Reserve enough space to store all possible distances - temp_snarl_record.distances.reserve( (temp_snarl_record.node_count > size_limit || size_limit == 0) - ? temp_snarl_record.node_count * 2 - : temp_snarl_record.node_count * temp_snarl_record.node_count); + if (size_limit != 0 && !only_top_level_chain_distances) { + //If we are saving distances + //Reserve enough space to store all possible distances + temp_snarl_record.distances.reserve( temp_snarl_record.node_count > size_limit + ? temp_snarl_record.node_count * 2 + : temp_snarl_record.node_count * temp_snarl_record.node_count); + } else { + temp_snarl_record.include_distances = false; + } if (size_limit != 0 && temp_snarl_record.node_count > size_limit) { temp_index.use_oversized_snarls = true; @@ -1137,7 +1146,7 @@ void populate_snarl_index( // assert(start_rank != 0 && start_rank != 1); //} - if ( (temp_snarl_record.node_count > size_limit || size_limit == 0) && (temp_snarl_record.is_root_snarl || (!start_is_tip && + if ( (temp_snarl_record.node_count > size_limit || size_limit == 0 || only_top_level_chain_distances) && (temp_snarl_record.is_root_snarl || (!start_is_tip && start_rank != 0 && start_rank != 1))) { //If we don't care about internal distances, and we also are not at a boundary or tip //TODO: Why do we care about tips specifically? diff --git a/src/snarl_distance_index.hpp b/src/snarl_distance_index.hpp index 13e8777301..33764d5eb8 100644 --- a/src/snarl_distance_index.hpp +++ b/src/snarl_distance_index.hpp @@ -22,13 +22,13 @@ size_t maximum_distance(const SnarlDistanceIndex& distance_index, pos_t pos1, po //Fill in the index //size_limit is a limit on the number of nodes in a snarl, after which the index won't store pairwise distances -void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit = 50000, bool silence_warnings=true); +void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit = 50000, bool only_top_level_chain_distances = false, bool silence_warnings=true); //Fill in the temporary snarl record with distances void populate_snarl_index(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index, - pair snarl_index, size_t size_limit, const HandleGraph* graph) ; + pair snarl_index, size_t size_limit, bool only_top_level_chain_distances, const HandleGraph* graph) ; -SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index(const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit); +SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index(const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit, bool only_top_level_chain_distances); //Define wang_hash for net_handle_t's so that we can use a hash_map template<> struct wang_hash { diff --git a/src/subcommand/index_main.cpp b/src/subcommand/index_main.cpp index 731fdcff26..2bc331395a 100644 --- a/src/subcommand/index_main.cpp +++ b/src/subcommand/index_main.cpp @@ -38,28 +38,29 @@ void help_index(char** argv) { << "Creates an index on the specified graph or graphs. All graphs indexed must " << endl << "already be in a joint ID space." << endl << "general options:" << endl - << " -b, --temp-dir DIR use DIR for temporary files" << endl - << " -t, --threads N number of threads to use" << endl - << " -p, --progress show progress" << endl + << " -b, --temp-dir DIR use DIR for temporary files" << endl + << " -t, --threads N number of threads to use" << endl + << " -p, --progress show progress" << endl << "xg options:" << endl - << " -x, --xg-name FILE use this file to store a succinct, queryable version of the graph(s), or read for GCSA or distance indexing" << endl - << " -L, --xg-alts include alt paths in xg" << endl + << " -x, --xg-name FILE use this file to store a succinct, queryable version of the graph(s), or read for GCSA or distance indexing" << endl + << " -L, --xg-alts include alt paths in xg" << endl << "gcsa options:" << endl - << " -g, --gcsa-out FILE output a GCSA2 index to the given file" << endl - //<< " -i, --dbg-in FILE use kmers from FILE instead of input VG (may repeat)" << endl - << " -f, --mapping FILE use this node mapping in GCSA2 construction" << endl - << " -k, --kmer-size N index kmers of size N in the graph (default " << gcsa::Key::MAX_LENGTH << ")" << endl - << " -X, --doubling-steps N use this number of doubling steps for GCSA2 construction (default " << gcsa::ConstructionParameters::DOUBLING_STEPS << ")" << endl - << " -Z, --size-limit N limit temporary disk space usage to N gigabytes (default " << gcsa::ConstructionParameters::SIZE_LIMIT << ")" << endl - << " -V, --verify-index validate the GCSA2 index using the input kmers (important for testing)" << endl + << " -g, --gcsa-out FILE output a GCSA2 index to the given file" << endl + //<< " -i, --dbg-in FILE use kmers from FILE instead of input VG (may repeat)" << endl + << " -f, --mapping FILE use this node mapping in GCSA2 construction" << endl + << " -k, --kmer-size N index kmers of size N in the graph (default " << gcsa::Key::MAX_LENGTH << ")" << endl + << " -X, --doubling-steps N use this number of doubling steps for GCSA2 construction (default " << gcsa::ConstructionParameters::DOUBLING_STEPS << ")" << endl + << " -Z, --size-limit N limit temporary disk space usage to N gigabytes (default " << gcsa::ConstructionParameters::SIZE_LIMIT << ")" << endl + << " -V, --verify-index validate the GCSA2 index using the input kmers (important for testing)" << endl << "gam indexing options:" << endl - << " -l, --index-sorted-gam input is sorted .gam format alignments, store a GAI index of the sorted GAM in INPUT.gam.gai" << endl + << " -l, --index-sorted-gam input is sorted .gam format alignments, store a GAI index of the sorted GAM in INPUT.gam.gai" << endl << "vg in-place indexing options:" << endl - << " --index-sorted-vg input is ID-sorted .vg format graph chunks, store a VGI index of the sorted vg in INPUT.vg.vgi" << endl + << " --index-sorted-vg input is ID-sorted .vg format graph chunks, store a VGI index of the sorted vg in INPUT.vg.vgi" << endl << "snarl distance index options" << endl - << " -j --dist-name FILE use this file to store a snarl-based distance index" << endl - << " --snarl-limit N don't store snarl distances for snarls with more than N nodes (default 10000)" << endl - << " if N is 0 then don't store distances, only the snarl tree" << endl; + << " -j --dist-name FILE use this file to store a snarl-based distance index" << endl + << " --snarl-limit N don't store snarl distances for snarls with more than N nodes (default 10000)" << endl + << " if N is 0 then don't store distances, only the snarl tree" << endl + << " --no-nested-distance only store distances along the top-level chain" << endl; } int main_index(int argc, char** argv) { @@ -72,6 +73,7 @@ int main_index(int argc, char** argv) { #define OPT_BUILD_VGI_INDEX 1000 #define OPT_RENAME_VARIANTS 1001 #define OPT_DISTANCE_SNARL_LIMIT 1002 + #define OPT_DISTANCE_NESTING 1003 // Which indexes to build. bool build_xg = false, build_gcsa = false, build_dist = false; @@ -103,6 +105,8 @@ int main_index(int argc, char** argv) { //Distance index size_t snarl_limit = 50000; + bool only_top_level_chain_distances = false; + int c; optind = 2; // force optind past command positional argument while (true) { @@ -155,6 +159,7 @@ int main_index(int argc, char** argv) { //Snarl distance index {"snarl-limit", required_argument, 0, OPT_DISTANCE_SNARL_LIMIT}, {"dist-name", required_argument, 0, 'j'}, + {"no-nested-distance", no_argument, 0, OPT_DISTANCE_NESTING}, {0, 0, 0, 0} }; @@ -254,6 +259,10 @@ int main_index(int argc, char** argv) { snarl_limit = parse(optarg); break; + case OPT_DISTANCE_NESTING: + only_top_level_chain_distances = true; + break; + case 'h': case '?': help_index(argv); @@ -531,7 +540,7 @@ int main_index(int argc, char** argv) { SnarlDistanceIndex distance_index; //Fill it in - fill_in_distance_index(&distance_index, xg.get(), &snarl_finder, snarl_limit, false); + fill_in_distance_index(&distance_index, xg.get(), &snarl_finder, snarl_limit, only_top_level_chain_distances, false); // Save it distance_index.serialize(dist_name); } else { @@ -547,7 +556,7 @@ int main_index(int argc, char** argv) { //Make a distance index and fill it in SnarlDistanceIndex distance_index; - fill_in_distance_index(&distance_index, &(gbz->graph), &snarl_finder, snarl_limit); + fill_in_distance_index(&distance_index, &(gbz->graph), &snarl_finder, snarl_limit, only_top_level_chain_distances, false); // Save it distance_index.serialize(dist_name); } else if (get<1>(options)) { @@ -559,7 +568,7 @@ int main_index(int argc, char** argv) { //Make a distance index and fill it in SnarlDistanceIndex distance_index; - fill_in_distance_index(&distance_index, graph.get(), &snarl_finder, snarl_limit); + fill_in_distance_index(&distance_index, graph.get(), &snarl_finder, snarl_limit, only_top_level_chain_distances, false); // Save it distance_index.serialize(dist_name); } else { diff --git a/src/unittest/snarl_distance_index.cpp b/src/unittest/snarl_distance_index.cpp index 9fbbe322cd..ee0b854b74 100644 --- a/src/unittest/snarl_distance_index.cpp +++ b/src/unittest/snarl_distance_index.cpp @@ -150,7 +150,7 @@ namespace vg { REQUIRE(std::get<2>(traceback.second.back()) == -5); } } - TEST_CASE( "Nested chain with loop", "[snarl_distance]" ) { + TEST_CASE( "Nested chain with loop", "[snarl_distance][bug]" ) { VG graph; @@ -188,6 +188,7 @@ namespace vg { Edge* e17 = graph.create_edge(n11, n12); Edge* e18 = graph.create_edge(n12, n13); + graph.serialize_to_file("test_graph.vg"); //get the snarls IntegratedSnarlFinder snarl_finder(graph); SECTION("Traversal of chain") { @@ -276,6 +277,40 @@ namespace vg { } } } + SECTION("Top-level distances only index") { + SnarlDistanceIndex distance_index; + fill_in_distance_index(&distance_index, &graph, &snarl_finder, 500, true); + + //Nodes on the top-level chain have distances + for (auto& id : {n1->id(), n2->id(), n4->id(), n5->id(), n12->id(), n13->id()}) { + net_handle_t n = distance_index.get_node_net_handle(id); + net_handle_t parent = distance_index.get_parent(n); + assert(distance_index.is_root(distance_index.get_parent(parent))); + distance_index.minimum_length(n); + distance_index.minimum_length(parent); + REQUIRE(true); + } + + //Nested nodes don't have distances + for (auto& id : {n3->id(), n6->id(), n7->id(), n8->id(), n9->id(), n10->id(), n11->id()}) { + net_handle_t n = distance_index.get_node_net_handle(id); + net_handle_t parent = distance_index.get_parent(n); + assert(!distance_index.is_root(distance_index.get_parent(parent))); + try { + distance_index.minimum_length(n); + distance_index.minimum_length(parent); + REQUIRE(false); + } catch (const std::runtime_error& err) { + REQUIRE(true); + } + } + + SECTION("Find minimum distances along the top-level chain") { + REQUIRE(distance_index.minimum_distance(n2->id(),true, 0, n2->id(), false, 0) == 1); + REQUIRE(distance_index.minimum_distance(n4->id(),true, 0, n5->id(), true, 0) == 19); + REQUIRE(distance_index.minimum_distance(n5->id(),false, 0, n5->id(), true, 0) == 9); + } + } } TEST_CASE( "Snarl decomposition can deal with multiple connected components", "[snarl_distance]" ) {