Skip to content

Commit

Permalink
Merge pull request #4466 from vgteam/distanceless-index
Browse files Browse the repository at this point in the history
Distanceless index
  • Loading branch information
xchang1 authored Dec 9, 2024
2 parents 205e43a + ae92b91 commit cd35552
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 37 deletions.
33 changes: 21 additions & 12 deletions src/snarl_distance_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@ size_t maximum_distance(const SnarlDistanceIndex& distance_index, pos_t pos1, po
get_id(pos2), get_is_rev(pos2), get_offset(pos2));
}

void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit, bool silence_warnings) {
void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit, bool only_top_level_chain_distances, bool silence_warnings) {
distance_index->set_snarl_size_limit(size_limit);
distance_index->set_only_top_level_chain_distances(only_top_level_chain_distances);

//Build the temporary distance index from the graph
SnarlDistanceIndex::TemporaryDistanceIndex temp_index = make_temporary_distance_index(graph, snarl_finder, size_limit);
SnarlDistanceIndex::TemporaryDistanceIndex temp_index = make_temporary_distance_index(graph, snarl_finder, size_limit, only_top_level_chain_distances);

if (!silence_warnings && temp_index.use_oversized_snarls) {
cerr << "warning: distance index uses oversized snarls, which may make mapping slow" << endl;
Expand All @@ -37,7 +38,7 @@ void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGrap
distance_index->get_snarl_tree_records(indexes, graph);
}
SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index(
const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit) {
const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit, bool only_top_level_chain_distances) {

#ifdef debug_distance_indexing
cerr << "Creating new distance index for nodes between " << graph->min_node_id() << " and " << graph->max_node_id() << endl;
Expand Down Expand Up @@ -210,9 +211,12 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index(
temp_chain_record.end_node_id = node_id;
temp_chain_record.end_node_rev = graph->get_is_reverse(chain_end_handle);
temp_chain_record.end_node_length = graph->get_length(chain_end_handle);

bool is_root_chain = false;

if (stack.empty()) {
//If this was the last thing on the stack, then this was a root
is_root_chain = true;

//Check to see if there is anything connected to the ends of the chain
vector<nid_t> reachable_nodes;
Expand Down Expand Up @@ -279,7 +283,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index(
parent_snarl_record.children.emplace_back(chain_index);
}

temp_index.max_index_size += temp_chain_record.get_max_record_length();
temp_index.max_index_size += temp_chain_record.get_max_record_length(!only_top_level_chain_distances || is_root_chain ? true : false );
#ifdef debug_distance_indexing
cerr << " Ending new " << (temp_chain_record.is_trivial ? "trivial " : "") << "chain " << temp_index.structure_start_end_as_string(chain_index)
<< endl << " that is a child of " << temp_index.structure_start_end_as_string(temp_chain_record.parent) << endl;
Expand Down Expand Up @@ -484,7 +488,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index(
temp_index.temp_snarl_records.at(chain_child_index.second);

//Fill in this snarl's distances
populate_snarl_index(temp_index, chain_child_index, size_limit, graph);
populate_snarl_index(temp_index, chain_child_index, size_limit, only_top_level_chain_distances, graph);

bool new_component = temp_snarl_record.min_length == std::numeric_limits<size_t>::max();
if (new_component){
Expand Down Expand Up @@ -733,7 +737,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index(
for (pair<SnarlDistanceIndex::temp_record_t, size_t>& component_index : temp_index.components) {
if (component_index.first == SnarlDistanceIndex::TEMP_SNARL) {
SnarlDistanceIndex::TemporaryDistanceIndex::TemporarySnarlRecord& temp_snarl_record = temp_index.temp_snarl_records.at(component_index.second);
populate_snarl_index(temp_index, component_index, size_limit, graph);
populate_snarl_index(temp_index, component_index, size_limit, only_top_level_chain_distances, graph);
temp_snarl_record.min_length = std::numeric_limits<size_t>::max();
}
}
Expand All @@ -756,7 +760,7 @@ SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index(
void populate_snarl_index(
SnarlDistanceIndex::TemporaryDistanceIndex& temp_index,
pair<SnarlDistanceIndex::temp_record_t, size_t> snarl_index, size_t size_limit,
const HandleGraph* graph) {
bool only_top_level_chain_distances, const HandleGraph* graph) {
#ifdef debug_distance_indexing
cerr << "Getting the distances for snarl " << temp_index.structure_start_end_as_string(snarl_index) << endl;
assert(snarl_index.first == SnarlDistanceIndex::TEMP_SNARL);
Expand Down Expand Up @@ -1063,10 +1067,15 @@ void populate_snarl_index(
*/


//Reserve enough space to store all possible distances
temp_snarl_record.distances.reserve( (temp_snarl_record.node_count > size_limit || size_limit == 0)
? temp_snarl_record.node_count * 2
: temp_snarl_record.node_count * temp_snarl_record.node_count);
if (size_limit != 0 && !only_top_level_chain_distances) {
//If we are saving distances
//Reserve enough space to store all possible distances
temp_snarl_record.distances.reserve( temp_snarl_record.node_count > size_limit
? temp_snarl_record.node_count * 2
: temp_snarl_record.node_count * temp_snarl_record.node_count);
} else {
temp_snarl_record.include_distances = false;
}

if (size_limit != 0 && temp_snarl_record.node_count > size_limit) {
temp_index.use_oversized_snarls = true;
Expand Down Expand Up @@ -1137,7 +1146,7 @@ void populate_snarl_index(
// assert(start_rank != 0 && start_rank != 1);
//}

if ( (temp_snarl_record.node_count > size_limit || size_limit == 0) && (temp_snarl_record.is_root_snarl || (!start_is_tip &&
if ( (temp_snarl_record.node_count > size_limit || size_limit == 0 || only_top_level_chain_distances) && (temp_snarl_record.is_root_snarl || (!start_is_tip &&
start_rank != 0 && start_rank != 1))) {
//If we don't care about internal distances, and we also are not at a boundary or tip
//TODO: Why do we care about tips specifically?
Expand Down
6 changes: 3 additions & 3 deletions src/snarl_distance_index.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ size_t maximum_distance(const SnarlDistanceIndex& distance_index, pos_t pos1, po

//Fill in the index
//size_limit is a limit on the number of nodes in a snarl, after which the index won't store pairwise distances
void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit = 50000, bool silence_warnings=true);
void fill_in_distance_index(SnarlDistanceIndex* distance_index, const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit = 50000, bool only_top_level_chain_distances = false, bool silence_warnings=true);

//Fill in the temporary snarl record with distances
void populate_snarl_index(SnarlDistanceIndex::TemporaryDistanceIndex& temp_index,
pair<SnarlDistanceIndex::temp_record_t, size_t> snarl_index, size_t size_limit, const HandleGraph* graph) ;
pair<SnarlDistanceIndex::temp_record_t, size_t> snarl_index, size_t size_limit, bool only_top_level_chain_distances, const HandleGraph* graph) ;

SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index(const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit);
SnarlDistanceIndex::TemporaryDistanceIndex make_temporary_distance_index(const HandleGraph* graph, const HandleGraphSnarlFinder* snarl_finder, size_t size_limit, bool only_top_level_chain_distances);

//Define wang_hash for net_handle_t's so that we can use a hash_map
template<> struct wang_hash<handlegraph::net_handle_t> {
Expand Down
49 changes: 29 additions & 20 deletions src/subcommand/index_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,28 +38,29 @@ void help_index(char** argv) {
<< "Creates an index on the specified graph or graphs. All graphs indexed must " << endl
<< "already be in a joint ID space." << endl
<< "general options:" << endl
<< " -b, --temp-dir DIR use DIR for temporary files" << endl
<< " -t, --threads N number of threads to use" << endl
<< " -p, --progress show progress" << endl
<< " -b, --temp-dir DIR use DIR for temporary files" << endl
<< " -t, --threads N number of threads to use" << endl
<< " -p, --progress show progress" << endl
<< "xg options:" << endl
<< " -x, --xg-name FILE use this file to store a succinct, queryable version of the graph(s), or read for GCSA or distance indexing" << endl
<< " -L, --xg-alts include alt paths in xg" << endl
<< " -x, --xg-name FILE use this file to store a succinct, queryable version of the graph(s), or read for GCSA or distance indexing" << endl
<< " -L, --xg-alts include alt paths in xg" << endl
<< "gcsa options:" << endl
<< " -g, --gcsa-out FILE output a GCSA2 index to the given file" << endl
//<< " -i, --dbg-in FILE use kmers from FILE instead of input VG (may repeat)" << endl
<< " -f, --mapping FILE use this node mapping in GCSA2 construction" << endl
<< " -k, --kmer-size N index kmers of size N in the graph (default " << gcsa::Key::MAX_LENGTH << ")" << endl
<< " -X, --doubling-steps N use this number of doubling steps for GCSA2 construction (default " << gcsa::ConstructionParameters::DOUBLING_STEPS << ")" << endl
<< " -Z, --size-limit N limit temporary disk space usage to N gigabytes (default " << gcsa::ConstructionParameters::SIZE_LIMIT << ")" << endl
<< " -V, --verify-index validate the GCSA2 index using the input kmers (important for testing)" << endl
<< " -g, --gcsa-out FILE output a GCSA2 index to the given file" << endl
//<< " -i, --dbg-in FILE use kmers from FILE instead of input VG (may repeat)" << endl
<< " -f, --mapping FILE use this node mapping in GCSA2 construction" << endl
<< " -k, --kmer-size N index kmers of size N in the graph (default " << gcsa::Key::MAX_LENGTH << ")" << endl
<< " -X, --doubling-steps N use this number of doubling steps for GCSA2 construction (default " << gcsa::ConstructionParameters::DOUBLING_STEPS << ")" << endl
<< " -Z, --size-limit N limit temporary disk space usage to N gigabytes (default " << gcsa::ConstructionParameters::SIZE_LIMIT << ")" << endl
<< " -V, --verify-index validate the GCSA2 index using the input kmers (important for testing)" << endl
<< "gam indexing options:" << endl
<< " -l, --index-sorted-gam input is sorted .gam format alignments, store a GAI index of the sorted GAM in INPUT.gam.gai" << endl
<< " -l, --index-sorted-gam input is sorted .gam format alignments, store a GAI index of the sorted GAM in INPUT.gam.gai" << endl
<< "vg in-place indexing options:" << endl
<< " --index-sorted-vg input is ID-sorted .vg format graph chunks, store a VGI index of the sorted vg in INPUT.vg.vgi" << endl
<< " --index-sorted-vg input is ID-sorted .vg format graph chunks, store a VGI index of the sorted vg in INPUT.vg.vgi" << endl
<< "snarl distance index options" << endl
<< " -j --dist-name FILE use this file to store a snarl-based distance index" << endl
<< " --snarl-limit N don't store snarl distances for snarls with more than N nodes (default 10000)" << endl
<< " if N is 0 then don't store distances, only the snarl tree" << endl;
<< " -j --dist-name FILE use this file to store a snarl-based distance index" << endl
<< " --snarl-limit N don't store snarl distances for snarls with more than N nodes (default 10000)" << endl
<< " if N is 0 then don't store distances, only the snarl tree" << endl
<< " --no-nested-distance only store distances along the top-level chain" << endl;
}

int main_index(int argc, char** argv) {
Expand All @@ -72,6 +73,7 @@ int main_index(int argc, char** argv) {
#define OPT_BUILD_VGI_INDEX 1000
#define OPT_RENAME_VARIANTS 1001
#define OPT_DISTANCE_SNARL_LIMIT 1002
#define OPT_DISTANCE_NESTING 1003

// Which indexes to build.
bool build_xg = false, build_gcsa = false, build_dist = false;
Expand Down Expand Up @@ -103,6 +105,8 @@ int main_index(int argc, char** argv) {
//Distance index
size_t snarl_limit = 50000;

bool only_top_level_chain_distances = false;

int c;
optind = 2; // force optind past command positional argument
while (true) {
Expand Down Expand Up @@ -155,6 +159,7 @@ int main_index(int argc, char** argv) {
//Snarl distance index
{"snarl-limit", required_argument, 0, OPT_DISTANCE_SNARL_LIMIT},
{"dist-name", required_argument, 0, 'j'},
{"no-nested-distance", no_argument, 0, OPT_DISTANCE_NESTING},
{0, 0, 0, 0}
};

Expand Down Expand Up @@ -254,6 +259,10 @@ int main_index(int argc, char** argv) {
snarl_limit = parse<int>(optarg);
break;

case OPT_DISTANCE_NESTING:
only_top_level_chain_distances = true;
break;

case 'h':
case '?':
help_index(argv);
Expand Down Expand Up @@ -531,7 +540,7 @@ int main_index(int argc, char** argv) {
SnarlDistanceIndex distance_index;

//Fill it in
fill_in_distance_index(&distance_index, xg.get(), &snarl_finder, snarl_limit, false);
fill_in_distance_index(&distance_index, xg.get(), &snarl_finder, snarl_limit, only_top_level_chain_distances, false);
// Save it
distance_index.serialize(dist_name);
} else {
Expand All @@ -547,7 +556,7 @@ int main_index(int argc, char** argv) {

//Make a distance index and fill it in
SnarlDistanceIndex distance_index;
fill_in_distance_index(&distance_index, &(gbz->graph), &snarl_finder, snarl_limit);
fill_in_distance_index(&distance_index, &(gbz->graph), &snarl_finder, snarl_limit, only_top_level_chain_distances, false);
// Save it
distance_index.serialize(dist_name);
} else if (get<1>(options)) {
Expand All @@ -559,7 +568,7 @@ int main_index(int argc, char** argv) {

//Make a distance index and fill it in
SnarlDistanceIndex distance_index;
fill_in_distance_index(&distance_index, graph.get(), &snarl_finder, snarl_limit);
fill_in_distance_index(&distance_index, graph.get(), &snarl_finder, snarl_limit, only_top_level_chain_distances, false);
// Save it
distance_index.serialize(dist_name);
} else {
Expand Down
37 changes: 36 additions & 1 deletion src/unittest/snarl_distance_index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,7 @@ namespace vg {
REQUIRE(std::get<2>(traceback.second.back()) == -5);
}
}
TEST_CASE( "Nested chain with loop", "[snarl_distance]" ) {
TEST_CASE( "Nested chain with loop", "[snarl_distance][bug]" ) {

VG graph;

Expand Down Expand Up @@ -188,6 +188,7 @@ namespace vg {
Edge* e17 = graph.create_edge(n11, n12);
Edge* e18 = graph.create_edge(n12, n13);

graph.serialize_to_file("test_graph.vg");
//get the snarls
IntegratedSnarlFinder snarl_finder(graph);
SECTION("Traversal of chain") {
Expand Down Expand Up @@ -276,6 +277,40 @@ namespace vg {
}
}
}
SECTION("Top-level distances only index") {
SnarlDistanceIndex distance_index;
fill_in_distance_index(&distance_index, &graph, &snarl_finder, 500, true);

//Nodes on the top-level chain have distances
for (auto& id : {n1->id(), n2->id(), n4->id(), n5->id(), n12->id(), n13->id()}) {
net_handle_t n = distance_index.get_node_net_handle(id);
net_handle_t parent = distance_index.get_parent(n);
assert(distance_index.is_root(distance_index.get_parent(parent)));
distance_index.minimum_length(n);
distance_index.minimum_length(parent);
REQUIRE(true);
}

//Nested nodes don't have distances
for (auto& id : {n3->id(), n6->id(), n7->id(), n8->id(), n9->id(), n10->id(), n11->id()}) {
net_handle_t n = distance_index.get_node_net_handle(id);
net_handle_t parent = distance_index.get_parent(n);
assert(!distance_index.is_root(distance_index.get_parent(parent)));
try {
distance_index.minimum_length(n);
distance_index.minimum_length(parent);
REQUIRE(false);
} catch (const std::runtime_error& err) {
REQUIRE(true);
}
}

SECTION("Find minimum distances along the top-level chain") {
REQUIRE(distance_index.minimum_distance(n2->id(),true, 0, n2->id(), false, 0) == 1);
REQUIRE(distance_index.minimum_distance(n4->id(),true, 0, n5->id(), true, 0) == 19);
REQUIRE(distance_index.minimum_distance(n5->id(),false, 0, n5->id(), true, 0) == 9);
}
}
}
TEST_CASE( "Snarl decomposition can deal with multiple connected components",
"[snarl_distance]" ) {
Expand Down

1 comment on commit cd35552

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

15 tests passed, 1 tests failed and 0 tests skipped in 17380 seconds

Failed tests:

  • test_sim_chr21_snp1kg_trained (2553 seconds)

Please sign in to comment.