Skip to content

Commit

Permalink
Merge pull request #4481 from jeizenga/mask-seqs
Browse files Browse the repository at this point in the history
Add subcommand to mask sequences
  • Loading branch information
jeizenga authored Dec 19, 2024
2 parents 230872c + ff7ea9d commit 8494a52
Show file tree
Hide file tree
Showing 7 changed files with 1,042 additions and 2 deletions.
2 changes: 1 addition & 1 deletion deps/libbdsg
Submodule libbdsg updated 38 files
+9 −4 CMakeLists.txt
+2 −2 bdsg/cmake_bindings/bdsg.cpp
+1 −1 bdsg/cmake_bindings/bdsg.sources
+77 −75 bdsg/cmake_bindings/bdsg/graph_proxy.cpp
+70 −68 bdsg/cmake_bindings/bdsg/graph_proxy_1.cpp
+76 −74 bdsg/cmake_bindings/bdsg/internal/base_packed_graph.cpp
+1 −1 bdsg/cmake_bindings/bdsg/internal/eades_algorithm.cpp
+26 −5 bdsg/cmake_bindings/bdsg/internal/hash_map.cpp
+1 −1 bdsg/cmake_bindings/bdsg/internal/is_single_stranded.cpp
+21 −21 bdsg/cmake_bindings/bdsg/internal/mapped_structs_1.cpp
+3 −3 bdsg/cmake_bindings/bdsg/overlays/packed_path_position_overlay.cpp
+6 −7 bdsg/cmake_bindings/bdsg/overlays/packed_path_position_overlay_1.cpp
+14 −3 bdsg/cmake_bindings/bdsg/overlays/path_position_overlays.cpp
+1 −3 bdsg/cmake_bindings/bdsg/overlays/path_subgraph_overlay.cpp
+1,121 −0 bdsg/cmake_bindings/bdsg/overlays/reference_path_overlay.cpp
+1 −1 bdsg/cmake_bindings/bdsg/overlays/strand_split_overlay.cpp
+516 −116 bdsg/cmake_bindings/bdsg/overlays/vectorizable_overlays.cpp
+0 −817 bdsg/cmake_bindings/bdsg/overlays/vectorizable_overlays_1.cpp
+59 −7 bdsg/cmake_bindings/bdsg/packed_graph.cpp
+42 −35 bdsg/cmake_bindings/bdsg/snarl_distance_index.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/expanding_overlay_graph.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/handle_graph.cpp
+15 −1 bdsg/cmake_bindings/handlegraph/mutable_path_metadata.cpp
+28 −2 bdsg/cmake_bindings/handlegraph/mutable_path_mutable_handle_graph.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/path_handle_graph.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/path_metadata.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/path_position_handle_graph.cpp
+1 −1 bdsg/cmake_bindings/handlegraph/trivially_serializable.cpp
+4 −0 bdsg/cmake_bindings/std/bdsg/internal/binder_hook_bind.cpp
+1 −1 bdsg/deps/libhandlegraph
+6 −1 bdsg/include/bdsg/hash_graph.hpp
+36 −0 bdsg/include/bdsg/internal/base_packed_graph.hpp
+8 −0 bdsg/include/bdsg/internal/binder_hook_bind.hpp
+8 −0 bdsg/include/bdsg/internal/graph_proxy_mutable_path_deletable_handle_graph_fragment.classfragment
+70 −20 bdsg/include/bdsg/snarl_distance_index.hpp
+7 −0 bdsg/src/hash_graph.cpp
+8 −1 bdsg/src/test_libbdsg.cpp
+38 −7 make_and_run_binder.py
395 changes: 395 additions & 0 deletions src/masker.cpp

Large diffs are not rendered by default.

57 changes: 57 additions & 0 deletions src/masker.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#ifndef VG_MASKER_HPP_INCLUDED
#define VG_MASKER_HPP_INCLUDED

#include <memory>

#include <gbwtgraph/gbwtgraph.h>

#include "handle.hpp"
#include "snarls.hpp"
#include "region.hpp"

namespace vg {

/*
* Class to mask out graph regions with N's
*/
class Masker {

public:

// construct own snarl decomposition
// use external snarl decomposition
Masker(gbwtgraph::GBWTGraph& graph, const SnarlManager* snarl_manager = nullptr);
Masker(MutablePathDeletableHandleGraph& graph, const SnarlManager* snarl_manager = nullptr);
Masker() = default;
~Masker() = default;

// move
Masker(Masker&& other) = default;
Masker& operator=(Masker&& other) = default;

// replace regions with N's, with regions given as tuples of (path name, region begin, region end).
// regions indexes are assumed to be 0-based and end-exclusive
void mask_sequences(const std::vector<std::tuple<std::string, size_t, size_t>>& regions) const;

private:

// make our own snarl manager
void init_snarls();

// takes tuples of (first step, offset in first step, last step, end in last step)
// uses the lambda function provided to apply mask sequences to the graph
void apply_mask_sequences(const std::vector<std::tuple<step_handle_t, size_t, step_handle_t, size_t>>& mask_intervals,
const std::function<void(handle_t, const std::string&)>& apply_mask) const;

PathHandleGraph* graph = nullptr;
const SnarlManager* snarl_manager = nullptr;

// if not given a snarl manager, we construct one and make the snarl_manager
// point to it
std::unique_ptr<SnarlManager> own_snarl_manager;

};

}

#endif
177 changes: 177 additions & 0 deletions src/subcommand/mask_main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
#include "subcommand.hpp"
#include "../io/save_handle_graph.hpp"
#include "../handle.hpp"
#include "../xg.hpp"
#include "../utility.hpp"
#include "../snarls.hpp"
#include "../masker.hpp"
#include <vg/io/stream.hpp>
#include <vg/io/vpkg.hpp>
#include <gbwtgraph/gbz.h>

#include <unistd.h>
#include <getopt.h>
#include <sstream>

using namespace vg;
using namespace vg::subcommand;
using namespace vg::io;
using namespace std;

vector<tuple<string, size_t, size_t>> parse_bed(istream& in){

vector<tuple<string, size_t, size_t>> regions;

string line_buffer;
string token_buffer;
while (getline(in, line_buffer)) {

if (line_buffer.empty() || line_buffer.front() == '#' || line_buffer.substr(0, 5) == "track" || line_buffer.substr(0, 7) == "browser") {
// header line
continue;
}
istringstream line_strm(line_buffer);

regions.emplace_back();

getline(line_strm, token_buffer, '\t');
get<0>(regions.back()) = token_buffer;
getline(line_strm, token_buffer, '\t');
get<1>(regions.back()) = parse<int64_t>(token_buffer);
getline(line_strm, token_buffer, '\t');
get<2>(regions.back()) = parse<int64_t>(token_buffer);
}

return regions;
}

void help_mask(char** argv) {
cerr << "usage: " << argv[0] << " [options] <graph>" << endl
<< "Mask out specified regions of a graph with N's" << endl
<< endl
<< "input options: " << endl
<< " -b, --bed FILE BED regions corresponding to path intervals of the graph to target (required)" << endl
<< " -g, --gbz-input Input graph is in GBZ format" << endl
<< " -s, --snarls FILE Snarls from vg snarls (computed directly if not provided)" << endl

<< endl;
}

int main_mask(int argc, char** argv) {

string bed_filepath;
string snarls_filepath;
bool gbz_input = false;

int c;
optind = 2; // force optind past command positional argument
while (true) {
static struct option long_options[] =
{
{"help", no_argument, 0, 'h'},
{"bed", required_argument, 0, 'b'},
{"gbz-input", no_argument, 0, 'g'},
{"snarls", required_argument, 0, 's'},
{0, 0, 0, 0}

};
int option_index = 0;
c = getopt_long (argc, argv, "hb:gs:", long_options, &option_index);

// Detect the end of the options.
if (c == -1)
break;

switch (c)
{

case '?':
case 'h':
help_mask(argv);
return 0;
case 'b':
bed_filepath = optarg;
break;
case 'g':
gbz_input = true;
break;
case 's':
snarls_filepath = optarg;
break;
default:
help_mask(argv);
return 1;
}
}


if (argc - optind != 1) {
cerr << "error: vg mask requires exactly 1 positional argument" << endl;
help_mask(argv);
return 1;
}

if (bed_filepath.empty()) {
cerr << "error: vg mask requires an input BED file from -b / --bed" << endl;
return 1;
}
if (bed_filepath != "-") {
if (!ifstream(bed_filepath)) {
cerr << "error: could not open BED file from " << bed_filepath << endl;
return 1;
}
}
if (!snarls_filepath.empty() && snarls_filepath != "-") {
if (!ifstream(snarls_filepath)) {
cerr << "error: could not open snarls file from " << snarls_filepath << endl;
return 1;
}
}

// load the graph
string graph_path = get_input_file_name(optind, argc, argv);
unique_ptr<MutablePathDeletableHandleGraph> graph;
unique_ptr<gbwtgraph::GBZ> gbz;
if (gbz_input) {
gbz = vg::io::VPKG::load_one<gbwtgraph::GBZ>(graph_path);;
}
else {
graph = vg::io::VPKG::load_one<MutablePathDeletableHandleGraph>(graph_path);
}

// load the BED
vector<tuple<string, size_t, size_t>> regions;
get_input_file(bed_filepath, [&](istream& in) {
regions = std::move(parse_bed(in));
});

// load the snarls
unique_ptr<SnarlManager> snarl_manager;
if (!snarls_filepath.empty()) {
snarl_manager = vg::io::VPKG::load_one<SnarlManager>(snarls_filepath);
}

Masker masker;
if (gbz.get()) {
masker = std::move(Masker(gbz->graph, snarl_manager.get()));
}
else {
masker = std::move(Masker(*graph, snarl_manager.get()));
}

masker.mask_sequences(regions);

// write the graph
if (gbz.get()) {
gbz->simple_sds_serialize(cout);
}
else {
vg::io::save_handle_graph(graph.get(), cout);
}

return 0;
}


// Register subcommand
static Subcommand vg_mask("mask", "Mask out sequences in a graph with N's", main_mask);
Loading

1 comment on commit 8494a52

@adamnovak
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

vg CI tests complete for merge to master. View the full report here.

16 tests passed, 0 tests failed and 0 tests skipped in 17702 seconds

Please sign in to comment.