-
Notifications
You must be signed in to change notification settings - Fork 195
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4481 from jeizenga/mask-seqs
Add subcommand to mask sequences
- Loading branch information
Showing
7 changed files
with
1,042 additions
and
2 deletions.
There are no files selected for viewing
Submodule libbdsg
updated
38 files
Submodule libhandlegraph
updated
5 files
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
#ifndef VG_MASKER_HPP_INCLUDED | ||
#define VG_MASKER_HPP_INCLUDED | ||
|
||
#include <memory> | ||
|
||
#include <gbwtgraph/gbwtgraph.h> | ||
|
||
#include "handle.hpp" | ||
#include "snarls.hpp" | ||
#include "region.hpp" | ||
|
||
namespace vg { | ||
|
||
/* | ||
* Class to mask out graph regions with N's | ||
*/ | ||
class Masker { | ||
|
||
public: | ||
|
||
// construct own snarl decomposition | ||
// use external snarl decomposition | ||
Masker(gbwtgraph::GBWTGraph& graph, const SnarlManager* snarl_manager = nullptr); | ||
Masker(MutablePathDeletableHandleGraph& graph, const SnarlManager* snarl_manager = nullptr); | ||
Masker() = default; | ||
~Masker() = default; | ||
|
||
// move | ||
Masker(Masker&& other) = default; | ||
Masker& operator=(Masker&& other) = default; | ||
|
||
// replace regions with N's, with regions given as tuples of (path name, region begin, region end). | ||
// regions indexes are assumed to be 0-based and end-exclusive | ||
void mask_sequences(const std::vector<std::tuple<std::string, size_t, size_t>>& regions) const; | ||
|
||
private: | ||
|
||
// make our own snarl manager | ||
void init_snarls(); | ||
|
||
// takes tuples of (first step, offset in first step, last step, end in last step) | ||
// uses the lambda function provided to apply mask sequences to the graph | ||
void apply_mask_sequences(const std::vector<std::tuple<step_handle_t, size_t, step_handle_t, size_t>>& mask_intervals, | ||
const std::function<void(handle_t, const std::string&)>& apply_mask) const; | ||
|
||
PathHandleGraph* graph = nullptr; | ||
const SnarlManager* snarl_manager = nullptr; | ||
|
||
// if not given a snarl manager, we construct one and make the snarl_manager | ||
// point to it | ||
std::unique_ptr<SnarlManager> own_snarl_manager; | ||
|
||
}; | ||
|
||
} | ||
|
||
#endif |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,177 @@ | ||
#include "subcommand.hpp" | ||
#include "../io/save_handle_graph.hpp" | ||
#include "../handle.hpp" | ||
#include "../xg.hpp" | ||
#include "../utility.hpp" | ||
#include "../snarls.hpp" | ||
#include "../masker.hpp" | ||
#include <vg/io/stream.hpp> | ||
#include <vg/io/vpkg.hpp> | ||
#include <gbwtgraph/gbz.h> | ||
|
||
#include <unistd.h> | ||
#include <getopt.h> | ||
#include <sstream> | ||
|
||
using namespace vg; | ||
using namespace vg::subcommand; | ||
using namespace vg::io; | ||
using namespace std; | ||
|
||
vector<tuple<string, size_t, size_t>> parse_bed(istream& in){ | ||
|
||
vector<tuple<string, size_t, size_t>> regions; | ||
|
||
string line_buffer; | ||
string token_buffer; | ||
while (getline(in, line_buffer)) { | ||
|
||
if (line_buffer.empty() || line_buffer.front() == '#' || line_buffer.substr(0, 5) == "track" || line_buffer.substr(0, 7) == "browser") { | ||
// header line | ||
continue; | ||
} | ||
istringstream line_strm(line_buffer); | ||
|
||
regions.emplace_back(); | ||
|
||
getline(line_strm, token_buffer, '\t'); | ||
get<0>(regions.back()) = token_buffer; | ||
getline(line_strm, token_buffer, '\t'); | ||
get<1>(regions.back()) = parse<int64_t>(token_buffer); | ||
getline(line_strm, token_buffer, '\t'); | ||
get<2>(regions.back()) = parse<int64_t>(token_buffer); | ||
} | ||
|
||
return regions; | ||
} | ||
|
||
void help_mask(char** argv) { | ||
cerr << "usage: " << argv[0] << " [options] <graph>" << endl | ||
<< "Mask out specified regions of a graph with N's" << endl | ||
<< endl | ||
<< "input options: " << endl | ||
<< " -b, --bed FILE BED regions corresponding to path intervals of the graph to target (required)" << endl | ||
<< " -g, --gbz-input Input graph is in GBZ format" << endl | ||
<< " -s, --snarls FILE Snarls from vg snarls (computed directly if not provided)" << endl | ||
|
||
<< endl; | ||
} | ||
|
||
int main_mask(int argc, char** argv) { | ||
|
||
string bed_filepath; | ||
string snarls_filepath; | ||
bool gbz_input = false; | ||
|
||
int c; | ||
optind = 2; // force optind past command positional argument | ||
while (true) { | ||
static struct option long_options[] = | ||
{ | ||
{"help", no_argument, 0, 'h'}, | ||
{"bed", required_argument, 0, 'b'}, | ||
{"gbz-input", no_argument, 0, 'g'}, | ||
{"snarls", required_argument, 0, 's'}, | ||
{0, 0, 0, 0} | ||
|
||
}; | ||
int option_index = 0; | ||
c = getopt_long (argc, argv, "hb:gs:", long_options, &option_index); | ||
|
||
// Detect the end of the options. | ||
if (c == -1) | ||
break; | ||
|
||
switch (c) | ||
{ | ||
|
||
case '?': | ||
case 'h': | ||
help_mask(argv); | ||
return 0; | ||
case 'b': | ||
bed_filepath = optarg; | ||
break; | ||
case 'g': | ||
gbz_input = true; | ||
break; | ||
case 's': | ||
snarls_filepath = optarg; | ||
break; | ||
default: | ||
help_mask(argv); | ||
return 1; | ||
} | ||
} | ||
|
||
|
||
if (argc - optind != 1) { | ||
cerr << "error: vg mask requires exactly 1 positional argument" << endl; | ||
help_mask(argv); | ||
return 1; | ||
} | ||
|
||
if (bed_filepath.empty()) { | ||
cerr << "error: vg mask requires an input BED file from -b / --bed" << endl; | ||
return 1; | ||
} | ||
if (bed_filepath != "-") { | ||
if (!ifstream(bed_filepath)) { | ||
cerr << "error: could not open BED file from " << bed_filepath << endl; | ||
return 1; | ||
} | ||
} | ||
if (!snarls_filepath.empty() && snarls_filepath != "-") { | ||
if (!ifstream(snarls_filepath)) { | ||
cerr << "error: could not open snarls file from " << snarls_filepath << endl; | ||
return 1; | ||
} | ||
} | ||
|
||
// load the graph | ||
string graph_path = get_input_file_name(optind, argc, argv); | ||
unique_ptr<MutablePathDeletableHandleGraph> graph; | ||
unique_ptr<gbwtgraph::GBZ> gbz; | ||
if (gbz_input) { | ||
gbz = vg::io::VPKG::load_one<gbwtgraph::GBZ>(graph_path);; | ||
} | ||
else { | ||
graph = vg::io::VPKG::load_one<MutablePathDeletableHandleGraph>(graph_path); | ||
} | ||
|
||
// load the BED | ||
vector<tuple<string, size_t, size_t>> regions; | ||
get_input_file(bed_filepath, [&](istream& in) { | ||
regions = std::move(parse_bed(in)); | ||
}); | ||
|
||
// load the snarls | ||
unique_ptr<SnarlManager> snarl_manager; | ||
if (!snarls_filepath.empty()) { | ||
snarl_manager = vg::io::VPKG::load_one<SnarlManager>(snarls_filepath); | ||
} | ||
|
||
Masker masker; | ||
if (gbz.get()) { | ||
masker = std::move(Masker(gbz->graph, snarl_manager.get())); | ||
} | ||
else { | ||
masker = std::move(Masker(*graph, snarl_manager.get())); | ||
} | ||
|
||
masker.mask_sequences(regions); | ||
|
||
// write the graph | ||
if (gbz.get()) { | ||
gbz->simple_sds_serialize(cout); | ||
} | ||
else { | ||
vg::io::save_handle_graph(graph.get(), cout); | ||
} | ||
|
||
return 0; | ||
} | ||
|
||
|
||
// Register subcommand | ||
static Subcommand vg_mask("mask", "Mask out sequences in a graph with N's", main_mask); |
Oops, something went wrong.
8494a52
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
vg CI tests complete for merge to master. View the full report here.
16 tests passed, 0 tests failed and 0 tests skipped in 17702 seconds