Skip to content

Commit

Permalink
Merge pull request #68 from OndrejSladky/revert-67-revert-59-lowerbound
Browse files Browse the repository at this point in the history
KmerCamel can compute lower bounds on superstring lengths.
  • Loading branch information
OndrejSladky authored Aug 22, 2024
2 parents 13c2127 + d8705d6 commit 8738872
Show file tree
Hide file tree
Showing 10 changed files with 132 additions and 26 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ kmercamel
kmercameltest
kmercamel-large
kmercameltest-large
kmercameltest-extra-large
src/version.h
🐫
bin/
Expand Down
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ CXX= g++
CXXFLAGS= -g -Wall -Wno-unused-function -std=c++17 -O2 -I/opt/homebrew/include
LDFLAGS= -lz -lglpk -L/opt/homebrew/lib
LARGEFLAGS= -DLARGE_KMERS
ELARGEFLAGS= -DEXTRA_LARGE_KMERS
SRC= src
TESTS= tests
GTEST= $(TESTS)/googletest/googletest
Expand All @@ -22,9 +23,10 @@ quick-verify: verify.py kmercamel
./verify.py --quick $(DATA)/spneumoniae.fa
./verify.py --k 13 --superstring_path $(DATA)/global-k13c.fa $(DATA)/spneumoniae.fa

cpptest: kmercameltest kmercameltest-large
cpptest: kmercameltest kmercameltest-large kmercameltest-extra-large
./kmercameltest
./kmercameltest-large
./kmercameltest-extra-large

converttest: convert_superstring_unittest.py
./convert_superstring_unittest.py
Expand All @@ -40,6 +42,9 @@ kmercameltest: $(TESTS)/unittest.cpp gtest-all.o $(SRC)/$(wildcard *.cpp *.h *.h
kmercameltest-large: $(TESTS)/unittest.cpp gtest-all.o $(SRC)/$(wildcard *.cpp *.h *.hpp) $(TESTS)/$(wildcard *.cpp *.h *.hpp)
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include $(TESTS)/unittest.cpp gtest-all.o -pthread -o $@ $(LDFLAGS) $(LARGEFLAGS)

kmercameltest-extra-large: $(TESTS)/unittest.cpp gtest-all.o $(SRC)/$(wildcard *.cpp *.h *.hpp) $(TESTS)/$(wildcard *.cpp *.h *.hpp)
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include $(TESTS)/unittest.cpp gtest-all.o -pthread -o $@ $(LDFLAGS) $(ELARGEFLAGS)

gtest-all.o: $(GTEST)/src/gtest-all.cc $(wildcard *.cpp *.h *.hpp)
$(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include -I $(GTEST) -DGTEST_CREATE_SHARED_LIBRARY=1 -c -pthread $(GTEST)/src/gtest-all.cc -o $@

Expand Down
12 changes: 8 additions & 4 deletions src/global.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,10 @@ void PartialPreSort(std::vector<kmer_t> &vals, int k) {
/// If complements are provided, treat k-mer and its complement as identical.
/// If this is the case, k-mers are expected to contain only one k-mer from a complement pair.
/// Moreover, if so, the resulting Hamiltonian path contains two superstrings which are reverse complements of one another.
/// If lower_bound is set to true, return a shortest cycle cover instead.
template <typename kmer_t, typename kh_wrapper_t>
overlapPath OverlapHamiltonianPath (kh_wrapper_t wrapper, std::vector<kmer_t> &kMers, int k, bool complements) {
overlapPath OverlapHamiltonianPath (kh_wrapper_t wrapper, std::vector<kmer_t> &kMers, int k, bool complements,
bool lower_bound = false) {
size_t n = kMers.size();
size_t kMersCount = n * (1 + complements);
size_t batchSize = kMersCount / MEMORY_REDUCTION_FACTOR + 1;
Expand Down Expand Up @@ -106,10 +108,12 @@ overlapPath OverlapHamiltonianPath (kh_wrapper_t wrapper, std::vector<kmer_t> &k
if (suffix_key == kh_end(prefixes)) continue;
size_t previous, j;
previous = j = kh_val(prefixes, suffix_key);
// If the path forms a cycle, or is between k-mer and its reverse complement, or the k-mers complement was already selected skip this path.
while (j != size_t(-1) && \
(accessFirstLast(first, last, i, n) % n == j % n \
|| accessFirstLast(first, last, i, n) % n == accessFirstLast(last, first, j, n) % n \
// k-mers are complementary
((i + n) % (2 * n) == j \
// forms a cycle
|| (!lower_bound && accessFirstLast(first, last, i, n) == j) \
// k-mer is already used
|| prefixForbidden[j])) {
size_t new_j = next[j - from];
// If the k-mer is forbidden, remove it to keep the complexity linear.
Expand Down
15 changes: 15 additions & 0 deletions src/lower_bound.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#pragma once

#include "global.h"
#include "kmers.h"

/// Return the length of the cycle cover which lower bounds the superstring length.
template <typename kmer_t, typename kh_wrapper_t>
size_t LowerBoundLength(kh_wrapper_t wrapper, std::vector<kmer_t> kMers, int k, bool complements) {
auto cycle_cover = OverlapHamiltonianPath(wrapper, kMers, k, complements, true);
size_t res = 0;
for (auto &overlap : cycle_cover.second) {
res += size_t(k) - size_t(overlap);
}
return res / (1 + complements);
}
32 changes: 24 additions & 8 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,13 @@
#include "ac/parser_ac.h"
#include "ac/streaming.h"
#include "khash_utils.h"

#include <iostream>
#include <string>
#include "unistd.h"
#include "version.h"
#include "masks.h"
#include "lower_bound.h"

int Help() {
std::cerr << "KmerCamel version " << VERSION << std::endl;
Expand All @@ -23,6 +29,7 @@ int Help() {
std::cerr << " -d d_value - integer value for d_max; default 5" << std::endl;
std::cerr << " -c - treat k-mer and its reverse complement as equal" << std::endl;
std::cerr << " -m - turn off the memory optimizations for global" << std::endl;
std::cerr << " -l - compute the cycle cover lower bound instead of masked superstring" << std::endl;
std::cerr << " -h - print help" << std::endl;
std::cerr << " -v - print version" << std::endl;
std::cerr << "Example usage: ./kmercamel -p path_to_fasta -k 31 -d 5 -a local -c" << std::endl;
Expand All @@ -48,8 +55,9 @@ void Version() {

// This macro cannot be avoided without bloating the code too much.
#define INIT_KMERCAMEL(type) \
int kmercamel##type(std::string path, int k, int d_max, std::ostream *of, bool complements, bool masks, std::string algorithm, bool optimize_memory) { \
kmer_dict##type##_t wrapper; \
int kmercamel##type(std::string path, int k, int d_max, std::ostream *of, bool complements, bool masks, \
std::string algorithm, bool optimize_memory, bool lower_bound) { \
kmer_dict##type##_t wrapper; \
kmer##type##_t kmer_type = 0; \
if (masks) { \
int ret = Optimize(wrapper, kmer_type, algorithm, path, *of, k, complements); \
Expand All @@ -71,14 +79,15 @@ int kmercamel##type(std::string path, int k, int d_max, std::ostream *of, bool c
return Help(); \
} \
d_max = std::min(k - 1, d_max); \
WriteName(k, *of); \
if (!lower_bound) WriteName(k, *of); \
if (algorithm == "global") { \
auto kMerVec = kMersToVec(kMers, kmer_type); \
kh_destroy_S##type(kMers); \
/* Turn off the memory optimizations if optimize_memory is set to false. */ \
if(optimize_memory) PartialPreSort(kMerVec, k); \
else MEMORY_REDUCTION_FACTOR = 1; \
Global(wrapper, kMerVec, *of, k, complements); \
if (lower_bound) std::cout << LowerBoundLength(wrapper, kMerVec, k, complements); \
else Global(wrapper, kMerVec, *of, k, complements); \
} \
else Local(kMers, wrapper, kmer_type, *of, k, d_max, complements); \
} else { \
Expand Down Expand Up @@ -127,9 +136,10 @@ int main(int argc, char **argv) {
bool complements = false;
bool optimize_memory = true;
bool d_set = false;
bool lower_bound = false;
int opt;
try {
while ((opt = getopt(argc, argv, "p:k:d:a:o:hcvm")) != -1) {
while ((opt = getopt(argc, argv, "p:k:d:a:o:hcvml")) != -1) {
switch(opt) {
case 'p':
path = optarg;
Expand Down Expand Up @@ -159,6 +169,9 @@ int main(int argc, char **argv) {
case 'm':
optimize_memory = false;
break;
case 'l':
lower_bound = true;
break;
case 'v':
Version();
return 0;
Expand Down Expand Up @@ -196,12 +209,15 @@ int main(int argc, char **argv) {
} else if (masks && (d_set || !optimize_memory)) {
std::cerr << "Not supported flags for optimize." << std::endl;
return Help();
} else if (lower_bound && algorithm != "global") {
std::cerr << "Lower bound computation supported only for hash table global." << std::endl;
return Help();
}
if (k < 32) {
return kmercamel64(path, k, d_max, of, complements, masks, algorithm, optimize_memory);
return kmercamel64(path, k, d_max, of, complements, masks, algorithm, optimize_memory, lower_bound);
} else if (k < 64) {
return kmercamel128(path, k, d_max, of, complements, masks, algorithm, optimize_memory);
return kmercamel128(path, k, d_max, of, complements, masks, algorithm, optimize_memory, lower_bound);
} else {
return kmercamel256(path, k, d_max, of, complements, masks, algorithm, optimize_memory);
return kmercamel256(path, k, d_max, of, complements, masks, algorithm, optimize_memory, lower_bound);
}
}
1 change: 1 addition & 0 deletions src/uint256_t/uint256_t.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#pragma once
#ifndef __LITTLE_ENDIAN__
#ifndef __BIG_ENDIAN__
#define __LITTLE_ENDIAN__ 1
Expand Down
20 changes: 19 additions & 1 deletion tests/global_unittest.h
Original file line number Diff line number Diff line change
Expand Up @@ -80,30 +80,48 @@ namespace {
overlapPath wantResult;
int k;
bool complements;
bool lower_bound;
};
std::vector<TestCase> tests = {
{
{KMerToNumber({"AT"})},
{{(size_t)-1, (size_t)-1}, {(byte)-1, (byte)-1}},
2,
true,
false,
},
{
{KMerToNumber({"ACG"}), KMerToNumber({"TAC"}), KMerToNumber({"GGC"})},
{{2, 0, (size_t)-1}, {1, 2, (byte)-1}},
3,
false,
false,
},
{
{KMerToNumber({"ACAA"}), KMerToNumber({"ATTT"}), KMerToNumber({"AACA"})},
{{4, 3, 0, 5, (size_t)-1, (size_t)-1}, {2, 2, 3, 3, (byte)-1, (byte)-1}},
4,
true,
false,
},
{
{KMerToNumber({"ACG"}), KMerToNumber({"CGT"}), KMerToNumber({"TAA"})},
{{1, 2, 0}, {2, 1, 1}},
3,
false,
true,
},
{
{KMerToNumber({"ACC"}), KMerToNumber({"CGG"})},
{{3, 2, 1, 0}, {2, 2, 0, 0}},
3,
true,
true,
},
};

for (auto t : tests) {
overlapPath got = OverlapHamiltonianPath(wrapper, t.kMers, t.k, t.complements);
overlapPath got = OverlapHamiltonianPath(wrapper, t.kMers, t.k, t.complements, t.lower_bound);
EXPECT_EQ(t.wantResult.first, got.first);
EXPECT_EQ(t.wantResult.second, got.second);
}
Expand Down
31 changes: 19 additions & 12 deletions tests/kmer_types.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,26 @@

#include <cstdint>
#include "../src/khash_utils.h"
#include "../src/uint256_t/uint256_t.h"


#ifdef LARGE_KMERS
typedef __uint128_t kmer_t;
typedef kmer_dict128_t kh_wrapper;
typedef kh_S128_t kh_S_t;
typedef kh_P128_t kh_P_t;
#else // LARGE_KMERS
typedef uint64_t kmer_t;
typedef kmer_dict64_t kh_wrapper;
typedef kh_S64_t kh_S_t;
typedef kh_P64_t kh_P_t;
#endif // LARGE_KMERS
#ifdef EXTRA_LARGE_KMERS
typedef uint256_t kmer_t;
typedef kmer_dict256_t kh_wrapper;
typedef kh_S256_t kh_S_t;
typedef kh_P256_t kh_P_t;
#else // EXTRA_LARGE_KMERS
#ifdef LARGE_KMERS
typedef __uint128_t kmer_t;
typedef kmer_dict128_t kh_wrapper;
typedef kh_S128_t kh_S_t;
typedef kh_P128_t kh_P_t;
#else // LARGE_KMERS
typedef uint64_t kmer_t;
typedef kmer_dict64_t kh_wrapper;
typedef kh_S64_t kh_S_t;
typedef kh_P64_t kh_P_t;
#endif // LARGE_KMERS
#endif // EXTRA_LARGE_KMERS

// To be passed to functions.
kh_wrapper wrapper;
38 changes: 38 additions & 0 deletions tests/lower_bound_unittest.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#pragma once

#include "kmer_types.h"

#include "../src/lower_bound.h"

#include "gtest/gtest.h"

namespace {
TEST(LowerBound, LowerBoundLength) {
struct TestCase {
std::vector<kmer_t> kMers;
int k;
bool complements;
size_t wantResult;
};
std::vector<TestCase> tests = {
{
{KMerToNumber({"ACG"}), KMerToNumber({"CGT"}), KMerToNumber({"TAA"})},
3, false, 5
},
{
{KMerToNumber({"AC"}), KMerToNumber({"GG"})},
2, true, 3
},
{
{KMerToNumber({"AAACCC"}), KMerToNumber({"CCCAAA"}), KMerToNumber({"TGGGGT"})},
6, false, 11
},
};

for (auto &t : tests) {
auto gotResult = LowerBoundLength(wrapper, t.kMers, t.k, t.complements);

ASSERT_EQ(t.wantResult, gotResult);
}
}
}
1 change: 1 addition & 0 deletions tests/unittest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "local_ac_unittest.h"
#include "streaming_unittest.h"
#include "ac_automaton_unittest.h"
#include "lower_bound_unittest.h"
#include "masks_unittest.h"

#include "gtest/gtest.h"
Expand Down

0 comments on commit 8738872

Please sign in to comment.