diff --git a/.gitignore b/.gitignore index 6c15039..ca070bf 100644 --- a/.gitignore +++ b/.gitignore @@ -6,6 +6,7 @@ kmercamel kmercameltest kmercamel-large kmercameltest-large +kmercameltest-extra-large src/version.h 🐫 bin/ diff --git a/Makefile b/Makefile index df6d615..3a74bf3 100644 --- a/Makefile +++ b/Makefile @@ -4,6 +4,7 @@ CXX= g++ CXXFLAGS= -g -Wall -Wno-unused-function -std=c++17 -O2 -I/opt/homebrew/include LDFLAGS= -lz -lglpk -L/opt/homebrew/lib LARGEFLAGS= -DLARGE_KMERS +ELARGEFLAGS= -DEXTRA_LARGE_KMERS SRC= src TESTS= tests GTEST= $(TESTS)/googletest/googletest @@ -22,9 +23,10 @@ quick-verify: verify.py kmercamel ./verify.py --quick $(DATA)/spneumoniae.fa ./verify.py --k 13 --superstring_path $(DATA)/global-k13c.fa $(DATA)/spneumoniae.fa -cpptest: kmercameltest kmercameltest-large +cpptest: kmercameltest kmercameltest-large kmercameltest-extra-large ./kmercameltest ./kmercameltest-large + ./kmercameltest-extra-large converttest: convert_superstring_unittest.py ./convert_superstring_unittest.py @@ -40,6 +42,9 @@ kmercameltest: $(TESTS)/unittest.cpp gtest-all.o $(SRC)/$(wildcard *.cpp *.h *.h kmercameltest-large: $(TESTS)/unittest.cpp gtest-all.o $(SRC)/$(wildcard *.cpp *.h *.hpp) $(TESTS)/$(wildcard *.cpp *.h *.hpp) $(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include $(TESTS)/unittest.cpp gtest-all.o -pthread -o $@ $(LDFLAGS) $(LARGEFLAGS) +kmercameltest-extra-large: $(TESTS)/unittest.cpp gtest-all.o $(SRC)/$(wildcard *.cpp *.h *.hpp) $(TESTS)/$(wildcard *.cpp *.h *.hpp) + $(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include $(TESTS)/unittest.cpp gtest-all.o -pthread -o $@ $(LDFLAGS) $(ELARGEFLAGS) + gtest-all.o: $(GTEST)/src/gtest-all.cc $(wildcard *.cpp *.h *.hpp) $(CXX) $(CXXFLAGS) -isystem $(GTEST)/include -I $(GTEST)/include -I $(GTEST) -DGTEST_CREATE_SHARED_LIBRARY=1 -c -pthread $(GTEST)/src/gtest-all.cc -o $@ diff --git a/src/global.h b/src/global.h index 15d782a..192e2bf 100644 --- a/src/global.h +++ b/src/global.h @@ -57,8 +57,10 @@ void PartialPreSort(std::vector &vals, int k) { /// If complements are provided, treat k-mer and its complement as identical. /// If this is the case, k-mers are expected to contain only one k-mer from a complement pair. /// Moreover, if so, the resulting Hamiltonian path contains two superstrings which are reverse complements of one another. +/// If lower_bound is set to true, return a shortest cycle cover instead. template -overlapPath OverlapHamiltonianPath (kh_wrapper_t wrapper, std::vector &kMers, int k, bool complements) { +overlapPath OverlapHamiltonianPath (kh_wrapper_t wrapper, std::vector &kMers, int k, bool complements, + bool lower_bound = false) { size_t n = kMers.size(); size_t kMersCount = n * (1 + complements); size_t batchSize = kMersCount / MEMORY_REDUCTION_FACTOR + 1; @@ -106,10 +108,12 @@ overlapPath OverlapHamiltonianPath (kh_wrapper_t wrapper, std::vector &k if (suffix_key == kh_end(prefixes)) continue; size_t previous, j; previous = j = kh_val(prefixes, suffix_key); - // If the path forms a cycle, or is between k-mer and its reverse complement, or the k-mers complement was already selected skip this path. while (j != size_t(-1) && \ - (accessFirstLast(first, last, i, n) % n == j % n \ - || accessFirstLast(first, last, i, n) % n == accessFirstLast(last, first, j, n) % n \ + // k-mers are complementary + ((i + n) % (2 * n) == j \ + // forms a cycle + || (!lower_bound && accessFirstLast(first, last, i, n) == j) \ + // k-mer is already used || prefixForbidden[j])) { size_t new_j = next[j - from]; // If the k-mer is forbidden, remove it to keep the complexity linear. diff --git a/src/lower_bound.h b/src/lower_bound.h new file mode 100644 index 0000000..92093bc --- /dev/null +++ b/src/lower_bound.h @@ -0,0 +1,15 @@ +#pragma once + +#include "global.h" +#include "kmers.h" + +/// Return the length of the cycle cover which lower bounds the superstring length. +template +size_t LowerBoundLength(kh_wrapper_t wrapper, std::vector kMers, int k, bool complements) { + auto cycle_cover = OverlapHamiltonianPath(wrapper, kMers, k, complements, true); + size_t res = 0; + for (auto &overlap : cycle_cover.second) { + res += size_t(k) - size_t(overlap); + } + return res / (1 + complements); +} \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 53f49f5..05d442d 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -11,7 +11,13 @@ #include "ac/parser_ac.h" #include "ac/streaming.h" #include "khash_utils.h" + +#include +#include +#include "unistd.h" +#include "version.h" #include "masks.h" +#include "lower_bound.h" int Help() { std::cerr << "KmerCamel version " << VERSION << std::endl; @@ -23,6 +29,7 @@ int Help() { std::cerr << " -d d_value - integer value for d_max; default 5" << std::endl; std::cerr << " -c - treat k-mer and its reverse complement as equal" << std::endl; std::cerr << " -m - turn off the memory optimizations for global" << std::endl; + std::cerr << " -l - compute the cycle cover lower bound instead of masked superstring" << std::endl; std::cerr << " -h - print help" << std::endl; std::cerr << " -v - print version" << std::endl; std::cerr << "Example usage: ./kmercamel -p path_to_fasta -k 31 -d 5 -a local -c" << std::endl; @@ -48,8 +55,9 @@ void Version() { // This macro cannot be avoided without bloating the code too much. #define INIT_KMERCAMEL(type) \ -int kmercamel##type(std::string path, int k, int d_max, std::ostream *of, bool complements, bool masks, std::string algorithm, bool optimize_memory) { \ - kmer_dict##type##_t wrapper; \ +int kmercamel##type(std::string path, int k, int d_max, std::ostream *of, bool complements, bool masks, \ + std::string algorithm, bool optimize_memory, bool lower_bound) { \ + kmer_dict##type##_t wrapper; \ kmer##type##_t kmer_type = 0; \ if (masks) { \ int ret = Optimize(wrapper, kmer_type, algorithm, path, *of, k, complements); \ @@ -71,14 +79,15 @@ int kmercamel##type(std::string path, int k, int d_max, std::ostream *of, bool c return Help(); \ } \ d_max = std::min(k - 1, d_max); \ - WriteName(k, *of); \ + if (!lower_bound) WriteName(k, *of); \ if (algorithm == "global") { \ auto kMerVec = kMersToVec(kMers, kmer_type); \ kh_destroy_S##type(kMers); \ /* Turn off the memory optimizations if optimize_memory is set to false. */ \ if(optimize_memory) PartialPreSort(kMerVec, k); \ else MEMORY_REDUCTION_FACTOR = 1; \ - Global(wrapper, kMerVec, *of, k, complements); \ + if (lower_bound) std::cout << LowerBoundLength(wrapper, kMerVec, k, complements); \ + else Global(wrapper, kMerVec, *of, k, complements); \ } \ else Local(kMers, wrapper, kmer_type, *of, k, d_max, complements); \ } else { \ @@ -127,9 +136,10 @@ int main(int argc, char **argv) { bool complements = false; bool optimize_memory = true; bool d_set = false; + bool lower_bound = false; int opt; try { - while ((opt = getopt(argc, argv, "p:k:d:a:o:hcvm")) != -1) { + while ((opt = getopt(argc, argv, "p:k:d:a:o:hcvml")) != -1) { switch(opt) { case 'p': path = optarg; @@ -159,6 +169,9 @@ int main(int argc, char **argv) { case 'm': optimize_memory = false; break; + case 'l': + lower_bound = true; + break; case 'v': Version(); return 0; @@ -196,12 +209,15 @@ int main(int argc, char **argv) { } else if (masks && (d_set || !optimize_memory)) { std::cerr << "Not supported flags for optimize." << std::endl; return Help(); + } else if (lower_bound && algorithm != "global") { + std::cerr << "Lower bound computation supported only for hash table global." << std::endl; + return Help(); } if (k < 32) { - return kmercamel64(path, k, d_max, of, complements, masks, algorithm, optimize_memory); + return kmercamel64(path, k, d_max, of, complements, masks, algorithm, optimize_memory, lower_bound); } else if (k < 64) { - return kmercamel128(path, k, d_max, of, complements, masks, algorithm, optimize_memory); + return kmercamel128(path, k, d_max, of, complements, masks, algorithm, optimize_memory, lower_bound); } else { - return kmercamel256(path, k, d_max, of, complements, masks, algorithm, optimize_memory); + return kmercamel256(path, k, d_max, of, complements, masks, algorithm, optimize_memory, lower_bound); } } diff --git a/src/uint256_t/uint256_t.h b/src/uint256_t/uint256_t.h index 1f3ddae..79baab7 100644 --- a/src/uint256_t/uint256_t.h +++ b/src/uint256_t/uint256_t.h @@ -1,3 +1,4 @@ +#pragma once #ifndef __LITTLE_ENDIAN__ #ifndef __BIG_ENDIAN__ #define __LITTLE_ENDIAN__ 1 diff --git a/tests/global_unittest.h b/tests/global_unittest.h index 640d5a7..169d53f 100644 --- a/tests/global_unittest.h +++ b/tests/global_unittest.h @@ -80,6 +80,7 @@ namespace { overlapPath wantResult; int k; bool complements; + bool lower_bound; }; std::vector tests = { { @@ -87,23 +88,40 @@ namespace { {{(size_t)-1, (size_t)-1}, {(byte)-1, (byte)-1}}, 2, true, + false, }, { {KMerToNumber({"ACG"}), KMerToNumber({"TAC"}), KMerToNumber({"GGC"})}, {{2, 0, (size_t)-1}, {1, 2, (byte)-1}}, 3, false, + false, }, { {KMerToNumber({"ACAA"}), KMerToNumber({"ATTT"}), KMerToNumber({"AACA"})}, {{4, 3, 0, 5, (size_t)-1, (size_t)-1}, {2, 2, 3, 3, (byte)-1, (byte)-1}}, 4, true, + false, + }, + { + {KMerToNumber({"ACG"}), KMerToNumber({"CGT"}), KMerToNumber({"TAA"})}, + {{1, 2, 0}, {2, 1, 1}}, + 3, + false, + true, + }, + { + {KMerToNumber({"ACC"}), KMerToNumber({"CGG"})}, + {{3, 2, 1, 0}, {2, 2, 0, 0}}, + 3, + true, + true, }, }; for (auto t : tests) { - overlapPath got = OverlapHamiltonianPath(wrapper, t.kMers, t.k, t.complements); + overlapPath got = OverlapHamiltonianPath(wrapper, t.kMers, t.k, t.complements, t.lower_bound); EXPECT_EQ(t.wantResult.first, got.first); EXPECT_EQ(t.wantResult.second, got.second); } diff --git a/tests/kmer_types.h b/tests/kmer_types.h index b699d4f..1f0c2fc 100644 --- a/tests/kmer_types.h +++ b/tests/kmer_types.h @@ -2,19 +2,26 @@ #include #include "../src/khash_utils.h" +#include "../src/uint256_t/uint256_t.h" - -#ifdef LARGE_KMERS - typedef __uint128_t kmer_t; - typedef kmer_dict128_t kh_wrapper; - typedef kh_S128_t kh_S_t; - typedef kh_P128_t kh_P_t; -#else // LARGE_KMERS - typedef uint64_t kmer_t; - typedef kmer_dict64_t kh_wrapper; - typedef kh_S64_t kh_S_t; - typedef kh_P64_t kh_P_t; -#endif // LARGE_KMERS +#ifdef EXTRA_LARGE_KMERS + typedef uint256_t kmer_t; + typedef kmer_dict256_t kh_wrapper; + typedef kh_S256_t kh_S_t; + typedef kh_P256_t kh_P_t; +#else // EXTRA_LARGE_KMERS + #ifdef LARGE_KMERS + typedef __uint128_t kmer_t; + typedef kmer_dict128_t kh_wrapper; + typedef kh_S128_t kh_S_t; + typedef kh_P128_t kh_P_t; + #else // LARGE_KMERS + typedef uint64_t kmer_t; + typedef kmer_dict64_t kh_wrapper; + typedef kh_S64_t kh_S_t; + typedef kh_P64_t kh_P_t; + #endif // LARGE_KMERS +#endif // EXTRA_LARGE_KMERS // To be passed to functions. kh_wrapper wrapper; \ No newline at end of file diff --git a/tests/lower_bound_unittest.h b/tests/lower_bound_unittest.h new file mode 100644 index 0000000..b9b7469 --- /dev/null +++ b/tests/lower_bound_unittest.h @@ -0,0 +1,38 @@ +#pragma once + +#include "kmer_types.h" + +#include "../src/lower_bound.h" + +#include "gtest/gtest.h" + +namespace { + TEST(LowerBound, LowerBoundLength) { + struct TestCase { + std::vector kMers; + int k; + bool complements; + size_t wantResult; + }; + std::vector tests = { + { + {KMerToNumber({"ACG"}), KMerToNumber({"CGT"}), KMerToNumber({"TAA"})}, + 3, false, 5 + }, + { + {KMerToNumber({"AC"}), KMerToNumber({"GG"})}, + 2, true, 3 + }, + { + {KMerToNumber({"AAACCC"}), KMerToNumber({"CCCAAA"}), KMerToNumber({"TGGGGT"})}, + 6, false, 11 + }, + }; + + for (auto &t : tests) { + auto gotResult = LowerBoundLength(wrapper, t.kMers, t.k, t.complements); + + ASSERT_EQ(t.wantResult, gotResult); + } + } +} \ No newline at end of file diff --git a/tests/unittest.cpp b/tests/unittest.cpp index ce2c882..8737e59 100644 --- a/tests/unittest.cpp +++ b/tests/unittest.cpp @@ -6,6 +6,7 @@ #include "local_ac_unittest.h" #include "streaming_unittest.h" #include "ac_automaton_unittest.h" +#include "lower_bound_unittest.h" #include "masks_unittest.h" #include "gtest/gtest.h"