From 11350b191ce1e11485b77e118c3fd303233926b7 Mon Sep 17 00:00:00 2001 From: OndrejSladky Date: Thu, 22 Aug 2024 23:36:16 +0200 Subject: [PATCH 1/2] Improved README quality. --- README.md | 150 ++++++++++++++++++++++++++++---------------------- src/README.md | 75 +++++++++++++++++++++++++ src/main.cpp | 6 +- src/version | 2 +- 4 files changed, 165 insertions(+), 68 deletions(-) create mode 100644 src/README.md diff --git a/README.md b/README.md index 9ced772..c78f024 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,26 @@ # KmerCamel🐫 -KmerCamel🐫 provides implementations of algorithms for efficiently representing a set of k-mers as a [masked superstring](https://doi.org/10.1101/2023.02.01.526717), based on the following paper: +[![KmerCamel test](https://github.com/OndrejSladky/kmercamel/actions/workflows/ci.yml/badge.svg)](https://github.com/OndrejSladky/kmercamel/actions/) + + + +* [Introduction](#introduction) +* [Prerequisites](#prerequisites) +* [Getting started](#getting-started) +* [How to use](#how-to-use) +* [How it works](#how-it-works) +* [How to test](#how-to-test) +* [Issues](#issues) +* [Changelog](#changelog) +* [Licence](#licence) +* [Contact](#contact) + + + +## Introduction + +KmerCamel🐫 is a tool for efficiently representing a set of k-mers a [masked superstring](https://doi.org/10.1101/2023.02.01.526717). + +It is based on the following paper: > Ondřej Sladký, Pavel Veselý, and Karel Břinda: Masked superstrings as a unified framework for textual *k*-mer set representations. *bioRxiv* 2023.02.01.526717, 2023. [https://doi.org/10.1101/2023.02.01.526717](https://doi.org/10.1101/2023.02.01.526717) @@ -7,57 +28,67 @@ KmerCamel🐫 provides implementations of algorithms for efficiently representin See [supplementary materials](https://github.com/karel-brinda/masked-superstrings-supplement) of the aforementioned paper for experimental results with KmerCamel🐫. The computation of masked superstring using KmerCamel🐫 is done in two steps - -first a superstring is computed and then its mask can be optimized. - -The implemented superstring algorithms are the following: -- Local Greedy algorithm - constructs the superstring by locally finding and appending an unused k-mer with the largest overlap. -- Global Greedy algorithm - constructs the superstring by merging two k-mers with the largest overlap - -They come in two different implementations (their results may differ due to the differences in used data structures): -- Encoding the k-mers as integers and using fast prefix/suffix equality checks. -- Using the Aho-Corasick automaton. +first a superstring is computed with its default mask and then its mask can be optimized. -Note that at this point only the implementations with hash table are optimized and that the Aho-Corasick automaton -based versions of the algorithms are only experimental. - -The hashing based implementations of KmerCamel🐫 support $k$-mer with $k$ at most 127, +The computation of the masked superstring works as follows. KmerCamel🐫 reads an input FASTA file (optionally `gzip`ed), retrieves the associated k-mers (with supported $k$ up to 127), and outputs +a fasta file with a single record - a masked-cased superstring, which is in the nucleotide alphabet with case of the letters determining the mask symbols. +KmerCamel🐫 implements two different algorithms for computing the superstring: +global greedy and local greedy. Global greedy produces more compact superstrings and therefore is the default option, +but local greedy requires less memory and hence can be more suitable in use cases where memory is the main limitation. All algorithms can be used to either work in the unidirectional model or in the bidirectional model (i.e. treat $k$-mer and its reverse complement as the same; in this case either of them appears in the result). -The implemented mask optimization algorithms are the following: +Additionally, KmerCamel🐫 can optimize the mask of the superstring via the `optimize`subcommand. The implemented mask optimization algorithms are the following: - Minimize the number of 1s in the mask. - Maximize the number of 1s in the mask. - Minimize the number of runs of 1s in the mask. -## How to install +## Prerequisites + +* GCC +* Zlib +* GLPK (can be installed via `apt-get install libglpk-dev` on Ubuntu or `brew install glpk` on macOS) -First clone the repo and its dependency: +## Getting started + +Download and compile KmerCamel🐫 by running the following commands: ``` git clone --recursive https://github.com/OndrejSladky/kmercamel +cd kmercamel && make ``` -Compile the program by running `make`. - - -### Dependencies - -The program requires the GLPK library. This is usually available by default. If not it can be installed via: +## How to use +Computing masked superstrings: ``` -sudo apt-get install libglpk-dev +./kmercamel -p ./spneumoniae.fa -k 31 -c # From a fasta file +./kmercamel -p - -k 31 -c # Read from stdin +./kmercamel -p ./spneumoniae.fa.gz -k 31 -c # From a gzipped fasta file +./kmercamel -p ./spneumoniae.fa -k 127 -c # Largest supported k +./kmercamel -p ./spneumoniae.fa -k 31 -a local -d 5 -c # Use local greedy +./kmercamel -p ./spneumoniae.fa -k 31 -c -o out.fa # Redirect output to a file +./🐫 -p ./spneumoniae.fa -k 31 -c # An alternative if your OS supports it ``` -on Ubuntu or +Optimizing masks: +``` +./kmercamel optimize -p ./masked-superstring.fa -k 31 -a runs -c # Minimize the number of runs of 1s +./kmercamel optimize -p ./masked-superstring.fa -k 31 -a ones -c # Maximize the number of 1s +./kmercamel optimize -p ./masked-superstring.fa -k 31 -a zeros -c # Maximize the number of 0s +./kmercamel optimize -p ./masked-superstring.fa -k 31 -a runapprox -c # Approximately minimize the number of runs of 1s +``` +Compute lower bound on the minimum possible superstring length of a k-mer set: ``` -brew install glpk +./kmercamel -l -p ./spneumoniae.fa -k 31 ``` -on macOS. +Additionally, KmerCamel🐫 experimentally implements both algorithms in their Aho-Corasick automaton versions. To use them, add `AC` to the algorithm name. +Note that they are slower than the original versions, but they can handle arbitrarily large *k*s. -## How to run +### Arguments The program has the following arguments: @@ -68,25 +99,11 @@ The versions with AC use Aho-Corasick automaton. Default `global`. - `-o output_path` - the path to output file. If not specified, output is printed to stdout. - `-d value_of_d` - d_max used in Local Greedy. Default 5. Increasing `d` beyond `k` has no effect. - `-c` - treat k-mer and its reverse complement as equal. +- `-l` - compute lower bound on the superstring length instead of the superstring. - `-m` - turn off memory optimizations for `global`. - `-h` - print help. - `-h` - print version. - -The output contains the resulting superstring - capital letters indicate that at given position, a k-mer starts. - -For example: - -``` -./kmercamel -p ./spneumoniae.fa -a local -k 31 -d 5 -c -``` - -runs the Local Greedy in the bidirectional model on the streptococcus fasta file with `k=31` and `d=5`. - -Alternatively, if your operating system supports it, you can run `./🐫` instead of `./kmercamel`. - -### Mask optimization - For mask optimization, run the subcommand `optimize` with the following arguments: - `p path_to_fasta` - the path to fasta file (can be `gzip`ed). This is a required argument. @@ -97,45 +114,46 @@ For mask optimization, run the subcommand `optimize` with the following argument - `h` - print help. - `v` - print version. -For example: -``` -./kmercamel optimize -p ./global-spneumoniae.fa -k 31 -a runs -c -``` +### Converting k-mer set superstring representation to the (r)SPSS representations -minimizes the number of runs of 1s in the mask of the superstring computed by Global Greedy in the bidirectional model on the streptococcus fasta file with `k=12`. +We provide a Python script for converting any masked superstring to a (r)SPSS representation. +Run `./convert_superstring.py < input.fa`. This runs a Python script which inputs a fasta file with masked-cased superstring and outputs the (r)SPSS representation. -### Turn off memory optimizations for Global +## How it works -In order to reduce the memory footprint of hash-table based Global Greedy, -it uses several optimizations that reduce memory but increase the running time. -If memory is not the bottleneck, turning off the memory optimizations might be desirable. -This in practice about halves the running time. +For details about the algorithms and their implementation, see the [Code README](./src/README.md). +## How to test -## Converting k-mer set superstring representation to the (r)SPSS representations +To ensure correctness of the results, KmerCamel🐫 has two levels of tests - unit tests and file-specific integration tests. -Run `./convert_superstring.py < input.fa`. This runs a Python script which inputs the superstring masked representation and outputs the (r)SPSS representation. +For integration tests install [jellyfish (v2)](https://github.com/gmarcais/Jellyfish) +and add it to PATH. -## How to test +You can verify all the algorithms for `1 < k < 128` on a *S. pneumoniae* by running `make verify`. +To run it on another dataset, see the [verification script](./verify.py). +You can run the C++ unittests by `make cpptest`. -For integration tests you'll have to install [jellyfish (v2)](https://github.com/gmarcais/Jellyfish) -and add it to PATH. +To run all the test, simply run `make test`. -You can verify all the algorithms for `1 < k < 32` on a given fasta file by running: +## Issues -``` -make verify -``` +Please use [Github issues](https://github.com/OndrejSladky/kmercamel/issues). -You can run the c++ unittests by `make cpptest`. -Similarly testing the convert script can be done via `make converttest`. +## Changelog + +See [Releases](https://github.com/OndrejSladky/kmercamel/releases). + + +## Licence + +[MIT](https://github.com/OndrejSladky/kmercamel/blob/master/LICENSE.txt) -To run all the test, simply run `make test`. ## Contact -You can contact the developer at [sladky@iuuk.mff.cuni.cz](mailto:sladky@iuuk.mff.cuni.cz). +[Ondrej Sladky](https://iuuk.mff.cuni.cz/~sladky/) \\ diff --git a/src/README.md b/src/README.md new file mode 100644 index 0000000..57be1f2 --- /dev/null +++ b/src/README.md @@ -0,0 +1,75 @@ +# KmerCamel🐫 internals + +## Parsing FASTA files and storing k-mers + +To parse FASTA files, we use the `kseq.h` library. To support large sequences, we use the version from [seqtk](https://github.com/lh3/seqtk/blob/master/kseq.h). +We represent *k*-mers as integers, where the size of the integer is selected depending on *k* without any need to recompile. +We use 64bit integers, 128bit integers from GCC and 256bit integers implemented in `uint256_t` folder. +To achieve this while keeping high performance, we use C++ templates and, where needed, C macros. +Efficient operations on *k*-mers are implemented in the `kmer.h` file. +*k*-mers are stored in a `khash.h` hash table. We modified the original version to internally use 64bit integers to support very large *k*-mer sets and also use Wang hash instead of the default one. +We implement wrapper operations over `khash.h` in `khash_utils.h` and the *k*-mer parser in `parser.h`. + +## Global greedy + +The global greedy algorithm in its core works in the unidirectional model as follows: +```python +def global_greedy(K): + while len(K) > 1: + a, b = two_most_overlapping(K) + K = K + {merge(a, b)} - {a, b} + return K.first() +``` + +In the bidirectional model, we additionally always merge the reverse complements of `a` and `b`, while never merging two reverse complementary strings. + +To achieve high performance, the problem is split into two parts. In the first, we for each *k*-mer compute its successor +without actually merging them, and we obtain the masked superstring in the second part. +To quickly find the two most overlapping *k*-mers, we, starting from the largest overlap length, +create a map of prefixes to *k*-mers and then iterate over the suffixes. +Since this map is quite memory demanding, we store at each time only a part of the *k*-mers and repeat the process that many times (which can be turned off). +In order for this not to be as time-consuming, we first partially sort the *k*-mers using bucket sort. + +The global greedy is implemented in the `global.h` file. + +## Local greedy + +The local greedy in its core works as follows: +```python +def local_greedy(K): + result = "" + while len(K) > 0: + current = K.first() + K = K - {current} + while largest_overlap(K, current) >= k - d_max: + a = most_overlapping(K, current) + current = merge(a, current) + K = K - {a} + result = result + current + return result +``` + +The *k*-mers with largest overlap are found simply by iterating over all possible extensions of length up to `d_max`. + +The local greedy is implemented in the `local.h` file. + +## Mask optimization + +We implement three different mask optimization algorithms: +- Minimizing the number of 1s in the mask. +- Maximizing the number of 1s in the mask. +- Minimizing the number of runs of 1s in the mask. + +Minimization/maximization of 1s is implemented by a simple two pass algorithm, where in the first pass *k*-mers +are loaded and in the second, the mask is masked at all positions or at the first position respectively. + +Minimization of runs of 1s is done in three steps. First, the intervals of consecutive *k*-mers which can be masked to 1 are obtained. +Second, we set intervals with *k*-mers not appearing elsewhere to 1 and after all intervals with resolved *k*-mers to 0. +This step is important as it significantly reduces the problem size. +Finally, e pass the rest to a GLPK ILP solver to minimize the number of 1s in the mask. + +The mask optimization is implemented in the `masks.h` file. + +## Aho-Corasick-based versions and other experimental algorithms + +All the experimental algorithms are in the folder `ac`. \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp index 05d442d..6d4da2f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -120,7 +120,7 @@ INIT_KMERCAMEL(128) INIT_KMERCAMEL(256) int main(int argc, char **argv) { - std::string path; + std::string path = ""; int k = 0; int d_max = 5; std::ofstream output; @@ -142,6 +142,10 @@ int main(int argc, char **argv) { while ((opt = getopt(argc, argv, "p:k:d:a:o:hcvml")) != -1) { switch(opt) { case 'p': + if (path != "") { + std::cerr << "Error: parameter p set twice." << std::endl; + return Help(); + } path = optarg; break; case 'o': diff --git a/src/version b/src/version index 085135e..6b3126c 100644 --- a/src/version +++ b/src/version @@ -1 +1 @@ -v0.1 +v1.0 From 0380d588463b8b64c06e089f8e0ef8384af5f694 Mon Sep 17 00:00:00 2001 From: OndrejSladky Date: Thu, 22 Aug 2024 23:43:36 +0200 Subject: [PATCH 2/2] Removed unnecessary macro. --- src/khash_utils.h | 3 ++ src/main.cpp | 132 ++++++++++++++++++++++------------------------ 2 files changed, 66 insertions(+), 69 deletions(-) diff --git a/src/khash_utils.h b/src/khash_utils.h index 0f0e1f9..b4cb124 100644 --- a/src/khash_utils.h +++ b/src/khash_utils.h @@ -48,6 +48,9 @@ KHASH_MAP_INIT_INT64(P64, size_t) inline void kh_del_from_set(kh_S##type##_t *set, khint_t key) { \ kh_del_S##type(set, key); \ } \ + inline void kh_destroy_set(kh_S##type##_t *set) { \ + kh_destroy_S##type(set); \ + } \ inline kh_P##type##_t *kh_init_map() { \ return kh_init_P##type(); \ } \ diff --git a/src/main.cpp b/src/main.cpp index 6d4da2f..7cf3be0 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -53,74 +53,68 @@ void Version() { std::cerr << VERSION << std::endl; } -// This macro cannot be avoided without bloating the code too much. -#define INIT_KMERCAMEL(type) \ -int kmercamel##type(std::string path, int k, int d_max, std::ostream *of, bool complements, bool masks, \ - std::string algorithm, bool optimize_memory, bool lower_bound) { \ - kmer_dict##type##_t wrapper; \ - kmer##type##_t kmer_type = 0; \ - if (masks) { \ - int ret = Optimize(wrapper, kmer_type, algorithm, path, *of, k, complements); \ - if (ret) Help(); \ - return ret; \ - } \ - \ - /* Handle streaming algorithm separately. */ \ - if (algorithm == "streaming") { \ - WriteName(k, *of); \ - Streaming(path, *of, k , complements); \ - } \ - /* Handle hash table based algorithms separately so that they consume less memory. */ \ - else if (algorithm == "global" || algorithm == "local") { \ - auto *kMers = kh_init_S##type(); \ - ReadKMers(kMers, wrapper, kmer_type, path, k, complements); \ - if (!kh_size(kMers)) { \ - std::cerr << "Path '" << path << "' contains no k-mers." << std::endl; \ - return Help(); \ - } \ - d_max = std::min(k - 1, d_max); \ - if (!lower_bound) WriteName(k, *of); \ - if (algorithm == "global") { \ - auto kMerVec = kMersToVec(kMers, kmer_type); \ - kh_destroy_S##type(kMers); \ - /* Turn off the memory optimizations if optimize_memory is set to false. */ \ - if(optimize_memory) PartialPreSort(kMerVec, k); \ - else MEMORY_REDUCTION_FACTOR = 1; \ - if (lower_bound) std::cout << LowerBoundLength(wrapper, kMerVec, k, complements); \ - else Global(wrapper, kMerVec, *of, k, complements); \ - } \ - else Local(kMers, wrapper, kmer_type, *of, k, d_max, complements); \ - } else { \ - auto data = ReadFasta(path); \ - if (data.empty()) { \ - std::cerr << "Path '" << path << "' not to a fasta file." << std::endl; \ - return Help(); \ - } \ - d_max = std::min(k - 1, d_max); \ - \ - auto kMers = ConstructKMers(data, k, complements); \ - WriteName(k, *of); \ - if (algorithm == "globalAC") { \ - GlobalAC(kMers, *of, complements); \ - } \ - else if (algorithm == "localAC") { \ - LocalAC(kMers, *of, k, d_max, complements); \ - } \ - else { \ - std::cerr << "Algorithm '" << algorithm << "' not supported." << std::endl; \ - return Help(); \ - } \ - } \ - *of << std::endl; \ - return 0; \ -} +/// Run KmerCamel with the given parameters. +template +int kmercamel(kh_wrapper_t wrapper, kmer_t kmer_type, std::string path, int k, int d_max, std::ostream *of, bool complements, bool masks, + std::string algorithm, bool optimize_memory, bool lower_bound) { + if (masks) { + int ret = Optimize(wrapper, kmer_type, algorithm, path, *of, k, complements); + if (ret) Help(); + return ret; + } -INIT_KMERCAMEL(64) -INIT_KMERCAMEL(128) -INIT_KMERCAMEL(256) + /* Handle streaming algorithm separately. */ + if (algorithm == "streaming") { + WriteName(k, *of); + Streaming(path, *of, k , complements); + } + /* Handle hash table based algorithms separately so that they consume less memory. */ + else if (algorithm == "global" || algorithm == "local") { + auto *kMers = wrapper.kh_init_set(); + ReadKMers(kMers, wrapper, kmer_type, path, k, complements); + if (!kh_size(kMers)) { + std::cerr << "Path '" << path << "' contains no k-mers." << std::endl; + return Help(); + } + d_max = std::min(k - 1, d_max); + if (!lower_bound) WriteName(k, *of); + if (algorithm == "global") { + auto kMerVec = kMersToVec(kMers, kmer_type); + wrapper.kh_destroy_set(kMers); + /* Turn off the memory optimizations if optimize_memory is set to false. */ + if(optimize_memory) PartialPreSort(kMerVec, k); + else MEMORY_REDUCTION_FACTOR = 1; + if (lower_bound) std::cout << LowerBoundLength(wrapper, kMerVec, k, complements); + else Global(wrapper, kMerVec, *of, k, complements); + } + else Local(kMers, wrapper, kmer_type, *of, k, d_max, complements); + } else { + auto data = ReadFasta(path); + if (data.empty()) { + std::cerr << "Path '" << path << "' not to a fasta file." << std::endl; + return Help(); + } + d_max = std::min(k - 1, d_max); + + auto kMers = ConstructKMers(data, k, complements); + WriteName(k, *of); + if (algorithm == "globalAC") { + GlobalAC(kMers, *of, complements); + } + else if (algorithm == "localAC") { + LocalAC(kMers, *of, k, d_max, complements); + } + else { + std::cerr << "Algorithm '" << algorithm << "' not supported." << std::endl; + return Help(); + } + } + *of << std::endl; + return 0; +} int main(int argc, char **argv) { - std::string path = ""; + std::string path; int k = 0; int d_max = 5; std::ofstream output; @@ -142,7 +136,7 @@ int main(int argc, char **argv) { while ((opt = getopt(argc, argv, "p:k:d:a:o:hcvml")) != -1) { switch(opt) { case 'p': - if (path != "") { + if (!path.empty()) { std::cerr << "Error: parameter p set twice." << std::endl; return Help(); } @@ -218,10 +212,10 @@ int main(int argc, char **argv) { return Help(); } if (k < 32) { - return kmercamel64(path, k, d_max, of, complements, masks, algorithm, optimize_memory, lower_bound); + return kmercamel(kmer_dict64_t(), kmer64_t(0), path, k, d_max, of, complements, masks, algorithm, optimize_memory, lower_bound); } else if (k < 64) { - return kmercamel128(path, k, d_max, of, complements, masks, algorithm, optimize_memory, lower_bound); + return kmercamel(kmer_dict128_t(), kmer128_t(0), path, k, d_max, of, complements, masks, algorithm, optimize_memory, lower_bound); } else { - return kmercamel256(path, k, d_max, of, complements, masks, algorithm, optimize_memory, lower_bound); + return kmercamel(kmer_dict256_t(), kmer256_t(0), path, k, d_max, of, complements, masks, algorithm, optimize_memory, lower_bound); } }