-
Notifications
You must be signed in to change notification settings - Fork 1
/
tandem.cpp
executable file
·150 lines (124 loc) · 4.67 KB
/
tandem.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
/* tandem.cpp
Copyright (C) 2017-2018 University of Oxford.
Author: Daniel Cooke <[email protected]>
Use of this source code is governed by the MIT license that can be found in the LICENSE file. */
#include "tandem.hpp"
#include <stack>
namespace tandem {
std::vector<std::uint32_t> make_rank_array(const std::vector<std::uint32_t>& suffix_array)
{
std::vector<std::uint32_t> result(suffix_array.size());
for (std::uint32_t i {0}; i < suffix_array.size(); ++i) {
result[suffix_array[i]] = i;
}
return result;
}
std::vector<std::uint32_t> make_rank_array(const std::vector<std::uint32_t>& suffix_array,
const std::size_t extra_capacity)
{
std::vector<std::uint32_t> result(suffix_array.size() - extra_capacity);
for (std::uint32_t i {0}; i < (suffix_array.size() - extra_capacity); ++i) {
result[suffix_array[i]] = i;
}
return result;
}
// Implementation of algorithm found in Crochemore et al. (2008)
std::vector<std::uint32_t> make_lpf_array(std::vector<std::uint32_t> sa, std::vector<std::uint32_t> lcp)
{
const auto n = sa.size();
std::vector<std::uint32_t> result(n, 0);
static constexpr auto sentinal = std::numeric_limits<std::uint32_t>::max();
sa.push_back(sentinal);
lcp.push_back(0);
std::stack<std::uint32_t> stack {};
stack.push(0);
for (std::uint32_t i {1}; i <= n; ++i) {
while (!stack.empty()
&& (sa[i] == sentinal || sa[i] < sa[stack.top()]
|| (sa[i] > sa[stack.top()] && lcp[i] <= lcp[stack.top()]))) {
if (sa[i] < sa[stack.top()]) {
std::tie(lcp[i], result[sa[stack.top()]]) = std::minmax(lcp[stack.top()], std::uint32_t {lcp[i]});
} else {
result[sa[stack.top()]] = lcp[stack.top()];
}
stack.pop();
}
if (i < n) {
stack.push(i);
}
}
return result;
}
// Implementation of algorithm found in Crochemore et al. (2008)
std::pair<std::vector<std::uint32_t>, std::vector<std::uint32_t>>
make_lpf_and_prev_occ_arrays(std::vector<std::uint32_t> sa, std::vector<std::uint32_t> lcp)
{
const auto n = sa.size();
std::vector<std::uint32_t> lpf(n, 0), prev_occ(n, 0);
static constexpr auto sentinal = std::numeric_limits<std::uint32_t>::max();
sa.push_back(sentinal);
lcp.push_back(0);
std::stack<std::pair<std::uint32_t, std::uint32_t>> stack {}; // default std::stack uses std::deque
stack.emplace(0, sa[0]);
for (std::uint32_t i {1}; i <= n; ++i) {
auto u = lcp[i];
while (!stack.empty() && (sa[i] == sentinal || sa[i] < stack.top().second)) {
std::tie(u, lpf[stack.top().second]) = std::minmax(stack.top().first, std::uint32_t {u});
const auto v = stack.top();
stack.pop();
if (lpf[v.second] == 0) {
prev_occ[v.second] = sentinal;
} else if (v.first > u) {
prev_occ[v.second] = stack.top().second;
} else {
prev_occ[v.second] = sa[i];
}
}
if (i < n) {
stack.emplace(u, sa[i]);
}
}
return std::make_pair(std::move(lpf), std::move(prev_occ));
}
namespace detail {
std::vector<std::vector<Repeat>>
get_init_buckets(const std::size_t n, const LMRVector& lmrs)
{
std::vector<std::uint32_t> counts(n, 0);
for (const auto& run : lmrs) {
++counts[run.pos + run.length - 1];
}
std::vector<std::vector<Repeat>> result(n, std::vector<Repeat> {});
for (std::size_t i {0}; i < n; ++i) {
result[i].reserve(counts[i]);
}
return result;
}
std::vector<std::vector<Repeat>>
get_init_buckets(const std::size_t n, const std::vector<std::vector<Repeat>>& end_buckets)
{
std::vector<std::uint32_t> counts(n, 0);
for (const auto& bucket : end_buckets) {
for (const auto& run : bucket) {
++counts[run.pos];
}
}
std::vector<std::vector<Repeat>> result(n, std::vector<Repeat> {});
for (std::size_t i {0}; i < n; ++i) {
result[i].reserve(counts[i]);
}
return result;
}
} // namespace detail
void rebase(std::vector<tandem::Repeat>& runs, const std::map<std::size_t, std::size_t>& shift_map)
{
if (shift_map.empty()) return;
auto shift_map_it = std::cbegin(shift_map);
for (auto& run : runs) {
while (std::next(shift_map_it) != std::cend(shift_map) && std::next(shift_map_it)->first <= run.pos) {
++shift_map_it;
}
run.pos += static_cast<decltype(run.pos)>(shift_map_it->second);
}
}
} // namespace tandem