From a335429503055975e1be8d38b416797fe7649c2e Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 21 Oct 2024 16:49:30 -0400 Subject: [PATCH 1/4] Make padded window size parity match anchor parity for orientation independence --- src/surjector.cpp | 48 ++++++++++++++++++++++++++++++++++++++++------- src/surjector.hpp | 4 ++++ 2 files changed, 45 insertions(+), 7 deletions(-) diff --git a/src/surjector.cpp b/src/surjector.cpp index be0a2e067d..9c510bde9a 100644 --- a/src/surjector.cpp +++ b/src/surjector.cpp @@ -4038,7 +4038,7 @@ using namespace std; for (int i = 0; i < path_chunks.size(); ++i) { auto& chunk = path_chunks[i]; // Mark anchors that are themselves suspicious as not to be kept. - if (chunk.first.first == path_chunks.front().first.first && chunk.first.second == path_chunks.back().first.second + if (chunk.first.first == path_chunks.front().first.first && chunk.first.second == path_chunks.back().first.second // TODO: shouldn't this be || if we want to match *either* tail? && (anchor_lengths[i] <= max_tail_anchor_prune || chunk.first.second - chunk.first.first <= max_tail_anchor_prune)) { #ifdef debug_anchored_surject cerr << "anchor " << i << " (read interval " << (chunk.first.first - sequence.begin()) << " : " << (chunk.first.second - sequence.begin()) << ") pruned for being a short tail" << endl; @@ -4050,19 +4050,53 @@ using namespace std; if ((anchor_lengths[i] <= max_low_complexity_anchor_prune || chunk.first.second - chunk.first.first <= max_low_complexity_anchor_prune)) { SeqComplexity<6> chunk_complexity(chunk.first.first, chunk.first.second); if (chunk.first.second - chunk.first.first < pad_suspicious_anchors_to_length) { - auto read_context_begin = max(sequence.begin(), chunk.first.first - (pad_suspicious_anchors_to_length - (chunk.first.second - chunk.first.first)) / 2); - auto read_context_end = min(sequence.end(), read_context_begin + pad_suspicious_anchors_to_length); - if (read_context_end == sequence.end()) { - // try to ensure enough bases if we're near the end of the read - read_context_begin = max(sequence.begin(), read_context_end - pad_suspicious_anchors_to_length); + // We need to fetch out a sequence at least pad_suspicious_anchors_to_length bp long (unless the whole sequence is shorter) and analyze that. + + // There's no way to symmetrically (and thus + // independently of orientation) get an even-length + // window around an odd-length window, or visa versa. + // So express everything as per-side padding, and round it up. + size_t chunk_length = chunk.first.second - chunk.first.first; + size_t padding_per_side = (pad_suspicious_anchors_to_length - chunk_length + 1) / 2; + + // Pad separately on each side to avoid read bounds + size_t left_padding = padding_per_side; + size_t right_padding = padding_per_side; + + // Shift the padded window right if we hit the start + size_t start_offset = chunk.first.first - sequence.begin(); + if (start_offset < left_padding) { + right_padding += (left_padding - start_offset); + left_padding = start_offset; } + + // Shift the padded window left if we hit the end + size_t remaining_until_end = sequence.end() - chunk.first.second; + if (remaining_until_end < right_padding) { + left_padding += (right_padding - remaining_until_end); + right_padding = remaining_until_end; + } + + // If we hit the start again, clip since the whole window doesn't fit. + left_padding = min(left_padding, start_offset); + + // TODO: Is there a more closed-form way to budge the padding? + + // Now expand the iterator range without ever constructing pre-begin iterators + auto read_context_begin = chunk.first.first - left_padding; + auto read_context_end = chunk.first.second + right_padding; + +#ifdef debug_anchored_surject + std::cerr << "For read interval " << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << " length " << (chunk.first.second - chunk.first.first) << " use context " << (read_context_begin - sequence.begin()) << ":" << (read_context_end - sequence.begin()) << " length " << (read_context_end - read_context_begin) << std::endl; +#endif + SeqComplexity<6> context_complexity(read_context_begin, read_context_end); // TODO: repetitive for (int order = 1, max_order = 6; order <= max_order; ++order) { //cerr << "padded anchor " << i << " (read[" << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << "]), seq " << string(read_context_begin, read_context_end) << ", order " << order << " with p-value " << context_complexity.p_value(order) << ", repetitive fraction " << chunk_complexity.repetitiveness(order) << endl; if (context_complexity.p_value(order) < low_complexity_p_value) { #ifdef debug_anchored_surject - cerr << "anchor " << i << " (read[" << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << "]) pruned being for having context with low complexity at order " << order << ", p-value " << context_complexity.p_value(order) << " and anchor repetitive fraction " << chunk_complexity.repetitiveness(order) << endl; + cerr << "anchor " << i << " (read[" << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << "]) pruned for having context with low complexity at order " << order << ", p-value " << context_complexity.p_value(order) << " and anchor repetitive fraction " << chunk_complexity.repetitiveness(order) << endl; #endif // the sequences is repetitive at this order keep[i] = false; diff --git a/src/surjector.hpp b/src/surjector.hpp index a582df6412..84bacead64 100644 --- a/src/surjector.hpp +++ b/src/surjector.hpp @@ -132,6 +132,10 @@ using namespace std; double low_complexity_p_value = .0075; int64_t max_low_complexity_anchor_prune = 40; int64_t max_low_complexity_anchor_trim = 65; + /// When examining anchors for suspiciousness, try and make them at + /// least this long. To ensure orientation symmetry, we will make + /// anchors with the oppsite parity (even if this is odd, or odd if + /// this is even) 1bp longer. int64_t pad_suspicious_anchors_to_length = 12; // A function for computing band padding From cabfe9fc66f31fed06eae727b04c3cb4ea16b61b Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 21 Oct 2024 16:50:01 -0400 Subject: [PATCH 2/4] Remove extra debugging --- src/surjector.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/surjector.cpp b/src/surjector.cpp index 9c510bde9a..0c2ff88df3 100644 --- a/src/surjector.cpp +++ b/src/surjector.cpp @@ -4086,10 +4086,6 @@ using namespace std; auto read_context_begin = chunk.first.first - left_padding; auto read_context_end = chunk.first.second + right_padding; -#ifdef debug_anchored_surject - std::cerr << "For read interval " << (chunk.first.first - sequence.begin()) << ":" << (chunk.first.second - sequence.begin()) << " length " << (chunk.first.second - chunk.first.first) << " use context " << (read_context_begin - sequence.begin()) << ":" << (read_context_end - sequence.begin()) << " length " << (read_context_end - read_context_begin) << std::endl; -#endif - SeqComplexity<6> context_complexity(read_context_begin, read_context_end); // TODO: repetitive for (int order = 1, max_order = 6; order <= max_order; ++order) { From 68fc4284eaf31de6fcbb62f3aacfa26354e79112 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 21 Oct 2024 17:05:54 -0400 Subject: [PATCH 3/4] Change short tail pruning to only require being one tail and not both --- src/surjector.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/surjector.cpp b/src/surjector.cpp index 0c2ff88df3..954adb0b27 100644 --- a/src/surjector.cpp +++ b/src/surjector.cpp @@ -4038,8 +4038,8 @@ using namespace std; for (int i = 0; i < path_chunks.size(); ++i) { auto& chunk = path_chunks[i]; // Mark anchors that are themselves suspicious as not to be kept. - if (chunk.first.first == path_chunks.front().first.first && chunk.first.second == path_chunks.back().first.second // TODO: shouldn't this be || if we want to match *either* tail? - && (anchor_lengths[i] <= max_tail_anchor_prune || chunk.first.second - chunk.first.first <= max_tail_anchor_prune)) { + if ((chunk.first.first == path_chunks.front().first.first || chunk.first.second == path_chunks.back().first.second) // Is at either tail + && (anchor_lengths[i] <= max_tail_anchor_prune || chunk.first.second - chunk.first.first <= max_tail_anchor_prune)) { // And is too short #ifdef debug_anchored_surject cerr << "anchor " << i << " (read interval " << (chunk.first.first - sequence.begin()) << " : " << (chunk.first.second - sequence.begin()) << ") pruned for being a short tail" << endl; #endif From 0c826dc84bed3697294a5fda31102b9cd9520a5f Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Mon, 21 Oct 2024 17:15:54 -0400 Subject: [PATCH 4/4] Add a test for surjection orientation independence --- test/surject/opposite_strands.gaf | 2 ++ test/surject/opposite_strands.gfa | 43 +++++++++++++++++++++++++++++++ test/t/15_vg_surject.t | 4 ++- 3 files changed, 48 insertions(+), 1 deletion(-) create mode 100644 test/surject/opposite_strands.gaf create mode 100644 test/surject/opposite_strands.gfa diff --git a/test/surject/opposite_strands.gaf b/test/surject/opposite_strands.gaf new file mode 100644 index 0000000000..73e07638fe --- /dev/null +++ b/test/surject/opposite_strands.gaf @@ -0,0 +1,2 @@ +forward 35 0 35 + >56629809>56629810>56629811>56629814>56629815>56629819>56629820>56629822 132 97 132 35 35 0 cs:Z::35 +reverse 35 0 35 + <56629822<56629820<56629819<56629815<56629814<56629811<56629810<56629809 132 0 35 35 35 0 cs:Z::35 diff --git a/test/surject/opposite_strands.gfa b/test/surject/opposite_strands.gfa new file mode 100644 index 0000000000..8177180d24 --- /dev/null +++ b/test/surject/opposite_strands.gfa @@ -0,0 +1,43 @@ +H VN:Z:1.1 RS:Z:GRCh38 CHM13 +S 56629807 G +S 56629808 T +S 56629809 TTTCTGATTATAAATATTGCACATGTATTGATTACATAAATCCATATACTATAAAACTGATATTTAAGAGAATAAAAGTCCCAACCTCAGAATTAACTACTG +S 56629810 C +S 56629811 A +S 56629812 A +S 56629813 C +S 56629814 CCCCC +S 56629815 C +S 56629816 G +S 56629817 T +S 56629818 T +S 56629819 T +S 56629820 TTTTTTTTTTT +S 56629821 T +S 56629822 GATGGAGTCT +S 56629823 C +S 56629824 G +W CHM13 0 chr8 81560724 81560860 >56629807>56629809>56629810>56629813>56629814>56629817>56629818>56629819>56629820>56629821>56629822>56629824 +W GRCh38 0 chr8 35183939 35184073 >56629807>56629809>56629810>56629811>56629814>56629815>56629819>56629820>56629822>56629823 +L 56629807 + 56629809 + 0M +L 56629808 + 56629809 + 0M +L 56629809 + 56629812 - 0M +L 56629809 + 56629810 + 0M +L 56629810 + 56629813 + 0M +L 56629810 + 56629811 + 0M +L 56629811 + 56629814 + 0M +L 56629812 - 56629813 + 0M +L 56629813 + 56629814 + 0M +L 56629814 + 56629817 + 0M +L 56629814 + 56629815 + 0M +L 56629815 + 56629819 + 0M +L 56629815 + 56629816 - 0M +L 56629816 - 56629819 + 0M +L 56629817 + 56629818 + 0M +L 56629818 + 56629819 + 0M +L 56629819 + 56629820 + 0M +L 56629820 + 56629822 + 0M +L 56629820 + 56629821 + 0M +L 56629821 + 56629822 + 0M +L 56629822 + 56629824 + 0M +L 56629822 + 56629823 + 0M diff --git a/test/t/15_vg_surject.t b/test/t/15_vg_surject.t index 6e0ddff37e..3704ec823c 100644 --- a/test/t/15_vg_surject.t +++ b/test/t/15_vg_surject.t @@ -6,7 +6,7 @@ BASH_TAP_ROOT=../deps/bash-tap PATH=../bin:$PATH # for vg -plan tests 47 +plan tests 48 vg construct -r small/x.fa >j.vg vg index -x j.xg j.vg @@ -198,3 +198,5 @@ is "$(vg surject -x x.xg -M -m -s -i -t 1 mapped.gamp | grep -v '@' | wc -l)" 80 rm x.vg x.pathdup.vg x.xg x.gcsa x.gcsa.lcp x.gam mapped.gam mapped.gamp +is "$(vg surject -p CHM13#0#chr8 -x surject/opposite_strands.gfa --prune-low-cplx --sam-output --gaf-input surject/opposite_strands.gaf | grep -v "^@" | cut -f3-12 | sort | uniq | wc -l)" "1" "vg surject low compelxity pruning gets the same alignment regardless of read orientation" +