Fixed bug for thread_dna function when using a ClipKIT log file. Inpu…

…t protein alignment must be the untrimmed alignment.
JLSteenwyk · Aug 13, 2024 · 0427f33 · 0427f33
1 parent 2bc1d48
commit 0427f33
Show file tree

Hide file tree

Showing 11 changed files with 510 additions and 582 deletions.
diff --git a/Makefile b/Makefile
@@ -118,7 +118,7 @@ test.fast:
 	python -m pytest -m "not (integration or slow)"
 	rm -rf output/
 	mkdir output/
-	python -m pytest --basetemp=output -m "integration and not slow"
+	python -m pytest --basetemp=output -m "integration and not slow" -vv
 	rm test.fa test.occupancy test.partition
 
 # used by GitHub actions during CI workflow

diff --git a/change_log.txt b/change_log.txt
@@ -1,5 +1,8 @@
 Major changes to PhyKIT are summarized here.
 
+1.20.0
+  - Fixed bug for thread_dna function when using a ClipKIT log file. Input protein alignment must be the untrimmed alignment.
+
 1.19.4
   - Saturation function forces y-intercept to be zero when calculating slope
 

diff --git a/docs/change_log/index.rst b/docs/change_log/index.rst
@@ -8,6 +8,9 @@ Change log
 
 Major changes to PhyKIT are summarized here.
 
+**1.20.0**:
+Fixed bug for thread_dna function when using a ClipKIT log file. Input protein alignment must be the untrimmed alignment.
+
 **1.19.9**:
 Saturation function now also reports the absolute value of 1-saturation. Lower values are indicative of less saturation.
 

diff --git a/docs/usage/index.rst b/docs/usage/index.rst
@@ -500,9 +500,20 @@ Thread DNA sequence onto a protein alignment to create a
 codon-based alignment. 
 
 This function requires input alignments are in fasta format.
-Codon alignments are then printed to stdout. Note, sequences
-are assumed to occur in the same order in the protein and 
-nucleotide alignment.
+Codon alignments are then printed to stdout. Note, paired
+sequences are assumed to have the same name between the 
+protein and nucleotide file. The order does not matter.
+
+To thread nucleotide sequences over a trimmed amino acid
+alignment, provide PhyKIT with a log file specifying which
+sites have been trimmed and which have been kept. The log
+file must be formatted the same as the log files outputted
+by the alignment trimming toolkit ClipKIT (see -l in ClipKIT
+documentation.) Details about ClipKIT can be seen here:
+https://github.com/JLSteenwyk/ClipKIT.
+
+If using a ClipKIT log file, the untrimmed protein alignment
+should be provided in the -p/--protein argument.
 
 .. code-block:: shell
 

diff --git a/phykit/phykit.py b/phykit/phykit.py
@@ -2606,9 +2606,9 @@ def thread_dna(argv):
                 codon-based alignment. 
                 
                 This function requires input alignments are in fasta format.
-                Codon alignments are then printed to stdout. Note, sequences
-                are assumed to occur in the same order in the protein and 
-                nucleotide alignment.
+                Codon alignments are then printed to stdout. Note, paired
+                sequences are assumed to have the same name between the 
+                protein and nucleotide file. The order does not matter.
 
                 To thread nucleotide sequences over a trimmed amino acid
                 alignment, provide PhyKIT with a log file specifying which
@@ -2618,6 +2618,9 @@ def thread_dna(argv):
                 documentation.) Details about ClipKIT can be seen here:
                 https://github.com/JLSteenwyk/ClipKIT.
 
+                If using a ClipKIT log file, the untrimmed protein alignment
+                should be provided in the -p/--protein argument.
+
                 Aliases:
                   thread_dna, pal2nal, p2n
                 Command line interfaces:

diff --git a/phykit/services/alignment/dna_threader.py b/phykit/services/alignment/dna_threader.py
@@ -61,16 +61,18 @@ def create_mask(self, length):
         return keep_mask
 
     def normalize_p_seq(self, p_seq, mask):
-        if self.clipkit_log_data:
-            untrimmed = []
-            offset = 0
-            for idx, value in enumerate(mask[::3]):
-                if value is True:
-                    untrimmed.append(p_seq[idx - offset])
-                else:
-                    offset += 1
-                    untrimmed.append("#")
-            p_seq = "".join(untrimmed)
+        #TODO: write MP
+        #TODO: update tests
+        # if self.clipkit_log_data:
+        #     untrimmed = []
+        #     offset = 0
+        #     for idx, value in enumerate(mask[::3]):
+        #         if value is True:
+        #             untrimmed.append(p_seq[idx - offset])
+        #         else:
+        #             offset += 1
+        #             untrimmed.append("#")
+        #     p_seq = "".join(untrimmed)
         return "".join([c * 3 for c in p_seq])
 
     def normalize_n_seq(self, n_seq, normalized_p_seq):

diff --git a/phykit/version.py b/phykit/version.py
@@ -1 +1 @@
-__version__ = "1.19.9"
+__version__ = "1.20.0"