Merge pull request #7 from RIVM-bioinformatics/update_cli

Update cli and juno-lib
RIVM-bioinformatics · Jul 25, 2024 · 720fe1a · 720fe1a
2 parents 2e431d5 + 7b96f39
commit 720fe1a
Show file tree

Hide file tree

Showing 5 changed files with 68 additions and 62 deletions.
diff --git a/envs/juno_variant_typing.yaml b/envs/juno_variant_typing.yaml
@@ -12,4 +12,4 @@ dependencies:
   - pip=23.*
   - python=3.11.*
   - pip:
-    - "--editable=git+https://github.com/RIVM-bioinformatics/juno-library.git@v2.1.2#egg=juno_library"
+    - "--editable=git+https://github.com/RIVM-bioinformatics/juno-library.git@v2.2.0#egg=juno_library"
diff --git a/juno_variant_typing.py b/juno_variant_typing.py
@@ -8,13 +8,11 @@
 """
 
 from pathlib import Path
-import pathlib
 import yaml
 import argparse
-import sys
 from dataclasses import dataclass, field
 from juno_library import Pipeline
-from typing import Optional, Union, Callable
+from typing import Optional, Union, Callable, Tuple
 from version import __package_name__, __version__, __description__
 
 
@@ -60,7 +58,8 @@ def generated_func_check_range(value: str) -> str:
 class JunoVariantTyping(Pipeline):
     pipeline_name: str = __package_name__
     pipeline_version: str = __version__
-    input_type: str = "bam_and_vcf"
+    input_type: Tuple[str, ...] = ("bam", "vcf")
+    species_options = ["mycobacterium_tuberculosis"]
 
     def _add_args_to_parser(self) -> None:
         super()._add_args_to_parser()
@@ -80,22 +79,21 @@ def _add_args_to_parser(self) -> None:
             "provided, it must contain at least one column named 'sample' "
             "with the name of the sample (same than file name but removing "
             "the suffix _R1.fastq.gz), a column called "
-            "'genus' and a column called 'species'. The genus and species "
-            "provided will be used to choose the serotyper and the MLST schema(s)."
-            "If a metadata file is provided, it will overwrite the --species "
-            "argument for the samples present in the metadata file.",
+            "'species'. The species provided will be used to choose the "
+            "typing schemes. If a metadata file is provided, it will "
+            "overwrite the --species argument for the samples present in "
+            "the metadata file.",
         )
         self.add_argument(
             "-s",
             "--species",
-            type=lambda s: s.strip().lower(),
-            nargs=2,
-            default=["mycobacterium", "tuberculosis"],
+            type=str.lower,
+            default="mycobacterium_tuberculosis",
             required=False,
-            metavar=("GENUS", "SPECIES"),
+            metavar="SPECIES",
+            choices=self.species_options,
             help="Species name (any species in the metadata file will overwrite"
-            " this argument). It should be given as two words (e.g. --species "
-            "Mycobacterium tuberculosis)",
+            " this argument). Choose from: {self.species_options}",
         )
         self.add_argument(
             "-d",
@@ -123,9 +121,7 @@ def _parse_args(self) -> argparse.Namespace:
         # Optional arguments are loaded into self here
         self.db_dir: Path = args.db_dir.resolve()
 
-        self.genus: Optional[str]
-        self.species: Optional[str]
-        self.genus, self.species = args.species
+        self.species: Optional[str] = args.species
         self.metadata_file: Path = args.metadata
         self.presets_path: Optional[Path] = args.presets_path
         # self.single_copy_bed: Optional[Path] = args.single_copy_bed
@@ -171,12 +167,11 @@ def setup(self) -> None:
     def update_sample_dict_with_metadata(self) -> None:
         self.get_metadata_from_csv_file(
             filepath=self.metadata_file,
-            expected_colnames=["sample", "genus", "species"],
+            expected_colnames=["sample", "species"],
         )
         # Add metadata
         for sample in self.sample_dict:
-            if self.genus is not None and self.species is not None:
-                self.sample_dict[sample]["genus"] = self.genus
+            if self.species is not None:
                 self.sample_dict[sample]["species"] = self.species
             else:
                 try:
@@ -188,9 +183,6 @@ def update_sample_dict_with_metadata(self) -> None:
                         "samples are present in the metadata file or provide "
                         "a --species argument."
                     )
-                self.sample_dict[sample]["genus"] = (
-                    self.sample_dict[sample]["genus"].strip().lower()
-                )
                 self.sample_dict[sample]["species"] = (
                     self.sample_dict[sample]["species"].strip().lower()
                 )
@@ -203,12 +195,9 @@ def set_presets(self) -> None:
             presets_dict = yaml.safe_load(f)
 
         for sample in self.sample_dict:
-            complete_species_name = "_".join(
-                [self.sample_dict[sample]["genus"], self.sample_dict[sample]["species"]]
-            )
-
-            if complete_species_name in presets_dict.keys():
-                for key, value in presets_dict[complete_species_name].items():
+            species_name = self.sample_dict[sample]["species"]
+            if species_name in presets_dict.keys():
+                for key, value in presets_dict[species_name].items():
                     self.sample_dict[sample][key] = value
 
 

diff --git a/workflow/rules/choose_species.smk b/workflow/rules/choose_species.smk
@@ -1,7 +1,5 @@
 def choose_species(wildcards):
-    if (SAMPLES[wildcards.sample]["genus"] == "mycobacterium") & (
-        SAMPLES[wildcards.sample]["species"] == "tuberculosis"
-    ):
+    if SAMPLES[wildcards.sample]["species"] == "mycobacterium_tuberculosis":
         return [
             OUT + "/mtb_typing/lineage_call/{sample}.tsv",
             OUT + "/mtb_typing/contamination_check/coll_positions/{sample}.tsv",

diff --git a/workflow/rules/consensus.smk b/workflow/rules/consensus.smk
@@ -23,7 +23,8 @@ rule mark_variants_by_proximity:
     log:
         "",
     params:
-        proximity_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_proximity_threshold"
+        proximity_threshold=lambda wildcards: SAMPLES[wildcards.sample][
+            "consensus_proximity_threshold"
         ],
     message:
         "Marking regions with variants too close together for {wildcards.sample}"
@@ -50,7 +51,8 @@ rule subset_fixed_snps_from_vcf:
     log:
         "",
     params:
-        AF_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_AF_threshold"
+        AF_threshold=lambda wildcards: SAMPLES[wildcards.sample][
+            "consensus_AF_threshold"
         ],
     message:
         "Subsetting fixed snps from vcf for {wildcards.sample}"
@@ -79,9 +81,11 @@ rule subset_low_confidence_variants_from_vcf:
     log:
         "",
     params:
-        AF_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_AF_threshold"
+        AF_threshold=lambda wildcards: SAMPLES[wildcards.sample][
+            "consensus_AF_threshold"
         ],
-        tlod_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_tlod_threshold"
+        tlod_threshold=lambda wildcards: SAMPLES[wildcards.sample][
+            "consensus_tlod_threshold"
         ],
     message:
         "Subsetting unfixed snps from vcf for {wildcards.sample}"
@@ -167,7 +171,8 @@ rule mask_fasta_on_depth_from_bam:
     message:
         "Masking low depth regions of {wildcards.sample} consensus genome"
     params:
-        masking_depth=lambda wildcards: SAMPLES[wildcards.sample]["consensus_masking_depth"
+        masking_depth=lambda wildcards: SAMPLES[wildcards.sample][
+            "consensus_masking_depth"
         ],
     threads: config["threads"]["bedtools"]
     resources:

diff --git a/workflow/scripts/postprocess_deletion_table.py b/workflow/scripts/postprocess_deletion_table.py
@@ -13,12 +13,12 @@ def check_deletions_in_bed(deletions, bed):
         Deletions table, should contain the columns 'CHROM', 'POS', 'END' and 'interval'
     bed : pd.DataFrame
         Bed file, should contain the columns 'chrom', 'start', 'end', 'locus_tag', 'gene', 'drugs' and 'interval'
-    
+
     Returns
     -------
     pd.DataFrame
         Annotated deletions table, containing the columns 'chrom', 'start', 'end', 'locus_tag', 'gene' and 'drugs'
-    
+
     Notes
     -----
     Compares genomic ranges stored as pd.Interval objects, with closed bounds
@@ -27,48 +27,62 @@ def check_deletions_in_bed(deletions, bed):
     deletions_annotated = []
     for index, row in deletions.iterrows():
         for index2, row2 in bed.iterrows():
-            if row['CHROM'] == row2['chrom']:
-                if row['interval'].overlaps(row2['interval']):
+            if row["CHROM"] == row2["chrom"]:
+                if row["interval"].overlaps(row2["interval"]):
                     dictionary_overlap = {
-                        'chrom': row['CHROM'],
-                        'start': row['POS'],
-                        'end': row['END'],
-                        'locus_tag': row2['locus_tag'],
-                        'gene': row2['gene'],
-                        'drugs': row2['drugs']
+                        "chrom": row["CHROM"],
+                        "start": row["POS"],
+                        "end": row["END"],
+                        "locus_tag": row2["locus_tag"],
+                        "gene": row2["gene"],
+                        "drugs": row2["drugs"],
                     }
                     deletions_annotated.append(dictionary_overlap)
-    
-    return pd.DataFrame(deletions_annotated)                
+
+    return pd.DataFrame(deletions_annotated)
 
 
 def main(args):
     # Read the deletion table
-    deletions = pd.read_csv(args.input, sep='\t', header=0)
+    deletions = pd.read_csv(args.input, sep="\t", header=0)
 
     if deletions.shape[0] == 0:
-        with open(args.output, 'w') as f:
-            f.write('chrom\tstart\tend\tlocus_tag\tgene\tdrugs\n')
+        with open(args.output, "w") as f:
+            f.write("chrom\tstart\tend\tlocus_tag\tgene\tdrugs\n")
 
     else:
-        deletions['interval'] = deletions.apply(lambda x: pd.Interval(x['POS'], x['END'], closed='both'), axis=1)
+        deletions["interval"] = deletions.apply(
+            lambda x: pd.Interval(x["POS"], x["END"], closed="both"), axis=1
+        )
 
         # Read the reference bed file
-        bed = pd.read_csv(args.bed, sep='\t', header=None, names=['chrom', 'start', 'end', 'locus_tag', 'gene', 'drugs'])
-        bed['interval'] = bed.apply(lambda x: pd.Interval(x['start'], x['end'], closed='both'), axis=1)
+        bed = pd.read_csv(
+            args.bed,
+            sep="\t",
+            header=None,
+            names=["chrom", "start", "end", "locus_tag", "gene", "drugs"],
+        )
+        bed["interval"] = bed.apply(
+            lambda x: pd.Interval(x["start"], x["end"], closed="both"), axis=1
+        )
 
         annotated_deletions = check_deletions_in_bed(deletions, bed)
 
         # Write the annotatetd deletion table
-        annotated_deletions.to_csv(args.output, sep='\t', index=False)
+        annotated_deletions.to_csv(args.output, sep="\t", index=False)
+
 
-if __name__ == '__main__':
+if __name__ == "__main__":
     import argparse
 
-    parser = argparse.ArgumentParser(description='Filter deletions that are not in the reference genome')
-    parser.add_argument('--input', help='Deletion table', required=True)
-    parser.add_argument('--bed', help='Filtered deletion table', required=True)
-    parser.add_argument('--output', help='Output file', default='deletions_annotated.tsv')
+    parser = argparse.ArgumentParser(
+        description="Filter deletions that are not in the reference genome"
+    )
+    parser.add_argument("--input", help="Deletion table", required=True)
+    parser.add_argument("--bed", help="Filtered deletion table", required=True)
+    parser.add_argument(
+        "--output", help="Output file", default="deletions_annotated.tsv"
+    )
     args = parser.parse_args()
 
     main(args)