From 97bb7adbd6726e5e209922fbb7b4070dc9cdf44c Mon Sep 17 00:00:00 2001 From: boasvdp Date: Thu, 25 Jul 2024 15:42:05 +0200 Subject: [PATCH 1/4] deps: update juno library --- envs/juno_variant_typing.yaml | 2 +- juno_variant_typing.py | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/envs/juno_variant_typing.yaml b/envs/juno_variant_typing.yaml index c46ebbc..4477df6 100644 --- a/envs/juno_variant_typing.yaml +++ b/envs/juno_variant_typing.yaml @@ -12,4 +12,4 @@ dependencies: - pip=23.* - python=3.11.* - pip: - - "--editable=git+https://github.com/RIVM-bioinformatics/juno-library.git@v2.1.2#egg=juno_library" + - "--editable=git+https://github.com/RIVM-bioinformatics/juno-library.git@v2.2.0#egg=juno_library" diff --git a/juno_variant_typing.py b/juno_variant_typing.py index 5f1d3ad..97fb5d6 100644 --- a/juno_variant_typing.py +++ b/juno_variant_typing.py @@ -8,13 +8,11 @@ """ from pathlib import Path -import pathlib import yaml import argparse -import sys from dataclasses import dataclass, field from juno_library import Pipeline -from typing import Optional, Union, Callable +from typing import Optional, Union, Callable, Tuple from version import __package_name__, __version__, __description__ @@ -60,7 +58,7 @@ def generated_func_check_range(value: str) -> str: class JunoVariantTyping(Pipeline): pipeline_name: str = __package_name__ pipeline_version: str = __version__ - input_type: str = "bam_and_vcf" + input_type: Tuple[str, ...] = ("bam", "vcf") def _add_args_to_parser(self) -> None: super()._add_args_to_parser() From 57454c375f678d758964893944c85c418308e69d Mon Sep 17 00:00:00 2001 From: boasvdp Date: Thu, 25 Jul 2024 15:42:24 +0200 Subject: [PATCH 2/4] fix: update cli --- juno_variant_typing.py | 41 ++++++++++++++++------------------------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/juno_variant_typing.py b/juno_variant_typing.py index 97fb5d6..c688a1b 100644 --- a/juno_variant_typing.py +++ b/juno_variant_typing.py @@ -59,6 +59,7 @@ class JunoVariantTyping(Pipeline): pipeline_name: str = __package_name__ pipeline_version: str = __version__ input_type: Tuple[str, ...] = ("bam", "vcf") + species_options = ["mycobacterium_tuberculosis"] def _add_args_to_parser(self) -> None: super()._add_args_to_parser() @@ -78,22 +79,21 @@ def _add_args_to_parser(self) -> None: "provided, it must contain at least one column named 'sample' " "with the name of the sample (same than file name but removing " "the suffix _R1.fastq.gz), a column called " - "'genus' and a column called 'species'. The genus and species " - "provided will be used to choose the serotyper and the MLST schema(s)." - "If a metadata file is provided, it will overwrite the --species " - "argument for the samples present in the metadata file.", + "'species'. The species provided will be used to choose the " + "typing schemes. If a metadata file is provided, it will " + "overwrite the --species argument for the samples present in " + "the metadata file.", ) self.add_argument( "-s", "--species", - type=lambda s: s.strip().lower(), - nargs=2, - default=["mycobacterium", "tuberculosis"], + type=str.lower, + default="mycobacterium_tuberculosis", required=False, - metavar=("GENUS", "SPECIES"), + metavar="SPECIES", + choices=self.species_options, help="Species name (any species in the metadata file will overwrite" - " this argument). It should be given as two words (e.g. --species " - "Mycobacterium tuberculosis)", + " this argument). Choose from: {self.species_options}", ) self.add_argument( "-d", @@ -121,9 +121,7 @@ def _parse_args(self) -> argparse.Namespace: # Optional arguments are loaded into self here self.db_dir: Path = args.db_dir.resolve() - self.genus: Optional[str] - self.species: Optional[str] - self.genus, self.species = args.species + self.species: Optional[str] = args.species self.metadata_file: Path = args.metadata self.presets_path: Optional[Path] = args.presets_path # self.single_copy_bed: Optional[Path] = args.single_copy_bed @@ -169,12 +167,11 @@ def setup(self) -> None: def update_sample_dict_with_metadata(self) -> None: self.get_metadata_from_csv_file( filepath=self.metadata_file, - expected_colnames=["sample", "genus", "species"], + expected_colnames=["sample", "species"], ) # Add metadata for sample in self.sample_dict: - if self.genus is not None and self.species is not None: - self.sample_dict[sample]["genus"] = self.genus + if self.species is not None: self.sample_dict[sample]["species"] = self.species else: try: @@ -186,9 +183,6 @@ def update_sample_dict_with_metadata(self) -> None: "samples are present in the metadata file or provide " "a --species argument." ) - self.sample_dict[sample]["genus"] = ( - self.sample_dict[sample]["genus"].strip().lower() - ) self.sample_dict[sample]["species"] = ( self.sample_dict[sample]["species"].strip().lower() ) @@ -201,12 +195,9 @@ def set_presets(self) -> None: presets_dict = yaml.safe_load(f) for sample in self.sample_dict: - complete_species_name = "_".join( - [self.sample_dict[sample]["genus"], self.sample_dict[sample]["species"]] - ) - - if complete_species_name in presets_dict.keys(): - for key, value in presets_dict[complete_species_name].items(): + species_name = self.sample_dict[sample]["species"] + if species_name in presets_dict.keys(): + for key, value in presets_dict[species_name].items(): self.sample_dict[sample][key] = value From 0de09d00d738929b61781f4bb8e38e23cafe981b Mon Sep 17 00:00:00 2001 From: boasvdp Date: Thu, 25 Jul 2024 15:42:34 +0200 Subject: [PATCH 3/4] fix: update cli --- workflow/rules/choose_species.smk | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/workflow/rules/choose_species.smk b/workflow/rules/choose_species.smk index 72b371f..c48c38f 100644 --- a/workflow/rules/choose_species.smk +++ b/workflow/rules/choose_species.smk @@ -1,7 +1,5 @@ def choose_species(wildcards): - if (SAMPLES[wildcards.sample]["genus"] == "mycobacterium") & ( - SAMPLES[wildcards.sample]["species"] == "tuberculosis" - ): + if SAMPLES[wildcards.sample]["species"] == "mycobacterium_tuberculosis": return [ OUT + "/mtb_typing/lineage_call/{sample}.tsv", OUT + "/mtb_typing/contamination_check/coll_positions/{sample}.tsv", From 7b96f39d81ec7ff9cc10b63530c8e1285c8cef4e Mon Sep 17 00:00:00 2001 From: boasvdp Date: Thu, 25 Jul 2024 15:42:45 +0200 Subject: [PATCH 4/4] style: formatting --- workflow/rules/consensus.smk | 15 +++-- .../scripts/postprocess_deletion_table.py | 62 ++++++++++++------- 2 files changed, 48 insertions(+), 29 deletions(-) diff --git a/workflow/rules/consensus.smk b/workflow/rules/consensus.smk index fcd7055..07c2ca7 100644 --- a/workflow/rules/consensus.smk +++ b/workflow/rules/consensus.smk @@ -23,7 +23,8 @@ rule mark_variants_by_proximity: log: "", params: - proximity_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_proximity_threshold" + proximity_threshold=lambda wildcards: SAMPLES[wildcards.sample][ + "consensus_proximity_threshold" ], message: "Marking regions with variants too close together for {wildcards.sample}" @@ -50,7 +51,8 @@ rule subset_fixed_snps_from_vcf: log: "", params: - AF_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_AF_threshold" + AF_threshold=lambda wildcards: SAMPLES[wildcards.sample][ + "consensus_AF_threshold" ], message: "Subsetting fixed snps from vcf for {wildcards.sample}" @@ -79,9 +81,11 @@ rule subset_low_confidence_variants_from_vcf: log: "", params: - AF_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_AF_threshold" + AF_threshold=lambda wildcards: SAMPLES[wildcards.sample][ + "consensus_AF_threshold" ], - tlod_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_tlod_threshold" + tlod_threshold=lambda wildcards: SAMPLES[wildcards.sample][ + "consensus_tlod_threshold" ], message: "Subsetting unfixed snps from vcf for {wildcards.sample}" @@ -167,7 +171,8 @@ rule mask_fasta_on_depth_from_bam: message: "Masking low depth regions of {wildcards.sample} consensus genome" params: - masking_depth=lambda wildcards: SAMPLES[wildcards.sample]["consensus_masking_depth" + masking_depth=lambda wildcards: SAMPLES[wildcards.sample][ + "consensus_masking_depth" ], threads: config["threads"]["bedtools"] resources: diff --git a/workflow/scripts/postprocess_deletion_table.py b/workflow/scripts/postprocess_deletion_table.py index a550ead..a18c740 100644 --- a/workflow/scripts/postprocess_deletion_table.py +++ b/workflow/scripts/postprocess_deletion_table.py @@ -13,12 +13,12 @@ def check_deletions_in_bed(deletions, bed): Deletions table, should contain the columns 'CHROM', 'POS', 'END' and 'interval' bed : pd.DataFrame Bed file, should contain the columns 'chrom', 'start', 'end', 'locus_tag', 'gene', 'drugs' and 'interval' - + Returns ------- pd.DataFrame Annotated deletions table, containing the columns 'chrom', 'start', 'end', 'locus_tag', 'gene' and 'drugs' - + Notes ----- Compares genomic ranges stored as pd.Interval objects, with closed bounds @@ -27,48 +27,62 @@ def check_deletions_in_bed(deletions, bed): deletions_annotated = [] for index, row in deletions.iterrows(): for index2, row2 in bed.iterrows(): - if row['CHROM'] == row2['chrom']: - if row['interval'].overlaps(row2['interval']): + if row["CHROM"] == row2["chrom"]: + if row["interval"].overlaps(row2["interval"]): dictionary_overlap = { - 'chrom': row['CHROM'], - 'start': row['POS'], - 'end': row['END'], - 'locus_tag': row2['locus_tag'], - 'gene': row2['gene'], - 'drugs': row2['drugs'] + "chrom": row["CHROM"], + "start": row["POS"], + "end": row["END"], + "locus_tag": row2["locus_tag"], + "gene": row2["gene"], + "drugs": row2["drugs"], } deletions_annotated.append(dictionary_overlap) - - return pd.DataFrame(deletions_annotated) + + return pd.DataFrame(deletions_annotated) def main(args): # Read the deletion table - deletions = pd.read_csv(args.input, sep='\t', header=0) + deletions = pd.read_csv(args.input, sep="\t", header=0) if deletions.shape[0] == 0: - with open(args.output, 'w') as f: - f.write('chrom\tstart\tend\tlocus_tag\tgene\tdrugs\n') + with open(args.output, "w") as f: + f.write("chrom\tstart\tend\tlocus_tag\tgene\tdrugs\n") else: - deletions['interval'] = deletions.apply(lambda x: pd.Interval(x['POS'], x['END'], closed='both'), axis=1) + deletions["interval"] = deletions.apply( + lambda x: pd.Interval(x["POS"], x["END"], closed="both"), axis=1 + ) # Read the reference bed file - bed = pd.read_csv(args.bed, sep='\t', header=None, names=['chrom', 'start', 'end', 'locus_tag', 'gene', 'drugs']) - bed['interval'] = bed.apply(lambda x: pd.Interval(x['start'], x['end'], closed='both'), axis=1) + bed = pd.read_csv( + args.bed, + sep="\t", + header=None, + names=["chrom", "start", "end", "locus_tag", "gene", "drugs"], + ) + bed["interval"] = bed.apply( + lambda x: pd.Interval(x["start"], x["end"], closed="both"), axis=1 + ) annotated_deletions = check_deletions_in_bed(deletions, bed) # Write the annotatetd deletion table - annotated_deletions.to_csv(args.output, sep='\t', index=False) + annotated_deletions.to_csv(args.output, sep="\t", index=False) + -if __name__ == '__main__': +if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser(description='Filter deletions that are not in the reference genome') - parser.add_argument('--input', help='Deletion table', required=True) - parser.add_argument('--bed', help='Filtered deletion table', required=True) - parser.add_argument('--output', help='Output file', default='deletions_annotated.tsv') + parser = argparse.ArgumentParser( + description="Filter deletions that are not in the reference genome" + ) + parser.add_argument("--input", help="Deletion table", required=True) + parser.add_argument("--bed", help="Filtered deletion table", required=True) + parser.add_argument( + "--output", help="Output file", default="deletions_annotated.tsv" + ) args = parser.parse_args() main(args)