Skip to content

Commit

Permalink
Merge pull request #7 from RIVM-bioinformatics/update_cli
Browse files Browse the repository at this point in the history
Update cli and juno-lib
  • Loading branch information
boasvdp authored Jul 25, 2024
2 parents 2e431d5 + 7b96f39 commit 720fe1a
Show file tree
Hide file tree
Showing 5 changed files with 68 additions and 62 deletions.
2 changes: 1 addition & 1 deletion envs/juno_variant_typing.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,4 @@ dependencies:
- pip=23.*
- python=3.11.*
- pip:
- "--editable=git+https://github.com/RIVM-bioinformatics/juno-library.git@v2.1.2#egg=juno_library"
- "--editable=git+https://github.com/RIVM-bioinformatics/juno-library.git@v2.2.0#egg=juno_library"
47 changes: 18 additions & 29 deletions juno_variant_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,11 @@
"""

from pathlib import Path
import pathlib
import yaml
import argparse
import sys
from dataclasses import dataclass, field
from juno_library import Pipeline
from typing import Optional, Union, Callable
from typing import Optional, Union, Callable, Tuple
from version import __package_name__, __version__, __description__


Expand Down Expand Up @@ -60,7 +58,8 @@ def generated_func_check_range(value: str) -> str:
class JunoVariantTyping(Pipeline):
pipeline_name: str = __package_name__
pipeline_version: str = __version__
input_type: str = "bam_and_vcf"
input_type: Tuple[str, ...] = ("bam", "vcf")
species_options = ["mycobacterium_tuberculosis"]

def _add_args_to_parser(self) -> None:
super()._add_args_to_parser()
Expand All @@ -80,22 +79,21 @@ def _add_args_to_parser(self) -> None:
"provided, it must contain at least one column named 'sample' "
"with the name of the sample (same than file name but removing "
"the suffix _R1.fastq.gz), a column called "
"'genus' and a column called 'species'. The genus and species "
"provided will be used to choose the serotyper and the MLST schema(s)."
"If a metadata file is provided, it will overwrite the --species "
"argument for the samples present in the metadata file.",
"'species'. The species provided will be used to choose the "
"typing schemes. If a metadata file is provided, it will "
"overwrite the --species argument for the samples present in "
"the metadata file.",
)
self.add_argument(
"-s",
"--species",
type=lambda s: s.strip().lower(),
nargs=2,
default=["mycobacterium", "tuberculosis"],
type=str.lower,
default="mycobacterium_tuberculosis",
required=False,
metavar=("GENUS", "SPECIES"),
metavar="SPECIES",
choices=self.species_options,
help="Species name (any species in the metadata file will overwrite"
" this argument). It should be given as two words (e.g. --species "
"Mycobacterium tuberculosis)",
" this argument). Choose from: {self.species_options}",
)
self.add_argument(
"-d",
Expand Down Expand Up @@ -123,9 +121,7 @@ def _parse_args(self) -> argparse.Namespace:
# Optional arguments are loaded into self here
self.db_dir: Path = args.db_dir.resolve()

self.genus: Optional[str]
self.species: Optional[str]
self.genus, self.species = args.species
self.species: Optional[str] = args.species
self.metadata_file: Path = args.metadata
self.presets_path: Optional[Path] = args.presets_path
# self.single_copy_bed: Optional[Path] = args.single_copy_bed
Expand Down Expand Up @@ -171,12 +167,11 @@ def setup(self) -> None:
def update_sample_dict_with_metadata(self) -> None:
self.get_metadata_from_csv_file(
filepath=self.metadata_file,
expected_colnames=["sample", "genus", "species"],
expected_colnames=["sample", "species"],
)
# Add metadata
for sample in self.sample_dict:
if self.genus is not None and self.species is not None:
self.sample_dict[sample]["genus"] = self.genus
if self.species is not None:
self.sample_dict[sample]["species"] = self.species
else:
try:
Expand All @@ -188,9 +183,6 @@ def update_sample_dict_with_metadata(self) -> None:
"samples are present in the metadata file or provide "
"a --species argument."
)
self.sample_dict[sample]["genus"] = (
self.sample_dict[sample]["genus"].strip().lower()
)
self.sample_dict[sample]["species"] = (
self.sample_dict[sample]["species"].strip().lower()
)
Expand All @@ -203,12 +195,9 @@ def set_presets(self) -> None:
presets_dict = yaml.safe_load(f)

for sample in self.sample_dict:
complete_species_name = "_".join(
[self.sample_dict[sample]["genus"], self.sample_dict[sample]["species"]]
)

if complete_species_name in presets_dict.keys():
for key, value in presets_dict[complete_species_name].items():
species_name = self.sample_dict[sample]["species"]
if species_name in presets_dict.keys():
for key, value in presets_dict[species_name].items():
self.sample_dict[sample][key] = value


Expand Down
4 changes: 1 addition & 3 deletions workflow/rules/choose_species.smk
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
def choose_species(wildcards):
if (SAMPLES[wildcards.sample]["genus"] == "mycobacterium") & (
SAMPLES[wildcards.sample]["species"] == "tuberculosis"
):
if SAMPLES[wildcards.sample]["species"] == "mycobacterium_tuberculosis":
return [
OUT + "/mtb_typing/lineage_call/{sample}.tsv",
OUT + "/mtb_typing/contamination_check/coll_positions/{sample}.tsv",
Expand Down
15 changes: 10 additions & 5 deletions workflow/rules/consensus.smk
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ rule mark_variants_by_proximity:
log:
"",
params:
proximity_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_proximity_threshold"
proximity_threshold=lambda wildcards: SAMPLES[wildcards.sample][
"consensus_proximity_threshold"
],
message:
"Marking regions with variants too close together for {wildcards.sample}"
Expand All @@ -50,7 +51,8 @@ rule subset_fixed_snps_from_vcf:
log:
"",
params:
AF_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_AF_threshold"
AF_threshold=lambda wildcards: SAMPLES[wildcards.sample][
"consensus_AF_threshold"
],
message:
"Subsetting fixed snps from vcf for {wildcards.sample}"
Expand Down Expand Up @@ -79,9 +81,11 @@ rule subset_low_confidence_variants_from_vcf:
log:
"",
params:
AF_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_AF_threshold"
AF_threshold=lambda wildcards: SAMPLES[wildcards.sample][
"consensus_AF_threshold"
],
tlod_threshold=lambda wildcards: SAMPLES[wildcards.sample]["consensus_tlod_threshold"
tlod_threshold=lambda wildcards: SAMPLES[wildcards.sample][
"consensus_tlod_threshold"
],
message:
"Subsetting unfixed snps from vcf for {wildcards.sample}"
Expand Down Expand Up @@ -167,7 +171,8 @@ rule mask_fasta_on_depth_from_bam:
message:
"Masking low depth regions of {wildcards.sample} consensus genome"
params:
masking_depth=lambda wildcards: SAMPLES[wildcards.sample]["consensus_masking_depth"
masking_depth=lambda wildcards: SAMPLES[wildcards.sample][
"consensus_masking_depth"
],
threads: config["threads"]["bedtools"]
resources:
Expand Down
62 changes: 38 additions & 24 deletions workflow/scripts/postprocess_deletion_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@ def check_deletions_in_bed(deletions, bed):
Deletions table, should contain the columns 'CHROM', 'POS', 'END' and 'interval'
bed : pd.DataFrame
Bed file, should contain the columns 'chrom', 'start', 'end', 'locus_tag', 'gene', 'drugs' and 'interval'
Returns
-------
pd.DataFrame
Annotated deletions table, containing the columns 'chrom', 'start', 'end', 'locus_tag', 'gene' and 'drugs'
Notes
-----
Compares genomic ranges stored as pd.Interval objects, with closed bounds
Expand All @@ -27,48 +27,62 @@ def check_deletions_in_bed(deletions, bed):
deletions_annotated = []
for index, row in deletions.iterrows():
for index2, row2 in bed.iterrows():
if row['CHROM'] == row2['chrom']:
if row['interval'].overlaps(row2['interval']):
if row["CHROM"] == row2["chrom"]:
if row["interval"].overlaps(row2["interval"]):
dictionary_overlap = {
'chrom': row['CHROM'],
'start': row['POS'],
'end': row['END'],
'locus_tag': row2['locus_tag'],
'gene': row2['gene'],
'drugs': row2['drugs']
"chrom": row["CHROM"],
"start": row["POS"],
"end": row["END"],
"locus_tag": row2["locus_tag"],
"gene": row2["gene"],
"drugs": row2["drugs"],
}
deletions_annotated.append(dictionary_overlap)
return pd.DataFrame(deletions_annotated)

return pd.DataFrame(deletions_annotated)


def main(args):
# Read the deletion table
deletions = pd.read_csv(args.input, sep='\t', header=0)
deletions = pd.read_csv(args.input, sep="\t", header=0)

if deletions.shape[0] == 0:
with open(args.output, 'w') as f:
f.write('chrom\tstart\tend\tlocus_tag\tgene\tdrugs\n')
with open(args.output, "w") as f:
f.write("chrom\tstart\tend\tlocus_tag\tgene\tdrugs\n")

else:
deletions['interval'] = deletions.apply(lambda x: pd.Interval(x['POS'], x['END'], closed='both'), axis=1)
deletions["interval"] = deletions.apply(
lambda x: pd.Interval(x["POS"], x["END"], closed="both"), axis=1
)

# Read the reference bed file
bed = pd.read_csv(args.bed, sep='\t', header=None, names=['chrom', 'start', 'end', 'locus_tag', 'gene', 'drugs'])
bed['interval'] = bed.apply(lambda x: pd.Interval(x['start'], x['end'], closed='both'), axis=1)
bed = pd.read_csv(
args.bed,
sep="\t",
header=None,
names=["chrom", "start", "end", "locus_tag", "gene", "drugs"],
)
bed["interval"] = bed.apply(
lambda x: pd.Interval(x["start"], x["end"], closed="both"), axis=1
)

annotated_deletions = check_deletions_in_bed(deletions, bed)

# Write the annotatetd deletion table
annotated_deletions.to_csv(args.output, sep='\t', index=False)
annotated_deletions.to_csv(args.output, sep="\t", index=False)


if __name__ == '__main__':
if __name__ == "__main__":
import argparse

parser = argparse.ArgumentParser(description='Filter deletions that are not in the reference genome')
parser.add_argument('--input', help='Deletion table', required=True)
parser.add_argument('--bed', help='Filtered deletion table', required=True)
parser.add_argument('--output', help='Output file', default='deletions_annotated.tsv')
parser = argparse.ArgumentParser(
description="Filter deletions that are not in the reference genome"
)
parser.add_argument("--input", help="Deletion table", required=True)
parser.add_argument("--bed", help="Filtered deletion table", required=True)
parser.add_argument(
"--output", help="Output file", default="deletions_annotated.tsv"
)
args = parser.parse_args()

main(args)

0 comments on commit 720fe1a

Please sign in to comment.