diff --git a/ingest/Snakefile b/ingest/Snakefile index 4e829ea4..6675ffe0 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -70,7 +70,6 @@ rule all: include: "workflow/snakemake_rules/fetch_sequences.smk" include: "workflow/snakemake_rules/transform.smk" -include: "workflow/snakemake_rules/nextclade.smk" if config.get("upload", False): diff --git a/ingest/bin/join-metadata-and-clades.py b/ingest/bin/join-metadata-and-clades.py deleted file mode 100755 index 3a0e919e..00000000 --- a/ingest/bin/join-metadata-and-clades.py +++ /dev/null @@ -1,77 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import re -import sys -import pandas as pd - -NEXTCLADE_JOIN_COLUMN_NAME = 'seqName' -VALUE_MISSING_DATA = '?' - -column_map = { - "clade": "clade", - "outbreak": "outbreak", - "lineage": "lineage", - "coverage": "coverage", - "totalMissing": "missing_data", - "totalSubstitutions": "divergence", - "totalNonACGTNs": "nonACGTN", - "qc.missingData.status": "QC_missing_data", - "qc.mixedSites.status": "QC_mixed_sites", - "qc.privateMutations.status": "QC_rare_mutations", - "qc.frameShifts.status": "QC_frame_shifts", - "qc.stopCodons.status": "QC_stop_codons", - "frameShifts": "frame_shifts", - "isReverseComplement": "is_reverse_complement", -# "deletions": "deletions", -# "insertions": "insertions" -# "substitutions": "substitutions", -# "aaSubstitutions": "aaSubstitutions" -} - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Joins metadata file with Nextclade clade output", - ) - parser.add_argument("--metadata") - parser.add_argument("--nextclade") - parser.add_argument("--id-field") - parser.add_argument("-o", default=sys.stdout) - return parser.parse_args() - -def main(): - args = parse_args() - - metadata = pd.read_csv(args.metadata, index_col=args.id_field, - sep='\t', low_memory=False, na_filter = False) - - # Read and rename clade column to be more descriptive - clades = pd.read_csv(args.nextclade, index_col=NEXTCLADE_JOIN_COLUMN_NAME, - sep='\t', low_memory=False, na_filter = False) \ - .rename(columns=column_map) - - clades.index = clades.index.map(lambda x: re.sub(" \|.*", "", x)) - - # Select columns in column map - clades = clades[list(column_map.values())] - - # Separate long from short columns - short_metadata = metadata.iloc[:,:-2].copy() - long_metadata = metadata.iloc[:,-2:].copy() - - # Concatenate on columns - result = pd.merge( - short_metadata, clades, - left_index=True, - right_index=True, - how='left' - ) - - # Add long columns to back - result = pd.concat([result, long_metadata], axis=1) - - result.to_csv(args.o, index_label=args.id_field, sep='\t') - - -if __name__ == '__main__': - main() diff --git a/ingest/workflow/snakemake_rules/nextclade.smk b/ingest/workflow/snakemake_rules/nextclade.smk deleted file mode 100644 index 28da0a98..00000000 --- a/ingest/workflow/snakemake_rules/nextclade.smk +++ /dev/null @@ -1,70 +0,0 @@ - -rule nextclade_dataset: - output: - temp("mpxv.zip"), - shell: - """ - nextclade dataset get --name MPXV --output-zip {output} - """ - - -rule nextclade_dataset_hMPXV: - output: - temp("hmpxv.zip"), - shell: - """ - nextclade dataset get --name hMPXV --output-zip {output} - """ - - -rule align: - input: - sequences="data/sequences.fasta", - dataset="hmpxv.zip", - output: - alignment="data/alignment.fasta", - insertions="data/insertions.csv", - translations="data/translations.zip", - params: - # The lambda is used to deactivate automatic wildcard expansion. - # https://github.com/snakemake/snakemake/blob/384d0066c512b0429719085f2cf886fdb97fd80a/snakemake/rules.py#L997-L1000 - translations=lambda w: "data/translations/{gene}.fasta", - threads: 4 - shell: - """ - nextclade run -D {input.dataset} -j {threads} --retry-reverse-complement \ - --output-fasta {output.alignment} --output-translations {params.translations} \ - --output-insertions {output.insertions} {input.sequences} - zip -rj {output.translations} data/translations - """ - - -rule nextclade: - input: - sequences="data/sequences.fasta", - dataset="mpxv.zip", - output: - "data/nextclade.tsv", - threads: 4 - shell: - """ - nextclade run -D {input.dataset} -j {threads} --output-tsv {output} {input.sequences} --retry-reverse-complement - """ - - -rule join_metadata_clades: - input: - nextclade="data/nextclade.tsv", - metadata="data/metadata_raw.tsv", - output: - "data/metadata.tsv", - params: - id_field=config["transform"]["id_field"], - shell: - """ - python3 bin/join-metadata-and-clades.py \ - --id-field {params.id_field} \ - --metadata {input.metadata} \ - --nextclade {input.nextclade} \ - -o {output} - """ diff --git a/ingest/workflow/snakemake_rules/transform.smk b/ingest/workflow/snakemake_rules/transform.smk index a39fb735..1c0cea5a 100644 --- a/ingest/workflow/snakemake_rules/transform.smk +++ b/ingest/workflow/snakemake_rules/transform.smk @@ -42,7 +42,7 @@ rule transform: all_geolocation_rules="data/all-geolocation-rules.tsv", annotations=config["transform"]["annotations"], output: - metadata="data/metadata_raw.tsv", + metadata="data/metadata.tsv", sequences="data/sequences.fasta", log: "logs/transform.txt",