diff --git a/ingest/bin/calculate-gene-converage-from-nextclade-translation.py b/ingest/bin/calculate-gene-converage-from-nextclade-translation.py new file mode 100644 index 00000000..4ab7c199 --- /dev/null +++ b/ingest/bin/calculate-gene-converage-from-nextclade-translation.py @@ -0,0 +1,54 @@ +#! /usr/bin/env python3 + +import argparse +from sys import stderr + +from Bio import SeqIO +import re + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Calculate gene coverage from amino acid sequence. The output will be in TSV format to stdout.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--fasta", + type=str, + help="FASTA file of CDS translations from Nextclade.", + ) + parser.add_argument( + "--out-col", + type=str, + default="gene_coverage", + help="Output column name.", + ) + return parser.parse_args() + + +def calculate_gene_coverage_from_nextclade_cds(fasta, out_col): + """ + Calculate gene coverage from amino acid sequence in gene translation FASTA file from Nextclade. + """ + print(f"genbank_accession\t{out_col}") + # Iterate over the sequences in the FASTA file + for record in SeqIO.parse(fasta, "fasta"): + sequence_id = record.id + sequence = str(record.seq) + + # Calculate gene coverage + results = re.findall(r"([ACDEFGHIKLMNPQRSTVWY])", sequence.upper()) + gene_coverage = round(len(results) / len(sequence), 3) + + # Print the results + print(f"{sequence_id}\t{gene_coverage}") + + +def main(): + args = parse_args() + + calculate_gene_coverage_from_nextclade_cds(args.fasta, args.out_col) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 6a748aa3..4213d023 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -109,4 +109,12 @@ curate: nextclade: min_length: 1000 # E gene length is approximately 1400nt min_seed_cover: 0.1 - nextclade_field: 'nextclade_subtype' \ No newline at end of file + gene: ['E','NS1'] + # Nextclade Fields to rename to metadata field names. + field_map: + seqName: genbank_accession # ID field used to merge annotations + clade: nextclade_subtype + alignmentStart: alignmentStart + alignmentEnd: alignmentEnd + coverage: genome_coverage + failedCdses: failedCdses \ No newline at end of file diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 88e1800f..82034140 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -19,16 +19,23 @@ SEROTYPE_CONSTRAINTS = '|'.join(SUPPORTED_NEXTCLADE_SEROTYPES) rule nextclade_denvX: """ For each type, classify into the appropriate subtype + 1. Capture the alignment + 2. Capture the translations of gene(s) of interest + + Note: If using --cds-selection, only the thoese genes are reported in the failedCdses column """ input: sequences="results/sequences_{serotype}.fasta", dataset="../nextclade_data/{serotype}", output: nextclade_denvX="data/nextclade_results/nextclade_{serotype}.tsv", + nextclade_alignment="results/aligned_{serotype}.fasta", + nextclade_translations=expand("data/translations/{{serotype}}/{gene}/seqs.gene.fasta", gene=config["nextclade"]["gene"]), threads: 4 params: min_length=config["nextclade"]["min_length"], min_seed_cover=config["nextclade"]["min_seed_cover"], + output_translations = lambda wildcards: f"data/translations/{wildcards.serotype}/{{cds}}/seqs.gene.fasta", wildcard_constraints: serotype=SEROTYPE_CONSTRAINTS shell: @@ -40,6 +47,8 @@ rule nextclade_denvX: --min-length {params.min_length} \ --min-seed-cover {params.min_seed_cover} \ --silent \ + --output-fasta {output.nextclade_alignment} \ + --output-translations {params.output_translations} \ {input.sequences} """ @@ -48,19 +57,19 @@ rule concat_nextclade_subtype_results: Concatenate all the nextclade results for dengue subtype classification """ input: - expand("data/nextclade_results/nextclade_{serotype}.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES), + nextclade_results_files = expand("data/nextclade_results/nextclade_{serotype}.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES), output: nextclade_subtypes="results/nextclade_subtypes.tsv", params: - id_field=config["curate"]["id_field"], - nextclade_field=config["nextclade"]["nextclade_field"], + input_nextclade_fields=",".join([f'{key}' for key, value in config["nextclade"]["field_map"].items()]), + output_nextclade_fields=",".join([f'{value}' for key, value in config["nextclade"]["field_map"].items()]), shell: """ - echo "{params.id_field},{params.nextclade_field}" \ + echo "{params.output_nextclade_fields}" \ | tr ',' '\t' \ > {output.nextclade_subtypes} - tsv-select -H -f "seqName,clade" {input} \ + tsv-select -H -f "{params.input_nextclade_fields}" {input.nextclade_results_files} \ | awk 'NR>1 {{print}}' \ >> {output.nextclade_subtypes} """ @@ -73,21 +82,78 @@ rule append_nextclade_columns: metadata="data/metadata_all.tsv", nextclade_subtypes="results/nextclade_subtypes.tsv", output: - metadata_all="results/metadata_all.tsv", + metadata_all="data/metadata_nextclade.tsv", params: - id_field=config["curate"]["id_field"], - nextclade_field=config["nextclade"]["nextclade_field"], + id_field=list(config["nextclade"]["field_map"].values())[0], + output_nextclade_fields=",".join([f'{value}' for key, value in config["nextclade"]["field_map"].items()][1:]), shell: """ tsv-join -H \ --filter-file {input.nextclade_subtypes} \ --key-fields {params.id_field} \ - --append-fields {params.nextclade_field} \ + --append-fields {params.output_nextclade_fields} \ --write-all ? \ {input.metadata} \ > {output.metadata_all} """ +rule calculate_gene_coverage: + """ + Calculate the coverage of the gene of interest + """ + input: + nextclade_translation="data/translations/{serotype}/{gene}/seqs.gene.fasta", + output: + gene_coverage="data/translations/{serotype}/{gene}/gene_coverage.tsv", + wildcard_constraints: + serotype=SEROTYPE_CONSTRAINTS, + shell: + """ + python bin/calculate-gene-converage-from-nextclade-translation.py \ + --fasta {input.nextclade_translation} \ + --out-col {wildcards.gene}_coverage \ + > {output.gene_coverage} + """ + +rule aggregate_gene_coverage_by_gene: + """ + Aggregate the gene coverage results by gene + """ + input: + gene_coverage=expand("data/translations/{serotype}/{{gene}}/gene_coverage.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES), + output: + gene_coverage_all="results/{gene}/gene_coverage_all.tsv", + shell: + """ + tsv-append -H {input.gene_coverage} > {output.gene_coverage_all} + """ + +rule append_gene_coverage_columns: + """ + Append the gene coverage results to the metadata + """ + input: + metadata="data/metadata_nextclade.tsv", + gene_coverage=expand("results/{gene}/gene_coverage_all.tsv", gene=config["nextclade"]["gene"]) + output: + metadata_all="results/metadata_all.tsv", + params: + id_field=config["curate"]["id_field"], + shell: + """ + cp {input.metadata} {output.metadata_all} + for FILE in {input.gene_coverage}; do + tsv-join -H \ + --filter-file $FILE \ + --key-fields {params.id_field} \ + --append-fields '*_coverage' \ + --write-all ? \ + {output.metadata_all} \ + > results/temp_aggregate_gene_coverage.tsv + mv results/temp_aggregate_gene_coverage.tsv {output.metadata_all} + done + """ + rule split_metadata_by_serotype: """ Split the metadata by serotype