From 9839bd17e9450a35c5280fa464d5181044948511 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 19 Mar 2024 15:26:10 -0700 Subject: [PATCH 01/15] Add alignmentStart,alignmentEnd from nextclade results --- ingest/rules/nextclade.smk | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 88e1800f..85140915 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -56,11 +56,11 @@ rule concat_nextclade_subtype_results: nextclade_field=config["nextclade"]["nextclade_field"], shell: """ - echo "{params.id_field},{params.nextclade_field}" \ + echo "{params.id_field},{params.nextclade_field},alignmentStart,alignmentEnd" \ | tr ',' '\t' \ > {output.nextclade_subtypes} - tsv-select -H -f "seqName,clade" {input} \ + tsv-select -H -f "seqName,clade,alignmentStart,alignmentEnd" {input} \ | awk 'NR>1 {{print}}' \ >> {output.nextclade_subtypes} """ @@ -82,7 +82,7 @@ rule append_nextclade_columns: tsv-join -H \ --filter-file {input.nextclade_subtypes} \ --key-fields {params.id_field} \ - --append-fields {params.nextclade_field} \ + --append-fields {params.nextclade_field},alignmentStart,alignmentEnd \ --write-all ? \ {input.metadata} \ > {output.metadata_all} From 2a9eee43eb1c572a7676c57b8c64daa9f1e398bf Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 19 Mar 2024 16:54:51 -0700 Subject: [PATCH 02/15] Add genome_coverage and indicator (True/blank) variable for E_coverage This is using the Nextclade "coverage" as "genome_coverage" and the Nextclade "failedCdses" to check if E_coverage is present or not. fixup: use 1 instead of true --- ingest/rules/nextclade.smk | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 85140915..6a13852b 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -56,12 +56,13 @@ rule concat_nextclade_subtype_results: nextclade_field=config["nextclade"]["nextclade_field"], shell: """ - echo "{params.id_field},{params.nextclade_field},alignmentStart,alignmentEnd" \ + echo "{params.id_field},{params.nextclade_field},alignmentStart,alignmentEnd,genome_coverage,failedCdses,E_coverage" \ | tr ',' '\t' \ > {output.nextclade_subtypes} - tsv-select -H -f "seqName,clade,alignmentStart,alignmentEnd" {input} \ + tsv-select -H -f "seqName,clade,alignmentStart,alignmentEnd,coverage,failedCdses" {input} \ | awk 'NR>1 {{print}}' \ + | awk -F'\t' '$2 && !($6 ~ /E/) {{print $0"\t1"; next}} {{print $0"\t"}}' \ >> {output.nextclade_subtypes} """ @@ -82,7 +83,7 @@ rule append_nextclade_columns: tsv-join -H \ --filter-file {input.nextclade_subtypes} \ --key-fields {params.id_field} \ - --append-fields {params.nextclade_field},alignmentStart,alignmentEnd \ + --append-fields {params.nextclade_field},alignmentStart,alignmentEnd,genome_coverage,failedCdses,E_coverage \ --write-all ? \ {input.metadata} \ > {output.metadata_all} From 12947fa3f7e79dae32948e70f1eafcad94bf2729 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Tue, 19 Mar 2024 17:03:09 -0700 Subject: [PATCH 03/15] fixup: silence slack notifications for now From d22016fc9db88878a63e7b92398bbf1c1f265d8f Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 22 Mar 2024 07:29:18 -0700 Subject: [PATCH 04/15] Add gene configs to allow for calculating gene coverage --- ingest/defaults/config.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 6a748aa3..d65f8d5a 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -109,4 +109,5 @@ curate: nextclade: min_length: 1000 # E gene length is approximately 1400nt min_seed_cover: 0.1 - nextclade_field: 'nextclade_subtype' \ No newline at end of file + nextclade_field: 'nextclade_subtype' + gene: ['E','NS1'] \ No newline at end of file From ddf0fb3dc1df8304c91bf70a547937f0529508f1 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 20 Mar 2024 14:52:44 -0700 Subject: [PATCH 05/15] Output Nextclade gene translations to a fasta files This can be one gene or a set of genes, can then be used to calculate gene_coverage columns. --- ingest/rules/nextclade.smk | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 6a13852b..d0e9f7f6 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -19,16 +19,22 @@ SEROTYPE_CONSTRAINTS = '|'.join(SUPPORTED_NEXTCLADE_SEROTYPES) rule nextclade_denvX: """ For each type, classify into the appropriate subtype + 1. Capture the alignment + 2. Capture the translations of gene(s) of interest """ input: sequences="results/sequences_{serotype}.fasta", dataset="../nextclade_data/{serotype}", output: nextclade_denvX="data/nextclade_results/nextclade_{serotype}.tsv", + nextclade_alignment="results/nextclade_aligned_sequences_{serotype}.fasta", + nextclade_translations=expand("results/translations/seqs_{{serotype}}.gene.{gene}.fasta", gene=config["nextclade"]["gene"]), threads: 4 params: min_length=config["nextclade"]["min_length"], min_seed_cover=config["nextclade"]["min_seed_cover"], + gene=config["nextclade"]["gene"], + output_translations = lambda wildcards: f"results/translations/seqs_{wildcards.serotype}.gene.{{cds}}.fasta", wildcard_constraints: serotype=SEROTYPE_CONSTRAINTS shell: @@ -40,6 +46,9 @@ rule nextclade_denvX: --min-length {params.min_length} \ --min-seed-cover {params.min_seed_cover} \ --silent \ + --output-fasta {output.nextclade_alignment} \ + --cds-selection {params.gene} \ + --output-translations {params.output_translations} \ {input.sequences} """ From 1a6d1ef447c31287344047844d43d44a9c70de9a Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 22 Mar 2024 07:23:37 -0700 Subject: [PATCH 06/15] Only have final files in the "results" directory Move intermediate files to the "data" folder --- ingest/rules/nextclade.smk | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index d0e9f7f6..227e6063 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -21,20 +21,21 @@ rule nextclade_denvX: For each type, classify into the appropriate subtype 1. Capture the alignment 2. Capture the translations of gene(s) of interest + + Note: If using --cds-selection, only the thoese genes are reported in the failedCdses column """ input: sequences="results/sequences_{serotype}.fasta", dataset="../nextclade_data/{serotype}", output: nextclade_denvX="data/nextclade_results/nextclade_{serotype}.tsv", - nextclade_alignment="results/nextclade_aligned_sequences_{serotype}.fasta", - nextclade_translations=expand("results/translations/seqs_{{serotype}}.gene.{gene}.fasta", gene=config["nextclade"]["gene"]), + nextclade_alignment="results/aligned_{serotype}.fasta", + nextclade_translations=expand("data/translations/seqs_{{serotype}}.gene.{gene}.fasta", gene=config["nextclade"]["gene"]), threads: 4 params: min_length=config["nextclade"]["min_length"], min_seed_cover=config["nextclade"]["min_seed_cover"], - gene=config["nextclade"]["gene"], - output_translations = lambda wildcards: f"results/translations/seqs_{wildcards.serotype}.gene.{{cds}}.fasta", + output_translations = lambda wildcards: f"data/translations/seqs_{wildcards.serotype}.gene.{{cds}}.fasta", wildcard_constraints: serotype=SEROTYPE_CONSTRAINTS shell: @@ -47,7 +48,6 @@ rule nextclade_denvX: --min-seed-cover {params.min_seed_cover} \ --silent \ --output-fasta {output.nextclade_alignment} \ - --cds-selection {params.gene} \ --output-translations {params.output_translations} \ {input.sequences} """ @@ -57,7 +57,7 @@ rule concat_nextclade_subtype_results: Concatenate all the nextclade results for dengue subtype classification """ input: - expand("data/nextclade_results/nextclade_{serotype}.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES), + nextclade_results_files = expand("data/nextclade_results/nextclade_{serotype}.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES), output: nextclade_subtypes="results/nextclade_subtypes.tsv", params: @@ -65,11 +65,11 @@ rule concat_nextclade_subtype_results: nextclade_field=config["nextclade"]["nextclade_field"], shell: """ - echo "{params.id_field},{params.nextclade_field},alignmentStart,alignmentEnd,genome_coverage,failedCdses,E_coverage" \ + echo "{params.id_field},{params.nextclade_field},alignmentStart,alignmentEnd,genome_coverage,failedCdses,E_indicator" \ | tr ',' '\t' \ > {output.nextclade_subtypes} - tsv-select -H -f "seqName,clade,alignmentStart,alignmentEnd,coverage,failedCdses" {input} \ + tsv-select -H -f "seqName,clade,alignmentStart,alignmentEnd,coverage,failedCdses" {input.nextclade_results_files} \ | awk 'NR>1 {{print}}' \ | awk -F'\t' '$2 && !($6 ~ /E/) {{print $0"\t1"; next}} {{print $0"\t"}}' \ >> {output.nextclade_subtypes} @@ -83,7 +83,7 @@ rule append_nextclade_columns: metadata="data/metadata_all.tsv", nextclade_subtypes="results/nextclade_subtypes.tsv", output: - metadata_all="results/metadata_all.tsv", + metadata_all="data/metadata_nextclade.tsv", params: id_field=config["curate"]["id_field"], nextclade_field=config["nextclade"]["nextclade_field"], @@ -92,7 +92,7 @@ rule append_nextclade_columns: tsv-join -H \ --filter-file {input.nextclade_subtypes} \ --key-fields {params.id_field} \ - --append-fields {params.nextclade_field},alignmentStart,alignmentEnd,genome_coverage,failedCdses,E_coverage \ + --append-fields {params.nextclade_field},alignmentStart,alignmentEnd,genome_coverage,failedCdses,E_indicator \ --write-all ? \ {input.metadata} \ > {output.metadata_all} From 35bbb83934888e5791e120525e6d08f0f5d56300 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 22 Mar 2024 07:20:34 -0700 Subject: [PATCH 07/15] Add script to calculate gene coverage from Nextclade translated amino acid FASTA file --- ...ne-converage-from-nextclade-translation.py | 54 +++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 ingest/bin/calculate-gene-converage-from-nextclade-translation.py diff --git a/ingest/bin/calculate-gene-converage-from-nextclade-translation.py b/ingest/bin/calculate-gene-converage-from-nextclade-translation.py new file mode 100644 index 00000000..6b7a9f0f --- /dev/null +++ b/ingest/bin/calculate-gene-converage-from-nextclade-translation.py @@ -0,0 +1,54 @@ +#! /usr/bin/env python3 + +import argparse +from sys import stderr + +from Bio import SeqIO +import re + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Calculate gene coverage from amino acid sequence", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + parser.add_argument( + "--fasta", + type=str, + help="FASTA file of CDS translations from Nextclade.", + ) + parser.add_argument( + "--out-col", + type=str, + default="gene_coverage", + help="Output column name.", + ) + return parser.parse_args() + + +def calculate_gene_coverage_from_nextclade_cds(fasta, out_col): + """ + Calculate gene coverage from amino acid sequence in gene translation FASTA file from Nextclade. + """ + print(f"genbank_accession\t{out_col}") + # Iterate over the sequences in the FASTA file + for record in SeqIO.parse(fasta, "fasta"): + sequence_id = record.id + sequence = str(record.seq) + + # Calculate gene coverage + results = re.findall(r"([ACDEFGHIKLMNPQRSTVWY])", sequence.upper()) + gene_coverage = round(len(results) / len(sequence), 3) + + # Print the results + print(f"{sequence_id}\t{gene_coverage}") + + +def main(): + args = parse_args() + + calculate_gene_coverage_from_nextclade_cds(args.fasta, args.out_col) + + +if __name__ == "__main__": + main() \ No newline at end of file From e0d2a7790d18bddaaa56659a2135f55e3c282cb7 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 22 Mar 2024 07:28:44 -0700 Subject: [PATCH 08/15] Add rules for gene_coverage Adds the following rules for gene coverage * calculate_gene_coverage: calls a python script which takes a Nextclade CDS translation FASTA and calculates (valid AA)/(total length). The percentage is rounded to 3 significant figures. * aggregate_gene_coverage_by_gene: combines the gene_coverage files by gene (e.g. ["E", "NS1"] ) across all serotypes (e.g. denv1-4) * appends_gene_coverage_columns: Add each gene_coverage column (e.g. "E_coverage", "NS1_coverage") to the the final metadata --- ingest/rules/nextclade.smk | 63 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 227e6063..948115fd 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -98,6 +98,69 @@ rule append_nextclade_columns: > {output.metadata_all} """ +rule calculate_gene_coverage: + """ + Calculate the coverage of the gene of interest + """ + input: + nextclade_translation="data/translations/seqs_{serotype}.gene.{gene}.fasta", + output: + gene_coverage="data/translations/gene_coverage_{serotype}_{gene}.tsv", + wildcard_constraints: + serotype=SEROTYPE_CONSTRAINTS, + shell: + """ + python bin/calculate-gene-converage-from-nextclade-translation.py \ + --fasta {input.nextclade_translation} \ + --out-col {wildcards.gene}_coverage \ + > {output.gene_coverage} + """ + +rule aggregate_gene_coverage_by_gene: + """ + Aggregate the gene coverage results by gene + """ + input: + gene_coverage=expand("data/translations/gene_coverage_{serotype}_{{gene}}.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES), + output: + gene_coverage_all="results/gene_coverage_all_{gene}.tsv", + shell: + """ + # Capture header for tsv + head -n1 {input.gene_coverage[0]} > {output.gene_coverage_all} + # Capture rest of tsv lines from all files + for FILE in {input.gene_coverage}; do + awk 'NR>1' $FILE >> {output.gene_coverage_all} + done + """ + +rule append_gene_coverage_columns: + """ + Append the gene coverage results to the metadata + """ + input: + metadata="data/metadata_nextclade.tsv", + gene_coverage=expand("results/gene_coverage_all_{gene}.tsv", gene=config["nextclade"]["gene"]) + output: + metadata_all="results/metadata_all.tsv", + params: + id_field=config["curate"]["id_field"], + shell: + """ + cp {input.metadata} {output.metadata_all} + for FILE in {input.gene_coverage}; do + export append_col=`awk 'NR==1 {{print $2}}' $FILE` + tsv-join -H \ + --filter-file $FILE \ + --key-fields {params.id_field} \ + --append-fields $append_col \ + --write-all ? \ + {output.metadata_all} \ + > results/temp_aggregate_gene_coverage.tsv + mv results/temp_aggregate_gene_coverage.tsv {output.metadata_all} + done + """ + rule split_metadata_by_serotype: """ Split the metadata by serotype From 685e218b5cea525ee2a52906a241de4f8579a5be Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 27 Mar 2024 10:02:32 +0000 Subject: [PATCH 09/15] fixup: Use tsv-append instead Co-authored-by: Jover Lee --- ingest/rules/nextclade.smk | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 948115fd..c99927a8 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -126,12 +126,7 @@ rule aggregate_gene_coverage_by_gene: gene_coverage_all="results/gene_coverage_all_{gene}.tsv", shell: """ - # Capture header for tsv - head -n1 {input.gene_coverage[0]} > {output.gene_coverage_all} - # Capture rest of tsv lines from all files - for FILE in {input.gene_coverage}; do - awk 'NR>1' $FILE >> {output.gene_coverage_all} - done + tsv-append -H {input.gene_coverage} > {output.gene_coverage_all} """ rule append_gene_coverage_columns: From c861942f66a7ab47c61165cfccc986e52413e217 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 27 Mar 2024 20:45:37 +0000 Subject: [PATCH 10/15] fixup: drop the E_indicator column https://github.com/nextstrain/dengue/pull/36#discussion_r1538211520 Since we are not using the E_indicator column, drop it. We have separate steps to calculate the E_coverage column. --- ingest/rules/nextclade.smk | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index c99927a8..e38addc4 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -65,13 +65,12 @@ rule concat_nextclade_subtype_results: nextclade_field=config["nextclade"]["nextclade_field"], shell: """ - echo "{params.id_field},{params.nextclade_field},alignmentStart,alignmentEnd,genome_coverage,failedCdses,E_indicator" \ + echo "{params.id_field},{params.nextclade_field},alignmentStart,alignmentEnd,genome_coverage,failedCdses" \ | tr ',' '\t' \ > {output.nextclade_subtypes} tsv-select -H -f "seqName,clade,alignmentStart,alignmentEnd,coverage,failedCdses" {input.nextclade_results_files} \ | awk 'NR>1 {{print}}' \ - | awk -F'\t' '$2 && !($6 ~ /E/) {{print $0"\t1"; next}} {{print $0"\t"}}' \ >> {output.nextclade_subtypes} """ @@ -92,7 +91,7 @@ rule append_nextclade_columns: tsv-join -H \ --filter-file {input.nextclade_subtypes} \ --key-fields {params.id_field} \ - --append-fields {params.nextclade_field},alignmentStart,alignmentEnd,genome_coverage,failedCdses,E_indicator \ + --append-fields {params.nextclade_field},alignmentStart,alignmentEnd,genome_coverage,failedCdses \ --write-all ? \ {input.metadata} \ > {output.metadata_all} From e722c76e0e5f8061e306996cb4598fca84c31c10 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 5 Apr 2024 09:24:31 -0700 Subject: [PATCH 11/15] fixup: gene coverage script docs --- .../bin/calculate-gene-converage-from-nextclade-translation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingest/bin/calculate-gene-converage-from-nextclade-translation.py b/ingest/bin/calculate-gene-converage-from-nextclade-translation.py index 6b7a9f0f..4ab7c199 100644 --- a/ingest/bin/calculate-gene-converage-from-nextclade-translation.py +++ b/ingest/bin/calculate-gene-converage-from-nextclade-translation.py @@ -9,7 +9,7 @@ def parse_args(): parser = argparse.ArgumentParser( - description="Calculate gene coverage from amino acid sequence", + description="Calculate gene coverage from amino acid sequence. The output will be in TSV format to stdout.", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( From 7b94670a5a19eff5c4d720b54f7c5e73b0397122 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 5 Apr 2024 09:39:46 -0700 Subject: [PATCH 12/15] fixup: avoid awk column number selection --- ingest/rules/nextclade.smk | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index e38addc4..e153c418 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -143,11 +143,10 @@ rule append_gene_coverage_columns: """ cp {input.metadata} {output.metadata_all} for FILE in {input.gene_coverage}; do - export append_col=`awk 'NR==1 {{print $2}}' $FILE` tsv-join -H \ --filter-file $FILE \ --key-fields {params.id_field} \ - --append-fields $append_col \ + --append-fields '*_coverage' \ --write-all ? \ {output.metadata_all} \ > results/temp_aggregate_gene_coverage.tsv From 1e7cde81d1562beaf28963e50b4c4ad8ff16e52d Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 5 Apr 2024 11:21:54 -0700 Subject: [PATCH 13/15] fixup: move hard-coded columns to a shared workflow variable or config params so they don't get out of sync between rules --- ingest/defaults/config.yaml | 4 +++- ingest/rules/nextclade.smk | 9 ++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index d65f8d5a..4afafebc 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -110,4 +110,6 @@ nextclade: min_length: 1000 # E gene length is approximately 1400nt min_seed_cover: 0.1 nextclade_field: 'nextclade_subtype' - gene: ['E','NS1'] \ No newline at end of file + gene: ['E','NS1'] + input_nextclade_fields: "seqName,clade,alignmentStart,alignmentEnd,coverage,failedCdses" + output_nextclade_fields: "alignmentStart,alignmentEnd,genome_coverage,failedCdses" \ No newline at end of file diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index e153c418..32a60a7d 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -63,13 +63,15 @@ rule concat_nextclade_subtype_results: params: id_field=config["curate"]["id_field"], nextclade_field=config["nextclade"]["nextclade_field"], + input_nextclade_fields=config["nextclade"]["input_nextclade_fields"], + output_nextclade_fields=config["nextclade"]["output_nextclade_fields"], shell: """ - echo "{params.id_field},{params.nextclade_field},alignmentStart,alignmentEnd,genome_coverage,failedCdses" \ + echo "{params.id_field},{params.nextclade_field},{params.output_nextclade_fields}" \ | tr ',' '\t' \ > {output.nextclade_subtypes} - tsv-select -H -f "seqName,clade,alignmentStart,alignmentEnd,coverage,failedCdses" {input.nextclade_results_files} \ + tsv-select -H -f "{params.input_nextclade_fields}" {input.nextclade_results_files} \ | awk 'NR>1 {{print}}' \ >> {output.nextclade_subtypes} """ @@ -86,12 +88,13 @@ rule append_nextclade_columns: params: id_field=config["curate"]["id_field"], nextclade_field=config["nextclade"]["nextclade_field"], + output_nextclade_fields=config["nextclade"]["output_nextclade_fields"], shell: """ tsv-join -H \ --filter-file {input.nextclade_subtypes} \ --key-fields {params.id_field} \ - --append-fields {params.nextclade_field},alignmentStart,alignmentEnd,genome_coverage,failedCdses \ + --append-fields {params.nextclade_field},{params.output_nextclade_fields} \ --write-all ? \ {input.metadata} \ > {output.metadata_all} From f6a620ddd7ba4174f7230a07d227d3c799e9c347 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 5 Apr 2024 12:59:07 -0700 Subject: [PATCH 14/15] Use serotype/gene/files in directory structure Encode serotype and gene as part of the directory structure where possible. --- ingest/rules/nextclade.smk | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 32a60a7d..acd944f5 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -30,12 +30,12 @@ rule nextclade_denvX: output: nextclade_denvX="data/nextclade_results/nextclade_{serotype}.tsv", nextclade_alignment="results/aligned_{serotype}.fasta", - nextclade_translations=expand("data/translations/seqs_{{serotype}}.gene.{gene}.fasta", gene=config["nextclade"]["gene"]), + nextclade_translations=expand("data/translations/{{serotype}}/{gene}/seqs.gene.fasta", gene=config["nextclade"]["gene"]), threads: 4 params: min_length=config["nextclade"]["min_length"], min_seed_cover=config["nextclade"]["min_seed_cover"], - output_translations = lambda wildcards: f"data/translations/seqs_{wildcards.serotype}.gene.{{cds}}.fasta", + output_translations = lambda wildcards: f"data/translations/{wildcards.serotype}/{{cds}}/seqs.gene.fasta", wildcard_constraints: serotype=SEROTYPE_CONSTRAINTS shell: @@ -105,9 +105,9 @@ rule calculate_gene_coverage: Calculate the coverage of the gene of interest """ input: - nextclade_translation="data/translations/seqs_{serotype}.gene.{gene}.fasta", + nextclade_translation="data/translations/{serotype}/{gene}/seqs.gene.fasta", output: - gene_coverage="data/translations/gene_coverage_{serotype}_{gene}.tsv", + gene_coverage="data/translations/{serotype}/{gene}/gene_coverage.tsv", wildcard_constraints: serotype=SEROTYPE_CONSTRAINTS, shell: @@ -123,9 +123,9 @@ rule aggregate_gene_coverage_by_gene: Aggregate the gene coverage results by gene """ input: - gene_coverage=expand("data/translations/gene_coverage_{serotype}_{{gene}}.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES), + gene_coverage=expand("data/translations/{serotype}/{{gene}}/gene_coverage.tsv", serotype=SUPPORTED_NEXTCLADE_SEROTYPES), output: - gene_coverage_all="results/gene_coverage_all_{gene}.tsv", + gene_coverage_all="results/{gene}/gene_coverage_all.tsv", shell: """ tsv-append -H {input.gene_coverage} > {output.gene_coverage_all} @@ -137,7 +137,7 @@ rule append_gene_coverage_columns: """ input: metadata="data/metadata_nextclade.tsv", - gene_coverage=expand("results/gene_coverage_all_{gene}.tsv", gene=config["nextclade"]["gene"]) + gene_coverage=expand("results/{gene}/gene_coverage_all.tsv", gene=config["nextclade"]["gene"]) output: metadata_all="results/metadata_all.tsv", params: From db300a5eeedb15e50c3eb6f0098647b140171350 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 19 Apr 2024 15:16:49 -0700 Subject: [PATCH 15/15] Use a one-to-one mapping of Nextclade input to output columns As suggested by https://github.com/nextstrain/dengue/pull/36#discussion_r1560020118 Merge ID should be the first item in the map --- ingest/defaults/config.yaml | 11 ++++++++--- ingest/rules/nextclade.smk | 15 ++++++--------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 4afafebc..4213d023 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -109,7 +109,12 @@ curate: nextclade: min_length: 1000 # E gene length is approximately 1400nt min_seed_cover: 0.1 - nextclade_field: 'nextclade_subtype' gene: ['E','NS1'] - input_nextclade_fields: "seqName,clade,alignmentStart,alignmentEnd,coverage,failedCdses" - output_nextclade_fields: "alignmentStart,alignmentEnd,genome_coverage,failedCdses" \ No newline at end of file + # Nextclade Fields to rename to metadata field names. + field_map: + seqName: genbank_accession # ID field used to merge annotations + clade: nextclade_subtype + alignmentStart: alignmentStart + alignmentEnd: alignmentEnd + coverage: genome_coverage + failedCdses: failedCdses \ No newline at end of file diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index acd944f5..82034140 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -61,13 +61,11 @@ rule concat_nextclade_subtype_results: output: nextclade_subtypes="results/nextclade_subtypes.tsv", params: - id_field=config["curate"]["id_field"], - nextclade_field=config["nextclade"]["nextclade_field"], - input_nextclade_fields=config["nextclade"]["input_nextclade_fields"], - output_nextclade_fields=config["nextclade"]["output_nextclade_fields"], + input_nextclade_fields=",".join([f'{key}' for key, value in config["nextclade"]["field_map"].items()]), + output_nextclade_fields=",".join([f'{value}' for key, value in config["nextclade"]["field_map"].items()]), shell: """ - echo "{params.id_field},{params.nextclade_field},{params.output_nextclade_fields}" \ + echo "{params.output_nextclade_fields}" \ | tr ',' '\t' \ > {output.nextclade_subtypes} @@ -86,15 +84,14 @@ rule append_nextclade_columns: output: metadata_all="data/metadata_nextclade.tsv", params: - id_field=config["curate"]["id_field"], - nextclade_field=config["nextclade"]["nextclade_field"], - output_nextclade_fields=config["nextclade"]["output_nextclade_fields"], + id_field=list(config["nextclade"]["field_map"].values())[0], + output_nextclade_fields=",".join([f'{value}' for key, value in config["nextclade"]["field_map"].items()][1:]), shell: """ tsv-join -H \ --filter-file {input.nextclade_subtypes} \ --key-fields {params.id_field} \ - --append-fields {params.nextclade_field},{params.output_nextclade_fields} \ + --append-fields {params.output_nextclade_fields} \ --write-all ? \ {input.metadata} \ > {output.metadata_all}