From 2239f4a6fe9b109274be5e6c6c30ec37d20297c4 Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Fri, 20 Dec 2024 12:09:27 -0800 Subject: [PATCH 1/4] Move nextclade field map into config [#21] --- ingest/defaults/config.yaml | 25 +++++++++++++++++++++- ingest/defaults/nextclade_field_map.tsv | 28 ------------------------- 2 files changed, 24 insertions(+), 29 deletions(-) delete mode 100644 ingest/defaults/nextclade_field_map.tsv diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml index 584b827..ba616f5 100644 --- a/ingest/defaults/config.yaml +++ b/ingest/defaults/config.yaml @@ -131,5 +131,28 @@ curate: - url nextclade: dataset_name: "nextstrain/yellow-fever/prM-E" - field_map: "defaults/nextclade_field_map.tsv" + field_map: + seqName: "seqName" + clade: "clade" + coverage: "coverage" + totalMissing: "missing_data" + totalSubstitutions: "divergence" + totalNonACGTNs: "nonACGTN" + qc.overallStatus: "QC_overall" + qc.missingData.status: "QC_missing_data" + qc.mixedSites.status: "QC_mixed_sites" + qc.privateMutations.status: "QC_rare_mutations" + qc.snpClusters.status: "QC_snp_clusters" + qc.frameShifts.status: "QC_frame_shifts" + qc.stopCodons.status: "QC_stop_codons" + frameShifts: "frame_shifts" + privateNucMutations.reversionSubstitutions: "private_reversion_substitutions" + privateNucMutations.labeledSubstitutions: "private_labeled_substitutions" + privateNucMutations.unlabeledSubstitutions: "private_unlabeled_substitutions" + privateNucMutations.totalReversionSubstitutions: "private_total_reversion_substitutions" + privateNucMutations.totalLabeledSubstitutions: "private_total_labeled_substitutions" + privateNucMutations.totalUnlabeledSubstitutions: "private_total_unlabeled_substitutions" + privateNucMutations.totalPrivateSubstitutions: "private_total_private_substitutions" + qc.snpClusters.clusteredSNPs: "private_snp_clusters" + qc.snpClusters.totalSNPs: "private_total_snp_clusters" id_field: "seqName" diff --git a/ingest/defaults/nextclade_field_map.tsv b/ingest/defaults/nextclade_field_map.tsv deleted file mode 100644 index e23c864..0000000 --- a/ingest/defaults/nextclade_field_map.tsv +++ /dev/null @@ -1,28 +0,0 @@ -# TSV file that is a mapping of column names for Nextclade output TSV -# The first column should be the original column name of the Nextclade TSV -# The second column should be the new column name to use in the final metadata TSV -# Nextclade can have pathogen specific output columns so make sure to check which -# columns would be useful for your downstream phylogenetic analysis. -seqName seqName -clade clade -coverage coverage -totalMissing missing_data -totalSubstitutions divergence -totalNonACGTNs nonACGTN -qc.overallStatus QC_overall -qc.missingData.status QC_missing_data -qc.mixedSites.status QC_mixed_sites -qc.privateMutations.status QC_rare_mutations -qc.snpClusters.status QC_snp_clusters -qc.frameShifts.status QC_frame_shifts -qc.stopCodons.status QC_stop_codons -frameShifts frame_shifts -privateNucMutations.reversionSubstitutions private_reversion_substitutions -privateNucMutations.labeledSubstitutions private_labeled_substitutions -privateNucMutations.unlabeledSubstitutions private_unlabeled_substitutions -privateNucMutations.totalReversionSubstitutions private_total_reversion_substitutions -privateNucMutations.totalLabeledSubstitutions private_total_labeled_substitutions -privateNucMutations.totalUnlabeledSubstitutions private_total_unlabeled_substitutions -privateNucMutations.totalPrivateSubstitutions private_total_private_substitutions -qc.snpClusters.clusteredSNPs private_snp_clusters -qc.snpClusters.totalSNPs private_total_snp_clusters From 0d6acd2aac406dcd4c83fde6920c8f8eccde8a63 Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Fri, 20 Dec 2024 12:10:20 -0800 Subject: [PATCH 2/4] Use augur curate to generate intermediate Nextclade metadata file [#21] --- ingest/rules/nextclade.smk | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index 19c1ff6..da4dd75 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -45,6 +45,31 @@ rule run_nextclade: """ +rule nextclade_metadata: + input: + nextclade="results/nextclade.tsv", + output: + nextclade_metadata=temp("results/nextclade_metadata.tsv"), + params: + nextclade_id_field=config["nextclade"]["id_field"], + nextclade_field_map=[f"{old}={new}" for old, new in config["nextclade"]["field_map"].items()], + nextclade_fields=",".join(config["nextclade"]["field_map"].values()), + log: + "logs/nextclade_metadata.txt", + benchmark: + "benchmarks/nextclade_metadata.tsv", + shell: + r""" + augur curate rename \ + --metadata {input.nextclade:q} \ + --id-column {params.nextclade_id_field:q} \ + --field-map {params.nextclade_field_map:q} \ + --output-metadata - \ + | csvtk cut --tabs --fields {params.nextclade_fields:q} \ + > {output.nextclade_metadata:q} 2> {log:q} + """ + + rule join_metadata_and_nextclade: input: nextclade="results/nextclade.tsv", From 3dddc124ddb5fe84ce9d648085717a8f794ae76e Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Fri, 20 Dec 2024 12:10:53 -0800 Subject: [PATCH 3/4] Use augur merge to unify nextclade and NCBI metadata [#21] --- ingest/rules/nextclade.smk | 33 +++++++++++---------------------- 1 file changed, 11 insertions(+), 22 deletions(-) diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk index da4dd75..d35939d 100644 --- a/ingest/rules/nextclade.smk +++ b/ingest/rules/nextclade.smk @@ -74,7 +74,7 @@ rule join_metadata_and_nextclade: input: nextclade="results/nextclade.tsv", metadata="data/subset_metadata.tsv", - nextclade_field_map=config["nextclade"]["field_map"], + nextclade_metadata="results/nextclade_metadata.tsv", output: metadata="results/metadata.tsv", params: @@ -86,25 +86,14 @@ rule join_metadata_and_nextclade: "benchmarks/join_metadata_and_nextclade.txt", shell: r""" - ( - export SUBSET_FIELDS=`grep -v '^#' {input.nextclade_field_map} | awk '{{print $1}}' | tr '\n' ',' | sed 's/,$//g'` - - csvtk -t cut -f $SUBSET_FIELDS \ - {input.nextclade} \ - | csvtk -t rename2 \ - -F \ - -f '*' \ - -p '(.+)' \ - -r '{{kv}}' \ - -k {input.nextclade_field_map} \ - | tsv-join -H \ - --filter-file - \ - --key-fields {params.nextclade_id_field} \ - --data-fields {params.metadata_id_field} \ - --append-fields '*' \ - --write-all ? \ - {input.metadata} \ - | tsv-select -H --exclude {params.nextclade_id_field} \ - > {output.metadata} - ) 2>{log:q} + augur merge \ + --metadata \ + metadata={input.metadata:q} \ + nextclade={input.nextclade_metadata:q} \ + --metadata-id-columns \ + metadata={params.metadata_id_field:q} \ + nextclade={params.nextclade_id_field:q} \ + --output-metadata {output.metadata:q} \ + --no-source-columns \ + &> {log:q} """ From c1c19ac2fd688c72e6cdbe2e6941415211cf3085 Mon Sep 17 00:00:00 2001 From: John SJ Anderson Date: Fri, 20 Dec 2024 12:11:27 -0800 Subject: [PATCH 4/4] Remove an extraneous temp() that Snakemake just started complaining about [#21] --- ingest/rules/curate.smk | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk index 308b368..f7effcc 100644 --- a/ingest/rules/curate.smk +++ b/ingest/rules/curate.smk @@ -118,7 +118,7 @@ rule curate: rule add_genbank_url: input: - metadata=temp("data/all_metadata_intermediate.tsv"), + metadata="data/all_metadata_intermediate.tsv", output: metadata="data/all_metadata.tsv", log: