From d9e565403c79066ef1dcb5aab590039c56a64892 Mon Sep 17 00:00:00 2001
From: Jennifer Chang <jennifer.chang.bioinform@gmail.com>
Date: Wed, 8 Nov 2023 15:53:54 -0800
Subject: [PATCH] NCBI Dataset field name transformations

Originally the field map was created to keep mpox NDJSON backward compatible
with field names used from NCBI Virus. However, this constraint is not
applicable to dengue.

This commit organizes field renaming into two parts.

1. Rename the NCBI output columns to match the NCBI mnemonics
   (see `source-data/ncbi-dataset-field-map.tsv`)
2. Where necessary, rename the NCBI mnemonics to match Nextstrain expected column names
   (see "transform: fieldmap:" in `config/config.yaml`)

For context and discussion, see https://github.com/nextstrain/dengue/pull/13#discussion_r1374892802
---
 ingest/config/config.yaml                     | 28 +++++++++++++----
 ingest/source-data/ncbi-dataset-field-map.tsv | 30 +++++++++----------
 .../snakemake_rules/fetch_sequences.smk       |  9 +++---
 3 files changed, 41 insertions(+), 26 deletions(-)

diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml
index f2dc1abf..d1fc2896 100644
--- a/ingest/config/config.yaml
+++ b/ingest/config/config.yaml
@@ -10,14 +10,28 @@ transform:
   # Fields to rename.
   # This is the first step in the pipeline, so any references to field names
   # in the configs below should use the new field names
-  field_map: ['collected=date', 'released=date_released', 'genbank_accession=accession', 'submitting_organization=institution']
+  field_map: [
+    'accession=genbank_accession',
+    'accession-rev=genbank_accession_rev',
+    'isolate-lineage=strain',
+    'sourcedb=database', # necessary for applying geo location rules
+    'geo-region=region',
+    'geo-location=location',
+    'host-name=host',
+    'isolate-collection-date=date',
+    'release-date=release_date',
+    'update-date=update_date',
+    'sra-accs=sra_accessions',    
+    'submitter-names=authors',
+    'submitter-affiliation=institution',
+  ]
   # Standardized strain name regex
   # Currently accepts any characters because we do not have a clear standard for strain names
   strain_regex: '^.+$'
   # Back up strain name field if 'strain' doesn't match regex above
   strain_backup_fields: ['accession']
   # List of date fields to standardize
-  date_fields: ['date', 'date_released']
+  date_fields: ['date', 'release-date', 'update-date']
   # Expected date formats present in date fields
   # These date formats should use directives expected by datetime
   # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
@@ -54,17 +68,19 @@ transform:
   sequence_field: 'sequence'
   # Final output columns for the metadata TSV
   metadata_columns: [
-    'accession',
-    'genbank_accession_rev',
     'strain',
+    'genbank_accession',
+    'genbank_accession_rev',
     'date',
     'region',
     'country',
     'division',
     'location',
+    'length',
     'host',
-    'date_released',
-    'sra_accession',
+    'release_date',
+    'update_date',
+    'sra_accessions',
     'abbr_authors',
     'authors',
     'institution'
diff --git a/ingest/source-data/ncbi-dataset-field-map.tsv b/ingest/source-data/ncbi-dataset-field-map.tsv
index 00c9db88..57b4f8c5 100644
--- a/ingest/source-data/ncbi-dataset-field-map.tsv
+++ b/ingest/source-data/ncbi-dataset-field-map.tsv
@@ -1,17 +1,17 @@
+# Maps the NCBI output TSV column names back to the NCBI mnemonics.
+# This list should match the list in
+# ingest/workflow/snakemake_rules/fetch_sequences.smk _get_ncbi_dataset_field_mnemonics
 key	value
-Accession	genbank_accession_rev
-Source database	database
-Isolate Lineage	strain
-Geographic Region	region
-Geographic Location	location
-Isolate Collection date	collected
-Release date	released
-Update date	updated
+Accession	accession-rev
+Source database	sourcedb
+Isolate Lineage	isolate-lineage
+Geographic Region	geo-region
+Geographic Location	geo-location
+Isolate Collection date	isolate-collection-date
+Release date	release-date
+Update date	update-date
 Length	length
-Host Name	host
-Isolate Lineage source	isolation_source
-BioProjects	bioproject_accession
-BioSample accession	biosample_accession
-SRA Accessions	sra_accession
-Submitter Names	authors
-Submitter Affiliation	submitting_organization
+Host Name	host-name
+SRA Accessions	sra-accs
+Submitter Names	submitter-names
+Submitter Affiliation	submitter-affiliation
diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk
index 3f32f9b4..8d271930 100644
--- a/ingest/workflow/snakemake_rules/fetch_sequences.smk
+++ b/ingest/workflow/snakemake_rules/fetch_sequences.smk
@@ -76,8 +76,7 @@ def _get_ncbi_dataset_field_mnemonics(wildcards) -> str:
 
 
 rule format_ncbi_dataset_report:
-    # Formats the headers to be the same as before we used NCBI Datasets
-    # The only fields we do not have equivalents for are "title" and "publications"
+    # Formats the headers to match the NCBI mnemonic names
     input:
         dataset_package="data/ncbi_dataset.zip",
         ncbi_field_map=config["ncbi_field_map"],
@@ -93,8 +92,8 @@ rule format_ncbi_dataset_report:
             --package {input.dataset_package} \
             --fields {params.fields_to_include:q} \
             | csvtk -tl rename2 -F -f '*' -p '(.+)' -r '{{kv}}' -k {input.ncbi_field_map} \
-            | csvtk -tl mutate -f genbank_accession_rev -n genbank_accession -p "^(.+?)\." \
-            | tsv-select -H -f genbank_accession --rest last \
+            | csvtk -tl mutate -f accession-rev -n accession -p "^(.+?)\." \
+            | tsv-select -H -f accession --rest last \
             > {output.ncbi_dataset_tsv}
         """
 
@@ -114,7 +113,7 @@ rule format_ncbi_datasets_ndjson:
         augur curate passthru \
             --metadata {input.ncbi_dataset_tsv} \
             --fasta {input.ncbi_dataset_sequences} \
-            --seq-id-column genbank_accession_rev \
+            --seq-id-column accession-rev \
             --seq-field sequence \
             --unmatched-reporting warn \
             --duplicate-reporting warn \