From d9e565403c79066ef1dcb5aab590039c56a64892 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 8 Nov 2023 15:53:54 -0800 Subject: [PATCH] NCBI Dataset field name transformations Originally the field map was created to keep mpox NDJSON backward compatible with field names used from NCBI Virus. However, this constraint is not applicable to dengue. This commit organizes field renaming into two parts. 1. Rename the NCBI output columns to match the NCBI mnemonics (see `source-data/ncbi-dataset-field-map.tsv`) 2. Where necessary, rename the NCBI mnemonics to match Nextstrain expected column names (see "transform: fieldmap:" in `config/config.yaml`) For context and discussion, see https://github.com/nextstrain/dengue/pull/13#discussion_r1374892802 --- ingest/config/config.yaml | 28 +++++++++++++---- ingest/source-data/ncbi-dataset-field-map.tsv | 30 +++++++++---------- .../snakemake_rules/fetch_sequences.smk | 9 +++--- 3 files changed, 41 insertions(+), 26 deletions(-) diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml index f2dc1abf..d1fc2896 100644 --- a/ingest/config/config.yaml +++ b/ingest/config/config.yaml @@ -10,14 +10,28 @@ transform: # Fields to rename. # This is the first step in the pipeline, so any references to field names # in the configs below should use the new field names - field_map: ['collected=date', 'released=date_released', 'genbank_accession=accession', 'submitting_organization=institution'] + field_map: [ + 'accession=genbank_accession', + 'accession-rev=genbank_accession_rev', + 'isolate-lineage=strain', + 'sourcedb=database', # necessary for applying geo location rules + 'geo-region=region', + 'geo-location=location', + 'host-name=host', + 'isolate-collection-date=date', + 'release-date=release_date', + 'update-date=update_date', + 'sra-accs=sra_accessions', + 'submitter-names=authors', + 'submitter-affiliation=institution', + ] # Standardized strain name regex # Currently accepts any characters because we do not have a clear standard for strain names strain_regex: '^.+$' # Back up strain name field if 'strain' doesn't match regex above strain_backup_fields: ['accession'] # List of date fields to standardize - date_fields: ['date', 'date_released'] + date_fields: ['date', 'release-date', 'update-date'] # Expected date formats present in date fields # These date formats should use directives expected by datetime # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes @@ -54,17 +68,19 @@ transform: sequence_field: 'sequence' # Final output columns for the metadata TSV metadata_columns: [ - 'accession', - 'genbank_accession_rev', 'strain', + 'genbank_accession', + 'genbank_accession_rev', 'date', 'region', 'country', 'division', 'location', + 'length', 'host', - 'date_released', - 'sra_accession', + 'release_date', + 'update_date', + 'sra_accessions', 'abbr_authors', 'authors', 'institution' diff --git a/ingest/source-data/ncbi-dataset-field-map.tsv b/ingest/source-data/ncbi-dataset-field-map.tsv index 00c9db88..57b4f8c5 100644 --- a/ingest/source-data/ncbi-dataset-field-map.tsv +++ b/ingest/source-data/ncbi-dataset-field-map.tsv @@ -1,17 +1,17 @@ +# Maps the NCBI output TSV column names back to the NCBI mnemonics. +# This list should match the list in +# ingest/workflow/snakemake_rules/fetch_sequences.smk _get_ncbi_dataset_field_mnemonics key value -Accession genbank_accession_rev -Source database database -Isolate Lineage strain -Geographic Region region -Geographic Location location -Isolate Collection date collected -Release date released -Update date updated +Accession accession-rev +Source database sourcedb +Isolate Lineage isolate-lineage +Geographic Region geo-region +Geographic Location geo-location +Isolate Collection date isolate-collection-date +Release date release-date +Update date update-date Length length -Host Name host -Isolate Lineage source isolation_source -BioProjects bioproject_accession -BioSample accession biosample_accession -SRA Accessions sra_accession -Submitter Names authors -Submitter Affiliation submitting_organization +Host Name host-name +SRA Accessions sra-accs +Submitter Names submitter-names +Submitter Affiliation submitter-affiliation diff --git a/ingest/workflow/snakemake_rules/fetch_sequences.smk b/ingest/workflow/snakemake_rules/fetch_sequences.smk index 3f32f9b4..8d271930 100644 --- a/ingest/workflow/snakemake_rules/fetch_sequences.smk +++ b/ingest/workflow/snakemake_rules/fetch_sequences.smk @@ -76,8 +76,7 @@ def _get_ncbi_dataset_field_mnemonics(wildcards) -> str: rule format_ncbi_dataset_report: - # Formats the headers to be the same as before we used NCBI Datasets - # The only fields we do not have equivalents for are "title" and "publications" + # Formats the headers to match the NCBI mnemonic names input: dataset_package="data/ncbi_dataset.zip", ncbi_field_map=config["ncbi_field_map"], @@ -93,8 +92,8 @@ rule format_ncbi_dataset_report: --package {input.dataset_package} \ --fields {params.fields_to_include:q} \ | csvtk -tl rename2 -F -f '*' -p '(.+)' -r '{{kv}}' -k {input.ncbi_field_map} \ - | csvtk -tl mutate -f genbank_accession_rev -n genbank_accession -p "^(.+?)\." \ - | tsv-select -H -f genbank_accession --rest last \ + | csvtk -tl mutate -f accession-rev -n accession -p "^(.+?)\." \ + | tsv-select -H -f accession --rest last \ > {output.ncbi_dataset_tsv} """ @@ -114,7 +113,7 @@ rule format_ncbi_datasets_ndjson: augur curate passthru \ --metadata {input.ncbi_dataset_tsv} \ --fasta {input.ncbi_dataset_sequences} \ - --seq-id-column genbank_accession_rev \ + --seq-id-column accession-rev \ --seq-field sequence \ --unmatched-reporting warn \ --duplicate-reporting warn \