From c78d665eb5c1d8f115672404663e03d55fbb2ffc Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Fri, 27 Oct 2023 10:46:18 -0700 Subject: [PATCH] Attempt to eradicate confounding of release and submission Co-authored-by: Cornelius Roemer --- ingest/config/config.yaml | 6 +++--- ingest/source-data/ncbi-dataset-field-map.tsv | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml index a8e26f70..a2a4d8f6 100644 --- a/ingest/config/config.yaml +++ b/ingest/config/config.yaml @@ -6,14 +6,14 @@ transform: # Fields to rename. # This is the first step in the pipeline, so any references to field names # in the configs below should use the new field names - field_map: ['collected=date', 'submitted=date_submitted', 'genbank_accession=accession', 'submitting_organization=institution'] + field_map: ['collected=date', 'genbank_accession=accession', 'submitting_organization=institution'] # Standardized strain name regex # Currently accepts any characters because we do not have a clear standard for strain names strain_regex: '^.+$' # Back up strain name field if 'strain' doesn't match regex above strain_backup_fields: ['accession'] # List of date fields to standardize - date_fields: ['date', 'date_submitted'] + date_fields: ['date', 'date_released'] # Expected date formats present in date fields # These date formats should use directives expected by datetime # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes @@ -59,7 +59,7 @@ transform: 'division', 'location', 'host', - 'date_submitted', + 'date_released', 'sra_accession', 'abbr_authors', 'reverse', diff --git a/ingest/source-data/ncbi-dataset-field-map.tsv b/ingest/source-data/ncbi-dataset-field-map.tsv index eb794184..aedb9abf 100644 --- a/ingest/source-data/ncbi-dataset-field-map.tsv +++ b/ingest/source-data/ncbi-dataset-field-map.tsv @@ -5,7 +5,7 @@ Isolate Lineage strain Geographic Region region Geographic Location location Isolate Collection date collected -Release date submitted +Release date date_released Update date updated Length length Host Name host