Add genotype metadata

This commit adds genotype metadata that is reported on NCBI to the ingest output, and allows visualization of that information on the genome tree and the N450 tree. Many samples have genotype information reported within the `virus-name` field of the NDJSON record from NCBI. This commit uses a custom script to parse the genotypes from that field and outputs the parsed genotypes to a column named `genotype_ncbi`. Genotypes are not available on NCBI for the vaccine strains or WHO genotype reference strains, and so this commit manually adds the genotypes for those strains. Information for WHO genotype reference strains is from https://www.who.int/publications/i/item/WER8709
nextstrain · Apr 25, 2024 · cce8b3c · cce8b3c
1 parent 70248db
commit cce8b3c
Show file tree

Hide file tree

Showing 7 changed files with 133 additions and 1 deletion.
diff --git a/ingest/bin/parse-measles-genotype-names.py b/ingest/bin/parse-measles-genotype-names.py
@@ -0,0 +1,60 @@
+#! /usr/bin/env python3
+"""
+From stdin, parses genotypes from GenBank's 'virus-name' field of the NDJSON record to 'genotype_ncbi'
+
+Outputs the modified record to stdout.
+"""
+
+import argparse
+import json
+from sys import stdin, stdout, stderr
+
+import re
+
+EXPECTED_GENOTYPES = ['A', 'B1', 'B2', 'B3', 'C1', 'C2', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'E', 'F', 'G1', 'G2', 'G3', 'H1', 'H2']
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Modify measles virus-name attribute to extract genotypes to 'genotype_ncbi'."
+    )
+    parser.add_argument("--genotype-field", default='virus_name',
+        help="Field from the records to use as the genotype to be parsed.")
+
+    return parser.parse_args()
+
+def _set_genotype_name(record):
+    genotype_name = record["genotype_ncbi"]
+
+    genotype_name = genotype_name.replace('Measles virus genotype ', '')
+    genotype_name = re.sub(r'Measles morbillivirus.*$', r'', genotype_name)   
+    genotype_name = re.sub(r'.*?\[(.*)\]$', r'\1', genotype_name) # If square brackets present at end of string, keep only the text inside the brackets
+    genotype_name = re.sub(r'Measles virus MVs.*$', r'', genotype_name)
+    genotype_name = re.sub(r'Measles virus MVi.*$', r'', genotype_name)
+    genotype_name = re.sub(r'Measles virus strain MVi.*$', r'', genotype_name)
+    genotype_name = genotype_name.replace('Measles virus strain ', '')
+    genotype_name = re.sub(r'Measles virus.*$', r'', genotype_name)
+    genotype_name = re.sub(r'A-vaccine.*$', r'A', genotype_name)
+    genotype_name = re.sub(r'B3.1', r'B3', genotype_name) 
+    genotype_name = re.sub(r'B3.2', r'B3', genotype_name) 
+    genotype_name = re.sub(r'D4a', r'D4', genotype_name) 
+    genotype_name = re.sub(r'D4b', r'D4', genotype_name) 
+    genotype_name = re.sub(r'H1a', r'H1', genotype_name) 
+    genotype_name = re.sub(r'H1b', r'H1', genotype_name) 
+
+    return (
+        genotype_name)
+
+def main():
+    args = parse_args()
+
+    for index, record in enumerate(stdin):
+        record = json.loads(record)
+        record['genotype_ncbi'] = record[args.genotype_field]
+        record['genotype_ncbi'] = _set_genotype_name(record)
+        if record['genotype_ncbi'] not in EXPECTED_GENOTYPES:
+            print(f"WARNING: unexpected NCBI genotype {record['genotype_ncbi']} parsed from record {index} will be excluded.", file=stderr)
+            record['genotype_ncbi'] = ''
+        stdout.write(json.dumps(record) + "\n")
+
+if __name__ == "__main__":
+    main()
diff --git a/ingest/defaults/annotations.tsv b/ingest/defaults/annotations.tsv
@@ -14,16 +14,22 @@ AF266288	region	North America
 AF266288	country	USA
 AF266288	division	Massachusetts
 AF266288	location	Boston
+AF266288	genotype_ncbi	A
 AF266287	strain	Measles vaccine strain Moraten
 AF266287	date	1954
+AF266287	genotype_ncbi	A
 AF266290	strain	Measles vaccine strain Zagreb
 AF266290	date	1954
+AF266290	genotype_ncbi	A
 AF266289	strain	Measles vaccine strain Rubeovax
 AF266289	date	1954
+AF266289	genotype_ncbi	A
 AF266291	strain	Measles vaccine strain Schwarz
 AF266291	date	1954
+AF266291	genotype_ncbi	A
 AF266286	strain	Measles vaccine strain AIK-C
 AF266286	date	1954
+AF266286	genotype_ncbi	A
 #
 # WHO genotype reference strains
 # Information from https://www.who.int/publications/i/item/WER8709
@@ -56,6 +62,34 @@ U64582	is_reference	TRUE
 X84865	is_reference	TRUE
 X84872	is_reference	TRUE
 X84879	is_reference	TRUE
+AF045212	genotype_ncbi	H1
+AF045217	genotype_ncbi	H2
+AF079555	genotype_ncbi	D5
+AF171232	genotype_ncbi	G2
+AF243450	genotype_ncbi	D7
+AF280803	genotype_ncbi	D8
+AF481485	genotype_ncbi	D9
+AJ232203	genotype_ncbi	B3
+AY037020	genotype_ncbi	D7
+AY043459	genotype_ncbi	C1
+AY184217	genotype_ncbi	G3
+AY923185	genotype_ncbi	D10
+D01005	genotype_ncbi	D1
+GU440571	genotype_ncbi	D11
+L46750	genotype_ncbi	D6
+L46753	genotype_ncbi	B3
+L46758	genotype_ncbi	D5
+M89921	genotype_ncbi	C2
+U01974	genotype_ncbi	G1
+U01976	genotype_ncbi	D4
+U01977	genotype_ncbi	D3
+U01987	genotype_ncbi	A
+U01994	genotype_ncbi	B2
+U01998	genotype_ncbi	B1
+U64582	genotype_ncbi	D2
+X84865	genotype_ncbi	F
+X84872	genotype_ncbi	C2
+X84879	genotype_ncbi	E
 AF045212	strain	MVi/Hunan.CHN/0.93/7
 AF045217	strain	MVi/Beijing.CHN/0.94/1
 AF079555	strain	MVi/Bangkok.THA/0.93/1

diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
@@ -118,6 +118,7 @@ curate:
     'authors',
     'abbr_authors',
     'institution',
+    'genotype_ncbi',
     'is_reference'
   ]
-
+  genotype_field: "virus_name"
diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk
@@ -82,6 +82,7 @@ rule curate:
         annotations_id=config["curate"]["annotations_id"],
         id_field=config["curate"]["output_id_field"],
         sequence_field=config["curate"]["output_sequence_field"],
+        genotype_field=config["curate"]["genotype_field"],
     shell:
         """
         (cat {input.sequences_ndjson} \
@@ -105,6 +106,7 @@ rule curate:
                 --abbr-authors-field {params.abbr_authors_field} \
             | ./vendored/apply-geolocation-rules \
                 --geolocation-rules {input.all_geolocation_rules} \
+            | ./bin/parse-measles-genotype-names.py --genotype-field {params.genotype_field} \
             | ./vendored/merge-user-metadata \
                 --annotations {input.annotations} \
                 --id-field {params.annotations_id} \

diff --git a/phylogenetic/defaults/auspice_config.json b/phylogenetic/defaults/auspice_config.json
@@ -25,6 +25,11 @@
       "key": "region",
       "title": "Region",
       "type": "categorical"
+    },
+    {
+      "key": "genotype_ncbi",
+      "title": "Genotype (NCBI)",
+      "type": "categorical"
     }
   ],
   "geo_resolutions": [

diff --git a/phylogenetic/defaults/auspice_config_N450.json b/phylogenetic/defaults/auspice_config_N450.json
@@ -26,6 +26,11 @@
       "title": "Region",
       "type": "categorical"
     },
+    {
+      "key": "genotype_ncbi",
+      "title": "Genotype (NCBI)",
+      "type": "categorical"
+    },
     {
       "key": "is_reference",
       "title": "WHO reference",

diff --git a/phylogenetic/defaults/colors.tsv b/phylogenetic/defaults/colors.tsv
@@ -4,3 +4,28 @@ region	Africa	#8ABB6A
 region	Europe	#BEBB48
 region	South America	#E29E39
 region	North America	#E2562B
+
+genotype_ncbi	A	#5E1D9D
+genotype_ncbi	B1	#4B26B1
+genotype_ncbi	B2	#4138C3
+genotype_ncbi	B3	#3F4FCC
+genotype_ncbi	C1	#4065CF
+genotype_ncbi	C2	#447ACD
+genotype_ncbi	D1	#4A8BC3
+genotype_ncbi	D2	#529AB6
+genotype_ncbi	D3	#5BA6A6
+genotype_ncbi	D4	#66AE95
+genotype_ncbi	D5	#73B583
+genotype_ncbi	D6	#81B973
+genotype_ncbi	D7	#91BC64
+genotype_ncbi	D8	#A1BE58
+genotype_ncbi	D9	#B1BD4E
+genotype_ncbi	D10	#C0BA47
+genotype_ncbi	D11	#CEB541
+genotype_ncbi	E	#DAAD3D
+genotype_ncbi	F	#E19F3A
+genotype_ncbi	G1	#E68E36
+genotype_ncbi	G2	#E67832
+genotype_ncbi	G3	#E35F2D
+genotype_ncbi	H1	#DF4328
+genotype_ncbi	H2	#DB2823