Skip to content

Commit

Permalink
Add genotype metadata
Browse files Browse the repository at this point in the history
This commit adds genotype metadata that is reported on NCBI to the ingest output, and allows visualization of that information on the genome tree and the N450 tree.

Many samples have genotype information reported within the `virus-name` field of the NDJSON record from NCBI. This commit uses a custom script to parse the genotypes from that field and outputs the parsed genotypes to a column named `genotype_ncbi`.

Genotypes are not available on NCBI for the vaccine strains or WHO genotype reference strains, and so this commit manually adds the genotypes for those strains. Information for WHO genotype reference strains is from https://www.who.int/publications/i/item/WER8709
  • Loading branch information
kimandrews committed Apr 25, 2024
1 parent 70248db commit cce8b3c
Show file tree
Hide file tree
Showing 7 changed files with 133 additions and 1 deletion.
60 changes: 60 additions & 0 deletions ingest/bin/parse-measles-genotype-names.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#! /usr/bin/env python3
"""
From stdin, parses genotypes from GenBank's 'virus-name' field of the NDJSON record to 'genotype_ncbi'
Outputs the modified record to stdout.
"""

import argparse
import json
from sys import stdin, stdout, stderr

import re

EXPECTED_GENOTYPES = ['A', 'B1', 'B2', 'B3', 'C1', 'C2', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'D10', 'D11', 'E', 'F', 'G1', 'G2', 'G3', 'H1', 'H2']

def parse_args():
parser = argparse.ArgumentParser(
description="Modify measles virus-name attribute to extract genotypes to 'genotype_ncbi'."
)
parser.add_argument("--genotype-field", default='virus_name',
help="Field from the records to use as the genotype to be parsed.")

return parser.parse_args()

def _set_genotype_name(record):
genotype_name = record["genotype_ncbi"]

genotype_name = genotype_name.replace('Measles virus genotype ', '')
genotype_name = re.sub(r'Measles morbillivirus.*$', r'', genotype_name)
genotype_name = re.sub(r'.*?\[(.*)\]$', r'\1', genotype_name) # If square brackets present at end of string, keep only the text inside the brackets
genotype_name = re.sub(r'Measles virus MVs.*$', r'', genotype_name)
genotype_name = re.sub(r'Measles virus MVi.*$', r'', genotype_name)
genotype_name = re.sub(r'Measles virus strain MVi.*$', r'', genotype_name)
genotype_name = genotype_name.replace('Measles virus strain ', '')
genotype_name = re.sub(r'Measles virus.*$', r'', genotype_name)
genotype_name = re.sub(r'A-vaccine.*$', r'A', genotype_name)
genotype_name = re.sub(r'B3.1', r'B3', genotype_name)
genotype_name = re.sub(r'B3.2', r'B3', genotype_name)
genotype_name = re.sub(r'D4a', r'D4', genotype_name)
genotype_name = re.sub(r'D4b', r'D4', genotype_name)
genotype_name = re.sub(r'H1a', r'H1', genotype_name)
genotype_name = re.sub(r'H1b', r'H1', genotype_name)

return (
genotype_name)

def main():
args = parse_args()

for index, record in enumerate(stdin):
record = json.loads(record)
record['genotype_ncbi'] = record[args.genotype_field]
record['genotype_ncbi'] = _set_genotype_name(record)
if record['genotype_ncbi'] not in EXPECTED_GENOTYPES:
print(f"WARNING: unexpected NCBI genotype {record['genotype_ncbi']} parsed from record {index} will be excluded.", file=stderr)
record['genotype_ncbi'] = ''
stdout.write(json.dumps(record) + "\n")

if __name__ == "__main__":
main()
34 changes: 34 additions & 0 deletions ingest/defaults/annotations.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -14,16 +14,22 @@ AF266288 region North America
AF266288 country USA
AF266288 division Massachusetts
AF266288 location Boston
AF266288 genotype_ncbi A
AF266287 strain Measles vaccine strain Moraten
AF266287 date 1954
AF266287 genotype_ncbi A
AF266290 strain Measles vaccine strain Zagreb
AF266290 date 1954
AF266290 genotype_ncbi A
AF266289 strain Measles vaccine strain Rubeovax
AF266289 date 1954
AF266289 genotype_ncbi A
AF266291 strain Measles vaccine strain Schwarz
AF266291 date 1954
AF266291 genotype_ncbi A
AF266286 strain Measles vaccine strain AIK-C
AF266286 date 1954
AF266286 genotype_ncbi A
#
# WHO genotype reference strains
# Information from https://www.who.int/publications/i/item/WER8709
Expand Down Expand Up @@ -56,6 +62,34 @@ U64582 is_reference TRUE
X84865 is_reference TRUE
X84872 is_reference TRUE
X84879 is_reference TRUE
AF045212 genotype_ncbi H1
AF045217 genotype_ncbi H2
AF079555 genotype_ncbi D5
AF171232 genotype_ncbi G2
AF243450 genotype_ncbi D7
AF280803 genotype_ncbi D8
AF481485 genotype_ncbi D9
AJ232203 genotype_ncbi B3
AY037020 genotype_ncbi D7
AY043459 genotype_ncbi C1
AY184217 genotype_ncbi G3
AY923185 genotype_ncbi D10
D01005 genotype_ncbi D1
GU440571 genotype_ncbi D11
L46750 genotype_ncbi D6
L46753 genotype_ncbi B3
L46758 genotype_ncbi D5
M89921 genotype_ncbi C2
U01974 genotype_ncbi G1
U01976 genotype_ncbi D4
U01977 genotype_ncbi D3
U01987 genotype_ncbi A
U01994 genotype_ncbi B2
U01998 genotype_ncbi B1
U64582 genotype_ncbi D2
X84865 genotype_ncbi F
X84872 genotype_ncbi C2
X84879 genotype_ncbi E
AF045212 strain MVi/Hunan.CHN/0.93/7
AF045217 strain MVi/Beijing.CHN/0.94/1
AF079555 strain MVi/Bangkok.THA/0.93/1
Expand Down
3 changes: 2 additions & 1 deletion ingest/defaults/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ curate:
'authors',
'abbr_authors',
'institution',
'genotype_ncbi',
'is_reference'
]

genotype_field: "virus_name"
2 changes: 2 additions & 0 deletions ingest/rules/curate.smk
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,7 @@ rule curate:
annotations_id=config["curate"]["annotations_id"],
id_field=config["curate"]["output_id_field"],
sequence_field=config["curate"]["output_sequence_field"],
genotype_field=config["curate"]["genotype_field"],
shell:
"""
(cat {input.sequences_ndjson} \
Expand All @@ -105,6 +106,7 @@ rule curate:
--abbr-authors-field {params.abbr_authors_field} \
| ./vendored/apply-geolocation-rules \
--geolocation-rules {input.all_geolocation_rules} \
| ./bin/parse-measles-genotype-names.py --genotype-field {params.genotype_field} \
| ./vendored/merge-user-metadata \
--annotations {input.annotations} \
--id-field {params.annotations_id} \
Expand Down
5 changes: 5 additions & 0 deletions phylogenetic/defaults/auspice_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,11 @@
"key": "region",
"title": "Region",
"type": "categorical"
},
{
"key": "genotype_ncbi",
"title": "Genotype (NCBI)",
"type": "categorical"
}
],
"geo_resolutions": [
Expand Down
5 changes: 5 additions & 0 deletions phylogenetic/defaults/auspice_config_N450.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,11 @@
"title": "Region",
"type": "categorical"
},
{
"key": "genotype_ncbi",
"title": "Genotype (NCBI)",
"type": "categorical"
},
{
"key": "is_reference",
"title": "WHO reference",
Expand Down
25 changes: 25 additions & 0 deletions phylogenetic/defaults/colors.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,28 @@ region Africa #8ABB6A
region Europe #BEBB48
region South America #E29E39
region North America #E2562B

genotype_ncbi A #5E1D9D
genotype_ncbi B1 #4B26B1
genotype_ncbi B2 #4138C3
genotype_ncbi B3 #3F4FCC
genotype_ncbi C1 #4065CF
genotype_ncbi C2 #447ACD
genotype_ncbi D1 #4A8BC3
genotype_ncbi D2 #529AB6
genotype_ncbi D3 #5BA6A6
genotype_ncbi D4 #66AE95
genotype_ncbi D5 #73B583
genotype_ncbi D6 #81B973
genotype_ncbi D7 #91BC64
genotype_ncbi D8 #A1BE58
genotype_ncbi D9 #B1BD4E
genotype_ncbi D10 #C0BA47
genotype_ncbi D11 #CEB541
genotype_ncbi E #DAAD3D
genotype_ncbi F #E19F3A
genotype_ncbi G1 #E68E36
genotype_ncbi G2 #E67832
genotype_ncbi G3 #E35F2D
genotype_ncbi H1 #DF4328
genotype_ncbi H2 #DB2823

0 comments on commit cce8b3c

Please sign in to comment.