Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/parse nextclade annotation #1263

Open
wants to merge 16 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

## __NEXT__

* ancestral: add functionality to reconstruct ancestral amino acid sequences and add inferred mutations to the `node_data_json` with output equivalent to `augur translate`. `ancestral` now takes an annotation (`--annotation`), a list of genes (`--genes`), and a file name pattern for amino acid alignments (`--translations`). Mutations for each of these genes will be inferred and added to the output json to each node as a list at `['aa_muts'][gene]`. The annotations will be added to the `annotation` field in the output json.
* ancestral: add the ability to report mutations relative to a sequence other than the inferred root of the tree. This sequence can be specified via `--root-sequence` and difference between this sequence and the inferred root of the tree will be added as mutations to the root node for nucleotides and amino acids. All differences between the This was previously already possible for `vcf` input via `--vcf-reference`.

## 22.1.0 (10 July 2023)

Expand Down
172 changes: 122 additions & 50 deletions augur/ancestral.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
from collections import defaultdict

def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
marginal=False, fill_overhangs=True, infer_tips=False):
marginal=False, fill_overhangs=True, infer_tips=False,
alphabet='nuc'):
"""infer ancestral sequences using TreeTime

Parameters
Expand All @@ -46,17 +47,20 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
This is only relevant when tip-state are not exactly specified, e.g. via
characters that signify ambiguous states. To replace those with the
most-likely state, set infer_tips=True
alphabet : str
alphabet to use for ancestral sequence inference. Default is the nucleotide
alphabet that included a gap character 'nuc'. Alternative is `aa` for amino
acids.

Returns
-------
treetime.TreeAnc
treetime.TreeAnc instance
"""

from treetime import TreeAnc, version as treetime_version
print(f"augur ancestral is using TreeTime version {treetime_version}")
from treetime import TreeAnc

tt = TreeAnc(tree=tree, aln=aln, ref=ref, gtr='JC69',
tt = TreeAnc(tree=tree, aln=aln, ref=ref, gtr='JC69', alphabet=alphabet,
fill_overhangs=fill_overhangs, verbose=1)

# convert marginal (from args.inference) from 'joint' or 'marginal' to True or False
Expand All @@ -66,13 +70,9 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
tt.infer_ancestral_sequences(infer_gtr=infer_gtr, marginal=bool_marginal,
reconstruct_tip_states=infer_tips)

print("\nInferred ancestral sequence states using TreeTime:"
"\n\tSagulenko et al. TreeTime: Maximum-likelihood phylodynamic analysis"
"\n\tVirus Evolution, vol 4, https://academic.oup.com/ve/article/4/1/vex042/4794731\n")

return tt

def collect_mutations_and_sequences(tt, infer_tips=False, full_sequences=False, character_map=None, mask_ambiguous=True):
def collect_mutations_and_sequences(tt, infer_tips=False, full_sequences=False, character_map=None, is_vcf=False):
"""iterates of the tree and produces dictionaries with
mutations and sequences for each node.

Expand Down Expand Up @@ -103,20 +103,19 @@ def collect_mutations_and_sequences(tt, infer_tips=False, full_sequences=False,
data[n.name]['muts'] = [a+str(int(pos)+inc)+cm(d)
for a,pos,d in n.mutations]

mask=None
if full_sequences:
if mask_ambiguous:
# Identify sites for which every terminal sequence is ambiguous.
# These sites will be masked to prevent rounding errors in the
# maximum likelihood inference from assigning an arbitrary
# nucleotide to sites at internal nodes.
ambiguous_count = np.zeros(tt.sequence_length, dtype=int)
for n in tt.tree.get_terminals():
ambiguous_count += np.array(tt.sequence(n,reconstructed=False, as_string=False)==tt.gtr.ambiguous, dtype=int)
mask = ambiguous_count==len(tt.tree.get_terminals())
else:
mask = np.zeros(tt.sequence_length, dtype=bool)
if is_vcf:
mask = np.zeros(tt.sequence_length, dtype=bool)
else:
# Identify sites for which every terminal sequence is ambiguous.
# These sites will be masked to prevent rounding errors in the
# maximum likelihood inference from assigning an arbitrary
# nucleotide to sites at internal nodes.
ambiguous_count = np.zeros(tt.sequence_length, dtype=int)
for n in tt.tree.get_terminals():
ambiguous_count += np.array(tt.sequence(n,reconstructed=False, as_string=False)==tt.gtr.ambiguous, dtype=int)
mask = ambiguous_count==len(tt.tree.get_terminals())

if full_sequences:
for n in tt.tree.find_clades():
try:
tmp = tt.sequence(n,reconstructed=infer_tips, as_string=False)
Expand All @@ -127,16 +126,59 @@ def collect_mutations_and_sequences(tt, infer_tips=False, full_sequences=False,

return {"nodes": data, "mask": mask}

def run_ancestral(T, aln, root_sequence=None, is_vcf=False, full_sequences=False, fill_overhangs=False,
infer_ambiguous=False, marginal=False, alphabet='nuc'):
tt = ancestral_sequence_inference(tree=T, aln=aln, ref=root_sequence if is_vcf else None, marginal=marginal,
fill_overhangs = fill_overhangs, alphabet=alphabet,
infer_tips = infer_ambiguous)

character_map = {}
for x in tt.gtr.profile_map:
if tt.gtr.profile_map[x].sum()==tt.gtr.n_states:
# TreeTime treats all characters that are not valid IUPAC nucleotide chars as fully ambiguous
# To clean up auspice output, we map all those to 'N'
character_map[x] = 'N'
else:
character_map[x] = x
# add reference sequence to json structure. This is the sequence with
# respect to which mutations on the tree are defined.
if root_sequence:
root_seq = root_sequence
else:
root_seq = tt.sequence(T.root, as_string=True)

mutations = collect_mutations_and_sequences(tt, full_sequences=full_sequences,
infer_tips=infer_ambiguous, character_map=character_map, is_vcf=is_vcf)
if root_sequence:
for pos, (root_state, tree_state) in enumerate(zip(root_sequence, tt.sequence(tt.tree.root, reconstructed=infer_ambiguous, as_string=True))):
if root_state != tree_state:
mutations['nodes'][tt.tree.root.name]['muts'].append(f"{root_state}{pos+1}{tree_state}")

return {'tt': tt,
'root_seq': root_seq,
'mutations': mutations}


def register_parser(parent_subparsers):
parser = parent_subparsers.add_parser("ancestral", help=__doc__)
parser.add_argument('--tree', '-t', required=True, help="prebuilt Newick")
parser.add_argument('--alignment', '-a', help="alignment in fasta or VCF format")
# FIXME: these three arguments should either be all there or none
parser.add_argument('--annotation',
help='GenBank, GFF, nextclade annotation json file containing the annotation')
parser.add_argument('--genes', nargs='+', help="genes to translate (list or file containing list)")
parser.add_argument('--translations', type=str, help="translated alignments for each CDS/Gene. "
"Currently only supported for fasta-input. Specify the file name via a "
"template like 'my_alignment_%%GENE.fasta' where %%GENE will be replaced "
"by the gene name.")
###
parser.add_argument('--output-node-data', type=str, help='name of JSON file to save mutations and ancestral sequences to')
parser.add_argument('--output-sequences', type=str, help='name of FASTA file to save ancestral sequences to (FASTA alignments only)')
parser.add_argument('--inference', default='joint', choices=["joint", "marginal"],
help="calculate joint or marginal maximum likelihood ancestral sequence states")
parser.add_argument('--vcf-reference', type=str, help='fasta file of the sequence the VCF was mapped to (only used if a VCF is provided as the alignment)')
parser.add_argument('--root-sequence', type=str, help='fasta/genbank file of the sequence that is used as root for mutation calling.'
' Differences between this sequence and the inferred root will be reported as mutations on the root branch.')
parser.add_argument('--output-vcf', type=str, help='name of output VCF file which will include ancestral seqs')
ambiguous = parser.add_mutually_exclusive_group()
ambiguous.add_argument('--keep-ambiguous', action="store_true",
Expand All @@ -151,7 +193,6 @@ def run(args):
# check alignment type, set flags, read in if VCF
is_vcf = any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])
ref = None
anc_seqs = {}

try:
T = read_tree(args.tree)
Expand Down Expand Up @@ -179,10 +220,26 @@ def run(args):
ref = compress_seq['reference']
else:
aln = args.alignment
ref = None
if args.root_sequence:
for fmt in ['fasta', 'genbank']:
try:
ref = SeqIO.read(args.root_sequence, fmt)
break
except:
pass
if ref is None:
print(f"ERROR: could not read root sequence from {args.root_sequence}", file=sys.stderr)
return 1

# Enforce treetime 0.7 or later
from distutils.version import StrictVersion
import treetime
print("\nInferred ancestral sequence states using TreeTime:"
"\n\tSagulenko et al. TreeTime: Maximum-likelihood phylodynamic analysis"
"\n\tVirus Evolution, vol 4, https://academic.oup.com/ve/article/4/1/vex042/4794731\n")

print(f"augur ancestral is using TreeTime version {treetime.version}")
if StrictVersion(treetime.version) < StrictVersion('0.7.0'):
print("ERROR: this version of augur requires TreeTime 0.7 or later.", file=sys.stderr)
return 1
Expand All @@ -191,36 +248,51 @@ def run(args):
# explicitly or by default) and the user has not explicitly requested that
# we keep them.
infer_ambiguous = args.infer_ambiguous and not args.keep_ambiguous

tt = ancestral_sequence_inference(tree=T, aln=aln, ref=ref, marginal=args.inference,
fill_overhangs = not(args.keep_overhangs),
infer_tips = infer_ambiguous)

character_map = {}
for x in tt.gtr.profile_map:
if tt.gtr.profile_map[x].sum()==tt.gtr.n_states:
# TreeTime treats all characters that are not valid IUPAC nucleotide chars as fully ambiguous
# To clean up auspice output, we map all those to 'N'
character_map[x] = 'N'
else:
character_map[x] = x

anc_seqs.update(collect_mutations_and_sequences(tt, full_sequences=not is_vcf,
infer_tips=infer_ambiguous, character_map=character_map))
# add reference sequence to json structure. This is the sequence with
# respect to which mutations on the tree are defined.
if is_vcf:
anc_seqs['reference'] = {"nuc":compress_seq['reference']}
else:
root_seq = tt.sequence(T.root, as_string=False)
if anc_seqs.get("mask") is not None:
root_seq[anc_seqs['mask']] = tt.gtr.ambiguous
anc_seqs['reference'] = {"nuc": ''.join(root_seq)}
full_sequences = not is_vcf
nuc_result = run_ancestral(T, aln, root_sequence=str(ref.seq) if ref else None, is_vcf=is_vcf, fill_overhangs=not args.keep_overhangs,
full_sequences=full_sequences, marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='nuc')
anc_seqs = nuc_result['mutations']
anc_seqs['reference'] = {'nuc': nuc_result['root_seq']}

if anc_seqs.get("mask") is not None:
anc_seqs["mask"] = "".join(['1' if x else '0' for x in anc_seqs["mask"]])

anc_seqs['annotations'] = {'nuc': {'start': 1, 'end': len(anc_seqs['reference']['nuc']), 'strand': '+'}}
anc_seqs['annotations'] = {'nuc': {'start': 1, 'end': len(anc_seqs['reference']['nuc']),
'strand': '+', 'type': 'source'}}

if not is_vcf and args.genes:
from .utils import load_features
## load features; only requested features if genes given
features = load_features(args.annotation, args.genes)
if features is None:
print("ERROR: could not read features of reference sequence file")
return 1
print("Read in {} features from reference sequence file".format(len(features)))
for gene in args.genes:
print(f"Processing gene: {gene}")
fname = args.translations.replace("%GENE", gene)
feat = features[gene]
root_seq = str(feat.extract(ref).translate().seq) if ref else None

aa_result = run_ancestral(T, fname, root_sequence=root_seq, is_vcf=is_vcf, fill_overhangs=not args.keep_overhangs,
marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='aa')
if aa_result['tt'].data.full_length*3 != len(feat):
print(f"ERROR: length of translated alignment for {gene} does not match length of reference feature."
" Please make sure that the annotation matches the translations.")
return 1

for key, node in anc_seqs['nodes'].items():
if 'aa_muts' not in node: node['aa_muts'] = {}
node['aa_muts'][gene] = aa_result['mutations']['nodes'][key]['muts']
anc_seqs['reference'][gene] = aa_result['root_seq']
# FIXME: Note that this is calculating the end of the CDS as 3*length of translation
# this is equivalent to the annotation for single segment CDS, but not for cds
# with splicing and slippage. But auspice can't handle the latter at the moment.
anc_seqs['annotations'][gene] = {'seqid':args.annotation,
'type':feat.type,
'start': int(feat.location.start)+1,
'end': int(feat.location.start) + 3*len(anc_seqs['reference'][gene]),
'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]}

out_name = get_json_name(args, '.'.join(args.alignment.split('.')[:-1]) + '_mutations.json')
write_json(anc_seqs, out_name)
Expand All @@ -243,7 +315,7 @@ def run(args):
vcf_fname = args.output_vcf
else:
vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf'
write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname)
write_vcf(nuc_result['tt'].get_tree_dict(keep_var_ambigs=True), vcf_fname)
print("ancestral sequences as vcf-file written to",vcf_fname, file=sys.stdout)

return 0
46 changes: 43 additions & 3 deletions augur/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,8 +148,8 @@ def default(self, obj):


def load_features(reference, feature_names=None):
#read in appropriately whether GFF or Genbank
#checks explicitly for GFF otherwise assumes Genbank
#read in appropriately whether GFF, nextclade-annotation-json, or Genbank
#checks explicitly for GFF or json otherwise assumes Genbank
if not os.path.isfile(reference):
print("ERROR: reference sequence not found. looking for", reference)
return None
Expand Down Expand Up @@ -193,7 +193,8 @@ def load_features(reference, feature_names=None):
for fe in feature_names:
if fe not in features:
print("Couldn't find gene {} in GFF or GenBank file".format(fe))

elif 'json' in reference.lower():
features = parse_nextclade_annotation_json(reference)
else:
from Bio import SeqIO
for feat in SeqIO.read(reference, 'genbank').features:
Expand All @@ -211,6 +212,45 @@ def load_features(reference, feature_names=None):

return features

def parse_nextclade_annotation_json(fname):
from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation
with open(fname) as fh:
nextclade = json.load(fh)

landmarks = {}
features = {}
for gene in nextclade['genes'].values():
for cds in gene['cdses']:
feat = SeqFeature(id=cds['id'], type='CDS')
strands = set()
location = []
for segment in cds['segments']:
strands.add(segment['strand'])
location.append(FeatureLocation(segment['range']['begin'], segment['range']['end']))
if segment['landmark'] and segment['landmark']['id'] not in landmarks:
landmarks[segment['landmark']['id']] = segment['landmark']
if len(location) > 1:
feat.location = CompoundLocation(location)
else:
feat.location = location[0]

if len(strands) > 1:
ValueError("CDS {} has multiple strands: {}".format(cds['id'], strands))
else:
feat.strand = {'+':1, '-':-1, '0':0}.get(strands.pop(), None)
feat.name = cds['name']
feat.qualifiers = cds['attributes']
features[cds['name']] = feat

if len(landmarks)>1:
ValueError("More than one landmark in nextclade annotation: {}".format(landmarks.keys()))
elif len(landmarks)==1:
lm_id, lm = landmarks.popitem()
features['nuc'] = SeqFeature(id=lm['id'], location=FeatureLocation(lm['range']['begin'], lm['range']['end']), type='source')

return features


def read_config(fname):
if not (fname and os.path.isfile(fname)):
print("ERROR: config file %s not found."%fname)
Expand Down
3 changes: 2 additions & 1 deletion tests/builds/zika/results/nt_muts.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@
"nuc": {
"end": 10769,
"start": 1,
"strand": "+"
"strand": "+",
"type": "source"
}
},
"generated_by": {
Expand Down
Loading