nextstrain · rneher · Jul 22, 2023 · Jul 22, 2023 · Jul 22, 2023 · Jul 22, 2023
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,6 +2,8 @@
 
 ## __NEXT__
 
+* ancestral: add functionality to reconstruct ancestral amino acid sequences and add inferred mutations to the `node_data_json` with output equivalent to `augur translate`. `ancestral` now takes an annotation (`--annotation`), a list of genes (`--genes`), and a file name pattern for amino acid alignments (`--translations`). Mutations for each of these genes will be inferred and added to the output json to each node as a list at `['aa_muts'][gene]`. The annotations will be added to the `annotation` field in the output json.
+* ancestral: add the ability to report mutations relative to a sequence other than the inferred root of the tree. This sequence can be specified via `--root-sequence` and difference between this sequence and the inferred root of the tree will be added as mutations to the root node for nucleotides and amino acids. All differences between the  This was previously already possible for `vcf` input via `--vcf-reference`.
 
 ## 22.1.0 (10 July 2023)
 

diff --git a/augur/ancestral.py b/augur/ancestral.py
@@ -23,7 +23,8 @@
 from collections import defaultdict
 
 def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
-                                 marginal=False, fill_overhangs=True, infer_tips=False):
+                                 marginal=False, fill_overhangs=True, infer_tips=False,
+                                 alphabet='nuc'):
     """infer ancestral sequences using TreeTime
 
     Parameters
@@ -46,17 +47,20 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
         This is only relevant when tip-state are not exactly specified, e.g. via
         characters that signify ambiguous states. To replace those with the
         most-likely state, set infer_tips=True
+    alphabet : str
+        alphabet to use for ancestral sequence inference. Default is the nucleotide
+        alphabet that included a gap character 'nuc'. Alternative is `aa` for amino
+        acids.
 
     Returns
     -------
     treetime.TreeAnc
         treetime.TreeAnc instance
     """
 
-    from treetime import TreeAnc, version as treetime_version
-    print(f"augur ancestral is using TreeTime version {treetime_version}")
+    from treetime import TreeAnc
 
-    tt = TreeAnc(tree=tree, aln=aln, ref=ref, gtr='JC69',
+    tt = TreeAnc(tree=tree, aln=aln, ref=ref, gtr='JC69', alphabet=alphabet,
                  fill_overhangs=fill_overhangs, verbose=1)
 
     # convert marginal (from args.inference) from 'joint' or 'marginal' to True or False
@@ -66,13 +70,9 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
     tt.infer_ancestral_sequences(infer_gtr=infer_gtr, marginal=bool_marginal,
                                  reconstruct_tip_states=infer_tips)
 
-    print("\nInferred ancestral sequence states using TreeTime:"
-          "\n\tSagulenko et al. TreeTime: Maximum-likelihood phylodynamic analysis"
-          "\n\tVirus Evolution, vol 4, https://academic.oup.com/ve/article/4/1/vex042/4794731\n")
-
     return tt
 
-def collect_mutations_and_sequences(tt, infer_tips=False, full_sequences=False, character_map=None, mask_ambiguous=True):
+def collect_mutations_and_sequences(tt, infer_tips=False, full_sequences=False, character_map=None, is_vcf=False):
     """iterates of the tree and produces dictionaries with
     mutations and sequences for each node.
 
@@ -103,20 +103,19 @@ def collect_mutations_and_sequences(tt, infer_tips=False, full_sequences=False,
         data[n.name]['muts'] = [a+str(int(pos)+inc)+cm(d)
                                 for a,pos,d in n.mutations]
 
-    mask=None
-    if full_sequences:
-        if mask_ambiguous:
-            # Identify sites for which every terminal sequence is ambiguous.
-            # These sites will be masked to prevent rounding errors in the
-            # maximum likelihood inference from assigning an arbitrary
-            # nucleotide to sites at internal nodes.
-            ambiguous_count = np.zeros(tt.sequence_length, dtype=int)
-            for n in tt.tree.get_terminals():
-                ambiguous_count += np.array(tt.sequence(n,reconstructed=False, as_string=False)==tt.gtr.ambiguous, dtype=int)
-            mask = ambiguous_count==len(tt.tree.get_terminals())
-        else:
-            mask = np.zeros(tt.sequence_length, dtype=bool)
+    if is_vcf:
+        mask = np.zeros(tt.sequence_length, dtype=bool)
+    else:
+        # Identify sites for which every terminal sequence is ambiguous.
+        # These sites will be masked to prevent rounding errors in the
+        # maximum likelihood inference from assigning an arbitrary
+        # nucleotide to sites at internal nodes.
+        ambiguous_count = np.zeros(tt.sequence_length, dtype=int)
+        for n in tt.tree.get_terminals():
+            ambiguous_count += np.array(tt.sequence(n,reconstructed=False, as_string=False)==tt.gtr.ambiguous, dtype=int)
+        mask = ambiguous_count==len(tt.tree.get_terminals())
 
+    if full_sequences:
         for n in tt.tree.find_clades():
             try:
                 tmp = tt.sequence(n,reconstructed=infer_tips, as_string=False)
@@ -127,16 +126,59 @@ def collect_mutations_and_sequences(tt, infer_tips=False, full_sequences=False,
 
     return {"nodes": data, "mask": mask}
 
+def run_ancestral(T, aln, root_sequence=None, is_vcf=False, full_sequences=False, fill_overhangs=False,
+                  infer_ambiguous=False, marginal=False, alphabet='nuc'):
+    tt = ancestral_sequence_inference(tree=T, aln=aln, ref=root_sequence if is_vcf else None, marginal=marginal,
+                                      fill_overhangs = fill_overhangs, alphabet=alphabet,
+                                      infer_tips = infer_ambiguous)
+
+    character_map = {}
+    for x in tt.gtr.profile_map:
+        if tt.gtr.profile_map[x].sum()==tt.gtr.n_states:
+            # TreeTime treats all characters that are not valid IUPAC nucleotide chars as fully ambiguous
+            # To clean up auspice output, we map all those to 'N'
+            character_map[x] = 'N'
+        else:
+            character_map[x] = x
+    # add reference sequence to json structure. This is the sequence with
+    # respect to which mutations on the tree are defined.
+    if root_sequence:
+        root_seq = root_sequence
+    else:
+        root_seq = tt.sequence(T.root, as_string=True)
+
+    mutations = collect_mutations_and_sequences(tt, full_sequences=full_sequences,
+                          infer_tips=infer_ambiguous, character_map=character_map, is_vcf=is_vcf)
+    if root_sequence:
+        for pos, (root_state, tree_state) in enumerate(zip(root_sequence, tt.sequence(tt.tree.root, reconstructed=infer_ambiguous, as_string=True))):
+            if root_state != tree_state:
+                mutations['nodes'][tt.tree.root.name]['muts'].append(f"{root_state}{pos+1}{tree_state}")
+
+    return {'tt': tt,
+            'root_seq': root_seq,
+            'mutations': mutations}
+
 
 def register_parser(parent_subparsers):
     parser = parent_subparsers.add_parser("ancestral", help=__doc__)
     parser.add_argument('--tree', '-t', required=True, help="prebuilt Newick")
     parser.add_argument('--alignment', '-a', help="alignment in fasta or VCF format")
+    # FIXME: these three arguments should either be all there or none
+    parser.add_argument('--annotation',
+                        help='GenBank, GFF, nextclade annotation json file containing the annotation')
+    parser.add_argument('--genes', nargs='+', help="genes to translate (list or file containing list)")
+    parser.add_argument('--translations', type=str, help="translated alignments for each CDS/Gene. "
+                           "Currently only supported for fasta-input. Specify the file name via a "
+                           "template like 'my_alignment_%%GENE.fasta' where %%GENE will be replaced "
+                           "by the gene name.")
+    ###
     parser.add_argument('--output-node-data', type=str, help='name of JSON file to save mutations and ancestral sequences to')
     parser.add_argument('--output-sequences', type=str, help='name of FASTA file to save ancestral sequences to (FASTA alignments only)')
     parser.add_argument('--inference', default='joint', choices=["joint", "marginal"],
                                     help="calculate joint or marginal maximum likelihood ancestral sequence states")
     parser.add_argument('--vcf-reference', type=str, help='fasta file of the sequence the VCF was mapped to (only used if a VCF is provided as the alignment)')
+    parser.add_argument('--root-sequence', type=str, help='fasta/genbank file of the sequence that is used as root for mutation calling.'
+                        ' Differences between this sequence and the inferred root will be reported as mutations on the root branch.')
     parser.add_argument('--output-vcf', type=str, help='name of output VCF file which will include ancestral seqs')
     ambiguous = parser.add_mutually_exclusive_group()
     ambiguous.add_argument('--keep-ambiguous', action="store_true",
@@ -151,7 +193,6 @@ def run(args):
     # check alignment type, set flags, read in if VCF
     is_vcf = any([args.alignment.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])
     ref = None
-    anc_seqs = {}
 
     try:
         T = read_tree(args.tree)
@@ -179,10 +220,26 @@ def run(args):
         ref = compress_seq['reference']
     else:
         aln = args.alignment
+        ref = None
+        if args.root_sequence:
+            for fmt in ['fasta', 'genbank']:
+                try:
+                    ref = SeqIO.read(args.root_sequence, fmt)
+                    break
+                except:
+                    pass
+            if ref is None:
+                print(f"ERROR: could not read root sequence from {args.root_sequence}", file=sys.stderr)
+                return 1
 
     # Enforce treetime 0.7 or later
     from distutils.version import StrictVersion
     import treetime
+    print("\nInferred ancestral sequence states using TreeTime:"
+          "\n\tSagulenko et al. TreeTime: Maximum-likelihood phylodynamic analysis"
+          "\n\tVirus Evolution, vol 4, https://academic.oup.com/ve/article/4/1/vex042/4794731\n")
+
+    print(f"augur ancestral is using TreeTime version {treetime.version}")
     if StrictVersion(treetime.version) < StrictVersion('0.7.0'):
         print("ERROR: this version of augur requires TreeTime 0.7 or later.", file=sys.stderr)
         return 1
@@ -191,36 +248,51 @@ def run(args):
     # explicitly or by default) and the user has not explicitly requested that
     # we keep them.
     infer_ambiguous = args.infer_ambiguous and not args.keep_ambiguous
-
-    tt = ancestral_sequence_inference(tree=T, aln=aln, ref=ref, marginal=args.inference,
-                                      fill_overhangs = not(args.keep_overhangs),
-                                      infer_tips = infer_ambiguous)
-
-    character_map = {}
-    for x in tt.gtr.profile_map:
-        if tt.gtr.profile_map[x].sum()==tt.gtr.n_states:
-            # TreeTime treats all characters that are not valid IUPAC nucleotide chars as fully ambiguous
-            # To clean up auspice output, we map all those to 'N'
-            character_map[x] = 'N'
-        else:
-            character_map[x] = x
-
-    anc_seqs.update(collect_mutations_and_sequences(tt, full_sequences=not is_vcf,
-                          infer_tips=infer_ambiguous, character_map=character_map))
-    # add reference sequence to json structure. This is the sequence with
-    # respect to which mutations on the tree are defined.
-    if is_vcf:
-        anc_seqs['reference'] = {"nuc":compress_seq['reference']}
-    else:
-        root_seq = tt.sequence(T.root, as_string=False)
-        if anc_seqs.get("mask") is not None:
-            root_seq[anc_seqs['mask']] = tt.gtr.ambiguous
-        anc_seqs['reference'] = {"nuc": ''.join(root_seq)}
+    full_sequences = not is_vcf
+    nuc_result = run_ancestral(T, aln, root_sequence=str(ref.seq) if ref else None, is_vcf=is_vcf, fill_overhangs=not args.keep_overhangs,
+                               full_sequences=full_sequences, marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='nuc')
+    anc_seqs = nuc_result['mutations']
+    anc_seqs['reference'] = {'nuc': nuc_result['root_seq']}
 
     if anc_seqs.get("mask") is not None:
         anc_seqs["mask"] = "".join(['1' if x else '0' for x in anc_seqs["mask"]])
 
-    anc_seqs['annotations'] = {'nuc': {'start': 1, 'end': len(anc_seqs['reference']['nuc']), 'strand': '+'}}
+    anc_seqs['annotations'] = {'nuc': {'start': 1, 'end': len(anc_seqs['reference']['nuc']),
+                                       'strand': '+', 'type': 'source'}}
+
+    if not is_vcf and args.genes:
+        from .utils import load_features
+        ## load features; only requested features if genes given
+        features = load_features(args.annotation, args.genes)
+        if features is None:
+            print("ERROR: could not read features of reference sequence file")
+            return 1
+        print("Read in {} features from reference sequence file".format(len(features)))
+        for gene in args.genes:
+            print(f"Processing gene: {gene}")
+            fname = args.translations.replace("%GENE", gene)
+            feat = features[gene]
+            root_seq = str(feat.extract(ref).translate().seq) if ref else None
+
+            aa_result = run_ancestral(T, fname, root_sequence=root_seq, is_vcf=is_vcf, fill_overhangs=not args.keep_overhangs,
+                                        marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='aa')
+            if aa_result['tt'].data.full_length*3 != len(feat):
+                print(f"ERROR: length of translated alignment for {gene} does not match length of reference feature."
+                       " Please make sure that the annotation matches the translations.")
+                return 1
+
+            for key, node in anc_seqs['nodes'].items():
+                if 'aa_muts' not in node: node['aa_muts'] = {}
+                node['aa_muts'][gene] = aa_result['mutations']['nodes'][key]['muts']
+            anc_seqs['reference'][gene] = aa_result['root_seq']
+            # FIXME: Note that this is calculating the end of the CDS as 3*length of translation
+            # this is equivalent to the annotation for single segment CDS, but not for cds
+            # with splicing and slippage. But auspice can't handle the latter at the moment.
+            anc_seqs['annotations'][gene] = {'seqid':args.annotation,
+                                             'type':feat.type,
+                                             'start': int(feat.location.start)+1,
+                                             'end': int(feat.location.start) + 3*len(anc_seqs['reference'][gene]),
+                                             'strand': {+1:'+', -1:'-', 0:'?', None:None}[feat.location.strand]}
 
     out_name = get_json_name(args, '.'.join(args.alignment.split('.')[:-1]) + '_mutations.json')
     write_json(anc_seqs, out_name)
@@ -243,7 +315,7 @@ def run(args):
             vcf_fname = args.output_vcf
         else:
             vcf_fname = '.'.join(args.alignment.split('.')[:-1]) + '.vcf'
-        write_vcf(tt.get_tree_dict(keep_var_ambigs=True), vcf_fname)
+        write_vcf(nuc_result['tt'].get_tree_dict(keep_var_ambigs=True), vcf_fname)
         print("ancestral sequences as vcf-file written to",vcf_fname, file=sys.stdout)
 
     return 0
diff --git a/augur/utils.py b/augur/utils.py
@@ -148,8 +148,8 @@ def default(self, obj):
 
 
 def load_features(reference, feature_names=None):
-    #read in appropriately whether GFF or Genbank
-    #checks explicitly for GFF otherwise assumes Genbank
+    #read in appropriately whether GFF, nextclade-annotation-json, or Genbank
+    #checks explicitly for GFF or json otherwise assumes Genbank
     if not os.path.isfile(reference):
         print("ERROR: reference sequence not found. looking for", reference)
         return None
@@ -193,7 +193,8 @@ def load_features(reference, feature_names=None):
                 for fe in feature_names:
                     if fe not in features:
                         print("Couldn't find gene {} in GFF or GenBank file".format(fe))
-
+    elif 'json' in reference.lower():
+        features = parse_nextclade_annotation_json(reference)
     else:
         from Bio import SeqIO
         for feat in SeqIO.read(reference, 'genbank').features:
@@ -211,6 +212,45 @@ def load_features(reference, feature_names=None):
 
     return features
 
+def parse_nextclade_annotation_json(fname):
+    from Bio.SeqFeature import SeqFeature, FeatureLocation, CompoundLocation
+    with open(fname) as fh:
+        nextclade = json.load(fh)
+
+    landmarks = {}
+    features = {}
+    for gene in nextclade['genes'].values():
+        for cds in gene['cdses']:
+            feat = SeqFeature(id=cds['id'], type='CDS')
+            strands = set()
+            location = []
+            for segment in cds['segments']:
+                strands.add(segment['strand'])
+                location.append(FeatureLocation(segment['range']['begin'], segment['range']['end']))
+                if segment['landmark'] and segment['landmark']['id'] not in landmarks:
+                    landmarks[segment['landmark']['id']] = segment['landmark']
+            if len(location) > 1:
+                feat.location = CompoundLocation(location)
+            else:
+                feat.location = location[0]
+
+            if len(strands) > 1:
+                ValueError("CDS {} has multiple strands: {}".format(cds['id'], strands))
+            else:
+                feat.strand = {'+':1, '-':-1, '0':0}.get(strands.pop(), None)
+            feat.name = cds['name']
+            feat.qualifiers = cds['attributes']
+            features[cds['name']] = feat
+
+    if len(landmarks)>1:
+        ValueError("More than one landmark in nextclade annotation: {}".format(landmarks.keys()))
+    elif len(landmarks)==1:
+        lm_id, lm = landmarks.popitem()
+        features['nuc'] = SeqFeature(id=lm['id'], location=FeatureLocation(lm['range']['begin'], lm['range']['end']), type='source')
+
+    return features
+
+
 def read_config(fname):
     if not (fname and os.path.isfile(fname)):
         print("ERROR: config file %s not found."%fname)

diff --git a/tests/builds/zika/results/nt_muts.json b/tests/builds/zika/results/nt_muts.json
@@ -3,7 +3,8 @@
     "nuc": {
       "end": 10769,
       "start": 1,
-      "strand": "+"
+      "strand": "+",
+      "type": "source"
     }
   },
   "generated_by": {