Merge pull request #1690 from nextstrain/fix/root-state-in-sequence-r…

…econstruction fix: explicitly specify how the root and ambiguous states are handled…
nextstrain · Dec 26, 2024 · a57d50c · a57d50c
2 parents 77ae31e + d618f53
commit a57d50c
Show file tree

Hide file tree

Showing 24 changed files with 177 additions and 55 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -2,11 +2,17 @@
 
 ## __NEXT__
 
+### Features
+
+* ancestral: Add `--seed` argument to enable deterministic inference of root states by TreeTime. [#1690][] (@huddlej)
+
 ### Bug Fixes
 
+* ancestral, refine: Explicitly specify how the root and ambiguous states are handled during sequence reconstruction and mutation counting. [#1690][] (@rneher)
 * titers: Fix type errors in code associated with cross-validation of models. [#1688][] (@huddlej)
 
 [#1688]: https://github.com/nextstrain/augur/pull/1688
+[#1690]: https://github.com/nextstrain/augur/pull/1690
 
 ## 27.0.0 (9 December 2024)
 
@@ -29,8 +35,8 @@
 
 ### Features
 
-- This is the first version to officially support Python 3.12 and Pandas v2. [#1671] [#1678] (@corneliusroemer, @victorlin)
-- curate: change output metadata to [RFC 4180 CSV-like TSVs][] to match the TSV format output by other Augur subcommands and the Nextstrain ecosystem as discussed in [#1566][]. [#1565][] (@joverlee521)
+* This is the first version to officially support Python 3.12 and Pandas v2. [#1671] [#1678] (@corneliusroemer, @victorlin)
+* curate: change output metadata to [RFC 4180 CSV-like TSVs][] to match the TSV format output by other Augur subcommands and the Nextstrain ecosystem as discussed in [#1566][]. [#1565][] (@joverlee521)
 
 [#1565]: https://github.com/nextstrain/augur/pull/1565
 [#1566]: https://github.com/nextstrain/augur/issues/1566

diff --git a/augur/ancestral.py b/augur/ancestral.py
@@ -39,7 +39,7 @@
 
 def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
                                  marginal=False, fill_overhangs=True, infer_tips=False,
-                                 alphabet='nuc'):
+                                 alphabet='nuc', rng_seed=None):
     """infer ancestral sequences using TreeTime
 
     Parameters
@@ -68,6 +68,8 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
         alphabet to use for ancestral sequence inference. Default is the nucleotide
         alphabet that included a gap character 'nuc'. Alternative is `aa` for amino
         acids.
+    rng_seed : int, optional
+        random seed value to use for inference
 
     Returns
     -------
@@ -78,14 +80,14 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
     from treetime import TreeAnc
 
     tt = TreeAnc(tree=tree, aln=aln, ref=ref, gtr='JC69', alphabet=alphabet,
-                 fill_overhangs=fill_overhangs, verbose=1)
+                 fill_overhangs=fill_overhangs, verbose=1, rng_seed=rng_seed)
 
     # convert marginal (from args.inference) from 'joint' or 'marginal' to True or False
     bool_marginal = (marginal == "marginal")
 
     # only infer ancestral sequences, leave branch length untouched
     tt.infer_ancestral_sequences(infer_gtr=infer_gtr, marginal=bool_marginal,
-                                 reconstruct_tip_states=infer_tips)
+                                 reconstruct_tip_states=infer_tips, sample_from_profile='root')
 
     return tt
 
@@ -104,7 +106,7 @@ def create_mask(is_vcf, tt, reference_sequence, aln):
         only used if is_vcf
     aln : dict
         describes variation (relative to reference) per sample. Only used if is_vcf.
-        
+
     Returns
     -------
     numpy.ndarray(bool)
@@ -122,7 +124,7 @@ def create_mask(is_vcf, tt, reference_sequence, aln):
                 if alt=='N':
                     ambig_positions.append(pos)
             # ambig_positions = [pos for pos, alt in sample_data.items() if alt=='N']
-            np.add.at(ambiguous_count, ambig_positions, 1)       
+            np.add.at(ambiguous_count, ambig_positions, 1)
         # Secondly, if the VCF defines no mutations but the ref is "N":
         no_info_sites = np.array(list(reference_sequence)) == 'N'
         no_info_sites[list(variable_sites)] = False
@@ -171,7 +173,7 @@ def collect_mutations(tt, mask, character_map=None, reference_sequence=None, inf
 
     # Note that for mutations reported across the tree we don't have to consider
     # the mask, because while sites which are all Ns may have an inferred base,
-    # there will be no variablity and thus no mutations.  
+    # there will be no variablity and thus no mutations.
     for n in tt.tree.find_clades():
         data[n.name] = [a+str(int(pos)+inc)+cm(d)
                         for a,pos,d in n.mutations]
@@ -185,7 +187,7 @@ def collect_mutations(tt, mask, character_map=None, reference_sequence=None, inf
                 data[tt.tree.root.name].append(f"{root_state}{pos+1}{tree_state}")
 
     return data
-        
+
 def collect_sequences(tt, mask, reference_sequence=None, infer_ambiguous=False):
     """
     Create a full sequence for every node on the tree. Masked positions will
@@ -198,7 +200,7 @@ def collect_sequences(tt, mask, reference_sequence=None, infer_ambiguous=False):
         instance of treetime with valid ancestral reconstruction
     mask : numpy.ndarray(bool)
         Mask these positions by changing them to the ambiguous nucleotide
-    reference_sequence : str or None 
+    reference_sequence : str or None
     infer_ambiguous : bool, optional
         if true, request the reconstructed sequences from treetime, otherwise retain input ambiguities
 
@@ -227,14 +229,14 @@ def collect_sequences(tt, mask, reference_sequence=None, infer_ambiguous=False):
     return sequences
 
 def run_ancestral(T, aln, reference_sequence=None, is_vcf=False, full_sequences=False, fill_overhangs=False,
-                  infer_ambiguous=False, marginal=False, alphabet='nuc'):
+                  infer_ambiguous=False, marginal=False, alphabet='nuc', rng_seed=None):
     """
     ancestral nucleotide reconstruction using TreeTime
     """
 
     tt = ancestral_sequence_inference(tree=T, aln=aln, ref=reference_sequence if is_vcf else None, marginal=marginal,
                                       fill_overhangs = fill_overhangs, alphabet=alphabet,
-                                      infer_tips = infer_ambiguous)
+                                      infer_tips = infer_ambiguous, rng_seed=rng_seed)
 
     character_map = {}
     for x in tt.gtr.profile_map:
@@ -295,6 +297,7 @@ def register_parser(parent_subparsers):
     )
     global_options_group.add_argument('--inference', default='joint', choices=["joint", "marginal"],
                                     help="calculate joint or marginal maximum likelihood ancestral sequence states")
+    global_options_group.add_argument('--seed', type=int, help="seed for random number generation")
 
     nucleotide_options_group = parser.add_argument_group(
         "nucleotide options",
@@ -416,7 +419,8 @@ def run(args):
     infer_ambiguous = args.infer_ambiguous and not args.keep_ambiguous
     full_sequences = not is_vcf
     nuc_result = run_ancestral(T, aln, reference_sequence=ref if ref else None, is_vcf=is_vcf, fill_overhangs=not args.keep_overhangs,
-                               full_sequences=full_sequences, marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='nuc')
+                               full_sequences=full_sequences, marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='nuc',
+                               rng_seed=args.seed)
     anc_seqs = nuc_result['mutations']
     anc_seqs['reference'] = {'nuc': nuc_result['root_seq']}
 
@@ -437,7 +441,7 @@ def run(args):
             features['nuc'].location.end != anc_seqs['annotations']['nuc']['end']):
             raise AugurError(f"The 'nuc' annotation coordinates parsed from {args.annotation!r} ({features['nuc'].location.start+1}..{features['nuc'].location.end})"
                 f" don't match the provided sequence data coordinates ({anc_seqs['annotations']['nuc']['start']}..{anc_seqs['annotations']['nuc']['end']}).")
-        
+
         print("Read in {} features from reference sequence file".format(len(features)))
         for gene in genes:
             print(f"Processing gene: {gene}")
@@ -446,7 +450,7 @@ def run(args):
             reference_sequence = str(feat.extract(Seq(ref)).translate()) if ref else None
 
             aa_result = run_ancestral(T, fname, reference_sequence=reference_sequence, is_vcf=is_vcf, fill_overhangs=not args.keep_overhangs,
-                                        marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='aa')
+                                        marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='aa', rng_seed=args.seed)
             len_translated_alignment = aa_result['tt'].data.full_length*3
             if len_translated_alignment != len(feat):
                 raise AugurError(f"length of translated alignment for {gene} ({len_translated_alignment})"

diff --git a/augur/refine.py b/augur/refine.py
@@ -313,7 +313,10 @@ def run(args):
         pass
     elif args.divergence_units=='mutations':
         if not args.timetree:
-            tt.infer_ancestral_sequences()
+            # infer ancestral sequence for the purpose of counting mutations
+            # sample mutations from the root profile, otherwise use most likely state.
+            # Reconstruct tip states to avoid mutations to N or W etc be counted
+            tt.infer_ancestral_sequences(marginal=True, reconstruct_tip_states=True, sample_from_profile='root')
         nuc_map = profile_maps['nuc']
 
         def are_sequence_states_different(nuc1, nuc2):

diff --git a/tests/functional/ancestral/cram/ambiguous-positions.t b/tests/functional/ancestral/cram/ambiguous-positions.t
@@ -19,6 +19,7 @@ Setup
   >  --tree "$DATA/tree.nwk" \
   >  --alignment snps.vcf \
   >  --vcf-reference reference.fasta \
+  >  --seed 314159 \
   >  --output-node-data nt_muts.json \
   >  --output-vcf nt_muts.vcf > /dev/null
 
@@ -44,4 +45,3 @@ Setup
   >   --vcf nt_muts.vcf \
   >   --json nt_muts.json \
   >   --ref reference.fasta > /dev/null
-
diff --git a/tests/functional/ancestral/cram/case-sensitive.t b/tests/functional/ancestral/cram/case-sensitive.t
@@ -11,6 +11,7 @@ Change the _reference_ to lowercase
   >  --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \
   >  --root-sequence ref.lower.fasta \
   >  --output-node-data "nt_muts.ref-seq.json" \
+  >  --seed 314159 \
   >  --inference marginal > /dev/null
 
   $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
@@ -33,6 +34,7 @@ be lowecase which will be compared against the uppercase reference
   >  --alignment sequences.lower.fasta \
   >  --root-sequence "$TESTDIR/../data/simple-genome/reference.fasta" \
   >  --output-node-data "nt_muts.ref-seq.json" \
+  >  --seed 314159 \
   >  --inference marginal > /dev/null
 
   $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \

diff --git a/tests/functional/ancestral/cram/default-nucleotide-reconstruction.t b/tests/functional/ancestral/cram/default-nucleotide-reconstruction.t
@@ -8,6 +8,7 @@ The default is to infer ambiguous bases, so there should not be N bases in the i
   $ ${AUGUR} ancestral \
   >  --tree $TESTDIR/../data/tree.nwk \
   >  --alignment $TESTDIR/../data/aligned.fasta \
+  >  --seed 314159 \
   >  --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
   >  --output-sequences "$CRAMTMP/$TESTFILE/ancestral_sequences.fasta" > /dev/null
 

diff --git a/tests/functional/ancestral/cram/general.t b/tests/functional/ancestral/cram/general.t
@@ -14,6 +14,7 @@ node-data JSON we diff against.
   >  --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \
   >  --root-sequence "$TESTDIR/../data/simple-genome/reference.fasta" \
   >  --output-node-data "nt_muts.ref-seq.json" \
+  >  --seed 314159 \
   >  --inference marginal > /dev/null
 
 
@@ -34,6 +35,7 @@ mutations (as there's nothing to compare the root node to)
   >  --tree "$TESTDIR/../data/simple-genome/tree.nwk" \
   >  --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \
   >  --output-node-data "nt_muts.no-ref-seq.json" \
+  >  --seed 314159 \
   >  --inference marginal > /dev/null
 
 

diff --git a/tests/functional/ancestral/cram/infer-ambiguous-nucleotides.t b/tests/functional/ancestral/cram/infer-ambiguous-nucleotides.t
@@ -9,6 +9,7 @@ There should not be N bases in the inferred output sequences.
   >  --tree $TESTDIR/../data/tree.nwk \
   >  --alignment $TESTDIR/../data/aligned.fasta \
   >  --infer-ambiguous \
+  >  --seed 314159 \
   >  --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
   >  --output-sequences "$CRAMTMP/$TESTFILE/ancestral_sequences.fasta" > /dev/null
 

diff --git a/tests/functional/ancestral/cram/infer-amino-acid-sequences-genes-file.t b/tests/functional/ancestral/cram/infer-amino-acid-sequences-genes-file.t
@@ -10,6 +10,7 @@ Infer ancestral nucleotide and amino acid sequences, using a genes file.
   >  --annotation $TESTDIR/../data/zika_outgroup.gb \
   >  --genes $TESTDIR/../data/genes.txt \
   >  --translations $TESTDIR/../data/aa_sequences_%GENE.fasta \
+  >  --seed 314159 \
   >  --output-node-data ancestral_mutations.json \
   >  --output-sequences ancestral_sequences.fasta \
   >  --output-translations ancestral_aa_sequences_%GENE.fasta > /dev/null

diff --git a/tests/functional/ancestral/cram/infer-amino-acid-sequences-with-root-sequence.t b/tests/functional/ancestral/cram/infer-amino-acid-sequences-with-root-sequence.t
@@ -15,6 +15,7 @@ ancestor).
   >  --root-sequence $TESTDIR/../data/zika_outgroup.gb \
   >  --genes ENV PRO \
   >  --translations $TESTDIR/../data/aa_sequences_%GENE.fasta \
+  >  --seed 314159 \
   >  --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" > /dev/null
 
 Check that the reference length was correctly exported as the nuc annotation

diff --git a/tests/functional/ancestral/cram/infer-amino-acid-sequences.t b/tests/functional/ancestral/cram/infer-amino-acid-sequences.t
@@ -10,6 +10,7 @@ Infer ancestral nucleotide and amino acid sequences.
   >  --annotation $TESTDIR/../data/zika_outgroup.gb \
   >  --genes ENV PRO \
   >  --translations $TESTDIR/../data/aa_sequences_%GENE.fasta \
+  >  --seed 314159 \
   >  --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
   >  --output-sequences "$CRAMTMP/$TESTFILE/ancestral_sequences.fasta" \
   >  --output-translations "$CRAMTMP/$TESTFILE/ancestral_aa_sequences_%GENE.fasta" > /dev/null

diff --git a/tests/functional/ancestral/cram/invalid-args.t b/tests/functional/ancestral/cram/invalid-args.t
@@ -7,6 +7,7 @@ Input FASTA + VCF output is not possible
   $ ${AUGUR} ancestral \
   >  --tree $TESTDIR/../data/tree.nwk \
   >  --alignment $TESTDIR/../data/aligned.fasta \
+  >  --seed 314159 \
   >  --output-vcf "output.vcf" > /dev/null
   ERROR: VCF output has been requested but the input alignment is not VCF.
   [2]
@@ -16,6 +17,7 @@ Input VCF + FASTA output is not possible (Note that the input file doesn't exist
   $ ${AUGUR} ancestral \
   >  --tree $TESTDIR/../data/tree.nwk \
   >  --alignment $TESTDIR/../data/snps.vcf \
+  >  --seed 314159 \
   >  --output-sequences "output.fasta" > /dev/null
   ERROR: Sequence (fasta) output has been requested but the input alignment is VCF.
   [2]
@@ -25,6 +27,7 @@ Output FASTA _and_ VCF is not possible
   $ ${AUGUR} ancestral \
   >  --tree $TESTDIR/../data/tree.nwk \
   >  --alignment $TESTDIR/../data/aligned.fasta \
+  >  --seed 314159 \
   >  --output-vcf "output.vcf" \
   >  --output-sequences "output.fasta" > /dev/null
   ERROR: Both sequence (fasta) and VCF output have been requested, but these are incompatible.
@@ -39,28 +42,31 @@ This should fail.
   >  --alignment $TESTDIR/../data/aligned.fasta \
   >  --annotation $TESTDIR/../data/zika_outgroup.gb \
   >  --genes ENV PRO \
+  >  --seed 314159 \
   >  --output-node-data "ancestral_mutations.json" > /dev/null
   ERROR: For amino acid sequence reconstruction, you must provide an annotation file, a list of genes, and a template path to amino acid sequences.
   [2]
 
-Missing tree file 
+Missing tree file
 
   $ ${AUGUR} ancestral \
   >  --tree $TESTDIR/../data/tree-doesnt-exist.nwk \
   >  --alignment $TESTDIR/../data/aligned.fasta \
+  >  --seed 314159 \
   >  --output-sequences "output.fasta" > /dev/null
   ERROR: The provided tree file .* doesn't exist (re)
   [2]
 
 
 Attempting to use FASTA-input reference and VCF-input reference args
-(The files here don't exist, but we exit before they're checked) 
+(The files here don't exist, but we exit before they're checked)
 
   $ ${AUGUR} ancestral \
   >  --tree $TESTDIR/../data/tree-doesnt-exist.nwk \
   >  --alignment $TESTDIR/../data/aligned.fasta \
   >  --root-sequence $TESTDIR/../data/reference.fasta \
   >  --vcf-reference $TESTDIR/../data/reference.fasta \
+  >  --seed 314159 \
   >  --output-sequences "output.fasta" > /dev/null 2>"err-args.txt"
   [2]
 

diff --git a/tests/functional/ancestral/cram/keep-ambiguous-nucleotides.t b/tests/functional/ancestral/cram/keep-ambiguous-nucleotides.t
@@ -9,6 +9,7 @@ There should be N bases in the inferred output sequences.
   >  --tree $TESTDIR/../data/tree.nwk \
   >  --alignment $TESTDIR/../data/aligned.fasta \
   >  --keep-ambiguous \
+  >  --seed 314159 \
   >  --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
   >  --output-sequences "$CRAMTMP/$TESTFILE/ancestral_sequences.fasta" > /dev/null
 

diff --git a/tests/functional/ancestral/cram/vcf-multi-allele.t b/tests/functional/ancestral/cram/vcf-multi-allele.t
@@ -4,36 +4,34 @@ Setup
 
   $ export DATA="$TESTDIR/../data/simple-genome"
 
-We take the same `snps.vcf` file used in `general.t` but add another
+We used a modified `snps.vcf` file used in `vcf.t` but with an additional
 allele at site 30 - sample_B has a "G". Since the root is A and this
 is the only sample with G it's 'A30G'.
 See <https://github.com/nextstrain/augur/issues/1380> for the bug this is testing.
 
-  $ sed '11s/^/1\t30\t.\tA\tG,N\t.\t.\t.\tGT\t0\t1\t2\n/' \
-  > "$DATA/snps.vcf" > snps2.vcf
-
   $ ${AUGUR} ancestral \
   >  --tree $DATA/tree.nwk \
-  >  --alignment snps2.vcf \
+  >  --alignment $DATA/snps_with_multiple_alleles.vcf \
   >  --vcf-reference $DATA/reference.fasta \
   >  --output-node-data nt_muts.json \
   >  --output-vcf nt_muts.vcf \
+  >  --seed 314159 \
   >  --inference marginal > /dev/null
 
 
   $ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
   >   "$DATA/nt_muts.ref-seq.json" \
   >   nt_muts.json \
   >   --exclude-regex-paths "root\['nodes'\]\['.+'\]\['sequence'\]" "root\['generated_by'\]"
-  {'iterable_item_added': {"root['nodes']['sample_B']['muts'][0]": 'A30G'}}
+  {'iterable_item_added': {"root['nodes']['node_root']['muts'][3]": 'A30G'}}
 
   $ cat > expected.vcf <<EOF
   > #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	node_root	sample_C	node_AB	sample_B	sample_A
   > 1	5	.	A	C	.	PASS	.	GT	1	1	1	1	1
   > 1	7	.	A	G	.	PASS	.	GT	0	0	1	1	1
-  > 1	14	.	C	T	.	PASS	.	GT	0	0	1	1	1
+  > 1	14	.	C	T	.	PASS	.	GT	1	0	1	1	1
   > 1	18	.	C	T	.	PASS	.	GT	1	1	0	0	0
-  > 1	30	.	A	G	.	PASS	.	GT	0	0	0	1	0
+  > 1	30	.	A	G	.	PASS	.	GT	1	1	1	1	1
   > 1	33	.	A	C,G	.	PASS	.	GT	0	0	2	2	1
   > 1	39	.	C	T	.	PASS	.	GT	0	0	0	0	1
   > 1	42	.	G	A	.	PASS	.	GT	0	0	0	1	0

diff --git a/tests/functional/ancestral/cram/vcf.t b/tests/functional/ancestral/cram/vcf.t
@@ -15,6 +15,7 @@ but it will have the reference sequence attached.
   >  --vcf-reference $DATA/reference.fasta \
   >  --output-node-data "nt_muts.vcf-input.ref-seq.json" \
   >  --output-vcf nt_muts.vcf \
+  >  --seed 314159 \
   >  --inference marginal > /dev/null
 
 
@@ -33,7 +34,7 @@ here as it's not relevant to what I'm trying to test.
   > #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	node_root	sample_C	node_AB	sample_B	sample_A
   > 1	5	.	A	C	.	PASS	.	GT	1	1	1	1	1
   > 1	7	.	A	G	.	PASS	.	GT	0	0	1	1	1
-  > 1	14	.	C	T	.	PASS	.	GT	0	0	1	1	1
+  > 1	14	.	C	T	.	PASS	.	GT	1	0	1	1	1
   > 1	18	.	C	T	.	PASS	.	GT	1	1	0	0	0
   > 1	33	.	A	C,G	.	PASS	.	GT	0	0	2	2	1
   > 1	39	.	C	T	.	PASS	.	GT	0	0	0	0	1