Skip to content

Commit

Permalink
Merge pull request #1690 from nextstrain/fix/root-state-in-sequence-r…
Browse files Browse the repository at this point in the history
…econstruction

fix: explicitly specify how the root and ambiguous states are handled…
  • Loading branch information
rneher authored Dec 26, 2024
2 parents 77ae31e + d618f53 commit a57d50c
Show file tree
Hide file tree
Showing 24 changed files with 177 additions and 55 deletions.
10 changes: 8 additions & 2 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,17 @@

## __NEXT__

### Features

* ancestral: Add `--seed` argument to enable deterministic inference of root states by TreeTime. [#1690][] (@huddlej)

### Bug Fixes

* ancestral, refine: Explicitly specify how the root and ambiguous states are handled during sequence reconstruction and mutation counting. [#1690][] (@rneher)
* titers: Fix type errors in code associated with cross-validation of models. [#1688][] (@huddlej)

[#1688]: https://github.com/nextstrain/augur/pull/1688
[#1690]: https://github.com/nextstrain/augur/pull/1690

## 27.0.0 (9 December 2024)

Expand All @@ -29,8 +35,8 @@

### Features

- This is the first version to officially support Python 3.12 and Pandas v2. [#1671] [#1678] (@corneliusroemer, @victorlin)
- curate: change output metadata to [RFC 4180 CSV-like TSVs][] to match the TSV format output by other Augur subcommands and the Nextstrain ecosystem as discussed in [#1566][]. [#1565][] (@joverlee521)
* This is the first version to officially support Python 3.12 and Pandas v2. [#1671] [#1678] (@corneliusroemer, @victorlin)
* curate: change output metadata to [RFC 4180 CSV-like TSVs][] to match the TSV format output by other Augur subcommands and the Nextstrain ecosystem as discussed in [#1566][]. [#1565][] (@joverlee521)

[#1565]: https://github.com/nextstrain/augur/pull/1565
[#1566]: https://github.com/nextstrain/augur/issues/1566
Expand Down
30 changes: 17 additions & 13 deletions augur/ancestral.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@

def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
marginal=False, fill_overhangs=True, infer_tips=False,
alphabet='nuc'):
alphabet='nuc', rng_seed=None):
"""infer ancestral sequences using TreeTime
Parameters
Expand Down Expand Up @@ -68,6 +68,8 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
alphabet to use for ancestral sequence inference. Default is the nucleotide
alphabet that included a gap character 'nuc'. Alternative is `aa` for amino
acids.
rng_seed : int, optional
random seed value to use for inference
Returns
-------
Expand All @@ -78,14 +80,14 @@ def ancestral_sequence_inference(tree=None, aln=None, ref=None, infer_gtr=True,
from treetime import TreeAnc

tt = TreeAnc(tree=tree, aln=aln, ref=ref, gtr='JC69', alphabet=alphabet,
fill_overhangs=fill_overhangs, verbose=1)
fill_overhangs=fill_overhangs, verbose=1, rng_seed=rng_seed)

# convert marginal (from args.inference) from 'joint' or 'marginal' to True or False
bool_marginal = (marginal == "marginal")

# only infer ancestral sequences, leave branch length untouched
tt.infer_ancestral_sequences(infer_gtr=infer_gtr, marginal=bool_marginal,
reconstruct_tip_states=infer_tips)
reconstruct_tip_states=infer_tips, sample_from_profile='root')

return tt

Expand All @@ -104,7 +106,7 @@ def create_mask(is_vcf, tt, reference_sequence, aln):
only used if is_vcf
aln : dict
describes variation (relative to reference) per sample. Only used if is_vcf.
Returns
-------
numpy.ndarray(bool)
Expand All @@ -122,7 +124,7 @@ def create_mask(is_vcf, tt, reference_sequence, aln):
if alt=='N':
ambig_positions.append(pos)
# ambig_positions = [pos for pos, alt in sample_data.items() if alt=='N']
np.add.at(ambiguous_count, ambig_positions, 1)
np.add.at(ambiguous_count, ambig_positions, 1)
# Secondly, if the VCF defines no mutations but the ref is "N":
no_info_sites = np.array(list(reference_sequence)) == 'N'
no_info_sites[list(variable_sites)] = False
Expand Down Expand Up @@ -171,7 +173,7 @@ def collect_mutations(tt, mask, character_map=None, reference_sequence=None, inf

# Note that for mutations reported across the tree we don't have to consider
# the mask, because while sites which are all Ns may have an inferred base,
# there will be no variablity and thus no mutations.
# there will be no variablity and thus no mutations.
for n in tt.tree.find_clades():
data[n.name] = [a+str(int(pos)+inc)+cm(d)
for a,pos,d in n.mutations]
Expand All @@ -185,7 +187,7 @@ def collect_mutations(tt, mask, character_map=None, reference_sequence=None, inf
data[tt.tree.root.name].append(f"{root_state}{pos+1}{tree_state}")

return data

def collect_sequences(tt, mask, reference_sequence=None, infer_ambiguous=False):
"""
Create a full sequence for every node on the tree. Masked positions will
Expand All @@ -198,7 +200,7 @@ def collect_sequences(tt, mask, reference_sequence=None, infer_ambiguous=False):
instance of treetime with valid ancestral reconstruction
mask : numpy.ndarray(bool)
Mask these positions by changing them to the ambiguous nucleotide
reference_sequence : str or None
reference_sequence : str or None
infer_ambiguous : bool, optional
if true, request the reconstructed sequences from treetime, otherwise retain input ambiguities
Expand Down Expand Up @@ -227,14 +229,14 @@ def collect_sequences(tt, mask, reference_sequence=None, infer_ambiguous=False):
return sequences

def run_ancestral(T, aln, reference_sequence=None, is_vcf=False, full_sequences=False, fill_overhangs=False,
infer_ambiguous=False, marginal=False, alphabet='nuc'):
infer_ambiguous=False, marginal=False, alphabet='nuc', rng_seed=None):
"""
ancestral nucleotide reconstruction using TreeTime
"""

tt = ancestral_sequence_inference(tree=T, aln=aln, ref=reference_sequence if is_vcf else None, marginal=marginal,
fill_overhangs = fill_overhangs, alphabet=alphabet,
infer_tips = infer_ambiguous)
infer_tips = infer_ambiguous, rng_seed=rng_seed)

character_map = {}
for x in tt.gtr.profile_map:
Expand Down Expand Up @@ -295,6 +297,7 @@ def register_parser(parent_subparsers):
)
global_options_group.add_argument('--inference', default='joint', choices=["joint", "marginal"],
help="calculate joint or marginal maximum likelihood ancestral sequence states")
global_options_group.add_argument('--seed', type=int, help="seed for random number generation")

nucleotide_options_group = parser.add_argument_group(
"nucleotide options",
Expand Down Expand Up @@ -416,7 +419,8 @@ def run(args):
infer_ambiguous = args.infer_ambiguous and not args.keep_ambiguous
full_sequences = not is_vcf
nuc_result = run_ancestral(T, aln, reference_sequence=ref if ref else None, is_vcf=is_vcf, fill_overhangs=not args.keep_overhangs,
full_sequences=full_sequences, marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='nuc')
full_sequences=full_sequences, marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='nuc',
rng_seed=args.seed)
anc_seqs = nuc_result['mutations']
anc_seqs['reference'] = {'nuc': nuc_result['root_seq']}

Expand All @@ -437,7 +441,7 @@ def run(args):
features['nuc'].location.end != anc_seqs['annotations']['nuc']['end']):
raise AugurError(f"The 'nuc' annotation coordinates parsed from {args.annotation!r} ({features['nuc'].location.start+1}..{features['nuc'].location.end})"
f" don't match the provided sequence data coordinates ({anc_seqs['annotations']['nuc']['start']}..{anc_seqs['annotations']['nuc']['end']}).")

print("Read in {} features from reference sequence file".format(len(features)))
for gene in genes:
print(f"Processing gene: {gene}")
Expand All @@ -446,7 +450,7 @@ def run(args):
reference_sequence = str(feat.extract(Seq(ref)).translate()) if ref else None

aa_result = run_ancestral(T, fname, reference_sequence=reference_sequence, is_vcf=is_vcf, fill_overhangs=not args.keep_overhangs,
marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='aa')
marginal=args.inference, infer_ambiguous=infer_ambiguous, alphabet='aa', rng_seed=args.seed)
len_translated_alignment = aa_result['tt'].data.full_length*3
if len_translated_alignment != len(feat):
raise AugurError(f"length of translated alignment for {gene} ({len_translated_alignment})"
Expand Down
5 changes: 4 additions & 1 deletion augur/refine.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,7 +313,10 @@ def run(args):
pass
elif args.divergence_units=='mutations':
if not args.timetree:
tt.infer_ancestral_sequences()
# infer ancestral sequence for the purpose of counting mutations
# sample mutations from the root profile, otherwise use most likely state.
# Reconstruct tip states to avoid mutations to N or W etc be counted
tt.infer_ancestral_sequences(marginal=True, reconstruct_tip_states=True, sample_from_profile='root')
nuc_map = profile_maps['nuc']

def are_sequence_states_different(nuc1, nuc2):
Expand Down
2 changes: 1 addition & 1 deletion tests/functional/ancestral/cram/ambiguous-positions.t
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Setup
> --tree "$DATA/tree.nwk" \
> --alignment snps.vcf \
> --vcf-reference reference.fasta \
> --seed 314159 \
> --output-node-data nt_muts.json \
> --output-vcf nt_muts.vcf > /dev/null

Expand All @@ -44,4 +45,3 @@ Setup
> --vcf nt_muts.vcf \
> --json nt_muts.json \
> --ref reference.fasta > /dev/null

2 changes: 2 additions & 0 deletions tests/functional/ancestral/cram/case-sensitive.t
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ Change the _reference_ to lowercase
> --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \
> --root-sequence ref.lower.fasta \
> --output-node-data "nt_muts.ref-seq.json" \
> --seed 314159 \
> --inference marginal > /dev/null

$ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
Expand All @@ -33,6 +34,7 @@ be lowecase which will be compared against the uppercase reference
> --alignment sequences.lower.fasta \
> --root-sequence "$TESTDIR/../data/simple-genome/reference.fasta" \
> --output-node-data "nt_muts.ref-seq.json" \
> --seed 314159 \
> --inference marginal > /dev/null

$ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ The default is to infer ambiguous bases, so there should not be N bases in the i
$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/tree.nwk \
> --alignment $TESTDIR/../data/aligned.fasta \
> --seed 314159 \
> --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
> --output-sequences "$CRAMTMP/$TESTFILE/ancestral_sequences.fasta" > /dev/null

Expand Down
2 changes: 2 additions & 0 deletions tests/functional/ancestral/cram/general.t
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ node-data JSON we diff against.
> --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \
> --root-sequence "$TESTDIR/../data/simple-genome/reference.fasta" \
> --output-node-data "nt_muts.ref-seq.json" \
> --seed 314159 \
> --inference marginal > /dev/null


Expand All @@ -34,6 +35,7 @@ mutations (as there's nothing to compare the root node to)
> --tree "$TESTDIR/../data/simple-genome/tree.nwk" \
> --alignment "$TESTDIR/../data/simple-genome/sequences.fasta" \
> --output-node-data "nt_muts.no-ref-seq.json" \
> --seed 314159 \
> --inference marginal > /dev/null


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ There should not be N bases in the inferred output sequences.
> --tree $TESTDIR/../data/tree.nwk \
> --alignment $TESTDIR/../data/aligned.fasta \
> --infer-ambiguous \
> --seed 314159 \
> --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
> --output-sequences "$CRAMTMP/$TESTFILE/ancestral_sequences.fasta" > /dev/null

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Infer ancestral nucleotide and amino acid sequences, using a genes file.
> --annotation $TESTDIR/../data/zika_outgroup.gb \
> --genes $TESTDIR/../data/genes.txt \
> --translations $TESTDIR/../data/aa_sequences_%GENE.fasta \
> --seed 314159 \
> --output-node-data ancestral_mutations.json \
> --output-sequences ancestral_sequences.fasta \
> --output-translations ancestral_aa_sequences_%GENE.fasta > /dev/null
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ ancestor).
> --root-sequence $TESTDIR/../data/zika_outgroup.gb \
> --genes ENV PRO \
> --translations $TESTDIR/../data/aa_sequences_%GENE.fasta \
> --seed 314159 \
> --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" > /dev/null

Check that the reference length was correctly exported as the nuc annotation
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Infer ancestral nucleotide and amino acid sequences.
> --annotation $TESTDIR/../data/zika_outgroup.gb \
> --genes ENV PRO \
> --translations $TESTDIR/../data/aa_sequences_%GENE.fasta \
> --seed 314159 \
> --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
> --output-sequences "$CRAMTMP/$TESTFILE/ancestral_sequences.fasta" \
> --output-translations "$CRAMTMP/$TESTFILE/ancestral_aa_sequences_%GENE.fasta" > /dev/null
Expand Down
10 changes: 8 additions & 2 deletions tests/functional/ancestral/cram/invalid-args.t
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Input FASTA + VCF output is not possible
$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/tree.nwk \
> --alignment $TESTDIR/../data/aligned.fasta \
> --seed 314159 \
> --output-vcf "output.vcf" > /dev/null
ERROR: VCF output has been requested but the input alignment is not VCF.
[2]
Expand All @@ -16,6 +17,7 @@ Input VCF + FASTA output is not possible (Note that the input file doesn't exist
$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/tree.nwk \
> --alignment $TESTDIR/../data/snps.vcf \
> --seed 314159 \
> --output-sequences "output.fasta" > /dev/null
ERROR: Sequence (fasta) output has been requested but the input alignment is VCF.
[2]
Expand All @@ -25,6 +27,7 @@ Output FASTA _and_ VCF is not possible
$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/tree.nwk \
> --alignment $TESTDIR/../data/aligned.fasta \
> --seed 314159 \
> --output-vcf "output.vcf" \
> --output-sequences "output.fasta" > /dev/null
ERROR: Both sequence (fasta) and VCF output have been requested, but these are incompatible.
Expand All @@ -39,28 +42,31 @@ This should fail.
> --alignment $TESTDIR/../data/aligned.fasta \
> --annotation $TESTDIR/../data/zika_outgroup.gb \
> --genes ENV PRO \
> --seed 314159 \
> --output-node-data "ancestral_mutations.json" > /dev/null
ERROR: For amino acid sequence reconstruction, you must provide an annotation file, a list of genes, and a template path to amino acid sequences.
[2]

Missing tree file
Missing tree file

$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/tree-doesnt-exist.nwk \
> --alignment $TESTDIR/../data/aligned.fasta \
> --seed 314159 \
> --output-sequences "output.fasta" > /dev/null
ERROR: The provided tree file .* doesn't exist (re)
[2]


Attempting to use FASTA-input reference and VCF-input reference args
(The files here don't exist, but we exit before they're checked)
(The files here don't exist, but we exit before they're checked)

$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/tree-doesnt-exist.nwk \
> --alignment $TESTDIR/../data/aligned.fasta \
> --root-sequence $TESTDIR/../data/reference.fasta \
> --vcf-reference $TESTDIR/../data/reference.fasta \
> --seed 314159 \
> --output-sequences "output.fasta" > /dev/null 2>"err-args.txt"
[2]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ There should be N bases in the inferred output sequences.
> --tree $TESTDIR/../data/tree.nwk \
> --alignment $TESTDIR/../data/aligned.fasta \
> --keep-ambiguous \
> --seed 314159 \
> --output-node-data "$CRAMTMP/$TESTFILE/ancestral_mutations.json" \
> --output-sequences "$CRAMTMP/$TESTFILE/ancestral_sequences.fasta" > /dev/null

Expand Down
14 changes: 6 additions & 8 deletions tests/functional/ancestral/cram/vcf-multi-allele.t
Original file line number Diff line number Diff line change
Expand Up @@ -4,36 +4,34 @@ Setup

$ export DATA="$TESTDIR/../data/simple-genome"

We take the same `snps.vcf` file used in `general.t` but add another
We used a modified `snps.vcf` file used in `vcf.t` but with an additional
allele at site 30 - sample_B has a "G". Since the root is A and this
is the only sample with G it's 'A30G'.
See <https://github.com/nextstrain/augur/issues/1380> for the bug this is testing.

$ sed '11s/^/1\t30\t.\tA\tG,N\t.\t.\t.\tGT\t0\t1\t2\n/' \
> "$DATA/snps.vcf" > snps2.vcf

$ ${AUGUR} ancestral \
> --tree $DATA/tree.nwk \
> --alignment snps2.vcf \
> --alignment $DATA/snps_with_multiple_alleles.vcf \
> --vcf-reference $DATA/reference.fasta \
> --output-node-data nt_muts.json \
> --output-vcf nt_muts.vcf \
> --seed 314159 \
> --inference marginal > /dev/null


$ python3 "$TESTDIR/../../../../scripts/diff_jsons.py" \
> "$DATA/nt_muts.ref-seq.json" \
> nt_muts.json \
> --exclude-regex-paths "root\['nodes'\]\['.+'\]\['sequence'\]" "root\['generated_by'\]"
{'iterable_item_added': {"root['nodes']['sample_B']['muts'][0]": 'A30G'}}
{'iterable_item_added': {"root['nodes']['node_root']['muts'][3]": 'A30G'}}

$ cat > expected.vcf <<EOF
> #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT node_root sample_C node_AB sample_B sample_A
> 1 5 . A C . PASS . GT 1 1 1 1 1
> 1 7 . A G . PASS . GT 0 0 1 1 1
> 1 14 . C T . PASS . GT 0 0 1 1 1
> 1 14 . C T . PASS . GT 1 0 1 1 1
> 1 18 . C T . PASS . GT 1 1 0 0 0
> 1 30 . A G . PASS . GT 0 0 0 1 0
> 1 30 . A G . PASS . GT 1 1 1 1 1
> 1 33 . A C,G . PASS . GT 0 0 2 2 1
> 1 39 . C T . PASS . GT 0 0 0 0 1
> 1 42 . G A . PASS . GT 0 0 0 1 0
Expand Down
3 changes: 2 additions & 1 deletion tests/functional/ancestral/cram/vcf.t
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ but it will have the reference sequence attached.
> --vcf-reference $DATA/reference.fasta \
> --output-node-data "nt_muts.vcf-input.ref-seq.json" \
> --output-vcf nt_muts.vcf \
> --seed 314159 \
> --inference marginal > /dev/null


Expand All @@ -33,7 +34,7 @@ here as it's not relevant to what I'm trying to test.
> #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT node_root sample_C node_AB sample_B sample_A
> 1 5 . A C . PASS . GT 1 1 1 1 1
> 1 7 . A G . PASS . GT 0 0 1 1 1
> 1 14 . C T . PASS . GT 0 0 1 1 1
> 1 14 . C T . PASS . GT 1 0 1 1 1
> 1 18 . C T . PASS . GT 1 1 0 0 0
> 1 33 . A C,G . PASS . GT 0 0 2 2 1
> 1 39 . C T . PASS . GT 0 0 0 0 1
Expand Down
Loading

0 comments on commit a57d50c

Please sign in to comment.