Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ancestral: Accept files to --genes #1353

Merged
merged 7 commits into from
Dec 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
* If `TreeTimeError` is encountered Augur now exits with code 2 rather than 0. (This restores the original behaviour.) [#1367][] (@jameshadfield)
* ancestral, translate: Avoid incompatibilities with Biopython 1.82. [#1374][] (@victorlin)
* ancestral, translate: Address Biopython deprecation warnings. [#1379][] (@victorlin)
* ancestral: Previously, the help text for `--genes` falsely claimed that it could accept a file. Now, it can truly claim that. [#1353][] (@victorlin)
* Deprecate `read_strains` from `augur.utils` and add it to the public API under `augur.io`. [#1353][] (@victorlin)


[#1344]: https://github.com/nextstrain/augur/pull/1344
Expand All @@ -43,6 +45,7 @@
[#1374]: https://github.com/nextstrain/augur/pull/1374
[#1379]: https://github.com/nextstrain/augur/pull/1379
[#1352]: https://github.com/nextstrain/augur/pull/1352
[#1353]: https://github.com/nextstrain/augur/pull/1353

## 23.1.1 (7 November 2023)

Expand Down
10 changes: 10 additions & 0 deletions DEPRECATED.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,16 @@ These features are deprecated, which means they are no longer maintained and
will go away in a future major version of Augur. They are currently still
available for backwards compatibility, but should not be used in new code.

## `augur.utils.read_strains`

*Deprecated in December 2023. Planned for removal March 2024 or after.*

This is part of a [larger effort](https://github.com/nextstrain/augur/issues/1011)
to formalize Augur's Python API.

We recognize the existing usage of this function, so it has been moved to
`augur.io.read_strains`.

## `augur export v1`

*Deprecated in version 22.2.0 (July 2023). Planned for [removal](https://github.com/nextstrain/augur/issues/1266)
Expand Down
8 changes: 5 additions & 3 deletions augur/ancestral.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from .utils import read_tree, InvalidTreeError, write_json, get_json_name
from .utils import parse_genes_argument, read_tree, InvalidTreeError, write_json, get_json_name
from treetime.vcf_utils import read_vcf, write_vcf
from collections import defaultdict

Expand Down Expand Up @@ -320,17 +320,19 @@ def run(args):
'strand': '+', 'type': 'source'}}

if not is_vcf and args.genes:
genes = parse_genes_argument(args.genes)

from .utils import load_features
## load features; only requested features if genes given
features = load_features(args.annotation, args.genes)
features = load_features(args.annotation, genes)
# Ensure the already-created nuc annotation coordinates match those parsed from the reference file
if (features['nuc'].location.start+1 != anc_seqs['annotations']['nuc']['start'] or
features['nuc'].location.end != anc_seqs['annotations']['nuc']['end']):
raise AugurError(f"The 'nuc' annotation coordinates parsed from {args.annotation!r} ({features['nuc'].location.start+1}..{features['nuc'].location.end})"
f" don't match the provided sequence data coordinates ({anc_seqs['annotations']['nuc']['start']}..{anc_seqs['annotations']['nuc']['end']}).")

print("Read in {} features from reference sequence file".format(len(features)))
for gene in args.genes:
for gene in genes:
print(f"Processing gene: {gene}")
fname = args.translations.replace("%GENE", gene)
feat = features[gene]
Expand Down
2 changes: 1 addition & 1 deletion augur/filter/include_exclude_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,8 @@
from augur.errors import AugurError
from augur.io.metadata import METADATA_DATE_COLUMN
from augur.io.print import print_err
from augur.io.strains import read_strains
from augur.io.vcf import is_vcf as filename_is_vcf
from augur.utils import read_strains
from . import constants

try:
Expand Down
1 change: 1 addition & 0 deletions augur/io/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@
from .file import open_file # noqa: F401
from .metadata import read_metadata # noqa: F401
from .sequences import read_sequences, write_sequences # noqa: F401
from .strains import read_strains # noqa: F401
27 changes: 27 additions & 0 deletions augur/io/strains.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from augur.utils import read_entries


def read_strains(*files, comment_char="#"):
"""Reads strain names from one or more plain text files and returns the
set of distinct strains.

Strain names can be commented with full-line or inline comments. For
example, the following is a valid strain names file::

# this is a comment at the top of the file
strain1 # exclude strain1 because it isn't sequenced properly
strain2
# this is an empty line that will be ignored.

Parameters
----------
files : iterable of str
one or more names of text files with one strain name per line

Returns
-------
set :
strain names from the given input files

"""
return set(read_entries(*files, comment_char=comment_char))
28 changes: 3 additions & 25 deletions augur/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@
The mutation positions in the node-data JSON are one-based.
"""

import os, sys
import sys
import numpy as np
from Bio import SeqIO, Seq, SeqRecord, Phylo
from .io.vcf import write_VCF_translation
from .utils import read_node_data, load_features, write_json, get_json_name
from .utils import parse_genes_argument, read_node_data, load_features, write_json, get_json_name
from treetime.vcf_utils import read_vcf
from augur.errors import AugurError
from textwrap import dedent
Expand Down Expand Up @@ -303,24 +303,6 @@ def assign_aa_fasta(tree, translations):

return aa_muts

def get_genes_from_file(fname):
genes = []
if os.path.isfile(fname):
with open(fname, encoding='utf-8') as ifile:
for line in ifile:
fields = line.strip().split('#')
if fields[0].strip():
genes.append(fields[0].strip())
else:
print("File with genes not found. Looking for", fname)

unique_genes = np.unique(np.array(genes))
if len(unique_genes) != len(genes):
print("You have duplicates in your genes file. They are being ignored.")
print("Read in {} specified genes to translate.".format(len(unique_genes)))

return unique_genes

def sequences_vcf(reference_fasta, vcf):
"""
Extract the nucleotide variation in the VCF
Expand Down Expand Up @@ -395,11 +377,7 @@ def run(args):
is_vcf = any([args.ancestral_sequences.lower().endswith(x) for x in ['.vcf', '.vcf.gz']])
check_arg_combinations(args, is_vcf)

# If genes is a file, read in the genes to translate
if args.genes and len(args.genes) == 1 and os.path.isfile(args.genes[0]):
genes = get_genes_from_file(args.genes[0])
else:
genes = args.genes
genes = parse_genes_argument(args.genes)

## load features; only requested features if genes given
features = load_features(args.reference_sequence, genes)
Expand Down
57 changes: 46 additions & 11 deletions augur/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,12 @@
import os, json, sys
import pandas as pd
from collections import defaultdict, OrderedDict
from textwrap import dedent
from .__version__ import __version__

from augur.data import as_file
from augur.io.file import open_file
from augur.io.print import print_err

from augur.types import ValidationMode
from augur.errors import AugurError
Expand Down Expand Up @@ -749,11 +751,17 @@ def load_mask_sites(mask_file):


def read_strains(*files, comment_char="#"):
"""Reads strain names from one or more plain text files and returns the
set of distinct strains.
print_err(dedent("""
DEPRECATION WARNING: augur.utils.read_strains is no longer maintained and will be removed in the future.
Please use augur.io.read_strains instead."""))
return set(read_entries(*files, comment_char=comment_char))

Strain names can be commented with full-line or inline comments. For
example, the following is a valid strain names file::

def read_entries(*files, comment_char="#"):
"""Reads entries (one per line) from one or more plain text files.

Entries can be commented with full-line or inline comments. For example, the
following is a valid file::

# this is a comment at the top of the file
strain1 # exclude strain1 because it isn't sequenced properly
Expand All @@ -763,21 +771,48 @@ def read_strains(*files, comment_char="#"):
Parameters
----------
files : iterable of str
one or more names of text files with one strain name per line
one or more names of text files with one entry per line

Returns
-------
set :
strain names from the given input files
lines from the given input files

"""
strains = set()
entries = list()
for input_file in files:
with open_file(input_file, 'r') as ifile:
for line in ifile:
# Allow comments anywhere in a given line.
strain_name = line.split(comment_char)[0].strip()
if len(strain_name) > 0:
strains.add(strain_name)
entry = line.split(comment_char)[0].strip()
if len(entry) > 0:
entries.append(entry)

return entries


def parse_genes_argument(input):
if input is None:
return None

# If input is a file, read in the genes to translate
if len(input) == 1 and os.path.isfile(input[0]):
return _get_genes_from_file(input[0])

# Otherwise, the input itself is assumed to be a list of genes
return input


def _get_genes_from_file(fname):
if os.path.isfile(fname):
genes = read_entries(fname)
else:
print("File with genes not found. Looking for", fname)
genes = []

unique_genes = np.unique(np.array(genes))
if len(unique_genes) != len(genes):
print("You have duplicates in your genes file. They are being ignored.")
print("Read in {} specified genes to translate.".format(len(unique_genes)))

return strains
return unique_genes
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
Setup

$ source "$TESTDIR"/_setup.sh

Infer ancestral nucleotide and amino acid sequences, using a genes file.

$ ${AUGUR} ancestral \
> --tree $TESTDIR/../data/tree.nwk \
> --alignment $TESTDIR/../data/aligned.fasta \
> --annotation $TESTDIR/../data/zika_outgroup.gb \
> --genes $TESTDIR/../data/genes.txt \
> --translations $TESTDIR/../data/aa_sequences_%GENE.fasta \
> --output-node-data ancestral_mutations.json \
> --output-sequences ancestral_sequences.fasta \
> --output-translations ancestral_aa_sequences_%GENE.fasta > /dev/null

Check that the reference length was correctly exported as the nuc annotation

$ grep -E "\"(ENV|PRO|nuc)\": {" ancestral_mutations.json
"ENV": {
"PRO": {
"nuc": {
2 changes: 2 additions & 0 deletions tests/functional/ancestral/data/genes.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
ENV
PRO
19 changes: 19 additions & 0 deletions tests/io/test_strains.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from pathlib import Path

from augur.io.strains import read_strains


def test_read_strains(tmpdir):
# Write one list of filenames with some unnecessary whitespace.
strains1 = Path(tmpdir) / Path("strains1.txt")
with open(strains1, "w") as oh:
oh.write("strain1 # this is an inline comment about strain 1\nstrain2\n # this is a comment preceded by whitespace.\n")

# Write another list of filenames with a comment.
strains2 = Path(tmpdir) / Path("strains2.txt")
with open(strains2, "w") as oh:
oh.write("# this is a comment. ignore this.\nstrain2\nstrain3\n")

strains = read_strains(strains1, strains2)
assert len(strains) == 3
assert "strain1" in strains
15 changes: 0 additions & 15 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,21 +79,6 @@ def test_read_mask_file_drm_file(self, tmpdir):
fh.write("\n".join(drm_lines))
assert utils.read_mask_file(drm_file) == expected_sites

def test_read_strains(self, tmpdir):
# Write one list of filenames with some unnecessary whitespace.
strains1 = Path(tmpdir) / Path("strains1.txt")
with open(strains1, "w") as oh:
oh.write("strain1 # this is an inline comment about strain 1\nstrain2\n # this is a comment preceded by whitespace.\n")

# Write another list of filenames with a comment.
strains2 = Path(tmpdir) / Path("strains2.txt")
with open(strains2, "w") as oh:
oh.write("# this is a comment. ignore this.\nstrain2\nstrain3\n")

strains = utils.read_strains(strains1, strains2)
assert len(strains) == 3
assert "strain1" in strains

def test_write_json_data_types(self, tmpdir):
"""write_json should be able to serialize various data types."""
data = {
Expand Down
Loading