Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

export multiple trees #1450

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 48 additions & 25 deletions augur/export_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,24 @@ def order_nodes(node):
return od


def read_trees(filenames):
trees = [Phylo.read(fname, 'newick') for fname in filenames]
# augur export requires unique node names (both terminal and external) as these
# are used to associate metadata/node-data with nodes. Any duplication is fatal.
# The exception to this is unlabelled node names, which auspice will handle but
# won't be associated with any metadata within export.
node_names = [clade.name for tree in trees for clade in tree.root.find_clades()]
if None in node_names:
raise AugurError(f"Tree contains unnamed nodes. If these are internal nodes you may wish to run "+
"`augur refine --tree <newick> --output-tree <newick>` to name them.")
if len(set(node_names))!=len(node_names):
from collections import Counter
dups = [name for name, count in Counter(node_names).items() if count>1]
raise AugurError(f"{len(dups)} node names occur multiple times in the tree: " +
", ".join([f"{v!r}" for v in dups[0:5]]) + ("..." if len(dups)>5 else ""))
return (trees, node_names)


def node_div(T, node_attrs):
"""
Scans the provided tree & metadata to see if divergence is defined, and if so returns
Expand Down Expand Up @@ -455,7 +473,7 @@ def _get_colorings():
data_json['meta']["colorings"] = colorings


def set_geo_resolutions(data_json, config, command_line_traits, lat_long_mapping, node_attrs):
def set_geo_resolutions(trees, meta_json, config, command_line_traits, lat_long_mapping, node_attrs):
"""
appropriately combine provided geo resolutions from command line & config files
and associate with lat/longs.
Expand Down Expand Up @@ -517,12 +535,11 @@ def _transfer_geo_data(node):
else:
warn("Geo resolution \"{}\" had no demes with supplied lat/longs and will be excluded from the exported \"geo_resolutions\".".format(trait_info["key"]))

#
_transfer_geo_data(data_json['tree'])

for subtree in trees:
_transfer_geo_data(subtree)

if geo_resolutions:
data_json['meta']["geo_resolutions"] = geo_resolutions
meta_json["geo_resolutions"] = geo_resolutions


def set_annotations(data_json, node_data):
Expand Down Expand Up @@ -722,9 +739,9 @@ def node_to_author_tuple(data):

return node_author_info

def set_branch_attrs_on_tree(data_json, branch_attrs):
def set_branch_attrs_on_tree(trees, branch_attrs):
"""
Shifts the provided `branch_attrs` onto the (auspice) `data_json`.
Shifts the provided `branch_attrs` onto the provided tree
Currently all data is transferred, there is no way for (e.g.) the set of exported
labels to be restricted by the user in a config.
"""
Expand All @@ -733,10 +750,12 @@ def _recursively_set_data(node):
node['branch_attrs'] = branch_attrs[node['name']]
for child in node.get("children", []):
_recursively_set_data(child)
_recursively_set_data(data_json["tree"])

for subtree in trees:
_recursively_set_data(subtree)

def set_node_attrs_on_tree(data_json, node_attrs, additional_metadata_columns):

def set_node_attrs_on_tree(trees, meta_json, node_attrs, additional_metadata_columns):
'''
Assign desired colorings, metadata etc to the `node_attrs` of nodes in the tree

Expand Down Expand Up @@ -786,10 +805,10 @@ def _transfer_url_accession(node, raw_data):

def _transfer_colorings_filters(node, raw_data):
trait_keys = set() # order we add to the node_attrs is not important for auspice
if "colorings" in data_json["meta"]:
trait_keys = trait_keys.union([t["key"] for t in data_json["meta"]["colorings"]])
if "filters" in data_json["meta"]:
trait_keys = trait_keys.union(data_json["meta"]["filters"])
if "colorings" in meta_json:
trait_keys = trait_keys.union([t["key"] for t in meta_json["colorings"]])
if "filters" in meta_json:
trait_keys = trait_keys.union(meta_json["filters"])
exclude_list = ["gt", "num_date", "author"] # exclude special cases already taken care of
trait_keys = trait_keys.difference(exclude_list)
for key in trait_keys:
Expand Down Expand Up @@ -822,7 +841,8 @@ def _recursively_set_data(node):
for child in node.get("children", []):
_recursively_set_data(child)

_recursively_set_data(data_json["tree"])
for subtree in trees:
_recursively_set_data(subtree)

def node_data_prop_is_normal_trait(name):
# those traits / keys / attrs which are not "special" and can be exported
Expand Down Expand Up @@ -876,7 +896,7 @@ def register_parser(parent_subparsers):
required = parser.add_argument_group(
title="REQUIRED"
)
required.add_argument('--tree','-t', metavar="newick", required=True, help="Phylogenetic tree, usually output from `augur refine`")
required.add_argument('--tree','-t', metavar="newick", nargs='+', action='extend', required=True, help="Phylogenetic tree(s), usually output from `augur refine`")
required.add_argument('--output', metavar="JSON", required=True, help="Output file (typically for visualisation in auspice)")

config = parser.add_argument_group(
Expand Down Expand Up @@ -1060,12 +1080,12 @@ def create_branch_labels(branch_attrs, node_data, branch_data):
continue
branch_attrs[node_name]["labels"][label_key] = label_value

def parse_node_data_and_metadata(T, node_data, metadata):
def parse_node_data_and_metadata(node_names, node_data, metadata):
node_data_names = set()
metadata_names = set()

# assign everything to node_attrs, exclusions considered later
node_attrs = {clade.name: {} for clade in T.root.find_clades()}
node_attrs = {name: {} for name in node_names}

# first pass: metadata
for metadata_id, node in metadata.items():
Expand All @@ -1089,7 +1109,7 @@ def parse_node_data_and_metadata(T, node_data, metadata):
# third pass: create `branch_attrs`. The data comes from
# (a) some keys within `node_data['nodes']` (for legacy reasons)
# (b) the `node_data['branches']` dictionary, which currently only defines labels
branch_attrs = {clade.name: defaultdict(dict) for clade in T.root.find_clades()}
branch_attrs = {name: defaultdict(dict) for name in node_names}
create_branch_mutations(branch_attrs, node_data)
create_branch_labels(branch_attrs, node_data['nodes'], node_data.get('branches', {}))

Expand Down Expand Up @@ -1174,9 +1194,9 @@ def run(args):
metadata_file = {}

# parse input files
T = Phylo.read(args.tree, 'newick')
(trees, node_names) = read_trees(args.tree)
node_data, node_attrs, node_data_names, metadata_names, branch_attrs = \
parse_node_data_and_metadata(T, node_data_file, metadata_file)
parse_node_data_and_metadata(node_names, node_data_file, metadata_file)
config = get_config(args)
additional_metadata_columns = get_additional_metadata_columns(config, args.metadata_columns, metadata_names)

Expand Down Expand Up @@ -1205,15 +1225,18 @@ def run(args):
sys.exit(2)
set_filters(data_json, config)

# set tree structure
data_json["tree"] = convert_tree_to_json_structure(T.root, node_attrs, node_div(T, node_attrs))
set_node_attrs_on_tree(data_json, node_attrs, additional_metadata_columns)
set_branch_attrs_on_tree(data_json, branch_attrs)
# maintain a list of (sub)trees so that we are always dealing with a list even in the (common) case of a single tree
trees = [convert_tree_to_json_structure(tree.root, node_attrs, node_div(tree, node_attrs)) for tree in trees]
set_node_attrs_on_tree(trees, data_json['meta'], node_attrs, additional_metadata_columns)
set_branch_attrs_on_tree(trees, branch_attrs)

set_geo_resolutions(data_json, config, args.geo_resolutions, read_lat_longs(args.lat_longs), node_attrs)
set_geo_resolutions(trees, data_json['meta'], config, args.geo_resolutions, read_lat_longs(args.lat_longs), node_attrs)
set_panels(data_json, config, args.panels)
set_data_provenance(data_json, config)

# If we have a single tree then we set this directly as <json>.tree
data_json['tree'] = trees[0] if len(trees)==1 else trees

# pass through any extensions block in the auspice config JSON without any changes / checking
if config.get("extensions"):
data_json["meta"]["extensions"] = config["extensions"]
Expand Down
36 changes: 28 additions & 8 deletions augur/validate_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import sys
from collections import defaultdict

def ensure_no_duplicate_names(root, ValidateError):
def ensure_no_duplicate_names(tree, ValidateError):
"""
Check that all node names are identical, which is required for auspice (v2) JSONs.
"""
Expand All @@ -18,10 +18,14 @@ def recurse(node):
names.add(node["name"])
if "children" in node:
[recurse(child) for child in node["children"]]
recurse(root)
if isinstance(tree, list):
for subtree in tree:
recurse(subtree)
else:
recurse(tree)


def collectTreeAttrsV2(root, warn):
def collectTreeAttrsV2(tree, warn):
"""
Collect all keys specified on `node["node_attrs"]` throughout the tree
and the values associated with them. Note that this will only look at
Expand All @@ -47,7 +51,12 @@ def recurse(node):
[recurse(child) for child in node["children"]]
else:
num_terminal += 1
recurse(root)

if isinstance(tree, list):
for subtree in tree:
recurse(subtree)
else:
recurse(tree)

for data in seen.values():
if data["count"] == num_nodes:
Expand All @@ -56,7 +65,7 @@ def recurse(node):
return(seen, num_terminal)


def collectMutationGenes(root):
def collectMutationGenes(tree):
"""
Returns a set of all genes specified in the tree in the "aa_muts" objects
"""
Expand All @@ -67,17 +76,28 @@ def recurse(node):
genes.update(mutations.keys())
if "children" in node:
[recurse(child) for child in node["children"]]
recurse(root)

if isinstance(tree, list):
for subtree in tree:
recurse(subtree)
else:
recurse(tree)

genes -= {"nuc"}
return genes

def collectBranchLabels(root):
def collectBranchLabels(tree):
labels = set()
def recurse(node):
labels.update(node.get("branch_attrs", {}).get("labels", {}).keys())
if "children" in node:
[recurse(child) for child in node["children"]]
recurse(root)

if isinstance(tree, list):
for subtree in tree:
recurse(subtree)
else:
recurse(tree)
return labels

def verifyMainJSONIsInternallyConsistent(data, ValidateError):
Expand Down
10 changes: 10 additions & 0 deletions tests/functional/export_v2/cram/duplicate-node-names.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
$ source "$TESTDIR"/_setup.sh

$ echo "(tipA:1,(tipA:1,tipA:1)internalBC:2,(tipD:3,tipE:4,tipF:1)internalDEF:5)ROOT:0;" > tree1.nwk

$ ${AUGUR} export v2 \
> --tree tree1.nwk \
> --output minimal.json
ERROR: 1 node names occur multiple times in the tree: 'tipA'
[2]

25 changes: 14 additions & 11 deletions tests/functional/export_v2/cram/minify-output.t
Original file line number Diff line number Diff line change
Expand Up @@ -16,27 +16,29 @@ names to create larger file sizes with less recursion.
> }

A small tree is not automatically minified.
The unminified output is 13KB which is considered acceptable.
The unminified output is 15KB which is considered acceptable.

$ echo "$(generate_newick 10);" > small_tree_raw.nwk

Use augur refine to add internal node names
$ ${AUGUR} refine --tree small_tree_raw.nwk --output-tree small_tree.nwk &>/dev/null
Comment on lines +23 to +24
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggestion: do this in generate_newick directly. I just tested locally and something like this seems to work:

- echo "($(generate_newick $((n-1))),$prefix$n)"
+ echo "($prefix$n,$(generate_newick $((n-1)))):INTERNAL_$n"

This runs much faster than augur refine, especially for the large tree.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Without running refine we fail validation due to the absence of any divergence values on the tree.

I just tested locally and something like this seems to work:

Do you have a patch which passes the tests?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would skip validation since validity should be tested elsewhere. Patch: d1c3997


$ echo "$(generate_newick 10);" > small_tree.nwk

$ ${AUGUR} export v2 \
> --tree small_tree.nwk \
> --skip-validation \
> --output output.json &>/dev/null

$ head -c 20 output.json
{
"version": "v2", (no-eol)

$ ls -l output.json | awk '{print $5}'
13813
15039

It can be forcefully minified with an argument.

$ ${AUGUR} export v2 \
> --tree small_tree.nwk \
> --skip-validation \
> --minify-json \
> --output output.json &>/dev/null

Expand All @@ -48,21 +50,22 @@ even if it may seem "falsey".

$ AUGUR_MINIFY_JSON=0 ${AUGUR} export v2 \
> --tree small_tree.nwk \
> --skip-validation \
> --output output.json &>/dev/null

$ head -c 20 output.json
{"version": "v2", "m (no-eol)


A large tree, when forcefully not minified, has an output size of 6MB which is
A large tree, when forcefully not minified, has an output size of 8MB which is
considered large.

$ echo "$(generate_newick 500);" > big_tree.nwk
$ echo "$(generate_newick 500);" > big_tree_raw.nwk

Use augur refine to add internal node names
$ ${AUGUR} refine --tree big_tree_raw.nwk --output-tree big_tree.nwk &>/dev/null

$ ${AUGUR} export v2 \
> --tree big_tree.nwk \
> --skip-validation \
> --no-minify-json \
> --output output.json &>/dev/null

Expand All @@ -71,7 +74,7 @@ considered large.
"version": "v2", (no-eol)

$ ls -l output.json | awk '{print $5}'
6568454
8591420

This means it is automatically minified.

Expand All @@ -84,4 +87,4 @@ This means it is automatically minified.
{"version": "v2", "m (no-eol)

$ ls -l output.json | awk '{print $5}'
561436
576414
18 changes: 18 additions & 0 deletions tests/functional/export_v2/cram/missing-internal-nodes.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
Setup

$ source "$TESTDIR"/_setup.sh

Test a tree with unlabelled internal nodes.
In Augur v24.3.0 and earlier these would work _sometimes_ depending
on what code paths were hit, but in most real-life cases would raise
an uncaught error. We now check for these when we parse the tree.

$ echo "(tipA:1,(tipB:1,tipC:1):2,(tipD:3,tipE:4,tipF:1):5):0;" > tree1.nwk

$ ${AUGUR} export v2 \
> --tree tree1.nwk \
> --metadata "$TESTDIR/../data/dataset1_metadata_with_name.tsv" \
> --node-data "$TESTDIR/../data/div_node-data.json" \
> --output test1.json
ERROR: Tree contains unnamed nodes.+ (re)
[2]
Loading
Loading