From 9e71d58796cfb9e17398e12c3d693fc10a4e3b82 Mon Sep 17 00:00:00 2001 From: james hadfield Date: Mon, 25 Nov 2024 13:59:08 +1300 Subject: [PATCH] WIP add config schema & generate HTML docs Generate docs using json-schema-for-humans: ``` generate-schema-doc --config template_name=js config/schema.yaml public/schema.html ``` --- Snakefile | 12 ++ config/schema.yaml | 375 ++++++++++++++++++++++++++++++++++++++ public/schema.html | 19 ++ public/schema_doc.css | 181 ++++++++++++++++++ public/schema_doc.min.js | 1 + scripts/validate_utils.py | 29 +++ 6 files changed, 617 insertions(+) create mode 100644 config/schema.yaml create mode 100644 public/schema.html create mode 100644 public/schema_doc.css create mode 100644 public/schema_doc.min.js create mode 100644 scripts/validate_utils.py diff --git a/Snakefile b/Snakefile index 6e18a81..a510058 100755 --- a/Snakefile +++ b/Snakefile @@ -19,6 +19,18 @@ if os.path.exists("config.yaml"): from pprint import pp; pp(config, stream=sys.stderr) # TODO XXX remove +# Before we validate the config against the schema, check to see if we've failed to provide one +if not len(config.keys()): + print("-"*80 + "\nNo config loaded!", file=sys.stderr) + print("Avian-flu is indented to be run from the snakefile inside a subdir " + "(e.g. gisaid/Snakefile) which will pick up the default configfile for that workflow. " + "Alternatively you can pass in the config via `--configfile`", file=sys.stderr) + print("-"*80, file=sys.stderr) + raise InvalidConfigError("No config") + +from scripts.validate_utils import validate +validate(config) + class InvalidConfigError(Exception): pass diff --git a/config/schema.yaml b/config/schema.yaml new file mode 100644 index 0000000..2a372de --- /dev/null +++ b/config/schema.yaml @@ -0,0 +1,375 @@ +$schema: http://json-schema.org/draft-07/schema# +type: object +title: Avian-flu config schema +description: > + This is the schema for the Nextstrain avian-flu phylogenetic workflow + . The readme (viewable at that URL) + provides general information including how to run the workflow. This schema + presents the interface into the phylogenetic workflow(s). +__aliases: + # We use YAML aliases to get the behaviour we want from $refs, namely we want the objects + # to be merged. This is not the case in draft-07 - any properties in an object with a '$ref' + # key are dropped. + generic_workflow_file: &generic_workflow_file + description: > + Relative path which is to be found in the analysis directory, the entry snakefile directory + (e.g. `avian-flu/gisaid`) or the workflow directory (`avian-flu`). First match is used. + File path which may include wildcards (see examples). + # NOTE: Some places where this is used provide their own description and thus this one isn't used. + type: string + # TODO: it's not clear whether spaces in filenames will work everywhere... + wildcard_param_property: &wildcard_param_property + # While the intention of the config is to distinguish between types (e.g. numbers, strings) + # In reality this doesn't matter _most_ of the time. We could create many such definitions to + # handle the combination of allowable types, but we can't pass in the type to this + # definition where it's used (e.g. via the $ref). + # Leaving this here as a potential TODO + oneOf: + - type: object + patternProperties: + "^[^_/]+/[^_/]+/[^_/]+$": + type: ['string', 'number', 'boolean', 'integer'] + description: &wildcard_param_property_wildcard_description > + An object to link wildcard values to the parameter to use. The keys are a `/`-separated + string of three parts corresponding to the subtype, segment and time wildcards. You can use + a `*` character for any part in order to match any wildcard value. For a given build this + object is searched for matching wildcard combinations, and the highest specificity key + is chosen. Given example wildcard values of {subtype=h5nx, segment=pb2, time=2y} then + we have a search order of: + - `h5nx/pb2/2y` ─ all 3 wildcard values specified + - `h5nx/pb2/*` ┐ + - `h5nx/*/2y` ├ 2/3 wildcard values specified + - `*/pb2/2y` ┘ + - `h5nx/*/*` ┐ + - `*/pb2/*` ├ 1/3 wildcard values specified + - `*/*/2y` ┘ + - `*/*/*` ─ default / fall-back + and the first key present in the config is used. + + The expected value type is dependant on the specific config parameter. + # Note: either YAML doesn't preserve multiple spaces or the generated HTML doesn't + # render them correctly + - type: ['string', 'number', 'boolean', 'integer'] + description: > + A scalar value which will be used for all builds, i.e. it does not change with + the wildcards. The type is dependant on the specific config parameter. +additionalProperties: False +required: + - builds + - target_patterns + - inputs + - subtype_lookup + - reference + - auspice_config + - colors + - lat_longs + - include_strains + - dropped_strains + - clades_file + - description + - filter + - refine + - ancestral + - traits + - export +properties: + builds: + title: Target subtype/segment/time combinations + description: > + Each element defines one or more subtypes, segments and time resolutions + which are expanded to produce all combinations. You can supply multiple elements + here in order to define different combinations. + + NOTE: H5N1 cattle-outbreak schemas should not define `time`. + examples: + - | + - subtype: + - h7n9 + - h9n2 + segment: + - ha + - na + time: + - all-time + type: array + items: + $ref: "#/$defs/build_element" + target_patterns: + default: "auspice/avian-flu_{subtype}_{segment}_{time}.json" + description: > + You can modify the target(s) the workflow will produce, using the `subtype`, `segment` and `time` wildcards which + will be filled in as per the builds the config defines. + Alternatively you can specify target filenames when you invoke the pipeline. + type: array + items: + type: string + minItems: 1 + custom_rules: + title: Additional snakefiles to include + description: > + TODO XXX describe where these can be located + type: array + items: + type: string + inputs: + title: Starting inputs + description: > + Commonly used by the base configs to define starting metadata/sequences on S3. + Override this if you don't want to use these canonical starting points, e.g. if you want to use locally ingested data. + type: array + items: + $ref: "#/$defs/input_item" + additional_inputs: + title: Additional starting inputs + description: > + Additional inputs. These will be merged with any `inputs` (see above) with the additional inputs taking priority. + See README for more information. + type: array + items: + $ref: "#/$defs/input_item" + subtype_lookup: + title: Mapping between wildcard subtype and metadata subtypes + description: > + The elements in each list are used by `augur filter` to filter the entire data + to produce metadata/sequences for each of the associated builds. + patternProperties: + "^(/[^/]+)+$": + type: array + minItems: 1 + items: + type: string + same_strains_per_segment: + description: > + When true we enforce each segment to use the same set of strains. + Not used for the h5n1-cattle-outbreak builds + type: boolean + reference: + title: Alignment reference (GenBank file) + description: > + align + FIX ME XXX + <<: *generic_workflow_file + auspice_config: + title: Auspice config JSON + <<: *generic_workflow_file + colors: + title: Colors TSV file + description: > + For GISAID builds this is used as augur export's `--colors` argument. + For h5n1-cattle-flu builds we additionally append colors which are not (yet) + config-definable. + <<: *generic_workflow_file + lat_longs: + title: Additional lat-longs + <<: *generic_workflow_file + include_strains: + title: TXT file listing strains which will be included + <<: *generic_workflow_file + dropped_strains: + title: TXT file listing strains which will be dropped + <<: *generic_workflow_file + clades_file: + title: H5 clades TSV file (H5 builds only) + <<: *generic_workflow_file + description: + title: Markdown file describing the footer shown in Auspice + <<: *generic_workflow_file + filter: + title: "Parameters for the filter step of the pipeline" + type: object + additionalProperties: False + required: + - target_sequences_per_tree + - min_length + - min_date + - group_by + - exclude_where + properties: + target_sequences_per_tree: + title: Target this many sequences per Auspice dataset + description: Augur filter `--subsample-max-sequences` value + <<: *wildcard_param_property + min_length: + title: Minimum length for sequences to be included + description: Augur filter `--min-length` value + <<: *wildcard_param_property + min_date: + title: Minimum date for sequences to be included + examples: + - "2y" + - 1996 + <<: *wildcard_param_property + group_by: + description: Augur filter `--group-by` value + <<: *wildcard_param_property + exclude_where: + description: Augur filter `--exclude-where` value + <<: *wildcard_param_property + refine: + title: "Parameters for the augur refine step of the pipeline" + type: object + additionalProperties: False + required: + - coalescent + - date_inference + - clock_filter_iqd + - clock_rates + - root + properties: + coalescent: + title: Coalescent time scale + description: > + Value passed to `augur refine --coalescent`. + <<: *wildcard_param_property + date_inference: + title: Date inference method + description: > + Value passed to `augur refine --date-inference`. + <<: *wildcard_param_property + clock_filter_iqd: + title: Filter out sequences which fall outside the inferred clock + description: > + Value passed to `augur refine --clock-filter-iqd`. If you supply a falsey + value then this argument not be provided. + <<: *wildcard_param_property + clock_rates: + title: Clock rate and std dev + oneOf: + - type: object + description: *wildcard_param_property_wildcard_description + patternProperties: + "^[^_/]+/[^_/]+/[^_/]+$": + $ref: "#/$defs/clock_rate_element" + - $ref: "#/$defs/clock_rate_element" + # Can't provide a description here as $ref will clobber it + root: + title: Root or rooting mechanism + description: > + Value passed to `augur refine --root`. If you supply a falsey + value then this argument not be provided. + <<: *wildcard_param_property + segment_lengths: + title: Sequence lengths for each segment + description: > + This option is only used for the `h5n1-cattle-outbreak` genome builds. + We use these values to calculate the clock-rate for the genome build from the suppied values + for each segment (via `clock_rates` config). + Values must be integers. + <<: *wildcard_param_property + ancestral: + title: "Parameters for the augur ancestral step of the pipeline" + type: object + additionalProperties: False + required: + - inference + - root_seq + properties: + inference: + title: Inference method + description: Passed to `augur ancestral --inference` + <<: *wildcard_param_property + root_seq: + title: Root sequence + description: > + Value passed to `augur ancestral --root-sequence`. If you supply a falsey + value then this argument not be provided. + <<: *wildcard_param_property + traits: + title: "Parameters for the augur traits step of the pipeline (DTA)" + type: object + additionalProperties: False + required: + - columns + - sampling_bias_correction + - confidence + properties: + columns: + title: DTA columns + description: > + Whitespace-separated columns on which to run `augur traits` + <<: *wildcard_param_property + sampling_bias_correction: + title: Sampling Bias Correction + description: > + Provide a falsey value to disable this correction, otherwise the provided value `X` is + passed as `augur traits --sampling-bias-correction X` + <<: *wildcard_param_property + confidence: + title: Infer confidence? + description: Value should be truthy or falsey + <<: *wildcard_param_property + export: + title: "Parameters for the augur export step of the pipeline" + description: "See also: `config.auspice_config`, above" + type: object + additionalProperties: False + required: + - title + properties: + title: + description: > + Use this property to override the auspice-config defined title. + Falsey values will use the auspice-config title. + <<: *wildcard_param_property +$defs: + string_or_array_of_strings: &string_or_array_of_strings + oneOf: + - type: string + - type: array + minItems: 1 + items: + type: string + build_element: + type: object + required: ['subtype', segment] + additionalProperties: False + properties: + subtype: + examples: + - H5N1 + - ['H5N1', 'H7N9'] + <<: *string_or_array_of_strings + segment: + examples: + - HA + - ['PB1', 'PB2'] + <<: *string_or_array_of_strings + time: + description: | + Note that this is unused for the h5n1-cattle-flu outbreak workflows + For GISAID workflows this is required. + examples: + - 2y + - ['all-time', '2y'] + <<: *string_or_array_of_strings + clock_rate_element: + description: The clock rate & std dev. Provide an empty string to infer this value instead. + oneOf: + - type: array + prefixItems: + - type: number + title: Clock rate (subs/site/year) + - type: number + title: Clock std dev + - enum: [''] + input_item: + type: object + required: ['name'] + properties: + name: + type: string + title: Name of the input + description: > + May be used in intermediate filepaths and in merged metadata columns. + Please avoid spaces. + metadata: + type: string + title: Metadata TSV filepath or S3 URI + sequences: + title: Sequence FASTA, 1 per segment + oneOf: + - type: string + title: Filepath/address with {segment} wildcard + - type: object + title: Map of segment name to filepath/address + diff --git a/public/schema.html b/public/schema.html new file mode 100644 index 0000000..ed71e41 --- /dev/null +++ b/public/schema.html @@ -0,0 +1,19 @@ + Avian-flu config schema

Avian-flu config schema

Type: object

This is the schema for the Nextstrain avian-flu phylogenetic workflow <https://github.com/nextstrain/avian-flu>. The readme (viewable at that URL) provides general information including how to run the workflow. This schema presents the interface into the phylogenetic workflow(s).

No Additional Properties

Target subtype/segment/time combinations

Type: array

Each element defines one or more subtypes, segments and time resolutions which are expanded to produce all combinations. You can supply multiple elements here in order to define different combinations.
NOTE: H5N1 cattle-outbreak schemas should not define time.

No Additional Items

Each item of this array must be:

Type: object
No Additional Properties


Type: array of string

Must contain a minimum of 1 items

No Additional Items

Each item of this array must be:


Examples:

"H5N1"
+
[
+    "H5N1",
+    "H7N9"
+]
+


Type: array of string

Must contain a minimum of 1 items

No Additional Items

Each item of this array must be:


Examples:

"HA"
+
[
+    "PB1",
+    "PB2"
+]
+


Note that this is unused for the h5n1-cattle-flu outbreak workflows
For GISAID workflows this is required.

Type: string
Type: array of string

Must contain a minimum of 1 items

No Additional Items

Each item of this array must be:


Examples:

"2y"
+
[
+    "all-time",
+    "2y"
+]
+

Example:

"- subtype:\n    - h7n9\n    - h9n2\n  segment:\n    - ha\n    - na\n  time:\n    - all-time\n"
+

Type: array of string Default: "auspice/avian-flu_{subtype}_{segment}_{time}.json"

You can modify the target(s) the workflow will produce, using the subtype, segment and time wildcards which will be filled in as per the builds the config defines. Alternatively you can specify target filenames when you invoke the pipeline.

Must contain a minimum of 1 items

No Additional Items

Each item of this array must be:

Additional snakefiles to include

Type: array of string

TODO XXX describe where these can be located

No Additional Items

Each item of this array must be:

Type: string

Starting inputs

Type: array

Commonly used by the base configs to define starting metadata/sequences on S3. Override this if you don't want to use these canonical starting points, e.g. if you want to use locally ingested data.

No Additional Items

Each item of this array must be:

Type: object

Name of the input

Type: string

May be used in intermediate filepaths and in merged metadata columns. Please avoid spaces.

Metadata TSV filepath or S3 URI

Type: string

Sequence FASTA, 1 per segment


Additional starting inputs

Type: array

Additional inputs. These will be merged with any inputs (see above) with the additional inputs taking priority. See README for more information.

No Additional Items

Each item of this array must be:

Mapping between wildcard subtype and metadata subtypes

Type: object

The elements in each list are used by augur filter to filter the entire data to produce metadata/sequences for each of the associated builds.

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: ^(/[^/]+)+$
Type: array of string

Must contain a minimum of 1 items

No Additional Items

Each item of this array must be:

Type: boolean

When true we enforce each segment to use the same set of strains. Not used for the h5n1-cattle-outbreak builds

Alignment reference (GenBank file)

Type: string

align FIX ME XXX

Auspice config JSON

Type: string

Relative path which is to be found in the analysis directory, the entry snakefile directory (e.g. avian-flu/gisaid) or the workflow directory (avian-flu). First match is used. File path which may include wildcards (see examples).

Colors TSV file

Type: string

For GISAID builds this is used as augur export's --colors argument. For h5n1-cattle-flu builds we additionally append colors which are not (yet) config-definable.

Additional lat-longs

Type: string

Relative path which is to be found in the analysis directory, the entry snakefile directory (e.g. avian-flu/gisaid) or the workflow directory (avian-flu). First match is used. File path which may include wildcards (see examples).

TXT file listing strains which will be included

Type: string

Relative path which is to be found in the analysis directory, the entry snakefile directory (e.g. avian-flu/gisaid) or the workflow directory (avian-flu). First match is used. File path which may include wildcards (see examples).

TXT file listing strains which will be dropped

Type: string

Relative path which is to be found in the analysis directory, the entry snakefile directory (e.g. avian-flu/gisaid) or the workflow directory (avian-flu). First match is used. File path which may include wildcards (see examples).

H5 clades TSV file (H5 builds only)

Type: string

Relative path which is to be found in the analysis directory, the entry snakefile directory (e.g. avian-flu/gisaid) or the workflow directory (avian-flu). First match is used. File path which may include wildcards (see examples).

Markdown file describing the footer shown in Auspice

Type: string

Relative path which is to be found in the analysis directory, the entry snakefile directory (e.g. avian-flu/gisaid) or the workflow directory (avian-flu). First match is used. File path which may include wildcards (see examples).

Parameters for the filter step of the pipeline

Type: object
No Additional Properties

Target this many sequences per Auspice dataset


Augur filter --subsample-max-sequences value

Type: object

An object to link wildcard values to the parameter to use. The keys are a /-separated string of three parts corresponding to the subtype, segment and time wildcards. You can use a * character for any part in order to match any wildcard value. For a given build this object is searched for matching wildcard combinations, and the highest specificity key is chosen. Given example wildcard values of {subtype=h5nx, segment=pb2, time=2y} then we have a search order of:
- h5nx/pb2/2y ─ all 3 wildcard values specified
- h5nx/pb2/*
- h5nx/*/2y ├ 2/3 wildcard values specified
- */pb2/2y
- h5nx/*/*
- */pb2/* ├ 1/3 wildcard values specified
- */*/2y
- */*/* ─ default / fall-back
and the first key present in the config is used.
The expected value type is dependant on the specific config parameter.

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: ^[^_/]+/[^_/]+/[^_/]+$
Type: string, number, boolean or integer
Type: string, number, boolean or integer

A scalar value which will be used for all builds, i.e. it does not change with the wildcards. The type is dependant on the specific config parameter.

Minimum length for sequences to be included


Augur filter --min-length value

Type: object

An object to link wildcard values to the parameter to use. The keys are a /-separated string of three parts corresponding to the subtype, segment and time wildcards. You can use a * character for any part in order to match any wildcard value. For a given build this object is searched for matching wildcard combinations, and the highest specificity key is chosen. Given example wildcard values of {subtype=h5nx, segment=pb2, time=2y} then we have a search order of:
- h5nx/pb2/2y ─ all 3 wildcard values specified
- h5nx/pb2/*
- h5nx/*/2y ├ 2/3 wildcard values specified
- */pb2/2y
- h5nx/*/*
- */pb2/* ├ 1/3 wildcard values specified
- */*/2y
- */*/* ─ default / fall-back
and the first key present in the config is used.
The expected value type is dependant on the specific config parameter.

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: ^[^_/]+/[^_/]+/[^_/]+$
Type: string, number, boolean or integer
Type: string, number, boolean or integer

A scalar value which will be used for all builds, i.e. it does not change with the wildcards. The type is dependant on the specific config parameter.

Minimum date for sequences to be included


Type: object

An object to link wildcard values to the parameter to use. The keys are a /-separated string of three parts corresponding to the subtype, segment and time wildcards. You can use a * character for any part in order to match any wildcard value. For a given build this object is searched for matching wildcard combinations, and the highest specificity key is chosen. Given example wildcard values of {subtype=h5nx, segment=pb2, time=2y} then we have a search order of:
- h5nx/pb2/2y ─ all 3 wildcard values specified
- h5nx/pb2/*
- h5nx/*/2y ├ 2/3 wildcard values specified
- */pb2/2y
- h5nx/*/*
- */pb2/* ├ 1/3 wildcard values specified
- */*/2y
- */*/* ─ default / fall-back
and the first key present in the config is used.
The expected value type is dependant on the specific config parameter.

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: ^[^_/]+/[^_/]+/[^_/]+$
Type: string, number, boolean or integer
Type: string, number, boolean or integer

A scalar value which will be used for all builds, i.e. it does not change with the wildcards. The type is dependant on the specific config parameter.


Examples:

"2y"
+
1996
+


Augur filter --group-by value

Type: object

An object to link wildcard values to the parameter to use. The keys are a /-separated string of three parts corresponding to the subtype, segment and time wildcards. You can use a * character for any part in order to match any wildcard value. For a given build this object is searched for matching wildcard combinations, and the highest specificity key is chosen. Given example wildcard values of {subtype=h5nx, segment=pb2, time=2y} then we have a search order of:
- h5nx/pb2/2y ─ all 3 wildcard values specified
- h5nx/pb2/*
- h5nx/*/2y ├ 2/3 wildcard values specified
- */pb2/2y
- h5nx/*/*
- */pb2/* ├ 1/3 wildcard values specified
- */*/2y
- */*/* ─ default / fall-back
and the first key present in the config is used.
The expected value type is dependant on the specific config parameter.

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: ^[^_/]+/[^_/]+/[^_/]+$
Type: string, number, boolean or integer
Type: string, number, boolean or integer

A scalar value which will be used for all builds, i.e. it does not change with the wildcards. The type is dependant on the specific config parameter.


Augur filter --exclude-where value

Type: object

An object to link wildcard values to the parameter to use. The keys are a /-separated string of three parts corresponding to the subtype, segment and time wildcards. You can use a * character for any part in order to match any wildcard value. For a given build this object is searched for matching wildcard combinations, and the highest specificity key is chosen. Given example wildcard values of {subtype=h5nx, segment=pb2, time=2y} then we have a search order of:
- h5nx/pb2/2y ─ all 3 wildcard values specified
- h5nx/pb2/*
- h5nx/*/2y ├ 2/3 wildcard values specified
- */pb2/2y
- h5nx/*/*
- */pb2/* ├ 1/3 wildcard values specified
- */*/2y
- */*/* ─ default / fall-back
and the first key present in the config is used.
The expected value type is dependant on the specific config parameter.

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: ^[^_/]+/[^_/]+/[^_/]+$
Type: string, number, boolean or integer
Type: string, number, boolean or integer

A scalar value which will be used for all builds, i.e. it does not change with the wildcards. The type is dependant on the specific config parameter.

Parameters for the augur refine step of the pipeline

Type: object
No Additional Properties

Coalescent time scale


Value passed to augur refine --coalescent.

Type: object

An object to link wildcard values to the parameter to use. The keys are a /-separated string of three parts corresponding to the subtype, segment and time wildcards. You can use a * character for any part in order to match any wildcard value. For a given build this object is searched for matching wildcard combinations, and the highest specificity key is chosen. Given example wildcard values of {subtype=h5nx, segment=pb2, time=2y} then we have a search order of:
- h5nx/pb2/2y ─ all 3 wildcard values specified
- h5nx/pb2/*
- h5nx/*/2y ├ 2/3 wildcard values specified
- */pb2/2y
- h5nx/*/*
- */pb2/* ├ 1/3 wildcard values specified
- */*/2y
- */*/* ─ default / fall-back
and the first key present in the config is used.
The expected value type is dependant on the specific config parameter.

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: ^[^_/]+/[^_/]+/[^_/]+$
Type: string, number, boolean or integer
Type: string, number, boolean or integer

A scalar value which will be used for all builds, i.e. it does not change with the wildcards. The type is dependant on the specific config parameter.

Date inference method


Value passed to augur refine --date-inference.

Type: object

An object to link wildcard values to the parameter to use. The keys are a /-separated string of three parts corresponding to the subtype, segment and time wildcards. You can use a * character for any part in order to match any wildcard value. For a given build this object is searched for matching wildcard combinations, and the highest specificity key is chosen. Given example wildcard values of {subtype=h5nx, segment=pb2, time=2y} then we have a search order of:
- h5nx/pb2/2y ─ all 3 wildcard values specified
- h5nx/pb2/*
- h5nx/*/2y ├ 2/3 wildcard values specified
- */pb2/2y
- h5nx/*/*
- */pb2/* ├ 1/3 wildcard values specified
- */*/2y
- */*/* ─ default / fall-back
and the first key present in the config is used.
The expected value type is dependant on the specific config parameter.

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: ^[^_/]+/[^_/]+/[^_/]+$
Type: string, number, boolean or integer
Type: string, number, boolean or integer

A scalar value which will be used for all builds, i.e. it does not change with the wildcards. The type is dependant on the specific config parameter.

Filter out sequences which fall outside the inferred clock


Value passed to augur refine --clock-filter-iqd. If you supply a falsey value then this argument not be provided.

Type: object

An object to link wildcard values to the parameter to use. The keys are a /-separated string of three parts corresponding to the subtype, segment and time wildcards. You can use a * character for any part in order to match any wildcard value. For a given build this object is searched for matching wildcard combinations, and the highest specificity key is chosen. Given example wildcard values of {subtype=h5nx, segment=pb2, time=2y} then we have a search order of:
- h5nx/pb2/2y ─ all 3 wildcard values specified
- h5nx/pb2/*
- h5nx/*/2y ├ 2/3 wildcard values specified
- */pb2/2y
- h5nx/*/*
- */pb2/* ├ 1/3 wildcard values specified
- */*/2y
- */*/* ─ default / fall-back
and the first key present in the config is used.
The expected value type is dependant on the specific config parameter.

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: ^[^_/]+/[^_/]+/[^_/]+$
Type: string, number, boolean or integer
Type: string, number, boolean or integer

A scalar value which will be used for all builds, i.e. it does not change with the wildcards. The type is dependant on the specific config parameter.

Clock rate and std dev


Type: object

An object to link wildcard values to the parameter to use. The keys are a /-separated string of three parts corresponding to the subtype, segment and time wildcards. You can use a * character for any part in order to match any wildcard value. For a given build this object is searched for matching wildcard combinations, and the highest specificity key is chosen. Given example wildcard values of {subtype=h5nx, segment=pb2, time=2y} then we have a search order of:
- h5nx/pb2/2y ─ all 3 wildcard values specified
- h5nx/pb2/*
- h5nx/*/2y ├ 2/3 wildcard values specified
- */pb2/2y
- h5nx/*/*
- */pb2/* ├ 1/3 wildcard values specified
- */*/2y
- */*/* ─ default / fall-back
and the first key present in the config is used.
The expected value type is dependant on the specific config parameter.

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: ^[^_/]+/[^_/]+/[^_/]+$
Type: object

The clock rate & std dev. Provide an empty string to infer this value instead.

Type: array
No Additional Items

Tuple Validation

Item at 1 must be:
Item at 2 must be:
Type: enum (of string)

Must be one of:

  • ""
Type: object

The clock rate & std dev. Provide an empty string to infer this value instead.

Same definition as refine_clock_rates_oneOf_i0_pattern1

Root or rooting mechanism


Value passed to augur refine --root. If you supply a falsey value then this argument not be provided.

Type: object

An object to link wildcard values to the parameter to use. The keys are a /-separated string of three parts corresponding to the subtype, segment and time wildcards. You can use a * character for any part in order to match any wildcard value. For a given build this object is searched for matching wildcard combinations, and the highest specificity key is chosen. Given example wildcard values of {subtype=h5nx, segment=pb2, time=2y} then we have a search order of:
- h5nx/pb2/2y ─ all 3 wildcard values specified
- h5nx/pb2/*
- h5nx/*/2y ├ 2/3 wildcard values specified
- */pb2/2y
- h5nx/*/*
- */pb2/* ├ 1/3 wildcard values specified
- */*/2y
- */*/* ─ default / fall-back
and the first key present in the config is used.
The expected value type is dependant on the specific config parameter.

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: ^[^_/]+/[^_/]+/[^_/]+$
Type: string, number, boolean or integer
Type: string, number, boolean or integer

A scalar value which will be used for all builds, i.e. it does not change with the wildcards. The type is dependant on the specific config parameter.

Sequence lengths for each segment


This option is only used for the h5n1-cattle-outbreak genome builds. We use these values to calculate the clock-rate for the genome build from the suppied values for each segment (via clock_rates config). Values must be integers.

Type: object

An object to link wildcard values to the parameter to use. The keys are a /-separated string of three parts corresponding to the subtype, segment and time wildcards. You can use a * character for any part in order to match any wildcard value. For a given build this object is searched for matching wildcard combinations, and the highest specificity key is chosen. Given example wildcard values of {subtype=h5nx, segment=pb2, time=2y} then we have a search order of:
- h5nx/pb2/2y ─ all 3 wildcard values specified
- h5nx/pb2/*
- h5nx/*/2y ├ 2/3 wildcard values specified
- */pb2/2y
- h5nx/*/*
- */pb2/* ├ 1/3 wildcard values specified
- */*/2y
- */*/* ─ default / fall-back
and the first key present in the config is used.
The expected value type is dependant on the specific config parameter.

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: ^[^_/]+/[^_/]+/[^_/]+$
Type: string, number, boolean or integer
Type: string, number, boolean or integer

A scalar value which will be used for all builds, i.e. it does not change with the wildcards. The type is dependant on the specific config parameter.

Parameters for the augur ancestral step of the pipeline

Type: object
No Additional Properties

Inference method


Passed to augur ancestral --inference

Type: object

An object to link wildcard values to the parameter to use. The keys are a /-separated string of three parts corresponding to the subtype, segment and time wildcards. You can use a * character for any part in order to match any wildcard value. For a given build this object is searched for matching wildcard combinations, and the highest specificity key is chosen. Given example wildcard values of {subtype=h5nx, segment=pb2, time=2y} then we have a search order of:
- h5nx/pb2/2y ─ all 3 wildcard values specified
- h5nx/pb2/*
- h5nx/*/2y ├ 2/3 wildcard values specified
- */pb2/2y
- h5nx/*/*
- */pb2/* ├ 1/3 wildcard values specified
- */*/2y
- */*/* ─ default / fall-back
and the first key present in the config is used.
The expected value type is dependant on the specific config parameter.

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: ^[^_/]+/[^_/]+/[^_/]+$
Type: string, number, boolean or integer
Type: string, number, boolean or integer

A scalar value which will be used for all builds, i.e. it does not change with the wildcards. The type is dependant on the specific config parameter.

Root sequence


Value passed to augur ancestral --root-sequence. If you supply a falsey value then this argument not be provided.

Type: object

An object to link wildcard values to the parameter to use. The keys are a /-separated string of three parts corresponding to the subtype, segment and time wildcards. You can use a * character for any part in order to match any wildcard value. For a given build this object is searched for matching wildcard combinations, and the highest specificity key is chosen. Given example wildcard values of {subtype=h5nx, segment=pb2, time=2y} then we have a search order of:
- h5nx/pb2/2y ─ all 3 wildcard values specified
- h5nx/pb2/*
- h5nx/*/2y ├ 2/3 wildcard values specified
- */pb2/2y
- h5nx/*/*
- */pb2/* ├ 1/3 wildcard values specified
- */*/2y
- */*/* ─ default / fall-back
and the first key present in the config is used.
The expected value type is dependant on the specific config parameter.

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: ^[^_/]+/[^_/]+/[^_/]+$
Type: string, number, boolean or integer
Type: string, number, boolean or integer

A scalar value which will be used for all builds, i.e. it does not change with the wildcards. The type is dependant on the specific config parameter.

Parameters for the augur traits step of the pipeline (DTA)

Type: object
No Additional Properties

DTA columns


Whitespace-separated columns on which to run augur traits

Type: object

An object to link wildcard values to the parameter to use. The keys are a /-separated string of three parts corresponding to the subtype, segment and time wildcards. You can use a * character for any part in order to match any wildcard value. For a given build this object is searched for matching wildcard combinations, and the highest specificity key is chosen. Given example wildcard values of {subtype=h5nx, segment=pb2, time=2y} then we have a search order of:
- h5nx/pb2/2y ─ all 3 wildcard values specified
- h5nx/pb2/*
- h5nx/*/2y ├ 2/3 wildcard values specified
- */pb2/2y
- h5nx/*/*
- */pb2/* ├ 1/3 wildcard values specified
- */*/2y
- */*/* ─ default / fall-back
and the first key present in the config is used.
The expected value type is dependant on the specific config parameter.

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: ^[^_/]+/[^_/]+/[^_/]+$
Type: string, number, boolean or integer
Type: string, number, boolean or integer

A scalar value which will be used for all builds, i.e. it does not change with the wildcards. The type is dependant on the specific config parameter.

Sampling Bias Correction


Provide a falsey value to disable this correction, otherwise the provided value X is passed as augur traits --sampling-bias-correction X

Type: object

An object to link wildcard values to the parameter to use. The keys are a /-separated string of three parts corresponding to the subtype, segment and time wildcards. You can use a * character for any part in order to match any wildcard value. For a given build this object is searched for matching wildcard combinations, and the highest specificity key is chosen. Given example wildcard values of {subtype=h5nx, segment=pb2, time=2y} then we have a search order of:
- h5nx/pb2/2y ─ all 3 wildcard values specified
- h5nx/pb2/*
- h5nx/*/2y ├ 2/3 wildcard values specified
- */pb2/2y
- h5nx/*/*
- */pb2/* ├ 1/3 wildcard values specified
- */*/2y
- */*/* ─ default / fall-back
and the first key present in the config is used.
The expected value type is dependant on the specific config parameter.

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: ^[^_/]+/[^_/]+/[^_/]+$
Type: string, number, boolean or integer
Type: string, number, boolean or integer

A scalar value which will be used for all builds, i.e. it does not change with the wildcards. The type is dependant on the specific config parameter.

Infer confidence?


Value should be truthy or falsey

Type: object

An object to link wildcard values to the parameter to use. The keys are a /-separated string of three parts corresponding to the subtype, segment and time wildcards. You can use a * character for any part in order to match any wildcard value. For a given build this object is searched for matching wildcard combinations, and the highest specificity key is chosen. Given example wildcard values of {subtype=h5nx, segment=pb2, time=2y} then we have a search order of:
- h5nx/pb2/2y ─ all 3 wildcard values specified
- h5nx/pb2/*
- h5nx/*/2y ├ 2/3 wildcard values specified
- */pb2/2y
- h5nx/*/*
- */pb2/* ├ 1/3 wildcard values specified
- */*/2y
- */*/* ─ default / fall-back
and the first key present in the config is used.
The expected value type is dependant on the specific config parameter.

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: ^[^_/]+/[^_/]+/[^_/]+$
Type: string, number, boolean or integer
Type: string, number, boolean or integer

A scalar value which will be used for all builds, i.e. it does not change with the wildcards. The type is dependant on the specific config parameter.

Parameters for the augur export step of the pipeline

Type: object

See also: config.auspice_config, above

No Additional Properties


Use this property to override the auspice-config defined title. Falsey values will use the auspice-config title.

Type: object

An object to link wildcard values to the parameter to use. The keys are a /-separated string of three parts corresponding to the subtype, segment and time wildcards. You can use a * character for any part in order to match any wildcard value. For a given build this object is searched for matching wildcard combinations, and the highest specificity key is chosen. Given example wildcard values of {subtype=h5nx, segment=pb2, time=2y} then we have a search order of:
- h5nx/pb2/2y ─ all 3 wildcard values specified
- h5nx/pb2/*
- h5nx/*/2y ├ 2/3 wildcard values specified
- */pb2/2y
- h5nx/*/*
- */pb2/* ├ 1/3 wildcard values specified
- */*/2y
- */*/* ─ default / fall-back
and the first key present in the config is used.
The expected value type is dependant on the specific config parameter.

All properties whose name matches the following regular expression must respect the following conditions

Property name regular expression: ^[^_/]+/[^_/]+/[^_/]+$
Type: string, number, boolean or integer
Type: string, number, boolean or integer

A scalar value which will be used for all builds, i.e. it does not change with the wildcards. The type is dependant on the specific config parameter.

\ No newline at end of file diff --git a/public/schema_doc.css b/public/schema_doc.css new file mode 100644 index 0000000..e1f3a51 --- /dev/null +++ b/public/schema_doc.css @@ -0,0 +1,181 @@ +body { + font: 16px/1.5em "Overpass", "Open Sans", Helvetica, sans-serif; + color: #333; + font-weight: 300; + padding: 40px; +} + +.btn.btn-link { + font-size: 18px; + user-select: text; +} + +.jsfh-animated-property { + animation: eclair; + animation-iteration-count: 1; + animation-fill-mode: forwards; + animation-duration: .75s; + +} + +@keyframes eclair { + 0%,100% { + transform: scale(1); + } + 50% { + transform: scale(1.03); + } +} + +.btn.btn-primary { + margin: 10px; +} + +.btn.example-show.collapsed:before { + content: "show" +} + +.btn.example-show:before { + content: "hide" +} + +.description.collapse:not(.show) { + max-height: 100px !important; + overflow: hidden; + + display: -webkit-box; + -webkit-line-clamp: 2; + -webkit-box-orient: vertical; +} + +.description.collapsing { + min-height: 100px !important; +} + +.collapse-description-link.collapsed:after { + content: '+ Read More'; +} + +.collapse-description-link:not(.collapsed):after { + content: '- Read Less'; +} + +.badge { + font-size: 100%; + margin-bottom: 0.5rem; + margin-top: 0.5rem; +} + +.badge.value-type { + font-size: 120%; + margin-right: 5px; + margin-bottom: 10px; +} + + +.badge.default-value { + font-size: 120%; + margin-left: 5px; + margin-bottom: 10px; +} + +.badge.restriction { + display: inline-block; +} + +.badge.required-property,.badge.deprecated-property,.badge.pattern-property,.badge.no-additional { + font-size: 100%; + margin-left: 10px; +} + +.accordion div.card:only-child { + border-bottom: 1px solid rgba(0, 0, 0, 0.125); +} + +.examples { + padding: 1rem !important; +} + +.examples pre { + margin-bottom: 0; +} + +.highlight.jumbotron { + padding: 1rem !important; +} + +.generated-by-footer { + margin-top: 1em; + text-align: right; +} + +/* From https://github.com/richleland/pygments-css/blob/master/friendly.css, see https://github.com/trentm/python-markdown2/wiki/fenced-code-blocks */ +.highlight { background: #e9ecef; } /* Changed from #f0f0f0 in the original style to be the same as bootstrap's jumbotron */ +.highlight .hll { background-color: #ffffcc } +.highlight .c { color: #60a0b0; font-style: italic } /* Comment */ +.highlight .err { border: 1px solid #FF0000 } /* Error */ +.highlight .k { color: #007020; font-weight: bold } /* Keyword */ +.highlight .o { color: #666666 } /* Operator */ +.highlight .ch { color: #60a0b0; font-style: italic } /* Comment.Hashbang */ +.highlight .cm { color: #60a0b0; font-style: italic } /* Comment.Multiline */ +.highlight .cp { color: #007020 } /* Comment.Preproc */ +.highlight .cpf { color: #60a0b0; font-style: italic } /* Comment.PreprocFile */ +.highlight .c1 { color: #60a0b0; font-style: italic } /* Comment.Single */ +.highlight .cs { color: #60a0b0; background-color: #fff0f0 } /* Comment.Special */ +.highlight .gd { color: #A00000 } /* Generic.Deleted */ +.highlight .ge { font-style: italic } /* Generic.Emph */ +.highlight .gr { color: #FF0000 } /* Generic.Error */ +.highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */ +.highlight .gi { color: #00A000 } /* Generic.Inserted */ +.highlight .go { color: #888888 } /* Generic.Output */ +.highlight .gp { color: #c65d09; font-weight: bold } /* Generic.Prompt */ +.highlight .gs { font-weight: bold } /* Generic.Strong */ +.highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */ +.highlight .gt { color: #0044DD } /* Generic.Traceback */ +.highlight .kc { color: #007020; font-weight: bold } /* Keyword.Constant */ +.highlight .kd { color: #007020; font-weight: bold } /* Keyword.Declaration */ +.highlight .kn { color: #007020; font-weight: bold } /* Keyword.Namespace */ +.highlight .kp { color: #007020 } /* Keyword.Pseudo */ +.highlight .kr { color: #007020; font-weight: bold } /* Keyword.Reserved */ +.highlight .kt { color: #902000 } /* Keyword.Type */ +.highlight .m { color: #40a070 } /* Literal.Number */ +.highlight .s { color: #4070a0 } /* Literal.String */ +.highlight .na { color: #4070a0 } /* Name.Attribute */ +.highlight .nb { color: #007020 } /* Name.Builtin */ +.highlight .nc { color: #0e84b5; font-weight: bold } /* Name.Class */ +.highlight .no { color: #60add5 } /* Name.Constant */ +.highlight .nd { color: #555555; font-weight: bold } /* Name.Decorator */ +.highlight .ni { color: #d55537; font-weight: bold } /* Name.Entity */ +.highlight .ne { color: #007020 } /* Name.Exception */ +.highlight .nf { color: #06287e } /* Name.Function */ +.highlight .nl { color: #002070; font-weight: bold } /* Name.Label */ +.highlight .nn { color: #0e84b5; font-weight: bold } /* Name.Namespace */ +.highlight .nt { color: #062873; font-weight: bold } /* Name.Tag */ +.highlight .nv { color: #bb60d5 } /* Name.Variable */ +.highlight .ow { color: #007020; font-weight: bold } /* Operator.Word */ +.highlight .w { color: #bbbbbb } /* Text.Whitespace */ +.highlight .mb { color: #40a070 } /* Literal.Number.Bin */ +.highlight .mf { color: #40a070 } /* Literal.Number.Float */ +.highlight .mh { color: #40a070 } /* Literal.Number.Hex */ +.highlight .mi { color: #40a070 } /* Literal.Number.Integer */ +.highlight .mo { color: #40a070 } /* Literal.Number.Oct */ +.highlight .sa { color: #4070a0 } /* Literal.String.Affix */ +.highlight .sb { color: #4070a0 } /* Literal.String.Backtick */ +.highlight .sc { color: #4070a0 } /* Literal.String.Char */ +.highlight .dl { color: #4070a0 } /* Literal.String.Delimiter */ +.highlight .sd { color: #4070a0; font-style: italic } /* Literal.String.Doc */ +.highlight .s2 { color: #4070a0 } /* Literal.String.Double */ +.highlight .se { color: #4070a0; font-weight: bold } /* Literal.String.Escape */ +.highlight .sh { color: #4070a0 } /* Literal.String.Heredoc */ +.highlight .si { color: #70a0d0; font-style: italic } /* Literal.String.Interpol */ +.highlight .sx { color: #c65d09 } /* Literal.String.Other */ +.highlight .sr { color: #235388 } /* Literal.String.Regex */ +.highlight .s1 { color: #4070a0 } /* Literal.String.Single */ +.highlight .ss { color: #517918 } /* Literal.String.Symbol */ +.highlight .bp { color: #007020 } /* Name.Builtin.Pseudo */ +.highlight .fm { color: #06287e } /* Name.Function.Magic */ +.highlight .vc { color: #bb60d5 } /* Name.Variable.Class */ +.highlight .vg { color: #bb60d5 } /* Name.Variable.Global */ +.highlight .vi { color: #bb60d5 } /* Name.Variable.Instance */ +.highlight .vm { color: #bb60d5 } /* Name.Variable.Magic */ +.highlight .il { color: #40a070 } /* Literal.Number.Integer.Long */ diff --git a/public/schema_doc.min.js b/public/schema_doc.min.js new file mode 100644 index 0000000..17eceaf --- /dev/null +++ b/public/schema_doc.min.js @@ -0,0 +1 @@ +$(document).on("click",'a[href^="#"]',function(event){event.preventDefault();history.pushState({},"",this.href)});function flashElement(elementId){myElement=document.getElementById(elementId);myElement.classList.add("jsfh-animated-property");setTimeout(function(){myElement.classList.remove("jsfh-animated-property")},1e3)}function setAnchor(anchorLinkDestination){history.pushState({},"",anchorLinkDestination)}function anchorOnLoad(){let linkTarget=decodeURIComponent(window.location.hash.split("?")[0].split("&")[0]);if(linkTarget[0]==="#"){linkTarget=linkTarget.substr(1)}if(linkTarget.length>0){anchorLink(linkTarget)}}function anchorLink(linkTarget){const target=$("#"+linkTarget);target.parents().addBack().filter(".collapse:not(.show), .tab-pane, [role='tab']").each(function(index){if($(this).hasClass("collapse")){$(this).collapse("show")}else if($(this).hasClass("tab-pane")){const tabToShow=$("a[href='#"+$(this).attr("id")+"']");if(tabToShow){tabToShow.tab("show")}}else if($(this).attr("role")==="tab"){$(this).tab("show")}});setTimeout(function(){let targetElement=document.getElementById(linkTarget);if(targetElement){targetElement.scrollIntoView({block:"center",behavior:"smooth"});setTimeout(function(){flashElement(linkTarget)},500)}},1e3)} \ No newline at end of file diff --git a/scripts/validate_utils.py b/scripts/validate_utils.py new file mode 100644 index 0000000..ae6cd50 --- /dev/null +++ b/scripts/validate_utils.py @@ -0,0 +1,29 @@ + +class ValidateError(Exception): + pass + +def validate(config): + # prototype based on + from importlib import metadata + # + assert str(metadata.version("jsonschema")).startswith('3.'), "jsonschema must be version 3" + import jsonschema + import jsonschema.exceptions + import yte + from os import path + + with open(path.join(path.dirname(path.realpath(__file__)), "../config/schema.yaml"), encoding='utf-8') as f: + schema = yte.process_yaml(f, require_use_yte=True) + + Validator = jsonschema.validators.validator_for(schema) + + try: + Validator.check_schema(schema) + except jsonschema.exceptions.SchemaError as err: + raise ValidateError(f"Internal error: config schema is not a valid JSON Schema ({Validator.META_SCHEMA['$schema']}). Error: {err}") + + # Here we're validating the merged schema. We could also validate the user config on its own by making (all?) properties optional? + from augur.validate import validate_json + validate_json(config, Validator(schema), "config") + + # There are more checks we can do by using code, as desired.