nextstrain · genehack · Jun 11, 2024 · May 20, 2024 · May 21, 2024 · May 21, 2024
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -0,0 +1,12 @@
+name: CI
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  ci:
+    uses: nextstrain/.github/.github/workflows/pathogen-repo-ci.yaml@master
diff --git a/README.md b/README.md
@@ -8,7 +8,10 @@ Built starting from the [Nextstrain pathogen repository template][].
 
 ## Working on this repo
 
-This repo is configured to use [pre-commit](https://pre-commit.com).
+This repo is configured to use [pre-commit](https://pre-commit.com),
+to help automatically catch common coding errors and syntax issues
+with changes before they are committed to the repo.
+.
 If you will be writing new code or otherwise working within this repo,
 please do the following to get started:
 
@@ -17,4 +20,9 @@ please do the following to get started:
    preferred package management solution
 2. install the local git hooks by running `pre-commit install` from
    the root of the repo
-3. get to coding!
+3. when problems are detected, correct them in your local working tree
+   before committing them.
+
+Note that these pre-commit checks are also run in a GitHub Action when
+changes are pushed to GitHub, so correcting issues locally will
+prevent extra cycles of correction.
diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -1,14 +1,11 @@
-VIRUSES = ["229e", "nl63", "oc43", "hku1"]
+# Use default configuration values. Override with Snakemake's --configfile/--config options.
+configfile: "config/defaults.yaml"
 
 
 rule all:
     input:
-        expand("results/{virus}/sequences.fasta", virus=VIRUSES),
-        expand("results/{virus}/metadata.tsv", virus=VIRUSES),
-
-
-# Use default configuration values. Override with Snakemake's --configfile/--config options.
-configfile: "config/defaults.yaml"
+        expand("results/{virus}/sequences.fasta", virus=config["viruses"]),
+        expand("results/{virus}/metadata.tsv", virus=config["viruses"]),
 
 
 include: "rules/fetch_from_ncbi.smk"

diff --git a/ingest/build-configs/ci/config.yaml b/ingest/build-configs/ci/config.yaml
@@ -0,0 +1,2 @@
+viruses:
+  - "hku1"
diff --git a/ingest/config/defaults.yaml b/ingest/config/defaults.yaml
@@ -1,21 +1,24 @@
+# virus species in this dataset
+# NOTE: when adding a new species, you must also add a top level key
+# with the species name pointing to specific information.
+viruses:
+  - "229e"
+  - "nl63"
+  - "oc43"
+  - "hku1"
+
 # virus-specific information
 # for each virus, give the NCBI taxon ID (required for data fetch from
 # NCBI datasets) and the path to the manual annotations file, relative
 # to the ingest directory
-## TODO see if the `annotations` key can be refactored out into the
-## Snakemake file with a `{virus}` wildcard
 229e:
   ncbi_taxon_id: "11137"
-  annotations: "config/229e/annotations.tsv"
 nl63:
   ncbi_taxon_id: "277944"
-  annotations: "config/nl63/annotations.tsv"
 oc43:
   ncbi_taxon_id: "31631"
-  annotations: "config/oc43/annotations.tsv"
 hku1:
   ncbi_taxon_id: "290028"
-  annotations: "config/hku1/annotations.tsv"
 
 # Optional fields to add to the NCBI Datasets output
 ncbi_dataset_fields: []
@@ -26,9 +29,11 @@ curate:
   # For the Nextstrain team, this is currently
   # 'https://raw.githubusercontent.com/nextstrain/ncov-ingest/@/source-data/gisaid_geoLocationRules.tsv'
   geolocation_rules_url: "https://raw.githubusercontent.com/nextstrain/ncov-ingest/@/source-data/gisaid_geoLocationRules.tsv"
+
   # The path to the local geolocation rules within the pathogen repo
   # The path should be relative to the ingest directory.
   local_geolocation_rules: "config/geolocation_rules.tsv"
+
   # List of field names to change in the format of <old_field_name>=<new_field_name>
   # This is the first step in the pipeline, so any references to field names
   # in the configs below should use the new field names
@@ -44,8 +49,10 @@ curate:
     Submitter Affiliation: institution
     SRA Accessions: sra_accession
     passage_type: passage_type
+
   # List of date fields to standardize to ISO format YYYY-MM-DD
   date_fields: ["date"]
+
   # List of expected date formats that are present in the date fields provided above
   # These date formats should use directives expected by datetime
   # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes
@@ -54,6 +61,7 @@ curate:
     - "%Y-%m"
     - "%Y-%m-%d"
     - "%Y-%m-%dT%H:%M:%SZ"
+
   titlecase:
     # Abbreviations not cast to titlecase, keeps uppercase
     abbreviations: ["USA"]
@@ -78,20 +86,28 @@ curate:
       - sur
       - the
       - y
+
     # List of string fields to titlecase
     fields: ["region", "country", "division", "location"]
+
   # Metadata field that contains the list of authors associated with the sequence
   authors_field: "authors"
+
   # Default value to use if the authors field is empty
   authors_default_value: "?"
+
   # Name to use for the generated abbreviated authors field
   abbr_authors_field: "abbr_authors"
+
   # The ID field in the metadata to use to merge the manual annotations
   annotations_id: "strain"
+
   # The ID field in the metadata to use as the sequence id in the output FASTA file
   output_id_field: "strain"
+
   # The field in the NDJSON record that contains the actual genomic sequence
   output_sequence_field: "sequence"
+
   # The list of metadata columns to keep in the final output of the curation pipeline.
   metadata_columns:
     - strain

diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk
@@ -65,7 +65,7 @@ rule curate:
         sequences_ndjson="data/{virus}/ncbi.ndjson",
         # Change the geolocation_rules input path if you are removing the above two rules
         all_geolocation_rules="data/all-geolocation-rules.tsv",
-        annotations=lambda wildcards: config[wildcards.virus]["annotations"],
+        annotations="config/{virus}/annotations.tsv",
     output:
         metadata="results/{virus}/all_metadata.tsv",
         sequences="results/{virus}/sequences.fasta",

diff --git a/phylogenetic/Snakefile b/phylogenetic/Snakefile
@@ -1,13 +1,10 @@
-VIRUSES = ["229e", "nl63", "oc43", "hku1"]
+# Use default configuration values. Override with Snakemake's --configfile/--config options.
+configfile: "config/defaults.yaml"
 
 
 rule all:
     input:
-        expand("auspice/{virus}.json", virus=VIRUSES),
-
-
-# Use default configuration values. Override with Snakemake's --configfile/--config options.
-configfile: "config/defaults.yaml"
+        expand("auspice/{virus}.json", virus=config["viruses"]),
 
 
 include: "rules/prepare_sequences.smk"

diff --git a/phylogenetic/build-configs/ci/config.yaml b/phylogenetic/build-configs/ci/config.yaml
@@ -0,0 +1,2 @@
+viruses:
+  - "hku1"
diff --git a/phylogenetic/config/defaults.yaml b/phylogenetic/config/defaults.yaml
@@ -1,3 +1,12 @@
+# virus species in this dataset
+# NOTE: when adding a new species, you must also add a top level key
+# with the species name pointing to specific information.
+viruses:
+  - "229e"
+  - "nl63"
+  - "oc43"
+  - "hku1"
+
 # virus-specific information
 # For each virus, provide the following parameter values. All paths
 # should be relative to the phylogenetic directory.
@@ -39,7 +48,6 @@
   annotate_phylogeny:
     inference: "joint"
     columns: "country"
-
 nl63:
   reference: "config/nl63/reference.fasta"
   genemap: "config/nl63/genemap.gff"
@@ -58,7 +66,6 @@ nl63:
   annotate_phylogeny:
     inference: "joint"
     columns: "country"
-
 oc43:
   reference: "config/oc43/reference.fasta"
   genemap: "config/oc43/genemap.gff"
@@ -77,7 +84,6 @@ oc43:
   annotate_phylogeny:
     inference: "joint"
     columns: "country"
-
 hku1:
   reference: "config/hku1/reference.fasta"
   genemap: "config/hku1/genemap.gff"