Merge pull request #18 from nextstrain/james/dedup-segments

Group segments by strains
nextstrain · Oct 11, 2024 · 06b3e6e · 06b3e6e
2 parents 357dbfe + 091dc7f
commit 06b3e6e
Show file tree

Hide file tree

Showing 29 changed files with 5,423 additions and 2,347 deletions.
diff --git a/.github/workflows/phylogenetic.yaml b/.github/workflows/phylogenetic.yaml
@@ -30,16 +30,10 @@ on:
           If set, builds will be deployed to s3://nextstrain-staging/oropouche_trials_<trial_name>_*
         required: false
         type: string
-      sequences_url:
+      ingest_url_prefix:
         description: |
-          URL for a sequences.fasta.zst file.
-          If not provided, will use default sequences_url from phylogenetic/defaults/config.yaml
-        required: false
-        type: string
-      metadata_url:
-        description: |
-          URL for a metadata.tsv.zst file.
-          If not provided, will use default metadata_url from phylogenetic/defaults/config.yaml
+          Location of the ingested data - we will add on prefixes of 'metadata.tsv.zst',
+          'S/sequences.fasta.zst' etc
         required: false
         type: string
 
@@ -51,8 +45,7 @@ jobs:
         name: Set config overrides
         env:
           TRIAL_NAME: ${{ inputs.trial_name }}
-          SEQUENCES_URL: ${{ inputs.sequences_url }}
-          METADATA_URL: ${{ inputs.metadata_url }}
+          INGEST_URL_PREFIX: ${{ inputs.ingest_url_prefix }}
         run: |
           config=""
 
@@ -61,11 +54,7 @@ jobs:
           fi
 
           if [[ "$SEQUENCES_URL" ]]; then
-            config+=" sequences_url='"$SEQUENCES_URL"'"
-          fi
-
-          if [[ "$METADATA_URL" ]]; then
-            config+=" metadata_url='"$METADATA_URL"'"
+            config+=" ingest_url_prefix='"$INGEST_URL_PREFIX"'"
           fi
 
           if [[ $config ]]; then

diff --git a/ingest/README.md b/ingest/README.md
@@ -22,8 +22,12 @@ nextstrain build .
 
 This produces the default outputs of the ingest workflow:
 
-- metadata      = results/metadata.tsv
-- sequences     = results/sequences.fasta
+```
+results/metadata.tsv
+results/S/sequences.fasta
+results/M/sequences.fasta
+results/L/sequences.fasta
+```
 
 ### Dumping the full raw metadata from NCBI Datasets
 

diff --git a/ingest/Snakefile b/ingest/Snakefile
@@ -23,8 +23,7 @@ wildcard_constraints:
 rule all:
     input:
         sequences=expand("results/{segment}/sequences.fasta", segment=segments),
-        metadata=expand("results/{segment}/metadata.tsv", segment=segments),
-        metadata_all="results/all/metadata.tsv",
+        metadata="results/metadata.tsv",
 
 
 # Note that only PATHOGEN-level customizations should be added to these
@@ -35,6 +34,7 @@ rule all:
 include: "rules/fetch_from_ncbi.smk"
 include: "rules/curate.smk"
 include: "rules/nextclade.smk"
+include: "rules/group_segments.smk"
 
 
 # We are pushing to standardize ingest workflows with Nextclade runs to include

diff --git a/ingest/build-configs/nextstrain-automation/config.yaml b/ingest/build-configs/nextstrain-automation/config.yaml
@@ -14,14 +14,12 @@ cloudfront_domain: "data.nextstrain.org"
 # Replace <pathogen> with the pathogen repo name.
 s3_dst: "s3://nextstrain-data/files/workflows/oropouche"
 
-# Mapping of files to upload
+# Mapping of files to upload.
+# Keys here are the target S3 key (after combining with "s3_dst")
+# Values are the local files produced by the workflow
 files_to_upload:
   ncbi.ndjson.zst: data/ncbi.ndjson
-  all/metadata.tsv.zst: results/all/metadata.tsv
-  all/sequences.fasta.zst: results/all/sequences.fasta
-  L/metadata.tsv.zst: results/L/metadata.tsv
+  metadata.tsv.zst: results/metadata.tsv
   L/sequences.fasta.zst: results/L/sequences.fasta
-  M/metadata.tsv.zst: results/M/metadata.tsv
   M/sequences.fasta.zst: results/M/sequences.fasta
-  S/metadata.tsv.zst: results/S/metadata.tsv
   S/sequences.fasta.zst: results/S/sequences.fasta
diff --git a/ingest/defaults/annotations.tsv b/ingest/defaults/annotations.tsv
@@ -3,7 +3,12 @@
 # id to match existing metadata, field name, and field value
 # If there are multiple annotations for the same id and field, then the last value is used
 # Lines starting with '#' are treated as comments
-# Any '#' after the field value are treated as comments.    
+# Any '#' after the field value are treated as comments.
+
+# NOTE: Here we use accession as the ID, however using strain name would be better going forward as it would reduce
+the duplication needed in the current format. We can't (currently) do this in oropouche because strain names are
+added _after_ the curate chain runs.
+
 PP952119	region	North America # strain IRCCS-SCDC_1/2024 from traveler, L segment
 PP952119	country	Cuba # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11212459/
 PP952119	date	2024-06-11
@@ -13,3 +18,20 @@ PP952118	date	2024-06-11
 PP952117	region	North America # strain IRCCS-SCDC_1/2024 from traveler, S segment
 PP952117	country	Cuba # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11212459/
 PP952117	date	2024-06-11
+
+# Strain 'H498913', 'date' had 2 observed values: HQ830423, HQ830388: 1988-XX-XX; HQ830457: 1990-XX-XX
+HQ830457	date	1988-XX-XX
+
+# When grouped by strain these segments have similar (but different) authors - we change them to the most complete author list
+PP477303	authors	Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
+PP477315	authors	Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
+PP477304	authors	Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
+PP477316	authors	Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
+PP477305	authors	Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
+PP477317	authors	Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
+PP477306	authors	Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
+PP477318	authors	Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
+PP477307	authors	Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
+PP477319	authors	Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
+PP477308	authors	Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
+PP477320	authors	Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
diff --git a/ingest/defaults/config.yaml b/ingest/defaults/config.yaml
@@ -101,10 +101,10 @@ curate:
   # The field in the NDJSON record that contains the actual genomic sequence
   output_sequence_field: "sequence"
   # The list of metadata columns to keep in the final output of the curation pipeline.
+  # (We do not export 'strain' here as that's added in separately via a ENTREZ call)
   metadata_columns: [
     "accession",
     "accession_version",
-    "strain",
     "date",
     "region",
     "country",
@@ -123,3 +123,17 @@ curate:
 nextclade:
   segment_reference: "../shared/oropouche_{segment}.fasta"
   min_seed_cover: 0.01
+
+
+grouping:
+  common_strain_fields:
+    - date
+    - region
+    - country
+    - division
+    - location
+    - host
+    - authors
+    - abbr_authors
+    - institution
+  resolutions: defaults/segment_resolutions.yaml
diff --git a/ingest/defaults/segment_resolutions.yaml b/ingest/defaults/segment_resolutions.yaml
@@ -0,0 +1,45 @@
+- strain: TRVL9760
+  accession: KP026181 # matches the metadata for the other segments for this strain
+  segment: S
+- strain: BeAn19991
+  accession: KP052851
+  segment: M
+- strain: BeAn19991
+  accession: KP052852
+  segment: S
+- strain: BeAn_626990
+  accession: MG747521
+  segment: S
+- strain: BeH_543629
+  accession: MG747572
+  segment: S
+- strain: BeH_543857
+  accession: MG747578
+  segment: S
+- strain: BeAn_206119
+  accession: MG747539
+  segment: S
+- strain: BeH_543760
+  accession: MG747575
+  segment: S
+- strain: BeAn_208402
+  accession: MG747542
+  segment: S
+- strain: BeAn_208819
+  accession: MG747545
+  segment: S
+- strain: PPS_523_H_669315
+  accession: MG747584
+  segment: S
+- strain: PPS_522_H_669314
+  accession: MG747581
+  segment: S
+- strain: PMOH_682426
+  accession: MG747587
+  segment: S
+- strain: PMOH_682431
+  accession: MG747590
+  segment: S
+- strain: BeH505764 # Note: strain is dropped by phylo exclude rule
+  accession: PP357050
+  segment: S
diff --git a/ingest/rules/curate.smk b/ingest/rules/curate.smk
@@ -61,8 +61,8 @@ rule curate:
         all_geolocation_rules="data/all-geolocation-rules.tsv",
         annotations=config["curate"]["annotations"],
     output:
-        metadata="data/all_metadata.tsv",
-        sequences="results/all/sequences.fasta",
+        metadata="data/metadata_curated.tsv",
+        sequences="data/sequences.fasta",
     log:
         "logs/curate.txt",
     benchmark:
@@ -116,28 +116,15 @@ rule curate:
                 --output-seq-field {params.sequence_field} ) 2>> {log}
         """
 
-
-rule replace_strain_names:
-    input:
-        metadata="data/all_metadata.tsv",
-        strains = "data/strain-names.tsv"
-    output:
-        metadata="data/all_metadata_with_strains.tsv",
-    shell:
-        """
-        tsv-select -H --exclude strain {input.metadata} | \
-        tsv-join -H --filter-file {input.strains} --key-fields accession --append-fields strain > {output.metadata}
-        """
-
-rule subset_metadata:
+rule subset_curated_metadata_columns:
     input:
-        metadata="data/all_metadata_with_strains.tsv",
+        metadata="data/metadata_curated.tsv",
     output:
-        metadata="results/all/metadata.tsv",
+        metadata="data/metadata_subset.tsv",
     params:
         metadata_fields=",".join(config["curate"]["metadata_columns"]),
     shell:
-        """
+        r"""
         tsv-select -H -f {params.metadata_fields} \
             {input.metadata} > {output.metadata}
         """
diff --git a/ingest/rules/fetch_from_ncbi.smk b/ingest/rules/fetch_from_ncbi.smk
@@ -138,7 +138,7 @@ rule entrez_via_accessions:
     accessions
     """
     input:
-        metadata="data/all_metadata.tsv",
+        metadata="data/metadata_curated.tsv",
     output:
         genbank="data/genbank.gb",
     benchmark:

diff --git a/ingest/rules/group_segments.smk b/ingest/rules/group_segments.smk
@@ -0,0 +1,43 @@
+
+rule group_segments:
+    input:
+        metadata="data/metadata_merged.tsv",
+        resolutions=config["grouping"]["resolutions"],
+    output:
+        metadata="results/metadata.tsv"
+    params:
+        common_strain_fields = config["grouping"]["common_strain_fields"],
+        segments = segments,
+    shell:
+        r"""
+        python3 scripts/group_segments.py \
+            --metadata {input.metadata} \
+            --common-strain-fields {params.common_strain_fields} \
+            --segments {params.segments} \
+            --resolutions {input.resolutions} \
+            --output-metadata {output.metadata}
+        """
+
+rule subset_sequences_by_segment:
+    input:
+        metadata = "results/metadata.tsv",
+        sequences = "data/sequences.fasta",
+    output:
+        kv_map = temp("data/kv-map_{segment}.tsv"),
+        sequences = "results/{segment}/sequences.fasta",
+    params:
+        columns = lambda w: f"accession_{w.segment},strain",
+        filter_exp = lambda w: f"len($accession_{w.segment})>0",
+        drop_key = "__DROP__",
+    shell:
+        r"""
+        cat results/metadata.tsv \
+            | csvtk cut -t -f {params.columns} \
+            | csvtk filter2 -t -U -f {params.filter_exp:q} \
+            > {output.kv_map} && \
+        seqkit replace \
+            -p "(.*)" --replacement "{{kv}}" --kv-file {output.kv_map} -m {params.drop_key} \
+            {input.sequences} \
+            | seqkit grep -v -r -p '^{params.drop_key}$' \
+            > {output.sequences}
+        """
diff --git a/ingest/rules/nextclade.smk b/ingest/rules/nextclade.smk
@@ -20,35 +20,57 @@ https://docs.nextstrain.org/projects/nextclade/page/user/nextclade-cli.html
 
 rule run_nextclade_to_identify_segment:
     input:
-        sequences = "results/all/sequences.fasta",
+        sequences = "data/sequences.fasta",
         segment_reference = config["nextclade"]["segment_reference"],
     output:
-        sequences = "results/{segment}/sequences.fasta",
+        nextclade = temp("data/nextclade_{segment}.tsv"),
     params:
         min_seed_cover = config["nextclade"]["min_seed_cover"],
     shell:
-        """
+        r"""
         nextclade run \
             --input-ref {input.segment_reference} \
-            --output-fasta {output.sequences} \
+            --output-tsv {output.nextclade} \
             --min-seed-cover {params.min_seed_cover} \
             --silent \
             {input.sequences}
         """
 
-rule subset_metadata_by_segment:
+rule parse_nextclade_tsv:
     input:
-        metadata = "results/all/metadata.tsv",
-        sequences = "results/{segment}/sequences.fasta",
+        nextclade = "data/nextclade_{segment}.tsv",
     output:
-        metadata = "results/{segment}/metadata.tsv",
+        summary = "data/nextclade_{segment}_summary.tsv",
     params:
-        strain_id_field = config["curate"]["output_id_field"],
+        nextclade_cols = 'seqName,qc.overallStatus',
+        new_cols = lambda w: f'accession,qc_{w.segment}',
+        mutate_exp = lambda w: f'len($qc_{w.segment})>0 ? "1" : "0"',
+        segment_col = lambda w: f'segment_{w.segment}',
     shell:
+        r"""
+        csvtk cut -t -H -f {params.nextclade_cols:q} {input.nextclade:q} \
+            | csvtk rename -t -f {params.nextclade_cols:q} -n {params.new_cols:q} \
+            | csvtk mutate2 -t -n {params.segment_col:q} --at 2 -e {params.mutate_exp:q} \
+            > {output.summary:q}
+
+        echo "Nextclade aligned $(( $(cat {output.summary} | csvtk grep -t -f {params.segment_col} -p '1' -U | wc -l) ))/$(( $(wc -l < {input.nextclade}) -1 )) sequences to segment {wildcards.segment}"
         """
-        augur filter \
-            --sequences {input.sequences} \
-            --metadata {input.metadata} \
-            --metadata-id-columns {params.strain_id_field} \
+
+
+rule merge_metadata:
+    input:
+        strain="data/strain-names.tsv",
+        main="data/metadata_subset.tsv",
+        segments=expand("data/nextclade_{segment}_summary.tsv", segment=segments),
+    output:
+        metadata="data/metadata_merged.tsv",
+    params:
+        # augur merge requires NAME=FILEPATH argments, so we transform the inputs here:
+        segments = lambda w,input: " ".join([f"s_{idx}={s}" for idx,s in enumerate(input.segments)])
+    shell:
+        r"""
+        augur merge \
+            --metadata strains={input.strain} main={input.main} {params.segments} \
+            --metadata-id-columns accession \
             --output-metadata {output.metadata}
         """