diff --git a/ingest/Snakefile b/ingest/Snakefile index 7589a39..9902536 100644 --- a/ingest/Snakefile +++ b/ingest/Snakefile @@ -45,7 +45,7 @@ rule all: include: "workflow/snakemake_rules/fetch_sequences.smk" -include: "workflow/snakemake_rules/transform.smk" +include: "workflow/snakemake_rules/curate.smk" if "s3_dst" in config: include: "workflow/snakemake_rules/upload.smk" diff --git a/ingest/bin/ndjson-to-tsv-and-fasta b/ingest/bin/ndjson-to-tsv-and-fasta deleted file mode 100755 index 017bcc0..0000000 --- a/ingest/bin/ndjson-to-tsv-and-fasta +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env python3 -""" -Parses NDJSON records from stdin to two different files: a metadata TSV and a -sequences FASTA. - -Records that do not have an ID or sequence will be excluded from the output files. -""" -import argparse -import csv -import json -from sys import stderr, stdin - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--metadata", metavar="TSV", default="data/metadata.tsv", - help="The output metadata TSV file") - parser.add_argument("--fasta", metavar="FASTA", default="data/sequences.fasta", - help="The output sequences FASTA file") - parser.add_argument("--metadata-columns", nargs="+", - help="List of fields from the NDJSON records to include as columns in the metadata TSV. " + - "Metadata TSV columns will be in the order of the columns provided.") - parser.add_argument("--id-field", default='strain', - help="Field from the records to use as the sequence ID in the FASTA file.") - parser.add_argument("--sequence-field", default='sequence', - help="Field from the record that holds the genomic sequence for the FASTA file.") - - args = parser.parse_args() - - with open(args.metadata, 'wt') as metadata_output: - with open(args.fasta, 'wt') as fasta_output: - metadata_csv = csv.DictWriter( - metadata_output, - args.metadata_columns, - restval="", - extrasaction='ignore', - delimiter='\t' - ) - metadata_csv.writeheader() - - for index, record in enumerate(stdin): - record = json.loads(record) - - sequence_id = str(record.get(args.id_field, '')) - sequence = str(record.get(args.sequence_field, '')) - - if not sequence_id: - print( - f"WARNING: Record number {index} does not have a sequence ID.", - "This record will be excluded from the output files.", - file=stderr - ) - elif not sequence: - print( - f"WARNING: Record number {index} does not have a sequence.", - "This record will be excluded from the output files.", - file=stderr - ) - else: - metadata_csv.writerow(record) - - print(f">{sequence_id}", file=fasta_output) - print(f"{sequence}" , file= fasta_output) diff --git a/ingest/config/config.yaml b/ingest/config/config.yaml index f7f820e..de18a1b 100644 --- a/ingest/config/config.yaml +++ b/ingest/config/config.yaml @@ -7,8 +7,8 @@ ncbi_taxon_id: b: "208895" general: "11250" -# Params for the transform rulegeneral -transform: +# Params for the curate rule +curate: # Fields to rename. # This is the first step in the pipeline, so any references to field names # in the configs below should use the new field names @@ -24,6 +24,8 @@ transform: # These date formats should use directives expected by datetime # See https://docs.python.org/3.9/library/datetime.html#strftime-and-strptime-format-codes expected_date_formats: ['%Y', '%Y-%m', '%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ'] + # The expected field that contains the GenBank geo_loc_name + genbank_location_field: location # Titlecase rules titlecase: # Abbreviations not cast to titlecase, keeps uppercase @@ -68,7 +70,6 @@ transform: 'date_submitted', 'sra_accession', 'abbr_authors', - 'reverse', 'authors', - 'institution' + 'institution', ] diff --git a/ingest/source-data/ncbi-dataset-field-map.tsv b/ingest/source-data/ncbi-dataset-field-map.tsv index eb79418..182b977 100644 --- a/ingest/source-data/ncbi-dataset-field-map.tsv +++ b/ingest/source-data/ncbi-dataset-field-map.tsv @@ -14,4 +14,4 @@ BioProjects bioproject_accession BioSample accession biosample_accession SRA Accessions sra_accession Submitter Names authors -Submitter Affiliation submitting_organization +Submitter Affiliation institution diff --git a/ingest/vendored/.cramrc b/ingest/vendored/.cramrc deleted file mode 100644 index 153d20f..0000000 --- a/ingest/vendored/.cramrc +++ /dev/null @@ -1,3 +0,0 @@ -[cram] -shell = /bin/bash -indent = 2 diff --git a/ingest/vendored/.github/dependabot.yml b/ingest/vendored/.github/dependabot.yml new file mode 100644 index 0000000..89bd084 --- /dev/null +++ b/ingest/vendored/.github/dependabot.yml @@ -0,0 +1,17 @@ +# Dependabot configuration file +# +# +# Each ecosystem is checked on a scheduled interval defined below. To trigger +# a check manually, go to +# +# https://github.com/nextstrain/ingest/network/updates +# +# and look for a "Check for updates" button. You may need to click around a +# bit first. +--- +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + schedule: + interval: "weekly" diff --git a/ingest/vendored/.github/workflows/ci.yaml b/ingest/vendored/.github/workflows/ci.yaml index c6a218a..c716277 100644 --- a/ingest/vendored/.github/workflows/ci.yaml +++ b/ingest/vendored/.github/workflows/ci.yaml @@ -11,13 +11,5 @@ jobs: shellcheck: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - uses: nextstrain/.github/actions/shellcheck@master - - cram: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - uses: actions/setup-python@v4 - - run: pip install cram - - run: cram tests/ \ No newline at end of file diff --git a/ingest/vendored/.github/workflows/pre-commit.yaml b/ingest/vendored/.github/workflows/pre-commit.yaml new file mode 100644 index 0000000..70da533 --- /dev/null +++ b/ingest/vendored/.github/workflows/pre-commit.yaml @@ -0,0 +1,14 @@ +name: pre-commit + +on: + - push + +jobs: + pre-commit: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-python@v5 + with: + python-version: "3.12" + - uses: pre-commit/action@v3.0.1 diff --git a/ingest/vendored/.gitrepo b/ingest/vendored/.gitrepo index d0ee1d3..d1afbca 100644 --- a/ingest/vendored/.gitrepo +++ b/ingest/vendored/.gitrepo @@ -6,7 +6,7 @@ [subrepo] remote = https://github.com/nextstrain/ingest branch = main - commit = 7617c39fae05e5882c5e6c065c5b47d500c998af - parent = 7a69d8383d5729f3d5aa873a194adb223aa8045e + commit = 258ab8ce898a88089bc88caee336f8d683a0e79a + parent = 5690058759d693517833af6d9e5c854190164ee8 method = merge cmdver = 0.4.6 diff --git a/ingest/vendored/.pre-commit-config.yaml b/ingest/vendored/.pre-commit-config.yaml new file mode 100644 index 0000000..2cdf88b --- /dev/null +++ b/ingest/vendored/.pre-commit-config.yaml @@ -0,0 +1,40 @@ +default_language_version: + python: python3 +repos: + - repo: https://github.com/pre-commit/sync-pre-commit-deps + rev: v0.0.1 + hooks: + - id: sync-pre-commit-deps + - repo: https://github.com/shellcheck-py/shellcheck-py + rev: v0.10.0.1 + hooks: + - id: shellcheck + - repo: https://github.com/rhysd/actionlint + rev: v1.6.27 + hooks: + - id: actionlint + entry: env SHELLCHECK_OPTS='--exclude=SC2027' actionlint + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.6.0 + hooks: + - id: trailing-whitespace + - id: check-ast + - id: check-case-conflict + - id: check-docstring-first + - id: check-json + - id: check-executables-have-shebangs + - id: check-merge-conflict + - id: check-shebang-scripts-are-executable + - id: check-symlinks + - id: check-toml + - id: check-yaml + - id: destroyed-symlinks + - id: detect-private-key + - id: end-of-file-fixer + - id: fix-byte-order-marker + - repo: https://github.com/astral-sh/ruff-pre-commit + # Ruff version. + rev: v0.4.6 + hooks: + # Run the linter. + - id: ruff diff --git a/ingest/vendored/README.md b/ingest/vendored/README.md index fa91891..a2b54cb 100644 --- a/ingest/vendored/README.md +++ b/ingest/vendored/README.md @@ -2,7 +2,7 @@ Shared internal tooling for pathogen data ingest. Used by our individual pathogen repos which produce Nextstrain builds. Expected to be vendored by -each pathogen repo using `git subtree`. +each pathogen repo using `git subrepo`. Some tools may only live here temporarily before finding a permanent home in `augur curate` or Nextstrain CLI. Others may happily live out their days here. @@ -12,6 +12,9 @@ Some tools may only live here temporarily before finding a permanent home in Nextstrain maintained pathogen repos will use [`git subrepo`](https://github.com/ingydotnet/git-subrepo) to vendor ingest scripts. (See discussion on this decision in https://github.com/nextstrain/ingest/issues/3) +For a list of Nextstrain repos that are currently using this method, use [this +GitHub code search](https://github.com/search?type=code&q=org%3Anextstrain+subrepo+%22remote+%3D+https%3A%2F%2Fgithub.com%2Fnextstrain%2Fingest%22). + If you don't already have `git subrepo` installed, follow the [git subrepo installation instructions](https://github.com/ingydotnet/git-subrepo#installation). Then add the latest ingest scripts to the pathogen repo by running: @@ -54,14 +57,14 @@ commit hash if needed. Much of this tooling originated in [ncov-ingest](https://github.com/nextstrain/ncov-ingest) and was passaged thru -[monkeypox's ingest/](https://github.com/nextstrain/monkeypox/tree/@/ingest/). -It subsequently proliferated from [monkeypox][] to other pathogen repos -([rsv][], [zika][], [dengue][], [hepatitisB][], [forecasts-ncov][]) primarily -thru copying. To [counter that +[mpox's ingest/](https://github.com/nextstrain/mpox/tree/@/ingest/). It +subsequently proliferated from [mpox][] to other pathogen repos ([rsv][], +[zika][], [dengue][], [hepatitisB][], [forecasts-ncov][]) primarily thru +copying. To [counter that proliferation](https://bedfordlab.slack.com/archives/C7SDVPBLZ/p1688577879947079), this repo was made. -[monkeypox]: https://github.com/nextstrain/monkeypox +[mpox]: https://github.com/nextstrain/mpox [rsv]: https://github.com/nextstrain/rsv [zika]: https://github.com/nextstrain/zika/pull/24 [dengue]: https://github.com/nextstrain/dengue/pull/10 @@ -114,15 +117,6 @@ Potential Nextstrain CLI scripts - [download-from-s3](download-from-s3) - Download file from AWS S3 bucket with decompression based on file extension in S3 URL. Skips download if the local file already exists and has a hash identical to the S3 object's metadata `sha256sum`. -Potential augur curate scripts - -- [apply-geolocation-rules](apply-geolocation-rules) - Applies user curated geolocation rules to NDJSON records -- [merge-user-metadata](merge-user-metadata) - Merges user annotations with NDJSON records -- [transform-authors](transform-authors) - Abbreviates full author lists to ' et al.' -- [transform-field-names](transform-field-names) - Rename fields of NDJSON records -- [transform-genbank-location](transform-genbank-location) - Parses `location` field with the expected pattern `"[:][, ]"` based on [GenBank's country field](https://www.ncbi.nlm.nih.gov/genbank/collab/country/) -- [transform-strain-names](transform-strain-names) - Ordered search for strain names across several fields. - ## Software requirements Some scripts may require Bash ≥4. If you are running these scripts on macOS, the builtin Bash (`/bin/bash`) does not meet this requirement. You can install [Homebrew's Bash](https://formulae.brew.sh/formula/bash) which is more up to date. @@ -131,7 +125,24 @@ Some scripts may require Bash ≥4. If you are running these scripts on macOS, t Most scripts are untested within this repo, relying on "testing in production". That is the only practical testing option for some scripts such as the ones interacting with S3 and Slack. -For more locally testable scripts, Cram-style functional tests live in `tests` and are run as part of CI. To run these locally, +## Working on this repo + +This repo is configured to use [pre-commit](https://pre-commit.com), +to help automatically catch common coding errors and syntax issues +with changes before they are committed to the repo. + +If you will be writing new code or otherwise working within this repo, +please do the following to get started: + +1. [install `pre-commit`](https://pre-commit.com/#install) by running + either `python -m pip install pre-commit` or `brew install + pre-commit`, depending on your preferred package management + solution +2. install the local git hooks by running `pre-commit install` from + the root of the repo +3. when problems are detected, correct them in your local working tree + before committing them. -1. Download Cram: `pip install cram` -2. Run the tests: `cram tests/` +Note that these pre-commit checks are also run in a GitHub Action when +changes are pushed to GitHub, so correcting issues locally will +prevent extra cycles of correction. diff --git a/ingest/vendored/apply-geolocation-rules b/ingest/vendored/apply-geolocation-rules deleted file mode 100755 index 776cf16..0000000 --- a/ingest/vendored/apply-geolocation-rules +++ /dev/null @@ -1,234 +0,0 @@ -#!/usr/bin/env python3 -""" -Applies user curated geolocation rules to the geolocation fields in the NDJSON -records from stdin. The modified records are output to stdout. This does not do -any additional transformations on top of the user curations. -""" -import argparse -import json -from collections import defaultdict -from sys import exit, stderr, stdin, stdout - - -class CyclicGeolocationRulesError(Exception): - pass - - -def load_geolocation_rules(geolocation_rules_file): - """ - Loads the geolocation rules from the provided *geolocation_rules_file*. - Returns the rules as a dict: - { - regions: { - countries: { - divisions: { - locations: corrected_geolocations_tuple - } - } - } - } - """ - geolocation_rules = defaultdict(lambda: defaultdict(lambda: defaultdict(dict))) - with open(geolocation_rules_file, 'r') as rules_fh: - for line in rules_fh: - # ignore comments - if line.strip()=="" or line.lstrip()[0] == '#': - continue - - row = line.strip('\n').split('\t') - # Skip lines that cannot be split into raw and annotated geolocations - if len(row) != 2: - print( - f"WARNING: Could not decode geolocation rule {line!r}.", - "Please make sure rules are formatted as", - "'region/country/division/locationregion/country/division/location'.", - file=stderr) - continue - - # remove trailing comments - row[-1] = row[-1].partition('#')[0].rstrip() - raw , annot = tuple( row[0].split('/') ) , tuple( row[1].split('/') ) - - # Skip lines where raw or annotated geolocations cannot be split into 4 fields - if len(raw) != 4: - print( - f"WARNING: Could not decode the raw geolocation {row[0]!r}.", - "Please make sure it is formatted as 'region/country/division/location'.", - file=stderr - ) - continue - - if len(annot) != 4: - print( - f"WARNING: Could not decode the annotated geolocation {row[1]!r}.", - "Please make sure it is formatted as 'region/country/division/location'.", - file=stderr - ) - continue - - - geolocation_rules[raw[0]][raw[1]][raw[2]][raw[3]] = annot - - return geolocation_rules - - -def get_annotated_geolocation(geolocation_rules, raw_geolocation, rule_traversal = None): - """ - Gets the annotated geolocation for the *raw_geolocation* in the provided - *geolocation_rules*. - - Recursively traverses the *geolocation_rules* until we get the annotated - geolocation, which must be a Tuple. Returns `None` if there are no - applicable rules for the provided *raw_geolocation*. - - Rules are applied in the order of region, country, division, location. - First checks the provided raw values for geolocation fields, then if there - are not matches, tries to use general rules marked with '*'. - """ - # Always instantiate the rule traversal as an empty list if not provided, - # e.g. the first call of this recursive function - if rule_traversal is None: - rule_traversal = [] - - current_rules = geolocation_rules - # Traverse the geolocation rules based using the rule_traversal values - for field_value in rule_traversal: - current_rules = current_rules.get(field_value) - # If we hit `None`, then we know there are no matching rules, so stop the rule traversal - if current_rules is None: - break - - # We've found the tuple of the annotated geolocation - if isinstance(current_rules, tuple): - return current_rules - - # We've reach the next level of geolocation rules, - # so try to traverse the rules with the next target in raw_geolocation - if isinstance(current_rules, dict): - next_traversal_target = raw_geolocation[len(rule_traversal)] - rule_traversal.append(next_traversal_target) - return get_annotated_geolocation(geolocation_rules, raw_geolocation, rule_traversal) - - # We did not find any matching rule for the last traversal target - if current_rules is None: - # If we've used all general rules and we still haven't found a match, - # then there are no applicable rules for this geolocation - if all(value == '*' for value in rule_traversal): - return None - - # If we failed to find matching rule with a general rule as the last - # traversal target, then delete all trailing '*'s to reset rule_traversal - # to end with the last index that is currently NOT a '*' - # [A, *, B, *] => [A, *, B] - # [A, B, *, *] => [A, B] - # [A, *, *, *] => [A] - if rule_traversal[-1] == '*': - # Find the index of the first of the consecutive '*' from the - # end of the rule_traversal - # [A, *, B, *] => first_consecutive_general_rule_index = 3 - # [A, B, *, *] => first_consecutive_general_rule_index = 2 - # [A, *, *, *] => first_consecutive_general_rule_index = 1 - for index, field_value in reversed(list(enumerate(rule_traversal))): - if field_value == '*': - first_consecutive_general_rule_index = index - else: - break - - rule_traversal = rule_traversal[:first_consecutive_general_rule_index] - - # Set the final value to '*' in hopes that by moving to a general rule, - # we can find a matching rule. - rule_traversal[-1] = '*' - - return get_annotated_geolocation(geolocation_rules, raw_geolocation, rule_traversal) - - -def transform_geolocations(geolocation_rules, geolocation): - """ - Transform the provided *geolocation* by looking it up in the provided - *geolocation_rules*. - - This will use all rules that apply to the geolocation and rules will - be applied in the order of region, country, division, location. - - Returns the original geolocation if no geolocation rules apply. - - Raises a `CyclicGeolocationRulesError` if more than 1000 rules have - been applied to the raw geolocation. - """ - transformed_values = geolocation - rules_applied = 0 - continue_to_apply = True - - while continue_to_apply: - annotated_values = get_annotated_geolocation(geolocation_rules, transformed_values) - - # Stop applying rules if no annotated values were found - if annotated_values is None: - continue_to_apply = False - else: - rules_applied += 1 - - if rules_applied > 1000: - raise CyclicGeolocationRulesError( - "ERROR: More than 1000 geolocation rules applied on the same entry {geolocation!r}." - ) - - # Create a new list of values for comparison to previous values - new_values = list(transformed_values) - for index, value in enumerate(annotated_values): - # Keep original value if annotated value is '*' - if value != '*': - new_values[index] = value - - # Stop applying rules if this rule did not change the values, - # since this means we've reach rules with '*' that no longer change values - if new_values == transformed_values: - continue_to_apply = False - - transformed_values = new_values - - return transformed_values - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--region-field", default="region", - help="Field that contains regions in NDJSON records.") - parser.add_argument("--country-field", default="country", - help="Field that contains countries in NDJSON records.") - parser.add_argument("--division-field", default="division", - help="Field that contains divisions in NDJSON records.") - parser.add_argument("--location-field", default="location", - help="Field that contains location in NDJSON records.") - parser.add_argument("--geolocation-rules", metavar="TSV", required=True, - help="TSV file of geolocation rules with the format: " + - "'' where the raw and annotated geolocations " + - "are formatted as '///'. " + - "If creating a general rule, then the raw field value can be substituted with '*'." + - "Lines starting with '#' will be ignored as comments." + - "Trailing '#' will be ignored as comments.") - - args = parser.parse_args() - - location_fields = [args.region_field, args.country_field, args.division_field, args.location_field] - - geolocation_rules = load_geolocation_rules(args.geolocation_rules) - - for record in stdin: - record = json.loads(record) - - try: - annotated_values = transform_geolocations(geolocation_rules, [record.get(field, '') for field in location_fields]) - except CyclicGeolocationRulesError as e: - print(e, file=stderr) - exit(1) - - for index, field in enumerate(location_fields): - record[field] = annotated_values[index] - - json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') - print() diff --git a/ingest/vendored/merge-user-metadata b/ingest/vendored/merge-user-metadata deleted file mode 100755 index 341c2df..0000000 --- a/ingest/vendored/merge-user-metadata +++ /dev/null @@ -1,55 +0,0 @@ -#!/usr/bin/env python3 -""" -Merges user curated annotations with the NDJSON records from stdin, with the user -curations overwriting the existing fields. The modified records are output -to stdout. This does not do any additional transformations on top of the user -curations. -""" -import argparse -import csv -import json -from collections import defaultdict -from sys import exit, stdin, stderr, stdout - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--annotations", metavar="TSV", required=True, - help="Manually curated annotations TSV file. " + - "The TSV should not have a header and should have exactly three columns: " + - "id to match existing metadata, field name, and field value. " + - "If there are multiple annotations for the same id and field, then the last value is used. " + - "Lines starting with '#' are treated as comments. " + - "Any '#' after the field value are treated as comments.") - parser.add_argument("--id-field", default="accession", - help="The ID field in the metadata to use to merge with the annotations.") - - args = parser.parse_args() - - annotations = defaultdict(dict) - with open(args.annotations, 'r') as annotations_fh: - csv_reader = csv.reader(annotations_fh, delimiter='\t') - for row in csv_reader: - if not row or row[0].lstrip()[0] == '#': - continue - elif len(row) != 3: - print("WARNING: Could not decode annotation line " + "\t".join(row), file=stderr) - continue - id, field, value = row - annotations[id][field] = value.partition('#')[0].rstrip() - - for record in stdin: - record = json.loads(record) - - record_id = record.get(args.id_field) - if record_id is None: - print(f"ERROR: ID field {args.id_field!r} does not exist in record", file=stderr) - exit(1) - - record.update(annotations.get(record_id, {})) - - json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') - print() diff --git a/ingest/vendored/tests/transform-strain-names/transform-strain-names.t b/ingest/vendored/tests/transform-strain-names/transform-strain-names.t deleted file mode 100644 index 1c05df7..0000000 --- a/ingest/vendored/tests/transform-strain-names/transform-strain-names.t +++ /dev/null @@ -1,17 +0,0 @@ -Look for strain name in "strain" or a list of backup fields. - -If strain entry exists, do not do anything. - - $ echo '{"strain": "i/am/a/strain", "strain_s": "other"}' \ - > | $TESTDIR/../../transform-strain-names \ - > --strain-regex '^.+$' \ - > --backup-fields strain_s accession - {"strain":"i/am/a/strain","strain_s":"other"} - -If strain entry does not exists, search the backup fields - - $ echo '{"strain_s": "other"}' \ - > | $TESTDIR/../../transform-strain-names \ - > --strain-regex '^.+$' \ - > --backup-fields accession strain_s - {"strain_s":"other","strain":"other"} \ No newline at end of file diff --git a/ingest/vendored/transform-authors b/ingest/vendored/transform-authors deleted file mode 100755 index 0bade20..0000000 --- a/ingest/vendored/transform-authors +++ /dev/null @@ -1,66 +0,0 @@ -#!/usr/bin/env python3 -""" -Abbreviates a full list of authors to be ' et al.' of the NDJSON -record from stdin and outputs modified records to stdout. - -Note: This is a "best effort" approach and can potentially mangle the author name. -""" -import argparse -import json -import re -from sys import stderr, stdin, stdout - - -def parse_authors(record: dict, authors_field: str, default_value: str, - index: int, abbr_authors_field: str = None) -> dict: - # Strip and normalize whitespace - new_authors = re.sub(r'\s+', ' ', record[authors_field]) - - if new_authors == "": - new_authors = default_value - else: - # Split authors list on comma/semicolon - # OR "and"/"&" with at least one space before and after - new_authors = re.split(r'(?:\s*[,,;;]\s*|\s+(?:and|&)\s+)', new_authors)[0] - - # if it does not already end with " et al.", add it - if not new_authors.strip('. ').endswith(" et al"): - new_authors += ' et al' - - if abbr_authors_field: - if record.get(abbr_authors_field): - print( - f"WARNING: the {abbr_authors_field!r} field already exists", - f"in record {index} and will be overwritten!", - file=stderr - ) - - record[abbr_authors_field] = new_authors - else: - record[authors_field] = new_authors - - return record - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--authors-field", default="authors", - help="The field containing list of authors.") - parser.add_argument("--default-value", default="?", - help="Default value to use if authors list is empty.") - parser.add_argument("--abbr-authors-field", - help="The field for the generated abbreviated authors. " + - "If not provided, the original authors field will be modified.") - - args = parser.parse_args() - - for index, record in enumerate(stdin): - record = json.loads(record) - - parse_authors(record, args.authors_field, args.default_value, index, args.abbr_authors_field) - - json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') - print() diff --git a/ingest/vendored/transform-field-names b/ingest/vendored/transform-field-names deleted file mode 100755 index fde223f..0000000 --- a/ingest/vendored/transform-field-names +++ /dev/null @@ -1,48 +0,0 @@ -#!/usr/bin/env python3 -""" -Renames fields of the NDJSON record from stdin and outputs modified records -to stdout. -""" -import argparse -import json -from sys import stderr, stdin, stdout - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--field-map", nargs="+", - help="Fields names in the NDJSON record mapped to new field names, " + - "formatted as '{old_field_name}={new_field_name}'. " + - "If the old field does not exist in record, the new field will be added with an empty string value." + - "If the new field already exists in record, then the renaming of the old field will be skipped.") - parser.add_argument("--force", action="store_true", - help="Force renaming of old field even if the new field already exists. " + - "Please keep in mind this will overwrite the value of the new field.") - - args = parser.parse_args() - - field_map = {} - for field in args.field_map: - old_name, new_name = field.split('=') - field_map[old_name] = new_name - - for record in stdin: - record = json.loads(record) - - for old_field, new_field in field_map.items(): - - if record.get(new_field) and not args.force: - print( - f"WARNING: skipping rename of {old_field} because record", - f"already has a field named {new_field}.", - file=stderr - ) - continue - - record[new_field] = record.pop(old_field, '') - - json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') - print() diff --git a/ingest/vendored/transform-genbank-location b/ingest/vendored/transform-genbank-location deleted file mode 100755 index 70ba56f..0000000 --- a/ingest/vendored/transform-genbank-location +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env python3 -""" -Parses GenBank's 'location' field of the NDJSON record from stdin to 3 separate -fields: 'country', 'division', and 'location'. Checks that a record is from -GenBank by verifying that the 'database' field has a value of "GenBank" or "RefSeq". - -Outputs the modified record to stdout. -""" -import json -from sys import stdin, stdout - - -def parse_location(record: dict) -> dict: - # Expected pattern for the location field is "[:][, ]" - # See GenBank docs for their "country" field: - # https://www.ncbi.nlm.nih.gov/genbank/collab/country/ - geographic_data = record['location'].split(':') - - country = geographic_data[0] - division = '' - location = '' - - if len(geographic_data) == 2: - division , _ , location = geographic_data[1].partition(',') - - record['country'] = country.strip() - record['division'] = division.strip() - record['location'] = location.strip() - - return record - - -if __name__ == '__main__': - - for record in stdin: - record = json.loads(record) - - database = record.get('database', '') - if database in {'GenBank', 'RefSeq'}: - parse_location(record) - - json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') - print() diff --git a/ingest/vendored/transform-strain-names b/ingest/vendored/transform-strain-names deleted file mode 100755 index d86c0e4..0000000 --- a/ingest/vendored/transform-strain-names +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env python3 -""" -Verifies strain name pattern in the 'strain' field of the NDJSON record from -stdin. Adds a 'strain' field to the record if it does not already exist. - -Outputs the modified records to stdout. -""" -import argparse -import json -import re -from sys import stderr, stdin, stdout - - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.ArgumentDefaultsHelpFormatter - ) - parser.add_argument("--strain-regex", default="^.+$", - help="Regex pattern for strain names. " + - "Strain names that do not match the pattern will be dropped.") - parser.add_argument("--backup-fields", nargs="*", - help="List of backup fields to use as strain name if the value in 'strain' " + - "does not match the strain regex pattern. " + - "If multiple fields are provided, will use the first field that has a non-empty string.") - - args = parser.parse_args() - - strain_name_pattern = re.compile(args.strain_regex) - - for index, record in enumerate(stdin): - record = json.loads(record) - - # Verify strain name matches the strain regex pattern - if strain_name_pattern.match(record.get('strain', '')) is None: - # Default to empty string if not matching pattern - record['strain'] = '' - # Use non-empty value of backup fields if provided - if args.backup_fields: - for field in args.backup_fields: - if record.get(field): - record['strain'] = str(record[field]) - break - - if record['strain'] == '': - print(f"WARNING: Record number {index} has an empty string as the strain name.", file=stderr) - - - json.dump(record, stdout, allow_nan=False, indent=None, separators=',:') - print() diff --git a/ingest/workflow/snakemake_rules/curate.smk b/ingest/workflow/snakemake_rules/curate.smk new file mode 100644 index 0000000..b8de48e --- /dev/null +++ b/ingest/workflow/snakemake_rules/curate.smk @@ -0,0 +1,109 @@ +""" +This part of the workflow handles transforming the data into standardized +formats and expects input file + + sequences_ndjson = "data/sequences.ndjson" + +This will produce output files as + + metadata = "data/metadata.tsv" + sequences = "data/sequences.fasta" + +Parameters are expected to be defined in `config.transform`. +""" + +rule fetch_general_geolocation_rules: + output: + general_geolocation_rules = "data/general-geolocation-rules.tsv" + params: + geolocation_rules_url = config['curate']['geolocation_rules_url'] + shell: + """ + curl {params.geolocation_rules_url} > {output.general_geolocation_rules} + """ + +rule concat_geolocation_rules: + input: + general_geolocation_rules = "data/general-geolocation-rules.tsv", + local_geolocation_rules = config['curate']['local_geolocation_rules'] + output: + all_geolocation_rules = "data/all-geolocation-rules.tsv" + shell: + """ + cat {input.general_geolocation_rules} {input.local_geolocation_rules} >> {output.all_geolocation_rules} + """ + + + + +rule curate: + input: + sequences_ndjson = "data/sequences.ndjson", + all_geolocation_rules = "data/all-geolocation-rules.tsv", + annotations = config['curate']['annotations'], + output: + metadata = "data/curated_metadata.tsv", + sequences = "data/sequences.fasta" + log: + "logs/curate.txt" + params: + field_map = config['curate']['field_map'], + strain_regex = config['curate']['strain_regex'], + strain_backup_fields = config['curate']['strain_backup_fields'], + date_fields = config['curate']['date_fields'], + expected_date_formats = config['curate']['expected_date_formats'], + genbank_location_field=config["curate"]["genbank_location_field"], + articles = config['curate']['titlecase']['articles'], + abbreviations = config['curate']['titlecase']['abbreviations'], + titlecase_fields = config['curate']['titlecase']['fields'], + authors_field = config['curate']['authors_field'], + authors_default_value = config['curate']['authors_default_value'], + abbr_authors_field = config['curate']['abbr_authors_field'], + annotations_id = config['curate']['annotations_id'], + id_field = config['curate']['id_field'], + sequence_field = config['curate']['sequence_field'] + shell: + """ + (cat {input.sequences_ndjson} \ + | augur curate rename \ + --field-map {params.field_map} \ + | augur curate normalize-strings \ + | augur curate transform-strain-name \ + --strain-regex {params.strain_regex} \ + --backup-fields {params.strain_backup_fields} \ + | augur curate format-dates \ + --date-fields {params.date_fields} \ + --expected-date-formats {params.expected_date_formats} \ + | augur curate parse-genbank-location \ + --location-field {params.genbank_location_field} \ + | augur curate titlecase \ + --titlecase-fields {params.titlecase_fields} \ + --articles {params.articles} \ + --abbreviations {params.abbreviations} \ + | augur curate abbreviate-authors \ + --authors-field {params.authors_field} \ + --default-value {params.authors_default_value} \ + --abbr-authors-field {params.abbr_authors_field} \ + | augur curate apply-geolocation-rules \ + --geolocation-rules {input.all_geolocation_rules} \ + | augur curate apply-record-annotations \ + --annotations {input.annotations} \ + --id-field {params.annotations_id} \ + --output-fasta {output.sequences} \ + --output-metadata {output.metadata} \ + --output-id-field {params.id_field} \ + --output-seq-field {params.sequence_field} ) 2>> {log} + """ + +rule subset_metadata: + input: + metadata = "data/curated_metadata.tsv", + output: + subset_metadata="data/metadata.tsv", + params: + metadata_fields=",".join(config["curate"]["metadata_columns"]), + shell: + """ + tsv-select -H -f {params.metadata_fields} \ + {input.metadata} > {output.subset_metadata} + """ \ No newline at end of file diff --git a/ingest/workflow/snakemake_rules/sort.smk b/ingest/workflow/snakemake_rules/sort.smk index fda5046..9680c02 100644 --- a/ingest/workflow/snakemake_rules/sort.smk +++ b/ingest/workflow/snakemake_rules/sort.smk @@ -12,7 +12,7 @@ It produces output files as rule sort: input: - sequences = rules.transform.output.sequences + sequences = rules.curate.output.sequences output: "data/a/sequences.fasta", "data/b/sequences.fasta" @@ -27,7 +27,7 @@ rule sort: rule metadata: input: - metadata = rules.transform.output.metadata, + metadata = rules.subset_metadata.output.subset_metadata, sequences = "data/{type}/sequences.fasta" output: metadata = "data/{type}/metadata_raw.tsv" diff --git a/ingest/workflow/snakemake_rules/transform.smk b/ingest/workflow/snakemake_rules/transform.smk deleted file mode 100644 index 1dd82fc..0000000 --- a/ingest/workflow/snakemake_rules/transform.smk +++ /dev/null @@ -1,97 +0,0 @@ -""" -This part of the workflow handles transforming the data into standardized -formats and expects input file - - sequences_ndjson = "data/sequences.ndjson" - -This will produce output files as - - metadata = "data/metadata.tsv" - sequences = "data/sequences.fasta" - -Parameters are expected to be defined in `config.transform`. -""" - -rule fetch_general_geolocation_rules: - output: - general_geolocation_rules = "data/general-geolocation-rules.tsv" - params: - geolocation_rules_url = config['transform']['geolocation_rules_url'] - shell: - """ - curl {params.geolocation_rules_url} > {output.general_geolocation_rules} - """ - -rule concat_geolocation_rules: - input: - general_geolocation_rules = "data/general-geolocation-rules.tsv", - local_geolocation_rules = config['transform']['local_geolocation_rules'] - output: - all_geolocation_rules = "data/all-geolocation-rules.tsv" - shell: - """ - cat {input.general_geolocation_rules} {input.local_geolocation_rules} >> {output.all_geolocation_rules} - """ - - - - -rule transform: - input: - sequences_ndjson = "data/sequences.ndjson", - all_geolocation_rules = "data/all-geolocation-rules.tsv", - annotations = config['transform']['annotations'], - output: - metadata = "data/metadata.tsv", - sequences = "data/sequences.fasta" - log: - "logs/transform.txt" - params: - field_map = config['transform']['field_map'], - strain_regex = config['transform']['strain_regex'], - strain_backup_fields = config['transform']['strain_backup_fields'], - date_fields = config['transform']['date_fields'], - expected_date_formats = config['transform']['expected_date_formats'], - articles = config['transform']['titlecase']['articles'], - abbreviations = config['transform']['titlecase']['abbreviations'], - titlecase_fields = config['transform']['titlecase']['fields'], - authors_field = config['transform']['authors_field'], - authors_default_value = config['transform']['authors_default_value'], - abbr_authors_field = config['transform']['abbr_authors_field'], - annotations_id = config['transform']['annotations_id'], - metadata_columns = config['transform']['metadata_columns'], - id_field = config['transform']['id_field'], - sequence_field = config['transform']['sequence_field'] - shell: - """ - (cat {input.sequences_ndjson} \ - | ./vendored/transform-field-names \ - --field-map {params.field_map} \ - | augur curate normalize-strings \ - | ./vendored/transform-strain-names \ - --strain-regex {params.strain_regex} \ - --backup-fields {params.strain_backup_fields} \ - | augur curate format-dates \ - --date-fields {params.date_fields} \ - --expected-date-formats {params.expected_date_formats} \ - | ./vendored/transform-genbank-location \ - | augur curate titlecase \ - --titlecase-fields {params.titlecase_fields} \ - --articles {params.articles} \ - --abbreviations {params.abbreviations} \ - | ./vendored/transform-authors \ - --authors-field {params.authors_field} \ - --default-value {params.authors_default_value} \ - --abbr-authors-field {params.abbr_authors_field} \ - | ./vendored/apply-geolocation-rules \ - --geolocation-rules {input.all_geolocation_rules} \ - | ./vendored/merge-user-metadata \ - --annotations {input.annotations} \ - --id-field {params.annotations_id} \ - | ./bin/ndjson-to-tsv-and-fasta \ - --fasta {output.sequences} \ - --metadata-columns {params.metadata_columns} \ - --metadata {output.metadata} \ - --id-field {params.id_field} \ - --sequence-field {params.sequence_field} ) 2>> {log} - """