diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index c2699ab..5af26ef 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -41,7 +41,7 @@ jobs: docker load --input /tmp/recognizer.tar docker image ls -a - name: Default annotation - run: docker run recognizer /bin/bash -c "recognizer -f reCOGnizer/cicd/proteomes.fasta -rd resources_directory --quiet -dbs COG,TIGRFAM --test-run" + run: docker run recognizer /bin/bash -c "recognizer -f reCOGnizer/cicd/genes.fasta -rd resources_directory --quiet -dbs COG,TIGRFAM --test-run" taxonomy-based-annotation: runs-on: ubuntu-latest @@ -57,7 +57,7 @@ jobs: docker load --input /tmp/recognizer.tar docker image ls -a - name: Taxonomy-based annotation - run: docker run recognizer /bin/bash -c "recognizer -f reCOGnizer/cicd/proteomes.fasta -rd resources_directory --tax-file reCOGnizer/cicd/UPIMAPI_results.tsv --tax-col 'Taxonomic lineage IDs (SPECIES)' --protein-id-col qseqid --species-taxids --quiet --test-run" + run: docker run recognizer /bin/bash -c "recognizer -f reCOGnizer/cicd/genes.fasta -rd resources_directory --tax-file reCOGnizer/cicd/UPIMAPI_results.tsv --tax-col 'Taxonomic lineage IDs (SPECIES)' --protein-id-col qseqid --species-taxids --quiet --test-run" custom-database-annotation: runs-on: ubuntu-latest @@ -73,4 +73,20 @@ jobs: docker load --input /tmp/recognizer.tar docker image ls -a - name: Custom database annotation - run: docker run recognizer /bin/bash -c "mkdir resources_directory; tar -xzf reCOGnizer/cicd/cdd.tar.gz -C resources_directory; makeprofiledb -in reCOGnizer/cicd/ci.pn -out resources_directory/db; recognizer -f reCOGnizer/cicd/proteomes.fasta -rd resources_directory --quiet -dbs resources_directory/db --custom-databases --test-run" + run: docker run recognizer /bin/bash -c "mkdir resources_directory; tar -xzf reCOGnizer/cicd/cdd.tar.gz -C resources_directory; makeprofiledb -in reCOGnizer/cicd/ci.pn -out resources_directory/db; recognizer -f reCOGnizer/cicd/genes.fasta -rd resources_directory --quiet -dbs resources_directory/db --custom-databases --test-run" + + prebuilt-databases-annotation: + runs-on: ubuntu-latest + needs: build + steps: + - name: Download artifact + uses: actions/download-artifact@v2 + with: + name: recognizer + path: /tmp + - name: Load Docker image + run: | + docker load --input /tmp/recognizer.tar + docker image ls -a + - name: Pre-built databases annotation + run: docker run recognizer /bin/bash -c "recognizer -f reCOGnizer/cicd/genes.fasta --quiet -dbs COG,TIGRFAM --use-prebuilt-dbs --test-run" diff --git a/cicd/proteomes.fasta b/cicd/genes.fasta similarity index 100% rename from cicd/proteomes.fasta rename to cicd/genes.fasta diff --git a/recognizer.py b/recognizer.py index 74dedb3..cd53c0b 100644 --- a/recognizer.py +++ b/recognizer.py @@ -24,10 +24,14 @@ import xml.etree.ElementTree as ET import re -__version__ = '1.10.1' +__version__ = '1.11.0' print_commands = False # for debugging purposes, can be changed with --debug parameter +db_prefixes = { # database name to tuple of (db prefix, smp prefix) + 'CDD': ('Cdd', 'cd'), 'Pfam': ('Pfam', 'pfam'), 'NCBIfam': ('NF', 'NF'), 'ProteinClusters': ('Prk', 'PRK'), + 'Smart': ('Smart', 'smart'), 'TIGRFAM': ('Tigr', 'TIGR'), 'COG': ('Cog', 'COG'), 'KOG': ('Kog', 'KOG')} + def get_arguments(): parser = ArgumentParser( @@ -41,15 +45,17 @@ def get_arguments(): "--evalue", type=float, default=1e-3, help="Maximum e-value to report annotations for [1e-3]") parser.add_argument( "-o", "--output", help="Output directory [reCOGnizer_results]", default='reCOGnizer_results') - parser.add_argument( - "-dr", "--download-resources", default=None, - help='This parameter is deprecated. Please do not use it [None]') parser.add_argument( "-rd", "--resources-directory", default=os.path.expanduser('~/recognizer_resources'), help="Output directory for storing databases and other resources [~/recognizer_resources]") parser.add_argument( - "-dbs", "--databases", default="CDD,Pfam,NCBIfam,Protein_Clusters,Smart,TIGRFAM,COG,KOG", - help="Databases to include in functional annotation (comma-separated) [all available]") + "-dbs", "--databases", default="CDD,Pfam,NCBIfam,ProteinClusters,Smart,TIGRFAM,COG,KOG", + help="Databases to include in functional annotation (comma-separated) " + "[CDD,Pfam,NCBIfam,ProteinClusters,Smart,TIGRFAM,COG,KOG]") + parser.add_argument( + "--use-prebuilt-dbs", action="store_true", default=False, + help="Bypass database formation and directly download databases from " + "https://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/") parser.add_argument( "--custom-databases", action="store_true", default=False, help="If databases inputted were NOT produced by reCOGnizer [False]. Default databases of reCOGnizer " @@ -71,9 +77,6 @@ def get_arguments(): parser.add_argument( "--output-rpsbproc-cols", action="store_true", default=False, help="Output columns obtained with RPSBPROC - 'Superfamilies', 'Sites' and 'Motifs'.") - parser.add_argument( - "-sd", "--skip-downloaded", default=None, - help="This parameter is deprecated. Please do not use it [None]") parser.add_argument( "--keep-intermediates", default=False, action='store_true', help="Keep intermediate annotation files generated in reCOGnizer's workflow, " @@ -106,15 +109,6 @@ def get_arguments(): args = parser.parse_args() - # Check for deprecated parameters. reCOGnizer will still fail, but it will inform on them. - for param in ['skip_downloaded', 'download_resources']: # deprecated_db_params - if getattr(args, param): - sys.exit(f"""Error: The parameter '--{param.replace("_", "-")}' is deprecated and no longer - needed. If you have already downloaded resources for version 1.10 and forward, just remove this - parameter. If not, please remove the file at '[resources_directory]/recognizer_dwnl.timestamp', and - reCOGnizer will re-download everything correctly. From now on, reCOGnizer relies on the timestamp to know - if databases have been correctly downloaded.""") - args.output = args.output.rstrip('/') args.resources_directory = args.resources_directory.rstrip('/') args.databases = args.databases.split(',') @@ -125,18 +119,18 @@ def get_arguments(): # If default databases, check if all are recognized. If using both default and custom, exit. if not args.custom_databases: for database in args.databases: - if database not in ["CDD", "Pfam", "NCBIfam", "Protein_Clusters", "Smart", "TIGRFAM", "COG", "KOG"]: + if database not in ["CDD", "Pfam", "NCBIfam", "ProteinClusters", "Smart", "TIGRFAM", "COG", "KOG"]: exit(f'Default database {database} not recognized. Exiting.') else: for database in args.databases: - if database in ["CDD", "Pfam", "NCBIfam", "Protein_Clusters", "Smart", "TIGRFAM", "COG", "KOG"]: + if database in ["CDD", "Pfam", "NCBIfam", "ProteinClusters", "Smart", "TIGRFAM", "COG", "KOG"]: exit(f"Default database {database} can't be used with custom databases.") if not is_db_good(database): exit(f"Custom database {database} not in correct format. Exiting.") if args.file: for directory in [f'{args.output}/{folder}' for folder in ['asn', 'blast', 'rpsbproc', 'tmp']] + [ - f'{args.resources_directory}/dbs']: + f'{args.resources_directory}/dbs']: if not os.path.isdir(directory): Path(directory).mkdir(parents=True, exist_ok=True) print(f'Created {directory}') @@ -197,7 +191,16 @@ def get_tabular_taxonomy(output): os.remove('taxonomy.rdf') -def download_resources(directory, quiet=False, test_run=False): +def download_little_endian(directory, dbs=db_prefixes.keys()): + for db in dbs: + url = f"https://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/{db_prefixes[db][0]}_LE.tar.gz" + filename = url.split('/')[-1] + run_command(f'wget -P {directory} {url}') + run_command(f'tar -xzf {directory}/{filename} -C {directory}/dbs') + run_command(f'rm {directory}/{filename}') + + +def download_resources(directory, quiet=False, test_run=False, download_le=False): timestamp_file = f'{directory}/recognizer_dwnl.timestamp' if os.path.isfile(timestamp_file): with open(timestamp_file) as f: @@ -242,9 +245,12 @@ def download_resources(directory, quiet=False, test_run=False): ] for location in web_locations: filename = location.split('/')[-1] - if filename == 'cdd.tar.gz' and test_run: # test on github, this makes it quicker + if filename == 'cdd.tar.gz' and test_run: # "test_run" allows CI/CD tests to run on GHA os.rename('reCOGnizer/cicd/cdd.tar.gz', f'{directory}/cdd.tar.gz') continue + if filename == 'cdd.tar.gz' and download_le: # download prebuilt databases + download_little_endian(directory) + continue if os.path.isfile(f"{directory}/{filename}"): print(f"Removing {directory}/{filename}") os.remove(f"{directory}/{filename}") @@ -256,17 +262,14 @@ def download_resources(directory, quiet=False, test_run=False): run_command(f'gunzip {directory}/{filename}', print_command=True) # Extract the smps - if os.path.isfile(f'{directory}/cdd.tar.gz'): - os.rename(f'{directory}/cdd.tar.gz', f'{directory}/smps/cdd.tar.gz') - wd = os.getcwd() - os.chdir(f'{directory}/smps') - run_pipe_command( - f'{"gtar" if sys.platform == "darwin" else "tar"} -xzf cdd.tar.gz --wildcards "*.smp"', print_command=True) - os.remove('cdd.tar.gz') # no idea why I put it doing this, maybe to free space? - os.chdir(wd) + if not download_le: + run_pipe_command( + f'{"gtar" if sys.platform == "darwin" else "tar"} -xzf {directory}/cdd.tar.gz --wildcards "{directory}/smps/*.smp"', + print_command=True) + #os.remove('cdd.tar.gz') # no idea why I put it doing this, maybe to free space? get_tabular_taxonomy(f'{directory}/taxonomy.tsv') - with open(f'{directory}/recognizer_dwnl.timestamp', 'w') as f: + with open(timestamp_file, 'w') as f: f.write(strftime("%Y-%m-%d %H:%M:%S", gmtime())) @@ -681,10 +684,10 @@ def cog_taxonomic_workflow( db_report.to_csv(f'{output}/rpsbproc/COG_report.tsv', sep='\t') -def check_regular_database(smp_directory, db_directory, db_prefix): +def validate_regular_database(smp_directory, db_directory, db_prefix, smps_prefix): if not is_db_good(f'{db_directory}/{db_prefix}'): print(f'Some part of {db_prefix} was not valid! Will rebuild!') - smp_list = glob(f'{smp_directory}/{db_prefix}*.smp') + smp_list = glob(f'{smp_directory}/{smps_prefix}*.smp') with open(f'{db_directory}/{db_prefix}.pn', 'w') as f: f.write('\n'.join(smp_list)) pn2database(f'{db_directory}/{db_prefix}.pn') @@ -692,6 +695,19 @@ def check_regular_database(smp_directory, db_directory, db_prefix): print(f'A valid {db_prefix} split database was found!') +def validate_prebuilt_database(db_directory, db_prefix): + with open(f'{db_directory}/{db_prefix}.pal') as f: + lines = f.readlines() + for line in lines: + if line.startswith('DBLIST'): + dbs = line.split('DBLIST ')[-1][1:-3].split('" "') # DBLIST "Prk.00" "Prk.01" -> ['Prk.00', 'Prk.01'] + for db in dbs: + if not is_db_good(f'{db_directory}/{db}'): + exit(f'Some part of prebuilt {db_prefix} was not valid! Exiting...') + return True + return False + + def load_relational_tables(resources_directory, tax_file=None): timed_message('Loading relational tables') cddid = parse_cddid(f'{resources_directory}/cddid_all.tbl') @@ -840,20 +856,21 @@ def custom_database_workflow(output, databases, threads=15, max_target_seqs=1, e def taxonomic_workflow( - output, resources_directory, threads, lineages, all_taxids, databases_prefixes, base, hmm_pgap, + output, resources_directory, threads, lineages, all_taxids, db_prefixes, base, hmm_pgap, max_target_seqs=1, evalue=1e-5): all_taxids += ['131567', '0'] # cellular organisms and no taxonomy - hmm_pgap_taxids = get_hmm_pgap_taxids(all_taxids, databases_prefixes[base], hmm_pgap) + hmm_pgap_taxids = get_hmm_pgap_taxids(all_taxids, db_prefixes[base][1], hmm_pgap) taxids_with_db = check_tax_databases( - f'{resources_directory}/smps', f'{resources_directory}/dbs', databases_prefixes[base], hmm_pgap_taxids, + f'{resources_directory}/smps', f'{resources_directory}/dbs', db_prefixes[base][0], hmm_pgap_taxids, hmm_pgap) # for proteins with no taxonomy - check_regular_database(f'{resources_directory}/smps', f'{resources_directory}/dbs', databases_prefixes[base]) + validate_regular_database( + f'{resources_directory}/smps', f'{resources_directory}/dbs', db_prefixes[base][0], db_prefixes[base][1]) dbs = {taxid: [ - f'{resources_directory}/dbs/{databases_prefixes[base]}_{parent_taxid}' for parent_taxid in + f'{resources_directory}/dbs/{db_prefixes[base][0]}_{parent_taxid}' for parent_taxid in lineages[taxid] + ['0'] if parent_taxid in taxids_with_db] for taxid in lineages.keys()} dbs = {**dbs, - **{'0': [f'{resources_directory}/dbs/{databases_prefixes[base]}']}} # no taxonomy is annotated with all + **{'0': [f'{resources_directory}/dbs/{db_prefixes[base][0]}']}} # no taxonomy is annotated with all db_report = pd.DataFrame(columns=['qseqid', 'sseqid', 'Superfamilies', 'Sites', 'Motifs']) for taxid in list(lineages.keys()) + ['0']: if os.path.isfile(f'{output}/tmp/{taxid}.fasta'): @@ -883,13 +900,22 @@ def taxonomic_workflow( def multiprocess_workflow( - output, resources_directory, threads, databases_prefixes, base, max_target_seqs=5, evalue=0.01): - check_regular_database(f'{resources_directory}/smps', f'{resources_directory}/dbs', databases_prefixes[base]) + output, resources_directory, threads, db_prefixes, base, max_target_seqs=5, evalue=0.01, + use_prebuilt_dbs=False): + if use_prebuilt_dbs: + if base not in ['NCBIfam', 'Smart']: # NCBIfam is not available at https://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/ + validate_prebuilt_database(f'{resources_directory}/dbs', db_prefixes[base][0]) + else: + validate_regular_database(f'{resources_directory}/smps', f'{resources_directory}/dbs', db_prefixes[base][0], + db_prefixes[base][1]) + + if base in ['NCBIfam', 'Smart'] and use_prebuilt_dbs: + return # NCBIfam is not available at https://ftp.ncbi.nih.gov/pub/mmdb/cdd/little_endian/ # Run RPS-BLAST with Pool(processes=threads) as p: p.starmap(run_rpsblast, [( f'{output}/tmp/tmp_{i}.fasta', f'{output}/asn/{base}_{i}_aligned.asn', - f'{resources_directory}/dbs/{databases_prefixes[base]}', '1', + f'{resources_directory}/dbs/{db_prefixes[base][0]}', '1', max_target_seqs, evalue) for i in range(threads)]) # Convert ASN-11 to TAB-6 with Pool(processes=threads) as p: @@ -916,7 +942,7 @@ def complete_report(report, db, resources_directory, output, hmm_pgap, fun): cols = ['qseqid', 'DB ID', 'product_name', 'DB description', 'ec_number', 'KO', 'CDD ID', 'taxonomic_range_name', 'taxonomic_range', 'Superfamilies', 'Sites', 'Motifs', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore'] - if db in ['CDD', 'Pfam', 'NCBIfam', 'Protein_Clusters', 'TIGRFAM']: + if db in ['CDD', 'Pfam', 'NCBIfam', 'ProteinClusters', 'TIGRFAM']: report = pd.merge(report, hmm_pgap, left_on='DB ID', right_on='source_identifier', how='left') if db == 'CDD': report['ec_number'] = report['DB description'].apply(get_db_ec, suffix=")") @@ -953,12 +979,15 @@ def complete_report(report, db, resources_directory, output, hmm_pgap, fun): def organize_results( file, output, resources_directory, databases, hmm_pgap, cddid, fun, no_output_sequences=False, - include_rpsbproc_cols=False): + include_rpsbproc_cols=False, use_prebuilt_dbs=False): timed_message("Organizing annotation results") i = 1 xlsx_report = pd.ExcelWriter(f'{output}/reCOGnizer_results.xlsx', engine='xlsxwriter') - all_reports = pd.DataFrame(columns=['qseqid', 'DB ID']) # intialize with these columns so if it has no rows, at least it has the columns to groupby + all_reports = pd.DataFrame(columns=['qseqid', + 'DB ID']) # intialize with these columns so if it has no rows, at least it has the columns to groupby for db in databases: + if db in ['NCBIfam', 'Smart'] and use_prebuilt_dbs: + continue run_pipe_command(f'cat {output}/blast/{db}_*_aligned.blast', file=f'{output}/blast/{db}_aligned.blast') print(f'[{i}/{len(databases)}] Handling {db} annotation') blast_res = parse_blast(f'{output}/blast/{db}_aligned.blast') @@ -986,7 +1015,8 @@ def organize_results( def main(): args = get_arguments() - download_resources(args.resources_directory, quiet=args.quiet, test_run=args.test_run) + download_resources( + args.resources_directory, quiet=args.quiet, test_run=args.test_run, download_le=args.use_prebuilt_dbs) if not args.file: exit('No input file specified. Exiting.') @@ -994,7 +1024,7 @@ def main(): cddid, hmm_pgap, fun, taxonomy_df, members_df = load_relational_tables( args.resources_directory, tax_file=args.tax_file) - if not args.keep_spaces: # if alters input file, internally alters args.file + if not args.keep_spaces: # if alters input file, internally alters args.file args.file = replace_spaces_with_underscores(args.file, f'{args.output}/tmp') # this splitting is always necessary, even with taxonomy inputted, since some databases don't have taxonomy-based @@ -1015,15 +1045,12 @@ def main(): f'{args.output}/tmp/{taxid}.fasta', f'{args.output}/tmp/tmp_{taxid}', args.threads) else: tax_file = lineages = all_taxids = None - databases_prefixes = { - 'CDD': 'cd', 'Pfam': 'pfam', 'NCBIfam': 'NF', 'Protein_Clusters': 'PRK', 'Smart': 'smart', - 'TIGRFAM': 'TIGR', 'COG': 'COG', 'KOG': 'KOG'} for base in args.databases: - db_hmm_pgap = hmm_pgap[hmm_pgap['source_identifier'].str.startswith(databases_prefixes[base])] + db_hmm_pgap = hmm_pgap[hmm_pgap['source_identifier'].str.startswith(db_prefixes[base][1])] timed_message(f'Running annotation with RPS-BLAST and {base} database as reference.') - if args.tax_file is not None and base in ['Pfam', 'NCBIfam', 'Protein_Clusters', 'TIGRFAM']: + if args.tax_file is not None and base in ['Pfam', 'NCBIfam', 'ProteinClusters', 'TIGRFAM']: taxonomic_workflow( - args.output, args.resources_directory, args.threads, lineages, all_taxids, databases_prefixes, + args.output, args.resources_directory, args.threads, lineages, all_taxids, db_prefixes, base, db_hmm_pgap, max_target_seqs=args.max_target_seqs, evalue=args.evalue) elif base in ['COG'] and args.tax_file is not None and args.species_taxids: cog_taxonomic_workflow( @@ -1031,11 +1058,12 @@ def main(): max_target_seqs=args.max_target_seqs, evalue=args.evalue) else: multiprocess_workflow( - args.output, args.resources_directory, args.threads, databases_prefixes, base, - max_target_seqs=args.max_target_seqs, evalue=args.evalue) + args.output, args.resources_directory, args.threads, db_prefixes, base, + max_target_seqs=args.max_target_seqs, evalue=args.evalue, use_prebuilt_dbs=args.use_prebuilt_dbs) organize_results( args.file, args.output, args.resources_directory, args.databases, hmm_pgap, cddid, fun, - no_output_sequences=args.no_output_sequences, include_rpsbproc_cols=args.output_rpsbproc_cols) + no_output_sequences=args.no_output_sequences, include_rpsbproc_cols=args.output_rpsbproc_cols, + use_prebuilt_dbs=args.use_prebuilt_dbs) if not args.keep_intermediates: for directory in [f'{args.output}/{folder}' for folder in ['asn', 'blast', 'rpsbproc', 'tmp']]: shutil.rmtree(directory)