-
Notifications
You must be signed in to change notification settings - Fork 0
/
retrieve_most_common_taxonids_in_LCA_output.pl
163 lines (139 loc) · 5.04 KB
/
retrieve_most_common_taxonids_in_LCA_output.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/env perl
# Retrieve NN most frequently matched species, genera, or families.
# Input table is output of retrieve_top_blast_hits_LCA_for_each_sequence.pl with parameter
# print_matched_accession_numbers set to true (1), with column titles (tab-separated):
# - sequence_name
# - LCA_taxon_id
# - LCA_taxon_rank
# - LCA_taxon_species
# - LCA_taxon_genus
# - LCA_taxon_family
# - evalue_of_top_hits
# - lowest_pident_of_top_hits
# - mean_pident_of_top_hits
# - highest_pident_of_top_hits
# - lowest_qcovs_of_top_hits
# - mean_qcovs_of_top_hits
# - highest_qcovs_of_top_hits
# - number_top_hits
# - matched_accession_numbers
# Usage:
# perl retrieve_most_common_taxonids_in_LCA_output.pl
# [output of retrieve_top_blast_hits_LCA_for_each_sequence.pl for one blast search]
# [species, genus, or family] [number most frequent matched species, genera, or families to output, or 0 to output all]
# [minimum number reads matched by a taxon to report it]
# Prints to console. To print to file, use
# perl retrieve_most_common_taxonids_in_LCA_output.pl
# [output of retrieve_top_blast_hits_LCA_for_each_sequence.pl for one blast search]
# [species, genus, or family] [number most frequent matched species, genera, or families to output, or 0 to output all]
# [minimum number reads matched by a taxon to report it] > [output list of taxon ids, one per line]
use strict;
use warnings;
my $LCA_matches = $ARGV[0]; # output of retrieve_top_blast_hits_LCA_for_each_sequence.pl
my $rank_of_taxa_to_examine = $ARGV[1]; # species, genus, or family
my $number_most_frequent_matched_taxa = $ARGV[2];
my $minimum_number_reads_matched_per_taxon = $ARGV[3];
my $PRINT_NUMBER_SEQUENCES_MAPPED_TO_EACH_TAXONID = 0;
my $NO_DATA = "NA";
my $NEWLINE = "\n";
my $DELIMITER = "\t";
# blast LCA table
my $sequence_name_column = 0;
my $LCA_taxon_id_column = 1;
my $LCA_taxon_rank_column = 2;
my $LCA_taxon_species_column = 3;
my $LCA_taxon_genus_column = 4;
my $LCA_taxon_family_column = 5;
my $evalue_of_top_hits_column = 6;
my $lowest_pident_of_top_hits_column = 7;
my $mean_pident_of_top_hits_column = 8;
my $highest_pident_of_top_hits_column = 9;
my $lowest_qcovs_of_top_hits_column = 10;
my $mean_qcovs_of_top_hits_column = 11;
my $highest_qcovs_of_top_hits_column = 12;
my $number_top_hits_column = 13;
my $matched_accession_numbers_column = 14;
# in input parameter rank_of_taxa_to_examine
my $SPECIES = "species";
my $GENUS = "genus";
my $FAMILY = "family";
# verifies that inputs exist and are non-empty
if(!$LCA_matches or !-e $LCA_matches or -z $LCA_matches)
{
print STDERR "Error: LCA matches file not provided, does not exist, or empty:\n\t"
.$LCA_matches."\nExiting.\n";
die;
}
if(!$rank_of_taxa_to_examine)
{
print STDERR "Error: No minimum rank to examine provided. Must be species, genus, or "
."family.\nExiting.\n";
die;
}
if($rank_of_taxa_to_examine ne $SPECIES
and $rank_of_taxa_to_examine ne $GENUS
and $rank_of_taxa_to_examine ne $FAMILY)
{
print STDERR "Error: Non-valid minimum rank to examine provided. Must be species, ".
"genus, or family.\nExiting.\n";
die;
}
# reads in species, genera, or families of matched taxon ids
my %taxon_id_to_number_matches = (); # key: species-, genus-, or family-level ancestor of each taxon id matched -> number sequences matched
my %taxon_id_to_matched_accession_numbers = (); # key: species-, genus-, or family-level ancestor of each taxon id matched -> accession numbers matched by taxon or its descendants
my $first_row = 1;
open LCA_MATCHES, "<$LCA_matches" || die "Could not open $LCA_matches to read\n";
while(<LCA_MATCHES>)
{
chomp;
if($_ =~ /\S/ and !$first_row)
{
my @items = split($DELIMITER, $_);
my $species = $items[$LCA_taxon_species_column];;
my $genus = $items[$LCA_taxon_genus_column];
my $family = $items[$LCA_taxon_family_column];
# determines ancestor taxon id
my $ancestor_taxon_id = "";
if($rank_of_taxa_to_examine eq $SPECIES
and $species and $species ne $NO_DATA)
{
$ancestor_taxon_id = $species;
}
if($rank_of_taxa_to_examine eq $GENUS
and $genus and $genus ne $NO_DATA)
{
$ancestor_taxon_id = $genus;
}
if($rank_of_taxa_to_examine eq $FAMILY
and $family and $family ne $NO_DATA)
{
$ancestor_taxon_id = $family;
}
if($ancestor_taxon_id)
{
# increments number matches for this ancestor taxon id
$taxon_id_to_number_matches{$ancestor_taxon_id}++;
}
}
$first_row = 0;
}
close LCA_MATCHES;
# prints most frequently matched species, genus, or family taxon ids
my $number_matched_taxon_ids_examined = 0;
foreach my $matched_taxon_id(
sort {$taxon_id_to_number_matches{$b} <=> $taxon_id_to_number_matches{$a}}
keys %taxon_id_to_number_matches)
{
if(($number_matched_taxon_ids_examined < $number_most_frequent_matched_taxa | !$number_most_frequent_matched_taxa)
and $taxon_id_to_number_matches{$matched_taxon_id} >= $minimum_number_reads_matched_per_taxon)
{
print $matched_taxon_id;
if($PRINT_NUMBER_SEQUENCES_MAPPED_TO_EACH_TAXONID)
{
print " (".$taxon_id_to_number_matches{$matched_taxon_id}.")";
}
print $NEWLINE;
$number_matched_taxon_ids_examined++;
}
}
# January 2, 2023