-
Notifications
You must be signed in to change notification settings - Fork 2
/
esl-alistat-cinfo-count-mutations.pl
executable file
·151 lines (138 loc) · 5.61 KB
/
esl-alistat-cinfo-count-mutations.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env perl
# EPN, Tue May 22 11:46:21 2018
# esl-alistat-cinfo-count-mutations.pl
# Summarize an esl-alistat --cinfo DNA or RNA alignment file.
#
use warnings;
use strict;
use Getopt::Long;
my $usage;
$usage = "perl esl-alistat-cinfo-count-mutations.pl [OPTIONS] <DNA or RNA alignment esl-alistat --cinfo output file>\n";
$usage .= "\tOPTIONS:\n";
$usage .= "\t\t--bpinfo <s>: name of bpinfo file to use to define single stranded vs paired positions\n";
$usage .= "\t\t--frame: break down number of changes by frame (for DNA alignments of protein coding regions)\n";
$usage .= "\t\t (w/--frame you may want to remove all insert columns first, e.g. 'esl-alimask --rf-is-mask')\n\n";
my $do_frame = 0;
my $bpinfo_file = undef;
&GetOptions( "bpinfo=s" => \$bpinfo_file,
"frame" => \$do_frame);
if(scalar(@ARGV) != 1) { die $usage; }
if($do_frame && (defined $bpinfo_file)) {
die "ERROR can't use both --frame and --bpinfo files, pick one.";
}
my ($cinfo_file) = (@ARGV);
open(CINFO, $cinfo_file) || die "ERROR unable to open $cinfo_file";
my %is_paired_H = ();
my $have_pairs = 0;
my $line;
if(defined $bpinfo_file) {
$have_pairs = 1;
open(BPINFO, $bpinfo_file) || die "ERROR unable to open $bpinfo_file for reading";
while($line = <BPINFO>) {
chomp $line;
if(($line !~ m/^\/\/$/) && ($line !~ m/^\#/)) {
$line =~ s/^\s+//;
$line =~ s/\s+$//;
my @el_A = split(/\s+/, $line);
if(scalar(@el_A) != 18) { die "ERROR expected 18 tokens on data line but got a different number: $line\n"; }
my ($lpos, $rpos) = ($el_A[0], $el_A[1]);
$is_paired_H{$lpos} = 1;
$is_paired_H{$rpos} = 1;
}
}
close(BPINFO);
}
#######
# Beginning of cinfo file
## Per column residue counts:
## Alignment file: syn-s2.refined.pgo1a-plus2.supported.cmbuild.stk
## Alignment idx: 1
## Alignment name: syn-s2.pgo1a-plus2.supported
## Number of sequences: 19
## Ambiguities were averaged (e.g. 1 'N' = 0.25 'A', 0.25 'C', 0.25 'G' and 0.25 'U')
## Sequence weights from alignment were ignored (if they existed).
##
## alnpos A C G U
## ------- ------- ------- ------- -------
# 1 0.0 1.0 18.0 0.0
# 2 0.0 1.0 1.0 17.0
my $tot_nmaj_paired = 0;
my $tot_ndiff_paired = 0;
my $tot_nmaj_unpaired = 0;
my $tot_ndiff_unpaired = 0;
my %tot_nmaj_unpaired_by_frame_H = ();
my %tot_ndiff_unpaired_by_frame_H = ();
my $nmaj = 0;
my $ndiff = 0;
while($line = <CINFO>) {
chomp $line;
if(($line !~ m/^\#/) && ($line !~ m/^\/\/$/)) {
# data line
chomp $line;
$line =~ s/^\s+//;
my @el_A = split(/\s+/, $line);
if(scalar(@el_A) != 5) { die "ERROR expected 5 tokens on data line but got a different number: $line\n"; }
my ($apos, $na, $nc, $ng, $nu) = (@el_A);
my $frame = (($apos-1) % 3) + 1; # 1 -> 1; 2 -> 2; 3 -> 3; 4 -> 1, etc.
if(($na >= $nc) &&
($na >= $ng) &&
($na >= $nu)) {
# a is most common:
$nmaj = $na;
$ndiff = $nc + $ng + $nu;
}
elsif(($nc >= $na) &&
($nc >= $ng) &&
($nc >= $nu)) {
# c is most common:
$nmaj = $nc;
$ndiff = $na + $ng + $nu;
}
elsif(($ng >= $na) &&
($ng >= $nc) &&
($ng >= $nu)) {
# g is most common:
$nmaj = $ng;
$ndiff = $na + $nc + $nu;
}
elsif(($nu >= $na) &&
($nu >= $nc) &&
($nu >= $ng)) {
# u is most common:
$nmaj = $nu;
$ndiff = $na + $nc + $ng;
}
else {
die "ERROR coudn't determine dominant nt in line: $line";
}
if(exists $is_paired_H{$apos}) {
$tot_nmaj_paired += $nmaj;
$tot_ndiff_paired += $ndiff;
}
else {
$tot_nmaj_unpaired += $nmaj;
$tot_ndiff_unpaired += $ndiff;
$tot_nmaj_unpaired_by_frame_H{$frame} += $nmaj;
$tot_ndiff_unpaired_by_frame_H{$frame} += $ndiff;
# only need by_frame updates for unpaired because --frame and --bpinfo are incompatible options
}
}
}
if($have_pairs) {
printf("Number of most common nucleotide in paired columns: %5d [%5.3f]\n", $tot_nmaj_paired, $tot_nmaj_paired / ($tot_nmaj_paired + $tot_ndiff_paired));
printf("Number of ! most common nucleotide in paired columns: %5d [%5.3f]\n", $tot_ndiff_paired, $tot_ndiff_paired / ($tot_nmaj_paired + $tot_ndiff_paired));
printf("Number of most common nucleotide in unpaired columns: %5d [%5.3f]\n", $tot_nmaj_unpaired, $tot_nmaj_unpaired / ($tot_nmaj_unpaired + $tot_ndiff_unpaired));
printf("Number of ! most common nucleotide in unpaired columns: %5d [%5.3f]\n", $tot_ndiff_unpaired, $tot_ndiff_unpaired / ($tot_nmaj_unpaired + $tot_ndiff_unpaired));
}
else {
if($do_frame) {
foreach my $frame ("1", "2", "3") {
printf("Number of most common nucleotide in all frame $frame columns: %5d [%5.3f]\n", $tot_nmaj_unpaired_by_frame_H{$frame}, $tot_nmaj_unpaired_by_frame_H{$frame} / ($tot_nmaj_unpaired_by_frame_H{$frame} + $tot_ndiff_unpaired_by_frame_H{$frame}));
printf("Number of ! most common nucleotide in all frame $frame columns: %5d [%5.3f]\n", $tot_ndiff_unpaired_by_frame_H{$frame}, $tot_ndiff_unpaired_by_frame_H{$frame} / ($tot_nmaj_unpaired_by_frame_H{$frame} + $tot_ndiff_unpaired_by_frame_H{$frame}));
}
}
else { # ! $do_frame
printf("Number of most common nucleotide in all columns: %5d [%5.3f]\n", $tot_nmaj_unpaired, $tot_nmaj_unpaired / ($tot_nmaj_unpaired + $tot_ndiff_unpaired));
printf("Number of ! most common nucleotide in all columns: %5d [%5.3f]\n", $tot_ndiff_unpaired, $tot_ndiff_unpaired / ($tot_nmaj_unpaired + $tot_ndiff_unpaired));
}
}