-
Notifications
You must be signed in to change notification settings - Fork 0
/
add_column_with_number_occurrences_of_column_values.pl
180 lines (152 loc) · 4.9 KB
/
add_column_with_number_occurrences_of_column_values.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#!/usr/bin/env perl
# Adds column indicating number occurrences of that row's value(s) in the entirety of the
# parameter column(s).
# Usage:
# perl add_column_with_number_occurrences_of_column_values.pl [tab-separated table]
# "[column title]" "[optional additional column title]"
# "[optional additional column title]" [etc.]
# Prints to console. To print to file, use
# perl add_column_with_number_occurrences_of_column_values.pl [tab-separated table]
# "[column title]" "[optional additional column title]"
# "[optional additional column title]" [etc.] > [output table path]
use strict;
use warnings;
my $table = $ARGV[0];
my @column_titles = @ARGV[1..$#ARGV];
my $NEWLINE = "\n";
my $DELIMITER = "\t";
# for generating output column title
my $OUTPUT_COUNT_COLUMN_TITLE_ADDITION = "count";
my $OUTPUT_COUNT_COLUMN_TITLE_DELIMITER = "_";
# verifies that input file exists and is not empty
if(!$table or !-e $table or -z $table)
{
print STDERR "Error: table not provided, does not exist, or empty:\n\t"
.$table."\nExiting.\n";
die;
}
# verifies that input table columns provided
if(scalar @column_titles < 1)
{
print STDERR "Error: no column titles provided. Exiting.\n";
die;
}
# generates output column title
my $output_column_title = join($OUTPUT_COUNT_COLUMN_TITLE_DELIMITER, @column_titles,
$OUTPUT_COUNT_COLUMN_TITLE_ADDITION);
# prepares to read in column titles
my $first_line = 1;
my %column_title_found = (); # key: column title -> value: 1 if column title has been found
my %column_title_to_column = (); # key: title of column of interest -> value: column (0-indexed)
my %column_title_of_interest = (); # key: title of column of interest -> value: 1
foreach my $column_title(@column_titles)
{
$column_title_of_interest{$column_title} = 1;
}
# reads in table
my %column_value_combination_to_count = (); # key: values in columns of interest, tab-separated -> value: number of rows that combination appears in
open TABLE, "<$table" || die "Could not open $table to read; terminating =(\n";
while(<TABLE>) # for each row in the file
{
chomp;
my $line = $_;
if($line =~ /\S/) # if row not empty
{
my @items_in_line = split($DELIMITER, $line, -1);
if($first_line) # column titles
{
# identifies column to merge by and columns to save
my $column = 0;
foreach my $column_title(@items_in_line)
{
if(defined $column_title and $column_title_of_interest{$column_title})
{
$column_title_found{$column_title} = 1;
$column_title_to_column{$column_title} = $column;
}
$column++;
}
# verifies that we have found all columns of interest
foreach my $column_title(@column_titles)
{
if(!$column_title_found{$column_title})
{
print STDERR "Warning: input column of interest ".$column_title." not "
."found. Exiting.\n";
die;
}
}
$first_line = 0; # next line is not column titles
}
else # column values (not column titles)
{
# retrieves values of columns of interest for this row and adds to their value
# combination's count
my $column_values_string = "";
foreach my $column_title(@column_titles) # for each column of interest
{
# retrieves column value
my $column = $column_title_to_column{$column_title};
my $column_value = $items_in_line[$column];
if(!defined $column_value)
{
$column_value = "";
}
if($column_values_string)
{
$column_values_string .= $DELIMITER;
}
$column_values_string .= $column_value;
}
# increments count for each column of interest value combination from this row
$column_value_combination_to_count{$column_values_string}++;
}
}
}
close TABLE;
# reads in table a second time, printing previously generated counts in new column
$first_line = 1;
open TABLE, "<$table" || die "Could not open $table to read; terminating =(\n";
while(<TABLE>) # for each row in the file
{
chomp;
my $line = $_;
if($line =~ /\S/) # if row not empty
{
my @items_in_line = split($DELIMITER, $line, -1);
if($first_line) # column titles
{
# prints column titles
print $line.$DELIMITER;
print $output_column_title.$NEWLINE;
$first_line = 0; # next line is not column titles
}
else # column values (not column titles)
{
# retrieves values of columns of interest for this row and adds to their value
# combination's count
my $column_values_string = "";
foreach my $column_title(@column_titles) # for each column of interest
{
# retrieves column value
my $column = $column_title_to_column{$column_title};
my $column_value = $items_in_line[$column];
if(!defined $column_value)
{
$column_value = "";
}
if($column_values_string)
{
$column_values_string .= $DELIMITER;
}
$column_values_string .= $column_value;
}
# prints column values
print $line.$DELIMITER;
print $column_value_combination_to_count{$column_values_string}.$NEWLINE;
}
}
}
close TABLE;
# December 9, 2021
# September 30, 2022