-
Notifications
You must be signed in to change notification settings - Fork 0
/
merge_tables_by_first_column_values.pl
145 lines (125 loc) · 3.65 KB
/
merge_tables_by_first_column_values.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#!/usr/bin/env perl
# Merges (takes union of) multiple tables by the values in the first column.
# Usage:
# perl merge_tables_by_first_column_values.pl [table] [another table] [another table]
# [etc.]
# Prints to console. To print to file, use
# perl merge_tables_by_first_column_values.pl [table] [another table] [another table]
# [etc.] > [merged output table path]
use strict;
use warnings;
my @input_tables = @ARGV;
my $NEWLINE = "\n";
my $DELIMITER = "\t";
my $NO_DATA = "NA";
# reads in all values in first column of each table
my %first_column_values = (); # key: value in any table's first column -> value: 1
foreach my $input_table(@input_tables)
{
my $first_line = 1;
open TABLE, "<".$input_table || die "Could not open ".$input_table." to read; terminating =(\n";
while(<TABLE>) # for each row in the file
{
chomp;
if($_ =~ /\S/) # if row not empty
{
my @items_in_line = split($DELIMITER, $_, -1);
if($first_line) # column titles
{
$first_line = 0;
}
else
{
my $first_column_value = $items_in_line[0];
$first_column_values{$first_column_value} = 1;
}
}
}
close TABLE;
}
# reads in each table and builds row for each first column value
my $header_line = ""; # header line to print to output
my %first_column_value_to_row = (); # key: first column value -> value: row to print
my $first_table = 1;
foreach my $input_table(@input_tables)
{
# reads in table and adds its columns
my $first_line = 1;
my $number_columns_in_table = 0;
my %first_column_value_found_in_table = (); # key: first column value -> value: 1 if found in table
open TABLE, "<".$input_table || die "Could not open ".$input_table." to read; terminating =(\n";
while(<TABLE>) # for each row in the file
{
chomp;
if($_ =~ /\S/) # if row not empty
{
my @items_in_line = split($DELIMITER, $_, -1);
if($first_line) # column titles
{
# removes first column title from header row
my $header_row_without_first_column = $_;
if(!$first_table)
{
if($_ =~ /^.+$DELIMITER(.*)$/)
{
$header_row_without_first_column = $1;
}
else
{
print STDERR "Error: header row does not contain delimiter.\n";
}
}
# saves column titles
if($header_line)
{
$header_line .= $DELIMITER;
}
$header_line .= $header_row_without_first_column;
$number_columns_in_table = scalar @items_in_line;
$first_line = 0;
}
else
{
my $first_column_value = $items_in_line[0];
$first_column_value_found_in_table{$first_column_value} = 1;
# removes first column value from row
my $row_without_first_column = "";
if($_ =~ /^$first_column_value$DELIMITER(.*)$/)
{
$row_without_first_column = $1;
}
else
{
print STDERR "Error: first column value not found in its row. "
."Something is wrong with the code.\n";
}
# saves this row
$first_column_value_to_row{$first_column_value} .= $DELIMITER;
$first_column_value_to_row{$first_column_value} .= $row_without_first_column;
}
}
}
close TABLE;
$first_table = 0;
# adds empty columns for values not found
foreach my $first_column_value(keys %first_column_values)
{
if(!$first_column_value_found_in_table{$first_column_value}) # if this first column value did not appear in this table
{
for(my $count = 0; $count < $number_columns_in_table-1; $count++)
{
$first_column_value_to_row{$first_column_value} .= $DELIMITER;
}
}
}
}
# prints output table
print $header_line;
print $NEWLINE;
foreach my $first_column_value(sort keys %first_column_values)
{
print $first_column_value;
print $first_column_value_to_row{$first_column_value};
print $NEWLINE;
}
# April 29, 2023