-
Notifications
You must be signed in to change notification settings - Fork 1
/
simple_freq_counter.pl
50 lines (41 loc) · 1.23 KB
/
simple_freq_counter.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# a simple frequency counter for english chars.
# (corpus in from STDIN, char-seqeunce\tfrequncy lines out to STDOUT)
# Note that, each line of STDIN is expected to be a "sentence".
# gets one argument (length of char-n-gram)
# how much unique entries per-len?
# (* 26 26 26) ;; 17576
# (* 26 26 26 26) ;; 456976
# (* 26 26 26 26 26) ;; 11881376 --- this is the target for now.
# (* 26 26 26 26 26 26) ;; 308915776 --- a bit too big for my aire mem? nope. okay.
use warnings;
use strict;
my $CHARLEN = 3;
my %table;
if ($ARGV[0])
{
$CHARLEN = $ARGV[0];
}
my $linecount = 0;
while(<STDIN>)
{
# remove anything not english char ...
my $line = $_;
$line =~ s/[^a-zA-Z]//g;
next if (length $line == 0);
# add "start - end" symbol
$line = '^' . $line . '$';
#dcode# print $line;
# now count every possible sequence with $CHARLEN patterns.
for(my $i=0; $i <= (length $line) - $CHARLEN; $i++)
{
my $x = substr($line, $i, $CHARLEN);
#dcode# print $x, "\t";
$table{$x}++; # well, non-existing ones are treated as zero already.
}
$linecount++;
print STDERR "." if ($linecount % 10000 == 0);
}
foreach my $key (sort keys %table)
{
print "$key,$table{$key},\n";
}