Skip to content

Commit

Permalink
Merge pull request #20 from jehiah/aggregate_params_20
Browse files Browse the repository at this point in the history
improve aggregate parameter handling
  • Loading branch information
jehiah committed Jan 8, 2015
2 parents 5b02092 + 06310ae commit eacacf6
Show file tree
Hide file tree
Showing 3 changed files with 25 additions and 13 deletions.
4 changes: 2 additions & 2 deletions README.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ Generate an ascii bar chart for input data (this is like a visualization of `uni
14:40 [ 49] ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
14:41 [ 14] ∎∎∎∎∎∎∎∎∎∎∎∎∎∎

bar_chart.py also supports ingesting aggregated values. Simply provide a two column input of key<space>value:
`bar_chart.py` and `histogram.py` also support ingesting pre-aggregated values. Simply provide a two column input of `count<whitespace>value` for `-a` or `value<whitespace>count` for `-A`:

$ cat data | uniq -c | bar_chart.py --sort-keys --agg-values
$ cat data | uniq -c | bar_chart.py -a

This is very convenient if you pull data out, say Hadoop or MySQL already aggregated.
13 changes: 10 additions & 3 deletions data_hacks/bar_chart.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,16 @@ def run(input_stream, options):
data = defaultdict(int)
total = 0
for row in input_stream:
if options.agg_values:
kv = row.replace('\t', ' ').split(' ',2);
if options.agg_key_value:
kv = row.rstrip().rsplit(None, 1)
value = int(kv[1])
data[kv[0]] += value
total += value
elif options.agg_value_key:
kv = row.lstrip().split(None, 1)
value = int(kv[0])
data[kv[1]] += value
total += value
else:
data[row] += 1
total += 1
Expand Down Expand Up @@ -85,7 +90,9 @@ def run(input_stream, options):
if __name__ == "__main__":
parser = OptionParser()
parser.usage = "cat data | %prog [options]"
parser.add_option("-a", "--agg-values", dest="agg_values", default=False, action="store_true",
parser.add_option("-a", "--agg", dest="agg_value_key", default=False, action="store_true",
help="Two column input format, space seperated with value<space>key")
parser.add_option("-A", "--agg-key-value", dest="agg_key_value", default=False, action="store_true",
help="Two column input format, space seperated with key<space>value")
parser.add_option("-k", "--sort-keys", dest="sort_keys", default=True, action="store_true",
help="sort by the key [default]")
Expand Down
21 changes: 13 additions & 8 deletions data_hacks/histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ def test_mvsd():
assert '%.2f' % mvsd.var() == "8.25"
assert '%.14f' % mvsd.sd() == "2.87228132326901"

def load_stream(input_stream, agg):
def load_stream(input_stream, agg_value_key, agg_key_value):
for line in input_stream:
clean_line = line.strip()
if not clean_line:
Expand All @@ -85,11 +85,14 @@ def load_stream(input_stream, agg):
if clean_line[0] in ['"', "'"]:
clean_line = clean_line.strip("\"'")
try:
if agg:
value, count = line.replace("\t", ' ').split(' ', 2)
yield DataPoint(Decimal(value), int(count))
continue
yield DataPoint(Decimal(clean_line), 1)
if agg_key_value:
key, value = clean_line.rstrip().rsplit(None, 1)
yield DataPoint(Decimal(key), int(value))
elif agg_value_key:
value, key = clean_line.lstrip().split(None, 1)
yield DataPoint(Decimal(key), int(value))
else:
yield DataPoint(Decimal(clean_line), 1)
except:
logging.exception('failed %r', line)
print >>sys.stderr, "invalid line %r" % line
Expand Down Expand Up @@ -219,7 +222,9 @@ def histogram(stream, options):
if __name__ == "__main__":
parser = OptionParser()
parser.usage = "cat data | %prog [options]"
parser.add_option("-a", "--agg", dest="agg", default=False, action="store_true",
parser.add_option("-a", "--agg", dest="agg_value_key", default=False, action="store_true",
help="Two column input format, space seperated with value<space>key")
parser.add_option("-A", "--agg-key-value", dest="agg_key_value", default=False, action="store_true",
help="Two column input format, space seperated with key<space>value")
parser.add_option("-m", "--min", dest="min",
help="minimum value for graph")
Expand All @@ -242,5 +247,5 @@ def histogram(stream, options):
parser.print_usage()
print "for more help use --help"
sys.exit(1)
histogram(load_stream(sys.stdin, options.agg), options)
histogram(load_stream(sys.stdin, options.agg_value_key, options.agg_key_value), options)

0 comments on commit eacacf6

Please sign in to comment.