Skip to content

Commit

Permalink
Merge pull request #17 from jehiah/histogram_agg_17
Browse files Browse the repository at this point in the history
Support ingesting Aggregate data from histogram.py
  • Loading branch information
SeanOC committed Aug 26, 2014
2 parents 3658aec + 8d9143e commit 7a761b4
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 19 deletions.
2 changes: 1 addition & 1 deletion data_hacks/bar_chart.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def run(input_stream, options):
data = defaultdict(lambda:0)
for row in input_stream:
if options.agg_values:
kv = row.split(' ',2);
kv = row.replace('\t', ' ').split(' ',2);
data[kv[0]]+= int(kv[1])
else:
data[row]+=1
Expand Down
51 changes: 33 additions & 18 deletions data_hacks/histogram.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@

import sys
from decimal import Decimal
import logging
import math
from optparse import OptionParser
from collections import namedtuple

class MVSD(object):
""" A class that calculates a running Mean / Variance / Standard Deviation"""
Expand Down Expand Up @@ -63,6 +65,8 @@ def sd(self):
def mean(self):
return self.m

DataPoint = namedtuple('DataPoint', ['value', 'count'])

def test_mvsd():
mvsd = MVSD()
for x in range(10):
Expand All @@ -72,28 +76,35 @@ def test_mvsd():
assert '%.2f' % mvsd.var() == "8.25"
assert '%.14f' % mvsd.sd() == "2.87228132326901"

def load_stream(input_stream):
def load_stream(input_stream, agg):
for line in input_stream:
clean_line = line.strip()
if not clean_line:
# skip empty lines (ie: newlines)
continue
if clean_line[0] in ['"', "'"]:
clean_line = clean_line.strip('"').strip("'")
clean_line = clean_line.strip("\"'")
try:
yield Decimal(clean_line)
if agg:
value, count = line.replace("\t", ' ').split(' ', 2)
yield DataPoint(Decimal(value), int(count))
continue
yield DataPoint(Decimal(clean_line), 1)
except:
logging.exception('failed %r', line)
print >>sys.stderr, "invalid line %r" % line

def median(values):
def median(values, key=None):
if not key:
key= lambda x: x
length = len(values)
if length%2:
median_indeces = [length/2]
else:
median_indeces = [length/2-1, length/2]

values = sorted(values)
return sum([values[i] for i in median_indeces]) / len(median_indeces)
values = sorted(values, key=key)
return sum(map(key, [values[i] for i in median_indeces])) / len(median_indeces)

def test_median():
assert 6 == median([8,7,9,1,2,6,3]) # odd-sized list
Expand All @@ -117,11 +128,13 @@ def histogram(stream, options):
if options.min:
min_v = Decimal(options.min)
else:
min_v = min(data)
min_v = min(data, key=lambda x: x.value)
min_v = min_v.value
if options.max:
max_v = Decimal(options.max)
else:
max_v = max(data)
max_v = max(data, key=lambda x: x.value)
max_v = max_v.value

if not max_v > min_v:
raise ValueError('max must be > min. max:%s min:%s' % (max_v, min_v))
Expand Down Expand Up @@ -163,18 +176,18 @@ def histogram(stream, options):
samples = 0
mvsd = MVSD()
accepted_data = []
for value in data:
samples +=1
for record in data:
samples += record.count
if options.mvsd:
mvsd.add(value)
accepted_data.append(value)
mvsd.add(record.value, record.count)
accepted_data.append(record)
# find the bucket this goes in
if value < min_v or value > max_v:
skipped +=1
if record.value < min_v or record.value > max_v:
skipped += record.count
continue
for bucket_postion, boundary in enumerate(boundaries):
if value <= boundary:
bucket_counts[bucket_postion] +=1
if record.value <= boundary:
bucket_counts[bucket_postion] += record.count
break

# auto-pick the hash scale
Expand All @@ -185,7 +198,7 @@ def histogram(stream, options):
if skipped:
print "# %d value%s outside of min/max" % (skipped, skipped > 1 and 's' or '')
if options.mvsd:
print "# Mean = %f; Variance = %f; SD = %f; Median %f" % (mvsd.mean(), mvsd.var(), mvsd.sd(), median(accepted_data))
print "# Mean = %f; Variance = %f; SD = %f; Median %f" % (mvsd.mean(), mvsd.var(), mvsd.sd(), median(accepted_data, key=lambda x: x.value))
print "# each ∎ represents a count of %d" % bucket_scale
bucket_min = min_v
bucket_max = min_v
Expand All @@ -202,6 +215,8 @@ def histogram(stream, options):
if __name__ == "__main__":
parser = OptionParser()
parser.usage = "cat data | %prog [options]"
parser.add_option("-a", "--agg", dest="agg", default=False, action="store_true",
help="Two column input format, space seperated with key<space>value")
parser.add_option("-m", "--min", dest="min",
help="minimum value for graph")
parser.add_option("-x", "--max", dest="max",
Expand All @@ -219,5 +234,5 @@ def histogram(stream, options):
parser.print_usage()
print "for more help use --help"
sys.exit(1)
histogram(load_stream(sys.stdin), options)
histogram(load_stream(sys.stdin, options.agg), options)

0 comments on commit 7a761b4

Please sign in to comment.