Skip to content

Commit

Permalink
Merge pull request #19 from jehiah/bar_chart_percentage_19
Browse files Browse the repository at this point in the history
bar chart/histogram percentage
  • Loading branch information
randyau committed Dec 19, 2014
2 parents 7a761b4 + ed3b010 commit 5b02092
Show file tree
Hide file tree
Showing 8 changed files with 57 additions and 50 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
build
build
dist
49 changes: 19 additions & 30 deletions README.markdown
Original file line number Diff line number Diff line change
Expand Up @@ -18,20 +18,21 @@ A utility that parses input data points and outputs a text histogram

Example:

$ cat /tmp/data | histogram.py
# NumSamples = 29; Max = 10.00; Min = 1.00
# Mean = 4.379310; Variance = 5.131986; SD = 2.265389
# each * represents a count of 1
1.0000 - 1.9000 [ 1]: *
1.9000 - 2.8000 [ 5]: *****
2.8000 - 3.7000 [ 8]: ********
3.7000 - 4.6000 [ 3]: ***
4.6000 - 5.5000 [ 4]: ****
5.5000 - 6.4000 [ 2]: **
6.4000 - 7.3000 [ 3]: ***
7.3000 - 8.2000 [ 1]: *
8.2000 - 9.1000 [ 1]: *
9.1000 - 10.0000 [ 1]: *
$ cat /tmp/data | histogram.py --percentage --max=1000 --min=0
# NumSamples = 60; Min = 0.00; Max = 1000.00
# 1 value outside of min/max
# Mean = 332.666667; Variance = 471056.055556; SD = 686.335236; Median 191.000000
# each ∎ represents a count of 1
0.0000 - 100.0000 [ 28]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎ (46.67%)
100.0000 - 200.0000 [ 2]: ∎∎ (3.33%)
200.0000 - 300.0000 [ 2]: ∎∎ (3.33%)
300.0000 - 400.0000 [ 8]: ∎∎∎∎∎∎∎∎ (13.33%)
400.0000 - 500.0000 [ 8]: ∎∎∎∎∎∎∎∎ (13.33%)
500.0000 - 600.0000 [ 7]: ∎∎∎∎∎∎∎ (11.67%)
600.0000 - 700.0000 [ 3]: ∎∎∎ (5.00%)
700.0000 - 800.0000 [ 0]: (0.00%)
800.0000 - 900.0000 [ 1]: ∎ (1.67%)
900.0000 - 1000.0000 [ 0]: (0.00%)

ninety_five_percent.py
----------------------
Expand Down Expand Up @@ -67,22 +68,10 @@ bar_chart.py

Generate an ascii bar chart for input data (this is like a visualization of `uniq -c`)

$ cat data | bar_chart.py --sort-keys
# each * represents a count of 2
19:0 [ 1]
19:1 [ 24] ************
19:2 [ 3] *
19:3 [ 9] ****
19:4 [ 5] **
19:5 [ 41] ********************
20:0 [ 115] *********************************************************
20:1 [ 181] ******************************************************************************************
20:2 [ 136] ********************************************************************
20:3 [ 155] *****************************************************************************
20:4 [ 150] ***************************************************************************
20:5 [ 79] ***************************************
21:0 [ 64] ********************************
21:1 [ 8] ****
$ cat data | bar_chart.py
# each ∎ represents a count of 1. total 63
14:40 [ 49] ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎
14:41 [ 14] ∎∎∎∎∎∎∎∎∎∎∎∎∎∎

bar_chart.py also supports ingesting aggregated values. Simply provide a two column input of key<space>value:

Expand Down
27 changes: 18 additions & 9 deletions data_hacks/bar_chart.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2010 bit.ly
# Copyright 2010 Bitly
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
Expand All @@ -18,7 +18,7 @@
"""
Generate an ascii bar chart for input data
http://github.com/bitly/data_hacks
https://github.com/bitly/data_hacks
"""
import sys
import math
Expand All @@ -38,13 +38,17 @@ def load_stream(input_stream):
yield clean_line

def run(input_stream, options):
data = defaultdict(lambda:0)
data = defaultdict(int)
total = 0
for row in input_stream:
if options.agg_values:
kv = row.replace('\t', ' ').split(' ',2);
data[kv[0]]+= int(kv[1])
value = int(kv[1])
data[kv[0]] += value
total += value
else:
data[row]+=1
data[row] += 1
total += 1

if not data:
print "Error: no data"
Expand All @@ -57,7 +61,7 @@ def run(input_stream, options):
scale = int(math.ceil(float(max_value) / value_characters))
scale = max(1, scale)

print "# each ∎ represents a count of %d" % scale
print "# each ∎ represents a count of %d. total %d" % (scale, total)

if options.sort_values:
data = [[value, key] for key, value in data.items()]
Expand All @@ -71,9 +75,12 @@ def run(input_stream, options):
else:
data.sort(key=lambda x: x[1], reverse=options.reverse_sort)

format = "%" + str(max_length) + "s [%6d] %s"
for value,key in data:
print format % (key[:max_length], value, (value / scale) * "∎")
str_format = "%" + str(max_length) + "s [%6d] %s%s"
percentage = ""
for value, key in data:
if options.percentage:
percentage = " (%0.2f%%)" % (100 * Decimal(value) / Decimal(total))
print str_format % (key[:max_length], value, (value / scale) * "∎", percentage)

if __name__ == "__main__":
parser = OptionParser()
Expand All @@ -88,6 +95,8 @@ def run(input_stream, options):
help="reverse the sort")
parser.add_option("-n", "--numeric-sort", dest="numeric_sort", default=False, action="store_true",
help="sort keys by numeric sequencing")
parser.add_option("-p", "--percentage", dest="percentage", default=False, action="store_true",
help="List percentage for each bar")

(options, args) = parser.parse_args()

Expand Down
14 changes: 11 additions & 3 deletions data_hacks/histogram.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright 2010 bit.ly
# Copyright 2010 Bitly
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
Expand All @@ -21,7 +21,7 @@
This is a loose port to python of the Perl version at
http://www.pandamatak.com/people/anand/xfer/histo
http://github.com/bitly/data_hacks
https://github.com/bitly/data_hacks
"""

import sys
Expand Down Expand Up @@ -202,14 +202,18 @@ def histogram(stream, options):
print "# each ∎ represents a count of %d" % bucket_scale
bucket_min = min_v
bucket_max = min_v
percentage = ""
format_string = options.format + ' - ' + options.format + ' [%6d]: %s%s'
for bucket in range(buckets):
bucket_min = bucket_max
bucket_max = boundaries[bucket]
bucket_count = bucket_counts[bucket]
star_count = 0
if bucket_count:
star_count = bucket_count / bucket_scale
print '%10.4f - %10.4f [%6d]: %s' % (bucket_min, bucket_max, bucket_count, '∎' * star_count)
if options.percentage:
percentage = " (%0.2f%%)" % (100 * Decimal(bucket_count) / Decimal(samples))
print format_string % (bucket_min, bucket_max, bucket_count, '∎' * star_count, percentage)


if __name__ == "__main__":
Expand All @@ -227,6 +231,10 @@ def histogram(stream, options):
help="Comma seperated list of bucket edges for the histogram")
parser.add_option("--no-mvsd", dest="mvsd", action="store_false", default=True,
help="Disable the calculation of Mean, Variance and SD (improves performance)")
parser.add_option("-f", "--bucket-format", dest="format", default="%10.4f",
help="format for bucket numbers")
parser.add_option("-p", "--percentage", dest="percentage", default=False, action="store_true",
help="List percentage for each bar")

(options, args) = parser.parse_args()
if sys.stdin.isatty():
Expand Down
4 changes: 2 additions & 2 deletions data_hacks/ninety_five_percent.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
#
# Copyright 2010 bit.ly
# Copyright 2010 Bitly
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
Expand All @@ -17,7 +17,7 @@
"""
Calculate the 95% time from a list of times given on stdin
http://github.com/bitly/data_hacks
https://github.com/bitly/data_hacks
"""

import sys
Expand Down
4 changes: 2 additions & 2 deletions data_hacks/run_for.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
#
# Copyright 2010 bit.ly
# Copyright 2010 Bitly
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
Expand All @@ -17,7 +17,7 @@
"""
Pass through data for a specified amount of time
http://github.com/bitly/data_hacks
https://github.com/bitly/data_hacks
"""

import time
Expand Down
4 changes: 2 additions & 2 deletions data_hacks/sample.py
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python
#
# Copyright 2010 bit.ly
# Copyright 2010 Bitly
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may
# not use this file except in compliance with the License. You may obtain
Expand All @@ -17,7 +17,7 @@
"""
Pass through a sampled percentage of data
http://github.com/bitly/data_hacks
https://github.com/bitly/data_hacks
"""

import sys
Expand Down
2 changes: 1 addition & 1 deletion setup.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
description='Command line utilities for data analysis',
author='Jehiah Czebotar',
author_email='[email protected]',
url='http://github.com/bitly/data_analysis',
url='https://github.com/bitly/data_hacks',
classifiers=[
'Development Status :: 4 - Beta',
'Programming Language :: Python',
Expand Down

0 comments on commit 5b02092

Please sign in to comment.