diff --git a/.gitignore b/.gitignore index c795b05..9d0b71a 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -build \ No newline at end of file +build +dist diff --git a/README.markdown b/README.markdown index 3bfbf8c..d68202f 100644 --- a/README.markdown +++ b/README.markdown @@ -18,20 +18,21 @@ A utility that parses input data points and outputs a text histogram Example: - $ cat /tmp/data | histogram.py - # NumSamples = 29; Max = 10.00; Min = 1.00 - # Mean = 4.379310; Variance = 5.131986; SD = 2.265389 - # each * represents a count of 1 - 1.0000 - 1.9000 [ 1]: * - 1.9000 - 2.8000 [ 5]: ***** - 2.8000 - 3.7000 [ 8]: ******** - 3.7000 - 4.6000 [ 3]: *** - 4.6000 - 5.5000 [ 4]: **** - 5.5000 - 6.4000 [ 2]: ** - 6.4000 - 7.3000 [ 3]: *** - 7.3000 - 8.2000 [ 1]: * - 8.2000 - 9.1000 [ 1]: * - 9.1000 - 10.0000 [ 1]: * + $ cat /tmp/data | histogram.py --percentage --max=1000 --min=0 + # NumSamples = 60; Min = 0.00; Max = 1000.00 + # 1 value outside of min/max + # Mean = 332.666667; Variance = 471056.055556; SD = 686.335236; Median 191.000000 + # each ∎ represents a count of 1 + 0.0000 - 100.0000 [ 28]: ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎ (46.67%) + 100.0000 - 200.0000 [ 2]: ∎∎ (3.33%) + 200.0000 - 300.0000 [ 2]: ∎∎ (3.33%) + 300.0000 - 400.0000 [ 8]: ∎∎∎∎∎∎∎∎ (13.33%) + 400.0000 - 500.0000 [ 8]: ∎∎∎∎∎∎∎∎ (13.33%) + 500.0000 - 600.0000 [ 7]: ∎∎∎∎∎∎∎ (11.67%) + 600.0000 - 700.0000 [ 3]: ∎∎∎ (5.00%) + 700.0000 - 800.0000 [ 0]: (0.00%) + 800.0000 - 900.0000 [ 1]: ∎ (1.67%) + 900.0000 - 1000.0000 [ 0]: (0.00%) ninety_five_percent.py ---------------------- @@ -67,22 +68,10 @@ bar_chart.py Generate an ascii bar chart for input data (this is like a visualization of `uniq -c`) - $ cat data | bar_chart.py --sort-keys - # each * represents a count of 2 - 19:0 [ 1] - 19:1 [ 24] ************ - 19:2 [ 3] * - 19:3 [ 9] **** - 19:4 [ 5] ** - 19:5 [ 41] ******************** - 20:0 [ 115] ********************************************************* - 20:1 [ 181] ****************************************************************************************** - 20:2 [ 136] ******************************************************************** - 20:3 [ 155] ***************************************************************************** - 20:4 [ 150] *************************************************************************** - 20:5 [ 79] *************************************** - 21:0 [ 64] ******************************** - 21:1 [ 8] **** + $ cat data | bar_chart.py + # each ∎ represents a count of 1. total 63 + 14:40 [ 49] ∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎∎ + 14:41 [ 14] ∎∎∎∎∎∎∎∎∎∎∎∎∎∎ bar_chart.py also supports ingesting aggregated values. Simply provide a two column input of keyvalue: diff --git a/data_hacks/bar_chart.py b/data_hacks/bar_chart.py index 3eaf6f2..3551860 100755 --- a/data_hacks/bar_chart.py +++ b/data_hacks/bar_chart.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Copyright 2010 bit.ly +# Copyright 2010 Bitly # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -18,7 +18,7 @@ """ Generate an ascii bar chart for input data -http://github.com/bitly/data_hacks +https://github.com/bitly/data_hacks """ import sys import math @@ -38,13 +38,17 @@ def load_stream(input_stream): yield clean_line def run(input_stream, options): - data = defaultdict(lambda:0) + data = defaultdict(int) + total = 0 for row in input_stream: if options.agg_values: kv = row.replace('\t', ' ').split(' ',2); - data[kv[0]]+= int(kv[1]) + value = int(kv[1]) + data[kv[0]] += value + total += value else: - data[row]+=1 + data[row] += 1 + total += 1 if not data: print "Error: no data" @@ -57,7 +61,7 @@ def run(input_stream, options): scale = int(math.ceil(float(max_value) / value_characters)) scale = max(1, scale) - print "# each ∎ represents a count of %d" % scale + print "# each ∎ represents a count of %d. total %d" % (scale, total) if options.sort_values: data = [[value, key] for key, value in data.items()] @@ -71,9 +75,12 @@ def run(input_stream, options): else: data.sort(key=lambda x: x[1], reverse=options.reverse_sort) - format = "%" + str(max_length) + "s [%6d] %s" - for value,key in data: - print format % (key[:max_length], value, (value / scale) * "∎") + str_format = "%" + str(max_length) + "s [%6d] %s%s" + percentage = "" + for value, key in data: + if options.percentage: + percentage = " (%0.2f%%)" % (100 * Decimal(value) / Decimal(total)) + print str_format % (key[:max_length], value, (value / scale) * "∎", percentage) if __name__ == "__main__": parser = OptionParser() @@ -88,6 +95,8 @@ def run(input_stream, options): help="reverse the sort") parser.add_option("-n", "--numeric-sort", dest="numeric_sort", default=False, action="store_true", help="sort keys by numeric sequencing") + parser.add_option("-p", "--percentage", dest="percentage", default=False, action="store_true", + help="List percentage for each bar") (options, args) = parser.parse_args() diff --git a/data_hacks/histogram.py b/data_hacks/histogram.py index 1a5f200..72b3806 100755 --- a/data_hacks/histogram.py +++ b/data_hacks/histogram.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # -# Copyright 2010 bit.ly +# Copyright 2010 Bitly # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -21,7 +21,7 @@ This is a loose port to python of the Perl version at http://www.pandamatak.com/people/anand/xfer/histo -http://github.com/bitly/data_hacks +https://github.com/bitly/data_hacks """ import sys @@ -202,6 +202,8 @@ def histogram(stream, options): print "# each ∎ represents a count of %d" % bucket_scale bucket_min = min_v bucket_max = min_v + percentage = "" + format_string = options.format + ' - ' + options.format + ' [%6d]: %s%s' for bucket in range(buckets): bucket_min = bucket_max bucket_max = boundaries[bucket] @@ -209,7 +211,9 @@ def histogram(stream, options): star_count = 0 if bucket_count: star_count = bucket_count / bucket_scale - print '%10.4f - %10.4f [%6d]: %s' % (bucket_min, bucket_max, bucket_count, '∎' * star_count) + if options.percentage: + percentage = " (%0.2f%%)" % (100 * Decimal(bucket_count) / Decimal(samples)) + print format_string % (bucket_min, bucket_max, bucket_count, '∎' * star_count, percentage) if __name__ == "__main__": @@ -227,6 +231,10 @@ def histogram(stream, options): help="Comma seperated list of bucket edges for the histogram") parser.add_option("--no-mvsd", dest="mvsd", action="store_false", default=True, help="Disable the calculation of Mean, Variance and SD (improves performance)") + parser.add_option("-f", "--bucket-format", dest="format", default="%10.4f", + help="format for bucket numbers") + parser.add_option("-p", "--percentage", dest="percentage", default=False, action="store_true", + help="List percentage for each bar") (options, args) = parser.parse_args() if sys.stdin.isatty(): diff --git a/data_hacks/ninety_five_percent.py b/data_hacks/ninety_five_percent.py old mode 100644 new mode 100755 index 8459fbc..9a51432 --- a/data_hacks/ninety_five_percent.py +++ b/data_hacks/ninety_five_percent.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -# Copyright 2010 bit.ly +# Copyright 2010 Bitly # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -17,7 +17,7 @@ """ Calculate the 95% time from a list of times given on stdin -http://github.com/bitly/data_hacks +https://github.com/bitly/data_hacks """ import sys diff --git a/data_hacks/run_for.py b/data_hacks/run_for.py old mode 100644 new mode 100755 index 4485a23..a8ea21f --- a/data_hacks/run_for.py +++ b/data_hacks/run_for.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -# Copyright 2010 bit.ly +# Copyright 2010 Bitly # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -17,7 +17,7 @@ """ Pass through data for a specified amount of time -http://github.com/bitly/data_hacks +https://github.com/bitly/data_hacks """ import time diff --git a/data_hacks/sample.py b/data_hacks/sample.py old mode 100644 new mode 100755 index 744b562..c3296ab --- a/data_hacks/sample.py +++ b/data_hacks/sample.py @@ -1,6 +1,6 @@ #!/usr/bin/env python # -# Copyright 2010 bit.ly +# Copyright 2010 Bitly # # Licensed under the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. You may obtain @@ -17,7 +17,7 @@ """ Pass through a sampled percentage of data -http://github.com/bitly/data_hacks +https://github.com/bitly/data_hacks """ import sys diff --git a/setup.py b/setup.py old mode 100644 new mode 100755 index ea37d5a..d0fc881 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ description='Command line utilities for data analysis', author='Jehiah Czebotar', author_email='jehiah@gmail.com', - url='http://github.com/bitly/data_analysis', + url='https://github.com/bitly/data_hacks', classifiers=[ 'Development Status :: 4 - Beta', 'Programming Language :: Python',