Updated requiremens, extract_ir perf improvements, benchmark reporting

* Using precise pip package versions in requirements.txt, to avoid unwanted upgrades. * Fix in extract_ir (authored by [email protected]), speedup in extraction by a few orders of magnitude. * Tools to post-process json benchmark reports, when benchmarks collect perf counters - this helps validate hypotheses about improvements/regressions (benchmarks: http://github.com/google/benchmark)
google · May 26, 2021 · 345011d · 345011d
1 parent 9bf0460
commit 345011d
Show file tree

Hide file tree

Showing 5 changed files with 392 additions and 12 deletions.
diff --git a/compiler_opt/tools/benchmark_report.py b/compiler_opt/tools/benchmark_report.py
@@ -0,0 +1,187 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Analysis for benchmark results.json."""
+
+import collections
+import math
+import statistics
+
+from typing import Any
+from typing import Dict
+from typing import Iterable
+from typing import List
+from typing import Tuple
+
+# For each benchmark, and for each counter, capture the recorded values.
+PerBenchmarkResults = Dict[str, Dict[str, List[float]]]
+
+# Benchmark data, as captured by the benchmark json output: a dictionary from
+# benchmark names to a list of run results. Each run result is a dictionary of
+# key-value pairs, e.g. counter name - value.
+BenchmarkRunResults = Dict[str, List[Dict[str, Any]]]
+
+# A comparison per benchmark, per counter, capturing the geomean and the stdev
+# of the base and experiment values.
+ABComparison = Dict[str, Dict[str, Tuple[float, float, float]]]
+
+
+def _geomean(data: List[float]):
+  return math.exp(sum([math.log(x) for x in data]) / len(data))
+
+
+def _stdev(data: List[float]):
+  assert data
+  return 0.0 if len(data) == 1 else statistics.stdev(data)
+
+
+class BenchmarkReport:
+  """The counter values collected for benchmarks in a benchmark suite."""
+
+  def __init__(self, suite_name: str, json_data: BenchmarkRunResults,
+               counter_names: Iterable[str]):
+    self._suite_name = suite_name
+    self._load_values(json_data, counter_names)
+
+  def suite_name(self):
+    return self._suite_name
+
+  def values(self):
+    return self._values
+
+  def names(self):
+    return self._names
+
+  def counters(self):
+    return self._counters
+
+  def raw_measurements(self):
+    return self._raw_measurements
+
+  def counter_means(self, benchmark: str, counter: str) -> Tuple[float, float]:
+    if counter not in self.counters():
+      raise ValueError('unknown counter')
+    if benchmark not in self.names():
+      raise ValueError('unknown benchmark')
+    return (_geomean(self._values[benchmark][counter]),
+            _stdev(self._values[benchmark][counter]))
+
+  def zero_counters(self):
+    ret = set()
+    for name in self.names():
+      for counter in self.values()[name]:
+        if 0.0 in self.values()[name][counter]:
+          ret.add((name, counter))
+    return frozenset(ret)
+
+  def large_variation_counters(self, variation: float):
+    ret = set()
+    for name in self.names():
+      for counter in self.values()[name]:
+        vals = self.values()[name][counter]
+        swing = _stdev(vals) / _geomean(vals)
+        if swing > variation:
+          ret.add((name, counter, swing))
+    return frozenset(ret)
+
+  def _load_values(self, data: BenchmarkRunResults,
+                   names: Iterable[str]) -> PerBenchmarkResults:
+    """Organize json values per-benchmark, per counter.
+
+    Args:
+        data: json data
+        names: perf counter names
+    Returns:
+        benchmark data organized per-benchmark, per-counter name.
+    """
+    runs = data['benchmarks']
+    self._values = collections.defaultdict(
+        lambda: collections.defaultdict(list))
+    self._raw_measurements = collections.defaultdict(
+        lambda: collections.defaultdict(list))
+    self._counters = set()
+    self._names = set()
+
+    for r in runs:
+      benchmark_name = r['name']
+      for counter in names:
+        value = float(r[counter])
+        iters = float(r['iterations'])
+        self._raw_measurements[benchmark_name][counter].append(value * iters)
+        self._values[benchmark_name][counter].append(value)
+        self._counters.add(counter)
+        self._names.add(benchmark_name)
+    self._counters = frozenset(self._counters)
+    self._names = frozenset(self._names)
+
+
+class BenchmarkComparison:
+  """Analysis of 2 benchmark runs."""
+
+  def __init__(self, base_report: BenchmarkReport, exp_report: BenchmarkReport):
+    if base_report.suite_name() != exp_report.suite_name():
+      raise ValueError('cannot compare different suites')
+    if set(base_report.names()) != set(exp_report.names()):
+      raise ValueError('suite runs have different benchmark names')
+    if set(base_report.counters()) != set(exp_report.counters()):
+      raise ValueError(
+          'counter names are different between base and experiment')
+
+    self._base = base_report
+    self._exp = exp_report
+
+  def suite_name(self):
+    return self._base.suite_name()
+
+  def summarize(self) -> ABComparison:
+    """Summarize the results from two runs (base/experiment).
+
+    Returns:
+      A per benchmark, per counter summary of the improvement/regression
+      between the 2 runs, in percents.
+    """
+    base_results = self._base.values()
+    exp_results = self._exp.values()
+
+    ret = {}
+    for bname in base_results:
+      ret[bname] = {}
+      for counter in base_results[bname]:
+        base_vals = base_results[bname][counter]
+        exp_vals = exp_results[bname][counter]
+        base_geomean = _geomean(base_vals)
+        exp_geomean = _geomean(exp_vals)
+        improvement = 1 - exp_geomean / base_geomean
+        base_stdev = _stdev(base_vals)
+        exp_stdev = _stdev(exp_vals)
+        ret[bname][counter] = (improvement, base_stdev / base_geomean,
+                               exp_stdev / exp_geomean)
+    return ret
+
+  def names(self):
+    return self._base.names()
+
+  def counters(self):
+    return self._base.counters()
+
+  def total_improvement(self, counter: str):
+    assert counter in self.counters()
+    logsum = 0
+    # we look at the geomean of the improvement for each benchmark
+    for bname in self.names():
+      b_geomean, _ = self._base.counter_means(bname, counter)
+      e_geomean, _ = self._exp.counter_means(bname, counter)
+      logsum += math.log(e_geomean / b_geomean)
+    return 1.0 - math.exp(logsum / len(self.names()))
diff --git a/compiler_opt/tools/benchmark_report_converter.py b/compiler_opt/tools/benchmark_report_converter.py
@@ -0,0 +1,79 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+r"""Convert benchmark results.json to csv.
+
+To run:
+python3 compiler_opt/tools/benchmark_report_counter.py \
+  --base=/tmp/base_report.json \
+  --exp=/tmp/exp_report.json \
+  --counters=INSTRUCTIONS \
+  --counters=CYCLES \
+  --output=/tmp/summary.csv
+
+optionally, add --suite_name=<name of benchmark>, if batch-processing multiple
+benchmarks' reports.
+
+Assuming /tmp/{base|exp}_report.json were produced from benchmark runs, which
+were asked to collect the counters named INSTRUCTIONS and CYCLES.
+"""
+
+import csv
+import json
+
+from typing import Sequence
+
+from absl import app
+from absl import flags
+
+import tensorflow.compat.v2 as tf
+
+from compiler_opt.tools import benchmark_report
+
+flags.DEFINE_string('suite_name', 'benchmark_suite',
+                    'The name of the benchmark suite (for reporting).')
+flags.DEFINE_string('base', None,
+                    'JSON report produced by the base benchmark run.')
+flags.DEFINE_string('exp', None,
+                    'JSON report produced by the experiment benchmark run.')
+flags.DEFINE_string('output', 'reports.csv', 'CSV output')
+flags.DEFINE_multi_string(
+    'counters', None,
+    'Counter names. Should match exactly the names used when running the'
+    'benchmark.')
+
+FLAGS = flags.FLAGS
+
+
+def main(argv: Sequence[str]) -> None:
+  if len(argv) > 1:
+    raise app.UsageError('Too many command-line arguments.')
+  with tf.io.gfile.GFile(FLAGS.base, 'r') as b:
+    with tf.io.gfile.GFile(FLAGS.exp, 'r') as e:
+      b = benchmark_report.BenchmarkReport(FLAGS.suite_name, json.load(b),
+                                           FLAGS.counters)
+      e = benchmark_report.BenchmarkReport(FLAGS.suite_name, json.load(e),
+                                           FLAGS.counters)
+      comparison = benchmark_report.BenchmarkComparison(b, e)
+      summary = comparison.summarize()
+  with tf.io.gfile.GFile(FLAGS.output, 'w+') as o:
+    co = csv.writer(o)
+    for bm in summary:
+      for c in summary[bm]:
+        co.writerow([bm, c] + list(summary[bm][c]))
+
+
+if __name__ == '__main__':
+  app.run(main)
diff --git a/compiler_opt/tools/benchmark_report_test.py b/compiler_opt/tools/benchmark_report_test.py
@@ -0,0 +1,117 @@
+# coding=utf-8
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for compiler_opt.tools.benchmark_report_converter."""
+
+from absl.testing import absltest
+
+from compiler_opt.tools import benchmark_report
+
+
+base_data = {
+    'benchmarks': [
+        {
+            'PerfCounter_0': 10,
+            'PerfCounter_1': 20,
+            'iterations': 10,
+            'name': 'BM_A',
+        },
+        {
+            'PerfCounter_0': 11,
+            'PerfCounter_1': 19,
+            'iterations': 11,
+            'name': 'BM_A',
+        },
+        {
+            'PerfCounter_0': 60,
+            'PerfCounter_1': 50,
+            'iterations': 15,
+            'name': 'BM_B',
+        },
+    ]
+}
+
+exp_data = {
+    'benchmarks': [
+        {
+            'PerfCounter_0': 9,
+            'PerfCounter_1': 11,
+            'iterations': 11,
+            'name': 'BM_A',
+        },
+        {
+            'PerfCounter_0': 8,
+            'PerfCounter_1': 10,
+            'iterations': 8,
+            'name': 'BM_A',
+        },
+        {
+            'PerfCounter_0': 62,
+            'PerfCounter_1': 54,
+            'iterations': 14,
+            'name': 'BM_B',
+        },
+    ]
+}
+
+
+class BenchmarkReportConverterTest(absltest.TestCase):
+
+  def test_loading(self):
+    report = benchmark_report.BenchmarkReport(
+        'foo', base_data, ['PerfCounter_0', 'PerfCounter_1'])
+    self.assertEqual(
+        report.values(), {
+            'BM_A': {
+                'PerfCounter_0': [10, 11],
+                'PerfCounter_1': [20, 19]
+            },
+            'BM_B': {
+                'PerfCounter_0': [60],
+                'PerfCounter_1': [50],
+            }
+        })
+    self.assertSetEqual(report.names(), set(['BM_A', 'BM_B']))
+    self.assertSetEqual(report.counters(),
+                        set(['PerfCounter_0', 'PerfCounter_1']))
+    self.assertEqual(
+        report.counter_means('BM_A', 'PerfCounter_0'),
+        (10.488088481701517, 0.7071067811865476))
+
+  def test_summarize_results(self):
+    b_values = benchmark_report.BenchmarkReport(
+        'foo', base_data, ['PerfCounter_0', 'PerfCounter_1'])
+    e_values = benchmark_report.BenchmarkReport(
+        'foo', exp_data, ['PerfCounter_0', 'PerfCounter_1'])
+    summary = benchmark_report.BenchmarkComparison(b_values, e_values)
+    self.assertDictEqual(
+        summary.summarize(), {
+            'BM_A': {
+                'PerfCounter_0': (0.19096016504410973, 0.0674199862463242,
+                                  0.08333333333333334),
+                'PerfCounter_1':
+                    (0.4619724131510293, 0.0362738125055006, 0.0674199862463242)
+            },
+            'BM_B': {
+                'PerfCounter_0': (-0.03333333333333366, 0.0, 0.0),
+                'PerfCounter_1': (-0.0800000000000003, 0.0, 0.0)
+            }
+        })
+    self.assertEqual(
+        summary.total_improvement('PerfCounter_0'), 0.08566536243319522)
+
+
+if __name__ == '__main__':
+  absltest.main()