Skip to content

Commit

Permalink
Updated requiremens, extract_ir perf improvements, benchmark reporting
Browse files Browse the repository at this point in the history
* Using precise pip package versions in requirements.txt, to avoid
  unwanted upgrades.

* Fix in extract_ir (authored by [email protected]), speedup in
  extraction by a few orders of magnitude.

* Tools to post-process json benchmark reports, when benchmarks collect
  perf counters - this helps validate hypotheses about
  improvements/regressions (benchmarks:
  http://github.com/google/benchmark)
  • Loading branch information
mtrofin committed May 26, 2021
1 parent 9bf0460 commit 345011d
Show file tree
Hide file tree
Showing 5 changed files with 392 additions and 12 deletions.
187 changes: 187 additions & 0 deletions compiler_opt/tools/benchmark_report.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
# coding=utf-8
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Analysis for benchmark results.json."""

import collections
import math
import statistics

from typing import Any
from typing import Dict
from typing import Iterable
from typing import List
from typing import Tuple

# For each benchmark, and for each counter, capture the recorded values.
PerBenchmarkResults = Dict[str, Dict[str, List[float]]]

# Benchmark data, as captured by the benchmark json output: a dictionary from
# benchmark names to a list of run results. Each run result is a dictionary of
# key-value pairs, e.g. counter name - value.
BenchmarkRunResults = Dict[str, List[Dict[str, Any]]]

# A comparison per benchmark, per counter, capturing the geomean and the stdev
# of the base and experiment values.
ABComparison = Dict[str, Dict[str, Tuple[float, float, float]]]


def _geomean(data: List[float]):
return math.exp(sum([math.log(x) for x in data]) / len(data))


def _stdev(data: List[float]):
assert data
return 0.0 if len(data) == 1 else statistics.stdev(data)


class BenchmarkReport:
"""The counter values collected for benchmarks in a benchmark suite."""

def __init__(self, suite_name: str, json_data: BenchmarkRunResults,
counter_names: Iterable[str]):
self._suite_name = suite_name
self._load_values(json_data, counter_names)

def suite_name(self):
return self._suite_name

def values(self):
return self._values

def names(self):
return self._names

def counters(self):
return self._counters

def raw_measurements(self):
return self._raw_measurements

def counter_means(self, benchmark: str, counter: str) -> Tuple[float, float]:
if counter not in self.counters():
raise ValueError('unknown counter')
if benchmark not in self.names():
raise ValueError('unknown benchmark')
return (_geomean(self._values[benchmark][counter]),
_stdev(self._values[benchmark][counter]))

def zero_counters(self):
ret = set()
for name in self.names():
for counter in self.values()[name]:
if 0.0 in self.values()[name][counter]:
ret.add((name, counter))
return frozenset(ret)

def large_variation_counters(self, variation: float):
ret = set()
for name in self.names():
for counter in self.values()[name]:
vals = self.values()[name][counter]
swing = _stdev(vals) / _geomean(vals)
if swing > variation:
ret.add((name, counter, swing))
return frozenset(ret)

def _load_values(self, data: BenchmarkRunResults,
names: Iterable[str]) -> PerBenchmarkResults:
"""Organize json values per-benchmark, per counter.
Args:
data: json data
names: perf counter names
Returns:
benchmark data organized per-benchmark, per-counter name.
"""
runs = data['benchmarks']
self._values = collections.defaultdict(
lambda: collections.defaultdict(list))
self._raw_measurements = collections.defaultdict(
lambda: collections.defaultdict(list))
self._counters = set()
self._names = set()

for r in runs:
benchmark_name = r['name']
for counter in names:
value = float(r[counter])
iters = float(r['iterations'])
self._raw_measurements[benchmark_name][counter].append(value * iters)
self._values[benchmark_name][counter].append(value)
self._counters.add(counter)
self._names.add(benchmark_name)
self._counters = frozenset(self._counters)
self._names = frozenset(self._names)


class BenchmarkComparison:
"""Analysis of 2 benchmark runs."""

def __init__(self, base_report: BenchmarkReport, exp_report: BenchmarkReport):
if base_report.suite_name() != exp_report.suite_name():
raise ValueError('cannot compare different suites')
if set(base_report.names()) != set(exp_report.names()):
raise ValueError('suite runs have different benchmark names')
if set(base_report.counters()) != set(exp_report.counters()):
raise ValueError(
'counter names are different between base and experiment')

self._base = base_report
self._exp = exp_report

def suite_name(self):
return self._base.suite_name()

def summarize(self) -> ABComparison:
"""Summarize the results from two runs (base/experiment).
Returns:
A per benchmark, per counter summary of the improvement/regression
between the 2 runs, in percents.
"""
base_results = self._base.values()
exp_results = self._exp.values()

ret = {}
for bname in base_results:
ret[bname] = {}
for counter in base_results[bname]:
base_vals = base_results[bname][counter]
exp_vals = exp_results[bname][counter]
base_geomean = _geomean(base_vals)
exp_geomean = _geomean(exp_vals)
improvement = 1 - exp_geomean / base_geomean
base_stdev = _stdev(base_vals)
exp_stdev = _stdev(exp_vals)
ret[bname][counter] = (improvement, base_stdev / base_geomean,
exp_stdev / exp_geomean)
return ret

def names(self):
return self._base.names()

def counters(self):
return self._base.counters()

def total_improvement(self, counter: str):
assert counter in self.counters()
logsum = 0
# we look at the geomean of the improvement for each benchmark
for bname in self.names():
b_geomean, _ = self._base.counter_means(bname, counter)
e_geomean, _ = self._exp.counter_means(bname, counter)
logsum += math.log(e_geomean / b_geomean)
return 1.0 - math.exp(logsum / len(self.names()))
79 changes: 79 additions & 0 deletions compiler_opt/tools/benchmark_report_converter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
# coding=utf-8
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

r"""Convert benchmark results.json to csv.
To run:
python3 compiler_opt/tools/benchmark_report_counter.py \
--base=/tmp/base_report.json \
--exp=/tmp/exp_report.json \
--counters=INSTRUCTIONS \
--counters=CYCLES \
--output=/tmp/summary.csv
optionally, add --suite_name=<name of benchmark>, if batch-processing multiple
benchmarks' reports.
Assuming /tmp/{base|exp}_report.json were produced from benchmark runs, which
were asked to collect the counters named INSTRUCTIONS and CYCLES.
"""

import csv
import json

from typing import Sequence

from absl import app
from absl import flags

import tensorflow.compat.v2 as tf

from compiler_opt.tools import benchmark_report

flags.DEFINE_string('suite_name', 'benchmark_suite',
'The name of the benchmark suite (for reporting).')
flags.DEFINE_string('base', None,
'JSON report produced by the base benchmark run.')
flags.DEFINE_string('exp', None,
'JSON report produced by the experiment benchmark run.')
flags.DEFINE_string('output', 'reports.csv', 'CSV output')
flags.DEFINE_multi_string(
'counters', None,
'Counter names. Should match exactly the names used when running the'
'benchmark.')

FLAGS = flags.FLAGS


def main(argv: Sequence[str]) -> None:
if len(argv) > 1:
raise app.UsageError('Too many command-line arguments.')
with tf.io.gfile.GFile(FLAGS.base, 'r') as b:
with tf.io.gfile.GFile(FLAGS.exp, 'r') as e:
b = benchmark_report.BenchmarkReport(FLAGS.suite_name, json.load(b),
FLAGS.counters)
e = benchmark_report.BenchmarkReport(FLAGS.suite_name, json.load(e),
FLAGS.counters)
comparison = benchmark_report.BenchmarkComparison(b, e)
summary = comparison.summarize()
with tf.io.gfile.GFile(FLAGS.output, 'w+') as o:
co = csv.writer(o)
for bm in summary:
for c in summary[bm]:
co.writerow([bm, c] + list(summary[bm][c]))


if __name__ == '__main__':
app.run(main)
117 changes: 117 additions & 0 deletions compiler_opt/tools/benchmark_report_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
# coding=utf-8
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Tests for compiler_opt.tools.benchmark_report_converter."""

from absl.testing import absltest

from compiler_opt.tools import benchmark_report


base_data = {
'benchmarks': [
{
'PerfCounter_0': 10,
'PerfCounter_1': 20,
'iterations': 10,
'name': 'BM_A',
},
{
'PerfCounter_0': 11,
'PerfCounter_1': 19,
'iterations': 11,
'name': 'BM_A',
},
{
'PerfCounter_0': 60,
'PerfCounter_1': 50,
'iterations': 15,
'name': 'BM_B',
},
]
}

exp_data = {
'benchmarks': [
{
'PerfCounter_0': 9,
'PerfCounter_1': 11,
'iterations': 11,
'name': 'BM_A',
},
{
'PerfCounter_0': 8,
'PerfCounter_1': 10,
'iterations': 8,
'name': 'BM_A',
},
{
'PerfCounter_0': 62,
'PerfCounter_1': 54,
'iterations': 14,
'name': 'BM_B',
},
]
}


class BenchmarkReportConverterTest(absltest.TestCase):

def test_loading(self):
report = benchmark_report.BenchmarkReport(
'foo', base_data, ['PerfCounter_0', 'PerfCounter_1'])
self.assertEqual(
report.values(), {
'BM_A': {
'PerfCounter_0': [10, 11],
'PerfCounter_1': [20, 19]
},
'BM_B': {
'PerfCounter_0': [60],
'PerfCounter_1': [50],
}
})
self.assertSetEqual(report.names(), set(['BM_A', 'BM_B']))
self.assertSetEqual(report.counters(),
set(['PerfCounter_0', 'PerfCounter_1']))
self.assertEqual(
report.counter_means('BM_A', 'PerfCounter_0'),
(10.488088481701517, 0.7071067811865476))

def test_summarize_results(self):
b_values = benchmark_report.BenchmarkReport(
'foo', base_data, ['PerfCounter_0', 'PerfCounter_1'])
e_values = benchmark_report.BenchmarkReport(
'foo', exp_data, ['PerfCounter_0', 'PerfCounter_1'])
summary = benchmark_report.BenchmarkComparison(b_values, e_values)
self.assertDictEqual(
summary.summarize(), {
'BM_A': {
'PerfCounter_0': (0.19096016504410973, 0.0674199862463242,
0.08333333333333334),
'PerfCounter_1':
(0.4619724131510293, 0.0362738125055006, 0.0674199862463242)
},
'BM_B': {
'PerfCounter_0': (-0.03333333333333366, 0.0, 0.0),
'PerfCounter_1': (-0.0800000000000003, 0.0, 0.0)
}
})
self.assertEqual(
summary.total_improvement('PerfCounter_0'), 0.08566536243319522)


if __name__ == '__main__':
absltest.main()
Loading

0 comments on commit 345011d

Please sign in to comment.