Analysis Software
Documentation for sPHENIX simulation software
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
BenchmarkTools.cpp
Go to the documentation of this file. Or view the newest version in sPHENIX GitHub for file BenchmarkTools.cpp
1 // This file is part of the Acts project.
2 //
3 // Copyright (C) 2020 CERN for the benefit of the Acts project
4 //
5 // This Source Code Form is subject to the terms of the Mozilla Public
6 // License, v. 2.0. If a copy of the MPL was not distributed with this
7 // file, You can obtain one at http://mozilla.org/MPL/2.0/.
8 
9 #include <boost/test/data/test_case.hpp>
10 #include <boost/test/unit_test.hpp>
11 
13 
15 
16 #include <cmath>
17 #include <complex>
18 #include <iostream>
19 #include <sstream>
20 #include <tuple>
21 
22 namespace Acts {
23 namespace Test {
24 
25 // Basic non-timing tests do not validate the core performance aspects of the
26 // benchmark tools, but have the advantage of being runnable on any system.
27 BOOST_AUTO_TEST_SUITE(benchmark_tools)
28 
29 BOOST_AUTO_TEST_CASE(assume_accessed) {
30  int x = 42;
31  assumeAccessed(x);
32  BOOST_CHECK_EQUAL(x, 42);
33 }
34 
35 BOOST_AUTO_TEST_CASE(assume_read) {
36  float x = 4.2f;
37  assumeRead(x);
38  BOOST_CHECK_EQUAL(x, 4.2f);
39 
40  const std::string y = "LOL";
41  assumeRead(x);
42  BOOST_CHECK_EQUAL(y, "LOL");
43 
44  assumeRead(std::make_tuple(1, false, 3.5));
45 }
46 
47 BOOST_AUTO_TEST_CASE(assume_written) {
48  std::complex c(1.2, 3.4);
49  assumeWritten(c);
50  BOOST_CHECK_EQUAL(c, std::complex(1.2, 3.4));
51 }
52 
53 BOOST_AUTO_TEST_CASE(micro_benchmark_result) {
55  res.iters_per_run = 42;
56  res.run_timings = {
57  std::chrono::microseconds(420), std::chrono::microseconds(21),
58  std::chrono::milliseconds(4), std::chrono::microseconds(84),
59  std::chrono::microseconds(294), std::chrono::microseconds(378),
60  std::chrono::microseconds(126), std::chrono::milliseconds(42)};
61 
62  CHECK_CLOSE_REL(res.totalTime().count() / 1'000'000., 47.323, 1e-6);
63 
64  const auto sorted = res.sortedRunTimes();
65  BOOST_CHECK_EQUAL(sorted.size(), res.run_timings.size());
66  BOOST_CHECK_EQUAL(sorted[0].count(), 21'000.);
67  BOOST_CHECK_EQUAL(sorted[1].count(), 84'000.);
68  BOOST_CHECK_EQUAL(sorted[2].count(), 126'000.);
69  BOOST_CHECK_EQUAL(sorted[3].count(), 294'000.);
70  BOOST_CHECK_EQUAL(sorted[4].count(), 378'000.);
71  BOOST_CHECK_EQUAL(sorted[5].count(), 420'000.);
72  BOOST_CHECK_EQUAL(sorted[6].count(), 4'000'000.);
73  BOOST_CHECK_EQUAL(sorted[7].count(), 42'000'000.);
74 
75  CHECK_CLOSE_REL(res.runTimeMedian().count() / 1000., (294. + 378.) / 2.,
76  1e-6);
77 
78  const auto [firstq, thirdq] = res.runTimeQuartiles();
79  CHECK_CLOSE_REL(firstq.count() / 1000., (84. + 126.) / 2., 1e-6);
80  CHECK_CLOSE_REL(thirdq.count() / 1000., (420. + 4000.) / 2., 1e-6);
81 
82  const auto robustRTStddev = res.runTimeRobustStddev();
83  CHECK_CLOSE_REL(robustRTStddev.count(), (thirdq - firstq).count() / 1.349,
84  1e-3);
85 
86  const auto runTimeError = res.runTimeError();
88  runTimeError.count(),
89  1.2533 * robustRTStddev.count() / std::sqrt(res.run_timings.size()),
90  1e-3);
91 
92  CHECK_CLOSE_REL(res.iterTimeAverage().count(),
93  res.runTimeMedian().count() / res.iters_per_run, 1e-6);
94 
95  CHECK_CLOSE_REL(res.iterTimeError().count(),
96  runTimeError.count() / std::sqrt(res.iters_per_run), 1e-6);
97 
98  std::ostringstream os;
99  os << res;
100  BOOST_CHECK_EQUAL(os.str(),
101  "8 runs of 42 iteration(s), 47.3ms total, "
102  "336.0000+/-1355.2296µs per run, "
103  "8000.000+/-209116.462ns per iteration");
104 }
105 
106 BOOST_AUTO_TEST_CASE(micro_benchmark) {
107  int counter = 0;
108  microBenchmark([&] { ++counter; }, 15, 7, std::chrono::milliseconds(0));
109  BOOST_CHECK_EQUAL(counter, 15 * 7);
110 
111  counter = 0;
113  [&] {
114  ++counter;
115  return counter;
116  },
117  17, 11, std::chrono::milliseconds(0));
118  BOOST_CHECK_EQUAL(counter, 17 * 11);
119 
120  counter = 0;
121  int previous = 64;
122  std::vector<int> ints{1, 2, 4, 8, 16, 32, 64};
124  [&](int input) {
125  if (input == 1) {
126  BOOST_CHECK_EQUAL(previous, 64);
127  counter = 1;
128  } else {
129  BOOST_CHECK_EQUAL(input, previous * 2);
130  counter += input;
131  }
132  previous = input;
133  },
134  ints, 123, std::chrono::milliseconds(3));
135  BOOST_CHECK_EQUAL(counter, 127);
136 
137  counter = 0;
138  previous = -81;
139  std::vector<char> chars{-1, 3, -9, 27, -81};
141  [&](int input) {
142  if (input == -1) {
143  BOOST_CHECK_EQUAL(previous, -81);
144  counter = -1;
145  } else {
146  BOOST_CHECK_EQUAL(input, -previous * 3);
147  counter += input;
148  }
149  previous = input;
150  return &previous;
151  },
152  chars, 456, std::chrono::milliseconds(8));
153  BOOST_CHECK_EQUAL(counter, -61);
154 }
155 
156 BOOST_AUTO_TEST_SUITE_END()
157 
158 // Timing tests are perhaps the most important ones for validation of
159 // benchmarking tools, but they cannot be run by default for two reasons:
160 // - They take a while to run, and therefore slow down the testing cycle
161 // - They require a quiet system to succeed, and will likely fail when invoked
162 // by a parallel run of CTest or when run on a continuous integration VM.
163 //
164 // If you can ensure both of these preconditions, you can run the test with
165 // ./BenchmarkTools --run_test=benchmark_timings
166 BOOST_AUTO_TEST_SUITE(benchmark_timings, *boost::unit_test::disabled())
167 
168 constexpr size_t bench_iters = 1'000;
169 
170 BOOST_AUTO_TEST_CASE(micro_benchmark) {
171  using namespace std::literals::chrono_literals;
172 
173  // For simple microbenchmarking needs, plain use of microBenchmark is enough.
174  //
175  // For example, here, the microbenchmark loop isn't optimized out even though
176  // each iteration does literally nothing. If it were optimized out, the time
177  // per iteration would change, since we wouldn't get linear scaling anymore.
178  auto nop = [] {};
179  const auto nop_x10 = microBenchmark(nop, 10 * bench_iters);
180  std::cout << "nop (10x iters): " << nop_x10 << std::endl;
181  const auto nop_x100 = microBenchmark(nop, 100 * bench_iters);
182  std::cout << "nop (100x iters): " << nop_x100 << std::endl;
183  const double nop_x10_iter_ns = nop_x10.iterTimeAverage().count();
184  const double nop_x100_iter_ns = nop_x100.iterTimeAverage().count();
185  CHECK_CLOSE_REL(nop_x10_iter_ns, nop_x100_iter_ns, 0.1);
186 
187 // These tests reason about the performance characteristics of _optimized_ code,
188 // and should therefore be compiled out of debug/coverage builds.
189 #ifdef __OPTIMIZE__
190  // The microbenchmarking harness is super low overhead, less than 1
191  // nanosecond per iteration on a modern CPU.
192  BOOST_CHECK_LT(nop_x100_iter_ns, 1.0);
193 
194  // With a well-chosen iteration count that keeps per-run times under the OS
195  // scheduling quantum (typically 1ms), the noise is also super low.
196  BOOST_CHECK_LT(nop_x100.iterTimeError().count(), 0.1);
197 
198  // You can measure the overhead of any operation as long as it's not
199  // _obnoxiously_ amenable to compiler const-propagation or dead code
200  // elimination. For example, this sqrt throughput microbenchmark works,
201  // because microBenchmark forces the compiler to assume that "x", "y" and "z"
202  // are modified on every benchmark iteration...
203  const double x = 1.2, y = 3.4, z = 5.6;
204  auto sqrt = microBenchmark(
205  [&] { return std::sqrt(x * y) + std::sqrt(y * z) + std::sqrt(z * x); },
206  bench_iters);
207  std::cout << "sqrt (correct): " << sqrt << std::endl;
208  BOOST_CHECK_GT(sqrt.iterTimeAverage().count(), 10. * nop_x100_iter_ns);
209 
210  // ...but this variant doesn't work, because the compiler can trivially
211  // precompute the square root when optimizing the inner lambda...
212  const auto sqrt_constprop = microBenchmark(
213  [] {
214  return std::sqrt(1.2 * 3.4) + std::sqrt(3.4 * 5.6) +
215  std::sqrt(5.6 * 1.2);
216  },
217  bench_iters * 20);
218  std::cout << "sqrt (constprop'd): " << sqrt_constprop << std::endl;
219  BOOST_CHECK_LT(sqrt_constprop.iterTimeAverage().count(),
220  sqrt.iterTimeAverage().count() / 5.);
221 
222  // ...and this one doesn't work either, because the compiler can trivially
223  // infer that the result of the computation is unused and stop computing it.
224  //
225  // The lower tolerance of this test is needed because current GCC doesn't
226  // optimize _everything_ out in its default configuration, as sqrt could still
227  // have side-effects like setting the errno thread-local variable...
228  const auto sqrt_deadcode = microBenchmark(
229  [&] { (void)(std::sqrt(x * y) + std::sqrt(y * z) + std::sqrt(z * x)); },
230  bench_iters * 10);
231  std::cout << "sqrt (deadcode'd): " << sqrt_deadcode << std::endl;
232  BOOST_CHECK_LT(sqrt_deadcode.iterTimeAverage().count(),
233  sqrt.iterTimeAverage().count() / 3.);
234 #endif
235 }
236 
237 // These tests reason about the performance characteristics of _optimized_ code,
238 // and should therefore be compiled out of debug/coverage builds.
239 #ifdef __OPTIMIZE__
240 BOOST_AUTO_TEST_CASE(assume_read) {
241  // You can use assumeRead when you want the compiler to assume that the result
242  // of some computation has been read and therefore the computation shouldn't
243  // be optimized out. This is what microBenchmark implicitly does to the value
244  // returned by the benchmark iteration function, if any.
245  //
246  // For example, these two computations are almost equivalent. Notice that
247  // assumeRead can be used on temporaries.
248  const double x = 1.2, y = 3.4, z = 5.6;
249  const auto tuple_return = microBenchmark(
250  [&] {
251  return std::make_tuple(
252  std::sqrt(x * y), std::complex(std::sqrt(y * z), std::sqrt(z * x)));
253  },
254  bench_iters);
255  std::cout << "tuple return: " << tuple_return << std::endl;
256  const auto assumeread = microBenchmark(
257  [&] {
258  assumeRead(std::sqrt(x * y));
259  assumeRead(std::complex(std::sqrt(y * z), std::sqrt(z * x)));
260  },
261  bench_iters);
262  std::cout << "assumeRead: " << assumeread << std::endl;
263  const double tuple_return_iter_ns = tuple_return.iterTimeAverage().count();
264  const double assumeRead_iter_ns = assumeread.iterTimeAverage().count();
265  CHECK_CLOSE_REL(tuple_return_iter_ns, assumeRead_iter_ns, 1e-2);
266 }
267 #endif
268 
269 BOOST_AUTO_TEST_CASE(assume_written) {
270  // You can use assumeWritten when you want the compiler to assume that some
271  // variables have been written to, and every dependent computation must
272  // therefore be recomputed. This is what microBenchmark implicitly does to
273  // every variable captured by the benchmark iteration lambda.
274  //
275  // Since assumeWritten operates on variables in memory, it cannot be used on
276  // temporaries, but only on mutable variables.
277  double x = 1.2, y = 3.4, z = 5.6;
278  auto sqrt_sum = microBenchmark(
279  [&] { return std::sqrt(x * y) + std::sqrt(y * z) + std::sqrt(z * x); },
280  bench_iters);
281  std::cout << "sqrt sum: " << sqrt_sum << std::endl;
282  auto sqrt_2sums = microBenchmark(
283  [&] {
284  double tmp = std::sqrt(x * y) + std::sqrt(y * z) + std::sqrt(z * x);
285  assumeWritten(x);
286  assumeWritten(y);
287  assumeWritten(z);
288  return tmp + std::sqrt(x * y) + std::sqrt(y * z) + std::sqrt(z * x);
289  },
290  bench_iters);
291  std::cout << "2x(sqrt sum): " << sqrt_2sums << std::endl;
292  const double sqrt_sum_iter_ns = sqrt_sum.iterTimeAverage().count();
293  const double sqrt_2sums_iter_ns = sqrt_2sums.iterTimeAverage().count();
294  CHECK_CLOSE_REL(2. * sqrt_sum_iter_ns, sqrt_2sums_iter_ns, 1e-2);
295 }
296 
297 BOOST_AUTO_TEST_SUITE_END()
298 
299 } // namespace Test
300 } // namespace Acts