google · dmah42 · May 20, 2021 · Mar 28, 2021 · Mar 28, 2021 · Mar 28, 2021
diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h
@@ -434,7 +434,7 @@ struct Statistics {
       : name_(name), compute_(compute) {}
 };
 
-struct BenchmarkInstance;
+class BenchmarkInstance;
 class ThreadTimer;
 class ThreadManager;
 
@@ -686,7 +686,7 @@ class State {
   internal::ThreadTimer* timer_;
   internal::ThreadManager* manager_;
 
-  friend struct internal::BenchmarkInstance;
+  friend class internal::BenchmarkInstance;
 };
 
 inline BENCHMARK_ALWAYS_INLINE bool State::KeepRunning() {
@@ -961,6 +961,7 @@ class Benchmark {
 
  private:
   friend class BenchmarkFamilies;
+  friend class BenchmarkInstance;
 
   std::string name_;
   AggregationReportMode aggregation_report_mode_;

diff --git a/src/benchmark.cc b/src/benchmark.cc
@@ -32,7 +32,9 @@
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
+#include <limits>
 #include <memory>
+#include <random>
 #include <string>
 #include <thread>
 #include <utility>
@@ -51,6 +53,18 @@
 #include "thread_manager.h"
 #include "thread_timer.h"
 
+// Each benchmark can be repeated a number of times, and within each
+// *repetition*, we run the user-defined benchmark function a number of
+// *iterations*. The number of repetitions is determined based on flags
+// (--benchmark_repetitions).
+namespace {
+
+// Attempt to make each repetition run for at least this much of time.
+constexpr double kDefaultMinTimeTotalSecs = 0.5;
+constexpr int64_t kRandomInterleavingDefaultRepetitions = 12;
+
+}  // namespace
+
 // Print a list of benchmarks. This option overrides all other options.
 DEFINE_bool(benchmark_list_tests, false);
 
@@ -59,16 +73,39 @@ DEFINE_bool(benchmark_list_tests, false);
 // linked into the binary are run.
 DEFINE_string(benchmark_filter, ".");
 
-// Minimum number of seconds we should run benchmark before results are
-// considered significant.  For cpu-time based tests, this is the lower bound
-// on the total cpu time used by all threads that make up the test.  For
-// real-time based tests, this is the lower bound on the elapsed time of the
-// benchmark execution, regardless of number of threads.
-DEFINE_double(benchmark_min_time, 0.5);
+// Do NOT read these flags directly. Use Get*() to read them.
+namespace do_not_read_flag_directly {
+
+// Minimum number of seconds we should run benchmark per repetition before
+// results are considered significant. For cpu-time based tests, this is the
+// lower bound on the total cpu time used by all threads that make up the test.
+// For real-time based tests, this is the lower bound on the elapsed time of the
+// benchmark execution, regardless of number of threads. If left unset, will use
+// kDefaultMinTimeTotalSecs / FLAGS_benchmark_repetitions, if random
+// interleaving is enabled. Otherwise, will use kDefaultMinTimeTotalSecs.
+// Do NOT read this flag directly. Use GetMinTime() to read this flag.
+DEFINE_double(benchmark_min_time, -1.0);
 
 // The number of runs of each benchmark. If greater than 1, the mean and
-// standard deviation of the runs will be reported.
-DEFINE_int32(benchmark_repetitions, 1);
+// standard deviation of the runs will be reported. By default, the number of
+// repetitions is 1 if random interleaving is disabled, and up to
+// kDefaultRepetitions if random interleaving is enabled. (Read the
+// documentation for random interleaving to see why it might be less than
+// kDefaultRepetitions.)
+// Do NOT read this flag directly, Use GetRepetitions() to access this flag.
+DEFINE_int32(benchmark_repetitions, -1);
+
+}  // namespace do_not_read_flag_directly
+
+// The maximum overhead allowed for random interleaving. A value X means total
+// execution time under random interleaving is limited by
+// (1 + X) * original total execution time. Set to 'inf' to allow infinite
+// overhead.
+DEFINE_double(benchmark_random_interleaving_max_overhead, 0.4);
+
+// If set, enable random interleaving. See
+// http://github.com/google/benchmark/issues/1051 for details.
+DEFINE_bool(benchmark_enable_random_interleaving, false);
 
 // Report the result of each benchmark repetitions. When 'true' is specified
 // only the mean, standard deviation, and other statistics are reported for
@@ -110,6 +147,30 @@ namespace benchmark {
 
 namespace internal {
 
+// Performance measurements always come with random variances. Defines a
+// factor by which the required number of iterations is overestimated in order
+// to reduce the probability that the minimum time requirement will not be met.
+const double kSafetyMultiplier = 1.4;
+
+// Wraps --benchmark_min_time and returns valid default values if not supplied.
+double GetMinTime() {
+  const double default_min_time = kDefaultMinTimeTotalSecs / GetRepetitions();
+  const double flag_min_time =
+      do_not_read_flag_directly::FLAGS_benchmark_min_time;
+  return flag_min_time >= 0.0 ? flag_min_time : default_min_time;
+}
+
+// Wraps --benchmark_repetitions and return valid default value if not supplied.
+int64_t GetRepetitions() {
+  const int64_t default_repetitions =
+      FLAGS_benchmark_enable_random_interleaving
+          ? kRandomInterleavingDefaultRepetitions
+          : 1;
+  const int64_t flag_repetitions =
+      do_not_read_flag_directly::FLAGS_benchmark_repetitions;
+  return flag_repetitions >= 0 ? flag_repetitions : default_repetitions;
+}
+
 // FIXME: wouldn't LTO mess this up?
 void UseCharPointer(char const volatile*) {}
 
@@ -222,15 +283,15 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
   CHECK(display_reporter != nullptr);
 
   // Determine the width of the name field using a minimum width of 10.
-  bool might_have_aggregates = FLAGS_benchmark_repetitions > 1;
+  bool might_have_aggregates = GetRepetitions() > 1;
   size_t name_field_width = 10;
   size_t stat_field_width = 0;
   for (const BenchmarkInstance& benchmark : benchmarks) {
     name_field_width =
-        std::max<size_t>(name_field_width, benchmark.name.str().size());
-    might_have_aggregates |= benchmark.repetitions > 1;
+        std::max<size_t>(name_field_width, benchmark.name().str().size());
+    might_have_aggregates |= benchmark.repetitions() > 1;
 
-    for (const auto& Stat : *benchmark.statistics)
+    for (const auto& Stat : *benchmark.statistics())
       stat_field_width = std::max<size_t>(stat_field_width, Stat.name_.size());
   }
   if (might_have_aggregates) name_field_width += 1 + stat_field_width;
@@ -255,23 +316,56 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
     flushStreams(display_reporter);
     flushStreams(file_reporter);
 
-    for (const auto& benchmark : benchmarks) {
-      RunResults run_results = RunBenchmark(benchmark, &complexity_reports);
-
-      auto report = [&run_results](BenchmarkReporter* reporter,
-                                   bool report_aggregates_only) {
-        assert(reporter);
-        // If there are no aggregates, do output non-aggregates.
-        report_aggregates_only &= !run_results.aggregates_only.empty();
-        if (!report_aggregates_only)
-          reporter->ReportRuns(run_results.non_aggregates);
-        if (!run_results.aggregates_only.empty())
-          reporter->ReportRuns(run_results.aggregates_only);
-      };
-
-      report(display_reporter, run_results.display_report_aggregates_only);
+    // Without random interleaving, benchmarks are executed in the order of:
+    //   A, A, ..., A, B, B, ..., B, C, C, ..., C, ...
+    // That is, repetition is within RunBenchmark(), hence the name
+    // inner_repetitions.
+    // With random interleaving, benchmarks are executed in the order of:
+    //  {Random order of A, B, C, ...}, {Random order of A, B, C, ...}, ...
+    // That is, repetitions is outside of RunBenchmark(), hence the name
+    // outer_repetitions.
+    int64_t inner_repetitions =
+        FLAGS_benchmark_enable_random_interleaving ? 1 : GetRepetitions();
+    int64_t outer_repetitions =
+        FLAGS_benchmark_enable_random_interleaving ? GetRepetitions() : 1;
+    std::vector<size_t> benchmark_indices(benchmarks.size());
+    for (size_t i = 0; i < benchmarks.size(); ++i) {
+      benchmark_indices[i] = i;
+    }
+
+    // 'run_results_vector' and 'benchmarks' are parallel arrays.
+    std::vector<RunResults> run_results_vector(benchmarks.size());
+    for (int64_t i = 0; i < outer_repetitions; i++) {
+      if (FLAGS_benchmark_enable_random_interleaving) {
+        std::random_shuffle(benchmark_indices.begin(), benchmark_indices.end());
+      }
+      for (size_t j : benchmark_indices) {
+        // Repetitions will be automatically adjusted under random interleaving.
+        if (!FLAGS_benchmark_enable_random_interleaving ||
+            i < benchmarks[j].random_interleaving_repetitions()) {
+          RunBenchmark(benchmarks[j], outer_repetitions, inner_repetitions,
+                       &complexity_reports, &run_results_vector[j]);
+        }
+      }
+    }
+
+    auto report = [](BenchmarkReporter* reporter, bool report_aggregates_only,
+                     const RunResults& run_results) {
+      assert(reporter);
+      // If there are no aggregates, do output non-aggregates.
+      report_aggregates_only &= !run_results.aggregates_only.empty();
+      if (!report_aggregates_only)
+        reporter->ReportRuns(run_results.non_aggregates);
+      if (!run_results.aggregates_only.empty())
+        reporter->ReportRuns(run_results.aggregates_only);
+    };
+
+    for (const RunResults& run_results : run_results_vector) {
+      report(display_reporter, run_results.display_report_aggregates_only,
+             run_results);
       if (file_reporter)
-        report(file_reporter, run_results.file_report_aggregates_only);
+        report(file_reporter, run_results.file_report_aggregates_only,
+               run_results);
 
       flushStreams(display_reporter);
       flushStreams(file_reporter);
@@ -399,7 +493,7 @@ size_t RunSpecifiedBenchmarks(BenchmarkReporter* display_reporter,
 
   if (FLAGS_benchmark_list_tests) {
     for (auto const& benchmark : benchmarks)
-      Out << benchmark.name.str() << "\n";
+      Out << benchmark.name().str() << "\n";
   } else {
     internal::RunBenchmarks(benchmarks, display_reporter, file_reporter);
   }
@@ -439,10 +533,16 @@ void ParseCommandLineFlags(int* argc, char** argv) {
     if (ParseBoolFlag(argv[i], "benchmark_list_tests",
                       &FLAGS_benchmark_list_tests) ||
         ParseStringFlag(argv[i], "benchmark_filter", &FLAGS_benchmark_filter) ||
-        ParseDoubleFlag(argv[i], "benchmark_min_time",
-                        &FLAGS_benchmark_min_time) ||
-        ParseInt32Flag(argv[i], "benchmark_repetitions",
-                       &FLAGS_benchmark_repetitions) ||
+        ParseDoubleFlag(
+            argv[i], "benchmark_min_time",
+            &do_not_read_flag_directly::FLAGS_benchmark_min_time) ||
+        ParseInt32Flag(
+            argv[i], "benchmark_repetitions",
+            &do_not_read_flag_directly::FLAGS_benchmark_repetitions) ||
+        ParseBoolFlag(argv[i], "benchmark_enable_random_interleaving",
+                      &FLAGS_benchmark_enable_random_interleaving) ||
+        ParseDoubleFlag(argv[i], "benchmark_random_interleaving_max_overhead",
+                        &FLAGS_benchmark_random_interleaving_max_overhead) ||
         ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
                       &FLAGS_benchmark_report_aggregates_only) ||
         ParseBoolFlag(argv[i], "benchmark_display_aggregates_only",

diff --git a/src/benchmark_adjust_repetitions.cc b/src/benchmark_adjust_repetitions.cc
@@ -0,0 +1,111 @@
+#include "benchmark_adjust_repetitions.h"
+
+#include "benchmark_api_internal.h"
+#include "log.h"
+
+namespace benchmark {
+namespace internal {
+
+namespace {
+
+constexpr double kNanosecondInSecond = 0.000000001;
+
+}  // namespace
+
+size_t ComputeRandomInterleavingRepetitions(
+    InternalRandomInterleavingRepetitionsInput input) {
+  // Find the repetitions such that total overhead is bounded. Let
+  //   n = desired number of repetitions, i.e., the output of this method.
+  //   t = total real execution time per repetition including overhead,
+  //       (input.total_execution_time_per_repetition).
+  //   o = maximum allowed increase in total real execution time due to random
+  //       interleaving, measured as a fraction (input.max_overhead).
+  //   e = estimated total execution time without Random Interleaving
+  // We want
+  //   t * n / e <= 1 + o
+  // I.e.,
+  //   n <= (1 + o) * e / t
+  //
+  // Let
+  //   h = overhead per repetition, which include all setup / teardown time and
+  //       also the execution time of preliminary trials used to search for the
+  //       correct number of iterations.
+  //   r = real execution time per repetition not including overhead
+  //       (input.real_accumulated_time_per_repetition).
+  //   s = measured execution time per repetition not including overhead,
+  //       which can be either real or CPU time
+  //       (input.accumulated_time_per_repetition).
+  // We have
+  //   h = t - r
+  //
+  // Let
+  //   m = total minimum measured execution time for all repetitions
+  //       (input.min_time_per_repetition * input.max_repetitions).
+  // Let
+  //   f = m / s
+  // f is the scale factor between m and s, and will be used to estimate
+  // l, the total real execution time for all repetitions excluding the
+  // overhead. It's reasonable to assume that the real execution time excluding
+  // the overhead is proportional to the measured time. Hence we expect to see
+  // l / r to be equal to m / s. That is, l / r = f, thus, l = r * f. Then the
+  // total execution time e can be estimated by h + l, which is h + r * f.
+  //   e = h + r * f
+  // Note that this might be an underestimation. If number of repetitions is
+  // reduced, we may need to run more iterations per repetition, and that may
+  // increase the number of preliminary trials needed to find the correct
+  // number of iterations.
+
+  double h = std::max(0.0, input.total_execution_time_per_repetition -
+                               input.real_time_used_per_repetition);
+  double r =
+      std::max(input.real_time_used_per_repetition, kNanosecondInSecond);
+  double s =
+      std::max(input.time_used_per_repetition, kNanosecondInSecond);
+  double m = input.min_time_per_repetition * input.max_repetitions;
+
+  // f = m / s
+  // RunBenchmark() always overshoot the iteration count by kSafetyMultiplier.
+  // Apply the same factor here.
+  //   f = kSafetyMultiplier * m / s
+  // Also we want to make sure 1 <= f <= input.max_repetitions. Note that we
+  // may not be able to reach m because the total iters per repetition is
+  // upper bounded by --benchmark_max_iters. This behavior is preserved in
+  // Random Interleaving, as we won't run repetitions more than
+  // input.max_repetitions to reach m.
+
+  double f = kSafetyMultiplier * m / s;
+  f = std::min(std::max(f, 1.0), static_cast<double>(input.max_repetitions));
+
+  double e = h + r * f;
+  // n <= (1 + o) * e / t = (1 + o) * e / (h + r)
+  // Also we want to make sure 1 <= n <= input.max_repetition, and (h + r) > 0.
+  double n = (1 + input.max_overhead) * e / (h + r);
+  n = std::min(std::max(n, 1.0), static_cast<double>(input.max_repetitions));
+
+  size_t n_size_t = static_cast<size_t>(n);
+
+  VLOG(2) << "Computed random interleaving repetitions"
+          << "\n  input.total_execution_time_per_repetition: "
+          << input.total_execution_time_per_repetition
+          << "\n  input.time_used_per_repetition: "
+          << input.time_used_per_repetition
+          << "\n  input.real_time_used_per_repetition: "
+          << input.real_time_used_per_repetition
+          << "\n  input.min_time_per_repetitions: "
+          << input.min_time_per_repetition
+          << "\n  input.max_repetitions: " << input.max_repetitions
+          << "\n  input.max_overhead: " << input.max_overhead
+          << "\n  h: " << h
+          << "\n  r: " << r
+          << "\n  s: " << s
+          << "\n  f: " << f
+          << "\n  m: " << m
+          << "\n  e: " << e
+          << "\n  n: " << n
+          << "\n  n_size_t: " << n_size_t;
+
+  return n_size_t;
+}
+
+}  // internal
+}  // benchmark
diff --git a/src/benchmark_adjust_repetitions.h b/src/benchmark_adjust_repetitions.h
@@ -0,0 +1,28 @@
+#ifndef BENCHMARK_ADJUST_REPETITIONS_H
+#define BENCHMARK_ADJUST_REPETITIONS_H
+
+#include "benchmark/benchmark.h"
+#include "commandlineflags.h"
+
+namespace benchmark {
+namespace internal {
+
+// Defines the input tuple to ComputeRandomInterleavingRepetitions().
+struct InternalRandomInterleavingRepetitionsInput {
+  double total_execution_time_per_repetition;
+  double time_used_per_repetition;
+  double real_time_used_per_repetition;
+  double min_time_per_repetition;
+  double max_overhead;
+  size_t max_repetitions;
+};
+
+// Should be called right after the first repetition is completed to estimate
+// the number of iterations.
+size_t ComputeRandomInterleavingRepetitions(
+    InternalRandomInterleavingRepetitionsInput input);
+
+}  // end namespace internal
+}  // end namespace benchmark
+
+#endif  // BENCHMARK_ADJUST_REPETITIONS_H