Random interleaving of benchmark repetitions - the sequel (fixes #1051)

LebedevRI · LebedevRI · commit f2938395528d · 2021-06-03T19:41:34.000+03:00
Based on the original implementation by Hai Huang @haih-g) from #1105.
diff --git a/README.md b/README.md
@@ -299,6 +299,8 @@ too (`-lkstat`).
 
 [Setting the Time Unit](#setting-the-time-unit)
 
+[Random Interleaving](docs/random_interleaving.md)
+
 [User-Requested Performance Counters](docs/perf_counters.md)
 
 [Preventing Optimization](#preventing-optimization)
diff --git a/docs/random_interleaving.md b/docs/random_interleaving.md
@@ -0,0 +1,13 @@
+<a name="interleaving" />
+
+# Random Interleaving
+
+[Random Interleaving](https://github.com/google/benchmark/issues/1051) is a
+technique to lower run-to-run variance. It randomly interleaves repetitions of a
+microbenchmark with repetitions from other microbenchmarks in the same benchmark
+test. Data shows it is able to lower run-to-run variance by
+[40%](https://github.com/google/benchmark/issues/1051) on average.
+
+To use, you mainly need to set `--benchmark_enable_random_interleaving=true`,
+and optionally specify non-zero repetition count `--benchmark_repetitions=9`
+and optionally decrease the per-repetition time `--benchmark_min_time=0.1`.
diff --git a/include/benchmark/benchmark.h b/include/benchmark/benchmark.h
@@ -1472,6 +1472,19 @@ class BenchmarkReporter {
     int64_t max_bytes_used;
   };
 
+  struct PerFamilyRunReports {
+    PerFamilyRunReports() : num_runs_total(0), num_runs_done(0) {}
+
+    // How many runs will all instances of this benchmark perform?
+    int num_runs_total;
+
+    // How many runs have happened already?
+    int num_runs_done;
+
+    // The reports about (non-errneous!) runs of this family.
+    std::vector<BenchmarkReporter::Run> Runs;
+  };
+
   // Construct a BenchmarkReporter with the output stream set to 'std::cout'
   // and the error stream set to 'std::cerr'
   BenchmarkReporter();
diff --git a/src/benchmark.cc b/src/benchmark.cc
@@ -33,8 +33,10 @@
 #include <cstdlib>
 #include <fstream>
 #include <iostream>
+#include <limits>
 #include <map>
 #include <memory>
+#include <random>
 #include <string>
 #include <thread>
 #include <utility>
@@ -73,6 +75,10 @@ DEFINE_double(benchmark_min_time, 0.5);
 // standard deviation of the runs will be reported.
 DEFINE_int32(benchmark_repetitions, 1);
 
+// If set, enable random interleaving of repetitions of all benchmarks.
+// See http://github.com/google/benchmark/issues/1051 for details.
+DEFINE_bool(benchmark_enable_random_interleaving, false);
+
 // Report the result of each benchmark repetitions. When 'true' is specified
 // only the mean, standard deviation, and other statistics are reported for
 // repeated benchmarks. Affects all reporters.
@@ -297,23 +303,69 @@ void RunBenchmarks(const std::vector<BenchmarkInstance>& benchmarks,
   context.name_field_width = name_field_width;
 
   // Keep track of running times of all instances of each benchmark family.
-  std::map<int /*family_index*/, std::vector<BenchmarkReporter::Run>>
-      complexity_reports;
+  std::map<int /*family_index*/, BenchmarkReporter::PerFamilyRunReports>
+      per_family_reports;
 
   if (display_reporter->ReportContext(context) &&
       (!file_reporter || file_reporter->ReportContext(context))) {
     FlushStreams(display_reporter);
     FlushStreams(file_reporter);
 
+    size_t num_repetitions_total = 0;
+
+    std::vector<internal::BenchmarkRunner> runners;
+    runners.reserve(benchmarks.size());
     for (const BenchmarkInstance& benchmark : benchmarks) {
-      std::vector<BenchmarkReporter::Run>* complexity_reports_for_family =
-          nullptr;
+      BenchmarkReporter::PerFamilyRunReports* reports_for_family = nullptr;
       if (benchmark.complexity() != oNone)
-        complexity_reports_for_family =
-            &complexity_reports[benchmark.family_index()];
+        reports_for_family = &per_family_reports[benchmark.family_index()];
+
+      runners.emplace_back(benchmark, reports_for_family);
+      int num_repeats_of_this_instance = runners.back().GetNumRepeats();
+      num_repetitions_total += num_repeats_of_this_instance;
+      if (reports_for_family)
+        reports_for_family->num_runs_total += num_repeats_of_this_instance;
+    }
+    assert(runners.size() == benchmarks.size() && "Unexpected runner count.");
+
+    std::vector<int> repetition_indices;
+    repetition_indices.reserve(num_repetitions_total);
+    for (size_t runner_index = 0, num_runners = runners.size();
+         runner_index != num_runners; ++runner_index) {
+      const internal::BenchmarkRunner& runner = runners[runner_index];
+      std::fill_n(std::back_inserter(repetition_indices),
+                  runner.GetNumRepeats(), runner_index);
+    }
+    assert(repetition_indices.size() == num_repetitions_total &&
+           "Unexpected number of repetition indexes.");
+
+    if (FLAGS_benchmark_enable_random_interleaving) {
+      std::random_device rd;
+      std::mt19937 g(rd());
+      std::shuffle(repetition_indices.begin(), repetition_indices.end(), g);
+    }
 
-      RunResults run_results =
-          RunBenchmark(benchmark, complexity_reports_for_family);
+    for (size_t repetition_index : repetition_indices) {
+      internal::BenchmarkRunner& runner = runners[repetition_index];
+      runner.DoOneRepetition();
+      if (runner.HasRepeatsRemaining()) continue;
+      // FIXME: report each repetition separately, not all of them in bulk.
+
+      RunResults run_results = runner.GetResults();
+
+      // Maybe calculate complexity report
+      if (BenchmarkReporter::PerFamilyRunReports* reports_for_family =
+              runner.GetReportsForFamily()) {
+        if (reports_for_family->num_runs_done ==
+            reports_for_family->num_runs_total) {
+          auto additional_run_stats = ComputeBigO(reports_for_family->Runs);
+          run_results.aggregates_only.insert(run_results.aggregates_only.end(),
+                                             additional_run_stats.begin(),
+                                             additional_run_stats.end());
+          per_family_reports.erase(
+              (int)reports_for_family->Runs.front().family_index);
+        }
+      }
 
       Report(display_reporter, file_reporter, run_results);
     }
@@ -471,6 +523,7 @@ void PrintUsageAndExit() {
           "          [--benchmark_filter=<regex>]\n"
           "          [--benchmark_min_time=<min_time>]\n"
           "          [--benchmark_repetitions=<num_repetitions>]\n"
+          "          [--benchmark_enable_random_interleaving={true|false}]\n"
           "          [--benchmark_report_aggregates_only={true|false}]\n"
           "          [--benchmark_display_aggregates_only={true|false}]\n"
           "          [--benchmark_format=<console|json|csv>]\n"
@@ -495,6 +548,8 @@ void ParseCommandLineFlags(int* argc, char** argv) {
                         &FLAGS_benchmark_min_time) ||
         ParseInt32Flag(argv[i], "benchmark_repetitions",
                        &FLAGS_benchmark_repetitions) ||
+        ParseBoolFlag(argv[i], "benchmark_enable_random_interleaving",
+                      &FLAGS_benchmark_enable_random_interleaving) ||
         ParseBoolFlag(argv[i], "benchmark_report_aggregates_only",
                       &FLAGS_benchmark_report_aggregates_only) ||
         ParseBoolFlag(argv[i], "benchmark_display_aggregates_only",
diff --git a/src/benchmark_api_internal.h b/src/benchmark_api_internal.h
@@ -39,8 +39,6 @@ class BenchmarkInstance {
   IterationCount iterations() const { return iterations_; }
   int threads() const { return threads_; }
 
-  bool last_benchmark_instance;
-
   State Run(IterationCount iters, int thread_id, internal::ThreadTimer* timer,
             internal::ThreadManager* manager,
             internal::PerfCountersMeasurement* perf_counters_measurement) const;
diff --git a/src/benchmark_register.cc b/src/benchmark_register.cc
@@ -166,7 +166,6 @@ bool BenchmarkFamilies::FindBenchmarks(
         const auto full_name = instance.name().str();
         if ((re.Match(full_name) && !isNegativeFilter) ||
             (!re.Match(full_name) && isNegativeFilter)) {
-          instance.last_benchmark_instance = (&args == &family->args_.back());
           benchmarks->push_back(std::move(instance));
 
           ++per_family_instance_index;
diff --git a/src/benchmark_runner.cc b/src/benchmark_runner.cc
@@ -143,9 +143,9 @@ void RunInThread(const BenchmarkInstance* b, IterationCount iters,
 
 BenchmarkRunner::BenchmarkRunner(
     const benchmark::internal::BenchmarkInstance& b_,
-    std::vector<BenchmarkReporter::Run>* complexity_reports_)
+    BenchmarkReporter::PerFamilyRunReports* reports_for_family_)
     : b(b_),
-      complexity_reports(complexity_reports_),
+      reports_for_family(reports_for_family_),
       min_time(!IsZero(b.min_time()) ? b.min_time() : FLAGS_benchmark_min_time),
       repeats(b.repetitions() != 0 ? b.repetitions()
                                    : FLAGS_benchmark_repetitions),
@@ -172,22 +172,6 @@ BenchmarkRunner::BenchmarkRunner(
           perf_counters_measurement.IsValid())
         << "Perf counters were requested but could not be set up.";
   }
-
-  for (int repetition_num = 0; repetition_num < repeats; repetition_num++) {
-    DoOneRepetition(repetition_num);
-  }
-
-  // Calculate additional statistics
-  run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
-
-  // Maybe calculate complexity report
-  if (complexity_reports && b.last_benchmark_instance) {
-    auto additional_run_stats = ComputeBigO(*complexity_reports);
-    run_results.aggregates_only.insert(run_results.aggregates_only.end(),
-                                       additional_run_stats.begin(),
-                                       additional_run_stats.end());
-    complexity_reports->clear();
-  }
 }
 
 BenchmarkRunner::IterationResults BenchmarkRunner::DoNIterations() {
@@ -283,8 +267,10 @@ bool BenchmarkRunner::ShouldReportIterationResults(
          ((i.results.real_time_used >= 5 * min_time) && !b.use_manual_time());
 }
 
-void BenchmarkRunner::DoOneRepetition(int64_t repetition_index) {
-  const bool is_the_first_repetition = repetition_index == 0;
+void BenchmarkRunner::DoOneRepetition() {
+  assert(HasRepeatsRemaining() && "Already done all repetitions?");
+
+  const bool is_the_first_repetition = num_repetitions_done == 0;
   IterationResults i;
 
   // We *may* be gradually increasing the length (iteration count)
@@ -337,19 +323,25 @@ void BenchmarkRunner::DoOneRepetition(int64_t repetition_index) {
   // Ok, now actualy report.
   BenchmarkReporter::Run report =
       CreateRunReport(b, i.results, memory_iterations, memory_result, i.seconds,
-                      repetition_index, repeats);
+                      num_repetitions_done, repeats);
 
-  if (complexity_reports && !report.error_occurred)
-    complexity_reports->push_back(report);
+  if (reports_for_family) {
+    ++reports_for_family->num_runs_done;
+    if (!report.error_occurred) reports_for_family->Runs.push_back(report);
+  }
 
   run_results.non_aggregates.push_back(report);
+
+  ++num_repetitions_done;
 }
 
-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports) {
-  internal::BenchmarkRunner r(b, complexity_reports);
-  return r.get_results();
+RunResults&& BenchmarkRunner::GetResults() {
+  assert(!HasRepeatsRemaining() && "Did not run all repetitions yet?");
+
+  // Calculate additional statistics over the repetitions of this instance.
+  run_results.aggregates_only = ComputeStats(run_results.non_aggregates);
+
+  return std::move(run_results);
 }
 
 }  // end namespace internal
diff --git a/src/benchmark_runner.h b/src/benchmark_runner.h
@@ -50,20 +50,34 @@ struct RunResults {
 class BenchmarkRunner {
  public:
   BenchmarkRunner(const benchmark::internal::BenchmarkInstance& b_,
-                  std::vector<BenchmarkReporter::Run>* complexity_reports_);
+                  BenchmarkReporter::PerFamilyRunReports* reports_for_family);
 
-  RunResults&& get_results() { return std::move(run_results); }
+  int GetNumRepeats() const { return repeats; }
+
+  bool HasRepeatsRemaining() const {
+    return GetNumRepeats() != num_repetitions_done;
+  }
+
+  void DoOneRepetition();
+
+  RunResults&& GetResults();
+
+  BenchmarkReporter::PerFamilyRunReports* GetReportsForFamily() const {
+    return reports_for_family;
+  };
 
  private:
   RunResults run_results;
 
   const benchmark::internal::BenchmarkInstance& b;
-  std::vector<BenchmarkReporter::Run>* complexity_reports;
+  BenchmarkReporter::PerFamilyRunReports* reports_for_family;
 
   const double min_time;
   const int repeats;
   const bool has_explicit_iteration_count;
 
+  int num_repetitions_done = 0;
+
   std::vector<std::thread> pool;
 
   IterationCount iters;  // preserved between repetitions!
@@ -83,14 +97,8 @@ class BenchmarkRunner {
   IterationCount PredictNumItersNeeded(const IterationResults& i) const;
 
   bool ShouldReportIterationResults(const IterationResults& i) const;
-
-  void DoOneRepetition(int64_t repetition_index);
 };
 
-RunResults RunBenchmark(
-    const benchmark::internal::BenchmarkInstance& b,
-    std::vector<BenchmarkReporter::Run>* complexity_reports);
-
 }  // namespace internal
 
 }  // end namespace benchmark
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -199,6 +199,7 @@ if (BENCHMARK_ENABLE_GTEST_TESTS)
 
   add_gtest(benchmark_gtest)
   add_gtest(benchmark_name_gtest)
+  add_gtest(benchmark_random_interleaving_gtest)
   add_gtest(commandlineflags_gtest)
   add_gtest(statistics_gtest)
   add_gtest(string_util_gtest)
diff --git a/test/benchmark_random_interleaving_gtest.cc b/test/benchmark_random_interleaving_gtest.cc