diff --git a/benchmarks/inc/skewed_allocator.hpp b/benchmarks/inc/skewed_allocator.hpp new file mode 100644 index 00000000000..ad2c31ff4c3 --- /dev/null +++ b/benchmarks/inc/skewed_allocator.hpp @@ -0,0 +1,76 @@ +// Copyright (c) Microsoft Corporation. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +#include +#include +#include + +template +struct skewed_allocator { + using value_type = T; + static_assert(Alignment % alignof(T) == 0, "Chosen Alignment will produce unaligned T objects"); + static_assert(Skew % alignof(T) == 0, "Chosen Skew will produce unaligned T objects"); + + template + struct rebind { + using other = skewed_allocator; + }; + + skewed_allocator() = default; + template + skewed_allocator(const skewed_allocator&) {} + + template + bool operator==(const skewed_allocator&) const { + return true; + } + + T* allocate(const size_t n) { + const auto p = static_cast(_aligned_malloc(n * sizeof(T) + Skew, Alignment)); + if (!p) { + throw std::bad_alloc{}; + } + return reinterpret_cast(p + Skew); + } + + void deallocate(T* const p, size_t) { + if (p) { + _aligned_free(reinterpret_cast(p) - Skew); + } + } +}; + +// The purpose is to provide consistent behavior for benchmarks. +// 64 would be a reasonable alignment for practical perf uses, +// as it is both the cache line size and the maximum vector instruction size (on x64). +// However, aligning to the page size will provide even more consistency +// by ensuring that the same number of page boundaries is crossed each time. +inline constexpr size_t page_size = 4096; + +// A realistic skew relative to allocation granularity, when a variable is placed +// next to a pointer in a structure or on the stack. Also corresponds to the default packing. +inline constexpr size_t realistic_skew = 8; + +template +using highly_aligned_allocator = skewed_allocator; + +template +using not_highly_aligned_allocator = skewed_allocator; + +#pragma warning(push) +#pragma warning(disable : 4324) // structure was padded due to alignment specifier + +template +struct alignas(page_size) highly_aligned { + T value; +}; + +template +struct alignas(page_size) not_highly_aligned { + char pad[realistic_skew]; + T value; +}; + +#pragma warning(pop) diff --git a/benchmarks/src/swap_ranges.cpp b/benchmarks/src/swap_ranges.cpp index 7e72c7311fc..19d3906e232 100644 --- a/benchmarks/src/swap_ranges.cpp +++ b/benchmarks/src/swap_ranges.cpp @@ -7,13 +7,17 @@ #include #include +#include "skewed_allocator.hpp" + using namespace std; -template +template class Padder> void std_swap(benchmark::State& state) { - T a[N]; + Padder padded_a; + auto& a = padded_a.value; memset(a, 'a', sizeof(a)); - T b[N]; + Padder padded_b; + auto& b = padded_b.value; memset(b, 'b', sizeof(b)); for (auto _ : state) { @@ -23,10 +27,10 @@ void std_swap(benchmark::State& state) { } } -template +template class Alloc> void std_swap_ranges(benchmark::State& state) { - vector a(static_cast(state.range(0)), T{'a'}); - vector b(static_cast(state.range(0)), T{'b'}); + vector> a(static_cast(state.range(0)), T{'a'}); + vector> b(static_cast(state.range(0)), T{'b'}); for (auto _ : state) { swap_ranges(a.begin(), a.end(), b.begin()); @@ -35,18 +39,41 @@ void std_swap_ranges(benchmark::State& state) { } } -BENCHMARK(std_swap<1, uint8_t>); -BENCHMARK(std_swap<5, uint8_t>); -BENCHMARK(std_swap<15, uint8_t>); -BENCHMARK(std_swap<26, uint8_t>); -BENCHMARK(std_swap<38, uint8_t>); -BENCHMARK(std_swap<60, uint8_t>); -BENCHMARK(std_swap<125, uint8_t>); -BENCHMARK(std_swap<800, uint8_t>); -BENCHMARK(std_swap<3000, uint8_t>); -BENCHMARK(std_swap<9000, uint8_t>); - -BENCHMARK(std_swap_ranges) +BENCHMARK(std_swap<1, uint8_t, highly_aligned>); +BENCHMARK(std_swap<5, uint8_t, highly_aligned>); +BENCHMARK(std_swap<15, uint8_t, highly_aligned>); +BENCHMARK(std_swap<26, uint8_t, highly_aligned>); +BENCHMARK(std_swap<38, uint8_t, highly_aligned>); +BENCHMARK(std_swap<60, uint8_t, highly_aligned>); +BENCHMARK(std_swap<125, uint8_t, highly_aligned>); +BENCHMARK(std_swap<800, uint8_t, highly_aligned>); +BENCHMARK(std_swap<3000, uint8_t, highly_aligned>); +BENCHMARK(std_swap<9000, uint8_t, highly_aligned>); + +BENCHMARK(std_swap<1, uint8_t, not_highly_aligned>); +BENCHMARK(std_swap<5, uint8_t, not_highly_aligned>); +BENCHMARK(std_swap<15, uint8_t, not_highly_aligned>); +BENCHMARK(std_swap<26, uint8_t, not_highly_aligned>); +BENCHMARK(std_swap<38, uint8_t, not_highly_aligned>); +BENCHMARK(std_swap<60, uint8_t, not_highly_aligned>); +BENCHMARK(std_swap<125, uint8_t, not_highly_aligned>); +BENCHMARK(std_swap<800, uint8_t, not_highly_aligned>); +BENCHMARK(std_swap<3000, uint8_t, not_highly_aligned>); +BENCHMARK(std_swap<9000, uint8_t, not_highly_aligned>); + +BENCHMARK(std_swap_ranges) + ->Arg(1) + ->Arg(5) + ->Arg(15) + ->Arg(26) + ->Arg(38) + ->Arg(60) + ->Arg(125) + ->Arg(800) + ->Arg(3000) + ->Arg(9000); + +BENCHMARK(std_swap_ranges) ->Arg(1) ->Arg(5) ->Arg(15)