Skip to content

Commit 871ff0d

Browse files
authored
Simplify ReorderGlobals using new topological sort utils (#6885)
Use the new TopologicalSort and MinTopologicalSortOf utilities instead of the old CRTP topological sort utility and a bespoke heap-based topological sort in ReorderGlobals. Since there is no longer a heap to pop from, the direction of the custom comparator is now much more intuitive. Further simplify the code by switching from tracking the new order of globals using a sequence of new indices to tracking the order using a sequence of old indices. This change also makes the pass about 20% faster on a large real-world module.
1 parent b63aead commit 871ff0d

1 file changed

Lines changed: 62 additions & 141 deletions

File tree

src/passes/ReorderGlobals.cpp

Lines changed: 62 additions & 141 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@
3535

3636
#include "ir/find_all.h"
3737
#include "pass.h"
38-
#include "support/topological_sort.h"
38+
#include "support/topological_orders.h"
3939
#include "wasm.h"
4040

4141
namespace wasm {
@@ -75,8 +75,8 @@ struct ReorderGlobals : public Pass {
7575

7676
// For efficiency we will use global indices rather than names. That is, we
7777
// use the index of the global in the original ordering to identify each
78-
// global. A different ordering is then a vector of new indices, saying where
79-
// each one moves, which is logically a mapping between indices.
78+
// global. A different ordering is then a vector of old indices, saying where
79+
// each element comes from, which is logically a mapping between indices.
8080
using IndexIndexMap = std::vector<Index>;
8181

8282
// We will also track counts of uses for each global. We use floating-point
@@ -190,26 +190,16 @@ struct ReorderGlobals : public Pass {
190190
double const EXPONENTIAL_FACTOR = 0.095;
191191
IndexCountMap sumCounts(globals.size()), exponentialCounts(globals.size());
192192

193-
struct Sort : public TopologicalSort<Index, Sort> {
194-
const Dependencies& deps;
195-
196-
Sort(Index numGlobals, const Dependencies& deps) : deps(deps) {
197-
for (Index i = 0; i < numGlobals; i++) {
198-
push(i);
193+
std::vector<std::vector<size_t>> dependenceGraph(globals.size());
194+
for (size_t i = 0; i < globals.size(); ++i) {
195+
if (auto it = deps.dependsOn.find(i); it != deps.dependsOn.end()) {
196+
for (auto dep : it->second) {
197+
dependenceGraph[i].push_back(dep);
199198
}
200199
}
200+
}
201201

202-
void pushPredecessors(Index global) {
203-
auto iter = deps.dependedUpon.find(global);
204-
if (iter == deps.dependedUpon.end()) {
205-
return;
206-
}
207-
for (auto dep : iter->second) {
208-
push(dep);
209-
}
210-
}
211-
} sort(globals.size(), deps);
212-
202+
auto sort = *TopologicalSort(dependenceGraph);
213203
for (auto global : sort) {
214204
// We can compute this global's count as in the sorted order all the
215205
// values it cares about are resolved. Start with the self-count, then
@@ -236,160 +226,91 @@ struct ReorderGlobals : public Pass {
236226
}
237227

238228
// Apply the indices we computed.
239-
std::vector<std::unique_ptr<Global>> old(std::move(globals));
229+
auto old = std::move(globals);
240230
globals.resize(old.size());
241231
for (Index i = 0; i < old.size(); i++) {
242-
globals[(*best)[i]] = std::move(old[i]);
232+
globals[i] = std::move(old[(*best)[i]]);
243233
}
244234
module->updateMaps();
245235
}
246236

247237
IndexIndexMap doSort(const IndexCountMap& counts,
248-
const Dependencies& originalDeps,
238+
const Dependencies& deps,
249239
Module* module) {
250240
auto& globals = module->globals;
251241

252-
// Copy the deps as we will operate on them as we go.
253-
auto deps = originalDeps;
254-
255242
// To sort the globals we do a simple greedy approach of always picking the
256243
// global with the highest count at every point in time, subject to the
257244
// constraint that we can only emit globals that have all of their
258-
// dependencies already emitted. To do so we keep a list of the "available"
259-
// globals, which are those with no remaining dependencies. Then by keeping
260-
// the list of available globals in heap form we can simply pop the largest
261-
// from the heap each time, and add new available ones as they become so.
245+
// dependencies already emitted.
262246
//
263-
// Other approaches here could be to do a topological sort, but the optimal
264-
// order may not require strict ordering by topological depth, e.g.:
265-
/*
266-
// $c - $a
267-
// /
268-
// $e
269-
// \
270-
// $d - $b
271-
*/
272-
// Here $e depends on $c and $d, $c depends on $a, and $d on $b. This is a
273-
// partial order, as $d can be before or after $a, for example. As a result,
274-
// if we sorted topologically by sub-trees here then we'd keep $c and $a
275-
// together, and $d and $b, but a better order might interleave them. A good
276-
// order also may not keep topological depths separated, e.g. we may want to
277-
// put $a in between $c and $d despite it having a greater depth.
247+
// The greedy approach here may also be suboptimal, however. Consider that
248+
// we might see that the best available global is $a, but if we instead
249+
// selected some other global $b, that would allow us to select a third
250+
// global $c that depends on $b, and $c might have a much higher use count
251+
// than $a. For that reason we try several variations of this with different
252+
// counts, see earlier.
278253
//
279-
// The greedy approach here may also be unoptimal, however. Consider that we
280-
// might see that the best available global is $a, but if we popped $b
281-
// instead that could unlock $c which depends on $b, and $c may have a much
282-
// higher use count than $a. For that reason we try several variations of
283-
// this with different counts, see earlier.
284-
std::vector<Index> availableHeap;
285-
286-
// Comparison function. Given a and b, returns if a should be before b. This
287-
// is used in a heap, where "highest" means "popped first", so see the notes
288-
// below on how we order.
289-
auto cmp = [&](Index a, Index b) {
290-
// Imports always go first. The binary writer takes care of this itself
291-
// anyhow, but it is better to do it here in the IR so we can actually
292-
// see what the final layout will be.
293-
auto aImported = globals[a]->imported();
294-
auto bImported = globals[b]->imported();
295-
// The highest items will be popped first off the heap, so we want imports
296-
// to be at higher indexes, that is,
297-
//
298-
// unimported, unimported, imported, imported.
299-
//
300-
// Then the imports are popped first.
301-
if (aImported != bImported) {
302-
return bImported;
303-
}
304-
305-
// Sort by the counts. We want higher counts at higher indexes so they are
306-
// popped first, that is,
307-
//
308-
// 10, 20, 30, 40
309-
//
310-
auto aCount = counts[a];
311-
auto bCount = counts[b];
312-
if (aCount != bCount) {
313-
return aCount < bCount;
314-
}
315-
316-
// Break ties using the original order, which means just using the
317-
// indices we have. We need lower indexes at the top so they are popped
318-
// first, that is,
319-
//
320-
// 3, 2, 1, 0
321-
//
322-
return a > b;
323-
};
324-
325-
// Push an item that just became available to the available heap.
326-
auto push = [&](Index global) {
327-
availableHeap.push_back(global);
328-
std::push_heap(availableHeap.begin(), availableHeap.end(), cmp);
329-
};
330-
331-
// The initially available globals are those with no dependencies.
332-
for (Index i = 0; i < globals.size(); i++) {
333-
if (deps.dependsOn[i].empty()) {
334-
push(i);
335-
}
254+
// Sort the globals into the optimal order based on the counts, ignoring
255+
// dependencies for now.
256+
std::vector<Index> sortedGlobals;
257+
sortedGlobals.resize(globals.size());
258+
for (Index i = 0; i < globals.size(); ++i) {
259+
sortedGlobals[i] = i;
336260
}
261+
std::sort(
262+
sortedGlobals.begin(), sortedGlobals.end(), [&](Index a, Index b) {
263+
// Imports always go first. The binary writer takes care of this itself
264+
// anyhow, but it is better to do it here in the IR so we can actually
265+
// see what the final layout will be.
266+
auto aImported = globals[a]->imported();
267+
auto bImported = globals[b]->imported();
268+
if (aImported != bImported) {
269+
return aImported;
270+
}
337271

338-
// Pop off the heap: Emit the global and its final, sorted index. Keep
339-
// doing that until we finish processing all the globals.
340-
IndexIndexMap sortedindices(globals.size());
341-
Index numSortedindices = 0;
342-
while (!availableHeap.empty()) {
343-
std::pop_heap(availableHeap.begin(), availableHeap.end(), cmp);
344-
auto global = availableHeap.back();
345-
sortedindices[global] = numSortedindices++;
346-
availableHeap.pop_back();
347-
348-
// Each time we pop we emit the global, which means anything that only
349-
// depended on it becomes available to be popped as well.
350-
for (auto other : deps.dependedUpon[global]) {
351-
assert(deps.dependsOn[other].count(global));
352-
deps.dependsOn[other].erase(global);
353-
if (deps.dependsOn[other].empty()) {
354-
push(other);
272+
// Sort by the counts. Higher counts come first.
273+
auto aCount = counts[a];
274+
auto bCount = counts[b];
275+
if (aCount != bCount) {
276+
return aCount > bCount;
355277
}
278+
279+
// Break ties using the original order, which means just using the
280+
// indices we have.
281+
return a < b;
282+
});
283+
284+
// Now use that optimal order to create an ordered graph that includes the
285+
// dependencies. The final order will be the minimum topological sort of
286+
// this graph.
287+
std::vector<std::pair<Index, std::vector<Index>>> graph;
288+
graph.reserve(globals.size());
289+
for (auto i : sortedGlobals) {
290+
std::vector<Index> children;
291+
if (auto it = deps.dependedUpon.find(i); it != deps.dependedUpon.end()) {
292+
children = std::vector<Index>(it->second.begin(), it->second.end());
356293
}
294+
graph.emplace_back(i, std::move(children));
357295
}
358296

359-
// All globals must have been handled. Cycles would prevent this, but they
360-
// cannot exist in valid IR.
361-
assert(numSortedindices == globals.size());
362-
363-
return sortedindices;
297+
return *MinTopologicalSortOf<Index>(graph.begin(), graph.end());
364298
}
365299

366300
// Given an indexing of the globals and the counts of how many times each is
367301
// used, estimate the size of relevant parts of the wasm binary (that is, of
368302
// LEBs in global.gets).
369303
double computeSize(IndexIndexMap& indices, IndexCountMap& counts) {
370-
// |indices| maps each old index to its new position in the sort. We need
371-
// the reverse map here, which at index 0 has the old index of the global
372-
// that will be first, and so forth.
373-
IndexIndexMap actualOrder(indices.size());
374-
for (Index i = 0; i < indices.size(); i++) {
375-
// Each global has a unique index, so we only replace 0's here, and they
376-
// must be in bounds.
377-
assert(indices[i] < indices.size());
378-
assert(actualOrder[indices[i]] == 0);
379-
380-
actualOrder[indices[i]] = i;
381-
}
382-
383304
if (always) {
384305
// In this mode we gradually increase the cost of later globals, in an
385306
// unrealistic but smooth manner.
386307
double total = 0;
387-
for (Index i = 0; i < actualOrder.size(); i++) {
308+
for (Index i = 0; i < indices.size(); i++) {
388309
// Multiply the count for this global by a smoothed LEB factor, which
389310
// starts at 1 (for 1 byte) at index 0, and then increases linearly with
390311
// i, so that after 128 globals we reach 2 (which is the true index at
391312
// which the LEB size normally jumps from 1 to 2), and so forth.
392-
total += counts[actualOrder[i]] * (1.0 + (i / 128.0));
313+
total += counts[indices[i]] * (1.0 + (i / 128.0));
393314
}
394315
return total;
395316
}
@@ -401,7 +322,7 @@ struct ReorderGlobals : public Pass {
401322
// forth.
402323
size_t sizeInBits = 0;
403324
size_t nextSizeIncrease = 0;
404-
for (Index i = 0; i < actualOrder.size(); i++) {
325+
for (Index i = 0; i < indices.size(); i++) {
405326
if (i == nextSizeIncrease) {
406327
sizeInBits++;
407328
// At the current size we have 7 * sizeInBits bits to use. For example,
@@ -410,7 +331,7 @@ struct ReorderGlobals : public Pass {
410331
// larger LEB.
411332
nextSizeIncrease = 1 << (7 * sizeInBits);
412333
}
413-
total += counts[actualOrder[i]] * sizeInBits;
334+
total += counts[indices[i]] * sizeInBits;
414335
}
415336
return total;
416337
}

0 commit comments

Comments
 (0)