3535
3636#include " ir/find_all.h"
3737#include " pass.h"
38- #include " support/topological_sort .h"
38+ #include " support/topological_orders .h"
3939#include " wasm.h"
4040
4141namespace wasm {
@@ -75,8 +75,8 @@ struct ReorderGlobals : public Pass {
7575
7676 // For efficiency we will use global indices rather than names. That is, we
7777 // use the index of the global in the original ordering to identify each
78- // global. A different ordering is then a vector of new indices, saying where
79- // each one moves , which is logically a mapping between indices.
78+ // global. A different ordering is then a vector of old indices, saying where
79+ // each element comes from , which is logically a mapping between indices.
8080 using IndexIndexMap = std::vector<Index>;
8181
8282 // We will also track counts of uses for each global. We use floating-point
@@ -190,26 +190,16 @@ struct ReorderGlobals : public Pass {
190190 double const EXPONENTIAL_FACTOR = 0.095 ;
191191 IndexCountMap sumCounts (globals.size ()), exponentialCounts (globals.size ());
192192
193- struct Sort : public TopologicalSort <Index, Sort> {
194- const Dependencies& deps;
195-
196- Sort (Index numGlobals, const Dependencies& deps) : deps(deps) {
197- for (Index i = 0 ; i < numGlobals; i++) {
198- push (i);
193+ std::vector<std::vector<size_t >> dependenceGraph (globals.size ());
194+ for (size_t i = 0 ; i < globals.size (); ++i) {
195+ if (auto it = deps.dependsOn .find (i); it != deps.dependsOn .end ()) {
196+ for (auto dep : it->second ) {
197+ dependenceGraph[i].push_back (dep);
199198 }
200199 }
200+ }
201201
202- void pushPredecessors (Index global) {
203- auto iter = deps.dependedUpon .find (global);
204- if (iter == deps.dependedUpon .end ()) {
205- return ;
206- }
207- for (auto dep : iter->second ) {
208- push (dep);
209- }
210- }
211- } sort (globals.size (), deps);
212-
202+ auto sort = *TopologicalSort (dependenceGraph);
213203 for (auto global : sort) {
214204 // We can compute this global's count as in the sorted order all the
215205 // values it cares about are resolved. Start with the self-count, then
@@ -236,160 +226,91 @@ struct ReorderGlobals : public Pass {
236226 }
237227
238228 // Apply the indices we computed.
239- std::vector<std::unique_ptr<Global>> old ( std::move (globals) );
229+ auto old = std::move (globals);
240230 globals.resize (old.size ());
241231 for (Index i = 0 ; i < old.size (); i++) {
242- globals[(*best)[i]] = std::move (old[i ]);
232+ globals[i] = std::move (old[(*best)[i] ]);
243233 }
244234 module ->updateMaps ();
245235 }
246236
247237 IndexIndexMap doSort (const IndexCountMap& counts,
248- const Dependencies& originalDeps ,
238+ const Dependencies& deps ,
249239 Module* module ) {
250240 auto & globals = module ->globals ;
251241
252- // Copy the deps as we will operate on them as we go.
253- auto deps = originalDeps;
254-
255242 // To sort the globals we do a simple greedy approach of always picking the
256243 // global with the highest count at every point in time, subject to the
257244 // constraint that we can only emit globals that have all of their
258- // dependencies already emitted. To do so we keep a list of the "available"
259- // globals, which are those with no remaining dependencies. Then by keeping
260- // the list of available globals in heap form we can simply pop the largest
261- // from the heap each time, and add new available ones as they become so.
245+ // dependencies already emitted.
262246 //
263- // Other approaches here could be to do a topological sort, but the optimal
264- // order may not require strict ordering by topological depth, e.g.:
265- /*
266- // $c - $a
267- // /
268- // $e
269- // \
270- // $d - $b
271- */
272- // Here $e depends on $c and $d, $c depends on $a, and $d on $b. This is a
273- // partial order, as $d can be before or after $a, for example. As a result,
274- // if we sorted topologically by sub-trees here then we'd keep $c and $a
275- // together, and $d and $b, but a better order might interleave them. A good
276- // order also may not keep topological depths separated, e.g. we may want to
277- // put $a in between $c and $d despite it having a greater depth.
247+ // The greedy approach here may also be suboptimal, however. Consider that
248+ // we might see that the best available global is $a, but if we instead
249+ // selected some other global $b, that would allow us to select a third
250+ // global $c that depends on $b, and $c might have a much higher use count
251+ // than $a. For that reason we try several variations of this with different
252+ // counts, see earlier.
278253 //
279- // The greedy approach here may also be unoptimal, however. Consider that we
280- // might see that the best available global is $a, but if we popped $b
281- // instead that could unlock $c which depends on $b, and $c may have a much
282- // higher use count than $a. For that reason we try several variations of
283- // this with different counts, see earlier.
284- std::vector<Index> availableHeap;
285-
286- // Comparison function. Given a and b, returns if a should be before b. This
287- // is used in a heap, where "highest" means "popped first", so see the notes
288- // below on how we order.
289- auto cmp = [&](Index a, Index b) {
290- // Imports always go first. The binary writer takes care of this itself
291- // anyhow, but it is better to do it here in the IR so we can actually
292- // see what the final layout will be.
293- auto aImported = globals[a]->imported ();
294- auto bImported = globals[b]->imported ();
295- // The highest items will be popped first off the heap, so we want imports
296- // to be at higher indexes, that is,
297- //
298- // unimported, unimported, imported, imported.
299- //
300- // Then the imports are popped first.
301- if (aImported != bImported) {
302- return bImported;
303- }
304-
305- // Sort by the counts. We want higher counts at higher indexes so they are
306- // popped first, that is,
307- //
308- // 10, 20, 30, 40
309- //
310- auto aCount = counts[a];
311- auto bCount = counts[b];
312- if (aCount != bCount) {
313- return aCount < bCount;
314- }
315-
316- // Break ties using the original order, which means just using the
317- // indices we have. We need lower indexes at the top so they are popped
318- // first, that is,
319- //
320- // 3, 2, 1, 0
321- //
322- return a > b;
323- };
324-
325- // Push an item that just became available to the available heap.
326- auto push = [&](Index global) {
327- availableHeap.push_back (global);
328- std::push_heap (availableHeap.begin (), availableHeap.end (), cmp);
329- };
330-
331- // The initially available globals are those with no dependencies.
332- for (Index i = 0 ; i < globals.size (); i++) {
333- if (deps.dependsOn [i].empty ()) {
334- push (i);
335- }
254+ // Sort the globals into the optimal order based on the counts, ignoring
255+ // dependencies for now.
256+ std::vector<Index> sortedGlobals;
257+ sortedGlobals.resize (globals.size ());
258+ for (Index i = 0 ; i < globals.size (); ++i) {
259+ sortedGlobals[i] = i;
336260 }
261+ std::sort (
262+ sortedGlobals.begin (), sortedGlobals.end (), [&](Index a, Index b) {
263+ // Imports always go first. The binary writer takes care of this itself
264+ // anyhow, but it is better to do it here in the IR so we can actually
265+ // see what the final layout will be.
266+ auto aImported = globals[a]->imported ();
267+ auto bImported = globals[b]->imported ();
268+ if (aImported != bImported) {
269+ return aImported;
270+ }
337271
338- // Pop off the heap: Emit the global and its final, sorted index. Keep
339- // doing that until we finish processing all the globals.
340- IndexIndexMap sortedindices (globals.size ());
341- Index numSortedindices = 0 ;
342- while (!availableHeap.empty ()) {
343- std::pop_heap (availableHeap.begin (), availableHeap.end (), cmp);
344- auto global = availableHeap.back ();
345- sortedindices[global] = numSortedindices++;
346- availableHeap.pop_back ();
347-
348- // Each time we pop we emit the global, which means anything that only
349- // depended on it becomes available to be popped as well.
350- for (auto other : deps.dependedUpon [global]) {
351- assert (deps.dependsOn [other].count (global));
352- deps.dependsOn [other].erase (global);
353- if (deps.dependsOn [other].empty ()) {
354- push (other);
272+ // Sort by the counts. Higher counts come first.
273+ auto aCount = counts[a];
274+ auto bCount = counts[b];
275+ if (aCount != bCount) {
276+ return aCount > bCount;
355277 }
278+
279+ // Break ties using the original order, which means just using the
280+ // indices we have.
281+ return a < b;
282+ });
283+
284+ // Now use that optimal order to create an ordered graph that includes the
285+ // dependencies. The final order will be the minimum topological sort of
286+ // this graph.
287+ std::vector<std::pair<Index, std::vector<Index>>> graph;
288+ graph.reserve (globals.size ());
289+ for (auto i : sortedGlobals) {
290+ std::vector<Index> children;
291+ if (auto it = deps.dependedUpon .find (i); it != deps.dependedUpon .end ()) {
292+ children = std::vector<Index>(it->second .begin (), it->second .end ());
356293 }
294+ graph.emplace_back (i, std::move (children));
357295 }
358296
359- // All globals must have been handled. Cycles would prevent this, but they
360- // cannot exist in valid IR.
361- assert (numSortedindices == globals.size ());
362-
363- return sortedindices;
297+ return *MinTopologicalSortOf<Index>(graph.begin (), graph.end ());
364298 }
365299
366300 // Given an indexing of the globals and the counts of how many times each is
367301 // used, estimate the size of relevant parts of the wasm binary (that is, of
368302 // LEBs in global.gets).
369303 double computeSize (IndexIndexMap& indices, IndexCountMap& counts) {
370- // |indices| maps each old index to its new position in the sort. We need
371- // the reverse map here, which at index 0 has the old index of the global
372- // that will be first, and so forth.
373- IndexIndexMap actualOrder (indices.size ());
374- for (Index i = 0 ; i < indices.size (); i++) {
375- // Each global has a unique index, so we only replace 0's here, and they
376- // must be in bounds.
377- assert (indices[i] < indices.size ());
378- assert (actualOrder[indices[i]] == 0 );
379-
380- actualOrder[indices[i]] = i;
381- }
382-
383304 if (always) {
384305 // In this mode we gradually increase the cost of later globals, in an
385306 // unrealistic but smooth manner.
386307 double total = 0 ;
387- for (Index i = 0 ; i < actualOrder .size (); i++) {
308+ for (Index i = 0 ; i < indices .size (); i++) {
388309 // Multiply the count for this global by a smoothed LEB factor, which
389310 // starts at 1 (for 1 byte) at index 0, and then increases linearly with
390311 // i, so that after 128 globals we reach 2 (which is the true index at
391312 // which the LEB size normally jumps from 1 to 2), and so forth.
392- total += counts[actualOrder [i]] * (1.0 + (i / 128.0 ));
313+ total += counts[indices [i]] * (1.0 + (i / 128.0 ));
393314 }
394315 return total;
395316 }
@@ -401,7 +322,7 @@ struct ReorderGlobals : public Pass {
401322 // forth.
402323 size_t sizeInBits = 0 ;
403324 size_t nextSizeIncrease = 0 ;
404- for (Index i = 0 ; i < actualOrder .size (); i++) {
325+ for (Index i = 0 ; i < indices .size (); i++) {
405326 if (i == nextSizeIncrease) {
406327 sizeInBits++;
407328 // At the current size we have 7 * sizeInBits bits to use. For example,
@@ -410,7 +331,7 @@ struct ReorderGlobals : public Pass {
410331 // larger LEB.
411332 nextSizeIncrease = 1 << (7 * sizeInBits);
412333 }
413- total += counts[actualOrder [i]] * sizeInBits;
334+ total += counts[indices [i]] * sizeInBits;
414335 }
415336 return total;
416337 }
0 commit comments