|
91 | 91 |
|
92 | 92 | const rightEffectController = { |
93 | 93 | algo: 'Reduce 3 (Subgroup Reduce)', |
94 | | - currentAlgo: uniform( 0 ), |
| 94 | + currentAlgo: uniform( 3 ), |
95 | 95 | highlight: uniform( 0 ), |
96 | 96 | displayMode: 'Input Element 0', |
97 | 97 | state: 'Run Algo', |
|
179 | 179 | } ) ); |
180 | 180 |
|
181 | 181 | const inputStorage = instancedArray( array, 'uint', size ).setPBO( true ).setName( `Current_${leftSideDisplay ? 'Left' : 'Right'}` ); |
| 182 | + const inputVectorizedStorage = instancedArray( array, 'vec4' ).setPBO( true ).setName( `CurrentVectorized_${leftSideDisplay ? 'Left' : 'Right'}` ); |
182 | 183 | const atomicAccumulator = instancedArray( new Uint32Array( 1 ), 'uint' ).setPBO( true ).toAtomic(); |
183 | 184 |
|
184 | 185 | // Reduce 3 Calculations |
|
575 | 576 | } ).compute( 32, [ 32 ] ) |
576 | 577 | ]; |
577 | 578 |
|
| 579 | + const createReduce4Fn = ( createReduce4FnProps ) => { |
| 580 | + |
| 581 | + const { workgroupSize, workPerThread, numElements, minSubgroupSize } = createReduce4FnProps; |
| 582 | + const partitionSize = uint(workgroupSize * workPerThread); |
| 583 | + const numThreadBlocks = |
| 584 | + |
| 585 | + const MAX_REDUCE_SIZE = uint(workgroupSize).div(minSubgroupSize); |
| 586 | + |
| 587 | + vecSize = numElements / 4; |
| 588 | + |
| 589 | + const fnDef = Fn( () => { |
| 590 | + |
| 591 | + const workgroupReductionArray = createSubgroupArray('uint', maxWorkgroupSize, minSubgroupSize); |
| 592 | + |
| 593 | + // Get the index of the subgroup within the workgroup |
| 594 | + const subgroupMetaRank = invocationLocalIndex.div( subgroupSize ); |
| 595 | + // Offset by 4 subgroups from current subgroup since each thread will scan in values from across 4 subgroups |
| 596 | + const subgroupOffset = subgroupMetaRank.mul( subgroupSize ).mul( workPerThread ); |
| 597 | + subgroupOffset.addAssign( invocationSubgroupIndex ); |
| 598 | + |
| 599 | + // Per workgroup, offset by number of elements scanned per workgroup |
| 600 | + const workgroupOffset = workgroupId.x.mul( partitionSize ); |
| 601 | + |
| 602 | + const startThread = subgroupOffset.add( workgroupOffset ); |
| 603 | + |
| 604 | + const subgroupReduction = uint(0); |
| 605 | + |
| 606 | + If(workgroupId.x.lessThan(info.thread_blocks - 1u), () => { |
| 607 | + |
| 608 | + const currentSubgroupInBlock = uint(0).toVar(); |
| 609 | + |
| 610 | + Loop( currentSubgroupInBlock.lessThan(workPerThread), () => { |
| 611 | + |
| 612 | + // Get vectorized element from input array |
| 613 | + const val = inputVectorizedStorage.element(startThread); |
| 614 | + |
| 615 | + // Sum values within vec4 together by using result of dot product |
| 616 | + subgroupReduction.addAssign(dot(val, vec4(1))); |
| 617 | + |
| 618 | + // Increment so thread will scan value in next subgroup |
| 619 | + startThread.addAssign(subgroupSize); |
| 620 | + |
| 621 | + // Increment to continue loop |
| 622 | + currentSubgroupInBlock.addAssign(1); |
| 623 | + |
| 624 | + }) |
| 625 | + }) |
| 626 | + |
| 627 | + If(workgroupId.x.equal(info.thread_blocks - 1u), () => { |
| 628 | + |
| 629 | + const currentSubgroupInBlock = uint(0).toVar(); |
| 630 | + |
| 631 | + Loop( currentSubgroupInBlock.lessThan(workPerThread), () => { |
| 632 | + |
| 633 | + const inputValue = inputVectorizedStorage.element(startThread); |
| 634 | + |
| 635 | + const val = select(startThread.lessThan(vecSize), inputValue, vec4(0)); |
| 636 | + |
| 637 | + // Sum values within vec4 together by using result of dot product |
| 638 | + subgroupReduction.addAssign(dot(val, vec4(1))); |
| 639 | + |
| 640 | + // Increment so thread will scan value in next subgroup |
| 641 | + startThread.addAssign(subgroupSize); |
| 642 | + |
| 643 | + // Increment to continue loop |
| 644 | + currentSubgroupInBlock.addAssign(1); |
| 645 | + |
| 646 | + }) |
| 647 | + }) |
| 648 | + |
| 649 | + subgroupReduction.assign(subgroupAdd(subgroupReduction)); |
| 650 | + |
| 651 | + // Delegate one thread per subgroup to assign to the workgroupArray storing elements per subgroup |
| 652 | + If(invocationSubgroupIndex.equal(0), () => { |
| 653 | + |
| 654 | + workgroupArray.element(subgroupMetaRank).assign() |
| 655 | + |
| 656 | + }) |
| 657 | + |
| 658 | + // Ensure that each workgroup has populated wg_reduce with data |
| 659 | + // from each subgroup before we begin reducing down its values |
| 660 | + |
| 661 | + |
| 662 | + |
| 663 | + { |
| 664 | + for(var k = 0u; k < VEC4_SPT; k += 1u){ |
| 665 | + let t = scan_in[i]; |
| 666 | + s_red += dot(t, vec4(1u, 1u, 1u, 1u)); |
| 667 | + i += lane_count; |
| 668 | + } |
| 669 | + } |
| 670 | + |
| 671 | + if(wgid.x == info.thread_blocks - 1u){ |
| 672 | + for(var k = 0u; k < VEC4_SPT; k += 1u){ |
| 673 | + let t = select(vec4<u32>(0u, 0u, 0u, 0u), scan_in[i], i < info.vec_size); |
| 674 | + s_red += dot(t, vec4(1u, 1u, 1u, 1u)); |
| 675 | + i += lane_count; |
| 676 | + } |
| 677 | + } |
| 678 | + |
| 679 | + |
| 680 | + |
| 681 | + |
| 682 | + |
| 683 | + } ); |
| 684 | + |
| 685 | + |
| 686 | + |
| 687 | + }; |
| 688 | + |
578 | 689 | const incorrectBaselineCalls = [ |
579 | 690 | createIncorrectBaselineFn().compute( size ), |
580 | 691 | ]; |
|
0 commit comments