Skip to content

Commit 7bd68df

Browse files
authored
Loop Alignment support for Arm64 (#60135)
* Enable FEATURE_LOOP_ALIGN for Arm64 * basic loop alignment for arm64 * misc changes * perf score should account for align * some fixes * updated some asserts * jit format * Fix test cases * Misc changes * jit format * Review comments
1 parent 2616e76 commit 7bd68df

File tree

9 files changed

+249
-74
lines changed

9 files changed

+249
-74
lines changed

src/coreclr/jit/compiler.cpp

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2677,16 +2677,38 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
26772677
opts.compJitAlignLoopMaxCodeSize = DEFAULT_MAX_LOOPSIZE_FOR_ALIGN;
26782678
#endif
26792679

2680+
#ifdef TARGET_XARCH
26802681
if (opts.compJitAlignLoopAdaptive)
26812682
{
2683+
// For adaptive alignment, padding limit is equal to the max instruction encoding
2684+
// size which is 15 bytes. Hence (32 >> 1) - 1 = 15 bytes.
26822685
opts.compJitAlignPaddingLimit = (opts.compJitAlignLoopBoundary >> 1) - 1;
26832686
}
26842687
else
26852688
{
2689+
// For non-adaptive alignment, padding limit is 1 less than the alignment boundary
2690+
// specified.
26862691
opts.compJitAlignPaddingLimit = opts.compJitAlignLoopBoundary - 1;
26872692
}
2693+
#elif TARGET_ARM64
2694+
if (opts.compJitAlignLoopAdaptive)
2695+
{
2696+
// For adaptive alignment, padding limit is same as specified by the alignment
2697+
// boundary because all instructions are 4 bytes long. Hence (32 >> 1) = 16 bytes.
2698+
opts.compJitAlignPaddingLimit = (opts.compJitAlignLoopBoundary >> 1);
2699+
}
2700+
else
2701+
{
2702+
// For non-adaptive, padding limit is same as specified by the alignment.
2703+
opts.compJitAlignPaddingLimit = opts.compJitAlignLoopBoundary;
2704+
}
2705+
#endif
26882706

26892707
assert(isPow2(opts.compJitAlignLoopBoundary));
2708+
#ifdef TARGET_ARM64
2709+
// The minimum encoding size for Arm64 is 4 bytes.
2710+
assert(opts.compJitAlignLoopBoundary >= 4);
2711+
#endif
26902712

26912713
#if REGEN_SHORTCUTS || REGEN_CALLPAT
26922714
// We never want to have debugging enabled when regenerating GC encoding patterns

src/coreclr/jit/emit.cpp

Lines changed: 138 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4796,6 +4796,81 @@ void emitter::emitJumpDistBind()
47964796

47974797
#if FEATURE_LOOP_ALIGN
47984798

4799+
//-----------------------------------------------------------------------------
4800+
//
4801+
// The next instruction will be a loop head entry point
4802+
// So insert an alignment instruction here to ensure that
4803+
// we can properly align the code.
4804+
//
4805+
void emitter::emitLoopAlign(unsigned short paddingBytes)
4806+
{
4807+
/* Insert a pseudo-instruction to ensure that we align
4808+
the next instruction properly */
4809+
instrDescAlign* id = emitNewInstrAlign();
4810+
4811+
#if defined(TARGET_XARCH)
4812+
assert(paddingBytes <= MAX_ENCODED_SIZE);
4813+
id->idCodeSize(paddingBytes);
4814+
#elif defined(TARGET_ARM64)
4815+
assert(paddingBytes == INSTR_ENCODED_SIZE);
4816+
#endif
4817+
4818+
id->idaIG = emitCurIG;
4819+
4820+
/* Append this instruction to this IG's alignment list */
4821+
id->idaNext = emitCurIGAlignList;
4822+
4823+
emitCurIGsize += paddingBytes;
4824+
4825+
dispIns(id);
4826+
emitCurIGAlignList = id;
4827+
}
4828+
4829+
//-----------------------------------------------------------------------------
4830+
//
4831+
// The next instruction will be a loop head entry point
4832+
// So insert alignment instruction(s) here to ensure that
4833+
// we can properly align the code.
4834+
//
4835+
// This emits more than one `INS_align` instruction depending on the
4836+
// alignmentBoundary parameter.
4837+
//
4838+
void emitter::emitLongLoopAlign(unsigned short alignmentBoundary)
4839+
{
4840+
#if defined(TARGET_XARCH)
4841+
unsigned short nPaddingBytes = alignmentBoundary - 1;
4842+
unsigned short nAlignInstr = (nPaddingBytes + (MAX_ENCODED_SIZE - 1)) / MAX_ENCODED_SIZE;
4843+
unsigned short insAlignCount = nPaddingBytes / MAX_ENCODED_SIZE;
4844+
unsigned short lastInsAlignSize = nPaddingBytes % MAX_ENCODED_SIZE;
4845+
unsigned short paddingBytes = MAX_ENCODED_SIZE;
4846+
#elif defined(TARGET_ARM64)
4847+
unsigned short nAlignInstr = alignmentBoundary / INSTR_ENCODED_SIZE;
4848+
unsigned short insAlignCount = nAlignInstr;
4849+
unsigned short paddingBytes = INSTR_ENCODED_SIZE;
4850+
#endif
4851+
4852+
unsigned short instrDescSize = nAlignInstr * sizeof(instrDescAlign);
4853+
4854+
// Ensure that all align instructions fall in same IG.
4855+
if (emitCurIGfreeNext + instrDescSize >= emitCurIGfreeEndp)
4856+
{
4857+
emitForceNewIG = true;
4858+
}
4859+
4860+
/* Insert a pseudo-instruction to ensure that we align
4861+
the next instruction properly */
4862+
4863+
while (insAlignCount)
4864+
{
4865+
emitLoopAlign(paddingBytes);
4866+
insAlignCount--;
4867+
}
4868+
4869+
#if defined(TARGET_XARCH)
4870+
emitLoopAlign(lastInsAlignSize);
4871+
#endif
4872+
}
4873+
47994874
//-----------------------------------------------------------------------------
48004875
// emitLoopAlignment: Insert an align instruction at the end of emitCurIG and
48014876
// mark it as IGF_LOOP_ALIGN to indicate that next IG is a
@@ -4805,6 +4880,9 @@ void emitter::emitLoopAlignment()
48054880
{
48064881
unsigned short paddingBytes;
48074882

4883+
#if defined(TARGET_XARCH)
4884+
// For xarch, each align instruction can be maximum of MAX_ENCODED_SIZE bytes and if
4885+
// more padding is needed, multiple MAX_ENCODED_SIZE bytes instructions are added.
48084886
if ((emitComp->opts.compJitAlignLoopBoundary > 16) && (!emitComp->opts.compJitAlignLoopAdaptive))
48094887
{
48104888
paddingBytes = emitComp->opts.compJitAlignLoopBoundary;
@@ -4815,6 +4893,19 @@ void emitter::emitLoopAlignment()
48154893
paddingBytes = MAX_ENCODED_SIZE;
48164894
emitLoopAlign(paddingBytes);
48174895
}
4896+
#elif defined(TARGET_ARM64)
4897+
// For Arm64, each align instruction is 4-bytes long because of fixed-length encoding.
4898+
// The padding added will be always be in multiple of 4-bytes.
4899+
if (emitComp->opts.compJitAlignLoopAdaptive)
4900+
{
4901+
paddingBytes = emitComp->opts.compJitAlignLoopBoundary >> 1;
4902+
}
4903+
else
4904+
{
4905+
paddingBytes = emitComp->opts.compJitAlignLoopBoundary;
4906+
}
4907+
emitLongLoopAlign(paddingBytes);
4908+
#endif
48184909

48194910
// Mark this IG as need alignment so during emitter we can check the instruction count heuristics of
48204911
// all IGs that follows this IG and participate in a loop.
@@ -5042,6 +5133,7 @@ void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock)
50425133
{
50435134
assert(!markedLastLoop);
50445135
assert(alignInstr->idaIG->isLoopAlign());
5136+
50455137
alignInstr->idaIG->igFlags &= ~IGF_LOOP_ALIGN;
50465138
markedLastLoop = true;
50475139
JITDUMP("** Skip alignment for aligned loop IG%02u ~ IG%02u because it encloses the current loop "
@@ -5054,6 +5146,20 @@ void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock)
50545146
break;
50555147
}
50565148

5149+
#if defined(TARGET_XARCH)
5150+
if (!emitComp->opts.compJitAlignLoopAdaptive)
5151+
#endif
5152+
{
5153+
// If there are multiple align instructions, skip the align instructions after
5154+
// the first align instruction and fast forward to the next IG
5155+
insGroup* alignIG = alignInstr->idaIG;
5156+
while ((alignInstr != nullptr) && (alignInstr->idaNext != nullptr) &&
5157+
(alignInstr->idaNext->idaIG == alignIG))
5158+
{
5159+
alignInstr = alignInstr->idaNext;
5160+
}
5161+
}
5162+
50575163
alignInstr = alignInstr->idaNext;
50585164
}
50595165

@@ -5126,26 +5232,45 @@ void emitter::emitLoopAlignAdjustments()
51265232
alignIG->igFlags &= ~IGF_LOOP_ALIGN;
51275233
}
51285234

5235+
#ifdef TARGET_XARCH
51295236
if (emitComp->opts.compJitAlignLoopAdaptive)
51305237
{
51315238
assert(actualPaddingNeeded < MAX_ENCODED_SIZE);
51325239
alignInstr->idCodeSize(actualPaddingNeeded);
51335240
}
51345241
else
5242+
#endif
51355243
{
51365244
unsigned paddingToAdj = actualPaddingNeeded;
51375245

51385246
#ifdef DEBUG
5247+
#if defined(TARGET_XARCH)
51395248
int instrAdjusted =
51405249
(emitComp->opts.compJitAlignLoopBoundary + (MAX_ENCODED_SIZE - 1)) / MAX_ENCODED_SIZE;
5141-
#endif
5250+
#elif defined(TARGET_ARM64)
5251+
unsigned short instrAdjusted = (emitComp->opts.compJitAlignLoopBoundary >> 1) / INSTR_ENCODED_SIZE;
5252+
if (!emitComp->opts.compJitAlignLoopAdaptive)
5253+
{
5254+
instrAdjusted = emitComp->opts.compJitAlignLoopBoundary / INSTR_ENCODED_SIZE;
5255+
}
5256+
#endif // TARGET_XARCH & TARGET_ARM64
5257+
#endif // DEBUG
51425258
// Adjust the padding amount in all align instructions in this IG
51435259
instrDescAlign *alignInstrToAdj = alignInstr, *prevAlignInstr = nullptr;
51445260
for (; alignInstrToAdj != nullptr && alignInstrToAdj->idaIG == alignInstr->idaIG;
51455261
alignInstrToAdj = alignInstrToAdj->idaNext)
51465262
{
5263+
5264+
#if defined(TARGET_XARCH)
51475265
unsigned newPadding = min(paddingToAdj, MAX_ENCODED_SIZE);
51485266
alignInstrToAdj->idCodeSize(newPadding);
5267+
#elif defined(TARGET_ARM64)
5268+
unsigned newPadding = min(paddingToAdj, INSTR_ENCODED_SIZE);
5269+
if (newPadding == 0)
5270+
{
5271+
alignInstrToAdj->idInsOpt(INS_OPTS_NONE);
5272+
}
5273+
#endif
51495274
paddingToAdj -= newPadding;
51505275
prevAlignInstr = alignInstrToAdj;
51515276
#ifdef DEBUG
@@ -5191,7 +5316,7 @@ void emitter::emitLoopAlignAdjustments()
51915316
}
51925317

51935318
//-----------------------------------------------------------------------------
5194-
// emitCalculatePaddingForLoopAlignment: Calculate the padding to insert at the
5319+
// emitCalculatePaddingForLoopAlignment: Calculate the padding amount to insert at the
51955320
// end of 'ig' so the loop that starts after 'ig' is aligned.
51965321
//
51975322
// Arguments:
@@ -5268,16 +5393,25 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs
52685393
if (emitComp->opts.compJitAlignLoopAdaptive)
52695394
{
52705395
// adaptive loop alignment
5271-
unsigned nMaxPaddingBytes = (1 << (maxLoopBlocksAllowed - minBlocksNeededForLoop + 1)) - 1;
5272-
unsigned nPaddingBytes = (-(int)(size_t)offset) & (alignmentBoundary - 1);
5396+
unsigned nMaxPaddingBytes = (1 << (maxLoopBlocksAllowed - minBlocksNeededForLoop + 1));
5397+
#ifdef TARGET_XARCH
5398+
// Max padding for adaptive alignment has alignmentBoundary of 32 bytes with
5399+
// max padding limit of 15 bytes ((alignmentBoundary >> 1) - 1)
5400+
nMaxPaddingBytes -= 1;
5401+
#endif
5402+
unsigned nPaddingBytes = (-(int)(size_t)offset) & (alignmentBoundary - 1);
52735403

52745404
// Check if the alignment exceeds maxPadding limit
52755405
if (nPaddingBytes > nMaxPaddingBytes)
52765406
{
5407+
#ifdef TARGET_XARCH
52775408
// Cannot align to 32B, so try to align to 16B boundary.
5409+
// Only applicable for xarch. For arm64, it is recommended to align
5410+
// at 32B only.
52785411
alignmentBoundary >>= 1;
52795412
nMaxPaddingBytes = 1 << (maxLoopBlocksAllowed - minBlocksNeededForLoop + 1);
52805413
nPaddingBytes = (-(int)(size_t)offset) & (alignmentBoundary - 1);
5414+
#endif
52815415

52825416
// Check if the loop is already at new alignment boundary
52835417
if (nPaddingBytes == 0)

src/coreclr/jit/emit.h

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,7 @@ class emitter
583583
instruction _idIns : 10;
584584
#define MAX_ENCODED_SIZE 15
585585
#elif defined(TARGET_ARM64)
586+
#define INSTR_ENCODED_SIZE 4
586587
static_assert_no_msg(INS_count <= 512);
587588
instruction _idIns : 9;
588589
#else // !(defined(TARGET_XARCH) || defined(TARGET_ARM64))
@@ -890,6 +891,12 @@ class emitter
890891
}
891892

892893
#elif defined(TARGET_ARM64)
894+
895+
inline bool idIsEmptyAlign() const
896+
{
897+
return (idIns() == INS_align) && (idInsOpt() == INS_OPTS_NONE);
898+
}
899+
893900
unsigned idCodeSize() const
894901
{
895902
int size = 4;
@@ -913,6 +920,12 @@ class emitter
913920
size = 8;
914921
}
915922
break;
923+
case IF_SN_0A:
924+
if (idIsEmptyAlign())
925+
{
926+
size = 0;
927+
}
928+
break;
916929
default:
917930
break;
918931
}
@@ -1371,7 +1384,11 @@ class emitter
13711384
instrDescAlign* idaNext; // next align in the group/method
13721385
insGroup* idaIG; // containing group
13731386
};
1374-
#endif
1387+
1388+
void emitLoopAlign(unsigned short paddingBytes);
1389+
void emitLongLoopAlign(unsigned short alignmentBoundary);
1390+
1391+
#endif // FEATURE_LOOP_ALIGN
13751392

13761393
#if !defined(TARGET_ARM64) // This shouldn't be needed for ARM32, either, but I don't want to touch the ARM32 JIT.
13771394
struct instrDescLbl : instrDescJmp
@@ -2569,6 +2586,11 @@ inline emitter::instrDescAlign* emitter::emitNewInstrAlign()
25692586
{
25702587
instrDescAlign* newInstr = emitAllocInstrAlign();
25712588
newInstr->idIns(INS_align);
2589+
2590+
#ifdef TARGET_ARM64
2591+
newInstr->idInsFmt(IF_SN_0A);
2592+
newInstr->idInsOpt(INS_OPTS_ALIGN);
2593+
#endif
25722594
return newInstr;
25732595
}
25742596
#endif

0 commit comments

Comments
 (0)