dotnet · kunalspathak · Oct 20, 2021 · Sep 29, 2021 · Sep 29, 2021 · Sep 30, 2021
diff --git a/src/coreclr/jit/compiler.cpp b/src/coreclr/jit/compiler.cpp
@@ -2647,16 +2647,38 @@ void Compiler::compInitOptions(JitFlags* jitFlags)
     opts.compJitAlignLoopMaxCodeSize    = DEFAULT_MAX_LOOPSIZE_FOR_ALIGN;
 #endif
 
+#ifdef TARGET_XARCH
     if (opts.compJitAlignLoopAdaptive)
     {
+        // For adaptive alignment, padding limit is equal to the max instruction encoding
+        // size which is 15 bytes. Hence (32 >> 1) - 1 = 15 bytes.
         opts.compJitAlignPaddingLimit = (opts.compJitAlignLoopBoundary >> 1) - 1;
     }
     else
     {
+        // For non-adaptive alignment, padding limit is 1 less than the alignment boundary
+        // specified.
         opts.compJitAlignPaddingLimit = opts.compJitAlignLoopBoundary - 1;
     }
+#elif TARGET_ARM64
+    if (opts.compJitAlignLoopAdaptive)
+    {
+        // For adaptive alignment, padding limit is same as specified by the alignment
+        // boundary because all instructions are 4 bytes long. Hence (32 >> 1) = 16 bytes.
+        opts.compJitAlignPaddingLimit = (opts.compJitAlignLoopBoundary >> 1);
+    }
+    else
+    {
+        // For non-adaptive, padding limit is same as specified by the alignment.
+        opts.compJitAlignPaddingLimit = opts.compJitAlignLoopBoundary;
+    }
+#endif
 
     assert(isPow2(opts.compJitAlignLoopBoundary));
+#ifdef TARGET_ARM64
+    // The minimum encoding size for Arm64 is 4 bytes.
+    assert(opts.compJitAlignLoopBoundary >= 4);
+#endif
 
 #if REGEN_SHORTCUTS || REGEN_CALLPAT
     // We never want to have debugging enabled when regenerating GC encoding patterns

diff --git a/src/coreclr/jit/emit.cpp b/src/coreclr/jit/emit.cpp
@@ -4796,6 +4796,81 @@ void emitter::emitJumpDistBind()
 
 #if FEATURE_LOOP_ALIGN
 
+//-----------------------------------------------------------------------------
+//
+//  The next instruction will be a loop head entry point
+//  So insert an alignment instruction here to ensure that
+//  we can properly align the code.
+//
+void emitter::emitLoopAlign(unsigned short paddingBytes)
+{
+    /* Insert a pseudo-instruction to ensure that we align
+       the next instruction properly */
+    instrDescAlign* id = emitNewInstrAlign();
+
+#if defined(TARGET_XARCH)
+    assert(paddingBytes <= MAX_ENCODED_SIZE);
+    id->idCodeSize(paddingBytes);
+#elif defined(TARGET_ARM64)
+    assert(paddingBytes == INSTR_ENCODED_SIZE);
+#endif
+
+    id->idaIG = emitCurIG;
+
+    /* Append this instruction to this IG's alignment list */
+    id->idaNext = emitCurIGAlignList;
+
+    emitCurIGsize += paddingBytes;
+
+    dispIns(id);
+    emitCurIGAlignList = id;
+}
+
+//-----------------------------------------------------------------------------
+//
+//  The next instruction will be a loop head entry point
+//  So insert alignment instruction(s) here to ensure that
+//  we can properly align the code.
+//
+//  This emits more than one `INS_align` instruction depending on the
+//  alignmentBoundary parameter.
+//
+void emitter::emitLongLoopAlign(unsigned short alignmentBoundary)
+{
+#if defined(TARGET_XARCH)
+    unsigned short nPaddingBytes    = alignmentBoundary - 1;
+    unsigned short nAlignInstr      = (nPaddingBytes + (MAX_ENCODED_SIZE - 1)) / MAX_ENCODED_SIZE;
+    unsigned short insAlignCount    = nPaddingBytes / MAX_ENCODED_SIZE;
+    unsigned short lastInsAlignSize = nPaddingBytes % MAX_ENCODED_SIZE;
+    unsigned short paddingBytes     = MAX_ENCODED_SIZE;
+#elif defined(TARGET_ARM64)
+    unsigned short nAlignInstr   = alignmentBoundary / INSTR_ENCODED_SIZE;
+    unsigned short insAlignCount = nAlignInstr;
+    unsigned short paddingBytes  = INSTR_ENCODED_SIZE;
+#endif
+
+    unsigned short instrDescSize = nAlignInstr * sizeof(instrDescAlign);
+
+    // Ensure that all align instructions fall in same IG.
+    if (emitCurIGfreeNext + instrDescSize >= emitCurIGfreeEndp)
+    {
+        emitForceNewIG = true;
+    }
+
+    /* Insert a pseudo-instruction to ensure that we align
+    the next instruction properly */
+
+    while (insAlignCount)
+    {
+        emitLoopAlign(paddingBytes);
+        insAlignCount--;
+    }
+
+#if defined(TARGET_XARCH)
+    emitLoopAlign(lastInsAlignSize);
+#endif
+}
+
 //-----------------------------------------------------------------------------
 // emitLoopAlignment: Insert an align instruction at the end of emitCurIG and
 //                    mark it as IGF_LOOP_ALIGN to indicate that next IG  is a
@@ -4805,6 +4880,9 @@ void emitter::emitLoopAlignment()
 {
     unsigned short paddingBytes;
 
+#if defined(TARGET_XARCH)
+    // For xarch, each align instruction can be maximum of MAX_ENCODED_SIZE bytes and if
+    // more padding is needed, multiple MAX_ENCODED_SIZE bytes instructions are added.
     if ((emitComp->opts.compJitAlignLoopBoundary > 16) && (!emitComp->opts.compJitAlignLoopAdaptive))
     {
         paddingBytes = emitComp->opts.compJitAlignLoopBoundary;
@@ -4815,6 +4893,19 @@ void emitter::emitLoopAlignment()
         paddingBytes = MAX_ENCODED_SIZE;
         emitLoopAlign(paddingBytes);
     }
+#elif defined(TARGET_ARM64)
+    // For Arm64, each align instruction is 4-bytes long because of fixed-length encoding.
+    // The padding added will be always be in multiple of 4-bytes.
+    if (emitComp->opts.compJitAlignLoopAdaptive)
+    {
+        paddingBytes = emitComp->opts.compJitAlignLoopBoundary >> 1;
+    }
+    else
+    {
+        paddingBytes = emitComp->opts.compJitAlignLoopBoundary;
+    }
+    emitLongLoopAlign(paddingBytes);
+#endif
 
     // Mark this IG as need alignment so during emitter we can check the instruction count heuristics of
     // all IGs that follows this IG and participate in a loop.
@@ -5042,6 +5133,7 @@ void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock)
                 {
                     assert(!markedLastLoop);
                     assert(alignInstr->idaIG->isLoopAlign());
+
                     alignInstr->idaIG->igFlags &= ~IGF_LOOP_ALIGN;
                     markedLastLoop = true;
                     JITDUMP("** Skip alignment for aligned loop IG%02u ~ IG%02u because it encloses the current loop "
@@ -5054,6 +5146,20 @@ void emitter::emitSetLoopBackEdge(BasicBlock* loopTopBlock)
                     break;
                 }
 
+#if defined(TARGET_XARCH)
+                if (!emitComp->opts.compJitAlignLoopAdaptive)
+#endif
+                {
+                    // If there are multiple align instructions, skip the align instructions after
+                    // the first align instruction and fast forward to the next IG
+                    insGroup* alignIG = alignInstr->idaIG;
+                    while ((alignInstr != nullptr) && (alignInstr->idaNext != nullptr) &&
+                           (alignInstr->idaNext->idaIG == alignIG))
+                    {
+                        alignInstr = alignInstr->idaNext;
+                    }
+                }
+
                 alignInstr = alignInstr->idaNext;
             }
 
@@ -5126,26 +5232,45 @@ void emitter::emitLoopAlignAdjustments()
                 alignIG->igFlags &= ~IGF_LOOP_ALIGN;
             }
 
+#ifdef TARGET_XARCH
             if (emitComp->opts.compJitAlignLoopAdaptive)
             {
                 assert(actualPaddingNeeded < MAX_ENCODED_SIZE);
                 alignInstr->idCodeSize(actualPaddingNeeded);
             }
             else
+#endif
             {
                 unsigned paddingToAdj = actualPaddingNeeded;
 
 #ifdef DEBUG
+#if defined(TARGET_XARCH)
                 int instrAdjusted =
                     (emitComp->opts.compJitAlignLoopBoundary + (MAX_ENCODED_SIZE - 1)) / MAX_ENCODED_SIZE;
-#endif
+#elif defined(TARGET_ARM64)
+                unsigned short instrAdjusted = (emitComp->opts.compJitAlignLoopBoundary >> 1) / INSTR_ENCODED_SIZE;
+                if (!emitComp->opts.compJitAlignLoopAdaptive)
+                {
+                    instrAdjusted = emitComp->opts.compJitAlignLoopBoundary / INSTR_ENCODED_SIZE;
+                }
+#endif // TARGET_XARCH & TARGET_ARM64
+#endif // DEBUG
                 // Adjust the padding amount in all align instructions in this IG
                 instrDescAlign *alignInstrToAdj = alignInstr, *prevAlignInstr = nullptr;
                 for (; alignInstrToAdj != nullptr && alignInstrToAdj->idaIG == alignInstr->idaIG;
                      alignInstrToAdj = alignInstrToAdj->idaNext)
                 {
+
+#if defined(TARGET_XARCH)
                     unsigned newPadding = min(paddingToAdj, MAX_ENCODED_SIZE);
                     alignInstrToAdj->idCodeSize(newPadding);
+#elif defined(TARGET_ARM64)
+                    unsigned newPadding = min(paddingToAdj, INSTR_ENCODED_SIZE);
+                    if (newPadding == 0)
+                    {
+                        alignInstrToAdj->idInsOpt(INS_OPTS_NONE);
+                    }
+#endif
                     paddingToAdj -= newPadding;
                     prevAlignInstr = alignInstrToAdj;
 #ifdef DEBUG
@@ -5191,7 +5316,7 @@ void emitter::emitLoopAlignAdjustments()
 }
 
 //-----------------------------------------------------------------------------
-//  emitCalculatePaddingForLoopAlignment: Calculate the padding to insert at the
+//  emitCalculatePaddingForLoopAlignment: Calculate the padding amount to insert at the
 //    end of 'ig' so the loop that starts after 'ig' is aligned.
 //
 //  Arguments:
@@ -5268,16 +5393,25 @@ unsigned emitter::emitCalculatePaddingForLoopAlignment(insGroup* ig, size_t offs
     if (emitComp->opts.compJitAlignLoopAdaptive)
     {
         // adaptive loop alignment
-        unsigned nMaxPaddingBytes = (1 << (maxLoopBlocksAllowed - minBlocksNeededForLoop + 1)) - 1;
-        unsigned nPaddingBytes    = (-(int)(size_t)offset) & (alignmentBoundary - 1);
+        unsigned nMaxPaddingBytes = (1 << (maxLoopBlocksAllowed - minBlocksNeededForLoop + 1));
+#ifdef TARGET_XARCH
+        // Max padding for adaptive alignment has alignmentBoundary of 32 bytes with
+        // max padding limit of 15 bytes ((alignmentBoundary >> 1) - 1)
+        nMaxPaddingBytes -= 1;
+#endif
+        unsigned nPaddingBytes = (-(int)(size_t)offset) & (alignmentBoundary - 1);
 
         // Check if the alignment exceeds maxPadding limit
         if (nPaddingBytes > nMaxPaddingBytes)
         {
+#ifdef TARGET_XARCH
             // Cannot align to 32B, so try to align to 16B boundary.
+            // Only applicable for xarch. For arm64, it is recommended to align
+            // at 32B only.
             alignmentBoundary >>= 1;
             nMaxPaddingBytes = 1 << (maxLoopBlocksAllowed - minBlocksNeededForLoop + 1);
             nPaddingBytes    = (-(int)(size_t)offset) & (alignmentBoundary - 1);
+#endif
 
             // Check if the loop is already at new alignment boundary
             if (nPaddingBytes == 0)

diff --git a/src/coreclr/jit/emit.h b/src/coreclr/jit/emit.h
@@ -583,6 +583,7 @@ class emitter
         instruction _idIns : 10;
 #define MAX_ENCODED_SIZE 15
 #elif defined(TARGET_ARM64)
+#define INSTR_ENCODED_SIZE 4
         static_assert_no_msg(INS_count <= 512);
         instruction _idIns : 9;
 #else  // !(defined(TARGET_XARCH) || defined(TARGET_ARM64))
@@ -890,6 +891,12 @@ class emitter
         }
 
 #elif defined(TARGET_ARM64)
+
+        inline bool idIsEmptyAlign() const
+        {
+            return (idIns() == INS_align) && (idInsOpt() == INS_OPTS_NONE);
+        }
+
         unsigned idCodeSize() const
         {
             int size = 4;
@@ -913,6 +920,12 @@ class emitter
                         size = 8;
                     }
                     break;
+                case IF_SN_0A:
+                    if (idIsEmptyAlign())
+                    {
+                        size = 0;
+                    }
+                    break;
                 default:
                     break;
             }
@@ -1371,7 +1384,11 @@ class emitter
         instrDescAlign* idaNext; // next align in the group/method
         insGroup*       idaIG;   // containing group
     };
-#endif
+
+    void emitLoopAlign(unsigned short paddingBytes);
+    void emitLongLoopAlign(unsigned short alignmentBoundary);
+
+#endif // FEATURE_LOOP_ALIGN
 
 #if !defined(TARGET_ARM64) // This shouldn't be needed for ARM32, either, but I don't want to touch the ARM32 JIT.
     struct instrDescLbl : instrDescJmp
@@ -2569,6 +2586,11 @@ inline emitter::instrDescAlign* emitter::emitNewInstrAlign()
 {
     instrDescAlign* newInstr = emitAllocInstrAlign();
     newInstr->idIns(INS_align);
+
+#ifdef TARGET_ARM64
+    newInstr->idInsFmt(IF_SN_0A);
+    newInstr->idInsOpt(INS_OPTS_ALIGN);
+#endif
     return newInstr;
 }
 #endif