dotnet · csa7mdm · Dec 9, 2025 · Dec 14, 2025 · Dec 15, 2025 · Dec 15, 2025
diff --git a/src/coreclr/jit/lower.cpp b/src/coreclr/jit/lower.cpp
@@ -8668,6 +8668,148 @@ void Lowering::LowerShift(GenTreeOp* shift)
         shift->gtOp2->ClearContained();
     }
 
+    if (comp->opts.OptimizationEnabled() && shift->OperIs(GT_LSH, GT_RSH, GT_RSZ) && shift->gtGetOp2()->IsCnsIntOrI())
+    {
+        GenTree* op1      = shift->gtGetOp1();
+        ssize_t  c2       = shift->gtGetOp2()->AsIntCon()->IconValue();
+        unsigned bitWidth = genTypeSize(shift->TypeGet()) * 8;
+
+        // Case 1: (shift (shift x c1) c2)
+        // We can combine if:
+        // 1. Same operation (LSH/LSH, RSH/RSH, RSZ/RSZ)
+        bool sameOp  = op1->OperIs(shift->OperGet());
+
+        if (sameOp && op1->gtGetOp2()->IsCnsIntOrI() && !op1->IsMultiRegNode())
+        {
+            ssize_t  c1            = op1->gtGetOp2()->AsIntCon()->IconValue();
+            unsigned innerBitWidth = genTypeSize(op1->TypeGet()) * 8;
+
+            // Only optimize if types match (simplifies width checks)
+            if (op1->TypeGet() == shift->TypeGet())
+            {
+                // We use a larger type to check for overflow (though shift counts likely small)
+                // But conceptually c1+c2 can be large.
+                ssize_t combined = c1 + c2;
+
+                if ((c1 > 0) && (c2 > 0))
+                {
+                    if (combined < (ssize_t)bitWidth)
+                    {
+                        JITDUMP("Optimizing consecutive shifts: (x %s %d) %s %d -> x %s %d\n",
+                                GenTree::OpName(op1->OperGet()), (int)c1, GenTree::OpName(shift->OperGet()), (int)c2,
+                                GenTree::OpName(shift->OperGet()), (int)combined);
+                        // If we had RSH(RSZ), result is RSZ.
+                        if (mixedOp)
+                        {
+                            shift->SetOper(GT_RSZ);
+                        }
+
+                        shift->gtGetOp2()->AsIntCon()->SetIconValue(combined);
+                        shift->gtOp1 = op1->gtGetOp1();
+                        op1->gtGetOp1()->ClearContained();
+                        BlockRange().Remove(op1->gtGetOp2());
+                        BlockRange().Remove(op1);
+                    }
+                    else
+                    {
+                        // Overshift Case
+                        JITDUMP("Optimizing overshift: (x %s %d) %s %d\n", GenTree::OpName(op1->OperGet()), (int)c1,
+                                GenTree::OpName(shift->OperGet()), (int)c2);
+
+                        if (shift->OperIs(GT_RSH) && !mixedOp)
+                        {
+                            // RSH saturates to sign bit (shift by bitWidth - 1)
+                            // (x >> 30) >> 30 -> x >> 31 (for 32-bit)
+                            JITDUMP(" -> x >> %d\n", bitWidth - 1);
+
+                            shift->gtGetOp2()->AsIntCon()->SetIconValue(static_cast<ssize_t>(bitWidth) - 1);
+                            shift->gtOp1 = op1->gtGetOp1();
+                            op1->gtGetOp1()->ClearContained();
+                            BlockRange().Remove(op1->gtGetOp2());
+                            BlockRange().Remove(op1);
+                        }
+                        else
+                        {
+                            // LSH or RSZ -> 0
+                            // (x << 30) << 2 -> 0
+                            // (x >>> 30) >>> 2 -> 0
+                            JITDUMP(" -> 0\n");
+
+                            GenTree* zero = comp->gtNewZeroConNode(shift->TypeGet());
+                            BlockRange().InsertAfter(shift, zero);
+
+                            LIR::Use use;
+                            if (BlockRange().TryGetUse(shift, &use))
+                            {
+                                use.ReplaceWith(zero);
+                            }
+                            else
+                            {
+                                zero->SetUnusedValue();
+                            }
+
+                            // Remove the entire chain if possible, or at least the outer shift
+                            // Note: op1 might still be used elsewhere if ref counts > 1?
+                            // But peephole assumes single use usually or we just disconnect.
+                            // The LIR::Use check handles the result use.
+                            // We remove 'shift' and its op2.
+                            BlockRange().Remove(shift->gtGetOp2());
+                            BlockRange().Remove(shift);
+
+                            // We don't remove op1 here as it might be used elsewhere (unlikely in this peephole context but safer)
+                            // Actually if we disconnect it from shift, and it has no other uses...
+                            // But let's leave DCE to handle op1 if it becomes dead.
+                        }
+                    }
+                }
+            }
+        }
+        // Case 2: (shift (cast (shift x c1)) c2)
+        // Optimization for: RSZ(CAST(RSZ(x, c1)), c2) -> CAST(RSZ(x, c1 + c2))
+        else if (shift->OperIs(GT_RSZ) && op1->OperIs(GT_CAST) && !op1->gtOverflow() && !op1->IsMultiRegNode())
+        {
+            GenTree* cast       = op1;
+            GenTree* innerShift = cast->gtGetOp1();
+
+            // Only optimize if strict widening or same width (narrowing casts can have side effects on bits)
+            // Example: (long)(intVar >>> 30) >>> 2
+            // If normal: (long)(00...011) >>> 2 = 0
+            // If combined: (long)(intVar) >>> 32 = 0 (maybe?)
+            // But: (short)(intVar >>> 16) >>> 1
+            // Real: (short)(0x....1234) -> 0x1234 -> 0x091a
+            // Combined: (short)(intVar >>> 17) -> 0x091a
+            // However, truncation behavior is subtle.
+            // Prompt requested: "Ensure cast doesn't change signedness or truncate in a way that invalidates the optimization"
+            // Safest is to disable for narrowing.
+            bool isNarrowing = genTypeSize(cast->TypeGet()) < genTypeSize(innerShift->TypeGet());
+
+            if (!isNarrowing && innerShift->OperIs(GT_RSZ) && innerShift->gtGetOp2()->IsCnsIntOrI() && !innerShift->IsMultiRegNode())
+            {
+                ssize_t  c1            = innerShift->gtGetOp2()->AsIntCon()->IconValue();
+                unsigned innerBitWidth = genTypeSize(innerShift->TypeGet()) * 8;
+
+                if ((c1 > 0) && (c2 > 0) && ((c1 + c2) < (ssize_t)innerBitWidth))
+                {
+                    JITDUMP("Optimizing distinct type shifts: (cast (x >> %d)) >> %d -> cast (x >> %d)\n", (int)c1,
+                            (int)c2, (int)(c1 + c2));
+
+                    innerShift->gtGetOp2()->AsIntCon()->SetIconValue(c1 + c2);
+
+                    // Replace uses of 'shift' with 'cast', bypassing 'shift'
+                    LIR::Use use;
+                    if (BlockRange().TryGetUse(shift, &use))
+                    {
+                        use.ReplaceWith(cast);
+                    }
+
+                    // Remove 'c2' and turn 'shift' into NOP
+                    BlockRange().Remove(shift->gtGetOp2());
+                    shift->gtBashToNOP();
+                }
+            }
+        }
+    }
+
     ContainCheckShiftRotate(shift);
 
 #ifdef TARGET_ARM64