From 8669dd477ba56401c4256e42562078797bb2a86d Mon Sep 17 00:00:00 2001
From: Dylan Reimerink <dylan.reimerink@isovalent.com>
Date: Thu, 17 Apr 2025 15:36:52 +0200
Subject: [PATCH 1/2] asm: Add handling for atomic operations

As it stands, we only had proper handling of atomic add operations.
`lock *(u32 *)(r1 + 0x1) += w2` was the only instruction that was
properly handled. Atomic operations have the same opcode, but
and are differentiated by the imm value. So far we have not been looking
at the imm value, so all atomic operations look like atomic adds to us.

This commit adds decoding for all current atomic operations. We handle
them similarly to how we handle ISAv4 instructions which use the
offset to further specify the instruction.

In #1193 we expanded the opcode from a u8 to u16, which is bigger than
the actual size. This allowed us to still represent functionally
different opcodes in Go, even tough the kernel uses other bits of the
instruction. So our opcode in Go is not identical to the opcode in the
kernel. We translate during marshalling and unmarshaling.

So far, we have only needed a few additional bits, but the atomic ops
need 9 bits of imm to fully encode all possibilities. Since 9 + 8 > 16
we have to grow the opcode to 32 bits.

During unmarshaling, we simply take the lower 9 bits of the imm shift
it left by 16 bits and or it with the opcode. During marshaling this
process is reversed.

Signed-off-by: Dylan Reimerink <dylan.reimerink@isovalent.com>
---
 asm/instruction.go       |  21 ++++--
 asm/instruction_test.go  |  94 ++++++++++++++++++++++++
 asm/load_store.go        | 151 ++++++++++++++++++++++++++++++++++++---
 asm/load_store_string.go |   4 +-
 asm/opcode.go            |  42 ++++++++---
 5 files changed, 287 insertions(+), 25 deletions(-)

diff --git a/asm/instruction.go b/asm/instruction.go
index 10ca2547e..cf970e57a 100644
--- a/asm/instruction.go
+++ b/asm/instruction.go
@@ -62,6 +62,10 @@ func (ins *Instruction) Unmarshal(r io.Reader, bo binary.ByteOrder, platform str
 
 	ins.Offset = int16(bo.Uint16(data[2:4]))
 
+	// Convert to int32 before widening to int64
+	// to ensure the signed bit is carried over.
+	ins.Constant = int64(int32(bo.Uint32(data[4:8])))
+
 	if ins.IsBuiltinCall() {
 		fn, err := BuiltinFuncForPlatform(platform, uint32(ins.Constant))
 		if err != nil {
@@ -93,12 +97,14 @@ func (ins *Instruction) Unmarshal(r io.Reader, bo binary.ByteOrder, platform str
 				ins.Offset = 0
 			}
 		}
+	} else if ins.OpCode.Class() == StXClass &&
+		ins.OpCode.Mode() == AtomicMode {
+		// For atomic ops, part of the opcode is stored in the
+		// constant field. Shift over 8 bytes so we can OR with the actual opcode and
+		// apply `atomicMask` to avoid merging unknown bits that may be added in the future.
+		ins.OpCode |= (OpCode((ins.Constant << 8)) & atomicMask)
 	}
 
-	// Convert to int32 before widening to int64
-	// to ensure the signed bit is carried over.
-	ins.Constant = int64(int32(bo.Uint32(data[4:8])))
-
 	if !ins.OpCode.IsDWordLoad() {
 		return nil
 	}
@@ -171,6 +177,9 @@ func (ins Instruction) Marshal(w io.Writer, bo binary.ByteOrder) (uint64, error)
 			return 0, fmt.Errorf("extended ALU opcodes should have an .Offset of 0: %s", ins)
 		}
 		ins.Offset = newOffset
+	} else if atomic := ins.OpCode.AtomicOp(); atomic != InvalidAtomic {
+		ins.OpCode = ins.OpCode &^ atomicMask
+		ins.Constant = int64(atomic >> 8)
 	}
 
 	op, err := ins.OpCode.bpfOpCode()
@@ -382,8 +391,8 @@ func (ins Instruction) Format(f fmt.State, c rune) {
 			fmt.Fprintf(f, "dst: %s src: %s imm: %d", ins.Dst, ins.Src, ins.Constant)
 		case MemMode, MemSXMode:
 			fmt.Fprintf(f, "dst: %s src: %s off: %d imm: %d", ins.Dst, ins.Src, ins.Offset, ins.Constant)
-		case XAddMode:
-			fmt.Fprintf(f, "dst: %s src: %s", ins.Dst, ins.Src)
+		case AtomicMode:
+			fmt.Fprintf(f, "dst: %s src: %s off: %d", ins.Dst, ins.Src, ins.Offset)
 		}
 
 	case cls.IsALU():
diff --git a/asm/instruction_test.go b/asm/instruction_test.go
index 04f160ecb..c505c5b41 100644
--- a/asm/instruction_test.go
+++ b/asm/instruction_test.go
@@ -328,6 +328,82 @@ func (t testFDer) FD() int {
 	return int(t)
 }
 
+func TestAtomics(t *testing.T) {
+	rawInsns := []byte{
+		0xc3, 0x21, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, // lock *(u32 *)(r1 + 0x1) += w2
+		0xc3, 0x21, 0x01, 0x00, 0x50, 0x00, 0x00, 0x00, // lock *(u32 *)(r1 + 0x1) &= w2
+		0xc3, 0x21, 0x01, 0x00, 0xa0, 0x00, 0x00, 0x00, // lock *(u32 *)(r1 + 0x1) ^= w2
+		0xc3, 0x21, 0x01, 0x00, 0x40, 0x00, 0x00, 0x00, // lock *(u32 *)(r1 + 0x1) |= w2
+
+		0xdb, 0x21, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, // lock *(u64 *)(r1 + 0x1) += r2
+		0xdb, 0x21, 0x01, 0x00, 0x50, 0x00, 0x00, 0x00, // lock *(u64 *)(r1 + 0x1) &= r2
+		0xdb, 0x21, 0x01, 0x00, 0xa0, 0x00, 0x00, 0x00, // lock *(u64 *)(r1 + 0x1) ^= r2
+		0xdb, 0x21, 0x01, 0x00, 0x40, 0x00, 0x00, 0x00, // lock *(u64 *)(r1 + 0x1) |= r2
+
+		0xc3, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, // w0 = atomic_fetch_add((u32 *)(r1 + 0x0), w0)
+		0xc3, 0x01, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, // w0 = atomic_fetch_and((u32 *)(r1 + 0x0), w0)
+		0xc3, 0x01, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, // w0 = atomic_fetch_xor((u32 *)(r1 + 0x0), w0)
+		0xc3, 0x01, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00, // w0 = atomic_fetch_or((u32 *)(r1 + 0x0), w0)
+
+		0xdb, 0x01, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, // r0 = atomic_fetch_add((u64 *)(r1 + 0x0), r0)
+		0xdb, 0x01, 0x00, 0x00, 0x51, 0x00, 0x00, 0x00, // r0 = atomic_fetch_and((u64 *)(r1 + 0x0), r0)
+		0xdb, 0x01, 0x00, 0x00, 0xa1, 0x00, 0x00, 0x00, // r0 = atomic_fetch_xor((u64 *)(r1 + 0x0), r0)
+		0xdb, 0x01, 0x00, 0x00, 0x41, 0x00, 0x00, 0x00, // r0 = atomic_fetch_or((u64 *)(r1 + 0x0), r0)
+
+		0xc3, 0x01, 0x00, 0x00, 0xe1, 0x00, 0x00, 0x00, // w0 = xchg32_32(r1 + 0x0, w0)
+		0xdb, 0x01, 0x00, 0x00, 0xe1, 0x00, 0x00, 0x00, // r0 = xchg_64(r1 + 0x0, r0)
+
+		0xc3, 0x11, 0x00, 0x00, 0xf1, 0x00, 0x00, 0x00, // w0 = cmpxchg32_32(r1 + 0x0, w0, w1)
+		0xdb, 0x11, 0x00, 0x00, 0xf1, 0x00, 0x00, 0x00, // r0 = cmpxchg_64(r1 + 0x0, r0, r1)
+	}
+
+	insns, err := AppendInstructions(nil, bytes.NewReader(rawInsns), binary.LittleEndian, platform.Linux)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	lines := []string{
+		"StXAtomicAddW dst: r1 src: r2 off: 1",
+		"StXAtomicAndW dst: r1 src: r2 off: 1",
+		"StXAtomicXorW dst: r1 src: r2 off: 1",
+		"StXAtomicOrW dst: r1 src: r2 off: 1",
+		"StXAtomicAddDW dst: r1 src: r2 off: 1",
+		"StXAtomicAndDW dst: r1 src: r2 off: 1",
+		"StXAtomicXorDW dst: r1 src: r2 off: 1",
+		"StXAtomicOrDW dst: r1 src: r2 off: 1",
+		"StXAtomicFetchAddW dst: r1 src: r0 off: 0",
+		"StXAtomicFetchAndW dst: r1 src: r0 off: 0",
+		"StXAtomicFetchXorW dst: r1 src: r0 off: 0",
+		"StXAtomicFetchOrW dst: r1 src: r0 off: 0",
+		"StXAtomicFetchAddDW dst: r1 src: r0 off: 0",
+		"StXAtomicFetchAndDW dst: r1 src: r0 off: 0",
+		"StXAtomicFetchXorDW dst: r1 src: r0 off: 0",
+		"StXAtomicFetchOrDW dst: r1 src: r0 off: 0",
+		"StXAtomicXchgW dst: r1 src: r0 off: 0",
+		"StXAtomicXchgDW dst: r1 src: r0 off: 0",
+		"StXAtomicCmpXchgW dst: r1 src: r1 off: 0",
+		"StXAtomicCmpXchgDW dst: r1 src: r1 off: 0",
+	}
+
+	for i, ins := range insns {
+		if want, got := lines[i], fmt.Sprint(ins); want != got {
+			t.Errorf("Expected %q, got %q", want, got)
+		}
+	}
+
+	// Marshal and unmarshal again to make sure the instructions are
+	// still valid.
+	var buf bytes.Buffer
+	err = insns.Marshal(&buf, binary.LittleEndian)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if !bytes.Equal(buf.Bytes(), rawInsns) {
+		t.Error("Expected instructions to be equal after marshalling")
+	}
+}
+
 func TestISAv4(t *testing.T) {
 	rawInsns := []byte{
 		0xd7, 0x01, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, // r1 = bswap16 r1
@@ -355,6 +431,16 @@ func TestISAv4(t *testing.T) {
 
 		0x3c, 0x31, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, // w1 s/= w3
 		0x9c, 0x42, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, // w2 s%= w4
+
+		0xd3, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, // w0 = load_acquire((u8 *)(r1 + 0x0))
+		0xcb, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, // w0 = load_acquire((u16 *)(r1 + 0x0))
+		0xc3, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, // w0 = load_acquire((u32 *)(r1 + 0x0))
+		0xdb, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, // r0 = load_acquire((u64 *)(r1 + 0x0))
+
+		0xd3, 0x21, 0x00, 0x00, 0x10, 0x01, 0x00, 0x00, // store_release((u8 *)(r1 + 0x0), w2)
+		0xcb, 0x21, 0x00, 0x00, 0x10, 0x01, 0x00, 0x00, // store_release((u16 *)(r1 + 0x0), w2)
+		0xc3, 0x21, 0x00, 0x00, 0x10, 0x01, 0x00, 0x00, // store_release((u32 *)(r1 + 0x0), w2)
+		0xdb, 0x21, 0x00, 0x00, 0x10, 0x01, 0x00, 0x00, // store_release((u64 *)(r1 + 0x0), r2)
 	}
 
 	insns, err := AppendInstructions(nil, bytes.NewReader(rawInsns), binary.LittleEndian, platform.Linux)
@@ -381,6 +467,14 @@ func TestISAv4(t *testing.T) {
 		"SModReg dst: r2 src: r4",
 		"SDivReg32 dst: r1 src: r3",
 		"SModReg32 dst: r2 src: r4",
+		"StXAtomicLdAcqB dst: r0 src: r1 off: 0",
+		"StXAtomicLdAcqH dst: r0 src: r1 off: 0",
+		"StXAtomicLdAcqW dst: r0 src: r1 off: 0",
+		"StXAtomicLdAcqDW dst: r0 src: r1 off: 0",
+		"StXAtomicStRelB dst: r1 src: r2 off: 0",
+		"StXAtomicStRelH dst: r1 src: r2 off: 0",
+		"StXAtomicStRelW dst: r1 src: r2 off: 0",
+		"StXAtomicStRelDW dst: r1 src: r2 off: 0",
 	}
 
 	for i, ins := range insns {
diff --git a/asm/load_store.go b/asm/load_store.go
index cdb5c5cfa..b4eb099d7 100644
--- a/asm/load_store.go
+++ b/asm/load_store.go
@@ -1,5 +1,7 @@
 package asm
 
+import "fmt"
+
 //go:generate go run golang.org/x/tools/cmd/stringer@latest -output load_store_string.go -type=Mode,Size
 
 // Mode for load and store operations
@@ -26,10 +28,147 @@ const (
 	MemMode Mode = 0x60
 	// MemSXMode - load from memory, sign extension
 	MemSXMode Mode = 0x80
-	// XAddMode - add atomically across processors.
-	XAddMode Mode = 0xc0
+	// AtomicMode - add atomically across processors.
+	AtomicMode Mode = 0xc0
+)
+
+const atomicMask OpCode = 0x0001_ff00
+
+type AtomicOp uint32
+
+const (
+	InvalidAtomic AtomicOp = 0xffff_ffff
+
+	// AddAtomic - add src to memory address dst atomically
+	AddAtomic AtomicOp = AtomicOp(Add) << 8
+	// AndAtomic - bitwise AND src with memory address at dst atomically
+	AndAtomic AtomicOp = AtomicOp(And) << 8
+	// OrAtomic - bitwise OR src with memory address at dst atomically
+	OrAtomic AtomicOp = AtomicOp(Or) << 8
+	// XorAtomic - bitwise XOR src with memory address at dst atomically
+	XorAtomic AtomicOp = AtomicOp(Xor) << 8
+
+	// xchgAtomic - atomically exchange the old value with the new value
+	xchgAtomic AtomicOp = 0x0000_e000
+	// cmpXchgAtomic - atomically compare and exchange the old value with the new value
+	cmpXchgAtomic AtomicOp = 0x0000_f000
+
+	// fetch modifier for copy-modify-write atomics
+	fetch AtomicOp = 0x0000_0100
+	// loadAcquireAtomic - atomically load with acquire semantics
+	loadAcquireAtomic AtomicOp = 0x0001_0000
+	// storeReleaseAtomic - atomically store with release semantics
+	storeReleaseAtomic AtomicOp = 0x0001_1000
 )
 
+func (op AtomicOp) String() string {
+	var name string
+	switch op {
+	case AddAtomic, AndAtomic, OrAtomic, XorAtomic:
+		name = ALUOp(op >> 8).String()
+	case AddAtomic | fetch, AndAtomic | fetch, OrAtomic | fetch, XorAtomic | fetch:
+		name = "Fetch" + ALUOp((op^fetch)>>8).String()
+	case xchgAtomic | fetch:
+		name = "Xchg"
+	case cmpXchgAtomic | fetch:
+		name = "CmpXchg"
+	case loadAcquireAtomic:
+		name = "LdAcq"
+	case storeReleaseAtomic:
+		name = "StRel"
+	default:
+		name = fmt.Sprintf("AtomicOp(%#x)", uint32(op))
+	}
+
+	return name
+}
+
+func (op AtomicOp) OpCode(size Size) OpCode {
+	switch op {
+	case AddAtomic, AndAtomic, OrAtomic, XorAtomic,
+		AddAtomic | fetch, AndAtomic | fetch, OrAtomic | fetch, XorAtomic | fetch,
+		xchgAtomic | fetch, cmpXchgAtomic | fetch:
+		switch size {
+		case Byte, Half:
+			// 8-bit and 16-bit atomic copy-modify-write atomics are not supported
+			return InvalidOpCode
+		}
+	}
+
+	return OpCode(StXClass).SetMode(AtomicMode).SetSize(size).SetAtomicOp(op)
+}
+
+// Mem emits `*(size *)(dst + offset) (op) src`.
+func (op AtomicOp) Mem(dst, src Register, size Size, offset int16) Instruction {
+	switch op {
+	case xchgAtomic, cmpXchgAtomic:
+		// XchgAtomic and CmpXchgAtomic always have fetch set, FetchMem must be used
+		return Instruction{
+			OpCode: InvalidOpCode,
+			Dst:    dst,
+			Src:    src,
+			Offset: offset,
+		}
+	}
+
+	return Instruction{
+		OpCode: op.OpCode(size),
+		Dst:    dst,
+		Src:    src,
+		Offset: offset,
+	}
+}
+
+// FetchMem is like Mem but also stores the result in src.
+func (op AtomicOp) FetchMem(dst, src Register, size Size, offset int16) Instruction {
+	fetchOp := op | fetch
+	ins := fetchOp.Mem(src, dst, size, offset)
+	return ins
+}
+
+// Emits `lock-acquire dst = *(size *)(src + offset)`.
+func LoadAcquire(dst, src Register, size Size, offset int16) Instruction {
+	return Instruction{
+		OpCode: loadAcquireAtomic.OpCode(size),
+		Dst:    dst,
+		Src:    src,
+		Offset: offset,
+	}
+}
+
+// Emits `lock-release *(size *)(dst + offset) = src`.
+func StoreRelease(dst, src Register, size Size, offset int16) Instruction {
+	return Instruction{
+		OpCode: storeReleaseAtomic.OpCode(size),
+		Dst:    dst,
+		Src:    src,
+		Offset: offset,
+	}
+}
+
+// Emits `src = xchg(*(size *)(dst + offset), src)`.
+// src gets populated with the old value of *(size *)(dst + offset).
+func AtomicXchg(dst, src Register, size Size, offset int16, fetch bool) Instruction {
+	return Instruction{
+		OpCode: xchgAtomic.OpCode(size),
+		Dst:    dst,
+		Src:    src,
+		Offset: offset,
+	}
+}
+
+// Emits `r0 = cmpxchg(*(size *)(dst + offset), r0, src)`.
+// Compares R0 and *(size *)(dst + offset), writes src to *(size *)(dst + offset) on match.
+// R0 gets populated with the old value of *(size *)(dst + offset), even if no exchange occurs.
+func AtomicCmpXchg(dst, src Register, size Size, offset int16, fetch bool) Instruction {
+	return Instruction{
+		OpCode: cmpXchgAtomic.OpCode(size),
+		Dst:    dst,
+		Src:    src,
+		Offset: offset,
+	}
+}
+
 // Size of load and store operations
 //
 //	msb      lsb
@@ -212,14 +351,10 @@ func StoreImm(dst Register, offset int16, value int64, size Size) Instruction {
 
 // StoreXAddOp returns the OpCode to atomically add a register to a value in memory.
 func StoreXAddOp(size Size) OpCode {
-	return OpCode(StXClass).SetMode(XAddMode).SetSize(size)
+	return AddAtomic.OpCode(size)
 }
 
 // StoreXAdd atomically adds src to *dst.
 func StoreXAdd(dst, src Register, size Size) Instruction {
-	return Instruction{
-		OpCode: StoreXAddOp(size),
-		Dst:    dst,
-		Src:    src,
-	}
+	return AddAtomic.Mem(dst, src, size, 0)
 }
diff --git a/asm/load_store_string.go b/asm/load_store_string.go
index c48080327..bbed58b66 100644
--- a/asm/load_store_string.go
+++ b/asm/load_store_string.go
@@ -14,7 +14,7 @@ func _() {
 	_ = x[IndMode-64]
 	_ = x[MemMode-96]
 	_ = x[MemSXMode-128]
-	_ = x[XAddMode-192]
+	_ = x[AtomicMode-192]
 }
 
 const (
@@ -23,7 +23,7 @@ const (
 	_Mode_name_2 = "IndMode"
 	_Mode_name_3 = "MemMode"
 	_Mode_name_4 = "MemSXMode"
-	_Mode_name_5 = "XAddMode"
+	_Mode_name_5 = "AtomicMode"
 	_Mode_name_6 = "InvalidMode"
 )
 
diff --git a/asm/opcode.go b/asm/opcode.go
index 1dfd0b171..c82a1f8fb 100644
--- a/asm/opcode.go
+++ b/asm/opcode.go
@@ -71,24 +71,29 @@ func (cls Class) isJumpOrALU() bool {
 //
 // The encoding varies based on a 3-bit Class:
 //
-//	7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
-//	           ???           | CLS
+//	7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
+//	                          ???                            | CLS
 //
 // For ALUClass and ALUCLass32:
 //
-//	7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
-//	           OPC         |S| CLS
+//	7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
+//	             0                 |           OPC         |S| CLS
 //
 // For LdClass, LdXclass, StClass and StXClass:
 //
-//	7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
-//	        0      | MDE |SIZ| CLS
+//	7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
+//	                       0                       | MDE |SIZ| CLS
+//
+// For StXClass where MDE == AtomicMode:
+//
+//	7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
+//	              0              |    ATOMIC OP    | MDE |SIZ| CLS
 //
 // For JumpClass, Jump32Class:
 //
-//	7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
-//	        0      |  OPC  |S| CLS
-type OpCode uint16
+//	7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0 7 6 5 4 3 2 1 0
+//	                       0                       |  OPC  |S| CLS
+type OpCode uint32
 
 // InvalidOpCode is returned by setters on OpCode
 const InvalidOpCode OpCode = 0xffff
@@ -138,6 +143,14 @@ func (op OpCode) Size() Size {
 	return Size(op & sizeMask)
 }
 
+// AtomicOp returns the type of atomic operation.
+func (op OpCode) AtomicOp() AtomicOp {
+	if op.Class() != StXClass || op.Mode() != AtomicMode {
+		return InvalidAtomic
+	}
+	return AtomicOp(op & atomicMask)
+}
+
 // Source returns the source for branch and ALU operations.
 func (op OpCode) Source() Source {
 	if !op.Class().isJumpOrALU() || op.ALUOp() == Swap {
@@ -199,6 +212,13 @@ func (op OpCode) SetSize(size Size) OpCode {
 	return (op & ^sizeMask) | OpCode(size)
 }
 
+func (op OpCode) SetAtomicOp(atomic AtomicOp) OpCode {
+	if op.Class() != StXClass || op.Mode() != AtomicMode || !valid(OpCode(atomic), atomicMask) {
+		return InvalidOpCode
+	}
+	return (op & ^atomicMask) | OpCode(atomic)
+}
+
 // SetSource sets the source on jump and ALU operations.
 //
 // Returns InvalidOpCode if op is of the wrong class.
@@ -247,6 +267,10 @@ func (op OpCode) String() string {
 		mode := op.Mode()
 		f.WriteString(strings.TrimSuffix(mode.String(), "Mode"))
 
+		if atomic := op.AtomicOp(); atomic != InvalidAtomic {
+			f.WriteString(strings.TrimSuffix(atomic.String(), "Atomic"))
+		}
+
 		switch op.Size() {
 		case DWord:
 			f.WriteString("DW")

From 1a95ecb3b72eaa87333abadeef42b7bce8ecf2df Mon Sep 17 00:00:00 2001
From: Lorenz Bauer <lmb@isovalent.com>
Date: Wed, 30 Apr 2025 10:22:43 +0100
Subject: [PATCH 2/2] WIP export Fetch variants

Signed-off-by: Lorenz Bauer <lmb@isovalent.com>
---
 asm/load_store.go | 90 ++++++++++++++++-------------------------------
 1 file changed, 31 insertions(+), 59 deletions(-)

diff --git a/asm/load_store.go b/asm/load_store.go
index b4eb099d7..30917b847 100644
--- a/asm/load_store.go
+++ b/asm/load_store.go
@@ -41,24 +41,37 @@ const (
 
 	// AddAtomic - add src to memory address dst atomically
 	AddAtomic AtomicOp = AtomicOp(Add) << 8
+	// FetchAdd - add src to memory address dst atomically, store result in src
+	FetchAdd AtomicOp = AddAtomic | fetch
 	// AndAtomic - bitwise AND src with memory address at dst atomically
 	AndAtomic AtomicOp = AtomicOp(And) << 8
+	// FetchAnd - bitwise AND src with memory address at dst atomically, store result in src
+	FetchAnd AtomicOp = AndAtomic | fetch
 	// OrAtomic - bitwise OR src with memory address at dst atomically
 	OrAtomic AtomicOp = AtomicOp(Or) << 8
+	// FetchOr - bitwise OR src with memory address at dst atomically, store result in src
+	FetchOr AtomicOp = OrAtomic | fetch
 	// XorAtomic - bitwise XOR src with memory address at dst atomically
 	XorAtomic AtomicOp = AtomicOp(Xor) << 8
-
-	// xchgAtomic - atomically exchange the old value with the new value
-	xchgAtomic AtomicOp = 0x0000_e000
-	// cmpXchgAtomic - atomically compare and exchange the old value with the new value
-	cmpXchgAtomic AtomicOp = 0x0000_f000
+	// FetchXor - bitwise XOR src with memory address at dst atomically, store result in src
+	FetchXor AtomicOp = XorAtomic | fetch
+
+	// Xchg - atomically exchange the old value with the new value
+	//
+	// src gets populated with the old value of *(size *)(dst + offset).
+	Xchg AtomicOp = 0x0000_e000 | fetch
+	// CmpXchg - atomically compare and exchange the old value with the new value
+	//
+	// Compares R0 and *(size *)(dst + offset), writes src to *(size *)(dst + offset) on match.
+	// R0 gets populated with the old value of *(size *)(dst + offset), even if no exchange occurs.
+	CmpXchg AtomicOp = 0x0000_f000 | fetch
 
 	// fetch modifier for copy-modify-write atomics
 	fetch AtomicOp = 0x0000_0100
-	// loadAcquireAtomic - atomically load with acquire semantics
-	loadAcquireAtomic AtomicOp = 0x0001_0000
-	// storeReleaseAtomic - atomically store with release semantics
-	storeReleaseAtomic AtomicOp = 0x0001_1000
+	// loadAcquire - atomically load with acquire semantics
+	loadAcquire AtomicOp = 0x0001_0000
+	// storeRelease - atomically store with release semantics
+	storeRelease AtomicOp = 0x0001_1000
 )
 
 func (op AtomicOp) String() string {
@@ -66,15 +79,15 @@ func (op AtomicOp) String() string {
 	switch op {
 	case AddAtomic, AndAtomic, OrAtomic, XorAtomic:
 		name = ALUOp(op >> 8).String()
-	case AddAtomic | fetch, AndAtomic | fetch, OrAtomic | fetch, XorAtomic | fetch:
+	case FetchAdd, FetchAnd, FetchOr, FetchXor:
 		name = "Fetch" + ALUOp((op^fetch)>>8).String()
-	case xchgAtomic | fetch:
+	case Xchg:
 		name = "Xchg"
-	case cmpXchgAtomic | fetch:
+	case CmpXchg:
 		name = "CmpXchg"
-	case loadAcquireAtomic:
+	case loadAcquire:
 		name = "LdAcq"
-	case storeReleaseAtomic:
+	case storeRelease:
 		name = "StRel"
 	default:
 		name = fmt.Sprintf("AtomicOp(%#x)", uint32(op))
@@ -86,8 +99,8 @@ func (op AtomicOp) String() string {
 func (op AtomicOp) OpCode(size Size) OpCode {
 	switch op {
 	case AddAtomic, AndAtomic, OrAtomic, XorAtomic,
-		AddAtomic | fetch, AndAtomic | fetch, OrAtomic | fetch, XorAtomic | fetch,
-		xchgAtomic | fetch, cmpXchgAtomic | fetch:
+		FetchAdd, FetchAnd, FetchOr, FetchXor,
+		Xchg, CmpXchg:
 		switch size {
 		case Byte, Half:
 			// 8-bit and 16-bit atomic copy-modify-write atomics are not supported
@@ -100,17 +113,6 @@ func (op AtomicOp) OpCode(size Size) OpCode {
 
 // Mem emits `*(size *)(dst + offset) (op) src`.
 func (op AtomicOp) Mem(dst, src Register, size Size, offset int16) Instruction {
-	switch op {
-	case xchgAtomic, cmpXchgAtomic:
-		// XchgAtomic and CmpXchgAtomic always have fetch set, FetchMem must be used
-		return Instruction{
-			OpCode: InvalidOpCode,
-			Dst:    dst,
-			Src:    src,
-			Offset: offset,
-		}
-	}
-
 	return Instruction{
 		OpCode: op.OpCode(size),
 		Dst:    dst,
@@ -119,17 +121,10 @@ func (op AtomicOp) Mem(dst, src Register, size Size, offset int16) Instruction {
 	}
 }
 
-// FetchMem is like Mem but also stores the result in src.
-func (op AtomicOp) FetchMem(dst, src Register, size Size, offset int16) Instruction {
-	fetchOp := op | fetch
-	ins := fetchOp.Mem(src, dst, size, offset)
-	return ins
-}
-
 // Emits `lock-acquire dst = *(size *)(src + offset)`.
 func LoadAcquire(dst, src Register, size Size, offset int16) Instruction {
 	return Instruction{
-		OpCode: loadAcquireAtomic.OpCode(size),
+		OpCode: loadAcquire.OpCode(size),
 		Dst:    dst,
 		Src:    src,
 		Offset: offset,
@@ -139,30 +134,7 @@ func LoadAcquire(dst, src Register, size Size, offset int16) Instruction {
 // Emits `lock-release *(size *)(dst + offset) = src`.
 func StoreRelease(dst, src Register, size Size, offset int16) Instruction {
 	return Instruction{
-		OpCode: storeReleaseAtomic.OpCode(size),
-		Dst:    dst,
-		Src:    src,
-		Offset: offset,
-	}
-}
-
-// Emits `src = xchg(*(size *)(dst + offset), src)`.
-// src gets populated with the old value of *(size *)(dst + offset).
-func AtomicXchg(dst, src Register, size Size, offset int16, fetch bool) Instruction {
-	return Instruction{
-		OpCode: xchgAtomic.OpCode(size),
-		Dst:    dst,
-		Src:    src,
-		Offset: offset,
-	}
-}
-
-// Emits `r0 = cmpxchg(*(size *)(dst + offset), r0, src)`.
-// Compares R0 and *(size *)(dst + offset), writes src to *(size *)(dst + offset) on match.
-// R0 gets populated with the old value of *(size *)(dst + offset), even if no exchange occurs.
-func AtomicCmpXchg(dst, src Register, size Size, offset int16, fetch bool) Instruction {
-	return Instruction{
-		OpCode: cmpXchgAtomic.OpCode(size),
+		OpCode: storeRelease.OpCode(size),
 		Dst:    dst,
 		Src:    src,
 		Offset: offset,