diff --git a/.github/workflows/buildAndTestRyzenAI.yml b/.github/workflows/buildAndTestRyzenAI.yml
index 2c1b86815ff..67d7bc4a95d 100644
--- a/.github/workflows/buildAndTestRyzenAI.yml
+++ b/.github/workflows/buildAndTestRyzenAI.yml
@@ -61,7 +61,7 @@ jobs:
       matrix:
         runner_type: [ amd7940hs, amdhx370 ]
     env:
-      IRON_CACHE_HOME: ${{ github.workspace }}/iron-cache-${{ matrix.runner_type }}-${{ github.run_id }}
+      NPU_CACHE_HOME: ${{ github.workspace }}/iron-cache-${{ matrix.runner_type }}-${{ github.run_id }}
       PIP_CACHE_DIR: ${{ github.workspace }}/.pip-cache-${{ matrix.runner_type }}-${{ github.run_id }}
     steps:
       - uses: actions/checkout@v4
@@ -130,18 +130,21 @@ jobs:
             $CMAKE_ARGS
 
           # Create runner-specific cache directory
-          rm -rf $IRON_CACHE_HOME
-          mkdir $IRON_CACHE_HOME
+          rm -rf $NPU_CACHE_HOME
+          mkdir $NPU_CACHE_HOME
+
+          # Set number of contexts to maintain in cache per process
+          export XRT_CONTEXT_CACHE_SIZE=3
 
           ninja install
           ninja check-aie
           ninja check-aie-concurrency
           popd
 
-      - name: Cleanup IRON_CACHE_HOME
+      - name: Cleanup NPU_CACHE_HOME
         if: always()
         run: |
-          rm -rf $IRON_CACHE_HOME
+          rm -rf $NPU_CACHE_HOME
           rm -rf $PIP_CACHE_DIR
 
   build-quick-setup:
@@ -152,7 +155,7 @@ jobs:
       matrix:
         runner_type: [ amd7940hs, amdhx370 ]
     env:
-      IRON_CACHE_HOME: ${{ github.workspace }}/iron-cache-${{ matrix.runner_type }}-${{ github.run_id }}
+      NPU_CACHE_HOME: ${{ github.workspace }}/iron-cache-${{ matrix.runner_type }}-${{ github.run_id }}
       PIP_CACHE_DIR: ${{ github.workspace }}/.pip-cache-${{ matrix.runner_type }}-${{ github.run_id }}
     steps:
       - uses: actions/checkout@v4
@@ -200,17 +203,20 @@ jobs:
             LIT_OPTS="-j12 $LIT_OPTS"
           fi
 
+          # Set number of contexts to maintain in cache per process
+          export XRT_CONTEXT_CACHE_SIZE=3
+
           # Create runner-specific cache directory
-          rm -rf $IRON_CACHE_HOME
-          mkdir $IRON_CACHE_HOME
+          rm -rf $NPU_CACHE_HOME
+          mkdir $NPU_CACHE_HOME
 
           ninja install
           ninja check-reference-designs
           ninja check-programming-guide
           popd
 
-      - name: Cleanup IRON_CACHE_HOME
+      - name: Cleanup NPU_CACHE_HOME
         if: always()
         run: |
-          rm -rf $IRON_CACHE_HOME
+          rm -rf $NPU_CACHE_HOME
           rm -rf $PIP_CACHE_DIR
diff --git a/.github/workflows/buildAndTestVitis.yml b/.github/workflows/buildAndTestVitis.yml
index 5fbac0e7979..5f8b25e6fe3 100644
--- a/.github/workflows/buildAndTestVitis.yml
+++ b/.github/workflows/buildAndTestVitis.yml
@@ -61,7 +61,7 @@ jobs:
       matrix:
         runner_type: [ ubuntu-vitis ]
     env:
-      IRON_CACHE_HOME: ${{ github.workspace }}/iron-cache-${{ matrix.runner_type }}-${{ github.run_id }}
+      NPU_CACHE_HOME: ${{ github.workspace }}/iron-cache-${{ matrix.runner_type }}-${{ github.run_id }}
     steps:
       - uses: actions/checkout@v4
         with:
@@ -125,13 +125,13 @@ jobs:
             $CMAKE_ARGS
 
           # Create runner-specific cache directory
-          rm -rf $IRON_CACHE_HOME
-          mkdir $IRON_CACHE_HOME
+          rm -rf $NPU_CACHE_HOME
+          mkdir $NPU_CACHE_HOME
 
           ninja install
           ninja check-aie
           popd
 
-      - name: Cleanup IRON_CACHE_HOME
+      - name: Cleanup NPU_CACHE_HOME
         if: always()
-        run: rm -rf $IRON_CACHE_HOME
+        run: rm -rf $NPU_CACHE_HOME
diff --git a/programming_examples/basic/matrix_multiplication/makefile-common b/programming_examples/basic/matrix_multiplication/makefile-common
index 44e0216288c..1d7c98768c4 100644
--- a/programming_examples/basic/matrix_multiplication/makefile-common
+++ b/programming_examples/basic/matrix_multiplication/makefile-common
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2025, Advanced Micro Devices, Inc.
+# Copyright (C) 2025-2026, Advanced Micro Devices, Inc.
 # 
 ##===----------------------------------------------------------------------===##
 
@@ -196,13 +196,13 @@ run: ${targetname}.exe ${xclbin_target}
 trace: ${targetname}.exe ${trace_xclbin_target} ${insts_target}
 	export XRT_HACK_UNSECURE_LOADING_XCLBIN=1 && \
 	${powershell} ./$< -x ${trace_xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N ${runargs} -t ${trace_size}
-	${srcdir}/../../../../python/utils/parse_trace.py --input trace.txt --mlir ${trace_mlir_target} --output trace_mm.json
+	${srcdir}/../../../../python/utils/trace/parse.py --input trace.txt --mlir ${trace_mlir_target} --output trace_mm.json
 
 #	${powershell} ./$< -x ${trace_xclbin_target} -i ${insts_target} -k MLIR_AIE -M $M -K $K -N $N -v 1 --warmup 0 --iters 1 -t ${trace_size}
 
 .PHONY: parse_trace
 parse_trace:
-	${srcdir}/../../../../python/utils/parse_trace.py --input trace.txt --mlir ${trace_mlir_target} --output trace_mm.json
+	${srcdir}/../../../../python/utils/trace/parse.py --input trace.txt --mlir ${trace_mlir_target} --output trace_mm.json
 
 .PHONY: clean
 clean: clean_trace
diff --git a/programming_examples/basic/matrix_multiplication/single_core/single_core.py b/programming_examples/basic/matrix_multiplication/single_core/single_core.py
index a1bd9a4c847..0a9ccbbc2da 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/single_core.py
+++ b/programming_examples/basic/matrix_multiplication/single_core/single_core.py
@@ -3,7 +3,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2025 AMD Inc.
+# (c) Copyright 2025-2026 AMD Inc.
 import argparse
 import numpy as np
 import sys
@@ -12,7 +12,6 @@
 from aie.dialects.aie import *
 from aie.dialects.aiex import *
 import aie.utils.trace as trace_utils
-from aie.utils.trace import PortEvent
 from aie.iron.controlflow import range_
 from aie.iron.dtype import str_to_dtype
 
@@ -280,28 +279,28 @@ def sequence(A, B, C):
                         trace_size=trace_size,
                         coretile_events=[
                             # captures input A (PORT_RUNNING_0, at port number 1, master for inputs)
-                            trace_utils.PortEvent(
-                                trace_utils.CoreEvent.PORT_RUNNING_0,
+                            trace_utils.events.PortEvent(
+                                trace_utils.events.CoreEvent.PORT_RUNNING_0,
                                 port_number=1,
                                 master=True,
                             ),
                             # captures input B (PORT_RUNNING_1, at port number 2, master for inputs)
-                            trace_utils.PortEvent(
-                                trace_utils.CoreEvent.PORT_RUNNING_1,
+                            trace_utils.events.PortEvent(
+                                trace_utils.events.CoreEvent.PORT_RUNNING_1,
                                 port_number=2,
                                 master=True,
                             ),
                             # captures output C (PORT_RUNNING_2, at port number 1, slave for outputs)
-                            trace_utils.PortEvent(
-                                trace_utils.CoreEvent.PORT_RUNNING_2,
+                            trace_utils.events.PortEvent(
+                                trace_utils.events.CoreEvent.PORT_RUNNING_2,
                                 port_number=1,
                                 master=False,
                             ),
-                            trace_utils.CoreEvent.INSTR_EVENT_0,
-                            trace_utils.CoreEvent.INSTR_EVENT_1,
-                            trace_utils.CoreEvent.MEMORY_STALL,
-                            trace_utils.CoreEvent.LOCK_STALL,
-                            trace_utils.CoreEvent.INSTR_VECTOR,
+                            trace_utils.events.CoreEvent.INSTR_EVENT_0,
+                            trace_utils.events.CoreEvent.INSTR_EVENT_1,
+                            trace_utils.events.CoreEvent.MEMORY_STALL,
+                            trace_utils.events.CoreEvent.LOCK_STALL,
+                            trace_utils.events.CoreEvent.INSTR_VECTOR,
                         ],
                     )
 
diff --git a/programming_examples/basic/packet_switch/test.cpp b/programming_examples/basic/packet_switch/test.cpp
index c9b6fd6e572..0502f93c4a6 100644
--- a/programming_examples/basic/packet_switch/test.cpp
+++ b/programming_examples/basic/packet_switch/test.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// Copyright (C) 2025, Advanced Micro Devices, Inc.
+// Copyright (C) 2025-2026, Advanced Micro Devices, Inc.
 //
 //===----------------------------------------------------------------------===//
 
@@ -113,10 +113,13 @@ int main(int argc, const char *argv[]) {
       ref = srcVecA[i] + 2; // ref for the second input packet
     }
     if (*(bufOut + i) != ref) {
-      if (errors < 10) {
+      if (errors < 100) {
         std::cout << "Error in output " << i << "; Input: " << srcVecA[i]
                   << "; Output: " << *(bufOut + i) << " != reference:" << ref
                   << std::endl;
+      } else if (errors == 100) {
+        std::cout << "..." << std::endl;
+        std::cout << "[Errors truncated]" << std::endl;
       }
       errors++;
     }
diff --git a/programming_examples/basic/passthrough_kernel/Makefile b/programming_examples/basic/passthrough_kernel/Makefile
index 9a9afc3f0e8..6f6bea62865 100755
--- a/programming_examples/basic/passthrough_kernel/Makefile
+++ b/programming_examples/basic/passthrough_kernel/Makefile
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 # 
 ##===----------------------------------------------------------------------===##
 
@@ -82,13 +82,13 @@ run_py: build/final_${data_size}.xclbin build/insts_${data_size}.bin
 
 trace: ${targetname}_${data_size}.exe build/final_trace_${data_size}.xclbin build/insts_${data_size}.bin
 	${powershell} ./$< -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.bin -k MLIR_AIE -t ${trace_size}
-	${srcdir}/../../../python/utils/parse_trace.py --input trace.txt --mlir build/aie2_trace_lineBased_8b_${data_size}.mlir --output trace_passthrough_kernel.json
-	${srcdir}/../../../python/utils/get_trace_summary.py --input trace_passthrough_kernel.json
+	${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie2_trace_lineBased_8b_${data_size}.mlir --output trace_passthrough_kernel.json
+	${srcdir}/../../../python/utils/trace/get_trace_summary.py --input trace_passthrough_kernel.json
 
 trace_py: build/final_trace_${data_size}.xclbin build/insts_${data_size}.bin
 	${powershell} python3 ${srcdir}/test.py -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.bin -k MLIR_AIE -t ${trace_size} -i1s ${in1_size} -os ${out_size}
-	${srcdir}/../../../python/utils/parse_trace.py --input trace.txt --mlir build/aie2_trace_lineBased_8b_${data_size}.mlir --output trace_${targetname}.json
-	${srcdir}/../../../python/utils/get_trace_summary.py --input trace_${targetname}.json
+	${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie2_trace_lineBased_8b_${data_size}.mlir --output trace_${targetname}.json
+	${srcdir}/../../../python/utils/trace/get_trace_summary.py --input trace_${targetname}.json
 
 clean_trace:
 	rm -rf tmpTrace trace.txt parse*json trace*json
diff --git a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
index e9793b181ec..ac8241c7510 100755
--- a/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
+++ b/programming_examples/basic/passthrough_kernel/run_strix_makefile.lit
@@ -1,4 +1,4 @@
-// (c) Copyright 2024 Advanced Micro Devices, Inc.
+// (c) Copyright 2024-2026 Advanced Micro Devices, Inc.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 // REQUIRES: ryzen_ai_npu2, peano
@@ -9,4 +9,4 @@
 // RUN: make -f %S/Makefile devicename=npu2 
 // RUN: %run_on_npu2% make -f %S/Makefile run devicename=npu2
 // RUN: make -f %S/Makefile clean
-// RUN: env CHESS=false %run_on_npu2% make -f %S/Makefile trace devicename=npu2
+// RUN: env CHESS=false %run_on_npu2% make -f %S/Makefile trace devicename=npu2
\ No newline at end of file
diff --git a/programming_examples/basic/passthrough_kernel/test.py b/programming_examples/basic/passthrough_kernel/test.py
index a5df3f2d1ad..2e4c1c2a6f9 100644
--- a/programming_examples/basic/passthrough_kernel/test.py
+++ b/programming_examples/basic/passthrough_kernel/test.py
@@ -4,16 +4,16 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+# (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 import numpy as np
 import sys
-import aie.utils.xrt as xrt_utils
 import aie.utils.test as test_utils
+import aie.iron as iron
+from aie.utils import DefaultNPURuntime
 
 
 def main(opts):
     in1_size = int(opts.in1_size)  # in bytes
-    in2_size = int(opts.in2_size)  # in bytes
     out_size = int(opts.out_size)  # in bytes
 
     # --------------------------------------------------------------------------
@@ -36,28 +36,23 @@ def main(opts):
     assert out_size == in1_size
 
     # Initialize data
-    in1_data = np.arange(0, in1_volume, dtype=in1_dtype)
-    out_data = np.zeros([out_volume], dtype=out_dtype)
-
-    # Define reference data
-    ref = in1_data
+    ref = np.arange(0, in1_volume, dtype=in1_dtype)
+    in1 = iron.tensor(ref, dtype=in1_dtype)
+    out = iron.zeros([out_volume], dtype=out_dtype)
 
     # --------------------------------------------------------------------------
 
     print("Running...\n")
-    res = xrt_utils.setup_and_run_aie(
-        in1_dtype,
-        None,
-        out_dtype,
-        in1_data,
-        None,
-        out_data,
-        in1_volume,
-        None,
-        out_volume,
-        ref,
-        opts,
+    npu_opts = test_utils.create_npu_kernel(opts)
+    res = DefaultNPURuntime.run_test(
+        npu_opts.npu_kernel,
+        [in1, out],
+        {1: ref},
+        verify=npu_opts.verify,
+        verbosity=npu_opts.verbosity,
     )
+    if res == 0:
+        print("\nPASS!\n")
     sys.exit(res)
 
 
diff --git a/programming_examples/basic/passthrough_pykernel/passthrough_pykernel.ipynb b/programming_examples/basic/passthrough_pykernel/passthrough_pykernel.ipynb
index 7d7d387050c..83fa6c0524c 100644
--- a/programming_examples/basic/passthrough_pykernel/passthrough_pykernel.ipynb
+++ b/programming_examples/basic/passthrough_pykernel/passthrough_pykernel.ipynb
@@ -24,7 +24,9 @@
     "from aie.helpers.dialects.func import func\n",
     "from aie.iron.controlflow import range_\n",
     "\n",
-    "from aie.utils.xrt import setup_aie, execute as execute_on_aie\n",
+    "from aie.utils import NPUKernel, DefaultNPURuntime, get_current_device\n",
+    "import aie.iron as iron\n",
+    "from aie.iron.device import NPU2\n",
     "import aie.utils.test as test_utils"
    ]
   },
@@ -69,7 +71,11 @@
     "    N = vector_size\n",
     "    lineWidthInBytes = N // 4  # chop input in 4 sub-tensors\n",
     "\n",
-    "    @device(AIEDevice.npu1_1col)\n",
+    "    d = get_current_device()\n",
+    "    if d is None:\n",
+    "        raise RuntimeError(\"Could not determine current device\")\n",
+    "\n",
+    "    @device(d.resolve())\n",
     "    def device_body():\n",
     "        # define types\n",
     "        line_ty = np.ndarray[(lineWidthInBytes,), np.dtype[np.uint8]]\n",
@@ -106,20 +112,15 @@
     "\n",
     "        @runtime_sequence(vector_ty, vector_ty, vector_ty)\n",
     "        def sequence(inTensor, outTensor, notUsed):\n",
-    "            npu_dma_memcpy_nd(\n",
-    "                metadata=of_in,\n",
-    "                bd_id=0,\n",
-    "                mem=inTensor,\n",
-    "                sizes=[1, 1, 1, N],\n",
-    "                issue_token=True,\n",
+    "            in_task = shim_dma_single_bd_task(\n",
+    "                of_in, inTensor, sizes=[1, 1, 1, N], issue_token=True\n",
     "            )\n",
-    "            npu_dma_memcpy_nd(\n",
-    "                metadata=of_out,\n",
-    "                bd_id=1,\n",
-    "                mem=outTensor,\n",
-    "                sizes=[1, 1, 1, N],\n",
+    "            out_task = shim_dma_single_bd_task(\n",
+    "                of_out, outTensor, sizes=[1, 1, 1, N], issue_token=True\n",
     "            )\n",
-    "            dma_wait(of_in, of_out)"
+    "\n",
+    "            dma_start_task(in_task, out_task)\n",
+    "            dma_await_task(in_task, out_task)"
    ]
   },
   {
@@ -181,19 +182,22 @@
    "source": [
     "dtype = np.uint8\n",
     "\n",
-    "app = setup_aie(\n",
+    "npu_kernel = NPUKernel(\n",
     "    \"notebook_build/notebook.xclbin\",\n",
     "    \"notebook_build/notebook_insts.bin\",\n",
-    "    VECTOR_SIZE,\n",
-    "    dtype,\n",
-    "    None,\n",
-    "    None,\n",
-    "    VECTOR_SIZE,\n",
-    "    dtype,\n",
+    "    kernel_name=\"MLIR_AIE\",\n",
     ")\n",
+    "kernel_handle = DefaultNPURuntime.load(npu_kernel)\n",
+    "\n",
     "input = np.arange(1, VECTOR_SIZE + 1, dtype=dtype)\n",
+    "in_tensor = iron.tensor(input, dtype=dtype)\n",
+    "out_tensor = iron.zeros(VECTOR_SIZE, dtype=dtype)\n",
+    "\n",
+    "buffers = [in_tensor, out_tensor]\n",
+    "\n",
     "print(\"Running...\")\n",
-    "aie_output = execute_on_aie(app, input)\n",
+    "DefaultNPURuntime.run(kernel_handle, buffers)\n",
+    "aie_output = out_tensor.numpy()\n",
     "\n",
     "# Copy output results and verify they are correct\n",
     "errors = 0\n",
diff --git a/programming_examples/basic/passthrough_pykernel/test.py b/programming_examples/basic/passthrough_pykernel/test.py
index 1d7c5b6608f..46b57654f16 100644
--- a/programming_examples/basic/passthrough_pykernel/test.py
+++ b/programming_examples/basic/passthrough_pykernel/test.py
@@ -4,11 +4,12 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+# (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 import numpy as np
 import sys
-from aie.utils.xrt import setup_aie, execute
 import aie.utils.test as test_utils
+import aie.iron as iron
+from aie.utils import DefaultNPURuntime
 
 
 def main(opts):
@@ -17,34 +18,21 @@ def main(opts):
     data_size = int(opts.size)
     dtype = np.uint8
 
-    app = setup_aie(
-        opts.xclbin,
-        opts.instr,
-        data_size,
-        dtype,
-        None,
-        None,
-        data_size,
-        dtype,
+    input_data = np.arange(1, data_size + 1, dtype=dtype)
+    in1 = iron.tensor(input_data, dtype=dtype)
+    out = iron.zeros(data_size, dtype=dtype)
+
+    npu_opts = test_utils.create_npu_kernel(opts)
+    res = DefaultNPURuntime.run_test(
+        npu_opts.npu_kernel,
+        [in1, out],
+        {1: input_data},
+        verify=npu_opts.verify,
+        verbosity=npu_opts.verbosity,
     )
-    input = np.arange(1, data_size + 1, dtype=dtype)
-    aie_output = execute(app, input)
-
-    # Copy output results and verify they are correct
-    errors = 0
-    if opts.verify:
-        if opts.verbosity >= 1:
-            print("Verifying results ...")
-        e = np.equal(input, aie_output)
-        errors = np.size(e) - np.count_nonzero(e)
-
-    if not errors:
+    if res == 0:
         print("\nPASS!\n")
-        exit(0)
-    else:
-        print("\nError count: ", errors)
-        print("\nFailed.\n")
-        exit(-1)
+    sys.exit(res)
 
 
 if __name__ == "__main__":
diff --git a/programming_examples/basic/tiling_exploration/per_tile/test.py b/programming_examples/basic/tiling_exploration/per_tile/test.py
index 048c09a09b6..cf364a82c6f 100644
--- a/programming_examples/basic/tiling_exploration/per_tile/test.py
+++ b/programming_examples/basic/tiling_exploration/per_tile/test.py
@@ -4,12 +4,15 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+# (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 import argparse
 import numpy as np
 
 from aie.helpers.taplib import TensorTiler2D
-from aie.utils.xrt import setup_aie, execute
+import aie.utils.test as test_utils
+import aie.iron as iron
+from aie.utils import DefaultNPURuntime
+import sys
 
 
 def main(opts):
@@ -23,57 +26,23 @@ def main(opts):
     )
     reference_access_order = reference_tiler.access_order()
 
-    app = setup_aie(
-        opts.xclbin,
-        opts.instr,
-        None,
-        None,
-        None,
-        None,
-        data_size,
-        dtype,
-    )
-    aie_output = execute(app)
-    aie_output = aie_output.reshape((opts.tensor_height, opts.tensor_width))
-
-    # Copy output results and verify they are correct
-    errors = 0
-    if opts.verbosity >= 1:
-        print("Verifying results ...")
-    e = np.equal(reference_access_order, aie_output)
-    errors = np.size(e) - np.count_nonzero(e)
+    out = iron.zeros(data_size, dtype=dtype)
 
-    if not errors:
+    npu_opts = test_utils.create_npu_kernel(opts)
+    res = DefaultNPURuntime.run_test(
+        npu_opts.npu_kernel,
+        [out],
+        {0: reference_access_order.flatten()},
+        verify=npu_opts.verify,
+        verbosity=npu_opts.verbosity,
+    )
+    if res == 0:
         print("\nPASS!\n")
-        exit(0)
-    else:
-        print("\nError count: ", errors)
-        print("\nFailed.\n")
-        exit(-1)
+    sys.exit(res)
 
 
 def get_arg_parser():
-    p = argparse.ArgumentParser()
-    p.add_argument(
-        "-x", "--xclbin", default="final.xclbin", dest="xclbin", help="the xclbin path"
-    )
-    p.add_argument(
-        "-k",
-        "--kernel",
-        dest="kernel",
-        default="MLIR_AIE",
-        help="the kernel name in the XCLBIN (for instance MLIR_AIE)",
-    )
-    p.add_argument(
-        "-v", "--verbosity", default=0, type=int, help="the verbosity of the output"
-    )
-    p.add_argument(
-        "-i",
-        "--instr",
-        dest="instr",
-        default="instr.bin",
-        help="path of file containing userspace instructions sent to the NPU",
-    )
+    p = test_utils.create_default_argparser()
     p.add_argument("--tensor-height", required=True, help="Tensor height", type=int)
     p.add_argument("--tensor-width", required=True, help="Tensor width", type=int)
     p.add_argument("--tile-height", required=True, help="Tile height", type=int)
diff --git a/programming_examples/basic/tiling_exploration/tile_group/test.py b/programming_examples/basic/tiling_exploration/tile_group/test.py
index a7261296b2f..cf364a82c6f 100644
--- a/programming_examples/basic/tiling_exploration/tile_group/test.py
+++ b/programming_examples/basic/tiling_exploration/tile_group/test.py
@@ -4,12 +4,15 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+# (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 import argparse
 import numpy as np
 
 from aie.helpers.taplib import TensorTiler2D
-from aie.utils.xrt import setup_aie, execute as execute_on_aie
+import aie.utils.test as test_utils
+import aie.iron as iron
+from aie.utils import DefaultNPURuntime
+import sys
 
 
 def main(opts):
@@ -23,57 +26,23 @@ def main(opts):
     )
     reference_access_order = reference_tiler.access_order()
 
-    app = setup_aie(
-        opts.xclbin,
-        opts.instr,
-        None,
-        None,
-        None,
-        None,
-        data_size,
-        dtype,
-    )
-    aie_output = execute_on_aie(app)
-    aie_output = aie_output.reshape((opts.tensor_height, opts.tensor_width))
-
-    # Copy output results and verify they are correct
-    errors = 0
-    if opts.verbosity >= 1:
-        print("Verifying results ...")
-    e = np.equal(reference_access_order, aie_output)
-    errors = np.size(e) - np.count_nonzero(e)
+    out = iron.zeros(data_size, dtype=dtype)
 
-    if not errors:
+    npu_opts = test_utils.create_npu_kernel(opts)
+    res = DefaultNPURuntime.run_test(
+        npu_opts.npu_kernel,
+        [out],
+        {0: reference_access_order.flatten()},
+        verify=npu_opts.verify,
+        verbosity=npu_opts.verbosity,
+    )
+    if res == 0:
         print("\nPASS!\n")
-        exit(0)
-    else:
-        print("\nError count: ", errors)
-        print("\nFailed.\n")
-        exit(-1)
+    sys.exit(res)
 
 
 def get_arg_parser():
-    p = argparse.ArgumentParser()
-    p.add_argument(
-        "-x", "--xclbin", default="final.xclbin", dest="xclbin", help="the xclbin path"
-    )
-    p.add_argument(
-        "-k",
-        "--kernel",
-        dest="kernel",
-        default="MLIR_AIE",
-        help="the kernel name in the XCLBIN (for instance MLIR_AIE)",
-    )
-    p.add_argument(
-        "-v", "--verbosity", default=0, type=int, help="the verbosity of the output"
-    )
-    p.add_argument(
-        "-i",
-        "--instr",
-        dest="instr",
-        default="instr.bin",
-        help="path of file containing userspace instructions sent to the NPU",
-    )
+    p = test_utils.create_default_argparser()
     p.add_argument("--tensor-height", required=True, help="Tensor height", type=int)
     p.add_argument("--tensor-width", required=True, help="Tensor width", type=int)
     p.add_argument("--tile-height", required=True, help="Tile height", type=int)
diff --git a/programming_examples/basic/vector_exp/test.cpp b/programming_examples/basic/vector_exp/test.cpp
index 5d3cd509d7b..1c179c0d4df 100644
--- a/programming_examples/basic/vector_exp/test.cpp
+++ b/programming_examples/basic/vector_exp/test.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// Copyright (C) 2023, Advanced Micro Devices, Inc.
+// Copyright (C) 2023-2026, Advanced Micro Devices, Inc.
 //
 //===----------------------------------------------------------------------===//
 
@@ -45,7 +45,12 @@ int verify(int CSize, std::vector<T> A, std::vector<T> C, int verbosity) {
     if (std::isnan(ref) || std::isnan(C[i]))
       break;
     if (!test_utils::nearly_equal(ref, C[i], 0.128)) {
-      std::cout << "Error in output " << C[i] << " != " << ref << std::endl;
+      if (errors < 100) {
+        std::cout << "Error in output " << C[i] << " != " << ref << std::endl;
+      } else if (errors == 100) {
+        std::cout << "..." << std::endl;
+        std::cout << "[Errors truncated]" << std::endl;
+      }
       errors++;
     } else {
       if (verbosity > 1)
diff --git a/programming_examples/basic/vector_reduce_add/Makefile b/programming_examples/basic/vector_reduce_add/Makefile
index 170850a6404..3ccd394ced5 100644
--- a/programming_examples/basic/vector_reduce_add/Makefile
+++ b/programming_examples/basic/vector_reduce_add/Makefile
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 # 
 ##===----------------------------------------------------------------------===##
 
@@ -60,7 +60,7 @@ run: ${targetname}.exe build/final.xclbin
 	${powershell} ./$< -x build/final.xclbin -i build/insts.bin -k MLIR_AIE
 
 trace:
-	../../../python/utils/parse_trace.py --input trace.txt --mlir build/aie.mlir --output parse_eventIR_vs.json
+	../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie.mlir --output parse_eventIR_vs.json
 
 clean_trace:
 	rm -rf tmpTrace trace.txt
diff --git a/programming_examples/basic/vector_reduce_max/multi_column_designs/Makefile b/programming_examples/basic/vector_reduce_max/multi_column_designs/Makefile
index f2f0a656390..d86e6d938a0 100755
--- a/programming_examples/basic/vector_reduce_max/multi_column_designs/Makefile
+++ b/programming_examples/basic/vector_reduce_max/multi_column_designs/Makefile
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2025, Advanced Micro Devices, Inc.
+# Copyright (C) 2025-2026, Advanced Micro Devices, Inc.
 # 
 ##===----------------------------------------------------------------------===##
 
@@ -98,11 +98,11 @@ run_py: build/final.xclbin build/insts.bin
 
 trace: ${targetname}.exe build/final_trace.xclbin build/insts.bin 
 	${powershell} ./$< -x build/final_trace.xclbin -i build/insts.bin -k MLIR_AIE -t ${trace_size}
-	${srcdir}/../../../../python/utils/parse_trace.py --input trace.txt --mlir build/aie2_trace.mlir --output trace_vector_reduce_max.json
-	${srcdir}/../../../../python/utils/get_trace_summary.py --input trace_vector_reduce_max.json
+	${srcdir}/../../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie2_trace.mlir --output trace_vector_reduce_max.json
+	${srcdir}/../../../../python/utils/trace/get_trace_summary.py --input trace_vector_reduce_max.json
 
 clean_trace:
 	rm -rf tmpTrace trace.txt parse*json trace*json
 
 clean:
-	rm -rf build _build ${targetname}*.exe
\ No newline at end of file
+	rm -rf build _build ${targetname}*.exe
diff --git a/programming_examples/basic/vector_reduce_max/single_column_designs/Makefile b/programming_examples/basic/vector_reduce_max/single_column_designs/Makefile
index af9f1f908bf..7c4a8e7a0c7 100644
--- a/programming_examples/basic/vector_reduce_max/single_column_designs/Makefile
+++ b/programming_examples/basic/vector_reduce_max/single_column_designs/Makefile
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2025, Advanced Micro Devices, Inc.
+# Copyright (C) 2025-2026, Advanced Micro Devices, Inc.
 # 
 ##===----------------------------------------------------------------------===##
 
@@ -99,11 +99,11 @@ run_py: build/final.xclbin build/insts.bin
 
 trace: ${targetname}.exe build/final_trace.xclbin build/insts.bin 
 	${powershell} ./$< -x build/final_trace.xclbin -i build/insts.bin -k MLIR_AIE -t ${trace_size}
-	${srcdir}/../../../../python/utils/parse_trace.py --input trace.txt --mlir build/aie2_trace.mlir --output trace_vector_reduce_max.json
-	${srcdir}/../../../../python/utils/get_trace_summary.py --input trace_vector_reduce_max.json
+	${srcdir}/../../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie2_trace.mlir --output trace_vector_reduce_max.json
+	${srcdir}/../../../../python/utils/trace/get_trace_summary.py --input trace_vector_reduce_max.json
 
 clean_trace:
 	rm -rf tmpTrace trace.txt parse*json trace*json
 
 clean:
-	rm -rf build _build ${targetname}*.exe
\ No newline at end of file
+	rm -rf build _build ${targetname}*.exe
diff --git a/programming_examples/basic/vector_reduce_max/single_core_designs/Makefile b/programming_examples/basic/vector_reduce_max/single_core_designs/Makefile
index 283a0fbf3b6..679da4117b0 100755
--- a/programming_examples/basic/vector_reduce_max/single_core_designs/Makefile
+++ b/programming_examples/basic/vector_reduce_max/single_core_designs/Makefile
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2025, Advanced Micro Devices, Inc.
+# Copyright (C) 2025-2026, Advanced Micro Devices, Inc.
 # 
 ##===----------------------------------------------------------------------===##
 
@@ -88,11 +88,11 @@ run_py: build/final.xclbin build/insts.bin
 
 trace: ${targetname}.exe build/final_trace.xclbin build/insts.bin 
 	${powershell} ./$< -x build/final_trace.xclbin -i build/insts.bin -k MLIR_AIE -t ${trace_size}
-	${srcdir}/../../../../python/utils/parse_trace.py --input trace.txt --mlir build/aie2_trace.mlir --output trace_vector_reduce_max.json
-	${srcdir}/../../../../python/utils/get_trace_summary.py --input trace_vector_reduce_max.json
+	${srcdir}/../../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie2_trace.mlir --output trace_vector_reduce_max.json
+	${srcdir}/../../../../python/utils/trace/get_trace_summary.py --input trace_vector_reduce_max.json
 
 clean_trace:
 	rm -rf tmpTrace trace.txt parse*json trace*json
 
 clean:
-	rm -rf build _build ${targetname}*.exe
\ No newline at end of file
+	rm -rf build _build ${targetname}*.exe
diff --git a/programming_examples/basic/vector_reduce_min/Makefile b/programming_examples/basic/vector_reduce_min/Makefile
index 682bb54c6ec..d4b6c61be20 100755
--- a/programming_examples/basic/vector_reduce_min/Makefile
+++ b/programming_examples/basic/vector_reduce_min/Makefile
@@ -60,7 +60,7 @@ run: ${targetname}.exe build/final.xclbin
 	${powershell} ./$< -x build/final.xclbin -i build/insts.bin -k MLIR_AIE --warmup 10 --iters 20
 
 trace:
-	../../../python/utils/parse_trace.py --input trace.txt --mlir build/aie.mlir --output parse_eventIR_vs.json
+	../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie.mlir --output parse_eventIR_vs.json
 
 clean_trace:
 	rm -rf tmpTrace trace.txt
diff --git a/programming_examples/basic/vector_reduce_min/README.md b/programming_examples/basic/vector_reduce_min/README.md
index 91c94aae2fc..5aac518b7c3 100644
--- a/programming_examples/basic/vector_reduce_min/README.md
+++ b/programming_examples/basic/vector_reduce_min/README.md
@@ -1,101 +1,101 @@
-<!---//===- README.md --------------------------*- Markdown -*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// Copyright (C) 2024-2025, Advanced Micro Devices, Inc.
-// 
-//===----------------------------------------------------------------------===//-->
-
-# Vector Reduce Min:
-
-This example showcases both **JIT** and **non-JIT** approaches for running IRON designs. A single tile performs a very simple reduction operation where the kernel loads data from local memory, performs the `min` reduction and stores the resulting value back.
-
-Input data is brought to the local memory of the Compute tile from a Shim tile. The size of the input data `N` from the Shim tile is configurable (default: `1024xi32` for the non-JIT version, customizable via command-line arguments for the JIT version). The data is copied to the AIE tile, where the reduction is performed. The single output data value is copied from the AIE tile to the Shim tile. Both approaches offer different compilation workflows with the JIT version adding microseconds runtime overhead.
-
-## Source Files Overview
-
-### JIT Approach Files
-
-1. **`vector_reduce_min_jit.py`**: A JIT (Just-In-Time) compiled version using IRON's `@iron.jit` decorator. This approach offers faster development iteration by compiling and executing the design at runtime, with support for command-line arguments to customize the number of elements.
-
-### Non-JIT Approach Files
-
-1. **`vector_reduce_min.py`**: A Python script that defines the AIE array structural design using MLIR-AIE operations. This generates MLIR that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.bin for the NPU in Ryzen™ AI). 
-
-1. **`vector_reduce_min_placed.py`**: An alternative version of the design in `vector_reduce_min.py`, that is expressed in a lower-level version of IRON.
-
-1. **`test.cpp`**: This C++ code is a testbench for the non-JIT design example targetting Ryzen™ AI (AIE2). The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the program verifies the results.
-
-### Shared Files
-
-1. **`reduce_min.cc`**: A C++ implementation of a vectorized `min` reduction operation for AIE cores. The code uses the AIE API, which is a C++ header-only library providing types and operations that get translated into efficient low-level intrinsics, and whose documentation can be found [here](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_api/aie_api/doc/index.html).  The source can be found [here](../../../aie_kernels/aie2/reduce_min.cc).
-
-## Usage
-
-### JIT Approach (Just-In-Time Compilation)
-
-The JIT approach uses IRON's `@iron.jit` decorator for runtime compilation, offering faster development iteration and more flexible parameterization.
-
-#### Running the JIT Version
-
-To run the JIT version with default parameters (1024 elements):
-```shell
-python vector_reduce_min_jit.py
-```
-
-To run with custom number of elements:
-```shell
-python vector_reduce_min_jit.py --num-elements 2048
-```
-
-Or using the short form:
-```shell
-python vector_reduce_min_jit.py -n 512
-```
-
-### Non-JIT Approach
-
-The non-JIT approach uses traditional MLIR-AIE compilation where the design is compiled ahead-of-time to produce binaries.
-
-#### Compilation
-
-To compile the design:
-```shell
-make
-```
-
-To compile the placed design:
-```shell
-env use_placed=1 make
-```
-
-To compile the C++ testbench:
-```shell
-make vector_reduce_min.exe
-```
-
-#### C++ Testbench
-
-To run the design:
-```shell
-make run
-```
-
-#### JIT vs Non-JIT Comparison
-
-| Aspect | Non-JIT Approach | JIT Approach |
-|--------|------------------|--------------|
-| **Compilation** | Ahead-of-time via `aiecc.py` | Runtime compilation |
-| **Development Speed** | Slower (manual make/compilation) | Faster (compilation integrated) |
-| **Host Code** | C++ testbench (`test.cpp`) | Python script |
-| **Performance** | Baseline execution time | Microseconds overhead from JIT runtime |
-| **Flexibility** | Fixed at compile time | Runtime parameterization |
-| **Use Case** | Explicit XCLBIN management | Dynamic compilation |
-| **Binary Output** | Generates XCLBIN/inst.bin | Cached binaries in `IRON_CACHE_HOME` (defaults to `~/.iron/`) |
-
-**When to use each approach:**
-- **Use JIT** for rapid prototyping, experimentation, runtime flexibility, and when you don't need control over XCLBINs
-- **Use non-JIT** when you need explicit XCLBIN control, working with existing MLIR-AIE workflows, or distributing pre-compiled binaries
-
+<!---//===- README.md --------------------------*- Markdown -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
+// 
+//===----------------------------------------------------------------------===//-->
+
+# Vector Reduce Min:
+
+This example showcases both **JIT** and **non-JIT** approaches for running IRON designs. A single tile performs a very simple reduction operation where the kernel loads data from local memory, performs the `min` reduction and stores the resulting value back.
+
+Input data is brought to the local memory of the Compute tile from a Shim tile. The size of the input data `N` from the Shim tile is configurable (default: `1024xi32` for the non-JIT version, customizable via command-line arguments for the JIT version). The data is copied to the AIE tile, where the reduction is performed. The single output data value is copied from the AIE tile to the Shim tile. Both approaches offer different compilation workflows with the JIT version adding microseconds runtime overhead.
+
+## Source Files Overview
+
+### JIT Approach Files
+
+1. **`vector_reduce_min_jit.py`**: A JIT (Just-In-Time) compiled version using IRON's `@iron.jit` decorator. This approach offers faster development iteration by compiling and executing the design at runtime, with support for command-line arguments to customize the number of elements.
+
+### Non-JIT Approach Files
+
+1. **`vector_reduce_min.py`**: A Python script that defines the AIE array structural design using MLIR-AIE operations. This generates MLIR that is then compiled using `aiecc.py` to produce design binaries (ie. XCLBIN and inst.bin for the NPU in Ryzen™ AI). 
+
+1. **`vector_reduce_min_placed.py`**: An alternative version of the design in `vector_reduce_min.py`, that is expressed in a lower-level version of IRON.
+
+1. **`test.cpp`**: This C++ code is a testbench for the non-JIT design example targetting Ryzen™ AI (AIE2). The code is responsible for loading the compiled XCLBIN file, configuring the AIE module, providing input data, and executing the AIE design on the NPU. After executing, the program verifies the results.
+
+### Shared Files
+
+1. **`reduce_min.cc`**: A C++ implementation of a vectorized `min` reduction operation for AIE cores. The code uses the AIE API, which is a C++ header-only library providing types and operations that get translated into efficient low-level intrinsics, and whose documentation can be found [here](https://www.xilinx.com/htmldocs/xilinx2023_2/aiengine_api/aie_api/doc/index.html).  The source can be found [here](../../../aie_kernels/aie2/reduce_min.cc).
+
+## Usage
+
+### JIT Approach (Just-In-Time Compilation)
+
+The JIT approach uses IRON's `@iron.jit` decorator for runtime compilation, offering faster development iteration and more flexible parameterization.
+
+#### Running the JIT Version
+
+To run the JIT version with default parameters (1024 elements):
+```shell
+python vector_reduce_min_jit.py
+```
+
+To run with custom number of elements:
+```shell
+python vector_reduce_min_jit.py --num-elements 2048
+```
+
+Or using the short form:
+```shell
+python vector_reduce_min_jit.py -n 512
+```
+
+### Non-JIT Approach
+
+The non-JIT approach uses traditional MLIR-AIE compilation where the design is compiled ahead-of-time to produce binaries.
+
+#### Compilation
+
+To compile the design:
+```shell
+make
+```
+
+To compile the placed design:
+```shell
+env use_placed=1 make
+```
+
+To compile the C++ testbench:
+```shell
+make vector_reduce_min.exe
+```
+
+#### C++ Testbench
+
+To run the design:
+```shell
+make run
+```
+
+#### JIT vs Non-JIT Comparison
+
+| Aspect | Non-JIT Approach | JIT Approach |
+|--------|------------------|--------------|
+| **Compilation** | Ahead-of-time via `aiecc.py` | Runtime compilation |
+| **Development Speed** | Slower (manual make/compilation) | Faster (compilation integrated) |
+| **Host Code** | C++ testbench (`test.cpp`) | Python script |
+| **Performance** | Baseline execution time | Microseconds overhead from JIT runtime |
+| **Flexibility** | Fixed at compile time | Runtime parameterization |
+| **Use Case** | Explicit XCLBIN management | Dynamic compilation |
+| **Binary Output** | Generates XCLBIN/inst.bin | Cached binaries in `NPU_CACHE_HOME` (defaults to `~/.npu/`) |
+
+**When to use each approach:**
+- **Use JIT** for rapid prototyping, experimentation, runtime flexibility, and when you don't need control over XCLBINs
+- **Use non-JIT** when you need explicit XCLBIN control, working with existing MLIR-AIE workflows, or distributing pre-compiled binaries
+
diff --git a/programming_examples/basic/vector_scalar_add/test.cpp b/programming_examples/basic/vector_scalar_add/test.cpp
index 8d9ade9ee7b..08fd32df5e2 100644
--- a/programming_examples/basic/vector_scalar_add/test.cpp
+++ b/programming_examples/basic/vector_scalar_add/test.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// Copyright (C) 2023, Advanced Micro Devices, Inc.
+// Copyright (C) 2023-2026, Advanced Micro Devices, Inc.
 //
 //===----------------------------------------------------------------------===//
 
@@ -126,8 +126,13 @@ int main(int argc, const char *argv[]) {
   for (uint32_t i = 0; i < OUT_SIZE; i++) {
     uint32_t ref = i + 2;
     if (*(bufOut + i) != ref) {
-      std::cout << "Error in output " << *(bufOut + i) << " != " << ref
-                << std::endl;
+      if (errors < 100) {
+        std::cout << "Error in output " << *(bufOut + i) << " != " << ref
+                  << std::endl;
+      } else if (errors == 100) {
+        std::cout << "..." << std::endl;
+        std::cout << "[Errors truncated]" << std::endl;
+      }
       errors++;
     } else {
       std::cout << "Correct output " << *(bufOut + i) << " == " << ref
diff --git a/programming_examples/basic/vector_scalar_add_runlist/test.cpp b/programming_examples/basic/vector_scalar_add_runlist/test.cpp
index 33def5c3233..a16e7c5cd60 100644
--- a/programming_examples/basic/vector_scalar_add_runlist/test.cpp
+++ b/programming_examples/basic/vector_scalar_add_runlist/test.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// Copyright (C) 2023, Advanced Micro Devices, Inc.
+// Copyright (C) 2023-2026, Advanced Micro Devices, Inc.
 //
 //===----------------------------------------------------------------------===//
 
@@ -157,8 +157,13 @@ int main(int argc, const char *argv[]) {
   for (uint32_t i = 0; i < OUT_SIZE; i++) {
     uint32_t ref = i + 2;
     if (*(bufOut_0 + i) != ref) {
-      std::cout << "Error in output " << *(bufOut_0 + i) << " != " << ref
-                << std::endl;
+      if (errors < 100) {
+        std::cout << "Error in output " << *(bufOut_0 + i) << " != " << ref
+                  << std::endl;
+      } else if (errors == 100) {
+        std::cout << "..." << std::endl;
+        std::cout << "[Errors truncated]" << std::endl;
+      }
       errors++;
     } else {
       std::cout << "Correct output " << *(bufOut_0 + i) << " == " << ref
@@ -170,8 +175,13 @@ int main(int argc, const char *argv[]) {
   for (uint32_t i = 0; i < OUT_SIZE; i++) {
     uint32_t ref = i + 3;
     if (*(bufOut_1 + i) != ref) {
-      std::cout << "Error in output " << *(bufOut_1 + i) << " != " << ref
-                << std::endl;
+      if (errors < 100) {
+        std::cout << "Error in output " << *(bufOut_1 + i) << " != " << ref
+                  << std::endl;
+      } else if (errors == 100) {
+        std::cout << "..." << std::endl;
+        std::cout << "[Errors truncated]" << std::endl;
+      }
       errors++;
     } else {
       std::cout << "Correct output " << *(bufOut_1 + i) << " == " << ref
diff --git a/programming_examples/basic/vector_scalar_mul/Makefile b/programming_examples/basic/vector_scalar_mul/Makefile
index ac90082ce97..cddb9dc4cc5 100644
--- a/programming_examples/basic/vector_scalar_mul/Makefile
+++ b/programming_examples/basic/vector_scalar_mul/Makefile
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 # 
 ##===----------------------------------------------------------------------===##
 
@@ -107,13 +107,13 @@ run_py: build/final_${data_size}.xclbin build/insts_${data_size}.bin
 
 trace: ${targetname}_${data_size}.exe build/final_trace_${data_size}.xclbin build/insts_${data_size}.bin 
 	${powershell} ./$< -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.bin -k MLIR_AIE -t ${trace_size}
-	${srcdir}/../../../python/utils/parse_trace.py --input trace.txt --mlir build/aie_trace_${data_size}.mlir --output trace_${targetname}.json
-	${srcdir}/../../../python/utils/get_trace_summary.py --input trace_${targetname}.json
+	${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie_trace_${data_size}.mlir --output trace_${targetname}.json
+	${srcdir}/../../../python/utils/trace/get_trace_summary.py --input trace_${targetname}.json
 
 trace_py: build/final_trace_${data_size}.xclbin build/insts_${data_size}.bin
 	${powershell} python3 ${srcdir}/test.py -x build/final_trace_${data_size}.xclbin -i build/insts_${data_size}.bin -k MLIR_AIE -t ${trace_size} -i1s ${in1_size} -i2s ${in2_size} -os ${out_size}
-	${srcdir}/../../../python/utils/parse_trace.py --input trace.txt --mlir build/aie_trace_${data_size}.mlir --output trace_${targetname}.json
-	${srcdir}/../../../python/utils/get_trace_summary.py --input trace_${targetname}.json
+	${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie_trace_${data_size}.mlir --output trace_${targetname}.json
+	${srcdir}/../../../python/utils/trace/get_trace_summary.py --input trace_${targetname}.json
 
 
 clean_trace:
diff --git a/programming_examples/basic/vector_scalar_mul/test.py b/programming_examples/basic/vector_scalar_mul/test.py
index 7d497f22bc0..de12559390e 100644
--- a/programming_examples/basic/vector_scalar_mul/test.py
+++ b/programming_examples/basic/vector_scalar_mul/test.py
@@ -4,11 +4,12 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+# (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 import numpy as np
 import sys
-import aie.utils.xrt as xrt_utils
 import aie.utils.test as test_utils
+import aie.iron as iron
+from aie.utils import DefaultNPURuntime
 
 
 def main(opts):
@@ -41,30 +42,29 @@ def main(opts):
     scale_factor = 3
 
     # Initialize data
-    in1_data = np.arange(1, in1_volume + 1, dtype=in1_dtype)
+    ref = np.arange(1, in1_volume + 1, dtype=in1_dtype)
+    in1 = iron.tensor(ref, dtype=in1_dtype)
     in2_data = np.array([scale_factor], dtype=in2_dtype)
-    out_data = np.zeros([out_volume], dtype=out_dtype)
-
-    # Define reference data
-    ref = np.arange(1, in1_volume + 1, dtype=out_dtype) * scale_factor
+    in2 = iron.tensor(in2_data, dtype=in2_dtype)
+    out = iron.zeros([out_volume], dtype=out_dtype)
+    ref = ref * scale_factor
 
     # --------------------------------------------------------------------------
 
+    npu_opts = test_utils.create_npu_kernel(opts)
+    if npu_opts.npu_kernel.trace_config:
+        npu_opts.npu_kernel.trace_config.enable_ctrl_pkts = True
+
     print("Running...\n")
-    res = xrt_utils.setup_and_run_aie(
-        in1_dtype,
-        in2_dtype,
-        out_dtype,
-        in1_data,
-        in2_data,
-        out_data,
-        in1_volume,
-        in2_volume,
-        out_volume,
-        ref,
-        opts,
-        enable_ctrl_pkts=True,
+    res = DefaultNPURuntime.run_test(
+        npu_opts.npu_kernel,
+        [in1, in2, out],
+        {2: ref},
+        verify=npu_opts.verify,
+        verbosity=npu_opts.verbosity,
     )
+    if res == 0:
+        print("\nPASS!\n")
     sys.exit(res)
 
 
diff --git a/programming_examples/basic/vector_scalar_mul/vector_scalar_mul_placed.py b/programming_examples/basic/vector_scalar_mul/vector_scalar_mul_placed.py
index 18db3758fd4..3c31921abb0 100644
--- a/programming_examples/basic/vector_scalar_mul/vector_scalar_mul_placed.py
+++ b/programming_examples/basic/vector_scalar_mul/vector_scalar_mul_placed.py
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+# (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 import numpy as np
 import argparse
 import sys
@@ -15,8 +15,7 @@
 from aie.iron.controlflow import range_
 
 import aie.utils.trace as trace_utils
-from aie.utils.trace import PortEvent
-from aie.utils.trace_events.aie2 import CoreEvent, MemEvent
+from aie.utils.trace.events import PortEvent, CoreEvent, MemEvent
 
 
 def my_vector_scalar_mul(dev, in1_size, in2_size, out_size, int_bit_width, trace_size):
diff --git a/programming_examples/basic/vector_vector_add/run.lit b/programming_examples/basic/vector_vector_add/run.lit
index 069ea7599a6..a13c5369c88 100644
--- a/programming_examples/basic/vector_vector_add/run.lit
+++ b/programming_examples/basic/vector_vector_add/run.lit
@@ -1,6 +1,6 @@
-// (c) Copyright 2025 Advanced Micro Devices, Inc.
+// (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 // REQUIRES: ryzen_ai_npu1, peano
 //
-// RUN: %run_on_npu1% python3 %S/vector_vector_add.py --device=npu
\ No newline at end of file
+// RUN: %run_on_npu1% python3 %S/vector_vector_add.py
\ No newline at end of file
diff --git a/programming_examples/basic/vector_vector_add/run_placed.lit b/programming_examples/basic/vector_vector_add/run_placed.lit
index 419d448d1a3..413bdc9e1bf 100644
--- a/programming_examples/basic/vector_vector_add/run_placed.lit
+++ b/programming_examples/basic/vector_vector_add/run_placed.lit
@@ -1,6 +1,6 @@
-// (c) Copyright 2025 Advanced Micro Devices, Inc.
+// (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 // REQUIRES: ryzen_ai, peano
 //
-// RUN: %run_on_npu1% python3 %S/vector_vector_add_placed.py --device=npu
\ No newline at end of file
+// RUN: %run_on_npu1% python3 %S/vector_vector_add_placed.py
\ No newline at end of file
diff --git a/programming_examples/basic/vector_vector_add/run_strix.lit b/programming_examples/basic/vector_vector_add/run_strix.lit
index 37a1c9360ff..9ce51ab2946 100644
--- a/programming_examples/basic/vector_vector_add/run_strix.lit
+++ b/programming_examples/basic/vector_vector_add/run_strix.lit
@@ -1,6 +1,6 @@
-// (c) Copyright 2025 Advanced Micro Devices, Inc.
+// (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 // REQUIRES: ryzen_ai_npu2, peano
 //
-// RUN: %run_on_npu2% python3 %S/vector_vector_add.py --device=npu2
\ No newline at end of file
+// RUN: %run_on_npu2% python3 %S/vector_vector_add.py
\ No newline at end of file
diff --git a/programming_examples/basic/vector_vector_add/run_strix_placed.lit b/programming_examples/basic/vector_vector_add/run_strix_placed.lit
index b2d3335ec5f..a3bc9d855bc 100644
--- a/programming_examples/basic/vector_vector_add/run_strix_placed.lit
+++ b/programming_examples/basic/vector_vector_add/run_strix_placed.lit
@@ -1,6 +1,6 @@
-// (c) Copyright 2025 Advanced Micro Devices, Inc.
+// (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 // REQUIRES: ryzen_ai_npu2, peano
 //
-// RUN: %run_on_npu2% python3 %S/vector_vector_add_placed.py --device=npu2
\ No newline at end of file
+// RUN: %run_on_npu2% python3 %S/vector_vector_add_placed.py
\ No newline at end of file
diff --git a/programming_examples/basic/vector_vector_add/vector_vector_add.py b/programming_examples/basic/vector_vector_add/vector_vector_add.py
index 15189ee0739..72a482036ce 100644
--- a/programming_examples/basic/vector_vector_add/vector_vector_add.py
+++ b/programming_examples/basic/vector_vector_add/vector_vector_add.py
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates
+# (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 
 import argparse
 import sys
@@ -85,21 +85,10 @@ def core_body(of_in1, of_in2, of_out):
 
 
 def main():
-    device_map = {
-        "npu": NPU1Col1(),
-        "npu2": NPU2Col1(),
-    }
-
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "-v", "--verbose", action="store_true", help="Enable verbose output"
     )
-    parser.add_argument(
-        "-d",
-        "--device",
-        choices=["npu", "npu2"],
-        help="Target device",
-    )
     parser.add_argument(
         "-n",
         "--num-elements",
@@ -115,9 +104,6 @@ def main():
     input1 = iron.randint(0, 100, (args.num_elements,), dtype=np.int32, device="npu")
     output = iron.zeros_like(input0)
 
-    if args.device:
-        iron.set_current_device(device_map[args.device])
-
     # JIT-compile the kernel then launches the kernel with the given arguments. Future calls
     # to the kernel will use the same compiled kernel and loaded code objects
     vector_vector_add(input0, input1, output)
diff --git a/programming_examples/basic/vector_vector_add/vector_vector_add_placed.py b/programming_examples/basic/vector_vector_add/vector_vector_add_placed.py
index ca4a3b10916..5fa1dbdbdac 100644
--- a/programming_examples/basic/vector_vector_add/vector_vector_add_placed.py
+++ b/programming_examples/basic/vector_vector_add/vector_vector_add_placed.py
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2024-2025 Advanced Micro Devices, Inc. or its affiliates
+# (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 
 import argparse
 import sys
@@ -48,7 +48,7 @@ def vector_vector_add(input0, input1, output):
 
     buffer_depth = 2
 
-    @device(iron.get_current_device())
+    @device(iron.get_current_device().resolve())
     def device_body():
         tensor_ty = np.ndarray[(num_elements,), np.dtype[dtype]]
         tile_ty = np.ndarray[(n,), np.dtype[dtype]]
@@ -97,22 +97,10 @@ def sequence(A, B, C):
 
 
 def main():
-    device_map = {
-        "npu": AIEDevice.npu1_1col,
-        "npu2": AIEDevice.npu2_1col,
-        "xcvc1902": AIEDevice.xcvc1902,
-    }
-
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "-v", "--verbose", action="store_true", help="Enable verbose output"
     )
-    parser.add_argument(
-        "-d",
-        "--device",
-        choices=["npu", "npu2", "xcvc1902"],
-        help="Target device",
-    )
     parser.add_argument(
         "-n",
         "--num-elements",
@@ -128,9 +116,6 @@ def main():
     input1 = iron.randint(0, 100, (args.num_elements,), dtype=np.int32, device="npu")
     output = iron.zeros_like(input0)
 
-    if args.device:
-        iron.set_current_device(device_map[args.device])
-
     # JIT-compile the kernel then launches the kernel with the given arguments. Future calls
     # to the kernel will use the same compiled kernel and loaded code objects
     vector_vector_add(input0, input1, output)
diff --git a/programming_examples/basic/vector_vector_add_BDs_init_values/test.cpp b/programming_examples/basic/vector_vector_add_BDs_init_values/test.cpp
index 2e72d137804..eaccc0fd500 100644
--- a/programming_examples/basic/vector_vector_add_BDs_init_values/test.cpp
+++ b/programming_examples/basic/vector_vector_add_BDs_init_values/test.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 //
 //===----------------------------------------------------------------------===//
 
@@ -140,12 +140,14 @@ int main(int argc, const char *argv[]) {
 
   for (uint32_t i = 0; i < OUT_SIZE; i++) {
     if (*(bufOut + i) != *(bufInA + i) + *(bufInB + i)) {
-      std::cout << "Error in output " << *(bufOut + i)
-                << " != " << *(bufInA + i) << " + " << *(bufInB + i)
-                << std::endl;
+      if (verbosity >= 1) {
+        std::cout << "Error in output " << *(bufOut + i)
+                  << " != " << *(bufInA + i) << " + " << *(bufInB + i)
+                  << std::endl;
+      }
       errors++;
     } else {
-      if (verbosity > 1)
+      if (verbosity >= 1)
         std::cout << "Correct output " << *(bufOut + i)
                   << " == " << *(bufInA + i) + *(bufInB + i) << std::endl;
     }
diff --git a/programming_examples/basic/vector_vector_modulo/test.cpp b/programming_examples/basic/vector_vector_modulo/test.cpp
index c050918961c..92e9c1abcf1 100644
--- a/programming_examples/basic/vector_vector_modulo/test.cpp
+++ b/programming_examples/basic/vector_vector_modulo/test.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// Copyright (C) 2023, Advanced Micro Devices, Inc.
+// Copyright (C) 2023-2026, Advanced Micro Devices, Inc.
 //
 //===----------------------------------------------------------------------===//
 
@@ -140,12 +140,14 @@ int main(int argc, const char *argv[]) {
 
   for (uint32_t i = 0; i < OUT_SIZE; i++) {
     if (*(bufOut + i) != *(bufInA + i) % *(bufInB + i)) {
-      std::cout << "Error in output " << *(bufOut + i)
-                << " != " << *(bufInA + i) << " + " << *(bufInB + i)
-                << std::endl;
+      if (verbosity >= 1) {
+        std::cout << "Error in output " << *(bufOut + i)
+                  << " != " << *(bufInA + i) << " + " << *(bufInB + i)
+                  << std::endl;
+      }
       errors++;
     } else {
-      if (verbosity > 1)
+      if (verbosity >= 1)
         std::cout << "Correct output " << *(bufOut + i)
                   << " == " << *(bufInA + i) + *(bufInB + i) << std::endl;
     }
diff --git a/programming_examples/basic/vector_vector_mul/test.cpp b/programming_examples/basic/vector_vector_mul/test.cpp
index c209135074f..ecb55886f0f 100644
--- a/programming_examples/basic/vector_vector_mul/test.cpp
+++ b/programming_examples/basic/vector_vector_mul/test.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// Copyright (C) 2023, Advanced Micro Devices, Inc.
+// Copyright (C) 2023-2026, Advanced Micro Devices, Inc.
 //
 //===----------------------------------------------------------------------===//
 
@@ -140,14 +140,15 @@ int main(int argc, const char *argv[]) {
 
   for (uint32_t i = 0; i < OUT_SIZE; i++) {
     if (*(bufOut + i) != *(bufInA + i) * *(bufInB + i)) {
-      std::cout << "Error in output " << *(bufOut + i)
-                << " != " << *(bufInA + i) << " * " << *(bufInB + i)
-                << std::endl;
+      if (verbosity >= 1) {
+        std::cout << "Error in output " << *(bufOut + i)
+                  << " != " << *(bufInA + i) << " * " << *(bufInB + i)
+                  << std::endl;
+      }
       errors++;
-    } else {
-      if (verbosity > 1)
-        std::cout << "Correct output " << *(bufOut + i)
-                  << " == " << *(bufInA + i) * *(bufInB + i) << std::endl;
+    } else if (verbosity >= 1) {
+      std::cout << "Correct output " << *(bufOut + i)
+                << " == " << *(bufInA + i) * *(bufInB + i) << std::endl;
     }
   }
 
diff --git a/programming_examples/ml/block_datatypes/bfp_conversion/test.cpp b/programming_examples/ml/block_datatypes/bfp_conversion/test.cpp
index 732488a53dc..0f785c104e5 100644
--- a/programming_examples/ml/block_datatypes/bfp_conversion/test.cpp
+++ b/programming_examples/ml/block_datatypes/bfp_conversion/test.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// Copyright (C) 2025, Advanced Micro Devices, Inc.
+// Copyright (C) 2025-2026, Advanced Micro Devices, Inc.
 //
 //===----------------------------------------------------------------------===//
 
@@ -209,11 +209,13 @@ int main(int argc, const char *argv[]) {
     // handled for any bfp type.
     if (!test_utils::nearly_equal(outputTransformed[i], expectedResultVec[i],
                                   0.25, 3.5)) {
-      std::cout << "Error in output " << outputTransformed[i]
-                << " != " << expectedResultVec[i] << std::endl;
+      if (verbosity >= 1) {
+        std::cout << "Error in output " << outputTransformed[i]
+                  << " != " << expectedResultVec[i] << std::endl;
+      }
       errors++;
     } else {
-      if (verbosity > 1)
+      if (verbosity >= 1)
         std::cout << "Correct output " << outputTransformed[i]
                   << " ~= " << expectedResultVec[i] << std::endl;
     }
diff --git a/programming_examples/ml/block_datatypes/vector_passthrough/test.cpp b/programming_examples/ml/block_datatypes/vector_passthrough/test.cpp
index 523f88125b0..5a51dadbd13 100644
--- a/programming_examples/ml/block_datatypes/vector_passthrough/test.cpp
+++ b/programming_examples/ml/block_datatypes/vector_passthrough/test.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// Copyright (C) 2025, Advanced Micro Devices, Inc.
+// Copyright (C) 2025-2026, Advanced Micro Devices, Inc.
 //
 //===----------------------------------------------------------------------===//
 
@@ -162,11 +162,13 @@ int main(int argc, const char *argv[]) {
       std::cout << "Block " << i / 9 << "\n";
     }
     if (*(bufOut + i) != *(bufInA + i)) {
-      std::cout << "Error in output " << int(*(bufOut + i))
-                << " != " << int(*(bufInA + i)) << std::endl;
+      if (verbosity >= 1) {
+        std::cout << "Error in output " << int(*(bufOut + i))
+                  << " != " << int(*(bufInA + i)) << std::endl;
+      }
       errors++;
     } else {
-      if (verbosity > 1)
+      if (verbosity >= 1)
         std::cout << "Correct output " << int(*(bufOut + i))
                   << " == " << int(*(bufInA + i)) << std::endl;
     }
diff --git a/programming_examples/ml/bottleneck/test.py b/programming_examples/ml/bottleneck/test.py
index 43213048c06..f186d96d3f3 100644
--- a/programming_examples/ml/bottleneck/test.py
+++ b/programming_examples/ml/bottleneck/test.py
@@ -3,7 +3,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 
 import torch
 import torch.nn as nn
@@ -13,8 +13,10 @@
 import time
 import os
 import numpy as np
-from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
 import aie.utils.test as test_utils
+import aie.iron as iron
+from aie.utils import TraceConfig, HostRuntime, NPUKernel, DefaultNPURuntime
+from pathlib import Path
 
 torch.use_deterministic_algorithms(True)
 torch.manual_seed(0)
@@ -79,18 +81,8 @@ def main(opts):
     # ------------------------------------------------------
     # Get device, load the xclbin & kernel and register them
     # ------------------------------------------------------
-    app = setup_aie(
-        xclbin_path,
-        insts_path,
-        shape_in_act,
-        dtype_in,
-        shape_total_wts,
-        dtype_wts,
-        shape_out,
-        dtype_out,
-        enable_trace=enable_trace,
-        trace_size=trace_size,
-    )
+    npu_kernel = NPUKernel(xclbin_path, insts_path)
+    kernel_handle = DefaultNPURuntime.load(npu_kernel)
 
     # ------------------------------------------------------
     # Define your golden reference
@@ -156,22 +148,41 @@ def forward(self, x):
     total_wts = np.concatenate((wts1, wts2, wts3), axis=None)
     total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
 
+    # ------------------------------------------------------
+    # Setup buffers
+    # ------------------------------------------------------
+    in1 = iron.tensor(ifm_mem_fmt, dtype=dtype_in)
+    in2 = iron.tensor(total_wts, dtype=dtype_wts)
+    out_size = np.prod(shape_out) * dtype_out.itemsize
+    out = iron.zeros(out_size, dtype=dtype_out)
+
+    buffers = [in1, in2, out]
+
+    trace_config = None
+    if enable_trace:
+        trace_config = TraceConfig(
+            trace_size=trace_size,
+            trace_file=trace_file,
+            trace_after_last_tensor=False,
+            enable_ctrl_pkts=False,
+            last_tensor_shape=out.shape,
+            last_tensor_dtype=out.dtype,
+        )
+        HostRuntime.prepare_args_for_trace(buffers, trace_config)
+
     # ------------------------------------------------------
     # Main run loop
     # ------------------------------------------------------
     for i in range(num_iter):
-        start = time.time_ns()
-        aie_output = execute(app, ifm_mem_fmt, total_wts) * inp_scale4
-        stop = time.time_ns()
+        ret = DefaultNPURuntime.run(kernel_handle, buffers)
 
-        if enable_trace:
-            aie_output, trace = extract_trace(
-                aie_output, shape_out, dtype_out, trace_size
-            )
-            write_out_trace(trace, trace_file)
+        if trace_config:
+            trace_buffer, _ = HostRuntime.extract_trace_from_args(buffers, trace_config)
+            trace_buffer = trace_buffer.view(np.uint32)
+            trace_config.write_trace(trace_buffer)
 
-        npu_time = stop - start
-        npu_time_total = npu_time_total + npu_time
+        aie_output = out.numpy() * inp_scale4
+        npu_time_total = npu_time_total + ret.npu_time
 
     # ------------------------------------------------------
     # Reorder output data-layout
diff --git a/programming_examples/ml/conv2d/Makefile b/programming_examples/ml/conv2d/Makefile
index 27bf7fa10c9..68ce8677cdc 100755
--- a/programming_examples/ml/conv2d/Makefile
+++ b/programming_examples/ml/conv2d/Makefile
@@ -3,7 +3,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 
 srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
@@ -96,8 +96,8 @@ run_py: build/final.xclbin
 
 trace_py: build/final_trace.xclbin
 	${powershell} python3 ${srcdir}/test.py -x build/final_trace.xclbin -i build/insts_trace.bin -k MLIR_AIE -wd ${width} -ht ${height} -ic ${in_channels} -oc ${out_channels} -t ${trace_size}
-	${srcdir}/../../../python/utils/parse_trace.py --input log/trace_conv2d.txt --mlir build/aie2_trace.mlir --output log/trace_conv2d.json
-	${srcdir}/../../../python/utils/get_trace_summary.py --input log/trace_conv2d.json
+	${srcdir}/../../../python/utils/trace/parse.py --input log/trace_conv2d.txt --mlir build/aie2_trace.mlir --output log/trace_conv2d.json
+	${srcdir}/../../../python/utils/trace/get_trace_summary.py --input log/trace_conv2d.json
 
 clean:
 	rm -rf build *.elf* *.lst *.bif aie2.mlir.prj log* *.xclbin sim \
diff --git a/programming_examples/ml/conv2d/test.py b/programming_examples/ml/conv2d/test.py
index 7dfc8effba7..815164b760c 100644
--- a/programming_examples/ml/conv2d/test.py
+++ b/programming_examples/ml/conv2d/test.py
@@ -3,7 +3,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 
 import torch
 import torch.nn as nn
@@ -13,8 +13,10 @@
 import time
 import os
 import numpy as np
-from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
 import aie.utils.test as test_utils
+import aie.iron as iron
+from aie.utils import TraceConfig, HostRuntime, NPUKernel, DefaultNPURuntime
+from pathlib import Path
 
 torch.use_deterministic_algorithms(True)
 torch.manual_seed(0)
@@ -69,19 +71,8 @@ def main(opts):
     # ------------------------------------------------------
     # Get device, load the xclbin & kernel and register them
     # ------------------------------------------------------
-    app = setup_aie(
-        xclbin_path,
-        insts_path,
-        shape_in_act,
-        dtype_in,
-        shape_total_wts,
-        dtype_wts,
-        shape_out,
-        dtype_out,
-        enable_trace=enable_trace,
-        trace_size=trace_size,
-        trace_after_output=True,
-    )
+    npu_kernel = NPUKernel(xclbin_path, insts_path, kernel_name=opts.kernel)
+    kernel_handle = DefaultNPURuntime.load(npu_kernel)
 
     # ------------------------------------------------------
     # Define your golden reference
@@ -124,29 +115,40 @@ def forward(self, x):
     total_wts = np.concatenate((wts1), axis=None)
     total_wts.tofile(log_folder + "/weights_mem_fmt_final.txt", sep=",", format="%d")
 
+    in1 = iron.tensor(ifm_mem_fmt, dtype=dtype_in)
+    in2 = iron.tensor(total_wts, dtype=dtype_wts)
+    out_size = np.prod(shape_out) * dtype_out.itemsize
+    out = iron.zeros(out_size, dtype=dtype_out)
+
+    buffers = [in1, in2, out]
+
+    trace_config = None
+    if enable_trace:
+        trace_config = TraceConfig(
+            trace_size=trace_size,
+            trace_file=trace_file,
+            trace_after_last_tensor=True,
+            enable_ctrl_pkts=False,
+            last_tensor_shape=out.shape,
+            last_tensor_dtype=out.dtype,
+        )
+        HostRuntime.prepare_args_for_trace(buffers, trace_config)
+
     # ------------------------------------------------------
     # Main run loop
     # ------------------------------------------------------
     for i in range(num_iter):
-        start = time.time_ns()
-        entire_buffer = execute(app, ifm_mem_fmt, total_wts)
-        stop = time.time_ns()
-
+        ret = DefaultNPURuntime.run(kernel_handle, buffers)
         if enable_trace:
-            # Separate data and trace
-            data_buffer, trace_buffer = extract_trace(
-                entire_buffer, shape_out, dtype_out, trace_size
-            )
-            # Scale the data
-            data_buffer = data_buffer * int8_scale
-            # Write out the trace
-            write_out_trace(trace_buffer, trace_file)
-        else:
-            data_buffer = entire_buffer * int8_scale
-            trace_buffer = None
-
-        npu_time = stop - start
-        npu_time_total = npu_time_total + npu_time
+            trace_buffer, _ = HostRuntime.extract_trace_from_args(buffers, trace_config)
+            trace_buffer = trace_buffer.view(np.uint32)
+            trace_config.write_trace(trace_buffer)
+
+        out_tensor = buffers[-1]
+        if not isinstance(out_tensor, np.ndarray):
+            out_tensor = out_tensor.numpy()
+        data_buffer = out_tensor * int8_scale
+        npu_time_total = npu_time_total + ret.npu_time
 
     # ------------------------------------------------------
     # Reorder output data-layout
diff --git a/programming_examples/ml/conv2d_14x14/Makefile b/programming_examples/ml/conv2d_14x14/Makefile
index 0466f9f38a0..8c9a91a3a9a 100755
--- a/programming_examples/ml/conv2d_14x14/Makefile
+++ b/programming_examples/ml/conv2d_14x14/Makefile
@@ -3,7 +3,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 
 srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
@@ -98,8 +98,8 @@ run_py: build/final.xclbin
 
 trace_py: build/final_trace.xclbin
 	${powershell} python3 ${srcdir}/test.py -x build/final_trace.xclbin -i build/insts_trace.bin -k MLIR_AIE -wd ${width} -ht ${height} -ic ${in_channels} -oc ${out_channels} -ksz ${kernel_size} -na ${num_act} -t ${trace_size}
-	${srcdir}/../../../python/utils/parse_trace.py --input log/trace_conv2d.txt --mlir build/aie2_trace.mlir --output log/trace_conv2d_14x14.json
-	${srcdir}/../../../python/utils/get_trace_summary.py --input log/trace_conv2d_14x14.json
+	${srcdir}/../../../python/utils/trace/parse.py --input log/trace_conv2d.txt --mlir build/aie2_trace.mlir --output log/trace_conv2d_14x14.json
+	${srcdir}/../../../python/utils/trace/get_trace_summary.py --input log/trace_conv2d_14x14.json
 
 clean:
 	rm -rf build *.elf* *.lst *.bif aie2.mlir.prj log* *.xclbin sim \
diff --git a/programming_examples/ml/conv2d_14x14/conv2dk14_32core_placed.py b/programming_examples/ml/conv2d_14x14/conv2dk14_32core_placed.py
index 3b638fb70f4..b28fdbbca2d 100644
--- a/programming_examples/ml/conv2d_14x14/conv2dk14_32core_placed.py
+++ b/programming_examples/ml/conv2d_14x14/conv2dk14_32core_placed.py
@@ -3,7 +3,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 import numpy as np
 import sys
 
@@ -12,8 +12,13 @@
 from aie.extras.context import mlir_mod_ctx
 from aie.iron.controlflow import range_
 import aie.utils.trace as trace_utils
-from aie.utils.trace import PortEvent
-from aie.utils.trace_events.aie2 import CoreEvent, MemEvent, ShimTileEvent, MemTileEvent
+from aie.utils.trace.events import (
+    PortEvent,
+    CoreEvent,
+    MemEvent,
+    ShimTileEvent,
+    MemTileEvent,
+)
 from aie.helpers.taplib import TensorTiler2D, TensorAccessSequence
 
 
diff --git a/programming_examples/ml/conv2d_14x14/conv2dk14_placed.py b/programming_examples/ml/conv2d_14x14/conv2dk14_placed.py
index 786f33d64f2..ca2ee6cf6c8 100644
--- a/programming_examples/ml/conv2d_14x14/conv2dk14_placed.py
+++ b/programming_examples/ml/conv2d_14x14/conv2dk14_placed.py
@@ -3,7 +3,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 import numpy as np
 import sys
 
@@ -12,8 +12,13 @@
 from aie.extras.context import mlir_mod_ctx
 from aie.iron.controlflow import range_
 import aie.utils.trace as trace_utils
-from aie.utils.trace import PortEvent
-from aie.utils.trace_events.aie2 import CoreEvent, MemEvent, ShimTileEvent, MemTileEvent
+from aie.utils.trace.events import (
+    PortEvent,
+    CoreEvent,
+    MemEvent,
+    ShimTileEvent,
+    MemTileEvent,
+)
 
 
 def conv2dk14(
diff --git a/programming_examples/ml/conv2d_14x14/test.py b/programming_examples/ml/conv2d_14x14/test.py
index 96b4e34d2a4..5489acfbeb8 100644
--- a/programming_examples/ml/conv2d_14x14/test.py
+++ b/programming_examples/ml/conv2d_14x14/test.py
@@ -3,7 +3,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 
 import torch
 import torch.nn as nn
@@ -13,8 +13,10 @@
 import time
 import os
 import numpy as np
-from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
 import aie.utils.test as test_utils
+import aie.iron as iron
+from aie.utils import TraceConfig, HostRuntime, NPUKernel, DefaultNPURuntime
+from pathlib import Path
 
 torch.use_deterministic_algorithms(True)
 torch.manual_seed(0)
@@ -110,19 +112,8 @@ def main(opts):
     # ------------------------------------------------------
     # Get device, load the xclbin & kernel and register them
     # ------------------------------------------------------
-    app = setup_aie(
-        xclbin_path,
-        insts_path,
-        shape_in_act,
-        dtype_in,
-        shape_total_wts,
-        dtype_wts,
-        shape_out,
-        dtype_out,
-        enable_trace=enable_trace,
-        trace_size=trace_size,
-        trace_after_output=False,
-    )
+    npu_kernel = NPUKernel(xclbin_path, insts_path)
+    kernel_handle = DefaultNPURuntime.load(npu_kernel)
 
     # ------------------------------------------------------
     # Define your golden reference
@@ -210,31 +201,37 @@ def forward(self, x):
     # ------------------------------------------------------
     # Main run loop
     # ------------------------------------------------------
+    in1 = iron.tensor(ifm_mem_fmt_grp, dtype=dtype_in)
+    in2 = iron.tensor(total_wts, dtype=dtype_wts)
+    out_size = np.prod(shape_out) * dtype_out.itemsize
+    out = iron.zeros(out_size, dtype=dtype_out)
+
+    buffers = [in1, in2, out]
+
+    trace_config = None
+    if enable_trace:
+        trace_config = TraceConfig(
+            trace_size=trace_size,
+            trace_file=trace_file,
+            trace_after_last_tensor=False,
+            enable_ctrl_pkts=False,
+            last_tensor_shape=out.shape,
+            last_tensor_dtype=out.dtype,
+        )
+        HostRuntime.prepare_args_for_trace(buffers, trace_config)
     for i in range(num_iter):
-        start = time.time_ns()
-        if enable_trace:
-            data_buffer, trace_buffer = execute(
-                app, ifm_mem_fmt_grp, total_wts, enable_trace, False
-            )
-        else:
-            entire_buffer = execute(
-                app, ifm_mem_fmt_grp, total_wts, enable_trace, False
-            )
-        stop = time.time_ns()
 
-        if enable_trace:
+        trace_buffer = None
+        ret = DefaultNPURuntime.run(kernel_handle, buffers)
+
+        if trace_config:
+            trace_buffer, _ = HostRuntime.extract_trace_from_args(buffers, trace_config)
             trace_buffer = trace_buffer.view(np.uint32)
-            # Scale the data
-            scaled_data_buffer = data_buffer * int8_scale
-            # Write out the trace
-            write_out_trace(trace_buffer, trace_file)
-        else:
-            data_buffer = entire_buffer
-            scaled_data_buffer = entire_buffer * int8_scale
-            trace_buffer = None
-
-        npu_time = stop - start
-        npu_time_total = npu_time_total + npu_time
+            trace_config.write_trace(trace_buffer)
+
+        data_buffer = out.numpy()
+        scaled_data_buffer = data_buffer * int8_scale
+        npu_time_total = npu_time_total + ret.npu_time
 
     # ------------------------------------------------------
     # Reorder output data-layout
diff --git a/programming_examples/ml/conv2d_fused_relu/test.py b/programming_examples/ml/conv2d_fused_relu/test.py
index e0d2e18606d..d8001458975 100644
--- a/programming_examples/ml/conv2d_fused_relu/test.py
+++ b/programming_examples/ml/conv2d_fused_relu/test.py
@@ -3,7 +3,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 
 import torch
 import torch.nn as nn
@@ -13,8 +13,10 @@
 import time
 import os
 import numpy as np
-from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
 import aie.utils.test as test_utils
+import aie.iron as iron
+from aie.utils import TraceConfig, HostRuntime, NPUKernel, DefaultNPURuntime
+from pathlib import Path
 
 torch.use_deterministic_algorithms(True)
 torch.manual_seed(0)
@@ -61,18 +63,8 @@ def main(opts):
     # ------------------------------------------------------
     # Get device, load the xclbin & kernel and register them
     # ------------------------------------------------------
-    app = setup_aie(
-        xclbin_path,
-        insts_path,
-        shape_in_act,
-        dtype_in,
-        shape_total_wts,
-        dtype_wts,
-        shape_out,
-        dtype_out,
-        enable_trace=enable_trace,
-        trace_size=trace_size,
-    )
+    npu_kernel = NPUKernel(xclbin_path, insts_path)
+    kernel_handle = DefaultNPURuntime.load(npu_kernel)
 
     # ------------------------------------------------------
     # Define your golden reference
@@ -118,19 +110,32 @@ def forward(self, x):
     # ------------------------------------------------------
     # Main run loop
     # ------------------------------------------------------
-    for i in range(num_iter):
-        start = time.time_ns()
-        aie_output = execute(app, ifm_mem_fmt, total_wts) * relu_scale
-        stop = time.time_ns()
+    in1 = iron.tensor(ifm_mem_fmt, dtype=dtype_in)
+    in2 = iron.tensor(total_wts, dtype=dtype_wts)
+    out_size = np.prod(shape_out) * dtype_out.itemsize
+    out = iron.zeros(out_size, dtype=dtype_out)
 
+    trace_config = None
+    for i in range(num_iter):
+        buffers = [in1, in2, out]
         if enable_trace:
-            aie_output, trace = extract_trace(
-                aie_output, shape_out, dtype_out, trace_size
+            trace_config = TraceConfig(
+                trace_size=trace_size,
+                trace_file=trace_file,
+                trace_after_last_tensor=False,
+                enable_ctrl_pkts=False,
+                last_tensor_shape=out.shape,
+                last_tensor_dtype=out.dtype,
             )
-            write_out_trace(trace, trace_file)
-
-        npu_time = stop - start
-        npu_time_total = npu_time_total + npu_time
+            HostRuntime.prepare_args_for_trace(buffers, trace_config)
+        ret = DefaultNPURuntime.run(kernel_handle, buffers)
+
+        if trace_config:
+            trace_buffer, _ = HostRuntime.extract_trace_from_args(buffers, trace_config)
+            trace_buffer = trace_buffer.view(np.uint32)
+            trace_config.write_trace(trace_buffer)
+        aie_output = out.numpy() * relu_scale
+        npu_time_total = npu_time_total + ret.npu_time
 
     # ------------------------------------------------------
     # Reorder output data-layout
diff --git a/programming_examples/ml/eltwise_add/Makefile b/programming_examples/ml/eltwise_add/Makefile
index e37abfdd661..371da4b5401 100644
--- a/programming_examples/ml/eltwise_add/Makefile
+++ b/programming_examples/ml/eltwise_add/Makefile
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 # 
 ##===----------------------------------------------------------------------===##
 
@@ -69,8 +69,8 @@ run: ${targetname}.exe build/final.xclbin
 
 trace: ${targetname}.exe build/final_trace.xclbin
 	${powershell} ./$< -x build/final_trace.xclbin -i build/insts.bin -k MLIR_AIE -t ${trace_size}
-	${srcdir}/../../../python/utils/parse_trace.py --input trace.txt --mlir build/aie_trace.mlir --output trace_eltwise_add.json
-	${srcdir}/../../../python/utils/get_trace_summary.py --input trace_eltwise_add.json
+	${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie_trace.mlir --output trace_eltwise_add.json
+	${srcdir}/../../../python/utils/trace/get_trace_summary.py --input trace_eltwise_add.json
 
 clean:
 	rm -rf build _build ${targetname}.exe
diff --git a/programming_examples/ml/eltwise_add/test.cpp b/programming_examples/ml/eltwise_add/test.cpp
index 5f9e8095df6..fbb1d911657 100644
--- a/programming_examples/ml/eltwise_add/test.cpp
+++ b/programming_examples/ml/eltwise_add/test.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// Copyright (C) 2023, Advanced Micro Devices, Inc.
+// Copyright (C) 2023-2026, Advanced Micro Devices, Inc.
 //
 //===----------------------------------------------------------------------===//
 
@@ -40,8 +40,13 @@ int verify(int size, std::vector<T> A, std::vector<T> B, std::vector<T> C,
   for (uint32_t i = 0; i < size; i++) {
     T ref = A[i] + B[i];
     if (!test_utils::nearly_equal(ref, C[i], 0.00390625)) {
-      std::cout << "Error in output " << C[i] << " != " << ref << " from "
-                << A[i] << " + " << B[i] << std::endl;
+      if (errors < 100) {
+        std::cout << "Error in output " << C[i] << " != " << ref << " from "
+                  << A[i] << " * " << B[i] << std::endl;
+      } else if (errors == 100) {
+        std::cout << "..." << std::endl;
+        std::cout << "[Errors truncated]" << std::endl;
+      }
       errors++;
     } else {
       if (verbosity > 1)
diff --git a/programming_examples/ml/eltwise_mul/Makefile b/programming_examples/ml/eltwise_mul/Makefile
index 50efedc66bc..cc09fc316f5 100644
--- a/programming_examples/ml/eltwise_mul/Makefile
+++ b/programming_examples/ml/eltwise_mul/Makefile
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 # 
 ##===----------------------------------------------------------------------===##
 
@@ -64,8 +64,8 @@ run: ${targetname}.exe build/final.xclbin
 
 trace: ${targetname}.exe build/final_trace.xclbin
 	${powershell} ./$< -x build/final_trace.xclbin -i build/insts.bin -k MLIR_AIE 
-	${srcdir}/../../../python/utils/parse_trace.py --input trace.txt --mlir build/aie_trace.mlir --output trace_eltwise_mul.json
-	${srcdir}/../../../python/utils/get_trace_summary.py --input trace_eltwise_mul.json
+	${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie_trace.mlir --output trace_eltwise_mul.json
+	${srcdir}/../../../python/utils/trace/get_trace_summary.py --input trace_eltwise_mul.json
 
 clean:
 	rm -rf build _build ${targetname}.exe
diff --git a/programming_examples/ml/eltwise_mul/test.cpp b/programming_examples/ml/eltwise_mul/test.cpp
index 09f21a75ff0..36aea4639cc 100644
--- a/programming_examples/ml/eltwise_mul/test.cpp
+++ b/programming_examples/ml/eltwise_mul/test.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// Copyright (C) 2023, Advanced Micro Devices, Inc.
+// Copyright (C) 2023-2026, Advanced Micro Devices, Inc.
 //
 //===----------------------------------------------------------------------===//
 
@@ -40,8 +40,13 @@ int verify(int size, std::vector<T> A, std::vector<T> B, std::vector<T> C,
   for (uint32_t i = 0; i < size; i++) {
     T ref = A[i] * B[i];
     if (!test_utils::nearly_equal(ref, C[i], 0.00390625)) {
-      std::cout << "Error in output " << C[i] << " != " << ref << " from "
-                << A[i] << " * " << B[i] << std::endl;
+      if (errors < 100) {
+        std::cout << "Error in output " << C[i] << " != " << ref << " from "
+                  << A[i] << " * " << B[i] << std::endl;
+      } else if (errors == 100) {
+        std::cout << "..." << std::endl;
+        std::cout << "[Errors truncated]" << std::endl;
+      }
       errors++;
     } else {
       if (verbosity > 1)
diff --git a/programming_examples/ml/layernorm/Makefile b/programming_examples/ml/layernorm/Makefile
index 9109d86d457..37eeebe0cb7 100644
--- a/programming_examples/ml/layernorm/Makefile
+++ b/programming_examples/ml/layernorm/Makefile
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2025, Advanced Micro Devices, Inc.
+# Copyright (C) 2025-2026, Advanced Micro Devices, Inc.
 # 
 ##===----------------------------------------------------------------------===##
 
@@ -71,8 +71,8 @@ run: ${targetname}.exe build/final.xclbin
 
 trace: ${targetname}.exe build/final_trace.xclbin
 	${powershell} ./$< -x build/final_trace.xclbin -i build/insts.bin -k MLIR_AIE -t ${trace_size}
-	${srcdir}/../../../python/utils/parse_trace.py --input trace.txt --mlir build/aie_trace.mlir --output trace_${targetname}.json
-	${srcdir}/../../../python/utils/get_trace_summary.py --input trace_${targetname}.json
+	${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie_trace.mlir --output trace_${targetname}.json
+	${srcdir}/../../../python/utils/trace/get_trace_summary.py --input trace_${targetname}.json
 
 clean:
 	rm -rf build _build ${targetname}.exe
diff --git a/programming_examples/ml/resnet/layers_conv2_x/test.py b/programming_examples/ml/resnet/layers_conv2_x/test.py
index 06989d55fa2..2af20fbbd15 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/test.py
+++ b/programming_examples/ml/resnet/layers_conv2_x/test.py
@@ -3,7 +3,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 
 import torch
 import torch.nn as nn
@@ -13,8 +13,10 @@
 import time
 import os
 import numpy as np
-from aie.utils.xrt import setup_aie, extract_trace, write_out_trace, execute
 import aie.utils.test as test_utils
+import aie.iron as iron
+from aie.utils import TraceConfig, HostRuntime, NPUKernel, DefaultNPURuntime
+from pathlib import Path
 
 torch.use_deterministic_algorithms(True)
 torch.manual_seed(0)
@@ -152,18 +154,8 @@ def main(opts):
     # ------------------------------------------------------
     # Get device, load the xclbin & kernel and register them
     # ------------------------------------------------------
-    app = setup_aie(
-        xclbin_path,
-        insts_path,
-        shape_in_act,
-        dtype_in,
-        shape_total_wts,
-        dtype_wts,
-        shape_out,
-        dtype_out,
-        enable_trace=enable_trace,
-        trace_size=trace_size,
-    )
+    npu_kernel = NPUKernel(xclbin_path, insts_path, kernel_name=opts.kernel)
+    kernel_handle = DefaultNPURuntime.load(npu_kernel)
 
     # ------------------------------------------------------
     # Define your golden reference
@@ -440,19 +432,33 @@ def forward(self, x):
     # ------------------------------------------------------
     # Main run loop
     # ------------------------------------------------------
+    in1 = iron.tensor(ifm_mem_fmt, dtype=dtype_in)
+    in2 = iron.tensor(total_wts3, dtype=dtype_wts)
+    out_size = np.prod(shape_out) * dtype_out.itemsize
+    out = iron.zeros(out_size, dtype=dtype_out)
+
+    buffers = [in1, in2, out]
+
+    trace_config = None
+    if enable_trace:
+        trace_config = TraceConfig(
+            trace_size=trace_size,
+            trace_file=trace_file,
+            trace_after_last_tensor=False,
+            enable_ctrl_pkts=False,
+            last_tensor_shape=out.shape,
+            last_tensor_dtype=out.dtype,
+        )
+        HostRuntime.prepare_args_for_trace(buffers, trace_config)
     for i in range(num_iter):
-        start = time.time_ns()
-        aie_output = execute(app, ifm_mem_fmt, total_wts3) * block_2_relu_3
-        stop = time.time_ns()
-
-        if enable_trace:
-            aie_output, trace = extract_trace(
-                aie_output, shape_out, dtype_out, trace_size
-            )
-            write_out_trace(trace, trace_file)
-
-        npu_time = stop - start
-        npu_time_total = npu_time_total + npu_time
+        ret = DefaultNPURuntime.run(kernel_handle, buffers)
+
+        if trace_config:
+            trace_buffer, _ = HostRuntime.extract_trace_from_args(buffers, trace_config)
+            trace_buffer = trace_buffer.view(np.uint32)
+            trace_config.write_trace(trace_buffer)
+        aie_output = out.numpy() * block_2_relu_3
+        npu_time_total = npu_time_total + ret.npu_time
 
     # ------------------------------------------------------
     # Reorder output data-layout
diff --git a/programming_examples/ml/rmsnorm/Makefile b/programming_examples/ml/rmsnorm/Makefile
index 6a9823104fe..8ebcd435803 100644
--- a/programming_examples/ml/rmsnorm/Makefile
+++ b/programming_examples/ml/rmsnorm/Makefile
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2025, Advanced Micro Devices, Inc.
+# Copyright (C) 2025-2026, Advanced Micro Devices, Inc.
 # 
 ##===----------------------------------------------------------------------===##
 
@@ -70,10 +70,10 @@ run: ${targetname}.exe build/final.xclbin
 
 trace: ${targetname}.exe build/final_trace.xclbin
 	${powershell} ./$< -x build/final_trace.xclbin -i build/insts.bin -k MLIR_AIE -t ${trace_size}
-	${srcdir}/../../../python/utils/parse_trace.py --input trace.txt --mlir build/aie_trace.mlir --output trace_${targetname}.json
-	${srcdir}/../../../python/utils/get_trace_summary.py --input trace_${targetname}.json
+	${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie_trace.mlir --output trace_${targetname}.json
+	${srcdir}/../../../python/utils/trace/get_trace_summary.py --input trace_${targetname}.json
 
 clean:
 	rm -rf build _build ${targetname}.exe
 
-.PHONY: all run trace clean
\ No newline at end of file
+.PHONY: all run trace clean
diff --git a/programming_examples/ml/rope/Makefile b/programming_examples/ml/rope/Makefile
index 1b88a2f9f8c..f2382a7e42f 100644
--- a/programming_examples/ml/rope/Makefile
+++ b/programming_examples/ml/rope/Makefile
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2025, Advanced Micro Devices, Inc.
+# Copyright (C) 2025-2026, Advanced Micro Devices, Inc.
 # 
 ##===----------------------------------------------------------------------===##
 
@@ -70,10 +70,10 @@ run: ${targetname}.exe build/final.xclbin
 
 trace: ${targetname}.exe build/final_trace.xclbin
 	${powershell} ./$< -x build/final_trace.xclbin -i build/insts.bin -k MLIR_AIE -t ${trace_size}
-	${srcdir}/../../../python/utils/parse_trace.py --input trace.txt --mlir build/aie_trace.mlir --output trace_${targetname}.json
-	${srcdir}/../../../python/utils/get_trace_summary.py --input trace_${targetname}.json
+	${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie_trace.mlir --output trace_${targetname}.json
+	${srcdir}/../../../python/utils/trace/get_trace_summary.py --input trace_${targetname}.json
 
 clean:
 	rm -rf build _build ${targetname}.exe
 
-.PHONY: all run trace clean
\ No newline at end of file
+.PHONY: all run trace clean
diff --git a/programming_examples/ml/scale_shift/Makefile b/programming_examples/ml/scale_shift/Makefile
index c570aaeb3fc..77f22598ac6 100644
--- a/programming_examples/ml/scale_shift/Makefile
+++ b/programming_examples/ml/scale_shift/Makefile
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2025, Advanced Micro Devices, Inc.
+# Copyright (C) 2025-2026, Advanced Micro Devices, Inc.
 # 
 ##===----------------------------------------------------------------------===##
 
@@ -74,8 +74,8 @@ run: ${targetname}.exe build/final.xclbin
 
 trace: ${targetname}.exe build/final_trace.xclbin
 	${powershell} ./$< -x build/final_trace.xclbin -i build/insts.bin -k MLIR_AIE -t ${trace_size}
-	${srcdir}/../../../python/utils/parse_trace.py --filename trace.txt --mlir build/aie_trace.mlir  > trace_eltwise_mul.json
-	${srcdir}/../../../python/utils/get_trace_summary.py --filename trace_eltwise_mul.json
+	${srcdir}/../../../python/utils/trace/parse.py --filename trace.txt --mlir build/aie_trace.mlir  > trace_eltwise_mul.json
+	${srcdir}/../../../python/utils/trace/get_trace_summary.py --filename trace_eltwise_mul.json
 
 clean:
 	rm -rf build _build ${targetname}.exe
diff --git a/programming_examples/ml/scale_shift/test.cpp b/programming_examples/ml/scale_shift/test.cpp
index 2eec7930def..15ee4c0a1cb 100644
--- a/programming_examples/ml/scale_shift/test.cpp
+++ b/programming_examples/ml/scale_shift/test.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// Copyright (C) 2025, Advanced Micro Devices, Inc.
+// Copyright (C) 2025-2026, Advanced Micro Devices, Inc.
 //
 //===----------------------------------------------------------------------===//
 
@@ -41,11 +41,13 @@ int verify(int size, std::vector<T> A, std::vector<T> B, std::vector<T> C,
   for (uint32_t i = 0; i < size; i++) {
     T ref = A[i] * B[i] + C[i];
     if (!test_utils::nearly_equal(ref, D[i], 0.002)) {
-      std::cout << "Error in output " << D[i] << " != " << ref << " from "
-                << A[i] << " * " << B[i] << " + " << C[i] << std::endl;
+      if (verbosity >= 1) {
+        std::cout << "Error in output " << D[i] << " != " << ref << " from "
+                  << A[i] << " * " << B[i] << " + " << C[i] << std::endl;
+      }
       errors++;
     } else {
-      if (verbosity > 1)
+      if (verbosity >= 1)
         std::cout << "Correct output " << D[i] << " == " << ref << std::endl;
     }
   }
diff --git a/programming_examples/ml/softmax/Makefile b/programming_examples/ml/softmax/Makefile
index 5d5813d2e7f..9e99b5c00d6 100755
--- a/programming_examples/ml/softmax/Makefile
+++ b/programming_examples/ml/softmax/Makefile
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2024-2025, Advanced Micro Devices, Inc.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 #
 ##===----------------------------------------------------------------------===##
 
@@ -111,9 +111,8 @@ profile: ${targetname}.exe build/final.xclbin
 
 trace: ${targetname}.exe build/final_trace.xclbin
 	${powershell} ./$< -x build/final_trace.xclbin -i build/insts.bin -k MLIR_AIE -t ${trace_size}
-	${srcdir}/../../../python/utils/parse_trace.py --input trace.txt --mlir build/aie_trace.mlir --output trace_softmax.json
-	${srcdir}/../../../python/utils/get_trace_summary.py --input trace_softmax.json
+	${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie_trace.mlir --output trace_softmax.json
+	${srcdir}/../../../python/utils/trace/get_trace_summary.py --input trace_softmax.json
 
 clean:
 	rm -rf build _build ${targetname}.exe
-
diff --git a/programming_examples/ml/softmax/test.cpp b/programming_examples/ml/softmax/test.cpp
index 21b7f677707..84e8d4e626c 100644
--- a/programming_examples/ml/softmax/test.cpp
+++ b/programming_examples/ml/softmax/test.cpp
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// Copyright (C) 2023, Advanced Micro Devices, Inc.
+// Copyright (C) 2023-2026, Advanced Micro Devices, Inc.
 //
 //===----------------------------------------------------------------------===//
 
@@ -64,13 +64,14 @@ int verify(int size, int tile_size, std::vector<T> A, std::vector<T> B,
   for (uint32_t i = 0; i < size; i++) {
 
     if (!test_utils::nearly_equal(RefVec[i], B[i], 0.04, 0.001)) {
-      std::cout << "Error in output " << B[i] << " != " << RefVec[i]
-                << std::endl;
-      errors++;
-    } else {
-      if (verbosity > 1)
-        std::cout << "Correct output " << B[i] << " == " << RefVec[i]
+      if (verbosity >= 1) {
+        std::cout << "Error in output " << B[i] << " != " << RefVec[i]
                   << std::endl;
+      }
+      errors++;
+    } else if (verbosity >= 1) {
+      std::cout << "Correct output " << B[i] << " == " << RefVec[i]
+                << std::endl;
     }
   }
   return errors;
diff --git a/programming_guide/iron_configuration.md b/programming_guide/iron_configuration.md
index 1b1c6cb3d6c..d640054e6be 100644
--- a/programming_guide/iron_configuration.md
+++ b/programming_guide/iron_configuration.md
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// Copyright (C) 2025, Advanced Micro Devices, Inc.
+// Copyright (C) 2025-2026, Advanced Micro Devices, Inc.
 // 
 //===----------------------------------------------------------------------===//-->
 
@@ -14,21 +14,20 @@ There are several options that exist to configure the IRON Python programming en
 
 ## Default IRON Tensor Class
 
-This is a variable that controls the types of [```iron.Tensor```](../python/iron/hostruntime/tensor.py)s that are produced by the utility functions ```tensor```, ```ones```, etc. Right now there are two tensor implementations: [```CPUOnlyTensor```](../python/iron/hostruntime/tensor.py) and [```XRTTensor```](../python/iron/hostruntime/xrtruntime/tensor.py).
+This is a variable that controls the types of [```aie.utils.Tensor```](../python/utils/tensor.py)s that are produced by the utility functions ```tensor```, ```ones```, etc. Right now there are two tensor implementations: [```CPUOnlyTensor```](../python/utils/tensor.py) and [```XRTTensor```](../python/utils/xrtruntime/tensor.py).
 
-By default, if ```pyxrt``` is available, the ```DEFAULT_IRON_TENSOR_CLASS``` is set to ```XRTTensor```. However, you can also manually set this value through the ```set_iron_tensor_class()```, e.g.:
+By default, if ```pyxrt``` is available, the ```DEFAULT_TENSOR_CLASS``` is set to ```XRTTensor```. However, you can also manually set this value through the ```set_tensor_class()```, e.g.:
 ```python
->>> import aie.iron as iron
 >>> import numpy as np
->>> print(iron.hostruntime.tensor.DEFAULT_IRON_TENSOR_CLASS.__name__)
+>>> print(aie.utils.tensor.DEFAULT_TENSOR_CLASS.__name__)
 XRTTensor
 >>> type(iron.tensor((2, 2), np.int32))
-<class 'aie.iron.hostruntime.xrtruntime.tensor.XRTTensor'>
->>> iron.set_iron_tensor_class(iron.hostruntime.tensor.CPUOnlyTensor)
->>> print(iron.hostruntime.tensor.DEFAULT_IRON_TENSOR_CLASS.__name__)
+<class 'aie.utils.xrtruntime.tensor.XRTTensor'>
+>>> aie.utils.set_tensor_class(aie.utils.tensor.CPUOnlyTensor)
+>>> print(aie.utils.tensor.DEFAULT_TENSOR_CLASS.__name__)
 CPUOnlyTensor
->>> type(iron.tensor((2, 2), np.int32))
-<class 'aie.iron.hostruntime.tensor.CPUOnlyTensor'>
+>>> type(aie.utils.tensor((2, 2), np.int32))
+<class 'aie.utils.tensor.CPUOnlyTensor'>
 ```
 
 ## Default IRON Device
@@ -44,4 +43,12 @@ If the IRON device is not set, many designs will try it fetch it on demand using
 
 ## IRON Cache Location
 
-The IRON jit feature caches compiled objects in a directory defined by ```IRON_CACHE_DIR```. By default this value is the user's home directory.
\ No newline at end of file
+The IRON jit feature caches compiled objects in a directory defined by ```NPU_CACHE_DIR```. By default this value is the user's home directory.
+
+## IRON XRT Runtime Cache Size
+
+The `CachedXRTRuntime` caches XRT contexts to improve performance. The size of this cache can be configured using the `XRT_CONTEXT_CACHE_SIZE` environment variable. This is particularly useful in CI environments where multiple tests run in parallel and might exhaust the available NPU contexts.
+
+```bash
+export XRT_CONTEXT_CACHE_SIZE=1
+```
\ No newline at end of file
diff --git a/programming_guide/section-3/test.py b/programming_guide/section-3/test.py
index e99188c87ec..73061c3185b 100644
--- a/programming_guide/section-3/test.py
+++ b/programming_guide/section-3/test.py
@@ -4,67 +4,36 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+# (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 
 import numpy as np
-import pyxrt as xrt
+from pathlib import Path
 import sys
 
-import aie.utils.xrt as xrt_utils
+import aie.iron as iron
+import aie.utils
 import aie.utils.test as test_utils
+from aie.utils.npukernel import NPUKernel
 
 
 def main(opts):
-
-    # Load instruction sequence
-    instr_v = xrt_utils.read_insts(opts.instr)
-
     # ------------------------------------------------------------
     # Configure this to match your design's buffer size and type
     # ------------------------------------------------------------
-    INOUT0_VOLUME = int(4096)  # Input only, 64x uint32_t in this example
-    INOUT1_VOLUME = int(1)  # Input only, 1 uint32_t scale factor
-    INOUT2_VOLUME = int(4096)  # Output only, 64x uint32_t in this example
-
-    INOUT0_DATATYPE = np.int32
-    INOUT1_DATATYPE = np.int32
-    INOUT2_DATATYPE = np.int32
-
-    INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize
-    INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize
-    INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize
-
-    OUT_SIZE = INOUT2_SIZE
-
-    # ------------------------------------------------------
-    # Get device, load the xclbin & kernel and register them
-    # ------------------------------------------------------
-    (device, kernel) = test_utils.init_xrt_load_kernel(opts)
-
-    # ------------------------------------------------------
-    # Initialize input/ output buffer sizes and sync them
-    # ------------------------------------------------------
-    bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(1))
-    bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(3))
-    bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(4))
-    bo_inout2 = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(5))
-
-    # Initialize instruction buffer
-    bo_instr.write(instr_v, 0)
 
-    # Initialize data buffers
-    inout0 = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE)
-    scale_factor = np.array([3], dtype=INOUT1_DATATYPE)
-    inout2 = np.zeros(OUT_SIZE, dtype=np.uint8)
-    bo_inout0.write(inout0, 0)
-    bo_inout1.write(scale_factor, 0)
-    bo_inout2.write(inout2, 0)
+    # Initialize data buffers and reference for verification
+    ref_buffer = np.arange(1, 64 + 1, dtype=np.int32)
+    in_buffer = iron.tensor(ref_buffer, dtype=np.int32)
+    scale_factor = 3
+    in_factor = iron.tensor([scale_factor], dtype=np.int32)
+    out = iron.zeros(64, dtype=np.int32)
+    ref_buffer = ref_buffer * scale_factor
 
-    # Sync buffers to update input buffer values
-    bo_instr.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-    bo_inout0.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-    bo_inout1.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-    bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+    # ----------------------------------------------------
+    # Prepare buffers and load compiled artifacts onto the device
+    # ----------------------------------------------------
+    npu_kernel = NPUKernel(opts.xclbin, opts.instr)
+    kernel_handle = aie.utils.DefaultNPURuntime.load(npu_kernel)
 
     # ------------------------------------------------------
     # Initialize run configs
@@ -78,19 +47,14 @@ def main(opts):
     # Run kernel
     if opts.verbosity >= 1:
         print("Running Kernel.")
-    opcode = 3
-    h = kernel(opcode, bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2)
-    h.wait()
-    bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
+    npu_time = aie.utils.DefaultNPURuntime.run(
+        kernel_handle, [in_buffer, in_factor, out]
+    )
 
-    # Copy output results and verify they are correct
-    entire_buffer = bo_inout2.read(OUT_SIZE, 0).view(np.uint32)
-    output_buffer = entire_buffer[:INOUT2_VOLUME]
     if opts.verify:
         if opts.verbosity >= 1:
             print("Verifying results ...")
-        ref = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE) * scale_factor
-        e = np.equal(output_buffer, ref)
+        e = np.equal(out, ref_buffer)
         errors = errors + np.size(e) - np.count_nonzero(e)
 
     # ------------------------------------------------------
diff --git a/programming_guide/section-4/section-4a/test.py b/programming_guide/section-4/section-4a/test.py
index 39fdb5ad548..14cff98aa06 100644
--- a/programming_guide/section-4/section-4a/test.py
+++ b/programming_guide/section-4/section-4a/test.py
@@ -4,68 +4,36 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+# (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 
 import numpy as np
-import pyxrt as xrt
 import sys
-import time
+from pathlib import Path
 
-import aie.utils.xrt as xrt_utils
+import aie.iron as iron
+import aie.utils
 import aie.utils.test as test_utils
+from aie.utils.npukernel import NPUKernel
 
 
 def main(opts):
-
-    # Load instruction sequence
-    instr_v = xrt_utils.read_insts(opts.instr)
-
     # ------------------------------------------------------------
     # Configure this to match your design's buffer size and type
     # ------------------------------------------------------------
-    INOUT0_VOLUME = int(4096)  # Input only, 64x uint32_t in this example
-    INOUT1_VOLUME = int(1)  # Input only, 1 uint32_t scale factor
-    INOUT2_VOLUME = int(4096)  # Output only, 64x uint32_t in this example
-
-    INOUT0_DATATYPE = np.int32
-    INOUT1_DATATYPE = np.int32
-    INOUT2_DATATYPE = np.int32
 
-    INOUT0_SIZE = INOUT0_VOLUME * INOUT0_DATATYPE().itemsize
-    INOUT1_SIZE = INOUT1_VOLUME * INOUT1_DATATYPE().itemsize
-    INOUT2_SIZE = INOUT2_VOLUME * INOUT2_DATATYPE().itemsize
+    # Initialize data buffers and reference for verification
+    ref_buffer = np.arange(1, 64 + 1, dtype=np.int32)
+    in_buffer = iron.tensor(ref_buffer, dtype=np.int32)
+    scale_factor = 3
+    in_factor = iron.tensor([scale_factor], dtype=np.int32)
+    out = iron.zeros(64, dtype=np.int32)
+    ref_buffer = ref_buffer * scale_factor
 
-    OUT_SIZE = INOUT2_SIZE
-
-    # ------------------------------------------------------
-    # Get device, load the xclbin & kernel and register them
-    # ------------------------------------------------------
-    (device, kernel) = test_utils.init_xrt_load_kernel(opts)
-
-    # ------------------------------------------------------
-    # Initialize input/ output buffer sizes and sync them
-    # ------------------------------------------------------
-    bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(1))
-    bo_inout0 = xrt.bo(device, INOUT0_SIZE, xrt.bo.host_only, kernel.group_id(3))
-    bo_inout1 = xrt.bo(device, INOUT1_SIZE, xrt.bo.host_only, kernel.group_id(4))
-    bo_inout2 = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(5))
-
-    # Initialize instruction buffer
-    bo_instr.write(instr_v, 0)
-
-    # Initialize data buffers
-    inout0 = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE)
-    scale_factor = np.array([3], dtype=INOUT1_DATATYPE)
-    inout2 = np.zeros(OUT_SIZE, dtype=np.uint8)
-    bo_inout0.write(inout0, 0)
-    bo_inout1.write(scale_factor, 0)
-    bo_inout2.write(inout2, 0)
-
-    # Sync buffers to update input buffer values
-    bo_instr.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-    bo_inout0.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-    bo_inout1.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-    bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+    # ----------------------------------------------------
+    # Prepare buffers and load compiled artifacts onto the device
+    # ----------------------------------------------------
+    npu_kernel = NPUKernel(opts.xclbin, opts.instr)
+    kernel_handle = aie.utils.DefaultNPURuntime.load(npu_kernel)
 
     # ------------------------------------------------------
     # Initialize run configs
@@ -81,30 +49,22 @@ def main(opts):
     # ------------------------------------------------------
     for i in range(num_iter):
         # Run kernel
-        if opts.verbosity >= 1:
-            print("Running Kernel.")
-        start = time.time_ns()
-        opcode = 3
-        h = kernel(opcode, bo_instr, len(instr_v), bo_inout0, bo_inout1, bo_inout2)
-        h.wait()
-        stop = time.time_ns()
-        bo_inout2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
+        result = aie.utils.DefaultNPURuntime.run(
+            kernel_handle, [in_buffer, in_factor, out]
+        )
+        npu_time = result.npu_time
 
         # Warmup iterations do not count towards average runtime.
         if i < opts.warmup_iters:
             continue
 
         # Copy output results and verify they are correct
-        entire_buffer = bo_inout2.read(OUT_SIZE, 0).view(np.uint32)
-        output_buffer = entire_buffer[:INOUT2_VOLUME]
         if opts.verify:
             if opts.verbosity >= 1:
                 print("Verifying results ...")
-            ref = np.arange(1, INOUT0_VOLUME + 1, dtype=INOUT0_DATATYPE) * scale_factor
-            e = np.equal(output_buffer, ref)
+            e = np.equal(out, ref_buffer)
             errors = errors + np.size(e) - np.count_nonzero(e)
 
-        npu_time = stop - start
         npu_time_total = npu_time_total + npu_time
         npu_time_min = min(npu_time_min, npu_time)
         npu_time_max = max(npu_time_max, npu_time)
diff --git a/programming_guide/section-4/section-4b/Makefile b/programming_guide/section-4/section-4b/Makefile
index 7b7a63d74ae..2c8fde8fe30 100644
--- a/programming_guide/section-4/section-4b/Makefile
+++ b/programming_guide/section-4/section-4b/Makefile
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 # 
 ##===----------------------------------------------------------------------===##
 
@@ -79,13 +79,13 @@ run_py: build/final.xclbin build/insts.bin
 
 trace: ${targetname}.exe build/final_trace.xclbin build/insts.bin 
 	${powershell} ./$< -x build/final_trace.xclbin -i build/insts.bin -k MLIR_AIE -t ${trace_size}
-	${srcdir}/../../../python/utils/parse_trace.py --input trace.txt --mlir build/aie_trace.mlir --output trace_4b.json
-	${srcdir}/../../../python/utils/get_trace_summary.py --input trace_4b.json
+	${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie_trace.mlir --output trace_4b.json
+	${srcdir}/../../../python/utils/trace/get_trace_summary.py --input trace_4b.json
 
 trace_py: build/final_trace.xclbin build/insts.bin
 	${powershell} python3 ${srcdir}/test.py -x build/final_trace.xclbin -i build/insts.bin -k MLIR_AIE -t ${trace_size} -i1s ${in1_size} -i2s ${in2_size} -os ${out_size}
-	${srcdir}/../../../python/utils/parse_trace.py --input trace.txt --mlir build/aie_trace.mlir --output trace_4b.json
-	${srcdir}/../../../python/utils/get_trace_summary.py --input trace_4b.json
+	${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie_trace.mlir --output trace_4b.json
+	${srcdir}/../../../python/utils/trace/get_trace_summary.py --input trace_4b.json
 
 
 clean_trace:
diff --git a/programming_guide/section-4/section-4b/README.md b/programming_guide/section-4/section-4b/README.md
index ead716e1206..6466005dcf3 100644
--- a/programming_guide/section-4/section-4b/README.md
+++ b/programming_guide/section-4/section-4b/README.md
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 //
 //===----------------------------------------------------------------------===//-->
 
@@ -237,13 +237,13 @@ These convenience python wrappers perform the `sync` steps under the hood when t
 Just like the C/C++ host code wrapper `setup_and_run_aie` found in [../../../runtime_lib/test_lib/xrt_test_wrapper_.h](../../../runtime_lib/test_lib/xrt_test_wrapper.h), for python, we have a similar wrapper `setup_and_run_aie` in [../../../python/utils/xrt.py](../../../python/utils/xrt.py). This likewise simplifies the `test.py` and can be used as a template for design patterns.
 
 ## <u>3. Parse text file to generate a waveform json file</u>
-Once the packet trace text file is generated (`trace.txt`), we use a python-based trace parser ([parse_trace.py](../../../python/utils/parse_trace.py)) to interpret the trace values and generate a waveform json file for visualization (with Perfetto). This is a step in the [Makefile](./Makefile) but can be executed from the command line as well.
+Once the packet trace text file is generated (`trace.txt`), we use a python-based trace parser ([parse_trace.py](../../../python/utils/trace/parse.py)) to interpret the trace values and generate a waveform json file for visualization (with Perfetto). This is a step in the [Makefile](./Makefile) but can be executed from the command line as well.
 ```Makefile
-	../../../python/utils/parse_trace.py --input trace.txt --mlir build/aie_trace.mlir --output trace_4b.json
+	../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie_trace.mlir --output trace_4b.json
 ```
 This leverages the python parse scripts under [python/utils](../../../python/utils/). See the [README.md](../../../python/utils/README.md) to get more details about how to use the python parse scripts.
 
-In our example [Makefile](./Makefile), we also run [get_trace_summary.py](../../../python/utils/get_trace_summary.py) to analyze the generated JSON trace file to count the number of invocations of the kernel and the cycle count of those invocations. This depends on the kernel having an `event0` and `event1` function call at the beginning and end of the kernel, which our example does. `event0` and `event1` are functions that generate an internal event and is helpful for us to mark the boundaries of a function call.
+In our example [Makefile](./Makefile), we also run [get_trace_summary.py](../../../python/utils/trace/get_trace_summary.py) to analyze the generated JSON trace file to count the number of invocations of the kernel and the cycle count of those invocations. This depends on the kernel having an `event0` and `event1` function call at the beginning and end of the kernel, which our example does. `event0` and `event1` are functions that generate an internal event and is helpful for us to mark the boundaries of a function call.
 
 ## <u>4. Open json file in a visualization tool like Perfetto</u>
 Open https://ui.perfetto.dev in your browser and then open up the waveform json file generated in step 3. You can navigate the waveform viewer as you would a standard waveform viewer and can even zoom/pan the waveform with the a,s,w,d keyboard keys.
@@ -273,11 +273,11 @@ Open https://ui.perfetto.dev in your browser and then open up the waveform json
     * `INSTR_VECTOR` - Vector instructions like vector MAC or vector load/store. Here, we are running a scalar implementation so there are no vector events.
     * `PORT_RUNNING_0` up to `PORT_RUNNING_7` - You can listen for a variety of events, such as `PORT_RUNNING`, `PORT_IDLE` or `PORT_STALLED` on up to 7 ports. To select which port to listen to, use the `PortEvent` Python class as your event. For example, to listen to master port 1:
         ```
-        from aie.utils.trace import configure_simple_tracing_aie2, PortEvent
-        from aie.utils.trace_events.aie2 import CoreEvent, MemEvent, PLEvent, MemTileEvent
+        from aie.utils.trace import configure_simple_tracing_aie2
+        from aie.utils.trace.events import PortEvent, CoreEvent, MemEvent, PLEvent, MemTileEvent
         trace_utils.configure_simple_tracing_aie2(
             # ... other arguments as above
-            events=[trace_utils.PortEvent(CoreEvent.PORT_RUNNING_0, 1, master=True)]
+            events=[trace_utils.events.PortEvent(CoreEvent.PORT_RUNNING_0, 1, master=True)]
         )
         ```
     * `PORT_RUNNING_1` - Mapped to Port 1 which is by default configured to the MM2S0 output (DMA from local memory to stream). This is usually the first output.
diff --git a/programming_guide/section-4/section-4b/test.py b/programming_guide/section-4/section-4b/test.py
index 1234a02165e..8c0c956d205 100644
--- a/programming_guide/section-4/section-4b/test.py
+++ b/programming_guide/section-4/section-4b/test.py
@@ -4,11 +4,12 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+# (c) Copyright 2024-2026 Advanced Micro Devices, Inc. or its affiliates
 import numpy as np
 import sys
-import aie.utils.xrt as xrt_utils
 import aie.utils.test as test_utils
+import aie.iron as iron
+from aie.utils import DefaultNPURuntime
 
 
 def main(opts):
@@ -41,29 +42,30 @@ def main(opts):
     scale_factor = 3
 
     # Initialize data
-    in1_data = np.arange(1, in1_volume + 1, dtype=in1_dtype)
-    in2_data = np.array([scale_factor], dtype=in2_dtype)
-    out_data = np.zeros([out_volume], dtype=out_dtype)
+    ref = np.arange(1, in1_volume + 1, dtype=in1_dtype)
+    in1 = iron.tensor(ref, dtype=in1_dtype)
+
+    in2 = iron.tensor([scale_factor], dtype=in2_dtype)
+    out = iron.zeros([out_volume], dtype=out_dtype)
 
     # Define reference data
-    ref = np.arange(1, in1_volume + 1, dtype=out_dtype) * scale_factor
+    ref = ref * scale_factor
 
     # --------------------------------------------------------------------------
 
     print("Running...\n")
-    res = xrt_utils.setup_and_run_aie(
-        in1_dtype,
-        in2_dtype,
-        out_dtype,
-        in1_data,
-        in2_data,
-        out_data,
-        in1_volume,
-        in2_volume,
-        out_volume,
-        ref,
-        opts,
+    npu_opts = test_utils.create_npu_kernel(opts)
+    res = DefaultNPURuntime.run_test(
+        npu_opts.npu_kernel,
+        [in1, in2, out],
+        {2: ref},
+        verify=npu_opts.verify,
+        verbosity=npu_opts.verbosity,
     )
+    if not res:
+        print("PASS!")
+    else:
+        print("Failed.")
     sys.exit(res)
 
 
diff --git a/programming_guide/section-4/section-4c/README.md b/programming_guide/section-4/section-4c/README.md
index c5465071777..803c782327a 100644
--- a/programming_guide/section-4/section-4c/README.md
+++ b/programming_guide/section-4/section-4c/README.md
@@ -4,7 +4,7 @@
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// Copyright (C) 2024, Advanced Micro Devices, Inc.
+// Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 // 
 //===----------------------------------------------------------------------===//-->
 
@@ -120,7 +120,7 @@ In this example, the vectorization strategy was relatively straight forward. Ins
 ## <u>Vectorization Exercises</u>
 1. Let's take a look at the trace for our vector scalar design. First, let's edit our [vector_scalar_mul design](../../../programming_examples/basic/vector_scalar_mul/) so that the [vector_scalar_mul.py](../../../programming_examples/basic/vector_scalar_mul/vector_scalar_mul.py) source file has `vectorized=False`. In the [vector_scalar_mul.py](../../../programming_examples/basic/vector_scalar_mul/vector_scalar_mul.py) source code, we now have selected the scalar version of the kernel function. We're also going to build the 32-bit integer version of the design by passing the environment variales `int_bit-width=32` to our `makefile` command, by running  `make int_bit_width=32 trace`. This makefile argument is defined in our makefile to customize datatypes and buffer sizes in our design code (`vector_scalar_mul.py`) and our host code (`test.cpp`). After the trace compilation is complete, open `trace_vector_scalar_mul.json` in https://ui.perfetto.dev and measure the delta between `event 0` and `event 1`. Note that in the Perfetto waveform, 1 us is equal to 1 clock cycle. How many cycles did you measure? <img src="../../../mlir_tutorials/images/answer1.jpg" title="~12,297 cycles" height=25> 
 
-    You may notice that in our `vector_scalar_mul` example, we call `python/utils/get_trace_summary.py` to analyze the generated json file and measure the delta between `event 0` and `event 1` automatically, providing the number of kernel invocations, and the first/ min/ avg/ max number of cycles. This is a handy utility for summarizing kernel performance for single core designs.
+    You may notice that in our `vector_scalar_mul` example, we call `python/utils/trace/get_trace_summary.py` to analyze the generated json file and measure the delta between `event 0` and `event 1` automatically, providing the number of kernel invocations, and the first/ min/ avg/ max number of cycles. This is a handy utility for summarizing kernel performance for single core designs.
 
 1. Now let's turn vectorization back on by changing `vectorized=True`. But we're also going to disable any pragma guided optimization first to see its effect. In the [scale.cc](../../../aie_kernels/aie2/scale.cc), comment out the lines before the `for loop` that says `AIE_PREPARE_FOR_PIPELINING AIE_LOOP_MIN_ITERATION_COUNT(16)`. **NOTE** Be sure to edit both the general template and the `int32_t` template specialization as we will be testing that case next. Then rerun the compilation (`make clean; make int_bit_width=32 trace`). Measure the delta between `event 0` and `event 1` again. What value do you see now? <img src="../../../mlir_tutorials/images/answer1.jpg" title="~1490 cycles" height=25>
 
@@ -268,4 +268,4 @@ Let's examine this more closely in our example.
 1. Now go back and uncomment the pragma lines again and rerun the build and cleanup script (`make clean; make trace; <mlir-aie>/ironenv/lib/python<ver>/site-packages/llvm-aie/bin/llvm-objdump -dr build/core_0_2.elf > diassembly_0_2.txt`). Search for `vector_scalar_mul_vector` again and count the number of inner loop lines, as well as `VMUL/VMAC` lines again. How many do you see? <img src="../../../mlir_tutorials/images/answer1.jpg" title="2 inner loop lines. 1 VMUL." height=25> This matches with our hand calculation that the inner loop is limited to 2 because of the vector stores. 
 
 -----
-[[Prev]](../section-4b) [[Up]](../../section-4) [[Next - Section 5]](../../section-5)
\ No newline at end of file
+[[Prev]](../section-4b) [[Up]](../../section-4) [[Next - Section 5]](../../section-5)
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index adce7a5675f..1afe566b23e 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,4 +1,4 @@
-# Copyright (C) 2022-2025, Advanced Micro Devices, Inc. All rights reserved.
+# Copyright (C) 2022-2026, Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 include(AddMLIRPython)
@@ -35,16 +35,31 @@ declare_mlir_python_sources(AIEPythonSources.Tools
 declare_mlir_python_sources(AIEPythonSources.Utils
   ADD_TO_PARENT AIEPythonSources
   SOURCES
+    utils/__init__.py
     utils/test.py
-    utils/xrt.py
     utils/ml.py
-    utils/trace.py
     utils/config.py
-    utils/parse_trace.py
-    utils/parse_eventIR.py
-    utils/get_trace_summary.py
-    utils/trace_utils.py
-    utils/trace_events/__init__.py
+    utils/jit.py
+    utils/npukernel.py
+    utils/hostruntime/__init__.py
+    utils/hostruntime/hostruntime.py
+    utils/hostruntime/tensor_class.py
+    utils/hostruntime/xrtruntime/__init__.py
+    utils/hostruntime/xrtruntime/hostruntime.py
+    utils/hostruntime/xrtruntime/tensor.py
+    utils/compile/__init__.py
+    utils/compile/link.py
+    utils/compile/utils.py
+    utils/compile/cache/circular_cache.py
+    utils/compile/cache/utils.py
+    utils/trace/__init__.py
+    utils/trace/config.py
+    utils/trace/events/__init__.py
+    utils/trace/event_ir.py
+    utils/trace/get_trace_summary.py
+    utils/trace/parse.py
+    utils/trace/setup.py
+    utils/trace/utils.py
 )
 
 declare_mlir_python_sources(AIEPythonSources.Helpers
@@ -415,10 +430,10 @@ else ()
     get_filename_component(_filename ${_file} NAME)
     add_custom_command(
       TARGET AIEPythonModules POST_BUILD
-      COMMAND ${CMAKE_COMMAND} -E make_directory ${AIE_PYTHON_PACKAGES_DIR}/aie/utils/trace_events
+      COMMAND ${CMAKE_COMMAND} -E make_directory ${AIE_PYTHON_PACKAGES_DIR}/aie/utils/trace/events
       COMMAND ${CMAKE_COMMAND} -E copy_if_different
         ${_file}
-        ${AIE_PYTHON_PACKAGES_DIR}/aie/utils/trace_events/${_filename}
+        ${AIE_PYTHON_PACKAGES_DIR}/aie/utils/trace/events/${_filename}
       COMMENT "Copying ${_filename} to build Python package"
     )
   endforeach()
@@ -427,7 +442,7 @@ else ()
   install(
     FILES 
       ${GENERATED_EVENTS_ENUM_FILES}
-    DESTINATION ${AIE_PYTHON_INSTALL_DIR}/aie/utils/trace_events
+    DESTINATION ${AIE_PYTHON_INSTALL_DIR}/aie/utils/trace/events
     COMPONENT aie-python
   )
 
diff --git a/python/Doxyfile b/python/Doxyfile
index b586c7f5adc..e97bdc233a3 100644
--- a/python/Doxyfile
+++ b/python/Doxyfile
@@ -4,7 +4,7 @@
 # Project related configuration options
 #---------------------------------------------------------------------------
 PROJECT_NAME           = "IRON"
-PROJECT_NUMBER         = "1.0"
+PROJECT_NUMBER         = $(GITHUB_SHA)
 OUTPUT_DIRECTORY       = "../docs/python"
 CREATE_SUBDIRS         = NO
 EXTRACT_ALL            = YES
@@ -14,7 +14,7 @@ EXTRACT_STATIC         = YES
 #---------------------------------------------------------------------------
 # Build related configuration options
 #---------------------------------------------------------------------------
-INPUT                  = helpers/taplib iron
+INPUT                  = helpers/taplib iron utils/hostruntime utils/__init__.py utils/jit.py utils/npukernel.py
 FILE_PATTERNS          = *.py
 RECURSIVE              = YES
 
diff --git a/python/helpers/util.py b/python/helpers/util.py
index 983c83d09f3..0687787a209 100644
--- a/python/helpers/util.py
+++ b/python/helpers/util.py
@@ -1,3 +1,4 @@
+# (c) Copyright 2026 Advanced Micro Devices, Inc.
 from collections import defaultdict
 import numpy as np
 from typing import Sequence, get_args, get_origin
@@ -191,7 +192,9 @@ def np_ndarray_type_get_shape(ndarray_type: type[np.ndarray]) -> tuple[int, ...]
     shape = get_args(ndarray_type)[0]
     assert isinstance(shape, tuple), "np.ndarray shape must be a tuple of integers"
     for elem in shape:
-        assert isinstance(elem, int), "np.ndarray shape must be a tuple of integers"
+        assert isinstance(
+            elem, (int, np.integer)
+        ), "np.ndarray shape must be a tuple of Python or numpy integer types"
     return shape
 
 
diff --git a/python/iron/__init__.py b/python/iron/__init__.py
index f700ccfd8da..06ec1d34fe4 100644
--- a/python/iron/__init__.py
+++ b/python/iron/__init__.py
@@ -1,3 +1,4 @@
+# (c) Copyright 2026 Advanced Micro Devices, Inc.
 from .buffer import Buffer
 from .kernel import ExternalFunction, Kernel
 from .program import Program
@@ -5,9 +6,8 @@
 from .runtime import Runtime
 from .dataflow import ObjectFifo
 from .dtype import str_to_dtype, dtype_to_str
-from .hostruntime.jit import jit
-from .hostruntime.config import get_current_device, set_current_device
-from .hostruntime.tensor import (
+from aie.utils.jit import jit
+from aie.utils import (
     tensor,
     ones,
     zeros,
@@ -15,5 +15,6 @@
     rand,
     arange,
     zeros_like,
-    set_iron_tensor_class,
+    set_tensor_class,
+    get_current_device,
 )
diff --git a/python/iron/compile/__init__.py b/python/iron/compile/__init__.py
deleted file mode 100644
index 79dc0b887f3..00000000000
--- a/python/iron/compile/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# __init__.py -*- Python -*-
-#
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# (c) Copyright 2025 Advanced Micro Devices, Inc.
-
-import os
-from pathlib import Path
-
-from .link import merge_object_files
-from .utils import (
-    compile_cxx_core_function,
-    compile_mlir_module,
-    compile_external_kernel,
-)
-
-# The `iron.compiledesign` decorator below caches compiled kenrels inside the `IRON_CACHE_HOME` directory.
-# Kernels are cached based on their hash value of the MLIR module string. If during compilation,
-# we hit in the cache, the `iron.jit` will load the xclbin and instruction binary files from the cache.
-IRON_CACHE_HOME = os.environ.get("IRON_CACHE_HOME", Path.home() / ".iron" / "cache")
diff --git a/python/iron/hostruntime/config.py b/python/iron/hostruntime/config.py
deleted file mode 100644
index d4e726a1829..00000000000
--- a/python/iron/hostruntime/config.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# config.py -*- Python -*-
-#
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# (c) Copyright 2025 Advanced Micro Devices, Inc.
-import shutil
-import subprocess
-
-from ..device import NPU1, NPU2
-
-
-def detect_npu_device():
-    """Detects the current device in the system.
-       This assumes XRT and XDNA driver is installed
-       and the system has NPU hardware.
-
-    Returns:
-        The current system device.
-    """
-    try:
-        # Run `xrt-smi examine` and capture output
-        xrt_smi = shutil.which("xrt-smi")
-        result = subprocess.run(
-            [xrt_smi, "examine"],
-            check=True,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.DEVNULL,
-            text=True,
-        )
-        output = result.stdout
-
-        # Match strings for NPU2 or NPU1
-        # Sets generic "whole array" devices. Overkill.
-        if any(
-            keyword.lower() in output.lower()
-            for keyword in [
-                "NPU Strix",
-                "NPU Strix Halo",
-                "NPU Krackan",
-                "RyzenAI-npu4",
-                "RyzenAI-npu6",
-            ]
-        ):
-            return NPU2()
-        elif any(
-            keyword.lower() in output.lower()
-            for keyword in [
-                "NPU",
-                "NPU Phoenix",
-                "RyzenAI-npu1",
-            ]
-        ):
-            return NPU1()
-        else:
-            raise RuntimeError("No supported NPU device found.")
-
-    except FileNotFoundError:
-        raise RuntimeError("xrt-smi not found. Make sure XRT is installed.")
-    except subprocess.CalledProcessError:
-        raise RuntimeError("Failed to run xrt-smi examine.")
-
-
-config = {}
-
-
-def set_current_device(device):
-    """Sets the current device.
-
-    Args:
-        device: Device to set as the current device.
-
-    Returns:
-        The previously set device.
-    """
-    global config
-    previous_device = config.get("device")
-    config["device"] = device
-    return previous_device
-
-
-def get_current_device():
-    """Gets the current device.
-
-    Returns:
-        The currently set device.
-    """
-    global config
-    if "device" not in config:
-        config["device"] = detect_npu_device()
-
-    return config["device"]
diff --git a/python/iron/hostruntime/kernelrunner.py b/python/iron/hostruntime/kernelrunner.py
deleted file mode 100644
index 0731a190a4e..00000000000
--- a/python/iron/hostruntime/kernelrunner.py
+++ /dev/null
@@ -1,124 +0,0 @@
-# kernelrunner.py -*- Python -*-
-#
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# (c) Copyright 2025 Advanced Micro Devices, Inc.
-import numpy as np
-
-
-class NPUKernel:
-    """
-    NPUKernel class wrapper for NPU kernels.
-    """
-
-    def __init__(
-        self, xclbin_path, insts_path, device_index=0, kernel_name="PP_FD_PRE"
-    ):
-        """
-        Initialize the NPUKernel object.
-        Parameters:
-            xclbin_path (str): Path to the XCLBIN file containing the kernel.
-            insts_path (str): Path to the instruction binary file for the kernel.
-            device_index (int, optional): Index of the device. Defaults to 0.
-            kernel_name (str, optional): Name of the kernel. Defaults to "PP_FD_PRE".
-        """
-        import pyxrt as xrt
-        from ...utils.xrt import read_insts_binary
-
-        self.__device = xrt.device(device_index)
-
-        # Find kernel by name in the xclbin
-        self.__xclbin = xrt.xclbin(xclbin_path)
-        kernels = self.__xclbin.get_kernels()
-
-        try:
-            xkernel = [k for k in kernels if kernel_name == k.get_name()][0]
-        except KeyError:
-            raise NPUKernel_Error("No such kernel: " + kernel_name)
-
-        self.__device.register_xclbin(self.__xclbin)
-        self.__context = xrt.hw_context(self.__device, self.__xclbin.get_uuid())
-        self.__kernel = xrt.kernel(self.__context, xkernel.get_name())
-
-        # Set up instruction stream
-        insts = read_insts_binary(insts_path)
-        self.__n_insts = len(insts)
-        insts_buffers_bytes = self.__n_insts * np.dtype(insts.dtype).itemsize
-
-        # Magic number for RyzenAI group id that will be fixed in the future. See same code at XRT:
-        # https://github.com/Xilinx/XRT/blob/56222ed5cfd119dff0d5bd920735b87024e8c829/src/runtime_src/core/common/api/xrt_module.cpp#L1621
-        group_id = 1
-
-        self.__insts_buffer_bo = xrt.bo(
-            self.__device,
-            insts_buffers_bytes,
-            xrt.bo.cacheable,
-            group_id,
-        )
-
-        # Copy into a temporary numpy buffer
-        insts_buffer_bo_np = np.frombuffer(
-            self.__insts_buffer_bo.map(), dtype=insts.dtype
-        ).reshape(insts.shape)
-        insts_buffer_bo_np[:] = insts
-
-        # Always sync to the device in the constructor
-        self.__insts_buffer_bo.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-
-    # Blocking call.
-    def __call__(self, *args):
-        """
-        Allows the kernel to be called as a function with the provided arguments.
-
-        Parameters:
-            args (IRON Tensors): Arguments to pass to the kernel.
-        """
-        import pyxrt as xrt
-
-        opcode = 3
-        kernel_args = []
-
-        for tensor in args:
-            # Skip callable arguments since these are inlined in the kernel
-            if callable(tensor):
-                continue
-            if not hasattr(tensor, "buffer_object"):
-                raise TypeError(
-                    f"Expected Tensor with .buffer_object(), got {type(tensor)}"
-                )
-            kernel_args.append(tensor.buffer_object())
-
-        h = self.__kernel(opcode, self.__insts_buffer_bo, self.__n_insts, *kernel_args)
-        r = h.wait()
-        if r != xrt.ert_cmd_state.ERT_CMD_STATE_COMPLETED:
-            raise NPUKernel_Error(f"Kernel returned {r}")
-
-    def __del__(self):
-        """
-        Destructor to clean up resources and delete the kernel and device objects.
-        """
-        if hasattr(self, "_NPUKernel__insts_buffer_bo"):
-            del self.__insts_buffer_bo
-            self.__insts_buffer_bo = None
-        if hasattr(self, "_NPUKernel__kernel"):
-            del self.__kernel
-            self.__kernel = None
-        if hasattr(self, "_NPUKernel__context"):
-            del self.__context
-            self.__context = None
-        if hasattr(self, "_NPUKernel__xclbin"):
-            del self.__xclbin
-            self.__xclbin = None
-        if hasattr(self, "_NPUKernel__device"):
-            del self.__device
-            self.__device = None
-
-
-class NPUKernel_Error(Exception):
-    """
-    Error raised when a NPU kernel encounters an error during execution.
-    """
-
-    pass
diff --git a/python/iron/runtime/runtime.py b/python/iron/runtime/runtime.py
index a5b0290e643..15239d15f42 100644
--- a/python/iron/runtime/runtime.py
+++ b/python/iron/runtime/runtime.py
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2024 Advanced Micro Devices, Inc.
+# (c) Copyright 2024-2026 Advanced Micro Devices, Inc.
 
 from __future__ import annotations
 from collections import defaultdict
@@ -313,7 +313,7 @@ def sequence(*args):
                     # tiles_to_trace=[ tiles_to_trace[0] ],
                     tiles_to_trace=tiles_to_trace,
                     shim=trace_shim_tile,
-                    trace_size=self._trace_size,
+                    trace_size=self._trace_size // 4,
                     trace_offset=(
                         self._trace_offset if self._trace_offset is not None else 0
                     ),
diff --git a/python/utils/__init__.py b/python/utils/__init__.py
new file mode 100644
index 00000000000..4d4012bbcac
--- /dev/null
+++ b/python/utils/__init__.py
@@ -0,0 +1,174 @@
+# __init__.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
+import sys
+from .hostruntime.tensor_class import Tensor
+
+try:
+    import pyxrt
+
+    has_xrt = True
+except ImportError as e:
+    print(
+        f"Failed to import PyXRT: {e}, proceeding without runtime libraries.",
+        file=sys.stderr,
+    )
+    has_xrt = False
+
+if has_xrt:
+    from .hostruntime.xrtruntime.tensor import XRTTensor
+
+    DEFAULT_TENSOR_CLASS = XRTTensor
+else:
+    from .hostruntime.tensor_class import CPUOnlyTensor
+
+    DEFAULT_TENSOR_CLASS = CPUOnlyTensor
+
+
+def tensor(*args, **kwargs):
+    """
+    Create a tensor using the default tensor class.
+
+    Args:
+        *args: Arguments passed to the tensor constructor.
+        **kwargs: Keyword arguments passed to the tensor constructor.
+
+    Returns:
+        Tensor: The created tensor.
+    """
+    return DEFAULT_TENSOR_CLASS(*args, **kwargs)
+
+
+def ones(*args, **kwargs):
+    """
+    Create a tensor filled with ones using the default tensor class.
+
+    Args:
+        *args: Arguments passed to the ones method.
+        **kwargs: Keyword arguments passed to the ones method.
+
+    Returns:
+        Tensor: The created tensor.
+    """
+    return DEFAULT_TENSOR_CLASS.ones(*args, **kwargs)
+
+
+def zeros(*args, **kwargs):
+    """
+    Create a tensor filled with zeros using the default tensor class.
+
+    Args:
+        *args: Arguments passed to the zeros method.
+        **kwargs: Keyword arguments passed to the zeros method.
+
+    Returns:
+        Tensor: The created tensor.
+    """
+    return DEFAULT_TENSOR_CLASS.zeros(*args, **kwargs)
+
+
+def randint(*args, **kwargs):
+    """
+    Create a tensor filled with random integers using the default tensor class.
+
+    Args:
+        *args: Arguments passed to the randint method.
+        **kwargs: Keyword arguments passed to the randint method.
+
+    Returns:
+        Tensor: The created tensor.
+    """
+    return DEFAULT_TENSOR_CLASS.randint(*args, **kwargs)
+
+
+def rand(*args, **kwargs):
+    """
+    Create a tensor filled with random values using the default tensor class.
+
+    Args:
+        *args: Arguments passed to the rand method.
+        **kwargs: Keyword arguments passed to the rand method.
+
+    Returns:
+        Tensor: The created tensor.
+    """
+    return DEFAULT_TENSOR_CLASS.rand(*args, **kwargs)
+
+
+def arange(*args, **kwargs):
+    """
+    Create a tensor with a range of values using the default tensor class.
+
+    Args:
+        *args: Arguments passed to the arange method.
+        **kwargs: Keyword arguments passed to the arange method.
+
+    Returns:
+        Tensor: The created tensor.
+    """
+    return DEFAULT_TENSOR_CLASS.arange(*args, **kwargs)
+
+
+def zeros_like(*args, **kwargs):
+    """
+    Create a tensor filled with zeros with the same shape as another tensor using the default tensor class.
+
+    Args:
+        *args: Arguments passed to the zeros_like method.
+        **kwargs: Keyword arguments passed to the zeros_like method.
+
+    Returns:
+        Tensor: The created tensor.
+    """
+    return DEFAULT_TENSOR_CLASS.zeros_like(*args, **kwargs)
+
+
+def set_tensor_class(cls):
+    """
+    Set the default tensor class.
+
+    Args:
+        cls: The new default tensor class. Must inherit from Tensor.
+
+    Raises:
+        ValueError: If cls does not inherit from Tensor.
+    """
+    if not issubclass(cls, Tensor):
+        raise ValueError(
+            f"Tensors must inherit from the Tensor class but {cls} does not."
+        )
+    global DEFAULT_TENSOR_CLASS
+    DEFAULT_TENSOR_CLASS = cls
+
+
+from .hostruntime import set_current_device
+from . import hostruntime
+from .hostruntime.hostruntime import HostRuntime
+from .trace import TraceConfig
+from .npukernel import NPUKernel
+
+if has_xrt:
+    from .hostruntime.xrtruntime.hostruntime import CachedXRTRuntime
+
+    DefaultNPURuntime = CachedXRTRuntime()
+else:
+    DefaultNPURuntime = None
+
+
+def get_current_device():
+    """
+    Get the current NPU device.
+
+    Returns:
+        Device | None: The current device if available, else None.
+    """
+    if hostruntime._CURRENT_DEVICE:
+        return hostruntime._CURRENT_DEVICE
+    elif DefaultNPURuntime:
+        return DefaultNPURuntime.device()
+    else:
+        return None
diff --git a/python/utils/compile/__init__.py b/python/utils/compile/__init__.py
new file mode 100644
index 00000000000..701b4dd60d4
--- /dev/null
+++ b/python/utils/compile/__init__.py
@@ -0,0 +1,22 @@
+# __init__.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
+
+import os
+from pathlib import Path
+
+from .link import merge_object_files
+from .utils import (
+    compile_cxx_core_function,
+    compile_mlir_module,
+    compile_external_kernel,
+)
+
+# Compiled kernels are cached inside the `NPU_CACHE_HOME` directory.
+NPU_CACHE_HOME = Path(
+    os.environ.get("NPU_CACHE_HOME", Path.home() / ".npu" / "cache")
+).resolve()
diff --git a/python/iron/compile/cache/circular_cache.py b/python/utils/compile/cache/circular_cache.py
similarity index 94%
rename from python/iron/compile/cache/circular_cache.py
rename to python/utils/compile/cache/circular_cache.py
index 6d51e204fde..ae6fae0bdb2 100644
--- a/python/iron/compile/cache/circular_cache.py
+++ b/python/utils/compile/cache/circular_cache.py
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2025 Advanced Micro Devices, Inc.
+# (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
 class CircularCache:
     def __init__(self, max_size):
         self.max_size = max_size
diff --git a/python/iron/compile/cache/utils.py b/python/utils/compile/cache/utils.py
similarity index 82%
rename from python/iron/compile/cache/utils.py
rename to python/utils/compile/cache/utils.py
index 8fa311d4efe..e45cb288e89 100644
--- a/python/iron/compile/cache/utils.py
+++ b/python/utils/compile/cache/utils.py
@@ -4,14 +4,13 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2025 Advanced Micro Devices, Inc.
+# (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
 import contextlib
 import fcntl
 import os
 import time
 
-from ...hostruntime.tensor import Tensor
-from ...kernel import ExternalFunction
+from aie.utils.hostruntime.tensor_class import Tensor
 
 
 def _create_function_cache_key(function, args, kwargs):
@@ -31,37 +30,38 @@ def _create_function_cache_key(function, args, kwargs):
         if isinstance(arg, Tensor):
             # Tensor argument - include shape and dtype
             signature_parts.append(f"tensor_{arg.shape}_{arg.dtype}")
-        elif isinstance(arg, ExternalFunction):
-            # ExternalFunction argument - use its custom hash method
-            func_hash = hash(arg)
-            signature_parts.append(f"externalfunction_{func_hash}")
         elif callable(arg):
             if hasattr(arg, "__code__"):
                 # Use bytecode and constants hash for Python functions/lambdas
                 code = arg.__code__
-                func_hash = hash((code.co_code, code.co_consts, code.co_names))
+                defaults = arg.__defaults__ if hasattr(arg, "__defaults__") else None
+                func_hash = hash(
+                    (code.co_code, code.co_consts, code.co_names, defaults)
+                )
                 signature_parts.append(f"function_{func_hash}")
             else:
                 # Function argument - use hash of function address for uniqueness
                 func_hash = hash(arg)
                 signature_parts.append(f"function_{func_hash}")
         else:
-            # Unsupported type - use type name
-            signature_parts.append(f"{type(arg).__name__}")
+            # Other type - use type name
+            arg_hash = hash(arg)
+            signature_parts.append(f"{type(arg).__name__}_{arg_hash}")
 
     for key, value in sorted(kwargs.items()):
         if isinstance(value, Tensor):
             # Tensor argument - include shape and dtype
             signature_parts.append(f"{key}_tensor_{value.shape}_{value.dtype}")
-        elif isinstance(value, ExternalFunction):
-            # ExternalFunction argument - use its custom hash method
-            func_hash = hash(value)
-            signature_parts.append(f"{key}_externalfunction_{func_hash}")
         elif callable(value):
             if hasattr(value, "__code__"):
                 # Use bytecode and constants hash for Python functions/lambdas
                 code = value.__code__
-                func_hash = hash((code.co_code, code.co_consts, code.co_names))
+                defaults = (
+                    value.__defaults__ if hasattr(value, "__defaults__") else None
+                )
+                func_hash = hash(
+                    (code.co_code, code.co_consts, code.co_names, defaults)
+                )
                 signature_parts.append(f"{key}_function_{func_hash}")
             else:
                 # Function argument - use hash of function address for uniqueness
diff --git a/python/iron/compile/link.py b/python/utils/compile/link.py
similarity index 96%
rename from python/iron/compile/link.py
rename to python/utils/compile/link.py
index a820be21c44..4556b7d40de 100644
--- a/python/iron/compile/link.py
+++ b/python/utils/compile/link.py
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2025 Advanced Micro Devices, Inc.
+# (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
 
 import subprocess
 from os import PathLike
diff --git a/python/iron/compile/utils.py b/python/utils/compile/utils.py
similarity index 99%
rename from python/iron/compile/utils.py
rename to python/utils/compile/utils.py
index ca5aa4aa3ba..458df555c61 100644
--- a/python/iron/compile/utils.py
+++ b/python/utils/compile/utils.py
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2025 Advanced Micro Devices, Inc.
+# (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
 import os
 import shutil
 import subprocess
diff --git a/python/utils/hostruntime/__init__.py b/python/utils/hostruntime/__init__.py
new file mode 100644
index 00000000000..56e1acc3cfa
--- /dev/null
+++ b/python/utils/hostruntime/__init__.py
@@ -0,0 +1,51 @@
+# __init__.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2026 Advanced Micro Devices, Inc.
+from typing import TYPE_CHECKING
+from ml_dtypes import bfloat16
+import numpy as np
+from .tensor_class import Tensor
+
+if TYPE_CHECKING:
+    from aie.iron.device import Device
+
+_CURRENT_DEVICE = None
+
+
+def set_current_device(device: "Device"):
+    """
+    Set the current device.
+
+    Args:
+        device (Device): The device to set as current.
+    """
+    global _CURRENT_DEVICE
+    _CURRENT_DEVICE = device
+
+
+def bfloat16_safe_allclose(dtype, arr1, arr2):
+    """
+    Check if two arrays are element-wise equal within a tolerance, handling bfloat16 safely.
+
+    Args:
+        dtype: The data type of the arrays.
+        arr1: First input array.
+        arr2: Second input array.
+
+    Returns:
+        bool: True if the arrays are equal within tolerance, False otherwise.
+    """
+    if dtype == bfloat16:
+        if isinstance(arr1, Tensor):
+            arr1 = np.array(arr1, dtype=np.float16)
+        else:
+            arr1 = arr1.astype(np.float16)
+        if isinstance(arr2, Tensor):
+            arr2 = np.array(arr2, dtype=np.float16)
+        else:
+            arr2 = arr2.astype(np.float16)
+    return np.allclose(arr1, arr2)
diff --git a/python/utils/hostruntime/hostruntime.py b/python/utils/hostruntime/hostruntime.py
new file mode 100644
index 00000000000..d4ebb3140d5
--- /dev/null
+++ b/python/utils/hostruntime/hostruntime.py
@@ -0,0 +1,466 @@
+# SPDX-FileCopyrightText: Copyright (C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+from abc import ABC, abstractmethod
+import numpy as np
+from pathlib import Path
+import sys
+from typing import TYPE_CHECKING
+
+from .. import tensor
+
+if TYPE_CHECKING:
+    from aie.iron.device import Device
+from .tensor_class import Tensor
+from ..trace import TraceConfig
+from ..trace.utils import create_ctrl_pkt, extract_tile
+from ..npukernel import NPUKernel
+from . import bfloat16_safe_allclose
+
+
+class HostRuntimeError(Exception):
+    """
+    Error raised when a NPU kernel encounters an error during runtime operations.
+    """
+
+    pass
+
+
+class KernelHandle(ABC):
+    """
+    Abstract representation that represents a kernel already registered/loaded with a runtime.
+    """
+
+    ...
+
+
+class KernelResult(ABC):
+    """A wrapper around data produced as the result of running a kernel"""
+
+    def __init__(
+        self,
+        npu_time: int,
+        trace_config: TraceConfig | None = None,
+    ):
+        """
+        Initialize the KernelResult.
+
+        Args:
+            npu_time (int): The execution time on the NPU in nanoseconds.
+            trace_config (TraceConfig | None, optional): Configuration for tracing. Defaults to None.
+        """
+        self._npu_time = npu_time
+        self._trace_config = trace_config
+
+    @property
+    def npu_time(self) -> int:
+        """
+        Get the NPU execution time.
+
+        Returns:
+            int: The execution time in nanoseconds.
+        """
+        return self._npu_time
+
+    @property
+    def trace_config(self) -> TraceConfig | None:
+        """
+        Get the trace configuration.
+
+        Returns:
+            TraceConfig | None: The trace configuration if available, else None.
+        """
+        return self._trace_config
+
+    def has_trace(self) -> bool:
+        """
+        Check if trace data is available.
+
+        Returns:
+            bool: True if trace configuration is present, False otherwise.
+        """
+        return not (self._trace_config is None)
+
+    @abstractmethod
+    def is_success(self) -> bool:
+        """
+        Check if the kernel execution was successful.
+
+        Returns:
+            bool: True if successful, False otherwise.
+        """
+        pass
+
+
+class HostRuntime(ABC):
+    """An abstract class for a generic host runtime"""
+
+    def check_device_consistency(self):
+        """
+        Check if the overridden device matches the runtime device.
+        """
+        mod = sys.modules[__package__]
+        override = getattr(mod, "_CURRENT_DEVICE", None)
+        if override:
+            runtime_device = self.device()
+            if getattr(override, "_device", None) != getattr(
+                runtime_device, "_device", None
+            ):
+                raise RuntimeError(
+                    f"Overridden device {override} does not match runtime device {runtime_device}"
+                )
+
+    @abstractmethod
+    def load(self, npu_kernel: NPUKernel, **kwargs) -> KernelHandle:
+        """
+        Load an NPU kernel into the runtime.
+
+        Args:
+            npu_kernel (NPUKernel): The NPU kernel to load.
+            **kwargs: Additional arguments for loading.
+
+        Returns:
+            KernelHandle: A handle to the loaded kernel.
+        """
+        pass
+
+    @abstractmethod
+    def run(
+        self,
+        kernel_handle: KernelHandle,
+        *args,
+        trace_config: TraceConfig | None = None,
+        only_if_loaded=False,
+    ) -> KernelResult:
+        """
+        Run a loaded kernel.
+
+        Args:
+            kernel_handle (KernelHandle): The handle to the loaded kernel.
+            *args: Arguments to pass to the kernel.
+            trace_config (TraceConfig | None, optional): Configuration for tracing. Defaults to None.
+            only_if_loaded (bool, optional): If True, only run if already loaded. Defaults to False.
+
+        Returns:
+            KernelResult: The result of the kernel execution.
+        """
+        pass
+
+    def load_and_run(
+        self,
+        npu_kernel: NPUKernel,
+        run_args: list,
+        **kwargs,
+    ) -> tuple[KernelHandle, KernelResult]:
+        """
+        Load and run an NPU kernel.
+
+        Args:
+            npu_kernel (NPUKernel): The NPU kernel to load and run.
+            run_args (list): Arguments to pass to the kernel.
+            **kwargs: Additional arguments passed to load.
+
+        Returns:
+            tuple[KernelHandle, KernelResult]: A tuple containing the kernel handle and the execution result.
+        """
+        trace_config = npu_kernel.trace_config
+        handle = self.load(npu_kernel, **kwargs)
+        if trace_config:
+            if trace_config.trace_after_last_tensor and len(run_args) > 0:
+                trace_config.last_tensor_shape = run_args[-1].shape
+                trace_config.last_tensor_dtype = np.dtype(run_args[-1].dtype)
+            self.prepare_args_for_trace(run_args, trace_config)
+
+        ret = self.run(handle, list(run_args), trace_config=trace_config)
+
+        if trace_config:
+            trace_buffer, ctrl_buffer = self.extract_trace_from_args(
+                run_args, trace_config
+            )
+            self.process_trace(trace_buffer, ctrl_buffer, trace_config)
+
+        return handle, ret
+
+    @abstractmethod
+    def device(self) -> "Device":
+        """
+        Get the device associated with this runtime.
+
+        Returns:
+            Device: The device object.
+        """
+        pass
+
+    # Read instruction stream from bin file and reformat it to be passed into the
+    # instruction buffer for the xrt.kernel call
+    @classmethod
+    def read_insts_binary(cls, insts_path: Path):
+        """
+        Reads instructions from a binary file.
+
+        Args:
+            insts_path (Path): Path to the binary instruction file.
+
+        Returns:
+            np.ndarray: Array of uint32 instructions.
+        """
+        with open(insts_path, "rb") as f:
+            data = f.read()
+        # Interpret the binary data as an array of uint32 values.
+        return np.frombuffer(data, dtype=np.uint32)
+
+    @classmethod
+    def read_insts(cls, insts_path: Path):
+        """
+        Reads instructions from the given file.
+
+        If the file extension is .bin, uses binary read.
+        If the file extension is .txt, uses sequence (text) read.
+
+        Args:
+            insts_path (Path): Path to the instruction file.
+
+        Returns:
+            np.ndarray: Array of instructions.
+
+        Raises:
+            HostRuntimeError: If the file extension is not supported.
+        """
+        ext = insts_path.suffix.lower()
+        if ext == ".bin":
+            return cls.read_insts_binary(insts_path)
+        else:
+            raise HostRuntimeError(
+                "Unsupported file extension for instruction file: expected .bin"
+            )
+
+    @classmethod
+    def prepare_args_for_trace(
+        cls, args: list[Tensor], trace_config: TraceConfig
+    ) -> list[Tensor]:
+        """
+        Prepare arguments for tracing by appending necessary buffers.
+
+        Args:
+            args (list[Tensor]): List of input/output tensors.
+            trace_config (TraceConfig): Trace configuration.
+
+        Returns:
+            list[Tensor]: The updated list of tensors with trace buffers appended.
+        """
+        if trace_config.trace_after_last_tensor:
+            # Create a new, extended out tensor.
+            out_size = trace_config.trace_size
+            if len(args) > 0:
+                out_size += args[-1].nbytes
+                # TODO: should really copy previous contents of output into this buffer...? What if it's in/out?
+                args[-1] = tensor((out_size,), dtype=np.uint8)
+            else:
+                out = tensor((out_size,), dtype=np.uint8)
+                args.append(out)
+        else:
+            pad_until = trace_config.DEFAULT_TRACE_BUFFER_INDEX
+            if trace_config.enable_ctrl_pkts:
+                pad_until -= 1
+            while len(args) < pad_until:
+                # TODO out always needed so register buf 7 succeeds (not needed in C/C++ host code)
+                filler = tensor((1,), dtype=np.uint32)
+                args.append(filler)
+
+            if trace_config.enable_ctrl_pkts:
+                # write ctrl packets
+                ctrl_pkts = [
+                    create_ctrl_pkt(1, 0, 0x32004),  # core status
+                    create_ctrl_pkt(1, 0, 0x340D8),  # trace status
+                ]
+                # Pad to 8 words
+                ctrl_pkts += [0] * (8 - len(ctrl_pkts))
+
+                header = tensor(np.array(ctrl_pkts, dtype=np.uint32))
+                args.append(header)
+
+            # Allocate extra space for control packets if enabled
+            alloc_size = trace_config.trace_size
+            if trace_config.enable_ctrl_pkts:
+                alloc_size = trace_config.trace_size * 4
+
+            trace_buff = tensor((alloc_size,), dtype=np.uint8)
+            args.append(trace_buff)
+        return args
+
+    @classmethod
+    def extract_trace_from_args(
+        cls, args: list[Tensor], trace_config: TraceConfig
+    ) -> tuple[Tensor, Tensor | None]:
+        """
+        Extract trace and control buffers from the arguments.
+
+        Args:
+            args (list[Tensor]): List of tensors used in execution.
+            trace_config (TraceConfig): Trace configuration.
+
+        Returns:
+            tuple[Tensor, Tensor | None]: A tuple containing the trace buffer and optionally the control buffer.
+        """
+        trace_buff = None
+        ctrl_buff = None
+
+        if trace_config.trace_after_last_tensor:
+            args[-1], trace_buff = cls._extract_prefix(
+                args[-1], trace_config.last_tensor_shape, trace_config.last_tensor_dtype
+            )
+        else:
+            # The trace position is always last.
+            trace_buff = args[-1].numpy()
+
+        if trace_config.enable_ctrl_pkts:
+            trace_buff, ctrl_buff = cls._extract_prefix(
+                trace_buff, trace_config.trace_size, np.dtype(np.uint8)
+            )
+        trace_buff = trace_buff.view(np.uint32).reshape(
+            trace_config.trace_size // np.dtype(np.uint32).itemsize
+        )
+        return trace_buff, ctrl_buff
+
+    @classmethod
+    def _extract_prefix(cls, tensor, prefix_shape, prefix_dtype):
+        """
+        Separate output data and trace data from a single output buffer stream.
+
+        Args:
+            tensor (Tensor | np.ndarray): The combined tensor.
+            prefix_shape (tuple): Shape of the prefix (output data).
+            prefix_dtype (np.dtype): Data type of the prefix.
+
+        Returns:
+            tuple[np.ndarray, np.ndarray]: A tuple containing the output prefix and the suffix (trace data).
+        """
+        # Wrapper function to separate output data and trace data from a single output buffer stream
+        if not isinstance(tensor, np.ndarray):
+            tensor = tensor.numpy()
+        flat_tensor = tensor.reshape((-1,)).view(np.uint8)
+        prefix_bytes = np.prod(prefix_shape) * prefix_dtype.itemsize
+        output_prefix = (
+            flat_tensor[:prefix_bytes].view(prefix_dtype).reshape(prefix_shape).copy()
+        )
+        output_suffix = flat_tensor[prefix_bytes:].copy()
+        return output_prefix, output_suffix
+
+    @classmethod
+    def process_trace(cls, trace_buffer, ctrl_buffer, trace_config, verbosity=0):
+        """
+        Process the trace buffer and control buffer.
+
+        Args:
+            trace_buffer (np.ndarray): The trace data buffer.
+            ctrl_buffer (np.ndarray): The control packet buffer.
+            trace_config (TraceConfig): Trace configuration.
+            verbosity (int, optional): Verbosity level. Defaults to 0.
+        """
+        if verbosity >= 1:
+            print("trace_buffer shape: ", trace_buffer.shape)
+            print("trace_buffer dtype: ", trace_buffer.dtype)
+        trace_config.write_trace(trace_buffer)
+
+        if trace_config.enable_ctrl_pkts:
+            if verbosity >= 1:
+                print("ctrl_buffer shape: ", ctrl_buffer.shape)
+                print("ctrl_buffer dtype: ", ctrl_buffer.dtype)
+                print("ctrl buffer: ", [hex(d) for d in ctrl_buffer])
+            for i in range(ctrl_buffer.size // 2):
+                col, row, pkt_type, pkt_id = extract_tile(ctrl_buffer[i * 2])
+                overflow = True if (ctrl_buffer[i * 2 + 1] >> 8) == 3 else False
+                if overflow:
+                    print(
+                        f"WARNING: Trace overflow detected in tile({row},{col}). Trace results may be invalid."
+                    )
+
+    @classmethod
+    def verify_results(cls, io_args, refs={}, verbosity=0):
+        """
+        Verify the results of the kernel execution against reference data.
+
+        Args:
+            io_args (list[Tensor]): List of input/output tensors.
+            refs (dict, optional): Dictionary mapping index to reference numpy array. Defaults to {}.
+            verbosity (int, optional): Verbosity level. Defaults to 0.
+
+        Returns:
+            int: Number of errors found.
+
+        Raises:
+            HostRuntimeError: If a reference index is out of bounds.
+        """
+        errors = 0
+        if verbosity >= 1:
+            print("Verifying results ...")
+
+        for idx, ref in refs.items():
+            if idx >= len(io_args):
+                raise HostRuntimeError(
+                    f"Error: Reference index {idx} out of bounds for {len(io_args)} IO buffers"
+                )
+            io_args[idx].to("cpu")
+            o = io_args[idx].numpy()
+            e = bfloat16_safe_allclose(ref.dtype, ref, o)
+            errors += np.size(e) - np.count_nonzero(e)
+        return errors
+
+    def run_test(
+        self,
+        npu_kernel,
+        io_args,
+        ref,
+        verify: bool = True,
+        verbosity: int = 0,
+    ) -> int:
+        """
+        Run a test for the given NPU kernel.
+
+        Args:
+            npu_kernel (NPUKernel): The NPU kernel to test.
+            io_args (list[Tensor]): List of input/output tensors.
+            ref (dict): Reference data for verification.
+            verify (bool, optional): Whether to verify results. Defaults to True.
+            verbosity (int, optional): Verbosity level. Defaults to 0.
+
+        Returns:
+            int: 0 if successful, 1 otherwise.
+        """
+        kernel_handle = self.load(npu_kernel)
+        trace_config = npu_kernel.trace_config
+
+        # Ensure io_args is a list
+        if not isinstance(io_args, list):
+            io_args = [io_args] if io_args else []
+
+        buffers = io_args
+        last_out = buffers[-1] if buffers else None
+
+        if trace_config:
+            trace_config.last_tensor_shape = last_out.shape if last_out else None
+            trace_config.last_tensor_dtype = last_out.dtype if last_out else None
+            self.prepare_args_for_trace(buffers, trace_config)
+
+        ret = self.run(kernel_handle, buffers)
+
+        if verbosity >= 1:
+            print("npu_time: ", ret.npu_time / 1000.0, " us")
+
+        if trace_config:
+            trace_buffer, ctrl_buffer = self.extract_trace_from_args(
+                buffers, trace_config
+            )
+            self.process_trace(trace_buffer, ctrl_buffer, trace_config, verbosity)
+
+        errors = 0
+        if verify:
+            errors = self.verify_results(io_args, ref, verbosity)
+
+        if not errors:
+            return 0
+        else:
+            if verbosity >= 1:
+                print("\nError count: ", errors)
+                print("\nFailed.\n")
+            return 1
diff --git a/python/iron/hostruntime/tensor.py b/python/utils/hostruntime/tensor_class.py
similarity index 84%
rename from python/iron/hostruntime/tensor.py
rename to python/utils/hostruntime/tensor_class.py
index d4654a4c97f..d0215b01a8b 100644
--- a/python/iron/hostruntime/tensor.py
+++ b/python/utils/hostruntime/tensor_class.py
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2025 Advanced Micro Devices, Inc.
+# (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
 from abc import ABC, abstractmethod
 from functools import cached_property
 import numpy as np
@@ -28,7 +28,7 @@ def __init__(self, shape_or_data, dtype=np.uint32, device="npu"):
         """
         Initialize the tensor.
 
-        Parameters:
+        Args:
             shape_or_data (tuple or array-like):
                 - If a tuple, creates a new tensor with the given shape and dtype.
                 - If array-like, wraps the data into a tensor with optional dtype casting.
@@ -43,13 +43,23 @@ def __init__(self, shape_or_data, dtype=np.uint32, device="npu"):
     @property
     @abstractmethod
     def data(self):
-        """Subclasses must implement a data property."""
+        """
+        Subclasses must implement a data property.
+
+        Returns:
+            np.ndarray: The underlying data of the tensor.
+        """
         pass
 
     @property
     @abstractmethod
     def shape(self):
-        """Subclasses must implement a shape property."""
+        """
+        Subclasses must implement a shape property.
+
+        Returns:
+            tuple: The shape of the tensor.
+        """
         pass
 
     def __repr__(self):
@@ -89,7 +99,7 @@ def __getitem__(self, index):
         """
         Retrieves the value at a specific index in the tensor.
 
-        Parameters:
+        Args:
             index (int): The index of the value to retrieve.
 
         Returns:
@@ -106,7 +116,7 @@ def __setitem__(self, index, value):
         """
         Sets the value at a specific index in the tensor.
 
-        Parameters:
+        Args:
             index (int): The index of the value to set.
             value: The new value to assign.
 
@@ -121,6 +131,15 @@ def __setitem__(self, index, value):
             self._sync_to_device()
 
     def __len__(self):
+        """
+        Return the length of the tensor.
+
+        Returns:
+            int: The length of the tensor (size of the first dimension).
+
+        Raises:
+            TypeError: If the tensor is 0-dimensional.
+        """
         if self.data.ndim == 0:
             raise TypeError("len() of a 0-d tensor")
         return self.shape[0]
@@ -143,7 +162,7 @@ def to(self, target_device: str):
         """
         Moves the tensor to a specified target device.
 
-        Parameters:
+        Args:
             target_device (str): The target device.
 
         Returns:
@@ -166,6 +185,8 @@ def to(self, target_device: str):
     def _sync_to_device(self):
         """
         Syncs the tensor data from the host to the device memory.
+
+        This method should be implemented by subclasses to handle device-specific synchronization.
         """
         ...
 
@@ -173,11 +194,29 @@ def _sync_to_device(self):
     def _sync_from_device(self):
         """
         Syncs the tensor data from the device to the host memory.
+
+        This method should be implemented by subclasses to handle device-specific synchronization.
         """
         ...
 
     @classmethod
     def __check_or_create(cls, *size, out=None, dtype=None, device=None, **kwargs):
+        """
+        Internal helper to check an output tensor or create a new one.
+
+        Args:
+            *size: Shape of the tensor.
+            out (Tensor, optional): Output tensor to check.
+            dtype (np.dtype, optional): Data type.
+            device (str, optional): Device.
+            **kwargs: Additional arguments for tensor creation.
+
+        Returns:
+            Tensor: The checked or created tensor.
+
+        Raises:
+            ValueError: If `out` tensor does not match shape, dtype, or device.
+        """
         # Normalize shape
         if len(size) == 1 and isinstance(size[0], (tuple, list)):
             shape = tuple(size[0])
@@ -218,6 +257,12 @@ def numpy(self):
     def to_torch(self):
         """
         Returns a torch tensor with a copy of the data in this tensor.
+
+        Returns:
+            torch.Tensor: A torch tensor containing the data.
+
+        Raises:
+            ImportError: If torch is not installed.
         """
         try:
             import torch
@@ -234,6 +279,17 @@ def to_torch(self):
     def from_torch(cls, torch_tensor, device=None, **kwargs):
         """
         Returns a tensor with a copy of the data in the torch_tensor.
+
+        Args:
+            torch_tensor (torch.Tensor): The source torch tensor.
+            device (str, optional): The target device. Defaults to None.
+            **kwargs: Additional arguments for tensor creation.
+
+        Returns:
+            Tensor: A new tensor containing the data from the torch tensor.
+
+        Raises:
+            ImportError: If torch is not installed.
         """
         try:
             import torch
@@ -257,7 +313,7 @@ def fill_(self, value):
         """
         Fills the tensor with a scalar value (in-place operation).
 
-        Parameters:
+        Args:
             value: The scalar value to fill the tensor with.
 
         Note: For NPU tensors, this method syncs the filled data to device after modification.
@@ -280,10 +336,8 @@ def ones(cls, *size, out=None, dtype=None, device=None, **kwargs):
         """
         Returns a tensor filled with ones, with shape defined by size.
 
-        Parameters:
+        Args:
             *size (int...): Shape of the tensor, passed as separate ints or a single tuple/list.
-
-        Keyword Arguments:
             out (Tensor, optional): Optional output tensor to write into.
             dtype (np.dtype, optional): Desired dtype. Defaults to np.float32.
             device (str, optional): Target device. Defaults to 'npu'.
@@ -301,10 +355,8 @@ def zeros(cls, *size, out=None, dtype=None, device=None, **kwargs):
         """
         Returns a tensor filled with zeros, with shape defined by size.
 
-        Parameters:
+        Args:
             *size (int...): Shape of the tensor, passed as separate ints or a single tuple/list.
-
-        Keyword Arguments:
             out (Tensor, optional): Optional output tensor to write into.
             dtype (np.dtype, optional): Desired dtype. Defaults to np.float32.
             device (str, optional): Target device. Defaults to 'npu'.
@@ -322,12 +374,10 @@ def randint(cls, low, high, size, *, out=None, dtype=None, device=None, **kwargs
         """
         Returns a tensor filled with random integers uniformly sampled from [low, high).
 
-        Parameters:
+        Args:
             low (int): Lowest integer to be drawn (inclusive).
             high (int): One above the highest integer to be drawn (exclusive).
             size (tuple): Shape of the returned tensor.
-
-        Keyword Arguments:
             out (Tensor, optional): Optional tensor to write the result into.
             dtype (np.dtype, optional): Data type. Defaults to np.int64.
             device (str, optional): Target device. Defaults to 'npu'.
@@ -354,10 +404,8 @@ def rand(cls, *size, out=None, dtype=None, device=None, **kwargs):
         """
         Returns a tensor filled with random numbers from a uniform distribution on [0, 1).
 
-        Parameters:
+        Args:
             *size (int...): Variable number of integers or a single tuple defining the shape.
-
-        Keyword Arguments:
             out (Tensor, optional): Output tensor to write into.
             dtype (np.dtype, optional): Desired data type. Defaults to np.float32.
             device (str, optional): Target device. Defaults to 'npu'.
@@ -402,12 +450,10 @@ def arange(
         """
         Returns a 1-D tensor with values from the interval [start, end) with spacing `step`.
 
-        Parameters:
+        Args:
             start (number): Start of interval. Defaults to 0.
             end (number): End of interval (exclusive). Required if only one argument is given.
             step (number): Gap between elements. Defaults to 1.
-
-        Keyword Arguments:
             dtype (np.dtype, optional): Desired output data type. Inferred if not provided.
             out (Tensor, optional): Optional tensor to write output to (must match shape and dtype).
             device (str, optional): Target device. Defaults to 'npu'.
@@ -450,7 +496,7 @@ def zeros_like(cls, other, dtype=None, device=None, **kwargs):
         """
         Creates a new tensor with the same shape as `other`, filled with zeros.
 
-        Parameters:
+        Args:
             other (Tensor): The reference tensor to copy shape from.
             dtype (np.dtype, optional): Data type of the new tensor. Defaults to other's dtype.
             device (str, optional): Target device. Defaults to other's device.
@@ -480,6 +526,16 @@ class CPUOnlyTensor(Tensor):
     DEFAULT_DEVICE = "cpu"
 
     def __init__(self, shape_or_data, dtype=np.uint32, device="cpu"):
+        """
+        Initialize the CPUOnlyTensor.
+
+        Args:
+            shape_or_data (tuple or array-like):
+                - If a tuple, creates a new tensor with the given shape and dtype.
+                - If array-like, wraps the data into a tensor with optional dtype casting.
+            dtype (np.dtype, optional): Data type of the tensor. Defaults to np.uint32.
+            device (str, optional): Device string identifier. Defaults to 'cpu'.
+        """
         super().__init__(shape_or_data, dtype=dtype, device=device)
         if not isinstance(shape_or_data, tuple):
             self._data = np.array(shape_or_data, dtype=dtype)
@@ -489,62 +545,36 @@ def __init__(self, shape_or_data, dtype=np.uint32, device="cpu"):
 
     @property
     def data(self):
+        """
+        Get the underlying numpy array.
+
+        Returns:
+            np.ndarray: The underlying data.
+        """
         return self._data
 
     @property
     def shape(self):
+        """
+        Get the shape of the tensor.
+
+        Returns:
+            tuple: The shape of the tensor.
+        """
         return self._shape
 
     def _sync_to_device(self):
+        """
+        Syncs the tensor data from the host to the device memory.
+        For CPUOnlyTensor, this is a no-op.
+        """
         # Nothing to do for CPU only
         pass
 
     def _sync_from_device(self):
+        """
+        Syncs the tensor data from the device to the host memory.
+        For CPUOnlyTensor, this is a no-op.
+        """
         # Nothing to do for CPU only
         pass
-
-
-# Set default tensor class
-try:
-    from .xrtruntime.tensor import XRTTensor
-
-    DEFAULT_IRON_TENSOR_CLASS = XRTTensor
-except ImportError:
-    DEFAULT_IRON_TENSOR_CLASS = CPUOnlyTensor
-
-
-def tensor(*args, **kwargs):
-    return DEFAULT_IRON_TENSOR_CLASS(*args, **kwargs)
-
-
-def ones(*args, **kwargs):
-    return DEFAULT_IRON_TENSOR_CLASS.ones(*args, **kwargs)
-
-
-def zeros(*args, **kwargs):
-    return DEFAULT_IRON_TENSOR_CLASS.zeros(*args, **kwargs)
-
-
-def randint(*args, **kwargs):
-    return DEFAULT_IRON_TENSOR_CLASS.randint(*args, **kwargs)
-
-
-def rand(*args, **kwargs):
-    return DEFAULT_IRON_TENSOR_CLASS.rand(*args, **kwargs)
-
-
-def arange(*args, **kwargs):
-    return DEFAULT_IRON_TENSOR_CLASS.arange(*args, **kwargs)
-
-
-def zeros_like(*args, **kwargs):
-    return DEFAULT_IRON_TENSOR_CLASS.zeros_like(*args, **kwargs)
-
-
-def set_iron_tensor_class(cls):
-    if not issubclass(cls, Tensor):
-        raise ValueError(
-            f"IRON Tensors must inherit from the Tensor class but {cls} does not."
-        )
-    global DEFAULT_IRON_TENSOR_CLASS
-    DEFAULT_IRON_TENSOR_CLASS = cls
diff --git a/python/iron/hostruntime/xrtruntime/__init__.py b/python/utils/hostruntime/xrtruntime/__init__.py
similarity index 86%
rename from python/iron/hostruntime/xrtruntime/__init__.py
rename to python/utils/hostruntime/xrtruntime/__init__.py
index eafee760124..fcd24d67cbc 100644
--- a/python/iron/hostruntime/xrtruntime/__init__.py
+++ b/python/utils/hostruntime/xrtruntime/__init__.py
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2025 Advanced Micro Devices, Inc.
+# (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
 
 try:
     import pyxrt as xrt
diff --git a/python/utils/hostruntime/xrtruntime/hostruntime.py b/python/utils/hostruntime/xrtruntime/hostruntime.py
new file mode 100644
index 00000000000..eb512258909
--- /dev/null
+++ b/python/utils/hostruntime/xrtruntime/hostruntime.py
@@ -0,0 +1,555 @@
+# SPDX-FileCopyrightText: Copyright (C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+XRT-based implementation of the HostRuntime
+"""
+import atexit
+import logging
+from collections import OrderedDict
+import os
+import time
+import weakref
+from pathlib import Path
+from typing import TYPE_CHECKING
+import numpy as np
+import pyxrt
+
+from ..hostruntime import HostRuntime, HostRuntimeError, KernelHandle, KernelResult
+
+if TYPE_CHECKING:
+    from aie.iron.device import Device
+from .tensor import XRTTensor
+
+
+# XRTKernelHandle(kernel, xclbin, context, insts_path)
+class XRTKernelHandle(KernelHandle):
+    """
+    Handle for a loaded XRT kernel.
+    """
+
+    def __init__(self, kernel, xclbin, context, insts, insts_bo=None):
+        """
+        Initialize the XRTKernelHandle.
+
+        Args:
+            kernel: The XRT kernel object.
+            xclbin: The XRT xclbin object.
+            context: The XRT context object.
+            insts: The instructions for the kernel.
+            insts_bo (optional): The instruction buffer object. Defaults to None.
+        """
+        self.kernel = kernel
+        self.xclbin = xclbin
+        self.context = context
+        self.insts = insts
+        self.insts_bo = insts_bo
+
+
+class XRTKernelResult(KernelResult):
+    """A wrapper around data produced as the result of running a kernel with the PyXRT runtime"""
+
+    def __init__(
+        self,
+        ret: pyxrt.ert_cmd_state,
+        npu_time: int,
+        trace_data: XRTTensor | None = None,
+    ):
+        super().__init__(npu_time, trace_data)
+        self.ret = ret
+
+    def is_success(self) -> bool:
+        return self.ret == pyxrt.ert_cmd_state.ERT_CMD_STATE_COMPLETED
+
+
+class XRTHostRuntime(HostRuntime):
+    """Singleton manager for AIE XRT resources."""
+
+    # TODO: this is duplicated from the LIT helpers.
+    # NPU Model mappings - centralized for easy updates
+    # Maps generation name to list of model strings that may appear in xrt-smi
+    NPU_MODELS = {
+        "npu1": ["npu1", "Phoenix"],
+        "npu2": ["npu4", "Strix", "npu5", "Strix Halo", "npu6", "Krackan"],
+    }
+    _tensor_class = XRTTensor
+
+    def __init__(self):
+        """
+        Initialize the XRTHostRuntime.
+        """
+        self._device = pyxrt.device(0)
+        self._device_type_str = self._device.get_info(pyxrt.xrt_info_device.name)
+
+        self.npu_str = None
+        for key, value in self.NPU_MODELS.items():
+            if any([model in self._device_type_str for model in self.NPU_MODELS[key]]):
+                self.npu_str = key
+                break
+        if not self.npu_str:
+            raise RuntimeError(f"Unknown device type: {self._device_type_str}")
+
+    @classmethod
+    def read_insts(cls, insts_path: Path):
+        """
+        Reads instructions from the given file, with XRT-specific handling for ELF files.
+
+        Args:
+            insts_path (Path): Path to the instruction file.
+
+        Returns:
+            The instructions (either as bytes/array or XRT module).
+        """
+        # Overload the function in the generic class so we can use xrt-specific handling of elf files.
+        ext = insts_path.suffix.lower()
+        if ext == ".elf" and hasattr(pyxrt, "module"):
+            elf = pyxrt.elf(str(insts_path))
+            return pyxrt.module(elf)
+        else:
+            return super().read_insts(insts_path)
+
+    def load(
+        self,
+        npu_kernel,
+        **kwargs,
+    ) -> XRTKernelHandle:
+        """
+        Load an NPU kernel into the XRT runtime.
+
+        Args:
+            npu_kernel: The NPU kernel to load.
+            **kwargs: Additional arguments for loading.
+
+        Returns:
+            XRTKernelHandle: A handle to the loaded kernel.
+
+        Raises:
+            HostRuntimeError: If xclbin or insts files do not exist, or if kernel is not found.
+        """
+        self.check_device_consistency()
+        xclbin_path = Path(npu_kernel.xclbin_path).resolve()
+        insts_path = Path(npu_kernel.insts_path).resolve()
+        kernel_name = npu_kernel.kernel_name
+
+        if not xclbin_path.exists() or not xclbin_path.is_file():
+            raise HostRuntimeError(
+                f"xclbin {xclbin_path} does not exist or is not a file."
+            )
+        if not insts_path.exists() or not insts_path.is_file():
+            raise HostRuntimeError(
+                f"insts {insts_path} does not exist or is not a file."
+            )
+
+        xclbin = pyxrt.xclbin(str(xclbin_path))
+        self._device.register_xclbin(xclbin)
+        xclbin_uuid = xclbin.get_uuid()
+        context = pyxrt.hw_context(self._device, xclbin_uuid)
+
+        if kernel_name is None:
+            kernels = xclbin.get_kernels()
+            if not kernels:
+                raise RuntimeError("No kernels found in xclbin")
+            kernel_name = kernels[0].get_name()
+        else:
+            if not kernel_name in [k.get_name() for k in xclbin.get_kernels()]:
+                raise HostRuntimeError(
+                    f"Kernel {kernel_name} not found in xclbin (kernels found: {[k.get_name() for k in xclbin.get_kernels()]})"
+                )
+
+        insts = self.read_insts(insts_path)
+        if hasattr(pyxrt, "module") and isinstance(insts, pyxrt.module):
+            kernel = pyxrt.ext.kernel(context, insts, kernel_name)
+        else:
+            kernel = pyxrt.kernel(context, kernel_name)
+
+        kernel_handle = XRTKernelHandle(kernel, xclbin, context, insts)
+        return kernel_handle
+
+    def run(
+        self,
+        kernel_handle: XRTKernelHandle,
+        args,
+        trace_config=None,
+        fail_on_error: bool = True,
+        **kwargs,
+    ) -> XRTKernelResult:
+        """
+        Run a loaded XRT kernel.
+
+        Args:
+            kernel_handle (XRTKernelHandle): The handle to the loaded kernel.
+            args: Arguments to pass to the kernel.
+            trace_config (optional): Configuration for tracing. Defaults to None.
+            fail_on_error (bool, optional): Whether to raise an exception on kernel failure. Defaults to True.
+            **kwargs: Additional arguments.
+
+        Returns:
+            XRTKernelResult: The result of the kernel execution.
+
+        Raises:
+            HostRuntimeError: If arguments are invalid or kernel execution fails (and fail_on_error is True).
+        """
+        self.check_device_consistency()
+        # Filter out callable functions and check arg types
+        args = [a for a in args if not callable(a)]
+        if not all([isinstance(a, self._tensor_class) for a in args]):
+            raise HostRuntimeError(
+                f"The {self.__class__.__name__} can only take {self._tensor_class.__name__} as arguments, but got: {args}"
+            )
+        [a.to("npu") for a in args]
+        buffers = [a.buffer_object() for a in args]
+
+        insts_bo = None
+        insts_bytes = 0
+        try:
+            is_module = hasattr(pyxrt, "module") and isinstance(
+                kernel_handle.insts, pyxrt.module
+            )
+            if not is_module:
+                insts_bytes = kernel_handle.insts.nbytes
+                if kernel_handle.insts_bo:
+                    insts_bo = kernel_handle.insts_bo
+                else:
+                    insts_bo = self._tensor_class(
+                        kernel_handle.insts,
+                        flags=pyxrt.bo.cacheable,
+                        group_id=kernel_handle.kernel.group_id(1),
+                    ).buffer_object()
+
+            start = time.time_ns()
+            h = kernel_handle.kernel(3, insts_bo, insts_bytes, *buffers)
+            r = h.wait()
+            stop = time.time_ns()
+
+            if fail_on_error and r != pyxrt.ert_cmd_state.ERT_CMD_STATE_COMPLETED:
+                raise HostRuntimeError(f"Kernel returned {str(r)}")
+        finally:
+            # delete insts buffer if it was created locally
+            if insts_bo and not kernel_handle.insts_bo:
+                del insts_bo
+
+        return XRTKernelResult(r, stop - start)
+
+    def device(self) -> "Device":
+        """
+        Get the device associated with this runtime.
+
+        Returns:
+            Device: The device object (NPU1 or NPU2).
+
+        Raises:
+            HostRuntimeError: If the device string is unknown.
+        """
+        from aie.iron.device import NPU1, NPU2
+
+        devices = {
+            "npu1": NPU1(),
+            "npu2": NPU2(),
+        }
+
+        if self.npu_str in devices:
+            return devices[self.npu_str]
+        else:
+            raise HostRuntimeError(
+                f"Unknown device string: {self.npu_str}: expected npu1 or npu2"
+            )
+
+
+class CachedXRTKernelHandle(XRTKernelHandle):
+    """
+    A cached handle for a loaded XRT kernel.
+    """
+
+    def __init__(self, kernel, xclbin, context, insts, insts_bo=None):
+        """
+        Initialize the CachedXRTKernelHandle.
+
+        Args:
+            kernel: The XRT kernel object.
+            xclbin: The XRT xclbin object.
+            context: The XRT context object.
+            insts: The instructions for the kernel.
+            insts_bo (optional): The instruction buffer object. Defaults to None.
+        """
+        super().__init__(kernel, xclbin, context, insts, insts_bo)
+        self._is_valid = True
+
+    def invalidate(self):
+        """
+        Invalidate the handle and release resources.
+        """
+        self._is_valid = False
+        if hasattr(self, "context"):
+            del self.context
+        if hasattr(self, "kernel"):
+            del self.kernel
+        if hasattr(self, "xclbin"):
+            del self.xclbin
+        if hasattr(self, "insts"):
+            del self.insts
+        if hasattr(self, "insts_bo"):
+            del self.insts_bo
+
+
+class CachedXRTRuntime(XRTHostRuntime):
+    """
+    A cached version of XRTHostRuntime that caches up to n contexts,
+    depending on the type of NPU.
+    It reuses contexts for the same xclbin (identified by path and mtime).
+    """
+
+    # I got these values through experimentation on two machines
+    # These values are primarily determined by the hardware/driver, and could change
+    # in the future. But currently, if you exceed these sizes, you will fail to be
+    # able to create a new context. At the driver level, the cached contexts are
+    # a system-wide constrained resource, so caching on systems with many concurrent
+    # processes trying to create contexts (as in parallel CI jobs) can be flaky.
+    # TODO: use some sort of file system artifact or figure out how to query the driver
+    # for the state of the cache, and how to make loading operations atomic between processes.
+    NPU_CONTEXT_CACHE_SIZE = {
+        "npu1": 6,
+        "npu2": 32,
+    }
+
+    def __init__(self):
+        """
+        Initialize the CachedXRTRuntime.
+        """
+        super().__init__()
+        # We use OrderedDict so that we can use Fifo behavior for LRU eviction policies
+        self._context_cache = OrderedDict()
+        self._insts_cache = OrderedDict()
+
+        # Set default from dict if present
+        self._cache_size = None
+        if self.npu_str in self.NPU_CONTEXT_CACHE_SIZE.keys():
+            self._cache_size = self.NPU_CONTEXT_CACHE_SIZE[self.npu_str]
+
+        # Environment variable always override default values
+        # TODO: should probably emit warning if exceeds recorded max size.
+        self._cache_size = os.environ.get("XRT_CONTEXT_CACHE_SIZE", self._cache_size)
+
+        # Error if no default and no env var
+        if self._cache_size is None:
+            raise HostRuntimeError(f"No known cache size for {self.npu_str}")
+
+        atexit.register(self.cleanup)
+
+    def cleanup(self):
+        """
+        Clean up the cache by evicting all entries.
+        """
+        while self._context_cache:
+            self._evict()
+        while self._insts_cache:
+            self._evict_insts()
+
+    def _cleanup_entry(self, entry):
+        context = entry["context"]
+        handles = entry["handles"]
+
+        # Invalidate all handles
+        for ref in handles:
+            handle = ref()
+            if handle:
+                handle.invalidate()
+
+        # Explicitly delete context
+        del context
+
+    def _evict(self):
+        # Pop the oldest item
+        key, entry = self._context_cache.popitem(last=False)
+        self._cleanup_entry(entry)
+
+    def _cleanup_insts_entry(self, entry):
+        insts_bo = entry["insts_bo"]
+        del insts_bo
+
+    def _evict_insts(self):
+        key, entry = self._insts_cache.popitem(last=False)
+        self._cleanup_insts_entry(entry)
+
+    def run(
+        self,
+        kernel_handle: XRTKernelHandle,
+        args,
+        trace_config=None,
+        fail_on_error: bool = True,
+        only_if_loaded: bool = False,
+        **kwargs,
+    ) -> XRTKernelResult:
+        """
+        Run a loaded XRT kernel.
+
+        Args:
+            kernel_handle (XRTKernelHandle): The handle to the loaded kernel.
+            args: Arguments to pass to the kernel.
+            trace_config (optional): Configuration for tracing. Defaults to None.
+            fail_on_error (bool, optional): Whether to raise an exception on kernel failure. Defaults to True.
+            only_if_loaded (bool, optional): If True, only run if the kernel is currently loaded in the cache. Defaults to False.
+            **kwargs: Additional arguments.
+
+        Returns:
+            XRTKernelResult: The result of the kernel execution.
+
+        Raises:
+            HostRuntimeError: If arguments are invalid, kernel execution fails, or kernel is not loaded (if only_if_loaded=True).
+        """
+        if only_if_loaded:
+            if (
+                isinstance(kernel_handle, CachedXRTKernelHandle)
+                and not kernel_handle._is_valid
+            ):
+                raise HostRuntimeError("Kernel not loaded (evicted from cache)")
+
+        return super().run(kernel_handle, args, trace_config, fail_on_error, **kwargs)
+
+    def load(
+        self,
+        npu_kernel,
+        retry: bool = True,
+        **kwargs,
+    ) -> XRTKernelHandle:
+        """
+        Load an NPU kernel into the cached XRT runtime.
+
+        Args:
+            npu_kernel: The NPU kernel to load.
+            retry (bool, optional): Whether to retry loading if context creation fails due to resource limits. Defaults to True.
+            **kwargs: Additional arguments for loading.
+
+        Returns:
+            XRTKernelHandle: A handle to the loaded kernel.
+
+        Raises:
+            HostRuntimeError: If xclbin or insts files do not exist, or if kernel is not found.
+        """
+        self.check_device_consistency()
+        xclbin_path = Path(npu_kernel.xclbin_path).resolve()
+        insts_path = Path(npu_kernel.insts_path).resolve()
+        kernel_name = npu_kernel.kernel_name
+
+        if not xclbin_path.exists() or not xclbin_path.is_file():
+            raise HostRuntimeError(
+                f"xclbin {xclbin_path} does not exist or is not a file."
+            )
+        if not insts_path.exists() or not insts_path.is_file():
+            raise HostRuntimeError(
+                f"insts {insts_path} does not exist or is not a file."
+            )
+
+        xclbin_mtime = xclbin_path.stat().st_mtime
+        insts_mtime = insts_path.stat().st_mtime
+
+        # Context Cache Lookup
+        context_key = (str(xclbin_path), xclbin_mtime)
+
+        try:
+            if context_key in self._context_cache:
+                entry = self._context_cache[context_key]
+                self._context_cache.move_to_end(context_key)
+                context = entry["context"]
+                xclbin = entry["xclbin"]
+                # Clean up dead handles
+                entry["handles"] = [
+                    ref for ref in entry["handles"] if ref() is not None
+                ]
+            else:
+                xclbin = pyxrt.xclbin(str(xclbin_path))
+                xclbin_uuid = xclbin.get_uuid()
+
+                if len(self._context_cache) >= self._cache_size:
+                    self._evict()
+
+                self._device.register_xclbin(xclbin)
+
+                # Try to create context, evicting if necessary
+                context = None
+                retries = 0
+                max_retries = len(self._context_cache) if retry else 0
+                while context is None:
+                    try:
+                        context = pyxrt.hw_context(self._device, xclbin_uuid)
+                    except RuntimeError as e:
+                        # If we hit a resource limit (err=-2 usually means EMFILE/ENFILE or similar resource exhaustion)
+                        # and we have items in the cache, try evicting.
+                        if (
+                            "No such file or directory" in str(e)
+                            and self._context_cache
+                            and retries < max_retries
+                        ):
+                            self._evict()
+                            retries += 1
+                        else:
+                            raise e
+
+                entry = {
+                    "context": context,
+                    "xclbin": xclbin,
+                    "handles": [],
+                    "uuid": xclbin_uuid,
+                }
+                self._context_cache[context_key] = entry
+
+            # Kernel Name Resolution
+            if kernel_name is None:
+                kernels = xclbin.get_kernels()
+                if not kernels:
+                    raise RuntimeError("No kernels found in xclbin")
+                kernel_name = kernels[0].get_name()
+            else:
+                if not kernel_name in [k.get_name() for k in xclbin.get_kernels()]:
+                    raise HostRuntimeError(
+                        f"Kernel {kernel_name} not found in xclbin (kernels found: {[k.get_name() for k in xclbin.get_kernels()]})"
+                    )
+
+            insts = self.read_insts(insts_path)
+            insts_bo = None
+            if hasattr(pyxrt, "module") and isinstance(insts, pyxrt.module):
+                kernel = pyxrt.ext.kernel(context, insts, kernel_name)
+            else:
+                kernel = pyxrt.kernel(context, kernel_name)
+
+                # Magic number for RyzenAI group id that will be fixed in the future. See same code at XRT:
+                # https://github.com/Xilinx/XRT/blob/56222ed5cfd119dff0d5bd920735b87024e8c829/src/runtime_src/core/common/api/xrt_module.cpp#L1621
+                group_id = kernel.group_id(1)
+                insts_key = (str(insts_path), insts_mtime, group_id)
+
+                if insts_key in self._insts_cache:
+                    insts_entry = self._insts_cache[insts_key]
+                    self._insts_cache.move_to_end(insts_key)
+                    insts_bo = insts_entry["insts_bo"]
+                else:
+                    if len(self._insts_cache) >= self._cache_size:
+                        self._evict_insts()
+
+                    insts_bo = self._tensor_class(
+                        insts,
+                        flags=pyxrt.bo.cacheable,
+                        group_id=group_id,
+                    ).buffer_object()
+
+                    insts_entry = {
+                        "insts_bo": insts_bo,
+                    }
+                    self._insts_cache[insts_key] = insts_entry
+
+            kernel_handle = CachedXRTKernelHandle(
+                kernel, xclbin, context, insts, insts_bo
+            )
+            entry["handles"].append(weakref.ref(kernel_handle))
+
+            return kernel_handle
+
+        except Exception:
+            if context_key in self._context_cache:
+                entry = self._context_cache[context_key]
+                # Clean up dead handles
+                entry["handles"] = [
+                    ref for ref in entry["handles"] if ref() is not None
+                ]
+                if not entry["handles"]:
+                    del self._context_cache[context_key]
+                    self._cleanup_entry(entry)
+            raise
diff --git a/python/iron/hostruntime/xrtruntime/tensor.py b/python/utils/hostruntime/xrtruntime/tensor.py
similarity index 58%
rename from python/iron/hostruntime/xrtruntime/tensor.py
rename to python/utils/hostruntime/xrtruntime/tensor.py
index 23882b5a10f..3c803a613f0 100644
--- a/python/iron/hostruntime/xrtruntime/tensor.py
+++ b/python/utils/hostruntime/xrtruntime/tensor.py
@@ -4,13 +4,13 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2025 Advanced Micro Devices, Inc.
+# (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
 
 import numpy as np
 import pyxrt as xrt
 
-from ..tensor import Tensor
-from ....helpers.util import np_ndarray_type_get_shape
+from ..tensor_class import Tensor
+from aie.helpers.util import np_ndarray_type_get_shape
 
 
 class XRTTensor(Tensor):
@@ -22,7 +22,26 @@ class XRTTensor(Tensor):
 
     """
 
-    def __init__(self, shape_or_data, dtype=np.uint32, device="npu"):
+    def __init__(
+        self,
+        shape_or_data,
+        dtype=np.uint32,
+        device="npu",
+        flags=xrt.bo.host_only,
+        group_id=0,
+    ):
+        """
+        Initialize the XRTTensor.
+
+        Args:
+            shape_or_data (tuple or array-like):
+                - If a tuple, creates a new tensor with the given shape and dtype.
+                - If array-like, wraps the data into a tensor with optional dtype casting.
+            dtype (np.dtype, optional): Data type of the tensor. Defaults to np.uint32.
+            device (str, optional): Device string identifier. Defaults to 'npu'.
+            flags (optional): XRT buffer object flags. Defaults to xrt.bo.host_only.
+            group_id (int, optional): XRT buffer object group ID. Defaults to 0.
+        """
         super().__init__(shape_or_data, dtype=dtype, device=device)
         device_index = 0
         self.xrt_device = xrt.device(device_index)
@@ -43,16 +62,17 @@ def __init__(self, shape_or_data, dtype=np.uint32, device="npu"):
             self._shape = np_data.shape
 
         # Ideally, we use xrt::ext::bo host-only BO but there are no bindings for that currently.
-        # Eventually, xrt:ext::bo uses the 0 magic number that shall be fixed in the future.
+
+        # Eventually, xrt:ext::bo uses the 0 magic number that shall be fixed in the future, so that is used as a default.
         # https://github.com/Xilinx/XRT/blob/9b114f18c4fcf4e3558291aa2d78f6d97c406365/src/runtime_src/core/common/api/xrt_bo.cpp#L1626
-        group_id = 0
-        self.bo = xrt.bo(
+        self._bo = xrt.bo(
             self.xrt_device,
             int(np.prod(self._shape) * np.dtype(self.dtype).itemsize),
-            xrt.bo.host_only,
+            flags,
             group_id,
         )
-        ptr = self.bo.map()
+
+        ptr = self._bo.map()
         self._data = np.frombuffer(ptr, dtype=self.dtype).reshape(self._shape)
 
         if not isinstance(shape_or_data, tuple):
@@ -65,17 +85,35 @@ def __init__(self, shape_or_data, dtype=np.uint32, device="npu"):
 
     @property
     def data(self):
+        """
+        Get the underlying numpy array.
+
+        Returns:
+            np.ndarray: The underlying data.
+        """
         return self._data
 
     @property
     def shape(self):
+        """
+        Get the shape of the tensor.
+
+        Returns:
+            tuple: The shape of the tensor.
+        """
         return self._shape
 
     def _sync_to_device(self):
-        return self.bo.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
+        """
+        Syncs the tensor data from the host to the device memory.
+        """
+        return self._bo.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
 
     def _sync_from_device(self):
-        return self.bo.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
+        """
+        Syncs the tensor data from the device to the host memory.
+        """
+        return self._bo.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
 
     def __del__(self):
         """
@@ -83,15 +121,15 @@ def __del__(self):
 
         Releases associated device memory (e.g., XRT buffer object).
         """
-        if hasattr(self, "bo"):
-            del self.bo
-            self.bo = None
+        if hasattr(self, "_bo"):
+            del self._bo
+            self._bo = None
 
     def buffer_object(self):
         """
         Returns the XRT buffer object associated with this tensor.
 
         Returns:
-           xrt.bo: The XRT buffer object associated with this tensor.
+            buffer_object: The XRT buffer object associated with this tensor.
         """
-        return self.bo
+        return self._bo
diff --git a/python/iron/hostruntime/jit.py b/python/utils/jit.py
similarity index 51%
rename from python/iron/hostruntime/jit.py
rename to python/utils/jit.py
index 090195ffe69..e81fd54c1e8 100644
--- a/python/iron/hostruntime/jit.py
+++ b/python/utils/jit.py
@@ -4,71 +4,20 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2025 Advanced Micro Devices, Inc.
+# (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
 
 import os
 import functools
 import hashlib
-import fcntl
-import contextlib
-import time
 
 from aie.extras.context import mlir_mod_ctx
-from ..device import NPU1, NPU2, NPU1Col1, NPU2Col1
-from ..compile import compile_mlir_module, compile_external_kernel
-from ..kernel import ExternalFunction
-from .config import get_current_device
-from .kernelrunner import NPUKernel
+from .compile import compile_mlir_module, compile_external_kernel
+from .npukernel import NPUKernel
 from aie.dialects.aie import AIEDevice
-from ..compile.cache.circular_cache import CircularCache
-from ..compile.cache.utils import _create_function_cache_key
-from ..compile import IRON_CACHE_HOME
-from ..compile.utils import _cleanup_failed_compilation
-
-
-@contextlib.contextmanager
-def file_lock(lock_file_path, timeout_seconds=60):
-    """
-    Context manager for file locking using flock to prevent race conditions.
-
-    Args:
-        lock_file_path (str): Path to the lock file
-        timeout_seconds (int): Maximum time to wait for lock acquisition in seconds
-    """
-    lock_file = None
-    try:
-        # Create lock file if it doesn't exist
-        os.makedirs(os.path.dirname(lock_file_path), exist_ok=True)
-        try:
-            f = os.open(lock_file_path, os.O_CREAT | os.O_EXCL)
-            os.close(f)
-        except FileExistsError:
-            pass  # File already exists
-        lock_file = open(lock_file_path, "a")
-
-        # Try to acquire exclusive lock with timeout
-        start_time = time.time()
-        while True:
-            try:
-                fcntl.flock(lock_file.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
-                break
-            except OSError:
-                # Lock is held by another process
-                if time.time() - start_time > timeout_seconds:
-                    raise TimeoutError(
-                        f"Could not acquire lock on {lock_file_path} within {timeout_seconds} seconds"
-                    )
-                time.sleep(0.1)
-
-        yield lock_file
-
-    finally:
-        if lock_file is not None:
-            try:
-                fcntl.flock(lock_file.fileno(), fcntl.LOCK_UN)
-            except OSError:
-                pass  # Ignore errors when releasing lock
-            lock_file.close()
+from .compile.cache.circular_cache import CircularCache
+from .compile.cache.utils import _create_function_cache_key, file_lock
+from .compile import NPU_CACHE_HOME
+from .compile.utils import _cleanup_failed_compilation
 
 
 # Global cache for compiled kernels at the function level
@@ -79,17 +28,30 @@ def file_lock(lock_file_path, timeout_seconds=60):
 
 def jit(function=None, is_placed=True, use_cache=True):
     """
-    Decorator to compile an IRON kernel into a binary to run on the NPU.
+    Decorator to compile an NPU kernel into a binary to run on the NPU.
+
+    Args:
+        function (callable, optional): The function to compile.
+        is_placed (bool, optional): Whether the kernel is using explicit or implicit placement. Defaults to True.
+        use_cache (bool, optional): Use cached MLIR module if available. Defaults to True.
 
-    Parameters:
-    - is_placed (bool): Whether the kernel is using explicit or implicit placement Defaults to True.
-    - use_cache (bool): Use cached MLIR module if available. Defaults to True.
+    Returns:
+        callable: The decorated function.
     """
     if function is None:
         return functools.partial(jit, is_placed=is_placed, use_cache=use_cache)
 
     @functools.wraps(function)
     def decorator(*args, **kwargs):
+        from aie.iron.device import NPU1, NPU2, NPU1Col1, NPU2Col1
+        from aie.iron.kernel import ExternalFunction
+        from . import DefaultNPURuntime
+
+        if DefaultNPURuntime is None:
+            raise Exception("Cannot use JIT; DefaultNPURuntime not set.")
+
+        trace_config = kwargs.get("trace_config")
+
         # Check if we already have a compiled kernel for this function signature
         cache_key = _create_function_cache_key(function, args, kwargs)
         if cache_key in _compiled_kernels:
@@ -109,18 +71,15 @@ def decorator(*args, **kwargs):
                 external_kernels.append(value)
 
         # Execute the function to generate MLIR
-        try:
-            if is_placed:
-                with mlir_mod_ctx() as ctx:
-                    function(*args, **kwargs)
-                    assert (
-                        ctx.module.operation.verify()
-                    ), f"Verification failed for '{function.__name__}'"
-                    mlir_module = ctx.module
-            else:
-                mlir_module = function(*args, **kwargs)
-        except Exception as e:
-            raise
+        if is_placed:
+            with mlir_mod_ctx() as ctx:
+                function(*args, **kwargs)
+                assert (
+                    ctx.module.operation.verify()
+                ), f"Verification failed for '{function.__name__}'"
+                mlir_module = ctx.module
+        else:
+            mlir_module = function(*args, **kwargs)
 
         # Compile all ExternalFunction instances that were created during this JIT compilation
         for func in ExternalFunction._instances:
@@ -130,28 +89,25 @@ def decorator(*args, **kwargs):
                 external_kernels.append(func)
 
         # Determine target architecture based on device type
-        try:
-            current_device = get_current_device()
-
-            # Determine target architecture based on device type
-            if isinstance(current_device, (NPU2, NPU2Col1)):
-                target_arch = "aie2p"
-            elif isinstance(current_device, (NPU1, NPU1Col1)):
-                target_arch = "aie2"
-            elif current_device in (AIEDevice.npu2, AIEDevice.npu2_1col):
-                target_arch = "aie2p"
-            elif current_device in (AIEDevice.npu1, AIEDevice.npu1_1col):
-                target_arch = "aie2"
-            else:
-                raise RuntimeError(f"Unsupported device type: {type(current_device)}")
-        except Exception as e:
-            raise
+        current_device = DefaultNPURuntime.device()
+
+        # Determine target architecture based on device type
+        if isinstance(current_device, (NPU2, NPU2Col1)):
+            target_arch = "aie2p"
+        elif isinstance(current_device, (NPU1, NPU1Col1)):
+            target_arch = "aie2"
+        elif current_device in (AIEDevice.npu2, AIEDevice.npu2_1col):
+            target_arch = "aie2p"
+        elif current_device in (AIEDevice.npu1, AIEDevice.npu1_1col):
+            target_arch = "aie2"
+        else:
+            raise RuntimeError(f"Unsupported device type: {type(current_device)}")
 
         # Hash of the IR string, ExternalFunction compiler options, and target architecture
         module_hash = hash_module(mlir_module, external_kernels, target_arch)
-        kernel_dir = os.path.join(IRON_CACHE_HOME, f"{module_hash}")
-        lock_file_path = os.path.join(kernel_dir, ".lock")
-        mlir_path = os.path.join(kernel_dir, "aie.mlir")
+        kernel_dir = NPU_CACHE_HOME / f"{module_hash}"
+        lock_file_path = kernel_dir / ".lock"
+        mlir_path = kernel_dir / "aie.mlir"
 
         # Use file locking to prevent race conditions when accessing cache directory
         with file_lock(lock_file_path):
@@ -161,8 +117,8 @@ def decorator(*args, **kwargs):
             # Write MLIR to file if not already cached
             inst_filename = "insts.bin"
             xclbin_filename = "final.xclbin"
-            xclbin_path = os.path.join(kernel_dir, xclbin_filename)
-            inst_path = os.path.join(kernel_dir, inst_filename)
+            xclbin_path = kernel_dir / xclbin_filename
+            inst_path = kernel_dir / inst_filename
 
             xclbin_exists = os.path.exists(xclbin_path)
             inst_exists = os.path.exists(inst_path)
@@ -188,17 +144,13 @@ def decorator(*args, **kwargs):
                     _cleanup_failed_compilation(kernel_dir)
                     raise e
 
-        kernel_name = "MLIR_AIE"
-        try:
-            kernel = NPUKernel(xclbin_path, inst_path, kernel_name=kernel_name)
-
-            # Cache the kernel for this function signature
-            _compiled_kernels[cache_key] = kernel
-
-            result = kernel(*args, **kwargs)
-            return result
-        except Exception as e:
-            raise
+        _compiled_kernels[cache_key] = NPUKernel(
+            xclbin_path,
+            inst_path,
+            kernel_name="MLIR_AIE",
+            trace_config=trace_config,
+        )
+        _compiled_kernels[cache_key](*args)
 
     return decorator
 
@@ -206,13 +158,20 @@ def decorator(*args, **kwargs):
 def hash_module(module, external_kernels=None, target_arch=None):
     """
     Hash the MLIR module and ExternalFunction compiler options to create a unique identifier.
+
+    Args:
+        module: The MLIR module.
+        external_kernels (list, optional): List of external kernels. Defaults to None.
+        target_arch (str, optional): Target architecture. Defaults to None.
+
+    Returns:
+        str: The hash string.
     """
     mlir_str = str(module)
 
     # Include ExternalFunction compiler options and source code in the hash
     if external_kernels:
         running_hash = ""
-        source_contents = []
         for func in external_kernels:
             running_hash += str(hash(func))
 
diff --git a/python/utils/ml.py b/python/utils/ml.py
index c44c2822155..8160a557e9b 100644
--- a/python/utils/ml.py
+++ b/python/utils/ml.py
@@ -4,7 +4,16 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2024 Advanced Micro Devices, Inc.
+# (c) Copyright 2024-2026 Advanced Micro Devices, Inc.
+"""
+ML related utilties
+
+* class `CSVLogger`
+* `load_class_label`
+* `unpickle`
+* `fuse_single_conv_bn_pair`
+* class `DataShaper`
+"""
 import csv
 import json
 import math
diff --git a/python/utils/npukernel.py b/python/utils/npukernel.py
new file mode 100644
index 00000000000..dd99fe06839
--- /dev/null
+++ b/python/utils/npukernel.py
@@ -0,0 +1,99 @@
+# npukernel.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
+from pathlib import Path
+from .trace import TraceConfig
+
+
+class NPUKernel:
+    """
+    Represents a compiled NPU kernel.
+    """
+
+    def __init__(
+        self,
+        xclbin_path,
+        insts_path,
+        device_index=0,
+        kernel_name="MLIR_AIE",
+        trace_config: TraceConfig | None = None,
+    ):
+        """
+        Initialize the NPUKernel.
+
+        Args:
+            xclbin_path (str | Path): Path to the xclbin file.
+            insts_path (str | Path): Path to the instructions file.
+            device_index (int, optional): Device index. Defaults to 0.
+            kernel_name (str, optional): Name of the kernel. Defaults to "MLIR_AIE".
+            trace_config (TraceConfig | None, optional): Trace configuration. Defaults to None.
+        """
+        self._xclbin_path = xclbin_path
+        self._insts_path = insts_path
+        self._kernel_name = kernel_name
+        self._trace_config = trace_config
+
+    @property
+    def trace_config(self) -> TraceConfig | None:
+        """
+        Get the trace configuration.
+
+        Returns:
+            TraceConfig | None: The trace configuration.
+        """
+        return self._trace_config
+
+    @property
+    def xclbin_path(self):
+        """
+        Get the path to the xclbin file.
+
+        Returns:
+            str | Path: The xclbin path.
+        """
+        return self._xclbin_path
+
+    @property
+    def insts_path(self):
+        """
+        Get the path to the instructions file.
+
+        Returns:
+            str | Path: The instructions path.
+        """
+        return self._insts_path
+
+    @property
+    def kernel_name(self):
+        """
+        Get the kernel name.
+
+        Returns:
+            str: The kernel name.
+        """
+        return self._kernel_name
+
+    # Blocking call.
+    def __call__(self, *args, **kwargs):
+        """
+        Run the kernel with the given arguments.
+        This is a blocking call.
+
+        Args:
+            *args: Arguments passed to the kernel.
+            **kwargs: Additional arguments passed to the runtime load_and_run method.
+
+        Returns:
+            KernelResult: The result of the kernel execution.
+        """
+        from . import DefaultNPURuntime
+
+        return DefaultNPURuntime.load_and_run(
+            self,
+            list(args),
+            **kwargs,
+        )
diff --git a/python/utils/test.py b/python/utils/test.py
index ca4ef472b82..a104c992737 100644
--- a/python/utils/test.py
+++ b/python/utils/test.py
@@ -4,10 +4,19 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2024 Advanced Micro Devices, Inc.
+# (c) Copyright 2024-2026 Advanced Micro Devices, Inc.
+"""
+Test/ Host code utilities.
 
+* `create_default_argparser`
+    * This creates a ArgumentParser with the following args: --xclbin, --kernel, --instr, -v, --verify, --iters, --warmup, --trace_sz, --trace_file
+    * It returns the ArgumentParser which allows you to add more arguments
+* `parse_args`
+    * Calls create_default_argparser and returns the parsed results
+    * Useful if you don't need additional custom args
+"""
 import argparse
-import pyxrt as xrt
+from aie.utils import TraceConfig, NPUKernel
 
 
 # Add default args to standard parser object
@@ -56,76 +65,75 @@ def create_default_argparser():
     )
     p.add_argument(
         "-t",
-        "--trace_sz",
+        "--trace-sz",
         dest="trace_size",
         default=0,
         type=int,
         help="trace size in bytes",
     )
     p.add_argument(
-        "--trace_file",
+        "--trace-file",
         dest="trace_file",
         default="trace.txt",
         help="where to store trace output",
     )
     p.add_argument(
         "-i1s",
-        "--in1_size",
+        "--in1-size",
         dest="in1_size",
         default=0,
         help="Input 1 buffer size in bytes",
     )
     p.add_argument(
         "-i2s",
-        "--in2_size",
+        "--in2-size",
         dest="in2_size",
         default=0,
         help="Input 2 buffer size in bytes",
     )
     p.add_argument(
         "-os",
-        "--out_size",
+        "--out-size",
         dest="out_size",
         default=0,
         help="Output buffer size in bytes",
     )
+    p.add_argument(
+        "--trace-after-output",
+        dest="trace_after_output",
+        action="store_true",
+        help="Trace after output",
+    )
+    p.add_argument(
+        "--enable-ctrl-pkts",
+        dest="enable_ctrl_pkts",
+        action="store_true",
+        help="Enable control packets",
+    )
     return p
 
 
+def create_npu_kernel(opts):
+    trace_config = None
+    trace_size = getattr(opts, "trace_size", 0)
+    if trace_size > 0:
+        trace_config = TraceConfig(
+            trace_size=trace_size,
+            trace_file=getattr(opts, "trace_file", "trace.txt"),
+            trace_after_last_tensor=getattr(opts, "trace_after_output", False),
+            enable_ctrl_pkts=getattr(opts, "enable_ctrl_pkts", False),
+        )
+    opts.npu_kernel = NPUKernel(
+        xclbin_path=opts.xclbin,
+        insts_path=opts.instr,
+        kernel_name=opts.kernel,
+        trace_config=trace_config,
+    )
+    return opts
+
+
 # options
 def parse_args(args):
     p = create_default_argparser()
-    return p.parse_args(args)
-
-
-#
-# Create new device and kernel based on xclbin
-#
-# If you want to setup XRT buffers as well, look at xrt.py/setup_aie
-# to setup your environment
-#
-def init_xrt_load_kernel(opts):
-    # Get a device handle
-    device = xrt.device(0)
-
-    # Load the xclbin
-    xclbin = xrt.xclbin(opts.xclbin)
-
-    # Load the kernel
-    kernels = xclbin.get_kernels()
-    try:
-        xkernel = [k for k in kernels if opts.kernel in k.get_name()][0]
-    except:
-        print(f"Kernel '{opts.kernel}' not found in '{opts.xclbin}'")
-        exit(-1)
-
-    # Register xclbin
-    device.register_xclbin(xclbin)
-
-    # Get a hardware context
-    context = xrt.hw_context(device, xclbin.get_uuid())
-
-    # get a kernel handle
-    kernel = xrt.kernel(context, xkernel.get_name())
-
-    return (device, kernel)
+    opts = p.parse_args(args)
+    return create_npu_kernel(opts)
diff --git a/python/utils/README.md b/python/utils/trace/__init__.py
similarity index 84%
rename from python/utils/README.md
rename to python/utils/trace/__init__.py
index f54d2cd5310..7ab875af8b7 100644
--- a/python/utils/README.md
+++ b/python/utils/trace/__init__.py
@@ -1,54 +1,15 @@
-<!---//===- README.md --------------------------*- Markdown -*-===//
-//
-// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-// Copyright (C) 2022, Advanced Micro Devices, Inc.
-// 
-//===----------------------------------------------------------------------===//-->
-
-# Python Utilities
-
-The python utilties are designed to simplify commonly repeated tasks and wrap them up into helper functions. They are divided into separate categories and can be added to any python code via:
-```
-import aie.utils.trace as trace_utils
-import aie.utils.test as test_utils
-improt aie.utils.xrt as xrt_utils
-```
-Thereafter, functions defined in the particular utils file such as `trace_utils` can be called via `trace_utils.configure_packet_tracing_aie2(...)`.
-
-- [Test utilities](#test-utilites-testpy) ([test.py](./test.py))
-- [Trace utilities](#trace-utilites-tracepy) ([trace.py](./trace.py))
-    - [Trace Mechanisms and Explanations](#trace-mechanisms-and-explanations)
-    - [Trace parser](#trace-parser-parse_tracepy) ([parse_trace.py](./parse_trace.py))
-    - [Trace parser - eventIR based](#trace-parser---eventir-based-parse_eventirpy) ([parse_eventIR.py](./parse_eventIR.py))
-- [XRT utilities](#xrt-utilites-xrtpy) ([xrt.py](./xrt.py))
-- [Machine Learning (ML) utilities](#machine-language-ml-utilites-mlpyss) ([ml.py](./ml.py))
-
-## Test utilites ([test.py](./test.py))
-Test/ Host code utilities.
-* `create_default_argparser`
-    * This creates a ArgumentParser with the following args: --xclbin, --kernel, --instr, -v, --verify, --iters, --warmup, --trace_sz, --trace_file
-    * It returns the ArgumentParser which allows you to add more arguments
-* `parse_args` 
-    * Calls create_default_argparser and returns the parsed results
-    * Useful if you don't need additional custom args
-* `init_xrt_load_kernel`
-    * Helpful wrapper for a number of commonly used XRT calls in `test.py`
-        * Declare an XRT `device`
-        * Load the xclbin file and register the `xclbin`
-        * Declare hardware context and use that to return the `device` and `kernel`
-
-
-## Trace utilites ([trace.py](./trace.py))
+# SPDX-FileCopyrightText: Copyright (C) 2024-2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""
+Trace utilites
 
 Trace utilities are designed to take the low level tile cofigurations need to configure trace and wrap them into convenient wrapper functions. We will go over descriptions of the functions and then dive deeper into Trace mechanisms and explanations.
 
 * `class GenericEvent`
 * `PortEventCodes`, `MemTileEventcodes`, `ShimTileEventCodes`
     * event codes for port events for the different tile types: core, memtile, shimtile
-* `class PacketType` 
+* `class PacketType`
     * We use the packet type field in the packet header to help differentiate the tile that the packet came from. Since packet types don't inherently have meaning, we assign numerical values to each tile type: core, mem (for core), shimtilem, memtile
 * `class PortEvent`, `class MemTilePortEvent`, `class ShimTilePortEvent`
     * class for port events to define accesses and `get_register_writes`
@@ -73,9 +34,9 @@
     * Configures timer in each tile type to reset based on an `event`
 
 * `configure_broadcast_core_aie2`
-    *  Configure broadcast event based on an internal triggered event. 
+    *  Configure broadcast event based on an internal triggered event.
         function arguments:
-        * `num` - broadcaast number we want to broadcast on 
+        * `num` - broadcaast number we want to broadcast on
         * `event` - the triggering broadcast event
 
 * `configure_event_gen_core_aie2`
@@ -89,10 +50,10 @@
         1. Configure core tile based on start/ stop, events, and flow id. The flow id needs to be unique per flow.
         2. Configure timer based on broadcast event (default is 15). This ensures all tiles keying off this event has a synchronized timer so their trace are synchronized. This event is also used as the start event for tracing.
         3. Configure shim tile to receive this flow and move the data to offset/ size.
-    It does this by calling `configure_coretile_tracing_aie2`, `configure_time_ctrl_coretile_aie2` and `configure_shimtile_dma_tracing_aie2`. 
+    It does this by calling `configure_coretile_tracing_aie2`, `configure_time_ctrl_coretile_aie2` and `configure_shimtile_dma_tracing_aie2`.
 
 * `configure_packet_tracing_flow`
-    * Wrapper around packeflows to itereate over tiles_to_trace and route them to the shim for outputing the trace to L3 memory. This uses default values for the packet id that increases for each tile we trace, starting with 1. This should match the tile trace config that's set by configure_coretile_packet_tracing_aie2. 
+    * Wrapper around packeflows to itereate over tiles_to_trace and route them to the shim for outputing the trace to L3 memory. This uses default values for the packet id that increases for each tile we trace, starting with 1. This should match the tile trace config that's set by configure_coretile_packet_tracing_aie2.
     * *NOTE* - Because we do it this way, we inherently cannot trace more than 31 tiles.
 
         Function arguments:
@@ -120,7 +81,7 @@
 
 * `configure_packet_tracing_aie2` (packet switched multi-tile tracing)
     * This wrapper function iterates over the `tiles_to_trace` array and calls the right version of `configure_*tile_packet_tracing_aie2`. A key distinction is made to choose the right start and stop event depending on the tile type. We pass in 3 sets of optional event arguments that allows them to be customized depending on the tile type.
-        
+
         Function arguments:
         * `tiles to trace` - array of tiles to trace
         * `shim tile` - Single shim tile to configure for writing trace packets to DDR
@@ -143,7 +104,7 @@
         ```
 
 * `configure_simple_tracing_aie2` (**DEPRECATED** cicuit switched single tile tracing)
-    * This function abstracts a number of python functions for configuring a core tile and an associated shim tile. It does not define the trace packet routing between the two however. 
+    * This function abstracts a number of python functions for configuring a core tile and an associated shim tile. It does not define the trace packet routing between the two however.
 
         Function arguments:
         * `channel` - S2MM channel used
@@ -178,7 +139,7 @@
                     PortEvent(CoreEvent.PORT_RUNNING_0, 1, True),  # master(1)
                     PortEvent(CoreEvent.PORT_RUNNING_1, 1, False),  # slave(1)
                    ]
-           ``` 
+           ```
 
         A more common use case might be:
         ```python
@@ -190,18 +151,18 @@
 
 * Additional helper functions can be found in the `trace.py` and are documented in the source directly.
 
-## Trace Mechanisms and Explanations
+Trace Mechanisms and Explanations
 
 The basic concept for trace configuration as summarized in [section-4b](../../programming_guide/section-4/section-4b/). MOre details about the trace hardware can be found for AIE-ML/AIE2 at [am020](https://docs.amd.com/r/en-US/am020-versal-aie-ml/Trace).
 
 ### Trace Packet Routing
-Digging one level lower, tracing can be configured such that trace data is moved via circuit switch routing or packet switched routing. The deprecated `configure_simple_tracing_aie2` uses circuit switch tracing but this mechanism utilizes a dedicate stream along the stream switch path and limits the number of parallel tiles that can be traced. The preferred default mechanism is to use packet swtiched routing instead. This has the benefit of using a shared stream to route multiple tiles' trace packets. In practice, if a large amount trace data is being produced among a large number of tiles and aggregated into a single stream, there can be a limit to how much data that stream can support which may exert back pressure can cause overrun of trace data leading to invalid trace results. One limitation of packet switched routing is the additional packet header prepended to each packet (32b header for 7x 32b of data payload). This reduces the effective bandwidth of the trace data but the benefit of packet switched routing far outweigh this overhead limitation. 
+Digging one level lower, tracing can be configured such that trace data is moved via circuit switch routing or packet switched routing. The deprecated `configure_simple_tracing_aie2` uses circuit switch tracing but this mechanism utilizes a dedicate stream along the stream switch path and limits the number of parallel tiles that can be traced. The preferred default mechanism is to use packet swtiched routing instead. This has the benefit of using a shared stream to route multiple tiles' trace packets. In practice, if a large amount trace data is being produced among a large number of tiles and aggregated into a single stream, there can be a limit to how much data that stream can support which may exert back pressure can cause overrun of trace data leading to invalid trace results. One limitation of packet switched routing is the additional packet header prepended to each packet (32b header for 7x 32b of data payload). This reduces the effective bandwidth of the trace data but the benefit of packet switched routing far outweigh this overhead limitation.
 
 ### Trace Array Level Configuration (packet switched routing)
 We have already discussed configuring individual trace units in each tile to enable tracing and packetization of the trace data, configuring packet flows to route the trace data packets to a shim, and configuring the shim to write that data to DDR. However, a key aspect of full array level configuration involves supporting multi-tile trace which requires synchronization of trace data. This is done via using broadcasted user events as both local timer reset and start and stop synchronization, as explained below:
 
 1. Configure shim to generate a custom user event (#1) and broadcast event (#15) throughout array
-2. Reset all timers in shim and traced tiles based on this broadcast event so all timers are synchronized. NOTE: In practice, there is a slight delay since the delay of this signal can be a few clock cycles between tiles. 
+2. Reset all timers in shim and traced tiles based on this broadcast event so all timers are synchronized. NOTE: In practice, there is a slight delay since the delay of this signal can be a few clock cycles between tiles.
 3. Configure tiles to use this broadcast event (#15) as the start event
 4. Continue with rest of runtime sequence (e.g. data movement for input and output buffers)
 5. Generate another user event (#0) and broadcast event (#14) throughout array. This will be used as the trace stop event for all tiles
@@ -238,7 +199,7 @@
 `PortEvent` is defined in [trace.py](../../python/utils/trace.py) and `CoreEvent` is defined in [trace_events_enum.py](../../python/utils/trace_events_enum.py). Likewise for memtiles and shimtiles, we have `MemTilePortEvent` and `ShimTilePortEvent` in [trace.py](../../python/utils/trace.py) and `MemTileEvent` and `ShimTileEvent` are in [trace_events_enum.py](../../python/utils/trace_events_enum.py).
 
 ### Configure tile trace settings
-Under the hood of `configure_coretile_tracing_aie2`/ `configure_memtile_tracing_aie2`/ `configure_shimtile_tracing_aie2`, we perform trace configurations by writing specific values to trace configuration registers. This is done within the `aiex.runtime_sequence` block, where we call a set of configuration register writes (`aiex.npu.write32`) to configure the tile trace units and (`aiex.npu.writebd`) to configure the shimDMA. 
+Under the hood of `configure_coretile_tracing_aie2`/ `configure_memtile_tracing_aie2`/ `configure_shimtile_tracing_aie2`, we perform trace configurations by writing specific values to trace configuration registers. This is done within the `aiex.runtime_sequence` block, where we call a set of configuration register writes (`aiex.npu.write32`) to configure the tile trace units and (`aiex.npu.writebd`) to configure the shimDMA.
 
 For a give AIE2 tile, we configure the trace control registers for the tile core and tile memory separately. There are 4 registers we generally use to configure the trace unit behavior. 2 are for configuring the general trace control and the other 2 are to specify which events our tile's trace hardware is monitoring.
 
@@ -247,7 +208,7 @@
 
 | Config Register | Address | Field | Bits | Reset Value | Description |
 |-----------------|---------|-------|------|-------------|-------------|
-| Trace Control 0 | 0x340D0 | Stop Event | [30:24], 0xNN------ | 0 | Event to stop trace capture | 
+| Trace Control 0 | 0x340D0 | Stop Event | [30:24], 0xNN------ | 0 | Event to stop trace capture |
 | Trace Control 0 | 0x340D0 | Start Event | [22:16], 0x--NN---- | 0 | Event to start trace capture |
 | Trace Control 0 | 0x340D0 | Mode | [1:0], 0x-------N | 0 | Trace mode. 00=event-time, 01=event-PC, 10=execution |
 | Trace Control 1 | 0x340D4 | Packet Type | [14:12], 0x----N--- | 0 | Detination trace packet - packet type |
@@ -304,10 +265,10 @@
 | Core Instruction - Event 1  |INSTR_EVENT_1| 0x22| 34 |
 | Vector Instructions (e.g. VMAC, VADD, VCMP) |INSTR_VECTOR| 0x25|  37 |
 | Lock acquire requests      |INSTR_LOCK_ACQUIRE_REQ| 0x2C|  44 |
-| Lock release requests      |INSTR_LOCK_RELEASE_REQ| 0x2D|  45 | 
+| Lock release requests      |INSTR_LOCK_RELEASE_REQ| 0x2D|  45 |
 | Lock stall                 |LOCK_STALL| 0x1A|  26 |
 | Core Port Running 1        |PORT_RUNNING_1| 0x4F|  79 |
-| Core Port Running 0        |PORT_RUNNING_0| 0x4B|  75 | 
+| Core Port Running 0        |PORT_RUNNING_0| 0x4B|  75 |
 
 **NOTE**: The "Core Instruction - Event 0/1" are special intrinsics you can add to your kernel code to trigger an event during the running of your core program. Within the kernel code, they look like:
 ```c++
@@ -328,7 +289,7 @@
 // Vector instrucitons (0x25)
 // Core Instruction - Event 0 (0x21)
 // Core Instruction - Event 1 (0x22)
-// Core Port Running 0 (0x4B) 
+// Core Port Running 0 (0x4B)
 aiex.npu.write32 { column = 0 : i32, row = 4 : i32, address = 0x340E0 : ui32, value = 0x4B222125 : ui32 }
 
 // Events 4-7 monitored
@@ -346,7 +307,7 @@
 npu_write32(column=0, row=4, address=0x340E4, value=*events[4:8],)
 ```
 
-Some configurations like the Port Running 0/1 events are further configured by a secondary configuration register. In this case, we route the port activity from the stream switch to Port running 0 or 1. 
+Some configurations like the Port Running 0/1 events are further configured by a secondary configuration register. In this case, we route the port activity from the stream switch to Port running 0 or 1.
 | Config Register | Address | Field | Bits | Reset Value | Description |
 |-----------------|---------|-------|------|-------------|-------------|
 | Stream Switch Event Port Selection 1 | 0x3FF04 | Port 7 Master/Slave | [29], 0xN------- | 0 | Master or slave for port 7, 1=master, 0=slave |
@@ -399,7 +360,7 @@
 * `buffer_offset` - specifies in bytes where the trace buffer starts in the output buffer and occurs after the main output buffer ends. If the output buffer size in words is 65,536, then the buffer offset would be 4*65,536 = 262,144 bytes.
 * `bd_id` - unique bd (out of 16 bds) to program for data movement. Since we're delcaring this manually, it's important that we dont' overlap with existing (and possibly auto-declared bd id values). In the matmul design, we needed this value to be at least 13.
 * `column` - this shimDMA's column
-* `ddr_id` - very important to indicate which inout buffer DDR region we're mapping to. 
+* `ddr_id` - very important to indicate which inout buffer DDR region we're mapping to.
 
 An example ddr_id to inout buffer mapping is below:
 | ddr ID value | buffer | group_id |
@@ -482,9 +443,9 @@
     use_next_bd=0,
     valid_bd=1,
 )
-```    
+```
 
-### <u>Trace parser ([parse_trace.py](./parse_trace.py))</u>
+### <u>Trace parser ([parse_trace.py](./parse.py))</u>
 The text file generated by the host code (`test.cpp` or `test.py`) are formatted as 32-bit hex values, one per line. This python script parses the raw trace packet data and creates a waveform json file for view on Perfetto http://ui.perfetto.dev. The script syntax is:
 
 ```bash
@@ -495,11 +456,11 @@
 * **--mlir**     : MLIR source. This is needed to parse what events and tiles we are monitoring to generate labels for our waveform visualizer.
 * **--colshift (optional)** : runtime column shift. This specifies how much the actual design was shifted from the default position when it was scheduled and called. The reason we need this is becuase even if our design is configured for column 0, the actual loading and execution of the design may place it in column 1, 2, 3 etc. We account for this shift since the parser needs to match the actual column location of the generated trace data. For npu devices (phoenix), this is typically 1 while npu2 (strix) uses 0. The script should be able to automatically figure out the starting column and set this correctly but can be overrided via this argument.
 
-    **NOTE** - the underlying tools currently default to column 1 to avoid using column 0 on Ryzen AI since that column does not have a shimDMA and is therefore avoided at the moment. 
+    **NOTE** - the underlying tools currently default to column 1 to avoid using column 0 on Ryzen AI since that column does not have a shimDMA and is therefore avoided at the moment.
 * **--output** : output json file
 
 
-### <u>Trace parser - eventIR based ([parse_eventIR.py](./parse_eventIR.py))</u>
+### <u>Trace parser - eventIR based ([event_ir.py](./event_ir.py))</u>
 The text file generated by the host code (`test.cpp` or `test.py`) are formatted as 32-bit hex values, one per line. This python script executes a number of steps in order to transform it from trace packet text file into a waveform json file.
 
 **NOTE** - There seems to be some inconsistencies in the results generated by this parser. As of now, it is used to compare to existing the `hwfrontend` tool only.
@@ -526,7 +487,7 @@
 We prepend `0x` before each hex line and save it `prep.<trace file>` since the `hwfrontend` utility expects it.
 
 #### <u>2. Parse MLIR to build event table</u>
-The MLIR parser is pretty rudimentary as it scans the source mlir file looking for `aiex.npu.write32` calls and does a pattern match for trace unit config address and then grab the hex events, which it looks up from an internal table to provide waveform labels. It would be better to use an MLIR pass that already has the config information and cross reference it with a more official event-to-label lookup table instead. 
+The MLIR parser is pretty rudimentary as it scans the source mlir file looking for `aiex.npu.write32` calls and does a pattern match for trace unit config address and then grab the hex events, which it looks up from an internal table to provide waveform labels. It would be better to use an MLIR pass that already has the config information and cross reference it with a more official event-to-label lookup table instead.
 
 #### <u>3. Create .target file</u>
 Create a dummy file (`.target`) in the `tmpTrace` with the file content 'hw' since `hwfrontend` utility expects it.
@@ -537,7 +498,7 @@
 #### <u>5. Run Vitis/aietools hwfrontend utility to parse raw trace data --> generates eventIR.txt</u>
 This is the main parse utility that generates a much more friendly eventIR file format. This utilty is the same one used by the adf tools for aiesimulator. However, the utility is very particular and some combinations of trace packet data might confuse the parser or cause an error. See the **Tips** section at the end for workarounds to known issues.
 
-#### <u>6. Convert eventIR.txt to perfetto_compatible.json</u> 
+#### <u>6. Convert eventIR.txt to perfetto_compatible.json</u>
 While the Perfetto-compliant json file format is not the same as the eventIR file format. The conversion between them is more straightforward that between trace packets and Perfetto-compliant json. Having said that, it is still possible this pass to be further tested and improved.
 
 #### <u>Tips</u>
@@ -560,43 +521,31 @@
 0x0005d0f7
 ```
 which reduces the timer from 11,091,042 cycles to 381,175 seems to fix it.
-
-## XRT utilites ([xrt.py](./xrt.py))
-XRT wrapped utilities. These classes and utilities help simplify the the declaration and instantiation of XRT components in the host code.
-
-In particular, `setup_and_run_aie` is a helpful convenience wrapper to simplify the setup and runnin of kernel with 1 or 2 inputs buffers and 1 output buffer. See [vector_scalar_mul](../../programming_examples/basic/vector_scalar_mul/) for an template example of how this is ued.
-
-* class `AIE_Application`
-    * This class configures and invokes the XRT components needed to run an AIE Application. This includes xrt.device, xrt.kernel, xrt.hw_context and XRT buffers as enacpuslated by the AIE_Buffer class. You can use this class to simplify and reduce the amount of code needed to set up an AIE application.
-        * `__init__` - Registers xclbin to set up the device, hw context and kernel. This also sets up the instruction stream
-        * `register_buffer` - Registers an AIE_Buffer class object given group_id, datatype and shape
-        * `run` - This syncs the instruction buffer to the device and then invokes the `call` function before wait for the call to complete
-        * `call` - Wrapper for xrt.kernel function passing in opcode and buffers objects
-* class `AIE_Buffer`
-    * This class wraps up access to the xrt.bo buffer object where sync calls are added to read and write calls to ensure data is synchronized.
-    * `__init__` - Declare xrt.bo object given group_id, datatype, shape
-    * `read` - Synchronize data from device before reading xrt.bo data
-    * `write` - Write data to xrt.bo and synchronize data to device
-    * `sync_to_device` - Wrapper for xrt.bo.sync call (to device)
-    * `sync_from_device` - Wrapper for xrt.bo.sync call (from device)
-* class `AIE_Application_Error`
-* `read_insts` - Read instruction stream from text file and reformat it to be passed into the instructoin buffer for the xrt.kernel call
-* `setup_aie`
-    * Sets up the AIE application with support for up to 2 input buffers, 1 output buffer, and an optional trace buffer. Under the hood, we call declare an AIE_Application object and register the buffers used given the buffer datatype and shapes. 
-* `execute`
-    * Wrapper function to write buffer arguments into registered input buffers, then call `run` function for AIE Application, and finally return the output buffer data.
-* `extract_trace`
-    * Wrapper function to separate output data and trace data from a single output buffer stream
-* `write_out_trace`
-    * Wrapper function to write trace buffer values to a text file
-* `setup_and_run_aie`
-    * This wrapper function abstracts the full set of functions to setup the aie and run the kernel program including check for functional correctness and reporting the run time. Under the hood, we call `setup_aie` to set up the AIE application before calling `execute` and checking results. The datatypes and shape for the 2 inputs and 1 output buffers are passed in as arguments, along with the gold reference data to compare it against. Trace buffers is also written out to a text file if trace is enabled. 
-
-## Machine Language (ML) utilites ([ml.py](./ml.py))
-ML related utilties
-
-* class `CSVLogger`
-* `load_class_label`
-* `unpickle`
-* `fuse_single_conv_bn_pair`
-* class `DataShaper`
+"""
+
+from .config import TraceConfig
+from .parse import parse_trace
+from .setup import (
+    configure_coremem_tracing_aie2,
+    configure_coretile_tracing_aie2,
+    configure_memtile_tracing_aie2,
+    configure_shimtile_tracing_aie2,
+    configure_packet_tracing_flow,
+    configure_shim_trace_start_aie2,
+    gen_trace_done_aie2,
+    configure_packet_tracing_aie2,
+    configure_simple_tracing_aie2,
+    configure_packet_ctrl_flow,
+    config_ctrl_pkts_aie,
+)
+from .utils import (
+    parity,
+    extract_tile,
+    pack4bytes,
+    create_ctrl_pkt,
+    get_kernel_code,
+    extract_buffers,
+    get_cycles,
+    get_cycles_summary,
+    get_vector_time,
+)
diff --git a/python/utils/trace/config.py b/python/utils/trace/config.py
new file mode 100644
index 00000000000..5892462ac6c
--- /dev/null
+++ b/python/utils/trace/config.py
@@ -0,0 +1,51 @@
+# SPDX-FileCopyrightText: Copyright (C) 2025-2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+import numpy as np
+import json
+from .parse import parse_trace
+from .utils import parity, extract_tile
+
+
+class TraceConfig:
+    DEFAULT_TRACE_BUFFER_INDEX = 4
+
+    def __init__(
+        self,
+        trace_size: int,
+        trace_file: str = "trace.txt",
+        trace_after_last_tensor: bool = False,
+        enable_ctrl_pkts: bool = False,
+        last_tensor_shape=None,
+        last_tensor_dtype=None,
+    ):
+        if trace_size <= 0:
+            raise ValueError(f"Invalid trace size: {trace_size}")
+        self.trace_size = trace_size
+        self.trace_file = trace_file
+        self.trace_after_last_tensor = trace_after_last_tensor
+        self.enable_ctrl_pkts = enable_ctrl_pkts
+        self.last_tensor_shape = last_tensor_shape
+        self.last_tensor_dtype = last_tensor_dtype
+
+    def write_trace(self, trace):
+        out_str = "\n".join(f"{i:0{8}x}" for i in trace if i != 0)
+        with open(self.trace_file, "w") as f:
+            f.write(out_str)
+
+    def read_trace(self):
+        with open(self.trace_file, "r") as f:
+            trace_data = [int(line.strip(), 16) for line in f if line.strip()]
+        return np.array(trace_data, dtype=np.uint32)
+
+    def trace_to_json(self, mlir_file: str, output_name: str = "trace.json"):
+        """Wrapper over parse_trace.py utility."""
+        trace_buffer = self.read_trace()
+
+        with open(mlir_file, "r") as f:
+            mlir_module_str = f.read()
+
+        trace_events = parse_trace(trace_buffer, mlir_module_str)
+
+        with open(output_name, "w") as f:
+            json.dump(trace_events, f, indent=2)
diff --git a/python/utils/parse_eventIR.py b/python/utils/trace/event_ir.py
similarity index 74%
rename from python/utils/parse_eventIR.py
rename to python/utils/trace/event_ir.py
index e6fd7e897f2..c4c43df8141 100755
--- a/python/utils/parse_eventIR.py
+++ b/python/utils/trace/event_ir.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+# (c) Copyright 2026 Advanced Micro Devices, Inc.
 import json
 import argparse
 import sys
@@ -6,14 +7,21 @@
 import subprocess
 import shutil
 import os
-from aie.utils.trace_events.aie2 import CoreEvent, MemEvent, ShimTileEvent, MemTileEvent
-
-# Number of different trace types, currently 4
-# core:    pkt type 0
-# mem:     pkt type 1
-# intfc:   pkt type 2
-# memtile: pkt type 3
-NumTraceTypes = 4
+
+from .utils import (
+    parse_pkt_hdr_in_stream,
+    trace_pkts_de_interleave,
+    convert_to_byte_stream,
+    convert_to_commands,
+)
+from .trace.events import (
+    NUM_TRACE_TYPES,
+    CoreEvent,
+    MemEvent,
+    ShimTileEvent,
+    MemTileEvent,
+)
+
 NUM_EVENTS = 8  # number of events we can view per trace
 
 rowoffset = 1  # TODO tmeporary workaround to figure out row offset for AIE2 for tiles
@@ -42,244 +50,6 @@ def parse_args():
     return parser.parse_args(sys.argv[1:])
 
 
-def check_odd_word_parity(word):
-    val = 0
-    for i in range(32):
-        val = val ^ ((word >> i) & 0x1)
-    return val == 1
-
-
-def parse_pkt_hdr_in_stream(word):
-    hdr = dict()
-    w = int(word)
-    hdr["valid"] = check_odd_word_parity(w)
-    # TODO can we assume non used fields must be 0 to rule out other data packets?
-    # what about bit[5:10]?
-    if (((w >> 5) & 0x7F) != 0) or (((w >> 19) & 0x1) != 0) or (((w >> 28) & 0x7) != 0):
-        hdr["valid"] = False
-    else:
-        # TODO Do we need to check for valid row/col for given device?
-        hdr["col"] = (w >> 21) & 0x7F
-        hdr["row"] = (w >> 16) & 0x1F
-        hdr["type"] = (w >> 12) & 0x3
-        hdr["id"] = w & 0x1F
-    return hdr
-
-
-# toks_list:   list (idx = types of traces, currently 4, value = stream_dict)
-# stream_dict: dict (key = row,col, value = list of word streams)
-def core_trace_and_mem_trace_de_interleave(word_stream):
-    toks_list = list()
-    for t in range(NumTraceTypes):
-        toks_list.append(dict())
-
-    # core_streams = dict()   # pkt type 0
-    # mem_stream = dict()     # pkt type 1
-    # intfc_stream = dict()   # pkt type 2
-    # memtile_stream = dict() # pkt type 3
-
-    # index lists based on row/col and if its not null, that means it already exists
-
-    curr_pkt_type = 0
-    curr_loc = ""
-    curr_vld = False  # only used in the beginning
-
-    # print(len(word_stream))
-    for i in range(len(word_stream)):
-        if word_stream[i] == "":
-            break  # TODO Assumes a blank line is the last line
-        if (i % 8) == 0:
-            # print(str(i)+':'+word_stream[i])
-            pkt_hdr = parse_pkt_hdr_in_stream(int(word_stream[i], 16))
-            if pkt_hdr["valid"]:
-                curr_loc = str(pkt_hdr["row"]) + "," + str(pkt_hdr["col"])
-                valid_type_found = False
-                for tt in range(NumTraceTypes):
-                    if pkt_hdr["type"] == tt:
-                        curr_pkt_type = tt
-                        if toks_list[tt].get(curr_loc) == None:
-                            toks_list[tt][curr_loc] = list()
-                        valid_type_found = True
-                if not valid_type_found:
-                    sys.exit("Error: Invalid packet type")
-            # Crate a list for the loc if it doesn't exist
-            curr_vld = True
-        else:
-            if (
-                curr_vld
-            ):  # ignores first 8 chunks of data is pkt hdr was invalid. TODO Is this right?
-                # or shoudl we require valid header in first chunk of data
-                toks_list[curr_pkt_type][curr_loc].append(
-                    word_stream[i]
-                )  # TODO assuem curr_pkt_type is valid
-                # for tt in range(NumTraceTypes):
-                #     if curr_pkt_type == tt:
-                #         toks_list[tt][curr_loc].append(word_stream[i])
-    return toks_list
-
-
-# toks_list is a list of toks dictionaries where each dictionary is a type (core, mem, intfc, memtile)
-# each dictionary key is a tile location (row,col) whose value is a list of stream data
-def convert_to_byte_stream(toks_list):
-    byte_stream_list = list()
-    for l in toks_list:
-        byte_stream_dict = dict()
-        for loc, stream in l.items():
-            # byte_stream = list()
-            byte_stream_dict[loc] = list()
-            f = ["", "a5a5a5a5"]
-            toks = [t for t in stream if not t in f]
-            events = [int(t, 16) for t in toks]
-            for event in events:
-                for top in range(4):
-                    byte = 3 - top
-                    opcode = event >> (byte * 8) & 0xFF
-                    byte_stream_dict[loc].append(opcode)
-        # for key, value in l.items():
-        #     # byte_stream = list()
-        #     byte_stream_dict[key] = list()
-        #     f = ['', 'a5a5a5a5']
-        #     toks = [t for t in value if not t in f]
-        #     events = [int(t,16) for t in toks]
-        #     for (i,event) in enumerate(events):
-        #         if ((i % 8) == 0): # assumed hdr every 8 words and ignores it
-        #             pass
-        #         else: # breaks line into list of bytes
-        #             for top in range(4):
-        #                 byte = 3-top
-        #                 opcode = (event >> (byte * 8) & 0xff)
-        #                 byte_stream_dict[key].append(opcode)
-        byte_stream_list.append(byte_stream_dict)
-    return byte_stream_list
-
-
-# byte_stream_list: list (idx = trace type, value = word_stream_dict)
-# word_stream_dict: dict (key = row,col, value = list of words)
-#
-# return commands:  list (idx = trace type, value = byte_stream_dict)
-# byte_stream_dict: dict (key = row,col, value = list of commands)
-#
-# command: dict
-#   keys: type (Single0/1, Multiple0/1/2, Start, Repeat0/1, EventSync)
-#         event (integer value)
-#         cycles (integer value)
-#         event# (integer value matching event number #)
-#         repeats (integer value)
-def convert_to_commands(byte_stream_list, zero=True):
-    # commands = dict()
-    commands = list()
-    for t in range(NumTraceTypes):
-        commands.append(dict())
-
-    for t in range(NumTraceTypes):
-        for key, byte_stream in byte_stream_list[t].items():
-            cursor = 0
-            commands[t][key] = list()
-            try:
-                while True:
-                    if (byte_stream[cursor] & 0b11111011) == 0b11110000:
-                        com = {"type": "Start", "timer_value": 0}
-                        if not zero:
-                            for i in range(7):
-                                com["timer_value"] += (byte_stream[cursor + i + 1]) * (
-                                    256 ** (6 - i)
-                                )
-                        commands[t][key].append(com)
-                        cursor = cursor + 8
-                    if (byte_stream[cursor] & 0b11111100) == 0b11011100:
-                        # We don't care about these
-                        cursor = cursor + 4
-                    if (byte_stream[cursor] & 0b10000000) == 0b00000000:
-                        com = {"type": "Single0"}
-                        com["event"] = (byte_stream[cursor]) >> 4 & 0b111
-                        com["cycles"] = (byte_stream[cursor]) & 0b1111
-                        commands[t][key].append(com)
-                        cursor = cursor + 1
-                    if (byte_stream[cursor] & 0b11100000) == 0b10000000:
-                        com = {"type": "Single1"}
-                        com["event"] = (byte_stream[cursor]) >> 2 & 0b111
-                        com["cycles"] = ((byte_stream[cursor]) & 0b11) * 256
-                        com["cycles"] += byte_stream[cursor + 1]
-                        commands[t][key].append(com)
-                        cursor = cursor + 2
-                    if (byte_stream[cursor] & 0b11100000) == 0b10100000:
-                        com = {"type": "Single2"}
-                        com["event"] = (byte_stream[cursor]) >> 2 & 0b111
-                        com["cycles"] = ((byte_stream[cursor]) & 0b11) * 256 * 256
-                        com["cycles"] += byte_stream[cursor + 1] * 256
-                        com["cycles"] += byte_stream[cursor + 2]
-                        commands[t][key].append(com)
-                        cursor = cursor + 3
-                    if (byte_stream[cursor] & 0b11110000) == 0b11000000:
-                        com = {"type": "Multiple0"}
-                        com["cycles"] = byte_stream[cursor + 1] & 0b1111
-                        events = (byte_stream[cursor] & 0b1111) << 4
-                        events = events + (byte_stream[cursor + 1] >> 4)
-                        for i in range(0, 8):
-                            e = (events >> i) & 0b1
-                            if e:
-                                com["event" + str(i)] = (
-                                    i  # TODO is this how event# is stored in IR?
-                                )
-                        commands[t][key].append(com)
-                        cursor = cursor + 2
-                    if (byte_stream[cursor] & 0b11111100) == 0b11010000:
-                        # TODO Don't we need to extract events here?
-                        # print("Multiple1")
-                        com = {"type": "Multiple1"}
-                        cycles = (byte_stream[cursor + 1] & 0b11) << 8
-                        com["cycles"] = cycles + (byte_stream[cursor + 2])
-                        events = (byte_stream[cursor] & 0b11) << 6
-                        events = events + (byte_stream[cursor + 1] >> 2)
-                        for i in range(0, 8):
-                            e = (events >> i) & 0b1
-                            if e:
-                                com["event" + str(i)] = (
-                                    i  # TODO is this how event# is stored in IR?
-                                )
-                        commands[t][key].append(com)
-                        cursor = cursor + 3
-                    if (byte_stream[cursor] & 0b11111100) == 0b11010100:
-                        # TODO Don't we need to extract events here?
-                        # print("Multiple2")
-                        com = {"type": "Multiple2"}
-                        cycles = (byte_stream[cursor + 1] & 0b11) << 16
-                        cycles = cycles + ((byte_stream[cursor + 2]) << 8)
-                        com["cycles"] = cycles + (byte_stream[cursor + 3])
-                        events = (byte_stream[cursor] & 0b11) << 6
-                        events = events + (byte_stream[cursor + 1] >> 2)
-                        for i in range(0, 8):
-                            e = (events >> i) & 0b1
-                            if e:
-                                com["event" + str(i)] = (
-                                    i  # TODO is this how event# is stored in IR?
-                                )
-                        commands[t][key].append(com)
-                        cursor = cursor + 4
-                    if (byte_stream[cursor] & 0b11110000) == 0b11100000:
-                        com = {"type": "Repeat0"}
-                        com["repeats"] = (byte_stream[cursor]) & 0b1111
-                        commands[t][key].append(com)
-                        cursor = cursor + 1
-                    if (byte_stream[cursor] & 0b11111100) == 0b11011000:
-                        com = {"type": "Repeat1"}
-                        com["repeats"] = ((byte_stream[cursor]) & 0b11) * 256
-                        com["repeats"] += byte_stream[cursor + 1]
-                        commands[t][key].append(com)
-                        cursor = cursor + 2
-                    if (byte_stream[cursor] & 0b11111111) == 0b11111110:
-                        # No one likes you filler, get out of here
-                        cursor = cursor + 1
-                    if (byte_stream[cursor] & 0b11111111) == 0b11111111:
-                        com = {"type": "Event_Sync"}
-                        commands[t][key].append(com)
-                        cursor = cursor + 1
-            except IndexError:
-                pass
-
-    return commands
-
-
 def make_event_lists(commands):
     events = {}
     ts = 0
@@ -600,7 +370,7 @@ def parse_mlir_trace_events(lines):
     pattern = r"aiex.npu.write32\s*\{\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*,\s*(\w+)\s*=\s*(0x)?(\w+)\s*:\s*\w+\s*\}"
 
     pid_events = list()
-    for t in range(NumTraceTypes):
+    for t in range(NUM_TRACE_TYPES):
         pid_events.append(dict())
 
     for i in range(len(lines)):
@@ -833,7 +603,7 @@ def lookup_event_name_by_type(trace_type, code):
 # NOTE: This assume the pid_events has already be analyzed and populated.
 def setup_trace_metadata(trace_events, pid_events):
     pid = 0
-    for t in range(NumTraceTypes):
+    for t in range(NUM_TRACE_TYPES):
         # for j in len(pid_events[i]):
         for loc in pid_events[t]:  # return loc
             process_name_metadata(trace_events, pid, t, loc)
@@ -1058,7 +828,6 @@ def print_config_json(pid_events):
             f.write("                0,\n")
             f.write("                0,\n")
             f.write("                0,\n")
-            f.write("                0,\n")
             f.write("                0\n")
             f.write("              ],\n")
             f.write('              "group_event_config": {\n')
@@ -1170,69 +939,71 @@ def run_hwfrontend(fileInName, fileOutName):
 # Script execution start - Open trace file and convert to commands
 # ------------------------------------------------------------------------------
 
-lines = list()
-pid_events = list()
-trace_events = list()
 
-opts = parse_args()
+if __name__ == "__main__":
+    lines = list()
+    pid_events = list()
+    trace_events = list()
 
-# set colshift based on optional argument
-colshift = int(opts.colshift) if opts.colshift else 0
+    opts = parse_args()
 
-try:
-    os.mkdir(tmpTraceDirName)
-except FileExistsError:
-    pass
-if opts.verbose:
-    print("created temporary directory", tmpTraceDirName)
-tmpTraceDir = os.path.abspath(tmpTraceDirName)
+    # set colshift based on optional argument
+    colshift = int(opts.colshift) if opts.colshift else 0
 
-mlirFile = os.path.abspath(opts.mlir)
-rawTraceFile = os.path.abspath(opts.filename)
-srcTraceFileName = "prep." + str(opts.filename)
-srcTraceFile = os.path.join(tmpTraceDir, srcTraceFileName)
+    try:
+        os.mkdir(tmpTraceDirName)
+    except FileExistsError:
+        pass
+    if opts.verbose:
+        print("created temporary directory", tmpTraceDirName)
+    tmpTraceDir = os.path.abspath(tmpTraceDirName)
 
-# Check source file and prepend 0x
-fix_raw_trace_data(rawTraceFile, srcTraceFile)
+    mlirFile = os.path.abspath(opts.mlir)
+    rawTraceFile = os.path.abspath(opts.filename)
+    srcTraceFileName = "prep." + str(opts.filename)
+    srcTraceFile = os.path.join(tmpTraceDir, srcTraceFileName)
 
-if opts.mlir:
-    try:
-        with open(opts.mlir, "rt") as mf:
-            mlir_lines = mf.read().split("\n")
-            pid_events = parse_mlir_trace_events(mlir_lines)
-    except Exception as e:
-        print(e)
-        sys.exit(1)
+    # Check source file and prepend 0x
+    fix_raw_trace_data(rawTraceFile, srcTraceFile)
+
+    if opts.mlir:
+        try:
+            with open(opts.mlir, "rt") as mf:
+                mlir_lines = mf.read().split("\n")
+                pid_events = parse_mlir_trace_events(mlir_lines)
+        except Exception as e:
+            print(e)
+            sys.exit(1)
 
-os.chdir(tmpTraceDirName)
+    os.chdir(tmpTraceDirName)
 
-create_target()
+    create_target()
 
-print_config_json(pid_events)
+    print_config_json(pid_events)
 
-run_hwfrontend(srcTraceFile, eventIRFile)
+    run_hwfrontend(srcTraceFile, eventIRFile)
 
-# with open(opts.filename, "r") as f:
-try:
-    with open(eventIRFile, "rt") as f:
-        lines = f.read().split("\n")
-        ignore = [""]
-        lines = [l for l in lines if not l in ignore]
-except Exception as e:
-    print(e)
-    sys.exit(1)
+    # with open(opts.filename, "r") as f:
+    try:
+        with open(eventIRFile, "rt") as f:
+            lines = f.read().split("\n")
+            ignore = [""]
+            lines = [l for l in lines if not l in ignore]
+    except Exception as e:
+        print(e)
+        sys.exit(1)
 
-if DEBUG:
-    print("\nDEBUG: lines\n")
-    print(lines)
-    print("\n\n")
+    if DEBUG:
+        print("\nDEBUG: lines\n")
+        print(lines)
+        print("\n\n")
 
-setup_trace_metadata(trace_events, pid_events)
-if DEBUG:
-    print("\nDEBUG: pid events\n")
-    print(pid_events)
-    print("\n\n")
+    setup_trace_metadata(trace_events, pid_events)
+    if DEBUG:
+        print("\nDEBUG: pid events\n")
+        print(pid_events)
+        print("\n\n")
 
-convert_eventIR_to_json(trace_events, lines, pid_events)
+    convert_eventIR_to_json(trace_events, lines, pid_events)
 
-print(json.dumps(trace_events))
+    print(json.dumps(trace_events))
diff --git a/python/utils/trace/events/__init__.py b/python/utils/trace/events/__init__.py
new file mode 100644
index 00000000000..b1990fa4076
--- /dev/null
+++ b/python/utils/trace/events/__init__.py
@@ -0,0 +1,149 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2025-2026 Advanced Micro Devices, Inc. or its affiliates
+#
+"""Trace events enumerations for AIE architectures.
+
+Available modules:
+- aie: AIE1 architecture events
+- aie2: AIE2/AIEML architecture events
+- aie2p: AIE2P architecture events
+"""
+from enum import IntEnum
+import typing
+
+from . import aie
+from . import aie2
+from . import aie2p
+
+from .aie2 import (
+    CoreEvent,
+    MemEvent,
+    ShimTileEvent,
+    MemTileEvent,
+)
+
+
+# We use the packet type field in the packet header to help differentiate the tile
+# that the packet came from. Since packet types don't inherently have meaning, we
+# assign numerical values to each tile type: core, mem (for core), shimtilem, memtile
+class PacketType(IntEnum):
+    CORE = 0
+    MEM = 1
+    SHIMTILE = 2
+    MEMTILE = 3
+
+
+# Number of different trace types
+NUM_TRACE_TYPES = len(PacketType)
+
+
+def get_events_for_device(device: str):
+    if "xcvc1902" in device:
+        return aie
+    elif "npu2p" in device:
+        return aie2p
+    else:
+        return aie2
+
+
+def _get_port_events(enum_class):
+    events = set()
+    for i in range(8):
+        for type in ["IDLE", "RUNNING", "STALLED", "TLAST"]:
+            events.add(getattr(enum_class, f"PORT_{type}_{i}"))
+    return events
+
+
+PortEventCodes = _get_port_events(CoreEvent)
+MemTilePortEventCodes = _get_port_events(MemTileEvent)
+ShimTilePortEventCodes = _get_port_events(ShimTileEvent)
+
+
+class GenericEvent:
+    def __init__(
+        self, code: typing.Union[CoreEvent, MemEvent, ShimTileEvent, MemTileEvent]
+    ):
+        # For backwards compatibility, allow integer as event
+        if isinstance(code, int):
+            code = CoreEvent(code)
+        self.code: typing.Union[CoreEvent, MemEvent, ShimTileEvent, MemTileEvent] = code
+
+    def get_register_writes(self):
+        """
+        Sub-classes for specific events that require writing to a specific
+        register should overwrite this method to return a dicitionary
+        address -> register value.
+
+        Note that if multiple event(-types) request writing to the same
+        register, their writes will be ORed together. (This makes sense if
+        configuration requires only writing some bits of the whole register.)
+        """
+        return {}
+
+
+class BasePortEvent(GenericEvent):
+    def __init__(
+        self, code, port_number, master=True, enum_class=None, valid_codes=None
+    ):
+        # For backwards compatibility, allow integer as event
+        if isinstance(code, int) and enum_class:
+            code = enum_class(code)
+        if valid_codes:
+            assert code in valid_codes
+
+        self.event_number = int(code.name.split("_")[-1])
+        self.port_number = port_number
+        self.master = master
+        super().__init__(code)
+
+    def get_register_writes(self):
+        def master(port):
+            return port | (1 << 5)
+
+        def slave(port):
+            return port
+
+        # 0x3FF00: Stream switch event port selection 0
+        # 0x3FF04: Stream switch event port selection 1
+        base_addr = self.get_base_address()
+        address = base_addr if self.event_number < 4 else base_addr + 4
+        value = master(self.port_number) if self.master else slave(self.port_number)
+
+        value = (value & 0xFF) << 8 * (self.event_number % 4)
+
+        ret = {base_addr: 0, base_addr + 4: 0}
+        ret[address] = value
+
+        return ret
+
+    def get_base_address(self):
+        raise NotImplementedError
+
+
+class PortEvent(BasePortEvent):
+    def __init__(self, code, port_number, master=True):
+        super().__init__(code, port_number, master, CoreEvent, PortEventCodes)
+
+    def get_base_address(self):
+        return 0x3FF00
+
+
+class MemTilePortEvent(BasePortEvent):
+    def __init__(self, code, port_number, master=True):
+        super().__init__(code, port_number, master, MemTileEvent, MemTilePortEventCodes)
+
+    def get_base_address(self):
+        return 0xB0F00
+
+
+class ShimTilePortEvent(BasePortEvent):
+    def __init__(self, code, port_number, master=True):
+        super().__init__(
+            code, port_number, master, ShimTileEvent, ShimTilePortEventCodes
+        )
+
+    def get_base_address(self):
+        return 0x3FF00
diff --git a/python/utils/get_trace_summary.py b/python/utils/trace/get_trace_summary.py
similarity index 50%
rename from python/utils/get_trace_summary.py
rename to python/utils/trace/get_trace_summary.py
index 2bd8c8da53d..1e43efa5bdc 100755
--- a/python/utils/get_trace_summary.py
+++ b/python/utils/trace/get_trace_summary.py
@@ -1,9 +1,10 @@
 #!/usr/bin/env python3
+# (c) Copyright 2026 Advanced Micro Devices, Inc.
 import json
 import argparse
 import sys
 import re
-import trace_utils
+from aie.utils.trace.utils import get_cycles_summary
 
 
 def parse_args():
@@ -21,22 +22,23 @@ def parse_args():
     return parser.parse_args(sys.argv[1:])
 
 
-opts = parse_args()
-cycles = trace_utils.get_cycles_summary(opts.input)
+if __name__ == "__main__":
+    opts = parse_args()
+    cycles = get_cycles_summary(opts.input)
 
-# print(cycles)
-for i in range(len(cycles)):
-    print(cycles[i][0])
-    runs = len(cycles[i]) - 1
-    print("Total number of full kernel invocations is " + str(runs))
-    if runs > 0:
-        print(
-            "First/Min/Avg/Max cycles is "
-            + str(cycles[i][1])
-            + "/ "
-            + str(min(cycles[i][1:]))
-            + "/ "
-            + str(sum(cycles[i][1:]) / (len(cycles[i]) - 1))
-            + "/ "
-            + str(max(cycles[i][1:]))
-        )
+    # print(cycles)
+    for i in range(len(cycles)):
+        print(cycles[i][0])
+        runs = len(cycles[i]) - 1
+        print("Total number of full kernel invocations is " + str(runs))
+        if runs > 0:
+            print(
+                "First/Min/Avg/Max cycles is "
+                + str(cycles[i][1])
+                + "/ "
+                + str(min(cycles[i][1:]))
+                + "/ "
+                + str(sum(cycles[i][1:]) / (len(cycles[i]) - 1))
+                + "/ "
+                + str(max(cycles[i][1:]))
+            )
diff --git a/python/utils/parse_trace.py b/python/utils/trace/parse.py
similarity index 66%
rename from python/utils/parse_trace.py
rename to python/utils/trace/parse.py
index 19a6ffab2da..8d81a3ab02c 100755
--- a/python/utils/parse_trace.py
+++ b/python/utils/trace/parse.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+# (c) Copyright 2026 Advanced Micro Devices, Inc.
 import json
 import argparse
 import sys
@@ -6,17 +7,28 @@
 
 from aie.extras.util import find_ops
 from aie.ir import Context, Module, Location
-from aie.utils.trace_events.aie2 import CoreEvent, MemEvent, ShimTileEvent, MemTileEvent
+from aie.utils.trace.utils import (
+    parity,
+    extract_tile,
+    parse_pkt_hdr_in_stream,
+    trace_pkts_de_interleave,
+    convert_to_byte_stream,
+    convert_to_commands,
+    trim_trace_pkts,
+)
+from aie.utils.trace.events import (
+    NUM_TRACE_TYPES,
+    PacketType,
+    CoreEvent,
+    MemEvent,
+    ShimTileEvent,
+    MemTileEvent,
+    get_events_for_device,
+)
 
 import aie.dialects.aie as aiedialect
 import aie.dialects.aiex as aiexdialect
 
-# Number of different trace types, currently 4
-# core:    pkt type 0
-# mem:     pkt type 1
-# shim:   pkt type 2
-# memtile: pkt type 3
-NumTraceTypes = 4
 NUM_EVENTS = 8  # number of events we can view per trace
 
 
@@ -57,260 +69,6 @@ def check_for_valid_trace(filename, trace_pkts, of=None, debug=False):
     return True
 
 
-def trim_trace_pkts(trace_pkts):
-    for i in range(len(trace_pkts)):
-        if trace_pkts[i] == "fefefefe" or trace_pkts[i] == "FEFEFEFE":
-            if i + 2 < len(trace_pkts):
-                if trace_pkts[i + 1] == "00000000" and trace_pkts[i + 2] == "00000000":
-                    return trace_pkts[0 : i + 1]
-    return trace_pkts
-
-
-def check_odd_word_parity(word):
-    val = 0
-    for i in range(32):
-        val = val ^ ((word >> i) & 0x1)
-    return val == 1
-
-
-def parse_pkt_hdr_in_stream(word):
-    hdr = dict()
-    w = int(word)
-    hdr["valid"] = check_odd_word_parity(w)
-    # TODO can we assume non used fields must be 0 to rule out other data packets?
-    # what about bit[5:10]?
-    if (((w >> 5) & 0x7F) != 0) or (((w >> 19) & 0x1) != 0) or (((w >> 28) & 0x7) != 0):
-        hdr["valid"] = False
-    else:
-        # TODO Do we need to check for valid row/col for given device?
-        hdr["col"] = (w >> 21) & 0x7F
-        hdr["row"] = (w >> 16) & 0x1F
-        hdr["type"] = (w >> 12) & 0x3
-        hdr["id"] = w & 0x1F
-    return hdr
-
-
-# Sorts list of trace packets into a list indexed by trace type (core, mem, shim, memtile)
-# and the value is dictionary tile location (key) and trace packets (value)
-#
-# trace_pkts_sorted:   list (idx = types of traces, currently 4, value = stream_dict)
-# stream_dict: dict (key = row,col, value = list of word streams)
-def trace_pkts_de_interleave(word_stream):
-    trace_pkts_sorted = list()
-    for t in range(NumTraceTypes):
-        trace_pkts_sorted.append(dict())
-
-    # core_streams = dict()   # pkt type 0
-    # mem_stream = dict()     # pkt type 1
-    # shim_stream = dict()   # pkt type 2
-    # memtile_stream = dict() # pkt type 3
-
-    # index lists based on row/col and if its not null, that means it already exists
-
-    curr_pkt_type = 0
-    curr_loc = ""
-    curr_vld = False  # only used in the beginning
-
-    # print(len(word_stream))
-    for i in range(len(word_stream)):
-        if word_stream[i] == "":
-            break  # TODO Assumes a blank line is the last line
-        if (i % 8) == 0:
-            # print(str(i)+':'+word_stream[i])
-            pkt_hdr = parse_pkt_hdr_in_stream(int(word_stream[i], 16))
-            if pkt_hdr["valid"]:
-                curr_loc = str(pkt_hdr["row"]) + "," + str(pkt_hdr["col"])
-                valid_type_found = False
-                for tt in range(NumTraceTypes):
-                    if pkt_hdr["type"] == tt:
-                        curr_pkt_type = tt
-                        if trace_pkts_sorted[tt].get(curr_loc) == None:
-                            trace_pkts_sorted[tt][curr_loc] = list()
-                        valid_type_found = True
-                if not valid_type_found:
-                    sys.exit("Error: Invalid packet type")
-            # Crate a list for the loc if it doesn't exist
-            curr_vld = True
-        else:
-            if (
-                curr_vld
-            ):  # ignores first 8 chunks of data is pkt hdr was invalid. TODO Is this right?
-                # or shoudl we require valid header in first chunk of data
-                trace_pkts_sorted[curr_pkt_type][curr_loc].append(
-                    word_stream[i]
-                )  # TODO assume curr_pkt_type is valid
-                # for tt in range(NumTraceTypes):
-                #     if curr_pkt_type == tt:
-                #         toks_list[tt][curr_loc].append(word_stream[i])
-    return trace_pkts_sorted
-
-
-# Convert trace packets into byte streams
-#
-# toks_list is a list of toks dictionaries where each dictionary is a type (core, mem, shim, memtile)
-# each dictionary key is a tile location (row,col) whose value is a list of stream data
-def convert_to_byte_stream(toks_list):
-    byte_stream_list = list()
-    for l in toks_list:
-        byte_stream_dict = dict()
-        for loc, stream in l.items():
-            # byte_stream = list()
-            byte_stream_dict[loc] = list()
-            f = ["", "a5a5a5a5"]
-            toks = [t for t in stream if not t in f]
-            events = [int(t, 16) for t in toks]
-            for event in events:
-                for top in range(4):
-                    byte = 3 - top
-                    opcode = event >> (byte * 8) & 0xFF
-                    byte_stream_dict[loc].append(opcode)
-        # for key, value in l.items():
-        #     # byte_stream = list()
-        #     byte_stream_dict[key] = list()
-        #     f = ['', 'a5a5a5a5']
-        #     toks = [t for t in value if not t in f]
-        #     events = [int(t,16) for t in toks]
-        #     for (i,event) in enumerate(events):
-        #         if ((i % 8) == 0): # assumed hdr every 8 words and ignores it
-        #             pass
-        #         else: # breaks line into list of bytes
-        #             for top in range(4):
-        #                 byte = 3-top
-        #                 opcode = (event >> (byte * 8) & 0xff)
-        #                 byte_stream_dict[key].append(opcode)
-        byte_stream_list.append(byte_stream_dict)
-    return byte_stream_list
-
-
-# Convert byte streams to equivalent packet commands
-#
-# byte_stream_list: list (idx = trace type, value = word_stream_dict)
-# word_stream_dict: dict (key = row,col, value = list of words)
-#
-# return commands:  list (idx = trace type, value = byte_stream_dict)
-# byte_stream_dict: dict (key = row,col, value = list of commands)
-#
-# command: dict
-#   keys: type (Single0/1, Multiple0/1/2, Start, Repeat0/1, EventSync)
-#         event (integer value)
-#         cycles (integer value)
-#         event# (integer value matching event number #)
-#         repeats (integer value)
-def convert_to_commands(byte_stream_list, zero=True):
-    # commands = dict()
-    commands = list()
-    for t in range(NumTraceTypes):
-        commands.append(dict())
-
-    for t in range(NumTraceTypes):
-        for key, byte_stream in byte_stream_list[t].items():
-            cursor = 0
-            commands[t][key] = list()
-            try:
-                while True:
-                    if (byte_stream[cursor] & 0b11111011) == 0b11110000:
-                        com = {"type": "Start", "timer_value": 0}
-                        if not zero:
-                            for i in range(7):
-                                com["timer_value"] += (byte_stream[cursor + i + 1]) * (
-                                    256 ** (6 - i)
-                                )
-                        commands[t][key].append(com)
-                        cursor = cursor + 8
-                    if (byte_stream[cursor] & 0b11111100) == 0b11011100:
-                        # We don't care about these
-                        cursor = cursor + 4
-                    if (byte_stream[cursor] & 0b10000000) == 0b00000000:
-                        com = {"type": "Single0"}
-                        com["event"] = (byte_stream[cursor]) >> 4 & 0b111
-                        com["cycles"] = (byte_stream[cursor]) & 0b1111
-                        commands[t][key].append(com)
-                        cursor = cursor + 1
-                    if (byte_stream[cursor] & 0b11100000) == 0b10000000:
-                        com = {"type": "Single1"}
-                        com["event"] = (byte_stream[cursor]) >> 2 & 0b111
-                        com["cycles"] = ((byte_stream[cursor]) & 0b11) * 256
-                        com["cycles"] += byte_stream[cursor + 1]
-                        commands[t][key].append(com)
-                        cursor = cursor + 2
-                    if (byte_stream[cursor] & 0b11100000) == 0b10100000:
-                        com = {"type": "Single2"}
-                        com["event"] = (byte_stream[cursor]) >> 2 & 0b111
-                        com["cycles"] = ((byte_stream[cursor]) & 0b11) * 256 * 256
-                        com["cycles"] += byte_stream[cursor + 1] * 256
-                        com["cycles"] += byte_stream[cursor + 2]
-                        commands[t][key].append(com)
-                        cursor = cursor + 3
-                    if (byte_stream[cursor] & 0b11110000) == 0b11000000:
-                        com = {"type": "Multiple0"}
-                        com["cycles"] = byte_stream[cursor + 1] & 0b1111
-                        events = (byte_stream[cursor] & 0b1111) << 4
-                        events = events + (byte_stream[cursor + 1] >> 4)
-                        for i in range(0, 8):
-                            e = (events >> i) & 0b1
-                            if e:
-                                com["event" + str(i)] = (
-                                    i  # TODO is this how event# is stored in IR?
-                                )
-                        commands[t][key].append(com)
-                        cursor = cursor + 2
-                    if (byte_stream[cursor] & 0b11111100) == 0b11010000:
-                        # TODO Don't we need to extract events here?
-                        # print("Multiple1")
-                        com = {"type": "Multiple1"}
-                        cycles = (byte_stream[cursor + 1] & 0b11) << 8
-                        com["cycles"] = cycles + (byte_stream[cursor + 2])
-                        events = (byte_stream[cursor] & 0b11) << 6
-                        events = events + (byte_stream[cursor + 1] >> 2)
-                        for i in range(0, 8):
-                            e = (events >> i) & 0b1
-                            if e:
-                                com["event" + str(i)] = (
-                                    i  # TODO is this how event# is stored in IR?
-                                )
-                        commands[t][key].append(com)
-                        cursor = cursor + 3
-                    if (byte_stream[cursor] & 0b11111100) == 0b11010100:
-                        # TODO Don't we need to extract events here?
-                        # print("Multiple2")
-                        com = {"type": "Multiple2"}
-                        cycles = (byte_stream[cursor + 1] & 0b11) << 16
-                        cycles = cycles + ((byte_stream[cursor + 2]) << 8)
-                        com["cycles"] = cycles + (byte_stream[cursor + 3])
-                        events = (byte_stream[cursor] & 0b11) << 6
-                        events = events + (byte_stream[cursor + 1] >> 2)
-                        for i in range(0, 8):
-                            e = (events >> i) & 0b1
-                            if e:
-                                com["event" + str(i)] = (
-                                    i  # TODO is this how event# is stored in IR?
-                                )
-                        commands[t][key].append(com)
-                        cursor = cursor + 4
-                    if (byte_stream[cursor] & 0b11110000) == 0b11100000:
-                        com = {"type": "Repeat0"}
-                        com["repeats"] = (byte_stream[cursor]) & 0b1111
-                        commands[t][key].append(com)
-                        cursor = cursor + 1
-                    if (byte_stream[cursor] & 0b11111100) == 0b11011000:
-                        com = {"type": "Repeat1"}
-                        com["repeats"] = ((byte_stream[cursor]) & 0b11) * 256
-                        com["repeats"] += byte_stream[cursor + 1]
-                        commands[t][key].append(com)
-                        cursor = cursor + 2
-                    if (byte_stream[cursor] & 0b11111111) == 0b11111110:
-                        # No one likes you filler, get out of here
-                        cursor = cursor + 1
-                    if (byte_stream[cursor] & 0b11111111) == 0b11111111:
-                        com = {"type": "Event_Sync"}
-                        commands[t][key].append(com)
-                        cursor = cursor + 1
-            except IndexError:
-                pass
-
-    return commands
-
-
 def make_event_lists(commands):
     events = {}
     ts = 0
@@ -413,6 +171,7 @@ def deactivate_events(
     loc,
     pid_events,
     trace_events,
+    events_module,
 ):
     for k in active_events.keys():  # an active event
         if cycles > 0 or (cycles == 0 and not k in multiples):
@@ -420,7 +179,7 @@ def deactivate_events(
             if active_events[k] > 0:
                 trace_event = {
                     "name": lookup_event_name_by_type(
-                        trace_type, pid_events[trace_type][loc][k]
+                        trace_type, pid_events[trace_type][loc][k], events_module
                     )
                 }  # TODO remove
                 trace_event["ts"] = timer
@@ -433,11 +192,15 @@ def deactivate_events(
 
 
 # Assert a begin siganl for the current event unless the event is still active
-def activate_event(event, tt, loc, timer, pid, active_events, pid_events, trace_events):
+def activate_event(
+    event, tt, loc, timer, pid, active_events, pid_events, trace_events, events_module
+):
     try:
         if active_events[event] == 0:
             trace_event = {
-                "name": lookup_event_name_by_type(tt, pid_events[tt][loc][event])
+                "name": lookup_event_name_by_type(
+                    tt, pid_events[tt][loc][event], events_module
+                )
             }
             trace_event["ts"] = timer
             trace_event["ph"] = "B"
@@ -455,7 +218,9 @@ def activate_event(event, tt, loc, timer, pid, active_events, pid_events, trace_
 #
 # commands:  list (idx = trace type, value = byte_stream_dict)
 # byte_stream_dict: dict (key = row,col, value = list of commands)
-def convert_commands_to_json(trace_events, commands, pid_events, of=None, debug=False):
+def convert_commands_to_json(
+    trace_events, commands, pid_events, events_module, of=None, debug=False
+):
     # byte_stream_dict for each trace type.
     for [tt, byte_stream_dict] in enumerate(commands):  # tt = trace type
 
@@ -518,6 +283,7 @@ def convert_commands_to_json(trace_events, commands, pid_events, of=None, debug=
                         loc,
                         pid_events,
                         trace_events,
+                        events_module,
                     )
                     timer = timer + cycles
                     activate_event(
@@ -529,6 +295,7 @@ def convert_commands_to_json(trace_events, commands, pid_events, of=None, debug=
                         active_events,
                         pid_events,
                         trace_events,
+                        events_module,
                     )
 
                 elif "Multiple" in t:
@@ -548,6 +315,7 @@ def convert_commands_to_json(trace_events, commands, pid_events, of=None, debug=
                         loc,
                         pid_events,
                         trace_events,
+                        events_module,
                     )
                     timer = timer + cycles
 
@@ -562,6 +330,7 @@ def convert_commands_to_json(trace_events, commands, pid_events, of=None, debug=
                                 active_events,
                                 pid_events,
                                 trace_events,
+                                events_module,
                             )
 
                 elif "Repeat" in t:
@@ -582,6 +351,7 @@ def convert_commands_to_json(trace_events, commands, pid_events, of=None, debug=
                                 loc,
                                 pid_events,
                                 trace_events,
+                                events_module,
                             )
                             timer = timer + cycles
                             if len(multiple_list) > 1:
@@ -596,6 +366,7 @@ def convert_commands_to_json(trace_events, commands, pid_events, of=None, debug=
                                             active_events,
                                             pid_events,
                                             trace_events,
+                                            events_module,
                                         )
                             else:
                                 activate_event(
@@ -607,6 +378,7 @@ def convert_commands_to_json(trace_events, commands, pid_events, of=None, debug=
                                     active_events,
                                     pid_events,
                                     trace_events,
+                                    events_module,
                                 )
 
 
@@ -615,22 +387,20 @@ def process_name_metadata(trace_events, pid, trace_type, loc):
     trace_event["ph"] = "M"
     trace_event["pid"] = pid
     trace_event["args"] = {}
-    # if (pid == 0 or pid == 2):
-    if trace_type == 0:
-        trace_event["args"]["name"] = "core_trace for tile" + str(loc)
-    # if (pid == 1 or pid == 3):
-    elif trace_type == 1:
-        trace_event["args"]["name"] = "mem_trace for tile" + str(loc)
-    elif trace_type == 2:
-        trace_event["args"]["name"] = "shim_trace for tile" + str(loc)
-    elif trace_type == 3:
-        trace_event["args"]["name"] = "memtile_trace for tile" + str(loc)
 
+    pt = PacketType(trace_type)
+    name = pt.name.lower()
+    if name == "shimtile":
+        name = "shim"
+
+    trace_event["args"]["name"] = f"{name}_trace for tile{loc}"
     trace_events.append(trace_event)
 
 
 # def thread_name_metadata(trace_events, pid, tid, pid_events):
-def thread_name_metadata(trace_events, trace_type, loc, pid, tid, pid_events):
+def thread_name_metadata(
+    trace_events, trace_type, loc, pid, tid, pid_events, events_module
+):
     # def thread_name_metadata(trace_events, trace_type, pid, tid):
     trace_event = {"name": "thread_name"}
     trace_event["ph"] = "M"
@@ -639,7 +409,7 @@ def thread_name_metadata(trace_events, trace_type, loc, pid, tid, pid_events):
     trace_event["args"] = {}
     # trace_event['args']['name'] = lookupEventNameInStr(str(tid), pid, pid_events)
     trace_event["args"]["name"] = lookup_event_name_by_type(
-        trace_type, pid_events[trace_type][loc][tid]
+        trace_type, pid_events[trace_type][loc][tid], events_module
     )
     trace_events.append(trace_event)
 
@@ -655,7 +425,7 @@ def thread_name_metadata(trace_events, trace_type, loc, pid, tid, pid_events):
 def parse_mlir_trace_events(mlir_module_str, colshift=None):
 
     pid_events = list()
-    for t in range(NumTraceTypes):
+    for t in range(NUM_TRACE_TYPES):
         pid_events.append(dict())
 
     with Context(), Location.unknown():
@@ -671,6 +441,7 @@ def parse_mlir_trace_events(mlir_module_str, colshift=None):
         )
         device = aiedialect.AIEDevice(int(device[0].device))
         target_model = aiedialect.get_target_model(device)
+        events_module = get_events_for_device(str(device))
 
     for write32 in write32s:
         address = None
@@ -776,27 +547,26 @@ def parse_mlir_trace_events(mlir_module_str, colshift=None):
     #     print("row:",j['row'],", col: ",j['col'])
     #     print("0: ", j[0], "1: ", j[1], "2: ", j[2], "3: ", j[3])
     #     print("4: ", j[4], "5: ", j[5], "6: ", j[6], "7: ", j[7])
-    return pid_events
-
-
-def lookup_event_name_by_type(trace_type, code):
-    # def lookup_event_name_by_type(trace_type, loc, event, pid_events):
-    event = ""
-    # code = pid_events[trace_type][loc][event]
-    events_enum = None
-    if trace_type == 0:  # Core traces
-        events_enum = CoreEvent
-    elif trace_type == 1:  # Mem traces
-        events_enum = MemEvent
-    elif trace_type == 2:  # Shim traces
-        events_enum = ShimTileEvent
-    elif trace_type == 3:  # MemTile traces
-        events_enum = MemTileEvent
-    if events_enum is not None and code in set(x.value for x in events_enum):
-        event = events_enum(code).name
+    return pid_events, events_module
+
+
+def lookup_event_name_by_type(trace_type, code, events_module):
+    if trace_type == PacketType.CORE:
+        enum_class = events_module.CoreEvent
+    elif trace_type == PacketType.MEM:
+        enum_class = events_module.MemEvent
+    elif trace_type == PacketType.SHIMTILE:
+        enum_class = events_module.ShimTileEvent
+    elif trace_type == PacketType.MEMTILE:
+        enum_class = events_module.MemTileEvent
     else:
-        event = "Unknown"
-    return event
+        return "Unknown"
+
+    try:
+        return enum_class(code).name
+    except ValueError:
+        pass
+    return "Unknown"
 
 
 # def lookup_event_name_by_code(code, traceType):
@@ -888,14 +658,16 @@ def lookup_event_name_by_type(trace_type, code):
 # This sets up the trace metadata and also assigned the unique pid that's referred
 # eleswhere for each process (combination of tile(row,col) and trace type).
 # NOTE: This assume the pid_events has already be analyzed and populated.
-def setup_trace_metadata(trace_events, pid_events):
+def setup_trace_metadata(trace_events, pid_events, events_module):
     pid = 0
-    for t in range(NumTraceTypes):
+    for t in range(NUM_TRACE_TYPES):
         # for j in len(pid_events[i]):
         for loc in pid_events[t]:  # return loc
             process_name_metadata(trace_events, pid, t, loc)
             for e in range(8):
-                thread_name_metadata(trace_events, t, loc, pid, e, pid_events)
+                thread_name_metadata(
+                    trace_events, t, loc, pid, e, pid_events, events_module
+                )
                 pid_events[t][loc].append(pid)  # assign unique pid
             pid = pid + 1
 
@@ -906,7 +678,7 @@ def setup_trace_metadata(trace_events, pid_events):
 def align_column_start_index(events, commands):
     # find min column of commands
     min_commands_col = float("inf")
-    for t in range(NumTraceTypes):
+    for t in range(NUM_TRACE_TYPES):
         for loc in commands[t]:
             col = int(loc.split(",")[1])
             if col < min_commands_col:
@@ -914,7 +686,7 @@ def align_column_start_index(events, commands):
 
     # find min column of events
     min_events_col = float("inf")
-    for t in range(NumTraceTypes):
+    for t in range(NUM_TRACE_TYPES):
         for loc in events[t]:
             col = int(loc.split(",")[1])
             if col < min_events_col:
@@ -926,7 +698,7 @@ def align_column_start_index(events, commands):
 
     # Shift all event keys by colshift
     new_events = []
-    for t in range(NumTraceTypes):
+    for t in range(NUM_TRACE_TYPES):
         updated = {}
         for loc, l in events[t].items():
             row, col = map(int, loc.split(","))
@@ -964,7 +736,7 @@ def parse_trace(trace_buffer, mlir_module_str, colshift=None, debug=False):
         trace_pkts.append(hex_str)
 
     # Parse MLIR to extract event configuration
-    pid_events = parse_mlir_trace_events(mlir_module_str, colshift)
+    pid_events, events_module = parse_mlir_trace_events(mlir_module_str, colshift)
 
     # Check for valid trace
     if not check_for_valid_trace("<numpy_array>", trace_pkts, of=None, debug=debug):
@@ -990,10 +762,12 @@ def parse_trace(trace_buffer, mlir_module_str, colshift=None, debug=False):
     trace_events = []
 
     # Setup metadata (process names, thread names, assign PIDs)
-    setup_trace_metadata(trace_events, pid_events)
+    setup_trace_metadata(trace_events, pid_events, events_module)
 
     # Convert commands to Chrome Trace Event Format
-    convert_commands_to_json(trace_events, commands, pid_events, of=None, debug=debug)
+    convert_commands_to_json(
+        trace_events, commands, pid_events, events_module, of=None, debug=debug
+    )
 
     return trace_events
 
@@ -1029,7 +803,7 @@ def main():
     try:
         with open(opts.mlir, "r") as mf:
             mlir_module_str = mf.read()
-        pid_events = parse_mlir_trace_events(mlir_module_str, colshift)
+        pid_events, events_module = parse_mlir_trace_events(mlir_module_str, colshift)
     except Exception as e:
         print("ERROR:", opts.mlir, "could not be opened. Check for valid MLIR file.", e)
         sys.exit(1)
@@ -1108,9 +882,11 @@ def main():
 
     trace_events = list()
 
-    setup_trace_metadata(trace_events, pid_events)
+    setup_trace_metadata(trace_events, pid_events, events_module)
 
-    convert_commands_to_json(trace_events, commands_0, pid_events, of, DEBUG)
+    convert_commands_to_json(
+        trace_events, commands_0, pid_events, events_module, of, DEBUG
+    )
 
     print(json.dumps(trace_events).replace("'", '"').replace(", {", ",\n{"), file=of)
 
diff --git a/python/utils/trace.py b/python/utils/trace/setup.py
similarity index 78%
rename from python/utils/trace.py
rename to python/utils/trace/setup.py
index 2363a1accaf..a1d9d0e4277 100644
--- a/python/utils/trace.py
+++ b/python/utils/trace/setup.py
@@ -1,271 +1,28 @@
-# trace.py -*- Python -*-
-#
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-FileCopyrightText: Copyright (C) 2024-2026 Advanced Micro Devices, Inc. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# (c) Copyright 2024 Advanced Micro Devices, Inc.
 
-import typing
-from aie.dialects.aie import *
-from aie.dialects.aiex import *
+from aie.dialects.aie import packetflow, WireBundle
+from aie.dialects.aiex import (
+    npu_write32,
+    npu_writebd,
+    npu_maskwrite32,
+    npu_address_patch,
+    npu_sync,
+)
 from aie.dialects.aie import get_target_model
-from aie.utils.trace_events.aie2 import CoreEvent, MemEvent, ShimTileEvent, MemTileEvent
-from enum import IntEnum
-
-
-class GenericEvent:
-    def __init__(
-        self, code: typing.Union[CoreEvent, MemEvent, ShimTileEvent, MemTileEvent]
-    ):
-        # For backwards compatibility, allow integer as event
-        if isinstance(code, int):
-            code = CoreEvent(code)
-        self.code: typing.Union[CoreEvent, MemEvent, ShimTileEvent, MemTileEvent] = code
-
-    def get_register_writes(self):
-        """
-        Sub-classes for specific events that require writing to a specific
-        register should overwrite this method to return a dicitionary
-        address -> register value.
-
-        Note that if multiple event(-types) request writing to the same
-        register, their writes will be ORed together. (This makes sense if
-        configuration requires only writing some bits of the whole register.)
-        """
-        return {}
-
-
-# fmt: off
-PortEventCodes = { CoreEvent.PORT_IDLE_0, CoreEvent.PORT_IDLE_1,
-                   CoreEvent.PORT_IDLE_2, CoreEvent.PORT_IDLE_3,
-                   CoreEvent.PORT_IDLE_4, CoreEvent.PORT_IDLE_5,
-                   CoreEvent.PORT_IDLE_6, CoreEvent.PORT_IDLE_7,
-                   CoreEvent.PORT_RUNNING_0, CoreEvent.PORT_RUNNING_1,
-                   CoreEvent.PORT_RUNNING_2, CoreEvent.PORT_RUNNING_3,
-                   CoreEvent.PORT_RUNNING_4, CoreEvent.PORT_RUNNING_5,
-                   CoreEvent.PORT_RUNNING_6, CoreEvent.PORT_RUNNING_7,
-                   CoreEvent.PORT_STALLED_0, CoreEvent.PORT_STALLED_1,
-                   CoreEvent.PORT_STALLED_2, CoreEvent.PORT_STALLED_3,
-                   CoreEvent.PORT_STALLED_4, CoreEvent.PORT_STALLED_5,
-                   CoreEvent.PORT_STALLED_6, CoreEvent.PORT_STALLED_7,
-                   CoreEvent.PORT_TLAST_0, CoreEvent.PORT_TLAST_1,
-                   CoreEvent.PORT_TLAST_2, CoreEvent.PORT_TLAST_3,
-                   CoreEvent.PORT_TLAST_4, CoreEvent.PORT_TLAST_5,
-                   CoreEvent.PORT_TLAST_6, CoreEvent.PORT_TLAST_7, }
-
-MemTilePortEventCodes = { MemTileEvent.PORT_IDLE_0, MemTileEvent.PORT_IDLE_1,
-                   MemTileEvent.PORT_IDLE_2, MemTileEvent.PORT_IDLE_3,
-                   MemTileEvent.PORT_IDLE_4, MemTileEvent.PORT_IDLE_5,
-                   MemTileEvent.PORT_IDLE_6, MemTileEvent.PORT_IDLE_7,
-                   MemTileEvent.PORT_RUNNING_0, MemTileEvent.PORT_RUNNING_1,
-                   MemTileEvent.PORT_RUNNING_2, MemTileEvent.PORT_RUNNING_3,
-                   MemTileEvent.PORT_RUNNING_4, MemTileEvent.PORT_RUNNING_5,
-                   MemTileEvent.PORT_RUNNING_6, MemTileEvent.PORT_RUNNING_7,
-                   MemTileEvent.PORT_STALLED_0, MemTileEvent.PORT_STALLED_1,
-                   MemTileEvent.PORT_STALLED_2, MemTileEvent.PORT_STALLED_3,
-                   MemTileEvent.PORT_STALLED_4, MemTileEvent.PORT_STALLED_5,
-                   MemTileEvent.PORT_STALLED_6, MemTileEvent.PORT_STALLED_7,
-                   MemTileEvent.PORT_TLAST_0, MemTileEvent.PORT_TLAST_1,
-                   MemTileEvent.PORT_TLAST_2, MemTileEvent.PORT_TLAST_3,
-                   MemTileEvent.PORT_TLAST_4, MemTileEvent.PORT_TLAST_5,
-                   MemTileEvent.PORT_TLAST_6, MemTileEvent.PORT_TLAST_7, }
-
-
-ShimTilePortEventCodes = { ShimTileEvent.PORT_IDLE_0, ShimTileEvent.PORT_IDLE_1,
-                   ShimTileEvent.PORT_IDLE_2, ShimTileEvent.PORT_IDLE_3,
-                   ShimTileEvent.PORT_IDLE_4, ShimTileEvent.PORT_IDLE_5,
-                   ShimTileEvent.PORT_IDLE_6, ShimTileEvent.PORT_IDLE_7,
-                   ShimTileEvent.PORT_RUNNING_0, ShimTileEvent.PORT_RUNNING_1,
-                   ShimTileEvent.PORT_RUNNING_2, ShimTileEvent.PORT_RUNNING_3,
-                   ShimTileEvent.PORT_RUNNING_4, ShimTileEvent.PORT_RUNNING_5,
-                   ShimTileEvent.PORT_RUNNING_6, ShimTileEvent.PORT_RUNNING_7,
-                   ShimTileEvent.PORT_STALLED_0, ShimTileEvent.PORT_STALLED_1,
-                   ShimTileEvent.PORT_STALLED_2, ShimTileEvent.PORT_STALLED_3,
-                   ShimTileEvent.PORT_STALLED_4, ShimTileEvent.PORT_STALLED_5,
-                   ShimTileEvent.PORT_STALLED_6, ShimTileEvent.PORT_STALLED_7,
-                   ShimTileEvent.PORT_TLAST_0, ShimTileEvent.PORT_TLAST_1,
-                   ShimTileEvent.PORT_TLAST_2, ShimTileEvent.PORT_TLAST_3,
-                   ShimTileEvent.PORT_TLAST_4, ShimTileEvent.PORT_TLAST_5,
-                   ShimTileEvent.PORT_TLAST_6, ShimTileEvent.PORT_TLAST_7, }
-
-# fmt: on
-
-
-# We use the packet type field in the packet header to help differentiate the tile
-# that the packet came from. Since packet types don't inherently have meaning, we
-# assign numerical values to each tile type: core, mem (for core), shimtilem, memtile
-class PacketType(IntEnum):
-    CORE = 0
-    MEM = 1
-    SHIMTILE = 2
-    MEMTILE = 3
-
-
-class PortEvent(GenericEvent):
-    def __init__(self, code, port_number, master=True):
-        # For backwards compatibility, allow integer as event
-        if isinstance(code, int):
-            code = CoreEvent(code)
-        assert code in PortEventCodes
-        # fmt: off
-        self.event_number = (
-                 0 if code in { CoreEvent.PORT_IDLE_0,    CoreEvent.PORT_RUNNING_0, 
-                                CoreEvent.PORT_STALLED_0, CoreEvent.PORT_TLAST_0    }
-            else 1 if code in { CoreEvent.PORT_IDLE_1,    CoreEvent.PORT_RUNNING_1,
-                                CoreEvent.PORT_STALLED_1, CoreEvent.PORT_TLAST_1,   }
-            else 2 if code in { CoreEvent.PORT_IDLE_2,    CoreEvent.PORT_RUNNING_2,
-                                CoreEvent.PORT_STALLED_2, CoreEvent.PORT_TLAST_2    }
-            else 3 if code in { CoreEvent.PORT_IDLE_3,    CoreEvent.PORT_RUNNING_3,
-                                CoreEvent.PORT_STALLED_3, CoreEvent.PORT_TLAST_3    }
-            else 4 if code in { CoreEvent.PORT_IDLE_4,    CoreEvent.PORT_RUNNING_4,
-                                CoreEvent.PORT_STALLED_4, CoreEvent.PORT_TLAST_4    }
-            else 5 if code in { CoreEvent.PORT_IDLE_5,    CoreEvent.PORT_RUNNING_5,
-                                CoreEvent.PORT_STALLED_5, CoreEvent.PORT_TLAST_5    }
-            else 6 if code in { CoreEvent.PORT_IDLE_6,    CoreEvent.PORT_RUNNING_6,
-                                CoreEvent.PORT_STALLED_6, CoreEvent.PORT_TLAST_6    }
-            else 7
-        )
-        # fmt: on
-        self.port_number = port_number
-        self.master = master
-        super().__init__(code)
-
-    def get_register_writes(self):
-        def master(port):
-            return port | (1 << 5)
-
-        def slave(port):
-            return port
-
-        # 0x3FF00: Stream switch event port selection 0
-        # 0x3FF04: Stream switch event port selection 1
-        address = 0x3FF00 if self.event_number < 4 else 0x3FF04
-        value = master(self.port_number) if self.master else slave(self.port_number)
-
-        value = (value & 0xFF) << 8 * (self.event_number % 4)
-
-        ret = {0x3FF00: 0, 0x3FF04: 0}
-        ret[address] = value
-
-        return ret
-
-
-class MemTilePortEvent(GenericEvent):
-    def __init__(self, code, port_number, master=True):
-        # For backwards compatibility, allow integer as event
-        if isinstance(code, int):
-            code = MemTileEvent(code)
-        assert code in MemTilePortEventCodes
-        # fmt: off
-        self.event_number = (
-                 0 if code in { MemTileEvent.PORT_IDLE_0,    MemTileEvent.PORT_RUNNING_0, 
-                                MemTileEvent.PORT_STALLED_0, MemTileEvent.PORT_TLAST_0    }
-            else 1 if code in { MemTileEvent.PORT_IDLE_1,    MemTileEvent.PORT_RUNNING_1,
-                                MemTileEvent.PORT_STALLED_1, MemTileEvent.PORT_TLAST_1,   }
-            else 2 if code in { MemTileEvent.PORT_IDLE_2,    MemTileEvent.PORT_RUNNING_2,
-                                MemTileEvent.PORT_STALLED_2, MemTileEvent.PORT_TLAST_2    }
-            else 3 if code in { MemTileEvent.PORT_IDLE_3,    MemTileEvent.PORT_RUNNING_3,
-                                MemTileEvent.PORT_STALLED_3, MemTileEvent.PORT_TLAST_3    }
-            else 4 if code in { MemTileEvent.PORT_IDLE_4,    MemTileEvent.PORT_RUNNING_4,
-                                MemTileEvent.PORT_STALLED_4, MemTileEvent.PORT_TLAST_4    }
-            else 5 if code in { MemTileEvent.PORT_IDLE_5,    MemTileEvent.PORT_RUNNING_5,
-                                MemTileEvent.PORT_STALLED_5, MemTileEvent.PORT_TLAST_5    }
-            else 6 if code in { MemTileEvent.PORT_IDLE_6,    MemTileEvent.PORT_RUNNING_6,
-                                MemTileEvent.PORT_STALLED_6, MemTileEvent.PORT_TLAST_6    }
-            else 7
-        )
-        # fmt: on
-        self.port_number = port_number
-        self.master = master
-        super().__init__(code)
-
-    def get_register_writes(self):
-        def master(port):
-            return port | (1 << 5)
-
-        def slave(port):
-            return port
-
-        # 0x3FF00: Stream switch event port selection 0
-        # 0x3FF04: Stream switch event port selection 1
-        address = 0xB0F00 if self.event_number < 4 else 0xB0F04
-        value = master(self.port_number) if self.master else slave(self.port_number)
-
-        value = (value & 0xFF) << 8 * (self.event_number % 4)
-
-        ret = {0xB0F00: 0, 0xB0F04: 0}
-        ret[address] = value
-
-        return ret
-
-
-class ShimTilePortEvent(GenericEvent):
-    def __init__(self, code, port_number, master=True):
-        # For backwards compatibility, allow integer as event
-        if isinstance(code, int):
-            code = ShimTileEvent(code)
-        assert code in ShimTilePortEventCodes
-        # fmt: off
-        self.event_number = (
-                 0 if code in { ShimTileEvent.PORT_IDLE_0,    ShimTileEvent.PORT_RUNNING_0, 
-                                ShimTileEvent.PORT_STALLED_0, ShimTileEvent.PORT_TLAST_0    }
-            else 1 if code in { ShimTileEvent.PORT_IDLE_1,    ShimTileEvent.PORT_RUNNING_1,
-                                ShimTileEvent.PORT_STALLED_1, ShimTileEvent.PORT_TLAST_1,   }
-            else 2 if code in { ShimTileEvent.PORT_IDLE_2,    ShimTileEvent.PORT_RUNNING_2,
-                                ShimTileEvent.PORT_STALLED_2, ShimTileEvent.PORT_TLAST_2    }
-            else 3 if code in { ShimTileEvent.PORT_IDLE_3,    ShimTileEvent.PORT_RUNNING_3,
-                                ShimTileEvent.PORT_STALLED_3, ShimTileEvent.PORT_TLAST_3    }
-            else 4 if code in { ShimTileEvent.PORT_IDLE_4,    ShimTileEvent.PORT_RUNNING_4,
-                                ShimTileEvent.PORT_STALLED_4, ShimTileEvent.PORT_TLAST_4    }
-            else 5 if code in { ShimTileEvent.PORT_IDLE_5,    ShimTileEvent.PORT_RUNNING_5,
-                                ShimTileEvent.PORT_STALLED_5, ShimTileEvent.PORT_TLAST_5    }
-            else 6 if code in { ShimTileEvent.PORT_IDLE_6,    ShimTileEvent.PORT_RUNNING_6,
-                                ShimTileEvent.PORT_STALLED_6, ShimTileEvent.PORT_TLAST_6    }
-            else 7
-        )
-        # fmt: on
-        self.port_number = port_number
-        self.master = master
-        super().__init__(code)
-
-    def get_register_writes(self):
-        def master(port):
-            return port | (1 << 5)
-
-        def slave(port):
-            return port
-
-        # 0x3FF00: Stream switch event port selection 0
-        # 0x3FF04: Stream switch event port selection 1
-        address = 0x3FF00 if self.event_number < 4 else 0x3FF04
-        value = master(self.port_number) if self.master else slave(self.port_number)
-
-        value = (value & 0xFF) << 8 * (self.event_number % 4)
-
-        ret = {0x3FF00: 0, 0x3FF04: 0}
-        ret[address] = value
-
-        return ret
-
-
-# TODO Should be expanded to be check for valid shim tile based on device
-# Checks if tile is a shim tile (for now, assumes row 0 is only shim tile, true for aie1/aie2/aie2p)
-def isShimTile(tile):
-    return int(tile.row) == 0
-
-
-# TODO Should be expanded to be check for valid shim tile based on device
-# Checks if tile is a Mem tile (for now, assumes row 1 is only mem tile, true for aie1/aie2/aie2p)
-def isMemTile(tile):
-    return int(tile.row) == 1
-
-
-# TODO Should be expanded to be check for valid shim tile based on device
-# Checks if tile is a Core tile (for now, assumes any row > 1 is core tile, true for aie1/aie2/aie2p)
-# though we're not checking max row value so this isn't 100% accurate
-def isCoreTile(tile):
-    return int(tile.row) > 1
+from .events import (
+    GenericEvent,
+    PortEvent,
+    CoreEvent,
+    MemEvent,
+    ShimTileEvent,
+    MemTileEvent,
+    MemTilePortEvent,
+    ShimTilePortEvent,
+    PacketType,
+    PortEventCodes,
+)
+from .utils import pack4bytes
 
 
 # Globally defined constants
@@ -273,14 +30,6 @@ def isCoreTile(tile):
 direction_mm2s = 1
 
 
-def pack4bytes(b3, b2, b1, b0):
-    w = (b3 & 0xFF) << 24
-    w |= (b2 & 0xFF) << 16
-    w |= (b1 & 0xFF) << 8
-    w |= (b0 & 0xFF) << 0
-    return w
-
-
 # This function configures the a tile's memory trace unit given a set of configurations as described below:
 #
 # function arguments:
@@ -765,11 +514,13 @@ def configure_timer_ctrl_shimtile_aie2(tile, event):
 # * `num` - broadcaast number we want to broadcast on
 # * `event` - the triggering broadcast event
 def configure_broadcast_core_aie2(tile, num, event):
-    if isShimTile(tile):
+    tm = get_target_model(tile.parent.attributes["device"])
+    col, row = int(tile.col), int(tile.row)
+    if tm.is_shim_noc_or_pl_tile(col, row):
         base_addr = 0x34010
-    elif isMemTile(tile):
+    elif tm.is_mem_tile(col, row):
         base_addr = 0x94010
-    elif isCoreTile(tile):
+    elif tm.is_core_tile(col, row):
         base_addr = 0x34010
     else:
         raise ValueError(
@@ -1393,10 +1144,14 @@ def configure_packet_tracing_aie2(
         else:
             p_id = i + 1
 
-        if isShimTile(tiles_to_trace[i]):
-            if tiles_to_trace[i] == shim:
+        tile = tiles_to_trace[i]
+        tm = get_target_model(tile.parent.attributes["device"])
+        col, row = int(tile.col), int(tile.row)
+
+        if tm.is_shim_noc_or_pl_tile(col, row):
+            if tile == shim:
                 configure_shimtile_tracing_aie2(
-                    tile=tiles_to_trace[i],
+                    tile=tile,
                     start=start_user_event,
                     stop=stop_user_event,
                     events=shimtile_events,
@@ -1404,10 +1159,10 @@ def configure_packet_tracing_aie2(
                     packet_id=p_id,
                     packet_type=PacketType.SHIMTILE,
                 )
-                configure_timer_ctrl_shimtile_aie2(tiles_to_trace[i], start_user_event)
+                configure_timer_ctrl_shimtile_aie2(tile, start_user_event)
             else:
                 configure_shimtile_tracing_aie2(
-                    tile=tiles_to_trace[i],
+                    tile=tile,
                     start=start_shimtile_broadcast_event,
                     stop=stop_shimtile_broadcast_event,
                     events=shimtile_events,
@@ -1415,12 +1170,10 @@ def configure_packet_tracing_aie2(
                     packet_id=p_id,
                     packet_type=PacketType.SHIMTILE,
                 )
-                configure_timer_ctrl_shimtile_aie2(
-                    tiles_to_trace[i], start_shimtile_broadcast_event
-                )
-        elif isMemTile(tiles_to_trace[i]):
+                configure_timer_ctrl_shimtile_aie2(tile, start_shimtile_broadcast_event)
+        elif tm.is_mem_tile(col, row):
             configure_memtile_tracing_aie2(
-                tile=tiles_to_trace[i],
+                tile=tile,
                 start=start_memtile_broadcast_event,
                 stop=stop_memtile_broadcast_event,
                 events=memtile_events,
@@ -1428,13 +1181,11 @@ def configure_packet_tracing_aie2(
                 packet_id=p_id,
                 packet_type=PacketType.MEMTILE,
             )
-            configure_timer_ctrl_memtile_aie2(
-                tiles_to_trace[i], start_memtile_broadcast_event
-            )
-        elif isCoreTile(tiles_to_trace[i]):
-            if tiles_to_trace[i] not in exist_core_tile_traces:
+            configure_timer_ctrl_memtile_aie2(tile, start_memtile_broadcast_event)
+        elif tm.is_core_tile(col, row):
+            if tile not in exist_core_tile_traces:
                 configure_coretile_tracing_aie2(
-                    tile=tiles_to_trace[i],
+                    tile=tile,
                     start=start_core_broadcast_event,
                     stop=stop_core_broadcast_event,
                     events=coretile_events,
@@ -1442,13 +1193,11 @@ def configure_packet_tracing_aie2(
                     packet_id=p_id,
                     packet_type=PacketType.CORE,
                 )
-                configure_timer_ctrl_coretile_aie2(
-                    tiles_to_trace[i], start_core_broadcast_event
-                )
-                exist_core_tile_traces.append(tiles_to_trace[i])
+                configure_timer_ctrl_coretile_aie2(tile, start_core_broadcast_event)
+                exist_core_tile_traces.append(tile)
             else:
                 configure_coremem_tracing_aie2(
-                    tile=tiles_to_trace[i],
+                    tile=tile,
                     start=start_core_mem_broadcast_event,
                     stop=stop_core_mem_broadcast_event,
                     events=coremem_events,
@@ -1456,15 +1205,13 @@ def configure_packet_tracing_aie2(
                     packet_id=p_id,
                     packet_type=PacketType.MEM,
                 )
-                configure_timer_ctrl_coremem_aie2(
-                    tiles_to_trace[i], start_core_mem_broadcast_event
-                )
+                configure_timer_ctrl_coremem_aie2(tile, start_core_mem_broadcast_event)
         else:
             raise ValueError(
                 "Invalid tile("
-                + str(tiles_to_trace[i].col)
+                + str(tile.col)
                 + ","
-                + str(tiles_to_trace[i].row)
+                + str(tile.row)
                 + "). Check tile coordinates are within a valid range."
             )
     configure_shimtile_dma_aie2(
@@ -1472,7 +1219,7 @@ def configure_packet_tracing_aie2(
         channel=1,
         bd_id=15,
         ddr_id=ddr_id,
-        size=trace_size,
+        size=trace_size // 4,  # convert to words
         offset=trace_offset,
         enable_token=enable_token,
         shim_burst_length=shim_burst_length,
diff --git a/python/utils/trace/utils.py b/python/utils/trace/utils.py
new file mode 100644
index 00000000000..e51c79b2547
--- /dev/null
+++ b/python/utils/trace/utils.py
@@ -0,0 +1,391 @@
+# Copyright (C) 2024-2026 Advanced Micro Devices, Inc. All rights reserved.
+# SPDX-License-Identifier: MIT
+
+# from CppHeaderParser import CppHeader
+import numpy as np
+import json
+import re
+import os
+import sys
+from .events import NUM_TRACE_TYPES
+
+
+# checks # of bits. Odd number returns a 1. Even returns 0.
+def parity(x):
+    return x.bit_count() & 1
+
+
+def extract_tile(data):
+    col = (data >> 21) & 0x7F
+    row = (data >> 16) & 0x1F
+    pkt_type = (data >> 12) & 0x3
+    pkt_id = data & 0x1F
+    return (col, row, pkt_type, pkt_id)
+
+
+def pack4bytes(b3, b2, b1, b0):
+    w = (b3 & 0xFF) << 24
+    w |= (b2 & 0xFF) << 16
+    w |= (b1 & 0xFF) << 8
+    w |= (b0 & 0xFF) << 0
+    return w
+
+
+def create_ctrl_pkt(
+    operation,
+    beats,
+    addr,
+    ctrl_pkt_read_id=28,  # global id used for all ctrl packet reads
+    # WARNING: this needs to match the packet id used in packetflow/.py
+):
+    header = (ctrl_pkt_read_id << 24) | (operation << 22) | (beats << 20) | addr
+    header |= (0x1 ^ parity(header)) << 31
+    return header
+
+
+def get_kernel_code(test: dict, solutions_path: str = None) -> str:
+    """Fetch the kernel code from the provided solution path, if none provided default
+    to canonical solution."""
+    if not solutions_path:
+        return test["prompt"] + test["canonical_solution"]
+
+    with open(
+        os.path.join(solutions_path, f"{test['kernel_name']}.json"), "r"
+    ) as sol_file:
+        solution = json.load(sol_file)
+        if not solution.get("code"):
+            print(f"No code available in {solutions_path} for {test['kernel_name']}")
+            return None
+
+        srccode = solution["code"]
+
+        # if gpt decides to be too helpful and adds a main()... remove it
+        srccode = re.sub(
+            r"int\s+main\s*\([^)]*\)\s*{[^{}]*({[^{}]*}[^{}]*)*}",
+            "",
+            srccode,
+            flags=re.DOTALL,
+        )
+
+        # cppheaderparser will complain if we don't remove trailing comments
+        srccode = srccode.split('// extern "C"')[0]
+
+        return srccode
+
+
+def extract_buffers(test):
+    """Specific helper for the AIEval dataset - parses the test dictionary and returns
+    input buffers, output buffers and RTPs as separate lists.
+    """
+    input_buffers = []
+    for x in test["test_vectors"]["inputs"]:
+        array, dtype = list(x.values())
+        input_buffers.append(np.array(array, dtype=dtype))
+
+    output_buffers = []
+    for x in test["test_vectors"]["outputs"]:
+        array, dtype = list(x.values())
+        output_buffers.append(np.array(array, dtype=dtype))
+
+    rtps = []
+    if test["test_vectors"].get("rtps") != None:
+        for rtp in test["test_vectors"]["rtps"]:
+            array, dtype = rtp.values()
+            rtps.append(np.array(array, dtype=dtype))
+            # rtp_names.append(list(rtp.keys())[0])
+
+    return input_buffers, output_buffers, rtps
+
+
+def get_cycles(trace_path):
+    """This helper function should only be used to extract cycle counts
+    from NPUEval trace files where the expectation is to have exactly 1 of
+    each event0 and event1.
+    """
+    with open(trace_path, "r") as f:
+        data = json.load(f)
+
+    event0 = []
+    event1 = []
+    try:
+        for x in data:
+            if (x["name"] == "INSTR_EVENT_0") and (x["ph"] == "B"):
+                event0.append(x["ts"])
+                tmp = x["ts"]
+                # print("event0 found at "+str(event0[0]))
+
+            if x["name"] == "INSTR_EVENT_1" and x["ph"] == "B":
+                event1.append(x["ts"])
+                # print("event1 found at "+str(event1[0]))
+
+        return event1[0] - event0[0]
+    except:
+        return np.inf
+
+
+def get_cycles_summary(trace_path):
+    """This helper function is  used to extract cycle counts from a trace json
+    file and returns an array of cycles between pairs of event0 and event1.
+    This always assumes each event0 is followed by an event1 and ignores
+    extra event0 and event1's.
+    """
+    with open(trace_path, "r") as f:
+        data = json.load(f)
+
+    try:
+        deltas = []
+        in_kernel = []
+        event0 = []
+        for x in data:
+            if x["name"] == "process_name":
+                deltas.append([x["args"]["name"]])
+                in_kernel.append(False)
+                event0.append(0)
+
+        for x in data:
+            idx = int(x["pid"])
+            if (x["name"] == "INSTR_EVENT_0") and (x["ph"] == "B"):
+                if in_kernel[idx] == False:
+                    event0[idx] = x["ts"]
+                    # print("event0 found at "+str(event0))
+                    in_kernel[idx] = True
+
+            if x["name"] == "INSTR_EVENT_1" and x["ph"] == "B":
+                if in_kernel[idx] == True:
+                    # print("event1 found at "+str(x['ts']))
+                    deltas[idx].append(x["ts"] - event0[idx])
+                    in_kernel[idx] = False
+
+        return deltas
+    except Exception as e:
+        print("Exception found", e)
+        return np.inf
+
+
+def get_vector_time(trace):
+    """This function extracts the total time spent on the vectorized unit
+    from an NPUEval AIE trace (this must have exactly 1 event0 and 1 event1
+    sandwiching the kernel call).
+    """
+    with open(trace, "r") as f:
+        data = json.load(f)
+
+    start, end = None, None
+
+    # find start and end
+    for x in data:
+        if (x["name"] == "INSTR_EVENT_0") and (x["ph"] == "B"):
+            start = x["ts"]
+        if x["name"] == "INSTR_EVENT_1" and x["ph"] == "B":
+            end = x["ts"]
+
+    if not start or not end:
+        return 0
+
+    total_duration = 0
+    stack = []
+
+    for event in data:
+        if event["name"] == "INSTR_VECTOR":
+            if event["ts"] < start:
+                continue
+
+            if event["ts"] > end:
+                continue
+
+            if event["ph"] == "B":
+                stack.append(event)
+            elif event["ph"] == "E" and stack:
+                # Get matching begin event
+                begin_event = stack.pop()
+                # Calculate duration for this pair
+                duration = event["ts"] - begin_event["ts"]
+                total_duration += duration
+
+    return total_duration / (end - start)
+
+
+def check_odd_word_parity(word):
+    val = 0
+    for i in range(32):
+        val = val ^ ((word >> i) & 0x1)
+    return val == 1
+
+
+def parse_pkt_hdr_in_stream(word):
+    hdr = dict()
+    w = int(word)
+    hdr["valid"] = check_odd_word_parity(w)
+    # TODO can we assume non used fields must be 0 to rule out other data packets?
+    # what about bit[5:10]?
+    if (((w >> 5) & 0x7F) != 0) or (((w >> 19) & 0x1) != 0) or (((w >> 28) & 0x7) != 0):
+        hdr["valid"] = False
+    else:
+        # TODO Do we need to check for valid row/col for given device?
+        col, row, pkt_type, pkt_id = extract_tile(w)
+        hdr["col"] = col
+        hdr["row"] = row
+        hdr["type"] = pkt_type
+        hdr["id"] = pkt_id
+    return hdr
+
+
+def trace_pkts_de_interleave(word_stream):
+    trace_pkts_sorted = list()
+    for t in range(NUM_TRACE_TYPES):
+        trace_pkts_sorted.append(dict())
+
+    curr_pkt_type = 0
+    curr_loc = ""
+    curr_vld = False  # only used in the beginning
+
+    for i in range(len(word_stream)):
+        if word_stream[i] == "":
+            break  # TODO Assumes a blank line is the last line
+        if (i % 8) == 0:
+            pkt_hdr = parse_pkt_hdr_in_stream(int(word_stream[i], 16))
+            if pkt_hdr["valid"]:
+                curr_loc = str(pkt_hdr["row"]) + "," + str(pkt_hdr["col"])
+                valid_type_found = False
+                for tt in range(NUM_TRACE_TYPES):
+                    if pkt_hdr["type"] == tt:
+                        curr_pkt_type = tt
+                        if trace_pkts_sorted[tt].get(curr_loc) == None:
+                            trace_pkts_sorted[tt][curr_loc] = list()
+                        valid_type_found = True
+                if not valid_type_found:
+                    sys.exit("Error: Invalid packet type")
+            curr_vld = True
+        else:
+            if curr_vld:
+                trace_pkts_sorted[curr_pkt_type][curr_loc].append(word_stream[i])
+    return trace_pkts_sorted
+
+
+def convert_to_byte_stream(toks_list):
+    byte_stream_list = list()
+    for l in toks_list:
+        byte_stream_dict = dict()
+        for loc, stream in l.items():
+            byte_stream_dict[loc] = list()
+            f = ["", "a5a5a5a5"]
+            toks = [t for t in stream if not t in f]
+            events = [int(t, 16) for t in toks]
+            for event in events:
+                for top in range(4):
+                    byte = 3 - top
+                    opcode = event >> (byte * 8) & 0xFF
+                    byte_stream_dict[loc].append(opcode)
+        byte_stream_list.append(byte_stream_dict)
+    return byte_stream_list
+
+
+def convert_to_commands(byte_stream_list, zero=True):
+    commands = list()
+    for t in range(NUM_TRACE_TYPES):
+        commands.append(dict())
+
+    for t in range(NUM_TRACE_TYPES):
+        for key, byte_stream in byte_stream_list[t].items():
+            cursor = 0
+            commands[t][key] = list()
+            try:
+                while True:
+                    if (byte_stream[cursor] & 0b11111011) == 0b11110000:
+                        com = {"type": "Start", "timer_value": 0}
+                        if not zero:
+                            for i in range(7):
+                                com["timer_value"] += (byte_stream[cursor + i + 1]) * (
+                                    256 ** (6 - i)
+                                )
+                        commands[t][key].append(com)
+                        cursor = cursor + 8
+                    if (byte_stream[cursor] & 0b11111100) == 0b11011100:
+                        cursor = cursor + 4
+                    if (byte_stream[cursor] & 0b10000000) == 0b00000000:
+                        com = {"type": "Single0"}
+                        com["event"] = (byte_stream[cursor]) >> 4 & 0b111
+                        com["cycles"] = (byte_stream[cursor]) & 0b1111
+                        commands[t][key].append(com)
+                        cursor = cursor + 1
+                    if (byte_stream[cursor] & 0b11100000) == 0b10000000:
+                        com = {"type": "Single1"}
+                        com["event"] = (byte_stream[cursor]) >> 2 & 0b111
+                        com["cycles"] = ((byte_stream[cursor]) & 0b11) * 256
+                        com["cycles"] += byte_stream[cursor + 1]
+                        commands[t][key].append(com)
+                        cursor = cursor + 2
+                    if (byte_stream[cursor] & 0b11100000) == 0b10100000:
+                        com = {"type": "Single2"}
+                        com["event"] = (byte_stream[cursor]) >> 2 & 0b111
+                        com["cycles"] = ((byte_stream[cursor]) & 0b11) * 256 * 256
+                        com["cycles"] += byte_stream[cursor + 1] * 256
+                        com["cycles"] += byte_stream[cursor + 2]
+                        commands[t][key].append(com)
+                        cursor = cursor + 3
+                    if (byte_stream[cursor] & 0b11110000) == 0b11000000:
+                        com = {"type": "Multiple0"}
+                        com["cycles"] = byte_stream[cursor + 1] & 0b1111
+                        events = (byte_stream[cursor] & 0b1111) << 4
+                        events = events + (byte_stream[cursor + 1] >> 4)
+                        for i in range(0, 8):
+                            e = (events >> i) & 0b1
+                            if e:
+                                com["event" + str(i)] = i
+                        commands[t][key].append(com)
+                        cursor = cursor + 2
+                    if (byte_stream[cursor] & 0b11111100) == 0b11010000:
+                        com = {"type": "Multiple1"}
+                        cycles = (byte_stream[cursor + 1] & 0b11) << 8
+                        com["cycles"] = cycles + (byte_stream[cursor + 2])
+                        events = (byte_stream[cursor] & 0b11) << 6
+                        events = events + (byte_stream[cursor + 1] >> 2)
+                        for i in range(0, 8):
+                            e = (events >> i) & 0b1
+                            if e:
+                                com["event" + str(i)] = i
+                        commands[t][key].append(com)
+                        cursor = cursor + 3
+                    if (byte_stream[cursor] & 0b11111100) == 0b11010100:
+                        com = {"type": "Multiple2"}
+                        cycles = (byte_stream[cursor + 1] & 0b11) << 16
+                        cycles = cycles + ((byte_stream[cursor + 2]) << 8)
+                        com["cycles"] = cycles + (byte_stream[cursor + 3])
+                        events = (byte_stream[cursor] & 0b11) << 6
+                        events = events + (byte_stream[cursor + 1] >> 2)
+                        for i in range(0, 8):
+                            e = (events >> i) & 0b1
+                            if e:
+                                com["event" + str(i)] = i
+                        commands[t][key].append(com)
+                        cursor = cursor + 4
+                    if (byte_stream[cursor] & 0b11110000) == 0b11100000:
+                        com = {"type": "Repeat0"}
+                        com["repeats"] = (byte_stream[cursor]) & 0b1111
+                        commands[t][key].append(com)
+                        cursor = cursor + 1
+                    if (byte_stream[cursor] & 0b11111100) == 0b11011000:
+                        com = {"type": "Repeat1"}
+                        com["repeats"] = ((byte_stream[cursor]) & 0b11) * 256
+                        com["repeats"] += byte_stream[cursor + 1]
+                        commands[t][key].append(com)
+                        cursor = cursor + 2
+                    if (byte_stream[cursor] & 0b11111111) == 0b11111110:
+                        cursor = cursor + 1
+                    if (byte_stream[cursor] & 0b11111111) == 0b11111111:
+                        com = {"type": "Event_Sync"}
+                        commands[t][key].append(com)
+                        cursor = cursor + 1
+            except IndexError:
+                pass
+
+    return commands
+
+
+def trim_trace_pkts(trace_pkts):
+    for i in range(len(trace_pkts)):
+        if trace_pkts[i] == "fefefefe" or trace_pkts[i] == "FEFEFEFE":
+            if i + 2 < len(trace_pkts):
+                if trace_pkts[i + 1] == "00000000" and trace_pkts[i + 2] == "00000000":
+                    return trace_pkts[0 : i + 1]
+    return trace_pkts
diff --git a/python/utils/trace_events/__init__.py b/python/utils/trace_events/__init__.py
deleted file mode 100644
index 6c3f3554ab4..00000000000
--- a/python/utils/trace_events/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# (c) Copyright 2025 Advanced Micro Devices, Inc. or its affiliates
-#
-"""Trace events enumerations for AIE architectures.
-
-Available modules:
-- aie: AIE1 architecture events
-- aie2: AIE2/AIEML architecture events
-- aie2p: AIE2P architecture events
-"""
diff --git a/python/utils/trace_events_enum.py b/python/utils/trace_events_enum.py
deleted file mode 100644
index e2c1e23d9b6..00000000000
--- a/python/utils/trace_events_enum.py
+++ /dev/null
@@ -1,561 +0,0 @@
-# Enumeration of AIE2 trace events
-# Automatically generated from utils/generate_events_enum.py
-
-from enum import Enum
-
-
-class CoreEvent(Enum):
-    NONE = 0
-    TRUE = 1
-    GROUP_0 = 2
-    TIMER_SYNC = 3
-    TIMER_VALUE_REACHED = 4
-    PERF_CNT_0 = 5
-    PERF_CNT_1 = 6
-    PERF_CNT_2 = 7
-    PERF_CNT_3 = 8
-    COMBO_EVENT_0 = 9
-    COMBO_EVENT_1 = 10
-    COMBO_EVENT_2 = 11
-    COMBO_EVENT_3 = 12
-    EDGE_DETECTION_EVENT_0 = 13
-    EDGE_DETECTION_EVENT_1 = 14
-    GROUP_PC_EVENT = 15
-    PC_0 = 16
-    PC_1 = 17
-    PC_2 = 18
-    PC_3 = 19
-    PC_RANGE_0_1 = 20
-    PC_RANGE_2_3 = 21
-    GROUP_STALL = 22
-    MEMORY_STALL = 23
-    STREAM_STALL = 24
-    CASCADE_STALL = 25
-    LOCK_STALL = 26
-    DEBUG_HALTED = 27
-    ACTIVE = 28
-    DISABLED = 29
-    ECC_ERROR_STALL = 30
-    ECC_SCRUBBING_STALL = 31
-    GROUP_PROGRAM_FLOW = 32
-    INSTR_EVENT_0 = 33
-    INSTR_EVENT_1 = 34
-    INSTR_CALL = 35
-    INSTR_RETURN = 36
-    INSTR_VECTOR = 37
-    INSTR_LOAD = 38
-    INSTR_STORE = 39
-    INSTR_STREAM_GET = 40
-    INSTR_STREAM_PUT = 41
-    INSTR_CASCADE_GET = 42
-    INSTR_CASCADE_PUT = 43
-    INSTR_LOCK_ACQUIRE_REQ = 44
-    INSTR_LOCK_RELEASE_REQ = 45
-    GROUP_ERRORS_0 = 46
-    GROUP_ERRORS_1 = 47
-    SRS_OVERFLOW = 48
-    UPS_OVERFLOW = 49
-    FP_HUGE = 50
-    INT_FP_0 = 51
-    FP_INVALID = 52
-    FP_INF = 53
-    rsvd_54 = 54
-    PM_REG_ACCESS_FAILURE = 55
-    STREAM_PKT_PARITY_ERROR = 56
-    CONTROL_PKT_ERROR = 57
-    AXI_MM_SLAVE_ERROR = 58
-    INSTR_DECOMPRSN_ERROR = 59
-    DM_ADDRESS_OUT_OF_RANGE = 60
-    PM_ECC_ERROR_SCRUB_CORRECTED = 61
-    PM_ECC_ERROR_SCRUB_2BIT = 62
-    PM_ECC_ERROR_1BIT = 63
-    PM_ECC_ERROR_2BIT = 64
-    PM_ADDRESS_OUT_OF_RANGE = 65
-    DM_ACCESS_TO_UNAVAILABLE = 66
-    LOCK_ACCESS_TO_UNAVAILABLE = 67
-    INSTR_WARNING = 68
-    INSTR_ERROR = 69
-    DECOMPRESSION_UNDERFLOW = 70
-    STREAM_SWITCH_PORT_PARITY_ERROR = 71
-    PROCESSOR_BUS_ERROR = 72
-    GROUP_STREAM_SWITCH = 73
-    PORT_IDLE_0 = 74
-    PORT_RUNNING_0 = 75
-    PORT_STALLED_0 = 76
-    PORT_TLAST_0 = 77
-    PORT_IDLE_1 = 78
-    PORT_RUNNING_1 = 79
-    PORT_STALLED_1 = 80
-    PORT_TLAST_1 = 81
-    PORT_IDLE_2 = 82
-    PORT_RUNNING_2 = 83
-    PORT_STALLED_2 = 84
-    PORT_TLAST_2 = 85
-    PORT_IDLE_3 = 86
-    PORT_RUNNING_3 = 87
-    PORT_STALLED_3 = 88
-    PORT_TLAST_3 = 89
-    PORT_IDLE_4 = 90
-    PORT_RUNNING_4 = 91
-    PORT_STALLED_4 = 92
-    PORT_TLAST_4 = 93
-    PORT_IDLE_5 = 94
-    PORT_RUNNING_5 = 95
-    PORT_STALLED_5 = 96
-    PORT_TLAST_5 = 97
-    PORT_IDLE_6 = 98
-    PORT_RUNNING_6 = 99
-    PORT_STALLED_6 = 100
-    PORT_TLAST_6 = 101
-    PORT_IDLE_7 = 102
-    PORT_RUNNING_7 = 103
-    PORT_STALLED_7 = 104
-    PORT_TLAST_7 = 105
-    GROUP_BROADCAST = 106
-    BROADCAST_0 = 107
-    BROADCAST_1 = 108
-    BROADCAST_2 = 109
-    BROADCAST_3 = 110
-    BROADCAST_4 = 111
-    BROADCAST_5 = 112
-    BROADCAST_6 = 113
-    BROADCAST_7 = 114
-    BROADCAST_8 = 115
-    BROADCAST_9 = 116
-    BROADCAST_10 = 117
-    BROADCAST_11 = 118
-    BROADCAST_12 = 119
-    BROADCAST_13 = 120
-    BROADCAST_14 = 121
-    BROADCAST_15 = 122
-    GROUP_USER_EVENT = 123
-    USER_EVENT_0 = 124
-    USER_EVENT_1 = 125
-    USER_EVENT_2 = 126
-    USER_EVENT_3 = 127
-
-
-class MemEvent(Enum):
-    NONE = 0
-    TRUE = 1
-    GROUP_0 = 2
-    TIMER_SYNC = 3
-    TIMER_VALUE_REACHED = 4
-    PERF_CNT_0 = 5
-    PERF_CNT_1 = 6
-    COMBO_EVENT_0 = 7
-    COMBO_EVENT_1 = 8
-    COMBO_EVENT_2 = 9
-    COMBO_EVENT_3 = 10
-    EDGE_DETECTION_EVENT_0 = 11
-    EDGE_DETECTION_EVENT_1 = 12
-    rsvd_13 = 13
-    rsvd_14 = 14
-    GROUP_WATCHPOINT = 15
-    WATCHPOINT_0 = 16
-    WATCHPOINT_1 = 17
-    GROUP_DMA_ACTIVITY = 18
-    DMA_S2MM_0_START_TASK = 19
-    DMA_S2MM_1_START_TASK = 20
-    DMA_MM2S_0_START_TASK = 21
-    DMA_MM2S_1_START_TASK = 22
-    DMA_S2MM_0_FINISHED_BD = 23
-    DMA_S2MM_1_FINISHED_BD = 24
-    DMA_MM2S_0_FINISHED_BD = 25
-    DMA_MM2S_1_FINISHED_BD = 26
-    DMA_S2MM_0_FINISHED_TASK = 27
-    DMA_S2MM_1_FINISHED_TASK = 28
-    DMA_MM2S_0_FINISHED_TASK = 29
-    DMA_MM2S_1_FINISHED_TASK = 30
-    DMA_S2MM_0_STALLED_LOCK = 31
-    DMA_S2MM_1_STALLED_LOCK = 32
-    DMA_MM2S_0_STALLED_LOCK = 33
-    DMA_MM2S_1_STALLED_LOCK = 34
-    DMA_S2MM_0_STREAM_STARVATION = 35
-    DMA_S2MM_1_STREAM_STARVATION = 36
-    DMA_MM2S_0_STREAM_BACKPRESSURE = 37
-    DMA_MM2S_1_STREAM_BACKPRESSURE = 38
-    DMA_S2MM_0_MEMORY_BACKPRESSURE = 39
-    DMA_S2MM_1_MEMORY_BACKPRESSURE = 40
-    DMA_MM2S_0_MEMORY_STARVATION = 41
-    DMA_MM2S_1_MEMORY_STARVATION = 42
-    GROUP_LOCK = 43
-    LOCK_SEL0_ACQ_EQ = 44
-    LOCK_SEL0_ACQ_GE = 45
-    LOCK_0_REL = 46
-    LOCK_SEL0_EQUAL_TO_VALUE = 47
-    LOCK_SEL1_ACQ_EQ = 48
-    LOCK_SEL1_ACQ_GE = 49
-    LOCK_1_REL = 50
-    LOCK_SEL1_EQUAL_TO_VALUE = 51
-    LOCK_SEL2_ACQ_EQ = 52
-    LOCK_SEL2_ACQ_GE = 53
-    LOCK_2_REL = 54
-    LOCK_SEL2_EQUAL_TO_VALUE = 55
-    LOCK_SEL3_ACQ_EQ = 56
-    LOCK_SEL3_ACQ_GE = 57
-    LOCK_3_REL = 58
-    LOCK_SEL3_EQUAL_TO_VALUE = 59
-    LOCK_SEL4_ACQ_EQ = 60
-    LOCK_SEL4_ACQ_GE = 61
-    LOCK_4_REL = 62
-    LOCK_SEL4_EQUAL_TO_VALUE = 63
-    LOCK_SEL5_ACQ_EQ = 64
-    LOCK_SEL5_ACQ_GE = 65
-    LOCK_5_REL = 66
-    LOCK_SEL5_EQUAL_TO_VALUE = 67
-    LOCK_SEL6_ACQ_EQ = 68
-    LOCK_SEL6_ACQ_GE = 69
-    LOCK_6_REL = 70
-    LOCK_SEL6_EQUAL_TO_VALUE = 71
-    LOCK_SEL7_ACQ_EQ = 72
-    LOCK_SEL7_ACQ_GE = 73
-    LOCK_7_REL = 74
-    LOCK_SEL7_EQUAL_TO_VALUE = 75
-    GROUP_MEMORY_CONFLICT = 76
-    CONFLICT_DM_BANK_0 = 77
-    CONFLICT_DM_BANK_1 = 78
-    CONFLICT_DM_BANK_2 = 79
-    CONFLICT_DM_BANK_3 = 80
-    CONFLICT_DM_BANK_4 = 81
-    CONFLICT_DM_BANK_5 = 82
-    CONFLICT_DM_BANK_6 = 83
-    CONFLICT_DM_BANK_7 = 84
-    rsvd_85 = 85
-    GROUP_ERRORS = 86
-    DM_ECC_ERROR_SCRUB_CORRECTED = 87
-    DM_ECC_ERROR_SCRUB_2BIT = 88
-    DM_ECC_ERROR_1BIT = 89
-    DM_ECC_ERROR_2BIT = 90
-    DM_PARITY_ERROR_BANK_2 = 91
-    DM_PARITY_ERROR_BANK_3 = 92
-    DM_PARITY_ERROR_BANK_4 = 93
-    DM_PARITY_ERROR_BANK_5 = 94
-    DM_PARITY_ERROR_BANK_6 = 95
-    DM_PARITY_ERROR_BANK_7 = 96
-    DMA_S2MM_0_ERROR = 97
-    DMA_S2MM_1_ERROR = 98
-    DMA_MM2S_0_ERROR = 99
-    DMA_MM2S_1_ERROR = 100
-    LOCK_ERROR = 101
-    DMA_TASK_TOKEN_STALL = 102
-    rsvd_103 = 103
-    rsvd_104 = 104
-    rsvd_105 = 105
-    GROUP_BROADCAST = 106
-    BROADCAST_0 = 107
-    BROADCAST_1 = 108
-    BROADCAST_2 = 109
-    BROADCAST_3 = 110
-    BROADCAST_4 = 111
-    BROADCAST_5 = 112
-    BROADCAST_6 = 113
-    BROADCAST_7 = 114
-    BROADCAST_8 = 115
-    BROADCAST_9 = 116
-    BROADCAST_10 = 117
-    BROADCAST_11 = 118
-    BROADCAST_12 = 119
-    BROADCAST_13 = 120
-    BROADCAST_14 = 121
-    BROADCAST_15 = 122
-    GROUP_USER_EVENT = 123
-    USER_EVENT_0 = 124
-    USER_EVENT_1 = 125
-    USER_EVENT_2 = 126
-    USER_EVENT_3 = 127
-
-
-class ShimTileEvent(Enum):
-    NONE = 0
-    TRUE = 1
-    GROUP_0 = 2
-    TIMER_SYNC = 3
-    TIMER_VALUE_REACHED = 4
-    PERF_CNT_0 = 5
-    PERF_CNT_1 = 6
-    COMBO_EVENT_0 = 7
-    COMBO_EVENT_1 = 8
-    COMBO_EVENT_2 = 9
-    COMBO_EVENT_3 = 10
-    EDGE_DETECTION_EVENT_0 = 11
-    EDGE_DETECTION_EVENT_1 = 12
-    GROUP_DMA_ACTIVITY = 13
-    DMA_S2MM_0_START_TASK = 14
-    DMA_S2MM_1_START_TASK = 15
-    DMA_MM2S_0_START_TASK = 16
-    DMA_MM2S_1_START_TASK = 17
-    DMA_S2MM_0_FINISHED_BD = 18
-    DMA_S2MM_1_FINISHED_BD = 19
-    DMA_MM2S_0_FINISHED_BD = 20
-    DMA_MM2S_1_FINISHED_BD = 21
-    DMA_S2MM_0_FINISHED_TASK = 22
-    DMA_S2MM_1_FINISHED_TASK = 23
-    DMA_MM2S_0_FINISHED_TASK = 24
-    DMA_MM2S_1_FINISHED_TASK = 25
-    DMA_S2MM_0_STALLED_LOCK = 26
-    DMA_S2MM_1_STALLED_LOCK = 27
-    DMA_MM2S_0_STALLED_LOCK = 28
-    DMA_MM2S_1_STALLED_LOCK = 29
-    DMA_S2MM_0_STREAM_STARVATION = 30
-    DMA_S2MM_1_STREAM_STARVATION = 31
-    DMA_MM2S_0_STREAM_BACKPRESSURE = 32
-    DMA_MM2S_1_STREAM_BACKPRESSURE = 33
-    DMA_S2MM_0_MEMORY_BACKPRESSURE = 34
-    DMA_S2MM_1_MEMORY_BACKPRESSURE = 35
-    DMA_MM2S_0_MEMORY_STARVATION = 36
-    DMA_MM2S_1_MEMORY_STARVATION = 37
-    GROUP_LOCK = 38
-    LOCK_0_ACQ_EQ = 39
-    LOCK_0_ACQ_GE = 40
-    LOCK_0_REL = 41
-    LOCK_0_EQUAL_TO_VALUE = 42
-    LOCK_1_ACQ_EQ = 43
-    LOCK_1_ACQ_GE = 44
-    LOCK_1_REL = 45
-    LOCK_1_EQUAL_TO_VALUE = 46
-    LOCK_2_ACQ_EQ = 47
-    LOCK_2_ACQ_GE = 48
-    LOCK_2_REL = 49
-    LOCK_2_EQUAL_TO_VALUE = 50
-    LOCK_3_ACQ_EQ = 51
-    LOCK_3_ACQ_GE = 52
-    LOCK_3_REL = 53
-    LOCK_3_EQUAL_TO_VALUE = 54
-    LOCK_4_ACQ_EQ = 55
-    LOCK_4_ACQ_GE = 56
-    LOCK_4_REL = 57
-    LOCK_4_EQUAL_TO_VALUE = 58
-    LOCK_5_ACQ_EQ = 59
-    LOCK_5_ACQ_GE = 60
-    LOCK_5_REL = 61
-    LOCK_5_EQUAL_TO_VALUE = 62
-    GROUP_ERRORS = 63
-    AXI_MM_SLAVE_ERROR = 64
-    CONTROL_PKT_ERROR = 65
-    STREAM_SWITCH_PARITY_ERROR = 66
-    AXI_MM_DECODE_NSU_ERROR = 67
-    AXI_MM_SLAVE_NSU_ERROR = 68
-    AXI_MM_UNSUPPORTED_TRAFFIC = 69
-    AXI_MM_UNSECURE_ACCESS_IN_SECURE_MODE = 70
-    AXI_MM_BYTE_STROBE_ERROR = 71
-    DMA_S2MM_ERROR = 72
-    DMA_MM2S_ERROR = 73
-    LOCK_ERROR = 74
-    DMA_TASK_TOKEN_STALL = 75
-    GROUP_STREAM_SWITCH = 76
-    PORT_IDLE_0 = 77
-    PORT_RUNNING_0 = 78
-    PORT_STALLED_0 = 79
-    PORT_TLAST_0 = 80
-    PORT_IDLE_1 = 81
-    PORT_RUNNING_1 = 82
-    PORT_STALLED_1 = 83
-    PORT_TLAST_1 = 84
-    PORT_IDLE_2 = 85
-    PORT_RUNNING_2 = 86
-    PORT_STALLED_2 = 87
-    PORT_TLAST_2 = 88
-    PORT_IDLE_3 = 89
-    PORT_RUNNING_3 = 90
-    PORT_STALLED_3 = 91
-    PORT_TLAST_3 = 92
-    PORT_IDLE_4 = 93
-    PORT_RUNNING_4 = 94
-    PORT_STALLED_4 = 95
-    PORT_TLAST_4 = 96
-    PORT_IDLE_5 = 97
-    PORT_RUNNING_5 = 98
-    PORT_STALLED_5 = 99
-    PORT_TLAST_5 = 100
-    PORT_IDLE_6 = 101
-    PORT_RUNNING_6 = 102
-    PORT_STALLED_6 = 103
-    PORT_TLAST_6 = 104
-    PORT_IDLE_7 = 105
-    PORT_RUNNING_7 = 106
-    PORT_STALLED_7 = 107
-    PORT_TLAST_7 = 108
-    GROUP_BROADCAST_A = 109
-    BROADCAST_A_0 = 110
-    BROADCAST_A_1 = 111
-    BROADCAST_A_2 = 112
-    BROADCAST_A_3 = 113
-    BROADCAST_A_4 = 114
-    BROADCAST_A_5 = 115
-    BROADCAST_A_6 = 116
-    BROADCAST_A_7 = 117
-    BROADCAST_A_8 = 118
-    BROADCAST_A_9 = 119
-    BROADCAST_A_10 = 120
-    BROADCAST_A_11 = 121
-    BROADCAST_A_12 = 122
-    BROADCAST_A_13 = 123
-    BROADCAST_A_14 = 124
-    BROADCAST_A_15 = 125
-    USER_EVENT_0 = 126
-    USER_EVENT_1 = 127
-
-
-class MemTileEvent(Enum):
-    NONE = 0
-    TRUE = 1
-    GROUP_0 = 2
-    TIMER_SYNC = 3
-    TIMER_VALUE_REACHED = 4
-    PERF_CNT0_EVENT = 5
-    PERF_CNT1_EVENT = 6
-    PERF_CNT2_EVENT = 7
-    PERF_CNT3_EVENT = 8
-    COMBO_EVENT_0 = 9
-    COMBO_EVENT_1 = 10
-    COMBO_EVENT_2 = 11
-    COMBO_EVENT_3 = 12
-    EDGE_DETECTION_EVENT_0 = 13
-    EDGE_DETECTION_EVENT_1 = 14
-    GROUP_WATCHPOINT = 15
-    WATCHPOINT_0 = 16
-    WATCHPOINT_1 = 17
-    WATCHPOINT_2 = 18
-    WATCHPOINT_3 = 19
-    GROUP_DMA_ACTIVITY = 20
-    DMA_S2MM_SEL0_START_TASK = 21
-    DMA_S2MM_SEL1_START_TASK = 22
-    DMA_MM2S_SEL0_START_TASK = 23
-    DMA_MM2S_SEL1_START_TASK = 24
-    DMA_S2MM_SEL0_FINISHED_BD = 25
-    DMA_S2MM_SEL1_FINISHED_BD = 26
-    DMA_MM2S_SEL0_FINISHED_BD = 27
-    DMA_MM2S_SEL1_FINISHED_BD = 28
-    DMA_S2MM_SEL0_FINISHED_TASK = 29
-    DMA_S2MM_SEL1_FINISHED_TASK = 30
-    DMA_MM2S_SEL0_FINISHED_TASK = 31
-    DMA_MM2S_SEL1_FINISHED_TASK = 32
-    DMA_S2MM_SEL0_STALLED_LOCK = 33
-    DMA_S2MM_SEL1_STALLED_LOCK = 34
-    DMA_MM2S_SEL0_STALLED_LOCK = 35
-    DMA_MM2S_SEL1_STALLED_LOCK = 36
-    DMA_S2MM_SEL0_STREAM_STARVATION = 37
-    DMA_S2MM_SEL1_STREAM_STARVATION = 38
-    DMA_MM2S_SEL0_STREAM_BACKPRESSURE = 39
-    DMA_MM2S_SEL1_STREAM_BACKPRESSURE = 40
-    DMA_S2MM_SEL0_MEMORY_BACKPRESSURE = 41
-    DMA_S2MM_SEL1_MEMORY_BACKPRESSURE = 42
-    DMA_MM2S_SEL0_MEMORY_STARVATION = 43
-    DMA_MM2S_SEL1_MEMORY_STARVATION = 44
-    GROUP_LOCK = 45
-    LOCK_SEL0_ACQ_EQ = 46
-    LOCK_SEL0_ACQ_GE = 47
-    LOCK_SEL0_REL = 48
-    LOCK_SEL0_EQUAL_TO_VALUE = 49
-    LOCK_SEL1_ACQ_EQ = 50
-    LOCK_SEL1_ACQ_GE = 51
-    LOCK_SEL1_REL = 52
-    LOCK_SEL1_EQUAL_TO_VALUE = 53
-    LOCK_SEL2_ACQ_EQ = 54
-    LOCK_SEL2_ACQ_GE = 55
-    LOCK_SEL2_REL = 56
-    LOCK_SEL2_EQUAL_TO_VALUE = 57
-    LOCK_SEL3_ACQ_EQ = 58
-    LOCK_SEL3_ACQ_GE = 59
-    LOCK_SEL3_REL = 60
-    LOCK_SEL3_EQUAL_TO_VALUE = 61
-    LOCK_SEL4_ACQ_EQ = 62
-    LOCK_SEL4_ACQ_GE = 63
-    LOCK_SEL4_REL = 64
-    LOCK_SEL4_EQUAL_TO_VALUE = 65
-    LOCK_SEL5_ACQ_EQ = 66
-    LOCK_SEL5_ACQ_GE = 67
-    LOCK_SEL5_REL = 68
-    LOCK_SEL5_EQUAL_TO_VALUE = 69
-    LOCK_SEL6_ACQ_EQ = 70
-    LOCK_SEL6_ACQ_GE = 71
-    LOCK_SEL6_REL = 72
-    LOCK_SEL6_EQUAL_TO_VALUE = 73
-    LOCK_SEL7_ACQ_EQ = 74
-    LOCK_SEL7_ACQ_GE = 75
-    LOCK_SEL7_REL = 76
-    LOCK_SEL7_EQUAL_TO_VALUE = 77
-    GROUP_STREAM_SWITCH = 78
-    PORT_IDLE_0 = 79
-    PORT_RUNNING_0 = 80
-    PORT_STALLED_0 = 81
-    PORT_TLAST_0 = 82
-    PORT_IDLE_1 = 83
-    PORT_RUNNING_1 = 84
-    PORT_STALLED_1 = 85
-    PORT_TLAST_1 = 86
-    PORT_IDLE_2 = 87
-    PORT_RUNNING_2 = 88
-    PORT_STALLED_2 = 89
-    PORT_TLAST_2 = 90
-    PORT_IDLE_3 = 91
-    PORT_RUNNING_3 = 92
-    PORT_STALLED_3 = 93
-    PORT_TLAST_3 = 94
-    PORT_IDLE_4 = 95
-    PORT_RUNNING_4 = 96
-    PORT_STALLED_4 = 97
-    PORT_TLAST_4 = 98
-    PORT_IDLE_5 = 99
-    PORT_RUNNING_5 = 100
-    PORT_STALLED_5 = 101
-    PORT_TLAST_5 = 102
-    PORT_IDLE_6 = 103
-    PORT_RUNNING_6 = 104
-    PORT_STALLED_6 = 105
-    PORT_TLAST_6 = 106
-    PORT_IDLE_7 = 107
-    PORT_RUNNING_7 = 108
-    PORT_STALLED_7 = 109
-    PORT_TLAST_7 = 110
-    GROUP_MEMORY_CONFLICT = 111
-    CONFLICT_DM_BANK_0 = 112
-    CONFLICT_DM_BANK_1 = 113
-    CONFLICT_DM_BANK_2 = 114
-    CONFLICT_DM_BANK_3 = 115
-    CONFLICT_DM_BANK_4 = 116
-    CONFLICT_DM_BANK_5 = 117
-    CONFLICT_DM_BANK_6 = 118
-    CONFLICT_DM_BANK_7 = 119
-    CONFLICT_DM_BANK_8 = 120
-    CONFLICT_DM_BANK_9 = 121
-    CONFLICT_DM_BANK_10 = 122
-    CONFLICT_DM_BANK_11 = 123
-    CONFLICT_DM_BANK_12 = 124
-    CONFLICT_DM_BANK_13 = 125
-    CONFLICT_DM_BANK_14 = 126
-    CONFLICT_DM_BANK_15 = 127
-    GROUP_ERRORS = 128
-    DM_ECC_ERROR_SCRUB_CORRECTED = 129
-    DM_ECC_ERROR_SCRUB_2BIT = 130
-    DM_ECC_ERROR_1BIT = 131
-    DM_ECC_ERROR_2BIT = 132
-    DMA_S2MM_ERROR = 133
-    DMA_MM2S_ERROR = 134
-    STREAM_SWITCH_PARITY_ERROR = 135
-    STREAM_PKT_ERROR = 136
-    CONTROL_PKT_ERROR = 137
-    AXI_MM_SLAVE_ERROR = 138
-    LOCK_ERROR = 139
-    DMA_TASK_TOKEN_STALL = 140
-    GROUP_BROADCAST = 141
-    BROADCAST_0 = 142
-    BROADCAST_1 = 143
-    BROADCAST_2 = 144
-    BROADCAST_3 = 145
-    BROADCAST_4 = 146
-    BROADCAST_5 = 147
-    BROADCAST_6 = 148
-    BROADCAST_7 = 149
-    BROADCAST_8 = 150
-    BROADCAST_9 = 151
-    BROADCAST_10 = 152
-    BROADCAST_11 = 153
-    BROADCAST_12 = 154
-    BROADCAST_13 = 155
-    BROADCAST_14 = 156
-    BROADCAST_15 = 157
-    GROUP_USER_EVENT = 158
-    USER_EVENT_0 = 159
-    USER_EVENT_1 = 160
diff --git a/python/utils/trace_utils.py b/python/utils/trace_utils.py
deleted file mode 100644
index 433cb004b7d..00000000000
--- a/python/utils/trace_utils.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
-# SPDX-License-Identifier: MIT
-
-# from CppHeaderParser import CppHeader
-import numpy as np
-import subprocess
-import json
-import re
-import os
-
-
-def get_kernel_code(test: dict, solutions_path: str = None) -> str:
-    """Fetch the kernel code from the provided solution path, if none provided default
-    to canonical solution."""
-    if not solutions_path:
-        return test["prompt"] + test["canonical_solution"]
-
-    with open(
-        os.path.join(solutions_path, f"{test['kernel_name']}.json"), "r"
-    ) as sol_file:
-        solution = json.load(sol_file)
-        if not solution.get("code"):
-            print(f"No code available in {solutions_path} for {test['kernel_name']}")
-            return None
-
-        srccode = solution["code"]
-
-        # if gpt decides to be too helpful and adds a main()... remove it
-        srccode = re.sub(
-            r"int\s+main\s*\([^)]*\)\s*{[^{}]*({[^{}]*}[^{}]*)*}",
-            "",
-            srccode,
-            flags=re.DOTALL,
-        )
-
-        # cppheaderparser will complain if we don't remove trailing comments
-        srccode = srccode.split('// extern "C"')[0]
-
-        return srccode
-
-
-def extract_buffers(test):
-    """Specific helper for the AIEval dataset - parses the test dictionary and returns
-    input buffers, output buffers and RTPs as separate lists.
-    """
-    input_buffers = []
-    for x in test["test_vectors"]["inputs"]:
-        array, dtype = list(x.values())
-        input_buffers.append(np.array(array, dtype=dtype))
-
-    output_buffers = []
-    for x in test["test_vectors"]["outputs"]:
-        array, dtype = list(x.values())
-        output_buffers.append(np.array(array, dtype=dtype))
-
-    rtps = []
-    if test["test_vectors"].get("rtps") != None:
-        for rtp in test["test_vectors"]["rtps"]:
-            array, dtype = rtp.values()
-            rtps.append(np.array(array, dtype=dtype))
-            # rtp_names.append(list(rtp.keys())[0])
-
-    return input_buffers, output_buffers, rtps
-
-
-def trace_to_json(trace_file: str, mlir_file: str, output_name: str = "trace.json"):
-    """Subprocesses wrapper over parse_trace.py utility.
-    Parameters
-    ----------
-    trace_file : str
-        The .txt trace file of 32-byte codes.
-    mlir_file : str
-        Path to the corresponding MLIR file for the design being traced.
-    output_name : str, optional
-        Path to output json file. You can analyze it using tools like https://ui.perfetto.dev
-    """
-    command = [
-        os.environ["MLIR_AIE_INSTALL_DIR"] + "/../../python/utils/parse_trace.py",
-        "--input",
-        trace_file,
-        "--mlir",
-        mlir_file,
-    ]
-
-    try:
-        result = subprocess.check_output(command, stderr=subprocess.STDOUT, text=True)
-        with open(output_name, "w") as f:
-            f.write(result)
-        print(f"Trace written to {output_name}")
-        return True
-    except subprocess.CalledProcessError as e:
-        print(f"Trace failed\n{e.output}")
-        return e.output
-
-
-def get_cycles(trace_path):
-    """This helper function should only be used to extract cycle counts
-    from NPUEval trace files where the expectation is to have exactly 1 of
-    each event0 and event1.
-    """
-    with open(trace_path, "r") as f:
-        data = json.load(f)
-
-    event0 = []
-    event1 = []
-    try:
-        for x in data:
-            if (x["name"] == "INSTR_EVENT_0") and (x["ph"] == "B"):
-                event0.append(x["ts"])
-                tmp = x["ts"]
-                # print("event0 found at "+str(event0[0]))
-
-            if x["name"] == "INSTR_EVENT_1" and x["ph"] == "B":
-                event1.append(x["ts"])
-                # print("event1 found at "+str(event1[0]))
-
-        return event1[0] - event0[0]
-    except:
-        return np.inf
-
-
-def get_cycles_summary(trace_path):
-    """This helper function is  used to extract cycle counts from a trace json
-    file and returns an array of cycles between pairs of event0 and event1.
-    This always assumes each event0 is followed by an event1 and ignores
-    extra event0 and event1's.
-    """
-    with open(trace_path, "r") as f:
-        data = json.load(f)
-
-    try:
-        deltas = []
-        in_kernel = []
-        event0 = []
-        for x in data:
-            if x["name"] == "process_name":
-                deltas.append([x["args"]["name"]])
-                in_kernel.append(False)
-                event0.append(0)
-
-        for x in data:
-            idx = int(x["pid"])
-            if (x["name"] == "INSTR_EVENT_0") and (x["ph"] == "B"):
-                if in_kernel[idx] == False:
-                    event0[idx] = x["ts"]
-                    # print("event0 found at "+str(event0))
-                    in_kernel[idx] = True
-
-            if x["name"] == "INSTR_EVENT_1" and x["ph"] == "B":
-                if in_kernel[idx] == True:
-                    # print("event1 found at "+str(x['ts']))
-                    deltas[idx].append(x["ts"] - event0[idx])
-                    in_kernel[idx] = False
-
-        return deltas
-    except Exception as e:
-        print("Exception found", e)
-        return np.inf
-
-
-def get_vector_time(trace):
-    """This function extracts the total time spent on the vectorized unit
-    from an NPUEval AIE trace (this must have exactly 1 event0 and 1 event1
-    sandwiching the kernel call).
-    """
-    with open(trace, "r") as f:
-        data = json.load(f)
-
-    start, end = None, None
-
-    # find start and end
-    for x in data:
-        if (x["name"] == "INSTR_EVENT_0") and (x["ph"] == "B"):
-            start = x["ts"]
-        if x["name"] == "INSTR_EVENT_1" and x["ph"] == "B":
-            end = x["ts"]
-
-    if not start or not end:
-        return 0
-
-    total_duration = 0
-    stack = []
-
-    for event in data:
-        if event["name"] == "INSTR_VECTOR":
-            if event["ts"] < start:
-                continue
-
-            if event["ts"] > end:
-                continue
-
-            if event["ph"] == "B":
-                stack.append(event)
-            elif event["ph"] == "E" and stack:
-                # Get matching begin event
-                begin_event = stack.pop()
-                # Calculate duration for this pair
-                duration = event["ts"] - begin_event["ts"]
-                total_duration += duration
-
-    return total_duration / (end - start)
diff --git a/python/utils/xrt.py b/python/utils/xrt.py
deleted file mode 100644
index c755917855e..00000000000
--- a/python/utils/xrt.py
+++ /dev/null
@@ -1,540 +0,0 @@
-# xrt.py -*- Python -*-
-#
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# (c) Copyright 2024 Advanced Micro Devices, Inc.
-import numpy as np
-import copy
-import time
-import pyxrt as xrt
-import os
-
-
-#
-# AI Engine Application class
-#
-# This class configures and invokes the XRT components needed to run an AIE
-# Application. This includes xrt.device, xrt.kernel, xrt.hw_context and XRT
-# buffers as enacpuslated by the AIE_Buffer class. You can use this class to
-# simplify and reduce the amount of code needed to set up an AIE application.
-#
-class AIE_Application:
-    # Registers xclbin to set up the device, hw context and kernel. This
-    # also sets up the instruction stream
-    def __init__(self, xclbin_path, insts_path, kernel_name="PP_FD_PRE"):
-        self.device = None
-        self.kernel = None
-        self.buffers = [None] * 8
-        self.device = xrt.device(0)
-
-        # Find kernel by name in the xclbin
-        self.xclbin = xrt.xclbin(xclbin_path)
-        kernels = self.xclbin.get_kernels()
-        try:
-            xkernel = [k for k in kernels if kernel_name == k.get_name()][0]
-        except KeyError:
-            raise AIE_Application_Error("No such kernel: " + kernel_name)
-        self.device.register_xclbin(self.xclbin)
-        self.context = xrt.hw_context(self.device, self.xclbin.get_uuid())
-        self.kernel = xrt.kernel(self.context, xkernel.get_name())
-
-        ## Set up instruction stream
-        insts = read_insts(insts_path)
-        self.n_insts = len(insts)
-        self.insts_buffer = AIE_Buffer(
-            self, 1, insts.dtype, insts.shape, xrt.bo.cacheable
-        )
-        self.insts_buffer.write(insts)
-
-    # Registers an AIE_Buffer class object given group_id, datatype and shape
-    def register_buffer(self, group_id, *args, **kwargs):
-        self.buffers[group_id] = AIE_Buffer(self, group_id, *args, **kwargs)
-
-    # This syncs the instruction buffer to the device and then invokes the
-    # `call` function before wait for the call to complete
-    def run(self):
-        self.insts_buffer.sync_to_device()
-        h = self.call()
-        r = h.wait()
-        if r != xrt.ert_cmd_state.ERT_CMD_STATE_COMPLETED:
-            raise Exception(f"Kernel returned {r}")
-
-    # Wrapper for xrt.kernel function passing in opcode and buffers objects
-    # class `AIE_Buffer`
-    def call(self):
-        opcode = 3
-        h = self.kernel(
-            opcode,
-            self.insts_buffer.bo,
-            self.n_insts,
-            *[b.bo for b in self.buffers if b is not None],
-        )
-        return h
-
-    def __del__(self):
-        if hasattr(self, "kernel"):
-            del self.kernel
-            self.kernel = None
-        if hasattr(self, "device"):
-            del self.device
-            self.device = None
-
-
-# This class wraps up access to the xrt.bo buffer object where sync calls are added
-# to read and write calls to ensure data is synchronized.
-class AIE_Buffer:
-
-    # Declare xrt.bo object given group_id, datatype, shape
-    def __init__(self, application, group_id, dtype, shape, flags=xrt.bo.host_only):
-        self.application = application
-        self.dtype = dtype
-        self.shape = shape
-        self.len_bytes = np.prod(shape) * np.dtype(dtype).itemsize
-        self.bo = xrt.bo(
-            application.device,
-            self.len_bytes,
-            flags,
-            application.kernel.group_id(group_id),
-        )
-
-    # Synchronize data from device before reading xrt.bo data
-    def read(self):
-        self.sync_from_device()
-        return self.bo.read(self.len_bytes, 0).view(self.dtype).reshape(self.shape)
-
-    # Write data to xrt.bo and synchronize data to device
-    def write(self, v, offset=0):
-        self.bo.write(v.view(np.uint8), offset)
-        self.sync_to_device()
-
-    # Wrapper for xrt.bo.sync call (to device)
-    def sync_to_device(self):
-        return self.bo.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-
-    # Wrapper for xrt.bo.sync call (from device)
-    def sync_from_device(self):
-        return self.bo.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
-
-    def __del__(self):
-        if hasattr(self, "bo"):
-            del self.bo
-            self.bo = None
-
-
-class AIE_Application_Error(Exception):
-    pass
-
-
-insts_cache = {}
-
-
-# Read instruction stream from text file and reformat it to be passed into the
-# instruction buffer for the xrt.kernel call
-def read_insts_sequence(insts_path):
-    """Reads instructions from a text file (hex numbers, one per line)."""
-    global insts_cache
-    if insts_path in insts_cache:
-        # Speed up things if we re-configure the array a lot: Don't re-parse
-        # the insts.txt each time
-        return insts_cache[insts_path]
-    with open(insts_path, "r") as f:
-        insts_text = f.readlines()
-        insts_text = [l for l in insts_text if l != ""]
-        insts_v = np.array([int(c, 16) for c in insts_text], dtype=np.uint32)
-        insts_cache[insts_path] = insts_v
-    return insts_v
-
-
-# Read instruction stream from bin file and reformat it to be passed into the
-# instruction buffer for the xrt.kernel call
-def read_insts_binary(insts_path):
-    """Reads instructions from a binary file."""
-    global insts_cache
-    if insts_path in insts_cache:
-        # Speed up things if we re-configure the array a lot: Don't re-parse
-        # the insts.bin each time
-        return insts_cache[insts_path]
-    with open(insts_path, "rb") as f:
-        data = f.read()
-    # Interpret the binary data as an array of uint32 values.
-    insts_v = np.frombuffer(data, dtype=np.uint32)
-    insts_cache[insts_path] = insts_v
-    return insts_v
-
-
-def read_insts(insts_path):
-    """
-    Reads instructions from the given file.
-    If the file extension is .bin, uses binary read.
-    If the file extension is .txt, uses sequence (text) read.
-    """
-    _, ext = os.path.splitext(insts_path)
-    ext = ext.lower()
-
-    if ext == ".bin":
-        return read_insts_binary(insts_path)
-    elif ext == ".txt":
-        return read_insts_sequence(insts_path)
-    else:
-        raise ValueError("Unsupported file extension: expected .bin or .txt")
-
-
-# Sets up the AIE application with support for up to 2 input buffers, 1 output
-# buffer, and an optional trace buffer. Under the hood, we call declare an
-# AIE_Application object and register the buffers used given the buffer datatype
-# and shapes.
-def setup_aie(
-    xclbin_path,
-    insts_path,
-    in_0_shape,
-    in_0_dtype,
-    in_1_shape,
-    in_1_dtype,
-    out_buf_shape,
-    out_buf_dtype,
-    enable_trace=False,
-    kernel_name="MLIR_AIE",
-    trace_size=16384,
-    verbosity=0,
-    trace_after_output=False,
-):
-    app = AIE_Application(xclbin_path, insts_path, kernel_name)
-
-    if in_0_shape and in_0_dtype:
-        if verbosity >= 1:
-            print(
-                f"register 1st input to group_id 3: size: {in_0_shape}, dtype: {in_0_dtype}"
-            )
-        app.register_buffer(3, shape=in_0_shape, dtype=in_0_dtype)
-    if in_1_shape and in_1_dtype:
-        if verbosity >= 1:
-            print(
-                f"register 2nd input to group_id 4: size: {in_1_shape}, dtype: {in_1_dtype}"
-            )
-        app.register_buffer(4, shape=in_1_shape, dtype=in_1_dtype)
-
-    if enable_trace:
-        if trace_after_output:
-            out_buf_len_bytes = (
-                np.prod(out_buf_shape) * np.dtype(out_buf_dtype).itemsize
-            )
-            out_buf_shape = (out_buf_len_bytes + trace_size,)
-            out_buf_dtype = np.uint8
-
-    if in_1_shape and in_1_dtype:
-        if verbosity >= 1:
-            print(
-                f"register output to group_id 5: size: {out_buf_shape}, dtype: {out_buf_dtype}"
-            )
-        app.register_buffer(5, shape=out_buf_shape, dtype=out_buf_dtype)
-    else:
-        if verbosity >= 1:
-            print(
-                f"register output to group_id 4: size: {out_buf_shape}, dtype: {out_buf_dtype}"
-            )
-        app.register_buffer(4, shape=out_buf_shape, dtype=out_buf_dtype)
-        if verbosity >= 1:
-            print(
-                "register placeholder buffer (32b) to group_id 5: size: 1, dtype: uint32"
-            )
-        app.register_buffer(
-            5, shape=(1,), dtype=np.uint32
-        )  # TODO Needed so register buf 7 succeeds (not needed in C/C++ host code)
-
-    if enable_trace:
-        if not trace_after_output:
-            # trace_buf_shape = (trace_size,)
-            # trace_buf_shape = (trace_size+8,)
-            trace_buf_shape = (trace_size * 4,)
-            trace_buf_dtype = np.uint8
-
-            if verbosity >= 1:
-                # print("register placeholder buffer (32b) to group_id 6")
-                # print("register 2x 32b words for ctrl packets to group_id 6")
-                print("register for ctrl packets to group_id 6: size: 8, dtype: uint32")
-            app.register_buffer(
-                # 6, shape=(1,), dtype=np.uint32
-                # 6, shape=(2,), dtype=np.uint32
-                6,
-                shape=(8,),
-                dtype=np.uint32,
-            )  # TODO Needed so register buf 7 succeeds (not needed in C/C++ host code)
-
-            if verbosity >= 1:
-                print(
-                    f"register trace on 7: size: {trace_buf_shape}, dtype: {trace_buf_dtype}"
-                )
-            app.register_buffer(7, shape=trace_buf_shape, dtype=trace_buf_dtype)
-
-    return app
-
-
-# checks # of bits. Odd number returns a 1. Even returns 0.
-def parity(x):
-    return x.bit_count() & 1
-
-
-# create control packet
-def create_ctrl_pkt(
-    operation,
-    beats,
-    addr,
-    ctrl_pkt_read_id=28,  # global id used for all ctrl packet reads
-    # WARNING: this needs to match the packet id used in packetflow/.py
-):
-    header = (ctrl_pkt_read_id << 24) | (operation << 22) | (beats << 20) | addr
-    header |= (0x1 ^ parity(header)) << 31
-    return header
-
-
-def setup_buffer_data(
-    app,
-    input_one=None,
-    input_two=None,
-    enable_trace=False,
-    enable_ctrl_pkts=False,
-    verbosity=False,
-):
-    if not (input_one is None):
-        app.buffers[3].write(input_one)
-    if not (input_two is None):
-        app.buffers[4].write(input_two)
-
-    deadbeef_string = "EFBEADDE" * 10
-
-    # Convert the hex string to a bytes object
-    byte_data = bytes.fromhex(deadbeef_string)
-
-    # Create the NumPy array from the bytes object
-    init_trace_data = np.frombuffer(byte_data, dtype=np.uint32)
-
-    if enable_trace:
-        if enable_ctrl_pkts:
-            # write ctrl packets
-            header = np.array(
-                [
-                    create_ctrl_pkt(1, 0, 0x32004),  # core status
-                    create_ctrl_pkt(1, 0, 0x340D8),  # trace status
-                ],
-                dtype=np.uint32,
-            )
-            if verbosity:
-                print("header", [hex(x) for x in header])
-            app.buffers[6].write(header)
-
-        app.buffers[7].write(init_trace_data)
-
-    # print("ctrl, buffers[6]: ", [hex(x) for x in app.buffers[6].read()])
-    # print("init, buffers[7]: ", [hex(x) for x in app.buffers[7].read()])
-
-
-def return_buffer_results(
-    app, input_one=None, input_two=None, enable_trace=False, trace_after_output=False
-):
-    if trace_after_output or not enable_trace:
-        if not (input_two is None):
-            return app.buffers[5].read()
-        else:
-            return app.buffers[4].read()
-    else:
-
-        if not (input_two is None):
-            return app.buffers[5].read(), app.buffers[7].read()
-        else:
-            return app.buffers[4].read(), app.buffers[7].read()
-
-
-# Wrapper function to write buffer arguments into registered input buffers, then call
-# `run` function for AIE Application, and finally return the output buffer data.
-def execute(
-    app,
-    input_one=None,
-    input_two=None,
-    enable_trace=False,
-    enable_ctrl_pkts=False,
-    trace_after_output=False,
-    verbosity=False,
-):
-    setup_buffer_data(
-        app, input_one, input_two, enable_trace, enable_ctrl_pkts, verbosity
-    )
-    app.run()
-    return return_buffer_results(
-        app, input_one, input_two, enable_trace, trace_after_output
-    )
-
-
-# Wrapper for execute but we do the host time delta directly around the app.run() call
-# so buffer init and read are not included
-def execute_timed(
-    app,
-    input_one=None,
-    input_two=None,
-    enable_trace=False,
-    enable_ctrl_pkts=False,
-    trace_after_output=False,
-    verbosity=False,
-):
-    setup_buffer_data(
-        app, input_one, input_two, enable_trace, enable_ctrl_pkts, verbosity
-    )
-    start = time.time_ns()
-    app.run()
-    stop = time.time_ns()
-    npu_time = stop - start
-    ret = return_buffer_results(
-        app, input_one, input_two, enable_trace, trace_after_output
-    )
-    if enable_trace:
-        return ret + (npu_time,)
-    else:
-        return (ret, npu_time)
-
-
-# Wrapper function to separate output data and trace data from a single output buffer stream
-def extract_trace(out_buf, out_buf_shape, out_buf_dtype, trace_size):
-    trace_size_words = trace_size // 4
-    out_buf_flat = out_buf.reshape((-1,)).view(np.uint32)
-    output_prefix = (
-        out_buf_flat[:-trace_size_words].view(out_buf_dtype).reshape(out_buf_shape)
-    )
-    trace_suffix = out_buf_flat[-trace_size_words:]
-    return output_prefix, trace_suffix
-
-
-def extract_tile(data):
-    col = (data >> 21) & 0x7F
-    row = (data >> 16) & 0x1F
-    pkt_type = (data >> 12) & 0x3
-    pkt_id = data & 0x1F
-    return (col, row, pkt_type, pkt_id)
-
-
-# Wrapper function to write trace buffer values to a text file
-def write_out_trace(trace, file_name):
-    out_str = "\n".join(f"{i:0{8}x}" for i in trace if i != 0)
-    with open(file_name, "w") as f:
-        f.write(out_str)
-
-
-# This wrapper function abstracts the full set of functions to setup the aie and run
-# the kernel program including check for functional correctness and reporting the
-# run time. Under the hood, we call `setup_aie` to set up the AIE application before
-# calling `execute` and checking results. The datatypes and shape for the 2 inputs
-# and 1 output buffers are passed in as arguments, along with the gold reference data
-# to compare it against. Trace buffers is also written out to a text file if trace is
-# enabled.
-def setup_and_run_aie(
-    in1_dtype,
-    in2_dtype,
-    out_dtype,
-    in1_data,
-    in2_data,
-    out_data,
-    in1_volume,
-    in2_volume,
-    out_volume,
-    ref,
-    opts,
-    trace_after_output=False,
-    enable_ctrl_pkts=False,
-):
-    enable_trace = opts.trace_size > 0
-    if opts.verbosity >= 1:
-        print("trace size = ", str(opts.trace_size))
-        print("enable_trace = ", str(enable_trace))
-
-    app = setup_aie(
-        opts.xclbin,
-        opts.instr,
-        in1_volume,
-        in1_dtype,
-        in2_volume,
-        in2_dtype,
-        out_volume,
-        out_dtype,
-        enable_trace=enable_trace,
-        trace_size=opts.trace_size,
-        verbosity=opts.verbosity,
-        trace_after_output=trace_after_output,
-    )
-
-    out_size = out_volume * out_data.itemsize
-    if opts.verbosity >= 1:
-        print("out_size: " + str(out_size))
-
-    if enable_trace:
-        full_output, trace_and_ctrl_buffer, npu_time = execute_timed(
-            app,
-            in1_data,
-            in2_data,
-            enable_trace,
-            enable_ctrl_pkts,
-            trace_after_output,
-            opts.verbosity,
-        )
-    else:
-        full_output, npu_time = execute_timed(
-            app,
-            in1_data,
-            in2_data,
-            enable_trace,
-            enable_ctrl_pkts,
-            trace_after_output,
-            opts.verbosity,
-        )
-
-    print("npu_time: ", npu_time / 1000.0, " us")
-
-    aie_output = full_output[:out_size].view(out_dtype)
-    if enable_trace:
-        # trace_size_words = opts.trace_size // 4
-
-        if trace_after_output:
-            trace_buffer = full_output[out_size:].view(np.uint32)
-        else:
-            if opts.verbosity >= 1:
-                print("trace_and_ctrl_buffer shape: ", trace_and_ctrl_buffer.shape)
-                print("trace_and_ctrl_buffer dtype: ", trace_and_ctrl_buffer.dtype)
-            trace_buffer = trace_and_ctrl_buffer[: opts.trace_size].view(np.uint32)
-            if enable_ctrl_pkts:
-                ctrl_buffer = trace_and_ctrl_buffer[opts.trace_size :].view(np.uint32)
-
-    if enable_trace:
-        if opts.verbosity >= 1:
-            print("trace_buffer shape: ", trace_buffer.shape)
-            print("trace_buffer dtype: ", trace_buffer.dtype)
-            if enable_ctrl_pkts:
-                print("ctrl_buffer shape: ", ctrl_buffer.shape)
-                print("ctrl_buffer dtype: ", ctrl_buffer.dtype)
-                print("ctrl buffer: ", [hex(d) for d in ctrl_buffer])
-                # [hex(ctrl_buffer[0]), hex(ctrl_buffer[1])])
-
-        write_out_trace(trace_buffer, str(opts.trace_file))
-
-        if enable_ctrl_pkts:
-            for i in range(ctrl_buffer.size // 2):
-                col, row, pkt_type, pkt_id = extract_tile(ctrl_buffer[i * 2])
-                overflow = True if (ctrl_buffer[i * 2 + 1] >> 8) == 3 else False
-                if overflow:
-                    print(
-                        f"WARNING: Trace overflow detected in tile({row},{col}). Trace results may be invalid."
-                    )
-
-    # Copy output results and verify they are correct
-    errors = 0
-    if opts.verify:
-        if opts.verbosity >= 1:
-            print("Verifying results ...")
-        e = np.equal(ref, aie_output)
-        errors = np.size(e) - np.count_nonzero(e)
-
-    if not errors:
-        print("\nPASS!\n")
-        return 0
-    else:
-        print("\nError count: ", errors)
-        print("\nFailed.\n")
-        return 1
diff --git a/test/lit.cfg.py b/test/lit.cfg.py
index 8009fe906a1..cd3edd57188 100644
--- a/test/lit.cfg.py
+++ b/test/lit.cfg.py
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2021 Xilinx Inc.
+# (c) Copyright 2021-2026 Xilinx Inc.
 
 import os
 import sys
@@ -155,6 +155,9 @@
 # Concurrency tests control their own parallelism, so run them serially
 lit_config.parallelism_groups["concurrency"] = 1
 
+# NPU XRT tests should run serially to avoid resource contention
+lit_config.parallelism_groups["npu-xrt"] = 1
+
 if config.python_passes:
     config.available_features.add("python_passes")
 
diff --git a/test/npu-xrt/add_one_objFifo/run.lit b/test/npu-xrt/add_one_objFifo/run.lit
index 7bb95a5b363..d736abad840 100644
--- a/test/npu-xrt/add_one_objFifo/run.lit
+++ b/test/npu-xrt/add_one_objFifo/run.lit
@@ -1,7 +1,7 @@
-// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// (c) Copyright 2023-2026 Advanced Micro Devices, Inc.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// REQUIRES: ryzen_ai
+// REQUIRES: ryzen_ai, xrt_python_bindings
 //
 // RUN: cp %S/aie.mlir aie_arch.mlir
 // RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir
diff --git a/test/npu-xrt/add_one_objFifo/test.py b/test/npu-xrt/add_one_objFifo/test.py
index a2a7b72b909..19e9a8ec1dd 100644
--- a/test/npu-xrt/add_one_objFifo/test.py
+++ b/test/npu-xrt/add_one_objFifo/test.py
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2025, Advanced Micro Devices, Inc.
+# Copyright (C) 2025-2026, Advanced Micro Devices, Inc.
 #
 # ===----------------------------------------------------------------------===#
 
@@ -12,143 +12,38 @@
 # REQUIRES: dont_run
 # RUN: echo FAIL | FileCheck %s
 # CHECK: PASS
-
-import argparse
-import pyxrt as xrt
+import sys
 import numpy as np
-import os
-import struct
+
+import aie.utils.test as test_utils
+import aie.iron as iron
+from aie.utils import DefaultNPURuntime
 
 IN_SIZE = 64
 OUT_SIZE = 64
 
 
-def check_file_exists(filepath):
-    if not os.path.isfile(filepath):
-        raise FileNotFoundError(f"File not found: {filepath}")
-
-
-def load_instr_binary(filepath):
-    with open(filepath, "rb") as f:
-        return list(struct.unpack(f"{os.path.getsize(filepath)//4}I", f.read()))
-
-
-def main():
-    # Argument parsing
-    parser = argparse.ArgumentParser(description="PyXRT Test Script")
-    parser.add_argument("-x", "--xclbin", required=True, help="The input xclbin path")
-    parser.add_argument(
-        "-k",
-        "--kernel",
-        required=True,
-        help="The kernel name in the XCLBIN (e.g., PP_PRE_FD)",
-    )
-    parser.add_argument(
-        "-v", "--verbosity", type=int, default=0, help="The verbosity of the output"
-    )
-    parser.add_argument(
-        "-i",
-        "--instr",
-        required=True,
-        help="Path of file containing userspace instructions to be sent to the LX6",
-    )
-    args = parser.parse_args()
-
-    # Check if files exist
-    check_file_exists(args.xclbin)
-    check_file_exists(args.instr)
-
-    # Load instruction binary
-    instr_v = load_instr_binary(args.instr)
-
-    if args.verbosity >= 1:
-        print(f"Sequence instr count: {len(instr_v)}")
-
-    device = xrt.device(0)
-
-    # Load the xclbin
-    if args.verbosity >= 1:
-        print(f"Loading xclbin: {args.xclbin}")
-    xclbin = xrt.xclbin(args.xclbin)
-
-    if args.verbosity >= 1:
-        print(f"Kernel opcode: {args.kernel}")
-
-    # Get the kernel from the xclbin
-    xkernels = xclbin.get_kernels()
-    xkernel = [k for k in xkernels if args.kernel in k.get_name()][0]
-    kernel_name = xkernel.get_name()
-
-    if args.verbosity >= 1:
-        print(f"Registering xclbin: {args.xclbin}")
-    device.register_xclbin(xclbin)
-
-    # Get a hardware context
-    if args.verbosity >= 1:
-        print("Getting hardware context.")
-    context = xrt.hw_context(device, xclbin.get_uuid())
-
-    # Get a kernel handle
-    if args.verbosity >= 1:
-        print(f"Getting handle to kernel: {kernel_name}")
-    kernel = xrt.kernel(context, kernel_name)
-
-    # Create buffer objects
-    bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(1))
-    bo_inA = xrt.bo(device, IN_SIZE * 4, xrt.bo.host_only, kernel.group_id(3))
-    bo_inB = xrt.bo(device, IN_SIZE * 4, xrt.bo.host_only, kernel.group_id(4))
-    bo_out = xrt.bo(device, OUT_SIZE * 4, xrt.bo.host_only, kernel.group_id(5))
-
-    if args.verbosity >= 1:
-        print("Writing data into buffer objects.")
-
-    # Fill input buffer A
-    buf_inA = np.arange(1, IN_SIZE + 1, dtype=np.uint32)
-    bo_inA.write(buf_inA, 0)
-
-    # Fill instruction buffer
-    buf_instr = np.array(instr_v, dtype=np.uint32)
-    bo_instr.write(buf_instr, 0)
-
-    bo_instr.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-    bo_inA.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-
-    if args.verbosity >= 1:
-        print("Running Kernel.")
-
-    # Run the kernel
-    opcode = 3
-    run = kernel(opcode, bo_instr, len(instr_v), bo_inA, bo_inB, bo_out)
-    r = run.wait()
-
-    if r != xrt.ert_cmd_state.ERT_CMD_STATE_COMPLETED:
-        print(f"Kernel did not complete. Returned status: {r}")
-        return 1
-
-    bo_out.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
-
-    # Read output buffer
-    buf_out = np.empty(OUT_SIZE, dtype=np.uint32)
-    buf_out = bo_out.read(buf_out.size * buf_out.itemsize, 0).view(dtype=buf_out.dtype)
-
-    # Verify output
-    errors = 0
-    for i in range(OUT_SIZE):
-        ref = i + 42
-        if buf_out[i] != ref:
-            print(f"Error in output {buf_out[i]} != {ref}")
-            errors += 1
-        else:
-            # print(f"Correct output {buf_out[i]} == {ref}")
-            pass
-
-    if errors == 0:
-        print("\nPASS!\n")
-        return 0
+def main(opts):
+    ref_data = np.arange(1, IN_SIZE + 1, dtype=np.uint32)
+    inA = iron.tensor(ref_data, dtype=np.uint32)
+    inB = iron.tensor(ref_data, dtype=np.uint32)
+    out = iron.zeros((OUT_SIZE,), dtype=np.uint32)
+    ref_data = ref_data + 41
+
+    npu_opts = test_utils.create_npu_kernel(opts)
+    if not DefaultNPURuntime.run_test(
+        npu_opts.npu_kernel,
+        [inA, inB, out],
+        {2: ref_data},
+        verify=npu_opts.verify,
+        verbosity=npu_opts.verbosity,
+    ):
+        print("PASS!")
     else:
-        print("\nfailed.\n")
-        return 1
+        print("Failed.")
 
 
 if __name__ == "__main__":
-    main()
+    p = test_utils.create_default_argparser()
+    opts = p.parse_args(sys.argv[1:])
+    sys.exit(main(opts))
diff --git a/test/npu-xrt/add_one_objFifo_elf/run.lit b/test/npu-xrt/add_one_objFifo_elf/run.lit
index f3a97637ca3..ef4d313e59f 100644
--- a/test/npu-xrt/add_one_objFifo_elf/run.lit
+++ b/test/npu-xrt/add_one_objFifo_elf/run.lit
@@ -1,7 +1,7 @@
-// (c) Copyright 2023 Advanced Micro Devices, Inc.
+// (c) Copyright 2023-2026 Advanced Micro Devices, Inc.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
-// REQUIRES: ryzen_ai
+// REQUIRES: ryzen_ai, xrt_python_bindings
 //
 // RUN: cp %S/aie.mlir aie_arch.mlir
 // RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir
diff --git a/test/npu-xrt/add_one_objFifo_elf/test.py b/test/npu-xrt/add_one_objFifo_elf/test.py
index 8983d1b96c4..19e9a8ec1dd 100644
--- a/test/npu-xrt/add_one_objFifo_elf/test.py
+++ b/test/npu-xrt/add_one_objFifo_elf/test.py
@@ -4,7 +4,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2025, Advanced Micro Devices, Inc.
+# Copyright (C) 2025-2026, Advanced Micro Devices, Inc.
 #
 # ===----------------------------------------------------------------------===#
 
@@ -12,128 +12,38 @@
 # REQUIRES: dont_run
 # RUN: echo FAIL | FileCheck %s
 # CHECK: PASS
-
-import argparse
-import pyxrt as xrt
+import sys
 import numpy as np
-import os
-import struct
+
+import aie.utils.test as test_utils
+import aie.iron as iron
+from aie.utils import DefaultNPURuntime
 
 IN_SIZE = 64
 OUT_SIZE = 64
 
 
-def check_file_exists(filepath):
-    if not os.path.isfile(filepath):
-        raise FileNotFoundError(f"File not found: {filepath}")
-
-
-def main():
-    # Argument parsing
-    parser = argparse.ArgumentParser(description="PyXRT Test Script")
-    parser.add_argument("-x", "--xclbin", required=True, help="The input xclbin path")
-    parser.add_argument(
-        "-k",
-        "--kernel",
-        required=True,
-        help="The kernel name in the XCLBIN (e.g., PP_PRE_FD)",
-    )
-    parser.add_argument(
-        "-v", "--verbosity", type=int, default=0, help="The verbosity of the output"
-    )
-    parser.add_argument(
-        "-i",
-        "--instr",
-        required=True,
-        help="Path of file containing userspace instructions to be sent to the LX6",
-    )
-    args = parser.parse_args()
-
-    # Check if files exist
-    check_file_exists(args.xclbin)
-    check_file_exists(args.instr)
-
-    device = xrt.device(0)
-
-    # Load the xclbin
-    if args.verbosity >= 1:
-        print(f"Loading xclbin: {args.xclbin}")
-    xclbin = xrt.xclbin(args.xclbin)
-
-    if args.verbosity >= 1:
-        print(f"Kernel opcode: {args.kernel}")
-
-    # Get the kernel from the xclbin
-    xkernels = xclbin.get_kernels()
-    xkernel = [k for k in xkernels if args.kernel in k.get_name()][0]
-    kernel_name = xkernel.get_name()
-
-    if args.verbosity >= 1:
-        print(f"Registering xclbin: {args.xclbin}")
-    device.register_xclbin(xclbin)
-
-    elf = xrt.elf(args.instr)
-    mod = xrt.module(elf)
-
-    # Get a hardware context
-    if args.verbosity >= 1:
-        print("Getting hardware context.")
-    context = xrt.hw_context(device, xclbin.get_uuid())
-
-    # Get a kernel handle
-    if args.verbosity >= 1:
-        print(f"Getting handle to kernel: {kernel_name}")
-    kernel = xrt.ext.kernel(context, mod, kernel_name)
-
-    # Create buffer objects
-    bo_inA = xrt.ext.bo(device, IN_SIZE * 4)
-    bo_inB = xrt.ext.bo(device, IN_SIZE * 4)
-    bo_out = xrt.ext.bo(device, OUT_SIZE * 4)
-
-    if args.verbosity >= 1:
-        print("Writing data into buffer objects.")
-
-    # Fill input buffer A
-    buf_inA = np.arange(1, IN_SIZE + 1, dtype=np.uint32)
-    bo_inA.write(buf_inA, 0)
-    bo_inA.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-
-    if args.verbosity >= 1:
-        print("Running Kernel.")
-
-    # Run the kernel
-    opcode = 3
-    run = kernel(opcode, 0, 0, bo_inA, bo_inB, bo_out)
-    r = run.wait()
-
-    if r != xrt.ert_cmd_state.ERT_CMD_STATE_COMPLETED:
-        print(f"Kernel did not complete. Returned status: {r}")
-        return 1
-
-    bo_out.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
-
-    # Read output buffer
-    buf_out = np.ones(OUT_SIZE, dtype=np.uint32)
-    buf_out = bo_out.read(buf_out.size * buf_out.itemsize, 0).view(dtype=buf_out.dtype)
-
-    # Verify output
-    errors = 0
-    for i in range(OUT_SIZE):
-        ref = i + 42
-        if buf_out[i] != ref:
-            print(f"Error in output {buf_out[i]} != {ref}")
-            errors += 1
-        else:
-            # print(f"Correct output {buf_out[i]} == {ref}")
-            pass
-
-    if errors == 0:
-        print("\nPASS!\n")
-        return 0
+def main(opts):
+    ref_data = np.arange(1, IN_SIZE + 1, dtype=np.uint32)
+    inA = iron.tensor(ref_data, dtype=np.uint32)
+    inB = iron.tensor(ref_data, dtype=np.uint32)
+    out = iron.zeros((OUT_SIZE,), dtype=np.uint32)
+    ref_data = ref_data + 41
+
+    npu_opts = test_utils.create_npu_kernel(opts)
+    if not DefaultNPURuntime.run_test(
+        npu_opts.npu_kernel,
+        [inA, inB, out],
+        {2: ref_data},
+        verify=npu_opts.verify,
+        verbosity=npu_opts.verbosity,
+    ):
+        print("PASS!")
     else:
-        print("\nfailed.\n")
-        return 1
+        print("Failed.")
 
 
 if __name__ == "__main__":
-    main()
+    p = test_utils.create_default_argparser()
+    opts = p.parse_args(sys.argv[1:])
+    sys.exit(main(opts))
diff --git a/test/npu-xrt/lit.local.cfg b/test/npu-xrt/lit.local.cfg
index ec350cc9837..d05972e2a31 100644
--- a/test/npu-xrt/lit.local.cfg
+++ b/test/npu-xrt/lit.local.cfg
@@ -2,7 +2,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2023 AMD Inc.
+# (c) Copyright 2023-2026 AMD Inc.
 
 config.suffixes = [".lit", ".py"]
 
@@ -10,3 +10,5 @@ if 'AIE2' not in config.vitis_components and 'AIE2P' not in config.vitis_compone
     config.unsupported = True
 
 config.excludes.add("util.py")
+
+config.parallelism_group = "npu-xrt"
diff --git a/test/npu-xrt/vec_mul_event_trace/test.py b/test/npu-xrt/vec_mul_event_trace/test.py
index 85e4a2f8284..7ba213606aa 100644
--- a/test/npu-xrt/vec_mul_event_trace/test.py
+++ b/test/npu-xrt/vec_mul_event_trace/test.py
@@ -5,11 +5,11 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2025, Advanced Micro Devices, Inc.
+# Copyright (C) 2025-2026, Advanced Micro Devices, Inc.
 #
 # ===-----------------------------------------------------------------------===#
 #
-# REQUIRES: ryzen_ai_npu1, chess
+# REQUIRES: ryzen_ai_npu1, xrt_python_bindings
 #
 
 # Build the test
@@ -19,76 +19,17 @@
 # Run the test
 # RUN: %run_on_npu1% %python %S/test.py --xclbin final.xclbin --instr insts.bin --kernel MLIR_AIE --trace-sz 8192 --mlir %S/aie.mlir | FileCheck %s
 # CHECK: PASS!
-
-import argparse
 import numpy as np
 import sys
 from pathlib import Path
-import pyxrt as xrt
-from aie.utils.parse_trace import parse_trace
-
-
-# Buffer sizes
-IN1_SIZE = 16384  # bytes
-IN2_SIZE = 4  # bytes
-OUT_SIZE = 16384  # bytes
-
-DATATYPE_IN1 = np.int32
-DATATYPE_IN2 = np.int32
-DATATYPE_OUT = np.int32
-
-IN1_VOLUME = IN1_SIZE // np.dtype(DATATYPE_IN1).itemsize
-IN2_VOLUME = IN2_SIZE // np.dtype(DATATYPE_IN2).itemsize
-OUT_VOLUME = OUT_SIZE // np.dtype(DATATYPE_OUT).itemsize
-
-
-def initialize_bufIn1(size):
-    """Initialize Input buffer 1"""
-    rng = np.random.default_rng(seed=42)
-    return rng.integers(1, 100, size=size, dtype=DATATYPE_IN1)
-
-
-def initialize_bufIn2(size):
-    """Initialize Input buffer 2"""
-    buf = np.zeros(size, dtype=DATATYPE_IN2)
-    buf[0] = 3  # scaleFactor
-    return buf
-
-
-def initialize_bufOut(size):
-    """Initialize Output buffer"""
-    return np.zeros(size, dtype=DATATYPE_OUT)
-
-
-def verify_vector_scalar_mul(bufIn1, bufIn2, bufOut, size, verbosity=0):
-    """Functional correctness verifier"""
-    errors = 0
-
-    for i in range(size):
-        ref = bufIn1[i] * bufIn2[0]
-        test = bufOut[i]
-        if test != ref:
-            if verbosity >= 1:
-                print(f"Error in output {test} != {ref}")
-            errors += 1
-        else:
-            if verbosity >= 1:
-                print(f"Correct output {test} == {ref}")
+from aie.utils.trace import parse_trace
+import aie.utils.test as test_utils
+import aie.iron as iron
+from aie.utils import DefaultNPURuntime
 
-    return errors
-
-
-def load_instr_binary(instr_file):
-    """Load instruction binary file"""
-    try:
-        with open(instr_file, "rb") as f:
-            instr_data = f.read()
-        # Convert to uint32 array
-        instr_v = np.frombuffer(instr_data, dtype=np.uint32)
-        return instr_v
-    except FileNotFoundError:
-        print(f"ERROR: Instruction file '{instr_file}' not found.")
-        sys.exit(1)
+IN_OUT_SIZE = 4096
+IN_OUT_DTYPE = np.int32
+SCALAR_FACTOR = 3
 
 
 def generate_trace_json(trace_buffer, mlir_file, verbosity=0):
@@ -119,152 +60,49 @@ def generate_trace_json(trace_buffer, mlir_file, verbosity=0):
         return False
 
 
-def parse_args():
-    """Parse command line arguments"""
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument("--xclbin", required=True, help="Path to XCLBIN file")
-    parser.add_argument(
-        "--instr", required=True, help="Path to instruction binary file"
-    )
-    parser.add_argument("--kernel", default="MLIR_AIE", help="Kernel name")
-    parser.add_argument(
-        "--verbosity", "-v", type=int, default=0, help="Verbosity level"
-    )
-    parser.add_argument(
-        "--trace-sz", type=int, default=8192, help="Trace buffer size in bytes"
-    )
-    parser.add_argument("--mlir", help="MLIR source file for trace parsing")
-
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-
-    assert args.trace_sz > 0, "Trace size must be greater than 0"
-
-    if args.verbosity >= 1:
+def main(opts):
+    assert opts.trace_size > 0, "Trace size must be greater than 0"
+    if opts.verbosity >= 1:
         print("=" * 80)
-        print(f"XCLBIN: {args.xclbin}")
-        print(f"Instruction file: {args.instr}")
-        print(f"Kernel: {args.kernel}")
-        print(
-            f"IN1_VOLUME: {IN1_VOLUME}, IN2_VOLUME: {IN2_VOLUME}, OUT_VOLUME: {OUT_VOLUME}"
-        )
+        print(f"XCLBIN: {opts.xclbin}")
+        print(f"Instruction file: {opts.instr}")
+        print(f"Kernel: {opts.kernel}")
         print("=" * 80)
 
-    # Load instruction sequence
-    instr_v = load_instr_binary(args.instr)
-    if args.verbosity >= 1:
-        print(f"Sequence instr count: {len(instr_v)}")
+    rng = np.random.default_rng(seed=42)
+    input_data = rng.integers(1, 100, size=IN_OUT_SIZE, dtype=IN_OUT_DTYPE)
+    in1 = iron.tensor(input_data, dtype=IN_OUT_DTYPE)
+    in2 = iron.tensor([SCALAR_FACTOR], dtype=IN_OUT_DTYPE)
+    out = iron.zeros(IN_OUT_SIZE, dtype=IN_OUT_DTYPE)
+    ref_data = input_data * SCALAR_FACTOR
 
     # Start the XRT context and load the kernel
-    if args.verbosity >= 1:
-        print("Loading device and kernel...")
-
-    device = xrt.device(0)
-    xclbin = xrt.xclbin(args.xclbin)
-    device.register_xclbin(xclbin)
-
-    # Get kernel name from xclbin
-    xkernels = xclbin.get_kernels()
-    xkernel = None
-    for k in xkernels:
-        if k.get_name() == args.kernel:
-            xkernel = k
-            break
-
-    if xkernel is None:
-        print(f"ERROR: Kernel '{args.kernel}' not found in xclbin")
-        sys.exit(1)
-
-    kernel = xrt.kernel(
-        device, xclbin.get_uuid(), args.kernel, xrt.kernel.cu_access_mode.exclusive
-    )
-
-    if args.verbosity >= 1:
-        print(f"Kernel loaded: {args.kernel}")
-
-    # Set up buffer objects
-    bo_instr = xrt.bo(device, len(instr_v) * 4, xrt.bo.cacheable, kernel.group_id(1))
-    bo_in1 = xrt.bo(device, IN1_SIZE, xrt.bo.host_only, kernel.group_id(3))
-    bo_in2 = xrt.bo(device, IN2_SIZE, xrt.bo.host_only, kernel.group_id(4))
-    bo_out = xrt.bo(device, OUT_SIZE, xrt.bo.host_only, kernel.group_id(5))
-    bo_extra = xrt.bo(device, 8, xrt.bo.host_only, kernel.group_id(6))
-
-    # Workaround: allocate trace buffer 4x size
-    tmp_trace_size = args.trace_sz * 4
-    bo_trace = xrt.bo(device, tmp_trace_size, xrt.bo.host_only, kernel.group_id(7))
-
-    if args.verbosity >= 1:
-        print("Writing data into buffer objects...")
-
-    # Map buffers and initialize
-    # Note: pyxrt write() expects bytes
-    bo_instr.write(instr_v.tobytes(), 0)
-
-    bufIn1 = initialize_bufIn1(IN1_VOLUME)
-    bufIn2 = initialize_bufIn2(IN2_VOLUME)
-    bufOut = initialize_bufOut(OUT_VOLUME)
-
-    bo_in1.write(bufIn1.tobytes(), 0)
-    bo_in2.write(bufIn2.tobytes(), 0)
-    bo_out.write(bufOut.tobytes(), 0)
-
-    # Initialize trace buffer with zeros
-    trace_init = np.zeros(tmp_trace_size, dtype=np.uint8)
-    bo_trace.write(trace_init.tobytes(), 0)
-
-    # Sync host to device
-    bo_instr.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-    bo_in1.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-    bo_in2.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-    bo_out.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-
-    bo_trace.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_TO_DEVICE)
-
-    # Run kernel
-    if args.verbosity >= 1:
-        print("Running Kernel...")
-
-    opcode = 3
-    run = kernel(
-        opcode, bo_instr, len(instr_v), bo_in1, bo_in2, bo_out, bo_extra, bo_trace
-    )
-    run.wait()
-
-    # Sync device to host
-    bo_out.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
-    bo_trace.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
-
-    # Read output buffer
-    bufOut_bytes = bo_out.read(OUT_SIZE, 0)
-    # Ensure bytes for frombuffer
-    if not isinstance(bufOut_bytes, (bytes, bytearray)):
-        bufOut_bytes = bytes(bufOut_bytes)
-    bufOut = np.frombuffer(bufOut_bytes, dtype=DATATYPE_OUT)
-
-    # Verify results
-    if args.verbosity >= 1:
-        print("Verifying results...")
-
-    errors = verify_vector_scalar_mul(
-        bufIn1, bufIn2, bufOut, IN1_VOLUME, args.verbosity
-    )
-
-    # Write trace to file
-    trace_data_bytes = bo_trace.read(args.trace_sz, 0)
+    if opts.verbosity >= 1:
+        print("Running...\n")
+
+    opts.trace_size = IN_OUT_SIZE * 4
+
+    npu_opts = test_utils.create_npu_kernel(opts)
+    if DefaultNPURuntime.run_test(
+        npu_opts.npu_kernel,
+        [in1, in2, out],
+        {2: ref_data},
+        verify=npu_opts.verify,
+        verbosity=npu_opts.verbosity,
+    ):
+        print("Failed.")
+        return 1
+    errors = 0
 
-    # Convert to uint32 array for proper formatting
-    trace_buffer = np.frombuffer(trace_data_bytes, dtype=np.uint32)
+    # Read trace from file
+    trace_buffer = npu_opts.npu_kernel.trace_config.read_trace()
 
-    if args.verbosity >= 1:
+    if opts.verbosity >= 1:
         print(f"Trace buffer shape: {trace_buffer.shape}")
         print(f"Trace buffer dtype: {trace_buffer.dtype}")
 
     trace_events = generate_trace_json(
-        trace_buffer, args.mlir, verbosity=args.verbosity
+        trace_buffer, opts.mlir, verbosity=opts.verbosity
     )
 
     if not trace_events:
@@ -284,7 +122,7 @@ def main():
             if event.get("name") == "INSTR_EVENT_1" and event.get("ph") == "B"
         )
 
-    if args.verbosity >= 1:
+    if opts.verbosity >= 1:
         print(f"INSTR_EVENT_0 count: {instr_event_0_count}")
         print(f"INSTR_EVENT_1 count: {instr_event_1_count}")
 
@@ -306,4 +144,7 @@ def main():
 
 
 if __name__ == "__main__":
-    sys.exit(main())
+    p = test_utils.create_default_argparser()
+    p.add_argument("--mlir", dest="mlir", help="MLIR file for trace parsing")
+    opts = p.parse_args(sys.argv[1:])
+    sys.exit(main(opts))
diff --git a/test/parse-trace/test1/Makefile b/test/parse-trace/test1/Makefile
index 9f16e575fd0..e4777870d04 100644
--- a/test/parse-trace/test1/Makefile
+++ b/test/parse-trace/test1/Makefile
@@ -4,15 +4,15 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 # 
 ##===----------------------------------------------------------------------===##
 
 srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 all:
-	${srcdir}/../../../python/utils/parse_trace.py --input ${srcdir}/trace_test1.txt --mlir ${srcdir}/aie_test1.mlir --output trace_test1.json
-	${srcdir}/../../../python/utils/get_trace_summary.py --input trace_test1.json
+	${srcdir}/../../../python/utils/trace/parse.py --input ${srcdir}/trace_test1.txt --mlir ${srcdir}/aie_test1.mlir --output trace_test1.json
+	${srcdir}/../../../python/utils/trace/get_trace_summary.py --input trace_test1.json
 
 diff:
 	diff trace_test1.json ${srcdir}/golden_json.txt
diff --git a/test/parse-trace/test2/Makefile b/test/parse-trace/test2/Makefile
index 4ab8042e421..94220d42252 100644
--- a/test/parse-trace/test2/Makefile
+++ b/test/parse-trace/test2/Makefile
@@ -4,15 +4,15 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# Copyright (C) 2024, Advanced Micro Devices, Inc.
+# Copyright (C) 2024-2026, Advanced Micro Devices, Inc.
 # 
 ##===----------------------------------------------------------------------===##
 
 srcdir := $(shell dirname $(realpath $(firstword $(MAKEFILE_LIST))))
 
 all:
-	${srcdir}/../../../python/utils/parse_trace.py --input ${srcdir}/trace_test2.txt --mlir ${srcdir}/aie_test2.mlir --output trace_test2.json
-	${srcdir}/../../../python/utils/get_trace_summary.py --input trace_test2.json
+	${srcdir}/../../../python/utils/trace/parse.py --input ${srcdir}/trace_test2.txt --mlir ${srcdir}/aie_test2.mlir --output trace_test2.json
+	${srcdir}/../../../python/utils/trace/get_trace_summary.py --input trace_test2.json
 
 diff:
 	diff trace_test2.json ${srcdir}/golden_json.txt
diff --git a/test/python/compile_link.py b/test/python/compile_link.py
deleted file mode 100644
index 91dc998c399..00000000000
--- a/test/python/compile_link.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# (c) Copyright 2025 AMD Inc.
-
-# RUN: %run_on_npu1% %pytest %s
-# RUN: %run_on_npu2% %pytest %s
-
-import pytest
-import os
-import tempfile
-
-from aie.iron.compile import compile_cxx_core_function
-from aie.iron.compile import merge_object_files
-
-SOURCE_STRING1 = """
-extern "C" {
-void add_one(int* input, int* output, int tile_size) {
-    for (int i = 0; i < tile_size; i++) {
-        output[i] = input[i] + 1;
-    }
-}
-}"""
-
-SOURCE_STRING2 = """
-extern "C" {
-void add_two(int* input, int* output, int tile_size) {
-    for (int i = 0; i < tile_size; i++) {
-        output[i] = input[i] + 2;
-    }
-}
-}"""
-
-
-def test_compile():
-    """Test compilation of a C++ source file to an object file."""
-    with tempfile.NamedTemporaryFile(
-        mode="w", suffix=".cpp", delete_on_close=False, delete=True
-    ) as source_file, tempfile.NamedTemporaryFile(
-        mode="r", suffix=".o", delete_on_close=True
-    ) as output_file:
-        source_file.write(SOURCE_STRING1)
-        source_file.close()
-        assert os.path.getsize(source_file.name) > 0
-
-        assert os.path.getsize(output_file.name) == 0
-        compile_cxx_core_function(
-            source_path=source_file.name,
-            target_arch="aie2",
-            output_path=output_file.name,
-            compile_args=["-DTEST"],
-        )
-        assert os.path.getsize(output_file.name) > 0
-
-
-def test_compile_and_link():
-    """Test compilation of two C++ source files and link them."""
-    with tempfile.NamedTemporaryFile(
-        mode="w", suffix=".cpp", delete_on_close=False, delete=True
-    ) as source_file1, tempfile.NamedTemporaryFile(
-        mode="w", suffix=".cpp", delete_on_close=False, delete=True
-    ) as source_file2, tempfile.NamedTemporaryFile(
-        mode="r", suffix=".o", delete_on_close=True
-    ) as output_file1, tempfile.NamedTemporaryFile(
-        mode="r", suffix=".o", delete_on_close=True
-    ) as output_file2, tempfile.NamedTemporaryFile(
-        mode="r", suffix=".o", delete_on_close=True
-    ) as combined_output_file:
-
-        source_file1.write(SOURCE_STRING1)
-        source_file1.close()
-        assert os.path.getsize(source_file1.name) > 0
-
-        source_file2.write(SOURCE_STRING2)
-        source_file2.close()
-        assert os.path.getsize(source_file2.name) > 0
-
-        assert os.path.getsize(output_file1.name) == 0
-        compile_cxx_core_function(
-            source_path=source_file1.name,
-            target_arch="aie2",
-            output_path=output_file1.name,
-        )
-        assert os.path.getsize(output_file1.name) > 0
-
-        assert os.path.getsize(output_file2.name) == 0
-        compile_cxx_core_function(
-            source_path=source_file2.name,
-            target_arch="aie2",
-            output_path=output_file2.name,
-        )
-        assert os.path.getsize(output_file2.name) > 0
-
-        assert os.path.getsize(combined_output_file.name) == 0
-        merge_object_files(
-            object_paths=[output_file1.name, output_file2.name],
-            output_path=combined_output_file.name,
-        )
-        assert os.path.getsize(combined_output_file.name) > 0
diff --git a/test/python/lit.local.cfg b/test/python/lit.local.cfg
index 12377510b7f..7ad21c2240c 100644
--- a/test/python/lit.local.cfg
+++ b/test/python/lit.local.cfg
@@ -2,8 +2,8 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2023 Advanced Micro Devices Inc.
+# (c) Copyright 2023-2026 Advanced Micro Devices Inc.
 if not config.enable_python_tests:
     config.unsupported = True
 
-config.excludes.add("util.py")
\ No newline at end of file
+config.excludes.add("util.py")
diff --git a/test/python/npu-xrt/lit.local.cfg b/test/python/npu-xrt/lit.local.cfg
new file mode 100644
index 00000000000..c57c0697c8f
--- /dev/null
+++ b/test/python/npu-xrt/lit.local.cfg
@@ -0,0 +1,11 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2023-2026 Advanced Micro Devices Inc.
+if not config.enable_python_tests:
+    config.unsupported = True
+
+config.excludes.add("util.py")
+
+config.parallelism_group = "npu-xrt"
diff --git a/test/python/npu-xrt/test_cached_xrt_runtime.py b/test/python/npu-xrt/test_cached_xrt_runtime.py
new file mode 100644
index 00000000000..082fc25538d
--- /dev/null
+++ b/test/python/npu-xrt/test_cached_xrt_runtime.py
@@ -0,0 +1,495 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2025-2026 AMD Inc.
+
+# RUN: %run_on_npu1% %pytest %s
+# RUN: %run_on_npu2% %pytest %s
+# REQUIRES: xrt_python_bindings
+
+import pytest
+import numpy as np
+import time
+import os
+import aie.iron as iron
+from aie.iron import ObjectFifo, Worker, Runtime, Program
+from aie.iron.placers import SequentialPlacer
+from aie.iron.controlflow import range_
+import aie.utils
+import aie.utils.jit
+from aie.utils.hostruntime.xrtruntime.hostruntime import (
+    CachedXRTRuntime,
+    XRTHostRuntime,
+)
+
+
+@pytest.fixture
+def runtime():
+    # Create new runtime instance
+    rt = CachedXRTRuntime()
+
+    # Save old values
+    old_utils_runtime = aie.utils.DefaultNPURuntime
+
+    # Set new values
+    aie.utils.DefaultNPURuntime = rt
+
+    yield rt
+
+    # Restore
+    aie.utils.DefaultNPURuntime = old_utils_runtime
+    rt.cleanup()
+
+
+@iron.jit(is_placed=False)
+def transform(input, output, func):
+    """Transform kernel that applies a function to input tensor and stores result in output tensor."""
+    if input.shape != output.shape:
+        raise ValueError(
+            f"Input shapes are not the equal ({input.shape} != {output.shape})."
+        )
+    num_elements = np.size(input)
+
+    if isinstance(func, iron.ExternalFunction):
+        tile_size = func.tile_size(0)
+    else:
+        tile_size = 16 if num_elements >= 16 else 1
+
+    if num_elements % tile_size != 0:
+        raise ValueError(
+            f"Number of elements ({num_elements}) must be a multiple of {tile_size}."
+        )
+    num_tiles = num_elements // tile_size
+
+    if input.dtype != output.dtype:
+        raise ValueError(
+            f"Input data types are not the same ({input.dtype} != {output.dtype})."
+        )
+
+    dtype = input.dtype
+
+    # Define tensor types
+    tensor_ty = np.ndarray[(num_elements,), np.dtype[dtype]]
+    tile_ty = np.ndarray[(tile_size,), np.dtype[dtype]]
+
+    # AIE-array data movement with object fifos
+    of_in = ObjectFifo(tile_ty, name="in")
+    of_out = ObjectFifo(tile_ty, name="out")
+
+    # Define a task that will run on a compute tile
+    def core_body(of_in, of_out, func_to_apply):
+        for _ in range_(num_tiles):
+            elem_in = of_in.acquire(1)
+            elem_out = of_out.acquire(1)
+            if isinstance(func_to_apply, iron.ExternalFunction):
+                func_to_apply(elem_in, elem_out, tile_size)
+            else:
+                for j in range_(tile_size):
+                    elem_out[j] = func_to_apply(elem_in[j])
+            of_in.release(1)
+            of_out.release(1)
+
+    # Create a worker to run the task on a compute tile
+    worker = Worker(core_body, fn_args=[of_in.cons(), of_out.prod(), func])
+
+    # Runtime operations to move data to/from the AIE-array
+    rt = Runtime()
+    with rt.sequence(tensor_ty, tensor_ty) as (A, B):
+        rt.start(worker)
+        rt.fill(of_in.prod(), A)
+        rt.drain(of_out.cons(), B, wait=True)
+
+    # Place program components (assign them resources on the device) and generate an MLIR module
+    return Program(iron.get_current_device(), rt).resolve_program(SequentialPlacer())
+
+
+def test_runtime_caching_reuse(runtime):
+    """Test that CachedXRTRuntime reuses contexts for the same kernel."""
+
+    input_tensor = iron.arange(32, dtype=np.int32)
+
+    # First run with lambda
+    transform(input_tensor, input_tensor, lambda x: x + 1)
+
+    assert len(runtime._context_cache) == 1
+
+    # Get the context from the cache
+    key1 = list(runtime._context_cache.keys())[0]
+    entry1 = runtime._context_cache[key1]
+    context1 = entry1["context"]
+
+    # Second run with same lambda (jit cache should hit, returning same NPUKernel)
+    transform(input_tensor, input_tensor, lambda x: x + 1)
+
+    assert len(runtime._context_cache) == 1
+
+    # Verify it's the same context
+    key2 = list(runtime._context_cache.keys())[0]
+    entry2 = runtime._context_cache[key2]
+    context2 = entry2["context"]
+
+    assert key1 == key2
+    assert context1 is context2
+
+
+def test_runtime_caching_multiple_kernels(runtime):
+    """Test that CachedXRTRuntime caches multiple different kernels."""
+
+    input_tensor = iron.arange(32, dtype=np.int32)
+
+    # Run first kernel (add 1)
+    transform(input_tensor, input_tensor, lambda x: x + 1)
+    assert len(runtime._context_cache) == 1
+
+    # Run second kernel (multiply by 2)
+    transform(input_tensor, input_tensor, lambda x: x * 2)
+
+    # Should have 2 entries now
+    assert len(runtime._context_cache) == 2
+
+
+def test_runtime_eviction_logic(runtime):
+    """Test eviction logic by artificially lowering cache size."""
+
+    original_size = runtime._cache_size
+    runtime._cache_size = 1  # Set small cache size
+
+    try:
+        input_tensor = iron.arange(32, dtype=np.int32)
+
+        # Run first kernel
+        transform(input_tensor, input_tensor, lambda x: x + 1)
+        assert len(runtime._context_cache) == 1
+        key1 = list(runtime._context_cache.keys())[0]
+
+        # Run second kernel (different lambda -> different xclbin)
+        transform(input_tensor, input_tensor, lambda x: x * 2)
+
+        assert len(runtime._context_cache) == 1
+        key2 = list(runtime._context_cache.keys())[0]
+
+        # Verify key changed (eviction happened)
+        assert key1 != key2
+
+    finally:
+        runtime._cache_size = original_size
+
+
+def test_runtime_cache_fill(runtime):
+    """Test filling the cache to its capacity."""
+
+    # Ensure cache is empty
+    runtime.cleanup()
+
+    input_tensor = iron.arange(32, dtype=np.int32)
+
+    # Load kernels up to capacity + 1
+    limit = runtime._cache_size
+    first_key = None
+
+    for i in range(limit + 1):
+        transform(input_tensor, input_tensor, lambda x, val=i: x + val)
+
+        if i == 0:
+            first_key = list(runtime._context_cache.keys())[0]
+
+        # Check size
+        expected_size = min(i + 1, limit)
+        assert len(runtime._context_cache) == expected_size
+
+    # Verify the first one was evicted (since we went to limit + 1)
+    assert first_key not in runtime._context_cache
+
+
+def test_runtime_mtime_sensitivity(runtime):
+    """Test that updating the file (changing mtime) causes a reload."""
+
+    input_tensor = iron.arange(32, dtype=np.int32)
+    # Load kernel
+    transform(input_tensor, input_tensor, lambda x: x + 1)
+    assert len(runtime._context_cache) == 1
+
+    # Get the xclbin path from the cache key
+    key = list(runtime._context_cache.keys())[0]
+    xclbin_path = key[0]
+
+    # Wait a bit to ensure mtime changes
+    time.sleep(0.01)
+
+    # Touch the xclbin file
+    os.utime(xclbin_path, None)
+
+    # Load again
+    transform(input_tensor, input_tensor, lambda x: x + 1)
+
+    # Should have 2 entries now (old one and new one with new mtime)
+    # Because CachedXRTRuntime keys include mtime, and it doesn't automatically evict old mtime entries for same path unless LRU kicks in.
+    assert len(runtime._context_cache) == 2
+
+    keys = list(runtime._context_cache.keys())
+    assert keys[0][0] == keys[1][0]  # Same path
+    assert keys[0][1] != keys[1][1]  # Different mtime
+
+
+def test_runtime_handle_invalidation(runtime):
+    """Test that handles are invalidated when context is evicted."""
+
+    original_size = runtime._cache_size
+    runtime._cache_size = 1
+
+    # Capture load calls to get paths
+    original_load = runtime.load
+    captured_kernels = []
+
+    def side_effect_load(npu_kernel, **kwargs):
+        captured_kernels.append(npu_kernel)
+        return original_load(npu_kernel, **kwargs)
+
+    runtime.load = side_effect_load
+
+    try:
+        input_tensor = iron.arange(32, dtype=np.int32)
+
+        # Load first kernel to generate artifacts
+        transform(input_tensor, input_tensor, lambda x: x + 1)
+
+        # Restore load
+        runtime.load = original_load
+
+        # Manually load to get a strong reference to the handle
+        npu_kernel_captured = captured_kernels[0]
+        xclbin_path = npu_kernel_captured.xclbin_path
+        insts_path = npu_kernel_captured.insts_path
+
+        class MockNPUKernel:
+            def __init__(self, x, i):
+                self.xclbin_path = x
+                self.insts_path = i
+                self.kernel_name = "MLIR_AIE"
+
+        npu_kernel = MockNPUKernel(xclbin_path, insts_path)
+        handle = runtime.load(npu_kernel)
+
+        assert handle is not None
+        assert handle._is_valid
+
+        # Load second kernel to force eviction
+        transform(input_tensor, input_tensor, lambda x: x * 2)
+
+        # Verify handle is invalidated
+        assert not handle._is_valid
+
+    finally:
+        runtime._cache_size = original_size
+
+
+def test_runtime_cleanup(runtime):
+    """Test that cleanup clears the cache and invalidates handles."""
+
+    input_tensor = iron.arange(32, dtype=np.int32)
+
+    # Capture load calls to get paths
+    original_load = runtime.load
+    captured_kernels = []
+
+    def side_effect_load(npu_kernel, **kwargs):
+        captured_kernels.append(npu_kernel)
+        return original_load(npu_kernel, **kwargs)
+
+    runtime.load = side_effect_load
+
+    # Load kernel to generate artifacts
+    transform(input_tensor, input_tensor, lambda x: x + 1)
+
+    # Restore load
+    runtime.load = original_load
+
+    # Manually load to get a strong reference to the handle
+    npu_kernel_captured = captured_kernels[0]
+    xclbin_path = npu_kernel_captured.xclbin_path
+    insts_path = npu_kernel_captured.insts_path
+
+    class MockNPUKernel:
+        def __init__(self, x, i):
+            self.xclbin_path = x
+            self.insts_path = i
+            self.kernel_name = "MLIR_AIE"
+
+    npu_kernel = MockNPUKernel(xclbin_path, insts_path)
+    handle = runtime.load(npu_kernel)
+
+    assert handle is not None
+    assert handle._is_valid
+
+    # Cleanup
+    runtime.cleanup()
+
+    assert len(runtime._context_cache) == 0
+    assert not handle._is_valid
+
+
+def test_base_runtime_load_run(runtime):
+    """Test that the base XRTHostRuntime works correctly (no caching)."""
+
+    input_tensor = iron.arange(32, dtype=np.int32)
+
+    # Capture load calls to get paths
+    original_load = runtime.load
+    captured_kernels = []
+
+    def side_effect_load(npu_kernel, **kwargs):
+        captured_kernels.append(npu_kernel)
+        return original_load(npu_kernel, **kwargs)
+
+    runtime.load = side_effect_load
+
+    # Run transform to generate artifacts using the cached runtime (fixture)
+    transform(input_tensor, input_tensor, lambda x: x + 1)
+
+    # Restore load
+    runtime.load = original_load
+
+    # Verify result
+    res = input_tensor.numpy()
+    expected = np.arange(32, dtype=np.int32) + 1
+    np.testing.assert_array_equal(res, expected)
+
+    # Get paths from cached runtime to use with base runtime
+    npu_kernel_captured = captured_kernels[0]
+    xclbin_path = npu_kernel_captured.xclbin_path
+    insts_path = npu_kernel_captured.insts_path
+
+    class MockNPUKernel:
+        def __init__(self, x, i):
+            self.xclbin_path = x
+            self.insts_path = i
+            self.kernel_name = "MLIR_AIE"
+            self.trace_config = None
+
+    npu_kernel = MockNPUKernel(xclbin_path, insts_path)
+
+    # Create base runtime
+    base_runtime = XRTHostRuntime()
+
+    # Load
+    handle = base_runtime.load(npu_kernel)
+    assert handle is not None
+
+    # Run
+    base_runtime.run(handle, [input_tensor, input_tensor])
+
+    # Verify result
+    res = input_tensor.numpy()
+    expected = expected + 1
+    np.testing.assert_array_equal(res, expected)
+
+    # Verify no caching in base runtime
+    assert not hasattr(base_runtime, "_context_cache")
+
+
+def test_cache_size_limit(runtime):
+    """Test that cache size is set correctly based on device."""
+    cache_sizes = CachedXRTRuntime.NPU_CONTEXT_CACHE_SIZE
+    npu1_size = cache_sizes["npu1"]
+    npu2_size = cache_sizes["npu2"]
+
+    expected_size = npu1_size if runtime.npu_str == "npu1" else npu2_size
+
+    env_cache_size = os.environ.get("XRT_CONTEXT_CACHE_SIZE")
+    if env_cache_size is not None:
+        expected_size = min(expected_size, int(env_cache_size))
+
+    assert runtime._cache_size == expected_size
+
+
+def test_runtime_retry_disable(runtime):
+    """Test that retry=False is accepted."""
+    input_tensor = iron.arange(32, dtype=np.int32)
+
+    # Capture load calls to get paths
+    original_load = runtime.load
+    captured_kernels = []
+
+    def side_effect_load(npu_kernel, **kwargs):
+        captured_kernels.append(npu_kernel)
+        return original_load(npu_kernel, **kwargs)
+
+    runtime.load = side_effect_load
+
+    # Run transform to generate artifacts
+    transform(input_tensor, input_tensor, lambda x: x + 1)
+
+    # Restore load
+    runtime.load = original_load
+
+    # Get paths
+    npu_kernel_captured = captured_kernels[0]
+    xclbin_path = npu_kernel_captured.xclbin_path
+    insts_path = npu_kernel_captured.insts_path
+
+    class MockNPUKernel:
+        def __init__(self, x, i):
+            self.xclbin_path = x
+            self.insts_path = i
+            self.kernel_name = "MLIR_AIE"
+            self.trace_config = None
+
+    npu_kernel = MockNPUKernel(xclbin_path, insts_path)
+
+    # Load with retry=False
+    handle = runtime.load(npu_kernel, retry=False)
+    assert handle is not None
+
+
+def test_runtime_run_only_if_loaded(runtime):
+    """Test that run with only_if_loaded=True fails if kernel is evicted."""
+    input_tensor = iron.arange(32, dtype=np.int32)
+
+    # Capture load calls to get paths
+    original_load = runtime.load
+    captured_kernels = []
+
+    def side_effect_load(npu_kernel, **kwargs):
+        captured_kernels.append(npu_kernel)
+        return original_load(npu_kernel, **kwargs)
+
+    runtime.load = side_effect_load
+
+    # Run transform to generate artifacts
+    transform(input_tensor, input_tensor, lambda x: x + 1)
+
+    # Restore load
+    runtime.load = original_load
+
+    # Get paths
+    npu_kernel_captured = captured_kernels[0]
+    xclbin_path = npu_kernel_captured.xclbin_path
+    insts_path = npu_kernel_captured.insts_path
+
+    class MockNPUKernel:
+        def __init__(self, x, i):
+            self.xclbin_path = x
+            self.insts_path = i
+            self.kernel_name = "MLIR_AIE"
+            self.trace_config = None
+
+    npu_kernel = MockNPUKernel(xclbin_path, insts_path)
+
+    # Load
+    handle = runtime.load(npu_kernel)
+    assert handle is not None
+    assert handle._is_valid
+
+    # Run with only_if_loaded=True (should succeed)
+    runtime.run(handle, [input_tensor, input_tensor], only_if_loaded=True)
+
+    # Invalidate handle (simulate eviction)
+    handle.invalidate()
+    assert not handle._is_valid
+
+    # Run with only_if_loaded=True (should fail)
+    from aie.utils.hostruntime.hostruntime import HostRuntimeError
+
+    with pytest.raises(HostRuntimeError, match="Kernel not loaded"):
+        runtime.run(handle, [input_tensor, input_tensor], only_if_loaded=True)
diff --git a/test/python/npu-xrt/test_cached_xrt_runtime_insts.py b/test/python/npu-xrt/test_cached_xrt_runtime_insts.py
new file mode 100644
index 00000000000..e402f97b1d3
--- /dev/null
+++ b/test/python/npu-xrt/test_cached_xrt_runtime_insts.py
@@ -0,0 +1,219 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2025-2026 AMD Inc.
+
+# RUN: %run_on_npu1% %pytest %s
+# RUN: %run_on_npu2% %pytest %s
+# REQUIRES: xrt_python_bindings
+
+import pytest
+import numpy as np
+import time
+import os
+import aie.iron as iron
+from aie.iron import ObjectFifo, Worker, Runtime, Program
+from aie.iron.placers import SequentialPlacer
+from aie.iron.controlflow import range_
+import aie.utils
+import aie.utils.jit
+from aie.utils.hostruntime.xrtruntime.hostruntime import (
+    CachedXRTRuntime,
+    XRTHostRuntime,
+)
+
+
+@pytest.fixture
+def runtime():
+    # Create new runtime instance
+    rt = CachedXRTRuntime()
+
+    # Save old values
+    old_utils_runtime = aie.utils.DefaultNPURuntime
+
+    # Set new values
+    aie.utils.DefaultNPURuntime = rt
+
+    yield rt
+
+    # Restore
+    aie.utils.DefaultNPURuntime = old_utils_runtime
+    rt.cleanup()
+
+
+@iron.jit(is_placed=False)
+def transform(input, output, func):
+    """Transform kernel that applies a function to input tensor and stores result in output tensor."""
+    if input.shape != output.shape:
+        raise ValueError(
+            f"Input shapes are not the equal ({input.shape} != {output.shape})."
+        )
+    num_elements = np.size(input)
+
+    if isinstance(func, iron.ExternalFunction):
+        tile_size = func.tile_size(0)
+    else:
+        tile_size = 16 if num_elements >= 16 else 1
+
+    if num_elements % tile_size != 0:
+        raise ValueError(
+            f"Number of elements ({num_elements}) must be a multiple of {tile_size}."
+        )
+    num_tiles = num_elements // tile_size
+
+    if input.dtype != output.dtype:
+        raise ValueError(
+            f"Input data types are not the same ({input.dtype} != {output.dtype})."
+        )
+
+    dtype = input.dtype
+
+    # Define tensor types
+    tensor_ty = np.ndarray[(num_elements,), np.dtype[dtype]]
+    tile_ty = np.ndarray[(tile_size,), np.dtype[dtype]]
+
+    # AIE-array data movement with object fifos
+    of_in = ObjectFifo(tile_ty, name="in")
+    of_out = ObjectFifo(tile_ty, name="out")
+
+    # Define a task that will run on a compute tile
+    def core_body(of_in, of_out, func_to_apply):
+        for _ in range_(num_tiles):
+            elem_in = of_in.acquire(1)
+            elem_out = of_out.acquire(1)
+            if isinstance(func_to_apply, iron.ExternalFunction):
+                func_to_apply(elem_in, elem_out, tile_size)
+            else:
+                for j in range_(tile_size):
+                    elem_out[j] = func_to_apply(elem_in[j])
+            of_in.release(1)
+            of_out.release(1)
+
+    # Create a worker to run the task on a compute tile
+    worker = Worker(core_body, fn_args=[of_in.cons(), of_out.prod(), func])
+
+    # Runtime operations to move data to/from the AIE-array
+    rt = Runtime()
+    with rt.sequence(tensor_ty, tensor_ty) as (A, B):
+        rt.start(worker)
+        rt.fill(of_in.prod(), A)
+        rt.drain(of_out.cons(), B, wait=True)
+
+    # Place program components (assign them resources on the device) and generate an MLIR module
+    return Program(iron.get_current_device(), rt).resolve_program(SequentialPlacer())
+
+
+def test_insts_caching(runtime):
+    """Test that insts buffers are cached and reused."""
+
+    input_tensor = iron.arange(32, dtype=np.int32)
+
+    # First run
+    transform(input_tensor, input_tensor, lambda x: x + 1)
+
+    # Check if _insts_cache exists (it should after our changes)
+    if not hasattr(runtime, "_insts_cache"):
+        pytest.skip("CachedXRTRuntime does not have _insts_cache yet")
+
+    assert len(runtime._insts_cache) == 1
+
+    # Get the insts_bo from the cache
+    key1 = list(runtime._insts_cache.keys())[0]
+    entry1 = runtime._insts_cache[key1]
+    insts_bo1 = entry1["insts_bo"]
+
+    # Second run with same lambda (should reuse insts)
+    transform(input_tensor, input_tensor, lambda x: x + 1)
+
+    assert len(runtime._insts_cache) == 1
+
+    # Verify it's the same insts_bo
+    key2 = list(runtime._insts_cache.keys())[0]
+    entry2 = runtime._insts_cache[key2]
+    insts_bo2 = entry2["insts_bo"]
+
+    assert key1 == key2
+    # Note: We can't easily check object identity of BOs if they are wrapped,
+    # but we can check if the entry is the same object.
+    assert entry1 is entry2
+
+
+def test_insts_initialization(runtime):
+    """Test that insts_bo is initialized during load."""
+
+    input_tensor = iron.arange(32, dtype=np.int32)
+
+    # Capture load calls to get paths
+    original_load = runtime.load
+    captured_kernels = []
+
+    def side_effect_load(npu_kernel):
+        captured_kernels.append(npu_kernel)
+        return original_load(npu_kernel)
+
+    runtime.load = side_effect_load
+
+    # Run once to generate artifacts
+    transform(input_tensor, input_tensor, lambda x: x + 1)
+
+    # Restore load
+    runtime.load = original_load
+
+    if not hasattr(runtime, "_context_cache"):
+        pytest.skip("CachedXRTRuntime does not have _context_cache")
+
+    # Manually load to get a strong reference
+    npu_kernel_captured = captured_kernels[0]
+    xclbin_path = npu_kernel_captured.xclbin_path
+    insts_path = npu_kernel_captured.insts_path
+
+    class MockNPUKernel:
+        def __init__(self, x, i):
+            self.xclbin_path = x
+            self.insts_path = i
+            self.kernel_name = "MLIR_AIE"
+
+    npu_kernel = MockNPUKernel(xclbin_path, insts_path)
+    handle = runtime.load(npu_kernel)
+
+    assert handle is not None
+    # Check if handle has insts_bo (after our changes)
+    if hasattr(handle, "insts_bo"):
+        assert handle.insts_bo is not None
+    else:
+        pytest.skip("XRTKernelHandle does not have insts_bo yet")
+
+
+def test_insts_mtime_sensitivity(runtime):
+    """Test that updating the insts file causes a reload."""
+
+    input_tensor = iron.arange(32, dtype=np.int32)
+
+    # Load kernel
+    transform(input_tensor, input_tensor, lambda x: x + 1)
+
+    if not hasattr(runtime, "_insts_cache"):
+        pytest.skip("CachedXRTRuntime does not have _insts_cache yet")
+
+    assert len(runtime._insts_cache) == 1
+
+    # Get the insts path from the cache key
+    key = list(runtime._insts_cache.keys())[0]
+    insts_path = key[0]
+
+    # Wait a bit to ensure mtime changes
+    time.sleep(0.01)
+
+    # Touch the insts file
+    os.utime(insts_path, None)
+
+    # Load again
+    transform(input_tensor, input_tensor, lambda x: x + 1)
+
+    # Should have 2 entries now (old one and new one with new mtime)
+    assert len(runtime._insts_cache) == 2
+
+    keys = list(runtime._insts_cache.keys())
+    assert keys[0][0] == keys[1][0]  # Same path
+    assert keys[0][1] != keys[1][1]  # Different mtime
diff --git a/test/python/cache_functionality.py b/test/python/npu-xrt/test_compile_cache_functionality.py
similarity index 91%
rename from test/python/cache_functionality.py
rename to test/python/npu-xrt/test_compile_cache_functionality.py
index ac5a11edc2c..cd3f44fcea5 100644
--- a/test/python/cache_functionality.py
+++ b/test/python/npu-xrt/test_compile_cache_functionality.py
@@ -2,7 +2,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2025 AMD Inc.
+# (c) Copyright 2025-2026 AMD Inc.
 
 # RUN: %run_on_npu1% %pytest %s
 # RUN: %run_on_npu2% %pytest %s
@@ -85,15 +85,14 @@ def core_body(of_in, of_out, func_to_apply):
 def test_cache_lambda_functions():
     """Test that caching works correctly with different lambda functions."""
     # Create input tensor
-    input_tensor = iron.tensor((32,), dtype=np.int32)
-    input_tensor[:] = np.arange(1, 33, dtype=np.int32)  # [1, 2, 3, ..., 32]
+    input_tensor = iron.arange(32, dtype=np.int32)
 
     # Test 1: First execution with lambda function
     transform(input_tensor, input_tensor, lambda x: x + 1)
     result1 = input_tensor.numpy().copy()
 
     # Reset tensor
-    input_tensor[:] = np.arange(1, 33, dtype=np.int32)
+    input_tensor[:] = np.arange(32, dtype=np.int32)
 
     # Test 2: Second execution with same lambda function (should use cache)
     transform(input_tensor, input_tensor, lambda x: x + 1)
@@ -116,8 +115,7 @@ def test_cache_lambda_functions():
 def test_cache_external_functions():
     """Test that ExternalFunction caching works correctly during execution."""
     # Create input tensor
-    input_tensor = iron.tensor((32,), dtype=np.int32)
-    input_tensor[:] = np.arange(1, 33, dtype=np.int32)  # [1, 2, 3, ..., 32]
+    input_tensor = iron.arange(32, dtype=np.int32)
 
     # Test 1: First execution
     add_one_1 = ExternalFunction(
@@ -139,7 +137,7 @@ def test_cache_external_functions():
     result1 = input_tensor.numpy().copy()
 
     # Reset tensor
-    input_tensor[:] = np.arange(1, 33, dtype=np.int32)
+    input_tensor[:] = np.arange(32, dtype=np.int32)
 
     # Test 2: Second execution
     add_one_2 = ExternalFunction(
@@ -180,7 +178,7 @@ def test_cache_external_functions():
         ],
     )
 
-    input_tensor[:] = np.arange(1, 33, dtype=np.int32)
+    input_tensor[:] = np.arange(32, dtype=np.int32)
     transform(input_tensor, input_tensor, multiply_two)
     result3 = input_tensor.numpy()
 
@@ -193,8 +191,7 @@ def test_cache_external_functions():
 def test_cache_compile_flags():
     """Test that ExternalFunctions with different compile flags produce different results."""
     # Create input tensor
-    input_tensor = iron.tensor((32,), dtype=np.int32)
-    input_tensor[:] = np.arange(1, 33, dtype=np.int32)  # [1, 2, 3, ..., 32]
+    input_tensor = iron.arange(32, dtype=np.int32)
 
     # Create ExternalFunctions with different compile flags
     add_5 = ExternalFunction(
@@ -236,7 +233,7 @@ def test_cache_compile_flags():
     result_5 = input_tensor.numpy().copy()
 
     # Reset and test with ADD_VALUE=10
-    input_tensor[:] = np.arange(1, 33, dtype=np.int32)
+    input_tensor[:] = np.arange(32, dtype=np.int32)
     transform(input_tensor, input_tensor, add_10)
     result_10 = input_tensor.numpy()
 
@@ -246,8 +243,8 @@ def test_cache_compile_flags():
     )
 
     # Verify expected results
-    expected_5 = np.arange(1, 33, dtype=np.int32) + 5
-    expected_10 = np.arange(1, 33, dtype=np.int32) + 10
+    expected_5 = np.arange(32, dtype=np.int32) + 5
+    expected_10 = np.arange(32, dtype=np.int32) + 10
 
     np.testing.assert_array_equal(result_5, expected_5)
     np.testing.assert_array_equal(result_10, expected_10)
@@ -360,8 +357,7 @@ def test_cache_file_source():
 def test_cache_include_directories():
     """Test that ExternalFunctions with include directories work correctly."""
     # Create input tensor
-    input_tensor = iron.tensor((32,), dtype=np.int32)
-    input_tensor[:] = np.arange(1, 33, dtype=np.int32)  # [1, 2, 3, ..., 32]
+    input_tensor = iron.arange(32, dtype=np.int32)
 
     # Create temporary directory with header file
     with tempfile.TemporaryDirectory() as temp_dir:
@@ -394,7 +390,7 @@ def test_cache_include_directories():
         result = input_tensor.numpy()
 
         # Verify expected results
-        expected = np.arange(1, 33, dtype=np.int32) + 42
+        expected = np.arange(32, dtype=np.int32) + 42
         np.testing.assert_array_equal(result, expected)
 
 
@@ -405,8 +401,7 @@ def test_cache_tensor_shapes():
     results = []
 
     for size in sizes:
-        input_tensor = iron.tensor((size,), dtype=np.int32)
-        input_tensor[:] = np.arange(1, size + 1, dtype=np.int32)
+        input_tensor = iron.arange(size, dtype=np.int32)
 
         # Apply transformation
         transform(input_tensor, input_tensor, lambda x: x + 1)
@@ -414,7 +409,7 @@ def test_cache_tensor_shapes():
         results.append(result)
 
         # Verify expected results
-        expected = np.arange(1, size + 1, dtype=np.int32) + 1
+        expected = np.arange(size, dtype=np.int32) + 1
         np.testing.assert_array_equal(result, expected)
 
 
@@ -425,8 +420,7 @@ def test_cache_tensor_dtypes():
     results = []
 
     for dtype in dtypes:
-        input_tensor = iron.tensor((32,), dtype=dtype)
-        input_tensor[:] = np.arange(1, 33, dtype=dtype)
+        input_tensor = iron.arange(32, dtype=dtype)
 
         # Apply transformation
         transform(input_tensor, input_tensor, lambda x: x + 1)
@@ -434,5 +428,5 @@ def test_cache_tensor_dtypes():
         results.append(result)
 
         # Verify expected results
-        expected = np.arange(1, 33, dtype=dtype) + 1
+        expected = np.arange(32, dtype=dtype) + 1
         np.testing.assert_array_equal(result, expected)
diff --git a/test/python/npu-xrt/test_compile_link.py b/test/python/npu-xrt/test_compile_link.py
new file mode 100644
index 00000000000..fbf8db4dd3f
--- /dev/null
+++ b/test/python/npu-xrt/test_compile_link.py
@@ -0,0 +1,95 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2025-2026 AMD Inc.
+
+# RUN: %run_on_npu1% %pytest %s
+# RUN: %run_on_npu2% %pytest %s
+
+import pytest
+import os
+import tempfile
+
+from aie.utils.compile import compile_cxx_core_function
+from aie.utils.compile import merge_object_files
+
+SOURCE_STRING1 = """
+extern "C" {
+void add_one(int* input, int* output, int tile_size) {
+    for (int i = 0; i < tile_size; i++) {
+        output[i] = input[i] + 1;
+    }
+}
+}"""
+
+SOURCE_STRING2 = """
+extern "C" {
+void add_two(int* input, int* output, int tile_size) {
+    for (int i = 0; i < tile_size; i++) {
+        output[i] = input[i] + 2;
+    }
+}
+}"""
+
+
+def test_compile():
+    """Test compilation of a C++ source file to an object file."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        source_path = os.path.join(tmpdir, "source.cpp")
+        output_path = os.path.join(tmpdir, "output.o")
+
+        with open(source_path, "w") as f:
+            f.write(SOURCE_STRING1)
+
+        assert os.path.getsize(source_path) > 0
+        assert not os.path.exists(output_path)
+
+        compile_cxx_core_function(
+            source_path=source_path,
+            target_arch="aie2",
+            output_path=output_path,
+            compile_args=["-DTEST"],
+        )
+        assert os.path.getsize(output_path) > 0
+
+
+def test_compile_and_link():
+    """Test compilation of two C++ source files and link them."""
+    with tempfile.TemporaryDirectory() as tmpdir:
+        source_path1 = os.path.join(tmpdir, "source1.cpp")
+        source_path2 = os.path.join(tmpdir, "source2.cpp")
+        output_path1 = os.path.join(tmpdir, "output1.o")
+        output_path2 = os.path.join(tmpdir, "output2.o")
+        combined_output_path = os.path.join(tmpdir, "combined.o")
+
+        with open(source_path1, "w") as f:
+            f.write(SOURCE_STRING1)
+        assert os.path.getsize(source_path1) > 0
+
+        with open(source_path2, "w") as f:
+            f.write(SOURCE_STRING2)
+        assert os.path.getsize(source_path2) > 0
+
+        assert not os.path.exists(output_path1)
+        compile_cxx_core_function(
+            source_path=source_path1,
+            target_arch="aie2",
+            output_path=output_path1,
+        )
+        assert os.path.getsize(output_path1) > 0
+
+        assert not os.path.exists(output_path2)
+        compile_cxx_core_function(
+            source_path=source_path2,
+            target_arch="aie2",
+            output_path=output_path2,
+        )
+        assert os.path.getsize(output_path2) > 0
+
+        assert not os.path.exists(combined_output_path)
+        merge_object_files(
+            object_paths=[output_path1, output_path2],
+            output_path=combined_output_path,
+        )
+        assert os.path.getsize(combined_output_path) > 0
diff --git a/test/python/device.py b/test/python/npu-xrt/test_device.py
similarity index 98%
rename from test/python/device.py
rename to test/python/npu-xrt/test_device.py
index b71e06b04fa..3e5ef135bf5 100644
--- a/test/python/device.py
+++ b/test/python/npu-xrt/test_device.py
@@ -2,7 +2,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2025 AMD Inc.
+# (c) Copyright 2025-2026 AMD Inc.
 
 # RUN: %run_on_npu1% %pytest %s
 # RUN: %run_on_npu2% %pytest %s
diff --git a/test/python/npu-xrt/test_device_override.py b/test/python/npu-xrt/test_device_override.py
new file mode 100644
index 00000000000..5f9300e1ba3
--- /dev/null
+++ b/test/python/npu-xrt/test_device_override.py
@@ -0,0 +1,66 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2026 AMD Inc.
+
+# RUN: %run_on_npu1% %pytest %s
+# RUN: %run_on_npu2% %pytest %s
+
+import pytest
+from aie.iron.device import NPU1, NPU2
+import aie.utils as utils
+
+
+def test_device_override():
+    # Save current device if any (likely from runtime)
+    original_device = utils.get_current_device()
+
+    # Create a dummy device
+    if isinstance(original_device, NPU1):
+        dummy_device = NPU2()
+    else:
+        dummy_device = NPU1()
+
+    # Set override
+    utils.set_current_device(dummy_device)
+
+    # Check if override works
+    assert utils.get_current_device() == dummy_device
+
+    # Reset override
+    utils.set_current_device(None)
+
+    # Check if reset works (should return original device)
+    # If runtime is present, it returns runtime device.
+    # If not, it returns None.
+    # In both cases, it should match original_device (assuming runtime didn't change/disappear)
+    assert type(utils.get_current_device()) == type(original_device)
+
+
+def test_device_consistency():
+    # Define a mock runtime class
+    class MockRuntime(utils.HostRuntime):
+        def load(self, *args, **kwargs):
+            pass
+
+        def run(self, *args, **kwargs):
+            pass
+
+        def device(self):
+            return NPU1()
+
+    runtime = MockRuntime()
+
+    # Set compatible override
+    utils.set_current_device(NPU1())
+    # Should not raise
+    runtime.check_device_consistency()
+
+    # Set incompatible override
+    utils.set_current_device(NPU2())
+    with pytest.raises(RuntimeError, match="does not match runtime device"):
+        runtime.check_device_consistency()
+
+    # Reset
+    utils.set_current_device(None)
diff --git a/test/python/jit_compilation.py b/test/python/npu-xrt/test_jit_compilation.py
similarity index 99%
rename from test/python/jit_compilation.py
rename to test/python/npu-xrt/test_jit_compilation.py
index e0164b74cd3..d73fcbf0c03 100644
--- a/test/python/jit_compilation.py
+++ b/test/python/npu-xrt/test_jit_compilation.py
@@ -2,7 +2,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2025 AMD Inc.
+# (c) Copyright 2025-2026 AMD Inc.
 
 # RUN: %run_on_npu1% %pytest %s
 # RUN: %run_on_npu2% %pytest %s
diff --git a/test/python/jit_extern_functions.py b/test/python/npu-xrt/test_jit_extern_functions.py
similarity index 99%
rename from test/python/jit_extern_functions.py
rename to test/python/npu-xrt/test_jit_extern_functions.py
index cf75aa6c50c..e3db9127ef2 100644
--- a/test/python/jit_extern_functions.py
+++ b/test/python/npu-xrt/test_jit_extern_functions.py
@@ -2,7 +2,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2025 AMD Inc.
+# (c) Copyright 2025-2026 AMD Inc.
 
 # RUN: %run_on_npu1% %pytest %s
 # RUN: %run_on_npu2% %pytest %s
diff --git a/test/python/jit_extern_functions_inside_jit.py b/test/python/npu-xrt/test_jit_extern_functions_inside_jit.py
similarity index 99%
rename from test/python/jit_extern_functions_inside_jit.py
rename to test/python/npu-xrt/test_jit_extern_functions_inside_jit.py
index bacc140b650..12f3c3e7476 100644
--- a/test/python/jit_extern_functions_inside_jit.py
+++ b/test/python/npu-xrt/test_jit_extern_functions_inside_jit.py
@@ -2,7 +2,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2025 AMD Inc.
+# (c) Copyright 2025-2026 AMD Inc.
 
 # RUN: %run_on_npu1% %pytest %s
 # RUN: %run_on_npu2% %pytest %s
diff --git a/test/python/npu-xrt/test_jit_trace.py b/test/python/npu-xrt/test_jit_trace.py
new file mode 100644
index 00000000000..3a8e4298d74
--- /dev/null
+++ b/test/python/npu-xrt/test_jit_trace.py
@@ -0,0 +1,89 @@
+# test_jit_trace.py -*- Python -*-
+#
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
+
+# RUN: %run_on_npu1% %pytest %s
+# RUN: %run_on_npu2% %pytest %s
+# REQUIRES: xrt_python_bindings
+
+import pytest
+import numpy as np
+import os
+import aie.iron as iron
+from aie.utils.jit import jit
+from aie.utils import tensor
+from aie.utils.trace import TraceConfig, parse_trace
+from aie.iron import Kernel, ObjectFifo, Program, Runtime, Worker
+from aie.iron.placers import SequentialPlacer
+from aie.iron.controlflow import range_
+
+
+# Define kernel function
+def scale_scalar(of_in, of_out, factor, N):
+    elem_in = of_in.acquire(1)
+    elem_out = of_out.acquire(1)
+    for i in range_(N):
+        elem_out[i] = elem_in[i]
+    of_in.release(1)
+    of_out.release(1)
+
+
+@jit(is_placed=False)
+def design(a_in, c_out, trace_config=None):
+    N = 1024
+    # Construct types for sequence
+    a_type = np.ndarray[(1024,), np.dtype[np.int32]]
+    c_type = np.ndarray[(1024,), np.dtype[np.int32]]
+
+    # Define ObjectFifos
+    of_in = ObjectFifo(a_type, depth=2)
+    of_out = ObjectFifo(c_type, depth=2)
+
+    # Define Worker
+    worker = Worker(scale_scalar, fn_args=[of_in.cons(), of_out.prod(), 2, N])
+
+    rt = Runtime()
+    with rt.sequence(a_type, c_type) as (a, c):
+        if trace_config:
+            rt.enable_trace(trace_config.trace_size, workers=[worker])
+
+        # In runtime sequence:
+        rt.fill(of_in.prod(), a)
+        rt.start(worker)
+        rt.drain(of_out.cons(), c, wait=True)
+    return Program(iron.get_current_device(), rt).resolve_program(SequentialPlacer())
+
+
+@pytest.mark.parametrize("trace_size", [8192])
+def test_jit_trace(trace_size):
+    N = 1024
+    ref = np.arange(N, dtype=np.int32)
+    a = tensor(ref, dtype=np.int32)
+    c = tensor(np.zeros(N, dtype=np.int32), dtype=np.int32)
+
+    trace_config = TraceConfig(trace_size=trace_size, trace_after_last_tensor=False)
+
+    # Run JIT kernel with tracing
+    design(a, c, trace_config=trace_config)
+
+    # Sync output from device
+    c.to("cpu")
+
+    # Verify results
+    assert np.array_equal(c.numpy(), ref)
+
+    # Verify trace file exists
+    assert os.path.exists(trace_config.trace_file)
+
+    # Parse trace
+    # Get MLIR module from the wrapped function
+    mlir_module = design.__wrapped__(a, c, trace_config=trace_config)
+
+    trace_buffer = trace_config.read_trace()
+    trace_events = parse_trace(trace_buffer, str(mlir_module))
+
+    assert len(trace_events) > 0
diff --git a/test/python/objectfifo.py b/test/python/npu-xrt/test_objectfifo.py
similarity index 99%
rename from test/python/objectfifo.py
rename to test/python/npu-xrt/test_objectfifo.py
index dd94c4d1b10..d582b1c3a1d 100644
--- a/test/python/objectfifo.py
+++ b/test/python/npu-xrt/test_objectfifo.py
@@ -2,7 +2,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2025 AMD Inc.
+# (c) Copyright 2025-2026 AMD Inc.
 
 # RUN: %run_on_npu1% %pytest %s
 # RUN: %run_on_npu2% %pytest %s
diff --git a/test/python/tensor.py b/test/python/npu-xrt/test_tensor.py
similarity index 88%
rename from test/python/tensor.py
rename to test/python/npu-xrt/test_tensor.py
index 33e5df4510b..040025065b8 100644
--- a/test/python/tensor.py
+++ b/test/python/npu-xrt/test_tensor.py
@@ -2,7 +2,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2025 AMD Inc.
+# (c) Copyright 2025-2026 AMD Inc.
 
 # RUN: %run_on_npu1% %pytest %s
 # RUN: %run_on_npu2% %pytest %s
@@ -11,33 +11,22 @@
 import pytest
 import numpy as np
 import aie.iron as iron
-from aie.iron.hostruntime.tensor import CPUOnlyTensor, XRTTensor, Tensor
+from aie.utils.hostruntime.tensor_class import CPUOnlyTensor, Tensor
+from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor
+from aie.utils.hostruntime import bfloat16_safe_allclose
 from ml_dtypes import bfloat16
 
 TENSOR_CLASSES = [CPUOnlyTensor, XRTTensor]
 TEST_DTYPES = [np.float32, np.int32, bfloat16]
 
 
-def bfloat16_safe_allclose(dtype, arr1, arr2):
-    if dtype == bfloat16:
-        if isinstance(arr1, Tensor):
-            arr1 = np.array(arr1, dtype=np.float16)
-        else:
-            arr1 = arr1.astype(np.float16)
-        if isinstance(arr2, Tensor):
-            arr2 = np.array(arr2, dtype=np.float16)
-        else:
-            arr2 = arr2.astype(np.float16)
-    return np.allclose(arr1, arr2)
-
-
 @pytest.mark.parametrize("dtype", TEST_DTYPES)
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_tensor_creation(dtype, tensorclass):
     for d in tensorclass.DEVICES:
         t = tensorclass((2, 2), dtype=dtype, device=d)
         assert t.dtype == dtype
-        assert isinstance(t, iron.hostruntime.tensor.Tensor)
+        assert isinstance(t, Tensor)
         assert isinstance(t, tensorclass)
         expected = np.zeros((2, 2), dtype=dtype)
         assert bfloat16_safe_allclose(dtype, t, expected)
@@ -48,10 +37,10 @@ def test_tensor_creation(dtype, tensorclass):
 @pytest.mark.parametrize("dtype", TEST_DTYPES)
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_to_device(dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     for d in tensorclass.DEVICES:
         t = iron.ones((2, 2), dtype=dtype, device=d)
-        assert isinstance(t, iron.hostruntime.tensor.Tensor)
+        assert isinstance(t, Tensor)
         assert isinstance(t, tensorclass)
         assert t.dtype == dtype
         for d2 in tensorclass.DEVICES:
@@ -61,7 +50,7 @@ def test_to_device(dtype, tensorclass):
 @pytest.mark.parametrize("dtype", TEST_DTYPES)
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_zeros(dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     t = iron.zeros(2, 3, dtype=dtype)
     assert isinstance(t, tensorclass)
     assert bfloat16_safe_allclose(dtype, t, np.zeros((2, 3), dtype=dtype))
@@ -70,7 +59,7 @@ def test_zeros(dtype, tensorclass):
 @pytest.mark.parametrize("dtype", TEST_DTYPES)
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_ones(dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     t = iron.ones((2, 2), dtype=dtype)
     assert isinstance(t, tensorclass)
     assert bfloat16_safe_allclose(dtype, t, np.ones((2, 2), dtype=dtype))
@@ -81,7 +70,7 @@ def test_ones(dtype, tensorclass):
 )
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_random_with_bounds(dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     for d in tensorclass.DEVICES:
         t = iron.randint(0, 32, (2, 4), dtype=dtype, device=d)
         assert t.shape == (2, 4)
@@ -92,7 +81,7 @@ def test_random_with_bounds(dtype, tensorclass):
 @pytest.mark.parametrize("dtype", TEST_DTYPES)
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_rand(dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     for d in tensorclass.DEVICES:
         t = iron.rand(2, 2, dtype=dtype, device=d)
         arr = t.numpy()
@@ -104,7 +93,7 @@ def test_rand(dtype, tensorclass):
 )
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_arange_integer(dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     assert np.array_equal(iron.arange(3, 9, dtype=dtype), np.arange(3, 9, dtype=dtype))
 
 
@@ -113,7 +102,7 @@ def test_arange_integer(dtype, tensorclass):
 )
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_arange_floats(dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     assert bfloat16_safe_allclose(
         dtype,
         iron.arange(1.0, 5.0, 1.5, dtype=dtype),
@@ -125,7 +114,7 @@ def test_arange_floats(dtype, tensorclass):
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_fill(dtype, tensorclass):
     """Test the fill method for in-place tensor filling."""
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     for d in tensorclass.DEVICES:
         t = iron.zeros((2, 3), dtype=dtype, device=d)
 
@@ -147,7 +136,7 @@ def test_fill(dtype, tensorclass):
 @pytest.mark.parametrize("dtype", TEST_DTYPES)
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_zeros_like(dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     t = iron.tensor([[1, 2], [3, 4]], dtype=dtype)
     z = iron.zeros_like(t)
     expected = np.zeros_like(t)
@@ -158,7 +147,7 @@ def test_zeros_like(dtype, tensorclass):
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_tensor_repr(dtype, tensorclass):
     """Test that __repr__ properly syncs from device and shows correct data."""
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     for d in tensorclass.DEVICES:
         t = iron.tensor([[1, 2], [3, 4]], dtype=dtype, device=d)
         # Modify data on device
@@ -178,7 +167,7 @@ def test_tensor_repr(dtype, tensorclass):
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_tensor_getitem(dtype, tensorclass):
     """Test that __getitem__ properly syncs from device."""
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     for d in tensorclass.DEVICES:
         t = iron.tensor([[1, 2], [3, 4]], dtype=dtype, device=d)
         # Modify data on device
@@ -192,7 +181,7 @@ def test_tensor_getitem(dtype, tensorclass):
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_tensor_setitem(dtype, tensorclass):
     """Test that __setitem__ properly syncs to and from device."""
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     for d in tensorclass.DEVICES:
         t = iron.tensor([[1, 2], [3, 4]], dtype=dtype, device=d)
         t[0, 1] = 42
@@ -208,7 +197,7 @@ def test_tensor_setitem(dtype, tensorclass):
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_tensor_getitem_setitem_consistency(dtype, tensorclass):
     """Test that getitem and setitem work consistently with device sync."""
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     for d in tensorclass.DEVICES:
         t = iron.zeros((2, 2), dtype=dtype, device=d)
         # Set values
@@ -232,7 +221,7 @@ def test_tensor_getitem_setitem_consistency(dtype, tensorclass):
 )
 def test_cpu_tensor_no_sync(dtype, tensorclass):
     """Test that CPU tensors operations."""
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     t = iron.tensor([[1, 2], [3, 4]], dtype=dtype, device="cpu")
     assert t[0, 1] == 2
     t[0, 1] = 42
@@ -264,7 +253,7 @@ def test_device_attribute_update(dtype):
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_npu_tensor_sync_behavior(dtype, tensorclass):
     """Test that NPU tensors when implicit sync is required."""
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     for d in tensorclass.DEVICES:
         t = iron.tensor([[1, 2], [3, 4]], dtype=dtype, device=d)
         assert t.device == d
@@ -311,7 +300,7 @@ def test_mixed_device_operations(dtype):
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_rand_bfloat16_boundary(dtype, tensorclass):
     """Test that bfloat16 rand never produces 1.0 due to rounding."""
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     # Generate many values to increase chance of hitting boundary
     t = iron.rand(10000, dtype=dtype)
     arr = t.numpy()
diff --git a/test/python/torch_comparison.py b/test/python/npu-xrt/test_torch_comparison.py
similarity index 92%
rename from test/python/torch_comparison.py
rename to test/python/npu-xrt/test_torch_comparison.py
index 4db6e130484..061fc9a3744 100644
--- a/test/python/torch_comparison.py
+++ b/test/python/npu-xrt/test_torch_comparison.py
@@ -2,7 +2,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2025 AMD Inc.
+# (c) Copyright 2025-2026 AMD Inc.
 
 # RUN: %run_on_npu1% %pytest %s
 # RUN: %run_on_npu2% %pytest %s
@@ -13,7 +13,8 @@
 import numpy as np
 import torch
 import aie.iron as iron
-from aie.iron.hostruntime.tensor import CPUOnlyTensor, XRTTensor, Tensor
+from aie.utils.hostruntime.tensor_class import CPUOnlyTensor, Tensor
+from aie.utils.hostruntime.xrtruntime.tensor import XRTTensor
 from ml_dtypes import bfloat16
 
 TENSOR_CLASSES = [CPUOnlyTensor, XRTTensor]
@@ -39,7 +40,7 @@ def bfloat16_safe_allclose(dtype, arr1, arr2):
 @pytest.mark.parametrize("dtype, torch_dtype", zip(TEST_DTYPES, TORCH_DTYPES))
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_zeros(shape, dtype, torch_dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     iron_t = iron.zeros(shape, dtype=dtype)
     torch_t = torch.zeros(shape, dtype=torch_dtype)
     assert bfloat16_safe_allclose(dtype, iron_t, torch_t)
@@ -49,7 +50,7 @@ def test_zeros(shape, dtype, torch_dtype, tensorclass):
 @pytest.mark.parametrize("dtype, torch_dtype", zip(TEST_DTYPES, TORCH_DTYPES))
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_ones(shape, dtype, torch_dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     iron_t = iron.ones(shape, dtype=dtype)
     torch_t = torch.ones(shape, dtype=torch_dtype)
     assert bfloat16_safe_allclose(dtype, iron_t, torch_t)
@@ -65,7 +66,7 @@ def test_ones(shape, dtype, torch_dtype, tensorclass):
 )
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_randint(shape, dtype, torch_dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     low, high = 0, 32
     iron_t = iron.randint(low, high, shape, dtype=dtype)
     torch_t = torch.randint(low, high, shape, dtype=torch_dtype)
@@ -90,7 +91,7 @@ def test_randint(shape, dtype, torch_dtype, tensorclass):
 )
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_arange_integer(dtype, torch_dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     start, end = 3, 9
     iron_t = iron.arange(start, end, dtype=dtype)
     torch_t = torch.arange(start, end, dtype=torch_dtype)
@@ -106,7 +107,7 @@ def test_arange_integer(dtype, torch_dtype, tensorclass):
 )
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_arange_floats(dtype, torch_dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     start, end, step = 1.0, 5.0, 1.5
     iron_t = iron.arange(start, end, step, dtype=dtype)
     torch_t = torch.arange(start, end, step, dtype=torch_dtype)
@@ -123,7 +124,7 @@ def test_arange_floats(dtype, torch_dtype, tensorclass):
 )
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_rand(shape, dtype, torch_dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     if shape == ():
         with pytest.raises(ValueError, match="rand.. received no arguments"):
             iron.rand(*shape, dtype=dtype)
@@ -146,7 +147,7 @@ def test_rand(shape, dtype, torch_dtype, tensorclass):
 @pytest.mark.parametrize("dtype, torch_dtype", zip(TEST_DTYPES, TORCH_DTYPES))
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_zeros_like(shape, dtype, torch_dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     iron_t_orig = iron.ones(shape, dtype=dtype)
     torch_t_orig = torch.ones(shape, dtype=torch_dtype)
 
@@ -160,7 +161,7 @@ def test_zeros_like(shape, dtype, torch_dtype, tensorclass):
 @pytest.mark.parametrize("dtype, torch_dtype", zip(TEST_DTYPES, TORCH_DTYPES))
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_fill(shape, dtype, torch_dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     iron_t = iron.zeros(shape, dtype=dtype)
     torch_t = torch.zeros(shape, dtype=torch_dtype)
 
@@ -175,7 +176,7 @@ def test_fill(shape, dtype, torch_dtype, tensorclass):
 @pytest.mark.parametrize("dtype, torch_dtype", zip(TEST_DTYPES, TORCH_DTYPES))
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_len(shape, dtype, torch_dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     iron_t = iron.zeros(shape, dtype=dtype)
     torch_t = torch.zeros(shape, dtype=torch_dtype)
     if not shape:
@@ -192,7 +193,7 @@ def test_len(shape, dtype, torch_dtype, tensorclass):
 @pytest.mark.parametrize("dtype, torch_dtype", zip(TEST_DTYPES, TORCH_DTYPES))
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_to_torch(shape, dtype, torch_dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     iron_t = iron.ones(shape, dtype=dtype)
     torch_t = iron_t.to_torch()
     assert isinstance(torch_t, torch.Tensor)
@@ -210,7 +211,7 @@ def test_to_torch(shape, dtype, torch_dtype, tensorclass):
 @pytest.mark.parametrize("dtype, torch_dtype", zip(TEST_DTYPES, TORCH_DTYPES))
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_from_torch(shape, dtype, torch_dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     torch_t = torch.ones(shape, dtype=torch_dtype)
     iron_t = tensorclass.from_torch(torch_t)
     assert isinstance(iron_t, Tensor)
@@ -228,7 +229,7 @@ def test_from_torch(shape, dtype, torch_dtype, tensorclass):
 )
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_iron_torch_iron(shape, dtype, torch_dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     low, high = 0, 100
     iron_t_orig = iron.randint(low, high, shape, dtype=dtype)
     torch_t = iron_t_orig.to_torch()
@@ -246,7 +247,7 @@ def test_iron_torch_iron(shape, dtype, torch_dtype, tensorclass):
 )
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_torch_iron_torch(shape, dtype, torch_dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     low, high = 0, 100
     torch_t_orig = torch.randint(low, high, shape, dtype=torch_dtype)
     iron_t = tensorclass.from_torch(torch_t_orig)
@@ -264,7 +265,7 @@ def test_torch_iron_torch(shape, dtype, torch_dtype, tensorclass):
 )
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_iron_torch_iron_float(shape, dtype, torch_dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     if shape == ():
         return
     iron_t_orig = iron.rand(*shape, dtype=dtype)
@@ -283,7 +284,7 @@ def test_iron_torch_iron_float(shape, dtype, torch_dtype, tensorclass):
 )
 @pytest.mark.parametrize("tensorclass", TENSOR_CLASSES)
 def test_torch_iron_torch_float(shape, dtype, torch_dtype, tensorclass):
-    iron.set_iron_tensor_class(tensorclass)
+    iron.set_tensor_class(tensorclass)
     if shape == ():
         return
     torch_t_orig = torch.rand(*shape, dtype=torch_dtype)
diff --git a/test/python/set_current_device.py b/test/python/set_current_device.py
deleted file mode 100644
index 715dc21851b..00000000000
--- a/test/python/set_current_device.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# (c) Copyright 2025 AMD Inc.
-
-# RUN: %run_on_npu1% %pytest %s
-# RUN: %run_on_npu2% %pytest %s
-
-import aie.iron as iron
-from aie.iron.device import NPU2
-
-
-def test_set_current_device():
-    device = NPU2()
-    iron.set_current_device(device)
-    current_device = iron.get_current_device()
-    assert current_device == device
diff --git a/test/python/test_event_enums.py b/test/python/trace_events.py
similarity index 97%
rename from test/python/test_event_enums.py
rename to test/python/trace_events.py
index 20ec5d59365..0933454d772 100644
--- a/test/python/test_event_enums.py
+++ b/test/python/trace_events.py
@@ -3,7 +3,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2025 Advanced Micro Devices, Inc.
+# (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
 
 # RUN: %python %s | FileCheck %s
 #
@@ -33,9 +33,9 @@
 # CHECK: AIE2.CoreEvent.SRS_SATURATE does not exist
 # CHECK: All tests passed!
 
-import aie.utils.trace_events.aie as aie1
-import aie.utils.trace_events.aie2 as aie2
-import aie.utils.trace_events.aie2p as aie2p
+import aie.utils.trace.events.aie as aie1
+import aie.utils.trace.events.aie2 as aie2
+import aie.utils.trace.events.aie2p as aie2p
 
 
 def test_arch(name, core_cls, mem_cls, shim_cls, mem_tile_cls=None):
diff --git a/test/python/trace_utils.py b/test/python/trace_utils.py
index 13390e9f701..33d1c348de6 100644
--- a/test/python/trace_utils.py
+++ b/test/python/trace_utils.py
@@ -3,7 +3,7 @@
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
-# (c) Copyright 2024 Advanced Micro Devices, Inc.
+# (c) Copyright 2024-2026 Advanced Micro Devices, Inc.
 
 # RUN: %python %s | FileCheck %s
 #
@@ -24,6 +24,7 @@
 from aie.iron.controlflow import range_
 from aie.extras.context import mlir_mod_ctx
 from aie.utils.trace import *
+from aie.utils.trace.events import *
 
 N = 1024
 
diff --git a/utils/env_setup.sh b/utils/env_setup.sh
index 12667a4d2cc..58b20f72073 100644
--- a/utils/env_setup.sh
+++ b/utils/env_setup.sh
@@ -1,4 +1,5 @@
 #!/bin/bash
+# (c) Copyright 2026 Advanced Micro Devices, Inc.
 ##===- utils/env_setup.sh - Setup mlir-aie env to compile IRON designs --*- Script -*-===##
 #
 # This file licensed under the Apache License v2.0 with LLVM Exceptions.
@@ -81,4 +82,4 @@ echo "      $PEANO_INSTALL_DIR/bin"
 echo ""
 echo "PATH              : $PATH"
 echo "LD_LIBRARY_PATH   : $LD_LIBRARY_PATH"
-echo "PYTHONPATH        : $PYTHONPATH"
+echo "PYTHONPATH        : $PYTHONPATH"
\ No newline at end of file