diff --git a/.github/workflows/buildAndTestRyzenAI.yml b/.github/workflows/buildAndTestRyzenAI.yml index 4c10bcec300..01a4374f3bd 100644 --- a/.github/workflows/buildAndTestRyzenAI.yml +++ b/.github/workflows/buildAndTestRyzenAI.yml @@ -60,6 +60,8 @@ jobs: fail-fast: false matrix: runner_type: [ amd7940hs, amdhx370 ] + env: + IRON_CACHE_HOME: ${{ github.workspace }}/iron-cache-${{ matrix.runner_type }}-${{ github.run_id }} steps: - uses: actions/checkout@v4 with: @@ -126,6 +128,9 @@ jobs: -DMLIR_DIR=$PWD/../mlir/lib/cmake/mlir \ $CMAKE_ARGS + # Create runner-specific cache directory + mkdir -p $IRON_CACHE_HOME + ninja install ninja check-aie popd @@ -137,6 +142,8 @@ jobs: fail-fast: false matrix: runner_type: [ amd7940hs, amdhx370 ] + env: + IRON_CACHE_HOME: ${{ github.workspace }}/iron-cache-${{ matrix.runner_type }}-${{ github.run_id }} steps: - uses: actions/checkout@v4 with: @@ -183,8 +190,10 @@ jobs: LIT_OPTS="-j12 $LIT_OPTS" fi + # Create runner-specific cache directory + mkdir -p $IRON_CACHE_HOME + ninja install ninja check-reference-designs ninja check-programming-guide - - popd \ No newline at end of file + popd diff --git a/python/iron/jit.py b/python/iron/jit.py index ec7252f5747..5bf38a938d3 100644 --- a/python/iron/jit.py +++ b/python/iron/jit.py @@ -23,10 +23,44 @@ from aie.dialects.aie import AIEDevice -# The `iron.jit` decorator below caches compiled kenrels inside the `IRON_CACHE_DIR` directory. +# The `iron.jit` decorator below caches compiled kenrels inside the `IRON_CACHE_HOME` directory. # Kernels are cached based on their hash value of the MLIR module string. If during compilation, # we hit in the cache, the `iron.jit` will load the xclbin and instruction binary files from the cache. -IRON_CACHE_DIR = os.path.expanduser("~/.iron/cache") +IRON_CACHE_HOME = os.environ.get("IRON_CACHE_HOME", os.path.expanduser("~/.iron/cache")) + + +class CircularCache: + def __init__(self, max_size): + self.max_size = max_size + self.cache = [None] * max_size + self.keys = [None] * max_size + self.index = 0 + + def __contains__(self, key): + return key in self.keys + + def __getitem__(self, key): + idx = self.keys.index(key) + return self.cache[idx] + + def __setitem__(self, key, value): + self.cache[self.index] = value + self.keys[self.index] = key + self.index = (self.index + 1) % self.max_size + + def __len__(self): + return sum(1 for k in self.keys if k is not None) + + def clear(self): + self.cache = [None] * self.max_size + self.keys = [None] * self.max_size + self.index = 0 + + +# Global cache for compiled kernels at the function level +# Key: (function_name, args_signature) -> NPUKernel instance +# There is a limit on the number of kernels we have in cache +_compiled_kernels = CircularCache(max_size=1) class NPUKernel: @@ -117,8 +151,21 @@ def __del__(self): """ Destructor to clean up resources and delete the kernel and device objects. """ - del self.__kernel - del self.__device + if hasattr(self, "_NPUKernel__insts_buffer_bo"): + del self.__insts_buffer_bo + self.__insts_buffer_bo = None + if hasattr(self, "_NPUKernel__kernel"): + del self.__kernel + self.__kernel = None + if hasattr(self, "_NPUKernel__context"): + del self.__context + self.__context = None + if hasattr(self, "_NPUKernel__xclbin"): + del self.__xclbin + self.__xclbin = None + if hasattr(self, "_NPUKernel__device"): + del self.__device + self.__device = None class NPUKernel_Error(Exception): @@ -145,6 +192,12 @@ def jit(function=None, is_placed=True, use_cache=True): def decorator(*args, **kwargs): from .kernel import ExternalFunction + # Check if we already have a compiled kernel for this function signature + cache_key = _create_function_cache_key(function, args, kwargs) + if cache_key in _compiled_kernels: + cached_kernel = _compiled_kernels[cache_key] + return cached_kernel(*args, **kwargs) + # Clear any instances from previous runs to make sure if the user provided any broken code we don't try to recompile it ExternalFunction._instances.clear() @@ -198,7 +251,7 @@ def decorator(*args, **kwargs): # Hash of the IR string, ExternalFunction compiler options, and target architecture module_hash = hash_module(mlir_module, external_kernels, target_arch) - kernel_dir = os.path.join(IRON_CACHE_DIR, f"{module_hash}") + kernel_dir = os.path.join(IRON_CACHE_HOME, f"{module_hash}") mlir_path = os.path.join(kernel_dir, "aie.mlir") # Ensure cache directory exists @@ -238,6 +291,10 @@ def decorator(*args, **kwargs): kernel_name = "MLIR_AIE" try: kernel = NPUKernel(xclbin_path, inst_path, kernel_name=kernel_name) + + # Cache the kernel for this function signature + _compiled_kernels[cache_key] = kernel + result = kernel(*args, **kwargs) return result except Exception as e: @@ -313,15 +370,14 @@ def hash_module(module, external_kernels=None, target_arch=None): """ mlir_str = str(module) - # Include ExternalFunction compiler options in the hash + # Include ExternalFunction compiler options and source code in the hash if external_kernels: - compiler_options = [] + running_hash = "" + source_contents = [] for func in external_kernels: - compiler_options.extend(func._include_dirs) - compiler_options.extend(func._compile_flags) + running_hash += str(hash(func)) - # Create a combined string for hashing - combined_str = mlir_str + "|" + "|".join(compiler_options) + combined_str = mlir_str + "|" + "|".join(running_hash) else: combined_str = mlir_str @@ -331,3 +387,52 @@ def hash_module(module, external_kernels=None, target_arch=None): hash_result = hashlib.sha256(combined_str.encode("utf-8")).hexdigest()[:16] return hash_result + + +def _hash_argument(arg, prefix=""): + """ + Helper function to hash supported argument types (tensors and callables). + Returns a string representation for cache key generation. + """ + from aie.iron.tensor import Tensor + from aie.iron.kernel import ExternalFunction + + if isinstance(arg, Tensor): + # Tensor argument - include shape and dtype + return f"{prefix}tensor_{arg.shape}_{arg.dtype}" + elif isinstance(arg, ExternalFunction): + # ExternalFunction argument - use its custom hash method + func_hash = hash(arg) + return f"{prefix}externalfunction_{func_hash}" + elif callable(arg): + # Function argument - use hash of function address for uniqueness + func_hash = hash(arg) + return f"{prefix}function_{func_hash}" + else: + # Unsupported type - use type name + return f"{prefix}{type(arg).__name__}" + + +def _create_function_cache_key(function, args, kwargs): + """ + Create a cache key for a function call based on function name and argument types/shapes. + This allows us to cache compiled kernels at the function level. + Note that it is not necessary that we cache the tensor shapes since the kernel may be agonstic + to the shape changes but we are doing here for safety. + """ + # Get function name + func_name = function.__name__ + + # Create signature from argument types and shapes + signature_parts = [] + + for arg in args: + result = _hash_argument(arg) + signature_parts.append(result) + + for key, value in sorted(kwargs.items()): + result = _hash_argument(value, f"{key}_") + signature_parts.append(result) + + signature = "_".join(signature_parts) + return (func_name, signature) diff --git a/python/iron/kernel.py b/python/iron/kernel.py index 220fce1edb3..d0b35f77209 100644 --- a/python/iron/kernel.py +++ b/python/iron/kernel.py @@ -186,6 +186,34 @@ def resolve( # Create the external function self._op = external_func(self._name, inputs=self._arg_types) + def __hash__(self): + """ + Compute a hash for the ExternalFunction based on its properties. + This allows ExternalFunction instances to be used in cache keys. + """ + import hashlib + + # Create a string representation of the function's key properties + hash_parts = [ + self._name, + str(self._arg_types), + str(sorted(self._include_dirs)), + str(sorted(self._compile_flags)), + ] + + # Include source content for uniqueness + # TODO: This solution needs to be extended to handle headers. See https://github.com/Xilinx/mlir-aie/issues/2543 + if self._source_string: + hash_parts.append(self._source_string) + elif self._source_file: + with open(self._source_file, "r") as f: + file_content = f.read() + hash_parts.append(file_content) + + # Create hash from combined string + combined = "|".join(hash_parts) + return int(hashlib.sha256(combined.encode("utf-8")).hexdigest()[:8], 16) + def __call__(self, *args, **kwargs): if not self._op: raise ValueError("Need to resolve ExternalFunction before it can be called") diff --git a/python/iron/tensor.py b/python/iron/tensor.py index 358cdd13ec3..24da9bbb1ad 100644 --- a/python/iron/tensor.py +++ b/python/iron/tensor.py @@ -484,8 +484,9 @@ def __del__(self): Releases associated device memory (e.g., XRT buffer object). """ - del self.bo - self.bo = None + if hasattr(self, "bo"): + del self.bo + self.bo = None def tensor(data, dtype=np.float32, device="npu"): diff --git a/python/utils/xrt.py b/python/utils/xrt.py index bc26f457613..162915a864d 100644 --- a/python/utils/xrt.py +++ b/python/utils/xrt.py @@ -74,8 +74,12 @@ def call(self): return h def __del__(self): - del self.kernel - del self.device + if hasattr(self, "kernel"): + del self.kernel + self.kernel = None + if hasattr(self, "device"): + del self.device + self.device = None # This class wraps up access to the xrt.bo buffer object where sync calls are added @@ -114,8 +118,9 @@ def sync_from_device(self): return self.bo.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE) def __del__(self): - del self.bo - self.bo = None + if hasattr(self, "bo"): + del self.bo + self.bo = None class AIE_Application_Error(Exception): diff --git a/test/python/cache_functionality.py b/test/python/cache_functionality.py new file mode 100644 index 00000000000..926fd7af851 --- /dev/null +++ b/test/python/cache_functionality.py @@ -0,0 +1,437 @@ +# This file is licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +# +# (c) Copyright 2025 AMD Inc. + +# RUN: %run_on_npu1% %pytest %s +# RUN: %run_on_npu2% %pytest %s + +import numpy as np +import tempfile +import os + + +import aie.iron as iron +from aie.iron import ExternalFunction +from aie.iron import ObjectFifo, Worker, Runtime, Program +from aie.iron.placers import SequentialPlacer +from aie.iron.controlflow import range_ + + +@iron.jit(is_placed=False) +def transform(input, output, func): + """Transform kernel that applies a function to input tensor and stores result in output tensor.""" + if input.shape != output.shape: + raise ValueError( + f"Input shapes are not the equal ({input.shape} != {output.shape})." + ) + num_elements = np.size(input) + + if isinstance(func, iron.ExternalFunction): + tile_size = func.tile_size(0) + else: + tile_size = 16 if num_elements >= 16 else 1 + + if num_elements % tile_size != 0: + raise ValueError( + f"Number of elements ({num_elements}) must be a multiple of {tile_size}." + ) + num_tiles = num_elements // tile_size + + if input.dtype != output.dtype: + raise ValueError( + f"Input data types are not the same ({input.dtype} != {output.dtype})." + ) + + dtype = input.dtype + + # Define tensor types + tensor_ty = np.ndarray[(num_elements,), np.dtype[dtype]] + tile_ty = np.ndarray[(tile_size,), np.dtype[dtype]] + + # AIE-array data movement with object fifos + of_in = ObjectFifo(tile_ty, name="in") + of_out = ObjectFifo(tile_ty, name="out") + + # Define a task that will run on a compute tile + def core_body(of_in, of_out, func_to_apply): + for _ in range_(num_tiles): + elem_in = of_in.acquire(1) + elem_out = of_out.acquire(1) + if isinstance(func_to_apply, iron.ExternalFunction): + func_to_apply(elem_in, elem_out, tile_size) + else: + for j in range_(tile_size): + elem_out[j] = func_to_apply(elem_in[j]) + of_in.release(1) + of_out.release(1) + + # Create a worker to run the task on a compute tile + worker = Worker(core_body, fn_args=[of_in.cons(), of_out.prod(), func]) + + # Runtime operations to move data to/from the AIE-array + rt = Runtime() + with rt.sequence(tensor_ty, tensor_ty) as (A, B): + rt.start(worker) + rt.fill(of_in.prod(), A) + rt.drain(of_out.cons(), B, wait=True) + + # Place program components (assign them resources on the device) and generate an MLIR module + return Program(iron.get_current_device(), rt).resolve_program(SequentialPlacer()) + + +def test_cache_lambda_functions(): + """Test that caching works correctly with different lambda functions.""" + # Create input tensor + input_tensor = iron.tensor((32,), dtype=np.int32) + input_tensor[:] = np.arange(1, 33, dtype=np.int32) # [1, 2, 3, ..., 32] + + # Test 1: First execution with lambda function + transform(input_tensor, input_tensor, lambda x: x + 1) + result1 = input_tensor.numpy().copy() + + # Reset tensor + input_tensor[:] = np.arange(1, 33, dtype=np.int32) + + # Test 2: Second execution with same lambda function (should use cache) + transform(input_tensor, input_tensor, lambda x: x + 1) + result2 = input_tensor.numpy() + + # Results should be identical + np.testing.assert_array_equal(result1, result2) + + # Test 3: Different lambda function (should generate new cache entry) + input_tensor[:] = np.arange(1, 33, dtype=np.int32) + transform(input_tensor, input_tensor, lambda x: x * 2) + result3 = input_tensor.numpy() + + # Results should be different + np.testing.assert_raises( + AssertionError, np.testing.assert_array_equal, result1, result3 + ) + + +def test_cache_external_functions(): + """Test that ExternalFunction caching works correctly during execution.""" + # Create input tensor + input_tensor = iron.tensor((32,), dtype=np.int32) + input_tensor[:] = np.arange(1, 33, dtype=np.int32) # [1, 2, 3, ..., 32] + + # Test 1: First execution + add_one_1 = ExternalFunction( + "add_one", + source_string="""extern "C" { + void add_one(int* input, int* output, int tile_size) { + for (int i = 0; i < tile_size; i++) { + output[i] = input[i] + 1; + } + } + }""", + arg_types=[ + np.ndarray[(16,), np.dtype[np.int32]], + np.ndarray[(16,), np.dtype[np.int32]], + np.int32, + ], + ) + transform(input_tensor, input_tensor, add_one_1) + result1 = input_tensor.numpy().copy() + + # Reset tensor + input_tensor[:] = np.arange(1, 33, dtype=np.int32) + + # Test 2: Second execution + add_one_2 = ExternalFunction( + "add_one", + source_string="""extern "C" { + void add_one(int* input, int* output, int tile_size) { + for (int i = 0; i < tile_size; i++) { + output[i] = input[i] + 1; + } + } + }""", + arg_types=[ + np.ndarray[(16,), np.dtype[np.int32]], + np.ndarray[(16,), np.dtype[np.int32]], + np.int32, + ], + ) + transform(input_tensor, input_tensor, add_one_2) + result2 = input_tensor.numpy() + + # Results should be identical + np.testing.assert_array_equal(result1, result2) + + # Test 3: Different ExternalFunction (should generate new cache entry) + multiply_two = ExternalFunction( + "multiply_two", + source_string="""extern "C" { + void multiply_two(int* input, int* output, int tile_size) { + for (int i = 0; i < tile_size; i++) { + output[i] = input[i] * 2; + } + } + }""", + arg_types=[ + np.ndarray[(16,), np.dtype[np.int32]], + np.ndarray[(16,), np.dtype[np.int32]], + np.int32, + ], + ) + + input_tensor[:] = np.arange(1, 33, dtype=np.int32) + transform(input_tensor, input_tensor, multiply_two) + result3 = input_tensor.numpy() + + # Results should be different + np.testing.assert_raises( + AssertionError, np.testing.assert_array_equal, result1, result3 + ) + + +def test_cache_compile_flags(): + """Test that ExternalFunctions with different compile flags produce different results.""" + # Create input tensor + input_tensor = iron.tensor((32,), dtype=np.int32) + input_tensor[:] = np.arange(1, 33, dtype=np.int32) # [1, 2, 3, ..., 32] + + # Create ExternalFunctions with different compile flags + add_5 = ExternalFunction( + "add_value", + source_string="""extern "C" { + void add_value(int* input, int* output, int tile_size) { + for (int i = 0; i < tile_size; i++) { + output[i] = input[i] + ADD_VALUE; + } + } + }""", + arg_types=[ + np.ndarray[(16,), np.dtype[np.int32]], + np.ndarray[(16,), np.dtype[np.int32]], + np.int32, + ], + compile_flags=["-DADD_VALUE=5"], + ) + + add_10 = ExternalFunction( + "add_value", + source_string="""extern "C" { + void add_value(int* input, int* output, int tile_size) { + for (int i = 0; i < tile_size; i++) { + output[i] = input[i] + ADD_VALUE; + } + } + }""", + arg_types=[ + np.ndarray[(16,), np.dtype[np.int32]], + np.ndarray[(16,), np.dtype[np.int32]], + np.int32, + ], + compile_flags=["-DADD_VALUE=10"], + ) + + # Test with ADD_VALUE=5 + transform(input_tensor, input_tensor, add_5) + result_5 = input_tensor.numpy().copy() + + # Reset and test with ADD_VALUE=10 + input_tensor[:] = np.arange(1, 33, dtype=np.int32) + transform(input_tensor, input_tensor, add_10) + result_10 = input_tensor.numpy() + + # Results should be different + np.testing.assert_raises( + AssertionError, np.testing.assert_array_equal, result_5, result_10 + ) + + # Verify expected results + expected_5 = np.arange(1, 33, dtype=np.int32) + 5 + expected_10 = np.arange(1, 33, dtype=np.int32) + 10 + + np.testing.assert_array_equal(result_5, expected_5) + np.testing.assert_array_equal(result_10, expected_10) + + +def test_cache_source_changes(): + """Test that ExternalFunctions with different source content produce different results.""" + # Create input tensor + input_tensor = iron.tensor((32,), dtype=np.int32) + input_tensor[:] = np.arange(1, 33, dtype=np.int32) # [1, 2, 3, ..., 32] + + # Create ExternalFunctions with different source content + add_1 = ExternalFunction( + "add_one", + source_string="""extern "C" { + void add_one(int* input, int* output, int tile_size) { + for (int i = 0; i < tile_size; i++) { + output[i] = input[i] + 1; + } + } + }""", + arg_types=[ + np.ndarray[(16,), np.dtype[np.int32]], + np.ndarray[(16,), np.dtype[np.int32]], + np.int32, + ], + ) + + add_2 = ExternalFunction( + "add_one", + source_string="""extern "C" { + void add_one(int* input, int* output, int tile_size) { + for (int i = 0; i < tile_size; i++) { + output[i] = input[i] + 2; // Different operation + } + } + }""", + arg_types=[ + np.ndarray[(16,), np.dtype[np.int32]], + np.ndarray[(16,), np.dtype[np.int32]], + np.int32, + ], + ) + + # Test with add_1 + transform(input_tensor, input_tensor, add_1) + result_1 = input_tensor.numpy().copy() + + # Reset and test with add_2 + input_tensor[:] = np.arange(1, 33, dtype=np.int32) + transform(input_tensor, input_tensor, add_2) + result_2 = input_tensor.numpy() + + # Results should be different + np.testing.assert_raises( + AssertionError, np.testing.assert_array_equal, result_1, result_2 + ) + + # Verify expected results + expected_1 = np.arange(1, 33, dtype=np.int32) + 1 + expected_2 = np.arange(1, 33, dtype=np.int32) + 2 + + np.testing.assert_array_equal(result_1, expected_1) + np.testing.assert_array_equal(result_2, expected_2) + + +def test_cache_file_source(): + """Test that ExternalFunctions with file sources work correctly.""" + # Create input tensor + input_tensor = iron.tensor((32,), dtype=np.int32) + input_tensor[:] = np.arange(1, 33, dtype=np.int32) # [1, 2, 3, ..., 32] + + # Create temporary source file + with tempfile.NamedTemporaryFile(mode="w", suffix=".cc", delete=False) as f: + source_content = """extern "C" { + void add_one_from_file(int* input, int* output, int tile_size) { + for (int i = 0; i < tile_size; i++) { + output[i] = input[i] + 1; + } + } + }""" + f.write(source_content) + source_file_path = f.name + + try: + # Create ExternalFunction using source_file + add_one_from_file = ExternalFunction( + "add_one_from_file", + source_file=source_file_path, + arg_types=[ + np.ndarray[(16,), np.dtype[np.int32]], + np.ndarray[(16,), np.dtype[np.int32]], + np.int32, + ], + ) + + # Test execution + transform(input_tensor, input_tensor, add_one_from_file) + result = input_tensor.numpy() + + # Verify expected results + expected = np.arange(1, 33, dtype=np.int32) + 1 + np.testing.assert_array_equal(result, expected) + + finally: + # Clean up the temporary file + os.unlink(source_file_path) + + +def test_cache_include_directories(): + """Test that ExternalFunctions with include directories work correctly.""" + # Create input tensor + input_tensor = iron.tensor((32,), dtype=np.int32) + input_tensor[:] = np.arange(1, 33, dtype=np.int32) # [1, 2, 3, ..., 32] + + # Create temporary directory with header file + with tempfile.TemporaryDirectory() as temp_dir: + # Create header file + header_file = os.path.join(temp_dir, "math_ops.h") + with open(header_file, "w") as f: + f.write("#define ADD_VALUE 42\n") + + # Create ExternalFunction that includes the header + add_value = ExternalFunction( + "add_value", + source_string="""extern "C" { + #include "math_ops.h" + void add_value(int* input, int* output, int tile_size) { + for (int i = 0; i < tile_size; i++) { + output[i] = input[i] + ADD_VALUE; + } + } + }""", + arg_types=[ + np.ndarray[(16,), np.dtype[np.int32]], + np.ndarray[(16,), np.dtype[np.int32]], + np.int32, + ], + include_dirs=[temp_dir], + ) + + # Test execution + transform(input_tensor, input_tensor, add_value) + result = input_tensor.numpy() + + # Verify expected results + expected = np.arange(1, 33, dtype=np.int32) + 42 + np.testing.assert_array_equal(result, expected) + + +def test_cache_tensor_shapes(): + """Test that different tensor shapes work correctly with caching.""" + # Test with different tensor sizes + sizes = [16, 32, 64] + results = [] + + for size in sizes: + input_tensor = iron.tensor((size,), dtype=np.int32) + input_tensor[:] = np.arange(1, size + 1, dtype=np.int32) + + # Apply transformation + transform(input_tensor, input_tensor, lambda x: x + 1) + result = input_tensor.numpy() + results.append(result) + + # Verify expected results + expected = np.arange(1, size + 1, dtype=np.int32) + 1 + np.testing.assert_array_equal(result, expected) + + +def test_cache_tensor_dtypes(): + """Test that different tensor dtypes work correctly with caching.""" + # Test with different dtypes + dtypes = [np.int32, np.float32] + results = [] + + for dtype in dtypes: + input_tensor = iron.tensor((32,), dtype=dtype) + input_tensor[:] = np.arange(1, 33, dtype=dtype) + + # Apply transformation + transform(input_tensor, input_tensor, lambda x: x + 1) + result = input_tensor.numpy() + results.append(result) + + # Verify expected results + expected = np.arange(1, 33, dtype=dtype) + 1 + np.testing.assert_array_equal(result, expected)