Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 11 additions & 2 deletions .github/workflows/buildAndTestRyzenAI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ jobs:
fail-fast: false
matrix:
runner_type: [ amd7940hs, amdhx370 ]
env:
IRON_CACHE_HOME: ${{ github.workspace }}/iron-cache-${{ matrix.runner_type }}-${{ github.run_id }}
steps:
- uses: actions/checkout@v4
with:
Expand Down Expand Up @@ -126,6 +128,9 @@ jobs:
-DMLIR_DIR=$PWD/../mlir/lib/cmake/mlir \
$CMAKE_ARGS

# Create runner-specific cache directory
mkdir -p $IRON_CACHE_HOME

ninja install
ninja check-aie
popd
Expand All @@ -137,6 +142,8 @@ jobs:
fail-fast: false
matrix:
runner_type: [ amd7940hs, amdhx370 ]
env:
IRON_CACHE_HOME: ${{ github.workspace }}/iron-cache-${{ matrix.runner_type }}-${{ github.run_id }}
steps:
- uses: actions/checkout@v4
with:
Expand Down Expand Up @@ -183,8 +190,10 @@ jobs:
LIT_OPTS="-j12 $LIT_OPTS"
fi

# Create runner-specific cache directory
mkdir -p $IRON_CACHE_HOME

ninja install
ninja check-reference-designs
ninja check-programming-guide

popd
popd
127 changes: 116 additions & 11 deletions python/iron/jit.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,44 @@
from aie.dialects.aie import AIEDevice


# The `iron.jit` decorator below caches compiled kenrels inside the `IRON_CACHE_DIR` directory.
# The `iron.jit` decorator below caches compiled kenrels inside the `IRON_CACHE_HOME` directory.
# Kernels are cached based on their hash value of the MLIR module string. If during compilation,
# we hit in the cache, the `iron.jit` will load the xclbin and instruction binary files from the cache.
IRON_CACHE_DIR = os.path.expanduser("~/.iron/cache")
IRON_CACHE_HOME = os.environ.get("IRON_CACHE_HOME", os.path.expanduser("~/.iron/cache"))


class CircularCache:
def __init__(self, max_size):
self.max_size = max_size
self.cache = [None] * max_size
self.keys = [None] * max_size
self.index = 0

def __contains__(self, key):
return key in self.keys

def __getitem__(self, key):
idx = self.keys.index(key)
return self.cache[idx]

def __setitem__(self, key, value):
self.cache[self.index] = value
self.keys[self.index] = key
self.index = (self.index + 1) % self.max_size

def __len__(self):
return sum(1 for k in self.keys if k is not None)

def clear(self):
self.cache = [None] * self.max_size
self.keys = [None] * self.max_size
self.index = 0


# Global cache for compiled kernels at the function level
# Key: (function_name, args_signature) -> NPUKernel instance
# There is a limit on the number of kernels we have in cache
_compiled_kernels = CircularCache(max_size=1)


class NPUKernel:
Expand Down Expand Up @@ -117,8 +151,21 @@ def __del__(self):
"""
Destructor to clean up resources and delete the kernel and device objects.
"""
del self.__kernel
del self.__device
if hasattr(self, "_NPUKernel__insts_buffer_bo"):
del self.__insts_buffer_bo
self.__insts_buffer_bo = None
if hasattr(self, "_NPUKernel__kernel"):
del self.__kernel
self.__kernel = None
if hasattr(self, "_NPUKernel__context"):
del self.__context
self.__context = None
if hasattr(self, "_NPUKernel__xclbin"):
del self.__xclbin
self.__xclbin = None
if hasattr(self, "_NPUKernel__device"):
del self.__device
self.__device = None


class NPUKernel_Error(Exception):
Expand All @@ -145,6 +192,12 @@ def jit(function=None, is_placed=True, use_cache=True):
def decorator(*args, **kwargs):
from .kernel import ExternalFunction

# Check if we already have a compiled kernel for this function signature
cache_key = _create_function_cache_key(function, args, kwargs)
if cache_key in _compiled_kernels:
cached_kernel = _compiled_kernels[cache_key]
return cached_kernel(*args, **kwargs)

# Clear any instances from previous runs to make sure if the user provided any broken code we don't try to recompile it
ExternalFunction._instances.clear()

Expand Down Expand Up @@ -198,7 +251,7 @@ def decorator(*args, **kwargs):

# Hash of the IR string, ExternalFunction compiler options, and target architecture
module_hash = hash_module(mlir_module, external_kernels, target_arch)
kernel_dir = os.path.join(IRON_CACHE_DIR, f"{module_hash}")
kernel_dir = os.path.join(IRON_CACHE_HOME, f"{module_hash}")
mlir_path = os.path.join(kernel_dir, "aie.mlir")

# Ensure cache directory exists
Expand Down Expand Up @@ -238,6 +291,10 @@ def decorator(*args, **kwargs):
kernel_name = "MLIR_AIE"
try:
kernel = NPUKernel(xclbin_path, inst_path, kernel_name=kernel_name)

# Cache the kernel for this function signature
_compiled_kernels[cache_key] = kernel

result = kernel(*args, **kwargs)
return result
except Exception as e:
Expand Down Expand Up @@ -313,15 +370,14 @@ def hash_module(module, external_kernels=None, target_arch=None):
"""
mlir_str = str(module)

# Include ExternalFunction compiler options in the hash
# Include ExternalFunction compiler options and source code in the hash
if external_kernels:
compiler_options = []
running_hash = ""
source_contents = []
for func in external_kernels:
compiler_options.extend(func._include_dirs)
compiler_options.extend(func._compile_flags)
running_hash += str(hash(func))

# Create a combined string for hashing
combined_str = mlir_str + "|" + "|".join(compiler_options)
combined_str = mlir_str + "|" + "|".join(running_hash)
else:
combined_str = mlir_str

Expand All @@ -331,3 +387,52 @@ def hash_module(module, external_kernels=None, target_arch=None):

hash_result = hashlib.sha256(combined_str.encode("utf-8")).hexdigest()[:16]
return hash_result


def _hash_argument(arg, prefix=""):
"""
Helper function to hash supported argument types (tensors and callables).
Returns a string representation for cache key generation.
"""
from aie.iron.tensor import Tensor
from aie.iron.kernel import ExternalFunction

if isinstance(arg, Tensor):
# Tensor argument - include shape and dtype
return f"{prefix}tensor_{arg.shape}_{arg.dtype}"
elif isinstance(arg, ExternalFunction):
# ExternalFunction argument - use its custom hash method
func_hash = hash(arg)
return f"{prefix}externalfunction_{func_hash}"
elif callable(arg):
# Function argument - use hash of function address for uniqueness
func_hash = hash(arg)
return f"{prefix}function_{func_hash}"
else:
# Unsupported type - use type name
return f"{prefix}{type(arg).__name__}"


def _create_function_cache_key(function, args, kwargs):
"""
Create a cache key for a function call based on function name and argument types/shapes.
This allows us to cache compiled kernels at the function level.
Note that it is not necessary that we cache the tensor shapes since the kernel may be agonstic
to the shape changes but we are doing here for safety.
"""
# Get function name
func_name = function.__name__

# Create signature from argument types and shapes
signature_parts = []

for arg in args:
result = _hash_argument(arg)
signature_parts.append(result)

for key, value in sorted(kwargs.items()):
result = _hash_argument(value, f"{key}_")
signature_parts.append(result)

signature = "_".join(signature_parts)
return (func_name, signature)
28 changes: 28 additions & 0 deletions python/iron/kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,34 @@ def resolve(
# Create the external function
self._op = external_func(self._name, inputs=self._arg_types)

def __hash__(self):
"""
Compute a hash for the ExternalFunction based on its properties.
This allows ExternalFunction instances to be used in cache keys.
"""
import hashlib

# Create a string representation of the function's key properties
hash_parts = [
self._name,
str(self._arg_types),
str(sorted(self._include_dirs)),
str(sorted(self._compile_flags)),
]

# Include source content for uniqueness
# TODO: This solution needs to be extended to handle headers. See https://github.com/Xilinx/mlir-aie/issues/2543
if self._source_string:
hash_parts.append(self._source_string)
elif self._source_file:
with open(self._source_file, "r") as f:
file_content = f.read()
hash_parts.append(file_content)

# Create hash from combined string
combined = "|".join(hash_parts)
return int(hashlib.sha256(combined.encode("utf-8")).hexdigest()[:8], 16)

def __call__(self, *args, **kwargs):
if not self._op:
raise ValueError("Need to resolve ExternalFunction before it can be called")
Expand Down
5 changes: 3 additions & 2 deletions python/iron/tensor.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,8 +484,9 @@ def __del__(self):

Releases associated device memory (e.g., XRT buffer object).
"""
del self.bo
self.bo = None
if hasattr(self, "bo"):
del self.bo
self.bo = None


def tensor(data, dtype=np.float32, device="npu"):
Expand Down
13 changes: 9 additions & 4 deletions python/utils/xrt.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,12 @@ def call(self):
return h

def __del__(self):
del self.kernel
del self.device
if hasattr(self, "kernel"):
del self.kernel
self.kernel = None
if hasattr(self, "device"):
del self.device
self.device = None


# This class wraps up access to the xrt.bo buffer object where sync calls are added
Expand Down Expand Up @@ -114,8 +118,9 @@ def sync_from_device(self):
return self.bo.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)

def __del__(self):
del self.bo
self.bo = None
if hasattr(self, "bo"):
del self.bo
self.bo = None


class AIE_Application_Error(Exception):
Expand Down
Loading
Loading