diff --git a/.github/workflows/buildAndTestRyzenAI.yml b/.github/workflows/buildAndTestRyzenAI.yml
index 4c10bcec300..01a4374f3bd 100644
--- a/.github/workflows/buildAndTestRyzenAI.yml
+++ b/.github/workflows/buildAndTestRyzenAI.yml
@@ -60,6 +60,8 @@ jobs:
       fail-fast: false
       matrix:
         runner_type: [ amd7940hs, amdhx370 ]
+    env:
+      IRON_CACHE_HOME: ${{ github.workspace }}/iron-cache-${{ matrix.runner_type }}-${{ github.run_id }}
     steps:
       - uses: actions/checkout@v4
         with:
@@ -126,6 +128,9 @@ jobs:
             -DMLIR_DIR=$PWD/../mlir/lib/cmake/mlir \
             $CMAKE_ARGS
 
+          # Create runner-specific cache directory
+          mkdir -p $IRON_CACHE_HOME
+
           ninja install
           ninja check-aie
           popd
@@ -137,6 +142,8 @@ jobs:
       fail-fast: false
       matrix:
         runner_type: [ amd7940hs, amdhx370 ]
+    env:
+      IRON_CACHE_HOME: ${{ github.workspace }}/iron-cache-${{ matrix.runner_type }}-${{ github.run_id }}
     steps:
       - uses: actions/checkout@v4
         with:
@@ -183,8 +190,10 @@ jobs:
             LIT_OPTS="-j12 $LIT_OPTS"
           fi
 
+          # Create runner-specific cache directory
+          mkdir -p $IRON_CACHE_HOME
+
           ninja install
           ninja check-reference-designs
           ninja check-programming-guide
-
-          popd
\ No newline at end of file
+          popd
diff --git a/python/iron/jit.py b/python/iron/jit.py
index ec7252f5747..5bf38a938d3 100644
--- a/python/iron/jit.py
+++ b/python/iron/jit.py
@@ -23,10 +23,44 @@
 from aie.dialects.aie import AIEDevice
 
 
-# The `iron.jit` decorator below caches compiled kenrels inside the `IRON_CACHE_DIR` directory.
+# The `iron.jit` decorator below caches compiled kenrels inside the `IRON_CACHE_HOME` directory.
 # Kernels are cached based on their hash value of the MLIR module string. If during compilation,
 # we hit in the cache, the `iron.jit` will load the xclbin and instruction binary files from the cache.
-IRON_CACHE_DIR = os.path.expanduser("~/.iron/cache")
+IRON_CACHE_HOME = os.environ.get("IRON_CACHE_HOME", os.path.expanduser("~/.iron/cache"))
+
+
+class CircularCache:
+    def __init__(self, max_size):
+        self.max_size = max_size
+        self.cache = [None] * max_size
+        self.keys = [None] * max_size
+        self.index = 0
+
+    def __contains__(self, key):
+        return key in self.keys
+
+    def __getitem__(self, key):
+        idx = self.keys.index(key)
+        return self.cache[idx]
+
+    def __setitem__(self, key, value):
+        self.cache[self.index] = value
+        self.keys[self.index] = key
+        self.index = (self.index + 1) % self.max_size
+
+    def __len__(self):
+        return sum(1 for k in self.keys if k is not None)
+
+    def clear(self):
+        self.cache = [None] * self.max_size
+        self.keys = [None] * self.max_size
+        self.index = 0
+
+
+# Global cache for compiled kernels at the function level
+# Key: (function_name, args_signature) -> NPUKernel instance
+# There is a limit on the number of kernels we have in cache
+_compiled_kernels = CircularCache(max_size=1)
 
 
 class NPUKernel:
@@ -117,8 +151,21 @@ def __del__(self):
         """
         Destructor to clean up resources and delete the kernel and device objects.
         """
-        del self.__kernel
-        del self.__device
+        if hasattr(self, "_NPUKernel__insts_buffer_bo"):
+            del self.__insts_buffer_bo
+            self.__insts_buffer_bo = None
+        if hasattr(self, "_NPUKernel__kernel"):
+            del self.__kernel
+            self.__kernel = None
+        if hasattr(self, "_NPUKernel__context"):
+            del self.__context
+            self.__context = None
+        if hasattr(self, "_NPUKernel__xclbin"):
+            del self.__xclbin
+            self.__xclbin = None
+        if hasattr(self, "_NPUKernel__device"):
+            del self.__device
+            self.__device = None
 
 
 class NPUKernel_Error(Exception):
@@ -145,6 +192,12 @@ def jit(function=None, is_placed=True, use_cache=True):
     def decorator(*args, **kwargs):
         from .kernel import ExternalFunction
 
+        # Check if we already have a compiled kernel for this function signature
+        cache_key = _create_function_cache_key(function, args, kwargs)
+        if cache_key in _compiled_kernels:
+            cached_kernel = _compiled_kernels[cache_key]
+            return cached_kernel(*args, **kwargs)
+
         # Clear any instances from previous runs to make sure if the user provided any broken code we don't try to recompile it
         ExternalFunction._instances.clear()
 
@@ -198,7 +251,7 @@ def decorator(*args, **kwargs):
 
         # Hash of the IR string, ExternalFunction compiler options, and target architecture
         module_hash = hash_module(mlir_module, external_kernels, target_arch)
-        kernel_dir = os.path.join(IRON_CACHE_DIR, f"{module_hash}")
+        kernel_dir = os.path.join(IRON_CACHE_HOME, f"{module_hash}")
         mlir_path = os.path.join(kernel_dir, "aie.mlir")
 
         # Ensure cache directory exists
@@ -238,6 +291,10 @@ def decorator(*args, **kwargs):
         kernel_name = "MLIR_AIE"
         try:
             kernel = NPUKernel(xclbin_path, inst_path, kernel_name=kernel_name)
+
+            # Cache the kernel for this function signature
+            _compiled_kernels[cache_key] = kernel
+
             result = kernel(*args, **kwargs)
             return result
         except Exception as e:
@@ -313,15 +370,14 @@ def hash_module(module, external_kernels=None, target_arch=None):
     """
     mlir_str = str(module)
 
-    # Include ExternalFunction compiler options in the hash
+    # Include ExternalFunction compiler options and source code in the hash
     if external_kernels:
-        compiler_options = []
+        running_hash = ""
+        source_contents = []
         for func in external_kernels:
-            compiler_options.extend(func._include_dirs)
-            compiler_options.extend(func._compile_flags)
+            running_hash += str(hash(func))
 
-        # Create a combined string for hashing
-        combined_str = mlir_str + "|" + "|".join(compiler_options)
+        combined_str = mlir_str + "|" + "|".join(running_hash)
     else:
         combined_str = mlir_str
 
@@ -331,3 +387,52 @@ def hash_module(module, external_kernels=None, target_arch=None):
 
     hash_result = hashlib.sha256(combined_str.encode("utf-8")).hexdigest()[:16]
     return hash_result
+
+
+def _hash_argument(arg, prefix=""):
+    """
+    Helper function to hash supported argument types (tensors and callables).
+    Returns a string representation for cache key generation.
+    """
+    from aie.iron.tensor import Tensor
+    from aie.iron.kernel import ExternalFunction
+
+    if isinstance(arg, Tensor):
+        # Tensor argument - include shape and dtype
+        return f"{prefix}tensor_{arg.shape}_{arg.dtype}"
+    elif isinstance(arg, ExternalFunction):
+        # ExternalFunction argument - use its custom hash method
+        func_hash = hash(arg)
+        return f"{prefix}externalfunction_{func_hash}"
+    elif callable(arg):
+        # Function argument - use hash of function address for uniqueness
+        func_hash = hash(arg)
+        return f"{prefix}function_{func_hash}"
+    else:
+        # Unsupported type - use type name
+        return f"{prefix}{type(arg).__name__}"
+
+
+def _create_function_cache_key(function, args, kwargs):
+    """
+    Create a cache key for a function call based on function name and argument types/shapes.
+    This allows us to cache compiled kernels at the function level.
+    Note that it is not necessary that we cache the tensor shapes since the kernel may be agonstic
+    to the shape changes but we are doing here for safety.
+    """
+    # Get function name
+    func_name = function.__name__
+
+    # Create signature from argument types and shapes
+    signature_parts = []
+
+    for arg in args:
+        result = _hash_argument(arg)
+        signature_parts.append(result)
+
+    for key, value in sorted(kwargs.items()):
+        result = _hash_argument(value, f"{key}_")
+        signature_parts.append(result)
+
+    signature = "_".join(signature_parts)
+    return (func_name, signature)
diff --git a/python/iron/kernel.py b/python/iron/kernel.py
index 220fce1edb3..d0b35f77209 100644
--- a/python/iron/kernel.py
+++ b/python/iron/kernel.py
@@ -186,6 +186,34 @@ def resolve(
             # Create the external function
             self._op = external_func(self._name, inputs=self._arg_types)
 
+    def __hash__(self):
+        """
+        Compute a hash for the ExternalFunction based on its properties.
+        This allows ExternalFunction instances to be used in cache keys.
+        """
+        import hashlib
+
+        # Create a string representation of the function's key properties
+        hash_parts = [
+            self._name,
+            str(self._arg_types),
+            str(sorted(self._include_dirs)),
+            str(sorted(self._compile_flags)),
+        ]
+
+        # Include source content for uniqueness
+        # TODO: This solution needs to be extended to handle headers. See https://github.com/Xilinx/mlir-aie/issues/2543
+        if self._source_string:
+            hash_parts.append(self._source_string)
+        elif self._source_file:
+            with open(self._source_file, "r") as f:
+                file_content = f.read()
+            hash_parts.append(file_content)
+
+        # Create hash from combined string
+        combined = "|".join(hash_parts)
+        return int(hashlib.sha256(combined.encode("utf-8")).hexdigest()[:8], 16)
+
     def __call__(self, *args, **kwargs):
         if not self._op:
             raise ValueError("Need to resolve ExternalFunction before it can be called")
diff --git a/python/iron/tensor.py b/python/iron/tensor.py
index 358cdd13ec3..24da9bbb1ad 100644
--- a/python/iron/tensor.py
+++ b/python/iron/tensor.py
@@ -484,8 +484,9 @@ def __del__(self):
 
         Releases associated device memory (e.g., XRT buffer object).
         """
-        del self.bo
-        self.bo = None
+        if hasattr(self, "bo"):
+            del self.bo
+            self.bo = None
 
 
 def tensor(data, dtype=np.float32, device="npu"):
diff --git a/python/utils/xrt.py b/python/utils/xrt.py
index bc26f457613..162915a864d 100644
--- a/python/utils/xrt.py
+++ b/python/utils/xrt.py
@@ -74,8 +74,12 @@ def call(self):
         return h
 
     def __del__(self):
-        del self.kernel
-        del self.device
+        if hasattr(self, "kernel"):
+            del self.kernel
+            self.kernel = None
+        if hasattr(self, "device"):
+            del self.device
+            self.device = None
 
 
 # This class wraps up access to the xrt.bo buffer object where sync calls are added
@@ -114,8 +118,9 @@ def sync_from_device(self):
         return self.bo.sync(xrt.xclBOSyncDirection.XCL_BO_SYNC_BO_FROM_DEVICE)
 
     def __del__(self):
-        del self.bo
-        self.bo = None
+        if hasattr(self, "bo"):
+            del self.bo
+            self.bo = None
 
 
 class AIE_Application_Error(Exception):
diff --git a/test/python/cache_functionality.py b/test/python/cache_functionality.py
new file mode 100644
index 00000000000..926fd7af851
--- /dev/null
+++ b/test/python/cache_functionality.py
@@ -0,0 +1,437 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2025 AMD Inc.
+
+# RUN: %run_on_npu1% %pytest %s
+# RUN: %run_on_npu2% %pytest %s
+
+import numpy as np
+import tempfile
+import os
+
+
+import aie.iron as iron
+from aie.iron import ExternalFunction
+from aie.iron import ObjectFifo, Worker, Runtime, Program
+from aie.iron.placers import SequentialPlacer
+from aie.iron.controlflow import range_
+
+
+@iron.jit(is_placed=False)
+def transform(input, output, func):
+    """Transform kernel that applies a function to input tensor and stores result in output tensor."""
+    if input.shape != output.shape:
+        raise ValueError(
+            f"Input shapes are not the equal ({input.shape} != {output.shape})."
+        )
+    num_elements = np.size(input)
+
+    if isinstance(func, iron.ExternalFunction):
+        tile_size = func.tile_size(0)
+    else:
+        tile_size = 16 if num_elements >= 16 else 1
+
+    if num_elements % tile_size != 0:
+        raise ValueError(
+            f"Number of elements ({num_elements}) must be a multiple of {tile_size}."
+        )
+    num_tiles = num_elements // tile_size
+
+    if input.dtype != output.dtype:
+        raise ValueError(
+            f"Input data types are not the same ({input.dtype} != {output.dtype})."
+        )
+
+    dtype = input.dtype
+
+    # Define tensor types
+    tensor_ty = np.ndarray[(num_elements,), np.dtype[dtype]]
+    tile_ty = np.ndarray[(tile_size,), np.dtype[dtype]]
+
+    # AIE-array data movement with object fifos
+    of_in = ObjectFifo(tile_ty, name="in")
+    of_out = ObjectFifo(tile_ty, name="out")
+
+    # Define a task that will run on a compute tile
+    def core_body(of_in, of_out, func_to_apply):
+        for _ in range_(num_tiles):
+            elem_in = of_in.acquire(1)
+            elem_out = of_out.acquire(1)
+            if isinstance(func_to_apply, iron.ExternalFunction):
+                func_to_apply(elem_in, elem_out, tile_size)
+            else:
+                for j in range_(tile_size):
+                    elem_out[j] = func_to_apply(elem_in[j])
+            of_in.release(1)
+            of_out.release(1)
+
+    # Create a worker to run the task on a compute tile
+    worker = Worker(core_body, fn_args=[of_in.cons(), of_out.prod(), func])
+
+    # Runtime operations to move data to/from the AIE-array
+    rt = Runtime()
+    with rt.sequence(tensor_ty, tensor_ty) as (A, B):
+        rt.start(worker)
+        rt.fill(of_in.prod(), A)
+        rt.drain(of_out.cons(), B, wait=True)
+
+    # Place program components (assign them resources on the device) and generate an MLIR module
+    return Program(iron.get_current_device(), rt).resolve_program(SequentialPlacer())
+
+
+def test_cache_lambda_functions():
+    """Test that caching works correctly with different lambda functions."""
+    # Create input tensor
+    input_tensor = iron.tensor((32,), dtype=np.int32)
+    input_tensor[:] = np.arange(1, 33, dtype=np.int32)  # [1, 2, 3, ..., 32]
+
+    # Test 1: First execution with lambda function
+    transform(input_tensor, input_tensor, lambda x: x + 1)
+    result1 = input_tensor.numpy().copy()
+
+    # Reset tensor
+    input_tensor[:] = np.arange(1, 33, dtype=np.int32)
+
+    # Test 2: Second execution with same lambda function (should use cache)
+    transform(input_tensor, input_tensor, lambda x: x + 1)
+    result2 = input_tensor.numpy()
+
+    # Results should be identical
+    np.testing.assert_array_equal(result1, result2)
+
+    # Test 3: Different lambda function (should generate new cache entry)
+    input_tensor[:] = np.arange(1, 33, dtype=np.int32)
+    transform(input_tensor, input_tensor, lambda x: x * 2)
+    result3 = input_tensor.numpy()
+
+    # Results should be different
+    np.testing.assert_raises(
+        AssertionError, np.testing.assert_array_equal, result1, result3
+    )
+
+
+def test_cache_external_functions():
+    """Test that ExternalFunction caching works correctly during execution."""
+    # Create input tensor
+    input_tensor = iron.tensor((32,), dtype=np.int32)
+    input_tensor[:] = np.arange(1, 33, dtype=np.int32)  # [1, 2, 3, ..., 32]
+
+    # Test 1: First execution
+    add_one_1 = ExternalFunction(
+        "add_one",
+        source_string="""extern "C" {
+            void add_one(int* input, int* output, int tile_size) {
+                for (int i = 0; i < tile_size; i++) {
+                    output[i] = input[i] + 1;
+                }
+            }
+        }""",
+        arg_types=[
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.int32,
+        ],
+    )
+    transform(input_tensor, input_tensor, add_one_1)
+    result1 = input_tensor.numpy().copy()
+
+    # Reset tensor
+    input_tensor[:] = np.arange(1, 33, dtype=np.int32)
+
+    # Test 2: Second execution
+    add_one_2 = ExternalFunction(
+        "add_one",
+        source_string="""extern "C" {
+            void add_one(int* input, int* output, int tile_size) {
+                for (int i = 0; i < tile_size; i++) {
+                    output[i] = input[i] + 1;
+                }
+            }
+        }""",
+        arg_types=[
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.int32,
+        ],
+    )
+    transform(input_tensor, input_tensor, add_one_2)
+    result2 = input_tensor.numpy()
+
+    # Results should be identical
+    np.testing.assert_array_equal(result1, result2)
+
+    # Test 3: Different ExternalFunction (should generate new cache entry)
+    multiply_two = ExternalFunction(
+        "multiply_two",
+        source_string="""extern "C" {
+            void multiply_two(int* input, int* output, int tile_size) {
+                for (int i = 0; i < tile_size; i++) {
+                    output[i] = input[i] * 2;
+                }
+            }
+        }""",
+        arg_types=[
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.int32,
+        ],
+    )
+
+    input_tensor[:] = np.arange(1, 33, dtype=np.int32)
+    transform(input_tensor, input_tensor, multiply_two)
+    result3 = input_tensor.numpy()
+
+    # Results should be different
+    np.testing.assert_raises(
+        AssertionError, np.testing.assert_array_equal, result1, result3
+    )
+
+
+def test_cache_compile_flags():
+    """Test that ExternalFunctions with different compile flags produce different results."""
+    # Create input tensor
+    input_tensor = iron.tensor((32,), dtype=np.int32)
+    input_tensor[:] = np.arange(1, 33, dtype=np.int32)  # [1, 2, 3, ..., 32]
+
+    # Create ExternalFunctions with different compile flags
+    add_5 = ExternalFunction(
+        "add_value",
+        source_string="""extern "C" {
+            void add_value(int* input, int* output, int tile_size) {
+                for (int i = 0; i < tile_size; i++) {
+                    output[i] = input[i] + ADD_VALUE;
+                }
+            }
+        }""",
+        arg_types=[
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.int32,
+        ],
+        compile_flags=["-DADD_VALUE=5"],
+    )
+
+    add_10 = ExternalFunction(
+        "add_value",
+        source_string="""extern "C" {
+            void add_value(int* input, int* output, int tile_size) {
+                for (int i = 0; i < tile_size; i++) {
+                    output[i] = input[i] + ADD_VALUE;
+                }
+            }
+        }""",
+        arg_types=[
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.int32,
+        ],
+        compile_flags=["-DADD_VALUE=10"],
+    )
+
+    # Test with ADD_VALUE=5
+    transform(input_tensor, input_tensor, add_5)
+    result_5 = input_tensor.numpy().copy()
+
+    # Reset and test with ADD_VALUE=10
+    input_tensor[:] = np.arange(1, 33, dtype=np.int32)
+    transform(input_tensor, input_tensor, add_10)
+    result_10 = input_tensor.numpy()
+
+    # Results should be different
+    np.testing.assert_raises(
+        AssertionError, np.testing.assert_array_equal, result_5, result_10
+    )
+
+    # Verify expected results
+    expected_5 = np.arange(1, 33, dtype=np.int32) + 5
+    expected_10 = np.arange(1, 33, dtype=np.int32) + 10
+
+    np.testing.assert_array_equal(result_5, expected_5)
+    np.testing.assert_array_equal(result_10, expected_10)
+
+
+def test_cache_source_changes():
+    """Test that ExternalFunctions with different source content produce different results."""
+    # Create input tensor
+    input_tensor = iron.tensor((32,), dtype=np.int32)
+    input_tensor[:] = np.arange(1, 33, dtype=np.int32)  # [1, 2, 3, ..., 32]
+
+    # Create ExternalFunctions with different source content
+    add_1 = ExternalFunction(
+        "add_one",
+        source_string="""extern "C" {
+            void add_one(int* input, int* output, int tile_size) {
+                for (int i = 0; i < tile_size; i++) {
+                    output[i] = input[i] + 1;
+                }
+            }
+        }""",
+        arg_types=[
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.int32,
+        ],
+    )
+
+    add_2 = ExternalFunction(
+        "add_one",
+        source_string="""extern "C" {
+            void add_one(int* input, int* output, int tile_size) {
+                for (int i = 0; i < tile_size; i++) {
+                    output[i] = input[i] + 2;  // Different operation
+                }
+            }
+        }""",
+        arg_types=[
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.int32,
+        ],
+    )
+
+    # Test with add_1
+    transform(input_tensor, input_tensor, add_1)
+    result_1 = input_tensor.numpy().copy()
+
+    # Reset and test with add_2
+    input_tensor[:] = np.arange(1, 33, dtype=np.int32)
+    transform(input_tensor, input_tensor, add_2)
+    result_2 = input_tensor.numpy()
+
+    # Results should be different
+    np.testing.assert_raises(
+        AssertionError, np.testing.assert_array_equal, result_1, result_2
+    )
+
+    # Verify expected results
+    expected_1 = np.arange(1, 33, dtype=np.int32) + 1
+    expected_2 = np.arange(1, 33, dtype=np.int32) + 2
+
+    np.testing.assert_array_equal(result_1, expected_1)
+    np.testing.assert_array_equal(result_2, expected_2)
+
+
+def test_cache_file_source():
+    """Test that ExternalFunctions with file sources work correctly."""
+    # Create input tensor
+    input_tensor = iron.tensor((32,), dtype=np.int32)
+    input_tensor[:] = np.arange(1, 33, dtype=np.int32)  # [1, 2, 3, ..., 32]
+
+    # Create temporary source file
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".cc", delete=False) as f:
+        source_content = """extern "C" {
+            void add_one_from_file(int* input, int* output, int tile_size) {
+                for (int i = 0; i < tile_size; i++) {
+                    output[i] = input[i] + 1;
+                }
+            }
+        }"""
+        f.write(source_content)
+        source_file_path = f.name
+
+    try:
+        # Create ExternalFunction using source_file
+        add_one_from_file = ExternalFunction(
+            "add_one_from_file",
+            source_file=source_file_path,
+            arg_types=[
+                np.ndarray[(16,), np.dtype[np.int32]],
+                np.ndarray[(16,), np.dtype[np.int32]],
+                np.int32,
+            ],
+        )
+
+        # Test execution
+        transform(input_tensor, input_tensor, add_one_from_file)
+        result = input_tensor.numpy()
+
+        # Verify expected results
+        expected = np.arange(1, 33, dtype=np.int32) + 1
+        np.testing.assert_array_equal(result, expected)
+
+    finally:
+        # Clean up the temporary file
+        os.unlink(source_file_path)
+
+
+def test_cache_include_directories():
+    """Test that ExternalFunctions with include directories work correctly."""
+    # Create input tensor
+    input_tensor = iron.tensor((32,), dtype=np.int32)
+    input_tensor[:] = np.arange(1, 33, dtype=np.int32)  # [1, 2, 3, ..., 32]
+
+    # Create temporary directory with header file
+    with tempfile.TemporaryDirectory() as temp_dir:
+        # Create header file
+        header_file = os.path.join(temp_dir, "math_ops.h")
+        with open(header_file, "w") as f:
+            f.write("#define ADD_VALUE 42\n")
+
+        # Create ExternalFunction that includes the header
+        add_value = ExternalFunction(
+            "add_value",
+            source_string="""extern "C" {
+                #include "math_ops.h"
+                void add_value(int* input, int* output, int tile_size) {
+                    for (int i = 0; i < tile_size; i++) {
+                        output[i] = input[i] + ADD_VALUE;
+                    }
+                }
+            }""",
+            arg_types=[
+                np.ndarray[(16,), np.dtype[np.int32]],
+                np.ndarray[(16,), np.dtype[np.int32]],
+                np.int32,
+            ],
+            include_dirs=[temp_dir],
+        )
+
+        # Test execution
+        transform(input_tensor, input_tensor, add_value)
+        result = input_tensor.numpy()
+
+        # Verify expected results
+        expected = np.arange(1, 33, dtype=np.int32) + 42
+        np.testing.assert_array_equal(result, expected)
+
+
+def test_cache_tensor_shapes():
+    """Test that different tensor shapes work correctly with caching."""
+    # Test with different tensor sizes
+    sizes = [16, 32, 64]
+    results = []
+
+    for size in sizes:
+        input_tensor = iron.tensor((size,), dtype=np.int32)
+        input_tensor[:] = np.arange(1, size + 1, dtype=np.int32)
+
+        # Apply transformation
+        transform(input_tensor, input_tensor, lambda x: x + 1)
+        result = input_tensor.numpy()
+        results.append(result)
+
+        # Verify expected results
+        expected = np.arange(1, size + 1, dtype=np.int32) + 1
+        np.testing.assert_array_equal(result, expected)
+
+
+def test_cache_tensor_dtypes():
+    """Test that different tensor dtypes work correctly with caching."""
+    # Test with different dtypes
+    dtypes = [np.int32, np.float32]
+    results = []
+
+    for dtype in dtypes:
+        input_tensor = iron.tensor((32,), dtype=dtype)
+        input_tensor[:] = np.arange(1, 33, dtype=dtype)
+
+        # Apply transformation
+        transform(input_tensor, input_tensor, lambda x: x + 1)
+        result = input_tensor.numpy()
+        results.append(result)
+
+        # Verify expected results
+        expected = np.arange(1, 33, dtype=dtype) + 1
+        np.testing.assert_array_equal(result, expected)