From 190ec66fb35f9269b3edba642d31653de10e8829 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Thu, 5 Mar 2026 12:29:09 -0700
Subject: [PATCH 01/28] [AIE] Add func-level link_with and
 AIEAssignCoreLinkFiles pass

- Add link_files StrArrayAttr to CoreOp (canonical post-pass list of .o paths)
- Deprecate CoreOp-level link_with in favour of func.func-level link_with
- Add AIEAssignCoreLinkFiles pass: traces call edges from each aie.core to
  func.func declarations carrying link_with, accumulates .o paths into
  CoreOp link_files, and migrates any legacy CoreOp-level link_with
- Wire link_files into BCF and LdScript emitters
- Register pass in CMakeLists.txt and AIEPasses.h

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 include/aie/Dialect/AIE/IR/AIEOps.td          |   3 +-
 .../aie/Dialect/AIE/Transforms/AIEPasses.h    |   2 +
 .../aie/Dialect/AIE/Transforms/AIEPasses.td   |  20 +++
 lib/Dialect/AIE/IR/AIEDialect.cpp             |   4 +
 .../AIE/Transforms/AIEAssignCoreLinkFiles.cpp | 125 ++++++++++++++++++
 lib/Dialect/AIE/Transforms/CMakeLists.txt     |   1 +
 lib/Targets/AIETargetBCF.cpp                  |  13 +-
 lib/Targets/AIETargetLdScript.cpp             |   7 +-
 8 files changed, 170 insertions(+), 5 deletions(-)
 create mode 100644 lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp
diff --git a/include/aie/Dialect/AIE/IR/AIEOps.td b/include/aie/Dialect/AIE/IR/AIEOps.td
index c1240d9b0e1..02f2eb39a7d 100644
--- a/include/aie/Dialect/AIE/IR/AIEOps.td
+++ b/include/aie/Dialect/AIE/IR/AIEOps.td
@@ -333,7 +333,8 @@ def AIE_CoreOp: AIE_Op<"core", [
   let arguments = (
     ins Index:$tile,
     DefaultValuedAttr<AIEI32Attr, "0x400">:$stack_size,
-    OptionalAttr<StrAttr>:$link_with,
+    OptionalAttr<StrAttr>:$link_with,   // deprecated: use link_with on func.func instead
+    OptionalAttr<StrArrayAttr>:$link_files, // canonical post-pass list of .o paths
     OptionalAttr<StrAttr>:$elf_file,
     OptionalAttr<BoolAttr>:$dynamic_objfifo_lowering
   );
diff --git a/include/aie/Dialect/AIE/Transforms/AIEPasses.h b/include/aie/Dialect/AIE/Transforms/AIEPasses.h
index fbee2c82429..0839249ab1e 100644
--- a/include/aie/Dialect/AIE/Transforms/AIEPasses.h
+++ b/include/aie/Dialect/AIE/Transforms/AIEPasses.h
@@ -24,6 +24,8 @@ namespace xilinx::AIE {
 #define GEN_PASS_DEF_AIEROUTEPATHFINDERFLOWS
 #include "aie/Dialect/AIE/Transforms/AIEPasses.h.inc"
 
+std::unique_ptr<mlir::OperationPass<DeviceOp>>
+createAIEAssignCoreLinkFilesPass();
 std::unique_ptr<mlir::OperationPass<DeviceOp>>
 createAIEAssignBufferAddressesPass();
 std::unique_ptr<mlir::OperationPass<DeviceOp>>
diff --git a/include/aie/Dialect/AIE/Transforms/AIEPasses.td b/include/aie/Dialect/AIE/Transforms/AIEPasses.td
index 1ee8bc3f0dd..f6de090ad8d 100644
--- a/include/aie/Dialect/AIE/Transforms/AIEPasses.td
+++ b/include/aie/Dialect/AIE/Transforms/AIEPasses.td
@@ -13,6 +13,26 @@
 
 include "mlir/Pass/PassBase.td"
 
+def AIEAssignCoreLinkFiles : Pass<"aie-assign-core-link-files", "DeviceOp"> {
+  let summary =
+      "Infer per-core link_files from func-level link_with attributes";
+  let description = [{
+    Walks each aie.core and collects the set of external object files it needs
+    by tracing call edges to func.func declarations that carry a "link_with"
+    string attribute.  The result is stored in the CoreOp's "link_files"
+    StrArrayAttr (a typed array of strings).
+
+    Core-level "link_with" (deprecated) is also migrated: its value is
+    added to the set and the attribute is removed from the CoreOp.
+  }];
+
+  let constructor = "xilinx::AIE::createAIEAssignCoreLinkFilesPass()";
+  let dependentDialects = [
+    "mlir::func::FuncDialect",
+    "xilinx::AIE::AIEDialect",
+  ];
+}
+
 def AIEAssignBufferAddresses : Pass<"aie-assign-buffer-addresses", "DeviceOp"> {
   let summary = "Assign memory locations for buffers in each tile";
   let description = [{
diff --git a/lib/Dialect/AIE/IR/AIEDialect.cpp b/lib/Dialect/AIE/IR/AIEDialect.cpp
index 8b82c1e6a80..994544e2209 100644
--- a/lib/Dialect/AIE/IR/AIEDialect.cpp
+++ b/lib/Dialect/AIE/IR/AIEDialect.cpp
@@ -1469,6 +1469,10 @@ LogicalResult CoreOp::verify() {
           "(consist of exactly one `aie.end` op).");
     }
   }
+  if (getLinkWith() && getLinkFiles())
+    return emitOpError(
+        "cannot specify both 'link_with' (deprecated) and 'link_files' "
+        "on the same core; run aie-assign-core-link-files to migrate");
   return success();
 }
 
diff --git a/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp b/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp
new file mode 100644
index 00000000000..14d7e196988
--- /dev/null
+++ b/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp
@@ -0,0 +1,125 @@
+//===- AIEAssignCoreLinkFiles.cpp --------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2026 Advanced Micro Devices Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass infers the per-core set of external object files required for
+// linking by tracing call edges from each core to func.func declarations that
+// carry a "link_with" attribute.
+//
+// After the pass runs, every CoreOp that needs external files will have a
+// "link_files" StrArrayAttr containing the (de-duplicated) list of .o paths.
+//
+// Core-level "link_with" (deprecated) is also migrated: its value is added to
+// the set and the attribute is removed from the CoreOp.
+//
+//===----------------------------------------------------------------------===//
+
+#include "aie/Dialect/AIE/IR/AIEDialect.h"
+#include "aie/Dialect/AIE/Transforms/AIEPasses.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Pass/Pass.h"
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SetVector.h"
+
+#define DEBUG_TYPE "aie-assign-core-link-files"
+
+using namespace mlir;
+using namespace xilinx;
+using namespace xilinx::AIE;
+
+struct AIEAssignCoreLinkFilesPass
+    : AIEAssignCoreLinkFilesBase<AIEAssignCoreLinkFilesPass> {
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<AIEDialect, mlir::func::FuncDialect>();
+  }
+
+  void runOnOperation() override {
+    DeviceOp device = getOperation();
+    OpBuilder builder(device.getContext());
+
+    // Build map: func name -> list of .o files (from "link_with" attr on
+    // func.func). Keys and values are interned in the MLIRContext so the
+    // StringRefs remain valid for the lifetime of the pass.
+    DenseMap<StringRef, SmallVector<StringRef, 2>> funcToObjs;
+    for (auto funcOp : device.getOps<mlir::func::FuncOp>()) {
+      if (auto attr = funcOp->getAttrOfType<mlir::StringAttr>("link_with")) {
+        funcToObjs[funcOp.getName()].push_back(attr.getValue());
+      }
+    }
+
+    // Track which funcs are actually called from any core.
+    llvm::DenseSet<StringRef> usedFuncs;
+
+    // Walk each core, collect all .o files needed.
+    device.walk([&](CoreOp core) {
+      // Always walk CallOps first to keep usedFuncs accurate even when the
+      // idempotency guard fires below (prevents false "never called" warnings
+      // on a second pass invocation).
+      core.walk(
+          [&](mlir::func::CallOp call) { usedFuncs.insert(call.getCallee()); });
+
+      // Early-out: pass already ran on this core and migration is done.
+      if (core.getLinkFiles() && !core.getLinkWith())
+        return;
+
+      // De-duplicate while preserving insertion order. StringRefs point into
+      // the MLIRContext attribute storage and remain valid throughout the pass.
+      llvm::SetVector<StringRef> needed;
+
+      // Migrate deprecated core-level attr: warn, consume it, and add to set.
+      if (auto lw = core.getLinkWith()) {
+        core.emitWarning(
+            "link_with on aie.core is deprecated; attach link_with to "
+            "the func.func declaration instead");
+        needed.insert(lw.value());
+        core->removeAttr("link_with");
+      }
+
+      // Trace func::CallOp ops to accumulate needed .o files.
+      core.walk([&](mlir::func::CallOp call) {
+        auto it = funcToObjs.find(call.getCallee());
+        if (it != funcToObjs.end())
+          for (StringRef obj : it->second)
+            needed.insert(obj);
+      });
+
+      // Warn on indirect calls: link_with cannot be statically resolved.
+      core.walk([&](mlir::func::CallIndirectOp indCall) {
+        indCall.emitWarning(
+            "indirect call in core body — link_with attributes on "
+            "indirectly-called functions are not automatically resolved; "
+            "declare the required .o files via link_with on the aie.core "
+            "or on a directly-called func.func");
+      });
+
+      if (!needed.empty())
+        core.setLinkFilesAttr(builder.getStrArrayAttr(needed.getArrayRef()));
+    });
+
+    // Warn about funcs with link_with that are never called from any core.
+    for (auto &[funcName, objs] : funcToObjs) {
+      if (!usedFuncs.count(funcName)) {
+        if (auto funcOp = device.lookupSymbol<mlir::func::FuncOp>(funcName))
+          funcOp.emitWarning("func '")
+              << funcName
+              << "' has link_with but is never called from any core; "
+                 "its .o file will not be linked";
+      }
+    }
+  }
+};
+
+std::unique_ptr<OperationPass<DeviceOp>>
+AIE::createAIEAssignCoreLinkFilesPass() {
+  return std::make_unique<AIEAssignCoreLinkFilesPass>();
+}
diff --git a/lib/Dialect/AIE/Transforms/CMakeLists.txt b/lib/Dialect/AIE/Transforms/CMakeLists.txt
index 89ed2ae12df..fabdf61a55e 100644
--- a/lib/Dialect/AIE/Transforms/CMakeLists.txt
+++ b/lib/Dialect/AIE/Transforms/CMakeLists.txt
@@ -8,6 +8,7 @@
 add_mlir_dialect_library(
   AIETransforms
   AIEAssignBuffers.cpp
+  AIEAssignCoreLinkFiles.cpp
   AIEAssignBufferDescriptorIDs.cpp
   AIEAssignLockIDs.cpp
   AIEFindFlows.cpp
diff --git a/lib/Targets/AIETargetBCF.cpp b/lib/Targets/AIETargetBCF.cpp
index 2e656d0f164..08826f31da4 100644
--- a/lib/Targets/AIETargetBCF.cpp
+++ b/lib/Targets/AIETargetBCF.cpp
@@ -139,9 +139,16 @@ LogicalResult AIETranslateToBCF(ModuleOp module, raw_ostream &output,
              << utohexstr(addressSpaceSize - dataMemoryEnd)
              << " // And everything else the core can't see\n";
 
-      if (tile.getCoreOp() && tile.getCoreOp().getLinkWith())
-        output << "_include _file "
-               << tile.getCoreOp().getLinkWith().value().str() << "\n";
+      if (auto coreOp = tile.getCoreOp()) {
+        if (auto filesAttr = coreOp.getLinkFiles()) {
+          for (auto f : filesAttr->getAsRange<mlir::StringAttr>())
+            output << "_include _file " << f.getValue() << "\n";
+        } else if (coreOp.getLinkWith()) {
+          // deprecated fallback
+          output << "_include _file " << coreOp.getLinkWith().value().str()
+                 << "\n";
+        }
+      }
       output << "_resolve _main core_" << tile.getCol() << "_" << tile.getRow()
              << "\n";
     }
diff --git a/lib/Targets/AIETargetLdScript.cpp b/lib/Targets/AIETargetLdScript.cpp
index d94ae5a0e03..f1d23f3ade3 100644
--- a/lib/Targets/AIETargetLdScript.cpp
+++ b/lib/Targets/AIETargetLdScript.cpp
@@ -177,8 +177,13 @@ SECTIONS
       output << "  .bss : { *(.bss*) } > data\n";
       output << "}\n";
       if (auto coreOp = tile.getCoreOp()) {
-        if (auto fileAttr = coreOp.getLinkWith())
+        if (auto filesAttr = coreOp.getLinkFiles()) {
+          for (auto f : filesAttr->getAsRange<mlir::StringAttr>())
+            output << "INPUT(" << f.getValue() << ")\n";
+        } else if (auto fileAttr = coreOp.getLinkWith()) {
+          // deprecated fallback
           output << "INPUT(" << fileAttr.value().str() << ")\n";
+        }
 
         output << "PROVIDE(main = core_" << tile.getCol() << "_"
                << tile.getRow() << ");\n";

From 46ac7c81a0a59d65a35412ffa02f07f5ec8e7f92 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Thu, 5 Mar 2026 14:27:34 -0700
Subject: [PATCH 02/28] [aiecc] Wire AIEAssignCoreLinkFiles into driver; add
 atomicCopyFile

- Run aie-assign-core-link-files pass in aiecc.cpp pipeline before BCF/LdScript
- Update Python aiecc driver (main.py) to invoke the pass and handle link_files
- Add atomicCopyFile helper for race-free kernel .o staging when multiple cores
  share the same kernel object file
- Use GEN_PASS_DEF_AIEASSIGNCORELINKFILES / impl:: base to be compatible with
  both wheel and source-built MLIR tablegen versions

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../AIE/Transforms/AIEAssignCoreLinkFiles.cpp |   54 +-
 python/aie_lit_utils/lit_config_helpers.py    |    2 +-
 python/compiler/aiecc/main.py                 | 2237 ++++++++++++++++-
 tools/aiecc/aiecc.cpp                         |  205 +-
 4 files changed, 2294 insertions(+), 204 deletions(-)

diff --git a/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp b/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp
index 14d7e196988..7a28b46c7b6 100644
--- a/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp
+++ b/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp
@@ -22,6 +22,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "aie/Dialect/AIE/IR/AIEDialect.h"
+#define GEN_PASS_DEF_AIEASSIGNCORELINKFILES
 #include "aie/Dialect/AIE/Transforms/AIEPasses.h"
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"
@@ -38,7 +39,8 @@ using namespace xilinx;
 using namespace xilinx::AIE;
 
 struct AIEAssignCoreLinkFilesPass
-    : AIEAssignCoreLinkFilesBase<AIEAssignCoreLinkFilesPass> {
+    : xilinx::AIE::impl::AIEAssignCoreLinkFilesBase<
+          AIEAssignCoreLinkFilesPass> {
   void getDependentDialects(DialectRegistry &registry) const override {
     registry.insert<AIEDialect, mlir::func::FuncDialect>();
   }
@@ -61,17 +63,12 @@ struct AIEAssignCoreLinkFilesPass
     llvm::DenseSet<StringRef> usedFuncs;
 
     // Walk each core, collect all .o files needed.
+    // NOTE: only *direct* calls (func.call) are traced; transitive calls
+    // through intermediate helpers are not followed.  If an intermediate
+    // helper carries its own link_with, attach link_with to the intermediate
+    // helper *and* call it directly from the core, or use the deprecated
+    // core-level link_with as a fallback.
     device.walk([&](CoreOp core) {
-      // Always walk CallOps first to keep usedFuncs accurate even when the
-      // idempotency guard fires below (prevents false "never called" warnings
-      // on a second pass invocation).
-      core.walk(
-          [&](mlir::func::CallOp call) { usedFuncs.insert(call.getCallee()); });
-
-      // Early-out: pass already ran on this core and migration is done.
-      if (core.getLinkFiles() && !core.getLinkWith())
-        return;
-
       // De-duplicate while preserving insertion order. StringRefs point into
       // the MLIRContext attribute storage and remain valid throughout the pass.
       llvm::SetVector<StringRef> needed;
@@ -85,21 +82,22 @@ struct AIEAssignCoreLinkFilesPass
         core->removeAttr("link_with");
       }
 
-      // Trace func::CallOp ops to accumulate needed .o files.
-      core.walk([&](mlir::func::CallOp call) {
-        auto it = funcToObjs.find(call.getCallee());
-        if (it != funcToObjs.end())
-          for (StringRef obj : it->second)
-            needed.insert(obj);
-      });
-
-      // Warn on indirect calls: link_with cannot be statically resolved.
-      core.walk([&](mlir::func::CallIndirectOp indCall) {
-        indCall.emitWarning(
-            "indirect call in core body — link_with attributes on "
-            "indirectly-called functions are not automatically resolved; "
-            "declare the required .o files via link_with on the aie.core "
-            "or on a directly-called func.func");
+      // Single walk: accumulate used funcs, collect .o files, warn on indirect
+      // calls — all in one pass over the core body.
+      core.walk([&](Operation *op) {
+        if (auto call = dyn_cast<mlir::func::CallOp>(op)) {
+          usedFuncs.insert(call.getCallee());
+          auto it = funcToObjs.find(call.getCallee());
+          if (it != funcToObjs.end())
+            for (StringRef obj : it->second)
+              needed.insert(obj);
+        } else if (auto indCall = dyn_cast<mlir::func::CallIndirectOp>(op)) {
+          indCall.emitWarning(
+              "indirect call in core body — link_with attributes on "
+              "indirectly-called functions are not automatically resolved; "
+              "add a direct func.call to the required func.func declaration "
+              "so that aie-assign-core-link-files can trace the dependency");
+        }
       });
 
       if (!needed.empty())
@@ -110,8 +108,8 @@ struct AIEAssignCoreLinkFilesPass
     for (auto &[funcName, objs] : funcToObjs) {
       if (!usedFuncs.count(funcName)) {
         if (auto funcOp = device.lookupSymbol<mlir::func::FuncOp>(funcName))
-          funcOp.emitWarning("func '")
-              << funcName
+          funcOp.emitWarning()
+              << "func '" << funcName
               << "' has link_with but is never called from any core; "
                  "its .o file will not be linked";
       }
diff --git a/python/aie_lit_utils/lit_config_helpers.py b/python/aie_lit_utils/lit_config_helpers.py
index 4d2d938155d..40ca1f63585 100644
--- a/python/aie_lit_utils/lit_config_helpers.py
+++ b/python/aie_lit_utils/lit_config_helpers.py
@@ -60,7 +60,7 @@ class LitConfigHelper:
     # Maps generation name to list of model strings that may appear in xrt-smi
     NPU_MODELS = {
         "npu1": ["npu1", "Phoenix"],
-        "npu2": ["npu4", "Strix", "npu5", "Strix Halo", "npu6", "Krackan"],
+        "npu2": ["npu4", "Strix", "npu5", "Strix Halo", "npu6", "Krackan", "Krackan 1"],
     }
 
     @staticmethod
diff --git a/python/compiler/aiecc/main.py b/python/compiler/aiecc/main.py
index 5bf87d3fa71..e6f901eaf2f 100644
--- a/python/compiler/aiecc/main.py
+++ b/python/compiler/aiecc/main.py
@@ -3,114 +3,2203 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
 # (c) Copyright 2021 Xilinx Inc.
-# (c) Copyright 2024-2026 Advanced Micro Devices, Inc.
 
 """
-aiecc.py - AIE Compiler Driver (Python wrapper)
-
-This is a thin wrapper that delegates to the C++ aiecc binary.
-The C++ implementation provides better performance through
-in-memory MLIR pass execution instead of subprocess calls.
-
-All command-line arguments are passed through unchanged to the
-C++ binary, which handles host compilation flags (-I, -L, -l, -o),
-host source files (.cpp), and all other options directly.
+aiecc - AIE compiler driver for MLIR tools
 """
 
+import asyncio
+import glob
+import json
 import os
+import re
 import shutil
+import stat
 import subprocess
 import sys
 import tempfile
-import warnings
+from textwrap import dedent
+import time
+import uuid
+import struct
+
+from aie.extras.runtime.passes import Pipeline
+from aie.extras.util import find_ops
+import aiofiles
+import rich.progress as progress
+
+import aie.compiler.aiecc.cl_arguments
+import aie.compiler.aiecc.configure
+from aie.dialects import aie as aiedialect
+from aie.dialects import aiex as aiexdialect
+from aie.ir import (
+    Context,
+    Location,
+    Module,
+    InsertionPoint,
+    IndexType,
+    StringAttr,
+    IntegerAttr,
+    IntegerType,
+)
+from aie.passmanager import PassManager
+
 
+def _create_input_with_addresses_pipeline(
+    scheme,
+    dynamic_objFifos,
+    packet_sw_objFifos,
+    ctrl_pkt_overlay,
+    aie_target,
+    opt_level="2",
+):
+    pipeline = Pipeline()
 
-def _find_aiecc_binary():
-    """Find the C++ aiecc binary in PATH."""
-    path = shutil.which("aiecc")
-    if path:
-        return path
+    # Only add convert-vector-to-aievec for AIE2 and later targets
+    # AIE1 ("aie") does not support target_backend="llvmir"
+    if aie_target.lower() in ["aie2", "aieml", "aie2p"]:
+        # Hoist vector transfer pointers before scf-to-cf conversion (O3 and above only)
+        # This runs on the module and walks into aie.core regions
+        if int(opt_level) >= 3:
+            pipeline.add_pass("aie-hoist-vector-transfer-pointers")
+        pipeline.add_pass(
+            "convert-vector-to-aievec",
+            **{"aie-target": aie_target.lower(), "target-backend": "llvmir"},
+        )
 
-    raise FileNotFoundError(
-        "Could not find 'aiecc' binary. Ensure mlir-aie is properly installed "
-        "and the bin directory is in your PATH, or use the C++ aiecc directly."
+    # Build nested device pipeline with conditional passes
+    device_pipeline = (
+        Pipeline()
+        .add_pass("aie-trace-to-config")
+        .add_pass("aie-trace-pack-reg-writes")
+        .add_pass("aie-inline-trace-config")
+        .add_pass("aie-assign-lock-ids")
+        .add_pass("aie-register-objectFifos")
+        .add_pass(
+            "aie-objectFifo-stateful-transform",
+            **{
+                "dynamic-objFifos": dynamic_objFifos,
+                "packet-sw-objFifos": packet_sw_objFifos,
+            },
+        )
+        .add_pass("aie-assign-bd-ids")
+        .add_pass("aie-lower-cascade-flows")
+        .add_pass("aie-lower-broadcast-packet")
+        .add_pass("aie-lower-multicast")
+        .add_pass("aie-assign-tile-controller-ids")
+        .add_pass(
+            "aie-generate-column-control-overlay",
+            **{"route-shim-to-tile-ctrl": ctrl_pkt_overlay},
+        )
+        .add_pass("aie-assign-buffer-addresses", **{"alloc-scheme": scheme})
+        .add_pass("aie-assign-core-link-files")
+        .add_pass("aie-vector-transfer-lowering", **{"max-transfer-rank": 1})
     )
 
+    # Only add vector-to-pointer-loops for O3 and above
+    if int(opt_level) >= 3:
+        device_pipeline.add_pass("aie-vector-to-pointer-loops")
 
-def main():
-    """
-    Main entry point - delegates to C++ aiecc.
+    return (
+        pipeline.lower_affine()
+        .add_pass("aie-canonicalize-device")
+        .Nested("aie.device", device_pipeline)
+        .convert_scf_to_cf()
+    )
 
-    All command-line arguments are passed directly to the C++ binary unchanged.
-    """
-    try:
-        aiecc_bin = _find_aiecc_binary()
-    except FileNotFoundError as e:
-        print(f"Error: {e}", file=sys.stderr)
+
+INPUT_WITH_ADDRESSES_PIPELINE = _create_input_with_addresses_pipeline
+
+LOWER_TO_LLVM_PIPELINE = (
+    Pipeline()
+    .canonicalize()
+    .cse()
+    .expand_strided_metadata()
+    .lower_affine()
+    .arith_expand()
+    .finalize_memref_to_llvm()
+    .convert_func_to_llvm(use_bare_ptr_memref_call_conv=True)
+    .convert_to_llvm(dynamic=True)
+    .add_pass("convert-vector-to-llvm")
+    .add_pass("convert-ub-to-llvm")
+    .canonicalize()
+    .cse()
+)
+
+
+def _create_aie_lower_to_llvm_pipeline(
+    device_name=None, col=None, row=None, aie_target="aie2", opt_level="2"
+):
+    pipeline = (
+        Pipeline()
+        .Nested(
+            "aie.device",
+            Pipeline()
+            .add_pass("aie-localize-locks")
+            .add_pass("aie-normalize-address-spaces")
+            .add_pass("aie-transform-bfp-types"),
+        )
+        .add_pass("aie-standard-lowering", device=device_name, tilecol=col, tilerow=row)
+        .add_pass("aiex-standard-lowering")
+    )
+
+    # Only add aievec-split-load-ups-chains for O3 and above
+    if int(opt_level) >= 3:
+        pipeline.add_pass("aievec-split-load-ups-chains")
+
+    pipeline.add_pass("convert-aievec-to-llvm", **{"aie-target": aie_target.lower()})
+
+    return pipeline + LOWER_TO_LLVM_PIPELINE
+
+
+AIE_LOWER_TO_LLVM = _create_aie_lower_to_llvm_pipeline
+
+
+# pipeline to lower and legalize runtime sequence for NPU
+def _create_npu_lowering_pipeline(expand_load_pdis=False):
+    pipeline = Pipeline()
+    if opts.materialize_runtime_sequence:
+        pipeline = pipeline.add_pass("aie-materialize-runtime-sequences")
+    pipeline = pipeline.Nested(
+        "aie.device",
+        Pipeline()
+        .add_pass("aie-materialize-bd-chains")
+        .add_pass("aie-substitute-shim-dma-allocations")
+        .add_pass("aie-assign-runtime-sequence-bd-ids")
+        .add_pass("aie-dma-tasks-to-npu")
+        .add_pass("aie-dma-to-npu")
+        .add_pass("aie-lower-set-lock"),
+    )
+    if expand_load_pdis:
+        pipeline = pipeline.add_pass("aie-expand-load-pdi")
+    return pipeline
+
+
+async def read_file_async(file_path: str) -> str:
+    async with aiofiles.open(file_path, mode="r") as f:
+        contents = await f.read()
+    return contents
+
+
+async def write_file_async(file_content: str, file_path: str):
+    async with aiofiles.open(file_path, mode="w") as f:
+        await f.write(file_content)
+
+
+def emit_design_kernel_json(
+    kernel_name="MLIR_AIE",
+    kernel_id="0x901",
+    instance_name="MLIRAIE",
+    buffer_args=None,
+):
+    if buffer_args is None:
+        buffer_args = [f"bo{i}" for i in range(5)]
+
+    arguments = [
+        {
+            "name": "opcode",
+            "address-qualifier": "SCALAR",
+            "type": "uint64_t",
+            "offset": "0x00",
+        },
+    ]
+    offset = 0x08
+
+    inst_arguments = [
+        {
+            "name": "instr",
+            "memory-connection": "SRAM",
+            "address-qualifier": "GLOBAL",
+            "type": "char *",
+            "offset": str(hex(offset)),
+        },
+        {
+            "name": "ninstr",
+            "address-qualifier": "SCALAR",
+            "type": "uint32_t",
+            "offset": str(hex(offset + 8)),
+        },
+    ]
+    arguments.append(inst_arguments[0])
+    arguments.append(inst_arguments[1])
+    offset += 12
+
+    for buf in buffer_args:
+        arg = {
+            "name": buf,
+            "memory-connection": "HOST",
+            "address-qualifier": "GLOBAL",
+            "type": "void*",
+            "offset": str(hex(offset)),
+        }
+        arguments.append(arg)
+        offset += 0x8
+
+    return {
+        "ps-kernels": {
+            "kernels": [
+                {
+                    "name": kernel_name,
+                    "type": "dpu",
+                    "extended-data": {
+                        "subtype": "DPU",
+                        "functional": "0",
+                        "dpu_kernel_id": kernel_id,
+                    },
+                    "arguments": arguments,
+                    "instances": [{"name": instance_name}],
+                }
+            ]
+        }
+    }
+
+
+mem_topology = {
+    "mem_topology": {
+        "m_count": "2",
+        "m_mem_data": [
+            {
+                "m_type": "MEM_DRAM",
+                "m_used": "1",
+                "m_sizeKB": "0x10000",
+                "m_tag": "HOST",
+                "m_base_address": "0x4000000",
+            },
+            {
+                "m_type": "MEM_DRAM",
+                "m_used": "1",
+                "m_sizeKB": "0xc000",
+                "m_tag": "SRAM",
+                "m_base_address": "0x4000000",
+            },
+        ],
+    }
+}
+
+
+def emit_partition(mlir_module_str, device_op, design_pdi, kernel_id="0x901"):
+    with Context(), Location.unknown():
+        module = Module.parse(mlir_module_str)
+    device = aiedialect.AIEDevice(int(device_op.device))
+    num_cols = aiedialect.get_target_model(device).columns()
+
+    # It's arguable that this should should come from the device model
+    # somehow.  Or perhaps that it shouldn't be needed in the
+    # XCLbin at all, since it is basically describing information
+    # which is already inherent in the CDO.
+    # For the time being, we just leave it here.
+    if device in [aiedialect.AIEDevice.npu1, aiedialect.AIEDevice.npu2]:
+        start_columns = [0]
+    else:
+        start_columns = list(range(1, 6 - num_cols))
+
+    # Generate a uuid
+    pdi_uuid = uuid.uuid4()
+    return {
+        "aie_partition": {
+            "name": "QoS",
+            "operations_per_cycle": "2048",
+            "inference_fingerprint": "23423",
+            "pre_post_fingerprint": "12345",
+            "partition": {
+                "column_width": num_cols,
+                "start_columns": start_columns,
+            },
+            "PDIs": [
+                {
+                    "uuid": str(pdi_uuid),
+                    "file_name": design_pdi,
+                    "cdo_groups": [
+                        {
+                            "name": "DPU",
+                            "type": "PRIMARY",
+                            "pdi_id": "0x01",
+                            "dpu_kernel_ids": [kernel_id],
+                            "pre_cdo_groups": ["0xC1"],
+                        }
+                    ],
+                }
+            ],
+        }
+    }
+
+
+def parse_file_as_mlir(mlir_module_str):
+    with Context(), Location.unknown():
+        return Module.parse(mlir_module_str)
+
+
+def generate_devices_list(module):
+    return [
+        (d, d.sym_name.value)
+        for d in find_ops(
+            module.operation,
+            lambda d: isinstance(d.operation.opview, aiedialect.DeviceOp),
+        )
+        if not opts.device_name or d.sym_name.value == opts.device_name
+    ]
+
+
+def _core_has_nonempty_body(core_op):
+    """Check if a CoreOp has a non-empty body (more than just aie.end)."""
+    for block in core_op.body:
+        if len(list(block)) > 1:
+            return True
+    return False
+
+
+def generate_cores_list(device_op):
+    def _link_files(c):
+        attr = c.link_files
+        if attr is None:
+            return []
+        return [attr[i].value for i in range(len(attr))]
+
+    return [
+        (
+            c.tile.owner.opview.col.value,
+            c.tile.owner.opview.row.value,
+            c.elf_file.value if c.elf_file is not None else None,
+            _link_files(c),
+        )
+        for c in find_ops(
+            device_op.operation,
+            lambda o: isinstance(o.operation.opview, aiedialect.CoreOp),
+        )
+        if c.elf_file is not None
+        or c.link_with is not None
+        or c.link_files is not None
+        or _core_has_nonempty_body(c)
+    ]
+
+
+def generate_runtime_sequences_list(device_op):
+    return [
+        (s, s.sym_name.value)
+        for s in find_ops(
+            device_op.operation,
+            lambda o: isinstance(o.operation.opview, aiexdialect.RuntimeSequenceOp),
+        )
+        if not opts.sequence_name or s.sym_name.value == opts.sequence_name
+    ]
+
+
+def find_aiebu_asm():
+    asm_bin = "aiebu-asm"
+    if shutil.which(asm_bin) is None:
+        asm_bin = os.path.join("/", "opt", "xilinx", "aiebu", "bin", "aiebu-asm")
+        if shutil.which(asm_bin) is None:
+            asm_bin = None
+    if asm_bin is None:
+        print(
+            "Error: aiebu-asm not found.",
+            file=sys.stderr,
+        )
         sys.exit(1)
+    return asm_bin
 
-    # Pass all arguments directly to C++ binary unchanged
-    result = subprocess.run([aiecc_bin, *sys.argv[1:]])
-    sys.exit(result.returncode)
 
+def create_device_id_mapping(devices):
+    """Assign an ID to each device in the MLIR; used later to assign IDs for each PDI"""
+    device_to_id = {}
+    for i, (device_op, device_name) in enumerate(devices, 1):
+        device_to_id[device_name] = i
+    return device_to_id
+
+
+def assign_load_pdi_ids(module, device_to_id_mapping):
+    """Transform symbolic aiex.npu.load_pdi references to numeric IDs"""
+    with module.context as context, Location.unknown():
+        for runtime_seq in find_ops(
+            module.operation,
+            lambda o: isinstance(o.operation.opview, aiexdialect.RuntimeSequenceOp),
+        ):
+            for load_pdi_op in find_ops(
+                runtime_seq.operation,
+                lambda o: isinstance(o.operation.opview, aiexdialect.NpuLoadPdiOp)
+                and hasattr(o, "device_ref")
+                and o.device_ref is not None,
+            ):
+                device_name = load_pdi_op.device_ref.value
+                if device_name not in device_to_id_mapping:
+                    print(
+                        f"Warning: Device '{device_name}' for load_pdi instruction does not have a matching device PDI."
+                    )
+                    sys.exit(1)
+                pdi_id = device_to_id_mapping[device_name]
+                load_pdi_op.id = IntegerAttr.get(
+                    IntegerType.get_signless(32, context=context), pdi_id
+                )
+
+
+def set_elf_file_for_core(core, path):
+    with InsertionPoint.at_block_terminator(
+        core.parent.regions[0].blocks[0]
+    ), Location.unknown():
+        result = IndexType.get()
+        new_core = aiedialect.CoreOp(result, core.tile)
+        for attr in core.attributes:
+            new_core.attributes[attr] = core.attributes[attr]
+        new_core.attributes["elf_file"] = StringAttr.get(path)
+        new_core_block = new_core.body.blocks.append()
+        with InsertionPoint(new_core_block):
+            aiedialect.EndOp()
+        new_core.move_before(core)
+    core.operation.erase()
+
+
+def emit_design_bif(
+    root_path, device_name, has_cores=True, enable_cores=True, unified=False
+):
+    if unified:
+        cdo_unified_file = f"file={root_path}/{device_name}_aie_cdo.bin"
+        files = f"{cdo_unified_file}"
+    else:
+        cdo_elfs_file = f"file={root_path}/{device_name}_aie_cdo_elfs.bin"
+        cdo_init_file = f"file={root_path}/{device_name}_aie_cdo_init.bin"
+        cdo_enable_file = (
+            f"file={root_path}/{device_name}_aie_cdo_enable.bin" if enable_cores else ""
+        )
+        files = f"{cdo_elfs_file} {cdo_init_file} {cdo_enable_file}"
+    return dedent(f"""\
+        all:
+        {{
+          id_code = 0x14ca8093
+          extended_id_code = 0x01
+          image
+          {{
+            name=aie_image, id=0x1c000000
+            {{ type=cdo {files} }}
+          }}
+        }}
+        """)
 
-def run(mlir_module, args=None):
-    """
-    Programmatic API for compiling MLIR modules.
 
-    DEPRECATED: This function is deprecated. Use the C++ aiecc binary
-    directly or the IRON Python API instead.
+# Extract included files from the given Chess linker script.
+# We rely on gnu linker scripts to stuff object files into a compile.  However, the Chess compiler doesn't
+# do this, so we have to explicitly specify included files on the link line.
+async def extract_input_files(file_core_bcf):
+    core_bcf = await read_file_async(file_core_bcf)
+    return " ".join(re.findall(r"^_include _file (.*)", core_bcf, re.MULTILINE))
+
+
+def do_run(command, verbose=False):
+    if verbose:
+        print(" ".join(command))
+    m = subprocess.PIPE
+    ret = subprocess.run(command, stdout=m, stderr=m, universal_newlines=True)
+    return ret
+
+
+def format_diagnostics_for_script(diagnostics):
+    """Format MLIR diagnostics for inclusion in repeater script."""
+    if not diagnostics:
+        return ""
+
+    diag_lines = "\n".join(diagnostics)
+    return f"""echo "Original MLIR Diagnostics:"
+cat << 'DIAGNOSTICS_EOF'
+{diag_lines}
+DIAGNOSTICS_EOF
+echo ""
+
+"""
+
+
+def generate_repeater_script(
+    mlir_file, pass_pipeline, output_file, timenow, description=None, diagnostics=None
+):
+    """
+    Generate a bash repeater script for reproducing a pass pipeline failure.
 
     Args:
-        mlir_module: MLIR module string or object with __str__ method
-        args: Optional list of command-line arguments
+        mlir_file: Path to the MLIR file that caused failure
+        pass_pipeline: The pass pipeline string
+        output_file: Where to write the repeater script
+        description: Optional description of what was being compiled
+        diagnostics: List of MLIR diagnostic messages
+    """
+    diag_section = format_diagnostics_for_script(diagnostics)
+
+    script_content = f"""#!/bin/bash
+#
+# AIECC Repeater Script
+# Generated: {timenow.isoformat()}
+#
+# This script reproduces a compilation failure from aiecc.py
+# Description: {description or 'N/A'}
+# Diagnostics: {len(diagnostics) if diagnostics else 0} messages captured
+#
+
+set -e  # Exit on error
+
+echo "=================================================="
+echo "AIECC Failure Reproduction Script"
+echo "=================================================="
+echo ""
+
+{diag_section}MLIR_FILE="{mlir_file}"
+PASS_PIPELINE='{pass_pipeline}'
+
+# Check if input file exists
+if [ ! -f "$MLIR_FILE" ]; then
+    echo "Error: Input MLIR file not found: $MLIR_FILE"
+    exit 1
+fi
+
+# Check if aie-opt is available
+if ! command -v aie-opt &> /dev/null; then
+    echo "Error: aie-opt not found in PATH"
+    echo "Please ensure mlir-aie tools are properly installed and in PATH"
+    exit 1
+fi
 
-    Raises:
-        RuntimeError: If compilation fails
+echo "Input MLIR: $MLIR_FILE"
+echo "Pass Pipeline: $PASS_PIPELINE"
+echo ""
+echo "Running aie-opt with debug flags..."
+echo ""
+
+# Run with debugging flags
+aie-opt \\
+    --mlir-print-ir-after-all \\
+    --mlir-disable-threading \\
+    --pass-pipeline="${{PASS_PIPELINE}}" \\
+    "$MLIR_FILE"
+
+echo ""
+echo "If the command succeeded, the issue may be non-deterministic."
+echo "Try running this script multiple times."
+"""
+
+    with open(output_file, "w") as f:
+        f.write(script_content)
+
+    # Make script executable
+    os.chmod(output_file, os.stat(output_file).st_mode | stat.S_IEXEC)
+
+
+def handle_pass_failure(
+    pass_pipeline,
+    mlir_ir,
+    description=None,
+    output_dir=None,
+    diagnostics=None,
+):
+    """
+    Handle failure of Python-based PassManager execution.
+    Saves intermediate MLIR and generates repeater script.
+
+    Args:
+        pass_pipeline: The pass pipeline that failed
+        mlir_ir: The MLIR IR before the failed pass
+        description: Human-readable description of what was being compiled
+        output_dir: Directory to save repeater scripts (default: temp dir)
+        diagnostics: List of diagnostic messages from MLIR
     """
-    warnings.warn(
-        "aiecc.run() is deprecated and will be removed in a future release. "
-        "Use the C++ aiecc binary directly or the IRON Python API instead.",
-        DeprecationWarning,
-        stacklevel=2,
+    import datetime
+
+    # Generate unique filename
+    timenow = datetime.datetime.now()
+    timestamp = timenow.strftime("%Y%m%d_%H%M%S")
+    failure_id = str(uuid.uuid4())[:8]
+
+    # Save MLIR to output directory
+    temp_dir = output_dir or tempfile.gettempdir()
+    mlir_filename = os.path.join(
+        temp_dir, f"aiecc_failure_{timestamp}_{failure_id}.mlir"
+    )
+    repeater_filename = os.path.join(
+        temp_dir, f"aiecc_repeater_{timestamp}_{failure_id}.sh"
     )
 
-    try:
-        aiecc_bin = _find_aiecc_binary()
-    except FileNotFoundError as e:
-        raise RuntimeError(str(e))
+    with open(mlir_filename, "w") as f:
+        f.write(mlir_ir)
+
+    # Generate repeater script
+    generate_repeater_script(
+        mlir_file=mlir_filename,
+        pass_pipeline=pass_pipeline,
+        output_file=repeater_filename,
+        timenow=timenow,
+        description=description,
+        diagnostics=diagnostics,
+    )
 
-    # Convert module to string if needed
-    mlir_str = str(mlir_module)
+    # Print diagnostic message
+    desc_str = f" ({description})" if description else ""
+    print("\n" + "=" * 80, file=sys.stderr)
+    print(f"AIECC COMPILATION FAILED{desc_str}", file=sys.stderr)
+    print("=" * 80, file=sys.stderr)
+    print(f"\nIntermediate MLIR saved to:", file=sys.stderr)
+    print(f"  {mlir_filename}", file=sys.stderr)
+    print(f"\nFor developers, the error can be reproduced with:", file=sys.stderr)
+    print(
+        f"  $ aie-opt --pass-pipeline='{pass_pipeline}' {mlir_filename}",
+        file=sys.stderr,
+    )
+    print(f"\nRepeater script generated:", file=sys.stderr)
+    print(f"  {repeater_filename}", file=sys.stderr)
+    print(f"  $ bash {repeater_filename}", file=sys.stderr)
+    print("=" * 80 + "\n", file=sys.stderr)
 
-    # Write MLIR to temp file
-    with tempfile.NamedTemporaryFile(mode="w", suffix=".mlir", delete=False) as f:
-        f.write(mlir_str)
-        mlir_path = f.name
 
-    try:
-        cmd = [aiecc_bin, mlir_path]
-        if args:
-            if isinstance(args, str):
-                cmd.extend(args.split())
+def corefile(dirname, device, core, ext):
+    col, row = core[0], core[1]
+    return os.path.join(dirname, f"{device}_core_{col}_{row}.{ext}")
+
+
+def aie_target_defines(aie_target):
+    if aie_target == "AIE2":
+        return ["-D__AIEARCH__=20"]
+    return ["-D__AIEARCH__=10"]
+
+
+def downgrade_ir_for_chess(llvmir_chesslinked):
+    llvmir_chesslinked = (
+        llvmir_chesslinked.replace("memory(none)", "readnone")
+        .replace("memory(read)", "readonly")
+        .replace("memory(write)", "writeonly")
+        .replace("memory(argmem: readwrite)", "argmemonly")
+        .replace("memory(argmem: read)", "argmemonly readonly")
+        .replace("memory(argmem: write)", "argmemonly writeonly")
+        .replace("memory(inaccessiblemem: readwrite)", "inaccessiblememonly")
+        .replace("memory(inaccessiblemem: read)", "inaccessiblememonly readonly")
+        .replace("memory(inaccessiblemem: write)", "inaccessiblememonly writeonly")
+        .replace(
+            "memory(argmem: readwrite, inaccessiblemem: readwrite)",
+            "inaccessiblemem_or_argmemonly",
+        )
+        .replace(
+            "memory(argmem: read, inaccessiblemem: read)",
+            "inaccessiblemem_or_argmemonly readonly",
+        )
+        .replace(
+            "memory(argmem: write, inaccessiblemem: write)",
+            "inaccessiblemem_or_argmemonly writeonly",
+        )
+        .replace("captures(none)", "nocapture")
+        .replace("getelementptr inbounds nuw", "getelementptr inbounds")
+    )
+    # Remove nocreateundeforpoison attribute (not supported by older LLVM in Chess toolchain)
+    llvmir_chesslinked = re.sub(r"\bnocreateundeforpoison\s+", "", llvmir_chesslinked)
+    return llvmir_chesslinked
+
+
+def downgrade_ir_for_peano(llvmir):
+    llvmir = llvmir.replace("getelementptr inbounds nuw", "getelementptr inbounds")
+    # Remove nocreateundeforpoison attribute (not supported by older LLVM in Peano toolchain)
+    llvmir = re.sub(r"\bnocreateundeforpoison\s+", "", llvmir)
+    return llvmir
+
+
+def drop_alignment_for_peano(llvmir):
+    # Remove any ", align <integer>" attribute occurrences
+    llvmir = re.sub(r",\s*align\s+\d+", "", llvmir)
+    return llvmir
+
+
+def get_peano_target(aie_target):
+    if not re.fullmatch("AIE.?.?", aie_target):
+        print(
+            "Unexpected target " + aie_target + ". Exiting...",
+            file=sys.stderr,
+        )
+        exit(-3)
+    aie_peano_target = aie_target.lower() + "-none-unknown-elf"
+    return aie_peano_target
+
+
+class FlowRunner:
+    def __init__(self, mlir_module_str, opts, tmpdirname):
+        self.mlir_module_str = mlir_module_str
+        self.opts = opts
+        self.tmpdirname = tmpdirname
+        self.runtimes = dict()
+        self.progress_bar = None
+        self.maxtasks = 5
+        self.stopall = False
+        self.peano_clang_path = os.path.join(opts.peano_install_dir, "bin", "clang")
+        self.peano_opt_path = os.path.join(opts.peano_install_dir, "bin", "opt")
+        self.peano_llc_path = os.path.join(opts.peano_install_dir, "bin", "llc")
+        self.repeater_output_dir = opts.repeater_output_dir or tempfile.gettempdir()
+
+    def prepend_tmp(self, x):
+        return os.path.join(self.tmpdirname, x)
+
+    def pdi_file_name(self, device_name):
+        return (
+            opts.pdi_name.format(device_name)
+            if opts.pdi
+            else self.prepend_tmp(f"{device_name}.pdi")
+        )
+
+    def npu_insts_file_name(self, device_name, seq_name):
+        return (
+            opts.insts_name.format(device_name, seq_name)
+            if opts.npu
+            else self.prepend_tmp(f"{device_name}_{seq_name}.bin")
+        )
+
+    def run_passes(
+        self,
+        pass_pipeline,
+        mlir_module,
+        outputfile=None,
+        description=None,
+    ):
+        """
+        Run a pass pipeline on MLIR module object.
+
+        Args:
+            pass_pipeline: Pipeline string to execute
+            mlir_module: Input MLIR module object
+            outputfile: Optional output file path
+            description: Human-readable description of this pass stage
+        """
+        if self.opts.verbose:
+            print("Running:", pass_pipeline)
+        diags = []
+        mlir_for_error_report = None  # Will be set before pass execution
+
+        def diagnostic_handler(d):
+            severity = str(d.severity).replace("DiagnosticSeverity.", "").lower()
+            diags.append(f"{d.location}: {severity}: {d.message}")
+            for note in d.notes:
+                diags.append(f"{note.location}: note: {note.message}")
+            if severity == "error":
+                # Generate repeater script on error
+                if self.opts.enable_repeater:
+                    handle_pass_failure(
+                        pass_pipeline=pass_pipeline,
+                        mlir_ir=mlir_for_error_report,
+                        description=description,
+                        output_dir=self.repeater_output_dir,
+                        diagnostics=diags,
+                    )
+                for d in diags:
+                    print(d, file=sys.stderr)
+                return False
+            return True
+
+        with mlir_module.context, Location.unknown():
+            mlir_module.context.emit_error_diagnostics = True
+            h = mlir_module.context.attach_diagnostic_handler(diagnostic_handler)
+            mlir_for_error_report = str(mlir_module)  # Save IR before transformation
+            pm = PassManager.parse(pass_pipeline)
+            pm.run(mlir_module.operation)
+            h.detach()
+            for d in diags:
+                print(d)
+            if outputfile:
+                mlir_module_str = str(mlir_module)
+                with open(outputfile, "w") as g:
+                    g.write(mlir_module_str)
+        return mlir_module
+
+    async def do_call(self, task_id, command, force=False):
+        if self.stopall:
+            return
+
+        commandstr = " ".join(command)
+        if task_id:
+            self.progress_bar.update(task_id, advance=0, command=commandstr[0:30])
+        start = time.time()
+        if self.opts.verbose:
+            print(commandstr)
+        if self.opts.execute or force:
+            proc = await asyncio.create_subprocess_exec(
+                *command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+            )
+            stdout, stderr = await proc.communicate()
+            ret = proc.returncode
+            if self.opts.verbose and stdout:
+                print(f"{stdout.decode()}")
+            if ret != 0 and stderr:
+                print(f"{stderr.decode()}", file=sys.stderr)
+        else:
+            ret = 0
+        end = time.time()
+        if self.opts.verbose:
+            print(f"Done in {end - start:.3f} sec: {commandstr}")
+        self.runtimes[commandstr] = end - start
+        if task_id:
+            self.progress_bar.update(task_id, advance=1, command="")
+            self.maxtasks = max(
+                self.progress_bar._tasks[task_id].completed, self.maxtasks
+            )
+            self.progress_bar.update(task_id, total=self.maxtasks)
+
+        if ret != 0:
+            if task_id:
+                self.progress_bar.update(task_id, description="[red] Error")
+            print("Error encountered while running: " + commandstr, file=sys.stderr)
+            sys.exit(ret)
+
+    # In order to run xchesscc on modern ll code, we need a bunch of hacks.
+    async def chesshack(self, task, llvmir, aie_target):
+        llvmir_chesshack = llvmir + "chesshack.ll"
+        llvmir_chesslinked_path = llvmir + "chesslinked.ll"
+        if not self.opts.execute:
+            return llvmir_chesslinked_path
+
+        install_path = aie.compiler.aiecc.configure.install_path()
+        runtime_lib_path = os.path.join(install_path, "aie_runtime_lib")
+        chess_intrinsic_wrapper_ll_path = os.path.join(
+            runtime_lib_path, aie_target.upper(), "chess_intrinsic_wrapper.ll"
+        )
+
+        llvmir_ir = await read_file_async(llvmir)
+        llvmir_hacked_ir = downgrade_ir_for_chess(llvmir_ir)
+        await write_file_async(llvmir_hacked_ir, llvmir_chesshack)
+
+        if aie_target.casefold() == "AIE2".casefold():
+            target = "target_aie_ml"
+        elif aie_target.casefold() == "AIE2P".casefold():
+            target = "target_aie2p"
+        else:
+            target = "target"
+        assert os.path.exists(llvmir_chesshack)
+        await self.do_call(
+            task,
+            [
+                # The path below is cheating a bit since it refers directly to the AIE1
+                # version of llvm-link, rather than calling the architecture-specific
+                # tool version.
+                opts.aietools_path
+                + "/tps/lnx64/"
+                + target
+                + "/bin/LNa64bin/chess-llvm-link",
+                llvmir_chesshack,
+                chess_intrinsic_wrapper_ll_path,
+                "-S",
+                "-o",
+                llvmir_chesslinked_path,
+            ],
+        )
+
+        return llvmir_chesslinked_path
+
+    # In order to run peano on modern ll code, we need a bunch of hacks.
+    async def peanohack(self, llvmir):
+        llvmir_peanohack = llvmir + "peanohack.ll"
+        if not self.opts.execute:
+            return llvmir_peanohack
+
+        llvmir_ir = await read_file_async(llvmir)
+        llvmir_hacked_ir = downgrade_ir_for_peano(llvmir_ir)
+        llvmir_hacked_ir = drop_alignment_for_peano(llvmir_hacked_ir)
+        await write_file_async(llvmir_hacked_ir, llvmir_peanohack)
+
+        return llvmir_peanohack
+
+    async def process_cores(
+        self,
+        device_op,
+        device_name,
+        file_with_addresses,
+        aie_target,
+        aie_peano_target,
+        parent_task_id,
+    ):
+        # If unified compilation is on, we create a single object file that
+        # contains the compiled code for all cores. If not, the equivalent
+        # of the below is created for each core inside of process_core
+        # (singular).
+
+        # fmt: off
+        if opts.unified:
+            file_opt_with_addresses = self.prepend_tmp(f"{device_name}_input_opt_with_addresses.mlir")
+            with Context(), Location.unknown():
+                module = Module.parse(await read_file_async(file_with_addresses))
+            self.run_passes(
+                str(AIE_LOWER_TO_LLVM(device_name, aie_target=aie_target, opt_level=opts.opt_level)),
+                module,
+                outputfile=file_opt_with_addresses,
+                description=f"LLVM lowering for unified compilation of {device_name}",
+            )
+
+            file_llvmir = self.prepend_tmp(f"{device_name}_input.ll")
+            await self.do_call(parent_task_id, ["aie-translate", "--mlir-to-llvmir", file_opt_with_addresses, "-o", file_llvmir])
+
+            unified_file_core_obj = self.prepend_tmp(f"{device_name}_input.o")
+            if opts.compile and opts.xchesscc:
+                file_llvmir_hacked = await self.chesshack(parent_task_id, file_llvmir, aie_target)
+                await self.do_call(parent_task_id, ["xchesscc_wrapper", aie_target.lower(), "+w", self.prepend_tmp("work"), "-c", "-d", "+Wclang,-xir", "-f", file_llvmir_hacked, "-o", unified_file_core_obj])
+            elif opts.compile:
+                file_llvmir_hacked = await self.peanohack(file_llvmir)
+                file_llvmir_opt = self.prepend_tmp(f"{device_name}_input.opt.ll")
+                opt_level = opts.opt_level
+                # Disable loop idiom memset for O3 and above.
+                # Rationale: memset is executed as scalar operation, while
+                # zeroinitializer will be executed as vector.
+                # Cap opt at O1 to prevent LLVM's SLP vectorizer from
+                # creating sub-512-bit vector types (e.g., <4 x i8>) that
+                # crash the AIE2 GlobalISel legalizer. This is still needed
+                # for any scalar ops in the core (e.g., memref.copy loops).
+                safe_opt = min(int(opt_level), 1)
+                opt_flags = [f"--passes=default<O{safe_opt}>"]
+                if int(opt_level) >= 3:
+                    opt_flags.append("-disable-loop-idiom-memset")
+                opt_flags.extend(["-inline-threshold=10", "-S", file_llvmir_hacked, "-o", file_llvmir_opt])
+                await self.do_call(parent_task_id, [self.peano_opt_path] + opt_flags)
+                await self.do_call(parent_task_id, [self.peano_llc_path, file_llvmir_opt, f"-O{opt_level}", "--march=" + aie_target.lower(), "--function-sections", "--filetype=obj", "-o", unified_file_core_obj])
+        else:
+            unified_file_core_obj = None
+        # fmt: on
+
+        # Now, process each individual core.
+        processes = []
+        cores = generate_cores_list(device_op)
+        for core in cores:
+            processes.append(
+                self.process_core(
+                    device_name,
+                    core,
+                    aie_target,
+                    aie_peano_target,
+                    file_with_addresses,
+                    unified_file_core_obj,
+                    parent_task_id,
+                )
+            )
+        device_elf_paths = await asyncio.gather(*processes)
+        elf_paths = {}
+        for (col, row, _, _lf), elf_path in zip(cores, device_elf_paths):
+            elf_paths[(col, row)] = elf_path
+
+        # copy the elfs left by proess_core to the tmpdir for process_cdo
+        for elf in glob.glob("*.elf"):
+            try:
+                shutil.copy(elf, self.tmpdirname)
+            except shutil.SameFileError:
+                pass
+        for elf_map in glob.glob("*.elf.map"):
+            try:
+                shutil.copy(elf_map, self.tmpdirname)
+            except shutil.SameFileError:
+                pass
+
+        return elf_paths
+
+    async def process_core(
+        self,
+        device_name,
+        core,
+        aie_target,
+        aie_peano_target,
+        file_with_addresses,
+        unified_file_core_obj,
+        parent_task_id,
+    ):
+        async with self.limit:
+            if self.stopall:
+                return
+
+            install_path = aie.compiler.aiecc.configure.install_path()
+            runtime_lib_path = os.path.join(
+                install_path, "aie_runtime_lib", aie_target.upper()
+            )
+
+            # --gc-sections to eliminate unneeded code.
+            # --orphan-handling=error to ensure that the linker script is as expected.
+            # If there are orphaned input sections, then they'd likely end up outside of the normal program memory.
+            clang_link_args = ["-Wl,--gc-sections", "-Wl,--orphan-handling=error"]
+
+            task = self.progress_bar.add_task(
+                "[yellow] Core (%d, %d)" % core[0:2],
+                total=self.maxtasks,
+                command="starting",
+            )
+
+            # fmt: off
+            corecol, corerow, elf_file, link_files = core
+
+            # Copy external .o files to tmpdir so linker can find them.
+            for lf in link_files:
+                src = lf if os.path.isabs(lf) else os.path.join(
+                    os.path.dirname(opts.filename) or os.getcwd(), lf)
+                dst = os.path.join(self.tmpdirname, os.path.basename(lf))
+                if src != dst:
+                    shutil.copy2(src, dst)
+
+            if not opts.unified:
+                file_opt_core = corefile(self.tmpdirname, device_name, core, "opt.mlir")
+                with Context(), Location.unknown():
+                    module = Module.parse(await read_file_async(file_with_addresses))
+                self.run_passes(
+                    str(AIE_LOWER_TO_LLVM(device_name, corecol, corerow, opts.opt_level)),
+                    module,
+                    outputfile=file_opt_core,
+                    description=f"LLVM lowering for core ({corecol}, {corerow}) of {device_name}",
+                )
+            if self.opts.xbridge:
+                file_core_bcf = corefile(self.tmpdirname, device_name, core, "bcf")
+                await self.do_call(task, ["aie-translate", file_with_addresses, "--aie-generate-bcf", "--aie-device-name", device_name, "--tilecol=%d" % corecol, "--tilerow=%d" % corerow, "-o", file_core_bcf])
             else:
-                cmd.extend(args)
+                file_core_ldscript = corefile(self.tmpdirname, device_name, core, "ld.script")
+                await self.do_call(task, ["aie-translate", file_with_addresses, "--aie-generate-ldscript", "--aie-device-name", device_name, "--tilecol=%d" % corecol, "--tilerow=%d" % corerow, "-o", file_core_ldscript])
+            if not self.opts.unified:
+                file_core_llvmir = corefile(self.tmpdirname, device_name, core, "ll")
+                await self.do_call(task, ["aie-translate", "--mlir-to-llvmir", file_opt_core, "-o", file_core_llvmir])
+                file_core_obj = corefile(self.tmpdirname, device_name, core, "o")
+
+            file_core_elf = elf_file if elf_file else corefile(self.tmpdirname, device_name, core, "elf")
+
+            if opts.compile and opts.xchesscc:
+                if not opts.unified:
+                    file_core_llvmir_chesslinked = await self.chesshack(task, file_core_llvmir, aie_target)
+                    if self.opts.link and self.opts.xbridge:
+                        link_with_obj = await extract_input_files(file_core_bcf)
+                        await self.do_call(task, ["xchesscc_wrapper", aie_target.lower(), "+w", self.prepend_tmp("work"), "-d", "+Wclang,-xir", "-f", file_core_llvmir_chesslinked, link_with_obj, "+l", file_core_bcf, "-o", file_core_elf])
+                    elif self.opts.link:
+                        await self.do_call(task, ["xchesscc_wrapper", aie_target.lower(), "+w", self.prepend_tmp("work"), "-c", "-d", "+Wclang,-xir", "-f", file_core_llvmir_chesslinked, "-o", file_core_obj])
+                        opt_level = opts.opt_level
+                        await self.do_call(task, [self.peano_clang_path, f"-O{opt_level}", "--target=" + aie_peano_target, file_core_obj, *clang_link_args, "-Wl,-T," + file_core_ldscript, "-o", file_core_elf])
+                else:
+                    file_core_obj = unified_file_core_obj
+                    if opts.link and opts.xbridge:
+                        link_with_obj = await extract_input_files(file_core_bcf)
+                        await self.do_call(task, ["xchesscc_wrapper", aie_target.lower(), "+w", self.prepend_tmp("work"), "-d", "-f", file_core_obj, link_with_obj, "+l", file_core_bcf, "-o", file_core_elf])
+                    elif opts.link:
+                        opt_level = opts.opt_level
+                        await self.do_call(task, [self.peano_clang_path, f"-O{opt_level}", "--target=" + aie_peano_target, file_core_obj, *clang_link_args, "-Wl,-T," + file_core_ldscript, "-o", file_core_elf])
+
+            elif opts.compile:
+                if not opts.unified:
+                    file_core_llvmir_peanohacked = await self.peanohack(file_core_llvmir)
+                    file_core_llvmir_stripped = corefile(self.tmpdirname, device_name, core, "stripped.ll")
+                    opt_level = opts.opt_level
+                    # Disable loop idiom memset for O3 and above.
+                    # Rationale: memset is executed as scalar operation, while
+                    # zeroinitializer will be executed as vector.
+                    # Cap opt at O1 to prevent LLVM's SLP vectorizer from
+                    # creating sub-512-bit vector types (e.g., <4 x i8>) that
+                    # crash the AIE2 GlobalISel legalizer.
+                    safe_opt = min(int(opt_level), 1)
+                    opt_flags = [f"--passes=default<O{safe_opt}>,strip"]
+                    if int(opt_level) >= 3:
+                        opt_flags.append("-disable-loop-idiom-memset")
+                    opt_flags.extend(["-S", file_core_llvmir_peanohacked, "-o", file_core_llvmir_stripped])
+                    await self.do_call(task, [self.peano_opt_path] + opt_flags)
+                    await self.do_call(task, [self.peano_llc_path, file_core_llvmir_stripped, f"-O{opt_level}", "--march=" + aie_target.lower(), "--function-sections", "--filetype=obj", "-o", file_core_obj])
+                else:
+                    file_core_obj = unified_file_core_obj
+
+                if opts.link and opts.xbridge:
+                    link_with_obj = await extract_input_files(file_core_bcf)
+                    await self.do_call(task, ["xchesscc_wrapper", aie_target.lower(), "+w", self.prepend_tmp("work"), "-d", "-f", file_core_obj, link_with_obj, "+l", file_core_bcf, "-o", file_core_elf])
+                elif opts.link:
+                    opt_level = opts.opt_level
+                    await self.do_call(task, [self.peano_clang_path, f"-O{opt_level}", "--target=" + aie_peano_target, file_core_obj, *clang_link_args, "-Wl,-T," + file_core_ldscript, "-o", file_core_elf])
+
+            self.progress_bar.update(parent_task_id, advance=1)
+            self.progress_bar.update(task, advance=0, visible=False)
+            # fmt: on
+
+            return file_core_elf
+
+    async def write_elf_paths_to_mlir(self, input_physical, elf_paths):
+        # After core ELF files are generated, we create a new MLIR file with
+        # references to those generated files in place of their IR.
+        with Context(), Location.unknown():
+            input_physical_with_elfs_module = Module.parse(
+                await read_file_async(input_physical)
+            )
+            for device in find_ops(
+                input_physical_with_elfs_module.operation,
+                lambda o: isinstance(o.operation.opview, aiedialect.DeviceOp),
+            ):
+                device_name = device.sym_name.value
+                if device_name not in elf_paths:
+                    continue
+
+                for core in find_ops(
+                    device, lambda o: isinstance(o.operation.opview, aiedialect.CoreOp)
+                ):
+                    col = core.tile.owner.opview.col.value
+                    row = core.tile.owner.opview.row.value
+                    if (col, row) not in elf_paths[device_name]:
+                        continue
+
+                    set_elf_file_for_core(core, elf_paths[device_name][(col, row)])
+
+            input_physical_with_elfs_str = str(input_physical_with_elfs_module)
+            input_physical_with_elfs = self.prepend_tmp("input_physical_with_elfs.mlir")
 
-        result = subprocess.run(cmd, capture_output=True, text=True)
+            with open(input_physical_with_elfs, "w") as f:
+                f.write(input_physical_with_elfs_str)
+            return input_physical_with_elfs
 
-        if result.returncode != 0:
-            error_msg = result.stderr if result.stderr else result.stdout
-            raise RuntimeError(
-                f"aiecc failed with exit code {result.returncode}: {error_msg}"
+    async def process_cdo(self, module_str, device_name):
+        with Context(), Location.unknown():
+            input_physical = Module.parse(module_str)
+            aiedialect.generate_cdo(
+                input_physical.operation, self.tmpdirname, device_name
             )
 
-        return result.stdout
-    finally:
-        try:
-            os.unlink(mlir_path)
-        except OSError:
-            pass
+    async def process_txn(self, module, device_name):
+        file_txn = self.prepend_tmp(f"{device_name}_txn.mlir")
+        self.run_passes(
+            f"builtin.module(aie.device(convert-aie-to-transaction{{device-name={device_name} elf-dir={self.tmpdirname}}}))",
+            module,
+            outputfile=file_txn,
+            description=f"Transaction binary generation for {device_name}",
+        )
+        txn_dest = opts.txn_name.format(device_name)
+        if opts.verbose:
+            print(f"copy {file_txn} to {txn_dest}")
+        shutil.copy(file_txn, txn_dest)
+        return file_txn
 
+    async def aiebu_asm(
+        self, input_file, output_file, ctrl_packet_file=None, ctrl_packet_idx=0
+    ):
+        asm_bin = find_aiebu_asm()
 
-if __name__ == "__main__":
-    main()
+        args = [
+            asm_bin,
+            "-t",
+            "aie2txn",
+            "-c",
+            input_file,
+            "-o",
+            output_file,
+        ]
+
+        if ctrl_packet_file:
+            ctrl_packet_size = os.path.getsize(ctrl_packet_file)
+            exteral_buffers_json = {
+                "external_buffers": {
+                    "buffer_ctrl": {
+                        "xrt_id": ctrl_packet_idx,
+                        "logical_id": -1,
+                        "size_in_bytes": ctrl_packet_size,
+                        "ctrl_pkt_buffer": 1,
+                        "name": "runtime_control_packet",
+                    },
+                }
+            }
+            with open(self.prepend_tmp("external_buffers.json"), "w") as f:
+                json.dump(exteral_buffers_json, f, indent=2)
+            args = args + [
+                "-j",
+                self.prepend_tmp("external_buffers.json"),
+                "-p",
+                ctrl_packet_file,
+            ]
+
+        await self.do_call(None, args)
+
+    async def generate_full_elf_config_json(
+        self, devices, device_to_id_mapping, opts, parent_task=None
+    ):
+        config = {"xrt-kernels": []}
+
+        for device_op, device_name in devices:
+            sequences = generate_runtime_sequences_list(device_op)
+
+            # Skip devices with no runtime sequences (e.g., @empty device)
+            if not sequences:
+                continue
+
+            max_arg_count = max(
+                len(seq_op.body.blocks[0].arguments) for seq_op, seq_name in sequences
+            )
+            arguments = [
+                {"name": f"arg_{i}", "type": "char *", "offset": hex(i * 8)}
+                for i in range(max_arg_count)
+            ]
+
+            kernel_entry = {
+                "name": device_name,
+                "arguments": arguments,
+                "instance": [],
+                "PDIs": [],
+            }
+
+            for other_device_name, other_pdi_id in device_to_id_mapping.items():
+                pdi_filename = self.pdi_file_name(other_device_name)
+                kernel_entry["PDIs"].append(
+                    {"id": other_pdi_id, "PDI_file": pdi_filename}
+                )
+
+            for seq_op, seq_name in sequences:
+                insts_filename = self.npu_insts_file_name(device_name, seq_name)
+                kernel_entry["instance"].append(
+                    {"id": seq_name, "TXN_ctrl_code_file": insts_filename}
+                )
+
+            config["xrt-kernels"].append(kernel_entry)
+
+        return config
+
+    async def assemble_full_elf(
+        self, config_json_path, output_elf_path, parent_task=None
+    ):
+        asm_bin = find_aiebu_asm()
+        args = [
+            asm_bin,
+            "-t",
+            "aie2_config",
+            "-j",
+            config_json_path,
+            "-o",
+            output_elf_path,
+        ]
+        await self.do_call(parent_task, args)
+        if self.opts.verbose:
+            print(f"Generated full ELF: {output_elf_path}")
+
+    async def generate_full_elf(self, devices, device_to_id_mapping, parent_task=None):
+        """Generate config.json and invoke aiebu-asm after all artifacts are ready"""
+        if parent_task:
+            self.progress_bar.update(
+                parent_task, advance=0, command="Generating config.json"
+            )
+        config = await self.generate_full_elf_config_json(
+            devices, device_to_id_mapping, self.opts, parent_task
+        )
+        config_json_path = self.prepend_tmp("config.json")
+        await write_file_async(json.dumps(config, indent=2), config_json_path)
+        if self.opts.verbose:
+            if self.opts.verbose:
+                print(f"Generated config.json: {config_json_path}")
+        if parent_task:
+            self.progress_bar.update(
+                parent_task, advance=1, command="Generating config.json"
+            )
+        full_elf_path = self.opts.full_elf_name or "aie.elf"
+        await self.assemble_full_elf(config_json_path, full_elf_path, parent_task)
+
+    async def process_ctrlpkt(self, module, device_op, device_name):
+        file_ctrlpkt_mlir = self.prepend_tmp(f"{device_name}_ctrlpkt.mlir")
+        file_ctrlpkt_bin = opts.ctrlpkt_name.format(device_name)
+        file_ctrlpkt_dma_seq_mlir = self.prepend_tmp(
+            f"{device_name}_ctrlpkt_dma_seq.mlir"
+        )
+        ctrlpkt_module = self.run_passes(
+            "builtin.module(aie.device(convert-aie-to-transaction{elf-dir="
+            + self.tmpdirname
+            + "},aie-txn-to-ctrl-packet,aie-legalize-ctrl-packet))",
+            module,
+            outputfile=file_ctrlpkt_mlir,
+            description="Transaction binary to control packet conversion",
+        )
+
+        # aie-translate --aie-ctrlpkt-to-bin -o ctrlpkt.bin
+        with ctrlpkt_module.context, Location.unknown():
+            ctrlpkt_bin = aiedialect.generate_control_packets(
+                ctrlpkt_module.operation, device_name
+            )
+        with open(file_ctrlpkt_bin, "wb") as f:
+            f.write(struct.pack("I" * len(ctrlpkt_bin), *ctrlpkt_bin))
+
+        # aie-opt --aie-ctrl-packet-to-dma -aie-dma-to-npu
+        ctrl_seq_module = self.run_passes(
+            "builtin.module(aie.device(aie-ctrl-packet-to-dma,aie-dma-to-npu))",
+            ctrlpkt_module,
+            outputfile=file_ctrlpkt_dma_seq_mlir,
+            description="Control packet to DMA sequence conversion",
+        )
+
+        # aie-translate --aie-npu-to-binary -o npu_insts.bin
+        with ctrl_seq_module.context, Location.unknown():
+            insts_bin = aiedialect.translate_npu_to_binary(
+                ctrl_seq_module.operation, device_name, opts.sequence_name
+            )
+        with open(opts.insts_name.format(device_name, "seq"), "wb") as f:
+            f.write(struct.pack("I" * len(insts_bin), *insts_bin))
+
+        ctrl_idx = 0
+        with Context(), Location.unknown():
+            # walk the device to find runtime sequence
+            seqs = find_ops(
+                device_op.operation,
+                lambda o: isinstance(o.operation.opview, aiexdialect.RuntimeSequenceOp),
+            )
+            if seqs:
+                ctrl_idx = len(seqs[0].regions[0].blocks[0].arguments.types)
+        await self.aiebu_asm(
+            opts.insts_name.format(device_name, "seq"),
+            opts.elf_name.format(device_name),
+            file_ctrlpkt_bin,
+            ctrl_idx,
+        )
+
+    async def process_elf(self, npu_insts_module, device_name):
+        # translate npu instructions to binary and write to file
+        npu_insts = aiedialect.translate_npu_to_binary(
+            npu_insts_module.operation, device_name, opts.sequence_name
+        )
+
+        npu_insts_bin = self.prepend_tmp(f"{device_name}_elf_insts.bin")
+        with open(npu_insts_bin, "wb") as f:
+            f.write(struct.pack("I" * len(npu_insts), *npu_insts))
+
+        await self.aiebu_asm(npu_insts_bin, opts.elf_name.format(device_name))
+
+    async def process_pdi_gen(self, device_name, file_design_pdi):
+        file_design_bif = self.prepend_tmp(f"{device_name}_design.bif")
+
+        await write_file_async(
+            emit_design_bif(self.tmpdirname, device_name), file_design_bif
+        )
+
+        await self.do_call(
+            None,
+            [
+                "bootgen",
+                "-arch",
+                "versal",
+                "-image",
+                file_design_bif,
+                "-o",
+                file_design_pdi,
+                "-w",
+            ],
+        )
+
+    # generate an xclbin. The inputs are self.mlir_module_str and the cdo
+    # binaries from the process_cdo step.
+    async def process_xclbin_gen(self, device_op, device_name):
+        task = self.progress_bar.add_task(
+            "[yellow] XCLBIN generation ", total=10, command="starting"
+        )
+
+        file_mem_topology = self.prepend_tmp(f"{device_name}_mem_topology.json")
+        file_partition = self.prepend_tmp(f"{device_name}_aie_partition.json")
+        file_input_partition = self.prepend_tmp(
+            f"{device_name}_aie_input_partition.json"
+        )
+        file_kernels = self.prepend_tmp(f"{device_name}_kernels.json")
+        file_pdi = self.pdi_file_name(device_name)
+
+        # collect the tasks to generate the inputs to xclbinutil
+        processes = []
+
+        # generate mem_topology.json
+        processes.append(
+            write_file_async(json.dumps(mem_topology, indent=2), file_mem_topology)
+        )
+
+        # generate aie_partition.json
+        processes.append(
+            write_file_async(
+                json.dumps(
+                    emit_partition(
+                        self.mlir_module_str, device_op, file_pdi, opts.kernel_id
+                    ),
+                    indent=2,
+                ),
+                file_partition,
+            )
+        )
+
+        # generate kernels.json
+        buffer_arg_names = [f"bo{i}" for i in range(5)]
+        processes.append(
+            write_file_async(
+                json.dumps(
+                    emit_design_kernel_json(
+                        opts.kernel_name,
+                        opts.kernel_id,
+                        opts.instance_name,
+                        buffer_arg_names,
+                    ),
+                    indent=2,
+                ),
+                file_kernels,
+            )
+        )
+
+        # generate pdi
+        processes.append(self.process_pdi_gen(device_name, file_pdi))
+
+        # get partition info from input xclbin, if present
+        if opts.xclbin_input:
+            processes.append(
+                self.do_call(
+                    task,
+                    [
+                        "xclbinutil",
+                        "--dump-section",
+                        f"AIE_PARTITION:JSON:{file_input_partition}",
+                        "--force",
+                        "--quiet",
+                        "--input",
+                        opts.xclbin_input,
+                    ],
+                )
+            )
+
+        # wait for all of the above to finish
+        await asyncio.gather(*processes)
+
+        # fmt: off
+        if opts.xclbin_input:
+            # patch the input partition json with the new partition information
+            with open(file_input_partition) as f:
+                input_partition = json.load(f)
+            with open(file_partition) as f:
+                new_partition = json.load(f)
+            input_partition["aie_partition"]["PDIs"].append(new_partition["aie_partition"]["PDIs"][0])
+            with open(file_partition, "w") as f:
+                json.dump(input_partition, f, indent=2)
+            flag = ['--input', opts.xclbin_input]
+        else:
+            flag = ["--add-replace-section", "MEM_TOPOLOGY:JSON:" + file_mem_topology]
+
+        # run xclbinutil to generate the xclbin
+        await self.do_call(task, ["xclbinutil"] + flag +
+                                 ["--add-kernel", file_kernels,
+                                  "--add-replace-section", "AIE_PARTITION:JSON:" + file_partition,
+                                  "--force", "--quiet", "--output", opts.xclbin_name.format(device_name)])
+        # fmt: on
+
+    async def process_host_cgen(self, aie_target, file_physical_with_elfs, device_name):
+        async with self.limit:
+            if self.stopall:
+                return
+
+            task = self.progress_bar.add_task(
+                "[yellow] Host compilation ", total=10, command="starting"
+            )
+
+            if opts.link_against_hsa:
+                file_inc_cpp = self.prepend_tmp("aie_data_movement.cpp")
+                await self.do_call(
+                    task,
+                    [
+                        "aie-translate",
+                        "--aie-generate-hsa",
+                        "--aie-device-name",
+                        device_name,
+                        file_physical_with_elfs,
+                        "-o",
+                        file_inc_cpp,
+                    ],
+                )
+
+            cmd = ["clang++", "-std=c++17"]
+            if opts.host_target:
+                cmd += ["--target=" + opts.host_target]
+                if (
+                    opts.aiesim
+                    and opts.host_target
+                    != aie.compiler.aiecc.configure.host_architecture
+                ):
+                    sys.exit(
+                        "Host cross-compile from "
+                        + aie.compiler.aiecc.configure.host_architecture
+                        + " to --target="
+                        + opts.host_target
+                        + " is not supported with --aiesim"
+                    )
+
+            if self.opts.sysroot:
+                cmd += ["--sysroot=" + opts.sysroot]
+                # In order to find the toolchain in the sysroot, we need to have
+                # a 'target' that includes 'linux' and for the 'lib/gcc/$target/$version'
+                # directory to have a corresponding 'include/gcc/$target/$version'.
+                # In some of our sysroots, it seems that we find a lib/gcc, but it
+                # doesn't have a corresponding include/gcc directory.  Instead
+                # force using '/usr/lib,include/gcc'
+                if opts.host_target == "aarch64-linux-gnu":
+                    cmd += [f"--gcc-toolchain={opts.sysroot}/usr"]
+                    # It looks like the G++ distribution is non standard, so add
+                    # an explicit handling of C++ library.
+                    # Perhaps related to https://discourse.llvm.org/t/add-gcc-install-dir-deprecate-gcc-toolchain-and-remove-gcc-install-prefix/65091/23
+                    cxx_include = glob.glob(f"{opts.sysroot}/usr/include/c++/*.*.*")[0]
+                    triple = os.path.basename(opts.sysroot)
+                    cmd += [f"-I{cxx_include}", f"-I{cxx_include}/{triple}"]
+                    gcc_lib = glob.glob(f"{opts.sysroot}/usr/lib/{triple}/*.*.*")[0]
+                    cmd += [f"-B{gcc_lib}", f"-L{gcc_lib}"]
+            install_path = aie.compiler.aiecc.configure.install_path()
+
+            # Setting everything up if linking against HSA
+            if opts.link_against_hsa:
+                cmd += ["-DHSA_RUNTIME"]
+                arch_name = opts.host_target.split("-")[0] + "-hsa"
+                hsa_path = os.path.join(aie.compiler.aiecc.configure.hsa_dir)
+                hsa_include_path = os.path.join(hsa_path, "..", "..", "..", "include")
+                hsa_lib_path = os.path.join(hsa_path, "..", "..")
+                hsa_so_path = os.path.join(hsa_lib_path, "libhsa-runtime64.so")
+            else:
+                arch_name = opts.host_target.split("-")[0]
+
+            # Getting a pointer to the libxaie include and library
+            runtime_xaiengine_path = os.path.join(
+                install_path, "runtime_lib", arch_name, "xaiengine"
+            )
+            xaiengine_include_path = os.path.join(runtime_xaiengine_path, "include")
+            xaiengine_lib_path = os.path.join(runtime_xaiengine_path, "lib")
+
+            # Getting a pointer to the library test_lib
+            runtime_testlib_path = os.path.join(
+                install_path,
+                "runtime_lib",
+                arch_name,
+                "test_lib",
+                "lib",
+            )
+
+            # Linking against the correct memory allocator
+            if opts.link_against_hsa:
+                memory_allocator = os.path.join(
+                    runtime_testlib_path, "libmemory_allocator_hsa.a"
+                )
+            else:
+                memory_allocator = os.path.join(
+                    runtime_testlib_path, "libmemory_allocator_ion.a"
+                )
+
+            cmd += [
+                memory_allocator,
+                "-I" + xaiengine_include_path,
+                "-L" + xaiengine_lib_path,
+                "-Wl,-R" + xaiengine_lib_path,
+                "-I" + self.tmpdirname,
+                "-fuse-ld=lld",
+                "-lm",
+                "-lxaienginecdo",
+            ]
+            # Linking against HSA
+            if opts.link_against_hsa:
+                cmd += [hsa_so_path]
+                cmd += ["-I%s" % hsa_include_path]
+                cmd += ["-Wl,-rpath,%s" % hsa_lib_path]
+
+            cmd += aie_target_defines(aie_target)
+
+            if len(opts.host_args) > 0:
+                await self.do_call(task, cmd + opts.host_args)
+
+            self.progress_bar.update(self.progress_bar.task_completed, advance=1)
+            self.progress_bar.update(task, advance=0, visible=False)
+
+    async def gen_sim(self, task, aie_target, file_physical, device_name):
+        # For simulation, we need to additionally parse the 'remaining' options to avoid things
+        # which conflict with the options below (e.g. -o)
+        host_opts = aie.compiler.aiecc.cl_arguments.strip_host_args_for_aiesim(
+            opts.host_args
+        )
+
+        sim_dir = self.prepend_tmp("sim")
+        shutil.rmtree(sim_dir, ignore_errors=True)
+        subdirs = ["arch", "reports", "config", "ps"]
+
+        def make_sim_dir(x):
+            dir = os.path.join(sim_dir, x)
+            os.makedirs(dir, exist_ok=True)
+            return dir
+
+        sim_arch_dir, sim_reports_dir, sim_config_dir, sim_ps_dir = map(
+            make_sim_dir, subdirs
+        )
+
+        install_path = aie.compiler.aiecc.configure.install_path()
+
+        # Setting everything up if linking against HSA
+        if opts.link_against_hsa:
+            arch_name = opts.host_target.split("-")[0] + "-hsa"
+        else:
+            arch_name = opts.host_target.split("-")[0]
+
+        runtime_simlib_path = os.path.join(
+            install_path, "aie_runtime_lib", aie_target.upper(), "aiesim"
+        )
+        runtime_testlib_path = os.path.join(
+            install_path,
+            "runtime_lib",
+            arch_name,
+            "test_lib",
+            "lib",
+        )
+        runtime_testlib_include_path = os.path.join(
+            install_path,
+            "runtime_lib",
+            arch_name,
+            "test_lib",
+            "include",
+        )
+        sim_genwrapper = os.path.join(runtime_simlib_path, "genwrapper_for_ps.cpp")
+        memory_allocator = os.path.join(
+            runtime_testlib_path, "libmemory_allocator_sim_aie.a"
+        )
+        # Getting a pointer to the libxaie include and library
+        runtime_xaiengine_path = os.path.join(
+            install_path, "runtime_lib", arch_name, "xaiengine"
+        )
+        xaiengine_include_path = os.path.join(runtime_xaiengine_path, "include")
+        xaiengine_lib_path = os.path.join(runtime_xaiengine_path, "lib")
+        sim_cc_args = [
+            "-fPIC",
+            "-flto",
+            "-fpermissive",
+            "-DAIE_OPTION_SCALAR_FLOAT_ON_VECTOR",
+            "-Wno-deprecated-declarations",
+            "-Wno-enum-constexpr-conversion",
+            "-Wno-format-security",
+            "-DSC_INCLUDE_DYNAMIC_PROCESSES",
+            "-D__AIESIM__",
+            "-D__PS_INIT_AIE__",
+            "-Og",
+            "-Dmain(...)=ps_main(...)",
+            "-I" + self.tmpdirname,
+            "-I" + opts.aietools_path + "/include",
+            "-I" + xaiengine_include_path,
+            "-I" + opts.aietools_path + "/data/osci_systemc/include",
+            "-I" + opts.aietools_path + "/include/xtlm/include",
+            "-I" + opts.aietools_path + "/include/common_cpp/common_cpp_v1_0/include",
+            "-I" + runtime_testlib_include_path,
+            memory_allocator,
+        ]  # clang is picky  # Pickup aie_inc.cpp
+
+        sim_link_args = [
+            "-L" + xaiengine_lib_path,
+            "-lxaienginecdo",
+            "-L" + opts.aietools_path + "/lib/lnx64.o",
+            "-L" + opts.aietools_path + "/lib/lnx64.o/Ubuntu",
+            "-L" + opts.aietools_path + "/data/osci_systemc/lib/lnx64",
+            "-Wl,--as-needed",
+            "-lsystemc",
+            "-lxtlm",
+        ]
+
+        processes = []
+        processes.append(
+            self.do_call(
+                task,
+                [
+                    "aie-translate",
+                    "--aie-mlir-to-xpe",
+                    "--aie-device-name",
+                    device_name,
+                    file_physical,
+                    "-o",
+                    os.path.join(sim_reports_dir, "graph.xpe"),
+                ],
+            )
+        )
+        processes.append(
+            self.do_call(
+                task,
+                [
+                    "aie-translate",
+                    "--aie-mlir-to-shim-solution",
+                    "--aie-device-name",
+                    device_name,
+                    file_physical,
+                    "-o",
+                    os.path.join(sim_arch_dir, "aieshim_solution.aiesol"),
+                ],
+            )
+        )
+        processes.append(
+            self.do_call(
+                task,
+                [
+                    "aie-translate",
+                    "--aie-mlir-to-scsim-config",
+                    "--aie-device-name",
+                    device_name,
+                    file_physical,
+                    "-o",
+                    os.path.join(sim_config_dir, "scsim_config.json"),
+                ],
+            )
+        )
+
+        flows_output = os.path.join(sim_dir, "flows_physical.mlir")
+        with Context(), Location.unknown():
+            module = Module.parse(await read_file_async(file_physical))
+        self.run_passes(
+            "builtin.module(aie.device(aie-find-flows))",
+            module,
+            outputfile=flows_output,
+            description="Finding flows for simulation",
+        )
+
+        processes.append(
+            self.do_call(
+                task,
+                [
+                    "clang++",
+                    "-O2",
+                    "-fuse-ld=lld",
+                    "-shared",
+                    "-o",
+                    os.path.join(sim_ps_dir, "ps.so"),
+                    sim_genwrapper,
+                    *aie_target_defines(aie_target),
+                    *host_opts,
+                    *sim_cc_args,
+                    *sim_link_args,
+                ],
+            )
+        )
+        await asyncio.gather(*processes)
+        await self.do_call(
+            task,
+            [
+                "aie-translate",
+                "--aie-device-name",
+                device_name,
+                "--aie-flows-to-json",
+                os.path.join(sim_dir, "flows_physical.mlir"),
+                "-o",
+                os.path.join(sim_dir, "flows_physical.json"),
+            ],
+        )
+
+        sim_script = self.prepend_tmp("aiesim.sh")
+        sim_script_template = dedent("""\
+            #!/bin/sh
+            prj_name=$(basename $(dirname $(realpath $0)))
+            root=$(dirname $(dirname $(realpath $0)))
+            vcd_filename=foo
+            if [ -n "$1" ]; then
+              vcd_filename=$1
+            fi
+            cd $root
+            aiesimulator --pkg-dir=${prj_name}/sim --dump-vcd ${vcd_filename}
+            """)
+        with open(sim_script, "wt") as sim_script_file:
+            sim_script_file.write(sim_script_template)
+        stats = os.stat(sim_script)
+        os.chmod(sim_script, stats.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
+
+        target = os.path.join(sim_dir, ".target")
+        with open(target, "wt") as target_file:
+            target_file.write("hw\n")
+
+        print("Simulation generated...")
+        print("To run simulation: " + sim_script)
+
+    async def get_aie_target_for_device(self, mlir_input_file, device_name):
+        t = do_run(
+            [
+                "aie-translate",
+                "--aie-generate-target-arch",
+                "--aie-device-name",
+                device_name,
+                mlir_input_file,
+            ],
+            self.opts.verbose,
+        )
+        aie_target = t.stdout.strip()
+        return (aie_target, get_peano_target(aie_target))
+
+    async def run_flow(self):
+        # First, we run some aie-opt passes that transform the MLIR for every
+        # device. Then, we generate the core code for each AIE core tile in
+        # every device. The result of this is an ELF file with each core's
+        # code; we generate a new MLIR file which referencees those generated
+        # ELF files in place of their IR code. We then generate artifacts for
+        # each device individually, using this last generated IR.
+
+        nworkers = int(opts.nthreads)
+        if nworkers == 0:
+            nworkers = os.cpu_count()
+
+        module = parse_file_as_mlir(self.mlir_module_str)
+
+        self.limit = asyncio.Semaphore(nworkers)
+        with progress.Progress(
+            *progress.Progress.get_default_columns(),
+            progress.TimeElapsedColumn(),
+            progress.MofNCompleteColumn(),
+            progress.TextColumn("{task.fields[command]}"),
+            redirect_stdout=False,
+            redirect_stderr=False,
+            disable=not opts.progress,
+        ) as progress_bar:
+            self.progress_bar = progress_bar
+
+            # 1.) MLIR transformations
+
+            task1 = progress_bar.add_task(
+                "[green] MLIR compilation", total=3, command="1 Worker"
+            )
+
+            self.progress_bar.update(task1, advance=1, command="Generating device list")
+            devices = generate_devices_list(module)
+            if len(devices) == 0:
+                print("error: input MLIR must contain at least one aie.device")
+                sys.exit(1)
+            aie_targets, aie_peano_targets = [], []
+            for device_op, device_name in devices:
+                aie_target, aie_peano_target = await self.get_aie_target_for_device(
+                    opts.filename, device_name
+                )
+                aie_targets.append(aie_target)
+                aie_peano_targets.append(aie_peano_target)
+
+            if len(aie_targets) == 0 or not all(
+                aie_target == aie_targets[0] for aie_target in aie_targets
+            ):
+                print("error: all device targets in the file must be the same")
+                # TODO: remove this restriction? currently only needed by AIEVec
+                sys.exit(1)
+            aie_target, aie_peano_target = aie_targets[0], aie_peano_targets[0]
+
+            pass_pipeline = INPUT_WITH_ADDRESSES_PIPELINE(
+                opts.alloc_scheme,
+                opts.dynamic_objFifos,
+                opts.packet_sw_objFifos,
+                opts.ctrl_pkt_overlay,
+                aie_target,
+                opts.opt_level,
+            ).materialize(module=True)
+
+            self.progress_bar.update(task1, advance=1, command=pass_pipeline[0:30])
+            file_with_addresses = self.prepend_tmp("input_with_addresses.mlir")
+            file_with_addresses_module = self.run_passes(
+                pass_pipeline,
+                module,
+                outputfile=file_with_addresses,
+                description="Resource allocation and Object FIFO lowering",
+            )
+
+            requires_routing = (
+                opts.xcl
+                or opts.cdo
+                or opts.pdi
+                or opts.compile
+                or opts.compile_host
+                or opts.aiesim
+            )
+            if requires_routing:
+                input_physical = self.prepend_tmp("input_physical.mlir")
+                self.run_passes(
+                    "builtin.module(aie.device(aie-create-pathfinder-flows))",
+                    file_with_addresses_module,
+                    outputfile=input_physical,
+                    description="Running Router",
+                )
+            else:
+                input_physical = file_with_addresses
+
+            self.progress_bar.update(task1, advance=1)
+
+            # 2.) Generate code for each core
+            requires_core_compilation = (
+                opts.xcl
+                or opts.cdo
+                or opts.pdi
+                or opts.compile
+                or opts.compile_host
+                or opts.aiesim
+            )
+            if requires_core_compilation:
+                task2 = progress_bar.add_task(
+                    "[green] Generating code for each core", total=3, command=""
+                )
+
+                # create core ELF files for each device and core
+                elf_paths = {}
+                for i, (device_op, device_name) in enumerate(devices):
+                    aie_target, aie_peano_target = aie_targets[i], aie_peano_targets[i]
+                    elf_paths[device_name] = await self.process_cores(
+                        device_op,
+                        device_name,
+                        file_with_addresses,
+                        aie_target,
+                        aie_peano_target,
+                        task2,
+                    )
+                input_physical_with_elfs = await self.write_elf_paths_to_mlir(
+                    input_physical, elf_paths
+                )
+            else:
+                input_physical_with_elfs = input_physical
+
+            # 3.) Targets that require the cores to be lowered but apply across all devices
+
+            npu_insts_module = None
+            if opts.npu or opts.elf or opts.generate_full_elf and not opts.ctrlpkt:
+                task3 = progress_bar.add_task(
+                    "[green] Lowering NPU instructions", total=2, command=""
+                )
+                with Context(), Location.unknown():
+                    input_physical_with_elfs_module = Module.parse(
+                        await read_file_async(input_physical_with_elfs)
+                    )
+                    npu_pipeline = _create_npu_lowering_pipeline(opts.expand_load_pdis)
+                    pass_pipeline = npu_pipeline.materialize(module=True)
+                    npu_insts_file = self.prepend_tmp(f"npu_insts.mlir")
+                    self.progress_bar.update(
+                        task3, advance=1, command=pass_pipeline[0:30]
+                    )
+                    npu_insts_module = self.run_passes(
+                        pass_pipeline,
+                        input_physical_with_elfs_module,
+                        npu_insts_file,
+                        description="NPU instruction lowering",
+                    )
+
+                    # If expand_load_pdis is enabled, the pass may have created new devices
+                    # (e.g., @empty), so we need to regenerate the device list from the transformed module
+                    if opts.expand_load_pdis:
+                        devices = generate_devices_list(npu_insts_module)
+                        input_physical_with_expanded = self.prepend_tmp(
+                            "input_physical_with_expanded.mlir"
+                        )
+                        await write_file_async(
+                            str(npu_insts_module), input_physical_with_expanded
+                        )
+                        # Update both input_physical and input_physical_with_elfs to point to the file with expanded devices
+                        input_physical = input_physical_with_expanded
+                        input_physical_with_elfs = input_physical_with_expanded
+
+                    if opts.generate_full_elf:
+                        device_to_id_mapping = create_device_id_mapping(devices)
+                        assign_load_pdi_ids(npu_insts_module, device_to_id_mapping)
+                        transformed_mlir_path = self.prepend_tmp(
+                            "npu_insts_with_pdi_ids.mlir"
+                        )
+                        await write_file_async(
+                            str(npu_insts_module), transformed_mlir_path
+                        )
+
+                    self.progress_bar.update(task3, advance=1)
+
+            # 4.) Generate compilation artifacts for each device
+
+            # create other artifacts for each device
+            task4 = progress_bar.add_task(
+                "[green] Generating device artifacts", total=len(devices), command=""
+            )
+            for device_op, device_name in devices:
+                aie_target, aie_peano_target = await self.get_aie_target_for_device(
+                    input_physical, device_name
+                )
+                await self.run_flow_for_device(
+                    input_physical,
+                    input_physical_with_elfs,
+                    npu_insts_module,
+                    device_op,
+                    device_name,
+                    aie_target,
+                    aie_peano_target,
+                    task4,
+                )
+
+            self.maxtasks = 2
+            task5 = progress_bar.add_task(
+                "[green] Creating full ELF", total=2, command=""
+            )
+            if opts.generate_full_elf:
+                await self.generate_full_elf(devices, device_to_id_mapping, task5)
+
+    async def run_flow_for_device(
+        self,
+        input_physical,
+        input_physical_with_elfs,
+        npu_insts_module,
+        device_op,
+        device_name,
+        aie_target,
+        aie_peano_target,
+        parent_task_id,
+    ):
+        pb = self.progress_bar
+        nworkers = int(opts.nthreads)
+
+        # Optionally generate insts.bin for NPU instruction stream
+        if opts.npu or opts.generate_full_elf and not opts.ctrlpkt:
+            # write each runtime sequence binary into its own file
+            runtime_sequences = generate_runtime_sequences_list(device_op)
+            for seq_op, seq_name in runtime_sequences:
+                pb.update(
+                    parent_task_id,
+                    description=f"[green] Creating NPU instruction binary",
+                )
+                npu_insts = aiedialect.translate_npu_to_binary(
+                    npu_insts_module.operation, device_name, seq_name
+                )
+                npu_insts_path = self.npu_insts_file_name(device_name, seq_name)
+                with open(npu_insts_path, "wb") as f:
+                    f.write(struct.pack("I" * len(npu_insts), *npu_insts))
+                pb.update(parent_task_id, advance=1)
+
+        if opts.compile_host or opts.aiesim:
+            file_inc_cpp = self.prepend_tmp("aie_inc.cpp")
+            await self.do_call(
+                parent_task_id,
+                [
+                    "aie-translate",
+                    "--aie-generate-xaie",
+                    "--aie-device-name",
+                    device_name,
+                    input_physical_with_elfs,
+                    "-o",
+                    file_inc_cpp,
+                ],
+            )
+
+        if opts.compile_host and len(opts.host_args) > 0:
+            await self.process_host_cgen(
+                aie_target, input_physical_with_elfs, device_name
+            )
+
+        processes = []
+        if opts.aiesim:
+            processes.append(
+                self.gen_sim(parent_task_id, aie_target, input_physical, device_name)
+            )
+
+        input_physical_with_elfs_str = await read_file_async(input_physical_with_elfs)
+
+        if (
+            opts.cdo or opts.xcl or opts.pdi or opts.generate_full_elf
+        ) and opts.execute:
+            await self.process_cdo(input_physical_with_elfs_str, device_name)
+
+        if opts.xcl:
+            processes.append(self.process_xclbin_gen(device_op, device_name))
+        # self.process_pdi_gen is called in process_xclbin_gen,
+        # so don't call it again if opts.xcl is set
+        elif opts.pdi or opts.generate_full_elf:
+            processes.append(
+                self.process_pdi_gen(device_name, self.pdi_file_name(device_name))
+            )
+        with Context(), Location.unknown():
+            input_physical_with_elfs_module = Module.parse(input_physical_with_elfs_str)
+        if opts.txn and opts.execute:
+            input_physical_with_elfs = await self.process_txn(
+                input_physical_with_elfs_module, device_name
+            )
+
+        if opts.ctrlpkt and opts.execute:
+            processes.append(
+                self.process_ctrlpkt(
+                    input_physical_with_elfs_module, device_op, device_name
+                )
+            )
+
+        if opts.elf and not opts.ctrlpkt and opts.execute:
+            processes.append(self.process_elf(npu_insts_module, device_name))
+
+        await asyncio.gather(*processes)
+
+    def dumpprofile(self):
+        sortedruntimes = sorted(
+            self.runtimes.items(), key=lambda item: item[1], reverse=True
+        )
+        for i in range(50):
+            if i < len(sortedruntimes):
+                s1, s0 = sortedruntimes[i][1], sortedruntimes[i][0]
+                print(f"{s1:.4f} sec: {s0}")
+
+
+def run(mlir_module, args=None):
+    global opts
+    if args is not None:
+        opts = aie.compiler.aiecc.cl_arguments.parse_args(args)
+
+    opts.aietools_path = None
+
+    # If Ryzen AI Software is installed then use it for aietools
+    try:
+        import ryzen_ai.__about__
+
+        version = ryzen_ai.__about__.__version__
+        path = os.path.realpath(ryzen_ai.__path__[0])
+        if opts.verbose:
+            print(f"Found Ryzen AI software version {version} at {path}")
+        # if ryzenai software is pip installed then the path is something like:
+        # <workdir>/venv/lib/python3.10/site-packages/
+        opts.aietools_path = os.path.realpath(os.path.join(path, ".."))
+    except:
+        pass
+
+    # Try to find xchesscc in the path
+    xchesscc_path = shutil.which("xchesscc")
+    if xchesscc_path:
+        xchesscc_bin_path = os.path.dirname(os.path.realpath(xchesscc_path))
+        xchesscc_path = os.path.dirname(xchesscc_bin_path)
+        if opts.verbose:
+            print(f"Found xchesscc at {xchesscc_path}")
+        os.environ["PATH"] = os.pathsep.join([os.environ["PATH"], xchesscc_bin_path])
+        if opts.aietools_path is None:
+            opts.aietools_path = xchesscc_path
+    else:
+        if opts.verbose:
+            print("xchesscc not found.")
+
+    if opts.aietools_path is None:
+        if opts.verbose:
+            print("Could not find aietools from Vitis or Ryzen AI Software.")
+        opts.aietools_path = "<aietools not found>"
+
+    os.environ["AIETOOLS"] = opts.aietools_path
+
+    aie_path = aie.compiler.aiecc.configure.install_path()
+    peano_path = os.path.join(opts.peano_install_dir, "bin")
+    os.environ["PATH"] = os.pathsep.join([aie_path, os.environ["PATH"]])
+    os.environ["PATH"] = os.pathsep.join([peano_path, os.environ["PATH"]])
+
+    if opts.aiesim and not opts.xbridge:
+        sys.exit("AIE Simulation (--aiesim) currently requires --xbridge")
+
+    if opts.verbose:
+        print(f"Compiling {opts.filename}")
+
+    if opts.tmpdir:
+        tmpdirname = opts.tmpdir
+    elif opts.filename:
+        tmpdirname = os.path.basename(opts.filename) + ".prj"
+    else:
+        tmpdirname = tempfile.mkdtemp()
+    tmpdirname = os.path.abspath(tmpdirname)
+
+    try:
+        os.mkdir(tmpdirname)
+    except FileExistsError:
+        pass
+    if opts.verbose:
+        print("created temporary directory", tmpdirname)
+
+    # Create a temporary file holding the input ir, if opts.filename is None.
+    if opts.filename == None:
+        tmpinput_path = os.path.join(tmpdirname, "tmpinput.mlir")
+        with open(tmpinput_path, "w") as f:
+            f.write(str(mlir_module))
+        opts.filename = tmpinput_path
+
+    runner = FlowRunner(str(mlir_module), opts, tmpdirname)
+    asyncio.run(runner.run_flow())
+
+    if opts.profiling:
+        runner.dumpprofile()
+
+
+def main():
+    global opts
+
+    # Set MLIR_AIE_INSTALL_DIR if not already set
+    if "MLIR_AIE_INSTALL_DIR" not in os.environ:
+        install_dir = aie.compiler.aiecc.configure.install_path()
+        os.environ["MLIR_AIE_INSTALL_DIR"] = install_dir
+
+    opts = aie.compiler.aiecc.cl_arguments.parse_args()
+
+    if opts.version:
+        print(f"aiecc.py {aie.compiler.aiecc.configure.git_commit}")
+        sys.exit(0)
+
+    if opts.filename is None:
+        print("error: the 'file' positional argument is required.")
+        sys.exit(1)
+
+    try:
+        with Context() as ctx, Location.unknown():
+            with open(opts.filename, "r") as f:
+                module = Module.parse(f.read())
+            module_str = str(module)
+    except Exception as e:
+        print(e)
+        sys.exit(1)
+    run(module_str)
diff --git a/tools/aiecc/aiecc.cpp b/tools/aiecc/aiecc.cpp
index 03dde182b4a..373c56e6221 100644
--- a/tools/aiecc/aiecc.cpp
+++ b/tools/aiecc/aiecc.cpp
@@ -115,6 +115,7 @@
 #include <string>
 #include <system_error>
 #include <thread>
+#include <unistd.h>
 #include <vector>
 
 #include "aiecc_aiesim.h"
@@ -1083,8 +1084,8 @@ static std::string getAIETargetForDevice(ModuleOp moduleOp,
 struct CoreInfo {
   std::int32_t col;
   std::int32_t row;
-  std::string linkWith; // External object files to link
-  std::string elfFile;  // Generated ELF path (if already specified)
+  SmallVector<std::string> linkFiles; // External object files to link
+  std::string elfFile; // Generated ELF path (if already specified)
 };
 
 /// Check if a CoreOp has a non-empty body (more than just aie.end).
@@ -1105,8 +1106,15 @@ static CoreInfo getCoreInfo(xilinx::AIE::CoreOp coreOp) {
     info.row = tileOp.getRow();
   }
 
-  if (auto linkWithAttr = coreOp.getLinkWithAttr()) {
-    info.linkWith = linkWithAttr.getValue().str();
+  // Prefer canonical link_files ArrayAttr (populated by AIEAssignCoreLinkFiles,
+  // which runs as part of the resource-allocation pipeline above).
+  if (auto filesAttr = coreOp.getLinkFiles()) {
+    for (auto f : filesAttr->getAsRange<mlir::StringAttr>())
+      info.linkFiles.push_back(f.getValue().str());
+  } else if (auto linkWithAttr = coreOp.getLinkWithAttr()) {
+    // Fallback: deprecated core-level link_with was not migrated by the pass
+    // (e.g., pipeline was not run). Treat it as a single-element list.
+    info.linkFiles.push_back(linkWithAttr.getValue().str());
   }
 
   if (auto elfAttr = coreOp.getElfFileAttr()) {
@@ -1387,6 +1395,9 @@ static LogicalResult runResourceAllocationPipeline(ModuleOp moduleOp,
   bufferOpts.clAllocScheme = allocScheme.getValue();
   devicePm.addPass(xilinx::AIE::createAIEAssignBufferAddressesPass(bufferOpts));
 
+  // Infer per-core link_files from func-level link_with attributes
+  devicePm.addPass(xilinx::AIE::createAIEAssignCoreLinkFilesPass());
+
   devicePm.addPass(xilinx::AIE::createAIEVectorTransferLoweringPass());
 
   // Step 5: Convert SCF to CF (module-level pass)
@@ -1783,6 +1794,48 @@ static LogicalResult runUnifiedLLVMLoweringPipeline(ModuleOp moduleOp,
   return success();
 }
 
+/// Copy \p src to \p destDir / \p destBasename atomically by writing to a
+/// sibling temp file first, then renaming.  On POSIX, rename(2) is atomic
+/// within the same filesystem, so parallel compilations sharing the same
+/// destination filename do not corrupt each other's copy.
+static LogicalResult atomicCopyFile(StringRef src, StringRef destDir,
+                                    StringRef destBasename) {
+  SmallString<256> dest(destDir);
+  sys::path::append(dest, destBasename);
+
+  // Write to a sibling temp file in destDir, then rename atomically.
+  // Keeping the temp in the same directory ensures they share a filesystem,
+  // so rename(2) is never cross-device (no EXDEV failure).
+  SmallString<256> tmpModel(destDir);
+  SmallString<64> tmpFilename;
+  tmpFilename += sys::path::stem(destBasename);
+  tmpFilename += "-%%%%%%";
+  tmpFilename += sys::path::extension(destBasename);
+  sys::path::append(tmpModel, tmpFilename);
+  SmallString<256> tmpPath;
+  int tmpFD;
+  if (sys::fs::createUniqueFile(tmpModel, tmpFD, tmpPath)) {
+    llvm::errs() << "Error: could not create temp file in " << destDir << "\n";
+    return failure();
+  }
+  ::close(tmpFD);
+
+  if (std::error_code ec = sys::fs::copy_file(src, tmpPath)) {
+    llvm::errs() << "Error: could not copy " << src << " to " << tmpPath << ": "
+                 << ec.message() << "\n";
+    sys::fs::remove(tmpPath);
+    return failure();
+  }
+
+  if (std::error_code ec = sys::fs::rename(tmpPath, dest)) {
+    llvm::errs() << "Error: could not rename " << tmpPath << " to " << dest
+                 << ": " << ec.message() << "\n";
+    sys::fs::remove(tmpPath);
+    return failure();
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Core Compilation
 //===----------------------------------------------------------------------===//
@@ -2165,32 +2218,17 @@ static LogicalResult compileCore(MLIRContext &context, ModuleOp moduleOp,
         }
       }
 
-      // Copy to .prj directory
+      // Copy to .prj directory atomically to avoid races between parallel
+      // cores.
       SmallString<256> destPath(tmpDirName);
       sys::path::append(destPath, sys::path::filename(linkWithFile));
+      if (failed(atomicCopyFile(srcPath, tmpDirName,
+                                sys::path::filename(linkWithFile))))
+        return failure();
 
-      if (srcPath == destPath) {
-        if (verbose) {
-          std::lock_guard<std::mutex> lock(outputMutex);
-          llvm::outs() << "link_with file already in place: " << srcPath
-                       << "\n";
-        }
-      } else {
-        sys::fs::remove(destPath);
-        std::error_code ec = sys::fs::copy_file(srcPath, destPath);
-        if (ec) {
-          std::lock_guard<std::mutex> lock(outputMutex);
-          llvm::errs() << "Error: Could not copy link_with file: " << srcPath
-                       << " to " << destPath << ": " << ec.message() << "\n";
-          return failure();
-        }
-
-        if (verbose) {
-          std::lock_guard<std::mutex> lock(outputMutex);
-          llvm::outs() << "Copied link_with: " << srcPath << " -> " << destPath
-                       << "\n";
-        }
-      }
+      if (verbose)
+        llvm::outs() << "Copied link_with: " << srcPath << " -> " << destPath
+                     << "\n";
 
       if (!linkWithArgs.empty()) {
         linkWithArgs += " ";
@@ -2281,29 +2319,25 @@ static LogicalResult compileCore(MLIRContext &context, ModuleOp moduleOp,
 
     linkCmd.push_back(std::string(objPath));
 
-    // Handle external object file if link_with attribute is specified
-    // The linker script generated by aie-translate will include an INPUT()
-    // directive for the link_with file, but it uses a relative path.
-    // We need to copy the file to the .prj directory so the linker can find it.
-    if (!core.linkWith.empty()) {
-      // Resolve the link_with path - check multiple locations:
-      // 1. If absolute, use as-is
-      // 2. Relative to current working directory (common for test cases)
-      // 3. Relative to input file directory (common for installed examples)
+    // Handle external object files specified via link_files (or deprecated
+    // link_with).  The linker script generated by aie-translate will include an
+    // INPUT() directive for each file, but uses a relative path.  We copy every
+    // file to the .prj directory so the linker can find them.
+    for (const auto &lf : core.linkFiles) {
       SmallString<256> srcLinkWith;
-      if (sys::path::is_absolute(core.linkWith)) {
-        srcLinkWith = core.linkWith;
+      if (sys::path::is_absolute(lf)) {
+        srcLinkWith = lf;
       } else {
         // First try current working directory
         SmallString<256> cwdPath;
         sys::fs::current_path(cwdPath);
-        sys::path::append(cwdPath, core.linkWith);
+        sys::path::append(cwdPath, lf);
         if (sys::fs::exists(cwdPath)) {
           srcLinkWith = cwdPath;
         } else {
           // Try tmpDirName (used in JIT where .o is pre-compiled there)
           SmallString<256> tmpPath(tmpDirName);
-          sys::path::append(tmpPath, core.linkWith);
+          sys::path::append(tmpPath, lf);
           if (sys::fs::exists(tmpPath)) {
             srcLinkWith = tmpPath;
           } else {
@@ -2314,45 +2348,27 @@ static LogicalResult compileCore(MLIRContext &context, ModuleOp moduleOp,
               sys::fs::current_path(inputDir);
             }
             srcLinkWith = inputDir;
-            sys::path::append(srcLinkWith, core.linkWith);
+            sys::path::append(srcLinkWith, lf);
             sys::path::remove_dots(srcLinkWith, /*remove_dot_dot=*/true);
           }
         }
       }
 
       // Copy the object file to the .prj directory so the linker script's
-      // INPUT() directive can find it
+      // INPUT() directive can find it. Copy atomically to avoid races between
+      // parallel cores that share the same .o filename.
       SmallString<256> destLinkWith(tmpDirName);
-      sys::path::append(destLinkWith, sys::path::filename(core.linkWith));
-
-      if (srcLinkWith == destLinkWith) {
-        if (verbose) {
-          std::lock_guard<std::mutex> lock(outputMutex);
-          llvm::outs() << "link_with file already in place: " << srcLinkWith
-                       << "\n";
-        }
-      } else {
-        // Remove destination file first if it exists (to ensure overwrite)
-        sys::fs::remove(destLinkWith);
-
-        std::error_code ec = sys::fs::copy_file(srcLinkWith, destLinkWith);
-        if (ec) {
-          std::lock_guard<std::mutex> lock(outputMutex);
-          llvm::errs() << "Error: Could not copy link_with file: "
-                       << srcLinkWith << " to " << destLinkWith << "\n";
-          llvm::errs() << "Error: " << ec.message() << "\n";
-          return failure();
-        }
+      sys::path::append(destLinkWith, sys::path::filename(lf));
+      if (failed(
+              atomicCopyFile(srcLinkWith, tmpDirName, sys::path::filename(lf))))
+        return failure();
 
-        if (verbose) {
-          std::lock_guard<std::mutex> lock(outputMutex);
-          llvm::outs() << "Copied link_with object: " << srcLinkWith << " -> "
-                       << destLinkWith << "\n";
-        }
-      }
+      if (verbose)
+        llvm::outs() << "Copied link_with object: " << srcLinkWith << " -> "
+                     << destLinkWith << "\n";
 
-      // Note: We don't add the object file to linkStrs because the linker
-      // script already has an INPUT() directive for it
+      // Note: We don't add the object file to linkCmd because the linker
+      // script already has INPUT() directives for each file
     }
 
     // Make linker script path absolute
@@ -2412,10 +2428,10 @@ compileCores(MLIRContext &context, ModuleOp moduleOp, Operation *deviceOp,
 
   SmallVector<CoreInfo> cores;
   deviceOp->walk([&](xilinx::AIE::CoreOp coreOp) {
-    // Skip cores with no elf_file, no link_with, and empty body
+    // Skip cores with no elf_file, no link_with/link_files, and empty body
     // (e.g., @empty device ops created by --expand-load-pdis)
     if (coreOp.getElfFileAttr() || coreOp.getLinkWithAttr() ||
-        coreHasNonemptyBody(coreOp)) {
+        coreOp.getLinkFiles() || coreHasNonemptyBody(coreOp)) {
       cores.push_back(getCoreInfo(coreOp));
     }
   });
@@ -2585,10 +2601,10 @@ compileCoresUnified(MLIRContext &context, ModuleOp moduleOp,
 
   SmallVector<CoreInfo> cores;
   deviceOp->walk([&](xilinx::AIE::CoreOp coreOp) {
-    // Skip cores with no elf_file, no link_with, and empty body
+    // Skip cores with no elf_file, no link_with/link_files, and empty body
     // (e.g., @empty device ops created by --expand-load-pdis)
     if (coreOp.getElfFileAttr() || coreOp.getLinkWithAttr() ||
-        coreHasNonemptyBody(coreOp)) {
+        coreOp.getLinkFiles() || coreHasNonemptyBody(coreOp)) {
       cores.push_back(getCoreInfo(coreOp));
     }
   });
@@ -2906,15 +2922,9 @@ compileCoresUnified(MLIRContext &context, ModuleOp moduleOp,
 
         SmallString<256> destPath(tmpDirName);
         sys::path::append(destPath, sys::path::filename(linkWithFile));
-        if (srcPath == destPath) {
-          continue;
-        }
-        sys::fs::remove(destPath);
-        std::error_code ec = sys::fs::copy_file(srcPath, destPath);
-        if (ec) {
-          llvm::errs() << "Error copying link_with file: " << srcPath << "\n";
+        if (failed(atomicCopyFile(srcPath, tmpDirName,
+                                  sys::path::filename(linkWithFile))))
           return failure();
-        }
       }
 
       auto xchessccWrapperPath = sys::findProgramByName("xchesscc_wrapper");
@@ -2993,24 +3003,23 @@ compileCoresUnified(MLIRContext &context, ModuleOp moduleOp,
       SmallString<256> peanoLld(peanoBinDir);
       sys::path::append(peanoLld, "ld.lld");
 
-      // Handle link_with if specified
-      // Search order: current working directory, tmpDirName, input file
-      // directory
-      if (!core.linkWith.empty()) {
+      // Handle external object files specified via link_files (or deprecated
+      // link_with). Search order: absolute, cwd, tmpDirName, input file dir.
+      for (const auto &lf : core.linkFiles) {
         SmallString<256> srcLinkWith;
-        if (sys::path::is_absolute(core.linkWith)) {
-          srcLinkWith = core.linkWith;
+        if (sys::path::is_absolute(lf)) {
+          srcLinkWith = lf;
         } else {
           // First try current working directory
           SmallString<256> cwdPath;
           sys::fs::current_path(cwdPath);
-          sys::path::append(cwdPath, core.linkWith);
+          sys::path::append(cwdPath, lf);
           if (sys::fs::exists(cwdPath)) {
             srcLinkWith = cwdPath;
           } else {
             // Try tmpDirName (used in JIT where .o is pre-compiled there)
             SmallString<256> tmpPath(tmpDirName);
-            sys::path::append(tmpPath, core.linkWith);
+            sys::path::append(tmpPath, lf);
             if (sys::fs::exists(tmpPath)) {
               srcLinkWith = tmpPath;
             } else {
@@ -3021,23 +3030,17 @@ compileCoresUnified(MLIRContext &context, ModuleOp moduleOp,
                 sys::fs::current_path(inputDir);
               }
               srcLinkWith = inputDir;
-              sys::path::append(srcLinkWith, core.linkWith);
+              sys::path::append(srcLinkWith, lf);
               sys::path::remove_dots(srcLinkWith, /*remove_dot_dot=*/true);
             }
           }
         }
 
         SmallString<256> destLinkWith(tmpDirName);
-        sys::path::append(destLinkWith, sys::path::filename(core.linkWith));
-        if (srcLinkWith != destLinkWith) {
-          sys::fs::remove(destLinkWith);
-          std::error_code ec = sys::fs::copy_file(srcLinkWith, destLinkWith);
-          if (ec) {
-            llvm::errs() << "Error copying link_with file: " << srcLinkWith
-                         << "\n";
-            return failure();
-          }
-        }
+        sys::path::append(destLinkWith, sys::path::filename(lf));
+        if (failed(atomicCopyFile(srcLinkWith, tmpDirName,
+                                  sys::path::filename(lf))))
+          return failure();
       }
 
       SmallString<128> absLdScriptPath;

From 26aafad601728aec47c65cc7ff4d0e8b1eb49ee7 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Thu, 5 Mar 2026 14:27:56 -0700
Subject: [PATCH 03/28] [test] Add aiecc and npu-xrt tests for func-level
 link_with
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

aiecc dialect tests (test/aiecc/):
- cpp_link_with.mlir: migrate existing test to func-level link_with
- cpp_link_with_func_level.mlir: basic func.func link_with → CoreOp link_files
- cpp_link_with_both_attrs.mlir: CoreOp + func.func link_with coexist → error
- cpp_link_with_deprecation.mlir: CoreOp-level link_with deprecation warning
- cpp_link_with_emitter_fallback.mlir: BCF/LdScript emit from link_files
- cpp_link_with_indirect_call.mlir: indirect call triggers warning
- cpp_link_with_mixed.mlir: mixed kernel .o per-core merged and deduped
- cpp_link_with_shared_func.mlir: same func.func called by multiple cores
- cpp_link_with_unused_func.mlir: unused func.func with link_with warns
- cpp_multi_link_with.mlir: multiple link_with attrs on different funcs

npu-xrt end-to-end tests (require physical NPU):
- add_one_func_link_with_{chess,peano}: single kernel via func-level link_with
- add_one_scale_func_link_with_{chess,peano}: two kernels (add_one + scale_by_two)
  each with its own func.func link_with, exercising multi-.o linking per core

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 test/aiecc/cpp_link_with.mlir                 |   4 +-
 test/aiecc/cpp_link_with_both_attrs.mlir      |  26 ++++
 test/aiecc/cpp_link_with_deprecation.mlir     |  38 ++++++
 .../aiecc/cpp_link_with_emitter_fallback.mlir |  34 +++++
 test/aiecc/cpp_link_with_func_level.mlir      |  58 +++++++++
 test/aiecc/cpp_link_with_indirect_call.mlir   |  30 +++++
 test/aiecc/cpp_link_with_mixed.mlir           |  49 +++++++
 test/aiecc/cpp_link_with_shared_func.mlir     |  57 ++++++++
 test/aiecc/cpp_link_with_unused_func.mlir     |  27 ++++
 test/aiecc/cpp_multi_link_with.mlir           |  62 +++++++++
 .../add_one_kernel.cc                         |  28 ++++
 .../add_one_func_link_with_chess/aie.mlir     |  60 +++++++++
 .../add_one_func_link_with_chess/run.lit      |  22 ++++
 .../add_one_func_link_with_chess/test.cpp     | 119 +++++++++++++++++
 .../add_one_kernel.cc                         |  29 +++++
 .../add_one_func_link_with_peano/aie.mlir     |  62 +++++++++
 .../add_one_func_link_with_peano/run.lit      |  23 ++++
 .../add_one_func_link_with_peano/test.cpp     | 119 +++++++++++++++++
 .../add_one_kernel.cc                         |  28 ++++
 .../aie.mlir                                  |  67 ++++++++++
 .../run.lit                                   |  31 +++++
 .../scale_kernel.cc                           |  41 ++++++
 .../test.cpp                                  | 122 ++++++++++++++++++
 .../add_one_kernel.cc                         |  28 ++++
 .../aie.mlir                                  |  67 ++++++++++
 .../run.lit                                   |  31 +++++
 .../scale_kernel.cc                           |  33 +++++
 .../test.cpp                                  | 122 ++++++++++++++++++
 28 files changed, 1415 insertions(+), 2 deletions(-)
 create mode 100644 test/aiecc/cpp_link_with_both_attrs.mlir
 create mode 100644 test/aiecc/cpp_link_with_deprecation.mlir
 create mode 100644 test/aiecc/cpp_link_with_emitter_fallback.mlir
 create mode 100644 test/aiecc/cpp_link_with_func_level.mlir
 create mode 100644 test/aiecc/cpp_link_with_indirect_call.mlir
 create mode 100644 test/aiecc/cpp_link_with_mixed.mlir
 create mode 100644 test/aiecc/cpp_link_with_shared_func.mlir
 create mode 100644 test/aiecc/cpp_link_with_unused_func.mlir
 create mode 100644 test/aiecc/cpp_multi_link_with.mlir
 create mode 100644 test/npu-xrt/add_one_func_link_with_chess/add_one_kernel.cc
 create mode 100644 test/npu-xrt/add_one_func_link_with_chess/aie.mlir
 create mode 100644 test/npu-xrt/add_one_func_link_with_chess/run.lit
 create mode 100644 test/npu-xrt/add_one_func_link_with_chess/test.cpp
 create mode 100644 test/npu-xrt/add_one_func_link_with_peano/add_one_kernel.cc
 create mode 100644 test/npu-xrt/add_one_func_link_with_peano/aie.mlir
 create mode 100644 test/npu-xrt/add_one_func_link_with_peano/run.lit
 create mode 100644 test/npu-xrt/add_one_func_link_with_peano/test.cpp
 create mode 100644 test/npu-xrt/add_one_scale_func_link_with_chess/add_one_kernel.cc
 create mode 100644 test/npu-xrt/add_one_scale_func_link_with_chess/aie.mlir
 create mode 100644 test/npu-xrt/add_one_scale_func_link_with_chess/run.lit
 create mode 100644 test/npu-xrt/add_one_scale_func_link_with_chess/scale_kernel.cc
 create mode 100644 test/npu-xrt/add_one_scale_func_link_with_chess/test.cpp
 create mode 100644 test/npu-xrt/add_one_scale_func_link_with_peano/add_one_kernel.cc
 create mode 100644 test/npu-xrt/add_one_scale_func_link_with_peano/aie.mlir
 create mode 100644 test/npu-xrt/add_one_scale_func_link_with_peano/run.lit
 create mode 100644 test/npu-xrt/add_one_scale_func_link_with_peano/scale_kernel.cc
 create mode 100644 test/npu-xrt/add_one_scale_func_link_with_peano/test.cpp

diff --git a/test/aiecc/cpp_link_with.mlir b/test/aiecc/cpp_link_with.mlir
index 0e0a36cc83c..96952a40b2f 100644
--- a/test/aiecc/cpp_link_with.mlir
+++ b/test/aiecc/cpp_link_with.mlir
@@ -29,7 +29,7 @@ module {
     aie.objectfifo @of_in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
     aie.objectfifo @of_out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
 
-    func.func private @external_func(memref<16xi32>, memref<16xi32>)
+    func.func private @external_func(memref<16xi32>, memref<16xi32>) attributes {link_with = "external.o"}
 
     %core_0_2 = aie.core(%tile_0_2) {
       %subview_in = aie.objectfifo.acquire @of_in(Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
@@ -43,7 +43,7 @@ module {
       aie.objectfifo.release @of_in(Consume, 1)
       aie.objectfifo.release @of_out(Produce, 1)
       aie.end
-    } {link_with = "external.o"}
+    }
 
     aie.runtime_sequence(%in : memref<16xi32>, %out : memref<16xi32>) {
       %c0 = arith.constant 0 : i64
diff --git a/test/aiecc/cpp_link_with_both_attrs.mlir b/test/aiecc/cpp_link_with_both_attrs.mlir
new file mode 100644
index 00000000000..f4a40cae290
--- /dev/null
+++ b/test/aiecc/cpp_link_with_both_attrs.mlir
@@ -0,0 +1,26 @@
+//===- cpp_link_with_both_attrs.mlir ----------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// Test that a core with both the deprecated 'link_with' scalar attr AND the
+// canonical 'link_files' array attr on the same CoreOp is rejected by the
+// CoreOp verifier.
+
+// RUN: aie-opt --verify-diagnostics %s
+
+module {
+  aie.device(npu1_1col) {
+    %tile_0_2 = aie.tile(0, 2)
+
+    // expected-error@+1 {{cannot specify both 'link_with' (deprecated) and 'link_files'}}
+    %core_0_2 = aie.core(%tile_0_2) {
+      aie.end
+    } {link_with = "a.o", link_files = ["b.o"]}
+  }
+}
diff --git a/test/aiecc/cpp_link_with_deprecation.mlir b/test/aiecc/cpp_link_with_deprecation.mlir
new file mode 100644
index 00000000000..1d450676407
--- /dev/null
+++ b/test/aiecc/cpp_link_with_deprecation.mlir
@@ -0,0 +1,38 @@
+//===- cpp_link_with_deprecation.mlir --------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// Test that core-level link_with still compiles but emits a deprecation warning,
+// and that the pass migrates the attribute to link_files on the core.
+
+// RUN: aie-opt --verify-diagnostics --aie-assign-core-link-files %s
+// RUN: aie-opt --aie-assign-core-link-files %s | FileCheck %s --check-prefix=MIGRATED
+
+// Verify the pass migrated the deprecated core-level attr into link_files and
+// removed link_with from the core.
+// MIGRATED:     link_files = ["legacy.o"]
+// MIGRATED-NOT: link_with = "legacy.o"
+
+module {
+  aie.device(npu1_1col) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_2 = aie.tile(0, 2)
+
+    aie.objectfifo @of(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+
+    // expected-warning@+1 {{link_with on aie.core is deprecated; attach link_with to the func.func declaration instead}}
+    %core_0_2 = aie.core(%tile_0_2) {
+      %buf = aie.objectfifo.acquire @of(Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
+      aie.objectfifo.release @of(Consume, 1)
+      aie.end
+    } {link_with = "legacy.o"}
+
+    aie.runtime_sequence() {}
+  }
+}
diff --git a/test/aiecc/cpp_link_with_emitter_fallback.mlir b/test/aiecc/cpp_link_with_emitter_fallback.mlir
new file mode 100644
index 00000000000..fb432c76463
--- /dev/null
+++ b/test/aiecc/cpp_link_with_emitter_fallback.mlir
@@ -0,0 +1,34 @@
+//===- cpp_link_with_emitter_fallback.mlir ---------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// Test the deprecated fallback path in the ldscript and BCF emitters:
+// when a core still has a core-level link_with (and no link_files), both
+// emitters should still emit the correct entry without running
+// aie-assign-core-link-files first.
+
+// RUN: aie-translate --aie-generate-ldscript --tilecol=0 --tilerow=2 %s | FileCheck %s --check-prefix=LDSCRIPT
+// RUN: aie-translate --aie-generate-bcf --tilecol=0 --tilerow=2 %s | FileCheck %s --check-prefix=BCF
+
+// LDSCRIPT: INPUT(fallback.o)
+// BCF: _include _file fallback.o
+
+// Use a bare core without objectfifo so no lowering is needed before
+// aie-translate can generate the ldscript/BCF.
+
+module {
+  aie.device(npu1_1col) {
+    %tile_0_2 = aie.tile(0, 2)
+
+    // Core keeps the old core-level link_with (no pass run, no link_files set).
+    %core_0_2 = aie.core(%tile_0_2) {
+      aie.end
+    } {link_with = "fallback.o"}
+  }
+}
diff --git a/test/aiecc/cpp_link_with_func_level.mlir b/test/aiecc/cpp_link_with_func_level.mlir
new file mode 100644
index 00000000000..9a53f0f4313
--- /dev/null
+++ b/test/aiecc/cpp_link_with_func_level.mlir
@@ -0,0 +1,58 @@
+//===- cpp_link_with_func_level.mlir ---------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// Canonical new style: link_with is on func.func, not on aie.core.
+// Verify that AIEAssignCoreLinkFiles populates link_files on the core and
+// that the ldscript/BCF emitters produce the correct entries.
+
+// RUN: aie-opt --verify-diagnostics --aie-assign-core-link-files %s | FileCheck %s --check-prefix=OPT
+// RUN: aie-opt --verify-diagnostics --aie-assign-core-link-files %s | aie-translate --aie-generate-ldscript --tilecol=0 --tilerow=2 | FileCheck %s --check-prefix=LDSCRIPT
+// RUN: aie-opt --verify-diagnostics --aie-assign-core-link-files %s | aie-translate --aie-generate-bcf --tilecol=0 --tilerow=2 | FileCheck %s --check-prefix=BCF
+
+// OPT: link_files = ["f.o"]
+
+// LDSCRIPT: INPUT(f.o)
+
+// BCF: _include _file f.o
+
+module {
+  aie.device(npu1_1col) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_2 = aie.tile(0, 2)
+
+    aie.objectfifo @of_in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+    aie.objectfifo @of_out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+
+    func.func private @f(memref<16xi32>, memref<16xi32>) attributes {link_with = "f.o"}
+
+    %core_0_2 = aie.core(%tile_0_2) {
+      %subview_in = aie.objectfifo.acquire @of_in(Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
+      %elem_in = aie.objectfifo.subview.access %subview_in[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
+
+      %subview_out = aie.objectfifo.acquire @of_out(Produce, 1) : !aie.objectfifosubview<memref<16xi32>>
+      %elem_out = aie.objectfifo.subview.access %subview_out[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
+
+      func.call @f(%elem_in, %elem_out) : (memref<16xi32>, memref<16xi32>) -> ()
+
+      aie.objectfifo.release @of_in(Consume, 1)
+      aie.objectfifo.release @of_out(Produce, 1)
+      aie.end
+    }
+
+    aie.runtime_sequence(%in : memref<16xi32>, %out : memref<16xi32>) {
+      %c0 = arith.constant 0 : i64
+      %c1 = arith.constant 1 : i64
+      %c16 = arith.constant 16 : i64
+      aiex.npu.dma_memcpy_nd(%out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c16][%c0,%c0,%c0,%c1]) {metadata = @of_out, id = 1 : i64} : memref<16xi32>
+      aiex.npu.dma_memcpy_nd(%in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c16][%c0,%c0,%c0,%c1]) {metadata = @of_in, id = 0 : i64, issue_token = true} : memref<16xi32>
+      aiex.npu.dma_wait {symbol = @of_out}
+    }
+  }
+}
diff --git a/test/aiecc/cpp_link_with_indirect_call.mlir b/test/aiecc/cpp_link_with_indirect_call.mlir
new file mode 100644
index 00000000000..fa06dc43f36
--- /dev/null
+++ b/test/aiecc/cpp_link_with_indirect_call.mlir
@@ -0,0 +1,30 @@
+//===- cpp_link_with_indirect_call.mlir -------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// Test that an indirect call inside a core body triggers a warning from
+// aie-assign-core-link-files, since link_with on indirectly-called funcs
+// cannot be statically resolved.
+
+// RUN: aie-opt --verify-diagnostics --aie-assign-core-link-files %s
+
+module {
+  aie.device(npu1_1col) {
+    %tile_0_2 = aie.tile(0, 2)
+
+    func.func private @some_helper() -> ()
+
+    %core_0_2 = aie.core(%tile_0_2) {
+      %fptr = func.constant @some_helper : () -> ()
+      // expected-warning@+1 {{indirect call in core body}}
+      func.call_indirect %fptr() : () -> ()
+      aie.end
+    }
+  }
+}
diff --git a/test/aiecc/cpp_link_with_mixed.mlir b/test/aiecc/cpp_link_with_mixed.mlir
new file mode 100644
index 00000000000..57a25d366b1
--- /dev/null
+++ b/test/aiecc/cpp_link_with_mixed.mlir
@@ -0,0 +1,49 @@
+//===- cpp_link_with_mixed.mlir --------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// Test that a core with both a deprecated core-level link_with AND a call to
+// a func.func with its own link_with produces a merged, deduplicated link_files
+// set.  The core-level attr is consumed (removed) and both .o paths appear
+// exactly once in link_files.
+
+// RUN: aie-opt --verify-diagnostics --aie-assign-core-link-files %s | FileCheck %s --check-prefix=OPT
+// RUN: aie-opt --verify-diagnostics --aie-assign-core-link-files %s | aie-translate --aie-generate-ldscript --tilecol=0 --tilerow=2 | FileCheck %s --check-prefix=LDSCRIPT
+
+// The merged set must contain both files.
+// OPT-DAG: "core_only.o"
+// OPT-DAG: "func_only.o"
+// The deprecated core-level attr must be gone.
+// OPT-NOT: link_with = "core_only.o"
+
+// LDSCRIPT-DAG: INPUT(core_only.o)
+// LDSCRIPT-DAG: INPUT(func_only.o)
+
+module {
+  aie.device(npu1_1col) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_2 = aie.tile(0, 2)
+
+    aie.objectfifo @of(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+
+    func.func private @ext(memref<16xi32>) attributes {link_with = "func_only.o"}
+
+    // Core carries deprecated core-level link_with AND calls a func with its own.
+    // expected-warning@+1 {{link_with on aie.core is deprecated; attach link_with to the func.func declaration instead}}
+    %core_0_2 = aie.core(%tile_0_2) {
+      %buf = aie.objectfifo.acquire @of(Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
+      %elem = aie.objectfifo.subview.access %buf[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
+      func.call @ext(%elem) : (memref<16xi32>) -> ()
+      aie.objectfifo.release @of(Consume, 1)
+      aie.end
+    } {link_with = "core_only.o"}
+
+    aie.runtime_sequence() {}
+  }
+}
diff --git a/test/aiecc/cpp_link_with_shared_func.mlir b/test/aiecc/cpp_link_with_shared_func.mlir
new file mode 100644
index 00000000000..4da67aa4892
--- /dev/null
+++ b/test/aiecc/cpp_link_with_shared_func.mlir
@@ -0,0 +1,57 @@
+//===- cpp_link_with_shared_func.mlir --------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// Test that two cores each calling the same func.func @kernel {link_with="k.o"}
+// each produce exactly one INPUT(k.o) / _include _file k.o entry (no
+// duplication of the shared object file).
+
+// RUN: aie-opt --aie-assign-core-link-files %s | FileCheck %s --check-prefix=OPT
+// RUN: aie-opt --aie-assign-core-link-files %s | aie-translate --aie-generate-ldscript --tilecol=0 --tilerow=2 | FileCheck %s --check-prefix=LDSCRIPT02
+// RUN: aie-opt --aie-assign-core-link-files %s | aie-translate --aie-generate-ldscript --tilecol=0 --tilerow=3 | FileCheck %s --check-prefix=LDSCRIPT03
+
+// OPT-COUNT-2: link_files = ["k.o"]
+
+// LDSCRIPT02: INPUT(k.o)
+// LDSCRIPT02-NOT: INPUT(k.o)
+
+// LDSCRIPT03: INPUT(k.o)
+// LDSCRIPT03-NOT: INPUT(k.o)
+
+module {
+  aie.device(npu1_1col) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_2 = aie.tile(0, 2)
+    %tile_0_3 = aie.tile(0, 3)
+
+    // Declare objectfifos before the cores that reference them.
+    aie.objectfifo @dummy_in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+    aie.objectfifo @dummy_in2(%tile_0_0, {%tile_0_3}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+
+    func.func private @kernel(memref<16xi32>) attributes {link_with = "k.o"}
+
+    %core_0_2 = aie.core(%tile_0_2) {
+      %buf = aie.objectfifo.acquire @dummy_in(Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
+      %elem = aie.objectfifo.subview.access %buf[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
+      func.call @kernel(%elem) : (memref<16xi32>) -> ()
+      aie.objectfifo.release @dummy_in(Consume, 1)
+      aie.end
+    }
+
+    %core_0_3 = aie.core(%tile_0_3) {
+      %buf = aie.objectfifo.acquire @dummy_in2(Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
+      %elem = aie.objectfifo.subview.access %buf[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
+      func.call @kernel(%elem) : (memref<16xi32>) -> ()
+      aie.objectfifo.release @dummy_in2(Consume, 1)
+      aie.end
+    }
+
+    aie.runtime_sequence() {}
+  }
+}
diff --git a/test/aiecc/cpp_link_with_unused_func.mlir b/test/aiecc/cpp_link_with_unused_func.mlir
new file mode 100644
index 00000000000..1804bc6a033
--- /dev/null
+++ b/test/aiecc/cpp_link_with_unused_func.mlir
@@ -0,0 +1,27 @@
+//===- cpp_link_with_unused_func.mlir ---------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// (c) Copyright 2026 Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// Test that a func.func carrying link_with that is never called from any core
+// produces a warning from aie-assign-core-link-files.
+
+// RUN: aie-opt --verify-diagnostics --aie-assign-core-link-files %s
+
+module {
+  aie.device(npu1_1col) {
+    %tile_0_2 = aie.tile(0, 2)
+
+    // expected-warning@+1 {{func 'never_called' has link_with but is never called from any core; its .o file will not be linked}}
+    func.func private @never_called(memref<16xi32>) attributes {link_with = "x.o"}
+
+    %core_0_2 = aie.core(%tile_0_2) {
+      aie.end
+    }
+  }
+}
diff --git a/test/aiecc/cpp_multi_link_with.mlir b/test/aiecc/cpp_multi_link_with.mlir
new file mode 100644
index 00000000000..db265d3034f
--- /dev/null
+++ b/test/aiecc/cpp_multi_link_with.mlir
@@ -0,0 +1,62 @@
+//===- cpp_multi_link_with.mlir --------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// Test that one core calling two func.func declarations each with a distinct
+// link_with attribute produces two INPUT() lines in the ldscript and two
+// _include _file lines in the BCF.
+
+// RUN: aie-opt --aie-assign-core-link-files %s | FileCheck %s --check-prefix=OPT
+// RUN: aie-opt --aie-assign-core-link-files %s | aie-translate --aie-generate-ldscript --tilecol=0 --tilerow=2 | FileCheck %s --check-prefix=LDSCRIPT
+// RUN: aie-opt --aie-assign-core-link-files %s | aie-translate --aie-generate-bcf --tilecol=0 --tilerow=2 | FileCheck %s --check-prefix=BCF
+
+// OPT: link_files = ["kernelA.o", "kernelB.o"]
+
+// LDSCRIPT: INPUT(kernelA.o)
+// LDSCRIPT: INPUT(kernelB.o)
+
+// BCF: _include _file kernelA.o
+// BCF: _include _file kernelB.o
+
+module {
+  aie.device(npu1_1col) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_2 = aie.tile(0, 2)
+
+    aie.objectfifo @of_in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+    aie.objectfifo @of_out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<16xi32>>
+
+    func.func private @kernelA(memref<16xi32>) attributes {link_with = "kernelA.o"}
+    func.func private @kernelB(memref<16xi32>) attributes {link_with = "kernelB.o"}
+
+    %core_0_2 = aie.core(%tile_0_2) {
+      %subview_in = aie.objectfifo.acquire @of_in(Consume, 1) : !aie.objectfifosubview<memref<16xi32>>
+      %elem_in = aie.objectfifo.subview.access %subview_in[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
+
+      %subview_out = aie.objectfifo.acquire @of_out(Produce, 1) : !aie.objectfifosubview<memref<16xi32>>
+      %elem_out = aie.objectfifo.subview.access %subview_out[0] : !aie.objectfifosubview<memref<16xi32>> -> memref<16xi32>
+
+      func.call @kernelA(%elem_in) : (memref<16xi32>) -> ()
+      func.call @kernelB(%elem_out) : (memref<16xi32>) -> ()
+
+      aie.objectfifo.release @of_in(Consume, 1)
+      aie.objectfifo.release @of_out(Produce, 1)
+      aie.end
+    }
+
+    aie.runtime_sequence(%in : memref<16xi32>, %out : memref<16xi32>) {
+      %c0 = arith.constant 0 : i64
+      %c1 = arith.constant 1 : i64
+      %c16 = arith.constant 16 : i64
+      aiex.npu.dma_memcpy_nd(%out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c16][%c0,%c0,%c0,%c1]) {metadata = @of_out, id = 1 : i64} : memref<16xi32>
+      aiex.npu.dma_memcpy_nd(%in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c16][%c0,%c0,%c0,%c1]) {metadata = @of_in, id = 0 : i64, issue_token = true} : memref<16xi32>
+      aiex.npu.dma_wait {symbol = @of_out}
+    }
+  }
+}
diff --git a/test/npu-xrt/add_one_func_link_with_chess/add_one_kernel.cc b/test/npu-xrt/add_one_func_link_with_chess/add_one_kernel.cc
new file mode 100644
index 00000000000..8e12df48812
--- /dev/null
+++ b/test/npu-xrt/add_one_func_link_with_chess/add_one_kernel.cc
@@ -0,0 +1,28 @@
+//===- add_one_kernel.cc -----------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// External AIE kernel compiled with xchesscc_wrapper and linked via func-level
+// link_with on func.func.  Increments every element of a buffer by 1.
+//
+//===----------------------------------------------------------------------===//
+
+#define NOCPP
+
+#include <stdint.h>
+
+extern "C" {
+
+void add_one(int32_t *__restrict in, int32_t *__restrict out, int32_t n) {
+  for (int32_t i = 0; i < n; i++)
+    out[i] = in[i] + 1;
+}
+
+} // extern "C"
diff --git a/test/npu-xrt/add_one_func_link_with_chess/aie.mlir b/test/npu-xrt/add_one_func_link_with_chess/aie.mlir
new file mode 100644
index 00000000000..03e67966a8e
--- /dev/null
+++ b/test/npu-xrt/add_one_func_link_with_chess/aie.mlir
@@ -0,0 +1,60 @@
+//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// End-to-end test for func-level link_with (Chess/xbridge backend).
+//
+// A func.func declaration carries {link_with = "add_one_kernel.o"}.  The
+// aie-assign-core-link-files pass (run inside aiecc) traces the CallOp inside
+// the core and populates the core's link_files attribute, which the BCF emitter
+// turns into _include _file directives consumed by xbridge.
+//
+//===----------------------------------------------------------------------===//
+
+module {
+  aie.device(NPUDEVICE) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_2 = aie.tile(0, 2)
+
+    aie.objectfifo @of_in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
+    aie.objectfifo @of_out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
+
+    // func-level link_with: the kernel .o is declared here, not on aie.core.
+    func.func private @add_one(memref<8xi32>, memref<8xi32>, i32) attributes {link_with = "add_one_kernel.o"}
+
+    aie.core(%tile_0_2) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c8 = arith.constant 8 : index
+      %n  = arith.constant 8 : i32
+
+      scf.for %i = %c0 to %c8 step %c1 {
+        %sub_in  = aie.objectfifo.acquire @of_in(Consume, 1)  : !aie.objectfifosubview<memref<8xi32>>
+        %elem_in = aie.objectfifo.subview.access %sub_in[0]   : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
+        %sub_out = aie.objectfifo.acquire @of_out(Produce, 1) : !aie.objectfifosubview<memref<8xi32>>
+        %elem_out = aie.objectfifo.subview.access %sub_out[0] : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
+
+        func.call @add_one(%elem_in, %elem_out, %n) : (memref<8xi32>, memref<8xi32>, i32) -> ()
+
+        aie.objectfifo.release @of_in(Consume, 1)
+        aie.objectfifo.release @of_out(Produce, 1)
+      }
+      aie.end
+    }
+
+    aie.runtime_sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
+      %c0  = arith.constant 0 : i64
+      %c1  = arith.constant 1 : i64
+      %c64 = arith.constant 64 : i64
+      aiex.npu.dma_memcpy_nd(%out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0,%c1]) {metadata = @of_out, id = 1 : i64} : memref<64xi32>
+      aiex.npu.dma_memcpy_nd(%in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0,%c1])  {metadata = @of_in,  id = 0 : i64, issue_token = true} : memref<64xi32>
+      aiex.npu.dma_wait {symbol = @of_out}
+    }
+  }
+}
diff --git a/test/npu-xrt/add_one_func_link_with_chess/run.lit b/test/npu-xrt/add_one_func_link_with_chess/run.lit
new file mode 100644
index 00000000000..656c98d3972
--- /dev/null
+++ b/test/npu-xrt/add_one_func_link_with_chess/run.lit
@@ -0,0 +1,22 @@
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess
+//
+// End-to-end test for func-level link_with using the Chess/xbridge backend.
+//
+// The kernel is compiled to a .o by xchesscc_wrapper, then linked via func-level
+// link_with on the func.func declaration.  aiecc (C++ driver) runs the
+// aie-assign-core-link-files pass, which traces the func::CallOp inside the
+// core and populates link_files on the CoreOp, which the BCF emitter turns into
+// _include _file directives consumed by xbridge.
+//
+// RUN: cp %S/aie.mlir aie_arch.mlir
+// RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir
+// RUN: %run_on_npu2% sed 's/NPUDEVICE/npu2_1col/g' -i aie_arch.mlir
+// RUN: %run_on_npu1% xchesscc_wrapper aie2  -I %aietools/include -c %S/add_one_kernel.cc -o ./add_one_kernel.o
+// RUN: %run_on_npu2% xchesscc_wrapper aie2p -I %aietools/include -c %S/add_one_kernel.cc -o ./add_one_kernel.o
+// RUN: aiecc --xchesscc --xbridge --aie-generate-xclbin --xclbin-name=aie.xclbin --aie-generate-npu-insts --npu-insts-name=insts.bin ./aie_arch.mlir
+// RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
+// RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin
+// RUN: %run_on_npu2% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin
diff --git a/test/npu-xrt/add_one_func_link_with_chess/test.cpp b/test/npu-xrt/add_one_func_link_with_chess/test.cpp
new file mode 100644
index 00000000000..939e9b6f742
--- /dev/null
+++ b/test/npu-xrt/add_one_func_link_with_chess/test.cpp
@@ -0,0 +1,119 @@
+//===- test.cpp -------------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// Host test for add_one_func_link_with_chess.
+// Sends 64 i32 values (1..64) through the AIE core; each is incremented by 1
+// externally by add_one_kernel.o, linked via func-level link_with.
+// Expected output: 2..65.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "cxxopts.hpp"
+#include "test_utils.h"
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+constexpr int IN_SIZE = 64;
+constexpr int OUT_SIZE = 64;
+
+int main(int argc, const char *argv[]) {
+  cxxopts::Options options("add_one_func_link_with_chess");
+  test_utils::add_default_options(options);
+
+  cxxopts::ParseResult vm;
+  test_utils::parse_options(argc, argv, options, vm);
+
+  std::vector<uint32_t> instr_v =
+      test_utils::load_instr_binary(vm["instr"].as<std::string>());
+
+  int verbosity = vm["verbosity"].as<int>();
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  if (verbosity >= 1)
+    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+
+  std::string Node = vm["kernel"].as<std::string>();
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                               [Node](xrt::xclbin::kernel &k) {
+                                 return k.get_name().rfind(Node, 0) == 0;
+                               });
+  auto kernelName = xkernel.get_name();
+
+  device.register_xclbin(xclbin);
+  xrt::hw_context context(device, xclbin.get_uuid());
+  auto kernel = xrt::kernel(context, kernelName);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
+
+  // Fill input: values 1..64
+  uint32_t *bufInA = bo_inA.map<uint32_t *>();
+  for (int i = 0; i < IN_SIZE; i++)
+    bufInA[i] = i + 1;
+
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  if (verbosity >= 1)
+    std::cout << "Running Kernel.\n";
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
+
+  bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  uint32_t *bufOut = bo_out.map<uint32_t *>();
+
+  int errors = 0;
+  for (int i = 0; i < OUT_SIZE; i++) {
+    uint32_t expected = i + 2; // input i+1, add_one adds 1 → i+2
+    if (bufOut[i] != expected) {
+      std::cout << "Error at [" << i << "]: got " << bufOut[i] << ", expected "
+                << expected << "\n";
+      errors++;
+    } else if (verbosity >= 1) {
+      std::cout << "OK [" << i << "]: " << bufOut[i] << "\n";
+    }
+  }
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  }
+  std::cout << "\nfailed.\n\n";
+  return 1;
+}
diff --git a/test/npu-xrt/add_one_func_link_with_peano/add_one_kernel.cc b/test/npu-xrt/add_one_func_link_with_peano/add_one_kernel.cc
new file mode 100644
index 00000000000..1e28d014233
--- /dev/null
+++ b/test/npu-xrt/add_one_func_link_with_peano/add_one_kernel.cc
@@ -0,0 +1,29 @@
+//===- add_one_kernel.cc -----------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// External AIE kernel: add 1 to every element of a buffer.
+// Compiled to add_one_kernel.o and linked via func-level link_with on
+// func.func.
+//
+//===----------------------------------------------------------------------===//
+
+#define NOCPP
+
+#include <stdint.h>
+
+extern "C" {
+
+void add_one(int32_t *__restrict in, int32_t *__restrict out, int32_t n) {
+  for (int32_t i = 0; i < n; i++)
+    out[i] = in[i] + 1;
+}
+
+} // extern "C"
diff --git a/test/npu-xrt/add_one_func_link_with_peano/aie.mlir b/test/npu-xrt/add_one_func_link_with_peano/aie.mlir
new file mode 100644
index 00000000000..576104bbd5b
--- /dev/null
+++ b/test/npu-xrt/add_one_func_link_with_peano/aie.mlir
@@ -0,0 +1,62 @@
+//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// End-to-end test for func-level link_with (Peano/lld backend).
+//
+// A func.func declaration carries {link_with = "add_one_kernel.o"}.  The
+// aie-assign-core-link-files pass (run inside aiecc) traces the CallOp inside
+// the core and populates the core's link_files attribute, which the ldscript
+// emitter turns into INPUT() directives.  The Peano copy loop copies the .o
+// to the .prj directory so lld can find it.
+//
+//===----------------------------------------------------------------------===//
+
+module {
+  aie.device(NPUDEVICE) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_2 = aie.tile(0, 2)
+
+    aie.objectfifo @of_in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
+    aie.objectfifo @of_out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
+
+    // func-level link_with: the kernel .o is declared here, not on aie.core.
+    func.func private @add_one(memref<8xi32>, memref<8xi32>, i32) attributes {link_with = "add_one_kernel.o"}
+
+    aie.core(%tile_0_2) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c8 = arith.constant 8 : index
+      %n  = arith.constant 8 : i32
+
+      scf.for %i = %c0 to %c8 step %c1 {
+        %sub_in  = aie.objectfifo.acquire @of_in(Consume, 1)  : !aie.objectfifosubview<memref<8xi32>>
+        %elem_in = aie.objectfifo.subview.access %sub_in[0]   : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
+        %sub_out = aie.objectfifo.acquire @of_out(Produce, 1) : !aie.objectfifosubview<memref<8xi32>>
+        %elem_out = aie.objectfifo.subview.access %sub_out[0] : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
+
+        func.call @add_one(%elem_in, %elem_out, %n) : (memref<8xi32>, memref<8xi32>, i32) -> ()
+
+        aie.objectfifo.release @of_in(Consume, 1)
+        aie.objectfifo.release @of_out(Produce, 1)
+      }
+      aie.end
+    }
+
+    aie.runtime_sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
+      %c0  = arith.constant 0 : i64
+      %c1  = arith.constant 1 : i64
+      %c8  = arith.constant 8 : i64
+      %c64 = arith.constant 64 : i64
+      aiex.npu.dma_memcpy_nd(%out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0,%c1]) {metadata = @of_out, id = 1 : i64} : memref<64xi32>
+      aiex.npu.dma_memcpy_nd(%in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0,%c1])  {metadata = @of_in,  id = 0 : i64, issue_token = true} : memref<64xi32>
+      aiex.npu.dma_wait {symbol = @of_out}
+    }
+  }
+}
diff --git a/test/npu-xrt/add_one_func_link_with_peano/run.lit b/test/npu-xrt/add_one_func_link_with_peano/run.lit
new file mode 100644
index 00000000000..dab28dd7299
--- /dev/null
+++ b/test/npu-xrt/add_one_func_link_with_peano/run.lit
@@ -0,0 +1,23 @@
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, peano
+//
+// End-to-end test for func-level link_with using the Peano/lld backend.
+//
+// The kernel is compiled to a .o by peano clang (target derived from the
+// device), then linked via func-level link_with on the func.func declaration.
+// aiecc (C++ driver) runs the aie-assign-core-link-files pass, which traces
+// the func::CallOp inside the core and populates link_files on the CoreOp,
+// which the ldscript emitter turns into an INPUT() directive.  The Peano
+// copy loop copies the .o to the .prj tmpdir so lld can find it.
+//
+// RUN: cp %S/aie.mlir aie_arch.mlir
+// RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir
+// RUN: %run_on_npu2% sed 's/NPUDEVICE/npu2_1col/g' -i aie_arch.mlir
+// RUN: %run_on_npu1% %PEANO_INSTALL_DIR/bin/clang --target=aie2-none-unknown-elf -O2 -c %S/add_one_kernel.cc -o ./add_one_kernel.o
+// RUN: %run_on_npu2% %PEANO_INSTALL_DIR/bin/clang --target=aie2p-none-unknown-elf -O2 -c %S/add_one_kernel.cc -o ./add_one_kernel.o
+// RUN: aiecc --no-xchesscc --no-xbridge --aie-generate-xclbin --xclbin-name=aie.xclbin --aie-generate-npu-insts --npu-insts-name=insts.bin ./aie_arch.mlir
+// RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
+// RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin
+// RUN: %run_on_npu2% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin
diff --git a/test/npu-xrt/add_one_func_link_with_peano/test.cpp b/test/npu-xrt/add_one_func_link_with_peano/test.cpp
new file mode 100644
index 00000000000..458fc47f53f
--- /dev/null
+++ b/test/npu-xrt/add_one_func_link_with_peano/test.cpp
@@ -0,0 +1,119 @@
+//===- test.cpp -------------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// Host test for add_one_func_link_with_peano.
+// Sends 64 i32 values (1..64) through the AIE core; each is incremented by 1
+// externally by add_one_kernel.o, linked via func-level link_with.
+// Expected output: 2..65.
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "cxxopts.hpp"
+#include "test_utils.h"
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+constexpr int IN_SIZE = 64;
+constexpr int OUT_SIZE = 64;
+
+int main(int argc, const char *argv[]) {
+  cxxopts::Options options("add_one_func_link_with_peano");
+  test_utils::add_default_options(options);
+
+  cxxopts::ParseResult vm;
+  test_utils::parse_options(argc, argv, options, vm);
+
+  std::vector<uint32_t> instr_v =
+      test_utils::load_instr_binary(vm["instr"].as<std::string>());
+
+  int verbosity = vm["verbosity"].as<int>();
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  if (verbosity >= 1)
+    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+
+  std::string Node = vm["kernel"].as<std::string>();
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                               [Node](xrt::xclbin::kernel &k) {
+                                 return k.get_name().rfind(Node, 0) == 0;
+                               });
+  auto kernelName = xkernel.get_name();
+
+  device.register_xclbin(xclbin);
+  xrt::hw_context context(device, xclbin.get_uuid());
+  auto kernel = xrt::kernel(context, kernelName);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
+
+  // Fill input: values 1..64
+  uint32_t *bufInA = bo_inA.map<uint32_t *>();
+  for (int i = 0; i < IN_SIZE; i++)
+    bufInA[i] = i + 1;
+
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  if (verbosity >= 1)
+    std::cout << "Running Kernel.\n";
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
+
+  bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  uint32_t *bufOut = bo_out.map<uint32_t *>();
+
+  int errors = 0;
+  for (int i = 0; i < OUT_SIZE; i++) {
+    uint32_t expected = i + 2; // input i+1, add_one adds 1 → i+2
+    if (bufOut[i] != expected) {
+      std::cout << "Error at [" << i << "]: got " << bufOut[i] << ", expected "
+                << expected << "\n";
+      errors++;
+    } else if (verbosity >= 1) {
+      std::cout << "OK [" << i << "]: " << bufOut[i] << "\n";
+    }
+  }
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  }
+  std::cout << "\nfailed.\n\n";
+  return 1;
+}
diff --git a/test/npu-xrt/add_one_scale_func_link_with_chess/add_one_kernel.cc b/test/npu-xrt/add_one_scale_func_link_with_chess/add_one_kernel.cc
new file mode 100644
index 00000000000..694ee947ecc
--- /dev/null
+++ b/test/npu-xrt/add_one_scale_func_link_with_chess/add_one_kernel.cc
@@ -0,0 +1,28 @@
+//===- add_one_kernel.cc -----------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// External AIE kernel: copy input to output, adding 1 to every element.
+// Compiled to add_one_kernel.o and linked via func-level link_with.
+//
+//===----------------------------------------------------------------------===//
+
+#define NOCPP
+
+#include <stdint.h>
+
+extern "C" {
+
+void add_one(int32_t *__restrict in, int32_t *__restrict out, int32_t n) {
+  for (int32_t i = 0; i < n; i++)
+    out[i] = in[i] + 1;
+}
+
+} // extern "C"
diff --git a/test/npu-xrt/add_one_scale_func_link_with_chess/aie.mlir b/test/npu-xrt/add_one_scale_func_link_with_chess/aie.mlir
new file mode 100644
index 00000000000..dda65af4e99
--- /dev/null
+++ b/test/npu-xrt/add_one_scale_func_link_with_chess/aie.mlir
@@ -0,0 +1,67 @@
+//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// End-to-end test for func-level link_with with multiple .o files (Chess/xbridge).
+//
+// Two func.func declarations each carry a distinct link_with attribute.
+// aie-assign-core-link-files (run inside aiecc) traces both CallOps inside
+// the core and produces link_files = ["add_one_kernel.o", "scale_kernel.o"]
+// on the CoreOp.  The BCF emitter turns each into an _include _file directive,
+// and xbridge links both .o files into the core ELF.
+//
+//===----------------------------------------------------------------------===//
+
+module {
+  aie.device(NPUDEVICE) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_2 = aie.tile(0, 2)
+
+    aie.objectfifo @of_in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
+    aie.objectfifo @of_out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
+
+    // Two func-level link_withs — each refers to a different .o file.
+    // aie-assign-core-link-files aggregates both into the core's link_files.
+    func.func private @add_one(memref<8xi32>, memref<8xi32>, i32) attributes {link_with = "add_one_kernel.o"}
+    func.func private @scale_by_two(memref<8xi32>, memref<8xi32>, i32) attributes {link_with = "scale_kernel.o"}
+
+    aie.core(%tile_0_2) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c8 = arith.constant 8 : index
+      %n  = arith.constant 8 : i32
+
+      scf.for %i = %c0 to %c8 step %c1 {
+        %sub_in  = aie.objectfifo.acquire @of_in(Consume, 1)  : !aie.objectfifosubview<memref<8xi32>>
+        %elem_in = aie.objectfifo.subview.access %sub_in[0]   : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
+        %sub_out = aie.objectfifo.acquire @of_out(Produce, 1) : !aie.objectfifosubview<memref<8xi32>>
+        %elem_out = aie.objectfifo.subview.access %sub_out[0] : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
+
+        // Step 1: add_one_kernel.o — out[i] = in[i] + 1
+        func.call @add_one(%elem_in, %elem_out, %n) : (memref<8xi32>, memref<8xi32>, i32) -> ()
+        // Step 2: scale_kernel.o — out[i] = out[i] * 2 (in-place via two-pointer form)
+        func.call @scale_by_two(%elem_out, %elem_out, %n) : (memref<8xi32>, memref<8xi32>, i32) -> ()
+
+        aie.objectfifo.release @of_in(Consume, 1)
+        aie.objectfifo.release @of_out(Produce, 1)
+      }
+      aie.end
+    }
+
+    aie.runtime_sequence(%in : memref<64xi32>, %out : memref<64xi32>) {
+      %c0  = arith.constant 0 : i64
+      %c1  = arith.constant 1 : i64
+      %c8  = arith.constant 8 : i64
+      %c64 = arith.constant 64 : i64
+      aiex.npu.dma_memcpy_nd(%out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0,%c1]) {metadata = @of_out, id = 1 : i64} : memref<64xi32>
+      aiex.npu.dma_memcpy_nd(%in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0,%c1])  {metadata = @of_in,  id = 0 : i64, issue_token = true} : memref<64xi32>
+      aiex.npu.dma_wait {symbol = @of_out}
+    }
+  }
+}
diff --git a/test/npu-xrt/add_one_scale_func_link_with_chess/run.lit b/test/npu-xrt/add_one_scale_func_link_with_chess/run.lit
new file mode 100644
index 00000000000..860745d1618
--- /dev/null
+++ b/test/npu-xrt/add_one_scale_func_link_with_chess/run.lit
@@ -0,0 +1,31 @@
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, chess
+//
+// End-to-end test for func-level link_with with MULTIPLE .o files (Chess/xbridge).
+//
+// Two func.func declarations each carry a distinct link_with attribute:
+//   @add_one      → add_one_kernel.o
+//   @scale_by_two → scale_kernel.o
+// aie-assign-core-link-files traces both CallOps inside the core and
+// produces link_files = ["add_one_kernel.o", "scale_kernel.o"] on the
+// CoreOp.  The BCF emitter emits an _include _file directive for each,
+// and xbridge links both .o files into the core ELF.
+//
+// The kernel pipeline per tile iteration:
+//   1. add_one(in, out, n)   — out[i] = in[i] + 1
+//   2. scale_by_two(out, out, n)  — out[i] *= 2  (in-place, same buf for in and out)
+// Expected output: (input + 1) * 2.
+//
+// RUN: cp %S/aie.mlir aie_arch.mlir
+// RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir
+// RUN: %run_on_npu2% sed 's/NPUDEVICE/npu2_1col/g' -i aie_arch.mlir
+// RUN: %run_on_npu1% xchesscc_wrapper aie2  -I %aietools/include -c %S/add_one_kernel.cc -o ./add_one_kernel.o
+// RUN: %run_on_npu2% xchesscc_wrapper aie2p -I %aietools/include -c %S/add_one_kernel.cc -o ./add_one_kernel.o
+// RUN: %run_on_npu1% xchesscc_wrapper aie2  -I %aietools/include -c %S/scale_kernel.cc -o ./scale_kernel.o
+// RUN: %run_on_npu2% xchesscc_wrapper aie2p -I %aietools/include -c %S/scale_kernel.cc -o ./scale_kernel.o
+// RUN: aiecc --xchesscc --xbridge --aie-generate-xclbin --xclbin-name=aie.xclbin --aie-generate-npu-insts --npu-insts-name=insts.bin ./aie_arch.mlir
+// RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
+// RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin
+// RUN: %run_on_npu2% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin
diff --git a/test/npu-xrt/add_one_scale_func_link_with_chess/scale_kernel.cc b/test/npu-xrt/add_one_scale_func_link_with_chess/scale_kernel.cc
new file mode 100644
index 00000000000..e8d2c4a258c
--- /dev/null
+++ b/test/npu-xrt/add_one_scale_func_link_with_chess/scale_kernel.cc
@@ -0,0 +1,41 @@
+//===- scale_kernel.cc -------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// External AIE kernel: multiply every element of a buffer by 2, writing to a
+// separate output buffer.  The loop is manually unrolled for n=8 (the fixed
+// tile buffer size) to avoid a chess compiler bug where software pipelining
+// sets lc=1 for loops with n < 9, causing only 1 iteration to execute.
+// Compiled to scale_kernel.o and linked via func-level link_with alongside
+// add_one_kernel.o — exercises multi-.o linking through the func-level
+// link_with path.
+//
+//===----------------------------------------------------------------------===//
+
+#define NOCPP
+
+#include <stdint.h>
+
+extern "C" {
+
+void scale_by_two(int32_t *__restrict in, int32_t *__restrict out, int32_t n) {
+  // Manually unrolled for n=8: avoids chess sw-pipeline bug (lc=1 for n<9).
+  (void)n;
+  out[0] = in[0] + in[0];
+  out[1] = in[1] + in[1];
+  out[2] = in[2] + in[2];
+  out[3] = in[3] + in[3];
+  out[4] = in[4] + in[4];
+  out[5] = in[5] + in[5];
+  out[6] = in[6] + in[6];
+  out[7] = in[7] + in[7];
+}
+
+} // extern "C"
diff --git a/test/npu-xrt/add_one_scale_func_link_with_chess/test.cpp b/test/npu-xrt/add_one_scale_func_link_with_chess/test.cpp
new file mode 100644
index 00000000000..e17014cd899
--- /dev/null
+++ b/test/npu-xrt/add_one_scale_func_link_with_chess/test.cpp
@@ -0,0 +1,122 @@
+//===- test.cpp -------------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// Host test for add_one_scale_func_link_with_chess.
+// Sends 64 i32 values (1..64) through the AIE core; each is first
+// incremented by 1 (add_one_kernel.o) then doubled (scale_kernel.o),
+// both linked via func-level link_with.
+// Expected output: (i + 2) * 2  for i in 0..63  (i.e. input i+1 → i+2 →
+// 2*(i+2)).
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "cxxopts.hpp"
+#include "test_utils.h"
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+constexpr int IN_SIZE = 64;
+constexpr int OUT_SIZE = 64;
+
+int main(int argc, const char *argv[]) {
+  cxxopts::Options options("add_one_scale_func_link_with_chess");
+  test_utils::add_default_options(options);
+
+  cxxopts::ParseResult vm;
+  test_utils::parse_options(argc, argv, options, vm);
+
+  std::vector<uint32_t> instr_v =
+      test_utils::load_instr_binary(vm["instr"].as<std::string>());
+
+  int verbosity = vm["verbosity"].as<int>();
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  if (verbosity >= 1)
+    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+
+  std::string Node = vm["kernel"].as<std::string>();
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                               [Node](xrt::xclbin::kernel &k) {
+                                 return k.get_name().rfind(Node, 0) == 0;
+                               });
+  auto kernelName = xkernel.get_name();
+
+  device.register_xclbin(xclbin);
+  xrt::hw_context context(device, xclbin.get_uuid());
+  auto kernel = xrt::kernel(context, kernelName);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
+
+  // Fill input: values 1..64
+  uint32_t *bufInA = bo_inA.map<uint32_t *>();
+  for (int i = 0; i < IN_SIZE; i++)
+    bufInA[i] = i + 1;
+
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  if (verbosity >= 1)
+    std::cout << "Running Kernel.\n";
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
+
+  bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  uint32_t *bufOut = bo_out.map<uint32_t *>();
+
+  int errors = 0;
+  for (int i = 0; i < OUT_SIZE; i++) {
+    // input[i] = i+1; add_one → i+2; scale_by_two → (i+2)*2
+    uint32_t expected = (i + 2) * 2;
+    if (bufOut[i] != expected) {
+      std::cout << "Error at [" << i << "]: got " << bufOut[i] << ", expected "
+                << expected << "\n";
+      errors++;
+    } else if (verbosity >= 1) {
+      std::cout << "OK [" << i << "]: " << bufOut[i] << "\n";
+    }
+  }
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  }
+  std::cout << "\nfailed.\n\n";
+  return 1;
+}
diff --git a/test/npu-xrt/add_one_scale_func_link_with_peano/add_one_kernel.cc b/test/npu-xrt/add_one_scale_func_link_with_peano/add_one_kernel.cc
new file mode 100644
index 00000000000..694ee947ecc
--- /dev/null
+++ b/test/npu-xrt/add_one_scale_func_link_with_peano/add_one_kernel.cc
@@ -0,0 +1,28 @@
+//===- add_one_kernel.cc -----------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// External AIE kernel: copy input to output, adding 1 to every element.
+// Compiled to add_one_kernel.o and linked via func-level link_with.
+//
+//===----------------------------------------------------------------------===//
+
+#define NOCPP
+
+#include <stdint.h>
+
+extern "C" {
+
+void add_one(int32_t *__restrict in, int32_t *__restrict out, int32_t n) {
+  for (int32_t i = 0; i < n; i++)
+    out[i] = in[i] + 1;
+}
+
+} // extern "C"
diff --git a/test/npu-xrt/add_one_scale_func_link_with_peano/aie.mlir b/test/npu-xrt/add_one_scale_func_link_with_peano/aie.mlir
new file mode 100644
index 00000000000..5eb113c2365
--- /dev/null
+++ b/test/npu-xrt/add_one_scale_func_link_with_peano/aie.mlir
@@ -0,0 +1,67 @@
+//===- aie.mlir ------------------------------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// End-to-end test for func-level link_with with multiple .o files (Peano/lld).
+//
+// Two func.func declarations each carry a distinct link_with attribute.
+// aie-assign-core-link-files (run inside aiecc) traces both CallOps inside
+// the core and produces link_files = ["add_one_kernel.o", "scale_kernel.o"]
+// on the CoreOp.  The ldscript emitter turns each into an INPUT() directive,
+// and the Peano copy loop copies both .o files to the .prj tmpdir for lld.
+//
+//===----------------------------------------------------------------------===//
+
+module {
+  aie.device(NPUDEVICE) {
+    %tile_0_0 = aie.tile(0, 0)
+    %tile_0_2 = aie.tile(0, 2)
+
+    aie.objectfifo @of_in(%tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
+    aie.objectfifo @of_out(%tile_0_2, {%tile_0_0}, 2 : i32) : !aie.objectfifo<memref<8xi32>>
+
+    // Two func-level link_withs — each refers to a different .o file.
+    // aie-assign-core-link-files aggregates both into the core's link_files.
+    func.func private @add_one(memref<8xi32>, memref<8xi32>, i32) attributes {link_with = "add_one_kernel.o"}
+    func.func private @scale_by_two(memref<8xi32>, memref<8xi32>, i32) attributes {link_with = "scale_kernel.o"}
+
+    aie.core(%tile_0_2) {
+      %c0 = arith.constant 0 : index
+      %c1 = arith.constant 1 : index
+      %c8 = arith.constant 8 : index
+      %n  = arith.constant 8 : i32
+
+      scf.for %i = %c0 to %c8 step %c1 {
+        %sub_in  = aie.objectfifo.acquire @of_in(Consume, 1)  : !aie.objectfifosubview<memref<8xi32>>
+        %elem_in = aie.objectfifo.subview.access %sub_in[0]   : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
+        %sub_out = aie.objectfifo.acquire @of_out(Produce, 1) : !aie.objectfifosubview<memref<8xi32>>
+        %elem_out = aie.objectfifo.subview.access %sub_out[0] : !aie.objectfifosubview<memref<8xi32>> -> memref<8xi32>
+
+        // Step 1: add_one_kernel.o — out[i] = in[i] + 1
+        func.call @add_one(%elem_in, %elem_out, %n) : (memref<8xi32>, memref<8xi32>, i32) -> ()
+        // Step 2: scale_kernel.o — out[i] = out[i] * 2 (in-place via two-pointer form)
+        func.call @scale_by_two(%elem_out, %elem_out, %n) : (memref<8xi32>, memref<8xi32>, i32) -> ()
+
+        aie.objectfifo.release @of_in(Consume, 1)
+        aie.objectfifo.release @of_out(Produce, 1)
+      }
+      aie.end
+    }
+
+    aie.runtime_sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
+      %c0  = arith.constant 0 : i64
+      %c1  = arith.constant 1 : i64
+      %c8  = arith.constant 8 : i64
+      %c64 = arith.constant 64 : i64
+      aiex.npu.dma_memcpy_nd(%out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0,%c1]) {metadata = @of_out, id = 1 : i64} : memref<64xi32>
+      aiex.npu.dma_memcpy_nd(%in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0,%c1])  {metadata = @of_in,  id = 0 : i64, issue_token = true} : memref<64xi32>
+      aiex.npu.dma_wait {symbol = @of_out}
+    }
+  }
+}
diff --git a/test/npu-xrt/add_one_scale_func_link_with_peano/run.lit b/test/npu-xrt/add_one_scale_func_link_with_peano/run.lit
new file mode 100644
index 00000000000..8211c420009
--- /dev/null
+++ b/test/npu-xrt/add_one_scale_func_link_with_peano/run.lit
@@ -0,0 +1,31 @@
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// REQUIRES: ryzen_ai, peano
+//
+// End-to-end test for func-level link_with with MULTIPLE .o files (Peano/lld).
+//
+// Two func.func declarations each carry a distinct link_with attribute:
+//   @add_one    → add_one_kernel.o
+//   @scale_by_two → scale_kernel.o
+// aie-assign-core-link-files traces both CallOps inside the core and
+// produces link_files = ["add_one_kernel.o", "scale_kernel.o"] on the
+// CoreOp.  The Peano copy loop copies both .o files to the .prj tmpdir so
+// lld links them together.
+//
+// The kernel pipeline per tile iteration:
+//   1. add_one(in, out, n)   — out[i] = in[i] + 1
+//   2. scale_by_two(out, n)  — out[i] *= 2
+// Expected output: (input + 1) * 2.
+//
+// RUN: cp %S/aie.mlir aie_arch.mlir
+// RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir
+// RUN: %run_on_npu2% sed 's/NPUDEVICE/npu2_1col/g' -i aie_arch.mlir
+// RUN: %run_on_npu1% %PEANO_INSTALL_DIR/bin/clang --target=aie2-none-unknown-elf  -O2 -c %S/add_one_kernel.cc -o ./add_one_kernel.o
+// RUN: %run_on_npu2% %PEANO_INSTALL_DIR/bin/clang --target=aie2p-none-unknown-elf -O2 -c %S/add_one_kernel.cc -o ./add_one_kernel.o
+// RUN: %run_on_npu1% %PEANO_INSTALL_DIR/bin/clang --target=aie2-none-unknown-elf  -O2 -c %S/scale_kernel.cc -o ./scale_kernel.o
+// RUN: %run_on_npu2% %PEANO_INSTALL_DIR/bin/clang --target=aie2p-none-unknown-elf -O2 -c %S/scale_kernel.cc -o ./scale_kernel.o
+// RUN: aiecc --no-xchesscc --no-xbridge --aie-generate-xclbin --xclbin-name=aie.xclbin --aie-generate-npu-insts --npu-insts-name=insts.bin ./aie_arch.mlir
+// RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags
+// RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin
+// RUN: %run_on_npu2% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin
diff --git a/test/npu-xrt/add_one_scale_func_link_with_peano/scale_kernel.cc b/test/npu-xrt/add_one_scale_func_link_with_peano/scale_kernel.cc
new file mode 100644
index 00000000000..06dfcbed6e0
--- /dev/null
+++ b/test/npu-xrt/add_one_scale_func_link_with_peano/scale_kernel.cc
@@ -0,0 +1,33 @@
+//===- scale_kernel.cc -------------------------------------------*- C++
+//-*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// External AIE kernel: multiply every element of a buffer by 2, writing to a
+// separate output buffer.  Used with the same memref for both in and out to
+// perform an in-place scale after add_one_kernel.  Two-pointer form with no
+// __restrict allows chess to generate correct vectorized code when in==out.
+// Compiled to scale_kernel.o and linked via func-level link_with alongside
+// add_one_kernel.o — exercises multi-.o linking through the func-level
+// link_with path.
+//
+//===----------------------------------------------------------------------===//
+
+#define NOCPP
+
+#include <stdint.h>
+
+extern "C" {
+
+void scale_by_two(int32_t *in, int32_t *out, int32_t n) {
+  for (int32_t i = 0; i < n; i++)
+    out[i] = in[i] + in[i];
+}
+
+} // extern "C"
diff --git a/test/npu-xrt/add_one_scale_func_link_with_peano/test.cpp b/test/npu-xrt/add_one_scale_func_link_with_peano/test.cpp
new file mode 100644
index 00000000000..5f2267bf4a1
--- /dev/null
+++ b/test/npu-xrt/add_one_scale_func_link_with_peano/test.cpp
@@ -0,0 +1,122 @@
+//===- test.cpp -------------------------------------------------*- C++ -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+//
+// Host test for add_one_scale_func_link_with_peano.
+// Sends 64 i32 values (1..64) through the AIE core; each is first
+// incremented by 1 (add_one_kernel.o) then doubled (scale_kernel.o),
+// both linked via func-level link_with.
+// Expected output: (i + 2) * 2  for i in 0..63  (i.e. input i+1 → i+2 →
+// 2*(i+2)).
+//
+//===----------------------------------------------------------------------===//
+
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "cxxopts.hpp"
+#include "test_utils.h"
+#include "xrt/xrt_bo.h"
+#include "xrt/xrt_device.h"
+#include "xrt/xrt_kernel.h"
+
+constexpr int IN_SIZE = 64;
+constexpr int OUT_SIZE = 64;
+
+int main(int argc, const char *argv[]) {
+  cxxopts::Options options("add_one_scale_func_link_with_peano");
+  test_utils::add_default_options(options);
+
+  cxxopts::ParseResult vm;
+  test_utils::parse_options(argc, argv, options, vm);
+
+  std::vector<uint32_t> instr_v =
+      test_utils::load_instr_binary(vm["instr"].as<std::string>());
+
+  int verbosity = vm["verbosity"].as<int>();
+  if (verbosity >= 1)
+    std::cout << "Sequence instr count: " << instr_v.size() << "\n";
+
+  unsigned int device_index = 0;
+  auto device = xrt::device(device_index);
+
+  if (verbosity >= 1)
+    std::cout << "Loading xclbin: " << vm["xclbin"].as<std::string>() << "\n";
+  auto xclbin = xrt::xclbin(vm["xclbin"].as<std::string>());
+
+  std::string Node = vm["kernel"].as<std::string>();
+  auto xkernels = xclbin.get_kernels();
+  auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(),
+                               [Node](xrt::xclbin::kernel &k) {
+                                 return k.get_name().rfind(Node, 0) == 0;
+                               });
+  auto kernelName = xkernel.get_name();
+
+  device.register_xclbin(xclbin);
+  xrt::hw_context context(device, xclbin.get_uuid());
+  auto kernel = xrt::kernel(context, kernelName);
+
+  auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int),
+                          XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
+  auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
+  auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
+                        XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5));
+
+  // Fill input: values 1..64
+  uint32_t *bufInA = bo_inA.map<uint32_t *>();
+  for (int i = 0; i < IN_SIZE; i++)
+    bufInA[i] = i + 1;
+
+  void *bufInstr = bo_instr.map<void *>();
+  memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int));
+
+  bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+  bo_inA.sync(XCL_BO_SYNC_BO_TO_DEVICE);
+
+  if (verbosity >= 1)
+    std::cout << "Running Kernel.\n";
+  unsigned int opcode = 3;
+  auto run = kernel(opcode, bo_instr, instr_v.size(), bo_inA, bo_inB, bo_out);
+  ert_cmd_state r = run.wait();
+  if (r != ERT_CMD_STATE_COMPLETED) {
+    std::cout << "Kernel did not complete. Returned status: " << r << "\n";
+    return 1;
+  }
+
+  bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+  uint32_t *bufOut = bo_out.map<uint32_t *>();
+
+  int errors = 0;
+  for (int i = 0; i < OUT_SIZE; i++) {
+    // input[i] = i+1; add_one → i+2; scale_by_two → (i+2)*2
+    uint32_t expected = (i + 2) * 2;
+    if (bufOut[i] != expected) {
+      std::cout << "Error at [" << i << "]: got " << bufOut[i] << ", expected "
+                << expected << "\n";
+      errors++;
+    } else if (verbosity >= 1) {
+      std::cout << "OK [" << i << "]: " << bufOut[i] << "\n";
+    }
+  }
+
+  if (!errors) {
+    std::cout << "\nPASS!\n\n";
+    return 0;
+  }
+  std::cout << "\nfailed.\n\n";
+  return 1;
+}

From 496a1befee01cd83ab9066fa19d56d45d106af81 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Thu, 5 Mar 2026 14:39:30 -0700
Subject: [PATCH 04/28] [aiecc] Fix unistd.h include causing link symbol
 collision

Replace ::close(tmpFD) with sys::fs::closeFile() in atomicCopyFile so
that <unistd.h> does not need to be included. The system header was
conflicting with the existing cl::opt<bool> link variable at file scope,
causing a build error on Linux.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tools/aiecc/aiecc.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/aiecc/aiecc.cpp b/tools/aiecc/aiecc.cpp
index 373c56e6221..401c70d1809 100644
--- a/tools/aiecc/aiecc.cpp
+++ b/tools/aiecc/aiecc.cpp
@@ -115,7 +115,6 @@
 #include <string>
 #include <system_error>
 #include <thread>
-#include <unistd.h>
 #include <vector>
 
 #include "aiecc_aiesim.h"
@@ -1818,7 +1817,7 @@ static LogicalResult atomicCopyFile(StringRef src, StringRef destDir,
     llvm::errs() << "Error: could not create temp file in " << destDir << "\n";
     return failure();
   }
-  ::close(tmpFD);
+  sys::fs::closeFile(tmpFD);
 
   if (std::error_code ec = sys::fs::copy_file(src, tmpPath)) {
     llvm::errs() << "Error: could not copy " << src << " to " << tmpPath << ": "

From 7326e89695c89419b201eaa5e75459a9bc7e2dca Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Thu, 5 Mar 2026 14:45:37 -0700
Subject: [PATCH 05/28] [AIE] Polish AIEAssignCoreLinkFiles: declaration order,
 redundant override, atomic copy

- Move createAIEAssignCoreLinkFilesPass() declaration to alphabetical
  position in AIEPasses.h (after the two AssignBufferAddresses overloads)
- Remove getDependentDialects() override from AIEAssignCoreLinkFilesPass;
  the tablegen dependentDialects field already registers FuncDialect and
  AIEDialect via the generated base class
- Use tempfile.mkstemp + os.replace in the Python aiecc driver's .o
  staging loop for the same atomic copy-then-rename semantics as the
  C++ atomicCopyFile helper

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 include/aie/Dialect/AIE/Transforms/AIEPasses.h        |  4 ++--
 lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp |  4 ----
 python/compiler/aiecc/main.py                         | 11 ++++++++++-
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/include/aie/Dialect/AIE/Transforms/AIEPasses.h b/include/aie/Dialect/AIE/Transforms/AIEPasses.h
index 0839249ab1e..60e56cbfbff 100644
--- a/include/aie/Dialect/AIE/Transforms/AIEPasses.h
+++ b/include/aie/Dialect/AIE/Transforms/AIEPasses.h
@@ -24,13 +24,13 @@ namespace xilinx::AIE {
 #define GEN_PASS_DEF_AIEROUTEPATHFINDERFLOWS
 #include "aie/Dialect/AIE/Transforms/AIEPasses.h.inc"
 
-std::unique_ptr<mlir::OperationPass<DeviceOp>>
-createAIEAssignCoreLinkFilesPass();
 std::unique_ptr<mlir::OperationPass<DeviceOp>>
 createAIEAssignBufferAddressesPass();
 std::unique_ptr<mlir::OperationPass<DeviceOp>>
 createAIEAssignBufferAddressesPass(
     const AIEAssignBufferAddressesOptions &options);
+std::unique_ptr<mlir::OperationPass<DeviceOp>>
+createAIEAssignCoreLinkFilesPass();
 std::unique_ptr<mlir::OperationPass<DeviceOp>> createAIEAssignLockIDsPass();
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 createAIECanonicalizeDevicePass();
diff --git a/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp b/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp
index 7a28b46c7b6..8a66ecd4b02 100644
--- a/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp
+++ b/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp
@@ -41,10 +41,6 @@ using namespace xilinx::AIE;
 struct AIEAssignCoreLinkFilesPass
     : xilinx::AIE::impl::AIEAssignCoreLinkFilesBase<
           AIEAssignCoreLinkFilesPass> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<AIEDialect, mlir::func::FuncDialect>();
-  }
-
   void runOnOperation() override {
     DeviceOp device = getOperation();
     OpBuilder builder(device.getContext());
diff --git a/python/compiler/aiecc/main.py b/python/compiler/aiecc/main.py
index e6f901eaf2f..6ced09e9be2 100644
--- a/python/compiler/aiecc/main.py
+++ b/python/compiler/aiecc/main.py
@@ -1016,12 +1016,21 @@ async def process_core(
             corecol, corerow, elf_file, link_files = core
 
             # Copy external .o files to tmpdir so linker can find them.
+            # Use a temp-then-rename pattern so parallel core compilations
+            # that share the same .o filename do not corrupt each other's copy.
             for lf in link_files:
                 src = lf if os.path.isabs(lf) else os.path.join(
                     os.path.dirname(opts.filename) or os.getcwd(), lf)
                 dst = os.path.join(self.tmpdirname, os.path.basename(lf))
                 if src != dst:
-                    shutil.copy2(src, dst)
+                    tmp_fd, tmp_path = tempfile.mkstemp(dir=self.tmpdirname)
+                    try:
+                        os.close(tmp_fd)
+                        shutil.copy2(src, tmp_path)
+                        os.replace(tmp_path, dst)
+                    except Exception:
+                        os.unlink(tmp_path)
+                        raise
 
             if not opts.unified:
                 file_opt_core = corefile(self.tmpdirname, device_name, core, "opt.mlir")

From 2974552f0735cdb6679585e3ec8bfea7fe256295 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Thu, 5 Mar 2026 15:02:26 -0700
Subject: [PATCH 06/28] [python] Post-migration audit: remove dead branches and
 redundant code

- worker.py: drop dead `isinstance(Kernel, ExternalFunction): pass` branch
  (now handled implicitly via func.call ops), remove unused import, update comment
- kernel.py: update stale `bin_name` docstring to reflect func.func link_with usage
- jit.py: remove redundant pre-scan of ExternalFunction in args/kwargs; simplify
  _instances collection into a single list comprehension
- compile/utils.py: remove three empty try/except/raise blocks in
  compile_external_kernel that catch and immediately re-raise

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 python/iron/kernel.py         |  7 +++++--
 python/iron/worker.py         | 25 ++++++-------------------
 python/utils/compile/utils.py | 34 ++++++++++++----------------------
 python/utils/jit.py           | 19 ++++---------------
 4 files changed, 27 insertions(+), 58 deletions(-)

diff --git a/python/iron/kernel.py b/python/iron/kernel.py
index bd9464dc1e2..a0ce506659a 100644
--- a/python/iron/kernel.py
+++ b/python/iron/kernel.py
@@ -58,7 +58,8 @@ def __init__(
 
         Args:
             name (str): The name of the function
-            bin_name (str): The name of the binary (used for linking to a compute core)
+            bin_name (str): The name of the object file (set as link_with on the func.func
+                declaration; also used as the output filename when compiling ExternalFunction sources)
             arg_types (list[type[np.ndarray]  |  np.dtype], optional): The type signature of the function. Defaults to [].
         """
         super().__init__(name, arg_types)
@@ -74,7 +75,9 @@ def resolve(
         ip: ir.InsertionPoint | None = None,
     ) -> None:
         if not self._op:
-            self._op = external_func(self._name, inputs=self._arg_types)
+            self._op = external_func(
+                self._name, inputs=self._arg_types, link_with=self._bin_name
+            )
 
 
 class ExternalFunction(Kernel):
diff --git a/python/iron/worker.py b/python/iron/worker.py
index ac75242a8c5..10f5225d056 100644
--- a/python/iron/worker.py
+++ b/python/iron/worker.py
@@ -15,7 +15,6 @@
 from .device import PlacementTile, AnyComputeTile, Tile
 from .dataflow.objectfifo import ObjectFifoHandle, ObjectFifo
 from .dataflow.endpoint import ObjectFifoEndpoint
-from .kernel import Kernel, ExternalFunction
 from .buffer import Buffer
 from .resolvable import Resolvable
 
@@ -73,18 +72,14 @@ def do_nothing_core_fun(*args) -> None:
             self.core_fn = do_nothing_core_fun
         else:
             self.core_fn = core_fn
-        self.link_with: str | None = None
         self.fn_args = fn_args
-        bin_names = set()
         self._fifos = []
         self._buffers = []
         self._barriers = []
 
         # Check arguments to the core. Some information is saved for resolution.
         for arg in self.fn_args:
-            if isinstance(arg, (Kernel, ExternalFunction)):
-                bin_names.add(arg.bin_name)
-            elif isinstance(arg, ObjectFifoHandle):
+            if isinstance(arg, ObjectFifoHandle):
                 arg.endpoint = self
                 self._fifos.append(arg)
             elif isinstance(arg, Buffer):
@@ -98,17 +93,10 @@ def do_nothing_core_fun(*args) -> None:
                 )
             elif isinstance(arg, WorkerRuntimeBarrier):
                 self._barriers.append(arg)
-            # We assume other arguments are metaprogramming (e.g, Python args)
-            # This could allow some errors to sink through, but we allow it for now.
-            # TODO: this could be cleaned up through creation of a MetaArgs struct, so you
-            # could access values through meta.my_var within the function.
-
-        if len(bin_names) > 1:
-            raise ValueError(
-                f"Currently, only one binary per works is supported. Found: {bin_names}"
-            )
-        if len(bin_names) == 1:
-            self.link_with = list(bin_names)[0]
+            # Kernel/ExternalFunction instances are valid fn_args — they resolve to
+            # func.call ops when invoked inside core_fn and carry link_with on their
+            # func.func declaration. Other unrecognized args are assumed to be
+            # metaprogramming values (Python scalars, etc.).
 
     def place(self, tile: Tile) -> None:
         """Set the placement of the Worker.
@@ -145,7 +133,6 @@ def resolve(
         if not self._tile:
             raise ValueError("Must place Worker before it can be resolved.")
         my_tile = self._tile.op
-        my_link = self.link_with
 
         # Create the necessary locks for the core operation to synchronize with the runtime sequence
         # and register them in the corresponding barriers.
@@ -153,7 +140,7 @@ def resolve(
             l = lock(my_tile)
             barrier._add_worker_lock(l)
 
-        @core(my_tile, link_with=my_link, stack_size=self.stack_size)
+        @core(my_tile, stack_size=self.stack_size)
         def core_body():
             for _ in range_(sys.maxsize) if self._while_true else range(1):
                 self.core_fn(*self.fn_args)
diff --git a/python/utils/compile/utils.py b/python/utils/compile/utils.py
index 458df555c61..3bae82747da 100644
--- a/python/utils/compile/utils.py
+++ b/python/utils/compile/utils.py
@@ -10,7 +10,6 @@
 import subprocess
 import aie.compiler.aiecc.main as aiecc
 import aie.utils.config as config
-from .link import merge_object_files
 
 
 def compile_cxx_core_function(
@@ -146,36 +145,27 @@ def compile_external_kernel(func, kernel_dir, target_arch):
     # Handle both source_string and source_file cases
     if func._source_string is not None:
         # Use source_string (write to file)
-        try:
-            with open(source_file, "w") as f:
-                f.write(func._source_string)
-        except Exception as e:
-            raise
+        with open(source_file, "w") as f:
+            f.write(func._source_string)
     elif func._source_file is not None:
         # Use source_file (copy existing file)
         # Check if source file exists before copying
         if os.path.exists(func._source_file):
-            try:
-                shutil.copy2(func._source_file, source_file)
-            except Exception as e:
-                raise
+            shutil.copy2(func._source_file, source_file)
         else:
             return
     else:
         raise ValueError("Neither source_string nor source_file is provided")
 
-    try:
-        compile_cxx_core_function(
-            source_path=source_file,
-            target_arch=target_arch,
-            output_path=output_file,
-            include_dirs=func._include_dirs,
-            compile_args=func._compile_flags,
-            cwd=kernel_dir,
-            verbose=False,
-        )
-    except Exception as e:
-        raise
+    compile_cxx_core_function(
+        source_path=source_file,
+        target_arch=target_arch,
+        output_path=output_file,
+        include_dirs=func._include_dirs,
+        compile_args=func._compile_flags,
+        cwd=kernel_dir,
+        verbose=False,
+    )
 
     # Mark the function as compiled
     func._compiled = True
diff --git a/python/utils/jit.py b/python/utils/jit.py
index e1dab95074f..9a45d37edd3 100644
--- a/python/utils/jit.py
+++ b/python/utils/jit.py
@@ -72,15 +72,6 @@ def decorator(*args, **kwargs):
         # Clear any instances from previous runs to make sure if the user provided any broken code we don't try to recompile it
         ExternalFunction._instances.clear()
 
-        # Find ExternalFunction instances in arguments and kwargs
-        external_kernels = []
-        for arg in args:
-            if isinstance(arg, ExternalFunction):
-                external_kernels.append(arg)
-        for value in kwargs.values():
-            if isinstance(value, ExternalFunction):
-                external_kernels.append(value)
-
         # Execute the function to generate MLIR
         if is_placed:
             with mlir_mod_ctx() as ctx:
@@ -92,12 +83,10 @@ def decorator(*args, **kwargs):
         else:
             mlir_module = function(*args, **kwargs)
 
-        # Compile all ExternalFunction instances that were created during this JIT compilation
-        for func in ExternalFunction._instances:
-            if (
-                not hasattr(func, "_compiled") or not func._compiled
-            ):  # Don't compile if already compiled
-                external_kernels.append(func)
+        # Collect ExternalFunction instances registered during this JIT compilation
+        external_kernels = [
+            func for func in ExternalFunction._instances if not func._compiled
+        ]
 
         # Determine target architecture based on device type
         current_device = DefaultNPURuntime.device()

From cb35c9717d09d61433432385c4e9045a394c5135 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Thu, 5 Mar 2026 16:14:47 -0700
Subject: [PATCH 07/28] [aiecc] Fix Peano linker unable to find external .o in
 JIT flow

The Peano linker (ld.lld via clang) resolves bare INPUT() filenames in
the generated ldscript against the linker process's working directory.
When invoked via do_call with no cwd, this inherited the Python process's
cwd rather than tmpdirname where the .o files live.

Fix by adding a cwd parameter to do_call and passing cwd=self.tmpdirname
to all three Peano linker invocations in process_core.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 python/compiler/aiecc/main.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/python/compiler/aiecc/main.py b/python/compiler/aiecc/main.py
index 6ced09e9be2..aab348762f9 100644
--- a/python/compiler/aiecc/main.py
+++ b/python/compiler/aiecc/main.py
@@ -797,7 +797,7 @@ def diagnostic_handler(d):
                     g.write(mlir_module_str)
         return mlir_module
 
-    async def do_call(self, task_id, command, force=False):
+    async def do_call(self, task_id, command, force=False, cwd=None):
         if self.stopall:
             return
 
@@ -809,7 +809,10 @@ async def do_call(self, task_id, command, force=False):
             print(commandstr)
         if self.opts.execute or force:
             proc = await asyncio.create_subprocess_exec(
-                *command, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
+                *command,
+                stdout=asyncio.subprocess.PIPE,
+                stderr=asyncio.subprocess.PIPE,
+                cwd=cwd,
             )
             stdout, stderr = await proc.communicate()
             ret = proc.returncode
@@ -1064,7 +1067,7 @@ async def process_core(
                     elif self.opts.link:
                         await self.do_call(task, ["xchesscc_wrapper", aie_target.lower(), "+w", self.prepend_tmp("work"), "-c", "-d", "+Wclang,-xir", "-f", file_core_llvmir_chesslinked, "-o", file_core_obj])
                         opt_level = opts.opt_level
-                        await self.do_call(task, [self.peano_clang_path, f"-O{opt_level}", "--target=" + aie_peano_target, file_core_obj, *clang_link_args, "-Wl,-T," + file_core_ldscript, "-o", file_core_elf])
+                        await self.do_call(task, [self.peano_clang_path, f"-O{opt_level}", "--target=" + aie_peano_target, file_core_obj, *clang_link_args, "-Wl,-T," + file_core_ldscript, "-o", file_core_elf], cwd=self.tmpdirname)
                 else:
                     file_core_obj = unified_file_core_obj
                     if opts.link and opts.xbridge:
@@ -1072,7 +1075,7 @@ async def process_core(
                         await self.do_call(task, ["xchesscc_wrapper", aie_target.lower(), "+w", self.prepend_tmp("work"), "-d", "-f", file_core_obj, link_with_obj, "+l", file_core_bcf, "-o", file_core_elf])
                     elif opts.link:
                         opt_level = opts.opt_level
-                        await self.do_call(task, [self.peano_clang_path, f"-O{opt_level}", "--target=" + aie_peano_target, file_core_obj, *clang_link_args, "-Wl,-T," + file_core_ldscript, "-o", file_core_elf])
+                        await self.do_call(task, [self.peano_clang_path, f"-O{opt_level}", "--target=" + aie_peano_target, file_core_obj, *clang_link_args, "-Wl,-T," + file_core_ldscript, "-o", file_core_elf], cwd=self.tmpdirname)
 
             elif opts.compile:
                 if not opts.unified:
@@ -1100,7 +1103,7 @@ async def process_core(
                     await self.do_call(task, ["xchesscc_wrapper", aie_target.lower(), "+w", self.prepend_tmp("work"), "-d", "-f", file_core_obj, link_with_obj, "+l", file_core_bcf, "-o", file_core_elf])
                 elif opts.link:
                     opt_level = opts.opt_level
-                    await self.do_call(task, [self.peano_clang_path, f"-O{opt_level}", "--target=" + aie_peano_target, file_core_obj, *clang_link_args, "-Wl,-T," + file_core_ldscript, "-o", file_core_elf])
+                    await self.do_call(task, [self.peano_clang_path, f"-O{opt_level}", "--target=" + aie_peano_target, file_core_obj, *clang_link_args, "-Wl,-T," + file_core_ldscript, "-o", file_core_elf], cwd=self.tmpdirname)
 
             self.progress_bar.update(parent_task_id, advance=1)
             self.progress_bar.update(task, advance=0, visible=False)

From 83f756a1337118fe3c3699f85f9464ff75ee4102 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Thu, 5 Mar 2026 16:18:03 -0700
Subject: [PATCH 08/28] [python] Remove merge_object_files; add link_with to
 external_func

The merge_object_files approach (link.py) is superseded by the link_with
attribute on func.func declarations, which aiecc aggregates onto CoreOp
via AIEAssignCoreLinkFiles. Remove link.py, its CMakeLists entry, and
the test_compile_and_link test that exercised it.

Add a link_with keyword argument to external_func in aie.py so Python/IRON
callers can attach the link_with attribute directly when declaring external
kernel functions.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 python/CMakeLists.txt                    |  1 -
 python/dialects/aie.py                   |  6 ++-
 python/utils/compile/__init__.py         |  1 -
 python/utils/compile/link.py             | 51 ------------------------
 test/python/npu-xrt/test_compile_link.py | 51 ------------------------
 5 files changed, 5 insertions(+), 105 deletions(-)
 delete mode 100644 python/utils/compile/link.py

diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index c09b51ed147..12657ab44db 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -49,7 +49,6 @@ declare_mlir_python_sources(AIEPythonSources.Utils
     utils/hostruntime/xrtruntime/hostruntime.py
     utils/hostruntime/xrtruntime/tensor.py
     utils/compile/__init__.py
-    utils/compile/link.py
     utils/compile/utils.py
     utils/compile/cache/circular_cache.py
     utils/compile/cache/utils.py
diff --git a/python/dialects/aie.py b/python/dialects/aie.py
index ddff535e37f..405baced79b 100644
--- a/python/dialects/aie.py
+++ b/python/dialects/aie.py
@@ -88,7 +88,9 @@ def __init__(self, buffer, index, value, loc=None, ip=None):
 
 
 class external_func(FuncOp):
-    def __init__(self, name: str, inputs, outputs=None, visibility="private"):
+    def __init__(
+        self, name: str, inputs, outputs=None, visibility="private", link_with=None
+    ):
         if outputs is None:
             outputs = []
         for i, ty in enumerate(inputs):
@@ -102,6 +104,8 @@ def __init__(self, name: str, inputs, outputs=None, visibility="private"):
         super().__init__(
             name=name, type=FunctionType.get(inputs, outputs), visibility=visibility
         )
+        if link_with is not None:
+            self.operation.attributes["link_with"] = StringAttr.get(link_with)
 
     def __call__(self, *call_args):
         return call(self, call_args)
diff --git a/python/utils/compile/__init__.py b/python/utils/compile/__init__.py
index 701b4dd60d4..9206688a30e 100644
--- a/python/utils/compile/__init__.py
+++ b/python/utils/compile/__init__.py
@@ -9,7 +9,6 @@
 import os
 from pathlib import Path
 
-from .link import merge_object_files
 from .utils import (
     compile_cxx_core_function,
     compile_mlir_module,
diff --git a/python/utils/compile/link.py b/python/utils/compile/link.py
deleted file mode 100644
index 4556b7d40de..00000000000
--- a/python/utils/compile/link.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# link.py -*- Python -*-
-#
-# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# (c) Copyright 2025-2026 Advanced Micro Devices, Inc.
-
-import subprocess
-from os import PathLike
-
-import aie.utils.config as config
-
-
-def merge_object_files(
-    object_paths: list[PathLike],
-    output_path: PathLike,
-    cwd=None,
-    verbose=False,
-) -> None:
-    """
-    Merges multiple object files into a single output file.
-
-    Args:
-        object_files (list of str): List of paths to object files to merge.
-        output_file (str): Path to the output object file.
-        cwd (str, optional): Overrides the current working directory.
-        verbose (bool): If True, enable verbose output.
-    """
-    cmd = [
-        config.peano_linker_path(),
-        "-r",  # relocatable output
-        "-o",
-        str(output_path),
-        *[str(obj) for obj in object_paths],
-    ]
-    if verbose:
-        print("Linking object files with:", " ".join(cmd))
-    ret = subprocess.run(
-        cmd,
-        cwd=cwd,
-        check=False,
-        capture_output=True,
-    )
-    if verbose and ret.stdout:
-        print(f"{ret.stdout.decode()}")
-    if ret.returncode != 0:
-        if ret.stderr:
-            raise RuntimeError(f"[Peano] object linking failed:\n{ret.stderr.decode()}")
-        else:
-            raise RuntimeError("[Peano] object linking failed")
diff --git a/test/python/npu-xrt/test_compile_link.py b/test/python/npu-xrt/test_compile_link.py
index fbf8db4dd3f..5b1fcb542e6 100644
--- a/test/python/npu-xrt/test_compile_link.py
+++ b/test/python/npu-xrt/test_compile_link.py
@@ -12,7 +12,6 @@
 import tempfile
 
 from aie.utils.compile import compile_cxx_core_function
-from aie.utils.compile import merge_object_files
 
 SOURCE_STRING1 = """
 extern "C" {
@@ -23,15 +22,6 @@
 }
 }"""
 
-SOURCE_STRING2 = """
-extern "C" {
-void add_two(int* input, int* output, int tile_size) {
-    for (int i = 0; i < tile_size; i++) {
-        output[i] = input[i] + 2;
-    }
-}
-}"""
-
 
 def test_compile():
     """Test compilation of a C++ source file to an object file."""
@@ -52,44 +42,3 @@ def test_compile():
             compile_args=["-DTEST"],
         )
         assert os.path.getsize(output_path) > 0
-
-
-def test_compile_and_link():
-    """Test compilation of two C++ source files and link them."""
-    with tempfile.TemporaryDirectory() as tmpdir:
-        source_path1 = os.path.join(tmpdir, "source1.cpp")
-        source_path2 = os.path.join(tmpdir, "source2.cpp")
-        output_path1 = os.path.join(tmpdir, "output1.o")
-        output_path2 = os.path.join(tmpdir, "output2.o")
-        combined_output_path = os.path.join(tmpdir, "combined.o")
-
-        with open(source_path1, "w") as f:
-            f.write(SOURCE_STRING1)
-        assert os.path.getsize(source_path1) > 0
-
-        with open(source_path2, "w") as f:
-            f.write(SOURCE_STRING2)
-        assert os.path.getsize(source_path2) > 0
-
-        assert not os.path.exists(output_path1)
-        compile_cxx_core_function(
-            source_path=source_path1,
-            target_arch="aie2",
-            output_path=output_path1,
-        )
-        assert os.path.getsize(output_path1) > 0
-
-        assert not os.path.exists(output_path2)
-        compile_cxx_core_function(
-            source_path=source_path2,
-            target_arch="aie2",
-            output_path=output_path2,
-        )
-        assert os.path.getsize(output_path2) > 0
-
-        assert not os.path.exists(combined_output_path)
-        merge_object_files(
-            object_paths=[output_path1, output_path2],
-            output_path=combined_output_path,
-        )
-        assert os.path.getsize(combined_output_path) > 0

From a4e7c1132e153972312860101dae49f2f979dce5 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Thu, 5 Mar 2026 17:02:13 -0700
Subject: [PATCH 09/28] [aiecc] Replace Python aiecc with thin wrapper from PR
 #2925

During rebase onto origin/main, later commits in our branch re-introduced
the 2217-line Python aiecc implementation (via replay of the cwd/JIT fix).
Replace it with the 116-line thin wrapper from dde51cb04c that delegates
all compilation to the C++ aiecc binary, consistent with PR #2925's
deprecation of aiecc.py.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 python/compiler/aiecc/main.py | 2249 ++-------------------------------
 1 file changed, 74 insertions(+), 2175 deletions(-)

diff --git a/python/compiler/aiecc/main.py b/python/compiler/aiecc/main.py
index aab348762f9..5bf87d3fa71 100644
--- a/python/compiler/aiecc/main.py
+++ b/python/compiler/aiecc/main.py
@@ -3,2215 +3,114 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 #
 # (c) Copyright 2021 Xilinx Inc.
+# (c) Copyright 2024-2026 Advanced Micro Devices, Inc.
 
 """
-aiecc - AIE compiler driver for MLIR tools
+aiecc.py - AIE Compiler Driver (Python wrapper)
+
+This is a thin wrapper that delegates to the C++ aiecc binary.
+The C++ implementation provides better performance through
+in-memory MLIR pass execution instead of subprocess calls.
+
+All command-line arguments are passed through unchanged to the
+C++ binary, which handles host compilation flags (-I, -L, -l, -o),
+host source files (.cpp), and all other options directly.
 """
 
-import asyncio
-import glob
-import json
 import os
-import re
 import shutil
-import stat
 import subprocess
 import sys
 import tempfile
-from textwrap import dedent
-import time
-import uuid
-import struct
-
-from aie.extras.runtime.passes import Pipeline
-from aie.extras.util import find_ops
-import aiofiles
-import rich.progress as progress
-
-import aie.compiler.aiecc.cl_arguments
-import aie.compiler.aiecc.configure
-from aie.dialects import aie as aiedialect
-from aie.dialects import aiex as aiexdialect
-from aie.ir import (
-    Context,
-    Location,
-    Module,
-    InsertionPoint,
-    IndexType,
-    StringAttr,
-    IntegerAttr,
-    IntegerType,
-)
-from aie.passmanager import PassManager
-
-
-def _create_input_with_addresses_pipeline(
-    scheme,
-    dynamic_objFifos,
-    packet_sw_objFifos,
-    ctrl_pkt_overlay,
-    aie_target,
-    opt_level="2",
-):
-    pipeline = Pipeline()
-
-    # Only add convert-vector-to-aievec for AIE2 and later targets
-    # AIE1 ("aie") does not support target_backend="llvmir"
-    if aie_target.lower() in ["aie2", "aieml", "aie2p"]:
-        # Hoist vector transfer pointers before scf-to-cf conversion (O3 and above only)
-        # This runs on the module and walks into aie.core regions
-        if int(opt_level) >= 3:
-            pipeline.add_pass("aie-hoist-vector-transfer-pointers")
-        pipeline.add_pass(
-            "convert-vector-to-aievec",
-            **{"aie-target": aie_target.lower(), "target-backend": "llvmir"},
-        )
-
-    # Build nested device pipeline with conditional passes
-    device_pipeline = (
-        Pipeline()
-        .add_pass("aie-trace-to-config")
-        .add_pass("aie-trace-pack-reg-writes")
-        .add_pass("aie-inline-trace-config")
-        .add_pass("aie-assign-lock-ids")
-        .add_pass("aie-register-objectFifos")
-        .add_pass(
-            "aie-objectFifo-stateful-transform",
-            **{
-                "dynamic-objFifos": dynamic_objFifos,
-                "packet-sw-objFifos": packet_sw_objFifos,
-            },
-        )
-        .add_pass("aie-assign-bd-ids")
-        .add_pass("aie-lower-cascade-flows")
-        .add_pass("aie-lower-broadcast-packet")
-        .add_pass("aie-lower-multicast")
-        .add_pass("aie-assign-tile-controller-ids")
-        .add_pass(
-            "aie-generate-column-control-overlay",
-            **{"route-shim-to-tile-ctrl": ctrl_pkt_overlay},
-        )
-        .add_pass("aie-assign-buffer-addresses", **{"alloc-scheme": scheme})
-        .add_pass("aie-assign-core-link-files")
-        .add_pass("aie-vector-transfer-lowering", **{"max-transfer-rank": 1})
-    )
-
-    # Only add vector-to-pointer-loops for O3 and above
-    if int(opt_level) >= 3:
-        device_pipeline.add_pass("aie-vector-to-pointer-loops")
-
-    return (
-        pipeline.lower_affine()
-        .add_pass("aie-canonicalize-device")
-        .Nested("aie.device", device_pipeline)
-        .convert_scf_to_cf()
-    )
-
-
-INPUT_WITH_ADDRESSES_PIPELINE = _create_input_with_addresses_pipeline
-
-LOWER_TO_LLVM_PIPELINE = (
-    Pipeline()
-    .canonicalize()
-    .cse()
-    .expand_strided_metadata()
-    .lower_affine()
-    .arith_expand()
-    .finalize_memref_to_llvm()
-    .convert_func_to_llvm(use_bare_ptr_memref_call_conv=True)
-    .convert_to_llvm(dynamic=True)
-    .add_pass("convert-vector-to-llvm")
-    .add_pass("convert-ub-to-llvm")
-    .canonicalize()
-    .cse()
-)
-
-
-def _create_aie_lower_to_llvm_pipeline(
-    device_name=None, col=None, row=None, aie_target="aie2", opt_level="2"
-):
-    pipeline = (
-        Pipeline()
-        .Nested(
-            "aie.device",
-            Pipeline()
-            .add_pass("aie-localize-locks")
-            .add_pass("aie-normalize-address-spaces")
-            .add_pass("aie-transform-bfp-types"),
-        )
-        .add_pass("aie-standard-lowering", device=device_name, tilecol=col, tilerow=row)
-        .add_pass("aiex-standard-lowering")
-    )
-
-    # Only add aievec-split-load-ups-chains for O3 and above
-    if int(opt_level) >= 3:
-        pipeline.add_pass("aievec-split-load-ups-chains")
-
-    pipeline.add_pass("convert-aievec-to-llvm", **{"aie-target": aie_target.lower()})
+import warnings
 
-    return pipeline + LOWER_TO_LLVM_PIPELINE
 
+def _find_aiecc_binary():
+    """Find the C++ aiecc binary in PATH."""
+    path = shutil.which("aiecc")
+    if path:
+        return path
 
-AIE_LOWER_TO_LLVM = _create_aie_lower_to_llvm_pipeline
-
-
-# pipeline to lower and legalize runtime sequence for NPU
-def _create_npu_lowering_pipeline(expand_load_pdis=False):
-    pipeline = Pipeline()
-    if opts.materialize_runtime_sequence:
-        pipeline = pipeline.add_pass("aie-materialize-runtime-sequences")
-    pipeline = pipeline.Nested(
-        "aie.device",
-        Pipeline()
-        .add_pass("aie-materialize-bd-chains")
-        .add_pass("aie-substitute-shim-dma-allocations")
-        .add_pass("aie-assign-runtime-sequence-bd-ids")
-        .add_pass("aie-dma-tasks-to-npu")
-        .add_pass("aie-dma-to-npu")
-        .add_pass("aie-lower-set-lock"),
+    raise FileNotFoundError(
+        "Could not find 'aiecc' binary. Ensure mlir-aie is properly installed "
+        "and the bin directory is in your PATH, or use the C++ aiecc directly."
     )
-    if expand_load_pdis:
-        pipeline = pipeline.add_pass("aie-expand-load-pdi")
-    return pipeline
-
-
-async def read_file_async(file_path: str) -> str:
-    async with aiofiles.open(file_path, mode="r") as f:
-        contents = await f.read()
-    return contents
-
-
-async def write_file_async(file_content: str, file_path: str):
-    async with aiofiles.open(file_path, mode="w") as f:
-        await f.write(file_content)
-
-
-def emit_design_kernel_json(
-    kernel_name="MLIR_AIE",
-    kernel_id="0x901",
-    instance_name="MLIRAIE",
-    buffer_args=None,
-):
-    if buffer_args is None:
-        buffer_args = [f"bo{i}" for i in range(5)]
-
-    arguments = [
-        {
-            "name": "opcode",
-            "address-qualifier": "SCALAR",
-            "type": "uint64_t",
-            "offset": "0x00",
-        },
-    ]
-    offset = 0x08
-
-    inst_arguments = [
-        {
-            "name": "instr",
-            "memory-connection": "SRAM",
-            "address-qualifier": "GLOBAL",
-            "type": "char *",
-            "offset": str(hex(offset)),
-        },
-        {
-            "name": "ninstr",
-            "address-qualifier": "SCALAR",
-            "type": "uint32_t",
-            "offset": str(hex(offset + 8)),
-        },
-    ]
-    arguments.append(inst_arguments[0])
-    arguments.append(inst_arguments[1])
-    offset += 12
-
-    for buf in buffer_args:
-        arg = {
-            "name": buf,
-            "memory-connection": "HOST",
-            "address-qualifier": "GLOBAL",
-            "type": "void*",
-            "offset": str(hex(offset)),
-        }
-        arguments.append(arg)
-        offset += 0x8
-
-    return {
-        "ps-kernels": {
-            "kernels": [
-                {
-                    "name": kernel_name,
-                    "type": "dpu",
-                    "extended-data": {
-                        "subtype": "DPU",
-                        "functional": "0",
-                        "dpu_kernel_id": kernel_id,
-                    },
-                    "arguments": arguments,
-                    "instances": [{"name": instance_name}],
-                }
-            ]
-        }
-    }
-
-
-mem_topology = {
-    "mem_topology": {
-        "m_count": "2",
-        "m_mem_data": [
-            {
-                "m_type": "MEM_DRAM",
-                "m_used": "1",
-                "m_sizeKB": "0x10000",
-                "m_tag": "HOST",
-                "m_base_address": "0x4000000",
-            },
-            {
-                "m_type": "MEM_DRAM",
-                "m_used": "1",
-                "m_sizeKB": "0xc000",
-                "m_tag": "SRAM",
-                "m_base_address": "0x4000000",
-            },
-        ],
-    }
-}
-
-
-def emit_partition(mlir_module_str, device_op, design_pdi, kernel_id="0x901"):
-    with Context(), Location.unknown():
-        module = Module.parse(mlir_module_str)
-    device = aiedialect.AIEDevice(int(device_op.device))
-    num_cols = aiedialect.get_target_model(device).columns()
-
-    # It's arguable that this should should come from the device model
-    # somehow.  Or perhaps that it shouldn't be needed in the
-    # XCLbin at all, since it is basically describing information
-    # which is already inherent in the CDO.
-    # For the time being, we just leave it here.
-    if device in [aiedialect.AIEDevice.npu1, aiedialect.AIEDevice.npu2]:
-        start_columns = [0]
-    else:
-        start_columns = list(range(1, 6 - num_cols))
-
-    # Generate a uuid
-    pdi_uuid = uuid.uuid4()
-    return {
-        "aie_partition": {
-            "name": "QoS",
-            "operations_per_cycle": "2048",
-            "inference_fingerprint": "23423",
-            "pre_post_fingerprint": "12345",
-            "partition": {
-                "column_width": num_cols,
-                "start_columns": start_columns,
-            },
-            "PDIs": [
-                {
-                    "uuid": str(pdi_uuid),
-                    "file_name": design_pdi,
-                    "cdo_groups": [
-                        {
-                            "name": "DPU",
-                            "type": "PRIMARY",
-                            "pdi_id": "0x01",
-                            "dpu_kernel_ids": [kernel_id],
-                            "pre_cdo_groups": ["0xC1"],
-                        }
-                    ],
-                }
-            ],
-        }
-    }
-
-
-def parse_file_as_mlir(mlir_module_str):
-    with Context(), Location.unknown():
-        return Module.parse(mlir_module_str)
 
 
-def generate_devices_list(module):
-    return [
-        (d, d.sym_name.value)
-        for d in find_ops(
-            module.operation,
-            lambda d: isinstance(d.operation.opview, aiedialect.DeviceOp),
-        )
-        if not opts.device_name or d.sym_name.value == opts.device_name
-    ]
-
-
-def _core_has_nonempty_body(core_op):
-    """Check if a CoreOp has a non-empty body (more than just aie.end)."""
-    for block in core_op.body:
-        if len(list(block)) > 1:
-            return True
-    return False
-
-
-def generate_cores_list(device_op):
-    def _link_files(c):
-        attr = c.link_files
-        if attr is None:
-            return []
-        return [attr[i].value for i in range(len(attr))]
-
-    return [
-        (
-            c.tile.owner.opview.col.value,
-            c.tile.owner.opview.row.value,
-            c.elf_file.value if c.elf_file is not None else None,
-            _link_files(c),
-        )
-        for c in find_ops(
-            device_op.operation,
-            lambda o: isinstance(o.operation.opview, aiedialect.CoreOp),
-        )
-        if c.elf_file is not None
-        or c.link_with is not None
-        or c.link_files is not None
-        or _core_has_nonempty_body(c)
-    ]
-
-
-def generate_runtime_sequences_list(device_op):
-    return [
-        (s, s.sym_name.value)
-        for s in find_ops(
-            device_op.operation,
-            lambda o: isinstance(o.operation.opview, aiexdialect.RuntimeSequenceOp),
-        )
-        if not opts.sequence_name or s.sym_name.value == opts.sequence_name
-    ]
-
-
-def find_aiebu_asm():
-    asm_bin = "aiebu-asm"
-    if shutil.which(asm_bin) is None:
-        asm_bin = os.path.join("/", "opt", "xilinx", "aiebu", "bin", "aiebu-asm")
-        if shutil.which(asm_bin) is None:
-            asm_bin = None
-    if asm_bin is None:
-        print(
-            "Error: aiebu-asm not found.",
-            file=sys.stderr,
-        )
-        sys.exit(1)
-    return asm_bin
-
-
-def create_device_id_mapping(devices):
-    """Assign an ID to each device in the MLIR; used later to assign IDs for each PDI"""
-    device_to_id = {}
-    for i, (device_op, device_name) in enumerate(devices, 1):
-        device_to_id[device_name] = i
-    return device_to_id
-
-
-def assign_load_pdi_ids(module, device_to_id_mapping):
-    """Transform symbolic aiex.npu.load_pdi references to numeric IDs"""
-    with module.context as context, Location.unknown():
-        for runtime_seq in find_ops(
-            module.operation,
-            lambda o: isinstance(o.operation.opview, aiexdialect.RuntimeSequenceOp),
-        ):
-            for load_pdi_op in find_ops(
-                runtime_seq.operation,
-                lambda o: isinstance(o.operation.opview, aiexdialect.NpuLoadPdiOp)
-                and hasattr(o, "device_ref")
-                and o.device_ref is not None,
-            ):
-                device_name = load_pdi_op.device_ref.value
-                if device_name not in device_to_id_mapping:
-                    print(
-                        f"Warning: Device '{device_name}' for load_pdi instruction does not have a matching device PDI."
-                    )
-                    sys.exit(1)
-                pdi_id = device_to_id_mapping[device_name]
-                load_pdi_op.id = IntegerAttr.get(
-                    IntegerType.get_signless(32, context=context), pdi_id
-                )
-
-
-def set_elf_file_for_core(core, path):
-    with InsertionPoint.at_block_terminator(
-        core.parent.regions[0].blocks[0]
-    ), Location.unknown():
-        result = IndexType.get()
-        new_core = aiedialect.CoreOp(result, core.tile)
-        for attr in core.attributes:
-            new_core.attributes[attr] = core.attributes[attr]
-        new_core.attributes["elf_file"] = StringAttr.get(path)
-        new_core_block = new_core.body.blocks.append()
-        with InsertionPoint(new_core_block):
-            aiedialect.EndOp()
-        new_core.move_before(core)
-    core.operation.erase()
-
-
-def emit_design_bif(
-    root_path, device_name, has_cores=True, enable_cores=True, unified=False
-):
-    if unified:
-        cdo_unified_file = f"file={root_path}/{device_name}_aie_cdo.bin"
-        files = f"{cdo_unified_file}"
-    else:
-        cdo_elfs_file = f"file={root_path}/{device_name}_aie_cdo_elfs.bin"
-        cdo_init_file = f"file={root_path}/{device_name}_aie_cdo_init.bin"
-        cdo_enable_file = (
-            f"file={root_path}/{device_name}_aie_cdo_enable.bin" if enable_cores else ""
-        )
-        files = f"{cdo_elfs_file} {cdo_init_file} {cdo_enable_file}"
-    return dedent(f"""\
-        all:
-        {{
-          id_code = 0x14ca8093
-          extended_id_code = 0x01
-          image
-          {{
-            name=aie_image, id=0x1c000000
-            {{ type=cdo {files} }}
-          }}
-        }}
-        """)
-
-
-# Extract included files from the given Chess linker script.
-# We rely on gnu linker scripts to stuff object files into a compile.  However, the Chess compiler doesn't
-# do this, so we have to explicitly specify included files on the link line.
-async def extract_input_files(file_core_bcf):
-    core_bcf = await read_file_async(file_core_bcf)
-    return " ".join(re.findall(r"^_include _file (.*)", core_bcf, re.MULTILINE))
-
-
-def do_run(command, verbose=False):
-    if verbose:
-        print(" ".join(command))
-    m = subprocess.PIPE
-    ret = subprocess.run(command, stdout=m, stderr=m, universal_newlines=True)
-    return ret
-
-
-def format_diagnostics_for_script(diagnostics):
-    """Format MLIR diagnostics for inclusion in repeater script."""
-    if not diagnostics:
-        return ""
-
-    diag_lines = "\n".join(diagnostics)
-    return f"""echo "Original MLIR Diagnostics:"
-cat << 'DIAGNOSTICS_EOF'
-{diag_lines}
-DIAGNOSTICS_EOF
-echo ""
-
-"""
-
-
-def generate_repeater_script(
-    mlir_file, pass_pipeline, output_file, timenow, description=None, diagnostics=None
-):
+def main():
     """
-    Generate a bash repeater script for reproducing a pass pipeline failure.
+    Main entry point - delegates to C++ aiecc.
 
-    Args:
-        mlir_file: Path to the MLIR file that caused failure
-        pass_pipeline: The pass pipeline string
-        output_file: Where to write the repeater script
-        description: Optional description of what was being compiled
-        diagnostics: List of MLIR diagnostic messages
+    All command-line arguments are passed directly to the C++ binary unchanged.
     """
-    diag_section = format_diagnostics_for_script(diagnostics)
-
-    script_content = f"""#!/bin/bash
-#
-# AIECC Repeater Script
-# Generated: {timenow.isoformat()}
-#
-# This script reproduces a compilation failure from aiecc.py
-# Description: {description or 'N/A'}
-# Diagnostics: {len(diagnostics) if diagnostics else 0} messages captured
-#
-
-set -e  # Exit on error
-
-echo "=================================================="
-echo "AIECC Failure Reproduction Script"
-echo "=================================================="
-echo ""
-
-{diag_section}MLIR_FILE="{mlir_file}"
-PASS_PIPELINE='{pass_pipeline}'
-
-# Check if input file exists
-if [ ! -f "$MLIR_FILE" ]; then
-    echo "Error: Input MLIR file not found: $MLIR_FILE"
-    exit 1
-fi
-
-# Check if aie-opt is available
-if ! command -v aie-opt &> /dev/null; then
-    echo "Error: aie-opt not found in PATH"
-    echo "Please ensure mlir-aie tools are properly installed and in PATH"
-    exit 1
-fi
-
-echo "Input MLIR: $MLIR_FILE"
-echo "Pass Pipeline: $PASS_PIPELINE"
-echo ""
-echo "Running aie-opt with debug flags..."
-echo ""
-
-# Run with debugging flags
-aie-opt \\
-    --mlir-print-ir-after-all \\
-    --mlir-disable-threading \\
-    --pass-pipeline="${{PASS_PIPELINE}}" \\
-    "$MLIR_FILE"
-
-echo ""
-echo "If the command succeeded, the issue may be non-deterministic."
-echo "Try running this script multiple times."
-"""
-
-    with open(output_file, "w") as f:
-        f.write(script_content)
+    try:
+        aiecc_bin = _find_aiecc_binary()
+    except FileNotFoundError as e:
+        print(f"Error: {e}", file=sys.stderr)
+        sys.exit(1)
 
-    # Make script executable
-    os.chmod(output_file, os.stat(output_file).st_mode | stat.S_IEXEC)
+    # Pass all arguments directly to C++ binary unchanged
+    result = subprocess.run([aiecc_bin, *sys.argv[1:]])
+    sys.exit(result.returncode)
 
 
-def handle_pass_failure(
-    pass_pipeline,
-    mlir_ir,
-    description=None,
-    output_dir=None,
-    diagnostics=None,
-):
-    """
-    Handle failure of Python-based PassManager execution.
-    Saves intermediate MLIR and generates repeater script.
-
-    Args:
-        pass_pipeline: The pass pipeline that failed
-        mlir_ir: The MLIR IR before the failed pass
-        description: Human-readable description of what was being compiled
-        output_dir: Directory to save repeater scripts (default: temp dir)
-        diagnostics: List of diagnostic messages from MLIR
+def run(mlir_module, args=None):
     """
-    import datetime
-
-    # Generate unique filename
-    timenow = datetime.datetime.now()
-    timestamp = timenow.strftime("%Y%m%d_%H%M%S")
-    failure_id = str(uuid.uuid4())[:8]
-
-    # Save MLIR to output directory
-    temp_dir = output_dir or tempfile.gettempdir()
-    mlir_filename = os.path.join(
-        temp_dir, f"aiecc_failure_{timestamp}_{failure_id}.mlir"
-    )
-    repeater_filename = os.path.join(
-        temp_dir, f"aiecc_repeater_{timestamp}_{failure_id}.sh"
-    )
-
-    with open(mlir_filename, "w") as f:
-        f.write(mlir_ir)
-
-    # Generate repeater script
-    generate_repeater_script(
-        mlir_file=mlir_filename,
-        pass_pipeline=pass_pipeline,
-        output_file=repeater_filename,
-        timenow=timenow,
-        description=description,
-        diagnostics=diagnostics,
-    )
-
-    # Print diagnostic message
-    desc_str = f" ({description})" if description else ""
-    print("\n" + "=" * 80, file=sys.stderr)
-    print(f"AIECC COMPILATION FAILED{desc_str}", file=sys.stderr)
-    print("=" * 80, file=sys.stderr)
-    print(f"\nIntermediate MLIR saved to:", file=sys.stderr)
-    print(f"  {mlir_filename}", file=sys.stderr)
-    print(f"\nFor developers, the error can be reproduced with:", file=sys.stderr)
-    print(
-        f"  $ aie-opt --pass-pipeline='{pass_pipeline}' {mlir_filename}",
-        file=sys.stderr,
-    )
-    print(f"\nRepeater script generated:", file=sys.stderr)
-    print(f"  {repeater_filename}", file=sys.stderr)
-    print(f"  $ bash {repeater_filename}", file=sys.stderr)
-    print("=" * 80 + "\n", file=sys.stderr)
+    Programmatic API for compiling MLIR modules.
 
+    DEPRECATED: This function is deprecated. Use the C++ aiecc binary
+    directly or the IRON Python API instead.
 
-def corefile(dirname, device, core, ext):
-    col, row = core[0], core[1]
-    return os.path.join(dirname, f"{device}_core_{col}_{row}.{ext}")
-
-
-def aie_target_defines(aie_target):
-    if aie_target == "AIE2":
-        return ["-D__AIEARCH__=20"]
-    return ["-D__AIEARCH__=10"]
-
+    Args:
+        mlir_module: MLIR module string or object with __str__ method
+        args: Optional list of command-line arguments
 
-def downgrade_ir_for_chess(llvmir_chesslinked):
-    llvmir_chesslinked = (
-        llvmir_chesslinked.replace("memory(none)", "readnone")
-        .replace("memory(read)", "readonly")
-        .replace("memory(write)", "writeonly")
-        .replace("memory(argmem: readwrite)", "argmemonly")
-        .replace("memory(argmem: read)", "argmemonly readonly")
-        .replace("memory(argmem: write)", "argmemonly writeonly")
-        .replace("memory(inaccessiblemem: readwrite)", "inaccessiblememonly")
-        .replace("memory(inaccessiblemem: read)", "inaccessiblememonly readonly")
-        .replace("memory(inaccessiblemem: write)", "inaccessiblememonly writeonly")
-        .replace(
-            "memory(argmem: readwrite, inaccessiblemem: readwrite)",
-            "inaccessiblemem_or_argmemonly",
-        )
-        .replace(
-            "memory(argmem: read, inaccessiblemem: read)",
-            "inaccessiblemem_or_argmemonly readonly",
-        )
-        .replace(
-            "memory(argmem: write, inaccessiblemem: write)",
-            "inaccessiblemem_or_argmemonly writeonly",
-        )
-        .replace("captures(none)", "nocapture")
-        .replace("getelementptr inbounds nuw", "getelementptr inbounds")
+    Raises:
+        RuntimeError: If compilation fails
+    """
+    warnings.warn(
+        "aiecc.run() is deprecated and will be removed in a future release. "
+        "Use the C++ aiecc binary directly or the IRON Python API instead.",
+        DeprecationWarning,
+        stacklevel=2,
     )
-    # Remove nocreateundeforpoison attribute (not supported by older LLVM in Chess toolchain)
-    llvmir_chesslinked = re.sub(r"\bnocreateundeforpoison\s+", "", llvmir_chesslinked)
-    return llvmir_chesslinked
-
-
-def downgrade_ir_for_peano(llvmir):
-    llvmir = llvmir.replace("getelementptr inbounds nuw", "getelementptr inbounds")
-    # Remove nocreateundeforpoison attribute (not supported by older LLVM in Peano toolchain)
-    llvmir = re.sub(r"\bnocreateundeforpoison\s+", "", llvmir)
-    return llvmir
-
-
-def drop_alignment_for_peano(llvmir):
-    # Remove any ", align <integer>" attribute occurrences
-    llvmir = re.sub(r",\s*align\s+\d+", "", llvmir)
-    return llvmir
-
-
-def get_peano_target(aie_target):
-    if not re.fullmatch("AIE.?.?", aie_target):
-        print(
-            "Unexpected target " + aie_target + ". Exiting...",
-            file=sys.stderr,
-        )
-        exit(-3)
-    aie_peano_target = aie_target.lower() + "-none-unknown-elf"
-    return aie_peano_target
-
-
-class FlowRunner:
-    def __init__(self, mlir_module_str, opts, tmpdirname):
-        self.mlir_module_str = mlir_module_str
-        self.opts = opts
-        self.tmpdirname = tmpdirname
-        self.runtimes = dict()
-        self.progress_bar = None
-        self.maxtasks = 5
-        self.stopall = False
-        self.peano_clang_path = os.path.join(opts.peano_install_dir, "bin", "clang")
-        self.peano_opt_path = os.path.join(opts.peano_install_dir, "bin", "opt")
-        self.peano_llc_path = os.path.join(opts.peano_install_dir, "bin", "llc")
-        self.repeater_output_dir = opts.repeater_output_dir or tempfile.gettempdir()
-
-    def prepend_tmp(self, x):
-        return os.path.join(self.tmpdirname, x)
-
-    def pdi_file_name(self, device_name):
-        return (
-            opts.pdi_name.format(device_name)
-            if opts.pdi
-            else self.prepend_tmp(f"{device_name}.pdi")
-        )
-
-    def npu_insts_file_name(self, device_name, seq_name):
-        return (
-            opts.insts_name.format(device_name, seq_name)
-            if opts.npu
-            else self.prepend_tmp(f"{device_name}_{seq_name}.bin")
-        )
-
-    def run_passes(
-        self,
-        pass_pipeline,
-        mlir_module,
-        outputfile=None,
-        description=None,
-    ):
-        """
-        Run a pass pipeline on MLIR module object.
-
-        Args:
-            pass_pipeline: Pipeline string to execute
-            mlir_module: Input MLIR module object
-            outputfile: Optional output file path
-            description: Human-readable description of this pass stage
-        """
-        if self.opts.verbose:
-            print("Running:", pass_pipeline)
-        diags = []
-        mlir_for_error_report = None  # Will be set before pass execution
-
-        def diagnostic_handler(d):
-            severity = str(d.severity).replace("DiagnosticSeverity.", "").lower()
-            diags.append(f"{d.location}: {severity}: {d.message}")
-            for note in d.notes:
-                diags.append(f"{note.location}: note: {note.message}")
-            if severity == "error":
-                # Generate repeater script on error
-                if self.opts.enable_repeater:
-                    handle_pass_failure(
-                        pass_pipeline=pass_pipeline,
-                        mlir_ir=mlir_for_error_report,
-                        description=description,
-                        output_dir=self.repeater_output_dir,
-                        diagnostics=diags,
-                    )
-                for d in diags:
-                    print(d, file=sys.stderr)
-                return False
-            return True
-
-        with mlir_module.context, Location.unknown():
-            mlir_module.context.emit_error_diagnostics = True
-            h = mlir_module.context.attach_diagnostic_handler(diagnostic_handler)
-            mlir_for_error_report = str(mlir_module)  # Save IR before transformation
-            pm = PassManager.parse(pass_pipeline)
-            pm.run(mlir_module.operation)
-            h.detach()
-            for d in diags:
-                print(d)
-            if outputfile:
-                mlir_module_str = str(mlir_module)
-                with open(outputfile, "w") as g:
-                    g.write(mlir_module_str)
-        return mlir_module
-
-    async def do_call(self, task_id, command, force=False, cwd=None):
-        if self.stopall:
-            return
-
-        commandstr = " ".join(command)
-        if task_id:
-            self.progress_bar.update(task_id, advance=0, command=commandstr[0:30])
-        start = time.time()
-        if self.opts.verbose:
-            print(commandstr)
-        if self.opts.execute or force:
-            proc = await asyncio.create_subprocess_exec(
-                *command,
-                stdout=asyncio.subprocess.PIPE,
-                stderr=asyncio.subprocess.PIPE,
-                cwd=cwd,
-            )
-            stdout, stderr = await proc.communicate()
-            ret = proc.returncode
-            if self.opts.verbose and stdout:
-                print(f"{stdout.decode()}")
-            if ret != 0 and stderr:
-                print(f"{stderr.decode()}", file=sys.stderr)
-        else:
-            ret = 0
-        end = time.time()
-        if self.opts.verbose:
-            print(f"Done in {end - start:.3f} sec: {commandstr}")
-        self.runtimes[commandstr] = end - start
-        if task_id:
-            self.progress_bar.update(task_id, advance=1, command="")
-            self.maxtasks = max(
-                self.progress_bar._tasks[task_id].completed, self.maxtasks
-            )
-            self.progress_bar.update(task_id, total=self.maxtasks)
-
-        if ret != 0:
-            if task_id:
-                self.progress_bar.update(task_id, description="[red] Error")
-            print("Error encountered while running: " + commandstr, file=sys.stderr)
-            sys.exit(ret)
-
-    # In order to run xchesscc on modern ll code, we need a bunch of hacks.
-    async def chesshack(self, task, llvmir, aie_target):
-        llvmir_chesshack = llvmir + "chesshack.ll"
-        llvmir_chesslinked_path = llvmir + "chesslinked.ll"
-        if not self.opts.execute:
-            return llvmir_chesslinked_path
-
-        install_path = aie.compiler.aiecc.configure.install_path()
-        runtime_lib_path = os.path.join(install_path, "aie_runtime_lib")
-        chess_intrinsic_wrapper_ll_path = os.path.join(
-            runtime_lib_path, aie_target.upper(), "chess_intrinsic_wrapper.ll"
-        )
-
-        llvmir_ir = await read_file_async(llvmir)
-        llvmir_hacked_ir = downgrade_ir_for_chess(llvmir_ir)
-        await write_file_async(llvmir_hacked_ir, llvmir_chesshack)
-
-        if aie_target.casefold() == "AIE2".casefold():
-            target = "target_aie_ml"
-        elif aie_target.casefold() == "AIE2P".casefold():
-            target = "target_aie2p"
-        else:
-            target = "target"
-        assert os.path.exists(llvmir_chesshack)
-        await self.do_call(
-            task,
-            [
-                # The path below is cheating a bit since it refers directly to the AIE1
-                # version of llvm-link, rather than calling the architecture-specific
-                # tool version.
-                opts.aietools_path
-                + "/tps/lnx64/"
-                + target
-                + "/bin/LNa64bin/chess-llvm-link",
-                llvmir_chesshack,
-                chess_intrinsic_wrapper_ll_path,
-                "-S",
-                "-o",
-                llvmir_chesslinked_path,
-            ],
-        )
-
-        return llvmir_chesslinked_path
-
-    # In order to run peano on modern ll code, we need a bunch of hacks.
-    async def peanohack(self, llvmir):
-        llvmir_peanohack = llvmir + "peanohack.ll"
-        if not self.opts.execute:
-            return llvmir_peanohack
-
-        llvmir_ir = await read_file_async(llvmir)
-        llvmir_hacked_ir = downgrade_ir_for_peano(llvmir_ir)
-        llvmir_hacked_ir = drop_alignment_for_peano(llvmir_hacked_ir)
-        await write_file_async(llvmir_hacked_ir, llvmir_peanohack)
-
-        return llvmir_peanohack
-
-    async def process_cores(
-        self,
-        device_op,
-        device_name,
-        file_with_addresses,
-        aie_target,
-        aie_peano_target,
-        parent_task_id,
-    ):
-        # If unified compilation is on, we create a single object file that
-        # contains the compiled code for all cores. If not, the equivalent
-        # of the below is created for each core inside of process_core
-        # (singular).
-
-        # fmt: off
-        if opts.unified:
-            file_opt_with_addresses = self.prepend_tmp(f"{device_name}_input_opt_with_addresses.mlir")
-            with Context(), Location.unknown():
-                module = Module.parse(await read_file_async(file_with_addresses))
-            self.run_passes(
-                str(AIE_LOWER_TO_LLVM(device_name, aie_target=aie_target, opt_level=opts.opt_level)),
-                module,
-                outputfile=file_opt_with_addresses,
-                description=f"LLVM lowering for unified compilation of {device_name}",
-            )
-
-            file_llvmir = self.prepend_tmp(f"{device_name}_input.ll")
-            await self.do_call(parent_task_id, ["aie-translate", "--mlir-to-llvmir", file_opt_with_addresses, "-o", file_llvmir])
-
-            unified_file_core_obj = self.prepend_tmp(f"{device_name}_input.o")
-            if opts.compile and opts.xchesscc:
-                file_llvmir_hacked = await self.chesshack(parent_task_id, file_llvmir, aie_target)
-                await self.do_call(parent_task_id, ["xchesscc_wrapper", aie_target.lower(), "+w", self.prepend_tmp("work"), "-c", "-d", "+Wclang,-xir", "-f", file_llvmir_hacked, "-o", unified_file_core_obj])
-            elif opts.compile:
-                file_llvmir_hacked = await self.peanohack(file_llvmir)
-                file_llvmir_opt = self.prepend_tmp(f"{device_name}_input.opt.ll")
-                opt_level = opts.opt_level
-                # Disable loop idiom memset for O3 and above.
-                # Rationale: memset is executed as scalar operation, while
-                # zeroinitializer will be executed as vector.
-                # Cap opt at O1 to prevent LLVM's SLP vectorizer from
-                # creating sub-512-bit vector types (e.g., <4 x i8>) that
-                # crash the AIE2 GlobalISel legalizer. This is still needed
-                # for any scalar ops in the core (e.g., memref.copy loops).
-                safe_opt = min(int(opt_level), 1)
-                opt_flags = [f"--passes=default<O{safe_opt}>"]
-                if int(opt_level) >= 3:
-                    opt_flags.append("-disable-loop-idiom-memset")
-                opt_flags.extend(["-inline-threshold=10", "-S", file_llvmir_hacked, "-o", file_llvmir_opt])
-                await self.do_call(parent_task_id, [self.peano_opt_path] + opt_flags)
-                await self.do_call(parent_task_id, [self.peano_llc_path, file_llvmir_opt, f"-O{opt_level}", "--march=" + aie_target.lower(), "--function-sections", "--filetype=obj", "-o", unified_file_core_obj])
-        else:
-            unified_file_core_obj = None
-        # fmt: on
-
-        # Now, process each individual core.
-        processes = []
-        cores = generate_cores_list(device_op)
-        for core in cores:
-            processes.append(
-                self.process_core(
-                    device_name,
-                    core,
-                    aie_target,
-                    aie_peano_target,
-                    file_with_addresses,
-                    unified_file_core_obj,
-                    parent_task_id,
-                )
-            )
-        device_elf_paths = await asyncio.gather(*processes)
-        elf_paths = {}
-        for (col, row, _, _lf), elf_path in zip(cores, device_elf_paths):
-            elf_paths[(col, row)] = elf_path
-
-        # copy the elfs left by proess_core to the tmpdir for process_cdo
-        for elf in glob.glob("*.elf"):
-            try:
-                shutil.copy(elf, self.tmpdirname)
-            except shutil.SameFileError:
-                pass
-        for elf_map in glob.glob("*.elf.map"):
-            try:
-                shutil.copy(elf_map, self.tmpdirname)
-            except shutil.SameFileError:
-                pass
-
-        return elf_paths
-
-    async def process_core(
-        self,
-        device_name,
-        core,
-        aie_target,
-        aie_peano_target,
-        file_with_addresses,
-        unified_file_core_obj,
-        parent_task_id,
-    ):
-        async with self.limit:
-            if self.stopall:
-                return
-
-            install_path = aie.compiler.aiecc.configure.install_path()
-            runtime_lib_path = os.path.join(
-                install_path, "aie_runtime_lib", aie_target.upper()
-            )
-
-            # --gc-sections to eliminate unneeded code.
-            # --orphan-handling=error to ensure that the linker script is as expected.
-            # If there are orphaned input sections, then they'd likely end up outside of the normal program memory.
-            clang_link_args = ["-Wl,--gc-sections", "-Wl,--orphan-handling=error"]
-
-            task = self.progress_bar.add_task(
-                "[yellow] Core (%d, %d)" % core[0:2],
-                total=self.maxtasks,
-                command="starting",
-            )
-
-            # fmt: off
-            corecol, corerow, elf_file, link_files = core
-
-            # Copy external .o files to tmpdir so linker can find them.
-            # Use a temp-then-rename pattern so parallel core compilations
-            # that share the same .o filename do not corrupt each other's copy.
-            for lf in link_files:
-                src = lf if os.path.isabs(lf) else os.path.join(
-                    os.path.dirname(opts.filename) or os.getcwd(), lf)
-                dst = os.path.join(self.tmpdirname, os.path.basename(lf))
-                if src != dst:
-                    tmp_fd, tmp_path = tempfile.mkstemp(dir=self.tmpdirname)
-                    try:
-                        os.close(tmp_fd)
-                        shutil.copy2(src, tmp_path)
-                        os.replace(tmp_path, dst)
-                    except Exception:
-                        os.unlink(tmp_path)
-                        raise
-
-            if not opts.unified:
-                file_opt_core = corefile(self.tmpdirname, device_name, core, "opt.mlir")
-                with Context(), Location.unknown():
-                    module = Module.parse(await read_file_async(file_with_addresses))
-                self.run_passes(
-                    str(AIE_LOWER_TO_LLVM(device_name, corecol, corerow, opts.opt_level)),
-                    module,
-                    outputfile=file_opt_core,
-                    description=f"LLVM lowering for core ({corecol}, {corerow}) of {device_name}",
-                )
-            if self.opts.xbridge:
-                file_core_bcf = corefile(self.tmpdirname, device_name, core, "bcf")
-                await self.do_call(task, ["aie-translate", file_with_addresses, "--aie-generate-bcf", "--aie-device-name", device_name, "--tilecol=%d" % corecol, "--tilerow=%d" % corerow, "-o", file_core_bcf])
-            else:
-                file_core_ldscript = corefile(self.tmpdirname, device_name, core, "ld.script")
-                await self.do_call(task, ["aie-translate", file_with_addresses, "--aie-generate-ldscript", "--aie-device-name", device_name, "--tilecol=%d" % corecol, "--tilerow=%d" % corerow, "-o", file_core_ldscript])
-            if not self.opts.unified:
-                file_core_llvmir = corefile(self.tmpdirname, device_name, core, "ll")
-                await self.do_call(task, ["aie-translate", "--mlir-to-llvmir", file_opt_core, "-o", file_core_llvmir])
-                file_core_obj = corefile(self.tmpdirname, device_name, core, "o")
-
-            file_core_elf = elf_file if elf_file else corefile(self.tmpdirname, device_name, core, "elf")
-
-            if opts.compile and opts.xchesscc:
-                if not opts.unified:
-                    file_core_llvmir_chesslinked = await self.chesshack(task, file_core_llvmir, aie_target)
-                    if self.opts.link and self.opts.xbridge:
-                        link_with_obj = await extract_input_files(file_core_bcf)
-                        await self.do_call(task, ["xchesscc_wrapper", aie_target.lower(), "+w", self.prepend_tmp("work"), "-d", "+Wclang,-xir", "-f", file_core_llvmir_chesslinked, link_with_obj, "+l", file_core_bcf, "-o", file_core_elf])
-                    elif self.opts.link:
-                        await self.do_call(task, ["xchesscc_wrapper", aie_target.lower(), "+w", self.prepend_tmp("work"), "-c", "-d", "+Wclang,-xir", "-f", file_core_llvmir_chesslinked, "-o", file_core_obj])
-                        opt_level = opts.opt_level
-                        await self.do_call(task, [self.peano_clang_path, f"-O{opt_level}", "--target=" + aie_peano_target, file_core_obj, *clang_link_args, "-Wl,-T," + file_core_ldscript, "-o", file_core_elf], cwd=self.tmpdirname)
-                else:
-                    file_core_obj = unified_file_core_obj
-                    if opts.link and opts.xbridge:
-                        link_with_obj = await extract_input_files(file_core_bcf)
-                        await self.do_call(task, ["xchesscc_wrapper", aie_target.lower(), "+w", self.prepend_tmp("work"), "-d", "-f", file_core_obj, link_with_obj, "+l", file_core_bcf, "-o", file_core_elf])
-                    elif opts.link:
-                        opt_level = opts.opt_level
-                        await self.do_call(task, [self.peano_clang_path, f"-O{opt_level}", "--target=" + aie_peano_target, file_core_obj, *clang_link_args, "-Wl,-T," + file_core_ldscript, "-o", file_core_elf], cwd=self.tmpdirname)
-
-            elif opts.compile:
-                if not opts.unified:
-                    file_core_llvmir_peanohacked = await self.peanohack(file_core_llvmir)
-                    file_core_llvmir_stripped = corefile(self.tmpdirname, device_name, core, "stripped.ll")
-                    opt_level = opts.opt_level
-                    # Disable loop idiom memset for O3 and above.
-                    # Rationale: memset is executed as scalar operation, while
-                    # zeroinitializer will be executed as vector.
-                    # Cap opt at O1 to prevent LLVM's SLP vectorizer from
-                    # creating sub-512-bit vector types (e.g., <4 x i8>) that
-                    # crash the AIE2 GlobalISel legalizer.
-                    safe_opt = min(int(opt_level), 1)
-                    opt_flags = [f"--passes=default<O{safe_opt}>,strip"]
-                    if int(opt_level) >= 3:
-                        opt_flags.append("-disable-loop-idiom-memset")
-                    opt_flags.extend(["-S", file_core_llvmir_peanohacked, "-o", file_core_llvmir_stripped])
-                    await self.do_call(task, [self.peano_opt_path] + opt_flags)
-                    await self.do_call(task, [self.peano_llc_path, file_core_llvmir_stripped, f"-O{opt_level}", "--march=" + aie_target.lower(), "--function-sections", "--filetype=obj", "-o", file_core_obj])
-                else:
-                    file_core_obj = unified_file_core_obj
-
-                if opts.link and opts.xbridge:
-                    link_with_obj = await extract_input_files(file_core_bcf)
-                    await self.do_call(task, ["xchesscc_wrapper", aie_target.lower(), "+w", self.prepend_tmp("work"), "-d", "-f", file_core_obj, link_with_obj, "+l", file_core_bcf, "-o", file_core_elf])
-                elif opts.link:
-                    opt_level = opts.opt_level
-                    await self.do_call(task, [self.peano_clang_path, f"-O{opt_level}", "--target=" + aie_peano_target, file_core_obj, *clang_link_args, "-Wl,-T," + file_core_ldscript, "-o", file_core_elf], cwd=self.tmpdirname)
-
-            self.progress_bar.update(parent_task_id, advance=1)
-            self.progress_bar.update(task, advance=0, visible=False)
-            # fmt: on
-
-            return file_core_elf
-
-    async def write_elf_paths_to_mlir(self, input_physical, elf_paths):
-        # After core ELF files are generated, we create a new MLIR file with
-        # references to those generated files in place of their IR.
-        with Context(), Location.unknown():
-            input_physical_with_elfs_module = Module.parse(
-                await read_file_async(input_physical)
-            )
-            for device in find_ops(
-                input_physical_with_elfs_module.operation,
-                lambda o: isinstance(o.operation.opview, aiedialect.DeviceOp),
-            ):
-                device_name = device.sym_name.value
-                if device_name not in elf_paths:
-                    continue
 
-                for core in find_ops(
-                    device, lambda o: isinstance(o.operation.opview, aiedialect.CoreOp)
-                ):
-                    col = core.tile.owner.opview.col.value
-                    row = core.tile.owner.opview.row.value
-                    if (col, row) not in elf_paths[device_name]:
-                        continue
-
-                    set_elf_file_for_core(core, elf_paths[device_name][(col, row)])
-
-            input_physical_with_elfs_str = str(input_physical_with_elfs_module)
-            input_physical_with_elfs = self.prepend_tmp("input_physical_with_elfs.mlir")
-
-            with open(input_physical_with_elfs, "w") as f:
-                f.write(input_physical_with_elfs_str)
-            return input_physical_with_elfs
-
-    async def process_cdo(self, module_str, device_name):
-        with Context(), Location.unknown():
-            input_physical = Module.parse(module_str)
-            aiedialect.generate_cdo(
-                input_physical.operation, self.tmpdirname, device_name
-            )
-
-    async def process_txn(self, module, device_name):
-        file_txn = self.prepend_tmp(f"{device_name}_txn.mlir")
-        self.run_passes(
-            f"builtin.module(aie.device(convert-aie-to-transaction{{device-name={device_name} elf-dir={self.tmpdirname}}}))",
-            module,
-            outputfile=file_txn,
-            description=f"Transaction binary generation for {device_name}",
-        )
-        txn_dest = opts.txn_name.format(device_name)
-        if opts.verbose:
-            print(f"copy {file_txn} to {txn_dest}")
-        shutil.copy(file_txn, txn_dest)
-        return file_txn
-
-    async def aiebu_asm(
-        self, input_file, output_file, ctrl_packet_file=None, ctrl_packet_idx=0
-    ):
-        asm_bin = find_aiebu_asm()
-
-        args = [
-            asm_bin,
-            "-t",
-            "aie2txn",
-            "-c",
-            input_file,
-            "-o",
-            output_file,
-        ]
-
-        if ctrl_packet_file:
-            ctrl_packet_size = os.path.getsize(ctrl_packet_file)
-            exteral_buffers_json = {
-                "external_buffers": {
-                    "buffer_ctrl": {
-                        "xrt_id": ctrl_packet_idx,
-                        "logical_id": -1,
-                        "size_in_bytes": ctrl_packet_size,
-                        "ctrl_pkt_buffer": 1,
-                        "name": "runtime_control_packet",
-                    },
-                }
-            }
-            with open(self.prepend_tmp("external_buffers.json"), "w") as f:
-                json.dump(exteral_buffers_json, f, indent=2)
-            args = args + [
-                "-j",
-                self.prepend_tmp("external_buffers.json"),
-                "-p",
-                ctrl_packet_file,
-            ]
-
-        await self.do_call(None, args)
-
-    async def generate_full_elf_config_json(
-        self, devices, device_to_id_mapping, opts, parent_task=None
-    ):
-        config = {"xrt-kernels": []}
-
-        for device_op, device_name in devices:
-            sequences = generate_runtime_sequences_list(device_op)
-
-            # Skip devices with no runtime sequences (e.g., @empty device)
-            if not sequences:
-                continue
-
-            max_arg_count = max(
-                len(seq_op.body.blocks[0].arguments) for seq_op, seq_name in sequences
-            )
-            arguments = [
-                {"name": f"arg_{i}", "type": "char *", "offset": hex(i * 8)}
-                for i in range(max_arg_count)
-            ]
-
-            kernel_entry = {
-                "name": device_name,
-                "arguments": arguments,
-                "instance": [],
-                "PDIs": [],
-            }
-
-            for other_device_name, other_pdi_id in device_to_id_mapping.items():
-                pdi_filename = self.pdi_file_name(other_device_name)
-                kernel_entry["PDIs"].append(
-                    {"id": other_pdi_id, "PDI_file": pdi_filename}
-                )
-
-            for seq_op, seq_name in sequences:
-                insts_filename = self.npu_insts_file_name(device_name, seq_name)
-                kernel_entry["instance"].append(
-                    {"id": seq_name, "TXN_ctrl_code_file": insts_filename}
-                )
-
-            config["xrt-kernels"].append(kernel_entry)
-
-        return config
-
-    async def assemble_full_elf(
-        self, config_json_path, output_elf_path, parent_task=None
-    ):
-        asm_bin = find_aiebu_asm()
-        args = [
-            asm_bin,
-            "-t",
-            "aie2_config",
-            "-j",
-            config_json_path,
-            "-o",
-            output_elf_path,
-        ]
-        await self.do_call(parent_task, args)
-        if self.opts.verbose:
-            print(f"Generated full ELF: {output_elf_path}")
-
-    async def generate_full_elf(self, devices, device_to_id_mapping, parent_task=None):
-        """Generate config.json and invoke aiebu-asm after all artifacts are ready"""
-        if parent_task:
-            self.progress_bar.update(
-                parent_task, advance=0, command="Generating config.json"
-            )
-        config = await self.generate_full_elf_config_json(
-            devices, device_to_id_mapping, self.opts, parent_task
-        )
-        config_json_path = self.prepend_tmp("config.json")
-        await write_file_async(json.dumps(config, indent=2), config_json_path)
-        if self.opts.verbose:
-            if self.opts.verbose:
-                print(f"Generated config.json: {config_json_path}")
-        if parent_task:
-            self.progress_bar.update(
-                parent_task, advance=1, command="Generating config.json"
-            )
-        full_elf_path = self.opts.full_elf_name or "aie.elf"
-        await self.assemble_full_elf(config_json_path, full_elf_path, parent_task)
-
-    async def process_ctrlpkt(self, module, device_op, device_name):
-        file_ctrlpkt_mlir = self.prepend_tmp(f"{device_name}_ctrlpkt.mlir")
-        file_ctrlpkt_bin = opts.ctrlpkt_name.format(device_name)
-        file_ctrlpkt_dma_seq_mlir = self.prepend_tmp(
-            f"{device_name}_ctrlpkt_dma_seq.mlir"
-        )
-        ctrlpkt_module = self.run_passes(
-            "builtin.module(aie.device(convert-aie-to-transaction{elf-dir="
-            + self.tmpdirname
-            + "},aie-txn-to-ctrl-packet,aie-legalize-ctrl-packet))",
-            module,
-            outputfile=file_ctrlpkt_mlir,
-            description="Transaction binary to control packet conversion",
-        )
-
-        # aie-translate --aie-ctrlpkt-to-bin -o ctrlpkt.bin
-        with ctrlpkt_module.context, Location.unknown():
-            ctrlpkt_bin = aiedialect.generate_control_packets(
-                ctrlpkt_module.operation, device_name
-            )
-        with open(file_ctrlpkt_bin, "wb") as f:
-            f.write(struct.pack("I" * len(ctrlpkt_bin), *ctrlpkt_bin))
-
-        # aie-opt --aie-ctrl-packet-to-dma -aie-dma-to-npu
-        ctrl_seq_module = self.run_passes(
-            "builtin.module(aie.device(aie-ctrl-packet-to-dma,aie-dma-to-npu))",
-            ctrlpkt_module,
-            outputfile=file_ctrlpkt_dma_seq_mlir,
-            description="Control packet to DMA sequence conversion",
-        )
-
-        # aie-translate --aie-npu-to-binary -o npu_insts.bin
-        with ctrl_seq_module.context, Location.unknown():
-            insts_bin = aiedialect.translate_npu_to_binary(
-                ctrl_seq_module.operation, device_name, opts.sequence_name
-            )
-        with open(opts.insts_name.format(device_name, "seq"), "wb") as f:
-            f.write(struct.pack("I" * len(insts_bin), *insts_bin))
-
-        ctrl_idx = 0
-        with Context(), Location.unknown():
-            # walk the device to find runtime sequence
-            seqs = find_ops(
-                device_op.operation,
-                lambda o: isinstance(o.operation.opview, aiexdialect.RuntimeSequenceOp),
-            )
-            if seqs:
-                ctrl_idx = len(seqs[0].regions[0].blocks[0].arguments.types)
-        await self.aiebu_asm(
-            opts.insts_name.format(device_name, "seq"),
-            opts.elf_name.format(device_name),
-            file_ctrlpkt_bin,
-            ctrl_idx,
-        )
-
-    async def process_elf(self, npu_insts_module, device_name):
-        # translate npu instructions to binary and write to file
-        npu_insts = aiedialect.translate_npu_to_binary(
-            npu_insts_module.operation, device_name, opts.sequence_name
-        )
-
-        npu_insts_bin = self.prepend_tmp(f"{device_name}_elf_insts.bin")
-        with open(npu_insts_bin, "wb") as f:
-            f.write(struct.pack("I" * len(npu_insts), *npu_insts))
-
-        await self.aiebu_asm(npu_insts_bin, opts.elf_name.format(device_name))
-
-    async def process_pdi_gen(self, device_name, file_design_pdi):
-        file_design_bif = self.prepend_tmp(f"{device_name}_design.bif")
-
-        await write_file_async(
-            emit_design_bif(self.tmpdirname, device_name), file_design_bif
-        )
-
-        await self.do_call(
-            None,
-            [
-                "bootgen",
-                "-arch",
-                "versal",
-                "-image",
-                file_design_bif,
-                "-o",
-                file_design_pdi,
-                "-w",
-            ],
-        )
-
-    # generate an xclbin. The inputs are self.mlir_module_str and the cdo
-    # binaries from the process_cdo step.
-    async def process_xclbin_gen(self, device_op, device_name):
-        task = self.progress_bar.add_task(
-            "[yellow] XCLBIN generation ", total=10, command="starting"
-        )
-
-        file_mem_topology = self.prepend_tmp(f"{device_name}_mem_topology.json")
-        file_partition = self.prepend_tmp(f"{device_name}_aie_partition.json")
-        file_input_partition = self.prepend_tmp(
-            f"{device_name}_aie_input_partition.json"
-        )
-        file_kernels = self.prepend_tmp(f"{device_name}_kernels.json")
-        file_pdi = self.pdi_file_name(device_name)
-
-        # collect the tasks to generate the inputs to xclbinutil
-        processes = []
-
-        # generate mem_topology.json
-        processes.append(
-            write_file_async(json.dumps(mem_topology, indent=2), file_mem_topology)
-        )
-
-        # generate aie_partition.json
-        processes.append(
-            write_file_async(
-                json.dumps(
-                    emit_partition(
-                        self.mlir_module_str, device_op, file_pdi, opts.kernel_id
-                    ),
-                    indent=2,
-                ),
-                file_partition,
-            )
-        )
-
-        # generate kernels.json
-        buffer_arg_names = [f"bo{i}" for i in range(5)]
-        processes.append(
-            write_file_async(
-                json.dumps(
-                    emit_design_kernel_json(
-                        opts.kernel_name,
-                        opts.kernel_id,
-                        opts.instance_name,
-                        buffer_arg_names,
-                    ),
-                    indent=2,
-                ),
-                file_kernels,
-            )
-        )
-
-        # generate pdi
-        processes.append(self.process_pdi_gen(device_name, file_pdi))
-
-        # get partition info from input xclbin, if present
-        if opts.xclbin_input:
-            processes.append(
-                self.do_call(
-                    task,
-                    [
-                        "xclbinutil",
-                        "--dump-section",
-                        f"AIE_PARTITION:JSON:{file_input_partition}",
-                        "--force",
-                        "--quiet",
-                        "--input",
-                        opts.xclbin_input,
-                    ],
-                )
-            )
-
-        # wait for all of the above to finish
-        await asyncio.gather(*processes)
-
-        # fmt: off
-        if opts.xclbin_input:
-            # patch the input partition json with the new partition information
-            with open(file_input_partition) as f:
-                input_partition = json.load(f)
-            with open(file_partition) as f:
-                new_partition = json.load(f)
-            input_partition["aie_partition"]["PDIs"].append(new_partition["aie_partition"]["PDIs"][0])
-            with open(file_partition, "w") as f:
-                json.dump(input_partition, f, indent=2)
-            flag = ['--input', opts.xclbin_input]
-        else:
-            flag = ["--add-replace-section", "MEM_TOPOLOGY:JSON:" + file_mem_topology]
-
-        # run xclbinutil to generate the xclbin
-        await self.do_call(task, ["xclbinutil"] + flag +
-                                 ["--add-kernel", file_kernels,
-                                  "--add-replace-section", "AIE_PARTITION:JSON:" + file_partition,
-                                  "--force", "--quiet", "--output", opts.xclbin_name.format(device_name)])
-        # fmt: on
-
-    async def process_host_cgen(self, aie_target, file_physical_with_elfs, device_name):
-        async with self.limit:
-            if self.stopall:
-                return
-
-            task = self.progress_bar.add_task(
-                "[yellow] Host compilation ", total=10, command="starting"
-            )
-
-            if opts.link_against_hsa:
-                file_inc_cpp = self.prepend_tmp("aie_data_movement.cpp")
-                await self.do_call(
-                    task,
-                    [
-                        "aie-translate",
-                        "--aie-generate-hsa",
-                        "--aie-device-name",
-                        device_name,
-                        file_physical_with_elfs,
-                        "-o",
-                        file_inc_cpp,
-                    ],
-                )
-
-            cmd = ["clang++", "-std=c++17"]
-            if opts.host_target:
-                cmd += ["--target=" + opts.host_target]
-                if (
-                    opts.aiesim
-                    and opts.host_target
-                    != aie.compiler.aiecc.configure.host_architecture
-                ):
-                    sys.exit(
-                        "Host cross-compile from "
-                        + aie.compiler.aiecc.configure.host_architecture
-                        + " to --target="
-                        + opts.host_target
-                        + " is not supported with --aiesim"
-                    )
-
-            if self.opts.sysroot:
-                cmd += ["--sysroot=" + opts.sysroot]
-                # In order to find the toolchain in the sysroot, we need to have
-                # a 'target' that includes 'linux' and for the 'lib/gcc/$target/$version'
-                # directory to have a corresponding 'include/gcc/$target/$version'.
-                # In some of our sysroots, it seems that we find a lib/gcc, but it
-                # doesn't have a corresponding include/gcc directory.  Instead
-                # force using '/usr/lib,include/gcc'
-                if opts.host_target == "aarch64-linux-gnu":
-                    cmd += [f"--gcc-toolchain={opts.sysroot}/usr"]
-                    # It looks like the G++ distribution is non standard, so add
-                    # an explicit handling of C++ library.
-                    # Perhaps related to https://discourse.llvm.org/t/add-gcc-install-dir-deprecate-gcc-toolchain-and-remove-gcc-install-prefix/65091/23
-                    cxx_include = glob.glob(f"{opts.sysroot}/usr/include/c++/*.*.*")[0]
-                    triple = os.path.basename(opts.sysroot)
-                    cmd += [f"-I{cxx_include}", f"-I{cxx_include}/{triple}"]
-                    gcc_lib = glob.glob(f"{opts.sysroot}/usr/lib/{triple}/*.*.*")[0]
-                    cmd += [f"-B{gcc_lib}", f"-L{gcc_lib}"]
-            install_path = aie.compiler.aiecc.configure.install_path()
-
-            # Setting everything up if linking against HSA
-            if opts.link_against_hsa:
-                cmd += ["-DHSA_RUNTIME"]
-                arch_name = opts.host_target.split("-")[0] + "-hsa"
-                hsa_path = os.path.join(aie.compiler.aiecc.configure.hsa_dir)
-                hsa_include_path = os.path.join(hsa_path, "..", "..", "..", "include")
-                hsa_lib_path = os.path.join(hsa_path, "..", "..")
-                hsa_so_path = os.path.join(hsa_lib_path, "libhsa-runtime64.so")
-            else:
-                arch_name = opts.host_target.split("-")[0]
-
-            # Getting a pointer to the libxaie include and library
-            runtime_xaiengine_path = os.path.join(
-                install_path, "runtime_lib", arch_name, "xaiengine"
-            )
-            xaiengine_include_path = os.path.join(runtime_xaiengine_path, "include")
-            xaiengine_lib_path = os.path.join(runtime_xaiengine_path, "lib")
-
-            # Getting a pointer to the library test_lib
-            runtime_testlib_path = os.path.join(
-                install_path,
-                "runtime_lib",
-                arch_name,
-                "test_lib",
-                "lib",
-            )
-
-            # Linking against the correct memory allocator
-            if opts.link_against_hsa:
-                memory_allocator = os.path.join(
-                    runtime_testlib_path, "libmemory_allocator_hsa.a"
-                )
-            else:
-                memory_allocator = os.path.join(
-                    runtime_testlib_path, "libmemory_allocator_ion.a"
-                )
-
-            cmd += [
-                memory_allocator,
-                "-I" + xaiengine_include_path,
-                "-L" + xaiengine_lib_path,
-                "-Wl,-R" + xaiengine_lib_path,
-                "-I" + self.tmpdirname,
-                "-fuse-ld=lld",
-                "-lm",
-                "-lxaienginecdo",
-            ]
-            # Linking against HSA
-            if opts.link_against_hsa:
-                cmd += [hsa_so_path]
-                cmd += ["-I%s" % hsa_include_path]
-                cmd += ["-Wl,-rpath,%s" % hsa_lib_path]
-
-            cmd += aie_target_defines(aie_target)
-
-            if len(opts.host_args) > 0:
-                await self.do_call(task, cmd + opts.host_args)
-
-            self.progress_bar.update(self.progress_bar.task_completed, advance=1)
-            self.progress_bar.update(task, advance=0, visible=False)
-
-    async def gen_sim(self, task, aie_target, file_physical, device_name):
-        # For simulation, we need to additionally parse the 'remaining' options to avoid things
-        # which conflict with the options below (e.g. -o)
-        host_opts = aie.compiler.aiecc.cl_arguments.strip_host_args_for_aiesim(
-            opts.host_args
-        )
-
-        sim_dir = self.prepend_tmp("sim")
-        shutil.rmtree(sim_dir, ignore_errors=True)
-        subdirs = ["arch", "reports", "config", "ps"]
-
-        def make_sim_dir(x):
-            dir = os.path.join(sim_dir, x)
-            os.makedirs(dir, exist_ok=True)
-            return dir
-
-        sim_arch_dir, sim_reports_dir, sim_config_dir, sim_ps_dir = map(
-            make_sim_dir, subdirs
-        )
-
-        install_path = aie.compiler.aiecc.configure.install_path()
-
-        # Setting everything up if linking against HSA
-        if opts.link_against_hsa:
-            arch_name = opts.host_target.split("-")[0] + "-hsa"
-        else:
-            arch_name = opts.host_target.split("-")[0]
-
-        runtime_simlib_path = os.path.join(
-            install_path, "aie_runtime_lib", aie_target.upper(), "aiesim"
-        )
-        runtime_testlib_path = os.path.join(
-            install_path,
-            "runtime_lib",
-            arch_name,
-            "test_lib",
-            "lib",
-        )
-        runtime_testlib_include_path = os.path.join(
-            install_path,
-            "runtime_lib",
-            arch_name,
-            "test_lib",
-            "include",
-        )
-        sim_genwrapper = os.path.join(runtime_simlib_path, "genwrapper_for_ps.cpp")
-        memory_allocator = os.path.join(
-            runtime_testlib_path, "libmemory_allocator_sim_aie.a"
-        )
-        # Getting a pointer to the libxaie include and library
-        runtime_xaiengine_path = os.path.join(
-            install_path, "runtime_lib", arch_name, "xaiengine"
-        )
-        xaiengine_include_path = os.path.join(runtime_xaiengine_path, "include")
-        xaiengine_lib_path = os.path.join(runtime_xaiengine_path, "lib")
-        sim_cc_args = [
-            "-fPIC",
-            "-flto",
-            "-fpermissive",
-            "-DAIE_OPTION_SCALAR_FLOAT_ON_VECTOR",
-            "-Wno-deprecated-declarations",
-            "-Wno-enum-constexpr-conversion",
-            "-Wno-format-security",
-            "-DSC_INCLUDE_DYNAMIC_PROCESSES",
-            "-D__AIESIM__",
-            "-D__PS_INIT_AIE__",
-            "-Og",
-            "-Dmain(...)=ps_main(...)",
-            "-I" + self.tmpdirname,
-            "-I" + opts.aietools_path + "/include",
-            "-I" + xaiengine_include_path,
-            "-I" + opts.aietools_path + "/data/osci_systemc/include",
-            "-I" + opts.aietools_path + "/include/xtlm/include",
-            "-I" + opts.aietools_path + "/include/common_cpp/common_cpp_v1_0/include",
-            "-I" + runtime_testlib_include_path,
-            memory_allocator,
-        ]  # clang is picky  # Pickup aie_inc.cpp
-
-        sim_link_args = [
-            "-L" + xaiengine_lib_path,
-            "-lxaienginecdo",
-            "-L" + opts.aietools_path + "/lib/lnx64.o",
-            "-L" + opts.aietools_path + "/lib/lnx64.o/Ubuntu",
-            "-L" + opts.aietools_path + "/data/osci_systemc/lib/lnx64",
-            "-Wl,--as-needed",
-            "-lsystemc",
-            "-lxtlm",
-        ]
-
-        processes = []
-        processes.append(
-            self.do_call(
-                task,
-                [
-                    "aie-translate",
-                    "--aie-mlir-to-xpe",
-                    "--aie-device-name",
-                    device_name,
-                    file_physical,
-                    "-o",
-                    os.path.join(sim_reports_dir, "graph.xpe"),
-                ],
-            )
-        )
-        processes.append(
-            self.do_call(
-                task,
-                [
-                    "aie-translate",
-                    "--aie-mlir-to-shim-solution",
-                    "--aie-device-name",
-                    device_name,
-                    file_physical,
-                    "-o",
-                    os.path.join(sim_arch_dir, "aieshim_solution.aiesol"),
-                ],
-            )
-        )
-        processes.append(
-            self.do_call(
-                task,
-                [
-                    "aie-translate",
-                    "--aie-mlir-to-scsim-config",
-                    "--aie-device-name",
-                    device_name,
-                    file_physical,
-                    "-o",
-                    os.path.join(sim_config_dir, "scsim_config.json"),
-                ],
-            )
-        )
-
-        flows_output = os.path.join(sim_dir, "flows_physical.mlir")
-        with Context(), Location.unknown():
-            module = Module.parse(await read_file_async(file_physical))
-        self.run_passes(
-            "builtin.module(aie.device(aie-find-flows))",
-            module,
-            outputfile=flows_output,
-            description="Finding flows for simulation",
-        )
-
-        processes.append(
-            self.do_call(
-                task,
-                [
-                    "clang++",
-                    "-O2",
-                    "-fuse-ld=lld",
-                    "-shared",
-                    "-o",
-                    os.path.join(sim_ps_dir, "ps.so"),
-                    sim_genwrapper,
-                    *aie_target_defines(aie_target),
-                    *host_opts,
-                    *sim_cc_args,
-                    *sim_link_args,
-                ],
-            )
-        )
-        await asyncio.gather(*processes)
-        await self.do_call(
-            task,
-            [
-                "aie-translate",
-                "--aie-device-name",
-                device_name,
-                "--aie-flows-to-json",
-                os.path.join(sim_dir, "flows_physical.mlir"),
-                "-o",
-                os.path.join(sim_dir, "flows_physical.json"),
-            ],
-        )
-
-        sim_script = self.prepend_tmp("aiesim.sh")
-        sim_script_template = dedent("""\
-            #!/bin/sh
-            prj_name=$(basename $(dirname $(realpath $0)))
-            root=$(dirname $(dirname $(realpath $0)))
-            vcd_filename=foo
-            if [ -n "$1" ]; then
-              vcd_filename=$1
-            fi
-            cd $root
-            aiesimulator --pkg-dir=${prj_name}/sim --dump-vcd ${vcd_filename}
-            """)
-        with open(sim_script, "wt") as sim_script_file:
-            sim_script_file.write(sim_script_template)
-        stats = os.stat(sim_script)
-        os.chmod(sim_script, stats.st_mode | stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)
-
-        target = os.path.join(sim_dir, ".target")
-        with open(target, "wt") as target_file:
-            target_file.write("hw\n")
-
-        print("Simulation generated...")
-        print("To run simulation: " + sim_script)
-
-    async def get_aie_target_for_device(self, mlir_input_file, device_name):
-        t = do_run(
-            [
-                "aie-translate",
-                "--aie-generate-target-arch",
-                "--aie-device-name",
-                device_name,
-                mlir_input_file,
-            ],
-            self.opts.verbose,
-        )
-        aie_target = t.stdout.strip()
-        return (aie_target, get_peano_target(aie_target))
-
-    async def run_flow(self):
-        # First, we run some aie-opt passes that transform the MLIR for every
-        # device. Then, we generate the core code for each AIE core tile in
-        # every device. The result of this is an ELF file with each core's
-        # code; we generate a new MLIR file which referencees those generated
-        # ELF files in place of their IR code. We then generate artifacts for
-        # each device individually, using this last generated IR.
-
-        nworkers = int(opts.nthreads)
-        if nworkers == 0:
-            nworkers = os.cpu_count()
-
-        module = parse_file_as_mlir(self.mlir_module_str)
-
-        self.limit = asyncio.Semaphore(nworkers)
-        with progress.Progress(
-            *progress.Progress.get_default_columns(),
-            progress.TimeElapsedColumn(),
-            progress.MofNCompleteColumn(),
-            progress.TextColumn("{task.fields[command]}"),
-            redirect_stdout=False,
-            redirect_stderr=False,
-            disable=not opts.progress,
-        ) as progress_bar:
-            self.progress_bar = progress_bar
-
-            # 1.) MLIR transformations
-
-            task1 = progress_bar.add_task(
-                "[green] MLIR compilation", total=3, command="1 Worker"
-            )
-
-            self.progress_bar.update(task1, advance=1, command="Generating device list")
-            devices = generate_devices_list(module)
-            if len(devices) == 0:
-                print("error: input MLIR must contain at least one aie.device")
-                sys.exit(1)
-            aie_targets, aie_peano_targets = [], []
-            for device_op, device_name in devices:
-                aie_target, aie_peano_target = await self.get_aie_target_for_device(
-                    opts.filename, device_name
-                )
-                aie_targets.append(aie_target)
-                aie_peano_targets.append(aie_peano_target)
-
-            if len(aie_targets) == 0 or not all(
-                aie_target == aie_targets[0] for aie_target in aie_targets
-            ):
-                print("error: all device targets in the file must be the same")
-                # TODO: remove this restriction? currently only needed by AIEVec
-                sys.exit(1)
-            aie_target, aie_peano_target = aie_targets[0], aie_peano_targets[0]
-
-            pass_pipeline = INPUT_WITH_ADDRESSES_PIPELINE(
-                opts.alloc_scheme,
-                opts.dynamic_objFifos,
-                opts.packet_sw_objFifos,
-                opts.ctrl_pkt_overlay,
-                aie_target,
-                opts.opt_level,
-            ).materialize(module=True)
-
-            self.progress_bar.update(task1, advance=1, command=pass_pipeline[0:30])
-            file_with_addresses = self.prepend_tmp("input_with_addresses.mlir")
-            file_with_addresses_module = self.run_passes(
-                pass_pipeline,
-                module,
-                outputfile=file_with_addresses,
-                description="Resource allocation and Object FIFO lowering",
-            )
-
-            requires_routing = (
-                opts.xcl
-                or opts.cdo
-                or opts.pdi
-                or opts.compile
-                or opts.compile_host
-                or opts.aiesim
-            )
-            if requires_routing:
-                input_physical = self.prepend_tmp("input_physical.mlir")
-                self.run_passes(
-                    "builtin.module(aie.device(aie-create-pathfinder-flows))",
-                    file_with_addresses_module,
-                    outputfile=input_physical,
-                    description="Running Router",
-                )
-            else:
-                input_physical = file_with_addresses
-
-            self.progress_bar.update(task1, advance=1)
-
-            # 2.) Generate code for each core
-            requires_core_compilation = (
-                opts.xcl
-                or opts.cdo
-                or opts.pdi
-                or opts.compile
-                or opts.compile_host
-                or opts.aiesim
-            )
-            if requires_core_compilation:
-                task2 = progress_bar.add_task(
-                    "[green] Generating code for each core", total=3, command=""
-                )
-
-                # create core ELF files for each device and core
-                elf_paths = {}
-                for i, (device_op, device_name) in enumerate(devices):
-                    aie_target, aie_peano_target = aie_targets[i], aie_peano_targets[i]
-                    elf_paths[device_name] = await self.process_cores(
-                        device_op,
-                        device_name,
-                        file_with_addresses,
-                        aie_target,
-                        aie_peano_target,
-                        task2,
-                    )
-                input_physical_with_elfs = await self.write_elf_paths_to_mlir(
-                    input_physical, elf_paths
-                )
-            else:
-                input_physical_with_elfs = input_physical
-
-            # 3.) Targets that require the cores to be lowered but apply across all devices
-
-            npu_insts_module = None
-            if opts.npu or opts.elf or opts.generate_full_elf and not opts.ctrlpkt:
-                task3 = progress_bar.add_task(
-                    "[green] Lowering NPU instructions", total=2, command=""
-                )
-                with Context(), Location.unknown():
-                    input_physical_with_elfs_module = Module.parse(
-                        await read_file_async(input_physical_with_elfs)
-                    )
-                    npu_pipeline = _create_npu_lowering_pipeline(opts.expand_load_pdis)
-                    pass_pipeline = npu_pipeline.materialize(module=True)
-                    npu_insts_file = self.prepend_tmp(f"npu_insts.mlir")
-                    self.progress_bar.update(
-                        task3, advance=1, command=pass_pipeline[0:30]
-                    )
-                    npu_insts_module = self.run_passes(
-                        pass_pipeline,
-                        input_physical_with_elfs_module,
-                        npu_insts_file,
-                        description="NPU instruction lowering",
-                    )
-
-                    # If expand_load_pdis is enabled, the pass may have created new devices
-                    # (e.g., @empty), so we need to regenerate the device list from the transformed module
-                    if opts.expand_load_pdis:
-                        devices = generate_devices_list(npu_insts_module)
-                        input_physical_with_expanded = self.prepend_tmp(
-                            "input_physical_with_expanded.mlir"
-                        )
-                        await write_file_async(
-                            str(npu_insts_module), input_physical_with_expanded
-                        )
-                        # Update both input_physical and input_physical_with_elfs to point to the file with expanded devices
-                        input_physical = input_physical_with_expanded
-                        input_physical_with_elfs = input_physical_with_expanded
-
-                    if opts.generate_full_elf:
-                        device_to_id_mapping = create_device_id_mapping(devices)
-                        assign_load_pdi_ids(npu_insts_module, device_to_id_mapping)
-                        transformed_mlir_path = self.prepend_tmp(
-                            "npu_insts_with_pdi_ids.mlir"
-                        )
-                        await write_file_async(
-                            str(npu_insts_module), transformed_mlir_path
-                        )
-
-                    self.progress_bar.update(task3, advance=1)
-
-            # 4.) Generate compilation artifacts for each device
-
-            # create other artifacts for each device
-            task4 = progress_bar.add_task(
-                "[green] Generating device artifacts", total=len(devices), command=""
-            )
-            for device_op, device_name in devices:
-                aie_target, aie_peano_target = await self.get_aie_target_for_device(
-                    input_physical, device_name
-                )
-                await self.run_flow_for_device(
-                    input_physical,
-                    input_physical_with_elfs,
-                    npu_insts_module,
-                    device_op,
-                    device_name,
-                    aie_target,
-                    aie_peano_target,
-                    task4,
-                )
-
-            self.maxtasks = 2
-            task5 = progress_bar.add_task(
-                "[green] Creating full ELF", total=2, command=""
-            )
-            if opts.generate_full_elf:
-                await self.generate_full_elf(devices, device_to_id_mapping, task5)
-
-    async def run_flow_for_device(
-        self,
-        input_physical,
-        input_physical_with_elfs,
-        npu_insts_module,
-        device_op,
-        device_name,
-        aie_target,
-        aie_peano_target,
-        parent_task_id,
-    ):
-        pb = self.progress_bar
-        nworkers = int(opts.nthreads)
-
-        # Optionally generate insts.bin for NPU instruction stream
-        if opts.npu or opts.generate_full_elf and not opts.ctrlpkt:
-            # write each runtime sequence binary into its own file
-            runtime_sequences = generate_runtime_sequences_list(device_op)
-            for seq_op, seq_name in runtime_sequences:
-                pb.update(
-                    parent_task_id,
-                    description=f"[green] Creating NPU instruction binary",
-                )
-                npu_insts = aiedialect.translate_npu_to_binary(
-                    npu_insts_module.operation, device_name, seq_name
-                )
-                npu_insts_path = self.npu_insts_file_name(device_name, seq_name)
-                with open(npu_insts_path, "wb") as f:
-                    f.write(struct.pack("I" * len(npu_insts), *npu_insts))
-                pb.update(parent_task_id, advance=1)
-
-        if opts.compile_host or opts.aiesim:
-            file_inc_cpp = self.prepend_tmp("aie_inc.cpp")
-            await self.do_call(
-                parent_task_id,
-                [
-                    "aie-translate",
-                    "--aie-generate-xaie",
-                    "--aie-device-name",
-                    device_name,
-                    input_physical_with_elfs,
-                    "-o",
-                    file_inc_cpp,
-                ],
-            )
-
-        if opts.compile_host and len(opts.host_args) > 0:
-            await self.process_host_cgen(
-                aie_target, input_physical_with_elfs, device_name
-            )
-
-        processes = []
-        if opts.aiesim:
-            processes.append(
-                self.gen_sim(parent_task_id, aie_target, input_physical, device_name)
-            )
-
-        input_physical_with_elfs_str = await read_file_async(input_physical_with_elfs)
-
-        if (
-            opts.cdo or opts.xcl or opts.pdi or opts.generate_full_elf
-        ) and opts.execute:
-            await self.process_cdo(input_physical_with_elfs_str, device_name)
-
-        if opts.xcl:
-            processes.append(self.process_xclbin_gen(device_op, device_name))
-        # self.process_pdi_gen is called in process_xclbin_gen,
-        # so don't call it again if opts.xcl is set
-        elif opts.pdi or opts.generate_full_elf:
-            processes.append(
-                self.process_pdi_gen(device_name, self.pdi_file_name(device_name))
-            )
-        with Context(), Location.unknown():
-            input_physical_with_elfs_module = Module.parse(input_physical_with_elfs_str)
-        if opts.txn and opts.execute:
-            input_physical_with_elfs = await self.process_txn(
-                input_physical_with_elfs_module, device_name
-            )
-
-        if opts.ctrlpkt and opts.execute:
-            processes.append(
-                self.process_ctrlpkt(
-                    input_physical_with_elfs_module, device_op, device_name
-                )
-            )
-
-        if opts.elf and not opts.ctrlpkt and opts.execute:
-            processes.append(self.process_elf(npu_insts_module, device_name))
-
-        await asyncio.gather(*processes)
-
-    def dumpprofile(self):
-        sortedruntimes = sorted(
-            self.runtimes.items(), key=lambda item: item[1], reverse=True
-        )
-        for i in range(50):
-            if i < len(sortedruntimes):
-                s1, s0 = sortedruntimes[i][1], sortedruntimes[i][0]
-                print(f"{s1:.4f} sec: {s0}")
-
-
-def run(mlir_module, args=None):
-    global opts
-    if args is not None:
-        opts = aie.compiler.aiecc.cl_arguments.parse_args(args)
-
-    opts.aietools_path = None
-
-    # If Ryzen AI Software is installed then use it for aietools
     try:
-        import ryzen_ai.__about__
-
-        version = ryzen_ai.__about__.__version__
-        path = os.path.realpath(ryzen_ai.__path__[0])
-        if opts.verbose:
-            print(f"Found Ryzen AI software version {version} at {path}")
-        # if ryzenai software is pip installed then the path is something like:
-        # <workdir>/venv/lib/python3.10/site-packages/
-        opts.aietools_path = os.path.realpath(os.path.join(path, ".."))
-    except:
-        pass
-
-    # Try to find xchesscc in the path
-    xchesscc_path = shutil.which("xchesscc")
-    if xchesscc_path:
-        xchesscc_bin_path = os.path.dirname(os.path.realpath(xchesscc_path))
-        xchesscc_path = os.path.dirname(xchesscc_bin_path)
-        if opts.verbose:
-            print(f"Found xchesscc at {xchesscc_path}")
-        os.environ["PATH"] = os.pathsep.join([os.environ["PATH"], xchesscc_bin_path])
-        if opts.aietools_path is None:
-            opts.aietools_path = xchesscc_path
-    else:
-        if opts.verbose:
-            print("xchesscc not found.")
-
-    if opts.aietools_path is None:
-        if opts.verbose:
-            print("Could not find aietools from Vitis or Ryzen AI Software.")
-        opts.aietools_path = "<aietools not found>"
+        aiecc_bin = _find_aiecc_binary()
+    except FileNotFoundError as e:
+        raise RuntimeError(str(e))
 
-    os.environ["AIETOOLS"] = opts.aietools_path
+    # Convert module to string if needed
+    mlir_str = str(mlir_module)
 
-    aie_path = aie.compiler.aiecc.configure.install_path()
-    peano_path = os.path.join(opts.peano_install_dir, "bin")
-    os.environ["PATH"] = os.pathsep.join([aie_path, os.environ["PATH"]])
-    os.environ["PATH"] = os.pathsep.join([peano_path, os.environ["PATH"]])
-
-    if opts.aiesim and not opts.xbridge:
-        sys.exit("AIE Simulation (--aiesim) currently requires --xbridge")
-
-    if opts.verbose:
-        print(f"Compiling {opts.filename}")
-
-    if opts.tmpdir:
-        tmpdirname = opts.tmpdir
-    elif opts.filename:
-        tmpdirname = os.path.basename(opts.filename) + ".prj"
-    else:
-        tmpdirname = tempfile.mkdtemp()
-    tmpdirname = os.path.abspath(tmpdirname)
+    # Write MLIR to temp file
+    with tempfile.NamedTemporaryFile(mode="w", suffix=".mlir", delete=False) as f:
+        f.write(mlir_str)
+        mlir_path = f.name
 
     try:
-        os.mkdir(tmpdirname)
-    except FileExistsError:
-        pass
-    if opts.verbose:
-        print("created temporary directory", tmpdirname)
-
-    # Create a temporary file holding the input ir, if opts.filename is None.
-    if opts.filename == None:
-        tmpinput_path = os.path.join(tmpdirname, "tmpinput.mlir")
-        with open(tmpinput_path, "w") as f:
-            f.write(str(mlir_module))
-        opts.filename = tmpinput_path
-
-    runner = FlowRunner(str(mlir_module), opts, tmpdirname)
-    asyncio.run(runner.run_flow())
-
-    if opts.profiling:
-        runner.dumpprofile()
-
-
-def main():
-    global opts
+        cmd = [aiecc_bin, mlir_path]
+        if args:
+            if isinstance(args, str):
+                cmd.extend(args.split())
+            else:
+                cmd.extend(args)
 
-    # Set MLIR_AIE_INSTALL_DIR if not already set
-    if "MLIR_AIE_INSTALL_DIR" not in os.environ:
-        install_dir = aie.compiler.aiecc.configure.install_path()
-        os.environ["MLIR_AIE_INSTALL_DIR"] = install_dir
+        result = subprocess.run(cmd, capture_output=True, text=True)
 
-    opts = aie.compiler.aiecc.cl_arguments.parse_args()
+        if result.returncode != 0:
+            error_msg = result.stderr if result.stderr else result.stdout
+            raise RuntimeError(
+                f"aiecc failed with exit code {result.returncode}: {error_msg}"
+            )
 
-    if opts.version:
-        print(f"aiecc.py {aie.compiler.aiecc.configure.git_commit}")
-        sys.exit(0)
+        return result.stdout
+    finally:
+        try:
+            os.unlink(mlir_path)
+        except OSError:
+            pass
 
-    if opts.filename is None:
-        print("error: the 'file' positional argument is required.")
-        sys.exit(1)
 
-    try:
-        with Context() as ctx, Location.unknown():
-            with open(opts.filename, "r") as f:
-                module = Module.parse(f.read())
-            module_str = str(module)
-    except Exception as e:
-        print(e)
-        sys.exit(1)
-    run(module_str)
+if __name__ == "__main__":
+    main()

From 4141ceaa97bb558857063e87c2f1a30bce50dd57 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Thu, 5 Mar 2026 17:27:54 -0700
Subject: [PATCH 10/28] [aiecc/jit] Fix JIT external function path resolution
 in C++ aiecc path

Three bugs found during post-rebase testing:

1. aiecc.cpp: atomicCopyFile was called unconditionally even when src==dest
   (JIT case where the .o is already in tmpDirName). Guard the copy with
   a src!=dest check in both compileCores and compileCoresUnified Peano
   sections.

2. aiecc.cpp: ld.lld resolves INPUT() paths relative to process cwd, not
   the linker script directory. Before generating the ldscript, patch the
   CoreOp's link_files attribute to use absolute paths (tmpDirName-prefixed)
   so INPUT() directives are always absolute.

3. python/utils/compile/utils.py: when work_dir is provided, write aie.mlir
   into work_dir and call aiecc directly on that path so relative link_with
   filenames (e.g. "add_one.o") resolve against work_dir where
   compile_external_kernel placed the compiled .o.

4. python/utils/jit.py: the post-migration audit (2974552f) incorrectly
   removed the pre-scan of args/kwargs for ExternalFunction instances.
   Since _instances.add() runs in __init__ (before the JIT call), and
   _instances.clear() runs before function() executes, the post-scan always
   found nothing. Restore the pre-scan so externally-constructed
   ExternalFunction instances are collected before _instances.clear().

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 python/utils/compile/utils.py | 22 +++++++++++++---
 python/utils/jit.py           | 21 ++++++++++++---
 tools/aiecc/aiecc.cpp         | 49 ++++++++++++++++++++++++++++-------
 3 files changed, 74 insertions(+), 18 deletions(-)

diff --git a/python/utils/compile/utils.py b/python/utils/compile/utils.py
index 3bae82747da..fdc4d9b7488 100644
--- a/python/utils/compile/utils.py
+++ b/python/utils/compile/utils.py
@@ -115,10 +115,24 @@ def compile_mlir_module(
         args.append("--verbose")
     if options:
         args.extend(options)
-    try:
-        aiecc.run(mlir_module, args)
-    except Exception as e:
-        raise RuntimeError("[aiecc] Compilation failed") from e
+    # Write the MLIR to a file co-located with the work_dir so that the C++
+    # aiecc binary resolves relative link_with paths (e.g. "add_one.o") against
+    # the same directory where compile_external_kernel placed the object files.
+    # If no work_dir is provided, fall back to a temporary file.
+    if work_dir:
+        mlir_file = os.path.join(work_dir, "aie.mlir")
+        with open(mlir_file, "w") as f:
+            f.write(str(mlir_module))
+        aiecc_bin = aiecc._find_aiecc_binary()
+        result = subprocess.run([aiecc_bin, mlir_file] + args, capture_output=True, text=True)
+        if result.returncode != 0:
+            error_msg = result.stderr if result.stderr else result.stdout
+            raise RuntimeError(f"[aiecc] Compilation failed with exit code {result.returncode}: {error_msg}")
+    else:
+        try:
+            aiecc.run(mlir_module, args)
+        except Exception as e:
+            raise RuntimeError("[aiecc] Compilation failed") from e
 
 
 def compile_external_kernel(func, kernel_dir, target_arch):
diff --git a/python/utils/jit.py b/python/utils/jit.py
index 9a45d37edd3..e9fd1f8afe5 100644
--- a/python/utils/jit.py
+++ b/python/utils/jit.py
@@ -69,6 +69,18 @@ def decorator(*args, **kwargs):
             tensor_args = _filter_tensor_args(args)
             return cached_kernel(*tensor_args, **kwargs)
 
+        # Collect ExternalFunction instances passed directly as arguments.
+        # These are captured before _instances.clear() since __init__ adds to
+        # _instances at construction time (outside the JIT call), so they would
+        # be lost after the clear below.
+        external_kernels = []
+        for arg in args:
+            if isinstance(arg, ExternalFunction):
+                external_kernels.append(arg)
+        for value in kwargs.values():
+            if isinstance(value, ExternalFunction):
+                external_kernels.append(value)
+
         # Clear any instances from previous runs to make sure if the user provided any broken code we don't try to recompile it
         ExternalFunction._instances.clear()
 
@@ -83,10 +95,11 @@ def decorator(*args, **kwargs):
         else:
             mlir_module = function(*args, **kwargs)
 
-        # Collect ExternalFunction instances registered during this JIT compilation
-        external_kernels = [
-            func for func in ExternalFunction._instances if not func._compiled
-        ]
+        # Also collect any ExternalFunction instances created during function()
+        # execution (e.g. inside algorithm helpers that construct them internally).
+        for func in ExternalFunction._instances:
+            if not func._compiled and func not in external_kernels:
+                external_kernels.append(func)
 
         # Determine target architecture based on device type
         current_device = DefaultNPURuntime.device()
diff --git a/tools/aiecc/aiecc.cpp b/tools/aiecc/aiecc.cpp
index 401c70d1809..cd5f009858a 100644
--- a/tools/aiecc/aiecc.cpp
+++ b/tools/aiecc/aiecc.cpp
@@ -1933,7 +1933,29 @@ static LogicalResult compileCore(MLIRContext &context, ModuleOp moduleOp,
                                       std::to_string(core.row) + ".ld.script");
 
   if (!xbridge) {
-    // Generate linker script to file using the original (unmodified) module
+    // Before generating the linker script, patch the CoreOp's link_files
+    // attribute to use absolute paths so that ld.lld's INPUT() directives
+    // resolve correctly regardless of the linker's working directory.
+    moduleOp->walk([&](xilinx::AIE::CoreOp coreOp) {
+      auto tileOp =
+          dyn_cast<xilinx::AIE::TileOp>(coreOp.getTile().getDefiningOp());
+      if (!tileOp || tileOp.getCol() != core.col || tileOp.getRow() != core.row)
+        return;
+      if (auto filesAttr = coreOp.getLinkFiles()) {
+        SmallVector<mlir::Attribute> absFiles;
+        for (auto f : filesAttr->getAsRange<mlir::StringAttr>()) {
+          StringRef name = f.getValue();
+          SmallString<256> absPath(tmpDirName);
+          sys::path::append(absPath, sys::path::filename(name));
+          absFiles.push_back(
+              mlir::StringAttr::get(moduleOp->getContext(), absPath));
+        }
+        coreOp.setLinkFilesAttr(
+            mlir::ArrayAttr::get(moduleOp->getContext(), absFiles));
+      }
+    });
+
+    // Generate linker script to file using the module with absolute link paths
     std::error_code ec;
     raw_fd_ostream ldScriptFile(ldScriptPath, ec);
     if (ec) {
@@ -2358,13 +2380,18 @@ static LogicalResult compileCore(MLIRContext &context, ModuleOp moduleOp,
       // parallel cores that share the same .o filename.
       SmallString<256> destLinkWith(tmpDirName);
       sys::path::append(destLinkWith, sys::path::filename(lf));
-      if (failed(
-              atomicCopyFile(srcLinkWith, tmpDirName, sys::path::filename(lf))))
-        return failure();
+      if (srcLinkWith != destLinkWith) {
+        if (failed(atomicCopyFile(srcLinkWith, tmpDirName,
+                                  sys::path::filename(lf))))
+          return failure();
 
-      if (verbose)
-        llvm::outs() << "Copied link_with object: " << srcLinkWith << " -> "
-                     << destLinkWith << "\n";
+        if (verbose)
+          llvm::outs() << "Copied link_with object: " << srcLinkWith << " -> "
+                       << destLinkWith << "\n";
+      } else if (verbose) {
+        llvm::outs() << "link_with object already in place: " << srcLinkWith
+                     << "\n";
+      }
 
       // Note: We don't add the object file to linkCmd because the linker
       // script already has INPUT() directives for each file
@@ -3037,9 +3064,11 @@ compileCoresUnified(MLIRContext &context, ModuleOp moduleOp,
 
         SmallString<256> destLinkWith(tmpDirName);
         sys::path::append(destLinkWith, sys::path::filename(lf));
-        if (failed(atomicCopyFile(srcLinkWith, tmpDirName,
-                                  sys::path::filename(lf))))
-          return failure();
+        if (srcLinkWith != destLinkWith) {
+          if (failed(atomicCopyFile(srcLinkWith, tmpDirName,
+                                    sys::path::filename(lf))))
+            return failure();
+        }
       }
 
       SmallString<128> absLdScriptPath;

From 2d3931f719ca6d4245c98748694a69657a7c986e Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Thu, 5 Mar 2026 17:30:43 -0700
Subject: [PATCH 11/28] [format] Apply black formatting to compile/utils.py

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 python/utils/compile/utils.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/python/utils/compile/utils.py b/python/utils/compile/utils.py
index fdc4d9b7488..25267a5b4a6 100644
--- a/python/utils/compile/utils.py
+++ b/python/utils/compile/utils.py
@@ -124,10 +124,14 @@ def compile_mlir_module(
         with open(mlir_file, "w") as f:
             f.write(str(mlir_module))
         aiecc_bin = aiecc._find_aiecc_binary()
-        result = subprocess.run([aiecc_bin, mlir_file] + args, capture_output=True, text=True)
+        result = subprocess.run(
+            [aiecc_bin, mlir_file] + args, capture_output=True, text=True
+        )
         if result.returncode != 0:
             error_msg = result.stderr if result.stderr else result.stdout
-            raise RuntimeError(f"[aiecc] Compilation failed with exit code {result.returncode}: {error_msg}")
+            raise RuntimeError(
+                f"[aiecc] Compilation failed with exit code {result.returncode}: {error_msg}"
+            )
     else:
         try:
             aiecc.run(mlir_module, args)

From 4063532084168fbe5ffbea66931cee28b4ed3b0d Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Thu, 5 Mar 2026 17:49:17 -0700
Subject: [PATCH 12/28] [quality] Production-level audit fixes across
 aiecc.cpp, jit.py, utils.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

aiecc.cpp:
- Fix thread-safety: ldscript CoreOp link_files patching was mutating the
  shared moduleOp from parallel core threads. Move the walk+patch onto
  coreModule (the per-thread clone) and pass it to AIETranslateToLdScript.
- Add missing verbose logging in compileCoresUnified Peano copy loop to
  match the logging in compileCores.
- Fix stale comment "above" → "as part of" in getCoreInfo.

python/utils/compile/utils.py:
- Replace call to private API aiecc._find_aiecc_binary() with
  shutil.which("aiecc"), which is already imported. Add a clear error
  message if the binary is not found.
- Improve error message format: include a newline before stderr output
  to separate it visually from the RuntimeError prefix.

python/utils/jit.py:
- Replace O(n) list `in` check for deduplication with an id-based set,
  making intent explicit: deduplication is by object identity, not equality.
- Remove duplicate "Determine target architecture based on device type"
  comment.
- Tighten collection of arg-passed ExternalFunction into a list
  comprehension for clarity.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 python/utils/compile/utils.py | 19 +++++++++++++------
 python/utils/jit.py           | 31 +++++++++++++++----------------
 tools/aiecc/aiecc.cpp         | 28 +++++++++++++++++-----------
 3 files changed, 45 insertions(+), 33 deletions(-)

diff --git a/python/utils/compile/utils.py b/python/utils/compile/utils.py
index 25267a5b4a6..cd481a9b049 100644
--- a/python/utils/compile/utils.py
+++ b/python/utils/compile/utils.py
@@ -115,22 +115,29 @@ def compile_mlir_module(
         args.append("--verbose")
     if options:
         args.extend(options)
-    # Write the MLIR to a file co-located with the work_dir so that the C++
-    # aiecc binary resolves relative link_with paths (e.g. "add_one.o") against
-    # the same directory where compile_external_kernel placed the object files.
-    # If no work_dir is provided, fall back to a temporary file.
+    # Write the MLIR to a file co-located with work_dir so that the C++ aiecc
+    # binary resolves relative link_with paths (e.g. "add_one.o") against the
+    # same directory where compile_external_kernel placed the compiled objects.
+    # If no work_dir is provided, fall back to the aiecc.run() helper which
+    # writes to a temporary file internally.
     if work_dir:
+        aiecc_bin = shutil.which("aiecc")
+        if not aiecc_bin:
+            raise RuntimeError(
+                "Could not find 'aiecc' binary. Ensure mlir-aie is installed "
+                "and its bin directory is in PATH."
+            )
         mlir_file = os.path.join(work_dir, "aie.mlir")
         with open(mlir_file, "w") as f:
             f.write(str(mlir_module))
-        aiecc_bin = aiecc._find_aiecc_binary()
         result = subprocess.run(
             [aiecc_bin, mlir_file] + args, capture_output=True, text=True
         )
         if result.returncode != 0:
             error_msg = result.stderr if result.stderr else result.stdout
             raise RuntimeError(
-                f"[aiecc] Compilation failed with exit code {result.returncode}: {error_msg}"
+                f"[aiecc] Compilation failed with exit code {result.returncode}:\n"
+                f"{error_msg}"
             )
     else:
         try:
diff --git a/python/utils/jit.py b/python/utils/jit.py
index e9fd1f8afe5..70278080d24 100644
--- a/python/utils/jit.py
+++ b/python/utils/jit.py
@@ -69,19 +69,18 @@ def decorator(*args, **kwargs):
             tensor_args = _filter_tensor_args(args)
             return cached_kernel(*tensor_args, **kwargs)
 
-        # Collect ExternalFunction instances passed directly as arguments.
-        # These are captured before _instances.clear() since __init__ adds to
-        # _instances at construction time (outside the JIT call), so they would
-        # be lost after the clear below.
-        external_kernels = []
-        for arg in args:
-            if isinstance(arg, ExternalFunction):
-                external_kernels.append(arg)
-        for value in kwargs.values():
-            if isinstance(value, ExternalFunction):
-                external_kernels.append(value)
-
-        # Clear any instances from previous runs to make sure if the user provided any broken code we don't try to recompile it
+        # Collect ExternalFunction instances passed as direct arguments first.
+        # ExternalFunction.__init__ registers to _instances at construction time
+        # (before this JIT call), so they must be captured before the clear below.
+        # Note: ExternalFunction instances nested inside containers are not
+        # collected here; top-level args cover all known call patterns.
+        external_kernels = [
+            arg for arg in args if isinstance(arg, ExternalFunction)
+        ] + [v for v in kwargs.values() if isinstance(v, ExternalFunction)]
+        seen = set(id(k) for k in external_kernels)
+
+        # Clear stale instances from previous (possibly failed) runs so that a
+        # broken kernel doesn't prevent a corrected one from being recompiled.
         ExternalFunction._instances.clear()
 
         # Execute the function to generate MLIR
@@ -95,13 +94,13 @@ def decorator(*args, **kwargs):
         else:
             mlir_module = function(*args, **kwargs)
 
-        # Also collect any ExternalFunction instances created during function()
+        # Also collect ExternalFunction instances created during function()
         # execution (e.g. inside algorithm helpers that construct them internally).
         for func in ExternalFunction._instances:
-            if not func._compiled and func not in external_kernels:
+            if not func._compiled and id(func) not in seen:
                 external_kernels.append(func)
+                seen.add(id(func))
 
-        # Determine target architecture based on device type
         current_device = DefaultNPURuntime.device()
 
         # Determine target architecture based on device type
diff --git a/tools/aiecc/aiecc.cpp b/tools/aiecc/aiecc.cpp
index cd5f009858a..6957933ec67 100644
--- a/tools/aiecc/aiecc.cpp
+++ b/tools/aiecc/aiecc.cpp
@@ -1106,7 +1106,7 @@ static CoreInfo getCoreInfo(xilinx::AIE::CoreOp coreOp) {
   }
 
   // Prefer canonical link_files ArrayAttr (populated by AIEAssignCoreLinkFiles,
-  // which runs as part of the resource-allocation pipeline above).
+  // which runs as part of the resource-allocation pipeline).
   if (auto filesAttr = coreOp.getLinkFiles()) {
     for (auto f : filesAttr->getAsRange<mlir::StringAttr>())
       info.linkFiles.push_back(f.getValue().str());
@@ -1933,10 +1933,11 @@ static LogicalResult compileCore(MLIRContext &context, ModuleOp moduleOp,
                                       std::to_string(core.row) + ".ld.script");
 
   if (!xbridge) {
-    // Before generating the linker script, patch the CoreOp's link_files
-    // attribute to use absolute paths so that ld.lld's INPUT() directives
-    // resolve correctly regardless of the linker's working directory.
-    moduleOp->walk([&](xilinx::AIE::CoreOp coreOp) {
+    // Rewrite link_files on this core's clone to use absolute paths so that
+    // ld.lld's INPUT() directives resolve correctly regardless of the linker's
+    // working directory.  We operate on coreModule (the per-thread clone), not
+    // on the shared moduleOp, to avoid data races with parallel core threads.
+    coreModule->walk([&](xilinx::AIE::CoreOp coreOp) {
       auto tileOp =
           dyn_cast<xilinx::AIE::TileOp>(coreOp.getTile().getDefiningOp());
       if (!tileOp || tileOp.getCol() != core.col || tileOp.getRow() != core.row)
@@ -1944,18 +1945,17 @@ static LogicalResult compileCore(MLIRContext &context, ModuleOp moduleOp,
       if (auto filesAttr = coreOp.getLinkFiles()) {
         SmallVector<mlir::Attribute> absFiles;
         for (auto f : filesAttr->getAsRange<mlir::StringAttr>()) {
-          StringRef name = f.getValue();
           SmallString<256> absPath(tmpDirName);
-          sys::path::append(absPath, sys::path::filename(name));
+          sys::path::append(absPath, sys::path::filename(f.getValue()));
           absFiles.push_back(
-              mlir::StringAttr::get(moduleOp->getContext(), absPath));
+              mlir::StringAttr::get(coreModule->getContext(), absPath));
         }
         coreOp.setLinkFilesAttr(
-            mlir::ArrayAttr::get(moduleOp->getContext(), absFiles));
+            mlir::ArrayAttr::get(coreModule->getContext(), absFiles));
       }
     });
 
-    // Generate linker script to file using the module with absolute link paths
+    // Generate linker script from the patched clone.
     std::error_code ec;
     raw_fd_ostream ldScriptFile(ldScriptPath, ec);
     if (ec) {
@@ -1966,7 +1966,7 @@ static LogicalResult compileCore(MLIRContext &context, ModuleOp moduleOp,
     }
 
     if (failed(xilinx::AIE::AIETranslateToLdScript(
-            moduleOp, ldScriptFile, core.col, core.row, deviceName))) {
+            *coreModule, ldScriptFile, core.col, core.row, deviceName))) {
       std::lock_guard<std::mutex> lock(outputMutex);
       llvm::errs() << "Error generating linker script\n";
       return failure();
@@ -3068,6 +3068,12 @@ compileCoresUnified(MLIRContext &context, ModuleOp moduleOp,
           if (failed(atomicCopyFile(srcLinkWith, tmpDirName,
                                     sys::path::filename(lf))))
             return failure();
+          if (verbose)
+            llvm::outs() << "Copied link_with object: " << srcLinkWith << " -> "
+                         << destLinkWith << "\n";
+        } else if (verbose) {
+          llvm::outs() << "link_with object already in place: " << srcLinkWith
+                       << "\n";
         }
       }
 

From 3a09dbca32946bab74a5188a0bdd6ed102dd7e96 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Thu, 5 Mar 2026 18:11:25 -0700
Subject: [PATCH 13/28] [fix] Fix two bugs exposed by running tests without
 warm cache
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

aiecc.cpp — ldscript clone was already LLVM-lowered:
  The ldscript patch cloned coreModule, but coreModule had already been
  destructively lowered to LLVM IR by runLLVMLoweringPipeline.
  AIETranslateToLdScript expects AIE dialect input, so it failed for every
  first-time compilation.  Fix: clone moduleOp (the shared pre-lowering
  module) into a fresh ldScriptModule specifically for this purpose.
  This also correctly addresses the thread-safety concern: ldScriptModule
  is a per-call local clone so mutations are never visible to other threads.

python/utils/jit.py — is_placed=True captured empty outer context:
  Program.resolve_program() creates its own mlir_mod_ctx() internally and
  returns its module.  The jit decorator was wrapping function() in an
  outer mlir_mod_ctx() and capturing ctx.module, which was always empty
  because all AIE ops were generated into resolve_program's inner context.
  Fix: always use the function's return value directly; the is_placed=True
  / is_placed=False split is now unnecessary and removed.  Also remove the
  now-unused mlir_mod_ctx import.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 python/utils/jit.py   | 19 ++++++++-----------
 tools/aiecc/aiecc.cpp | 21 ++++++++++++---------
 2 files changed, 20 insertions(+), 20 deletions(-)

diff --git a/python/utils/jit.py b/python/utils/jit.py
index 70278080d24..44be513270a 100644
--- a/python/utils/jit.py
+++ b/python/utils/jit.py
@@ -11,7 +11,6 @@
 import hashlib
 import numpy as np
 
-from aie.extras.context import mlir_mod_ctx
 from .compile import compile_mlir_module, compile_external_kernel
 from .npukernel import NPUKernel
 from aie.dialects.aie import AIEDevice
@@ -83,16 +82,14 @@ def decorator(*args, **kwargs):
         # broken kernel doesn't prevent a corrected one from being recompiled.
         ExternalFunction._instances.clear()
 
-        # Execute the function to generate MLIR
-        if is_placed:
-            with mlir_mod_ctx() as ctx:
-                function(*args, **kwargs)
-                assert (
-                    ctx.module.operation.verify()
-                ), f"Verification failed for '{function.__name__}'"
-                mlir_module = ctx.module
-        else:
-            mlir_module = function(*args, **kwargs)
+        # Execute the function to generate MLIR.
+        # resolve_program() opens its own mlir_mod_ctx() internally and returns
+        # its module.  Capturing ctx.module from an outer context would give an
+        # empty module, so we always use the function's return value.
+        mlir_module = function(*args, **kwargs)
+        assert (
+            mlir_module.operation.verify()
+        ), f"Verification failed for '{function.__name__}'"
 
         # Also collect ExternalFunction instances created during function()
         # execution (e.g. inside algorithm helpers that construct them internally).
diff --git a/tools/aiecc/aiecc.cpp b/tools/aiecc/aiecc.cpp
index 6957933ec67..f830e490b14 100644
--- a/tools/aiecc/aiecc.cpp
+++ b/tools/aiecc/aiecc.cpp
@@ -1933,11 +1933,14 @@ static LogicalResult compileCore(MLIRContext &context, ModuleOp moduleOp,
                                       std::to_string(core.row) + ".ld.script");
 
   if (!xbridge) {
-    // Rewrite link_files on this core's clone to use absolute paths so that
-    // ld.lld's INPUT() directives resolve correctly regardless of the linker's
-    // working directory.  We operate on coreModule (the per-thread clone), not
-    // on the shared moduleOp, to avoid data races with parallel core threads.
-    coreModule->walk([&](xilinx::AIE::CoreOp coreOp) {
+    // Clone the pre-lowering module for ldscript generation.  We need a
+    // separate clone here because coreModule will be destructively lowered to
+    // LLVM IR by runLLVMLoweringPipeline below, making it unsuitable for
+    // AIETranslateToLdScript.  We also cannot mutate the shared moduleOp
+    // (data race with parallel core threads), so this per-thread clone is the
+    // correct place to rewrite link_files to absolute paths.
+    OwningOpRef<ModuleOp> ldScriptModule = moduleOp.clone();
+    ldScriptModule->walk([&](xilinx::AIE::CoreOp coreOp) {
       auto tileOp =
           dyn_cast<xilinx::AIE::TileOp>(coreOp.getTile().getDefiningOp());
       if (!tileOp || tileOp.getCol() != core.col || tileOp.getRow() != core.row)
@@ -1948,14 +1951,14 @@ static LogicalResult compileCore(MLIRContext &context, ModuleOp moduleOp,
           SmallString<256> absPath(tmpDirName);
           sys::path::append(absPath, sys::path::filename(f.getValue()));
           absFiles.push_back(
-              mlir::StringAttr::get(coreModule->getContext(), absPath));
+              mlir::StringAttr::get(ldScriptModule->getContext(), absPath));
         }
         coreOp.setLinkFilesAttr(
-            mlir::ArrayAttr::get(coreModule->getContext(), absFiles));
+            mlir::ArrayAttr::get(ldScriptModule->getContext(), absFiles));
       }
     });
 
-    // Generate linker script from the patched clone.
+    // Generate linker script from the pre-lowering clone with absolute paths.
     std::error_code ec;
     raw_fd_ostream ldScriptFile(ldScriptPath, ec);
     if (ec) {
@@ -1966,7 +1969,7 @@ static LogicalResult compileCore(MLIRContext &context, ModuleOp moduleOp,
     }
 
     if (failed(xilinx::AIE::AIETranslateToLdScript(
-            *coreModule, ldScriptFile, core.col, core.row, deviceName))) {
+            *ldScriptModule, ldScriptFile, core.col, core.row, deviceName))) {
       std::lock_guard<std::mutex> lock(outputMutex);
       llvm::errs() << "Error generating linker script\n";
       return failure();

From 9f6a19d58cf03d708c2f34df439ba3fde5407bf5 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Fri, 6 Mar 2026 10:21:34 -0700
Subject: [PATCH 14/28] [fix] Fix add_one_scale_func_link_with_chess output
 buffer mismatch

The runtime_sequence had two parameters (%in, %out), mapping %out to
XRT group_id(4) = bo_inB. But test.cpp reads results from bo_out at
group_id(5), which was never written, causing all-zero output.

Add the standard dummy middle buffer (%buf : memref<32xi32>) to shift
%out to group_id(5) = bo_out, matching test.cpp's three-buffer layout.
Also remove the unused %c8 constant from the sequence body.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 test/npu-xrt/add_one_scale_func_link_with_chess/aie.mlir | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/npu-xrt/add_one_scale_func_link_with_chess/aie.mlir b/test/npu-xrt/add_one_scale_func_link_with_chess/aie.mlir
index dda65af4e99..05e2107a73f 100644
--- a/test/npu-xrt/add_one_scale_func_link_with_chess/aie.mlir
+++ b/test/npu-xrt/add_one_scale_func_link_with_chess/aie.mlir
@@ -54,10 +54,9 @@ module {
       aie.end
     }
 
-    aie.runtime_sequence(%in : memref<64xi32>, %out : memref<64xi32>) {
+    aie.runtime_sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
       %c0  = arith.constant 0 : i64
       %c1  = arith.constant 1 : i64
-      %c8  = arith.constant 8 : i64
       %c64 = arith.constant 64 : i64
       aiex.npu.dma_memcpy_nd(%out[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0,%c1]) {metadata = @of_out, id = 1 : i64} : memref<64xi32>
       aiex.npu.dma_memcpy_nd(%in[%c0,%c0,%c0,%c0][%c1,%c1,%c1,%c64][%c0,%c0,%c0,%c1])  {metadata = @of_in,  id = 0 : i64, issue_token = true} : memref<64xi32>

From c9f4feb313a4aa055c3a9fa6d8851bd41b20a6e3 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Fri, 6 Mar 2026 11:49:12 -0700
Subject: [PATCH 15/28] [fix] Restore is_placed branch in jit.py for placed
 designs

Commit 3a09dbca32 ("Fix two bugs exposed by running tests without warm
cache") correctly observed that non-placed designs (using
Program.resolve_program()) already open their own mlir_mod_ctx()
internally and return the module, so wrapping them in an outer context
gives an empty module.

However, the fix over-reached: it also removed the outer mlir_mod_ctx()
for placed designs, which use the raw @device(...) DSL.  Placed designs
populate the context as a side effect and return nothing, so they still
require an outer context to be active when their decorator arguments are
evaluated.  Without it, iron.get_current_device().resolve() fails with:

  RuntimeError: An MLIR function requires a Context but none was provided

Fix: restore the is_placed branch so placed designs get an outer
mlir_mod_ctx() (capturing ctx.module), while non-placed designs
continue to use the function's return value directly.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 python/utils/jit.py | 26 +++++++++++++++++++-------
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/python/utils/jit.py b/python/utils/jit.py
index 44be513270a..9e6990bf669 100644
--- a/python/utils/jit.py
+++ b/python/utils/jit.py
@@ -11,6 +11,7 @@
 import hashlib
 import numpy as np
 
+from aie.extras.context import mlir_mod_ctx
 from .compile import compile_mlir_module, compile_external_kernel
 from .npukernel import NPUKernel
 from aie.dialects.aie import AIEDevice
@@ -83,13 +84,24 @@ def decorator(*args, **kwargs):
         ExternalFunction._instances.clear()
 
         # Execute the function to generate MLIR.
-        # resolve_program() opens its own mlir_mod_ctx() internally and returns
-        # its module.  Capturing ctx.module from an outer context would give an
-        # empty module, so we always use the function's return value.
-        mlir_module = function(*args, **kwargs)
-        assert (
-            mlir_module.operation.verify()
-        ), f"Verification failed for '{function.__name__}'"
+        # Placed designs use the raw @device(...) DSL and populate the context
+        # as a side effect (returning nothing), so we must provide an outer
+        # mlir_mod_ctx() and capture ctx.module.
+        # Non-placed designs use Program.resolve_program(), which opens its own
+        # mlir_mod_ctx() internally and returns the module directly; wrapping
+        # them in an outer context would give an empty module.
+        if is_placed:
+            with mlir_mod_ctx() as ctx:
+                function(*args, **kwargs)
+                assert (
+                    ctx.module.operation.verify()
+                ), f"Verification failed for '{function.__name__}'"
+                mlir_module = ctx.module
+        else:
+            mlir_module = function(*args, **kwargs)
+            assert (
+                mlir_module.operation.verify()
+            ), f"Verification failed for '{function.__name__}'"
 
         # Also collect ExternalFunction instances created during function()
         # execution (e.g. inside algorithm helpers that construct them internally).

From 19066a165f467dc8a2ef4e92c54abc6d5473f1db Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Fri, 6 Mar 2026 11:58:14 -0700
Subject: [PATCH 16/28] [fix] Fix Windows build: use file_t-free
 createUniqueFile in atomicCopyFile

On Windows, llvm::sys::fs::file_t is HANDLE (not int), so calling
closeFile(int tmpFD) fails to compile with MSVC:

  error C2664: cannot convert argument 1 from 'int' to 'file_t &'

The open+close sequence was unnecessary: we only needed a unique path,
not an open file descriptor, since copy_file immediately overwrites the
temp file. Switch to the two-argument createUniqueFile(model, path)
overload which reserves a unique filename without opening it, and drop
the closeFile call entirely. This is correct on all platforms.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 tools/aiecc/aiecc.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tools/aiecc/aiecc.cpp b/tools/aiecc/aiecc.cpp
index f830e490b14..d0f70f604c6 100644
--- a/tools/aiecc/aiecc.cpp
+++ b/tools/aiecc/aiecc.cpp
@@ -1812,12 +1812,10 @@ static LogicalResult atomicCopyFile(StringRef src, StringRef destDir,
   tmpFilename += sys::path::extension(destBasename);
   sys::path::append(tmpModel, tmpFilename);
   SmallString<256> tmpPath;
-  int tmpFD;
-  if (sys::fs::createUniqueFile(tmpModel, tmpFD, tmpPath)) {
+  if (sys::fs::createUniqueFile(tmpModel, tmpPath)) {
     llvm::errs() << "Error: could not create temp file in " << destDir << "\n";
     return failure();
   }
-  sys::fs::closeFile(tmpFD);
 
   if (std::error_code ec = sys::fs::copy_file(src, tmpPath)) {
     llvm::errs() << "Error: could not copy " << src << " to " << tmpPath << ": "

From df9a92f186c5d2e31236d5a0ef01c1cfa87c5c76 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Tue, 10 Mar 2026 15:55:23 -0600
Subject: [PATCH 17/28] try to update some tests

---
 mlir_exercises/tutorial-8/aie.mlir            |   8 +-
 mlir_exercises/tutorial-8/answers/aie.mlir    |   8 +-
 mlir_exercises/tutorial-9/README.md           |  10 +-
 mlir_exercises/tutorial-9/aie.mlir            |   4 +-
 .../tutorial-9/answers/aie_matmul.mlir        |   4 +-
 .../basic/event_trace/aie_trace.mlir          |   4 +-
 .../basic/event_trace/aie_trace.py            |   3 +-
 .../matrix_multiplication/cascade/cascade.py  |  11 +-
 .../cascade/cascade_placed.py                 |  11 +-
 .../matrix_vector/matrix_vector.py            |   9 +-
 .../matrix_vector/matrix_vector_placed.py     |   9 +-
 .../single_core/single_core.py                |  10 +-
 .../single_core/single_core_placed.py         |  10 +-
 .../whole_array/whole_array.py                |  10 +-
 .../whole_array/whole_array_placed.py         |  10 +-
 .../basic/packet_switch/aie_add_placed.py     |   8 +-
 .../basic/packet_switch/aie_mul_placed.py     |   8 +-
 .../passthrough_kernel_placed.py              |   6 +-
 .../row_wise_bias_add_placed.py               |   6 +-
 .../vector_reduce_add_placed.py               |   6 +-
 .../row_wise_vector_reduce_max_placed.py      |  12 +-
 .../vector_reduce_max_chained_placed.py       |  12 +-
 .../vector_reduce_max_memtile_placed.py       |  14 +-
 .../vector_reduce_max_shared_placed.py        |  12 +-
 .../vector_reduce_max_placed.py               |  10 +-
 .../vector_reduce_min_placed.py               |   6 +-
 .../vector_scalar_mul_placed.py               |   3 +-
 .../bfp_conversion/bfp_conversion_placed.py   |   9 +-
 .../vector_passthrough_placed.py              |   6 +-
 .../ml/bottleneck/bottleneck_placed.py        |  11 +-
 .../ml/conv2d/conv2d_placed.py                |   3 +-
 .../conv2d_14x14/conv2dk14_32core_placed.py   |   3 +-
 .../ml/conv2d_14x14/conv2dk14_placed.py       |   3 +-
 .../conv2d_fused_relu_placed.py               |   3 +-
 .../ml/magika/group0_placed.py                |   6 +-
 .../ml/magika/group1_placed.py                |  18 +-
 .../ml/magika/group2_placed.py                |   3 +-
 .../ml/resnet/layers_conv2_x/aie.mlir         |  34 +-
 .../ml/resnet/layers_conv2_x/resnet_placed.py |  13 +-
 .../MM_2x2/circuit_switched_version/aie.mlir  |  10 +-
 .../aie.mlir                                  |  10 +-
 .../MM_2x2/packet_switched_version/aie.mlir   |  10 +-
 .../aie.mlir                                  |   8 +-
 .../aie_fp32.mlir                             |   8 +-
 .../HDIFF_single_AIE_objectFIFO/aie.mlir      |   4 +-
 .../HDIFF_single_AIE_objectFIFO/aie_fp32.mlir |   4 +-
 .../aie.mlir                                  |   4 +-
 .../aie_fp32.mlir                             |   4 +-
 .../aie.mlir                                  |  66 +-
 .../aie.mlir                                  |  12 +-
 .../aie_fp32.mlir                             |  12 +-
 .../aie.mlir                                  |  30 +-
 .../aie_1.mlir                                |  30 +-
 .../aie_16.mlir                               | 390 ++++-----
 .../aie_2.mlir                                |  54 +-
 .../aie_3.mlir                                |  78 +-
 .../aie_32.mlir                               | 774 +++++++++---------
 .../aie_4.mlir                                | 102 +--
 .../aie_8.mlir                                | 198 ++---
 programming_examples/mlir/idct/aie.mlir       |  12 +-
 .../aie.mlir                                  |  12 +-
 .../color_detect/color_detect_placed.py       |  11 +-
 .../color_threshold/color_threshold_placed.py |   9 +-
 .../vision/edge_detect/edge_detect_placed.py  |  12 +-
 .../aie2_lineBased_8b_1080.mlir               |   4 +-
 .../aie2_lineBased_8b_8k.mlir                 |   4 +-
 .../aie2_lineBased_8b_tiny.mlir               |   4 +-
 .../vision_passthrough_placed.py              |   6 +-
 .../section-4/section-4b/aie2_placed.py       |   3 +-
 test/Integration/julia_by_lines/aie.mlir      |   6 +-
 test/npu-xrt/cascade_flows/aie.mlir           |  16 +-
 .../aie_bufferx4.mlir                         |  14 +-
 .../aie_cascadex4.mlir                        |  20 +-
 .../aie_plainx1.mlir                          |  10 +-
 .../aie_plainx4.mlir                          |  16 +-
 test/npu-xrt/runtime_cumsum/aie.mlir          |   6 +-
 test/npu-xrt/tile_mapped_read/aie.mlir        |   4 +-
 test/npu-xrt/two_col/aie.mlir                 |  10 +-
 test/npu-xrt/vec_mul_event_trace/aie.mlir     |   4 +-
 test/npu-xrt/vector_scalar_using_dma/aie.mlir |   4 +-
 test/parse-trace/test1/aie_test1.mlir         |   4 +-
 test/parse-trace/test2/aie_test2.mlir         |   4 +-
 test/unit_tests/aie/12_julia/aie.mlir         |   4 +-
 test/unit_tests/aie/13_julia_fp/aie.mlir      |   4 +-
 .../01_precompiled_core_function/aie.mlir     |   4 +-
 .../aie2/03_cascade_core_functions/aie.mlir   |   8 +-
 .../aie2/05_shim_dma_core_function/aie.mlir   |   4 +-
 .../aie.mlir                                  |   4 +-
 .../01_precompiled_core_function/aie.mlir     |   4 +-
 .../03_cascade_core_functions/aie.mlir        |  12 +-
 .../05_shim_dma_core_function/aie.mlir        |   4 +-
 .../aie.mlir                                  |   4 +-
 .../01_precompiled_core_function/aie.mlir     |   4 +-
 .../03_cascade_core_functions/aie.mlir        |   8 +-
 .../05_shim_dma_core_function/aie.mlir        |   4 +-
 .../aie.mlir                                  |   4 +-
 96 files changed, 1261 insertions(+), 1150 deletions(-)

diff --git a/mlir_exercises/tutorial-8/aie.mlir b/mlir_exercises/tutorial-8/aie.mlir
index 9bd2b304635..be23b4da763 100644
--- a/mlir_exercises/tutorial-8/aie.mlir
+++ b/mlir_exercises/tutorial-8/aie.mlir
@@ -29,8 +29,8 @@ module @tutorial_8 {
 
     // declare 2 kernel functions name "extern_kernel1" and "extern_kernel2"
     // with one positional function argument, in this case mapped to a memref
-    func.func private @extern_kernel1() -> ()
-    func.func private @extern_kernel2(%b: memref<256xi32>) -> ()
+    func.func private @extern_kernel1() -> () attributes {link_with = "kernel1.o"}
+    func.func private @extern_kernel2(%b: memref<256xi32>) -> () attributes {link_with = "kernel2.o"}
 
     // Declare shared lock (belonging to tile(2,4), lock ID=1)
     // %lock13_1 = aie.lock(%tile13, 1) { sym_name = "lock_13_1" }
@@ -49,7 +49,7 @@ module @tutorial_8 {
 
         // aie.use_lock(%lock13_1, "Release", 1)
         aie.end
-    } { link_with="kernel1.o" }
+    }
 
     // Define core algorithm for tile(2,4) which reads value set by tile(1,4)
     // buf[5] = buf[3] + 100
@@ -74,6 +74,6 @@ module @tutorial_8 {
         // This release means our 2nd core is done
         aie.use_lock(%lock13_2, "Release", 1)
         aie.end
-    } { link_with="kernel2.o" }
+    }
 
 }
diff --git a/mlir_exercises/tutorial-8/answers/aie.mlir b/mlir_exercises/tutorial-8/answers/aie.mlir
index 00e484b2182..4ea44a0446c 100755
--- a/mlir_exercises/tutorial-8/answers/aie.mlir
+++ b/mlir_exercises/tutorial-8/answers/aie.mlir
@@ -30,8 +30,8 @@ module @tutorial_8 {
 
     // declare 2 kernel functions name "extern_kernel1" and "extern_kernel2"
     // with one positional function argument, in this case mapped to a memref
-    func.func private @extern_kernel1() -> ()
-    func.func private @extern_kernel2(%b: memref<256xi32>) -> ()
+    func.func private @extern_kernel1() -> () attributes {link_with = "kernel1.o"}
+    func.func private @extern_kernel2(%b: memref<256xi32>) -> () attributes {link_with = "kernel2.o"}
 
     // Declare shared lock (belonging to tile(2,4), lock ID=1), do not change symbolic name to allow reuse of test.cpp
 
@@ -52,7 +52,7 @@ module @tutorial_8 {
 
         // aie.use_lock(%lock23_1, "Release", 1)
         aie.end
-    } { link_with="kernel2.o" }
+    }
 
     // Define core algorithm for tile(2,4) which reads value set by tile(1,4)
     // buf[5] = buf[3] + 100
@@ -73,6 +73,6 @@ module @tutorial_8 {
 
         // aie.use_lock(%lock24_1, "Release", 0)
         aie.end
-    } { link_with="kernel1.o" }
+    }
 
 }
diff --git a/mlir_exercises/tutorial-9/README.md b/mlir_exercises/tutorial-9/README.md
index aa63c1da39d..394b76bc9ac 100755
--- a/mlir_exercises/tutorial-9/README.md
+++ b/mlir_exercises/tutorial-9/README.md
@@ -15,14 +15,14 @@ MLIR gives us the ability to leverage different dialects such as [arith](https:/
 
 Specifically, to support external functions, we use the operators `func.func` and `func.call` as follows:
 ```
-func.func private @extern_kernel(%b: memref<256xi32>) -> ()
+func.func private @extern_kernel(%b: memref<256xi32>) -> () attributes {link_with = "kernel.o"}
 
 %core14 = AIE.core(%tile14) {
     func.call @extern_kernel(%buf) : (memref<256xi32>) -> ()
     AIE.end
-} { link_with="kernel.o"}
+}
 ```
-In this MLIR code snippet, we see that we first call `func.func` to declare a private function whose function signature matches that of the AIE C/C++ function. The function name after the @ (e.g. `@external_kernel`) should match the C function name and the number of arguments should match the number of C function arguments.  C++ name mangling is not supported.  Argument types are converted according to the MLIR ['bare pointer' calling convention](https://mlir.llvm.org/docs/TargetLLVMIR/#bare-pointer-calling-convention-for-ranked-memref) (see below). 
+In this MLIR code snippet, we see that we first call `func.func` to declare a private function whose function signature matches that of the AIE C/C++ function. The function name after the @ (e.g. `@external_kernel`) should match the C function name and the number of arguments should match the number of C function arguments.  C++ name mangling is not supported.  Argument types are converted according to the MLIR ['bare pointer' calling convention](https://mlir.llvm.org/docs/TargetLLVMIR/#bare-pointer-calling-convention-for-ranked-memref) (see below).
 
 | MLIR type   | C type      |
 | ----------- | ----------- |
@@ -31,9 +31,9 @@ In this MLIR code snippet, we see that we first call `func.func` to declare a pr
 | Memref      | C pointer   |
 | index       | int64_t     |
 
-Then, within the `AIE.core` operator, we use `func.call` to call the previously defined function from within our core, being sure to pass the appropriate function arguments. In this case, we pass in the the `AIE.buffer` `%buf`. 
+Then, within the `AIE.core` operator, we use `func.call` to call the previously defined function from within our core, being sure to pass the appropriate function arguments. In this case, we pass in the the `AIE.buffer` `%buf`.
 
-The final step is to tell our tools where to look for the object code that the function whose name we defined in `func.func`/ `func.call`. Using the additional operator definition `link_with="kernel.o"`, we point to the file `kernel.o` in the current directory and link it in to create the final kernel object file.
+The final step is to tell our tools where to look for the object code that the function whose name we defined in `func.func`/ `func.call`. Using the `link_with` attribute on the `func.func` declaration (e.g. `attributes {link_with = "kernel.o"}`), we point to the file `kernel.o` in the current directory and link it in to create the final kernel object file.
 > Note that this allows us to call the function multiple times within the `AIE.core` or even separate functions in the same `AIE.core` if they are both defined within the single linked object file.
 
 ## <ins>Kernel object file generation</ins>
diff --git a/mlir_exercises/tutorial-9/aie.mlir b/mlir_exercises/tutorial-9/aie.mlir
index 4544eb57168..31c02ce4a64 100644
--- a/mlir_exercises/tutorial-9/aie.mlir
+++ b/mlir_exercises/tutorial-9/aie.mlir
@@ -35,7 +35,7 @@ module @tutorial_9 {
 
     // declare kernel function name "extern_kernel" with one positional
     // function argument, in this case mapped to a memref
-    func.func private @extern_kernel(%b: memref<256xi32>) -> ()
+    func.func private @extern_kernel(%b: memref<256xi32>) -> () attributes {link_with = "kernel.o"}
 
     // Define the algorithm for the core of tile(1, 4)
     // buf[3] = 14
@@ -52,6 +52,6 @@ module @tutorial_9 {
         // by acquiring this lock (with value 1).
         aie.use_lock(%lock14_0, "Release", 1)
         aie.end
-    } { link_with="kernel.o" } // indicate kernel object name used by this core
+    } // indicate kernel object name used by this core
 
 }
diff --git a/mlir_exercises/tutorial-9/answers/aie_matmul.mlir b/mlir_exercises/tutorial-9/answers/aie_matmul.mlir
index 6ab5b0aa15d..d4c898a0eba 100644
--- a/mlir_exercises/tutorial-9/answers/aie_matmul.mlir
+++ b/mlir_exercises/tutorial-9/answers/aie_matmul.mlir
@@ -38,7 +38,7 @@ module @tutorial_9 {
 
     // declare kernel function name "extern_kernel" with one positional
     // function argument, in this case mapped to a memref
-    func.func private @extern_kernel(%a: memref<32xi32>, %b: memref<32xi32>, %acc: memref<32xi32>, %c: memref<32xi32>) -> ()
+    func.func private @extern_kernel(%a: memref<32xi32>, %b: memref<32xi32>, %acc: memref<32xi32>, %c: memref<32xi32>) -> () attributes {link_with = "kernel_matmul.o"}
 
     // Define the algorithm for the core of tile(1, 4)
     // buf[3] = 14
@@ -55,6 +55,6 @@ module @tutorial_9 {
         // by acquiring this lock (with value 1).
         aie.use_lock(%lock14_0, "Release", 1)
         aie.end
-    } { link_with="kernel_matmul.o" } // indicate kernel object name used by this core
+    } // indicate kernel object name used by this core
 
 }
diff --git a/programming_examples/basic/event_trace/aie_trace.mlir b/programming_examples/basic/event_trace/aie_trace.mlir
index a69d7b567dc..5b1575fc93e 100644
--- a/programming_examples/basic/event_trace/aie_trace.mlir
+++ b/programming_examples/basic/event_trace/aie_trace.mlir
@@ -20,7 +20,7 @@
 module {
   aie.device(npu1_1col) {
     // External kernel function declaration
-    func.func private @vector_scalar_mul_aie_scalar(memref<1024xi32>, memref<1024xi32>, memref<1xi32>, i32)
+    func.func private @vector_scalar_mul_aie_scalar(memref<1024xi32>, memref<1024xi32>, memref<1xi32>, i32) attributes {link_with = "scale.o"}
 
     // Tile declarations
     %shim_noc_tile_0_0 = aie.tile(0, 0)
@@ -55,7 +55,7 @@ module {
         aie.objectfifo.release @infactor(Consume, 1)
       }
       aie.end
-    } {link_with = "scale.o"}
+    }
 
     // ========================================================================
     // TRACE CONFIGURATION
diff --git a/programming_examples/basic/event_trace/aie_trace.py b/programming_examples/basic/event_trace/aie_trace.py
index 652412fa98c..3b4bc6f7b9a 100644
--- a/programming_examples/basic/event_trace/aie_trace.py
+++ b/programming_examples/basic/event_trace/aie_trace.py
@@ -44,6 +44,7 @@ def device_body():
         scale = external_func(
             "vector_scalar_mul_aie_scalar",
             inputs=[tile_ty, tile_ty, scalar_ty, np.int32],
+            link_with="scale.o",
         )
 
         # Tile declarations
@@ -56,7 +57,7 @@ def device_body():
         of_out = object_fifo("out", tile_0_2, shim_noc_tile_0_0, 2, tile_ty)
 
         # Core computation
-        @core(tile_0_2, "scale.o")
+        @core(tile_0_2)
         def core_body():
             for _ in range_(sys.maxsize):
                 elem_factor = of_factor.acquire(ObjectFifoPort.Consume, 1)
diff --git a/programming_examples/basic/matrix_multiplication/cascade/cascade.py b/programming_examples/basic/matrix_multiplication/cascade/cascade.py
index 0d86477e8ae..bad349d14f1 100644
--- a/programming_examples/basic/matrix_multiplication/cascade/cascade.py
+++ b/programming_examples/basic/matrix_multiplication/cascade/cascade.py
@@ -136,18 +136,25 @@ def device_body():
         C_l1_ty = np.ndarray[(m, n), np.dtype[dtype_out]]
 
         # AIE Core Function declarations
-        zero_scalar = external_func(f"zero_scalar_{dtype_out_str}", inputs=[C_l1_ty])
+        zero_scalar = external_func(
+            f"zero_scalar_{dtype_out_str}",
+            inputs=[C_l1_ty],
+            link_with=f"mm_{m}x{k}x{n}.o",
+        )
         matmul_scalar_cascade_get_only = external_func(
             f"matmul_scalar_cascade_get_only_{dtype_in_str}_{dtype_out_str}",
             inputs=[A_l1_ty, B_l1_ty, C_l1_ty],
+            link_with=f"mm_{m}x{k}x{n}.o",
         )
         matmul_scalar_cascade_put_only = external_func(
             f"matmul_scalar_cascade_put_only_{dtype_in_str}_{dtype_out_str}",
             inputs=[A_l1_ty, B_l1_ty, C_l1_ty],
+            link_with=f"mm_{m}x{k}x{n}.o",
         )
         matmul_scalar_cascade_put_get = external_func(
             f"matmul_scalar_cascade_put_get_{dtype_in_str}_{dtype_out_str}",
             inputs=[A_l1_ty, B_l1_ty, C_l1_ty],
+            link_with=f"mm_{m}x{k}x{n}.o",
         )
 
         # Tile declarations as tile[row][col]
@@ -278,7 +285,7 @@ def device_body():
         for row in range(n_aie_rows):
             for col in range(n_aie_cols):
 
-                @core(core_tiles[row][col], f"mm_{m}x{k}x{n}.o")
+                @core(core_tiles[row][col])
                 def core_body():
                     for _ in range_(0xFFFFFFFF):
                         loop = (
diff --git a/programming_examples/basic/matrix_multiplication/cascade/cascade_placed.py b/programming_examples/basic/matrix_multiplication/cascade/cascade_placed.py
index e36d7b169b9..361d3920b0a 100644
--- a/programming_examples/basic/matrix_multiplication/cascade/cascade_placed.py
+++ b/programming_examples/basic/matrix_multiplication/cascade/cascade_placed.py
@@ -163,18 +163,25 @@ def device_body():
         C_l1_ty = np.ndarray[(m, n), np.dtype[dtype_out]]
 
         # AIE Core Function declarations
-        zero_scalar = external_func(f"zero_scalar_{dtype_out_str}", inputs=[C_l1_ty])
+        zero_scalar = external_func(
+            f"zero_scalar_{dtype_out_str}",
+            inputs=[C_l1_ty],
+            link_with=f"mm_{m}x{k}x{n}.o",
+        )
         matmul_scalar_cascade_get_only = external_func(
             f"matmul_scalar_cascade_get_only_{dtype_in_str}_{dtype_out_str}",
             inputs=[A_l1_ty, B_l1_ty, C_l1_ty],
+            link_with=f"mm_{m}x{k}x{n}.o",
         )
         matmul_scalar_cascade_put_only = external_func(
             f"matmul_scalar_cascade_put_only_{dtype_in_str}_{dtype_out_str}",
             inputs=[A_l1_ty, B_l1_ty, C_l1_ty],
+            link_with=f"mm_{m}x{k}x{n}.o",
         )
         matmul_scalar_cascade_put_get = external_func(
             f"matmul_scalar_cascade_put_get_{dtype_in_str}_{dtype_out_str}",
             inputs=[A_l1_ty, B_l1_ty, C_l1_ty],
+            link_with=f"mm_{m}x{k}x{n}.o",
         )
 
         # Tile declarations as tile[row][col]
@@ -305,7 +312,7 @@ def device_body():
         for row in range(n_aie_rows):
             for col in range(n_aie_cols):
 
-                @core(core_tiles[row][col], f"mm_{m}x{k}x{n}.o")
+                @core(core_tiles[row][col])
                 def core_body():
                     for _ in range_(0xFFFFFFFF):
                         loop = (
diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/matrix_vector.py b/programming_examples/basic/matrix_multiplication/matrix_vector/matrix_vector.py
index 5d8f4463b3c..138d729736a 100644
--- a/programming_examples/basic/matrix_multiplication/matrix_vector/matrix_vector.py
+++ b/programming_examples/basic/matrix_multiplication/matrix_vector/matrix_vector.py
@@ -57,10 +57,15 @@ def device_body():
 
             # AIE Core Function declarations
             func_type = "vectorized" if vectorized else "scalar"
-            zero = external_func(f"zero_{func_type}_{dtype_out_str}", inputs=[outC_ty])
+            zero = external_func(
+                f"zero_{func_type}_{dtype_out_str}",
+                inputs=[outC_ty],
+                link_with=f"mv_{m}x{k}.o",
+            )
             matvec = external_func(
                 f"matvec_{func_type}_{dtype_in_str}_{dtype_out_str}",
                 inputs=[A_ty, inB_ty, outC_ty],
+                link_with=f"mv_{m}x{k}.o",
             )
 
             # Tile declarations
@@ -122,7 +127,7 @@ def device_body():
             # Set up compute tiles
             for i in range(n_cores):
                 # Compute tile i
-                @core(cores[i], f"mv_{m}x{k}.o")
+                @core(cores[i])
                 def core_body():
                     for _ in range_(0xFFFFFFFF):
                         elem_out = outC_fifos[i].acquire(
diff --git a/programming_examples/basic/matrix_multiplication/matrix_vector/matrix_vector_placed.py b/programming_examples/basic/matrix_multiplication/matrix_vector/matrix_vector_placed.py
index 3c7506dc897..0ae9a97216c 100644
--- a/programming_examples/basic/matrix_multiplication/matrix_vector/matrix_vector_placed.py
+++ b/programming_examples/basic/matrix_multiplication/matrix_vector/matrix_vector_placed.py
@@ -57,10 +57,15 @@ def device_body():
 
             # AIE Core Function declarations
             func_type = "vectorized" if vectorized else "scalar"
-            zero = external_func(f"zero_{func_type}_{dtype_out_str}", inputs=[outC_ty])
+            zero = external_func(
+                f"zero_{func_type}_{dtype_out_str}",
+                inputs=[outC_ty],
+                link_with=f"mv_{m}x{k}.o",
+            )
             matvec = external_func(
                 f"matvec_{func_type}_{dtype_in_str}_{dtype_out_str}",
                 inputs=[A_ty, inB_ty, outC_ty],
+                link_with=f"mv_{m}x{k}.o",
             )
 
             # Tile declarations
@@ -122,7 +127,7 @@ def device_body():
             # Set up compute tiles
             for i in range(n_cores):
                 # Compute tile i
-                @core(cores[i], f"mv_{m}x{k}.o")
+                @core(cores[i])
                 def core_body():
                     for _ in range_(0xFFFFFFFF):
                         elem_out = outC_fifos[i].acquire(
diff --git a/programming_examples/basic/matrix_multiplication/single_core/single_core.py b/programming_examples/basic/matrix_multiplication/single_core/single_core.py
index 0a9ccbbc2da..375a9f7b9fe 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/single_core.py
+++ b/programming_examples/basic/matrix_multiplication/single_core/single_core.py
@@ -15,7 +15,6 @@
 from aie.iron.controlflow import range_
 from aie.iron.dtype import str_to_dtype
 
-
 microkernel_mac_dim_map = {
     "npu": {
         "bf16": (4, 8, 4),
@@ -146,11 +145,16 @@ def device_body():
 
             # AIE Core Function declarations
             func_type = "" if vectorized else "scalar_"
-            zero = external_func(f"zero_{func_type}{dtype_out_str}", inputs=[c_ty])
+            zero = external_func(
+                f"zero_{func_type}{dtype_out_str}",
+                inputs=[c_ty],
+                link_with=f"mm_{m}x{k}x{n}.o",
+            )
             matmul_func_name = f"matmul_{func_type}{dtype_in_str}_{dtype_out_str}"
             matmul = external_func(
                 matmul_func_name,
                 inputs=[a_ty, b_ty, c_ty],
+                link_with=f"mm_{m}x{k}x{n}.o",
             )
 
             # Tile declarations
@@ -244,7 +248,7 @@ def device_body():
             # Exceding the stack size leads to wrong results from the kernel, but no error is triggered.
             # Stack usage can be checked as explained here:
             # https://github.com/Xilinx/llvm-aie/issues/487#issuecomment-2969438585
-            @core(compute_tile2, f"mm_{m}x{k}x{n}.o", stack_size=0xD00)
+            @core(compute_tile2, stack_size=0xD00)
             def core_body():
                 for _ in range_(0xFFFFFFFF):
                     for _ in range_(tiles) if tiles > 1 else range(1):  # issue #1547
diff --git a/programming_examples/basic/matrix_multiplication/single_core/single_core_placed.py b/programming_examples/basic/matrix_multiplication/single_core/single_core_placed.py
index 373a2996238..857bc02fd08 100644
--- a/programming_examples/basic/matrix_multiplication/single_core/single_core_placed.py
+++ b/programming_examples/basic/matrix_multiplication/single_core/single_core_placed.py
@@ -19,7 +19,6 @@
 from aie.iron.controlflow import range_
 from aie.iron.dtype import str_to_dtype
 
-
 microkernel_mac_dim_map = {
     "npu": {
         "bf16": (4, 8, 4),
@@ -175,11 +174,16 @@ def device_body():
 
         # AIE Core Function declarations
         func_type = "" if vectorized else "scalar_"
-        zero = external_func(f"zero_{func_type}{dtype_out_str}", inputs=[c_ty])
+        zero = external_func(
+            f"zero_{func_type}{dtype_out_str}",
+            inputs=[c_ty],
+            link_with=f"mm_{m}x{k}x{n}.o",
+        )
         matmul_func_name = f"matmul_{func_type}{dtype_in_str}_{dtype_out_str}"
         matmul = external_func(
             matmul_func_name,
             inputs=[a_ty, b_ty, c_ty],
+            link_with=f"mm_{m}x{k}x{n}.o",
         )
 
         # Tile declarations
@@ -269,7 +273,7 @@ def device_body():
         # Set up compute tiles
 
         # Compute tile 2
-        @core(compute_tile2, f"mm_{m}x{k}x{n}.o", stack_size=0xD00)
+        @core(compute_tile2, stack_size=0xD00)
         def core_body():
             for _ in range_(0xFFFFFFFF):
                 for _ in range_(tiles) if tiles > 1 else range(1):  # issue #1547
diff --git a/programming_examples/basic/matrix_multiplication/whole_array/whole_array.py b/programming_examples/basic/matrix_multiplication/whole_array/whole_array.py
index 394dc6165a8..1ce32e832a2 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/whole_array.py
+++ b/programming_examples/basic/matrix_multiplication/whole_array/whole_array.py
@@ -16,7 +16,6 @@
 
 from aie.iron import str_to_dtype
 
-
 microkernel_mac_dim_map = {
     "npu": {
         "bf16": (4, 8, 4),
@@ -222,10 +221,15 @@ def device_body():
 
         # AIE Core Function declarations
         scalar_suffix = "_scalar" if use_scalar else ""
-        zero = external_func(f"zero{scalar_suffix}_{dtype_out_str}", inputs=[C_l1_ty])
+        zero = external_func(
+            f"zero{scalar_suffix}_{dtype_out_str}",
+            inputs=[C_l1_ty],
+            link_with=f"mm_{m}x{k}x{n}.o",
+        )
         matmul = external_func(
             f"matmul{scalar_suffix}_{dtype_in_str}_{dtype_out_str}",
             inputs=[A_l1_ty, B_l1_ty, C_l1_ty],
+            link_with=f"mm_{m}x{k}x{n}.o",
         )
 
         # Tile declarations as tile[row][col]
@@ -397,7 +401,7 @@ def device_body():
                 # Exceding the stack size leads to wrong results from the kernel, but no error is triggered.
                 # Stack usage can be checked as explained here:
                 # https://github.com/Xilinx/llvm-aie/issues/487#issuecomment-2969438585
-                @core(core_tiles[row][col], f"mm_{m}x{k}x{n}.o", stack_size=0xD00)
+                @core(core_tiles[row][col], stack_size=0xD00)
                 def core_body():
                     for _ in range_(0xFFFFFFFF):
                         loop = (
diff --git a/programming_examples/basic/matrix_multiplication/whole_array/whole_array_placed.py b/programming_examples/basic/matrix_multiplication/whole_array/whole_array_placed.py
index 95703a66f79..85c6467f8c6 100644
--- a/programming_examples/basic/matrix_multiplication/whole_array/whole_array_placed.py
+++ b/programming_examples/basic/matrix_multiplication/whole_array/whole_array_placed.py
@@ -15,7 +15,6 @@
 from aie.helpers.taplib import TensorTiler2D, TensorAccessSequence
 from aie.iron import str_to_dtype
 
-
 microkernel_mac_dim_map = {
     "npu": {
         "bf16": (4, 8, 4),
@@ -211,11 +210,16 @@ def device_body():
         C_l1_ty = np.ndarray[(m, n), np.dtype[dtype_out]]
 
         # AIE Core Function declarations
-        zero = external_func(f"zero_{dtype_out_str}", inputs=[C_l1_ty])
+        zero = external_func(
+            f"zero_{dtype_out_str}",
+            inputs=[C_l1_ty],
+            link_with=f"mm_{m}x{k}x{n}.o",
+        )
         matmul_vectorized_func_name = f"matmul_{dtype_in_str}_{dtype_out_str}"
         matmul = external_func(
             matmul_vectorized_func_name,
             inputs=[A_l1_ty, B_l1_ty, C_l1_ty],
+            link_with=f"mm_{m}x{k}x{n}.o",
         )
 
         # Tile declarations as tile[row][col]
@@ -364,7 +368,7 @@ def device_body():
         for row in range(n_aie_rows):
             for col in range(n_aie_cols):
 
-                @core(core_tiles[row][col], f"mm_{m}x{k}x{n}.o", stack_size=0xD00)
+                @core(core_tiles[row][col], stack_size=0xD00)
                 def core_body():
                     for _ in range_(0xFFFFFFFF):
                         loop = (
diff --git a/programming_examples/basic/packet_switch/aie_add_placed.py b/programming_examples/basic/packet_switch/aie_add_placed.py
index 3386c74b127..ceb88f5e53f 100644
--- a/programming_examples/basic/packet_switch/aie_add_placed.py
+++ b/programming_examples/basic/packet_switch/aie_add_placed.py
@@ -36,8 +36,8 @@ def device_body():
         # Size of input vector + 4 bytes for the packet header (used in memtile_0_1 DMA logic)
         vector_with_packet_ty = np.ndarray[(in_out_size + 4,), in_out_ty]
 
-        add_func = external_func("add", [vector_ty, vector_ty])
-        mult_func = external_func("mul", [vector_ty, vector_ty])
+        add_func = external_func("add", [vector_ty, vector_ty], link_with="add_mul.o")
+        mult_func = external_func("mul", [vector_ty, vector_ty], link_with="add_mul.o")
 
         ShimTile_0_0 = tile(0, 0)
         MemTile_0_1 = tile(0, 1)
@@ -159,7 +159,7 @@ def device_body():
         )
 
         # core_0_2 compute
-        @core(CT_0_2, "add_mul.o")
+        @core(CT_0_2)
         def core_body():
             for _ in range_(sys.maxsize):
                 # Acquire locks to read core02_buff_in and write core02_buff_out
@@ -193,7 +193,7 @@ def m(block):
                 EndOp()
 
         # core_0_3 compute
-        @core(CT_0_3, "add_mul.o")
+        @core(CT_0_3)
         def core_body():
             for _ in range_(sys.maxsize):
                 # Acquire locks to read core03_buff_in and write core03_buff_out
diff --git a/programming_examples/basic/packet_switch/aie_mul_placed.py b/programming_examples/basic/packet_switch/aie_mul_placed.py
index 24f304ed4cd..2a03fdc8c23 100644
--- a/programming_examples/basic/packet_switch/aie_mul_placed.py
+++ b/programming_examples/basic/packet_switch/aie_mul_placed.py
@@ -36,8 +36,8 @@ def device_body():
         # Size of input vector + 4 bytes for the packet header (used in memtile_0_1 DMA logic)
         vector_with_packet_ty = np.ndarray[(in_out_size + 4,), in_out_ty]
 
-        add_func = external_func("add", [vector_ty, vector_ty])
-        mult_func = external_func("mul", [vector_ty, vector_ty])
+        add_func = external_func("add", [vector_ty, vector_ty], link_with="add_mul.o")
+        mult_func = external_func("mul", [vector_ty, vector_ty], link_with="add_mul.o")
 
         ShimTile_0_0 = tile(0, 0)
         MemTile_0_1 = tile(0, 1)
@@ -159,7 +159,7 @@ def device_body():
         )
 
         # core_0_2 compute
-        @core(CT_0_2, "add_mul.o")
+        @core(CT_0_2)
         def core_body():
             for _ in range_(sys.maxsize):
                 # Acquire locks to read core02_buff_in and write core02_buff_out
@@ -193,7 +193,7 @@ def m(block):
                 EndOp()
 
         # core_0_3 compute
-        @core(CT_0_3, "add_mul.o")
+        @core(CT_0_3)
         def core_body():
             for _ in range_(sys.maxsize):
                 # Acquire locks to read core03_buff_in and write core03_buff_out
diff --git a/programming_examples/basic/passthrough_kernel/passthrough_kernel_placed.py b/programming_examples/basic/passthrough_kernel/passthrough_kernel_placed.py
index a9fe504cced..aa752706e56 100644
--- a/programming_examples/basic/passthrough_kernel/passthrough_kernel_placed.py
+++ b/programming_examples/basic/passthrough_kernel/passthrough_kernel_placed.py
@@ -36,7 +36,9 @@ def device_body():
 
         # AIE Core Function declarations
         passThroughLine = external_func(
-            "passThroughLine", inputs=[line_ty, line_ty, np.int32]
+            "passThroughLine",
+            inputs=[line_ty, line_ty, np.int32],
+            link_with="passThrough.cc.o",
         )
 
         # Tile declarations
@@ -55,7 +57,7 @@ def device_body():
         # Set up compute tiles
 
         # Compute tile 2
-        @core(ComputeTile2, "passThrough.cc.o")
+        @core(ComputeTile2)
         def core_body():
             for _ in range_(sys.maxsize):
                 elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
diff --git a/programming_examples/basic/row_wise_bias_add/row_wise_bias_add_placed.py b/programming_examples/basic/row_wise_bias_add/row_wise_bias_add_placed.py
index bc2acbdb574..d2ec2133ab4 100644
--- a/programming_examples/basic/row_wise_bias_add/row_wise_bias_add_placed.py
+++ b/programming_examples/basic/row_wise_bias_add/row_wise_bias_add_placed.py
@@ -26,7 +26,9 @@ def device_body():
         bias_ty = np.ndarray[(n,), np.dtype[np.float32]]
 
         kernel_func = external_func(
-            f"row_wise_bias_add_f32_f32", inputs=[tensor_ty, bias_ty, tensor_ty]
+            f"row_wise_bias_add_f32_f32",
+            inputs=[tensor_ty, bias_ty, tensor_ty],
+            link_with="kernel.o",
         )
 
         shim_tile = tile(0, 0)
@@ -36,7 +38,7 @@ def device_body():
         bias_fifo = object_fifo("bias_fifo", shim_tile, compute_tile, 2, bias_ty)
         out_fifo = object_fifo("out_fifo", compute_tile, shim_tile, 2, tensor_ty)
 
-        @core(compute_tile, "kernel.o")
+        @core(compute_tile)
         def core_body():
             for _ in range_(0xFFFFFFFF):
                 for _ in range_(N // n):
diff --git a/programming_examples/basic/vector_reduce_add/vector_reduce_add_placed.py b/programming_examples/basic/vector_reduce_add/vector_reduce_add_placed.py
index 870e47b88b4..c6d84bed1b0 100644
--- a/programming_examples/basic/vector_reduce_add/vector_reduce_add_placed.py
+++ b/programming_examples/basic/vector_reduce_add/vector_reduce_add_placed.py
@@ -38,7 +38,9 @@ def device_body():
 
         # AIE Core Function declarations
         reduce_add_vector = external_func(
-            "reduce_add_vector", inputs=[in_ty, out_ty, np.int32]
+            "reduce_add_vector",
+            inputs=[in_ty, out_ty, np.int32],
+            link_with="reduce_add.cc.o",
         )
 
         # Tile declarations
@@ -52,7 +54,7 @@ def device_body():
         # Set up compute tiles
 
         # Compute tile 2
-        @core(ComputeTile2, "reduce_add.cc.o")
+        @core(ComputeTile2)
         def core_body():
             for _ in range_(0xFFFFFFFF):
                 elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
diff --git a/programming_examples/basic/vector_reduce_max/multi_column_designs/row_wise_vector_reduce_max_placed.py b/programming_examples/basic/vector_reduce_max/multi_column_designs/row_wise_vector_reduce_max_placed.py
index 502e3fff26a..de4bea88913 100644
--- a/programming_examples/basic/vector_reduce_max/multi_column_designs/row_wise_vector_reduce_max_placed.py
+++ b/programming_examples/basic/vector_reduce_max/multi_column_designs/row_wise_vector_reduce_max_placed.py
@@ -53,9 +53,15 @@ def device_body():
         # AIE Core Function declarations
         suffix = "_bfloat16" if dtype_str == "bf16" else ""
         reduce_max_vector = external_func(
-            f"reduce_max_vector{suffix}", [op_ty, out_ty, np.int32]
+            f"reduce_max_vector{suffix}",
+            [op_ty, out_ty, np.int32],
+            link_with="reduce_max.cc.o",
+        )
+        compute_max = external_func(
+            f"compute_max{suffix}",
+            [out_ty, out_ty, out_ty],
+            link_with="reduce_max.cc.o",
         )
-        compute_max = external_func(f"compute_max{suffix}", [out_ty, out_ty, out_ty])
         min_val = (
             np.array([bfloat16(float("-inf"))], dtype=dtype)
             if dtype_str == "bf16"
@@ -155,7 +161,7 @@ def device_body():
                 initial_value=min_val,
             )
 
-            @core(cores[i], "reduce_max.cc.o")
+            @core(cores[i])
             def core_body():
                 elem_out = out_fifos[i].acquire(ObjectFifoPort.Produce, 1)
                 for _ in range_(num_iter):
diff --git a/programming_examples/basic/vector_reduce_max/single_column_designs/vector_reduce_max_chained_placed.py b/programming_examples/basic/vector_reduce_max/single_column_designs/vector_reduce_max_chained_placed.py
index 5cc0e56eff8..3b2d9fa037d 100644
--- a/programming_examples/basic/vector_reduce_max/single_column_designs/vector_reduce_max_chained_placed.py
+++ b/programming_examples/basic/vector_reduce_max/single_column_designs/vector_reduce_max_chained_placed.py
@@ -46,10 +46,14 @@ def device_body():
         # AIE Core Function declarations
         suffix = "_bfloat16" if dtype_str == "bf16" else ""
         reduce_max_vector = external_func(
-            f"reduce_max_vector{suffix}", inputs=[op_ty, out_ty, np.int32]
+            f"reduce_max_vector{suffix}",
+            inputs=[op_ty, out_ty, np.int32],
+            link_with="reduce_max.cc.o",
         )
         compute_max = external_func(
-            f"compute_max{suffix}", inputs=[out_ty, out_ty, out_ty]
+            f"compute_max{suffix}",
+            inputs=[out_ty, out_ty, out_ty],
+            link_with="reduce_max.cc.o",
         )
         min_val = (
             np.array([bfloat16(float("-inf"))], dtype=dtype)
@@ -116,7 +120,7 @@ def device_body():
             )
             if i == n_cores - 1:
 
-                @core(cores[i], "reduce_max.cc.o")
+                @core(cores[i])
                 def core_body():
                     elem_out = out_fifos[i].acquire(ObjectFifoPort.Produce, 1)
                     for _ in range_(num_iter):
@@ -129,7 +133,7 @@ def core_body():
 
             else:
 
-                @core(cores[i], "reduce_max.cc.o")
+                @core(cores[i])
                 def core_body():
                     for _ in range_(num_iter):
                         elem_in = in_fifos[i].acquire(ObjectFifoPort.Consume, 1)
diff --git a/programming_examples/basic/vector_reduce_max/single_column_designs/vector_reduce_max_memtile_placed.py b/programming_examples/basic/vector_reduce_max/single_column_designs/vector_reduce_max_memtile_placed.py
index 96bc7b16e24..f941906caac 100644
--- a/programming_examples/basic/vector_reduce_max/single_column_designs/vector_reduce_max_memtile_placed.py
+++ b/programming_examples/basic/vector_reduce_max/single_column_designs/vector_reduce_max_memtile_placed.py
@@ -48,13 +48,19 @@ def device_body():
 
         suffix = "_bfloat16" if dtype_str == "bf16" else ""
         reduce_max_vector = external_func(
-            f"reduce_max_vector{suffix}", inputs=[op_ty, out_ty, np.int32]
+            f"reduce_max_vector{suffix}",
+            inputs=[op_ty, out_ty, np.int32],
+            link_with="reduce_max.cc.o",
         )
         reduce_max_scalar = external_func(
-            f"reduce_max_scalar{suffix}", inputs=[int_ty, out_ty, np.int32]
+            f"reduce_max_scalar{suffix}",
+            inputs=[int_ty, out_ty, np.int32],
+            link_with="reduce_max.cc.o",
         )
         compute_max = external_func(
-            f"compute_max{suffix}", inputs=[out_ty, out_ty, out_ty]
+            f"compute_max{suffix}",
+            inputs=[out_ty, out_ty, out_ty],
+            link_with="reduce_max.cc.o",
         )
         min_val = (
             np.array([bfloat16(float("-inf"))], dtype=dtype)
@@ -136,7 +142,7 @@ def device_body():
                 initial_value=min_val,
             )
 
-            @core(cores[i], "reduce_max.cc.o")
+            @core(cores[i])
             def core_body():
                 elem_out = out_fifos[i].acquire(ObjectFifoPort.Produce, 1)
                 for _ in range_(num_iter):
diff --git a/programming_examples/basic/vector_reduce_max/single_column_designs/vector_reduce_max_shared_placed.py b/programming_examples/basic/vector_reduce_max/single_column_designs/vector_reduce_max_shared_placed.py
index feae246ea75..20816f6cd38 100644
--- a/programming_examples/basic/vector_reduce_max/single_column_designs/vector_reduce_max_shared_placed.py
+++ b/programming_examples/basic/vector_reduce_max/single_column_designs/vector_reduce_max_shared_placed.py
@@ -46,10 +46,14 @@ def device_body():
         # AIE Core Function declarations
         suffix = "_bfloat16" if dtype_str == "bf16" else ""
         reduce_max_vector = external_func(
-            f"reduce_max_vector{suffix}", inputs=[op_ty, out_ty, np.int32]
+            f"reduce_max_vector{suffix}",
+            inputs=[op_ty, out_ty, np.int32],
+            link_with="reduce_max.cc.o",
         )
         compute_max = external_func(
-            f"compute_max{suffix}", inputs=[out_ty, out_ty, out_ty]
+            f"compute_max{suffix}",
+            inputs=[out_ty, out_ty, out_ty],
+            link_with="reduce_max.cc.o",
         )
         min_val = (
             np.array([bfloat16(float("-inf"))], dtype=dtype)
@@ -123,7 +127,7 @@ def device_body():
             )
             if i != 1:
 
-                @core(cores[i], "reduce_max.cc.o")
+                @core(cores[i])
                 def core_body():
                     elem_out = out_fifos[i].acquire(ObjectFifoPort.Produce, 1)
                     for _ in range_(num_iter):
@@ -136,7 +140,7 @@ def core_body():
 
             else:
 
-                @core(cores[i], "reduce_max.cc.o")
+                @core(cores[i])
                 def core_body():
                     for _ in range_(num_iter):
                         elem_in = in_fifos[i].acquire(ObjectFifoPort.Consume, 1)
diff --git a/programming_examples/basic/vector_reduce_max/single_core_designs/vector_reduce_max_placed.py b/programming_examples/basic/vector_reduce_max/single_core_designs/vector_reduce_max_placed.py
index f5c00b8e0ce..0d6514fff6f 100644
--- a/programming_examples/basic/vector_reduce_max/single_core_designs/vector_reduce_max_placed.py
+++ b/programming_examples/basic/vector_reduce_max/single_core_designs/vector_reduce_max_placed.py
@@ -38,11 +38,15 @@ def device_body():
         # AIE Core Function declarations
         if dtype_str == "bf16":
             reduce_max_vector = external_func(
-                "reduce_max_vector_bfloat16", inputs=[in_ty, out_ty, np.int32]
+                "reduce_max_vector_bfloat16",
+                inputs=[in_ty, out_ty, np.int32],
+                link_with="reduce_max.cc.o",
             )
         else:
             reduce_max_vector = external_func(
-                "reduce_max_vector", inputs=[in_ty, out_ty, np.int32]
+                "reduce_max_vector",
+                inputs=[in_ty, out_ty, np.int32],
+                link_with="reduce_max.cc.o",
             )
 
         # Tile declarations
@@ -61,7 +65,7 @@ def device_body():
         # Set up compute tiles
 
         # Compute tile 2
-        @core(ComputeTile2, "reduce_max.cc.o")
+        @core(ComputeTile2)
         def core_body():
             for _ in range_(0xFFFFFFFF):
                 elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
diff --git a/programming_examples/basic/vector_reduce_min/vector_reduce_min_placed.py b/programming_examples/basic/vector_reduce_min/vector_reduce_min_placed.py
index 5073caed50b..d4e5fd50e9f 100644
--- a/programming_examples/basic/vector_reduce_min/vector_reduce_min_placed.py
+++ b/programming_examples/basic/vector_reduce_min/vector_reduce_min_placed.py
@@ -38,7 +38,9 @@ def device_body():
 
         # AIE Core Function declarations
         reduce_min_vector = external_func(
-            "reduce_min_vector", inputs=[in_ty, out_ty, np.int32]
+            "reduce_min_vector",
+            inputs=[in_ty, out_ty, np.int32],
+            link_with="reduce_min.cc.o",
         )
 
         # Tile declarations
@@ -52,7 +54,7 @@ def device_body():
         # Set up compute tiles
 
         # Compute tile 2
-        @core(ComputeTile2, "reduce_min.cc.o")
+        @core(ComputeTile2)
         def core_body():
             for _ in range_(0xFFFFFFFF):
                 elem_out = of_out.acquire(ObjectFifoPort.Produce, 1)
diff --git a/programming_examples/basic/vector_scalar_mul/vector_scalar_mul_placed.py b/programming_examples/basic/vector_scalar_mul/vector_scalar_mul_placed.py
index 3c31921abb0..21eebb35cb6 100644
--- a/programming_examples/basic/vector_scalar_mul/vector_scalar_mul_placed.py
+++ b/programming_examples/basic/vector_scalar_mul/vector_scalar_mul_placed.py
@@ -51,6 +51,7 @@ def device_body():
         scale = external_func(
             f"vector_scalar_mul_{func_type}",
             inputs=[tile_ty, tile_ty, scalar_ty, np.int32],
+            link_with="scale.o",
         )
 
         # Tile declarations
@@ -65,7 +66,7 @@ def device_body():
 
         # Set up compute tiles
         # Compute tile 2
-        @core(ComputeTile2, "scale.o")
+        @core(ComputeTile2)
         def core_body():
             # Effective while(1)
             for _ in range_(sys.maxsize):
diff --git a/programming_examples/ml/block_datatypes/bfp_conversion/bfp_conversion_placed.py b/programming_examples/ml/block_datatypes/bfp_conversion/bfp_conversion_placed.py
index 56651de802c..8b892f2ee74 100644
--- a/programming_examples/ml/block_datatypes/bfp_conversion/bfp_conversion_placed.py
+++ b/programming_examples/ml/block_datatypes/bfp_conversion/bfp_conversion_placed.py
@@ -39,10 +39,13 @@ def device_body():
         conversion_func = external_func(
             "bf16_to_bfp_conversion",
             [tile_bf16_ty, tile_bf16_ty, tile_bfp16_ty, tile_bfp16_ty],
+            link_with="kernel.o",
         )
 
         multiplication_func = external_func(
-            "bfp16_matrix_multiplication", [tile_bfp16_ty, tile_bfp16_ty, tile_bfp16_ty]
+            "bfp16_matrix_multiplication",
+            [tile_bfp16_ty, tile_bfp16_ty, tile_bfp16_ty],
+            link_with="kernel.o",
         )
 
         # Tile declarations
@@ -64,7 +67,7 @@ def device_body():
         # Set up compute tiles
 
         # Compute tile 2
-        @core(ComputeTile2, "kernel.o")
+        @core(ComputeTile2)
         def core_body():
             for _ in range_(sys.maxsize):
                 elem_in1 = of_in1.acquire(ObjectFifoPort.Consume, 1)
@@ -80,7 +83,7 @@ def core_body():
                 of_intermediate2.release(ObjectFifoPort.Produce, 1)
 
         # Compute tile 3
-        @core(ComputeTile3, "kernel.o")
+        @core(ComputeTile3)
         def core_body():
             for _ in range_(sys.maxsize):
                 elem_in1 = of_intermediate1.acquire(ObjectFifoPort.Consume, 1)
diff --git a/programming_examples/ml/block_datatypes/vector_passthrough/vector_passthrough_placed.py b/programming_examples/ml/block_datatypes/vector_passthrough/vector_passthrough_placed.py
index 671e7d1ec60..5c145e4105d 100644
--- a/programming_examples/ml/block_datatypes/vector_passthrough/vector_passthrough_placed.py
+++ b/programming_examples/ml/block_datatypes/vector_passthrough/vector_passthrough_placed.py
@@ -28,7 +28,9 @@ def device_body():
         tensor_ty = np.ndarray[(N,), np.dtype[v8bfp16ebs8]]
         tile_ty = np.ndarray[(n,), np.dtype[v8bfp16ebs8]]
 
-        kernel_func = external_func("bfp16_passthrough_vectorized", [tile_ty, tile_ty])
+        kernel_func = external_func(
+            "bfp16_passthrough_vectorized", [tile_ty, tile_ty], link_with="kernel.o"
+        )
 
         # Tile declarations
         ShimTile = tile(int(sys.argv[1]), 0)
@@ -41,7 +43,7 @@ def device_body():
         # Set up compute tiles
 
         # Compute tile 2
-        @core(ComputeTile2, "kernel.o")
+        @core(ComputeTile2)
         def core_body():
             for _ in range_(sys.maxsize):
                 elem_in1 = of_in1.acquire(ObjectFifoPort.Consume, 1)
diff --git a/programming_examples/ml/bottleneck/bottleneck_placed.py b/programming_examples/ml/bottleneck/bottleneck_placed.py
index cbf3388ef2a..61e8e37976b 100644
--- a/programming_examples/ml/bottleneck/bottleneck_placed.py
+++ b/programming_examples/ml/bottleneck/bottleneck_placed.py
@@ -105,6 +105,7 @@ def deviceBody():
                     np.int32,
                     np.int32,
                 ],
+                link_with="conv2dk1.o",
             )
             conv2dk3 = external_func(
                 "conv2dk3_ui8",
@@ -123,6 +124,7 @@ def deviceBody():
                     np.int32,
                     np.int32,
                 ],
+                link_with="conv2dk3.o",
             )
             conv2dk1_skip = external_func(
                 "conv2dk1_skip_i8",
@@ -138,6 +140,7 @@ def deviceBody():
                     np.int32,
                     np.int32,
                 ],
+                link_with="conv2dk1_skip.o",
             )
 
             ShimTile = tile(0, 0)
@@ -242,7 +245,7 @@ def deviceBody():
             )
 
             # 1x1 conv2d
-            @core(ComputeTile2, "conv2dk1.o")
+            @core(ComputeTile2)
             def core_body():
                 for _ in range_(sys.maxsize):
                     use_lock(lock2, LockAction.Acquire, value=1)
@@ -270,7 +273,7 @@ def core_body():
                     of_wts_buf_00.release(ObjectFifoPort.Consume, 1)
 
             # 3x3 conv2d OFM 0-31
-            @core(ComputeTile3, "conv2dk3.o")
+            @core(ComputeTile3)
             def core_body():
                 scale = 11
                 for _ in range_(sys.maxsize):
@@ -353,7 +356,7 @@ def core_body():
                     wts_buf_01.release(ObjectFifoPort.Consume, 1)
 
             # 3x3 conv2d OFM 32-63
-            @core(ComputeTile5, "conv2dk3.o")
+            @core(ComputeTile5)
             def core_body():
                 scale = 11
                 for _ in range_(sys.maxsize):
@@ -435,7 +438,7 @@ def core_body():
                     wts_buf_01.release(ObjectFifoPort.Consume, 1)
 
             # # 1x1 conv2d and add skip
-            @core(ComputeTile4, "conv2dk1_skip.o", stack_size=0xA00)
+            @core(ComputeTile4, stack_size=0xA00)
             def core_body():
                 for _ in range_(sys.maxsize):
 
diff --git a/programming_examples/ml/conv2d/conv2d_placed.py b/programming_examples/ml/conv2d/conv2d_placed.py
index 6455a8fc734..4f81d7856dc 100644
--- a/programming_examples/ml/conv2d/conv2d_placed.py
+++ b/programming_examples/ml/conv2d/conv2d_placed.py
@@ -57,6 +57,7 @@ def device_body():
                     np.int32,
                     np.int32,
                 ],
+                link_with="conv2dk1_i8.o",
             )
 
             # Tile declarations
@@ -99,7 +100,7 @@ def device_body():
             )
 
             # Compute tile 2
-            @core(ComputeTile2, "conv2dk1_i8.o", stack_size=0x600)
+            @core(ComputeTile2, stack_size=0x600)
             def core_body():
                 y_dim = height
                 x_dim = width
diff --git a/programming_examples/ml/conv2d_14x14/conv2dk14_32core_placed.py b/programming_examples/ml/conv2d_14x14/conv2dk14_32core_placed.py
index b28fdbbca2d..ad5d352a06f 100644
--- a/programming_examples/ml/conv2d_14x14/conv2dk14_32core_placed.py
+++ b/programming_examples/ml/conv2d_14x14/conv2dk14_32core_placed.py
@@ -87,6 +87,7 @@ def device_body():
                     np.int32,
                     np.int32,
                 ],
+                link_with="conv2dk14.o",
             )
 
             # Tile declarations
@@ -225,7 +226,7 @@ def device_body():
             for i in range(n_aie_cols):
                 for j in range(n_aie_rows):
 
-                    @core(core_tiles[j][i], "conv2dk14.o", stack_size=0xC00)
+                    @core(core_tiles[j][i], stack_size=0xC00)
                     def core_body():
                         y_dim = height // (kernel_size * 4)
                         x_blocks = 4
diff --git a/programming_examples/ml/conv2d_14x14/conv2dk14_placed.py b/programming_examples/ml/conv2d_14x14/conv2dk14_placed.py
index ca2ee6cf6c8..5c4c44f2e3b 100644
--- a/programming_examples/ml/conv2d_14x14/conv2dk14_placed.py
+++ b/programming_examples/ml/conv2d_14x14/conv2dk14_placed.py
@@ -78,6 +78,7 @@ def device_body():
                     np.int32,
                     np.int32,
                 ],
+                link_with="conv2dk14.o",
             )
 
             # Tile declarations
@@ -169,7 +170,7 @@ def device_body():
             )
 
             # Compute tile 2
-            @core(ComputeTile2, "conv2dk14.o", stack_size=0xC00)
+            @core(ComputeTile2, stack_size=0xC00)
             def core_body():
                 y_dim = height // kernel_size
                 x_blocks = 4
diff --git a/programming_examples/ml/conv2d_fused_relu/conv2d_fused_relu_placed.py b/programming_examples/ml/conv2d_fused_relu/conv2d_fused_relu_placed.py
index 2e9da2ae19f..06d1136e881 100644
--- a/programming_examples/ml/conv2d_fused_relu/conv2d_fused_relu_placed.py
+++ b/programming_examples/ml/conv2d_fused_relu/conv2d_fused_relu_placed.py
@@ -60,6 +60,7 @@ def device_body():
                     np.int32,
                     np.int32,
                 ],
+                link_with="conv2dk1.o",
             )
 
             # Tile declarations
@@ -98,7 +99,7 @@ def device_body():
             )
 
             # Compute tile 2
-            @core(ComputeTile2, "conv2dk1.o", stack_size=0xA00)
+            @core(ComputeTile2, stack_size=0xA00)
             def core_body():
                 y_dim = 32
                 x_dim = 32
diff --git a/programming_examples/ml/magika/group0_placed.py b/programming_examples/ml/magika/group0_placed.py
index 0ac1b667842..d735aceab33 100644
--- a/programming_examples/ml/magika/group0_placed.py
+++ b/programming_examples/ml/magika/group0_placed.py
@@ -60,9 +60,10 @@ def __init__(
         group0a_func = external_func(
             "group0a_kernel",
             inputs=[din_ty, dout_ty, lut0a_ty, scalar_ty, scalar_ty],
+            link_with=_objectArchive,
         )
 
-        @core(self.computeTile, self.objectArchive, stack_size=4096)
+        @core(self.computeTile, stack_size=4096)
         def core_body():
             for _ in range_(sys.maxsize):
                 di = self.din.acquire(ObjectFifoPort.Consume, 1)
@@ -123,9 +124,10 @@ def __init__(
         group0b_func = external_func(
             "group0b_kernel",
             inputs=[din_ty, dout_ty, lut0b_a_ty, lut0b_b_ty],
+            link_with=_objectArchive,
         )
 
-        @core(self.computeTile, self.objectArchive)
+        @core(self.computeTile)
         def core_body():
             for _ in range_(sys.maxsize):
                 for ite in range_(32):  # 256/8
diff --git a/programming_examples/ml/magika/group1_placed.py b/programming_examples/ml/magika/group1_placed.py
index a2da7ed3d4f..8554f4cca64 100644
--- a/programming_examples/ml/magika/group1_placed.py
+++ b/programming_examples/ml/magika/group1_placed.py
@@ -46,9 +46,10 @@ def __init__(
         group1_func = external_func(
             f"group1_{id}_kernel",
             inputs=[din_ty, dout_ty],
+            link_with=_objectArchive,
         )
 
-        @core(self.computeTile, self.objectArchive)
+        @core(self.computeTile)
         def core_body():
             for _ in range_(sys.maxsize):
                 do = self.dout.acquire(ObjectFifoPort.Produce, 1)
@@ -104,14 +105,16 @@ def __init__(
         group1a_func = external_func(
             f"group1_{id1}_kernel",
             inputs=[din_ty, dout2_ty],
+            link_with=_objectArchive,
         )
 
         group1b_func = external_func(
             f"group1_{id2}_kernel",
             inputs=[din_ty, dout2_ty, dout_ty],
+            link_with=_objectArchive,
         )
 
-        @core(self.computeTile1, self.objectArchive)
+        @core(self.computeTile1)
         def core_body():
             for _ in range_(sys.maxsize):
                 do = self.of_int.acquire(ObjectFifoPort.Produce, 1)
@@ -120,7 +123,7 @@ def core_body():
                 self.din.release(ObjectFifoPort.Consume, 1)
                 self.of_int.release(ObjectFifoPort.Produce, 1)
 
-        @core(self.computeTile2, self.objectArchive)
+        @core(self.computeTile2)
         def core_body():
             for _ in range_(sys.maxsize):
                 do = self.dout.acquire(ObjectFifoPort.Produce, 1)
@@ -188,19 +191,22 @@ def __init__(
         group1a_func = external_func(
             f"group1_{id1}_kernel",
             inputs=[din_ty, dout2_ty],
+            link_with=_objectArchive,
         )
 
         group1b_func = external_func(
             f"group1_{id2}_kernel",
             inputs=[din_ty, dout2_ty, dout2_ty, dout_ty],
+            link_with=_objectArchive,
         )
 
         group1c_func = external_func(
             f"group1_{id3}_kernel",
             inputs=[din_ty, dout2_ty],
+            link_with=_objectArchive,
         )
 
-        @core(self.computeTile1, self.objectArchive)
+        @core(self.computeTile1)
         def core_body():
             for _ in range_(sys.maxsize):
                 do = self.of_int.acquire(ObjectFifoPort.Produce, 1)
@@ -209,7 +215,7 @@ def core_body():
                 self.din.release(ObjectFifoPort.Consume, 1)
                 self.of_int.release(ObjectFifoPort.Produce, 1)
 
-        @core(self.computeTile2, self.objectArchive)
+        @core(self.computeTile2)
         def core_body():
             for _ in range_(sys.maxsize):
                 do = self.dout.acquire(ObjectFifoPort.Produce, 1)
@@ -222,7 +228,7 @@ def core_body():
                 self.of_int3.release(ObjectFifoPort.Consume, 1)
                 self.dout.release(ObjectFifoPort.Produce, 1)
 
-        @core(self.computeTile3, self.objectArchive)
+        @core(self.computeTile3)
         def core_body():
             for _ in range_(sys.maxsize):
                 do = self.of_int2.acquire(ObjectFifoPort.Produce, 1)
diff --git a/programming_examples/ml/magika/group2_placed.py b/programming_examples/ml/magika/group2_placed.py
index a9f02af3841..b7394b9fc74 100644
--- a/programming_examples/ml/magika/group2_placed.py
+++ b/programming_examples/ml/magika/group2_placed.py
@@ -91,13 +91,14 @@ def __init__(
         group2_func = external_func(
             "group2_kernel",
             inputs=[din_ty, lut0_ty, lut1_ty, lut2_ty, lut3_ty],
+            link_with=_objectArchive,
         )
 
         output_lock = lock(
             self.computeTile, lock_id=8, init=0
         )  # chooose id=8, objfifo doesn't use it
 
-        @core(self.computeTile, self.objectArchive)
+        @core(self.computeTile)
         def core_body():
             for _ in range_(sys.maxsize):
                 di = self.din.acquire(ObjectFifoPort.Consume, 1)
diff --git a/programming_examples/ml/resnet/layers_conv2_x/aie.mlir b/programming_examples/ml/resnet/layers_conv2_x/aie.mlir
index 088bc9be826..191efe3020d 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/aie.mlir
+++ b/programming_examples/ml/resnet/layers_conv2_x/aie.mlir
@@ -114,12 +114,12 @@ aie.device(npu1_3col) {
     aie.objectfifo @outOFL2L3(%tile24, {%tile10}, 2 : i32) : !aie.objectfifo<memref<32x1x256xui8>> //32x1x64
 
   // ___________________________Kernel Call___________________________
-    func.func private @conv2dk1_i8(memref<32x1x64xi8>, memref<4096xi8>, memref<32x1x64xui8>,i32,i32,i32,i32) -> ()
-    func.func private @conv2dk3_ui8(memref<32x1x64xui8>,memref<32x1x64xui8>, memref<32x1x64xui8>,  memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> ()
-    func.func private @conv2dk1_skip_init_i8(memref<32x1x32xui8>,memref<32x1x32xui8>, memref<32768xi8>,memref<32x1x256xui8>,memref<32x1x64xi8>,i32,i32,i32,i32,i32,i32,i32) -> ()
+    func.func private @conv2dk1_i8(memref<32x1x64xi8>, memref<4096xi8>, memref<32x1x64xui8>,i32,i32,i32,i32) -> () attributes {link_with = "conv2dk1_i8.o"}
+    func.func private @conv2dk3_ui8(memref<32x1x64xui8>,memref<32x1x64xui8>, memref<32x1x64xui8>,  memref<36864xi8>,memref<32x1x32xui8>,i32,i32,i32,i32,i32,i32,i32,i32) -> () attributes {link_with = "conv2dk3.o"}
+    func.func private @conv2dk1_skip_init_i8(memref<32x1x32xui8>,memref<32x1x32xui8>, memref<32768xi8>,memref<32x1x256xui8>,memref<32x1x64xi8>,i32,i32,i32,i32,i32,i32,i32) -> () attributes {link_with = "conv2dk1_skip_init.o"}
 
-    func.func private @conv2dk1_ui8(memref<32x1x256xui8>, memref<16384xi8>, memref<32x1x64xui8>,i32,i32,i32,i32) -> ()
-    func.func private @conv2dk1_skip_ui8(memref<32x1x32xui8>,memref<32x1x32xui8>, memref<16384xi8>,memref<32x1x256xui8>,memref<32x1x256xui8>,i32,i32,i32,i32,i32) -> ()
+    func.func private @conv2dk1_ui8(memref<32x1x256xui8>, memref<16384xi8>, memref<32x1x64xui8>,i32,i32,i32,i32) -> () attributes {link_with = "conv2dk1_ui8.o"}
+    func.func private @conv2dk1_skip_ui8(memref<32x1x32xui8>,memref<32x1x32xui8>, memref<16384xi8>,memref<32x1x256xui8>,memref<32x1x256xui8>,i32,i32,i32,i32,i32) -> () attributes {link_with = "conv2dk1_skip.o"}
   // ___________________________Bottleneck 1___________________________
     // 1x1 conv
     aie.core(%tile02) {
@@ -156,7 +156,7 @@ aie.device(npu1_3col) {
         aie.objectfifo.release @wts_buf_00(Consume, 1)
       }
       aie.end
-    } { link_with="conv2dk1_i8.o" }
+    }
 
     // 3x3 conv
     aie.core(%tile03) {
@@ -239,7 +239,7 @@ aie.device(npu1_3col) {
       }
         // aie.objectfifo.release<Consume>(%inOF_wts_0_L3L2 : !aie.objectfifo<memref<32x32x3x3xi32>>, 1)
       aie.end
-    } { link_with="conv2dk3.o" }
+    }
 
     // 3x3 conv
     aie.core(%tile04) {
@@ -321,7 +321,7 @@ aie.device(npu1_3col) {
         }
         aie.end
 
-    } { link_with="conv2dk3.o" }
+    }
 
     // 1x1 conv with skip
     aie.core(%tile05) {
@@ -373,7 +373,7 @@ aie.device(npu1_3col) {
         aie.objectfifo.release @wts_buf_02(Consume, 1)
       }
       aie.end
-    } { link_with="conv2dk1_skip_init.o" }
+    }
   // ___________________________Bottleneck 2___________________________
     // 1x1 conv
     aie.core(%tile15) {
@@ -410,7 +410,7 @@ aie.device(npu1_3col) {
         aie.objectfifo.release @wts_buf_10(Consume, 1)
       }
       aie.end
-    } { link_with="conv2dk1_ui8.o" }
+    }
 
     // 3x3 conv
     aie.core(%tile12) {
@@ -493,7 +493,7 @@ aie.device(npu1_3col) {
       }
         // aie.objectfifo.release<Consume>(%inOF_wts_0_L3L2 : !aie.objectfifo<memref<32x32x3x3xi32>>, 1)
       aie.end
-    } { link_with="conv2dk3.o" }
+    }
 
     // 3x3 conv
     aie.core(%tile14) {
@@ -575,7 +575,7 @@ aie.device(npu1_3col) {
         }
         aie.end
 
-    } { link_with="conv2dk3.o" }
+    }
 
     // 1x1 conv with skip
     aie.core(%tile13) {
@@ -624,7 +624,7 @@ aie.device(npu1_3col) {
           aie.objectfifo.release @wts_buf_12(Consume, 1)
         }
         aie.end
-      } { link_with="conv2dk1_skip.o" }
+      }
 
 
   // ___________________________Bottleneck 3___________________________
@@ -663,7 +663,7 @@ aie.device(npu1_3col) {
         aie.objectfifo.release @wts_buf_20(Consume, 1)
       }
       aie.end
-    } { link_with="conv2dk1_ui8.o" }
+    }
 
     // 3x3 conv
     aie.core(%tile23) {
@@ -746,7 +746,7 @@ aie.device(npu1_3col) {
       }
         // aie.objectfifo.release<Consume>(%inOF_wts_0_L3L2 : !aie.objectfifo<memref<32x32x3x3xi32>>, 1)
       aie.end
-    } { link_with="conv2dk3.o" }
+    }
 
     // 3x3 conv
     aie.core(%tile25) {
@@ -828,7 +828,7 @@ aie.device(npu1_3col) {
           }
         aie.end
 
-    } { link_with="conv2dk3.o" }
+    }
 
     // 1x1 conv with skip
     aie.core(%tile24) {
@@ -877,7 +877,7 @@ aie.device(npu1_3col) {
         aie.objectfifo.release @wts_buf_22(Consume, 1)
       }
       aie.end
-    } { link_with="conv2dk1_skip.o" }
+    }
 
 
   aie.runtime_sequence(%in0 : memref<16384xi32>, %wts0 : memref<53248xi32>, %out : memref<65536xi32>) {
diff --git a/programming_examples/ml/resnet/layers_conv2_x/resnet_placed.py b/programming_examples/ml/resnet/layers_conv2_x/resnet_placed.py
index 2f29be4113b..26c55e22894 100755
--- a/programming_examples/ml/resnet/layers_conv2_x/resnet_placed.py
+++ b/programming_examples/ml/resnet/layers_conv2_x/resnet_placed.py
@@ -147,6 +147,7 @@ def deviceBody():
                     np.int32,
                     np.int32,
                 ],
+                link_with="conv2dk1_i8.o",
             )
             conv2dk3 = external_func(
                 "conv2dk3_ui8",
@@ -165,6 +166,7 @@ def deviceBody():
                     np.int32,
                     np.int32,
                 ],
+                link_with="conv2dk3.o",
             )
             conv2dk1_skip_init_i8 = external_func(
                 "conv2dk1_skip_init_i8",
@@ -182,6 +184,7 @@ def deviceBody():
                     np.int32,
                     np.int32,
                 ],
+                link_with="conv2dk1_skip_init.o",
             )
             conv2dk1_ui8 = external_func(
                 "conv2dk1_ui8",
@@ -194,6 +197,7 @@ def deviceBody():
                     np.int32,
                     np.int32,
                 ],
+                link_with="conv2dk1_ui8.o",
             )
 
             conv2dk1_skip_ui8 = external_func(
@@ -210,6 +214,7 @@ def deviceBody():
                     np.int32,
                     np.int32,
                 ],
+                link_with="conv2dk1_skip.o",
             )
 
             ShimTile00 = tile(0, 0)
@@ -579,7 +584,7 @@ def deviceBody():
             # # 1x1 conv2d
             for i in range(n_cols):
 
-                @core(cores[i][0], conv1_kernels[i])
+                @core(cores[i][0])
                 def core_body():
                     for _ in range_(sys.maxsize):
 
@@ -623,7 +628,7 @@ def core_body():
             # 3x3 conv2d OFM 0-31
             for i in range(n_cols):
 
-                @core(cores[i][1], "conv2dk3.o")
+                @core(cores[i][1])
                 def core_body():
                     scale = 1
                     for _ in range_(sys.maxsize):
@@ -715,7 +720,7 @@ def core_body():
 
             for i in range(n_cols):
 
-                @core(cores[i][3], "conv2dk3.o")
+                @core(cores[i][3])
                 def core_body():
                     scale = 1
                     for _ in range_(sys.maxsize):
@@ -807,7 +812,7 @@ def core_body():
             # # 1x1 conv2d and add skip
             for i in range(n_cols):
 
-                @core(cores[i][2], conv3_kernels[i], stack_size=0xA00)
+                @core(cores[i][2], stack_size=0xA00)
                 def core_body():
                     for _ in range_(sys.maxsize):
 
diff --git a/programming_examples/mlir/MM_2x2/circuit_switched_version/aie.mlir b/programming_examples/mlir/MM_2x2/circuit_switched_version/aie.mlir
index f3b8316da7b..0a39e4ab0bd 100755
--- a/programming_examples/mlir/MM_2x2/circuit_switched_version/aie.mlir
+++ b/programming_examples/mlir/MM_2x2/circuit_switched_version/aie.mlir
@@ -205,7 +205,7 @@ module @MM_2x2 {
     aie.end
   }
 
-  func.func private @extern_kernel(%A: memref<1024xi32>, %B: memref<1024xi32>, %acc: memref<1024xi32>, %C: memref<1024xi32>) -> ()
+  func.func private @extern_kernel(%A: memref<1024xi32>, %B: memref<1024xi32>, %acc: memref<1024xi32>, %C: memref<1024xi32>) -> () attributes {link_with = "kernel.o"}
 
   %core63 = aie.core(%t63) {
     aie.use_lock(%lock63_0, "Acquire", 1)
@@ -216,7 +216,7 @@ module @MM_2x2 {
     aie.use_lock(%lock63_1, "Release", 0)
     aie.use_lock(%lock63_0, "Release", 0)
     aie.end
-  } { link_with="kernel.o" }
+  }
 
   %core64 = aie.core(%t64) {
     aie.use_lock(%lock63_3, "Acquire", 1)
@@ -229,7 +229,7 @@ module @MM_2x2 {
     aie.use_lock(%lock64_0, "Release", 0)
     aie.use_lock(%lock63_3, "Release", 0)
     aie.end
-  } { link_with="kernel.o" }
+  }
 
   %m73 = aie.mem(%t73)  {
     aie.dma_start("S2MM", 0, ^bd0, ^dma0)
@@ -283,7 +283,7 @@ module @MM_2x2 {
     aie.use_lock(%lock73_1, "Release", 0)
     aie.use_lock(%lock73_0, "Release", 0)
     aie.end
-  } { link_with="kernel.o" }
+  }
 
   %core74 = aie.core(%t74) {
     aie.use_lock(%lock73_2, "Acquire", 1)
@@ -296,5 +296,5 @@ module @MM_2x2 {
     aie.use_lock(%lock74_0, "Release", 0)
     aie.use_lock(%lock73_2, "Release", 0)
     aie.end
-  } { link_with="kernel.o" }
+  }
 }
diff --git a/programming_examples/mlir/MM_2x2/objectFifo_circuit_switched_version/aie.mlir b/programming_examples/mlir/MM_2x2/objectFifo_circuit_switched_version/aie.mlir
index 880622c31e4..288d09316e0 100755
--- a/programming_examples/mlir/MM_2x2/objectFifo_circuit_switched_version/aie.mlir
+++ b/programming_examples/mlir/MM_2x2/objectFifo_circuit_switched_version/aie.mlir
@@ -62,7 +62,7 @@ aie.device(xcvc1902) {
   %buf63 = aie.buffer(%t63) {sym_name = "buf63"} : memref<1024xi32>  //Accumulator0
   %buf73 = aie.buffer(%t73) {sym_name = "buf73"} : memref<1024xi32>  //Accumulator1
 
-  func.func private @extern_kernel(%A: memref<1024xi32>, %B: memref<1024xi32>, %acc: memref<1024xi32>, %C: memref<1024xi32>) -> ()
+  func.func private @extern_kernel(%A: memref<1024xi32>, %B: memref<1024xi32>, %acc: memref<1024xi32>, %C: memref<1024xi32>) -> () attributes {link_with = "kernel.o"}
 
   %core63 = aie.core(%t63) {
     %LHS0Subview = aie.objectfifo.acquire @of_LHS0 (Consume, 1) : !aie.objectfifosubview<memref<1024xi32>>
@@ -81,7 +81,7 @@ aie.device(xcvc1902) {
     aie.objectfifo.release @of_acc0 (Produce, 1)
 
     aie.end
-  } { link_with="kernel.o" }
+  }
 
   %core64 = aie.core(%t64) {
     %LHS1Subview = aie.objectfifo.acquire @of_LHS1 (Consume, 1) : !aie.objectfifosubview<memref<1024xi32>>
@@ -104,7 +104,7 @@ aie.device(xcvc1902) {
     aie.objectfifo.release @of_out0 (Produce, 1)
 
     aie.end
-  } { link_with="kernel.o" }
+  }
 
   %core73 = aie.core(%t73) {
     %LHS0Subview = aie.objectfifo.acquire @of_LHS0 (Consume, 1) : !aie.objectfifosubview<memref<1024xi32>>
@@ -123,7 +123,7 @@ aie.device(xcvc1902) {
     aie.objectfifo.release @of_acc1 (Produce, 1)
 
     aie.end
-  } { link_with="kernel.o" }
+  }
 
   %core74 = aie.core(%t74) {
     %LHS1Subview = aie.objectfifo.acquire @of_LHS1 (Consume, 1) : !aie.objectfifosubview<memref<1024xi32>>
@@ -146,5 +146,5 @@ aie.device(xcvc1902) {
     aie.objectfifo.release @of_out1 (Produce, 1)
 
     aie.end
-  } { link_with="kernel.o" }
+  }
 }
diff --git a/programming_examples/mlir/MM_2x2/packet_switched_version/aie.mlir b/programming_examples/mlir/MM_2x2/packet_switched_version/aie.mlir
index 9490746e3a7..5ab33ba82c9 100644
--- a/programming_examples/mlir/MM_2x2/packet_switched_version/aie.mlir
+++ b/programming_examples/mlir/MM_2x2/packet_switched_version/aie.mlir
@@ -207,7 +207,7 @@ module @MM_2x2 {
     aie.end
   }
 
-  func.func private @extern_kernel(%A: memref<1024xi32>, %B: memref<1024xi32>, %acc: memref<1024xi32>, %C: memref<1024xi32>) -> ()
+  func.func private @extern_kernel(%A: memref<1024xi32>, %B: memref<1024xi32>, %acc: memref<1024xi32>, %C: memref<1024xi32>) -> () attributes {link_with = "kernel.o"}
 
 
   %lock63_3 = aie.lock(%t63, 3)
@@ -221,7 +221,7 @@ module @MM_2x2 {
     aie.use_lock(%lock63_0, "Release", 0)
     
     aie.end
-  } { link_with="kernel.o" }
+  }
 
 
   %core64 = aie.core(%t64) {
@@ -235,7 +235,7 @@ module @MM_2x2 {
     aie.use_lock(%lock64_0, "Release", 0)
     aie.use_lock(%lock63_3, "Release", 0)
     aie.end
-  } { link_with="kernel.o" }
+  }
 
 
   %lock73_0 = aie.lock(%t73, 0)
@@ -300,7 +300,7 @@ module @MM_2x2 {
     aie.use_lock(%lock73_1, "Release", 0)
     aie.use_lock(%lock73_0, "Release", 0)
     aie.end
-  } { link_with="kernel.o" }
+  }
 
   %core74 = aie.core(%t74) {
     aie.use_lock(%lock73_2, "Acquire", 1)
@@ -313,7 +313,7 @@ module @MM_2x2 {
     aie.use_lock(%lock74_0, "Release", 0)
     aie.use_lock(%lock73_2, "Release", 0)
     aie.end
-  } { link_with="kernel.o" }
+  }
 
 
 }
diff --git a/programming_examples/mlir/horizontal_diffusion/HDIFF_dual_AIE_objectFIFO_ping_pong/aie.mlir b/programming_examples/mlir/horizontal_diffusion/HDIFF_dual_AIE_objectFIFO_ping_pong/aie.mlir
index 985f4b38503..214061a0982 100644
--- a/programming_examples/mlir/horizontal_diffusion/HDIFF_dual_AIE_objectFIFO_ping_pong/aie.mlir
+++ b/programming_examples/mlir/horizontal_diffusion/HDIFF_dual_AIE_objectFIFO_ping_pong/aie.mlir
@@ -30,7 +30,7 @@ module @hdiff_multi_AIE{
   aie.objectfifo.register_external_buffers @obj_in (%t70, {%ext_buffer_in0}) : (memref<1536xi32>)
   aie.objectfifo.register_external_buffers @obj_out_flux (%t70, {%ext_buffer_out}) : (memref<512xi32>)
 
-  func.func private @hdiff_lap(%AL: memref<256xi32>,%BL: memref<256xi32>, %CL:  memref<256xi32>, %DL: memref<256xi32>, %EL:  memref<256xi32>,  %OLL1: memref<256xi32>,  %OLL2: memref<256xi32>,  %OLL3: memref<256xi32>,  %OLL4: memref<256xi32>) -> ()
+  func.func private @hdiff_lap(%AL: memref<256xi32>,%BL: memref<256xi32>, %CL:  memref<256xi32>, %DL: memref<256xi32>, %EL:  memref<256xi32>,  %OLL1: memref<256xi32>,  %OLL2: memref<256xi32>,  %OLL3: memref<256xi32>,  %OLL4: memref<256xi32>) -> () attributes {link_with = "hdiff_lap.o"}
   
   %c13 = aie.core(%t71) {
     %lb = arith.constant 0 : index
@@ -59,9 +59,9 @@ module @hdiff_multi_AIE{
     aie.objectfifo.release @obj_in (Consume, 4)
 
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
-func.func private @hdiff_flux(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  memref<256xi32>,   %OLF1: memref<256xi32>,  %OLF2: memref<256xi32>,  %OLF3: memref<256xi32>,  %OLF4: memref<256xi32>,  %OF: memref<256xi32>) -> ()
+func.func private @hdiff_flux(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  memref<256xi32>,   %OLF1: memref<256xi32>,  %OLF2: memref<256xi32>,  %OLF3: memref<256xi32>,  %OLF4: memref<256xi32>,  %OF: memref<256xi32>) -> () attributes {link_with = "hdiff_flux.o"}
   
   %c14 = aie.core(%t72) {
     %lb = arith.constant 0 : index
@@ -91,5 +91,5 @@ func.func private @hdiff_flux(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  m
     aie.objectfifo.release @obj_in (Consume, 4)
 
     aie.end
-  } { link_with="hdiff_flux.o" }
+  }
 }
diff --git a/programming_examples/mlir/horizontal_diffusion/HDIFF_dual_AIE_objectFIFO_ping_pong/aie_fp32.mlir b/programming_examples/mlir/horizontal_diffusion/HDIFF_dual_AIE_objectFIFO_ping_pong/aie_fp32.mlir
index 78a78e8eb68..a554c95c0e5 100644
--- a/programming_examples/mlir/horizontal_diffusion/HDIFF_dual_AIE_objectFIFO_ping_pong/aie_fp32.mlir
+++ b/programming_examples/mlir/horizontal_diffusion/HDIFF_dual_AIE_objectFIFO_ping_pong/aie_fp32.mlir
@@ -29,7 +29,7 @@ module @hdiff_multi_AIE{
   aie.objectfifo.register_external_buffers @obj_in (%t70, {%ext_buffer_in0}) : (memref<1536xf32>)
   aie.objectfifo.register_external_buffers @obj_out_flux (%t70, {%ext_buffer_out}) : (memref<512xf32>)
 
-  func.func private @hdiff_lap_fp32(%AL: memref<256xf32>,%BL: memref<256xf32>, %CL:  memref<256xf32>, %DL: memref<256xf32>, %EL:  memref<256xf32>,  %OLL1: memref<256xf32>,  %OLL2: memref<256xf32>,  %OLL3: memref<256xf32>,  %OLL4: memref<256xf32>) -> ()
+  func.func private @hdiff_lap_fp32(%AL: memref<256xf32>,%BL: memref<256xf32>, %CL:  memref<256xf32>, %DL: memref<256xf32>, %EL:  memref<256xf32>,  %OLL1: memref<256xf32>,  %OLL2: memref<256xf32>,  %OLL3: memref<256xf32>,  %OLL4: memref<256xf32>) -> () attributes {link_with = "hdiff_lap_fp32.o"}
   
   %c13 = aie.core(%t71) {
     
@@ -60,9 +60,9 @@ module @hdiff_multi_AIE{
     aie.objectfifo.release @obj_in (Consume, 4)
 
     aie.end
-  } { link_with="hdiff_lap_fp32.o" }
+  }
 
-func.func private @hdiff_flux_fp32(%AF: memref<256xf32>,%BF: memref<256xf32>, %CF:  memref<256xf32>,   %OLF1: memref<256xf32>,  %OLF2: memref<256xf32>,  %OLF3: memref<256xf32>,  %OLF4: memref<256xf32>,  %OF: memref<256xf32>) -> ()
+func.func private @hdiff_flux_fp32(%AF: memref<256xf32>,%BF: memref<256xf32>, %CF:  memref<256xf32>,   %OLF1: memref<256xf32>,  %OLF2: memref<256xf32>,  %OLF3: memref<256xf32>,  %OLF4: memref<256xf32>,  %OF: memref<256xf32>) -> () attributes {link_with = "hdiff_flux_fp32.o"}
   
   %c14 = aie.core(%t72) {
     
@@ -95,7 +95,7 @@ func.func private @hdiff_flux_fp32(%AF: memref<256xf32>,%BF: memref<256xf32>, %C
     aie.objectfifo.release @obj_in (Consume, 4)
 
     aie.end
-  } { link_with="hdiff_flux_fp32.o" }
+  }
 
 
 
diff --git a/programming_examples/mlir/horizontal_diffusion/HDIFF_single_AIE_objectFIFO/aie.mlir b/programming_examples/mlir/horizontal_diffusion/HDIFF_single_AIE_objectFIFO/aie.mlir
index 6b8907f080b..968ec361dcf 100644
--- a/programming_examples/mlir/horizontal_diffusion/HDIFF_single_AIE_objectFIFO/aie.mlir
+++ b/programming_examples/mlir/horizontal_diffusion/HDIFF_single_AIE_objectFIFO/aie.mlir
@@ -27,7 +27,7 @@ module @hdiff_single_AIE {
   aie.objectfifo.register_external_buffers @obj_in (%t70, {%ext_buffer_in0}) : (memref<1536xi32>)
   aie.objectfifo.register_external_buffers @obj_out (%t70, {%ext_buffer_out}) : (memref<512xi32>)
 
-  func.func private @vec_hdiff(%A: memref<256xi32>,%B: memref<256xi32>, %C:  memref<256xi32>, %D: memref<256xi32>, %E:  memref<256xi32>,  %O: memref<256xi32>) -> ()
+  func.func private @vec_hdiff(%A: memref<256xi32>,%B: memref<256xi32>, %C:  memref<256xi32>, %D: memref<256xi32>, %E:  memref<256xi32>,  %O: memref<256xi32>) -> () attributes {link_with = "hdiff.o"}
 
   %c13 = aie.core(%t71) {
     %lb = arith.constant 0 : index
@@ -53,5 +53,5 @@ module @hdiff_single_AIE {
     aie.objectfifo.release @obj_in (Consume, 4)
 
     aie.end
-  } { link_with="hdiff.o" }
+  }
 }
diff --git a/programming_examples/mlir/horizontal_diffusion/HDIFF_single_AIE_objectFIFO/aie_fp32.mlir b/programming_examples/mlir/horizontal_diffusion/HDIFF_single_AIE_objectFIFO/aie_fp32.mlir
index 1533c5052eb..10f6adc49df 100644
--- a/programming_examples/mlir/horizontal_diffusion/HDIFF_single_AIE_objectFIFO/aie_fp32.mlir
+++ b/programming_examples/mlir/horizontal_diffusion/HDIFF_single_AIE_objectFIFO/aie_fp32.mlir
@@ -27,7 +27,7 @@ module @hdiff_single_AIE_fp32{
   aie.objectfifo.register_external_buffers @obj_in (%t70, {%ext_buffer_in0}) : (memref<1536xf32>)
   aie.objectfifo.register_external_buffers @obj_out (%t70, {%ext_buffer_out}) : (memref<512xf32>)
 
-  func.func private @vec_hdiff_fp32(%A: memref<256xf32>,%B: memref<256xf32>, %C:  memref<256xf32>, %D: memref<256xf32>, %E:  memref<256xf32>,  %O: memref<256xf32>) -> ()
+  func.func private @vec_hdiff_fp32(%A: memref<256xf32>,%B: memref<256xf32>, %C:  memref<256xf32>, %D: memref<256xf32>, %E:  memref<256xf32>,  %O: memref<256xf32>) -> () attributes {link_with = "hdiff_fp32.o"}
 
   %c13 = aie.core(%t71) {
     
@@ -55,7 +55,7 @@ module @hdiff_single_AIE_fp32{
     aie.objectfifo.release @obj_in (Consume, 4)
 
     aie.end
-  } { link_with="hdiff_fp32.o" }
+  }
 
 }
 
diff --git a/programming_examples/mlir/horizontal_diffusion/HDIFF_single_AIE_objectFIFO_ping_pong/aie.mlir b/programming_examples/mlir/horizontal_diffusion/HDIFF_single_AIE_objectFIFO_ping_pong/aie.mlir
index c3f43f522d8..455f6e9a3e5 100644
--- a/programming_examples/mlir/horizontal_diffusion/HDIFF_single_AIE_objectFIFO_ping_pong/aie.mlir
+++ b/programming_examples/mlir/horizontal_diffusion/HDIFF_single_AIE_objectFIFO_ping_pong/aie.mlir
@@ -26,7 +26,7 @@ module @hdiff_single_AIE {
   aie.objectfifo.register_external_buffers @obj_in (%t70, {%ext_buffer_in0}) : (memref<1536xi32>)
   aie.objectfifo.register_external_buffers @obj_out (%t70, {%ext_buffer_out}) : (memref<512xi32>)
 
-  func.func private @vec_hdiff(%A: memref<256xi32>, %B: memref<256xi32>, %C:  memref<256xi32>, %D: memref<256xi32>, %E:  memref<256xi32>,  %O: memref<256xi32>) -> ()
+  func.func private @vec_hdiff(%A: memref<256xi32>, %B: memref<256xi32>, %C:  memref<256xi32>, %D: memref<256xi32>, %E:  memref<256xi32>,  %O: memref<256xi32>) -> () attributes {link_with = "hdiff.o"}
 
   %c13 = aie.core(%t71) {
     %lb = arith.constant 0 : index
@@ -52,5 +52,5 @@ module @hdiff_single_AIE {
     aie.objectfifo.release @obj_in (Consume, 4)
 
     aie.end
-  } { link_with="hdiff.o" }
+  }
 }
diff --git a/programming_examples/mlir/horizontal_diffusion/HDIFF_single_AIE_objectFIFO_ping_pong/aie_fp32.mlir b/programming_examples/mlir/horizontal_diffusion/HDIFF_single_AIE_objectFIFO_ping_pong/aie_fp32.mlir
index 2384b35a6ec..75e61f7a774 100644
--- a/programming_examples/mlir/horizontal_diffusion/HDIFF_single_AIE_objectFIFO_ping_pong/aie_fp32.mlir
+++ b/programming_examples/mlir/horizontal_diffusion/HDIFF_single_AIE_objectFIFO_ping_pong/aie_fp32.mlir
@@ -28,7 +28,7 @@ module @hdiff_single_AIE_fp32{
   aie.objectfifo.register_external_buffers @obj_in (%t70, {%ext_buffer_in0}) : (memref<1536xf32>)
   aie.objectfifo.register_external_buffers @obj_out (%t70, {%ext_buffer_out}) : (memref<512xf32>)
 
-  func.func private @vec_hdiff_fp32(%A: memref<256xf32>,%B: memref<256xf32>, %C:  memref<256xf32>, %D: memref<256xf32>, %E:  memref<256xf32>,  %O: memref<256xf32>) -> ()
+  func.func private @vec_hdiff_fp32(%A: memref<256xf32>,%B: memref<256xf32>, %C:  memref<256xf32>, %D: memref<256xf32>, %E:  memref<256xf32>,  %O: memref<256xf32>) -> () attributes {link_with = "hdiff_fp32.o"}
 
   %c13 = aie.core(%t71) {
     
@@ -56,7 +56,7 @@ module @hdiff_single_AIE_fp32{
     aie.objectfifo.release @obj_in (Consume, 4)
 
     aie.end
-  } { link_with="hdiff_fp32.o" }
+  }
 
 }
 
diff --git a/programming_examples/mlir/horizontal_diffusion/HDIFF_single_AIE_objectFIFO_ping_pong_scaled/aie.mlir b/programming_examples/mlir/horizontal_diffusion/HDIFF_single_AIE_objectFIFO_ping_pong_scaled/aie.mlir
index a5700619bd0..5d86cfe169c 100644
--- a/programming_examples/mlir/horizontal_diffusion/HDIFF_single_AIE_objectFIFO_ping_pong_scaled/aie.mlir
+++ b/programming_examples/mlir/horizontal_diffusion/HDIFF_single_AIE_objectFIFO_ping_pong_scaled/aie.mlir
@@ -398,7 +398,7 @@ module @hdiff_large_0 {
   aie.objectfifo.register_external_buffers(%tile47_0, %buf_out_31_2_shim_47  : !aie.objectfifo<memref<256xi32>>, {%ext_buffer_out_31_2}) : (memref<512xi32>)
 
 
-  func.func private @vec_hdiff(%A: memref<256xi32>,%B: memref<256xi32>, %C:  memref<256xi32>, %D: memref<256xi32>, %E:  memref<256xi32>,  %O: memref<256xi32>) -> ()
+  func.func private @vec_hdiff(%A: memref<256xi32>,%B: memref<256xi32>, %C:  memref<256xi32>, %D: memref<256xi32>, %E:  memref<256xi32>,  %O: memref<256xi32>) -> () attributes {link_with = "hdiff.o"}
 
   %core0_2 = aie.core(%tile0_2) {
     %lb = arith.constant 0 : index
@@ -420,7 +420,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_0_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core1_2 = aie.core(%tile1_2) {
     %lb = arith.constant 0 : index
@@ -442,7 +442,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_1_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core2_2 = aie.core(%tile2_2) {
     %lb = arith.constant 0 : index
@@ -464,7 +464,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_2_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core3_2 = aie.core(%tile3_2) {
     %lb = arith.constant 0 : index
@@ -486,7 +486,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_3_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core4_2 = aie.core(%tile4_2) {
     %lb = arith.constant 0 : index
@@ -508,7 +508,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_4_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core5_2 = aie.core(%tile5_2) {
     %lb = arith.constant 0 : index
@@ -530,7 +530,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_5_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core6_2 = aie.core(%tile6_2) {
     %lb = arith.constant 0 : index
@@ -552,7 +552,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_6_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core7_2 = aie.core(%tile7_2) {
     %lb = arith.constant 0 : index
@@ -574,7 +574,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_7_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core8_2 = aie.core(%tile8_2) {
     %lb = arith.constant 0 : index
@@ -596,7 +596,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_8_shim_10: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core9_2 = aie.core(%tile9_2) {
     %lb = arith.constant 0 : index
@@ -618,7 +618,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_9_shim_10: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core10_2 = aie.core(%tile10_2) {
     %lb = arith.constant 0 : index
@@ -640,7 +640,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_10_shim_11: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core11_2 = aie.core(%tile11_2) {
     %lb = arith.constant 0 : index
@@ -662,7 +662,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_11_shim_11: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core12_2 = aie.core(%tile12_2) {
     %lb = arith.constant 0 : index
@@ -684,7 +684,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_12_shim_18: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core13_2 = aie.core(%tile13_2) {
     %lb = arith.constant 0 : index
@@ -706,7 +706,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_13_shim_18: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core14_2 = aie.core(%tile14_2) {
     %lb = arith.constant 0 : index
@@ -728,7 +728,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_14_shim_19: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core15_2 = aie.core(%tile15_2) {
     %lb = arith.constant 0 : index
@@ -750,7 +750,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_15_shim_19: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core16_2 = aie.core(%tile16_2) {
     %lb = arith.constant 0 : index
@@ -772,7 +772,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_16_shim_26: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core17_2 = aie.core(%tile17_2) {
     %lb = arith.constant 0 : index
@@ -794,7 +794,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_17_shim_26: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core18_2 = aie.core(%tile18_2) {
     %lb = arith.constant 0 : index
@@ -816,7 +816,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_18_shim_27: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core19_2 = aie.core(%tile19_2) {
     %lb = arith.constant 0 : index
@@ -838,7 +838,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_19_shim_27: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core20_2 = aie.core(%tile20_2) {
     %lb = arith.constant 0 : index
@@ -860,7 +860,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_20_shim_34: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core21_2 = aie.core(%tile21_2) {
     %lb = arith.constant 0 : index
@@ -882,7 +882,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_21_shim_34: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core22_2 = aie.core(%tile22_2) {
     %lb = arith.constant 0 : index
@@ -904,7 +904,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_22_shim_35: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core23_2 = aie.core(%tile23_2) {
     %lb = arith.constant 0 : index
@@ -926,7 +926,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_23_shim_35: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core24_2 = aie.core(%tile24_2) {
     %lb = arith.constant 0 : index
@@ -948,7 +948,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_24_shim_42: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core25_2 = aie.core(%tile25_2) {
     %lb = arith.constant 0 : index
@@ -970,7 +970,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_25_shim_42: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core26_2 = aie.core(%tile26_2) {
     %lb = arith.constant 0 : index
@@ -992,7 +992,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_26_shim_43: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core27_2 = aie.core(%tile27_2) {
     %lb = arith.constant 0 : index
@@ -1014,7 +1014,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_27_shim_43: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core28_2 = aie.core(%tile28_2) {
     %lb = arith.constant 0 : index
@@ -1036,7 +1036,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_28_shim_46: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core29_2 = aie.core(%tile29_2) {
     %lb = arith.constant 0 : index
@@ -1058,7 +1058,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_29_shim_46: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core30_2 = aie.core(%tile30_2) {
     %lb = arith.constant 0 : index
@@ -1080,7 +1080,7 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_30_shim_47: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
   %core31_2 = aie.core(%tile31_2) {
     %lb = arith.constant 0 : index
@@ -1102,6 +1102,6 @@ module @hdiff_large_0 {
 
   aie.objectfifo.release<Consume>(%buf_in_31_shim_47: !aie.objectfifo<memref<256xi32>>, 4)
   aie.end
- } { link_with="hdiff.o" }
+ }
 
 }
diff --git a/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong/aie.mlir b/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong/aie.mlir
index 1566168882c..523a6cc6a80 100644
--- a/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong/aie.mlir
+++ b/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong/aie.mlir
@@ -32,9 +32,9 @@ module @hdiff_tri_AIE {
   aie.objectfifo.register_external_buffers @obj_in (%t70, {%ext_buffer_in0}) : (memref<1536xi32>)
   aie.objectfifo.register_external_buffers @obj_out_flux (%t70, {%ext_buffer_out}) : (memref<512xi32>)
 
-  func.func private @hdiff_lap(%AL: memref<256xi32>,%BL: memref<256xi32>, %CL:  memref<256xi32>, %DL: memref<256xi32>, %EL:  memref<256xi32>,  %OLL1: memref<256xi32>,  %OLL2: memref<256xi32>,  %OLL3: memref<256xi32>,  %OLL4: memref<256xi32>) -> ()
-  func.func private @hdiff_flux1(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  memref<256xi32>,   %OLF1: memref<256xi32>,  %OLF2: memref<256xi32>,  %OLF3: memref<256xi32>,  %OLF4: memref<256xi32>,  %OFI1: memref<512xi32>,  %OFI2: memref<512xi32>,  %OFI3: memref<512xi32>,  %OFI4: memref<512xi32>,  %OFI5: memref<512xi32>) -> ()
-  func.func private @hdiff_flux2( %Inter1: memref<512xi32>,%Inter2: memref<512xi32>, %Inter3: memref<512xi32>,%Inter4: memref<512xi32>,%Inter5: memref<512xi32>,  %Out: memref<256xi32>) -> ()
+  func.func private @hdiff_lap(%AL: memref<256xi32>,%BL: memref<256xi32>, %CL:  memref<256xi32>, %DL: memref<256xi32>, %EL:  memref<256xi32>,  %OLL1: memref<256xi32>,  %OLL2: memref<256xi32>,  %OLL3: memref<256xi32>,  %OLL4: memref<256xi32>) -> () attributes {link_with = "hdiff_lap.o"}
+  func.func private @hdiff_flux1(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  memref<256xi32>,   %OLF1: memref<256xi32>,  %OLF2: memref<256xi32>,  %OLF3: memref<256xi32>,  %OLF4: memref<256xi32>,  %OFI1: memref<512xi32>,  %OFI2: memref<512xi32>,  %OFI3: memref<512xi32>,  %OFI4: memref<512xi32>,  %OFI5: memref<512xi32>) -> () attributes {link_with = "hdiff_flux1.o"}
+  func.func private @hdiff_flux2( %Inter1: memref<512xi32>,%Inter2: memref<512xi32>, %Inter3: memref<512xi32>,%Inter4: memref<512xi32>,%Inter5: memref<512xi32>,  %Out: memref<256xi32>) -> () attributes {link_with = "hdiff_flux2.o"}
 
   %c13 = aie.core(%t71) {
     %lb = arith.constant 0 : index
@@ -62,7 +62,7 @@ module @hdiff_tri_AIE {
     aie.objectfifo.release @obj_in (Consume, 4)
 
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %c14 = aie.core(%t72) {
     %lb = arith.constant 0 : index
@@ -97,7 +97,7 @@ module @hdiff_tri_AIE {
     aie.objectfifo.release @obj_in (Consume, 4)
 
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %c15 = aie.core(%t73) {
     %lb = arith.constant 0 : index
@@ -123,5 +123,5 @@ module @hdiff_tri_AIE {
     aie.use_lock(%lock73_14, "Acquire", 0) // stop the timer
 
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 }
diff --git a/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong/aie_fp32.mlir b/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong/aie_fp32.mlir
index e91361b78cd..3e55315f3c5 100644
--- a/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong/aie_fp32.mlir
+++ b/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong/aie_fp32.mlir
@@ -33,9 +33,9 @@ module @hdiff_tri_AIE{
   aie.objectfifo.register_external_buffers @obj_out_flux (%t70, {%ext_buffer_out}) : (memref<512xf32>)
 
 
-  func.func private @hdiff_lap_fp32(%AL: memref<256xf32>,%BL: memref<256xf32>, %CL:  memref<256xf32>, %DL: memref<256xf32>, %EL:  memref<256xf32>,  %OLL1: memref<256xf32>,  %OLL2: memref<256xf32>,  %OLL3: memref<256xf32>,  %OLL4: memref<256xf32>) -> ()
-  func.func private @hdiff_flux1_fp32(%AF: memref<256xf32>,%BF: memref<256xf32>, %CF:  memref<256xf32>,   %OLF1: memref<256xf32>,  %OLF2: memref<256xf32>,  %OLF3: memref<256xf32>,  %OLF4: memref<256xf32>,  %OFI1: memref<512xf32>,  %OFI2: memref<512xf32>,  %OFI3: memref<512xf32>,  %OFI4: memref<512xf32>,  %OFI5: memref<512xf32>) -> ()
-  func.func private @hdiff_flux2_fp32( %Inter1: memref<512xf32>,%Inter2: memref<512xf32>, %Inter3: memref<512xf32>,%Inter4: memref<512xf32>,%Inter5: memref<512xf32>,  %Out: memref<256xf32>) -> ()
+  func.func private @hdiff_lap_fp32(%AL: memref<256xf32>,%BL: memref<256xf32>, %CL:  memref<256xf32>, %DL: memref<256xf32>, %EL:  memref<256xf32>,  %OLL1: memref<256xf32>,  %OLL2: memref<256xf32>,  %OLL3: memref<256xf32>,  %OLL4: memref<256xf32>) -> () attributes {link_with = "hdiff_lap_fp32.o"}
+  func.func private @hdiff_flux1_fp32(%AF: memref<256xf32>,%BF: memref<256xf32>, %CF:  memref<256xf32>,   %OLF1: memref<256xf32>,  %OLF2: memref<256xf32>,  %OLF3: memref<256xf32>,  %OLF4: memref<256xf32>,  %OFI1: memref<512xf32>,  %OFI2: memref<512xf32>,  %OFI3: memref<512xf32>,  %OFI4: memref<512xf32>,  %OFI5: memref<512xf32>) -> () attributes {link_with = "hdiff_flux1_fp32.o"}
+  func.func private @hdiff_flux2_fp32( %Inter1: memref<512xf32>,%Inter2: memref<512xf32>, %Inter3: memref<512xf32>,%Inter4: memref<512xf32>,%Inter5: memref<512xf32>,  %Out: memref<256xf32>) -> () attributes {link_with = "hdiff_flux2_fp32.o"}
 
   %c13 = aie.core(%t71) {
     
@@ -66,7 +66,7 @@ module @hdiff_tri_AIE{
     aie.objectfifo.release @obj_in (Consume, 4)
 
     aie.end
-  } { link_with="hdiff_lap_fp32.o" }
+  }
 
 
   %c14 = aie.core(%t72) {
@@ -103,7 +103,7 @@ module @hdiff_tri_AIE{
     aie.objectfifo.release @obj_in (Consume, 4)
 
     aie.end
-  } { link_with="hdiff_flux1_fp32.o" }
+  }
 
   %c15 = aie.core(%t73) {
     %lb = arith.constant 0 : index
@@ -131,7 +131,7 @@ module @hdiff_tri_AIE{
     aie.use_lock(%lock73_14, "Acquire", 0) // stop the timer
 
     aie.end
-  } { link_with="hdiff_flux2_fp32.o" }
+  }
 
 
 
diff --git a/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie.mlir b/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie.mlir
index 78de65d0928..b313711740f 100644
--- a/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie.mlir
+++ b/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie.mlir
@@ -57,9 +57,9 @@ module @hdiff_bundle_1 {
   aie.objectfifo.register_external_buffers(%tile2_0, %block_0_buf_out_shim_2 : !aie.objectfifo<memref<256xi32>>, {%ext_buffer_out_0}) : (memref<2048xi32>)
 
 
-  func.func private @hdiff_lap(%AL: memref<256xi32>,%BL: memref<256xi32>, %CL:  memref<256xi32>, %DL: memref<256xi32>, %EL:  memref<256xi32>,  %OLL1: memref<256xi32>,  %OLL2: memref<256xi32>,  %OLL3: memref<256xi32>,  %OLL4: memref<256xi32>) -> ()
-  func.func private @hdiff_flux1(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  memref<256xi32>,   %OLF1: memref<256xi32>,  %OLF2: memref<256xi32>,  %OLF3: memref<256xi32>,  %OLF4: memref<256xi32>,  %OFI1: memref<512xi32>,  %OFI2: memref<512xi32>,  %OFI3: memref<512xi32>,  %OFI4: memref<512xi32>,  %OFI5: memref<512xi32>) -> ()
-  func.func private @hdiff_flux2( %Inter1: memref<512xi32>,%Inter2: memref<512xi32>, %Inter3: memref<512xi32>,%Inter4: memref<512xi32>,%Inter5: memref<512xi32>,  %Out: memref<256xi32>) -> ()
+  func.func private @hdiff_lap(%AL: memref<256xi32>,%BL: memref<256xi32>, %CL:  memref<256xi32>, %DL: memref<256xi32>, %EL:  memref<256xi32>,  %OLL1: memref<256xi32>,  %OLL2: memref<256xi32>,  %OLL3: memref<256xi32>,  %OLL4: memref<256xi32>) -> () attributes {link_with = "hdiff_lap.o"}
+  func.func private @hdiff_flux1(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  memref<256xi32>,   %OLF1: memref<256xi32>,  %OLF2: memref<256xi32>,  %OLF3: memref<256xi32>,  %OLF4: memref<256xi32>,  %OFI1: memref<512xi32>,  %OFI2: memref<512xi32>,  %OFI3: memref<512xi32>,  %OFI4: memref<512xi32>,  %OFI5: memref<512xi32>) -> () attributes {link_with = "hdiff_flux1.o"}
+  func.func private @hdiff_flux2( %Inter1: memref<512xi32>,%Inter2: memref<512xi32>, %Inter3: memref<512xi32>,%Inter4: memref<512xi32>,%Inter5: memref<512xi32>,  %Out: memref<256xi32>) -> () attributes {link_with = "hdiff_flux2.o"}
 
   %block_0_core0_1 = aie.core(%tile0_1) {
     %lb = arith.constant 0 : index
@@ -87,7 +87,7 @@ module @hdiff_bundle_1 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_1 = aie.core(%tile1_1) {
     %lb = arith.constant 0 : index
@@ -120,7 +120,7 @@ module @hdiff_bundle_1 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_1 = aie.core(%tile2_1) {
     %lb = arith.constant 0 : index
@@ -144,7 +144,7 @@ module @hdiff_bundle_1 {
     }
     aie.use_lock(%lock21_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_2 = aie.core(%tile0_2) {
     %lb = arith.constant 0 : index
@@ -171,7 +171,7 @@ module @hdiff_bundle_1 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_2 = aie.core(%tile1_2) {
     %lb = arith.constant 0 : index
@@ -204,7 +204,7 @@ module @hdiff_bundle_1 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_0_core2_2 = aie.core(%tile2_2) {
@@ -248,7 +248,7 @@ module @hdiff_bundle_1 {
       aie.objectfifo.release<Produce>(%block_0_buf_out_shim_2:!aie.objectfifo<memref<256xi32>>, 4)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_3 = aie.core(%tile0_3) {
     %lb = arith.constant 0 : index
@@ -275,7 +275,7 @@ module @hdiff_bundle_1 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_3 = aie.core(%tile1_3) {
     %lb = arith.constant 0 : index
@@ -308,7 +308,7 @@ module @hdiff_bundle_1 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_3 = aie.core(%tile2_3) {
     %lb = arith.constant 0 : index
@@ -331,7 +331,7 @@ module @hdiff_bundle_1 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_4 = aie.core(%tile0_4) {
     %lb = arith.constant 0 : index
@@ -358,7 +358,7 @@ module @hdiff_bundle_1 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_4 = aie.core(%tile1_4) {
     %lb = arith.constant 0 : index
@@ -391,7 +391,7 @@ module @hdiff_bundle_1 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_4 = aie.core(%tile2_4) {
     %lb = arith.constant 0 : index
@@ -414,6 +414,6 @@ module @hdiff_bundle_1 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
 }
diff --git a/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_1.mlir b/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_1.mlir
index a4b619191ad..2a3dcddf6bc 100644
--- a/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_1.mlir
+++ b/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_1.mlir
@@ -57,9 +57,9 @@ module @hdiff_bundle_1 {
   aie.objectfifo.register_external_buffers(%tile2_0, %block_0_buf_out_shim_2 : !aie.objectfifo<memref<256xi32>>, {%ext_buffer_out_0}) : (memref<2048xi32>)
 
 
-  func.func private @hdiff_lap(%AL: memref<256xi32>,%BL: memref<256xi32>, %CL:  memref<256xi32>, %DL: memref<256xi32>, %EL:  memref<256xi32>,  %OLL1: memref<256xi32>,  %OLL2: memref<256xi32>,  %OLL3: memref<256xi32>,  %OLL4: memref<256xi32>) -> ()
-  func.func private @hdiff_flux1(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  memref<256xi32>,   %OLF1: memref<256xi32>,  %OLF2: memref<256xi32>,  %OLF3: memref<256xi32>,  %OLF4: memref<256xi32>,  %OFI1: memref<512xi32>,  %OFI2: memref<512xi32>,  %OFI3: memref<512xi32>,  %OFI4: memref<512xi32>,  %OFI5: memref<512xi32>) -> ()
-  func.func private @hdiff_flux2( %Inter1: memref<512xi32>,%Inter2: memref<512xi32>, %Inter3: memref<512xi32>,%Inter4: memref<512xi32>,%Inter5: memref<512xi32>,  %Out: memref<256xi32>) -> ()
+  func.func private @hdiff_lap(%AL: memref<256xi32>,%BL: memref<256xi32>, %CL:  memref<256xi32>, %DL: memref<256xi32>, %EL:  memref<256xi32>,  %OLL1: memref<256xi32>,  %OLL2: memref<256xi32>,  %OLL3: memref<256xi32>,  %OLL4: memref<256xi32>) -> () attributes {link_with = "hdiff_lap.o"}
+  func.func private @hdiff_flux1(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  memref<256xi32>,   %OLF1: memref<256xi32>,  %OLF2: memref<256xi32>,  %OLF3: memref<256xi32>,  %OLF4: memref<256xi32>,  %OFI1: memref<512xi32>,  %OFI2: memref<512xi32>,  %OFI3: memref<512xi32>,  %OFI4: memref<512xi32>,  %OFI5: memref<512xi32>) -> () attributes {link_with = "hdiff_flux1.o"}
+  func.func private @hdiff_flux2( %Inter1: memref<512xi32>,%Inter2: memref<512xi32>, %Inter3: memref<512xi32>,%Inter4: memref<512xi32>,%Inter5: memref<512xi32>,  %Out: memref<256xi32>) -> () attributes {link_with = "hdiff_flux2.o"}
 
   %block_0_core0_1 = aie.core(%tile0_1) {
     %lb = arith.constant 0 : index
@@ -87,7 +87,7 @@ module @hdiff_bundle_1 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_1 = aie.core(%tile1_1) {
     %lb = arith.constant 0 : index
@@ -120,7 +120,7 @@ module @hdiff_bundle_1 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_1 = aie.core(%tile2_1) {
     %lb = arith.constant 0 : index
@@ -144,7 +144,7 @@ module @hdiff_bundle_1 {
     }
     aie.use_lock(%lock21_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_2 = aie.core(%tile0_2) {
     %lb = arith.constant 0 : index
@@ -171,7 +171,7 @@ module @hdiff_bundle_1 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_2 = aie.core(%tile1_2) {
     %lb = arith.constant 0 : index
@@ -204,7 +204,7 @@ module @hdiff_bundle_1 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_0_core2_2 = aie.core(%tile2_2) {
@@ -248,7 +248,7 @@ module @hdiff_bundle_1 {
       aie.objectfifo.release<Produce>(%block_0_buf_out_shim_2:!aie.objectfifo<memref<256xi32>>, 4)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_3 = aie.core(%tile0_3) {
     %lb = arith.constant 0 : index
@@ -275,7 +275,7 @@ module @hdiff_bundle_1 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_3 = aie.core(%tile1_3) {
     %lb = arith.constant 0 : index
@@ -308,7 +308,7 @@ module @hdiff_bundle_1 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_3 = aie.core(%tile2_3) {
     %lb = arith.constant 0 : index
@@ -331,7 +331,7 @@ module @hdiff_bundle_1 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_4 = aie.core(%tile0_4) {
     %lb = arith.constant 0 : index
@@ -358,7 +358,7 @@ module @hdiff_bundle_1 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_4 = aie.core(%tile1_4) {
     %lb = arith.constant 0 : index
@@ -391,7 +391,7 @@ module @hdiff_bundle_1 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_4 = aie.core(%tile2_4) {
     %lb = arith.constant 0 : index
@@ -414,6 +414,6 @@ module @hdiff_bundle_1 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
 }
diff --git a/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_16.mlir b/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_16.mlir
index 588bcab596d..ea6abb91863 100644
--- a/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_16.mlir
+++ b/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_16.mlir
@@ -687,9 +687,9 @@ module @hdiff_bundle_16 {
   aie.objectfifo.register_external_buffers(%tile19_0, %block_15_buf_out_shim_19 : !aie.objectfifo<memref<256xi32>>, {%ext_buffer_out_15}) : (memref<2048xi32>)
 
 
-  func.func private @hdiff_lap(%AL: memref<256xi32>,%BL: memref<256xi32>, %CL:  memref<256xi32>, %DL: memref<256xi32>, %EL:  memref<256xi32>,  %OLL1: memref<256xi32>,  %OLL2: memref<256xi32>,  %OLL3: memref<256xi32>,  %OLL4: memref<256xi32>) -> ()
-  func.func private @hdiff_flux1(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  memref<256xi32>,   %OLF1: memref<256xi32>,  %OLF2: memref<256xi32>,  %OLF3: memref<256xi32>,  %OLF4: memref<256xi32>,  %OFI1: memref<512xi32>,  %OFI2: memref<512xi32>,  %OFI3: memref<512xi32>,  %OFI4: memref<512xi32>,  %OFI5: memref<512xi32>) -> ()
-  func.func private @hdiff_flux2( %Inter1: memref<512xi32>,%Inter2: memref<512xi32>, %Inter3: memref<512xi32>,%Inter4: memref<512xi32>,%Inter5: memref<512xi32>,  %Out: memref<256xi32>) -> ()
+  func.func private @hdiff_lap(%AL: memref<256xi32>,%BL: memref<256xi32>, %CL:  memref<256xi32>, %DL: memref<256xi32>, %EL:  memref<256xi32>,  %OLL1: memref<256xi32>,  %OLL2: memref<256xi32>,  %OLL3: memref<256xi32>,  %OLL4: memref<256xi32>) -> () attributes {link_with = "hdiff_lap.o"}
+  func.func private @hdiff_flux1(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  memref<256xi32>,   %OLF1: memref<256xi32>,  %OLF2: memref<256xi32>,  %OLF3: memref<256xi32>,  %OLF4: memref<256xi32>,  %OFI1: memref<512xi32>,  %OFI2: memref<512xi32>,  %OFI3: memref<512xi32>,  %OFI4: memref<512xi32>,  %OFI5: memref<512xi32>) -> () attributes {link_with = "hdiff_flux1.o"}
+  func.func private @hdiff_flux2( %Inter1: memref<512xi32>,%Inter2: memref<512xi32>, %Inter3: memref<512xi32>,%Inter4: memref<512xi32>,%Inter5: memref<512xi32>,  %Out: memref<256xi32>) -> () attributes {link_with = "hdiff_flux2.o"}
 
   %block_0_core0_1 = aie.core(%tile0_1) {
     %lb = arith.constant 0 : index
@@ -716,7 +716,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_1 = aie.core(%tile1_1) {
     %lb = arith.constant 0 : index
@@ -749,7 +749,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_1 = aie.core(%tile2_1) {
     %lb = arith.constant 0 : index
@@ -772,7 +772,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_2 = aie.core(%tile0_2) {
     %lb = arith.constant 0 : index
@@ -800,7 +800,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_2 = aie.core(%tile1_2) {
     %lb = arith.constant 0 : index
@@ -833,7 +833,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_0_core2_2 = aie.core(%tile2_2) {
@@ -878,7 +878,7 @@ module @hdiff_bundle_16 {
     }
     aie.use_lock(%lock22_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_3 = aie.core(%tile0_3) {
     %lb = arith.constant 0 : index
@@ -905,7 +905,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_3 = aie.core(%tile1_3) {
     %lb = arith.constant 0 : index
@@ -938,7 +938,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_3 = aie.core(%tile2_3) {
     %lb = arith.constant 0 : index
@@ -961,7 +961,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_4 = aie.core(%tile0_4) {
     %lb = arith.constant 0 : index
@@ -988,7 +988,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_4 = aie.core(%tile1_4) {
     %lb = arith.constant 0 : index
@@ -1021,7 +1021,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_4 = aie.core(%tile2_4) {
     %lb = arith.constant 0 : index
@@ -1044,7 +1044,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_5 = aie.core(%tile0_5) {
     %lb = arith.constant 0 : index
@@ -1071,7 +1071,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_5 = aie.core(%tile1_5) {
     %lb = arith.constant 0 : index
@@ -1104,7 +1104,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_1_core2_5 = aie.core(%tile2_5) {
     %lb = arith.constant 0 : index
@@ -1127,7 +1127,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_1_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_6 = aie.core(%tile0_6) {
     %lb = arith.constant 0 : index
@@ -1155,7 +1155,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_6 = aie.core(%tile1_6) {
     %lb = arith.constant 0 : index
@@ -1188,7 +1188,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_1_core2_6 = aie.core(%tile2_6) {
@@ -1233,7 +1233,7 @@ module @hdiff_bundle_16 {
     }
     aie.use_lock(%lock26_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_7 = aie.core(%tile0_7) {
     %lb = arith.constant 0 : index
@@ -1260,7 +1260,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_7 = aie.core(%tile1_7) {
     %lb = arith.constant 0 : index
@@ -1293,7 +1293,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_1_core2_7 = aie.core(%tile2_7) {
     %lb = arith.constant 0 : index
@@ -1316,7 +1316,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_1_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_8 = aie.core(%tile0_8) {
     %lb = arith.constant 0 : index
@@ -1343,7 +1343,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_8 = aie.core(%tile1_8) {
     %lb = arith.constant 0 : index
@@ -1376,7 +1376,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_1_core2_8 = aie.core(%tile2_8) {
     %lb = arith.constant 0 : index
@@ -1399,7 +1399,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_1_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_2_core3_1 = aie.core(%tile3_1) {
     %lb = arith.constant 0 : index
@@ -1426,7 +1426,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_2_core4_1 = aie.core(%tile4_1) {
     %lb = arith.constant 0 : index
@@ -1459,7 +1459,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_2_core5_1 = aie.core(%tile5_1) {
     %lb = arith.constant 0 : index
@@ -1482,7 +1482,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_2_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_2_core3_2 = aie.core(%tile3_2) {
     %lb = arith.constant 0 : index
@@ -1510,7 +1510,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_2_core4_2 = aie.core(%tile4_2) {
     %lb = arith.constant 0 : index
@@ -1543,7 +1543,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_2_core5_2 = aie.core(%tile5_2) {
@@ -1588,7 +1588,7 @@ module @hdiff_bundle_16 {
     }
     aie.use_lock(%lock52_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_2_core3_3 = aie.core(%tile3_3) {
     %lb = arith.constant 0 : index
@@ -1615,7 +1615,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_2_core4_3 = aie.core(%tile4_3) {
     %lb = arith.constant 0 : index
@@ -1648,7 +1648,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_2_core5_3 = aie.core(%tile5_3) {
     %lb = arith.constant 0 : index
@@ -1671,7 +1671,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_2_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_2_core3_4 = aie.core(%tile3_4) {
     %lb = arith.constant 0 : index
@@ -1698,7 +1698,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_2_core4_4 = aie.core(%tile4_4) {
     %lb = arith.constant 0 : index
@@ -1731,7 +1731,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_2_core5_4 = aie.core(%tile5_4) {
     %lb = arith.constant 0 : index
@@ -1754,7 +1754,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_2_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_3_core3_5 = aie.core(%tile3_5) {
     %lb = arith.constant 0 : index
@@ -1781,7 +1781,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_3_core4_5 = aie.core(%tile4_5) {
     %lb = arith.constant 0 : index
@@ -1814,7 +1814,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_3_core5_5 = aie.core(%tile5_5) {
     %lb = arith.constant 0 : index
@@ -1837,7 +1837,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_3_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_3_core3_6 = aie.core(%tile3_6) {
     %lb = arith.constant 0 : index
@@ -1865,7 +1865,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_3_core4_6 = aie.core(%tile4_6) {
     %lb = arith.constant 0 : index
@@ -1898,7 +1898,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_3_core5_6 = aie.core(%tile5_6) {
@@ -1943,7 +1943,7 @@ module @hdiff_bundle_16 {
     }
     aie.use_lock(%lock56_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_3_core3_7 = aie.core(%tile3_7) {
     %lb = arith.constant 0 : index
@@ -1970,7 +1970,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_3_core4_7 = aie.core(%tile4_7) {
     %lb = arith.constant 0 : index
@@ -2003,7 +2003,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_3_core5_7 = aie.core(%tile5_7) {
     %lb = arith.constant 0 : index
@@ -2026,7 +2026,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_3_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_3_core3_8 = aie.core(%tile3_8) {
     %lb = arith.constant 0 : index
@@ -2053,7 +2053,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_3_core4_8 = aie.core(%tile4_8) {
     %lb = arith.constant 0 : index
@@ -2086,7 +2086,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_3_core5_8 = aie.core(%tile5_8) {
     %lb = arith.constant 0 : index
@@ -2109,7 +2109,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_3_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_4_core6_1 = aie.core(%tile6_1) {
     %lb = arith.constant 0 : index
@@ -2136,7 +2136,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_4_core7_1 = aie.core(%tile7_1) {
     %lb = arith.constant 0 : index
@@ -2169,7 +2169,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_4_core8_1 = aie.core(%tile8_1) {
     %lb = arith.constant 0 : index
@@ -2192,7 +2192,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_4_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_4_core6_2 = aie.core(%tile6_2) {
     %lb = arith.constant 0 : index
@@ -2220,7 +2220,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_4_core7_2 = aie.core(%tile7_2) {
     %lb = arith.constant 0 : index
@@ -2253,7 +2253,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_4_core8_2 = aie.core(%tile8_2) {
@@ -2298,7 +2298,7 @@ module @hdiff_bundle_16 {
     }
     aie.use_lock(%lock82_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_4_core6_3 = aie.core(%tile6_3) {
     %lb = arith.constant 0 : index
@@ -2325,7 +2325,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_4_core7_3 = aie.core(%tile7_3) {
     %lb = arith.constant 0 : index
@@ -2358,7 +2358,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_4_core8_3 = aie.core(%tile8_3) {
     %lb = arith.constant 0 : index
@@ -2381,7 +2381,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_4_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_4_core6_4 = aie.core(%tile6_4) {
     %lb = arith.constant 0 : index
@@ -2408,7 +2408,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_4_core7_4 = aie.core(%tile7_4) {
     %lb = arith.constant 0 : index
@@ -2441,7 +2441,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_4_core8_4 = aie.core(%tile8_4) {
     %lb = arith.constant 0 : index
@@ -2464,7 +2464,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_4_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_5_core6_5 = aie.core(%tile6_5) {
     %lb = arith.constant 0 : index
@@ -2491,7 +2491,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_5_core7_5 = aie.core(%tile7_5) {
     %lb = arith.constant 0 : index
@@ -2524,7 +2524,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_5_core8_5 = aie.core(%tile8_5) {
     %lb = arith.constant 0 : index
@@ -2547,7 +2547,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_5_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_5_core6_6 = aie.core(%tile6_6) {
     %lb = arith.constant 0 : index
@@ -2575,7 +2575,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_5_core7_6 = aie.core(%tile7_6) {
     %lb = arith.constant 0 : index
@@ -2608,7 +2608,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_5_core8_6 = aie.core(%tile8_6) {
@@ -2653,7 +2653,7 @@ module @hdiff_bundle_16 {
     }
     aie.use_lock(%lock86_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_5_core6_7 = aie.core(%tile6_7) {
     %lb = arith.constant 0 : index
@@ -2680,7 +2680,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_5_core7_7 = aie.core(%tile7_7) {
     %lb = arith.constant 0 : index
@@ -2713,7 +2713,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_5_core8_7 = aie.core(%tile8_7) {
     %lb = arith.constant 0 : index
@@ -2736,7 +2736,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_5_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_5_core6_8 = aie.core(%tile6_8) {
     %lb = arith.constant 0 : index
@@ -2763,7 +2763,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_5_core7_8 = aie.core(%tile7_8) {
     %lb = arith.constant 0 : index
@@ -2796,7 +2796,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_5_core8_8 = aie.core(%tile8_8) {
     %lb = arith.constant 0 : index
@@ -2819,7 +2819,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_5_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_6_core9_1 = aie.core(%tile9_1) {
     %lb = arith.constant 0 : index
@@ -2846,7 +2846,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_6_core10_1 = aie.core(%tile10_1) {
     %lb = arith.constant 0 : index
@@ -2879,7 +2879,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_6_core11_1 = aie.core(%tile11_1) {
     %lb = arith.constant 0 : index
@@ -2902,7 +2902,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_6_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_6_core9_2 = aie.core(%tile9_2) {
     %lb = arith.constant 0 : index
@@ -2930,7 +2930,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_6_core10_2 = aie.core(%tile10_2) {
     %lb = arith.constant 0 : index
@@ -2963,7 +2963,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_6_core11_2 = aie.core(%tile11_2) {
@@ -3008,7 +3008,7 @@ module @hdiff_bundle_16 {
     }
     aie.use_lock(%lock112_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_6_core9_3 = aie.core(%tile9_3) {
     %lb = arith.constant 0 : index
@@ -3035,7 +3035,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_6_core10_3 = aie.core(%tile10_3) {
     %lb = arith.constant 0 : index
@@ -3068,7 +3068,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_6_core11_3 = aie.core(%tile11_3) {
     %lb = arith.constant 0 : index
@@ -3091,7 +3091,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_6_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_6_core9_4 = aie.core(%tile9_4) {
     %lb = arith.constant 0 : index
@@ -3118,7 +3118,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_6_core10_4 = aie.core(%tile10_4) {
     %lb = arith.constant 0 : index
@@ -3151,7 +3151,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_6_core11_4 = aie.core(%tile11_4) {
     %lb = arith.constant 0 : index
@@ -3174,7 +3174,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_6_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_7_core9_5 = aie.core(%tile9_5) {
     %lb = arith.constant 0 : index
@@ -3201,7 +3201,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_7_core10_5 = aie.core(%tile10_5) {
     %lb = arith.constant 0 : index
@@ -3234,7 +3234,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_7_core11_5 = aie.core(%tile11_5) {
     %lb = arith.constant 0 : index
@@ -3257,7 +3257,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_7_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_7_core9_6 = aie.core(%tile9_6) {
     %lb = arith.constant 0 : index
@@ -3285,7 +3285,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_7_core10_6 = aie.core(%tile10_6) {
     %lb = arith.constant 0 : index
@@ -3318,7 +3318,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_7_core11_6 = aie.core(%tile11_6) {
@@ -3363,7 +3363,7 @@ module @hdiff_bundle_16 {
     }
     aie.use_lock(%lock116_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_7_core9_7 = aie.core(%tile9_7) {
     %lb = arith.constant 0 : index
@@ -3390,7 +3390,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_7_core10_7 = aie.core(%tile10_7) {
     %lb = arith.constant 0 : index
@@ -3423,7 +3423,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_7_core11_7 = aie.core(%tile11_7) {
     %lb = arith.constant 0 : index
@@ -3446,7 +3446,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_7_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_7_core9_8 = aie.core(%tile9_8) {
     %lb = arith.constant 0 : index
@@ -3473,7 +3473,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_7_core10_8 = aie.core(%tile10_8) {
     %lb = arith.constant 0 : index
@@ -3506,7 +3506,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_7_core11_8 = aie.core(%tile11_8) {
     %lb = arith.constant 0 : index
@@ -3529,7 +3529,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_7_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_8_core12_1 = aie.core(%tile12_1) {
     %lb = arith.constant 0 : index
@@ -3556,7 +3556,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_8_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_8_core13_1 = aie.core(%tile13_1) {
     %lb = arith.constant 0 : index
@@ -3589,7 +3589,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_8_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_8_core14_1 = aie.core(%tile14_1) {
     %lb = arith.constant 0 : index
@@ -3612,7 +3612,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_8_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_8_core12_2 = aie.core(%tile12_2) {
     %lb = arith.constant 0 : index
@@ -3640,7 +3640,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_8_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_8_core13_2 = aie.core(%tile13_2) {
     %lb = arith.constant 0 : index
@@ -3673,7 +3673,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_8_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_8_core14_2 = aie.core(%tile14_2) {
@@ -3718,7 +3718,7 @@ module @hdiff_bundle_16 {
     }
     aie.use_lock(%lock142_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_8_core12_3 = aie.core(%tile12_3) {
     %lb = arith.constant 0 : index
@@ -3745,7 +3745,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_8_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_8_core13_3 = aie.core(%tile13_3) {
     %lb = arith.constant 0 : index
@@ -3778,7 +3778,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_8_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_8_core14_3 = aie.core(%tile14_3) {
     %lb = arith.constant 0 : index
@@ -3801,7 +3801,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_8_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_8_core12_4 = aie.core(%tile12_4) {
     %lb = arith.constant 0 : index
@@ -3828,7 +3828,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_8_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_8_core13_4 = aie.core(%tile13_4) {
     %lb = arith.constant 0 : index
@@ -3861,7 +3861,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_8_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_8_core14_4 = aie.core(%tile14_4) {
     %lb = arith.constant 0 : index
@@ -3884,7 +3884,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_8_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_9_core12_5 = aie.core(%tile12_5) {
     %lb = arith.constant 0 : index
@@ -3911,7 +3911,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_9_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_9_core13_5 = aie.core(%tile13_5) {
     %lb = arith.constant 0 : index
@@ -3944,7 +3944,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_9_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_9_core14_5 = aie.core(%tile14_5) {
     %lb = arith.constant 0 : index
@@ -3967,7 +3967,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_9_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_9_core12_6 = aie.core(%tile12_6) {
     %lb = arith.constant 0 : index
@@ -3995,7 +3995,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_9_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_9_core13_6 = aie.core(%tile13_6) {
     %lb = arith.constant 0 : index
@@ -4028,7 +4028,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_9_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_9_core14_6 = aie.core(%tile14_6) {
@@ -4073,7 +4073,7 @@ module @hdiff_bundle_16 {
     }
     aie.use_lock(%lock146_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_9_core12_7 = aie.core(%tile12_7) {
     %lb = arith.constant 0 : index
@@ -4100,7 +4100,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_9_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_9_core13_7 = aie.core(%tile13_7) {
     %lb = arith.constant 0 : index
@@ -4133,7 +4133,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_9_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_9_core14_7 = aie.core(%tile14_7) {
     %lb = arith.constant 0 : index
@@ -4156,7 +4156,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_9_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_9_core12_8 = aie.core(%tile12_8) {
     %lb = arith.constant 0 : index
@@ -4183,7 +4183,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_9_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_9_core13_8 = aie.core(%tile13_8) {
     %lb = arith.constant 0 : index
@@ -4216,7 +4216,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_9_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_9_core14_8 = aie.core(%tile14_8) {
     %lb = arith.constant 0 : index
@@ -4239,7 +4239,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_9_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_10_core15_1 = aie.core(%tile15_1) {
     %lb = arith.constant 0 : index
@@ -4266,7 +4266,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_10_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_10_core16_1 = aie.core(%tile16_1) {
     %lb = arith.constant 0 : index
@@ -4299,7 +4299,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_10_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_10_core17_1 = aie.core(%tile17_1) {
     %lb = arith.constant 0 : index
@@ -4322,7 +4322,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_10_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_10_core15_2 = aie.core(%tile15_2) {
     %lb = arith.constant 0 : index
@@ -4350,7 +4350,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_10_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_10_core16_2 = aie.core(%tile16_2) {
     %lb = arith.constant 0 : index
@@ -4383,7 +4383,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_10_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_10_core17_2 = aie.core(%tile17_2) {
@@ -4428,7 +4428,7 @@ module @hdiff_bundle_16 {
     }
     aie.use_lock(%lock172_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_10_core15_3 = aie.core(%tile15_3) {
     %lb = arith.constant 0 : index
@@ -4455,7 +4455,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_10_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_10_core16_3 = aie.core(%tile16_3) {
     %lb = arith.constant 0 : index
@@ -4488,7 +4488,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_10_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_10_core17_3 = aie.core(%tile17_3) {
     %lb = arith.constant 0 : index
@@ -4511,7 +4511,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_10_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_10_core15_4 = aie.core(%tile15_4) {
     %lb = arith.constant 0 : index
@@ -4538,7 +4538,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_10_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_10_core16_4 = aie.core(%tile16_4) {
     %lb = arith.constant 0 : index
@@ -4571,7 +4571,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_10_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_10_core17_4 = aie.core(%tile17_4) {
     %lb = arith.constant 0 : index
@@ -4594,7 +4594,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_10_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_11_core15_5 = aie.core(%tile15_5) {
     %lb = arith.constant 0 : index
@@ -4621,7 +4621,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_11_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_11_core16_5 = aie.core(%tile16_5) {
     %lb = arith.constant 0 : index
@@ -4654,7 +4654,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_11_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_11_core17_5 = aie.core(%tile17_5) {
     %lb = arith.constant 0 : index
@@ -4677,7 +4677,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_11_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_11_core15_6 = aie.core(%tile15_6) {
     %lb = arith.constant 0 : index
@@ -4705,7 +4705,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_11_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_11_core16_6 = aie.core(%tile16_6) {
     %lb = arith.constant 0 : index
@@ -4738,7 +4738,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_11_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_11_core17_6 = aie.core(%tile17_6) {
@@ -4783,7 +4783,7 @@ module @hdiff_bundle_16 {
     }
     aie.use_lock(%lock176_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_11_core15_7 = aie.core(%tile15_7) {
     %lb = arith.constant 0 : index
@@ -4810,7 +4810,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_11_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_11_core16_7 = aie.core(%tile16_7) {
     %lb = arith.constant 0 : index
@@ -4843,7 +4843,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_11_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_11_core17_7 = aie.core(%tile17_7) {
     %lb = arith.constant 0 : index
@@ -4866,7 +4866,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_11_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_11_core15_8 = aie.core(%tile15_8) {
     %lb = arith.constant 0 : index
@@ -4893,7 +4893,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_11_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_11_core16_8 = aie.core(%tile16_8) {
     %lb = arith.constant 0 : index
@@ -4926,7 +4926,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_11_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_11_core17_8 = aie.core(%tile17_8) {
     %lb = arith.constant 0 : index
@@ -4949,7 +4949,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_11_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_12_core18_1 = aie.core(%tile18_1) {
     %lb = arith.constant 0 : index
@@ -4976,7 +4976,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_12_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_12_core19_1 = aie.core(%tile19_1) {
     %lb = arith.constant 0 : index
@@ -5009,7 +5009,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_12_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_12_core20_1 = aie.core(%tile20_1) {
     %lb = arith.constant 0 : index
@@ -5032,7 +5032,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_12_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_12_core18_2 = aie.core(%tile18_2) {
     %lb = arith.constant 0 : index
@@ -5060,7 +5060,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_12_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_12_core19_2 = aie.core(%tile19_2) {
     %lb = arith.constant 0 : index
@@ -5093,7 +5093,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_12_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_12_core20_2 = aie.core(%tile20_2) {
@@ -5138,7 +5138,7 @@ module @hdiff_bundle_16 {
     }
     aie.use_lock(%lock202_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_12_core18_3 = aie.core(%tile18_3) {
     %lb = arith.constant 0 : index
@@ -5165,7 +5165,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_12_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_12_core19_3 = aie.core(%tile19_3) {
     %lb = arith.constant 0 : index
@@ -5198,7 +5198,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_12_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_12_core20_3 = aie.core(%tile20_3) {
     %lb = arith.constant 0 : index
@@ -5221,7 +5221,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_12_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_12_core18_4 = aie.core(%tile18_4) {
     %lb = arith.constant 0 : index
@@ -5248,7 +5248,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_12_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_12_core19_4 = aie.core(%tile19_4) {
     %lb = arith.constant 0 : index
@@ -5281,7 +5281,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_12_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_12_core20_4 = aie.core(%tile20_4) {
     %lb = arith.constant 0 : index
@@ -5304,7 +5304,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_12_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_13_core18_5 = aie.core(%tile18_5) {
     %lb = arith.constant 0 : index
@@ -5331,7 +5331,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_13_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_13_core19_5 = aie.core(%tile19_5) {
     %lb = arith.constant 0 : index
@@ -5364,7 +5364,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_13_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_13_core20_5 = aie.core(%tile20_5) {
     %lb = arith.constant 0 : index
@@ -5387,7 +5387,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_13_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_13_core18_6 = aie.core(%tile18_6) {
     %lb = arith.constant 0 : index
@@ -5415,7 +5415,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_13_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_13_core19_6 = aie.core(%tile19_6) {
     %lb = arith.constant 0 : index
@@ -5448,7 +5448,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_13_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_13_core20_6 = aie.core(%tile20_6) {
@@ -5493,7 +5493,7 @@ module @hdiff_bundle_16 {
     }
     aie.use_lock(%lock206_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_13_core18_7 = aie.core(%tile18_7) {
     %lb = arith.constant 0 : index
@@ -5520,7 +5520,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_13_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_13_core19_7 = aie.core(%tile19_7) {
     %lb = arith.constant 0 : index
@@ -5553,7 +5553,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_13_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_13_core20_7 = aie.core(%tile20_7) {
     %lb = arith.constant 0 : index
@@ -5576,7 +5576,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_13_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_13_core18_8 = aie.core(%tile18_8) {
     %lb = arith.constant 0 : index
@@ -5603,7 +5603,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_13_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_13_core19_8 = aie.core(%tile19_8) {
     %lb = arith.constant 0 : index
@@ -5636,7 +5636,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_13_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_13_core20_8 = aie.core(%tile20_8) {
     %lb = arith.constant 0 : index
@@ -5659,7 +5659,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_13_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_14_core21_1 = aie.core(%tile21_1) {
     %lb = arith.constant 0 : index
@@ -5686,7 +5686,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_14_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_14_core22_1 = aie.core(%tile22_1) {
     %lb = arith.constant 0 : index
@@ -5719,7 +5719,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_14_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_14_core23_1 = aie.core(%tile23_1) {
     %lb = arith.constant 0 : index
@@ -5742,7 +5742,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_14_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_14_core21_2 = aie.core(%tile21_2) {
     %lb = arith.constant 0 : index
@@ -5770,7 +5770,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_14_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_14_core22_2 = aie.core(%tile22_2) {
     %lb = arith.constant 0 : index
@@ -5803,7 +5803,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_14_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_14_core23_2 = aie.core(%tile23_2) {
@@ -5848,7 +5848,7 @@ module @hdiff_bundle_16 {
     }
     aie.use_lock(%lock232_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_14_core21_3 = aie.core(%tile21_3) {
     %lb = arith.constant 0 : index
@@ -5875,7 +5875,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_14_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_14_core22_3 = aie.core(%tile22_3) {
     %lb = arith.constant 0 : index
@@ -5908,7 +5908,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_14_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_14_core23_3 = aie.core(%tile23_3) {
     %lb = arith.constant 0 : index
@@ -5931,7 +5931,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_14_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_14_core21_4 = aie.core(%tile21_4) {
     %lb = arith.constant 0 : index
@@ -5958,7 +5958,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_14_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_14_core22_4 = aie.core(%tile22_4) {
     %lb = arith.constant 0 : index
@@ -5991,7 +5991,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_14_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_14_core23_4 = aie.core(%tile23_4) {
     %lb = arith.constant 0 : index
@@ -6014,7 +6014,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_14_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_15_core21_5 = aie.core(%tile21_5) {
     %lb = arith.constant 0 : index
@@ -6041,7 +6041,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_15_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_15_core22_5 = aie.core(%tile22_5) {
     %lb = arith.constant 0 : index
@@ -6074,7 +6074,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_15_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_15_core23_5 = aie.core(%tile23_5) {
     %lb = arith.constant 0 : index
@@ -6097,7 +6097,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_15_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_15_core21_6 = aie.core(%tile21_6) {
     %lb = arith.constant 0 : index
@@ -6125,7 +6125,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_15_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_15_core22_6 = aie.core(%tile22_6) {
     %lb = arith.constant 0 : index
@@ -6158,7 +6158,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_15_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_15_core23_6 = aie.core(%tile23_6) {
@@ -6203,7 +6203,7 @@ module @hdiff_bundle_16 {
     }
     aie.use_lock(%lock236_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_15_core21_7 = aie.core(%tile21_7) {
     %lb = arith.constant 0 : index
@@ -6230,7 +6230,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_15_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_15_core22_7 = aie.core(%tile22_7) {
     %lb = arith.constant 0 : index
@@ -6263,7 +6263,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_15_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_15_core23_7 = aie.core(%tile23_7) {
     %lb = arith.constant 0 : index
@@ -6286,7 +6286,7 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_15_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_15_core21_8 = aie.core(%tile21_8) {
     %lb = arith.constant 0 : index
@@ -6313,7 +6313,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_15_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_15_core22_8 = aie.core(%tile22_8) {
     %lb = arith.constant 0 : index
@@ -6346,7 +6346,7 @@ module @hdiff_bundle_16 {
     }
     aie.objectfifo.release<Consume>(%block_15_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_15_core23_8 = aie.core(%tile23_8) {
     %lb = arith.constant 0 : index
@@ -6369,6 +6369,6 @@ module @hdiff_bundle_16 {
       aie.objectfifo.release<Produce>(%block_15_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
 }
diff --git a/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_2.mlir b/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_2.mlir
index e2c70180b01..4e77bd5bf6b 100644
--- a/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_2.mlir
+++ b/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_2.mlir
@@ -99,9 +99,9 @@ module @hdiff_bundle_2 {
   aie.objectfifo.register_external_buffers(%tile2_0, %block_1_buf_out_shim_2 : !aie.objectfifo<memref<256xi32>>, {%ext_buffer_out_1}) : (memref<2048xi32>)
 
 
-  func.func private @hdiff_lap(%AL: memref<256xi32>,%BL: memref<256xi32>, %CL:  memref<256xi32>, %DL: memref<256xi32>, %EL:  memref<256xi32>,  %OLL1: memref<256xi32>,  %OLL2: memref<256xi32>,  %OLL3: memref<256xi32>,  %OLL4: memref<256xi32>) -> ()
-  func.func private @hdiff_flux1(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  memref<256xi32>,   %OLF1: memref<256xi32>,  %OLF2: memref<256xi32>,  %OLF3: memref<256xi32>,  %OLF4: memref<256xi32>,  %OFI1: memref<512xi32>,  %OFI2: memref<512xi32>,  %OFI3: memref<512xi32>,  %OFI4: memref<512xi32>,  %OFI5: memref<512xi32>) -> ()
-  func.func private @hdiff_flux2( %Inter1: memref<512xi32>,%Inter2: memref<512xi32>, %Inter3: memref<512xi32>,%Inter4: memref<512xi32>,%Inter5: memref<512xi32>,  %Out: memref<256xi32>) -> ()
+  func.func private @hdiff_lap(%AL: memref<256xi32>,%BL: memref<256xi32>, %CL:  memref<256xi32>, %DL: memref<256xi32>, %EL:  memref<256xi32>,  %OLL1: memref<256xi32>,  %OLL2: memref<256xi32>,  %OLL3: memref<256xi32>,  %OLL4: memref<256xi32>) -> () attributes {link_with = "hdiff_lap.o"}
+  func.func private @hdiff_flux1(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  memref<256xi32>,   %OLF1: memref<256xi32>,  %OLF2: memref<256xi32>,  %OLF3: memref<256xi32>,  %OLF4: memref<256xi32>,  %OFI1: memref<512xi32>,  %OFI2: memref<512xi32>,  %OFI3: memref<512xi32>,  %OFI4: memref<512xi32>,  %OFI5: memref<512xi32>) -> () attributes {link_with = "hdiff_flux1.o"}
+  func.func private @hdiff_flux2( %Inter1: memref<512xi32>,%Inter2: memref<512xi32>, %Inter3: memref<512xi32>,%Inter4: memref<512xi32>,%Inter5: memref<512xi32>,  %Out: memref<256xi32>) -> () attributes {link_with = "hdiff_flux2.o"}
 
   %block_0_core0_1 = aie.core(%tile0_1) {
     %lb = arith.constant 0 : index
@@ -128,7 +128,7 @@ module @hdiff_bundle_2 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_1 = aie.core(%tile1_1) {
     %lb = arith.constant 0 : index
@@ -161,7 +161,7 @@ module @hdiff_bundle_2 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_1 = aie.core(%tile2_1) {
     %lb = arith.constant 0 : index
@@ -184,7 +184,7 @@ module @hdiff_bundle_2 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_2 = aie.core(%tile0_2) {
     %lb = arith.constant 0 : index
@@ -212,7 +212,7 @@ module @hdiff_bundle_2 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_2 = aie.core(%tile1_2) {
     %lb = arith.constant 0 : index
@@ -245,7 +245,7 @@ module @hdiff_bundle_2 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_0_core2_2 = aie.core(%tile2_2) {
@@ -290,7 +290,7 @@ module @hdiff_bundle_2 {
     }
     aie.use_lock(%lock22_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_3 = aie.core(%tile0_3) {
     %lb = arith.constant 0 : index
@@ -317,7 +317,7 @@ module @hdiff_bundle_2 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_3 = aie.core(%tile1_3) {
     %lb = arith.constant 0 : index
@@ -350,7 +350,7 @@ module @hdiff_bundle_2 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_3 = aie.core(%tile2_3) {
     %lb = arith.constant 0 : index
@@ -373,7 +373,7 @@ module @hdiff_bundle_2 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_4 = aie.core(%tile0_4) {
     %lb = arith.constant 0 : index
@@ -400,7 +400,7 @@ module @hdiff_bundle_2 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_4 = aie.core(%tile1_4) {
     %lb = arith.constant 0 : index
@@ -433,7 +433,7 @@ module @hdiff_bundle_2 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_4 = aie.core(%tile2_4) {
     %lb = arith.constant 0 : index
@@ -456,7 +456,7 @@ module @hdiff_bundle_2 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_5 = aie.core(%tile0_5) {
     %lb = arith.constant 0 : index
@@ -483,7 +483,7 @@ module @hdiff_bundle_2 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_5 = aie.core(%tile1_5) {
     %lb = arith.constant 0 : index
@@ -516,7 +516,7 @@ module @hdiff_bundle_2 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_1_core2_5 = aie.core(%tile2_5) {
     %lb = arith.constant 0 : index
@@ -539,7 +539,7 @@ module @hdiff_bundle_2 {
       aie.objectfifo.release<Produce>(%block_1_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_6 = aie.core(%tile0_6) {
     %lb = arith.constant 0 : index
@@ -567,7 +567,7 @@ module @hdiff_bundle_2 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_6 = aie.core(%tile1_6) {
     %lb = arith.constant 0 : index
@@ -600,7 +600,7 @@ module @hdiff_bundle_2 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_1_core2_6 = aie.core(%tile2_6) {
@@ -645,7 +645,7 @@ module @hdiff_bundle_2 {
     }
     aie.use_lock(%lock26_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_7 = aie.core(%tile0_7) {
     %lb = arith.constant 0 : index
@@ -672,7 +672,7 @@ module @hdiff_bundle_2 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_7 = aie.core(%tile1_7) {
     %lb = arith.constant 0 : index
@@ -705,7 +705,7 @@ module @hdiff_bundle_2 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_1_core2_7 = aie.core(%tile2_7) {
     %lb = arith.constant 0 : index
@@ -728,7 +728,7 @@ module @hdiff_bundle_2 {
       aie.objectfifo.release<Produce>(%block_1_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_8 = aie.core(%tile0_8) {
     %lb = arith.constant 0 : index
@@ -755,7 +755,7 @@ module @hdiff_bundle_2 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_8 = aie.core(%tile1_8) {
     %lb = arith.constant 0 : index
@@ -788,7 +788,7 @@ module @hdiff_bundle_2 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_1_core2_8 = aie.core(%tile2_8) {
     %lb = arith.constant 0 : index
@@ -811,6 +811,6 @@ module @hdiff_bundle_2 {
       aie.objectfifo.release<Produce>(%block_1_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
 }
diff --git a/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_3.mlir b/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_3.mlir
index 284bcf77522..cf422d16767 100644
--- a/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_3.mlir
+++ b/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_3.mlir
@@ -141,9 +141,9 @@ module @hdiff_bundle_3 {
   aie.objectfifo.register_external_buffers(%tile3_0, %block_2_buf_out_shim_3 : !aie.objectfifo<memref<256xi32>>, {%ext_buffer_out_2}) : (memref<2048xi32>)
 
 
-  func.func private @hdiff_lap(%AL: memref<256xi32>,%BL: memref<256xi32>, %CL:  memref<256xi32>, %DL: memref<256xi32>, %EL:  memref<256xi32>,  %OLL1: memref<256xi32>,  %OLL2: memref<256xi32>,  %OLL3: memref<256xi32>,  %OLL4: memref<256xi32>) -> ()
-  func.func private @hdiff_flux1(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  memref<256xi32>,   %OLF1: memref<256xi32>,  %OLF2: memref<256xi32>,  %OLF3: memref<256xi32>,  %OLF4: memref<256xi32>,  %OFI1: memref<512xi32>,  %OFI2: memref<512xi32>,  %OFI3: memref<512xi32>,  %OFI4: memref<512xi32>,  %OFI5: memref<512xi32>) -> ()
-  func.func private @hdiff_flux2( %Inter1: memref<512xi32>,%Inter2: memref<512xi32>, %Inter3: memref<512xi32>,%Inter4: memref<512xi32>,%Inter5: memref<512xi32>,  %Out: memref<256xi32>) -> ()
+  func.func private @hdiff_lap(%AL: memref<256xi32>,%BL: memref<256xi32>, %CL:  memref<256xi32>, %DL: memref<256xi32>, %EL:  memref<256xi32>,  %OLL1: memref<256xi32>,  %OLL2: memref<256xi32>,  %OLL3: memref<256xi32>,  %OLL4: memref<256xi32>) -> () attributes {link_with = "hdiff_lap.o"}
+  func.func private @hdiff_flux1(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  memref<256xi32>,   %OLF1: memref<256xi32>,  %OLF2: memref<256xi32>,  %OLF3: memref<256xi32>,  %OLF4: memref<256xi32>,  %OFI1: memref<512xi32>,  %OFI2: memref<512xi32>,  %OFI3: memref<512xi32>,  %OFI4: memref<512xi32>,  %OFI5: memref<512xi32>) -> () attributes {link_with = "hdiff_flux1.o"}
+  func.func private @hdiff_flux2( %Inter1: memref<512xi32>,%Inter2: memref<512xi32>, %Inter3: memref<512xi32>,%Inter4: memref<512xi32>,%Inter5: memref<512xi32>,  %Out: memref<256xi32>) -> () attributes {link_with = "hdiff_flux2.o"}
 
   %block_0_core0_1 = aie.core(%tile0_1) {
     %lb = arith.constant 0 : index
@@ -171,7 +171,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_1 = aie.core(%tile1_1) {
     %lb = arith.constant 0 : index
@@ -204,7 +204,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_1 = aie.core(%tile2_1) {
     %lb = arith.constant 0 : index
@@ -228,7 +228,7 @@ module @hdiff_bundle_3 {
     }
     aie.use_lock(%lock21_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_2 = aie.core(%tile0_2) {
     %lb = arith.constant 0 : index
@@ -255,7 +255,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_2 = aie.core(%tile1_2) {
     %lb = arith.constant 0 : index
@@ -288,7 +288,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_0_core2_2 = aie.core(%tile2_2) {
@@ -332,7 +332,7 @@ module @hdiff_bundle_3 {
       aie.objectfifo.release<Produce>(%block_0_buf_out_shim_2:!aie.objectfifo<memref<256xi32>>, 4)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_3 = aie.core(%tile0_3) {
     %lb = arith.constant 0 : index
@@ -359,7 +359,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_3 = aie.core(%tile1_3) {
     %lb = arith.constant 0 : index
@@ -392,7 +392,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_3 = aie.core(%tile2_3) {
     %lb = arith.constant 0 : index
@@ -415,7 +415,7 @@ module @hdiff_bundle_3 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_4 = aie.core(%tile0_4) {
     %lb = arith.constant 0 : index
@@ -442,7 +442,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_4 = aie.core(%tile1_4) {
     %lb = arith.constant 0 : index
@@ -475,7 +475,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_4 = aie.core(%tile2_4) {
     %lb = arith.constant 0 : index
@@ -498,7 +498,7 @@ module @hdiff_bundle_3 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_5 = aie.core(%tile0_5) {
     %lb = arith.constant 0 : index
@@ -526,7 +526,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_5 = aie.core(%tile1_5) {
     %lb = arith.constant 0 : index
@@ -559,7 +559,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_1_core2_5 = aie.core(%tile2_5) {
     %lb = arith.constant 0 : index
@@ -583,7 +583,7 @@ module @hdiff_bundle_3 {
     }
     aie.use_lock(%lock25_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_6 = aie.core(%tile0_6) {
     %lb = arith.constant 0 : index
@@ -610,7 +610,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_6 = aie.core(%tile1_6) {
     %lb = arith.constant 0 : index
@@ -643,7 +643,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_1_core2_6 = aie.core(%tile2_6) {
@@ -687,7 +687,7 @@ module @hdiff_bundle_3 {
       aie.objectfifo.release<Produce>(%block_1_buf_out_shim_2:!aie.objectfifo<memref<256xi32>>, 4)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_7 = aie.core(%tile0_7) {
     %lb = arith.constant 0 : index
@@ -714,7 +714,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_7 = aie.core(%tile1_7) {
     %lb = arith.constant 0 : index
@@ -747,7 +747,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_1_core2_7 = aie.core(%tile2_7) {
     %lb = arith.constant 0 : index
@@ -770,7 +770,7 @@ module @hdiff_bundle_3 {
       aie.objectfifo.release<Produce>(%block_1_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_8 = aie.core(%tile0_8) {
     %lb = arith.constant 0 : index
@@ -797,7 +797,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_8 = aie.core(%tile1_8) {
     %lb = arith.constant 0 : index
@@ -830,7 +830,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_1_core2_8 = aie.core(%tile2_8) {
     %lb = arith.constant 0 : index
@@ -853,7 +853,7 @@ module @hdiff_bundle_3 {
       aie.objectfifo.release<Produce>(%block_1_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_2_core3_1 = aie.core(%tile3_1) {
     %lb = arith.constant 0 : index
@@ -881,7 +881,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_2_core4_1 = aie.core(%tile4_1) {
     %lb = arith.constant 0 : index
@@ -914,7 +914,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_2_core5_1 = aie.core(%tile5_1) {
     %lb = arith.constant 0 : index
@@ -938,7 +938,7 @@ module @hdiff_bundle_3 {
     }
     aie.use_lock(%lock51_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_2_core3_2 = aie.core(%tile3_2) {
     %lb = arith.constant 0 : index
@@ -965,7 +965,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_2_core4_2 = aie.core(%tile4_2) {
     %lb = arith.constant 0 : index
@@ -998,7 +998,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_2_core5_2 = aie.core(%tile5_2) {
@@ -1042,7 +1042,7 @@ module @hdiff_bundle_3 {
       aie.objectfifo.release<Produce>(%block_2_buf_out_shim_3:!aie.objectfifo<memref<256xi32>>, 4)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_2_core3_3 = aie.core(%tile3_3) {
     %lb = arith.constant 0 : index
@@ -1069,7 +1069,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_2_core4_3 = aie.core(%tile4_3) {
     %lb = arith.constant 0 : index
@@ -1102,7 +1102,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_2_core5_3 = aie.core(%tile5_3) {
     %lb = arith.constant 0 : index
@@ -1125,7 +1125,7 @@ module @hdiff_bundle_3 {
       aie.objectfifo.release<Produce>(%block_2_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_2_core3_4 = aie.core(%tile3_4) {
     %lb = arith.constant 0 : index
@@ -1152,7 +1152,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_2_core4_4 = aie.core(%tile4_4) {
     %lb = arith.constant 0 : index
@@ -1185,7 +1185,7 @@ module @hdiff_bundle_3 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_2_core5_4 = aie.core(%tile5_4) {
     %lb = arith.constant 0 : index
@@ -1208,6 +1208,6 @@ module @hdiff_bundle_3 {
       aie.objectfifo.release<Produce>(%block_2_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
 }
diff --git a/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_32.mlir b/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_32.mlir
index 7db60b8d6ae..bea715cd9e2 100644
--- a/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_32.mlir
+++ b/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_32.mlir
@@ -1358,9 +1358,9 @@ module @hdiff_bundle_32 {
   aie.objectfifo.register_external_buffers(%tile47_0, %block_31_buf_out_shim_47 : !aie.objectfifo<memref<256xi32>>, {%ext_buffer_out_31}) : (memref<2048xi32>)
 
 
-  func.func private @hdiff_lap(%AL: memref<256xi32>,%BL: memref<256xi32>, %CL:  memref<256xi32>, %DL: memref<256xi32>, %EL:  memref<256xi32>,  %OLL1: memref<256xi32>,  %OLL2: memref<256xi32>,  %OLL3: memref<256xi32>,  %OLL4: memref<256xi32>) -> ()
-  func.func private @hdiff_flux1(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  memref<256xi32>,   %OLF1: memref<256xi32>,  %OLF2: memref<256xi32>,  %OLF3: memref<256xi32>,  %OLF4: memref<256xi32>,  %OFI1: memref<512xi32>,  %OFI2: memref<512xi32>,  %OFI3: memref<512xi32>,  %OFI4: memref<512xi32>,  %OFI5: memref<512xi32>) -> ()
-  func.func private @hdiff_flux2( %Inter1: memref<512xi32>,%Inter2: memref<512xi32>, %Inter3: memref<512xi32>,%Inter4: memref<512xi32>,%Inter5: memref<512xi32>,  %Out: memref<256xi32>) -> ()
+  func.func private @hdiff_lap(%AL: memref<256xi32>,%BL: memref<256xi32>, %CL:  memref<256xi32>, %DL: memref<256xi32>, %EL:  memref<256xi32>,  %OLL1: memref<256xi32>,  %OLL2: memref<256xi32>,  %OLL3: memref<256xi32>,  %OLL4: memref<256xi32>) -> () attributes {link_with = "hdiff_lap.o"}
+  func.func private @hdiff_flux1(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  memref<256xi32>,   %OLF1: memref<256xi32>,  %OLF2: memref<256xi32>,  %OLF3: memref<256xi32>,  %OLF4: memref<256xi32>,  %OFI1: memref<512xi32>,  %OFI2: memref<512xi32>,  %OFI3: memref<512xi32>,  %OFI4: memref<512xi32>,  %OFI5: memref<512xi32>) -> () attributes {link_with = "hdiff_flux1.o"}
+  func.func private @hdiff_flux2( %Inter1: memref<512xi32>,%Inter2: memref<512xi32>, %Inter3: memref<512xi32>,%Inter4: memref<512xi32>,%Inter5: memref<512xi32>,  %Out: memref<256xi32>) -> () attributes {link_with = "hdiff_flux2.o"}
 
   %block_0_core0_1 = aie.core(%tile0_1) {
     %lb = arith.constant 0 : index
@@ -1387,7 +1387,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_1 = aie.core(%tile1_1) {
     %lb = arith.constant 0 : index
@@ -1420,7 +1420,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_1 = aie.core(%tile2_1) {
     %lb = arith.constant 0 : index
@@ -1443,7 +1443,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_2 = aie.core(%tile0_2) {
     %lb = arith.constant 0 : index
@@ -1471,7 +1471,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_2 = aie.core(%tile1_2) {
     %lb = arith.constant 0 : index
@@ -1504,7 +1504,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_0_core2_2 = aie.core(%tile2_2) {
@@ -1549,7 +1549,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock22_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_3 = aie.core(%tile0_3) {
     %lb = arith.constant 0 : index
@@ -1576,7 +1576,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_3 = aie.core(%tile1_3) {
     %lb = arith.constant 0 : index
@@ -1609,7 +1609,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_3 = aie.core(%tile2_3) {
     %lb = arith.constant 0 : index
@@ -1632,7 +1632,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_4 = aie.core(%tile0_4) {
     %lb = arith.constant 0 : index
@@ -1659,7 +1659,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_4 = aie.core(%tile1_4) {
     %lb = arith.constant 0 : index
@@ -1692,7 +1692,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_4 = aie.core(%tile2_4) {
     %lb = arith.constant 0 : index
@@ -1715,7 +1715,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_5 = aie.core(%tile0_5) {
     %lb = arith.constant 0 : index
@@ -1742,7 +1742,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_5 = aie.core(%tile1_5) {
     %lb = arith.constant 0 : index
@@ -1775,7 +1775,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_1_core2_5 = aie.core(%tile2_5) {
     %lb = arith.constant 0 : index
@@ -1798,7 +1798,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_1_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_6 = aie.core(%tile0_6) {
     %lb = arith.constant 0 : index
@@ -1826,7 +1826,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_6 = aie.core(%tile1_6) {
     %lb = arith.constant 0 : index
@@ -1859,7 +1859,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_1_core2_6 = aie.core(%tile2_6) {
@@ -1904,7 +1904,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock26_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_7 = aie.core(%tile0_7) {
     %lb = arith.constant 0 : index
@@ -1931,7 +1931,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_7 = aie.core(%tile1_7) {
     %lb = arith.constant 0 : index
@@ -1964,7 +1964,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_1_core2_7 = aie.core(%tile2_7) {
     %lb = arith.constant 0 : index
@@ -1987,7 +1987,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_1_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_8 = aie.core(%tile0_8) {
     %lb = arith.constant 0 : index
@@ -2014,7 +2014,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_8 = aie.core(%tile1_8) {
     %lb = arith.constant 0 : index
@@ -2047,7 +2047,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_1_core2_8 = aie.core(%tile2_8) {
     %lb = arith.constant 0 : index
@@ -2070,7 +2070,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_1_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_2_core3_1 = aie.core(%tile3_1) {
     %lb = arith.constant 0 : index
@@ -2097,7 +2097,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_2_core4_1 = aie.core(%tile4_1) {
     %lb = arith.constant 0 : index
@@ -2130,7 +2130,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_2_core5_1 = aie.core(%tile5_1) {
     %lb = arith.constant 0 : index
@@ -2153,7 +2153,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_2_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_2_core3_2 = aie.core(%tile3_2) {
     %lb = arith.constant 0 : index
@@ -2181,7 +2181,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_2_core4_2 = aie.core(%tile4_2) {
     %lb = arith.constant 0 : index
@@ -2214,7 +2214,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_2_core5_2 = aie.core(%tile5_2) {
@@ -2259,7 +2259,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock52_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_2_core3_3 = aie.core(%tile3_3) {
     %lb = arith.constant 0 : index
@@ -2286,7 +2286,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_2_core4_3 = aie.core(%tile4_3) {
     %lb = arith.constant 0 : index
@@ -2319,7 +2319,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_2_core5_3 = aie.core(%tile5_3) {
     %lb = arith.constant 0 : index
@@ -2342,7 +2342,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_2_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_2_core3_4 = aie.core(%tile3_4) {
     %lb = arith.constant 0 : index
@@ -2369,7 +2369,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_2_core4_4 = aie.core(%tile4_4) {
     %lb = arith.constant 0 : index
@@ -2402,7 +2402,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_2_core5_4 = aie.core(%tile5_4) {
     %lb = arith.constant 0 : index
@@ -2425,7 +2425,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_2_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_3_core3_5 = aie.core(%tile3_5) {
     %lb = arith.constant 0 : index
@@ -2452,7 +2452,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_3_core4_5 = aie.core(%tile4_5) {
     %lb = arith.constant 0 : index
@@ -2485,7 +2485,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_3_core5_5 = aie.core(%tile5_5) {
     %lb = arith.constant 0 : index
@@ -2508,7 +2508,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_3_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_3_core3_6 = aie.core(%tile3_6) {
     %lb = arith.constant 0 : index
@@ -2536,7 +2536,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_3_core4_6 = aie.core(%tile4_6) {
     %lb = arith.constant 0 : index
@@ -2569,7 +2569,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_3_core5_6 = aie.core(%tile5_6) {
@@ -2614,7 +2614,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock56_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_3_core3_7 = aie.core(%tile3_7) {
     %lb = arith.constant 0 : index
@@ -2641,7 +2641,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_3_core4_7 = aie.core(%tile4_7) {
     %lb = arith.constant 0 : index
@@ -2674,7 +2674,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_3_core5_7 = aie.core(%tile5_7) {
     %lb = arith.constant 0 : index
@@ -2697,7 +2697,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_3_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_3_core3_8 = aie.core(%tile3_8) {
     %lb = arith.constant 0 : index
@@ -2724,7 +2724,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_3_core4_8 = aie.core(%tile4_8) {
     %lb = arith.constant 0 : index
@@ -2757,7 +2757,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_3_core5_8 = aie.core(%tile5_8) {
     %lb = arith.constant 0 : index
@@ -2780,7 +2780,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_3_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_4_core6_1 = aie.core(%tile6_1) {
     %lb = arith.constant 0 : index
@@ -2807,7 +2807,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_4_core7_1 = aie.core(%tile7_1) {
     %lb = arith.constant 0 : index
@@ -2840,7 +2840,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_4_core8_1 = aie.core(%tile8_1) {
     %lb = arith.constant 0 : index
@@ -2863,7 +2863,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_4_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_4_core6_2 = aie.core(%tile6_2) {
     %lb = arith.constant 0 : index
@@ -2891,7 +2891,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_4_core7_2 = aie.core(%tile7_2) {
     %lb = arith.constant 0 : index
@@ -2924,7 +2924,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_4_core8_2 = aie.core(%tile8_2) {
@@ -2969,7 +2969,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock82_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_4_core6_3 = aie.core(%tile6_3) {
     %lb = arith.constant 0 : index
@@ -2996,7 +2996,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_4_core7_3 = aie.core(%tile7_3) {
     %lb = arith.constant 0 : index
@@ -3029,7 +3029,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_4_core8_3 = aie.core(%tile8_3) {
     %lb = arith.constant 0 : index
@@ -3052,7 +3052,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_4_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_4_core6_4 = aie.core(%tile6_4) {
     %lb = arith.constant 0 : index
@@ -3079,7 +3079,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_4_core7_4 = aie.core(%tile7_4) {
     %lb = arith.constant 0 : index
@@ -3112,7 +3112,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_4_core8_4 = aie.core(%tile8_4) {
     %lb = arith.constant 0 : index
@@ -3135,7 +3135,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_4_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_5_core6_5 = aie.core(%tile6_5) {
     %lb = arith.constant 0 : index
@@ -3162,7 +3162,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_5_core7_5 = aie.core(%tile7_5) {
     %lb = arith.constant 0 : index
@@ -3195,7 +3195,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_5_core8_5 = aie.core(%tile8_5) {
     %lb = arith.constant 0 : index
@@ -3218,7 +3218,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_5_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_5_core6_6 = aie.core(%tile6_6) {
     %lb = arith.constant 0 : index
@@ -3246,7 +3246,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_5_core7_6 = aie.core(%tile7_6) {
     %lb = arith.constant 0 : index
@@ -3279,7 +3279,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_5_core8_6 = aie.core(%tile8_6) {
@@ -3324,7 +3324,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock86_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_5_core6_7 = aie.core(%tile6_7) {
     %lb = arith.constant 0 : index
@@ -3351,7 +3351,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_5_core7_7 = aie.core(%tile7_7) {
     %lb = arith.constant 0 : index
@@ -3384,7 +3384,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_5_core8_7 = aie.core(%tile8_7) {
     %lb = arith.constant 0 : index
@@ -3407,7 +3407,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_5_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_5_core6_8 = aie.core(%tile6_8) {
     %lb = arith.constant 0 : index
@@ -3434,7 +3434,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_5_core7_8 = aie.core(%tile7_8) {
     %lb = arith.constant 0 : index
@@ -3467,7 +3467,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_5_core8_8 = aie.core(%tile8_8) {
     %lb = arith.constant 0 : index
@@ -3490,7 +3490,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_5_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_6_core9_1 = aie.core(%tile9_1) {
     %lb = arith.constant 0 : index
@@ -3517,7 +3517,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_6_core10_1 = aie.core(%tile10_1) {
     %lb = arith.constant 0 : index
@@ -3550,7 +3550,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_6_core11_1 = aie.core(%tile11_1) {
     %lb = arith.constant 0 : index
@@ -3573,7 +3573,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_6_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_6_core9_2 = aie.core(%tile9_2) {
     %lb = arith.constant 0 : index
@@ -3601,7 +3601,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_6_core10_2 = aie.core(%tile10_2) {
     %lb = arith.constant 0 : index
@@ -3634,7 +3634,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_6_core11_2 = aie.core(%tile11_2) {
@@ -3679,7 +3679,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock112_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_6_core9_3 = aie.core(%tile9_3) {
     %lb = arith.constant 0 : index
@@ -3706,7 +3706,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_6_core10_3 = aie.core(%tile10_3) {
     %lb = arith.constant 0 : index
@@ -3739,7 +3739,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_6_core11_3 = aie.core(%tile11_3) {
     %lb = arith.constant 0 : index
@@ -3762,7 +3762,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_6_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_6_core9_4 = aie.core(%tile9_4) {
     %lb = arith.constant 0 : index
@@ -3789,7 +3789,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_6_core10_4 = aie.core(%tile10_4) {
     %lb = arith.constant 0 : index
@@ -3822,7 +3822,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_6_core11_4 = aie.core(%tile11_4) {
     %lb = arith.constant 0 : index
@@ -3845,7 +3845,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_6_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_7_core9_5 = aie.core(%tile9_5) {
     %lb = arith.constant 0 : index
@@ -3872,7 +3872,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_7_core10_5 = aie.core(%tile10_5) {
     %lb = arith.constant 0 : index
@@ -3905,7 +3905,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_7_core11_5 = aie.core(%tile11_5) {
     %lb = arith.constant 0 : index
@@ -3928,7 +3928,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_7_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_7_core9_6 = aie.core(%tile9_6) {
     %lb = arith.constant 0 : index
@@ -3956,7 +3956,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_7_core10_6 = aie.core(%tile10_6) {
     %lb = arith.constant 0 : index
@@ -3989,7 +3989,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_7_core11_6 = aie.core(%tile11_6) {
@@ -4034,7 +4034,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock116_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_7_core9_7 = aie.core(%tile9_7) {
     %lb = arith.constant 0 : index
@@ -4061,7 +4061,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_7_core10_7 = aie.core(%tile10_7) {
     %lb = arith.constant 0 : index
@@ -4094,7 +4094,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_7_core11_7 = aie.core(%tile11_7) {
     %lb = arith.constant 0 : index
@@ -4117,7 +4117,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_7_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_7_core9_8 = aie.core(%tile9_8) {
     %lb = arith.constant 0 : index
@@ -4144,7 +4144,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_7_core10_8 = aie.core(%tile10_8) {
     %lb = arith.constant 0 : index
@@ -4177,7 +4177,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_7_core11_8 = aie.core(%tile11_8) {
     %lb = arith.constant 0 : index
@@ -4200,7 +4200,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_7_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_8_core12_1 = aie.core(%tile12_1) {
     %lb = arith.constant 0 : index
@@ -4227,7 +4227,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_8_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_8_core13_1 = aie.core(%tile13_1) {
     %lb = arith.constant 0 : index
@@ -4260,7 +4260,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_8_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_8_core14_1 = aie.core(%tile14_1) {
     %lb = arith.constant 0 : index
@@ -4283,7 +4283,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_8_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_8_core12_2 = aie.core(%tile12_2) {
     %lb = arith.constant 0 : index
@@ -4311,7 +4311,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_8_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_8_core13_2 = aie.core(%tile13_2) {
     %lb = arith.constant 0 : index
@@ -4344,7 +4344,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_8_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_8_core14_2 = aie.core(%tile14_2) {
@@ -4389,7 +4389,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock142_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_8_core12_3 = aie.core(%tile12_3) {
     %lb = arith.constant 0 : index
@@ -4416,7 +4416,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_8_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_8_core13_3 = aie.core(%tile13_3) {
     %lb = arith.constant 0 : index
@@ -4449,7 +4449,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_8_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_8_core14_3 = aie.core(%tile14_3) {
     %lb = arith.constant 0 : index
@@ -4472,7 +4472,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_8_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_8_core12_4 = aie.core(%tile12_4) {
     %lb = arith.constant 0 : index
@@ -4499,7 +4499,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_8_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_8_core13_4 = aie.core(%tile13_4) {
     %lb = arith.constant 0 : index
@@ -4532,7 +4532,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_8_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_8_core14_4 = aie.core(%tile14_4) {
     %lb = arith.constant 0 : index
@@ -4555,7 +4555,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_8_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_9_core12_5 = aie.core(%tile12_5) {
     %lb = arith.constant 0 : index
@@ -4582,7 +4582,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_9_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_9_core13_5 = aie.core(%tile13_5) {
     %lb = arith.constant 0 : index
@@ -4615,7 +4615,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_9_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_9_core14_5 = aie.core(%tile14_5) {
     %lb = arith.constant 0 : index
@@ -4638,7 +4638,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_9_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_9_core12_6 = aie.core(%tile12_6) {
     %lb = arith.constant 0 : index
@@ -4666,7 +4666,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_9_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_9_core13_6 = aie.core(%tile13_6) {
     %lb = arith.constant 0 : index
@@ -4699,7 +4699,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_9_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_9_core14_6 = aie.core(%tile14_6) {
@@ -4744,7 +4744,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock146_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_9_core12_7 = aie.core(%tile12_7) {
     %lb = arith.constant 0 : index
@@ -4771,7 +4771,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_9_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_9_core13_7 = aie.core(%tile13_7) {
     %lb = arith.constant 0 : index
@@ -4804,7 +4804,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_9_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_9_core14_7 = aie.core(%tile14_7) {
     %lb = arith.constant 0 : index
@@ -4827,7 +4827,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_9_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_9_core12_8 = aie.core(%tile12_8) {
     %lb = arith.constant 0 : index
@@ -4854,7 +4854,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_9_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_9_core13_8 = aie.core(%tile13_8) {
     %lb = arith.constant 0 : index
@@ -4887,7 +4887,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_9_buf_in_shim_10: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_9_core14_8 = aie.core(%tile14_8) {
     %lb = arith.constant 0 : index
@@ -4910,7 +4910,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_9_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_10_core15_1 = aie.core(%tile15_1) {
     %lb = arith.constant 0 : index
@@ -4937,7 +4937,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_10_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_10_core16_1 = aie.core(%tile16_1) {
     %lb = arith.constant 0 : index
@@ -4970,7 +4970,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_10_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_10_core17_1 = aie.core(%tile17_1) {
     %lb = arith.constant 0 : index
@@ -4993,7 +4993,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_10_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_10_core15_2 = aie.core(%tile15_2) {
     %lb = arith.constant 0 : index
@@ -5021,7 +5021,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_10_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_10_core16_2 = aie.core(%tile16_2) {
     %lb = arith.constant 0 : index
@@ -5054,7 +5054,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_10_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_10_core17_2 = aie.core(%tile17_2) {
@@ -5099,7 +5099,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock172_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_10_core15_3 = aie.core(%tile15_3) {
     %lb = arith.constant 0 : index
@@ -5126,7 +5126,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_10_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_10_core16_3 = aie.core(%tile16_3) {
     %lb = arith.constant 0 : index
@@ -5159,7 +5159,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_10_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_10_core17_3 = aie.core(%tile17_3) {
     %lb = arith.constant 0 : index
@@ -5182,7 +5182,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_10_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_10_core15_4 = aie.core(%tile15_4) {
     %lb = arith.constant 0 : index
@@ -5209,7 +5209,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_10_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_10_core16_4 = aie.core(%tile16_4) {
     %lb = arith.constant 0 : index
@@ -5242,7 +5242,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_10_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_10_core17_4 = aie.core(%tile17_4) {
     %lb = arith.constant 0 : index
@@ -5265,7 +5265,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_10_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_11_core15_5 = aie.core(%tile15_5) {
     %lb = arith.constant 0 : index
@@ -5292,7 +5292,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_11_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_11_core16_5 = aie.core(%tile16_5) {
     %lb = arith.constant 0 : index
@@ -5325,7 +5325,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_11_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_11_core17_5 = aie.core(%tile17_5) {
     %lb = arith.constant 0 : index
@@ -5348,7 +5348,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_11_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_11_core15_6 = aie.core(%tile15_6) {
     %lb = arith.constant 0 : index
@@ -5376,7 +5376,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_11_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_11_core16_6 = aie.core(%tile16_6) {
     %lb = arith.constant 0 : index
@@ -5409,7 +5409,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_11_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_11_core17_6 = aie.core(%tile17_6) {
@@ -5454,7 +5454,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock176_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_11_core15_7 = aie.core(%tile15_7) {
     %lb = arith.constant 0 : index
@@ -5481,7 +5481,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_11_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_11_core16_7 = aie.core(%tile16_7) {
     %lb = arith.constant 0 : index
@@ -5514,7 +5514,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_11_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_11_core17_7 = aie.core(%tile17_7) {
     %lb = arith.constant 0 : index
@@ -5537,7 +5537,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_11_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_11_core15_8 = aie.core(%tile15_8) {
     %lb = arith.constant 0 : index
@@ -5564,7 +5564,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_11_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_11_core16_8 = aie.core(%tile16_8) {
     %lb = arith.constant 0 : index
@@ -5597,7 +5597,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_11_buf_in_shim_11: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_11_core17_8 = aie.core(%tile17_8) {
     %lb = arith.constant 0 : index
@@ -5620,7 +5620,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_11_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_12_core18_1 = aie.core(%tile18_1) {
     %lb = arith.constant 0 : index
@@ -5647,7 +5647,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_12_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_12_core19_1 = aie.core(%tile19_1) {
     %lb = arith.constant 0 : index
@@ -5680,7 +5680,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_12_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_12_core20_1 = aie.core(%tile20_1) {
     %lb = arith.constant 0 : index
@@ -5703,7 +5703,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_12_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_12_core18_2 = aie.core(%tile18_2) {
     %lb = arith.constant 0 : index
@@ -5731,7 +5731,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_12_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_12_core19_2 = aie.core(%tile19_2) {
     %lb = arith.constant 0 : index
@@ -5764,7 +5764,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_12_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_12_core20_2 = aie.core(%tile20_2) {
@@ -5809,7 +5809,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock202_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_12_core18_3 = aie.core(%tile18_3) {
     %lb = arith.constant 0 : index
@@ -5836,7 +5836,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_12_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_12_core19_3 = aie.core(%tile19_3) {
     %lb = arith.constant 0 : index
@@ -5869,7 +5869,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_12_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_12_core20_3 = aie.core(%tile20_3) {
     %lb = arith.constant 0 : index
@@ -5892,7 +5892,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_12_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_12_core18_4 = aie.core(%tile18_4) {
     %lb = arith.constant 0 : index
@@ -5919,7 +5919,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_12_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_12_core19_4 = aie.core(%tile19_4) {
     %lb = arith.constant 0 : index
@@ -5952,7 +5952,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_12_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_12_core20_4 = aie.core(%tile20_4) {
     %lb = arith.constant 0 : index
@@ -5975,7 +5975,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_12_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_13_core18_5 = aie.core(%tile18_5) {
     %lb = arith.constant 0 : index
@@ -6002,7 +6002,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_13_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_13_core19_5 = aie.core(%tile19_5) {
     %lb = arith.constant 0 : index
@@ -6035,7 +6035,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_13_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_13_core20_5 = aie.core(%tile20_5) {
     %lb = arith.constant 0 : index
@@ -6058,7 +6058,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_13_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_13_core18_6 = aie.core(%tile18_6) {
     %lb = arith.constant 0 : index
@@ -6086,7 +6086,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_13_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_13_core19_6 = aie.core(%tile19_6) {
     %lb = arith.constant 0 : index
@@ -6119,7 +6119,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_13_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_13_core20_6 = aie.core(%tile20_6) {
@@ -6164,7 +6164,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock206_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_13_core18_7 = aie.core(%tile18_7) {
     %lb = arith.constant 0 : index
@@ -6191,7 +6191,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_13_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_13_core19_7 = aie.core(%tile19_7) {
     %lb = arith.constant 0 : index
@@ -6224,7 +6224,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_13_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_13_core20_7 = aie.core(%tile20_7) {
     %lb = arith.constant 0 : index
@@ -6247,7 +6247,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_13_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_13_core18_8 = aie.core(%tile18_8) {
     %lb = arith.constant 0 : index
@@ -6274,7 +6274,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_13_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_13_core19_8 = aie.core(%tile19_8) {
     %lb = arith.constant 0 : index
@@ -6307,7 +6307,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_13_buf_in_shim_18: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_13_core20_8 = aie.core(%tile20_8) {
     %lb = arith.constant 0 : index
@@ -6330,7 +6330,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_13_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_14_core21_1 = aie.core(%tile21_1) {
     %lb = arith.constant 0 : index
@@ -6357,7 +6357,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_14_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_14_core22_1 = aie.core(%tile22_1) {
     %lb = arith.constant 0 : index
@@ -6390,7 +6390,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_14_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_14_core23_1 = aie.core(%tile23_1) {
     %lb = arith.constant 0 : index
@@ -6413,7 +6413,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_14_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_14_core21_2 = aie.core(%tile21_2) {
     %lb = arith.constant 0 : index
@@ -6441,7 +6441,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_14_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_14_core22_2 = aie.core(%tile22_2) {
     %lb = arith.constant 0 : index
@@ -6474,7 +6474,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_14_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_14_core23_2 = aie.core(%tile23_2) {
@@ -6519,7 +6519,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock232_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_14_core21_3 = aie.core(%tile21_3) {
     %lb = arith.constant 0 : index
@@ -6546,7 +6546,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_14_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_14_core22_3 = aie.core(%tile22_3) {
     %lb = arith.constant 0 : index
@@ -6579,7 +6579,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_14_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_14_core23_3 = aie.core(%tile23_3) {
     %lb = arith.constant 0 : index
@@ -6602,7 +6602,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_14_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_14_core21_4 = aie.core(%tile21_4) {
     %lb = arith.constant 0 : index
@@ -6629,7 +6629,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_14_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_14_core22_4 = aie.core(%tile22_4) {
     %lb = arith.constant 0 : index
@@ -6662,7 +6662,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_14_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_14_core23_4 = aie.core(%tile23_4) {
     %lb = arith.constant 0 : index
@@ -6685,7 +6685,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_14_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_15_core21_5 = aie.core(%tile21_5) {
     %lb = arith.constant 0 : index
@@ -6712,7 +6712,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_15_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_15_core22_5 = aie.core(%tile22_5) {
     %lb = arith.constant 0 : index
@@ -6745,7 +6745,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_15_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_15_core23_5 = aie.core(%tile23_5) {
     %lb = arith.constant 0 : index
@@ -6768,7 +6768,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_15_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_15_core21_6 = aie.core(%tile21_6) {
     %lb = arith.constant 0 : index
@@ -6796,7 +6796,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_15_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_15_core22_6 = aie.core(%tile22_6) {
     %lb = arith.constant 0 : index
@@ -6829,7 +6829,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_15_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_15_core23_6 = aie.core(%tile23_6) {
@@ -6874,7 +6874,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock236_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_15_core21_7 = aie.core(%tile21_7) {
     %lb = arith.constant 0 : index
@@ -6901,7 +6901,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_15_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_15_core22_7 = aie.core(%tile22_7) {
     %lb = arith.constant 0 : index
@@ -6934,7 +6934,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_15_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_15_core23_7 = aie.core(%tile23_7) {
     %lb = arith.constant 0 : index
@@ -6957,7 +6957,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_15_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_15_core21_8 = aie.core(%tile21_8) {
     %lb = arith.constant 0 : index
@@ -6984,7 +6984,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_15_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_15_core22_8 = aie.core(%tile22_8) {
     %lb = arith.constant 0 : index
@@ -7017,7 +7017,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_15_buf_in_shim_19: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_15_core23_8 = aie.core(%tile23_8) {
     %lb = arith.constant 0 : index
@@ -7040,7 +7040,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_15_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_16_core24_1 = aie.core(%tile24_1) {
     %lb = arith.constant 0 : index
@@ -7067,7 +7067,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_16_buf_in_shim_26: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_16_core25_1 = aie.core(%tile25_1) {
     %lb = arith.constant 0 : index
@@ -7100,7 +7100,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_16_buf_in_shim_26: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_16_core26_1 = aie.core(%tile26_1) {
     %lb = arith.constant 0 : index
@@ -7123,7 +7123,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_16_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_16_core24_2 = aie.core(%tile24_2) {
     %lb = arith.constant 0 : index
@@ -7151,7 +7151,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_16_buf_in_shim_26: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_16_core25_2 = aie.core(%tile25_2) {
     %lb = arith.constant 0 : index
@@ -7184,7 +7184,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_16_buf_in_shim_26: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_16_core26_2 = aie.core(%tile26_2) {
@@ -7229,7 +7229,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock262_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_16_core24_3 = aie.core(%tile24_3) {
     %lb = arith.constant 0 : index
@@ -7256,7 +7256,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_16_buf_in_shim_26: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_16_core25_3 = aie.core(%tile25_3) {
     %lb = arith.constant 0 : index
@@ -7289,7 +7289,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_16_buf_in_shim_26: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_16_core26_3 = aie.core(%tile26_3) {
     %lb = arith.constant 0 : index
@@ -7312,7 +7312,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_16_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_16_core24_4 = aie.core(%tile24_4) {
     %lb = arith.constant 0 : index
@@ -7339,7 +7339,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_16_buf_in_shim_26: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_16_core25_4 = aie.core(%tile25_4) {
     %lb = arith.constant 0 : index
@@ -7372,7 +7372,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_16_buf_in_shim_26: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_16_core26_4 = aie.core(%tile26_4) {
     %lb = arith.constant 0 : index
@@ -7395,7 +7395,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_16_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_17_core24_5 = aie.core(%tile24_5) {
     %lb = arith.constant 0 : index
@@ -7422,7 +7422,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_17_buf_in_shim_26: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_17_core25_5 = aie.core(%tile25_5) {
     %lb = arith.constant 0 : index
@@ -7455,7 +7455,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_17_buf_in_shim_26: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_17_core26_5 = aie.core(%tile26_5) {
     %lb = arith.constant 0 : index
@@ -7478,7 +7478,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_17_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_17_core24_6 = aie.core(%tile24_6) {
     %lb = arith.constant 0 : index
@@ -7506,7 +7506,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_17_buf_in_shim_26: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_17_core25_6 = aie.core(%tile25_6) {
     %lb = arith.constant 0 : index
@@ -7539,7 +7539,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_17_buf_in_shim_26: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_17_core26_6 = aie.core(%tile26_6) {
@@ -7584,7 +7584,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock266_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_17_core24_7 = aie.core(%tile24_7) {
     %lb = arith.constant 0 : index
@@ -7611,7 +7611,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_17_buf_in_shim_26: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_17_core25_7 = aie.core(%tile25_7) {
     %lb = arith.constant 0 : index
@@ -7644,7 +7644,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_17_buf_in_shim_26: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_17_core26_7 = aie.core(%tile26_7) {
     %lb = arith.constant 0 : index
@@ -7667,7 +7667,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_17_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_17_core24_8 = aie.core(%tile24_8) {
     %lb = arith.constant 0 : index
@@ -7694,7 +7694,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_17_buf_in_shim_26: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_17_core25_8 = aie.core(%tile25_8) {
     %lb = arith.constant 0 : index
@@ -7727,7 +7727,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_17_buf_in_shim_26: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_17_core26_8 = aie.core(%tile26_8) {
     %lb = arith.constant 0 : index
@@ -7750,7 +7750,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_17_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_18_core27_1 = aie.core(%tile27_1) {
     %lb = arith.constant 0 : index
@@ -7777,7 +7777,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_18_buf_in_shim_27: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_18_core28_1 = aie.core(%tile28_1) {
     %lb = arith.constant 0 : index
@@ -7810,7 +7810,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_18_buf_in_shim_27: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_18_core29_1 = aie.core(%tile29_1) {
     %lb = arith.constant 0 : index
@@ -7833,7 +7833,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_18_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_18_core27_2 = aie.core(%tile27_2) {
     %lb = arith.constant 0 : index
@@ -7861,7 +7861,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_18_buf_in_shim_27: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_18_core28_2 = aie.core(%tile28_2) {
     %lb = arith.constant 0 : index
@@ -7894,7 +7894,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_18_buf_in_shim_27: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_18_core29_2 = aie.core(%tile29_2) {
@@ -7939,7 +7939,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock292_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_18_core27_3 = aie.core(%tile27_3) {
     %lb = arith.constant 0 : index
@@ -7966,7 +7966,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_18_buf_in_shim_27: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_18_core28_3 = aie.core(%tile28_3) {
     %lb = arith.constant 0 : index
@@ -7999,7 +7999,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_18_buf_in_shim_27: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_18_core29_3 = aie.core(%tile29_3) {
     %lb = arith.constant 0 : index
@@ -8022,7 +8022,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_18_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_18_core27_4 = aie.core(%tile27_4) {
     %lb = arith.constant 0 : index
@@ -8049,7 +8049,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_18_buf_in_shim_27: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_18_core28_4 = aie.core(%tile28_4) {
     %lb = arith.constant 0 : index
@@ -8082,7 +8082,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_18_buf_in_shim_27: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_18_core29_4 = aie.core(%tile29_4) {
     %lb = arith.constant 0 : index
@@ -8105,7 +8105,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_18_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_19_core27_5 = aie.core(%tile27_5) {
     %lb = arith.constant 0 : index
@@ -8132,7 +8132,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_19_buf_in_shim_27: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_19_core28_5 = aie.core(%tile28_5) {
     %lb = arith.constant 0 : index
@@ -8165,7 +8165,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_19_buf_in_shim_27: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_19_core29_5 = aie.core(%tile29_5) {
     %lb = arith.constant 0 : index
@@ -8188,7 +8188,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_19_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_19_core27_6 = aie.core(%tile27_6) {
     %lb = arith.constant 0 : index
@@ -8216,7 +8216,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_19_buf_in_shim_27: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_19_core28_6 = aie.core(%tile28_6) {
     %lb = arith.constant 0 : index
@@ -8249,7 +8249,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_19_buf_in_shim_27: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_19_core29_6 = aie.core(%tile29_6) {
@@ -8294,7 +8294,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock296_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_19_core27_7 = aie.core(%tile27_7) {
     %lb = arith.constant 0 : index
@@ -8321,7 +8321,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_19_buf_in_shim_27: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_19_core28_7 = aie.core(%tile28_7) {
     %lb = arith.constant 0 : index
@@ -8354,7 +8354,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_19_buf_in_shim_27: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_19_core29_7 = aie.core(%tile29_7) {
     %lb = arith.constant 0 : index
@@ -8377,7 +8377,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_19_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_19_core27_8 = aie.core(%tile27_8) {
     %lb = arith.constant 0 : index
@@ -8404,7 +8404,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_19_buf_in_shim_27: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_19_core28_8 = aie.core(%tile28_8) {
     %lb = arith.constant 0 : index
@@ -8437,7 +8437,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_19_buf_in_shim_27: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_19_core29_8 = aie.core(%tile29_8) {
     %lb = arith.constant 0 : index
@@ -8460,7 +8460,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_19_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_20_core30_1 = aie.core(%tile30_1) {
     %lb = arith.constant 0 : index
@@ -8487,7 +8487,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_20_buf_in_shim_34: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_20_core31_1 = aie.core(%tile31_1) {
     %lb = arith.constant 0 : index
@@ -8520,7 +8520,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_20_buf_in_shim_34: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_20_core32_1 = aie.core(%tile32_1) {
     %lb = arith.constant 0 : index
@@ -8543,7 +8543,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_20_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_20_core30_2 = aie.core(%tile30_2) {
     %lb = arith.constant 0 : index
@@ -8571,7 +8571,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_20_buf_in_shim_34: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_20_core31_2 = aie.core(%tile31_2) {
     %lb = arith.constant 0 : index
@@ -8604,7 +8604,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_20_buf_in_shim_34: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_20_core32_2 = aie.core(%tile32_2) {
@@ -8649,7 +8649,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock322_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_20_core30_3 = aie.core(%tile30_3) {
     %lb = arith.constant 0 : index
@@ -8676,7 +8676,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_20_buf_in_shim_34: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_20_core31_3 = aie.core(%tile31_3) {
     %lb = arith.constant 0 : index
@@ -8709,7 +8709,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_20_buf_in_shim_34: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_20_core32_3 = aie.core(%tile32_3) {
     %lb = arith.constant 0 : index
@@ -8732,7 +8732,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_20_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_20_core30_4 = aie.core(%tile30_4) {
     %lb = arith.constant 0 : index
@@ -8759,7 +8759,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_20_buf_in_shim_34: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_20_core31_4 = aie.core(%tile31_4) {
     %lb = arith.constant 0 : index
@@ -8792,7 +8792,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_20_buf_in_shim_34: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_20_core32_4 = aie.core(%tile32_4) {
     %lb = arith.constant 0 : index
@@ -8815,7 +8815,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_20_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_21_core30_5 = aie.core(%tile30_5) {
     %lb = arith.constant 0 : index
@@ -8842,7 +8842,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_21_buf_in_shim_34: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_21_core31_5 = aie.core(%tile31_5) {
     %lb = arith.constant 0 : index
@@ -8875,7 +8875,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_21_buf_in_shim_34: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_21_core32_5 = aie.core(%tile32_5) {
     %lb = arith.constant 0 : index
@@ -8898,7 +8898,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_21_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_21_core30_6 = aie.core(%tile30_6) {
     %lb = arith.constant 0 : index
@@ -8926,7 +8926,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_21_buf_in_shim_34: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_21_core31_6 = aie.core(%tile31_6) {
     %lb = arith.constant 0 : index
@@ -8959,7 +8959,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_21_buf_in_shim_34: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_21_core32_6 = aie.core(%tile32_6) {
@@ -9004,7 +9004,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock326_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_21_core30_7 = aie.core(%tile30_7) {
     %lb = arith.constant 0 : index
@@ -9031,7 +9031,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_21_buf_in_shim_34: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_21_core31_7 = aie.core(%tile31_7) {
     %lb = arith.constant 0 : index
@@ -9064,7 +9064,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_21_buf_in_shim_34: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_21_core32_7 = aie.core(%tile32_7) {
     %lb = arith.constant 0 : index
@@ -9087,7 +9087,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_21_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_21_core30_8 = aie.core(%tile30_8) {
     %lb = arith.constant 0 : index
@@ -9114,7 +9114,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_21_buf_in_shim_34: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_21_core31_8 = aie.core(%tile31_8) {
     %lb = arith.constant 0 : index
@@ -9147,7 +9147,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_21_buf_in_shim_34: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_21_core32_8 = aie.core(%tile32_8) {
     %lb = arith.constant 0 : index
@@ -9170,7 +9170,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_21_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_22_core33_1 = aie.core(%tile33_1) {
     %lb = arith.constant 0 : index
@@ -9197,7 +9197,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_22_buf_in_shim_35: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_22_core34_1 = aie.core(%tile34_1) {
     %lb = arith.constant 0 : index
@@ -9230,7 +9230,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_22_buf_in_shim_35: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_22_core35_1 = aie.core(%tile35_1) {
     %lb = arith.constant 0 : index
@@ -9253,7 +9253,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_22_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_22_core33_2 = aie.core(%tile33_2) {
     %lb = arith.constant 0 : index
@@ -9281,7 +9281,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_22_buf_in_shim_35: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_22_core34_2 = aie.core(%tile34_2) {
     %lb = arith.constant 0 : index
@@ -9314,7 +9314,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_22_buf_in_shim_35: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_22_core35_2 = aie.core(%tile35_2) {
@@ -9359,7 +9359,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock352_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_22_core33_3 = aie.core(%tile33_3) {
     %lb = arith.constant 0 : index
@@ -9386,7 +9386,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_22_buf_in_shim_35: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_22_core34_3 = aie.core(%tile34_3) {
     %lb = arith.constant 0 : index
@@ -9419,7 +9419,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_22_buf_in_shim_35: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_22_core35_3 = aie.core(%tile35_3) {
     %lb = arith.constant 0 : index
@@ -9442,7 +9442,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_22_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_22_core33_4 = aie.core(%tile33_4) {
     %lb = arith.constant 0 : index
@@ -9469,7 +9469,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_22_buf_in_shim_35: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_22_core34_4 = aie.core(%tile34_4) {
     %lb = arith.constant 0 : index
@@ -9502,7 +9502,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_22_buf_in_shim_35: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_22_core35_4 = aie.core(%tile35_4) {
     %lb = arith.constant 0 : index
@@ -9525,7 +9525,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_22_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_23_core33_5 = aie.core(%tile33_5) {
     %lb = arith.constant 0 : index
@@ -9552,7 +9552,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_23_buf_in_shim_35: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_23_core34_5 = aie.core(%tile34_5) {
     %lb = arith.constant 0 : index
@@ -9585,7 +9585,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_23_buf_in_shim_35: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_23_core35_5 = aie.core(%tile35_5) {
     %lb = arith.constant 0 : index
@@ -9608,7 +9608,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_23_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_23_core33_6 = aie.core(%tile33_6) {
     %lb = arith.constant 0 : index
@@ -9636,7 +9636,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_23_buf_in_shim_35: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_23_core34_6 = aie.core(%tile34_6) {
     %lb = arith.constant 0 : index
@@ -9669,7 +9669,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_23_buf_in_shim_35: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_23_core35_6 = aie.core(%tile35_6) {
@@ -9714,7 +9714,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock356_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_23_core33_7 = aie.core(%tile33_7) {
     %lb = arith.constant 0 : index
@@ -9741,7 +9741,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_23_buf_in_shim_35: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_23_core34_7 = aie.core(%tile34_7) {
     %lb = arith.constant 0 : index
@@ -9774,7 +9774,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_23_buf_in_shim_35: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_23_core35_7 = aie.core(%tile35_7) {
     %lb = arith.constant 0 : index
@@ -9797,7 +9797,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_23_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_23_core33_8 = aie.core(%tile33_8) {
     %lb = arith.constant 0 : index
@@ -9824,7 +9824,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_23_buf_in_shim_35: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_23_core34_8 = aie.core(%tile34_8) {
     %lb = arith.constant 0 : index
@@ -9857,7 +9857,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_23_buf_in_shim_35: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_23_core35_8 = aie.core(%tile35_8) {
     %lb = arith.constant 0 : index
@@ -9880,7 +9880,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_23_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_24_core36_1 = aie.core(%tile36_1) {
     %lb = arith.constant 0 : index
@@ -9907,7 +9907,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_24_buf_in_shim_42: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_24_core37_1 = aie.core(%tile37_1) {
     %lb = arith.constant 0 : index
@@ -9940,7 +9940,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_24_buf_in_shim_42: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_24_core38_1 = aie.core(%tile38_1) {
     %lb = arith.constant 0 : index
@@ -9963,7 +9963,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_24_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_24_core36_2 = aie.core(%tile36_2) {
     %lb = arith.constant 0 : index
@@ -9991,7 +9991,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_24_buf_in_shim_42: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_24_core37_2 = aie.core(%tile37_2) {
     %lb = arith.constant 0 : index
@@ -10024,7 +10024,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_24_buf_in_shim_42: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_24_core38_2 = aie.core(%tile38_2) {
@@ -10069,7 +10069,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock382_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_24_core36_3 = aie.core(%tile36_3) {
     %lb = arith.constant 0 : index
@@ -10096,7 +10096,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_24_buf_in_shim_42: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_24_core37_3 = aie.core(%tile37_3) {
     %lb = arith.constant 0 : index
@@ -10129,7 +10129,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_24_buf_in_shim_42: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_24_core38_3 = aie.core(%tile38_3) {
     %lb = arith.constant 0 : index
@@ -10152,7 +10152,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_24_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_24_core36_4 = aie.core(%tile36_4) {
     %lb = arith.constant 0 : index
@@ -10179,7 +10179,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_24_buf_in_shim_42: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_24_core37_4 = aie.core(%tile37_4) {
     %lb = arith.constant 0 : index
@@ -10212,7 +10212,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_24_buf_in_shim_42: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_24_core38_4 = aie.core(%tile38_4) {
     %lb = arith.constant 0 : index
@@ -10235,7 +10235,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_24_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_25_core36_5 = aie.core(%tile36_5) {
     %lb = arith.constant 0 : index
@@ -10262,7 +10262,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_25_buf_in_shim_42: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_25_core37_5 = aie.core(%tile37_5) {
     %lb = arith.constant 0 : index
@@ -10295,7 +10295,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_25_buf_in_shim_42: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_25_core38_5 = aie.core(%tile38_5) {
     %lb = arith.constant 0 : index
@@ -10318,7 +10318,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_25_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_25_core36_6 = aie.core(%tile36_6) {
     %lb = arith.constant 0 : index
@@ -10346,7 +10346,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_25_buf_in_shim_42: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_25_core37_6 = aie.core(%tile37_6) {
     %lb = arith.constant 0 : index
@@ -10379,7 +10379,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_25_buf_in_shim_42: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_25_core38_6 = aie.core(%tile38_6) {
@@ -10424,7 +10424,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock386_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_25_core36_7 = aie.core(%tile36_7) {
     %lb = arith.constant 0 : index
@@ -10451,7 +10451,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_25_buf_in_shim_42: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_25_core37_7 = aie.core(%tile37_7) {
     %lb = arith.constant 0 : index
@@ -10484,7 +10484,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_25_buf_in_shim_42: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_25_core38_7 = aie.core(%tile38_7) {
     %lb = arith.constant 0 : index
@@ -10507,7 +10507,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_25_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_25_core36_8 = aie.core(%tile36_8) {
     %lb = arith.constant 0 : index
@@ -10534,7 +10534,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_25_buf_in_shim_42: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_25_core37_8 = aie.core(%tile37_8) {
     %lb = arith.constant 0 : index
@@ -10567,7 +10567,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_25_buf_in_shim_42: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_25_core38_8 = aie.core(%tile38_8) {
     %lb = arith.constant 0 : index
@@ -10590,7 +10590,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_25_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_26_core39_1 = aie.core(%tile39_1) {
     %lb = arith.constant 0 : index
@@ -10617,7 +10617,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_26_buf_in_shim_43: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_26_core40_1 = aie.core(%tile40_1) {
     %lb = arith.constant 0 : index
@@ -10650,7 +10650,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_26_buf_in_shim_43: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_26_core41_1 = aie.core(%tile41_1) {
     %lb = arith.constant 0 : index
@@ -10673,7 +10673,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_26_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_26_core39_2 = aie.core(%tile39_2) {
     %lb = arith.constant 0 : index
@@ -10701,7 +10701,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_26_buf_in_shim_43: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_26_core40_2 = aie.core(%tile40_2) {
     %lb = arith.constant 0 : index
@@ -10734,7 +10734,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_26_buf_in_shim_43: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_26_core41_2 = aie.core(%tile41_2) {
@@ -10779,7 +10779,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock412_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_26_core39_3 = aie.core(%tile39_3) {
     %lb = arith.constant 0 : index
@@ -10806,7 +10806,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_26_buf_in_shim_43: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_26_core40_3 = aie.core(%tile40_3) {
     %lb = arith.constant 0 : index
@@ -10839,7 +10839,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_26_buf_in_shim_43: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_26_core41_3 = aie.core(%tile41_3) {
     %lb = arith.constant 0 : index
@@ -10862,7 +10862,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_26_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_26_core39_4 = aie.core(%tile39_4) {
     %lb = arith.constant 0 : index
@@ -10889,7 +10889,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_26_buf_in_shim_43: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_26_core40_4 = aie.core(%tile40_4) {
     %lb = arith.constant 0 : index
@@ -10922,7 +10922,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_26_buf_in_shim_43: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_26_core41_4 = aie.core(%tile41_4) {
     %lb = arith.constant 0 : index
@@ -10945,7 +10945,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_26_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_27_core39_5 = aie.core(%tile39_5) {
     %lb = arith.constant 0 : index
@@ -10972,7 +10972,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_27_buf_in_shim_43: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_27_core40_5 = aie.core(%tile40_5) {
     %lb = arith.constant 0 : index
@@ -11005,7 +11005,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_27_buf_in_shim_43: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_27_core41_5 = aie.core(%tile41_5) {
     %lb = arith.constant 0 : index
@@ -11028,7 +11028,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_27_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_27_core39_6 = aie.core(%tile39_6) {
     %lb = arith.constant 0 : index
@@ -11056,7 +11056,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_27_buf_in_shim_43: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_27_core40_6 = aie.core(%tile40_6) {
     %lb = arith.constant 0 : index
@@ -11089,7 +11089,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_27_buf_in_shim_43: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_27_core41_6 = aie.core(%tile41_6) {
@@ -11134,7 +11134,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock416_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_27_core39_7 = aie.core(%tile39_7) {
     %lb = arith.constant 0 : index
@@ -11161,7 +11161,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_27_buf_in_shim_43: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_27_core40_7 = aie.core(%tile40_7) {
     %lb = arith.constant 0 : index
@@ -11194,7 +11194,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_27_buf_in_shim_43: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_27_core41_7 = aie.core(%tile41_7) {
     %lb = arith.constant 0 : index
@@ -11217,7 +11217,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_27_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_27_core39_8 = aie.core(%tile39_8) {
     %lb = arith.constant 0 : index
@@ -11244,7 +11244,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_27_buf_in_shim_43: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_27_core40_8 = aie.core(%tile40_8) {
     %lb = arith.constant 0 : index
@@ -11277,7 +11277,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_27_buf_in_shim_43: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_27_core41_8 = aie.core(%tile41_8) {
     %lb = arith.constant 0 : index
@@ -11300,7 +11300,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_27_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_28_core42_1 = aie.core(%tile42_1) {
     %lb = arith.constant 0 : index
@@ -11327,7 +11327,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_28_buf_in_shim_46: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_28_core43_1 = aie.core(%tile43_1) {
     %lb = arith.constant 0 : index
@@ -11360,7 +11360,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_28_buf_in_shim_46: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_28_core44_1 = aie.core(%tile44_1) {
     %lb = arith.constant 0 : index
@@ -11383,7 +11383,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_28_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_28_core42_2 = aie.core(%tile42_2) {
     %lb = arith.constant 0 : index
@@ -11411,7 +11411,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_28_buf_in_shim_46: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_28_core43_2 = aie.core(%tile43_2) {
     %lb = arith.constant 0 : index
@@ -11444,7 +11444,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_28_buf_in_shim_46: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_28_core44_2 = aie.core(%tile44_2) {
@@ -11489,7 +11489,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock442_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_28_core42_3 = aie.core(%tile42_3) {
     %lb = arith.constant 0 : index
@@ -11516,7 +11516,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_28_buf_in_shim_46: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_28_core43_3 = aie.core(%tile43_3) {
     %lb = arith.constant 0 : index
@@ -11549,7 +11549,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_28_buf_in_shim_46: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_28_core44_3 = aie.core(%tile44_3) {
     %lb = arith.constant 0 : index
@@ -11572,7 +11572,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_28_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_28_core42_4 = aie.core(%tile42_4) {
     %lb = arith.constant 0 : index
@@ -11599,7 +11599,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_28_buf_in_shim_46: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_28_core43_4 = aie.core(%tile43_4) {
     %lb = arith.constant 0 : index
@@ -11632,7 +11632,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_28_buf_in_shim_46: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_28_core44_4 = aie.core(%tile44_4) {
     %lb = arith.constant 0 : index
@@ -11655,7 +11655,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_28_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_29_core42_5 = aie.core(%tile42_5) {
     %lb = arith.constant 0 : index
@@ -11682,7 +11682,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_29_buf_in_shim_46: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_29_core43_5 = aie.core(%tile43_5) {
     %lb = arith.constant 0 : index
@@ -11715,7 +11715,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_29_buf_in_shim_46: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_29_core44_5 = aie.core(%tile44_5) {
     %lb = arith.constant 0 : index
@@ -11738,7 +11738,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_29_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_29_core42_6 = aie.core(%tile42_6) {
     %lb = arith.constant 0 : index
@@ -11766,7 +11766,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_29_buf_in_shim_46: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_29_core43_6 = aie.core(%tile43_6) {
     %lb = arith.constant 0 : index
@@ -11799,7 +11799,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_29_buf_in_shim_46: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_29_core44_6 = aie.core(%tile44_6) {
@@ -11844,7 +11844,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock446_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_29_core42_7 = aie.core(%tile42_7) {
     %lb = arith.constant 0 : index
@@ -11871,7 +11871,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_29_buf_in_shim_46: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_29_core43_7 = aie.core(%tile43_7) {
     %lb = arith.constant 0 : index
@@ -11904,7 +11904,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_29_buf_in_shim_46: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_29_core44_7 = aie.core(%tile44_7) {
     %lb = arith.constant 0 : index
@@ -11927,7 +11927,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_29_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_29_core42_8 = aie.core(%tile42_8) {
     %lb = arith.constant 0 : index
@@ -11954,7 +11954,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_29_buf_in_shim_46: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_29_core43_8 = aie.core(%tile43_8) {
     %lb = arith.constant 0 : index
@@ -11987,7 +11987,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_29_buf_in_shim_46: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_29_core44_8 = aie.core(%tile44_8) {
     %lb = arith.constant 0 : index
@@ -12010,7 +12010,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_29_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_30_core45_1 = aie.core(%tile45_1) {
     %lb = arith.constant 0 : index
@@ -12037,7 +12037,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_30_buf_in_shim_47: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_30_core46_1 = aie.core(%tile46_1) {
     %lb = arith.constant 0 : index
@@ -12070,7 +12070,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_30_buf_in_shim_47: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_30_core47_1 = aie.core(%tile47_1) {
     %lb = arith.constant 0 : index
@@ -12093,7 +12093,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_30_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_30_core45_2 = aie.core(%tile45_2) {
     %lb = arith.constant 0 : index
@@ -12121,7 +12121,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_30_buf_in_shim_47: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_30_core46_2 = aie.core(%tile46_2) {
     %lb = arith.constant 0 : index
@@ -12154,7 +12154,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_30_buf_in_shim_47: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_30_core47_2 = aie.core(%tile47_2) {
@@ -12199,7 +12199,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock472_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_30_core45_3 = aie.core(%tile45_3) {
     %lb = arith.constant 0 : index
@@ -12226,7 +12226,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_30_buf_in_shim_47: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_30_core46_3 = aie.core(%tile46_3) {
     %lb = arith.constant 0 : index
@@ -12259,7 +12259,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_30_buf_in_shim_47: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_30_core47_3 = aie.core(%tile47_3) {
     %lb = arith.constant 0 : index
@@ -12282,7 +12282,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_30_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_30_core45_4 = aie.core(%tile45_4) {
     %lb = arith.constant 0 : index
@@ -12309,7 +12309,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_30_buf_in_shim_47: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_30_core46_4 = aie.core(%tile46_4) {
     %lb = arith.constant 0 : index
@@ -12342,7 +12342,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_30_buf_in_shim_47: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_30_core47_4 = aie.core(%tile47_4) {
     %lb = arith.constant 0 : index
@@ -12365,7 +12365,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_30_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_31_core45_5 = aie.core(%tile45_5) {
     %lb = arith.constant 0 : index
@@ -12392,7 +12392,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_31_buf_in_shim_47: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_31_core46_5 = aie.core(%tile46_5) {
     %lb = arith.constant 0 : index
@@ -12425,7 +12425,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_31_buf_in_shim_47: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_31_core47_5 = aie.core(%tile47_5) {
     %lb = arith.constant 0 : index
@@ -12448,7 +12448,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_31_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_31_core45_6 = aie.core(%tile45_6) {
     %lb = arith.constant 0 : index
@@ -12476,7 +12476,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_31_buf_in_shim_47: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_31_core46_6 = aie.core(%tile46_6) {
     %lb = arith.constant 0 : index
@@ -12509,7 +12509,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_31_buf_in_shim_47: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_31_core47_6 = aie.core(%tile47_6) {
@@ -12554,7 +12554,7 @@ module @hdiff_bundle_32 {
     }
     aie.use_lock(%lock476_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_31_core45_7 = aie.core(%tile45_7) {
     %lb = arith.constant 0 : index
@@ -12581,7 +12581,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_31_buf_in_shim_47: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_31_core46_7 = aie.core(%tile46_7) {
     %lb = arith.constant 0 : index
@@ -12614,7 +12614,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_31_buf_in_shim_47: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_31_core47_7 = aie.core(%tile47_7) {
     %lb = arith.constant 0 : index
@@ -12637,7 +12637,7 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_31_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_31_core45_8 = aie.core(%tile45_8) {
     %lb = arith.constant 0 : index
@@ -12664,7 +12664,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_31_buf_in_shim_47: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_31_core46_8 = aie.core(%tile46_8) {
     %lb = arith.constant 0 : index
@@ -12697,7 +12697,7 @@ module @hdiff_bundle_32 {
     }
     aie.objectfifo.release<Consume>(%block_31_buf_in_shim_47: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_31_core47_8 = aie.core(%tile47_8) {
     %lb = arith.constant 0 : index
@@ -12720,6 +12720,6 @@ module @hdiff_bundle_32 {
       aie.objectfifo.release<Produce>(%block_31_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
 }
diff --git a/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_4.mlir b/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_4.mlir
index b1c53088895..1dd0746754a 100644
--- a/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_4.mlir
+++ b/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_4.mlir
@@ -183,9 +183,9 @@ module @hdiff_bundle_4 {
   aie.objectfifo.register_external_buffers(%tile3_0, %block_3_buf_out_shim_3 : !aie.objectfifo<memref<256xi32>>, {%ext_buffer_out_3}) : (memref<2048xi32>)
 
 
-  func.func private @hdiff_lap(%AL: memref<256xi32>,%BL: memref<256xi32>, %CL:  memref<256xi32>, %DL: memref<256xi32>, %EL:  memref<256xi32>,  %OLL1: memref<256xi32>,  %OLL2: memref<256xi32>,  %OLL3: memref<256xi32>,  %OLL4: memref<256xi32>) -> ()
-  func.func private @hdiff_flux1(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  memref<256xi32>,   %OLF1: memref<256xi32>,  %OLF2: memref<256xi32>,  %OLF3: memref<256xi32>,  %OLF4: memref<256xi32>,  %OFI1: memref<512xi32>,  %OFI2: memref<512xi32>,  %OFI3: memref<512xi32>,  %OFI4: memref<512xi32>,  %OFI5: memref<512xi32>) -> ()
-  func.func private @hdiff_flux2( %Inter1: memref<512xi32>,%Inter2: memref<512xi32>, %Inter3: memref<512xi32>,%Inter4: memref<512xi32>,%Inter5: memref<512xi32>,  %Out: memref<256xi32>) -> ()
+  func.func private @hdiff_lap(%AL: memref<256xi32>,%BL: memref<256xi32>, %CL:  memref<256xi32>, %DL: memref<256xi32>, %EL:  memref<256xi32>,  %OLL1: memref<256xi32>,  %OLL2: memref<256xi32>,  %OLL3: memref<256xi32>,  %OLL4: memref<256xi32>) -> () attributes {link_with = "hdiff_lap.o"}
+  func.func private @hdiff_flux1(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  memref<256xi32>,   %OLF1: memref<256xi32>,  %OLF2: memref<256xi32>,  %OLF3: memref<256xi32>,  %OLF4: memref<256xi32>,  %OFI1: memref<512xi32>,  %OFI2: memref<512xi32>,  %OFI3: memref<512xi32>,  %OFI4: memref<512xi32>,  %OFI5: memref<512xi32>) -> () attributes {link_with = "hdiff_flux1.o"}
+  func.func private @hdiff_flux2( %Inter1: memref<512xi32>,%Inter2: memref<512xi32>, %Inter3: memref<512xi32>,%Inter4: memref<512xi32>,%Inter5: memref<512xi32>,  %Out: memref<256xi32>) -> () attributes {link_with = "hdiff_flux2.o"}
 
   %block_0_core0_1 = aie.core(%tile0_1) {
     %lb = arith.constant 0 : index
@@ -212,7 +212,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_1 = aie.core(%tile1_1) {
     %lb = arith.constant 0 : index
@@ -245,7 +245,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_1 = aie.core(%tile2_1) {
     %lb = arith.constant 0 : index
@@ -268,7 +268,7 @@ module @hdiff_bundle_4 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_2 = aie.core(%tile0_2) {
     %lb = arith.constant 0 : index
@@ -296,7 +296,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_2 = aie.core(%tile1_2) {
     %lb = arith.constant 0 : index
@@ -329,7 +329,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_0_core2_2 = aie.core(%tile2_2) {
@@ -374,7 +374,7 @@ module @hdiff_bundle_4 {
     }
     aie.use_lock(%lock22_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_3 = aie.core(%tile0_3) {
     %lb = arith.constant 0 : index
@@ -401,7 +401,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_3 = aie.core(%tile1_3) {
     %lb = arith.constant 0 : index
@@ -434,7 +434,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_3 = aie.core(%tile2_3) {
     %lb = arith.constant 0 : index
@@ -457,7 +457,7 @@ module @hdiff_bundle_4 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_4 = aie.core(%tile0_4) {
     %lb = arith.constant 0 : index
@@ -484,7 +484,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_4 = aie.core(%tile1_4) {
     %lb = arith.constant 0 : index
@@ -517,7 +517,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_4 = aie.core(%tile2_4) {
     %lb = arith.constant 0 : index
@@ -540,7 +540,7 @@ module @hdiff_bundle_4 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_5 = aie.core(%tile0_5) {
     %lb = arith.constant 0 : index
@@ -567,7 +567,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_5 = aie.core(%tile1_5) {
     %lb = arith.constant 0 : index
@@ -600,7 +600,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_1_core2_5 = aie.core(%tile2_5) {
     %lb = arith.constant 0 : index
@@ -623,7 +623,7 @@ module @hdiff_bundle_4 {
       aie.objectfifo.release<Produce>(%block_1_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_6 = aie.core(%tile0_6) {
     %lb = arith.constant 0 : index
@@ -651,7 +651,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_6 = aie.core(%tile1_6) {
     %lb = arith.constant 0 : index
@@ -684,7 +684,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_1_core2_6 = aie.core(%tile2_6) {
@@ -729,7 +729,7 @@ module @hdiff_bundle_4 {
     }
     aie.use_lock(%lock26_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_7 = aie.core(%tile0_7) {
     %lb = arith.constant 0 : index
@@ -756,7 +756,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_7 = aie.core(%tile1_7) {
     %lb = arith.constant 0 : index
@@ -789,7 +789,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_1_core2_7 = aie.core(%tile2_7) {
     %lb = arith.constant 0 : index
@@ -812,7 +812,7 @@ module @hdiff_bundle_4 {
       aie.objectfifo.release<Produce>(%block_1_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_8 = aie.core(%tile0_8) {
     %lb = arith.constant 0 : index
@@ -839,7 +839,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_8 = aie.core(%tile1_8) {
     %lb = arith.constant 0 : index
@@ -872,7 +872,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_1_core2_8 = aie.core(%tile2_8) {
     %lb = arith.constant 0 : index
@@ -895,7 +895,7 @@ module @hdiff_bundle_4 {
       aie.objectfifo.release<Produce>(%block_1_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_2_core3_1 = aie.core(%tile3_1) {
     %lb = arith.constant 0 : index
@@ -922,7 +922,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_2_core4_1 = aie.core(%tile4_1) {
     %lb = arith.constant 0 : index
@@ -955,7 +955,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_2_core5_1 = aie.core(%tile5_1) {
     %lb = arith.constant 0 : index
@@ -978,7 +978,7 @@ module @hdiff_bundle_4 {
       aie.objectfifo.release<Produce>(%block_2_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_2_core3_2 = aie.core(%tile3_2) {
     %lb = arith.constant 0 : index
@@ -1006,7 +1006,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_2_core4_2 = aie.core(%tile4_2) {
     %lb = arith.constant 0 : index
@@ -1039,7 +1039,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_2_core5_2 = aie.core(%tile5_2) {
@@ -1084,7 +1084,7 @@ module @hdiff_bundle_4 {
     }
     aie.use_lock(%lock52_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_2_core3_3 = aie.core(%tile3_3) {
     %lb = arith.constant 0 : index
@@ -1111,7 +1111,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_2_core4_3 = aie.core(%tile4_3) {
     %lb = arith.constant 0 : index
@@ -1144,7 +1144,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_2_core5_3 = aie.core(%tile5_3) {
     %lb = arith.constant 0 : index
@@ -1167,7 +1167,7 @@ module @hdiff_bundle_4 {
       aie.objectfifo.release<Produce>(%block_2_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_2_core3_4 = aie.core(%tile3_4) {
     %lb = arith.constant 0 : index
@@ -1194,7 +1194,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_2_core4_4 = aie.core(%tile4_4) {
     %lb = arith.constant 0 : index
@@ -1227,7 +1227,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_2_core5_4 = aie.core(%tile5_4) {
     %lb = arith.constant 0 : index
@@ -1250,7 +1250,7 @@ module @hdiff_bundle_4 {
       aie.objectfifo.release<Produce>(%block_2_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_3_core3_5 = aie.core(%tile3_5) {
     %lb = arith.constant 0 : index
@@ -1277,7 +1277,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_3_core4_5 = aie.core(%tile4_5) {
     %lb = arith.constant 0 : index
@@ -1310,7 +1310,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_3_core5_5 = aie.core(%tile5_5) {
     %lb = arith.constant 0 : index
@@ -1333,7 +1333,7 @@ module @hdiff_bundle_4 {
       aie.objectfifo.release<Produce>(%block_3_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_3_core3_6 = aie.core(%tile3_6) {
     %lb = arith.constant 0 : index
@@ -1361,7 +1361,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_3_core4_6 = aie.core(%tile4_6) {
     %lb = arith.constant 0 : index
@@ -1394,7 +1394,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_3_core5_6 = aie.core(%tile5_6) {
@@ -1439,7 +1439,7 @@ module @hdiff_bundle_4 {
     }
     aie.use_lock(%lock56_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_3_core3_7 = aie.core(%tile3_7) {
     %lb = arith.constant 0 : index
@@ -1466,7 +1466,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_3_core4_7 = aie.core(%tile4_7) {
     %lb = arith.constant 0 : index
@@ -1499,7 +1499,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_3_core5_7 = aie.core(%tile5_7) {
     %lb = arith.constant 0 : index
@@ -1522,7 +1522,7 @@ module @hdiff_bundle_4 {
       aie.objectfifo.release<Produce>(%block_3_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_3_core3_8 = aie.core(%tile3_8) {
     %lb = arith.constant 0 : index
@@ -1549,7 +1549,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_3_core4_8 = aie.core(%tile4_8) {
     %lb = arith.constant 0 : index
@@ -1582,7 +1582,7 @@ module @hdiff_bundle_4 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_3_core5_8 = aie.core(%tile5_8) {
     %lb = arith.constant 0 : index
@@ -1605,6 +1605,6 @@ module @hdiff_bundle_4 {
       aie.objectfifo.release<Produce>(%block_3_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
 }
diff --git a/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_8.mlir b/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_8.mlir
index b0e1bbd3946..959f1923cb1 100644
--- a/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_8.mlir
+++ b/programming_examples/mlir/horizontal_diffusion/HDIFF_tri_AIE_objectFIFO_ping_pong_scaled/aie_8.mlir
@@ -351,9 +351,9 @@ module @hdiff_bundle_8 {
   aie.objectfifo.register_external_buffers(%tile7_0, %block_7_buf_out_shim_7 : !aie.objectfifo<memref<256xi32>>, {%ext_buffer_out_7}) : (memref<2048xi32>)
 
 
-  func.func private @hdiff_lap(%AL: memref<256xi32>,%BL: memref<256xi32>, %CL:  memref<256xi32>, %DL: memref<256xi32>, %EL:  memref<256xi32>,  %OLL1: memref<256xi32>,  %OLL2: memref<256xi32>,  %OLL3: memref<256xi32>,  %OLL4: memref<256xi32>) -> ()
-  func.func private @hdiff_flux1(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  memref<256xi32>,   %OLF1: memref<256xi32>,  %OLF2: memref<256xi32>,  %OLF3: memref<256xi32>,  %OLF4: memref<256xi32>,  %OFI1: memref<512xi32>,  %OFI2: memref<512xi32>,  %OFI3: memref<512xi32>,  %OFI4: memref<512xi32>,  %OFI5: memref<512xi32>) -> ()
-  func.func private @hdiff_flux2( %Inter1: memref<512xi32>,%Inter2: memref<512xi32>, %Inter3: memref<512xi32>,%Inter4: memref<512xi32>,%Inter5: memref<512xi32>,  %Out: memref<256xi32>) -> ()
+  func.func private @hdiff_lap(%AL: memref<256xi32>,%BL: memref<256xi32>, %CL:  memref<256xi32>, %DL: memref<256xi32>, %EL:  memref<256xi32>,  %OLL1: memref<256xi32>,  %OLL2: memref<256xi32>,  %OLL3: memref<256xi32>,  %OLL4: memref<256xi32>) -> () attributes {link_with = "hdiff_lap.o"}
+  func.func private @hdiff_flux1(%AF: memref<256xi32>,%BF: memref<256xi32>, %CF:  memref<256xi32>,   %OLF1: memref<256xi32>,  %OLF2: memref<256xi32>,  %OLF3: memref<256xi32>,  %OLF4: memref<256xi32>,  %OFI1: memref<512xi32>,  %OFI2: memref<512xi32>,  %OFI3: memref<512xi32>,  %OFI4: memref<512xi32>,  %OFI5: memref<512xi32>) -> () attributes {link_with = "hdiff_flux1.o"}
+  func.func private @hdiff_flux2( %Inter1: memref<512xi32>,%Inter2: memref<512xi32>, %Inter3: memref<512xi32>,%Inter4: memref<512xi32>,%Inter5: memref<512xi32>,  %Out: memref<256xi32>) -> () attributes {link_with = "hdiff_flux2.o"}
 
   %block_0_core0_1 = aie.core(%tile0_1) {
     %lb = arith.constant 0 : index
@@ -380,7 +380,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_1 = aie.core(%tile1_1) {
     %lb = arith.constant 0 : index
@@ -413,7 +413,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_1 = aie.core(%tile2_1) {
     %lb = arith.constant 0 : index
@@ -436,7 +436,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_2 = aie.core(%tile0_2) {
     %lb = arith.constant 0 : index
@@ -464,7 +464,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_2 = aie.core(%tile1_2) {
     %lb = arith.constant 0 : index
@@ -497,7 +497,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_0_core2_2 = aie.core(%tile2_2) {
@@ -542,7 +542,7 @@ module @hdiff_bundle_8 {
     }
     aie.use_lock(%lock22_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_3 = aie.core(%tile0_3) {
     %lb = arith.constant 0 : index
@@ -569,7 +569,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_3 = aie.core(%tile1_3) {
     %lb = arith.constant 0 : index
@@ -602,7 +602,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_3 = aie.core(%tile2_3) {
     %lb = arith.constant 0 : index
@@ -625,7 +625,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_0_core0_4 = aie.core(%tile0_4) {
     %lb = arith.constant 0 : index
@@ -652,7 +652,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_0_core1_4 = aie.core(%tile1_4) {
     %lb = arith.constant 0 : index
@@ -685,7 +685,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_0_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_0_core2_4 = aie.core(%tile2_4) {
     %lb = arith.constant 0 : index
@@ -708,7 +708,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_0_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_5 = aie.core(%tile0_5) {
     %lb = arith.constant 0 : index
@@ -735,7 +735,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_5 = aie.core(%tile1_5) {
     %lb = arith.constant 0 : index
@@ -768,7 +768,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_1_core2_5 = aie.core(%tile2_5) {
     %lb = arith.constant 0 : index
@@ -791,7 +791,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_1_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_6 = aie.core(%tile0_6) {
     %lb = arith.constant 0 : index
@@ -819,7 +819,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_6 = aie.core(%tile1_6) {
     %lb = arith.constant 0 : index
@@ -852,7 +852,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_1_core2_6 = aie.core(%tile2_6) {
@@ -897,7 +897,7 @@ module @hdiff_bundle_8 {
     }
     aie.use_lock(%lock26_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_7 = aie.core(%tile0_7) {
     %lb = arith.constant 0 : index
@@ -924,7 +924,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_7 = aie.core(%tile1_7) {
     %lb = arith.constant 0 : index
@@ -957,7 +957,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_1_core2_7 = aie.core(%tile2_7) {
     %lb = arith.constant 0 : index
@@ -980,7 +980,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_1_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_1_core0_8 = aie.core(%tile0_8) {
     %lb = arith.constant 0 : index
@@ -1007,7 +1007,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_1_core1_8 = aie.core(%tile1_8) {
     %lb = arith.constant 0 : index
@@ -1040,7 +1040,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_1_buf_in_shim_2: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_1_core2_8 = aie.core(%tile2_8) {
     %lb = arith.constant 0 : index
@@ -1063,7 +1063,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_1_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_2_core3_1 = aie.core(%tile3_1) {
     %lb = arith.constant 0 : index
@@ -1090,7 +1090,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_2_core4_1 = aie.core(%tile4_1) {
     %lb = arith.constant 0 : index
@@ -1123,7 +1123,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_2_core5_1 = aie.core(%tile5_1) {
     %lb = arith.constant 0 : index
@@ -1146,7 +1146,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_2_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_2_core3_2 = aie.core(%tile3_2) {
     %lb = arith.constant 0 : index
@@ -1174,7 +1174,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_2_core4_2 = aie.core(%tile4_2) {
     %lb = arith.constant 0 : index
@@ -1207,7 +1207,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_2_core5_2 = aie.core(%tile5_2) {
@@ -1252,7 +1252,7 @@ module @hdiff_bundle_8 {
     }
     aie.use_lock(%lock52_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_2_core3_3 = aie.core(%tile3_3) {
     %lb = arith.constant 0 : index
@@ -1279,7 +1279,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_2_core4_3 = aie.core(%tile4_3) {
     %lb = arith.constant 0 : index
@@ -1312,7 +1312,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_2_core5_3 = aie.core(%tile5_3) {
     %lb = arith.constant 0 : index
@@ -1335,7 +1335,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_2_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_2_core3_4 = aie.core(%tile3_4) {
     %lb = arith.constant 0 : index
@@ -1362,7 +1362,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_2_core4_4 = aie.core(%tile4_4) {
     %lb = arith.constant 0 : index
@@ -1395,7 +1395,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_2_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_2_core5_4 = aie.core(%tile5_4) {
     %lb = arith.constant 0 : index
@@ -1418,7 +1418,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_2_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_3_core3_5 = aie.core(%tile3_5) {
     %lb = arith.constant 0 : index
@@ -1445,7 +1445,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_3_core4_5 = aie.core(%tile4_5) {
     %lb = arith.constant 0 : index
@@ -1478,7 +1478,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_3_core5_5 = aie.core(%tile5_5) {
     %lb = arith.constant 0 : index
@@ -1501,7 +1501,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_3_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_3_core3_6 = aie.core(%tile3_6) {
     %lb = arith.constant 0 : index
@@ -1529,7 +1529,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_3_core4_6 = aie.core(%tile4_6) {
     %lb = arith.constant 0 : index
@@ -1562,7 +1562,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_3_core5_6 = aie.core(%tile5_6) {
@@ -1607,7 +1607,7 @@ module @hdiff_bundle_8 {
     }
     aie.use_lock(%lock56_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_3_core3_7 = aie.core(%tile3_7) {
     %lb = arith.constant 0 : index
@@ -1634,7 +1634,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_3_core4_7 = aie.core(%tile4_7) {
     %lb = arith.constant 0 : index
@@ -1667,7 +1667,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_3_core5_7 = aie.core(%tile5_7) {
     %lb = arith.constant 0 : index
@@ -1690,7 +1690,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_3_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_3_core3_8 = aie.core(%tile3_8) {
     %lb = arith.constant 0 : index
@@ -1717,7 +1717,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_3_core4_8 = aie.core(%tile4_8) {
     %lb = arith.constant 0 : index
@@ -1750,7 +1750,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_3_buf_in_shim_3: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_3_core5_8 = aie.core(%tile5_8) {
     %lb = arith.constant 0 : index
@@ -1773,7 +1773,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_3_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_4_core6_1 = aie.core(%tile6_1) {
     %lb = arith.constant 0 : index
@@ -1800,7 +1800,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_4_core7_1 = aie.core(%tile7_1) {
     %lb = arith.constant 0 : index
@@ -1833,7 +1833,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_4_core8_1 = aie.core(%tile8_1) {
     %lb = arith.constant 0 : index
@@ -1856,7 +1856,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_4_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_4_core6_2 = aie.core(%tile6_2) {
     %lb = arith.constant 0 : index
@@ -1884,7 +1884,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_4_core7_2 = aie.core(%tile7_2) {
     %lb = arith.constant 0 : index
@@ -1917,7 +1917,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_4_core8_2 = aie.core(%tile8_2) {
@@ -1962,7 +1962,7 @@ module @hdiff_bundle_8 {
     }
     aie.use_lock(%lock82_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_4_core6_3 = aie.core(%tile6_3) {
     %lb = arith.constant 0 : index
@@ -1989,7 +1989,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_4_core7_3 = aie.core(%tile7_3) {
     %lb = arith.constant 0 : index
@@ -2022,7 +2022,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_4_core8_3 = aie.core(%tile8_3) {
     %lb = arith.constant 0 : index
@@ -2045,7 +2045,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_4_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_4_core6_4 = aie.core(%tile6_4) {
     %lb = arith.constant 0 : index
@@ -2072,7 +2072,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_4_core7_4 = aie.core(%tile7_4) {
     %lb = arith.constant 0 : index
@@ -2105,7 +2105,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_4_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_4_core8_4 = aie.core(%tile8_4) {
     %lb = arith.constant 0 : index
@@ -2128,7 +2128,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_4_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_5_core6_5 = aie.core(%tile6_5) {
     %lb = arith.constant 0 : index
@@ -2155,7 +2155,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_5_core7_5 = aie.core(%tile7_5) {
     %lb = arith.constant 0 : index
@@ -2188,7 +2188,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_5_core8_5 = aie.core(%tile8_5) {
     %lb = arith.constant 0 : index
@@ -2211,7 +2211,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_5_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_5_core6_6 = aie.core(%tile6_6) {
     %lb = arith.constant 0 : index
@@ -2239,7 +2239,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_5_core7_6 = aie.core(%tile7_6) {
     %lb = arith.constant 0 : index
@@ -2272,7 +2272,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_5_core8_6 = aie.core(%tile8_6) {
@@ -2317,7 +2317,7 @@ module @hdiff_bundle_8 {
     }
     aie.use_lock(%lock86_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_5_core6_7 = aie.core(%tile6_7) {
     %lb = arith.constant 0 : index
@@ -2344,7 +2344,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_5_core7_7 = aie.core(%tile7_7) {
     %lb = arith.constant 0 : index
@@ -2377,7 +2377,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_5_core8_7 = aie.core(%tile8_7) {
     %lb = arith.constant 0 : index
@@ -2400,7 +2400,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_5_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_5_core6_8 = aie.core(%tile6_8) {
     %lb = arith.constant 0 : index
@@ -2427,7 +2427,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_5_core7_8 = aie.core(%tile7_8) {
     %lb = arith.constant 0 : index
@@ -2460,7 +2460,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_5_buf_in_shim_6: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_5_core8_8 = aie.core(%tile8_8) {
     %lb = arith.constant 0 : index
@@ -2483,7 +2483,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_5_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_6_core9_1 = aie.core(%tile9_1) {
     %lb = arith.constant 0 : index
@@ -2510,7 +2510,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_6_core10_1 = aie.core(%tile10_1) {
     %lb = arith.constant 0 : index
@@ -2543,7 +2543,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_6_core11_1 = aie.core(%tile11_1) {
     %lb = arith.constant 0 : index
@@ -2566,7 +2566,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_6_buf_row_1_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_6_core9_2 = aie.core(%tile9_2) {
     %lb = arith.constant 0 : index
@@ -2594,7 +2594,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_6_core10_2 = aie.core(%tile10_2) {
     %lb = arith.constant 0 : index
@@ -2627,7 +2627,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_6_core11_2 = aie.core(%tile11_2) {
@@ -2672,7 +2672,7 @@ module @hdiff_bundle_8 {
     }
     aie.use_lock(%lock112_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_6_core9_3 = aie.core(%tile9_3) {
     %lb = arith.constant 0 : index
@@ -2699,7 +2699,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_6_core10_3 = aie.core(%tile10_3) {
     %lb = arith.constant 0 : index
@@ -2732,7 +2732,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_6_core11_3 = aie.core(%tile11_3) {
     %lb = arith.constant 0 : index
@@ -2755,7 +2755,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_6_buf_row_3_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_6_core9_4 = aie.core(%tile9_4) {
     %lb = arith.constant 0 : index
@@ -2782,7 +2782,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_6_core10_4 = aie.core(%tile10_4) {
     %lb = arith.constant 0 : index
@@ -2815,7 +2815,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_6_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_6_core11_4 = aie.core(%tile11_4) {
     %lb = arith.constant 0 : index
@@ -2838,7 +2838,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_6_buf_row_4_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_7_core9_5 = aie.core(%tile9_5) {
     %lb = arith.constant 0 : index
@@ -2865,7 +2865,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_7_core10_5 = aie.core(%tile10_5) {
     %lb = arith.constant 0 : index
@@ -2898,7 +2898,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_7_core11_5 = aie.core(%tile11_5) {
     %lb = arith.constant 0 : index
@@ -2921,7 +2921,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_7_buf_row_5_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_7_core9_6 = aie.core(%tile9_6) {
     %lb = arith.constant 0 : index
@@ -2949,7 +2949,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_7_core10_6 = aie.core(%tile10_6) {
     %lb = arith.constant 0 : index
@@ -2982,7 +2982,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   // Gathering Tile
   %block_7_core11_6 = aie.core(%tile11_6) {
@@ -3027,7 +3027,7 @@ module @hdiff_bundle_8 {
     }
     aie.use_lock(%lock116_14, "Acquire", 0) // stop the timer
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_7_core9_7 = aie.core(%tile9_7) {
     %lb = arith.constant 0 : index
@@ -3054,7 +3054,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_7_core10_7 = aie.core(%tile10_7) {
     %lb = arith.constant 0 : index
@@ -3087,7 +3087,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_7_core11_7 = aie.core(%tile11_7) {
     %lb = arith.constant 0 : index
@@ -3110,7 +3110,7 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_7_buf_row_7_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
   %block_7_core9_8 = aie.core(%tile9_8) {
     %lb = arith.constant 0 : index
@@ -3137,7 +3137,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 4)
     aie.end
-  } { link_with="hdiff_lap.o" }
+  }
 
   %block_7_core10_8 = aie.core(%tile10_8) {
     %lb = arith.constant 0 : index
@@ -3170,7 +3170,7 @@ module @hdiff_bundle_8 {
     }
     aie.objectfifo.release<Consume>(%block_7_buf_in_shim_7: !aie.objectfifo<memref<256xi32>>, 7)
     aie.end
-  } { link_with="hdiff_flux1.o" }
+  }
 
   %block_7_core11_8 = aie.core(%tile11_8) {
     %lb = arith.constant 0 : index
@@ -3193,6 +3193,6 @@ module @hdiff_bundle_8 {
       aie.objectfifo.release<Produce>(%block_7_buf_row_8_out_flx2 :!aie.objectfifo<memref<256xi32>>, 1)
     }
     aie.end
-  } { link_with="hdiff_flux2.o" }
+  }
 
 }
diff --git a/programming_examples/mlir/idct/aie.mlir b/programming_examples/mlir/idct/aie.mlir
index d2fab524f83..392cc9edee0 100644
--- a/programming_examples/mlir/idct/aie.mlir
+++ b/programming_examples/mlir/idct/aie.mlir
@@ -59,9 +59,9 @@ module @idct {
   aie.flow(%t74, DMA : 1, %t75, DMA : 0)
   aie.flow(%t75, DMA : 1, %t70, DMA : 0)
 
-  func.func private @dequant_8x8(%A: memref<64xi16>, %B: memref<64xi16>) -> ()
-  func.func private @idct_8x8_mmult_h(%A: memref<64xi16>, %B: memref<64xi16>) -> ()
-  func.func private @idct_8x8_mmult_v(%A: memref<64xi16>, %B: memref<64xi16>) -> ()
+  func.func private @dequant_8x8(%A: memref<64xi16>, %B: memref<64xi16>) -> () attributes {link_with = "dequant.o"}
+  func.func private @idct_8x8_mmult_h(%A: memref<64xi16>, %B: memref<64xi16>) -> () attributes {link_with = "idct_horizontal.o"}
+  func.func private @idct_8x8_mmult_v(%A: memref<64xi16>, %B: memref<64xi16>) -> () attributes {link_with = "idct_vertical.o"}
 
   %c13 = aie.core(%t73) {
     %lb = arith.constant 0 : index
@@ -83,7 +83,7 @@ module @idct {
     }
 
     aie.end
-  } { link_with="dequant.o" }
+  }
 
   %c74 = aie.core(%t74) {
     %lb = arith.constant 0 : index
@@ -105,7 +105,7 @@ module @idct {
     }
 
     aie.end
-  } { link_with="idct_horizontal.o" }
+  }
   
     %c75 = aie.core(%t75) {
     %lb = arith.constant 0 : index
@@ -127,7 +127,7 @@ module @idct {
     }
 
     aie.end
-  } { link_with="idct_vertical.o" }
+  }
 
   // Tile DMA
   %m73 = aie.mem(%t73) {
diff --git a/programming_examples/mlir/idct/objectFifo_circuit_switched_version/aie.mlir b/programming_examples/mlir/idct/objectFifo_circuit_switched_version/aie.mlir
index 26bfa40c29e..0c7a55f6da0 100755
--- a/programming_examples/mlir/idct/objectFifo_circuit_switched_version/aie.mlir
+++ b/programming_examples/mlir/idct/objectFifo_circuit_switched_version/aie.mlir
@@ -34,9 +34,9 @@ module @idct {
   aie.objectfifo.register_external_buffers @of_in (%t70, {%buffer_in}) : (memref<512xi16>)
   aie.objectfifo.register_external_buffers @of_out (%t70, {%buffer_out}) : (memref<512xi16>)
 
-  func.func private @dequant_8x8(%A: memref<64xi16>, %B: memref<64xi16>) -> ()
-  func.func private @idct_8x8_mmult_h(%A: memref<64xi16>, %B: memref<64xi16>) -> ()
-  func.func private @idct_8x8_mmult_v(%A: memref<64xi16>, %B: memref<64xi16>) -> ()
+  func.func private @dequant_8x8(%A: memref<64xi16>, %B: memref<64xi16>) -> () attributes {link_with = "dequant.o"}
+  func.func private @idct_8x8_mmult_h(%A: memref<64xi16>, %B: memref<64xi16>) -> () attributes {link_with = "idct_horizontal.o"}
+  func.func private @idct_8x8_mmult_v(%A: memref<64xi16>, %B: memref<64xi16>) -> () attributes {link_with = "idct_vertical.o"}
   func.func private @pass(%A: memref<64xi16>, %B: memref<64xi16>) -> ()
 
   %c13 = aie.core(%t73) {
@@ -63,7 +63,7 @@ module @idct {
     }
 
     aie.end
-  } { link_with="dequant.o" }
+  }
 
   %c74 = aie.core(%t74) {
     %lb = arith.constant 0 : index
@@ -89,7 +89,7 @@ module @idct {
     }
 
     aie.end
-  } { link_with="idct_horizontal.o" }
+  }
 
   %c75 = aie.core(%t75) {
     %lb = arith.constant 0 : index
@@ -115,5 +115,5 @@ module @idct {
     }
 
     aie.end
-  } { link_with="idct_vertical.o" }
+  }
 }
diff --git a/programming_examples/vision/color_detect/color_detect_placed.py b/programming_examples/vision/color_detect/color_detect_placed.py
index d0654fbd985..0554ce78ad4 100644
--- a/programming_examples/vision/color_detect/color_detect_placed.py
+++ b/programming_examples/vision/color_detect/color_detect_placed.py
@@ -37,11 +37,14 @@ def deviceBody():
 
         # AIE Core Function declarations
         rgba2hueLine = external_func(
-            "rgba2hueLine", inputs=[line_bytes_ty, line_ty, np.int32]
+            "rgba2hueLine",
+            inputs=[line_bytes_ty, line_ty, np.int32],
+            link_with="rgba2hue.cc.o",
         )
         thresholdLine = external_func(
             "thresholdLine",
             inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8],
+            link_with="threshold.cc.o",
         )
         bitwiseORLine = external_func(
             "bitwiseORLine", inputs=[line_ty, line_ty, line_ty, np.int32]
@@ -94,7 +97,7 @@ def deviceBody():
         # Set up compute tiles
 
         # Compute tile 2
-        @core(ComputeTile2, "rgba2hue.cc.o")
+        @core(ComputeTile2)
         def coreBody():
             for _ in range_(sys.maxsize):
                 elemIn = inOF_L3L2.acquire(ObjectFifoPort.Consume, 1)
@@ -104,7 +107,7 @@ def coreBody():
                 OF_2to34.release(ObjectFifoPort.Produce, 1)
 
         # Compute tile 3
-        @core(ComputeTile3, "threshold.cc.o")
+        @core(ComputeTile3)
         def coreBody():
             thresholdValueUpper1 = 40
             thresholdValueLower1 = 30
@@ -138,7 +141,7 @@ def coreBody():
                 OF_3to5.release(ObjectFifoPort.Produce, 1)
 
         # Compute tile 4
-        @core(ComputeTile4, "threshold.cc.o")
+        @core(ComputeTile4)
         def coreBody():
             thresholdValueUpper1 = 160
             thresholdValueLower1 = 90
diff --git a/programming_examples/vision/color_threshold/color_threshold_placed.py b/programming_examples/vision/color_threshold/color_threshold_placed.py
index 77bb5bb2da8..4ebc50b6c07 100644
--- a/programming_examples/vision/color_threshold/color_threshold_placed.py
+++ b/programming_examples/vision/color_threshold/color_threshold_placed.py
@@ -29,6 +29,7 @@ def device_body():
         thresholdLine = external_func(
             "thresholdLine",
             inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8],
+            link_with="threshold.cc.o",
         )
 
         # Tile declarations
@@ -103,7 +104,7 @@ def device_body():
         # Set up compute tiles
 
         # Compute tile 2
-        @core(ComputeTile2, "threshold.cc.o")
+        @core(ComputeTile2)
         def core_body():
             for _ in range_(sys.maxsize):
                 # RTPs written from the instruction stream must be synchronized with the runtime sequence
@@ -129,7 +130,7 @@ def core_body():
                 outOOB_L1L2_0.release(ObjectFifoPort.Produce, 1)
 
         # Compute tile 3
-        @core(ComputeTile3, "threshold.cc.o")
+        @core(ComputeTile3)
         def core_body():
             for _ in range_(sys.maxsize):
                 # RTPs written from the instruction stream must be synchronized with the runtime sequence
@@ -155,7 +156,7 @@ def core_body():
                 outOOB_L1L2_1.release(ObjectFifoPort.Produce, 1)
 
         # Compute tile 4
-        @core(ComputeTile4, "threshold.cc.o")
+        @core(ComputeTile4)
         def core_body():
             for _ in range_(sys.maxsize):
                 # RTPs written from the instruction stream must be synchronized with the runtime sequence
@@ -181,7 +182,7 @@ def core_body():
                 outOOB_L1L2_2.release(ObjectFifoPort.Produce, 1)
 
         # Compute tile 5
-        @core(ComputeTile5, "threshold.cc.o")
+        @core(ComputeTile5)
         def core_body():
             for _ in range_(sys.maxsize):
                 # RTPs written from the instruction stream must be synchronized with the runtime sequence
diff --git a/programming_examples/vision/edge_detect/edge_detect_placed.py b/programming_examples/vision/edge_detect/edge_detect_placed.py
index 2dac6746da7..c493ca23a78 100644
--- a/programming_examples/vision/edge_detect/edge_detect_placed.py
+++ b/programming_examples/vision/edge_detect/edge_detect_placed.py
@@ -34,15 +34,19 @@ def device_body():
 
         # AIE Core Function declarations
         rgba2gray_line = external_func(
-            "rgba2grayLine", inputs=[line_bytes_ty, line_ty, np.int32]
+            "rgba2grayLine",
+            inputs=[line_bytes_ty, line_ty, np.int32],
+            link_with="rgba2gray.cc.o",
         )
         filter2d_line = external_func(
             "filter2dLine",
             inputs=[line_ty, line_ty, line_ty, line_ty, np.int32, tensor_3x3_ty],
+            link_with="filter2d.cc.o",
         )
         threshold_line = external_func(
             "thresholdLine",
             inputs=[line_ty, line_ty, np.int32, np.int16, np.int16, np.int8],
+            link_with="threshold.cc.o",
         )
         gray2rgba_line = external_func(
             "gray2rgbaLine", inputs=[line_ty, line_bytes_ty, np.int32]
@@ -136,7 +140,7 @@ def device_body():
         # Set up compute tiles
 
         # Compute tile 2
-        @core(ComputeTile2, "rgba2gray.cc.o")
+        @core(ComputeTile2)
         def core_body():
             for _ in range_(sys.maxsize):
                 elem_in = inOF_L3L2.acquire(ObjectFifoPort.Consume, 1)
@@ -148,7 +152,7 @@ def core_body():
                 OF_2to3.release(ObjectFifoPort.Produce, 1)
 
         # Compute tile 3
-        @core(ComputeTile3, "filter2d.cc.o")
+        @core(ComputeTile3)
         def core_body():
             v0 = 0
             v1 = 4096
@@ -207,7 +211,7 @@ def core_body():
                 OF_3to4.release(ObjectFifoPort.Produce, 1)
 
         # Compute tile 4
-        @core(ComputeTile4, "threshold.cc.o")
+        @core(ComputeTile4)
         def core_body():
             v_thr = 10
             v_max = 255
diff --git a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir
index af7a4f05e7e..8797d4688f3 100644
--- a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir
+++ b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_1080.mlir
@@ -14,7 +14,7 @@ module @passThroughLine_aie2 {
 
  	aie.device(npu) {
         // declare kernel external kernel function 
-        func.func private @passThroughLine(%in: memref<1920xui8>, %out: memref<1920xui8>, %tilewidth: i32) -> ()
+        func.func private @passThroughLine(%in: memref<1920xui8>, %out: memref<1920xui8>, %tilewidth: i32) -> () attributes {link_with = "passThrough.cc.o"}
         
         // Declare tile object of the AIE class located at position col 1, row 4
         %tile00 = aie.tile(0, 0)
@@ -44,7 +44,7 @@ module @passThroughLine_aie2 {
                 aie.objectfifo.release @outOF(Produce, 1)
             }
             aie.end
-        } { link_with="passThrough.cc.o" } // indicate kernel object name used by this core
+        } // indicate kernel object name used by this core
 
         aie.runtime_sequence(%in : memref<518400xi32>, %arg1 : memref<1xi32>, %out : memref<518400xi32>) {
             %c0 = arith.constant 0 : i64
diff --git a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir
index 12d2855713b..0a8bca4e01e 100644
--- a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir
+++ b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_8k.mlir
@@ -14,7 +14,7 @@ module @passThroughLine_aie2 {
 
  	aie.device(npu) {
         // declare kernel external kernel function 
-        func.func private @passThroughLine(%in: memref<7680xui8>, %out: memref<7680xui8>, %tilewidth: i32) -> ()
+        func.func private @passThroughLine(%in: memref<7680xui8>, %out: memref<7680xui8>, %tilewidth: i32) -> () attributes {link_with = "passThrough.cc.o"}
         
         // Declare tile object of the AIE class located at position col 1, row 4
         %tile00 = aie.tile(0, 0)
@@ -44,7 +44,7 @@ module @passThroughLine_aie2 {
                 aie.objectfifo.release @outOF(Produce, 1)
             }
             aie.end
-        } { link_with="passThrough.cc.o" } // indicate kernel object name used by this core
+        } // indicate kernel object name used by this core
 
         aie.runtime_sequence(%in : memref<2073600xi32>, %arg1 : memref<1xi32>, %out : memref<2073600xi32>) {
             %c0 = arith.constant 0 : i64
diff --git a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir
index d4929df35f7..7107263f8b6 100644
--- a/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir
+++ b/programming_examples/vision/vision_passthrough/aie2_lineBased_8b_tiny.mlir
@@ -14,7 +14,7 @@ module @passThroughLine_aie2 {
 
  	aie.device(npu) {
         // declare kernel external kernel function 
-        func.func private @passThroughLine(%in: memref<512xui8>, %out: memref<512xui8>, %tilewidth: i32) -> ()
+        func.func private @passThroughLine(%in: memref<512xui8>, %out: memref<512xui8>, %tilewidth: i32) -> () attributes {link_with = "passThrough.cc.o"}
         
         // Declare tile object of the AIE class located at position col 1, row 4
         %tile00 = aie.tile(0, 0)
@@ -44,7 +44,7 @@ module @passThroughLine_aie2 {
                 aie.objectfifo.release @outOF(Produce, 1)
             }
             aie.end
-        } { link_with="passThrough.cc.o" } // indicate kernel object name used by this core
+        } // indicate kernel object name used by this core
 
         aie.runtime_sequence(%in : memref<1152xi32>, %arg1 : memref<1xi32>, %out : memref<1152xi32>) {
             %c0 = arith.constant 0 : i64
diff --git a/programming_examples/vision/vision_passthrough/vision_passthrough_placed.py b/programming_examples/vision/vision_passthrough/vision_passthrough_placed.py
index 681932c7f8b..5d72215d4fd 100644
--- a/programming_examples/vision/vision_passthrough/vision_passthrough_placed.py
+++ b/programming_examples/vision/vision_passthrough/vision_passthrough_placed.py
@@ -29,7 +29,9 @@ def device_body():
 
         # AIE Core Function declarations
         passThroughLine = external_func(
-            "passThroughLine", inputs=[line_ty, line_ty, np.int32]
+            "passThroughLine",
+            inputs=[line_ty, line_ty, np.int32],
+            link_with="passThrough.cc.o",
         )
 
         # Tile declarations
@@ -46,7 +48,7 @@ def device_body():
         # Set up compute tiles
 
         # Compute tile 2
-        @core(ComputeTile2, "passThrough.cc.o")
+        @core(ComputeTile2)
         def core_body():
             for _ in range_(sys.maxsize):
                 for _ in range_(height):
diff --git a/programming_guide/section-4/section-4b/aie2_placed.py b/programming_guide/section-4/section-4b/aie2_placed.py
index ff60baaab20..73b9d91c763 100644
--- a/programming_guide/section-4/section-4b/aie2_placed.py
+++ b/programming_guide/section-4/section-4b/aie2_placed.py
@@ -39,6 +39,7 @@ def device_body():
         scale_scalar = external_func(
             "vector_scalar_mul_aie_scalar",
             inputs=[tile_ty, tile_ty, scalar_ty, in2_dtype],
+            link_with="scale.o",
         )
 
         # Tile declarations
@@ -52,7 +53,7 @@ def device_body():
 
         # Set up compute tiles
         # Compute tile 2
-        @core(ComputeTile2, "scale.o")
+        @core(ComputeTile2)
         def core_body():
             # Effective while(1)
             for _ in range_(sys.maxsize):
diff --git a/test/Integration/julia_by_lines/aie.mlir b/test/Integration/julia_by_lines/aie.mlir
index 6695dfe1c95..72acf9d224b 100644
--- a/test/Integration/julia_by_lines/aie.mlir
+++ b/test/Integration/julia_by_lines/aie.mlir
@@ -21,8 +21,8 @@ module @test {
 
   %lock13_3 = aie.lock(%tile13, 3)
 
-  func.func private @func(%A: memref<32x32xi32>, %MinRe : f32, %MaxRe : f32, %MinIm : f32, %MaxIm : f32) -> ()
-  func.func private @do_line(%A: memref<32x32xi32>, %MinRe : f32, %StepRe : f32, %Im : f32, %cols : i32) -> ()
+  func.func private @func(%A: memref<32x32xi32>, %MinRe : f32, %MaxRe : f32, %MinIm : f32, %MaxIm : f32) -> () attributes {link_with = "kernel.o"}
+  func.func private @do_line(%A: memref<32x32xi32>, %MinRe : f32, %StepRe : f32, %Im : f32, %cols : i32) -> () attributes {link_with = "kernel.o"}
 
   %core13 = aie.core(%tile13) {
     %MinRe = arith.constant -1.5 : f32
@@ -51,5 +51,5 @@ module @test {
       scf.yield %Im_next : f32
     }
     aie.end
-  } { link_with="kernel.o" }
+  }
 }
diff --git a/test/npu-xrt/cascade_flows/aie.mlir b/test/npu-xrt/cascade_flows/aie.mlir
index 9fb6d3c6ddb..e89c78cd00b 100644
--- a/test/npu-xrt/cascade_flows/aie.mlir
+++ b/test/npu-xrt/cascade_flows/aie.mlir
@@ -27,10 +27,10 @@ module {
     aie.objectfifo @objFifo_out0(%t01, {%t00}, 1 : i32) : !aie.objectfifo<memref<64xi32>>
     aie.objectfifo.link [@objFifo_out1] -> [@objFifo_out0] ([] [])
 
-    func.func private @extern_kernel1() -> ()
-    func.func private @extern_kernel2() -> ()
-    func.func private @extern_kernel3(%b: memref<64xi32>, %size: i32) -> ()
-  
+    func.func private @extern_kernel1() -> () attributes {link_with = "kernel1.o"}
+    func.func private @extern_kernel2() -> () attributes {link_with = "kernel2.o"}
+    func.func private @extern_kernel3(%b: memref<64xi32>, %size: i32) -> () attributes {link_with = "kernel3.o"}
+
     %core02 = aie.core(%t03) {
       %subview0 = aie.objectfifo.acquire @objFifo_in1(Consume, 1) : !aie.objectfifosubview<memref<64xi32>>
 
@@ -39,17 +39,17 @@ module {
       aie.objectfifo.release @objFifo_in1(Consume, 1)
 
       aie.end
-    } { link_with="kernel1.o" }
+    }
 
     %core13 = aie.core(%t13) {
       func.call @extern_kernel2() : () -> ()
 
       aie.end
-    } { link_with="kernel2.o" }
+    }
 
     %core12 = aie.core(%t12) {
       %size = arith.constant 64 : i32
- 
+
       %subview1 = aie.objectfifo.acquire @objFifo_out1(Produce, 1) : !aie.objectfifosubview<memref<64xi32>>
       %elem1 = aie.objectfifo.subview.access %subview1[0] : !aie.objectfifosubview<memref<64xi32>> -> memref<64xi32>
 
@@ -57,7 +57,7 @@ module {
 
       aie.objectfifo.release @objFifo_out1(Produce, 1)
       aie.end
-    } { link_with="kernel3.o" }
+    }
 
     aie.runtime_sequence(%in : memref<64xi32>, %buf : memref<32xi32>, %out : memref<64xi32>) {
       %c0 = arith.constant 0 : i64
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir
index c614012de9c..570692fe7f0 100644
--- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_bufferx4.mlir
@@ -8,9 +8,9 @@
 module {
   aie.device(npu1) {
     // <trace>
-    func.func private @flush_trace()
-    func.func private @event_0()
-    func.func private @event_1()
+    func.func private @flush_trace() attributes {link_with = "mm.o"}
+    func.func private @event_0() attributes {link_with = "mm.o"}
+    func.func private @event_1() attributes {link_with = "mm.o"}
     // </trace>
     %tile_0_0 = aie.tile(0, 0)
     %tile_0_1 = aie.tile(0, 1)
@@ -150,7 +150,7 @@ module {
       func.call @flush_trace() : () -> ()
       // </trace>
       cf.br ^bb1
-    } {link_with = "mm.o"}
+    }
     %mem_1_2 = aie.mem(%tile_1_2) {
       %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1)
     ^bb1:  // 2 preds: ^bb0, ^bb1
@@ -243,7 +243,7 @@ module {
       func.call @flush_trace() : () -> ()
       // </trace>
       cf.br ^bb1
-    } {link_with = "mm.o"}
+    }
     %mem_2_2 = aie.mem(%tile_2_2) {
       %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1)
     ^bb1:  // 2 preds: ^bb0, ^bb1
@@ -336,7 +336,7 @@ module {
       func.call @flush_trace() : () -> ()
       // </trace>
       cf.br ^bb1
-    } {link_with = "mm.o"}
+    }
     %mem_3_2 = aie.mem(%tile_3_2) {
       %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1)
     ^bb1:  // 2 preds: ^bb0, ^bb1
@@ -430,7 +430,7 @@ module {
       func.call @flush_trace() : () -> ()
       // </trace>
       cf.br ^bb1
-    } {link_with = "mm.o"}
+    }
     aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0)
     aie.flow(%tile_0_0, DMA : 1, %tile_1_1, DMA : 0)
     aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir
index af5deeaf2e8..b7a0a33d43b 100644
--- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_cascadex4.mlir
@@ -7,13 +7,13 @@
 
 module {
   aie.device(npu1) {
-    func.func private @matmul_scalar_put_4x1x4_4x4x4_i32_i32(memref<1x4x4x4xi32, 2 : i32>, memref<4x1x4x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>)
-    func.func private @matmul_scalar_put_get_4x1x4_4x4x4_i32_i32(memref<1x4x4x4xi32, 2 : i32>, memref<4x1x4x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>)
-    func.func private @matmul_scalar_get_4x1x4_4x4x4_i32_i32(memref<1x4x4x4xi32, 2 : i32>, memref<4x1x4x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>)
+    func.func private @matmul_scalar_put_4x1x4_4x4x4_i32_i32(memref<1x4x4x4xi32, 2 : i32>, memref<4x1x4x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>) attributes {link_with = "mm.o"}
+    func.func private @matmul_scalar_put_get_4x1x4_4x4x4_i32_i32(memref<1x4x4x4xi32, 2 : i32>, memref<4x1x4x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>) attributes {link_with = "mm.o"}
+    func.func private @matmul_scalar_get_4x1x4_4x4x4_i32_i32(memref<1x4x4x4xi32, 2 : i32>, memref<4x1x4x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>) attributes {link_with = "mm.o"}
     // <trace>
-    func.func private @event_0()
-    func.func private @event_1()
-    func.func private @flush_trace()
+    func.func private @event_0() attributes {link_with = "mm.o"}
+    func.func private @event_1() attributes {link_with = "mm.o"}
+    func.func private @flush_trace() attributes {link_with = "mm.o"}
     // </trace>
     %tile_0_0 = aie.tile(0, 0)
     %tile_0_1 = aie.tile(0, 1)
@@ -116,7 +116,7 @@ module {
       func.call @flush_trace() : () -> ()
       // </trace>
       cf.br ^bb1
-    } {link_with = "mm.o"}
+    }
     %mem_1_2 = aie.mem(%tile_1_2) {
       %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1)
     ^bb1:  // 2 preds: ^bb0, ^bb1
@@ -167,7 +167,7 @@ module {
       func.call @flush_trace() : () -> ()
       // </trace>
       cf.br ^bb1
-    } {link_with = "mm.o"}
+    }
     %mem_2_2 = aie.mem(%tile_2_2) {
       %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb3, repeat_count = 1)
     ^bb1:  // 2 preds: ^bb0, ^bb1
@@ -218,7 +218,7 @@ module {
       func.call @flush_trace() : () -> ()
       // </trace>
       cf.br ^bb1
-    } {link_with = "mm.o"}
+    }
     %mem_3_2 = aie.mem(%tile_3_2) {
       %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1)
     ^bb1:  // 2 preds: ^bb0, ^bb1
@@ -278,7 +278,7 @@ module {
       func.call @flush_trace() : () -> ()
       // </trace>
       cf.br ^bb1
-    } {link_with = "mm.o"}
+    }
     aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0)
     aie.flow(%tile_0_0, DMA : 1, %tile_1_1, DMA : 0)
     aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir
index c2610065706..129529356d2 100644
--- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx1.mlir
@@ -1,10 +1,10 @@
 module {
   aie.device(npu1) {
-    func.func private @matmul_scalar_4x2x4_4x8x4_i32_i32(memref<2x4x4x8xi32, 2 : i32>, memref<4x2x8x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>)
+    func.func private @matmul_scalar_4x2x4_4x8x4_i32_i32(memref<2x4x4x8xi32, 2 : i32>, memref<4x2x8x4xi32, 2 : i32>, memref<4x4x4x4xi32, 2 : i32>) attributes {link_with = "mm.o"}
     // <trace>
-    func.func private @event_0()
-    func.func private @event_1()
-    func.func private @flush_trace()
+    func.func private @event_0() attributes {link_with = "mm.o"}
+    func.func private @event_1() attributes {link_with = "mm.o"}
+    func.func private @flush_trace() attributes {link_with = "mm.o"}
     // </trace>
     %tile_0_0 = aie.tile(0, 0)
     %tile_0_1 = aie.tile(0, 1)
@@ -88,7 +88,7 @@ module {
       func.call @flush_trace() : () -> ()
       // </trace>
       cf.br ^bb1
-    } {link_with = "mm.o"}
+    }
     aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0)
     aie.flow(%tile_0_0, DMA : 1, %tile_1_1, DMA : 0)
     aie.flow(%tile_2_1, DMA : 0, %tile_0_0, DMA : 0)
diff --git a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir
index 1c3585ba28c..9fa6fa77d91 100644
--- a/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir
+++ b/test/npu-xrt/matrix_multiplication_using_cascade/aie_plainx4.mlir
@@ -1,10 +1,10 @@
 module {
   aie.device(npu1) {
-    func.func private @matmul_scalar_2x2x2_4x8x4_i32_i32(memref<2x2x4x8xi32, 2 : i32>, memref<2x2x8x4xi32, 2 : i32>, memref<2x2x4x4xi32, 2 : i32>)
+    func.func private @matmul_scalar_2x2x2_4x8x4_i32_i32(memref<2x2x4x8xi32, 2 : i32>, memref<2x2x8x4xi32, 2 : i32>, memref<2x2x4x4xi32, 2 : i32>) attributes {link_with = "mm.o"}
     // <trace>
-    func.func private @event_0()
-    func.func private @event_1()
-    func.func private @flush_trace()
+    func.func private @event_0() attributes {link_with = "mm.o"}
+    func.func private @event_1() attributes {link_with = "mm.o"}
+    func.func private @flush_trace() attributes {link_with = "mm.o"}
     // </trace>
     %tile_0_0 = aie.tile(0, 0)
     %tile_0_1 = aie.tile(0, 1)
@@ -123,7 +123,7 @@ module {
       func.call @flush_trace() : () -> ()
       // </trace>
       cf.br ^bb1
-    } {link_with = "mm.o"}
+    }
     %mem_1_2 = aie.mem(%tile_1_2) {
       %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1)
     ^bb1:  // 2 preds: ^bb0, ^bb1
@@ -183,7 +183,7 @@ module {
       func.call @flush_trace() : () -> ()
       // </trace>
       cf.br ^bb1
-    } {link_with = "mm.o"}
+    }
     %mem_2_2 = aie.mem(%tile_2_2) {
       %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1)
     ^bb1:  // 2 preds: ^bb0, ^bb1
@@ -243,7 +243,7 @@ module {
       func.call @flush_trace() : () -> ()
       // </trace>
       cf.br ^bb1
-    } {link_with = "mm.o"}
+    }
     %mem_3_2 = aie.mem(%tile_3_2) {
       %0 = aie.dma_start(S2MM, 0, ^bb1, ^bb5, repeat_count = 1)
     ^bb1:  // 2 preds: ^bb0, ^bb1
@@ -303,7 +303,7 @@ module {
       func.call @flush_trace() : () -> ()
       // </trace>
       cf.br ^bb1
-    } {link_with = "mm.o"}
+    }
     aie.flow(%tile_0_0, DMA : 0, %tile_0_1, DMA : 0)
     aie.flow(%tile_0_0, DMA : 1, %tile_1_1, DMA : 0)
     aie.flow(%tile_0_1, DMA : 0, %tile_0_2, DMA : 0)
diff --git a/test/npu-xrt/runtime_cumsum/aie.mlir b/test/npu-xrt/runtime_cumsum/aie.mlir
index 841def30859..fff697a574e 100644
--- a/test/npu-xrt/runtime_cumsum/aie.mlir
+++ b/test/npu-xrt/runtime_cumsum/aie.mlir
@@ -34,8 +34,8 @@ module {
     aie.device(npu1_1col) {
 
         // AIE Core Function declarations
-        func.func private @sum(memref<16xi32>, memref<16xi32>)
-        func.func private @zero(memref<16xi32>)
+        func.func private @sum(memref<16xi32>, memref<16xi32>) attributes {link_with = "sum.o"}
+        func.func private @zero(memref<16xi32>) attributes {link_with = "sum.o"}
 
         %shim_noc_tile_0_0 = aie.tile(0, 0)
         %mem_tile_0_1 = aie.tile(0, 1)
@@ -87,7 +87,7 @@ module {
                 aie.objectfifo.release @out(Produce, 1)
             }
             aie.end
-        } {link_with = "sum.o"}
+        }
 
         aie.runtime_sequence @sequence(%xy: memref<128xi32>) {
             aiex.npu.rtp_write(@rtp2, 0, 1)
diff --git a/test/npu-xrt/tile_mapped_read/aie.mlir b/test/npu-xrt/tile_mapped_read/aie.mlir
index 63aa6daaed9..50e32c02851 100644
--- a/test/npu-xrt/tile_mapped_read/aie.mlir
+++ b/test/npu-xrt/tile_mapped_read/aie.mlir
@@ -10,7 +10,7 @@
 
 module {
   aie.device(npu1_1col) {
-    func.func private @read_processor_bus(memref<8xi32>, i32, i32, i32)
+    func.func private @read_processor_bus(memref<8xi32>, i32, i32, i32) attributes {link_with = "kernel.o"}
     %t00 = aie.tile(0, 0)
     %t01 = aie.tile(0, 1)
     %t02 = aie.tile(0, 2)
@@ -51,7 +51,7 @@ module {
         aie.objectfifo.release @objFifo_out1(Produce, 1)
       }
       aie.end
-    } {link_with = "kernel.o"}
+    }
 
     aie.runtime_sequence(%in : memref<64xi32>, %out : memref<64xi32>) {
       %c0 = arith.constant 0 : i64
diff --git a/test/npu-xrt/two_col/aie.mlir b/test/npu-xrt/two_col/aie.mlir
index 500a0e61e7e..199ef36d3db 100644
--- a/test/npu-xrt/two_col/aie.mlir
+++ b/test/npu-xrt/two_col/aie.mlir
@@ -31,7 +31,7 @@ module {
     aie.objectfifo @objFifo_out3(%4, {%1}, 2 : i32) : !aie.objectfifo<memref<128xui8>>
     aie.objectfifo @objFifo_out4(%5, {%1}, 2 : i32) : !aie.objectfifo<memref<128xui8>>
     aie.objectfifo.link [@objFifo_out1, @objFifo_out2, @objFifo_out3, @objFifo_out4] -> [@objFifo_out0] ([0, 128, 256, 384] [])
-    func.func private @thresholdLine(%in: memref<128xui8>, %out: memref<128xui8>, %lineWidth: i32,  %thresholdValue: i32, %maxValue: i32, %thresholdType: i8) -> ()
+    func.func private @thresholdLine(%in: memref<128xui8>, %out: memref<128xui8>, %lineWidth: i32,  %thresholdValue: i32, %maxValue: i32, %thresholdType: i8) -> () attributes {link_with = "threshold.o"}
     %24 = aie.core(%2) {
       %c0 = arith.constant 0 : index
       %c1 = arith.constant 1 : index
@@ -55,7 +55,7 @@ module {
         aie.objectfifo.release @objFifo_out1(Produce, 1)
       }
       aie.end
-    } {link_with = "threshold.o"}
+    }
     %34 = aie.core(%3) {
       %c0 = arith.constant 0 : index
       %c1 = arith.constant 1 : index
@@ -79,7 +79,7 @@ module {
         aie.objectfifo.release @objFifo_out2(Produce, 1)
       }
       aie.end
-    } {link_with = "threshold.o"}
+    }
     %44 = aie.core(%4) {
       %c0 = arith.constant 0 : index
       %c1 = arith.constant 1 : index
@@ -103,7 +103,7 @@ module {
         aie.objectfifo.release @objFifo_out3(Produce, 1)
       }
       aie.end
-    } {link_with = "threshold.o"}
+    }
     %54 = aie.core(%5) {
       %c0 = arith.constant 0 : index
       %c1 = arith.constant 1 : index
@@ -127,7 +127,7 @@ module {
         aie.objectfifo.release @objFifo_out4(Produce, 1)
       }
       aie.end
-    } {link_with = "threshold.o"}
+    }
     aie.runtime_sequence(%in : memref<2048xi32>, %buf : memref<32xi32>, %out : memref<2048xi32>) {
       %c0 = arith.constant 0 : i64
       %c1 = arith.constant 1 : i64
diff --git a/test/npu-xrt/vec_mul_event_trace/aie.mlir b/test/npu-xrt/vec_mul_event_trace/aie.mlir
index 60585538a0e..bce20dcb959 100644
--- a/test/npu-xrt/vec_mul_event_trace/aie.mlir
+++ b/test/npu-xrt/vec_mul_event_trace/aie.mlir
@@ -21,7 +21,7 @@
 module {
   aie.device(npu1_1col) {
     // External kernel function declaration
-    func.func private @vector_scalar_mul_aie_scalar(memref<1024xi32>, memref<1024xi32>, memref<1xi32>, i32)
+    func.func private @vector_scalar_mul_aie_scalar(memref<1024xi32>, memref<1024xi32>, memref<1xi32>, i32) attributes {link_with = "vector_scalar_mul.o"}
 
     // Tile declarations
     %shim_noc_tile_0_0 = aie.tile(0, 0)
@@ -56,7 +56,7 @@ module {
         aie.objectfifo.release @infactor(Consume, 1)
       }
       aie.end
-    } {link_with = "vector_scalar_mul.o"}
+    }
 
     // ========================================================================
     // Trace Packet Flow Configuration
diff --git a/test/npu-xrt/vector_scalar_using_dma/aie.mlir b/test/npu-xrt/vector_scalar_using_dma/aie.mlir
index 687882d3cc3..ce18c02e45d 100644
--- a/test/npu-xrt/vector_scalar_using_dma/aie.mlir
+++ b/test/npu-xrt/vector_scalar_using_dma/aie.mlir
@@ -11,7 +11,7 @@
 
 module {
   aie.device(npu1_1col) {
-    func.func private @scale_int32(memref<1024xi32>, memref<1024xi32>)
+    func.func private @scale_int32(memref<1024xi32>, memref<1024xi32>) attributes {link_with = "scale.o"}
 
     %tile_0_0 = aie.tile(0, 0)
     %tile_0_2 = aie.tile(0, 2)
@@ -56,7 +56,7 @@ module {
         }
       }
       aie.end
-    } {link_with = "scale.o"}
+    }
 
     aie.shim_dma_allocation @in (%tile_0_0, MM2S, 0)
 
diff --git a/test/parse-trace/test1/aie_test1.mlir b/test/parse-trace/test1/aie_test1.mlir
index 2fc666f6660..ae2ffe8acef 100644
--- a/test/parse-trace/test1/aie_test1.mlir
+++ b/test/parse-trace/test1/aie_test1.mlir
@@ -14,7 +14,7 @@ module {
     aie.objectfifo @infactor(%shim_noc_tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<1xi32>> 
     aie.objectfifo @in(%shim_noc_tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<1024xi16>> 
     aie.objectfifo @out(%tile_0_2, {%shim_noc_tile_0_0}, 2 : i32) : !aie.objectfifo<memref<1024xi16>> 
-    func.func private @vector_scalar_mul_vector(memref<1024xi16>, memref<1024xi16>, memref<1xi32>, i32)
+    func.func private @vector_scalar_mul_vector(memref<1024xi16>, memref<1024xi16>, memref<1xi32>, i32) attributes {link_with = "scale.o"}
     %core_0_2 = aie.core(%tile_0_2) {
       %c0 = arith.constant 0 : index
       %c9223372036854775807 = arith.constant 9223372036854775807 : index
@@ -38,7 +38,7 @@ module {
         aie.objectfifo.release @infactor(Consume, 1)
       }
       aie.end
-    } {link_with = "scale.o"}
+    }
     aie.packet_flow(1) {
       aie.packet_source<%tile_0_2, Trace : 0>
       aie.packet_dest<%shim_noc_tile_0_0, DMA : 1>
diff --git a/test/parse-trace/test2/aie_test2.mlir b/test/parse-trace/test2/aie_test2.mlir
index ef67fbd99d4..c6221fe58a9 100644
--- a/test/parse-trace/test2/aie_test2.mlir
+++ b/test/parse-trace/test2/aie_test2.mlir
@@ -9,7 +9,7 @@
 
 module {
   aie.device(npu1_1col) {
-    func.func private @vector_scalar_mul_vector(memref<1024xi16>, memref<1024xi16>, memref<1xi32>, i32)
+    func.func private @vector_scalar_mul_vector(memref<1024xi16>, memref<1024xi16>, memref<1xi32>, i32) attributes {link_with = "scale.o"}
     %shim_noc_tile_0_0 = aie.tile(0, 0)
     %tile_0_2 = aie.tile(0, 2)
     aie.objectfifo @in(%shim_noc_tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo<memref<1024xi16>> 
@@ -38,7 +38,7 @@ module {
         aie.objectfifo.release @infactor(Consume, 1)
       }
       aie.end
-    } {link_with = "scale.o"}
+    }
     aie.packet_flow(1) {
       aie.packet_source<%tile_0_2, Trace : 0>
       aie.packet_dest<%shim_noc_tile_0_0, DMA : 1>
diff --git a/test/unit_tests/aie/12_julia/aie.mlir b/test/unit_tests/aie/12_julia/aie.mlir
index ab51d46de49..8e296ee5e1a 100644
--- a/test/unit_tests/aie/12_julia/aie.mlir
+++ b/test/unit_tests/aie/12_julia/aie.mlir
@@ -23,14 +23,14 @@ aie.device(xcvc1902) {
   %buf13_1 = aie.buffer(%tile13) { sym_name = "b" } : memref<4096xi32>
   %lock13_3 = aie.lock(%tile13, 3) { sym_name = "output_lock" }
 
-  func.func private @func(%A: memref<2xi32>, %B: memref<4096xi32>) -> ()
+  func.func private @func(%A: memref<2xi32>, %B: memref<4096xi32>) -> () attributes {link_with = "kernel.o"}
 
   %core13 = aie.core(%tile13) {
     aie.use_lock(%lock13_3, "Acquire", 1) // acquire
     func.call @func(%buf13_0, %buf13_1) : (memref<2xi32>, memref<4096xi32>) -> ()
     aie.use_lock(%lock13_3, "Release", 0) // release for write
     aie.end
-  } { link_with="kernel.o" }
+  }
   
 }
 }
diff --git a/test/unit_tests/aie/13_julia_fp/aie.mlir b/test/unit_tests/aie/13_julia_fp/aie.mlir
index 90bf1c370b8..9af18d571e2 100644
--- a/test/unit_tests/aie/13_julia_fp/aie.mlir
+++ b/test/unit_tests/aie/13_julia_fp/aie.mlir
@@ -23,14 +23,14 @@ aie.device(xcvc1902) {
 
   %lock13_3 = aie.lock(%tile13, 3) { sym_name = "inout_lock" }
 
-  func.func private @func(%A: memref<256xf32>, %B: memref<256xf32>) -> ()
+  func.func private @func(%A: memref<256xf32>, %B: memref<256xf32>) -> () attributes {link_with = "kernel.o"}
 
   %core13 = aie.core(%tile13) {
     aie.use_lock(%lock13_3, "Acquire", 1) // acquire
     func.call @func(%buf13_0, %buf13_1) : (memref<256xf32>, memref<256xf32>) -> ()
     aie.use_lock(%lock13_3, "Release", 0) // release for write
     aie.end
-  } { link_with="kernel.o" }
+  }
   
 }
 }
diff --git a/test/unit_tests/aie2/01_precompiled_core_function/aie.mlir b/test/unit_tests/aie2/01_precompiled_core_function/aie.mlir
index a962626acd0..f603f512a5e 100644
--- a/test/unit_tests/aie2/01_precompiled_core_function/aie.mlir
+++ b/test/unit_tests/aie2/01_precompiled_core_function/aie.mlir
@@ -22,7 +22,7 @@ module @test_chesss_01_precompiled_core_function {
     %lock13_3 = aie.lock(%tile13, 3) { sym_name = "input_lock" }
     %lock13_5 = aie.lock(%tile13, 5) { sym_name = "output_lock" }
 
-    func.func private @func(%A: memref<256xi32>, %B: memref<256xi32>) -> ()
+    func.func private @func(%A: memref<256xi32>, %B: memref<256xi32>) -> () attributes {link_with = "kernel.o"}
 
     %core13 = aie.core(%tile13) {
       aie.use_lock(%lock13_3, "Acquire", 1) // acquire for read(e.g. input ping)
@@ -31,6 +31,6 @@ module @test_chesss_01_precompiled_core_function {
       aie.use_lock(%lock13_3, "Release", 0) // release for write
       aie.use_lock(%lock13_5, "Release", 1) // release for read
       aie.end
-    } { link_with="kernel.o" }
+    }
   }
 }
diff --git a/test/unit_tests/aie2/03_cascade_core_functions/aie.mlir b/test/unit_tests/aie2/03_cascade_core_functions/aie.mlir
index 0c6cb5645f4..7c8b727ec8a 100644
--- a/test/unit_tests/aie2/03_cascade_core_functions/aie.mlir
+++ b/test/unit_tests/aie2/03_cascade_core_functions/aie.mlir
@@ -25,14 +25,14 @@ module {
     %lock13_3 = aie.lock(%tile13, 3) { sym_name = "input_lock" } // input buffer lock
     %lock23_7 = aie.lock(%tile23, 7) { sym_name = "output_lock" } // output buffer lock
 
-    func.func private @do_mul(%A: memref<256xi32>) -> ()
-    func.func private @do_mac(%A: memref<256xi32>) -> ()
+    func.func private @do_mul(%A: memref<256xi32>) -> () attributes {link_with = "kernel.o"}
+    func.func private @do_mac(%A: memref<256xi32>) -> () attributes {link_with = "kernel.o"}
 
     %core13 = aie.core(%tile13) {
       aie.use_lock(%lock13_3, AcquireGreaterEqual, 1) // acquire for read(e.g. input ping)
       func.call @do_mul(%buf13_0) : (memref<256xi32>) -> ()
       aie.end
-    } { link_with="kernel.o" }
+    }
 
     %core23 = aie.core(%tile23) {
   //    %val1 = arith.constant 7 : i32
@@ -41,6 +41,6 @@ module {
       func.call @do_mac(%buf23_0) : (memref<256xi32>) -> ()
       aie.use_lock(%lock23_7, Release, 1) // release for read
       aie.end
-    } { link_with="kernel.o" }
+    }
   }
 }
diff --git a/test/unit_tests/aie2/05_shim_dma_core_function/aie.mlir b/test/unit_tests/aie2/05_shim_dma_core_function/aie.mlir
index c98e1743242..f90f5fc16e9 100644
--- a/test/unit_tests/aie2/05_shim_dma_core_function/aie.mlir
+++ b/test/unit_tests/aie2/05_shim_dma_core_function/aie.mlir
@@ -31,7 +31,7 @@ module @test_chess_05_shim_dma_core_function {
     %lock_b_read = aie.lock(%t73, 6)
     %lock_done = aie.lock(%t73, 7)
 
-    func.func private @func(%A: memref<16xi32>, %B: memref<16xi32>) -> ()
+    func.func private @func(%A: memref<16xi32>, %B: memref<16xi32>) -> () attributes {link_with = "kernel.o"}
 
     %c13 = aie.core(%t73) {
 
@@ -54,7 +54,7 @@ module @test_chess_05_shim_dma_core_function {
       }
 
       aie.end
-    } { link_with="kernel.o" }
+    }
 
     // Tile DMA
     %m73 = aie.mem(%t73) {
diff --git a/test/unit_tests/aie2/07_shim_dma_core_function_with_loop/aie.mlir b/test/unit_tests/aie2/07_shim_dma_core_function_with_loop/aie.mlir
index ce1d0c15c20..900607ab488 100644
--- a/test/unit_tests/aie2/07_shim_dma_core_function_with_loop/aie.mlir
+++ b/test/unit_tests/aie2/07_shim_dma_core_function_with_loop/aie.mlir
@@ -29,7 +29,7 @@ module @test_chess_04_deprecated_shim_dma_precompiled_kernel{
     %lock_b_ping = aie.lock(%t73, 5) // b_ping
     %lock_b_pong = aie.lock(%t73, 6) // b_pong
 
-    func.func private @func(%A: memref<64xi32>, %B: memref<64xi32>, %C: i32) -> ()
+    func.func private @func(%A: memref<64xi32>, %B: memref<64xi32>, %C: i32) -> () attributes {link_with = "kernel.o"}
 
     %c13 = aie.core(%t73) {
       %buffer_size =  arith.constant 64 : i32
@@ -59,7 +59,7 @@ module @test_chess_04_deprecated_shim_dma_precompiled_kernel{
       }
 
       aie.end
-    } { link_with="kernel.o" }
+    }
 
     // Tile DMA
     %m73 = aie.mem(%t73) {
diff --git a/test/unit_tests/chess_compiler_tests/01_precompiled_core_function/aie.mlir b/test/unit_tests/chess_compiler_tests/01_precompiled_core_function/aie.mlir
index 790397c0211..bf6a3859a47 100644
--- a/test/unit_tests/chess_compiler_tests/01_precompiled_core_function/aie.mlir
+++ b/test/unit_tests/chess_compiler_tests/01_precompiled_core_function/aie.mlir
@@ -28,7 +28,7 @@ aie.device(xcvc1902) {
   %lock13_3 = aie.lock(%tile13, 3) { sym_name = "input_lock" }
   %lock13_5 = aie.lock(%tile13, 5) { sym_name = "output_lock" }
 
-  func.func private @func(%A: memref<256xi32>, %B: memref<256xi32>) -> ()
+  func.func private @func(%A: memref<256xi32>, %B: memref<256xi32>) -> () attributes {link_with = "kernel.o"}
 
   %core13 = aie.core(%tile13) {
     aie.use_lock(%lock13_3, "Acquire", 1) // acquire for read(e.g. input ping)
@@ -37,7 +37,7 @@ aie.device(xcvc1902) {
     aie.use_lock(%lock13_3, "Release", 0) // release for write
     aie.use_lock(%lock13_5, "Release", 1) // release for read
     aie.end
-  } { link_with="kernel.o" }
+  }
 
 }
 }
diff --git a/test/unit_tests/chess_compiler_tests/03_cascade_core_functions/aie.mlir b/test/unit_tests/chess_compiler_tests/03_cascade_core_functions/aie.mlir
index ef6dc31097d..e7989b81570 100644
--- a/test/unit_tests/chess_compiler_tests/03_cascade_core_functions/aie.mlir
+++ b/test/unit_tests/chess_compiler_tests/03_cascade_core_functions/aie.mlir
@@ -29,16 +29,16 @@ aie.device(xcvc1902) {
   %lock13_3 = aie.lock(%tile13, 3) { sym_name = "input_lock" } // input buffer lock
   %lock23_7 = aie.lock(%tile23, 7) { sym_name = "output_lock" } // output buffer lock
   
-  func.func private @do_mul(%A: memref<256xi32>) -> ()
-  func.func private @do_mac(%A: memref<256xi32>) -> ()
-  
+  func.func private @do_mul(%A: memref<256xi32>) -> () attributes {link_with = "kernel.o"}
+  func.func private @do_mac(%A: memref<256xi32>) -> () attributes {link_with = "kernel.o"}
+
   %core13 = aie.core(%tile13) {
     aie.use_lock(%lock13_3, "Acquire", 1) // acquire for read(e.g. input ping)
     func.call @do_mul(%buf13_0) : (memref<256xi32>) -> ()
     aie.use_lock(%lock13_3, "Release", 0) // release for write
     aie.end
-  } { link_with="kernel.o" }
-  
+  }
+
   %core23 = aie.core(%tile23) {
 //    %val1 = arith.constant 7 : i32
 //    %idx1 = arith.constant 0 : index
@@ -47,7 +47,7 @@ aie.device(xcvc1902) {
     func.call @do_mac(%buf23_0) : (memref<256xi32>) -> ()
      aie.use_lock(%lock23_7, "Release", 1) // release for read
     aie.end
-  } { link_with="kernel.o" }
+  }
   
 }
 }
diff --git a/test/unit_tests/chess_compiler_tests/05_shim_dma_core_function/aie.mlir b/test/unit_tests/chess_compiler_tests/05_shim_dma_core_function/aie.mlir
index 1dcacc1319b..4102158c46f 100644
--- a/test/unit_tests/chess_compiler_tests/05_shim_dma_core_function/aie.mlir
+++ b/test/unit_tests/chess_compiler_tests/05_shim_dma_core_function/aie.mlir
@@ -35,7 +35,7 @@ aie.device(xcvc1902) {
   %lock_b_ping = aie.lock(%t73, 5) // b_ping
   %lock_b_pong = aie.lock(%t73, 6) // b_pong
 
-  func.func private @func(%A: memref<256xi32>, %B: memref<256xi32>) -> ()
+  func.func private @func(%A: memref<256xi32>, %B: memref<256xi32>) -> () attributes {link_with = "kernel.o"}
 
   %c13 = aie.core(%t73) {
     
@@ -60,7 +60,7 @@ aie.device(xcvc1902) {
     }
 
     aie.end
-  } { link_with="kernel.o" }
+  }
 
   // Tile DMA
   %m73 = aie.mem(%t73) {
diff --git a/test/unit_tests/chess_compiler_tests/07_shim_dma_core_function_with_loop/aie.mlir b/test/unit_tests/chess_compiler_tests/07_shim_dma_core_function_with_loop/aie.mlir
index 46a82f997c8..23a54d6521d 100644
--- a/test/unit_tests/chess_compiler_tests/07_shim_dma_core_function_with_loop/aie.mlir
+++ b/test/unit_tests/chess_compiler_tests/07_shim_dma_core_function_with_loop/aie.mlir
@@ -38,7 +38,7 @@ aie.device(xcvc1902) {
   %lock_b_ping = aie.lock(%t73, 5) // b_ping
   %lock_b_pong = aie.lock(%t73, 6) // b_pong
 
-  func.func private @func(%A: memref<64xi32>, %B: memref<64xi32>, %C: i32) -> ()
+  func.func private @func(%A: memref<64xi32>, %B: memref<64xi32>, %C: i32) -> () attributes {link_with = "kernel.o"}
 
   %c13 = aie.core(%t73) {
     %buffer_size =  arith.constant 64 : i32
@@ -68,7 +68,7 @@ aie.device(xcvc1902) {
     }
 
     aie.end
-  } { link_with="kernel.o" }
+  }
 
   // Tile DMA
   %m73 = aie.mem(%t73) {
diff --git a/test/unit_tests/chess_compiler_tests_aie2/01_precompiled_core_function/aie.mlir b/test/unit_tests/chess_compiler_tests_aie2/01_precompiled_core_function/aie.mlir
index efb4aa07444..18c588dc24f 100644
--- a/test/unit_tests/chess_compiler_tests_aie2/01_precompiled_core_function/aie.mlir
+++ b/test/unit_tests/chess_compiler_tests_aie2/01_precompiled_core_function/aie.mlir
@@ -27,7 +27,7 @@ module @test_chesss_01_precompiled_core_function {
     %lock13_3 = aie.lock(%tile13, 3) { sym_name = "input_lock" }
     %lock13_5 = aie.lock(%tile13, 5) { sym_name = "output_lock" }
 
-    func.func private @func(%A: memref<256xi32>, %B: memref<256xi32>) -> ()
+    func.func private @func(%A: memref<256xi32>, %B: memref<256xi32>) -> () attributes {link_with = "kernel.o"}
 
     %core13 = aie.core(%tile13) {
       aie.use_lock(%lock13_3, "Acquire", 1) // acquire for read(e.g. input ping)
@@ -36,6 +36,6 @@ module @test_chesss_01_precompiled_core_function {
       aie.use_lock(%lock13_3, "Release", 0) // release for write
       aie.use_lock(%lock13_5, "Release", 1) // release for read
       aie.end
-    } { link_with="kernel.o" }
+    }
   }
 }
diff --git a/test/unit_tests/chess_compiler_tests_aie2/03_cascade_core_functions/aie.mlir b/test/unit_tests/chess_compiler_tests_aie2/03_cascade_core_functions/aie.mlir
index d1234e54bda..4013037f04f 100644
--- a/test/unit_tests/chess_compiler_tests_aie2/03_cascade_core_functions/aie.mlir
+++ b/test/unit_tests/chess_compiler_tests_aie2/03_cascade_core_functions/aie.mlir
@@ -28,14 +28,14 @@ module {
     %lock13_3 = aie.lock(%tile13, 3) { sym_name = "input_lock" } // input buffer lock
     %lock23_7 = aie.lock(%tile23, 7) { sym_name = "output_lock" } // output buffer lock
 
-    func.func private @do_mul(%A: memref<256xi32>) -> ()
-    func.func private @do_mac(%A: memref<256xi32>) -> ()
+    func.func private @do_mul(%A: memref<256xi32>) -> () attributes {link_with = "kernel.o"}
+    func.func private @do_mac(%A: memref<256xi32>) -> () attributes {link_with = "kernel.o"}
 
     %core13 = aie.core(%tile13) {
       aie.use_lock(%lock13_3, AcquireGreaterEqual, 1) // acquire for read(e.g. input ping)
       func.call @do_mul(%buf13_0) : (memref<256xi32>) -> ()
       aie.end
-    } { link_with="kernel.o" }
+    }
 
     %core23 = aie.core(%tile23) {
   //    %val1 = arith.constant 7 : i32
@@ -44,6 +44,6 @@ module {
       func.call @do_mac(%buf23_0) : (memref<256xi32>) -> ()
       aie.use_lock(%lock23_7, Release, 1) // release for read
       aie.end
-    } { link_with="kernel.o" }
+    }
   }
 }
diff --git a/test/unit_tests/chess_compiler_tests_aie2/05_shim_dma_core_function/aie.mlir b/test/unit_tests/chess_compiler_tests_aie2/05_shim_dma_core_function/aie.mlir
index f9d28035713..d546d5a4325 100644
--- a/test/unit_tests/chess_compiler_tests_aie2/05_shim_dma_core_function/aie.mlir
+++ b/test/unit_tests/chess_compiler_tests_aie2/05_shim_dma_core_function/aie.mlir
@@ -38,7 +38,7 @@ module @test_chess_05_shim_dma_core_function {
     %lock_b_read = aie.lock(%t73, 6)
     %lock_done = aie.lock(%t73, 7)
 
-    func.func private @func(%A: memref<16xi32>, %B: memref<16xi32>) -> ()
+    func.func private @func(%A: memref<16xi32>, %B: memref<16xi32>) -> () attributes {link_with = "kernel.o"}
 
     %c13 = aie.core(%t73) {
 
@@ -61,7 +61,7 @@ module @test_chess_05_shim_dma_core_function {
       }
 
       aie.end
-    } { link_with="kernel.o" }
+    }
 
     // Tile DMA
     %m73 = aie.mem(%t73) {
diff --git a/test/unit_tests/chess_compiler_tests_aie2/07_shim_dma_core_function_with_loop/aie.mlir b/test/unit_tests/chess_compiler_tests_aie2/07_shim_dma_core_function_with_loop/aie.mlir
index 71f48b56609..93ebe7fc28e 100644
--- a/test/unit_tests/chess_compiler_tests_aie2/07_shim_dma_core_function_with_loop/aie.mlir
+++ b/test/unit_tests/chess_compiler_tests_aie2/07_shim_dma_core_function_with_loop/aie.mlir
@@ -34,7 +34,7 @@ module @test_chess_04_deprecated_shim_dma_precompiled_kernel{
     %lock_b_ping = aie.lock(%t73, 5) // b_ping
     %lock_b_pong = aie.lock(%t73, 6) // b_pong
 
-    func.func private @func(%A: memref<64xi32>, %B: memref<64xi32>, %C: i32) -> ()
+    func.func private @func(%A: memref<64xi32>, %B: memref<64xi32>, %C: i32) -> () attributes {link_with = "kernel.o"}
 
     %c13 = aie.core(%t73) {
       %buffer_size =  arith.constant 64 : i32
@@ -64,7 +64,7 @@ module @test_chess_04_deprecated_shim_dma_precompiled_kernel{
       }
 
       aie.end
-    } { link_with="kernel.o" }
+    }
 
     // Tile DMA
     %m73 = aie.mem(%t73) {

From 55c03f8462defb0281830abd89a34724cc928d4e Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Tue, 10 Mar 2026 16:39:35 -0600
Subject: [PATCH 18/28] [audit] Code quality pass: comments, docs, and minor
 cleanup

- Fix broken LLVM file header modeline in AIEAssignCoreLinkFiles.cpp
- Remove duplicate #include in aiecc.cpp
- Extract duplicated core-skipping predicate into coreNeedsCompilation()
- Normalize verbose output terminology ("external object" vs "link_with")
- Improve comments in AIEAssignCoreLinkFiles, BCF/ldscript targets, aiecc
- Add/improve docstrings in AIEOps.td, AIEPasses.td, kernel.py, aie.py, utils.py
- Extend indirect-call warning check in test to cover actionable message text
- Add comment explaining uninitialized bo_inB in npu-xrt tests

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 include/aie/Dialect/AIE/IR/AIEOps.td          | 20 +++++-
 .../aie/Dialect/AIE/Transforms/AIEPasses.td   | 17 +++--
 .../AIE/Transforms/AIEAssignCoreLinkFiles.cpp | 37 ++++++-----
 lib/Targets/AIETargetBCF.cpp                  |  4 +-
 lib/Targets/AIETargetLdScript.cpp             |  6 +-
 python/dialects/aie.py                        | 14 ++++
 python/iron/kernel.py                         | 63 ++++++++++++------
 python/utils/compile/utils.py                 | 50 ++++++++------
 python/utils/jit.py                           | 13 ++--
 test/aiecc/cpp_link_with_indirect_call.mlir   |  2 +-
 .../add_one_func_link_with_chess/test.cpp     |  3 +
 .../add_one_func_link_with_peano/test.cpp     |  3 +
 tools/aiecc/aiecc.cpp                         | 65 +++++++++++--------
 13 files changed, 199 insertions(+), 98 deletions(-)

diff --git a/include/aie/Dialect/AIE/IR/AIEOps.td b/include/aie/Dialect/AIE/IR/AIEOps.td
index 0119a544235..9d5392e12b2 100644
--- a/include/aie/Dialect/AIE/IR/AIEOps.td
+++ b/include/aie/Dialect/AIE/IR/AIEOps.td
@@ -322,8 +322,12 @@ def AIE_CoreOp: AIE_Op<"core", [
   let arguments = (
     ins Index:$tile,
     DefaultValuedAttr<AIEI32Attr, "0x400">:$stack_size,
-    OptionalAttr<StrAttr>:$link_with,   // deprecated: use link_with on func.func instead
-    OptionalAttr<StrArrayAttr>:$link_files, // canonical post-pass list of .o paths
+    // Deprecated: attach link_with to func.func declarations instead and run
+    // aie-assign-core-link-files to populate link_files.
+    OptionalAttr<StrAttr>:$link_with,
+    // Populated by aie-assign-core-link-files; consumed by BCF/ldscript emitters
+    // and the aiecc driver.  Specifying both link_with and link_files is an error.
+    OptionalAttr<StrArrayAttr>:$link_files,
     OptionalAttr<StrAttr>:$elf_file,
     OptionalAttr<BoolAttr>:$dynamic_objfifo_lowering
   );
@@ -348,6 +352,18 @@ def AIE_CoreOp: AIE_Op<"core", [
     This op has an optional `dynamic_objfifo_lowering` attribute, to finely control whether the
     objectfifos in this core should be lowered using the dynamic runtime lowering.
 
+    **External object files.**  The preferred mechanism is to attach a `link_with`
+    string attribute to each `func.func` declaration for an externally-defined
+    function, then run the `aie-assign-core-link-files` pass.  That pass traces
+    direct `func.call` edges from each core and writes the aggregated, de-duplicated
+    list of object file paths into the `link_files` attribute on this op.  The
+    BCF/ldscript emitters and the aiecc driver consume `link_files`.
+
+    The core-level `link_with` attribute is deprecated and kept only for
+    backward compatibility.  It is migrated by `aie-assign-core-link-files`
+    (its value is folded into `link_files` and then removed).  Specifying both
+    `link_with` and `link_files` on the same CoreOp is a verifier error.
+
     Examples:
     ```
     %tile = aie.tile(1, 1)
diff --git a/include/aie/Dialect/AIE/Transforms/AIEPasses.td b/include/aie/Dialect/AIE/Transforms/AIEPasses.td
index f6de090ad8d..29e8a0d3b59 100644
--- a/include/aie/Dialect/AIE/Transforms/AIEPasses.td
+++ b/include/aie/Dialect/AIE/Transforms/AIEPasses.td
@@ -18,12 +18,21 @@ def AIEAssignCoreLinkFiles : Pass<"aie-assign-core-link-files", "DeviceOp"> {
       "Infer per-core link_files from func-level link_with attributes";
   let description = [{
     Walks each aie.core and collects the set of external object files it needs
-    by tracing call edges to func.func declarations that carry a "link_with"
-    string attribute.  The result is stored in the CoreOp's "link_files"
-    StrArrayAttr (a typed array of strings).
+    by tracing direct func.call edges to func.func declarations that carry a
+    "link_with" string attribute.  The result is stored in the CoreOp's
+    "link_files" StrArrayAttr.
+
+    Only direct calls (func.call) are resolved.  Indirect calls
+    (func.call_indirect) inside a core body emit a warning and are not
+    resolved; add a direct func.call to the required func.func declaration
+    so the pass can trace the dependency.
 
     Core-level "link_with" (deprecated) is also migrated: its value is
-    added to the set and the attribute is removed from the CoreOp.
+    folded into the set and the attribute is removed from the CoreOp.
+
+    func.func declarations that carry "link_with" but are never called from
+    any core emit a warning; their object files will not appear in any
+    core's link_files.
   }];
 
   let constructor = "xilinx::AIE::createAIEAssignCoreLinkFilesPass()";
diff --git a/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp b/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp
index 8a66ecd4b02..a96db649ad2 100644
--- a/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp
+++ b/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp
@@ -1,5 +1,4 @@
-//===- AIEAssignCoreLinkFiles.cpp --------------------------------*- C++
-//-*-===//
+//===- AIEAssignCoreLinkFiles.cpp -------------------------------*- C++ -*-===//
 //
 // This file is licensed under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -45,9 +44,10 @@ struct AIEAssignCoreLinkFilesPass
     DeviceOp device = getOperation();
     OpBuilder builder(device.getContext());
 
-    // Build map: func name -> list of .o files (from "link_with" attr on
-    // func.func). Keys and values are interned in the MLIRContext so the
-    // StringRefs remain valid for the lifetime of the pass.
+    // Build a map from func name to the object file(s) it requires, sourced
+    // from the "link_with" string attribute on func.func declarations.
+    // StringRefs are views into MLIRContext-owned storage and remain valid
+    // for the entire pass run.
     DenseMap<StringRef, SmallVector<StringRef, 2>> funcToObjs;
     for (auto funcOp : device.getOps<mlir::func::FuncOp>()) {
       if (auto attr = funcOp->getAttrOfType<mlir::StringAttr>("link_with")) {
@@ -55,18 +55,18 @@ struct AIEAssignCoreLinkFilesPass
       }
     }
 
-    // Track which funcs are actually called from any core.
+    // Tracks which func.func symbols are directly called from at least one
+    // core; used to warn about link_with-bearing functions that are never
+    // called and whose object files would otherwise be silently omitted.
     llvm::DenseSet<StringRef> usedFuncs;
 
-    // Walk each core, collect all .o files needed.
-    // NOTE: only *direct* calls (func.call) are traced; transitive calls
-    // through intermediate helpers are not followed.  If an intermediate
-    // helper carries its own link_with, attach link_with to the intermediate
-    // helper *and* call it directly from the core, or use the deprecated
-    // core-level link_with as a fallback.
+    // Only direct func.call edges are traced.  func.call_indirect ops and
+    // calls through intermediate wrapper functions are not followed.  To
+    // handle transitive dependencies, attach link_with directly to every
+    // func.func declaration that a core calls, even thin wrappers.
+    // TODO: extend to transitive call resolution.
     device.walk([&](CoreOp core) {
-      // De-duplicate while preserving insertion order. StringRefs point into
-      // the MLIRContext attribute storage and remain valid throughout the pass.
+      // De-duplicate while preserving insertion order.
       llvm::SetVector<StringRef> needed;
 
       // Migrate deprecated core-level attr: warn, consume it, and add to set.
@@ -78,8 +78,8 @@ struct AIEAssignCoreLinkFilesPass
         core->removeAttr("link_with");
       }
 
-      // Single walk: accumulate used funcs, collect .o files, warn on indirect
-      // calls — all in one pass over the core body.
+      // Single walk over the core body: collect required object files and
+      // record called symbols (for the unused-func warning below).
       core.walk([&](Operation *op) {
         if (auto call = dyn_cast<mlir::func::CallOp>(op)) {
           usedFuncs.insert(call.getCallee());
@@ -96,8 +96,11 @@ struct AIEAssignCoreLinkFilesPass
         }
       });
 
-      if (!needed.empty())
+      if (!needed.empty()) {
+        // builder is used only for attribute construction; its insertion
+        // point is irrelevant and no ops are inserted.
         core.setLinkFilesAttr(builder.getStrArrayAttr(needed.getArrayRef()));
+      }
     });
 
     // Warn about funcs with link_with that are never called from any core.
diff --git a/lib/Targets/AIETargetBCF.cpp b/lib/Targets/AIETargetBCF.cpp
index 08826f31da4..4148841a53d 100644
--- a/lib/Targets/AIETargetBCF.cpp
+++ b/lib/Targets/AIETargetBCF.cpp
@@ -141,10 +141,12 @@ LogicalResult AIETranslateToBCF(ModuleOp module, raw_ostream &output,
 
       if (auto coreOp = tile.getCoreOp()) {
         if (auto filesAttr = coreOp.getLinkFiles()) {
+          // Canonical path: link_files populated by aie-assign-core-link-files.
           for (auto f : filesAttr->getAsRange<mlir::StringAttr>())
             output << "_include _file " << f.getValue() << "\n";
         } else if (coreOp.getLinkWith()) {
-          // deprecated fallback
+          // Deprecated fallback: core-level link_with was not migrated by
+          // aie-assign-core-link-files (e.g., the pass was not run).
           output << "_include _file " << coreOp.getLinkWith().value().str()
                  << "\n";
         }
diff --git a/lib/Targets/AIETargetLdScript.cpp b/lib/Targets/AIETargetLdScript.cpp
index f1d23f3ade3..1cf87059c3f 100644
--- a/lib/Targets/AIETargetLdScript.cpp
+++ b/lib/Targets/AIETargetLdScript.cpp
@@ -175,13 +175,17 @@ SECTIONS
                targetModel.getMemEastBaseAddress(), std::string("east"));
 
       output << "  .bss : { *(.bss*) } > data\n";
+      // INPUT() directives must follow the closing brace of SECTIONS; placing
+      // them inside SECTIONS is invalid linker script syntax.
       output << "}\n";
       if (auto coreOp = tile.getCoreOp()) {
         if (auto filesAttr = coreOp.getLinkFiles()) {
+          // Canonical path: link_files populated by aie-assign-core-link-files.
           for (auto f : filesAttr->getAsRange<mlir::StringAttr>())
             output << "INPUT(" << f.getValue() << ")\n";
         } else if (auto fileAttr = coreOp.getLinkWith()) {
-          // deprecated fallback
+          // Deprecated fallback: core-level link_with was not migrated by
+          // aie-assign-core-link-files (e.g., the pass was not run).
           output << "INPUT(" << fileAttr.value().str() << ")\n";
         }
 
diff --git a/python/dialects/aie.py b/python/dialects/aie.py
index 405baced79b..0472d476db8 100644
--- a/python/dialects/aie.py
+++ b/python/dialects/aie.py
@@ -88,6 +88,20 @@ def __init__(self, buffer, index, value, loc=None, ip=None):
 
 
 class external_func(FuncOp):
+    """A ``func.func`` declaration for an externally-defined AIE core function.
+
+    Args:
+        name: Symbol name of the function.
+        inputs: List of input types (numpy dtypes or MLIR types).
+        outputs: List of output types.  Defaults to [].
+        visibility: MLIR symbol visibility.  Defaults to ``"private"``.
+        link_with: Optional path to the object file (``.o``) that implements
+            this function.  Sets the ``link_with`` string attribute on the
+            generated ``func.func`` op; the ``aie-assign-core-link-files`` pass
+            reads this attribute and propagates it into the CoreOp's
+            ``link_files`` attribute for the linker.
+    """
+
     def __init__(
         self, name: str, inputs, outputs=None, visibility="private", link_with=None
     ):
diff --git a/python/iron/kernel.py b/python/iron/kernel.py
index 47de4fa7600..dfd38337e3b 100644
--- a/python/iron/kernel.py
+++ b/python/iron/kernel.py
@@ -56,14 +56,22 @@ def __init__(
         bin_name: str,
         arg_types: list[type[np.ndarray] | np.dtype] = [],
     ) -> None:
-        """A Kernel is an externally defined function that eventually resolves to a FuncOp. If it is called,
-        a CallOp will be generated.
+        """An externally pre-compiled AIE core function.
+
+        ``Kernel`` wraps a pre-built object file (``.o``).  Use
+        ``ExternalFunction`` instead when you want to compile from C/C++ source.
+
+        When the kernel is first called inside a core body, ``resolve()`` emits
+        a ``func.func`` declaration with a ``link_with`` attribute naming
+        ``bin_name``; the ``aie-assign-core-link-files`` pass later propagates
+        that into the CoreOp's ``link_files`` attribute for the linker.
 
         Args:
-            name (str): The name of the function
-            bin_name (str): The name of the object file (set as link_with on the func.func
-                declaration; also used as the output filename when compiling ExternalFunction sources)
-            arg_types (list[type[np.ndarray]  |  np.dtype], optional): The type signature of the function. Defaults to [].
+            name: Symbol name of the function as it appears in the object file.
+            bin_name: Filename of the pre-compiled object file (e.g.,
+                ``"add_one.o"``).  Must be available on the linker search path
+                at compile time.
+            arg_types: Type signature of the function arguments.  Defaults to [].
         """
         super().__init__(name, arg_types)
         self._bin_name = bin_name
@@ -84,6 +92,18 @@ def resolve(
 
 
 class ExternalFunction(Kernel):
+    """An AIE core function compiled from C/C++ source at JIT time.
+
+    Each instance is registered in ``_instances`` at construction time so that
+    the ``@jit`` decorator can discover and compile all source files before
+    invoking the MLIR compilation pipeline.  ``_instances`` is cleared at the
+    start of each ``@jit`` call to prevent stale registrations from a previous
+    (possibly failed) run.
+
+    Use the base ``Kernel`` class instead when you have a pre-built object file.
+    """
+
+    _instances: set  # Registry of all live ExternalFunction instances.
     _instances = set()
 
     def __init__(
@@ -97,18 +117,21 @@ def __init__(
         compile_flags: list[str] = [],
         debug: bool = False,
     ) -> None:
-        """An ExternalFunction is a C/C++ source file that gets compiled to an object file and eventually resolves to a FuncOp.
-        If it is called, a CallOp will be generated.
-
+        """
         Args:
-            name (str): The name of the function
-            object_file_name (str, optional): The name of the object file. If None, it will be name.o.
-            source_file (str): Path to the C/C++ source file
-            source_string (str): C/C++ source code as a string
-            arg_types (list[type[np.ndarray] | np.dtype], optional): The type signature of the function. Defaults to [].
-            include_dirs (list[str], optional): Additional include directories. Defaults to [].
-            compile_flags (list[str], optional): Additional compilation flags. Defaults to [].
-            debug (bool, optional): Enable debug logging. Defaults to False.
+            name: Symbol name of the function as it will appear in the object file.
+            object_file_name: Output object file name.  Defaults to ``<name>.o``.
+            source_file: Path to a C/C++ source file on disk.  Mutually
+                exclusive with ``source_string``.
+            source_string: Inline C/C++ source code.  Mutually exclusive with
+                ``source_file``.
+            arg_types: Type signature of the function arguments.  Defaults to [].
+            include_dirs: Additional ``-I`` directories passed to the Peano
+                compiler.  Defaults to [].
+            compile_flags: Additional flags passed verbatim to the Peano
+                compiler.  Defaults to [].
+            debug: If True, emit debug log messages during construction.
+                Defaults to False.
         """
         if not object_file_name:
             object_file_name = f"{name}.o"
@@ -128,7 +151,7 @@ def __init__(
             logger.debug("Include dirs: %s", include_dirs)
             logger.debug("Compile flags: %s", compile_flags)
 
-        # Track this instance for JIT compilation
+        # Register this instance so the @jit decorator can compile it.
         ExternalFunction._instances.add(self)
 
     def _setup_source(self, source_file: str | None, source_string: str | None) -> None:
@@ -143,11 +166,11 @@ def _setup_source(self, source_file: str | None, source_string: str | None) -> N
             self._source_string = source_string
 
     def __enter__(self):
-        """Enter the context."""
+        """Support use as a context manager (``with ExternalFunction(...) as f``)."""
         return self
 
     def __exit__(self, exc_type, exc_value, traceback):
-        """Exit the context."""
+        """No cleanup is performed on exit; the context manager is purely syntactic."""
         pass
 
     def tile_size(self, arg_index: int = 0) -> int:
diff --git a/python/utils/compile/utils.py b/python/utils/compile/utils.py
index 04ec56c430a..97e58c0e8d1 100644
--- a/python/utils/compile/utils.py
+++ b/python/utils/compile/utils.py
@@ -85,16 +85,25 @@ def compile_mlir_module(
     options=None,
 ):
     """
-    Compile an MLIR module to instruction, PDI, and/or xbclbin files using the aiecc module.
-    This function supports only the Peano compiler.
-    Parameters:
-        mlir_module (str): MLIR module to compile.
-        insts_path (str): Path to the instructions binary file.
-        pdi_path (str): Path to the PDI file.
-        xclbin_path (str): Path to the xclbin file.
-        verbose (bool): If True, enable verbose output.
-        work_dir (str): Compilation working directory.
-        options (list[str]): List of additional options.
+    Compile an MLIR module to instruction, PDI, and/or xclbin files using aiecc.
+
+    By default uses the Peano compiler backend (--no-xchesscc --no-xbridge).
+    Pass additional flags via ``options`` to override.
+
+    When ``work_dir`` is provided, the MLIR is written to a file inside that
+    directory so that the C++ aiecc binary resolves relative ``link_with``
+    paths on ``func.func`` declarations against the same directory where
+    ``compile_external_kernel`` placed the compiled object files.
+
+    Args:
+        mlir_module: MLIR module to compile.
+        insts_path: Output path for the NPU instruction binary.
+        pdi_path: Output path for the PDI file.
+        xclbin_path: Output path for the xclbin package.
+        verbose: If True, pass --verbose to aiecc.
+        work_dir: Compilation working directory; also determines where the
+            MLIR input file is written when invoking the C++ aiecc binary.
+        options: Additional aiecc command-line options.
     """
 
     args = [
@@ -150,26 +159,29 @@ def compile_external_kernel(func, kernel_dir, target_arch):
     """
     Compile an ExternalFunction to an object file in the kernel directory.
 
+    The output file is named ``func.bin_name`` and placed in ``kernel_dir``.
+    If the object file already exists in ``kernel_dir``, compilation is skipped.
+
     Args:
-        func: ExternalFunction instance to compile
-        kernel_dir: Directory to place the compiled object file
-        target_arch: Target architecture (e.g., "aie2" or "aie2p")
+        func: ExternalFunction instance to compile.
+        kernel_dir: Directory where the compiled object file will be placed.
+            Must be the same directory passed as ``work_dir`` to
+            ``compile_mlir_module`` so that relative link_with paths resolve
+            correctly.
+        target_arch: Peano target architecture string (e.g., "aie2", "aie2p").
     """
-    # Skip if already compiled
+    # Skip if already compiled in this session.
     if hasattr(func, "_compiled") and func._compiled:
         return
 
-    # Check if object file already exists in kernel directory
+    # Skip if the object file already exists (cache hit).
     output_file = os.path.join(kernel_dir, func.bin_name)
     if os.path.exists(output_file):
         return
 
-    # Create source file in kernel directory
     source_file = os.path.join(kernel_dir, f"{func._name}.cc")
 
-    # Handle both source_string and source_file cases
     if func._source_string is not None:
-        # Use source_string (write to file)
         with open(source_file, "w") as f:
             f.write(func._source_string)
     elif func._source_file is not None:
@@ -190,8 +202,6 @@ def compile_external_kernel(func, kernel_dir, target_arch):
         compile_args=func._compile_flags,
         cwd=kernel_dir,
     )
-
-    # Mark the function as compiled
     func._compiled = True
 
 
diff --git a/python/utils/jit.py b/python/utils/jit.py
index 9e6990bf669..3304b685266 100644
--- a/python/utils/jit.py
+++ b/python/utils/jit.py
@@ -184,13 +184,16 @@ def decorator(*args, **kwargs):
 
 def _filter_tensor_args(args):
     """
-    Filter out non-tensor arguments from args. Required for Algorithms because
-    they pass ExternalFunction and scalar values in their signature that should
-    not be interpreted as runtime sequence arguments.
+    Filter out non-tensor arguments from args.
+
+    Algorithm functions may include ExternalFunction instances and scalar
+    compile-time constants in their Python signature that must not be forwarded
+    to the NPU kernel as runtime buffer arguments.
 
     Removes:
-    - ExternalFunction instances
-    - Scalar values (int, float, np.integer, np.floating), embedded as MLIR constants
+    - ExternalFunction instances (resolved at compile time via link_with)
+    - Scalar values (int, float, np.integer, np.floating) used as MLIR constants
+    - Callables (e.g. lambda configuration helpers)
     """
     tensor_args = []
     for arg in args:
diff --git a/test/aiecc/cpp_link_with_indirect_call.mlir b/test/aiecc/cpp_link_with_indirect_call.mlir
index fa06dc43f36..5d31d3e9fba 100644
--- a/test/aiecc/cpp_link_with_indirect_call.mlir
+++ b/test/aiecc/cpp_link_with_indirect_call.mlir
@@ -22,7 +22,7 @@ module {
 
     %core_0_2 = aie.core(%tile_0_2) {
       %fptr = func.constant @some_helper : () -> ()
-      // expected-warning@+1 {{indirect call in core body}}
+      // expected-warning@+1 {{indirect call in core body — link_with attributes on indirectly-called functions are not automatically resolved}}
       func.call_indirect %fptr() : () -> ()
       aie.end
     }
diff --git a/test/npu-xrt/add_one_func_link_with_chess/test.cpp b/test/npu-xrt/add_one_func_link_with_chess/test.cpp
index 939e9b6f742..2d6de2f4cd1 100644
--- a/test/npu-xrt/add_one_func_link_with_chess/test.cpp
+++ b/test/npu-xrt/add_one_func_link_with_chess/test.cpp
@@ -69,6 +69,9 @@ int main(int argc, const char *argv[]) {
                           XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
   auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
                         XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  // bo_inB corresponds to the unused %buf memref in the runtime_sequence; it
+  // is passed to satisfy the kernel argument count but is never read by the
+  // device kernel, so no initialization or sync is needed.
   auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
                         XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
   auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
diff --git a/test/npu-xrt/add_one_func_link_with_peano/test.cpp b/test/npu-xrt/add_one_func_link_with_peano/test.cpp
index 458fc47f53f..c6d843886cd 100644
--- a/test/npu-xrt/add_one_func_link_with_peano/test.cpp
+++ b/test/npu-xrt/add_one_func_link_with_peano/test.cpp
@@ -69,6 +69,9 @@ int main(int argc, const char *argv[]) {
                           XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1));
   auto bo_inA = xrt::bo(device, IN_SIZE * sizeof(int32_t),
                         XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(3));
+  // bo_inB corresponds to the unused %buf memref in the runtime_sequence; it
+  // is passed to satisfy the kernel argument count but is never read by the
+  // device kernel, so no initialization or sync is needed.
   auto bo_inB = xrt::bo(device, IN_SIZE * sizeof(int32_t),
                         XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(4));
   auto bo_out = xrt::bo(device, OUT_SIZE * sizeof(int32_t),
diff --git a/tools/aiecc/aiecc.cpp b/tools/aiecc/aiecc.cpp
index f904f1c82b4..a427e8f7e01 100644
--- a/tools/aiecc/aiecc.cpp
+++ b/tools/aiecc/aiecc.cpp
@@ -117,8 +117,6 @@
 #include <thread>
 #include <vector>
 
-#include "aiecc_aiesim.h"
-
 using namespace llvm;
 using namespace mlir;
 
@@ -1080,14 +1078,22 @@ static std::string getAIETargetForDevice(ModuleOp moduleOp,
 // AIE Device and Core Discovery
 //===----------------------------------------------------------------------===//
 
+/// Per-core metadata extracted from a CoreOp before the compilation pipeline
+/// begins.  All fields are populated by getCoreInfo().
 struct CoreInfo {
-  std::int32_t col;
-  std::int32_t row;
-  SmallVector<std::string> linkFiles; // External object files to link
-  std::string elfFile; // Generated ELF path (if already specified)
+  std::int32_t col = 0; ///< Tile column (from TileOp).
+  std::int32_t row = 0; ///< Tile row (from TileOp).
+  /// External object files to link into this core's ELF.  Populated from
+  /// CoreOp::getLinkFiles() (canonical) or CoreOp::getLinkWith() (deprecated
+  /// fallback when aie-assign-core-link-files was not run).
+  SmallVector<std::string> linkFiles;
+  /// If non-empty, the ELF was provided via the elf_file attribute; no
+  /// compilation is needed.
+  std::string elfFile;
 };
 
-/// Check if a CoreOp has a non-empty body (more than just aie.end).
+/// Returns true if the CoreOp has a non-empty body (i.e., anything beyond the
+/// mandatory aie.end terminator).
 static bool coreHasNonemptyBody(xilinx::AIE::CoreOp coreOp) {
   for (auto &block : coreOp.getBody()) {
     if (block.getOperations().size() > 1)
@@ -1096,7 +1102,20 @@ static bool coreHasNonemptyBody(xilinx::AIE::CoreOp coreOp) {
   return false;
 }
 
-// Helper to extract core info from a CoreOp
+/// Returns true if a CoreOp requires compilation or linking.
+///
+/// Skips hollow cores created by --expand-load-pdis (empty body, no elf_file,
+/// no link files), which exist only to satisfy structural constraints.
+static bool coreNeedsCompilation(xilinx::AIE::CoreOp coreOp) {
+  return coreOp.getElfFileAttr() || coreOp.getLinkWithAttr() ||
+         coreOp.getLinkFiles() || coreHasNonemptyBody(coreOp);
+}
+
+/// Extracts tile coordinates and link-file metadata from a CoreOp.
+///
+/// Prefers the canonical link_files attribute (set by
+/// aie-assign-core-link-files). Falls back to the deprecated core-level
+/// link_with attribute if link_files is absent (e.g., the pass was not run).
 static CoreInfo getCoreInfo(xilinx::AIE::CoreOp coreOp) {
   CoreInfo info;
   auto tileOp = dyn_cast<xilinx::AIE::TileOp>(coreOp.getTile().getDefiningOp());
@@ -2203,11 +2222,10 @@ static LogicalResult compileCore(MLIRContext &context, ModuleOp moduleOp,
       llvm::outs() << "Generated BCF: " << bcfPath << "\n";
     }
 
-    // Extract link_with files from BCF
+    // Extract external object files listed in the BCF's _include _file
+    // directives. Search order: current working directory, then tmpDirName (JIT
+    // cache), then the directory containing the input MLIR file.
     std::vector<std::string> linkWithFiles = extractInputFilesFromBCF(bcfPath);
-
-    // Handle link_with files: copy to .prj directory if needed
-    // Search order: current working directory, then input file directory
     std::string linkWithArgs;
     for (const auto &linkWithFile : linkWithFiles) {
       SmallString<256> srcPath;
@@ -2249,8 +2267,8 @@ static LogicalResult compileCore(MLIRContext &context, ModuleOp moduleOp,
         return failure();
 
       if (verbose)
-        llvm::outs() << "Copied link_with: " << srcPath << " -> " << destPath
-                     << "\n";
+        llvm::outs() << "Copied external object: " << srcPath << " -> "
+                     << destPath << "\n";
 
       if (!linkWithArgs.empty()) {
         linkWithArgs += " ";
@@ -2278,7 +2296,8 @@ static LogicalResult compileCore(MLIRContext &context, ModuleOp moduleOp,
         std::string(workDir), "-d",           "-f",
         std::string(objPath)};
 
-    // Add link_with files if any
+    // Append external object files (previously copied to tmpDirName) to the
+    // xchesscc_wrapper link command.
     for (const auto &linkWithFile : linkWithFiles) {
       SmallString<256> localPath(tmpDirName);
       sys::path::append(localPath, sys::path::filename(linkWithFile));
@@ -2387,10 +2406,10 @@ static LogicalResult compileCore(MLIRContext &context, ModuleOp moduleOp,
           return failure();
 
         if (verbose)
-          llvm::outs() << "Copied link_with object: " << srcLinkWith << " -> "
+          llvm::outs() << "Copied external object: " << srcLinkWith << " -> "
                        << destLinkWith << "\n";
       } else if (verbose) {
-        llvm::outs() << "link_with object already in place: " << srcLinkWith
+        llvm::outs() << "External object already in place: " << srcLinkWith
                      << "\n";
       }
 
@@ -2455,12 +2474,8 @@ compileCores(MLIRContext &context, ModuleOp moduleOp, Operation *deviceOp,
 
   SmallVector<CoreInfo> cores;
   deviceOp->walk([&](xilinx::AIE::CoreOp coreOp) {
-    // Skip cores with no elf_file, no link_with/link_files, and empty body
-    // (e.g., @empty device ops created by --expand-load-pdis)
-    if (coreOp.getElfFileAttr() || coreOp.getLinkWithAttr() ||
-        coreOp.getLinkFiles() || coreHasNonemptyBody(coreOp)) {
+    if (coreNeedsCompilation(coreOp))
       cores.push_back(getCoreInfo(coreOp));
-    }
   });
 
   if (cores.empty()) {
@@ -2628,12 +2643,8 @@ compileCoresUnified(MLIRContext &context, ModuleOp moduleOp,
 
   SmallVector<CoreInfo> cores;
   deviceOp->walk([&](xilinx::AIE::CoreOp coreOp) {
-    // Skip cores with no elf_file, no link_with/link_files, and empty body
-    // (e.g., @empty device ops created by --expand-load-pdis)
-    if (coreOp.getElfFileAttr() || coreOp.getLinkWithAttr() ||
-        coreOp.getLinkFiles() || coreHasNonemptyBody(coreOp)) {
+    if (coreNeedsCompilation(coreOp))
       cores.push_back(getCoreInfo(coreOp));
-    }
   });
 
   if (cores.empty()) {

From 396b802892850d1c2260c98bf77facff4a56cf3a Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Tue, 10 Mar 2026 17:17:32 -0600
Subject: [PATCH 19/28] [audit] Fix correctness issues found in code review

aiecc.cpp:
- Add missing lock_guard(outputMutex) around new verbose log lines in
  compileCore and compileCoresUnified (data race with parallel compilation)
- Add src == dest guard before atomicCopyFile in the BCF linkWithFiles
  loop, consistent with the existing guard in the Peano loop

python/utils/compile/utils.py:
- Forward aiecc subprocess stdout/stderr to logger.debug on success so
  verbose output is not silently discarded
- Remove dead hasattr(func, "_compiled") guard; _compiled is always
  initialized in ExternalFunction.__init__

python/utils/jit.py:
- Replace assert mlir_module.operation.verify() with explicit
  if/raise so it cannot be silently disabled by python -O

lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp:
- Use mlir::Builder instead of mlir::OpBuilder; no ops are inserted

lib/Dialect/AIE/Transforms/CMakeLists.txt:
- Restore alphabetical ordering of source files

test/npu-xrt/add_one_scale_func_link_with_peano/run.lit:
- Fix wrong function signature in comment: scale_by_two takes three args

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../AIE/Transforms/AIEAssignCoreLinkFiles.cpp |  3 ++-
 lib/Dialect/AIE/Transforms/CMakeLists.txt     |  2 +-
 python/utils/compile/utils.py                 |  6 +++++-
 python/utils/jit.py                           | 14 +++++++------
 .../run.lit                                   |  2 +-
 tools/aiecc/aiecc.cpp                         | 21 ++++++++++++-------
 6 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp b/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp
index a96db649ad2..a0f4d712b6a 100644
--- a/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp
+++ b/lib/Dialect/AIE/Transforms/AIEAssignCoreLinkFiles.cpp
@@ -42,7 +42,8 @@ struct AIEAssignCoreLinkFilesPass
           AIEAssignCoreLinkFilesPass> {
   void runOnOperation() override {
     DeviceOp device = getOperation();
-    OpBuilder builder(device.getContext());
+    // Builder is used only for attribute construction; no ops are inserted.
+    Builder builder(device.getContext());
 
     // Build a map from func name to the object file(s) it requires, sourced
     // from the "link_with" string attribute on func.func declarations.
diff --git a/lib/Dialect/AIE/Transforms/CMakeLists.txt b/lib/Dialect/AIE/Transforms/CMakeLists.txt
index fabdf61a55e..cfb2c973d9f 100644
--- a/lib/Dialect/AIE/Transforms/CMakeLists.txt
+++ b/lib/Dialect/AIE/Transforms/CMakeLists.txt
@@ -7,9 +7,9 @@
 
 add_mlir_dialect_library(
   AIETransforms
+  AIEAssignBufferDescriptorIDs.cpp
   AIEAssignBuffers.cpp
   AIEAssignCoreLinkFiles.cpp
-  AIEAssignBufferDescriptorIDs.cpp
   AIEAssignLockIDs.cpp
   AIEFindFlows.cpp
   AIEPathFinder.cpp
diff --git a/python/utils/compile/utils.py b/python/utils/compile/utils.py
index 97e58c0e8d1..f3795b9da1f 100644
--- a/python/utils/compile/utils.py
+++ b/python/utils/compile/utils.py
@@ -142,6 +142,10 @@ def compile_mlir_module(
         result = subprocess.run(
             [aiecc_bin, mlir_file] + args, capture_output=True, text=True
         )
+        if result.stdout:
+            logger.debug("%s", result.stdout)
+        if result.stderr:
+            logger.debug("%s", result.stderr)
         if result.returncode != 0:
             error_msg = result.stderr if result.stderr else result.stdout
             raise RuntimeError(
@@ -171,7 +175,7 @@ def compile_external_kernel(func, kernel_dir, target_arch):
         target_arch: Peano target architecture string (e.g., "aie2", "aie2p").
     """
     # Skip if already compiled in this session.
-    if hasattr(func, "_compiled") and func._compiled:
+    if func._compiled:
         return
 
     # Skip if the object file already exists (cache hit).
diff --git a/python/utils/jit.py b/python/utils/jit.py
index 3304b685266..73bdae2e813 100644
--- a/python/utils/jit.py
+++ b/python/utils/jit.py
@@ -93,15 +93,17 @@ def decorator(*args, **kwargs):
         if is_placed:
             with mlir_mod_ctx() as ctx:
                 function(*args, **kwargs)
-                assert (
-                    ctx.module.operation.verify()
-                ), f"Verification failed for '{function.__name__}'"
+                if not ctx.module.operation.verify():
+                    raise RuntimeError(
+                        f"MLIR verification failed for '{function.__name__}'"
+                    )
                 mlir_module = ctx.module
         else:
             mlir_module = function(*args, **kwargs)
-            assert (
-                mlir_module.operation.verify()
-            ), f"Verification failed for '{function.__name__}'"
+            if not mlir_module.operation.verify():
+                raise RuntimeError(
+                    f"MLIR verification failed for '{function.__name__}'"
+                )
 
         # Also collect ExternalFunction instances created during function()
         # execution (e.g. inside algorithm helpers that construct them internally).
diff --git a/test/npu-xrt/add_one_scale_func_link_with_peano/run.lit b/test/npu-xrt/add_one_scale_func_link_with_peano/run.lit
index 8211c420009..a2ff6338849 100644
--- a/test/npu-xrt/add_one_scale_func_link_with_peano/run.lit
+++ b/test/npu-xrt/add_one_scale_func_link_with_peano/run.lit
@@ -15,7 +15,7 @@
 //
 // The kernel pipeline per tile iteration:
 //   1. add_one(in, out, n)   — out[i] = in[i] + 1
-//   2. scale_by_two(out, n)  — out[i] *= 2
+//   2. scale_by_two(out, out, n)  — out[i] *= 2  (in-place: same buffer for in and out)
 // Expected output: (input + 1) * 2.
 //
 // RUN: cp %S/aie.mlir aie_arch.mlir
diff --git a/tools/aiecc/aiecc.cpp b/tools/aiecc/aiecc.cpp
index a427e8f7e01..f8d93eab760 100644
--- a/tools/aiecc/aiecc.cpp
+++ b/tools/aiecc/aiecc.cpp
@@ -2262,13 +2262,17 @@ static LogicalResult compileCore(MLIRContext &context, ModuleOp moduleOp,
       // cores.
       SmallString<256> destPath(tmpDirName);
       sys::path::append(destPath, sys::path::filename(linkWithFile));
-      if (failed(atomicCopyFile(srcPath, tmpDirName,
-                                sys::path::filename(linkWithFile))))
-        return failure();
+      if (srcPath != destPath) {
+        if (failed(atomicCopyFile(srcPath, tmpDirName,
+                                  sys::path::filename(linkWithFile))))
+          return failure();
 
-      if (verbose)
-        llvm::outs() << "Copied external object: " << srcPath << " -> "
-                     << destPath << "\n";
+        if (verbose) {
+          std::lock_guard<std::mutex> lock(outputMutex);
+          llvm::outs() << "Copied external object: " << srcPath << " -> "
+                       << destPath << "\n";
+        }
+      }
 
       if (!linkWithArgs.empty()) {
         linkWithArgs += " ";
@@ -2405,10 +2409,13 @@ static LogicalResult compileCore(MLIRContext &context, ModuleOp moduleOp,
                                   sys::path::filename(lf))))
           return failure();
 
-        if (verbose)
+        if (verbose) {
+          std::lock_guard<std::mutex> lock(outputMutex);
           llvm::outs() << "Copied external object: " << srcLinkWith << " -> "
                        << destLinkWith << "\n";
+        }
       } else if (verbose) {
+        std::lock_guard<std::mutex> lock(outputMutex);
         llvm::outs() << "External object already in place: " << srcLinkWith
                      << "\n";
       }

From 4d6f0ad18db99f29f38f2b8c6656b18824c1f5f0 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Tue, 10 Mar 2026 17:39:00 -0600
Subject: [PATCH 20/28] [docs] Document link_with parameter in
 quick_reference.md

Add link_with to the external_func signature entry and note that
multiple declarations may share the same .o file (automatically
deduplicated by aie-assign-core-link-files).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 programming_guide/quick_reference.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/programming_guide/quick_reference.md b/programming_guide/quick_reference.md
index 97dbcb1233e..6fb43de5a8f 100644
--- a/programming_guide/quick_reference.md
+++ b/programming_guide/quick_reference.md
@@ -22,7 +22,7 @@
 | Function Signature  | Definition | Parameters | Return Type | Example | 
 |---------------------|------------|------------|-------------|---------|
 | `tile(column, row)` | Declare AI Engine tile | `column`: column index number <br> `row`: row index number | `<tile>` | ComputeTile = tile(1,3) |
-| `external_func(name, inputs, output)` | Declare external kernel function that will run on AIE Cores|  `name`: external function name <br> `input`: list of input types <br> `output`: list of output types | `<external_func>` | scale_scalar = external_func("vector_scalar_mul_aie_scalar", inputs=[tensor_ty, tensor_ty, np.ndarray[(1,), np.dtype[np.int32]]]) | |
+| `external_func(name, inputs, output, link_with=None)` | Declare external kernel function that will run on AIE Cores. Multiple `external_func` declarations may share the same `link_with` object file; the compiler deduplicates automatically. |  `name`: external function name <br> `input`: list of input types <br> `output`: list of output types <br> `link_with` (optional): path to the compiled object file (`.o`) that implements this function | `<external_func>` | scale_scalar = external_func("vector_scalar_mul_aie_scalar", inputs=[tensor_ty, tensor_ty, np.ndarray[(1,), np.dtype[np.int32]]], link_with="scale.o") | |
 | `npu_dma_memcpy_nd(metadata, bd_id, mem, sizes)` | configure n-dimensional DMA accessing external memory | `metadata`:  ObjectFifo python object or string with name of `object_fifo`<br> `bd_id`: Identifier number<br> `mem`: memory for transfer<br> `sizes`: 4-D transfer size in 4B granularity | `None` | npu_dma_memcpy_nd(metadata="out", bd_id=0, mem=C, sizes=[1, 1, 1, N]) |
 | `dma_wait(object_fifo, ...)` | configure host-ShimDMA synchronization for accessing external memory | `metadata`: Identifies the ObjectFifo (by Python object or name string) whose half-DMA completion we are waiting on. This is a variable argument function that can accept one or more metadatas at once, to be waited on in order given, | `None` | dma_wait(of_out) |
 | `npu_sync(column, row, direction, channel, column_num=1, row_num=1)` | alternative method to configure host-ShimDMA synchronization for accessing external memory | `column` and `row`: Specify the tile location for initiating the synchronization. <br> `direction`: Indicates the DMA direction (0 for write to host, 1 for read from host). <br> `channel`: Identifies the DMA channel (0 or 1) for the synchronization token <br> `column_num` and `row_num` (optional): Define the range of tiles to wait for synchronization| `None` | npu_sync(column=0, row=0, direction=0, channel=1) |

From ef61e9a733bcb533083283eb021bc5ec58b84189 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Tue, 10 Mar 2026 17:50:16 -0600
Subject: [PATCH 21/28] [test] Fill three gaps in func-level link_with test
 coverage

cpp_link_with_no_link_with.mlir:
  Verify aie-assign-core-link-files is a no-op on designs with no
  link_with attributes; no link_files attr should appear on any CoreOp.

test/python/external_func_link_with.py:
  Four Python-level unit tests verifying that external_func(link_with=...)
  emits the correct func.func attribute in four cases: single func,
  two funcs sharing one .o, two funcs with separate .os, and no link_with.

test/python/npu-xrt/test_jit_two_extern_functions.py:
  Hardware tests for the core new capability: a single Worker calling two
  distinct ExternalFunctions compiled to separate .o files, exercising
  the full multi-.o JIT pipeline (two compilations, aie-assign-core-link-
  files merging them, two INPUT() directives, successful lld multi-object
  link). Also tests the same-.o deduplication case.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 test/aiecc/cpp_link_with_no_link_with.mlir    |  28 +++
 test/python/external_func_link_with.py        | 106 +++++++++++
 .../npu-xrt/test_jit_two_extern_functions.py  | 166 ++++++++++++++++++
 3 files changed, 300 insertions(+)
 create mode 100644 test/aiecc/cpp_link_with_no_link_with.mlir
 create mode 100644 test/python/external_func_link_with.py
 create mode 100644 test/python/npu-xrt/test_jit_two_extern_functions.py

diff --git a/test/aiecc/cpp_link_with_no_link_with.mlir b/test/aiecc/cpp_link_with_no_link_with.mlir
new file mode 100644
index 00000000000..d55757e6a53
--- /dev/null
+++ b/test/aiecc/cpp_link_with_no_link_with.mlir
@@ -0,0 +1,28 @@
+//===- cpp_link_with_no_link_with.mlir -------------------------*- MLIR -*-===//
+//
+// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Copyright (C) 2026, Advanced Micro Devices, Inc.
+//
+//===----------------------------------------------------------------------===//
+
+// Verify that aie-assign-core-link-files is a no-op on designs that carry
+// no link_with attributes anywhere — no link_files attribute should appear
+// on any CoreOp, and no warnings should be emitted.
+
+// RUN: aie-opt --verify-diagnostics --aie-assign-core-link-files %s | FileCheck %s
+
+// CHECK-NOT: link_files
+// CHECK-NOT: link_with
+
+module {
+  aie.device(npu1_1col) {
+    %tile_0_2 = aie.tile(0, 2)
+
+    %core_0_2 = aie.core(%tile_0_2) {
+      aie.end
+    }
+  }
+}
diff --git a/test/python/external_func_link_with.py b/test/python/external_func_link_with.py
new file mode 100644
index 00000000000..ed72c5ddf3c
--- /dev/null
+++ b/test/python/external_func_link_with.py
@@ -0,0 +1,106 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Verify that the link_with keyword argument on external_func produces the
+# expected func.func attribute in the emitted MLIR.
+
+# RUN: %python %s | FileCheck %s
+
+import numpy as np
+from aie.dialects.aie import AIEDevice, Device, external_func, tile, end
+from aie.ir import Block, InsertionPoint
+
+from util import construct_and_print_module
+
+
+# Single external_func with link_with produces a func.func with the attribute.
+# CHECK-LABEL: TEST: single_func_link_with
+# CHECK: func.func private @scale({{.*}}) attributes {link_with = "scale.o"}
+@construct_and_print_module
+def single_func_link_with():
+    dev = Device(AIEDevice.npu1_1col)
+    dev_block = Block.create_at_start(dev.body_region)
+    with InsertionPoint(dev_block):
+        external_func(
+            "scale",
+            inputs=[np.ndarray[(16,), np.dtype[np.int32]]],
+            link_with="scale.o",
+        )
+        tile(0, 2)
+        end()
+
+
+# Two external_func declarations sharing the same object file each carry
+# their own link_with attribute.
+# CHECK-LABEL: TEST: two_funcs_same_object_file
+# CHECK-DAG: func.func private @add_one({{.*}}) attributes {link_with = "kernel.o"}
+# CHECK-DAG: func.func private @scale_by_two({{.*}}) attributes {link_with = "kernel.o"}
+@construct_and_print_module
+def two_funcs_same_object_file():
+    dev = Device(AIEDevice.npu1_1col)
+    dev_block = Block.create_at_start(dev.body_region)
+    with InsertionPoint(dev_block):
+        external_func(
+            "add_one",
+            inputs=[
+                np.ndarray[(16,), np.dtype[np.int32]],
+                np.ndarray[(16,), np.dtype[np.int32]],
+            ],
+            link_with="kernel.o",
+        )
+        external_func(
+            "scale_by_two",
+            inputs=[
+                np.ndarray[(16,), np.dtype[np.int32]],
+                np.ndarray[(16,), np.dtype[np.int32]],
+            ],
+            link_with="kernel.o",
+        )
+        tile(0, 2)
+        end()
+
+
+# Two external_func declarations pointing to different object files.
+# CHECK-LABEL: TEST: two_funcs_different_object_files
+# CHECK-DAG: func.func private @add_one({{.*}}) attributes {link_with = "add_one.o"}
+# CHECK-DAG: func.func private @scale_by_two({{.*}}) attributes {link_with = "scale_by_two.o"}
+@construct_and_print_module
+def two_funcs_different_object_files():
+    dev = Device(AIEDevice.npu1_1col)
+    dev_block = Block.create_at_start(dev.body_region)
+    with InsertionPoint(dev_block):
+        external_func(
+            "add_one",
+            inputs=[
+                np.ndarray[(16,), np.dtype[np.int32]],
+                np.ndarray[(16,), np.dtype[np.int32]],
+            ],
+            link_with="add_one.o",
+        )
+        external_func(
+            "scale_by_two",
+            inputs=[
+                np.ndarray[(16,), np.dtype[np.int32]],
+                np.ndarray[(16,), np.dtype[np.int32]],
+            ],
+            link_with="scale_by_two.o",
+        )
+        tile(0, 2)
+        end()
+
+
+# external_func without link_with produces no link_with attribute.
+# CHECK-LABEL: TEST: func_without_link_with
+# CHECK: func.func private @helper({{.*}})
+# CHECK-NOT: link_with
+@construct_and_print_module
+def func_without_link_with():
+    dev = Device(AIEDevice.npu1_1col)
+    dev_block = Block.create_at_start(dev.body_region)
+    with InsertionPoint(dev_block):
+        external_func(
+            "helper",
+            inputs=[np.ndarray[(16,), np.dtype[np.int32]]],
+        )
+        tile(0, 2)
+        end()
diff --git a/test/python/npu-xrt/test_jit_two_extern_functions.py b/test/python/npu-xrt/test_jit_two_extern_functions.py
new file mode 100644
index 00000000000..52ab992aa90
--- /dev/null
+++ b/test/python/npu-xrt/test_jit_two_extern_functions.py
@@ -0,0 +1,166 @@
+# This file is licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# (c) Copyright 2026 AMD Inc.
+
+# RUN: %run_on_npu1% %pytest %s
+# RUN: %run_on_npu2% %pytest %s
+# REQUIRES: xrt_python_bindings
+
+# End-to-end test for the core new capability: a single Worker calling TWO
+# distinct ExternalFunction instances, each compiled to its own object file.
+# This exercises the full multi-.o JIT pipeline:
+#   1. Two separate source compilations (two .o files in the cache dir)
+#   2. aie-assign-core-link-files traces both func.call ops and emits
+#      link_files = ["add_one.o", "scale_by_two.o"] on the CoreOp
+#   3. Two INPUT() directives in the linker script (Peano path)
+#   4. Successful lld link with both object files
+#   5. Core executes both functions: output[i] = (input[i] + 1) * 2
+
+import numpy as np
+import pytest
+
+import aie.iron as iron
+from aie.iron import ExternalFunction, jit
+from aie.iron import ObjectFifo, Worker, Runtime, Program
+from aie.iron.placers import SequentialPlacer
+from aie.iron.controlflow import range_
+
+
+@jit(is_placed=False)
+def add_then_scale(input, output, add_func, scale_func):
+    """Apply add_func then scale_func sequentially on each tile."""
+    num_elements = np.size(input)
+    tile_size = add_func.tile_size(0)
+    num_tiles = num_elements // tile_size
+    dtype = input.dtype
+
+    tensor_ty = np.ndarray[(num_elements,), np.dtype[dtype]]
+    tile_ty = np.ndarray[(tile_size,), np.dtype[dtype]]
+
+    of_in = ObjectFifo(tile_ty, name="in")
+    of_tmp = ObjectFifo(tile_ty, name="tmp")
+    of_out = ObjectFifo(tile_ty, name="out")
+
+    def core_body(of_in, of_tmp, of_out, add_fn, scale_fn):
+        for _ in range_(num_tiles):
+            elem_in = of_in.acquire(1)
+            elem_tmp = of_tmp.acquire(1)
+            add_fn(elem_in, elem_tmp, tile_size)
+            of_in.release(1)
+            of_tmp.release(1)
+
+            elem_tmp2 = of_tmp.acquire(1)
+            elem_out = of_out.acquire(1)
+            scale_fn(elem_tmp2, elem_out, tile_size)
+            of_tmp.release(1)
+            of_out.release(1)
+
+    worker = Worker(
+        core_body,
+        fn_args=[of_in.cons(), of_tmp.prod(), of_out.prod(), add_func, scale_func],
+    )
+
+    rt = Runtime()
+    with rt.sequence(tensor_ty, tensor_ty) as (A, B):
+        rt.start(worker)
+        rt.fill(of_in.prod(), A)
+        rt.drain(of_out.cons(), B, wait=True)
+
+    return Program(iron.get_current_device(), rt).resolve_program(SequentialPlacer())
+
+
+def test_two_external_functions_different_objects():
+    """
+    One core calls two ExternalFunctions compiled to separate object files.
+    Expected result: output[i] = (input[i] + 1) * 2.
+    """
+    add_one = ExternalFunction(
+        "add_one",
+        source_string="""extern "C" {
+            void add_one(int* in, int* out, int n) {
+                for (int i = 0; i < n; i++) out[i] = in[i] + 1;
+            }
+        }""",
+        arg_types=[
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.int32,
+        ],
+    )
+
+    scale_by_two = ExternalFunction(
+        "scale_by_two",
+        source_string="""extern "C" {
+            void scale_by_two(int* in, int* out, int n) {
+                for (int i = 0; i < n; i++) out[i] = in[i] * 2;
+            }
+        }""",
+        arg_types=[
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.int32,
+        ],
+    )
+
+    input_tensor = iron.arange(32, dtype=np.int32)
+    output_tensor = iron.zeros((32,), dtype=np.int32)
+
+    add_then_scale(input_tensor, output_tensor, add_one, scale_by_two)
+
+    expected = (np.arange(32, dtype=np.int32) + 1) * 2
+    np.testing.assert_array_equal(output_tensor.numpy(), expected)
+
+
+def test_two_external_functions_same_object():
+    """
+    One core calls two ExternalFunctions that share the same compiled object
+    file. The aie-assign-core-link-files pass must deduplicate the .o path
+    so it appears only once in link_files and is linked exactly once.
+    Expected result: output[i] = (input[i] + 1) * 2 (same computation, shared .o).
+    """
+    # Both functions come from the same translation unit / object file name.
+    add_one = ExternalFunction(
+        "add_one_shared",
+        object_file_name="shared_kernel.o",
+        source_string="""extern "C" {
+            void add_one_shared(int* in, int* out, int n) {
+                for (int i = 0; i < n; i++) out[i] = in[i] + 1;
+            }
+            void scale_by_two_shared(int* in, int* out, int n) {
+                for (int i = 0; i < n; i++) out[i] = in[i] * 2;
+            }
+        }""",
+        arg_types=[
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.int32,
+        ],
+    )
+
+    scale_by_two = ExternalFunction(
+        "scale_by_two_shared",
+        object_file_name="shared_kernel.o",
+        source_string="""extern "C" {
+            void add_one_shared(int* in, int* out, int n) {
+                for (int i = 0; i < n; i++) out[i] = in[i] + 1;
+            }
+            void scale_by_two_shared(int* in, int* out, int n) {
+                for (int i = 0; i < n; i++) out[i] = in[i] * 2;
+            }
+        }""",
+        arg_types=[
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.ndarray[(16,), np.dtype[np.int32]],
+            np.int32,
+        ],
+    )
+
+    input_tensor = iron.arange(32, dtype=np.int32)
+    output_tensor = iron.zeros((32,), dtype=np.int32)
+
+    add_then_scale(input_tensor, output_tensor, add_one, scale_by_two)
+
+    expected = (np.arange(32, dtype=np.int32) + 1) * 2
+    np.testing.assert_array_equal(output_tensor.numpy(), expected)

From b4711ed9c78b03a65dc0f6fbf3c57375c5dea64e Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Tue, 10 Mar 2026 18:26:06 -0600
Subject: [PATCH 22/28] [python] Rationalize external kernel Python API

kernel.py (breaking changes):
- Rename Kernel(name, bin_name) -> Kernel(name, object_file_name) and
  .bin_name property -> .object_file_name; consistent with ExternalFunction
- Lift tile_size(), arg_types(), and arg-count validation to BaseKernel
- Remove ExternalFunction.debug constructor parameter (use logging.setLevel)
- Remove ExternalFunction no-op __enter__/__exit__ context manager
- Inline _setup_source into __init__; remove single-use indirection
- Fix _instances annotation to standard "name: type = value" form
- Fix ExternalFunction.__init__ redundant _arg_types/_op reassignments

aie.py:
- Core(link_with=...) now raises TypeError immediately; set link_with on
  external_func() declarations instead

jit.py:
- _filter_tensor_args filters any Kernel subclass (not just ExternalFunction)
- Document why external_kernels collects only ExternalFunction (not Kernel)

utils/compile/utils.py:
- compile_external_kernel uses func.object_file_name (renamed property)

programming_examples: migrate 5 placed examples from deprecated
@core(tile, "archive.a") to link_with on each external_func() declaration:
- basic/vector_exp/vector_exp_placed.py
- ml/softmax/softmax_placed.py
- ml/softmax/softmax_whole_array_placed.py
- vision/color_detect/color_detect_placed.py
- vision/edge_detect/edge_detect_placed.py

test/python/npu-xrt/test_jit_extern_functions.py:
- Replace with-statement context manager tests with direct construction

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../basic/vector_exp/vector_exp_placed.py     |   6 +-
 .../ml/softmax/softmax_placed.py              |   4 +-
 .../ml/softmax/softmax_whole_array_placed.py  |   4 +-
 .../color_detect/color_detect_placed.py       |  11 +-
 .../vision/edge_detect/edge_detect_placed.py  |   7 +-
 python/dialects/aie.py                        |   8 +-
 python/iron/kernel.py                         | 225 ++++++++----------
 python/utils/compile/utils.py                 |   4 +-
 python/utils/jit.py                           |  18 +-
 .../npu-xrt/test_jit_extern_functions.py      |  48 ++--
 10 files changed, 156 insertions(+), 179 deletions(-)

diff --git a/programming_examples/basic/vector_exp/vector_exp_placed.py b/programming_examples/basic/vector_exp/vector_exp_placed.py
index 11a1a711d09..cfb87b39925 100644
--- a/programming_examples/basic/vector_exp/vector_exp_placed.py
+++ b/programming_examples/basic/vector_exp/vector_exp_placed.py
@@ -60,7 +60,9 @@ def device_body():
 
         # AIE Core Function declarations
 
-        exp_bf16_1024 = external_func("exp_bf16_1024", inputs=[tile_ty, tile_ty])
+        exp_bf16_1024 = external_func(
+            "exp_bf16_1024", inputs=[tile_ty, tile_ty], link_with="kernels.a"
+        )
 
         # Tile declarations
         ShimTile = tile(0, 0)
@@ -105,7 +107,7 @@ def device_body():
         # Compute tile bodies
         for i in range(n_cores):
             # Compute tile i
-            @core(cores[i], "kernels.a")
+            @core(cores[i])
             def core_body():
                 for _ in range_(0xFFFFFFFF):
                     for _ in range_(tiles):
diff --git a/programming_examples/ml/softmax/softmax_placed.py b/programming_examples/ml/softmax/softmax_placed.py
index ddf0e6485ac..456707ce10e 100755
--- a/programming_examples/ml/softmax/softmax_placed.py
+++ b/programming_examples/ml/softmax/softmax_placed.py
@@ -46,7 +46,7 @@ def device_body():
         # AIE Core Function declarations
 
         softmax_bf16_vector = external_func(
-            "softmax_bf16", inputs=[tile_ty, tile_ty, np.int32]
+            "softmax_bf16", inputs=[tile_ty, tile_ty, np.int32], link_with="kernels.a"
         )
 
         # Tile declarations
@@ -94,7 +94,7 @@ def device_body():
         # Set up compute tiles
         for i in range(n_cores):
             # Compute tile i
-            @core(cores[i], "kernels.a")
+            @core(cores[i])
             def core_body():
                 for _ in range_(0xFFFFFFFF):
                     for _ in range_(tiles):
diff --git a/programming_examples/ml/softmax/softmax_whole_array_placed.py b/programming_examples/ml/softmax/softmax_whole_array_placed.py
index bdb93ba812e..ebae88126de 100644
--- a/programming_examples/ml/softmax/softmax_whole_array_placed.py
+++ b/programming_examples/ml/softmax/softmax_whole_array_placed.py
@@ -61,7 +61,7 @@ def device_body():
         # AIE Core Function declarations
 
         softmax_bf16_vector = external_func(
-            "softmax_bf16", inputs=[tile_ty, tile_ty, np.int32]
+            "softmax_bf16", inputs=[tile_ty, tile_ty, np.int32], link_with="kernels.a"
         )
 
         # Tile declarations
@@ -168,7 +168,7 @@ def device_body():
         # Set up compute tiles
         for i in range(n_cores):
             # Compute tile i
-            @core(cores[i], "kernels.a")
+            @core(cores[i])
             def core_body():
                 for _ in range_(0xFFFFFFFF):
                     for _ in range_(tiles):
diff --git a/programming_examples/vision/color_detect/color_detect_placed.py b/programming_examples/vision/color_detect/color_detect_placed.py
index 0554ce78ad4..16147571f9e 100644
--- a/programming_examples/vision/color_detect/color_detect_placed.py
+++ b/programming_examples/vision/color_detect/color_detect_placed.py
@@ -47,14 +47,19 @@ def deviceBody():
             link_with="threshold.cc.o",
         )
         bitwiseORLine = external_func(
-            "bitwiseORLine", inputs=[line_ty, line_ty, line_ty, np.int32]
+            "bitwiseORLine",
+            inputs=[line_ty, line_ty, line_ty, np.int32],
+            link_with="combined_bitwiseOR_gray2rgba_bitwiseAND.a",
         )
         gray2rgbaLine = external_func(
-            "gray2rgbaLine", inputs=[line_ty, line_bytes_ty, np.int32]
+            "gray2rgbaLine",
+            inputs=[line_ty, line_bytes_ty, np.int32],
+            link_with="combined_bitwiseOR_gray2rgba_bitwiseAND.a",
         )
         bitwiseANDLine = external_func(
             "bitwiseANDLine",
             inputs=[line_bytes_ty, line_bytes_ty, line_bytes_ty, np.int32],
+            link_with="combined_bitwiseOR_gray2rgba_bitwiseAND.a",
         )
 
         # Tile declarations
@@ -175,7 +180,7 @@ def coreBody():
                 OF_4to5.release(ObjectFifoPort.Produce, 1)
 
         # Compute tile 5
-        @core(ComputeTile5, "combined_bitwiseOR_gray2rgba_bitwiseAND.a")
+        @core(ComputeTile5)
         def coreBody():
             for _ in range_(sys.maxsize):
                 # bitwise OR
diff --git a/programming_examples/vision/edge_detect/edge_detect_placed.py b/programming_examples/vision/edge_detect/edge_detect_placed.py
index c493ca23a78..afe074fa2c7 100644
--- a/programming_examples/vision/edge_detect/edge_detect_placed.py
+++ b/programming_examples/vision/edge_detect/edge_detect_placed.py
@@ -49,7 +49,9 @@ def device_body():
             link_with="threshold.cc.o",
         )
         gray2rgba_line = external_func(
-            "gray2rgbaLine", inputs=[line_ty, line_bytes_ty, np.int32]
+            "gray2rgbaLine",
+            inputs=[line_ty, line_bytes_ty, np.int32],
+            link_with="combined_gray2rgba_addWeighted.a",
         )
         add_weighted_line = external_func(
             "addWeightedLine",
@@ -62,6 +64,7 @@ def device_body():
                 np.int16,
                 np.int8,
             ],
+            link_with="combined_gray2rgba_addWeighted.a",
         )
 
         # Tile declarations
@@ -227,7 +230,7 @@ def core_body():
                 OF_4to5.release(ObjectFifoPort.Produce, 1)
 
         # Compute tile 5
-        @core(ComputeTile5, "combined_gray2rgba_addWeighted.a")
+        @core(ComputeTile5)
         def core_body():
             for _ in range_(sys.maxsize):
                 elem_in = OF_4to5.acquire(ObjectFifoPort.Consume, 1)
diff --git a/python/dialects/aie.py b/python/dialects/aie.py
index 0472d476db8..43b6c1b26c1 100644
--- a/python/dialects/aie.py
+++ b/python/dialects/aie.py
@@ -292,11 +292,17 @@ class Core(CoreOp):
     def __init__(
         self, tile, link_with=None, dynamic_objfifo_lowering=None, stack_size=None
     ):
+        if link_with is not None:
+            raise TypeError(
+                "Core() no longer accepts link_with. "
+                "Set link_with= on each external_func() declaration instead; "
+                "the aie-assign-core-link-files pass aggregates them onto the core."
+            )
         super().__init__(
             result=T.index(),
             tile=tile,
             stack_size=stack_size,
-            link_with=link_with,
+            link_with=None,
             dynamic_objfifo_lowering=dynamic_objfifo_lowering,
         )
 
diff --git a/python/iron/kernel.py b/python/iron/kernel.py
index dfd38337e3b..d22f220c6ad 100644
--- a/python/iron/kernel.py
+++ b/python/iron/kernel.py
@@ -21,64 +21,104 @@
 
 
 class BaseKernel(Resolvable):
-    """Base class for kernel-like objects that resolve to FuncOp."""
+    """Base class for AIE core functions that resolve to a func.func declaration.
 
-    def __init__(self, name: str, arg_types: list[type[np.ndarray] | np.dtype] = []):
-        """Initialize base kernel.
+    Subclasses:
+        Kernel: wraps a pre-compiled object file.
+        ExternalFunction: compiles C/C++ source at JIT time.
+    """
 
+    def __init__(self, name: str, arg_types: list[type[np.ndarray] | np.dtype] = []):
+        """
         Args:
-            name (str): The name of the function
-            arg_types (list[type[np.ndarray] | np.dtype], optional): The type signature of the function. Defaults to [].
+            name: Symbol name of the function.
+            arg_types: Type signature of the function arguments.  Defaults to [].
         """
         if not name:
-            raise ValueError("The name of a kernel cannot be empty or null.")
+            raise ValueError("Kernel name cannot be empty.")
         self._name = name
         self._arg_types = arg_types
         self._op: FuncOp | None = None
 
+    def tile_size(self, arg_index: int = 0) -> int:
+        """Return the first dimension of the array argument at ``arg_index``.
+
+        Args:
+            arg_index: Index into ``arg_types``.  Defaults to 0.
+        """
+        if not self._arg_types:
+            raise ValueError("No argument types defined.")
+        if arg_index >= len(self._arg_types):
+            raise ValueError(
+                f"Argument index {arg_index} out of range "
+                f"(max: {len(self._arg_types) - 1})"
+            )
+        arg = self._arg_types[arg_index]
+
+        # numpy array type, e.g. np.ndarray[(16,), np.dtype[np.int32]]
+        if hasattr(arg, "__args__") and len(arg.__args__) > 0:
+            shape_arg = arg.__args__[0]
+            if isinstance(shape_arg, tuple) and len(shape_arg) > 0:
+                return shape_arg[0]
+
+        # MLIR MemRefType
+        if hasattr(arg, "shape") and len(arg.shape) > 0:
+            return arg.shape[0]
+
+        raise ValueError(
+            f"Argument {arg_index} does not have a shape or is not an array type."
+        )
+
+    def arg_types(self) -> list:
+        """Return a copy of the argument type list."""
+        return self._arg_types.copy()
+
     def __call__(self, *args, **kwargs):
-        """Call the kernel with the given arguments."""
+        """Emit a func.call to this kernel, validating argument count."""
         if not self._op:
-            raise ValueError("Need to resolve kernel before it can be called")
-        arg_ops = []
-        for a in args:
-            if isinstance(a, Buffer):
-                arg_ops.append(a.op)
-            else:
-                arg_ops.append(a)
+            raise ValueError("Kernel must be resolved before it can be called.")
+        if len(args) != len(self._arg_types):
+            raise ValueError(
+                f"Kernel '{self._name}' expects {len(self._arg_types)} "
+                f"argument(s), but {len(args)} were provided."
+            )
+        arg_ops = [a.op if isinstance(a, Buffer) else a for a in args]
         call(self._op, arg_ops, **kwargs)
 
 
 class Kernel(BaseKernel):
+    """An AIE core function backed by a pre-compiled object file.
+
+    Use :class:`ExternalFunction` instead when you want to compile from
+    C/C++ source at JIT time.
+
+    ``resolve()`` emits a ``func.func private`` declaration with a
+    ``link_with`` attribute naming ``object_file_name``.  The
+    ``aie-assign-core-link-files`` pass propagates this into the CoreOp's
+    ``link_files`` attribute so the linker knows which file to include.
+    """
+
     def __init__(
         self,
         name: str,
-        bin_name: str,
+        object_file_name: str,
         arg_types: list[type[np.ndarray] | np.dtype] = [],
     ) -> None:
-        """An externally pre-compiled AIE core function.
-
-        ``Kernel`` wraps a pre-built object file (``.o``).  Use
-        ``ExternalFunction`` instead when you want to compile from C/C++ source.
-
-        When the kernel is first called inside a core body, ``resolve()`` emits
-        a ``func.func`` declaration with a ``link_with`` attribute naming
-        ``bin_name``; the ``aie-assign-core-link-files`` pass later propagates
-        that into the CoreOp's ``link_files`` attribute for the linker.
-
+        """
         Args:
             name: Symbol name of the function as it appears in the object file.
-            bin_name: Filename of the pre-compiled object file (e.g.,
-                ``"add_one.o"``).  Must be available on the linker search path
+            object_file_name: Filename of the pre-compiled object file
+                (e.g. ``"add_one.o"``).  Must be on the linker search path
                 at compile time.
             arg_types: Type signature of the function arguments.  Defaults to [].
         """
         super().__init__(name, arg_types)
-        self._bin_name = bin_name
+        self._object_file_name = object_file_name
 
     @property
-    def bin_name(self) -> str:
-        return self._bin_name
+    def object_file_name(self) -> str:
+        """Filename of the compiled object file."""
+        return self._object_file_name
 
     def resolve(
         self,
@@ -87,7 +127,7 @@ def resolve(
     ) -> None:
         if not self._op:
             self._op = external_func(
-                self._name, inputs=self._arg_types, link_with=self._bin_name
+                self._name, inputs=self._arg_types, link_with=self._object_file_name
             )
 
 
@@ -100,11 +140,11 @@ class ExternalFunction(Kernel):
     start of each ``@jit`` call to prevent stale registrations from a previous
     (possibly failed) run.
 
-    Use the base ``Kernel`` class instead when you have a pre-built object file.
+    Use the base :class:`Kernel` class instead when you have a pre-built
+    object file.
     """
 
-    _instances: set  # Registry of all live ExternalFunction instances.
-    _instances = set()
+    _instances: set = set()  # Registry of all live ExternalFunction instances.
 
     def __init__(
         self,
@@ -115,127 +155,67 @@ def __init__(
         arg_types: list[type[np.ndarray] | np.dtype] = [],
         include_dirs: list[str] = [],
         compile_flags: list[str] = [],
-        debug: bool = False,
     ) -> None:
         """
         Args:
-            name: Symbol name of the function as it will appear in the object file.
-            object_file_name: Output object file name.  Defaults to ``<name>.o``.
+            name: Symbol name of the function as it will appear in the object
+                file.
+            object_file_name: Output object file name.  Defaults to
+                ``<name>.o``.
             source_file: Path to a C/C++ source file on disk.  Mutually
                 exclusive with ``source_string``.
             source_string: Inline C/C++ source code.  Mutually exclusive with
                 ``source_file``.
-            arg_types: Type signature of the function arguments.  Defaults to [].
+            arg_types: Type signature of the function arguments.  Defaults to
+                [].
             include_dirs: Additional ``-I`` directories passed to the Peano
                 compiler.  Defaults to [].
             compile_flags: Additional flags passed verbatim to the Peano
                 compiler.  Defaults to [].
-            debug: If True, emit debug log messages during construction.
-                Defaults to False.
         """
         if not object_file_name:
             object_file_name = f"{name}.o"
         super().__init__(name, object_file_name, arg_types)
 
-        self._setup_source(source_file, source_string)
-        self._include_dirs = include_dirs
-        self._compile_flags = compile_flags
-        self._compiled = False
-        self._arg_types = arg_types
-        self._op: FuncOp | None = None
-        self._debug = debug
-
-        if self._debug:
-            logger.debug("Initializing ExternalFunction: %s", name)
-            logger.debug("Source file: %s", source_file)
-            logger.debug("Include dirs: %s", include_dirs)
-            logger.debug("Compile flags: %s", compile_flags)
-
-        # Register this instance so the @jit decorator can compile it.
-        ExternalFunction._instances.add(self)
-
-    def _setup_source(self, source_file: str | None, source_string: str | None) -> None:
-        """Set up the source file for compilation."""
         if source_file is not None:
             self._source_file = source_file
             self._source_string = None
-        else:
-            if source_string is None:
-                raise ValueError("source_file or source_string must be provided")
+        elif source_string is not None:
             self._source_file = None
             self._source_string = source_string
+        else:
+            raise ValueError("source_file or source_string must be provided.")
 
-    def __enter__(self):
-        """Support use as a context manager (``with ExternalFunction(...) as f``)."""
-        return self
-
-    def __exit__(self, exc_type, exc_value, traceback):
-        """No cleanup is performed on exit; the context manager is purely syntactic."""
-        pass
-
-    def tile_size(self, arg_index: int = 0) -> int:
-        """Get the tile size from the specified array argument type.
-
-        Args:
-            arg_index (int): Index of the argument to get tile size from. Defaults to 0.
-
-        Returns:
-            int: The tile size (first dimension) of the specified argument.
-        """
-        if not self._arg_types:
-            raise ValueError("No argument types defined")
-        if arg_index >= len(self._arg_types):
-            raise ValueError(
-                f"Argument index {arg_index} out of range (max: {len(self._arg_types) - 1})"
-            )
-
-        arg = self._arg_types[arg_index]
-
-        # Handle numpy array types like np.ndarray[(16,), np.dtype[np.int32]]
-        if hasattr(arg, "__args__") and len(arg.__args__) > 0:
-            # For types like np.ndarray[(16,), np.dtype[np.int32]], the shape is in __args__[0]
-            shape_arg = arg.__args__[0]
-            if isinstance(shape_arg, tuple) and len(shape_arg) > 0:
-                return shape_arg[0]
-
-        # Handle MLIR types like MemRefType(memref<16xi32>)
-        if (
-            hasattr(arg, "shape")
-            and hasattr(arg.shape, "__len__")
-            and len(arg.shape) > 0
-        ):
-            return arg.shape[0]
-
-        raise ValueError(
-            f"Argument {arg_index} does not have a shape or is not an array type"
-        )
+        self._include_dirs = include_dirs
+        self._compile_flags = compile_flags
+        self._compiled = False
 
-    def arg_types(self) -> list:
-        """Get the argument types of the ExternalFunction."""
-        return self._arg_types.copy()
+        # Register this instance so the @jit decorator can compile it.
+        ExternalFunction._instances.add(self)
 
     def __call__(self, *args, **kwargs):
-        """Call the ExternalFunction with argument validation."""
+        """Call with argument count and type validation before emitting MLIR."""
         if len(args) != len(self._arg_types):
             raise ValueError(
-                f"ExternalFunction '{self._name}' expects {len(self._arg_types)} argument(s), "
-                f"but {len(args)} were provided."
+                f"ExternalFunction '{self._name}' expects "
+                f"{len(self._arg_types)} argument(s), but {len(args)} "
+                f"were provided."
             )
         for i, (arg, expected_ty) in enumerate(zip(args, self._arg_types)):
             self._validate_arg(i, arg, expected_ty)
+        # Delegate to BaseKernel for the actual MLIR func.call emission.
+        # BaseKernel also validates count, which is harmless redundancy but
+        # ensures the check holds even if this override is bypassed.
         super().__call__(*args, **kwargs)
 
     def _validate_arg(self, index: int, arg, expected_ty) -> None:
         """Validate a single argument against its expected type."""
-        # Scalar types (np.int32, np.float32, etc.)
         if isinstance(expected_ty, type) and issubclass(expected_ty, np.generic):
             if not isinstance(arg, (int, float, np.integer, np.floating)):
                 raise ValueError(
                     f"Argument {index}: expected scalar, got {type(arg).__name__}"
                 )
             return
-
-        # Array types - check shape and dtype
         if hasattr(expected_ty, "__args__") and hasattr(arg, "shape"):
             expected_shape = expected_ty.__args__[0]
             expected_dtype = expected_ty.__args__[1].__args__[0]
@@ -246,27 +226,18 @@ def _validate_arg(self, index: int, arg, expected_ty) -> None:
                 )
 
     def __hash__(self):
-        """
-        Compute a hash for the ExternalFunction based on its properties.
-        This allows ExternalFunction instances to be used in cache keys.
-        """
-        # Create a string representation of the function's key properties
+        """Hash based on source content and compiler options for cache keying."""
+        # TODO: extend to cover included headers (issue #2543)
         hash_parts = [
             self._name,
             str(self._arg_types),
             str(sorted(self._include_dirs)),
             str(sorted(self._compile_flags)),
         ]
-
-        # Include source content for uniqueness
-        # TODO: This solution needs to be extended to handle headers. See https://github.com/Xilinx/mlir-aie/issues/2543
         if self._source_string:
             hash_parts.append(self._source_string)
         elif self._source_file:
             with open(self._source_file, "r") as f:
-                file_content = f.read()
-            hash_parts.append(file_content)
-
-        # Create hash from combined string
+                hash_parts.append(f.read())
         combined = "|".join(hash_parts)
         return int(hashlib.sha256(combined.encode("utf-8")).hexdigest()[:8], 16)
diff --git a/python/utils/compile/utils.py b/python/utils/compile/utils.py
index f3795b9da1f..a62aa17cfde 100644
--- a/python/utils/compile/utils.py
+++ b/python/utils/compile/utils.py
@@ -163,7 +163,7 @@ def compile_external_kernel(func, kernel_dir, target_arch):
     """
     Compile an ExternalFunction to an object file in the kernel directory.
 
-    The output file is named ``func.bin_name`` and placed in ``kernel_dir``.
+    The output file is named ``func.object_file_name`` and placed in ``kernel_dir``.
     If the object file already exists in ``kernel_dir``, compilation is skipped.
 
     Args:
@@ -179,7 +179,7 @@ def compile_external_kernel(func, kernel_dir, target_arch):
         return
 
     # Skip if the object file already exists (cache hit).
-    output_file = os.path.join(kernel_dir, func.bin_name)
+    output_file = os.path.join(kernel_dir, func.object_file_name)
     if os.path.exists(output_file):
         return
 
diff --git a/python/utils/jit.py b/python/utils/jit.py
index 73bdae2e813..5a90b0cb6c0 100644
--- a/python/utils/jit.py
+++ b/python/utils/jit.py
@@ -19,7 +19,7 @@
 from .compile.cache.utils import _create_function_cache_key, file_lock
 from .compile import NPU_CACHE_HOME
 from .compile.utils import _cleanup_failed_compilation
-from aie.iron.kernel import ExternalFunction
+from aie.iron.kernel import ExternalFunction, Kernel
 
 # Global cache for compiled kernels at the function level
 # Key: (function_name, args_signature) -> NPUKernel instance
@@ -69,11 +69,13 @@ def decorator(*args, **kwargs):
             tensor_args = _filter_tensor_args(args)
             return cached_kernel(*tensor_args, **kwargs)
 
-        # Collect ExternalFunction instances passed as direct arguments first.
+        # Collect ExternalFunction instances that need JIT compilation.
+        # Note: bare Kernel instances (pre-compiled .o) are intentionally
+        # excluded here — they require no compilation step. Both Kernel and
+        # ExternalFunction are stripped from the tensor args passed to the NPU
+        # kernel (see _filter_tensor_args).
         # ExternalFunction.__init__ registers to _instances at construction time
         # (before this JIT call), so they must be captured before the clear below.
-        # Note: ExternalFunction instances nested inside containers are not
-        # collected here; top-level args cover all known call patterns.
         external_kernels = [
             arg for arg in args if isinstance(arg, ExternalFunction)
         ] + [v for v in kwargs.values() if isinstance(v, ExternalFunction)]
@@ -188,19 +190,19 @@ def _filter_tensor_args(args):
     """
     Filter out non-tensor arguments from args.
 
-    Algorithm functions may include ExternalFunction instances and scalar
+    Algorithm functions may include Kernel/ExternalFunction instances and scalar
     compile-time constants in their Python signature that must not be forwarded
     to the NPU kernel as runtime buffer arguments.
 
     Removes:
-    - ExternalFunction instances (resolved at compile time via link_with)
+    - Kernel and ExternalFunction instances (resolved at compile time via link_with)
     - Scalar values (int, float, np.integer, np.floating) used as MLIR constants
     - Callables (e.g. lambda configuration helpers)
     """
     tensor_args = []
     for arg in args:
-        # Skip ExternalFunction
-        if isinstance(arg, ExternalFunction):
+        # Skip any kernel handle (Kernel, ExternalFunction, or subclasses)
+        if isinstance(arg, Kernel):
             continue
         # Skip scalar types (MLIR constants)
         if isinstance(arg, (int, float, np.integer, np.floating)):
diff --git a/test/python/npu-xrt/test_jit_extern_functions.py b/test/python/npu-xrt/test_jit_extern_functions.py
index 049031eb96d..4dac11ed75b 100644
--- a/test/python/npu-xrt/test_jit_extern_functions.py
+++ b/test/python/npu-xrt/test_jit_extern_functions.py
@@ -272,16 +272,14 @@ def test_include_directories():
         # Create a header file
         header_file = os.path.join(temp_dir, "math_ops.h")
         with open(header_file, "w") as f:
-            f.write(
-                """
+            f.write("""
 #ifndef MATH_OPS_H
 #define MATH_OPS_H
 
 #define ADD_VALUE 42
 
 #endif
-"""
-            )
+""")
 
         # Create input and output tensors
         input_tensor = iron.randint(0, 100, (1024,), dtype=np.int32, device="npu")
@@ -415,18 +413,16 @@ def test_caching_same_source():
     np.testing.assert_array_equal(result1, result2)
 
 
-def test_context_manager():
-    """Test ExternalFunction with context manager syntax."""
-    # Create input and output tensors
+def test_inline_source_string():
+    """Test ExternalFunction constructed inline with a source string."""
     input_tensor = iron.randint(0, 100, (1024,), dtype=np.int32, device="npu")
     output_tensor = iron.zeros((1024,), dtype=np.int32, device="npu")
     initial_tensor = input_tensor.numpy().copy()
 
-    # Create ExternalFunction and use it with context manager
-    with ExternalFunction(
-        "add_one_context",
+    add_one = ExternalFunction(
+        "add_one_inline",
         source_string="""extern "C" {
-            void add_one_context(int* input, int* output, int tile_size) {
+            void add_one_inline(int* input, int* output, int tile_size) {
                 for (int i = 0; i < tile_size; i++) {
                     output[i] = input[i] + 1;
                 }
@@ -437,28 +433,23 @@ def test_context_manager():
             np.ndarray[(16,), np.dtype[np.int32]],
             np.int32,
         ],
-    ) as add_one:
-        # Apply the transform
-        transform(input_tensor, output_tensor, add_one)
+    )
+    transform(input_tensor, output_tensor, add_one)
 
-    # Verify results
     expected = initial_tensor + 1
-    actual = output_tensor.numpy()
-    np.testing.assert_array_equal(actual, expected)
+    np.testing.assert_array_equal(output_tensor.numpy(), expected)
 
 
-def test_context_manager_with_compiler_options():
-    """Test ExternalFunction with context manager and compiler options."""
-    # Create input and output tensors
+def test_inline_source_string_with_compiler_options():
+    """Test ExternalFunction constructed inline with compile flags."""
     input_tensor = iron.randint(0, 100, (1024,), dtype=np.int32, device="npu")
     output_tensor = iron.zeros((1024,), dtype=np.int32, device="npu")
     initial_tensor = input_tensor.numpy().copy()
 
-    # Create ExternalFunction with compiler options using context manager
-    with ExternalFunction(
-        "add_value_context",
+    add_value = ExternalFunction(
+        "add_value_inline",
         source_string="""extern "C" {
-            void add_value_context(int* input, int* output, int tile_size) {
+            void add_value_inline(int* input, int* output, int tile_size) {
                 for (int i = 0; i < tile_size; i++) {
                     output[i] = input[i] + ADD_VALUE;
                 }
@@ -470,14 +461,11 @@ def test_context_manager_with_compiler_options():
             np.int32,
         ],
         compile_flags=["-DADD_VALUE=42"],
-    ) as add_value:
-        # Apply the transform
-        transform(input_tensor, output_tensor, add_value)
+    )
+    transform(input_tensor, output_tensor, add_value)
 
-    # Verify results
     expected = initial_tensor + 42
-    actual = output_tensor.numpy()
-    np.testing.assert_array_equal(actual, expected)
+    np.testing.assert_array_equal(output_tensor.numpy(), expected)
 
 
 def test_source_file():

From 038084ed7515001e25fc233e4be50ce9e9e38141 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Tue, 10 Mar 2026 19:09:25 -0600
Subject: [PATCH 23/28] [fix] Update Python tests broken by Core(link_with=...)
 removal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Core() no longer accepts link_with — four Python FileCheck tests used
the old API and failed in CI with TypeError.

npu.py: move link_with from @core(tile, "file.o") onto each
external_func() declaration for all five cores in my_vector_scalar,
my_matmul, and edge_detect.

core_ext_kernel.py, code_region.py: move link_with from Core/core to
external_func(); update FileCheck patterns to match link_with on
func.func rather than on the core block attributes.

aie_ops.py (coreOpParameters): remove link_with from Core() call and
from the expected CHECK pattern — core-level link_with is no longer
supported via the Python API.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 test/python/aie_ops.py         |  4 ++--
 test/python/code_region.py     | 11 +++++++----
 test/python/core_ext_kernel.py |  7 ++++---
 test/python/npu.py             | 32 ++++++++++++++++++++++----------
 4 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/test/python/aie_ops.py b/test/python/aie_ops.py
index 1ec19ad6f00..d01456cb6be 100644
--- a/test/python/aie_ops.py
+++ b/test/python/aie_ops.py
@@ -62,11 +62,11 @@ def coreOp():
 # CHECK: %[[VAL1:.*]] = aie.tile(1, 1)
 # CHECK: %[[VAL2:.*]] = aie.core(%[[VAL1]]) {
 # CHECK:   aie.end
-# CHECK: } {dynamic_objfifo_lowering = false, link_with = "test.elf", stack_size = 2048 : i32}
+# CHECK: } {dynamic_objfifo_lowering = false, stack_size = 2048 : i32}
 @construct_and_print_module
 def coreOpParameters():
     t = tile(col=1, row=1)
-    c = Core(t, link_with="test.elf", dynamic_objfifo_lowering=False, stack_size=2048)
+    c = Core(t, dynamic_objfifo_lowering=False, stack_size=2048)
     bb = Block.create_at_start(c.body)
     with InsertionPoint(bb):
         end()
diff --git a/test/python/code_region.py b/test/python/code_region.py
index 2a2786e6986..72740978efd 100644
--- a/test/python/code_region.py
+++ b/test/python/code_region.py
@@ -20,7 +20,7 @@
 
 # CHECK:  module {
 # CHECK:    aie.device(xcve2802) {
-# CHECK:      func.func private @test_func(memref<8x8xi32>) -> i32
+# CHECK:      func.func private @test_func(memref<8x8xi32>) -> i32 attributes {link_with = "test.o"}
 # CHECK:      %{{.*}}tile_0_2 = aie.tile(0, 2)
 # CHECK:      %{{.*}}tile_1_2 = aie.tile(1, 2)
 # CHECK:      %{{.*}}tile_3_3 = aie.tile(3, 3)
@@ -38,7 +38,7 @@
 # CHECK:          aie.objectfifo.release @of1(Consume, 1)
 # CHECK:        }
 # CHECK:        aie.end
-# CHECK:      } {link_with = "test.o"}
+# CHECK:      }
 # CHECK:    }
 # CHECK:  }
 @construct_and_print_module
@@ -46,7 +46,10 @@ def codeRegion():
     @device(AIEDevice.xcve2802)
     def device_body():
         test_func = external_func(
-            "test_func", inputs=[T.memref(8, 8, T.i32())], outputs=[np.int32]
+            "test_func",
+            inputs=[T.memref(8, 8, T.i32())],
+            outputs=[np.int32],
+            link_with="test.o",
         )
 
         S = tile(0, 2)
@@ -57,7 +60,7 @@ def device_body():
         of1 = object_fifo("of1", M, N, 2, T.memref(8, 8, T.i32()))
         object_fifo_link(of0, of1)
 
-        @core(N, "test.o")
+        @core(N)
         def core_body():
             for _ in range_(10):
                 elem0 = of1.acquire(ObjectFifoPort.Consume, 1)
diff --git a/test/python/core_ext_kernel.py b/test/python/core_ext_kernel.py
index 231774ea9e0..3b3dc51ade6 100644
--- a/test/python/core_ext_kernel.py
+++ b/test/python/core_ext_kernel.py
@@ -23,7 +23,7 @@
 
 # CHECK:  module {
 # CHECK:    aie.device(xcve2802) {
-# CHECK:      func.func private @test_func(memref<8x8xi32>, i32) -> i32
+# CHECK:      func.func private @test_func(memref<8x8xi32>, i32) -> i32 attributes {link_with = "test.o"}
 # CHECK:      %{{.*}}tile_0_2 = aie.tile(0, 2)
 # CHECK:      %{{.*}}tile_1_2 = aie.tile(1, 2)
 # CHECK:      %{{.*}}tile_3_3 = aie.tile(3, 3)
@@ -42,7 +42,7 @@
 # CHECK:          aie.objectfifo.release @of1(Consume, 1)
 # CHECK:        }
 # CHECK:        aie.end
-# CHECK:      } {link_with = "test.o"}
+# CHECK:      }
 # CHECK:    }
 # CHECK:  }
 @construct_and_print_module
@@ -54,6 +54,7 @@ def core_ext_kernel():
             "test_func",
             inputs=[np.ndarray[(8, 8), np.dtype[np.int32]], np.int32],
             outputs=[T.i32()],
+            link_with="test.o",
         )
 
         S = tile(0, 2)
@@ -64,7 +65,7 @@ def core_ext_kernel():
         of1 = object_fifo("of1", M, N, 2, T.memref(8, 8, T.i32()))
         object_fifo_link(of0, of1)
 
-        C = Core(N, "test.o")
+        C = Core(N)
         bb = Block.create_at_start(C.body)
         with InsertionPoint(bb):
             for _ in range_(10):
diff --git a/test/python/npu.py b/test/python/npu.py
index 666206aa21e..b0e3ac6045a 100644
--- a/test/python/npu.py
+++ b/test/python/npu.py
@@ -46,7 +46,9 @@ def my_vector_scalar(module):
     def device_body():
         n_ty = np.ndarray[(n,), np.dtype[np.int32]]
         N_ty = np.ndarray[(N,), np.dtype[np.int32]]
-        scale_int32 = external_func("scale_int32", inputs=[n_ty, n_ty])
+        scale_int32 = external_func(
+            "scale_int32", inputs=[n_ty, n_ty], link_with="scale.o"
+        )
 
         S = tile(0, 0)
         M = tile(0, 2)
@@ -54,7 +56,7 @@ def device_body():
         of_in = object_fifo("in", S, M, buffer_depth, n_ty)
         of_out = object_fifo("out", M, S, buffer_depth, n_ty)
 
-        @core(M, "scale.o")
+        @core(M)
         def core_body():
             # Effective while(1)
             for _ in range_(0xFFFFFFFF):
@@ -117,7 +119,9 @@ def my_matmul(module):
     def device_body():
         func_type = "" if vectorized else "scalar_"
         zero = external_func(
-            f"zero_{func_type}i16", inputs=[np.ndarray[(m, n), np.dtype[np.int16]]]
+            f"zero_{func_type}i16",
+            inputs=[np.ndarray[(m, n), np.dtype[np.int16]]],
+            link_with="mm.o",
         )
         matmul = external_func(
             f"matmul_{func_type}i16_i16",
@@ -126,6 +130,7 @@ def device_body():
                 np.ndarray[(k, n), np.dtype[np.int16]],
                 np.ndarray[(m, n), np.dtype[np.int16]],
             ],
+            link_with="mm.o",
         )
 
         S = tile(0, 0)
@@ -135,7 +140,7 @@ def device_body():
         of_inB = object_fifo("inB", S, M, 2, np.ndarray[(k, n), np.dtype[np.int16]])
         of_outC = object_fifo("outC", M, S, 2, np.ndarray[(m, n), np.dtype[np.int16]])
 
-        @core(M, "mm.o")
+        @core(M)
         def core_body():
             for _ in range_(0xFFFFFFFF):
                 for _ in range_(tiles):
@@ -211,7 +216,9 @@ def device_body():
         vec64_ty = np.ndarray[(64,), np.dtype[np.uint8]]
         vec256_ty = np.ndarray[(256,), np.dtype[np.uint8]]
         rgba2gray_line = external_func(
-            "rgba2gray_line", inputs=[vec256_ty, vec64_ty, np.int32]
+            "rgba2gray_line",
+            inputs=[vec256_ty, vec64_ty, np.int32],
+            link_with="rgba2gray.cc.o",
         )
         filter2d_line = external_func(
             "filter2d_line",
@@ -223,6 +230,7 @@ def device_body():
                 np.int32,
                 np.ndarray[(3, 3), np.dtype[np.int16]],
             ],
+            link_with="filter2d.cc.o",
         )
         threshold_line = external_func(
             "threshold_line",
@@ -234,9 +242,12 @@ def device_body():
                 np.int16,
                 np.int8,
             ],
+            link_with="threshold.cc.o",
         )
         gray2rgba_line = external_func(
-            "gray2rgba_line", inputs=[vec64_ty, vec256_ty, np.int32]
+            "gray2rgba_line",
+            inputs=[vec64_ty, vec256_ty, np.int32],
+            link_with="combined_gray2rgba_addWeighted.a",
         )
         add_weighted_line = external_func(
             "add_weighted_line",
@@ -249,6 +260,7 @@ def device_body():
                 np.int16,
                 np.int8,
             ],
+            link_with="combined_gray2rgba_addWeighted.a",
         )
 
         S = tile(0, 0)
@@ -271,7 +283,7 @@ def device_body():
         OF_4to5 = object_fifo("OF_4to5", T4, T5, 2, vec64_ty)
         OF_5to5 = object_fifo("OF_5to5", T5, T5, 1, vec256_ty)
 
-        @core(T2, "rgba2gray.cc.o")
+        @core(T2)
         def core_body():
             for _ in range_(36):
                 elem_in = inOF_L2L1.acquire(ObjectFifoPort.Consume, 1)
@@ -282,7 +294,7 @@ def core_body():
                 inOF_L2L1.release(ObjectFifoPort.Consume, 1)
                 OF_2to3.release(ObjectFifoPort.Produce, 1)
 
-        @core(T3, "filter2d.cc.o")
+        @core(T3)
         def core_body():
             kernel = memref.alloc((3, 3), T.i16())
             v0 = 0
@@ -335,7 +347,7 @@ def core_body():
             OF_2to3.release(ObjectFifoPort.Consume, 2)
             OF_3to4.release(ObjectFifoPort.Produce, 1)
 
-        @core(T4, "threshold.cc.o")
+        @core(T4)
         def core_body():
             v_thr = 10
             v_max = 255
@@ -348,7 +360,7 @@ def core_body():
                 OF_3to4.release(ObjectFifoPort.Consume, 1)
                 OF_4to5.release(ObjectFifoPort.Produce, 1)
 
-        @core(T5, "combined_gray2rgba_addWeighted.a")
+        @core(T5)
         def core_body():
             for _ in range_(36):
                 elem_in = OF_4to5.acquire(ObjectFifoPort.Consume, 1)

From a61dee4f0305cc5de1a0384ec26e82aa18092422 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Tue, 10 Mar 2026 19:31:16 -0600
Subject: [PATCH 24/28] [fix] Migrate remaining tests from deprecated
 Core(link_with=...) API

trace_utils.py: missed in the previous CI-fix commit.

test/npu-xrt/*/aie2.py: 9 hardware tests using @core(tile, "archive")
are broken since Core() no longer accepts link_with. Migrate all of them
to link_with on the external_func() declaration.

aie_ops.py: remove fragile inline TypeError assertion; add a standalone
test/python/core_link_with_removed.py that correctly tests the TypeError
using pytest.raises without risking crashing the FileCheck script.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../bd_chain_repeat_on_memtile/aie2.py        |  6 ++--
 .../dynamic_object_fifo/nested_loops/aie2.py  |  6 ++--
 .../dynamic_object_fifo/ping_pong/aie2.py     |  6 ++--
 .../dynamic_object_fifo/reduction/aie2.py     |  6 ++--
 .../sliding_window/aie2.py                    |  6 ++--
 .../sliding_window_conditional/aie2.py        |  6 ++--
 .../two_core_sliding_window/aie2.py           | 12 +++++---
 test/npu-xrt/matrix_transpose/aie2.py         |  6 ++--
 test/npu-xrt/nd_memcpy_transforms/aie2.py     |  3 +-
 test/python/core_link_with_removed.py         | 29 +++++++++++++++++++
 test/python/trace_utils.py                    |  6 ++--
 11 files changed, 71 insertions(+), 21 deletions(-)
 create mode 100644 test/python/core_link_with_removed.py

diff --git a/test/npu-xrt/bd_chain_repeat_on_memtile/aie2.py b/test/npu-xrt/bd_chain_repeat_on_memtile/aie2.py
index 4eaf1db77e6..3dc23da6f0b 100644
--- a/test/npu-xrt/bd_chain_repeat_on_memtile/aie2.py
+++ b/test/npu-xrt/bd_chain_repeat_on_memtile/aie2.py
@@ -60,7 +60,9 @@ def device_body():
 
         # AIE Core Function declarations
         passThroughLine = external_func(
-            "passThroughLine", inputs=[core_chunk_ty, core_chunk_ty, np.int32]
+            "passThroughLine",
+            inputs=[core_chunk_ty, core_chunk_ty, np.int32],
+            link_with="kernel.cc.o",
         )
 
         ShimTile = tile(0, 0)
@@ -120,7 +122,7 @@ def device_body():
         for i, compute_tile in enumerate(compute_tiles):
 
             def make_core_fn(idx):
-                @core(compute_tile, "kernel.cc.o")
+                @core(compute_tile)
                 def core_body():
                     for _ in range_(sys.maxsize):
                         elemOut = of_join[idx].acquire(ObjectFifoPort.Produce, 1)
diff --git a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
index 4fb63781d22..2221a7bf392 100644
--- a/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/nested_loops/aie2.py
@@ -44,11 +44,13 @@ def device_body():
 
             # AIE Core Function declarations
             passthrough_10_i32 = external_func(
-                "passthrough_10_i32", inputs=[tensor_ty, tensor_ty]
+                "passthrough_10_i32",
+                inputs=[tensor_ty, tensor_ty],
+                link_with="kernel.o",
             )
 
             # Set up compute tiles
-            @core(ComputeTile, "kernel.o")
+            @core(ComputeTile)
             def core_body():
                 for _ in range_(5):
                     elemIn = of_in.acquire(ObjectFifoPort.Consume, 1)
diff --git a/test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py b/test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py
index f79875a017e..299370178c0 100644
--- a/test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/ping_pong/aie2.py
@@ -42,11 +42,13 @@ def device_body():
 
             # AIE Core Function declarations
             passthrough_64_i32 = external_func(
-                "passthrough_64_i32", inputs=[tensor_ty, tensor_ty]
+                "passthrough_64_i32",
+                inputs=[tensor_ty, tensor_ty],
+                link_with="kernel.o",
             )
 
             # Set up compute tiles
-            @core(ComputeTile, "kernel.o")
+            @core(ComputeTile)
             def core_body():
                 for _ in range_(sys.maxsize):
                     elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
diff --git a/test/npu-xrt/dynamic_object_fifo/reduction/aie2.py b/test/npu-xrt/dynamic_object_fifo/reduction/aie2.py
index 2b9e77194bf..ffd36c9f6aa 100644
--- a/test/npu-xrt/dynamic_object_fifo/reduction/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/reduction/aie2.py
@@ -43,10 +43,12 @@ def device_body():
             of_out = object_fifo("out", ComputeTile, ShimTile, 2, tile_ty)
 
             # AIE Core Function declarations
-            add_10_i32 = external_func("add_10_i32", inputs=[tile_ty, tile_ty, tile_ty])
+            add_10_i32 = external_func(
+                "add_10_i32", inputs=[tile_ty, tile_ty, tile_ty], link_with="kernel.o"
+            )
 
             # Set up compute tiles
-            @core(ComputeTile, "kernel.o")
+            @core(ComputeTile)
             def core_body():
                 for _ in range_(sys.maxsize):
                     elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
index 68e7f3bcb1e..79709877940 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window/aie2.py
@@ -41,12 +41,14 @@ def device_body():
 
             # AIE Core Function declarations
             add_10_i32 = external_func(
-                "add_10_i32", inputs=[memRef_ty, memRef_ty, memRef_ty]
+                "add_10_i32",
+                inputs=[memRef_ty, memRef_ty, memRef_ty],
+                link_with="kernel.o",
             )
 
             # Set up compute tiles
 
-            @core(ComputeTile, "kernel.o")
+            @core(ComputeTile)
             def core_body():
                 elemOutPre = of_out.acquire(ObjectFifoPort.Produce, 1)
                 elemInPre = of_in.acquire(ObjectFifoPort.Consume, 1)
diff --git a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
index 001752be2ad..0e43a7b0b99 100644
--- a/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/sliding_window_conditional/aie2.py
@@ -45,11 +45,13 @@ def device_body():
 
             # AIE Core Function declarations
             add_10_i32 = external_func(
-                "add_10_i32", inputs=[subtensor_ty, subtensor_ty, subtensor_ty]
+                "add_10_i32",
+                inputs=[subtensor_ty, subtensor_ty, subtensor_ty],
+                link_with="kernel.o",
             )
 
             # Set up compute tiles
-            @core(ComputeTile, "kernel.o")
+            @core(ComputeTile)
             def core_body():
                 for i in range_(10):
                     elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)
diff --git a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
index 5e75e91291f..72fa89fe959 100644
--- a/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
+++ b/test/npu-xrt/dynamic_object_fifo/two_core_sliding_window/aie2.py
@@ -45,15 +45,19 @@ def device_body():
 
             # AIE Core Function declarations
             passthrough_10_i32 = external_func(
-                "passthrough_10_i32", inputs=[subtensor_ty, subtensor_ty]
+                "passthrough_10_i32",
+                inputs=[subtensor_ty, subtensor_ty],
+                link_with="kernel.o",
             )
             add_10_i32 = external_func(
-                "add_10_i32", inputs=[subtensor_ty, subtensor_ty, subtensor_ty]
+                "add_10_i32",
+                inputs=[subtensor_ty, subtensor_ty, subtensor_ty],
+                link_with="kernel.o",
             )
 
             # Set up compute tiles
 
-            @core(ComputeTile, "kernel.o")
+            @core(ComputeTile)
             def core_body():
                 for _ in range_(10):
                     elemOut = of_in2.acquire(ObjectFifoPort.Produce, 1)
@@ -62,7 +66,7 @@ def core_body():
                     of_in.release(ObjectFifoPort.Consume, 1)
                     of_in2.release(ObjectFifoPort.Produce, 1)
 
-            @core(ComputeTile2, "kernel.o")
+            @core(ComputeTile2)
             def core_body():
                 elemOutPre = of_out.acquire(ObjectFifoPort.Produce, 1)
                 elemInPre = of_in2.acquire(ObjectFifoPort.Consume, 1)
diff --git a/test/npu-xrt/matrix_transpose/aie2.py b/test/npu-xrt/matrix_transpose/aie2.py
index 455aa3fb73e..9b10709adb3 100644
--- a/test/npu-xrt/matrix_transpose/aie2.py
+++ b/test/npu-xrt/matrix_transpose/aie2.py
@@ -33,7 +33,9 @@ def device_body():
             matrix_ty = np.ndarray[(matrix_size,), np.dtype[np.int32]]
 
             passthrough_func = external_func(
-                "passthrough", inputs=[matrix_ty, matrix_ty, np.int32]
+                "passthrough",
+                inputs=[matrix_ty, matrix_ty, np.int32],
+                link_with="kernel.o",
             )
 
             # Tile declarations as tile[row][col]
@@ -46,7 +48,7 @@ def device_body():
             fifo_out = object_fifo("fifo_out", tiles[2][0], tiles[0][0], 2, matrix_ty)
 
             # Core
-            @core(tiles[2][0], "kernel.o")
+            @core(tiles[2][0])
             def core_body():
                 for _ in range_(0, 0xFFFFFFFF):
                     elem_in = fifo_in.acquire(ObjectFifoPort.Consume, 1)
diff --git a/test/npu-xrt/nd_memcpy_transforms/aie2.py b/test/npu-xrt/nd_memcpy_transforms/aie2.py
index 462d4a7e7f3..4c7e14142d8 100644
--- a/test/npu-xrt/nd_memcpy_transforms/aie2.py
+++ b/test/npu-xrt/nd_memcpy_transforms/aie2.py
@@ -39,6 +39,7 @@ def device_body():
             concat_func = external_func(
                 "concat",
                 inputs=[a_ty, b_ty, c_ty, np.int32, np.int32, np.int32],
+                link_with="kernel.o",
             )
 
             # Tile declarations as tile[row][col]
@@ -52,7 +53,7 @@ def device_body():
             fifo_c = object_fifo("fifo_c", tiles[2][0], tiles[0][0], 2, c_ty)
 
             # Core
-            @core(tiles[2][0], "kernel.o")
+            @core(tiles[2][0])
             def core_body():
                 for _ in range_(0, 0xFFFFFFFF):
                     elem_c = fifo_c.acquire(ObjectFifoPort.Produce, 1)
diff --git a/test/python/core_link_with_removed.py b/test/python/core_link_with_removed.py
new file mode 100644
index 00000000000..1632bcff1f9
--- /dev/null
+++ b/test/python/core_link_with_removed.py
@@ -0,0 +1,29 @@
+# Copyright (C) 2026, Advanced Micro Devices, Inc.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Verify that Core() raises TypeError when link_with is passed.
+# link_with must be set on external_func() declarations instead.
+
+# RUN: %python %s
+
+import pytest
+from aie.dialects.aie import AIEDevice, Core, Device, end, tile
+from aie.ir import Block, InsertionPoint
+from aie.extras.context import mlir_mod_ctx
+
+
+def _make_core_with_link_with():
+    with mlir_mod_ctx():
+        dev = Device(AIEDevice.npu1_1col)
+        dev_block = Block.create_at_start(dev.body_region)
+        with InsertionPoint(dev_block):
+            t = tile(col=0, row=2)
+            Core(t, link_with="test.o")
+
+
+# Core(link_with=...) must raise TypeError with a message directing users
+# to external_func().
+with pytest.raises(TypeError, match="link_with"):
+    _make_core_with_link_with()
+
+print("PASS: Core(link_with=...) correctly raises TypeError")
diff --git a/test/python/trace_utils.py b/test/python/trace_utils.py
index 33d1c348de6..9b690d32123 100644
--- a/test/python/trace_utils.py
+++ b/test/python/trace_utils.py
@@ -49,7 +49,9 @@ def device_body():
 
             # AIE Core Function declarations
             passThroughLine = external_func(
-                "passThroughLine", inputs=[memRef_ty, memRef_ty, T.i32()]
+                "passThroughLine",
+                inputs=[memRef_ty, memRef_ty, T.i32()],
+                link_with="passThrough.cc.o",
             )
 
             # Tile declarations
@@ -66,7 +68,7 @@ def device_body():
             # Set up compute tiles
 
             # Compute tile 2
-            @core(ComputeTile2, "passThrough.cc.o")
+            @core(ComputeTile2)
             def core_body():
                 for _ in range_(sys.maxsize):
                     elemOut = of_out.acquire(ObjectFifoPort.Produce, 1)

From c9b081b4da9b5dfe7a501022e6768cccf7484eae Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Wed, 11 Mar 2026 09:10:03 -0600
Subject: [PATCH 25/28] [fix] Fix self-loop ObjectFifo in
 test_jit_two_extern_functions

The intermediate 'tmp' ObjectFifo had a producer endpoint (the worker)
but no consumer endpoint registered, causing a ValueError during
placement. AIE hardware also does not support a tile DMA-ing to itself.

Replace the three-fifo pipeline with a two-fifo design: the core
acquires of_in and of_out together, applies add_fn into of_out, then
scale_fn in-place on of_out. The computed result is unchanged:
output[i] = (input[i] + 1) * 2.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../npu-xrt/test_jit_two_extern_functions.py   | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/test/python/npu-xrt/test_jit_two_extern_functions.py b/test/python/npu-xrt/test_jit_two_extern_functions.py
index 52ab992aa90..6da87b52add 100644
--- a/test/python/npu-xrt/test_jit_two_extern_functions.py
+++ b/test/python/npu-xrt/test_jit_two_extern_functions.py
@@ -40,26 +40,22 @@ def add_then_scale(input, output, add_func, scale_func):
     tile_ty = np.ndarray[(tile_size,), np.dtype[dtype]]
 
     of_in = ObjectFifo(tile_ty, name="in")
-    of_tmp = ObjectFifo(tile_ty, name="tmp")
     of_out = ObjectFifo(tile_ty, name="out")
 
-    def core_body(of_in, of_tmp, of_out, add_fn, scale_fn):
+    def core_body(of_in, of_out, add_fn, scale_fn):
         for _ in range_(num_tiles):
             elem_in = of_in.acquire(1)
-            elem_tmp = of_tmp.acquire(1)
-            add_fn(elem_in, elem_tmp, tile_size)
-            of_in.release(1)
-            of_tmp.release(1)
-
-            elem_tmp2 = of_tmp.acquire(1)
             elem_out = of_out.acquire(1)
-            scale_fn(elem_tmp2, elem_out, tile_size)
-            of_tmp.release(1)
+            # Apply add_fn first, writing result into elem_out as a temporary,
+            # then apply scale_fn in-place on elem_out.
+            add_fn(elem_in, elem_out, tile_size)
+            scale_fn(elem_out, elem_out, tile_size)
+            of_in.release(1)
             of_out.release(1)
 
     worker = Worker(
         core_body,
-        fn_args=[of_in.cons(), of_tmp.prod(), of_out.prod(), add_func, scale_func],
+        fn_args=[of_in.cons(), of_out.prod(), add_func, scale_func],
     )
 
     rt = Runtime()

From 64c8fb6827c3282b68bc69d472476a543ffdc78a Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Wed, 11 Mar 2026 14:02:32 -0600
Subject: [PATCH 26/28] Update lit_config_helpers.py

---
 python/aie_lit_utils/lit_config_helpers.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/aie_lit_utils/lit_config_helpers.py b/python/aie_lit_utils/lit_config_helpers.py
index 62945e58a4a..98ab9880afb 100644
--- a/python/aie_lit_utils/lit_config_helpers.py
+++ b/python/aie_lit_utils/lit_config_helpers.py
@@ -64,7 +64,7 @@ class LitConfigHelper:
     # Maps generation name to list of model strings that may appear in xrt-smi
     NPU_MODELS = {
         "npu1": ["npu1", "Phoenix"],
-        "npu2": ["npu4", "Strix", "npu5", "Strix Halo", "npu6", "Krackan", "Krackan 1"],
+        "npu2": ["npu4", "Strix", "npu5", "Strix Halo", "npu6", "Krackan"],
     }
 
     @staticmethod

From f98478e26c39ef674442cb30e78db0e576672c1f Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Wed, 11 Mar 2026 15:55:01 -0600
Subject: [PATCH 27/28] [examples] Replace artificial archives with individual
 .o files in vision examples

edge_detect and color_detect were packing multiple kernel object files
into combined archives (combined_gray2rgba_addWeighted.a,
combined_bitwiseOR_gray2rgba_bitwiseAND.a) solely because the old
aie.core link_with attribute accepted only one file per core.

Now that link_with is declared per external_func and
aie-assign-core-link-files aggregates and deduplicates the per-function
object files into a per-core link_files list, each external_func
declaration can point directly to the .o that implements it:

- edge_detect: gray2rgbaLine -> gray2rgba.cc.o
               addWeightedLine -> addWeighted.cc.o
- color_detect: bitwiseORLine -> bitwiseOR.cc.o
                gray2rgbaLine -> gray2rgba.cc.o
                bitwiseANDLine -> bitwiseAND.cc.o

Remove the ar archive build rules from both Makefiles and update the
xclbin dependencies accordingly. Update the Iron Kernel() calls in the
non-placed variants and the edge_detect test in test/python/npu.py.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 programming_examples/vision/color_detect/Makefile           | 6 +-----
 programming_examples/vision/color_detect/color_detect.py    | 6 +++---
 .../vision/color_detect/color_detect_placed.py              | 6 +++---
 programming_examples/vision/edge_detect/Makefile            | 6 +-----
 programming_examples/vision/edge_detect/edge_detect.py      | 4 ++--
 .../vision/edge_detect/edge_detect_placed.py                | 4 ++--
 test/python/npu.py                                          | 4 ++--
 7 files changed, 14 insertions(+), 22 deletions(-)

diff --git a/programming_examples/vision/color_detect/Makefile b/programming_examples/vision/color_detect/Makefile
index 70cd174c1b8..bd507ee9c21 100755
--- a/programming_examples/vision/color_detect/Makefile
+++ b/programming_examples/vision/color_detect/Makefile
@@ -45,15 +45,11 @@ else
 	echo "Device type not supported"
 endif
 
-build/combined_bitwiseOR_gray2rgba_bitwiseAND.a: build/bitwiseOR.cc.o build/gray2rgba.cc.o build/bitwiseAND.cc.o
-	mkdir -p ${@D}
-	ar rvs $@ $< $(word 2,$^) $(word 3,$^)
-
 build/aie2_lineBased_8b_${COLORDETECT_WIDTH}.mlir: ${srcdir}/${aie_py_src}
 	mkdir -p ${@D}
 	python3 $< ${device} ${COLORDETECT_WIDTH} ${COLORDETECT_HEIGHT} > $@
 
-build/final_${COLORDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${COLORDETECT_WIDTH}.mlir build/rgba2hue.cc.o build/threshold.cc.o build/combined_bitwiseOR_gray2rgba_bitwiseAND.a
+build/final_${COLORDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${COLORDETECT_WIDTH}.mlir build/rgba2hue.cc.o build/threshold.cc.o build/bitwiseOR.cc.o build/gray2rgba.cc.o build/bitwiseAND.cc.o
 	mkdir -p ${@D}
 ifeq ($(device),npu)
 	cd ${@D} && aiecc.py --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host --alloc-scheme=basic-sequential \
diff --git a/programming_examples/vision/color_detect/color_detect.py b/programming_examples/vision/color_detect/color_detect.py
index f98cd8b59c6..0b462ca7735 100644
--- a/programming_examples/vision/color_detect/color_detect.py
+++ b/programming_examples/vision/color_detect/color_detect.py
@@ -36,17 +36,17 @@ def color_detect(dev, width, height):
     )
     bitwiseORLine = Kernel(
         "bitwiseORLine",
-        "combined_bitwiseOR_gray2rgba_bitwiseAND.a",
+        "bitwiseOR.cc.o",
         [line_ty, line_ty, line_ty, np.int32],
     )
     gray2rgbaLine = Kernel(
         "gray2rgbaLine",
-        "combined_bitwiseOR_gray2rgba_bitwiseAND.a",
+        "gray2rgba.cc.o",
         [line_ty, line_bytes_ty, np.int32],
     )
     bitwiseANDLine = Kernel(
         "bitwiseANDLine",
-        "combined_bitwiseOR_gray2rgba_bitwiseAND.a",
+        "bitwiseAND.cc.o",
         [line_bytes_ty, line_bytes_ty, line_bytes_ty, np.int32],
     )
 
diff --git a/programming_examples/vision/color_detect/color_detect_placed.py b/programming_examples/vision/color_detect/color_detect_placed.py
index 16147571f9e..fb6844e7dc3 100644
--- a/programming_examples/vision/color_detect/color_detect_placed.py
+++ b/programming_examples/vision/color_detect/color_detect_placed.py
@@ -49,17 +49,17 @@ def deviceBody():
         bitwiseORLine = external_func(
             "bitwiseORLine",
             inputs=[line_ty, line_ty, line_ty, np.int32],
-            link_with="combined_bitwiseOR_gray2rgba_bitwiseAND.a",
+            link_with="bitwiseOR.cc.o",
         )
         gray2rgbaLine = external_func(
             "gray2rgbaLine",
             inputs=[line_ty, line_bytes_ty, np.int32],
-            link_with="combined_bitwiseOR_gray2rgba_bitwiseAND.a",
+            link_with="gray2rgba.cc.o",
         )
         bitwiseANDLine = external_func(
             "bitwiseANDLine",
             inputs=[line_bytes_ty, line_bytes_ty, line_bytes_ty, np.int32],
-            link_with="combined_bitwiseOR_gray2rgba_bitwiseAND.a",
+            link_with="bitwiseAND.cc.o",
         )
 
         # Tile declarations
diff --git a/programming_examples/vision/edge_detect/Makefile b/programming_examples/vision/edge_detect/Makefile
index 0729cbb2da8..a62c592e114 100755
--- a/programming_examples/vision/edge_detect/Makefile
+++ b/programming_examples/vision/edge_detect/Makefile
@@ -52,15 +52,11 @@ else
 	echo "Device type not supported"
 endif
 
-build/combined_gray2rgba_addWeighted.a: build/gray2rgba.cc.o build/addWeighted.cc.o
-	mkdir -p ${@D}
-	ar rvs $@ $< $(word 2,$^)
-
 build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir: ${srcdir}/${aie_py_src}
 	mkdir -p ${@D}
 	python3 $< ${device} ${EDGEDETECT_WIDTH} ${EDGEDETECT_HEIGHT} > $@
 
-build/final_${EDGEDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir build/rgba2gray.cc.o build/gray2rgba.cc.o build/filter2d.cc.o build/threshold.cc.o build/addWeighted.cc.o build/combined_gray2rgba_addWeighted.a
+build/final_${EDGEDETECT_WIDTH}.xclbin: build/aie2_lineBased_8b_${EDGEDETECT_WIDTH}.mlir build/rgba2gray.cc.o build/gray2rgba.cc.o build/filter2d.cc.o build/threshold.cc.o build/addWeighted.cc.o
 	mkdir -p ${@D}
 	cd ${@D} && aiecc.py --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host --alloc-scheme=basic-sequential \
 		--no-xchesscc --no-xbridge \
diff --git a/programming_examples/vision/edge_detect/edge_detect.py b/programming_examples/vision/edge_detect/edge_detect.py
index d6e2154c4a8..b6d3c189093 100644
--- a/programming_examples/vision/edge_detect/edge_detect.py
+++ b/programming_examples/vision/edge_detect/edge_detect.py
@@ -42,12 +42,12 @@ def edge_detect(dev, width, height):
     )
     gray2rgba_line_kernel = Kernel(
         "gray2rgbaLine",
-        "combined_gray2rgba_addWeighted.a",
+        "gray2rgba.cc.o",
         [line_ty, line_bytes_ty, np.int32],
     )
     add_weighted_line_kernel = Kernel(
         "addWeightedLine",
-        "combined_gray2rgba_addWeighted.a",
+        "addWeighted.cc.o",
         [
             line_bytes_ty,
             line_bytes_ty,
diff --git a/programming_examples/vision/edge_detect/edge_detect_placed.py b/programming_examples/vision/edge_detect/edge_detect_placed.py
index afe074fa2c7..813e9e40fb6 100644
--- a/programming_examples/vision/edge_detect/edge_detect_placed.py
+++ b/programming_examples/vision/edge_detect/edge_detect_placed.py
@@ -51,7 +51,7 @@ def device_body():
         gray2rgba_line = external_func(
             "gray2rgbaLine",
             inputs=[line_ty, line_bytes_ty, np.int32],
-            link_with="combined_gray2rgba_addWeighted.a",
+            link_with="gray2rgba.cc.o",
         )
         add_weighted_line = external_func(
             "addWeightedLine",
@@ -64,7 +64,7 @@ def device_body():
                 np.int16,
                 np.int8,
             ],
-            link_with="combined_gray2rgba_addWeighted.a",
+            link_with="addWeighted.cc.o",
         )
 
         # Tile declarations
diff --git a/test/python/npu.py b/test/python/npu.py
index b0e3ac6045a..32e8d773c49 100644
--- a/test/python/npu.py
+++ b/test/python/npu.py
@@ -247,7 +247,7 @@ def device_body():
         gray2rgba_line = external_func(
             "gray2rgba_line",
             inputs=[vec64_ty, vec256_ty, np.int32],
-            link_with="combined_gray2rgba_addWeighted.a",
+            link_with="gray2rgba.cc.o",
         )
         add_weighted_line = external_func(
             "add_weighted_line",
@@ -260,7 +260,7 @@ def device_body():
                 np.int16,
                 np.int8,
             ],
-            link_with="combined_gray2rgba_addWeighted.a",
+            link_with="addWeighted.cc.o",
         )
 
         S = tile(0, 0)

From ddddb0d07431ea0f5310e237f1043ee964968149 Mon Sep 17 00:00:00 2001
From: Erika Hunhoff <erika.hunhoff@amd.com>
Date: Wed, 11 Mar 2026 16:01:48 -0600
Subject: [PATCH 28/28] [audit] Tighten kernel.py and compile_mlir_module
 comments

kernel.py:
- Remove apologetic "harmless redundancy" comment from
  ExternalFunction.__call__; the arg-count check is explained by the
  fact that BaseKernel validates count while ExternalFunction also
  validates types, so the super() call naturally re-checks count.

compile/utils.py:
- Clarify compile_mlir_module comment: the MLIR write in the work_dir
  branch is intentional (and idempotent when jit.py pre-writes the same
  file) because the C++ aiecc binary needs the MLIR on disk to resolve
  relative link_with paths against the compilation directory.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 python/iron/kernel.py         |  3 ---
 python/utils/compile/utils.py | 10 ++++++----
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/python/iron/kernel.py b/python/iron/kernel.py
index d22f220c6ad..ec7c8183d69 100644
--- a/python/iron/kernel.py
+++ b/python/iron/kernel.py
@@ -203,9 +203,6 @@ def __call__(self, *args, **kwargs):
             )
         for i, (arg, expected_ty) in enumerate(zip(args, self._arg_types)):
             self._validate_arg(i, arg, expected_ty)
-        # Delegate to BaseKernel for the actual MLIR func.call emission.
-        # BaseKernel also validates count, which is harmless redundancy but
-        # ensures the check holds even if this override is bypassed.
         super().__call__(*args, **kwargs)
 
     def _validate_arg(self, index: int, arg, expected_ty) -> None:
diff --git a/python/utils/compile/utils.py b/python/utils/compile/utils.py
index a62aa17cfde..113567a3666 100644
--- a/python/utils/compile/utils.py
+++ b/python/utils/compile/utils.py
@@ -124,11 +124,13 @@ def compile_mlir_module(
         args.append("--verbose")
     if options:
         args.extend(options)
-    # Write the MLIR to a file co-located with work_dir so that the C++ aiecc
-    # binary resolves relative link_with paths (e.g. "add_one.o") against the
+    # When work_dir is provided, invoke the aiecc binary as a subprocess so
+    # that it resolves relative link_with paths (e.g. "add_one.o") against the
     # same directory where compile_external_kernel placed the compiled objects.
-    # If no work_dir is provided, fall back to the aiecc.run() helper which
-    # writes to a temporary file internally.
+    # The MLIR file is written to work_dir/aie.mlir; callers (e.g. jit.py)
+    # may have already written it there, in which case this is a no-op write.
+    # If no work_dir is provided, fall back to aiecc.run() which writes to a
+    # temporary file internally.
     if work_dir:
         aiecc_bin = shutil.which("aiecc")
         if not aiecc_bin: