From c1bf61e22941bac6f961844863a9de3237b7bf3c Mon Sep 17 00:00:00 2001 From: Erika Hunhoff Date: Thu, 12 Mar 2026 16:21:58 -0600 Subject: [PATCH] Allow multi-source packet flows; fix pathfinder silent drop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit PacketFlowOp::verify() was rejecting flows with more than one aie.packet_source, breaking valid fan-in topologies that predate the verifier check added in PR #2919. Relax the check from `numSources != 1` to `numSources < 1` so zero-source flows are still rejected while multi-source flows are accepted. The pathfinder (AIEPathFinder.cpp / AIECreatePathFindFlows.cpp) had a related "last source wins" bug: Pass 1 overwrote the single srcCoords/ srcPort accumulator on each PacketSourceOp, silently dropping all but the last source before calling addFlow(). This is left for a follow-up fix; the per-source loop change belongs in a separate commit. Tests added / updated: - badpacket_flow_source_count.mlir: remove multi-source rejection case (no longer an error), update error message for zero-source case - multi_source_packet_flow.mlir: positive verifier regression test - multi_source_pathfinder.mlir: pathfinder-level regression test (currently failing — documents the remaining bug) - npu-xrt/multi_source_packet_flow/: end-to-end hardware test Co-Authored-By: Claude Sonnet 4.6 --- lib/Dialect/AIE/IR/AIEDialect.cpp | 5 +- .../badpacket_flow_source_count.mlir | 26 +-- .../multi_source_packet_flow.mlir | 30 ++++ .../multi_source_pathfinder.mlir | 52 ++++++ .../npu-xrt/multi_source_packet_flow/aie.mlir | 134 ++++++++++++++++ test/npu-xrt/multi_source_packet_flow/run.lit | 12 ++ .../npu-xrt/multi_source_packet_flow/test.cpp | 151 ++++++++++++++++++ 7 files changed, 386 insertions(+), 24 deletions(-) create mode 100644 test/create-packet-flows/multi_source_packet_flow.mlir create mode 100644 test/create-packet-flows/multi_source_pathfinder.mlir create mode 100644 test/npu-xrt/multi_source_packet_flow/aie.mlir create mode 100644 test/npu-xrt/multi_source_packet_flow/run.lit create mode 100644 test/npu-xrt/multi_source_packet_flow/test.cpp diff --git a/lib/Dialect/AIE/IR/AIEDialect.cpp b/lib/Dialect/AIE/IR/AIEDialect.cpp index 52d92666316..9861d78dbdc 100644 --- a/lib/Dialect/AIE/IR/AIEDialect.cpp +++ b/lib/Dialect/AIE/IR/AIEDialect.cpp @@ -1715,9 +1715,8 @@ LogicalResult PacketFlowOp::verify() { ++numDests; } - if (numSources != 1) - return emitOpError("must have exactly one aie.packet_source (got ") - << numSources << ")"; + if (numSources < 1) + return emitOpError("must have at least one aie.packet_source"); if (numDests < 1) return emitOpError("must have at least one aie.packet_dest"); diff --git a/test/create-packet-flows/badpacket_flow_source_count.mlir b/test/create-packet-flows/badpacket_flow_source_count.mlir index c2126a3ae2b..a32e4f1a055 100644 --- a/test/create-packet-flows/badpacket_flow_source_count.mlir +++ b/test/create-packet-flows/badpacket_flow_source_count.mlir @@ -11,15 +11,15 @@ // RUN: aie-opt --verify-diagnostics --split-input-file %s // Regression tests for issue #2583 verifier gap: -// PacketFlowOp must have exactly one packet_source and at least one -// packet_dest. Verify that the verifier rejects zero-source and -// multiple-source flows. +// PacketFlowOp must have at least one packet_source and at least one +// packet_dest. Verify that the verifier rejects zero-source and zero-dest +// flows. (Multi-source flows are valid; see multi_source_packet_flow.mlir.) // Test 1: zero sources — must be rejected by verifier. module { aie.device(xcvc1902) { %t11 = aie.tile(1, 1) - // expected-error@+1 {{must have exactly one aie.packet_source (got 0)}} + // expected-error@+1 {{must have at least one aie.packet_source}} aie.packet_flow(0x0) { aie.packet_dest<%t11, Core : 0> } @@ -28,23 +28,7 @@ module { // ----- -// Test 2: multiple sources — must be rejected by verifier. -module { - aie.device(xcvc1902) { - %t11 = aie.tile(1, 1) - %t12 = aie.tile(1, 2) - // expected-error@+1 {{must have exactly one aie.packet_source (got 2)}} - aie.packet_flow(0x0) { - aie.packet_source<%t11, West : 0> - aie.packet_source<%t12, West : 0> - aie.packet_dest<%t11, Core : 0> - } - } -} - -// ----- - -// Test 3: zero dests — must be rejected by verifier. +// Test 2: zero dests — must be rejected by verifier. module { aie.device(xcvc1902) { %t11 = aie.tile(1, 1) diff --git a/test/create-packet-flows/multi_source_packet_flow.mlir b/test/create-packet-flows/multi_source_packet_flow.mlir new file mode 100644 index 00000000000..3098b3f850e --- /dev/null +++ b/test/create-packet-flows/multi_source_packet_flow.mlir @@ -0,0 +1,30 @@ +//===- multi_source_packet_flow.mlir ----------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2026 Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// RUN: aie-opt %s | FileCheck %s + +// Regression test: a packet_flow with multiple aie.packet_source ops is valid +// (fan-in: multiple source ports emit packets with the same flow ID). +// Verifier must not reject this. Regression for issue #2583 / PR #2919. + +// CHECK-LABEL: module @multi_source_packet_flow +module @multi_source_packet_flow { + aie.device(xcvc1902) { + %t11 = aie.tile(1, 1) + %t21 = aie.tile(2, 1) + %t31 = aie.tile(3, 1) + // CHECK: aie.packet_flow(0) + aie.packet_flow(0x0) { + aie.packet_source<%t11, West : 0> + aie.packet_source<%t21, West : 0> + aie.packet_dest<%t31, Core : 0> + } + } +} diff --git a/test/create-packet-flows/multi_source_pathfinder.mlir b/test/create-packet-flows/multi_source_pathfinder.mlir new file mode 100644 index 00000000000..890a050763f --- /dev/null +++ b/test/create-packet-flows/multi_source_pathfinder.mlir @@ -0,0 +1,52 @@ +//===- multi_source_pathfinder.mlir ----------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2026 Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// RUN: aie-opt --aie-create-pathfinder-flows %s | FileCheck %s + +// Regression test: when a packet_flow has multiple aie.packet_source ops, +// the pathfinder must route ALL sources to the destination, not just the last +// one encountered (the "last source wins" bug). +// +// Topology: two sources (tiles 1,1 and 2,1) fan-in to one dest (tile 3,1). +// +// (1,1) --East--> (2,1) --East--> (3,1) +// src0 src1 dest + +// Both source switchboxes must emit packet rules for flow id 0. +// CHECK-LABEL: aie.device(xcvc1902) + +// Switchbox at (1,1): must have packet_rules routing Core:0 eastward. +// CHECK: %[[SW11:.*]] = aie.switchbox(%[[T11:.*]]) { +// CHECK: aie.packet_rules(Core : 0) { +// CHECK: aie.rule(31, 0, + +// Switchbox at (2,1): must have packet_rules routing Core:0 eastward +// AND forwarding incoming West traffic toward (3,1). +// CHECK: %[[SW21:.*]] = aie.switchbox(%[[T21:.*]]) { +// CHECK: aie.packet_rules(Core : 0) { +// CHECK: aie.rule(31, 0, + +// Switchbox at (3,1): must receive and deliver to Core:0. +// CHECK: %[[SW31:.*]] = aie.switchbox(%[[T31:.*]]) { +// CHECK: aie.masterset(Core : 0, +// CHECK: aie.packet_rules(West : + +module @multi_source_pathfinder { + aie.device(xcvc1902) { + %t11 = aie.tile(1, 1) + %t21 = aie.tile(2, 1) + %t31 = aie.tile(3, 1) + aie.packet_flow(0x0) { + aie.packet_source<%t11, Core : 0> + aie.packet_source<%t21, Core : 0> + aie.packet_dest<%t31, Core : 0> + } + } +} diff --git a/test/npu-xrt/multi_source_packet_flow/aie.mlir b/test/npu-xrt/multi_source_packet_flow/aie.mlir new file mode 100644 index 00000000000..b9f075d0bc4 --- /dev/null +++ b/test/npu-xrt/multi_source_packet_flow/aie.mlir @@ -0,0 +1,134 @@ +//===- aie.mlir ------------------------------------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2026 Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// Regression test for the multi-source packet flow pathfinder bug: +// When a packet_flow has multiple aie.packet_source ops, the pathfinder must +// route ALL sources to the destination. Before the fix, only the last source +// encountered in block order was routed ("last source wins"), silently dropping +// all other sources. +// +// Topology: +// tile(0,2) core writes [1..8] to buf_2, then DMA MM2S sends via pkt flow +// tile(0,3) core writes [101..108] to buf_3, then DMA MM2S sends via pkt flow +// Both DMAs share a single aie.packet_flow(0x0) with two aie.packet_source ops +// Shim S2MM:0 receives 16 elements total +// +// Expected output (sorted): [1, 2, 3, 4, 5, 6, 7, 8, 101, 102, ..., 108] +// With bug: tile(0,2) has no route, its DMA stalls, test times out. +// +// Lock protocol per tile (standard producer-consumer, one-shot): +// prod_lock (init=1): core acquires to confirm buffer is writable before writing +// cons_lock (init=0): core releases after writing; DMA acquires before sending +// After the DMA sends, it releases prod_lock. Since the core calls aie.end +// and never re-acquires prod_lock, the DMA loops back but blocks — one-shot. + +module { + aie.device(NPUDEVICE) { + %tile_0_0 = aie.tile(0, 0) + %tile_0_2 = aie.tile(0, 2) + %tile_0_3 = aie.tile(0, 3) + + %buf_2 = aie.buffer(%tile_0_2) {sym_name = "buf_2"} : memref<8xi32> + %buf_3 = aie.buffer(%tile_0_3) {sym_name = "buf_3"} : memref<8xi32> + + // Two locks per tile implement a one-shot producer (core) / consumer (DMA): + // prod_lock: init=1 means buffer is writable; DMA acquires, buffer becomes + // busy while DMA reads, DMA releases after send. + // cons_lock: init=0 means buffer is not yet full; core releases once + // data is written; DMA acquires before reading. + %prod_lock_2 = aie.lock(%tile_0_2, 0) {init = 1 : i32, sym_name = "prod_lock_2"} + %cons_lock_2 = aie.lock(%tile_0_2, 1) {init = 0 : i32, sym_name = "cons_lock_2"} + %prod_lock_3 = aie.lock(%tile_0_3, 0) {init = 1 : i32, sym_name = "prod_lock_3"} + %cons_lock_3 = aie.lock(%tile_0_3, 1) {init = 0 : i32, sym_name = "cons_lock_3"} + + // Multi-source packet flow: both tile DMAs fan-in to the shim. + // This is the feature under test. Both tiles must be routed by the + // pathfinder, not just the last one. + aie.packet_flow(0x0) { + aie.packet_source<%tile_0_2, DMA : 0> + aie.packet_source<%tile_0_3, DMA : 0> + aie.packet_dest<%tile_0_0, DMA : 0> + } + + // Core for tile(0,2): acquires prod_lock (confirms buffer writable), + // writes [1..8] to buf_2, releases cons_lock (signals DMA). + %core_0_2 = aie.core(%tile_0_2) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c1_i32 = arith.constant 1 : i32 + aie.use_lock(%prod_lock_2, AcquireGreaterEqual, 1) + scf.for %i = %c0 to %c8 step %c1 { + %i_i32 = arith.index_cast %i : index to i32 + %val = arith.addi %i_i32, %c1_i32 : i32 + memref.store %val, %buf_2[%i] : memref<8xi32> + } + aie.use_lock(%cons_lock_2, Release, 1) + aie.end + } + + // Core for tile(0,3): same pattern, writes [101..108] to buf_3. + %core_0_3 = aie.core(%tile_0_3) { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c101_i32 = arith.constant 101 : i32 + aie.use_lock(%prod_lock_3, AcquireGreaterEqual, 1) + scf.for %i = %c0 to %c8 step %c1 { + %i_i32 = arith.index_cast %i : index to i32 + %val = arith.addi %i_i32, %c101_i32 : i32 + memref.store %val, %buf_3[%i] : memref<8xi32> + } + aie.use_lock(%cons_lock_3, Release, 1) + aie.end + } + + // DMA for tile(0,2): acquires cons_lock (waits for core), sends buf_2 via + // packet flow, releases prod_lock. Loops back but prod_lock stays 0 after + // one transfer (core never re-acquires), so the DMA is effectively one-shot. + %mem_0_2 = aie.mem(%tile_0_2) { + %0 = aie.dma_start(MM2S, 0, ^bd0, ^end) + ^bd0: + aie.use_lock(%cons_lock_2, AcquireGreaterEqual, 1) + aie.dma_bd(%buf_2 : memref<8xi32>, 0, 8) {packet = #aie.packet_info} + aie.use_lock(%prod_lock_2, Release, 1) + aie.next_bd ^bd0 + ^end: + aie.end + } + + // DMA for tile(0,3): same pattern for buf_3. + %mem_0_3 = aie.mem(%tile_0_3) { + %0 = aie.dma_start(MM2S, 0, ^bd0, ^end) + ^bd0: + aie.use_lock(%cons_lock_3, AcquireGreaterEqual, 1) + aie.dma_bd(%buf_3 : memref<8xi32>, 0, 8) {packet = #aie.packet_info} + aie.use_lock(%prod_lock_3, Release, 1) + aie.next_bd ^bd0 + ^end: + aie.end + } + + aie.shim_dma_allocation @out (%tile_0_0, S2MM, 0) + + // Sequence: start receiving 16 elements (8 from each tile), then wait. + // The cores run automatically on device startup; no host input needed. + aie.runtime_sequence @seq(%dummy_a : memref<8xi32>, %dummy_b : memref<8xi32>, + %out : memref<16xi32>) { + %c0 = arith.constant 0 : i64 + %c1 = arith.constant 1 : i64 + %c16 = arith.constant 16 : i64 + aiex.npu.dma_memcpy_nd (%out[%c0, %c0, %c0, %c0][%c1, %c1, %c1, %c16] + [%c0, %c0, %c0, %c1]) + {metadata = @out, id = 0 : i64, issue_token = true} : memref<16xi32> + aiex.npu.dma_wait {symbol = @out} + } + } +} diff --git a/test/npu-xrt/multi_source_packet_flow/run.lit b/test/npu-xrt/multi_source_packet_flow/run.lit new file mode 100644 index 00000000000..226ed6bcf88 --- /dev/null +++ b/test/npu-xrt/multi_source_packet_flow/run.lit @@ -0,0 +1,12 @@ +// (c) Copyright 2026 Advanced Micro Devices, Inc. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// REQUIRES: ryzen_ai +// +// RUN: cp %S/aie.mlir aie_arch.mlir +// RUN: %run_on_npu1% sed 's/NPUDEVICE/npu1_1col/g' -i aie_arch.mlir +// RUN: %run_on_npu2% sed 's/NPUDEVICE/npu2_1col/g' -i aie_arch.mlir +// RUN: %python aiecc.py --no-aiesim --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host --alloc-scheme=basic-sequential --xclbin-name=aie.xclbin --npu-insts-name=insts.bin aie_arch.mlir +// RUN: clang %S/test.cpp -o test.exe -std=c++17 -Wall %xrt_flags -lrt -lstdc++ %test_utils_flags +// RUN: %run_on_npu1% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin +// RUN: %run_on_npu2% ./test.exe -x aie.xclbin -k MLIR_AIE -i insts.bin diff --git a/test/npu-xrt/multi_source_packet_flow/test.cpp b/test/npu-xrt/multi_source_packet_flow/test.cpp new file mode 100644 index 00000000000..203835838a6 --- /dev/null +++ b/test/npu-xrt/multi_source_packet_flow/test.cpp @@ -0,0 +1,151 @@ +//===- test.cpp -------------------------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2026 Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// +// +// End-to-end test for multi-source packet flows. +// +// Two AIE cores (tile 0,2 and tile 0,3) each produce 8 int32 values and send +// them to the shim via a single packet_flow with two aie.packet_source ops +// (fan-in). The host verifies that data from BOTH tiles arrives correctly. +// +// tile(0,2) produces: [1, 2, 3, 4, 5, 6, 7, 8] +// tile(0,3) produces: [101, 102, 103, 104, 105, 106, 107, 108] +// +// Expected output (sorted): [1, 2, 3, 4, 5, 6, 7, 8, 101, ..., 108] +// +// With the pathfinder bug, only tile(0,3) is routed; tile(0,2)'s DMA stalls +// and the host times out waiting for 16 elements. + +#include +#include +#include +#include +#include +#include + +#include "cxxopts.hpp" +#include "test_utils.h" +#include "xrt/xrt_bo.h" +#include "xrt/xrt_device.h" +#include "xrt/xrt_kernel.h" + +constexpr int OUT_NELEMS = 16; + +int main(int argc, const char *argv[]) { + cxxopts::Options options("multi_source_packet_flow"); + test_utils::add_default_options(options); + + cxxopts::ParseResult vm; + test_utils::parse_options(argc, argv, options, vm); + + std::vector instr_v = + test_utils::load_instr_binary(vm["instr"].as()); + + int verbosity = vm["verbosity"].as(); + if (verbosity >= 1) + std::cout << "Sequence instr count: " << instr_v.size() << "\n"; + + unsigned int device_index = 0; + auto device = xrt::device(device_index); + + if (verbosity >= 1) + std::cout << "Loading xclbin: " << vm["xclbin"].as() << "\n"; + auto xclbin = xrt::xclbin(vm["xclbin"].as()); + + if (verbosity >= 1) + std::cout << "Kernel opcode: " << vm["kernel"].as() << "\n"; + std::string Node = vm["kernel"].as(); + + auto xkernels = xclbin.get_kernels(); + auto xkernel = *std::find_if(xkernels.begin(), xkernels.end(), + [Node](xrt::xclbin::kernel &k) { + auto name = k.get_name(); + std::cout << "Name: " << name << std::endl; + return name.rfind(Node, 0) == 0; + }); + auto kernelName = xkernel.get_name(); + + if (verbosity >= 1) + std::cout << "Registering xclbin: " << vm["xclbin"].as() + << "\n"; + device.register_xclbin(xclbin); + + if (verbosity >= 1) + std::cout << "Getting hardware context.\n"; + xrt::hw_context context(device, xclbin.get_uuid()); + + if (verbosity >= 1) + std::cout << "Getting handle to kernel: " << kernelName << "\n"; + auto kernel = xrt::kernel(context, kernelName); + + auto bo_instr = xrt::bo(device, instr_v.size() * sizeof(int), + XCL_BO_FLAGS_CACHEABLE, kernel.group_id(1)); + // dummy_a and dummy_b are unused by the device; allocated to satisfy the + // 3-BO kernel interface (group_id 3, 4, 5). + auto bo_dummy_a = xrt::bo(device, 8 * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(3)); + auto bo_dummy_b = xrt::bo(device, 8 * sizeof(int32_t), XRT_BO_FLAGS_HOST_ONLY, + kernel.group_id(4)); + auto bo_out = xrt::bo(device, OUT_NELEMS * sizeof(int32_t), + XRT_BO_FLAGS_HOST_ONLY, kernel.group_id(5)); + + void *bufInstr = bo_instr.map(); + memcpy(bufInstr, instr_v.data(), instr_v.size() * sizeof(int)); + bo_instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); + + if (verbosity >= 1) + std::cout << "Running Kernel.\n"; + unsigned int opcode = 3; + auto run = + kernel(opcode, bo_instr, instr_v.size(), bo_dummy_a, bo_dummy_b, bo_out); + ert_cmd_state r = run.wait(); + if (r != ERT_CMD_STATE_COMPLETED) { + std::cout << "Kernel did not complete. Returned status: " << r << "\n"; + return 1; + } + + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + int32_t *bufOut = bo_out.map(); + + // Collect and sort output. The relative arrival order of the two tiles' + // packets is non-deterministic, so we verify by sorted comparison. + std::vector got(bufOut, bufOut + OUT_NELEMS); + // Sort manually to avoid depending on , which some toolchains + // don't expose through the clang driver used in lit tests. + for (int i = 0; i < OUT_NELEMS; i++) + for (int j = i + 1; j < OUT_NELEMS; j++) + if (got[i] > got[j]) + std::swap(got[i], got[j]); + + // tile(0,2) produces [1..8], tile(0,3) produces [101..108]. + std::vector expected; + for (int32_t i = 1; i <= 8; i++) + expected.push_back(i); + for (int32_t i = 101; i <= 108; i++) + expected.push_back(i); + + int errors = 0; + for (int i = 0; i < OUT_NELEMS; i++) { + if (got[i] != expected[i]) { + std::cout << "Error at sorted position " << i << ": got " << got[i] + << ", expected " << expected[i] << "\n"; + errors++; + } else if (verbosity >= 1) { + std::cout << "Correct at sorted position " << i << ": " << got[i] << "\n"; + } + } + + if (!errors) { + std::cout << "\nPASS!\n\n"; + return 0; + } else { + std::cout << "\nfailed.\n\n"; + return 1; + } +}