From e614691ee852b22da0b669b1295a6ac211236fde Mon Sep 17 00:00:00 2001 From: yenjames Date: Wed, 11 Mar 2026 11:43:36 -0600 Subject: [PATCH 1/3] Add trace flow infrastructure from Jeff's fork. --- .../aie/Dialect/AIE/Transforms/AIEPasses.h | 2 + .../aie/Dialect/AIE/Transforms/AIEPasses.td | 44 ++ .../AIE/Transforms/AIEInsertTraceFlows.cpp | 421 ++++++++++++++++++ .../AIE/Transforms/AIETraceToConfig.cpp | 34 +- lib/Dialect/AIE/Transforms/CMakeLists.txt | 1 + .../basic/event_trace/Makefile | 18 + .../test_insert_trace_flows_multiple.mlir | 55 +++ .../trace/test_insert_trace_flows_simple.mlir | 33 ++ 8 files changed, 606 insertions(+), 2 deletions(-) create mode 100644 lib/Dialect/AIE/Transforms/AIEInsertTraceFlows.cpp create mode 100644 test/dialect/AIE/trace/test_insert_trace_flows_multiple.mlir create mode 100644 test/dialect/AIE/trace/test_insert_trace_flows_simple.mlir diff --git a/include/aie/Dialect/AIE/Transforms/AIEPasses.h b/include/aie/Dialect/AIE/Transforms/AIEPasses.h index 60e56cbfbff..211a029b880 100644 --- a/include/aie/Dialect/AIE/Transforms/AIEPasses.h +++ b/include/aie/Dialect/AIE/Transforms/AIEPasses.h @@ -68,6 +68,8 @@ std::unique_ptr> createAIEAssignTileCtrlIDsPass(); std::unique_ptr> createAIETraceToConfigPass(); std::unique_ptr> createAIETraceRegPackWritesPass(); +std::unique_ptr> +createAIEInsertTraceFlowsPass(); /// Generate the code for registering passes. #define GEN_PASS_REGISTRATION diff --git a/include/aie/Dialect/AIE/Transforms/AIEPasses.td b/include/aie/Dialect/AIE/Transforms/AIEPasses.td index 29e8a0d3b59..f91b42702cb 100644 --- a/include/aie/Dialect/AIE/Transforms/AIEPasses.td +++ b/include/aie/Dialect/AIE/Transforms/AIEPasses.td @@ -424,4 +424,48 @@ def AIETraceRegPackWrites : Pass<"aie-trace-pack-reg-writes", "DeviceOp"> { ]; } +def AIEInsertTraceFlows : Pass<"aie-insert-trace-flows", "DeviceOp"> { + let summary = "Insert packet flows and runtime sequence trace setup"; + let description = [{ + For each aie.trace operation, this pass: + - Creates ONE packet flow from trace port to shim DMA + - Groups traces by target shim (minimizes shim usage, ideally 1) + - Inserts ONE shim buffer descriptor per shim tile for all traces + - Inserts per-tile timer control register writes + - Inserts per-shim broadcast and DMA control setup + + All trace configuration is injected at the beginning of the runtime + sequence, before user data transfer operations. + + Multiple trace streams (from different tiles or different trace units + on the same tile) are routed to the same shim DMA channel and drained + by a single buffer descriptor. + }]; + + let constructor = "xilinx::AIE::createAIEInsertTraceFlowsPass()"; + + let dependentDialects = [ + "xilinx::AIE::AIEDialect", + ]; + + let options = [ + Option<"shimChannel", "shim-channel", "int", "1", + "S2MM DMA channel to use for trace (default: 1)">, + Option<"defaultBdId", "default-bd-id", "int", "15", + "Buffer descriptor ID for trace (default: 15)">, + Option<"packetIdStart", "packet-id-start", "int", "1", + "Starting packet ID for trace flows (default: 1)">, + Option<"traceBufferSize", "trace-buffer-size", "int", "1048576", + "Default trace buffer size in bytes (default: 1MB)">, + Option<"traceBurstLength", "burst-length", "int", "64", + "DMA burst length for trace transfers (default: 64 bytes)">, + Option<"traceArgIdx", "trace-arg-idx", "int", "4", + "Runtime sequence argument index for trace buffer (default: 4)">, + Option<"minimizeShims", "minimize-shims", "bool", "true", + "Minimize number of shim tiles used (prefer one shim for all traces)">, + Option<"preferSameColumn", "prefer-same-column", "bool", "true", + "When choosing shim, prefer same column as trace sources"> + ]; +} + #endif diff --git a/lib/Dialect/AIE/Transforms/AIEInsertTraceFlows.cpp b/lib/Dialect/AIE/Transforms/AIEInsertTraceFlows.cpp new file mode 100644 index 00000000000..a5d02dc4a47 --- /dev/null +++ b/lib/Dialect/AIE/Transforms/AIEInsertTraceFlows.cpp @@ -0,0 +1,421 @@ +//===- AIEInsertTraceFlows.cpp ----------------------------------*- C++ -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Copyright (C) 2025, Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// +// Pass to insert packet flows and runtime sequence trace setup +//===----------------------------------------------------------------------===// + +#include "aie/Dialect/AIE/IR/AIEDialect.h" +#include "aie/Dialect/AIE/Transforms/AIEPasses.h" +#include "aie/Dialect/AIEX/IR/AIEXDialect.h" + +#include "mlir/IR/Attributes.h" +#include "mlir/Pass/Pass.h" + +#include +#include + +using namespace mlir; +using namespace xilinx; +using namespace xilinx::AIE; + +namespace { + +struct TraceInfo { + TraceOp traceOp; + TileOp tile; + int packetId; + TracePacketType packetType; + WireBundle tracePort; // Trace:0 (core) or Trace:1 (mem) + int traceChannel; // Port number (0 for core, 1 for mem) +}; + +struct ShimInfo { + TileOp shimTile; + int channel; // S2MM channel + int bdId; // Buffer descriptor ID + int argIdx; // Runtime sequence argument index + std::vector traceSources; // All traces routed to this shim +}; + +} // namespace + +struct AIEInsertTraceFlowsPass + : AIEInsertTraceFlowsBase { + + void runOnOperation() override { + DeviceOp device = getOperation(); + OpBuilder builder(device); + const auto &targetModel = device.getTargetModel(); + + // Phase 1: Collect all trace operations + SmallVector traces; + device.walk([&](TraceOp trace) { traces.push_back(trace); }); + + if (traces.empty()) + return; + + // Phase 2: Analyze traces and allocate resources + std::vector traceInfos; + std::map usedPacketIds; // col -> next available packet ID + int nextPacketId = packetIdStart; + + for (auto trace : traces) { + auto tile = cast(trace.getTile().getDefiningOp()); + int col = tile.getCol(); + + // Find packet ID and type from trace body + std::optional packetId; + TracePacketType packetType = TracePacketType::Core; // default + for (auto &op : trace.getBody().getOps()) { + if (auto packetOp = dyn_cast(op)) { + packetId = packetOp.getId(); + packetType = packetOp.getType(); + break; + } + } + + // Allocate packet ID if not specified + if (!packetId) { + if (usedPacketIds.find(col) == usedPacketIds.end()) { + usedPacketIds[col] = nextPacketId; + } + packetId = usedPacketIds[col]++; + } + + // Determine trace port based on packet type + WireBundle tracePort = WireBundle::Trace; + int traceChannel = 0; + if (packetType == TracePacketType::Mem) { + traceChannel = 1; // Mem trace uses port 1 + } + + TraceInfo info; + info.traceOp = trace; + info.tile = tile; + info.packetId = *packetId; + info.packetType = packetType; + info.tracePort = tracePort; + info.traceChannel = traceChannel; + traceInfos.push_back(info); + } + + // Phase 2b: Select shim tiles (minimize usage) + std::map shimInfos; // col -> ShimInfo + + if (minimizeShims && preferSameColumn) { + // Strategy: Group all traces by column, use one shim per column + std::map> tracesByCol; + for (auto &info : traceInfos) { + int col = info.tile.getCol(); + tracesByCol[col].push_back(info); + } + + // For each column with traces, allocate a shim + for (auto &[col, colTraces] : tracesByCol) { + // Find shim tile for this column + TileOp shimTile = nullptr; + for (auto tile : device.getOps()) { + if (tile.getCol() == col && tile.getRow() == 0) { + shimTile = tile; + break; + } + } + + if (!shimTile) { + // Create shim tile if it doesn't exist + builder.setInsertionPointToStart(&device.getRegion().front()); + shimTile = builder.create(device.getLoc(), col, 0); + } + + ShimInfo shimInfo; + shimInfo.shimTile = shimTile; + shimInfo.channel = shimChannel; + shimInfo.bdId = defaultBdId; + shimInfo.argIdx = traceArgIdx; + shimInfo.traceSources = colTraces; + shimInfos[col] = shimInfo; + } + } else { + // Fallback: Use one shim for all traces (column 0) + int targetCol = 0; + TileOp shimTile = nullptr; + for (auto tile : device.getOps()) { + if (tile.getCol() == targetCol && tile.getRow() == 0) { + shimTile = tile; + break; + } + } + + if (!shimTile) { + builder.setInsertionPointToStart(&device.getRegion().front()); + shimTile = builder.create(device.getLoc(), targetCol, 0); + } + + ShimInfo shimInfo; + shimInfo.shimTile = shimTile; + shimInfo.channel = shimChannel; + shimInfo.bdId = defaultBdId; + shimInfo.argIdx = traceArgIdx; + shimInfo.traceSources = traceInfos; + shimInfos[targetCol] = shimInfo; + } + + // Phase 3: Insert packet flows + // Insert before the device terminator + Block &deviceBlock = device.getRegion().front(); + builder.setInsertionPoint(deviceBlock.getTerminator()); + + for (auto &info : traceInfos) { + // Find target shim for this trace + int col = info.tile.getCol(); + ShimInfo &shimInfo = shimInfos[col]; + + // Create packet flow + auto packetFlowOp = builder.create( + device.getLoc(), builder.getI8IntegerAttr(info.packetId), nullptr, + nullptr); + + Block *flowBody = new Block(); + packetFlowOp.getPorts().push_back(flowBody); + OpBuilder flowBuilder = OpBuilder::atBlockEnd(flowBody); + + // Add source + flowBuilder.create(device.getLoc(), + Value(info.tile.getResult()), + info.tracePort, info.traceChannel); + + // Add destination + flowBuilder.create(device.getLoc(), + Value(shimInfo.shimTile.getResult()), + WireBundle::DMA, shimInfo.channel); + + // Add terminator + flowBuilder.create(device.getLoc()); + + // Add keep_pkt_header attribute + packetFlowOp->setAttr("keep_pkt_header", builder.getBoolAttr(true)); + } + + // Phase 4: Insert runtime sequence operations + // Find runtime sequence + RuntimeSequenceOp runtimeSeq = nullptr; + device.walk([&](RuntimeSequenceOp seq) { + if (!runtimeSeq) + runtimeSeq = seq; + return WalkResult::advance(); + }); + + if (!runtimeSeq) { + // No runtime sequence found, nothing to insert + return; + } + + // Insert trace infrastructure at the beginning of runtime sequence + // NOTE: trace.start_config insertion is NOT done here. + // The source MLIR should already contain aie.trace.start_config ops, + // and --aie-inline-trace-config will expand them to register writes. + Block &seqBlock = runtimeSeq.getBody().front(); + builder.setInsertionPointToStart(&seqBlock); + + // 4b. Insert per-tile timer controls + std::set> processedTiles; // (col, row) + for (auto &info : traceInfos) { + int col = info.tile.getCol(); + int row = info.tile.getRow(); + + if (processedTiles.find({col, row}) != processedTiles.end()) + continue; + processedTiles.insert({col, row}); + + // Compute timer control address + uint32_t timerCtrlAddr = computeTimerCtrlAddress( + info.tile, targetModel, info.packetType == TracePacketType::Mem); + + // Timer control value: BROADCAST_15 event (122 << 8 = 31232) + uint32_t timerCtrlValue = 31232; // Event 122 (BROADCAST_15) << 8 + + builder.create( + runtimeSeq.getLoc(), timerCtrlAddr, timerCtrlValue, nullptr, + builder.getI32IntegerAttr(col), builder.getI32IntegerAttr(row)); + } + + // 4c-4f. Insert per-shim configurations + for (auto &[col, shimInfo] : shimInfos) { + int shimCol = shimInfo.shimTile.getCol(); + + // 4c. Write buffer descriptor + builder.create( + runtimeSeq.getLoc(), + shimCol, // column + shimInfo.bdId, // bd_id + traceBufferSize, // buffer_length + 0, // buffer_offset + 1, // enable_packet + 0, // out_of_order_id + 0, // packet_id (not used for reception) + 0, // packet_type (not used for reception) + 0, 0, 0, 0, 0, + 0, // d0_size, d0_stride, d1_size, d1_stride, d2_size, d2_stride + 0, 0, 0, // iteration_current, iteration_size, iteration_stride + 0, // next_bd + 0, // row + 0, // use_next_bd + 1, // valid_bd + 0, 0, 0, 0, 0, // lock_rel_val, lock_rel_id, lock_acq_enable, + // lock_acq_val, lock_acq_id + 0, 0, 0, 0, 0, 0, // d0_zero_before, d1_zero_before, d2_zero_before, + // d0_zero_after, d1_zero_after, d2_zero_after + traceBurstLength // burst_length + ); + + // 4d. Address patch + uint32_t bdAddress = computeBDAddress(shimCol, shimInfo.bdId, + shimInfo.shimTile, targetModel); + builder.create( + runtimeSeq.getLoc(), bdAddress, shimInfo.argIdx, 0); + + // 4e. DMA channel configuration + uint32_t ctrlAddr = + computeCtrlAddress(DMAChannelDir::S2MM, shimInfo.channel, + shimInfo.shimTile, targetModel); + builder.create( + runtimeSeq.getLoc(), ctrlAddr, 3840, 7936, // value, mask + nullptr, builder.getI32IntegerAttr(shimCol), + builder.getI32IntegerAttr(0)); + + // Push BD to task queue + uint32_t taskQueueAddr = + computeTaskQueueAddress(DMAChannelDir::S2MM, shimInfo.channel, + shimInfo.shimTile, targetModel); + uint32_t bdIdWithToken = (1U << 31) | shimInfo.bdId; // enable_token = 1 + builder.create( + runtimeSeq.getLoc(), taskQueueAddr, bdIdWithToken, nullptr, + builder.getI32IntegerAttr(shimCol), builder.getI32IntegerAttr(0)); + + // 4f. Shim timer and broadcast control + // Shim timer control (USER_EVENT_1 = 127 << 8 = 32512) + uint32_t shimTimerCtrlAddr = + computeTimerCtrlAddress(shimInfo.shimTile, targetModel, false); + builder.create( + runtimeSeq.getLoc(), shimTimerCtrlAddr, 32512, nullptr, + builder.getI32IntegerAttr(shimCol), builder.getI32IntegerAttr(0)); + + // Trigger broadcast (Event_Broadcast15_A) + const RegisterInfo *broadcast15Reg = targetModel.lookupRegister( + "Event_Broadcast15_A", shimInfo.shimTile.getTileID()); + if (!broadcast15Reg) + llvm::report_fatal_error( + "Failed to lookup Event_Broadcast15_A register"); + builder.create( + runtimeSeq.getLoc(), broadcast15Reg->offset, 127, nullptr, + builder.getI32IntegerAttr(shimCol), builder.getI32IntegerAttr(0)); + + // Generate USER_EVENT_1 + const RegisterInfo *eventGenReg = targetModel.lookupRegister( + "Event_Generate", shimInfo.shimTile.getTileID()); + if (!eventGenReg) + llvm::report_fatal_error("Failed to lookup Event_Generate register"); + builder.create( + runtimeSeq.getLoc(), eventGenReg->offset, 127, nullptr, + builder.getI32IntegerAttr(shimCol), builder.getI32IntegerAttr(0)); + } + + // Phase 4g: Insert trace stop/flush at the END of runtime sequence + // Trace stop must happen AFTER all data DMA tasks complete so that + // the trace captures events during the entire kernel execution. + builder.setInsertionPointToEnd(&seqBlock); + + for (auto &[col, shimInfo] : shimInfos) { + int shimCol = shimInfo.shimTile.getCol(); + + // Configure broadcast 14 to forward USER_EVENT_0 + const RegisterInfo *broadcast14Reg = targetModel.lookupRegister( + "Event_Broadcast14_A", shimInfo.shimTile.getTileID()); + if (!broadcast14Reg) + llvm::report_fatal_error( + "Failed to lookup Event_Broadcast14_A register"); + builder.create( + runtimeSeq.getLoc(), broadcast14Reg->offset, 126, nullptr, + builder.getI32IntegerAttr(shimCol), builder.getI32IntegerAttr(0)); + + // Generate USER_EVENT_0 to trigger broadcast 14 (trace stop event) + const RegisterInfo *stopEventGenReg = targetModel.lookupRegister( + "Event_Generate", shimInfo.shimTile.getTileID()); + if (!stopEventGenReg) + llvm::report_fatal_error("Failed to lookup Event_Generate register"); + builder.create( + runtimeSeq.getLoc(), stopEventGenReg->offset, 126, nullptr, + builder.getI32IntegerAttr(shimCol), builder.getI32IntegerAttr(0)); + } + } + +private: + // Compute buffer descriptor base address for the buffer address field + uint32_t computeBDAddress(int col, int bdId, TileOp shimTile, + const AIETargetModel &tm) { + // Use register database to lookup BD0 address, then add stride * bdId + // The buffer address field is at offset +4 within each BD descriptor + const RegisterInfo *bdReg = + tm.lookupRegister("DMA_BD0_0", shimTile.getTileID()); + if (!bdReg) + llvm::report_fatal_error("Failed to lookup DMA_BD0_0 register"); + const uint32_t BD_STRIDE = 0x20; + const uint32_t BUFFER_ADDR_OFFSET = 4; // buffer address is 2nd word in BD + return (col << tm.getColumnShift()) | + (bdReg->offset + bdId * BD_STRIDE + BUFFER_ADDR_OFFSET); + } + + // Compute DMA task queue address + uint32_t computeTaskQueueAddress(DMAChannelDir dir, int channel, + TileOp shimTile, const AIETargetModel &tm) { + std::string regName; + if (dir == DMAChannelDir::S2MM) { + regName = + (channel == 0) ? "DMA_S2MM_0_Task_Queue" : "DMA_S2MM_1_Task_Queue"; + } else { // MM2S + regName = + (channel == 0) ? "DMA_MM2S_0_Task_Queue" : "DMA_MM2S_1_Task_Queue"; + } + const RegisterInfo *reg = tm.lookupRegister(regName, shimTile.getTileID()); + if (!reg) + llvm::report_fatal_error(llvm::Twine("Failed to lookup ") + regName); + return reg->offset; + } + + // Compute DMA control register address + uint32_t computeCtrlAddress(DMAChannelDir dir, int channel, TileOp shimTile, + const AIETargetModel &tm) { + std::string regName; + if (dir == DMAChannelDir::S2MM) { + regName = (channel == 0) ? "DMA_S2MM_0_Ctrl" : "DMA_S2MM_1_Ctrl"; + } else { // MM2S + regName = (channel == 0) ? "DMA_MM2S_0_Ctrl" : "DMA_MM2S_1_Ctrl"; + } + const RegisterInfo *reg = tm.lookupRegister(regName, shimTile.getTileID()); + if (!reg) + llvm::report_fatal_error(llvm::Twine("Failed to lookup ") + regName); + return reg->offset; + } + + // Compute timer control address based on tile type + uint32_t computeTimerCtrlAddress(TileOp tile, const AIETargetModel &tm, + bool isMemTrace) { + // Use register database to lookup Timer_Control for the appropriate module + const RegisterInfo *reg = + tm.lookupRegister("Timer_Control", tile.getTileID(), isMemTrace); + if (!reg) + llvm::report_fatal_error("Failed to lookup Timer_Control register"); + return reg->offset; + } +}; + +std::unique_ptr> +xilinx::AIE::createAIEInsertTraceFlowsPass() { + return std::make_unique(); +} diff --git a/lib/Dialect/AIE/Transforms/AIETraceToConfig.cpp b/lib/Dialect/AIE/Transforms/AIETraceToConfig.cpp index 8a83333a780..7b778d08ea5 100644 --- a/lib/Dialect/AIE/Transforms/AIETraceToConfig.cpp +++ b/lib/Dialect/AIE/Transforms/AIETraceToConfig.cpp @@ -218,7 +218,22 @@ struct AIETraceToConfigPass if (auto startOp = dyn_cast(op)) { uint32_t startEvent = 0; if (startOp.getBroadcast()) { - startEvent = *startOp.getBroadcast(); + uint32_t broadcastNum = *startOp.getBroadcast(); + // Resolve broadcast channel to hardware event ID + std::string eventName; + if (targetModel.isShimNOCTile(tileID.col, tileID.row) || + targetModel.isShimPLTile(tileID.col, tileID.row)) { + eventName = "BROADCAST_A_" + std::to_string(broadcastNum); + } else { + eventName = "BROADCAST_" + std::to_string(broadcastNum); + } + auto eventNum = targetModel.lookupEvent(eventName, tileID, isMem); + if (eventNum) { + startEvent = *eventNum; + } else { + startOp.emitError("unknown broadcast event '") << eventName << "'"; + return signalPassFailure(); + } } else if (auto eventAttr = startOp.getEvent()) { // Use getEventName() helper and check for enum std::string eventName = eventAttr->getEventName(); @@ -248,7 +263,22 @@ struct AIETraceToConfigPass if (auto stopOp = dyn_cast(op)) { uint32_t stopEvent = 0; if (stopOp.getBroadcast()) { - stopEvent = *stopOp.getBroadcast(); + uint32_t broadcastNum = *stopOp.getBroadcast(); + // Resolve broadcast channel to hardware event ID + std::string eventName; + if (targetModel.isShimNOCTile(tileID.col, tileID.row) || + targetModel.isShimPLTile(tileID.col, tileID.row)) { + eventName = "BROADCAST_A_" + std::to_string(broadcastNum); + } else { + eventName = "BROADCAST_" + std::to_string(broadcastNum); + } + auto eventNum = targetModel.lookupEvent(eventName, tileID, isMem); + if (eventNum) { + stopEvent = *eventNum; + } else { + stopOp.emitError("unknown broadcast event '") << eventName << "'"; + return signalPassFailure(); + } } else if (auto eventAttr = stopOp.getEvent()) { // Use getEventName() helper and check for enum std::string eventName = eventAttr->getEventName(); diff --git a/lib/Dialect/AIE/Transforms/CMakeLists.txt b/lib/Dialect/AIE/Transforms/CMakeLists.txt index cfb2c973d9f..f60ce9a31a9 100644 --- a/lib/Dialect/AIE/Transforms/CMakeLists.txt +++ b/lib/Dialect/AIE/Transforms/CMakeLists.txt @@ -27,6 +27,7 @@ add_mlir_dialect_library( AIELowerCascadeFlows.cpp AIEGenerateColumnControlOverlay.cpp AIETraceToConfig.cpp + AIEInsertTraceFlows.cpp ADDITIONAL_HEADER_DIRS ${AIE_BINARY_DIR}/include diff --git a/programming_examples/basic/event_trace/Makefile b/programming_examples/basic/event_trace/Makefile index 174c7f1f91e..fc24e8c792b 100644 --- a/programming_examples/basic/event_trace/Makefile +++ b/programming_examples/basic/event_trace/Makefile @@ -39,6 +39,16 @@ else echo "Device type not supported" endif +# Preprocess the MLIR: substitute NPUDEVICE with the target device string +build/aie_trace.mlir: ${srcdir}/aie_trace.mlir + mkdir -p ${@D} + sed 's/NPUDEVICE/${devicename}_1col/g' $< > $@ + +# Generate MLIR from Python +build/aie_trace_from_py.mlir: ${srcdir}/aie_trace.py + mkdir -p ${@D} + python3 $< > $@ + # Preprocess the MLIR: substitute NPUDEVICE with the target device string build/aie_trace.mlir: ${srcdir}/aie_trace.mlir mkdir -p ${@D} @@ -81,7 +91,11 @@ endif run_trace: ${targetname}.exe build/final.xclbin build/insts.bin @echo "Running with declarative trace syntax..." ${powershell} ./$< -x build/final.xclbin -i build/insts.bin -k MLIR_AIE -t ${trace_size} +<<<<<<< HEAD ${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie_trace.mlir.prj/main_physical_with_elfs.mlir --output trace.json +======= + ${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie_trace.mlir.prj/input_physical_with_elfs.mlir --output trace.json +>>>>>>> fifield/insert-trace-flows ${srcdir}/../../../python/utils/trace/get_trace_summary.py --input trace.json @echo "" @@ -93,7 +107,11 @@ run_trace: ${targetname}.exe build/final.xclbin build/insts.bin run_trace_py: build/final.xclbin build/insts.bin @echo "Running Python test with declarative trace syntax..." python3 ${srcdir}/test.py --xclbin build/final.xclbin --instr build/insts.bin --kernel MLIR_AIE --verbosity 1 --trace-sz ${trace_size} --trace-file trace.txt +<<<<<<< HEAD ${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie_trace.mlir.prj/main_physical_with_elfs.mlir --output trace.json +======= + ${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie_trace.mlir.prj/input_physical_with_elfs.mlir --output trace.json +>>>>>>> fifield/insert-trace-flows ${srcdir}/../../../python/utils/trace/get_trace_summary.py --input trace.json @echo "" @echo "Generating trace visualization..." diff --git a/test/dialect/AIE/trace/test_insert_trace_flows_multiple.mlir b/test/dialect/AIE/trace/test_insert_trace_flows_multiple.mlir new file mode 100644 index 00000000000..6c7ab00f4b9 --- /dev/null +++ b/test/dialect/AIE/trace/test_insert_trace_flows_multiple.mlir @@ -0,0 +1,55 @@ +//===- test_insert_trace_flows_multiple.mlir ------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// RUN: aie-opt %s -aie-insert-trace-flows | FileCheck %s + +// Test multiple traces from different tiles routed to one shim + +// CHECK-LABEL: module { +module { + aie.device(npu1_1col) { + %tile02 = aie.tile(0, 2) + %tile03 = aie.tile(0, 3) + %tile00 = aie.tile(0, 0) + + // Core trace on tile (0,2) + aie.trace @core_trace_02(%tile02) { + aie.trace.packet id=1 type="core" + aie.trace.event<"INSTR_VECTOR"> + aie.trace.start broadcast=15 + } + + // Mem trace on tile (0,2) + aie.trace @mem_trace_02(%tile02) { + aie.trace.packet id=2 type="mem" + aie.trace.event<"DMA_S2MM_0_START_TASK"> + aie.trace.start broadcast=15 + } + + // Core trace on tile (0,3) + aie.trace @core_trace_03(%tile03) { + aie.trace.packet id=3 type="core" + aie.trace.event<"LOCK_STALL"> + aie.trace.start broadcast=15 + } + + // CHECK: aie.packet_flow(1) + // CHECK: aie.packet_source<%tile_0_2, Trace : 0> + // CHECK: aie.packet_dest<%{{.*}}, DMA : 1> + + // CHECK: aie.packet_flow(2) + // CHECK: aie.packet_source<%tile_0_2, Trace : 1> + // CHECK: aie.packet_dest<%{{.*}}, DMA : 1> + + // CHECK: aie.packet_flow(3) + // CHECK: aie.packet_source<%tile_0_3, Trace : 0> + // CHECK: aie.packet_dest<%{{.*}}, DMA : 1> + } +} diff --git a/test/dialect/AIE/trace/test_insert_trace_flows_simple.mlir b/test/dialect/AIE/trace/test_insert_trace_flows_simple.mlir new file mode 100644 index 00000000000..e4414e28111 --- /dev/null +++ b/test/dialect/AIE/trace/test_insert_trace_flows_simple.mlir @@ -0,0 +1,33 @@ +//===- test_insert_trace_flows_simple.mlir --------------------*- MLIR -*-===// +// +// This file is licensed under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// (c) Copyright 2025 Advanced Micro Devices, Inc. +// +//===----------------------------------------------------------------------===// + +// RUN: aie-opt %s -aie-insert-trace-flows | FileCheck %s + +// CHECK-LABEL: module { +module { + aie.device(npu1_1col) { + %tile02 = aie.tile(0, 2) + %tile00 = aie.tile(0, 0) + + // CHECK: aie.trace @core_trace + aie.trace @core_trace(%tile02) { + aie.trace.mode "Event-Time" + aie.trace.packet id=1 type="core" + aie.trace.event<"INSTR_EVENT_0"> + aie.trace.start broadcast=15 + aie.trace.stop broadcast=14 + } + + // CHECK: aie.packet_flow(1) + // CHECK: aie.packet_source<%tile_0_2, Trace : 0> + // CHECK: aie.packet_dest<%{{.*}}, DMA : 1> + // CHECK: keep_pkt_header = true + } +} From c9b1ad339f02d592f79ad7320051247f393d4164 Mon Sep 17 00:00:00 2001 From: yenjames Date: Thu, 12 Mar 2026 14:50:32 -0600 Subject: [PATCH 2/3] Updated trace examples to use the declarative `aie.trace` ops that are lowered through trace passes. - Added auto-detection of packet type based on tile when not defined in `aie.trace`. - Fixed field mode emission. Only core trace control registers have a mode field. - Updated expected broadcast event values to actual hardware event codes - Update manual `aiex.npu.write32` trace configuration in test/npu-xrt/vec_mul_event_trace/aie.mlir and programming_examples/basic/event_trace/aie_trace.mlir to declarative `aie.trace` ops - Updated examples to feature trace config for all 4 options: coretile, core_mem, memtile, shimtile. - Register -aie-insert-trace-flows to aiecc. - TODO: Future PR to update python to use `aie.trace` bindings in python/utils/trace. --- .../AIE/Transforms/AIEInsertTraceFlows.cpp | 52 +++- .../AIE/Transforms/AIETraceToConfig.cpp | 9 +- .../basic/event_trace/Makefile | 8 - .../basic/event_trace/README.md | 14 +- .../basic/event_trace/aie_trace.mlir | 184 ++++++------- .../basic/event_trace/aie_trace.py | 156 ++++------- .../AIE/combo_edge/test_combo_edge_full.mlir | 4 +- .../AIE/trace/test_trace_port_to_config.mlir | 4 +- .../AIE/trace/test_trace_to_config.mlir | 8 +- test/npu-xrt/vec_mul_event_trace/aie.mlir | 252 ++++++++---------- test/npu-xrt/vec_mul_event_trace/test.py | 79 ++++-- .../vec_mul_event_trace/vector_scalar_mul.cc | 2 +- tools/aiecc/aiecc.cpp | 1 + 13 files changed, 368 insertions(+), 405 deletions(-) diff --git a/lib/Dialect/AIE/Transforms/AIEInsertTraceFlows.cpp b/lib/Dialect/AIE/Transforms/AIEInsertTraceFlows.cpp index a5d02dc4a47..1a0f9851d3c 100644 --- a/lib/Dialect/AIE/Transforms/AIEInsertTraceFlows.cpp +++ b/lib/Dialect/AIE/Transforms/AIEInsertTraceFlows.cpp @@ -20,6 +20,11 @@ #include #include +namespace xilinx::AIE { +#define GEN_PASS_DEF_AIEINSERTTRACEFLOWS +#include "aie/Dialect/AIE/Transforms/AIEPasses.h.inc" +} // namespace xilinx::AIE + using namespace mlir; using namespace xilinx; using namespace xilinx::AIE; @@ -43,10 +48,8 @@ struct ShimInfo { std::vector traceSources; // All traces routed to this shim }; -} // namespace - struct AIEInsertTraceFlowsPass - : AIEInsertTraceFlowsBase { + : xilinx::AIE::impl::AIEInsertTraceFlowsBase { void runOnOperation() override { DeviceOp device = getOperation(); @@ -71,7 +74,7 @@ struct AIEInsertTraceFlowsPass // Find packet ID and type from trace body std::optional packetId; - TracePacketType packetType = TracePacketType::Core; // default + std::optional packetType; for (auto &op : trace.getBody().getOps()) { if (auto packetOp = dyn_cast(op)) { packetId = packetOp.getId(); @@ -80,6 +83,18 @@ struct AIEInsertTraceFlowsPass } } + // Determine packet type from tile type if not specified + if (!packetType) { + if (tile.isShimTile()) { + packetType = TracePacketType::ShimTile; + } else if (tile.isMemTile()) { + packetType = TracePacketType::MemTile; + } else { + // Core tile defaults to core type + packetType = TracePacketType::Core; + } + } + // Allocate packet ID if not specified if (!packetId) { if (usedPacketIds.find(col) == usedPacketIds.end()) { @@ -91,7 +106,7 @@ struct AIEInsertTraceFlowsPass // Determine trace port based on packet type WireBundle tracePort = WireBundle::Trace; int traceChannel = 0; - if (packetType == TracePacketType::Mem) { + if (*packetType == TracePacketType::Mem) { traceChannel = 1; // Mem trace uses port 1 } @@ -99,7 +114,7 @@ struct AIEInsertTraceFlowsPass info.traceOp = trace; info.tile = tile; info.packetId = *packetId; - info.packetType = packetType; + info.packetType = *packetType; info.tracePort = tracePort; info.traceChannel = traceChannel; traceInfos.push_back(info); @@ -216,12 +231,27 @@ struct AIEInsertTraceFlowsPass return; } - // Insert trace infrastructure at the beginning of runtime sequence - // NOTE: trace.start_config insertion is NOT done here. - // The source MLIR should already contain aie.trace.start_config ops, + // Insert trace infrastructure AFTER aie.trace.start_config ops + // NOTE: The source MLIR should already contain aie.trace.start_config ops, // and --aie-inline-trace-config will expand them to register writes. + // We need to insert broadcast start writes AFTER the start_config ops + // so that trace configuration registers are written before trace starts. Block &seqBlock = runtimeSeq.getBody().front(); - builder.setInsertionPointToStart(&seqBlock); + + // Find the last TraceStartConfigOp in the runtime sequence + Operation *lastStartConfig = nullptr; + for (auto &op : seqBlock.getOperations()) { + if (isa(op)) { + lastStartConfig = &op; + } + } + + // Insert after the last start_config op, or at start if none found + if (lastStartConfig) { + builder.setInsertionPointAfter(lastStartConfig); + } else { + builder.setInsertionPointToStart(&seqBlock); + } // 4b. Insert per-tile timer controls std::set> processedTiles; // (col, row) @@ -415,6 +445,8 @@ struct AIEInsertTraceFlowsPass } }; +} // namespace + std::unique_ptr> xilinx::AIE::createAIEInsertTraceFlowsPass() { return std::make_unique(); diff --git a/lib/Dialect/AIE/Transforms/AIETraceToConfig.cpp b/lib/Dialect/AIE/Transforms/AIETraceToConfig.cpp index 7b778d08ea5..ae0015071ea 100644 --- a/lib/Dialect/AIE/Transforms/AIETraceToConfig.cpp +++ b/lib/Dialect/AIE/Transforms/AIETraceToConfig.cpp @@ -231,7 +231,8 @@ struct AIETraceToConfigPass if (eventNum) { startEvent = *eventNum; } else { - startOp.emitError("unknown broadcast event '") << eventName << "'"; + startOp.emitError("unknown broadcast event '") + << eventName << "'"; return signalPassFailure(); } } else if (auto eventAttr = startOp.getEvent()) { @@ -306,8 +307,10 @@ struct AIETraceToConfigPass } // Emit mode if present. - // Memory trace does not expose Trace_Control0.Mode in the register DB. - if (auto modeOp = dyn_cast(op); modeOp && !isMem) { + // Only core traces expose Trace_Control0.Mode in the register DB. + // Memory, memory_tile, and shim modules do not have the Mode field. + bool isCore = (packetType == TracePacketType::Core); + if (auto modeOp = dyn_cast(op); modeOp && isCore) { configBuilder.create( trace.getLoc(), builder.getStringAttr("Trace_Control0"), builder.getStringAttr("Mode"), diff --git a/programming_examples/basic/event_trace/Makefile b/programming_examples/basic/event_trace/Makefile index fc24e8c792b..e7de7e505c1 100644 --- a/programming_examples/basic/event_trace/Makefile +++ b/programming_examples/basic/event_trace/Makefile @@ -91,11 +91,7 @@ endif run_trace: ${targetname}.exe build/final.xclbin build/insts.bin @echo "Running with declarative trace syntax..." ${powershell} ./$< -x build/final.xclbin -i build/insts.bin -k MLIR_AIE -t ${trace_size} -<<<<<<< HEAD ${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie_trace.mlir.prj/main_physical_with_elfs.mlir --output trace.json -======= - ${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie_trace.mlir.prj/input_physical_with_elfs.mlir --output trace.json ->>>>>>> fifield/insert-trace-flows ${srcdir}/../../../python/utils/trace/get_trace_summary.py --input trace.json @echo "" @@ -107,11 +103,7 @@ run_trace: ${targetname}.exe build/final.xclbin build/insts.bin run_trace_py: build/final.xclbin build/insts.bin @echo "Running Python test with declarative trace syntax..." python3 ${srcdir}/test.py --xclbin build/final.xclbin --instr build/insts.bin --kernel MLIR_AIE --verbosity 1 --trace-sz ${trace_size} --trace-file trace.txt -<<<<<<< HEAD ${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie_trace.mlir.prj/main_physical_with_elfs.mlir --output trace.json -======= - ${srcdir}/../../../python/utils/trace/parse.py --input trace.txt --mlir build/aie_trace.mlir.prj/input_physical_with_elfs.mlir --output trace.json ->>>>>>> fifield/insert-trace-flows ${srcdir}/../../../python/utils/trace/get_trace_summary.py --input trace.json @echo "" @echo "Generating trace visualization..." diff --git a/programming_examples/basic/event_trace/README.md b/programming_examples/basic/event_trace/README.md index e6fde07f332..88a2203d122 100644 --- a/programming_examples/basic/event_trace/README.md +++ b/programming_examples/basic/event_trace/README.md @@ -50,16 +50,18 @@ aie.runtime_sequence(...) { ``` Compiler lowering pipeline for declarative trace: -1. `-aie-trace-to-config` -2. `-aie-trace-pack-reg-writes` -3. `-aie-inline-trace-config` +1. `-aie-insert-trace-flows` +2. `-aie-trace-to-config` +3. `-aie-trace-pack-reg-writes` +4. `-aie-inline-trace-config` Inspect intermediate IR: ```bash -aie-opt -aie-trace-to-config aie_trace.mlir -aie-opt -aie-trace-to-config -aie-trace-pack-reg-writes aie_trace.mlir -aie-opt -aie-trace-to-config -aie-trace-pack-reg-writes -aie-inline-trace-config aie_trace.mlir +aie-opt -aie-insert-trace-flows aie_trace.mlir +aie-opt -aie-insert-trace-flows -aie-trace-to-config aie_trace.mlir +aie-opt -aie-insert-trace-flows -aie-trace-to-config -aie-trace-pack-reg-writes aie_trace.mlir +aie-opt -aie-insert-trace-flows -aie-trace-to-config -aie-trace-pack-reg-writes -aie-inline-trace-config aie_trace.mlir ``` ## Example Visualization diff --git a/programming_examples/basic/event_trace/aie_trace.mlir b/programming_examples/basic/event_trace/aie_trace.mlir index c6bc449243f..e91bc762fc0 100644 --- a/programming_examples/basic/event_trace/aie_trace.mlir +++ b/programming_examples/basic/event_trace/aie_trace.mlir @@ -13,7 +13,7 @@ // - aie.trace.event for specifying events to capture // - aie.trace.start_config in runtime sequence // -// The passes aie-trace-to-config and aie-inline-trace-config will lower this. +// This will be incrementally lowered by trace passes // //===----------------------------------------------------------------------===// @@ -24,12 +24,21 @@ module { // Tile declarations %shim_noc_tile_0_0 = aie.tile(0, 0) + %mem_tile_0_1 = aie.tile(0, 1) %tile_0_2 = aie.tile(0, 2) // ObjectFIFOs for data movement - aie.objectfifo @in(%shim_noc_tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @infactor(%shim_noc_tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @out(%tile_0_2, {%shim_noc_tile_0_0}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @in(%shim_noc_tile_0_0, {%mem_tile_0_1}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @in_fwd(%mem_tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo> + aie.objectfifo.link [@in] -> [@in_fwd]([] [0]) + + aie.objectfifo @infactor(%shim_noc_tile_0_0, {%mem_tile_0_1}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @infactor_fwd(%mem_tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo> + aie.objectfifo.link [@infactor] -> [@infactor_fwd]([] [0]) + + aie.objectfifo @out(%tile_0_2, {%mem_tile_0_1}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @out_fwd(%mem_tile_0_1, {%shim_noc_tile_0_0}, 2 : i32) : !aie.objectfifo> + aie.objectfifo.link [@out] -> [@out_fwd]([] [0]) // Core computation %core_0_2 = aie.core(%tile_0_2) { @@ -37,7 +46,7 @@ module { %c9223372036854775807 = arith.constant 9223372036854775807 : index %c1 = arith.constant 1 : index scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @infactor(Consume, 1) : !aie.objectfifosubview> + %0 = aie.objectfifo.acquire @infactor_fwd(Consume, 1) : !aie.objectfifosubview> %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<1xi32> %c0_0 = arith.constant 0 : index %c4 = arith.constant 4 : index @@ -45,14 +54,14 @@ module { scf.for %arg1 = %c0_0 to %c4 step %c1_1 { %2 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview> %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<1024xi32> - %4 = aie.objectfifo.acquire @in(Consume, 1) : !aie.objectfifosubview> + %4 = aie.objectfifo.acquire @in_fwd(Consume, 1) : !aie.objectfifosubview> %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview> -> memref<1024xi32> %c1024_i32 = arith.constant 1024 : i32 func.call @vector_scalar_mul_aie_scalar(%5, %3, %1, %c1024_i32) : (memref<1024xi32>, memref<1024xi32>, memref<1xi32>, i32) -> () - aie.objectfifo.release @in(Consume, 1) + aie.objectfifo.release @in_fwd(Consume, 1) aie.objectfifo.release @out(Produce, 1) } - aie.objectfifo.release @infactor(Consume, 1) + aie.objectfifo.release @infactor_fwd(Consume, 1) } aie.end } @@ -63,35 +72,39 @@ module { // Trace configuration for compute tile (0,2) - core events aie.trace @core_trace(%tile_0_2) { - // Set trace mode (Event-Time captures timestamps) + // Core traces have a trace mode (Event-Time captures timestamps) aie.trace.mode "Event-Time" - // Configure packet routing (ID and type for packet-switched routing) + // Packet routing configuration: + // - id is optional; if omitted, auto-allocated by trace pass + // - type is inferred based on tile type except for core tiles, since both + // trace units exist; defaults to type=core for core tiles aie.trace.packet id=1 type=core // Specify which events to capture (up to 8 events) - aie.trace.event<"INSTR_EVENT_0"> // User event 0 (start marker) - aie.trace.event<"INSTR_EVENT_1"> // User event 1 (end marker) - aie.trace.event<"INSTR_VECTOR"> // Vector instructions - aie.trace.event<"MEMORY_STALL"> // Memory access stalls - aie.trace.event<"STREAM_STALL"> // Stream buffer stalls - aie.trace.event<"LOCK_STALL"> // Lock acquisition stalls - aie.trace.event<"PORT_RUNNING_1"> // DMA:0 slave port running - aie.trace.event<"PORT_IDLE_1"> // DMA:1 master port running + aie.trace.event<"INSTR_EVENT_0"> + aie.trace.event<"INSTR_EVENT_1"> + aie.trace.event<"INSTR_VECTOR"> + aie.trace.event<"MEMORY_STALL"> + aie.trace.event<"STREAM_STALL"> + aie.trace.event<"LOCK_STALL"> + aie.trace.event<"PORT_RUNNING_0"> + aie.trace.event<"PORT_RUNNING_1"> + + // PORT_RUNNING/IDLE/STALLED events monitor stream switch ports, but the + // hardware needs to know which physical port to monitor for each slot. + // trace.port maps slot N to a specific port (bundle + channel + direction). aie.trace.port<0> port=DMA channel=0 direction=S2MM aie.trace.port<1> port=DMA channel=0 direction=MM2S // Specify start/stop control (broadcast events) - aie.trace.start event=<"BROADCAST_15"> - aie.trace.stop event=<"BROADCAST_14"> + aie.trace.start broadcast=15 + aie.trace.stop broadcast=14 } // Trace configuration for compute tile (0,2) - memory events aie.trace @mem_trace(%tile_0_2) { - // Set trace mode (Event-Time captures timestamps) - aie.trace.mode "Event-Time" - - // Configure packet routing (ID and type for packet-switched routing) + // For core tiles, type=mem selects the memory module trace unit aie.trace.packet id=3 type=mem // Specify which events to capture (up to 8 events) @@ -109,12 +122,42 @@ module { aie.trace.stop event=<"BROADCAST_14"> } + // Trace configuration for mem tile (0, 1) + aie.trace @memtile_trace(%mem_tile_0_1) { + // For memtiles, type=memtile is inferred but can be explicit + aie.trace.packet id=4 type=memtile + + // Specify which events to capture (up to 8 events) + aie.trace.event<"PORT_RUNNING_0"> + aie.trace.event<"PORT_RUNNING_1"> + aie.trace.event<"PORT_RUNNING_2"> + aie.trace.event<"PORT_RUNNING_3"> + aie.trace.event<"PORT_RUNNING_4"> + aie.trace.event<"PORT_RUNNING_5"> + aie.trace.event<"PORT_RUNNING_6"> + aie.trace.event<"PORT_RUNNING_7"> + + // Map each port event slot to a physical DMA port + aie.trace.port<0> port=DMA channel=0 direction=MM2S + aie.trace.port<1> port=DMA channel=1 direction=MM2S + aie.trace.port<2> port=DMA channel=0 direction=S2MM + aie.trace.port<3> port=DMA channel=1 direction=S2MM + aie.trace.port<4> port=DMA channel=2 direction=S2MM + aie.trace.port<5> port=DMA channel=3 direction=S2MM + aie.trace.port<6> port=DMA channel=4 direction=S2MM + aie.trace.port<7> port=DMA channel=5 direction=S2MM + + // Specify start/stop control (broadcast events) + aie.trace.start broadcast=15 + aie.trace.stop broadcast=14 + } + // Trace configuration for shim tile (0,0) - // Captures DMA activity at the interface to DDR aie.trace @shim_trace(%shim_noc_tile_0_0) { + // For shim tiles, type=shimtile is inferred but can be explicit aie.trace.packet id=2 type=shimtile - // Shim DMA events + // Specify which events to capture (up to 8 events) aie.trace.event<"DMA_S2MM_0_START_TASK"> aie.trace.event<"DMA_S2MM_1_START_TASK"> aie.trace.event<"DMA_MM2S_0_START_TASK"> @@ -124,101 +167,33 @@ module { aie.trace.event<"DMA_S2MM_0_STREAM_STARVATION"> aie.trace.event<"DMA_S2MM_1_STREAM_STARVATION"> + // Specify start/stop control (broadcast events) aie.trace.start event=<"TRUE"> aie.trace.stop event=<"NONE"> } - // Packet flows to route trace data (same as before) - // These define the routing but the trace config is separate - aie.packet_flow(1) { - aie.packet_source<%tile_0_2, Trace : 0> - aie.packet_dest<%shim_noc_tile_0_0, DMA : 1> - } {keep_pkt_header = true} - aie.packet_flow(3) { - aie.packet_source<%tile_0_2, Trace : 1> - aie.packet_dest<%shim_noc_tile_0_0, DMA : 1> - } {keep_pkt_header = true} - - aie.packet_flow(2) { - aie.packet_source<%shim_noc_tile_0_0, Trace : 0> - aie.packet_dest<%shim_noc_tile_0_0, DMA : 1> - } {keep_pkt_header = true} - // ======================================================================== // RUNTIME SEQUENCE WITH TRACE ACTIVATION // ======================================================================== - // Runtime sequence with trace configuration + // Runtime sequence aie.runtime_sequence(%arg0: memref<4096xi32>, %arg1: memref<1xi32>, %arg2: memref<4096xi32>) { // ======================================================================== // TRACE INITIALIZATION // ======================================================================== - // Start trace configuration for core tile - // This will be lowered to the aiex.npu.write32 operations automatically + // Start trace configuration aie.trace.start_config @core_trace aie.trace.start_config @mem_trace - - // Start trace configuration for shim tile + aie.trace.start_config @memtile_trace aie.trace.start_config @shim_trace - // Address 212992 (0x34000): Timer_Control - aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 2 : i32, value = 31232 : ui32} - - // Configure trace buffer descriptor (still manual for now) - aiex.npu.writebd { - bd_id = 15 : i32, - buffer_length = 8192 : i32, - buffer_offset = 0 : i32, - burst_length = 64 : i32, - column = 0 : i32, - d0_size = 0 : i32, - d0_stride = 0 : i32, - d0_zero_after = 0 : i32, - d0_zero_before = 0 : i32, - d1_size = 0 : i32, - d1_stride = 0 : i32, - d1_zero_after = 0 : i32, - d1_zero_before = 0 : i32, - d2_size = 0 : i32, - d2_stride = 0 : i32, - d2_zero_after = 0 : i32, - d2_zero_before = 0 : i32, - enable_packet = 1 : i32, - iteration_current = 0 : i32, - iteration_size = 0 : i32, - iteration_stride = 0 : i32, - lock_acq_enable = 0 : i32, - lock_acq_id = 0 : i32, - lock_acq_val = 0 : i32, - lock_rel_id = 0 : i32, - lock_rel_val = 0 : i32, - next_bd = 0 : i32, - out_of_order_id = 0 : i32, - packet_id = 0 : i32, - packet_type = 0 : i32, - row = 0 : i32, - use_next_bd = 0 : i32, - valid_bd = 1 : i32 - } - - // Patch trace buffer address - aiex.npu.address_patch {addr = 119268 : ui32, arg_idx = 4 : i32, arg_plus = 0 : i32} - - // Configure DMA channel for trace - aiex.npu.maskwrite32 {address = 119304 : ui32, column = 0 : i32, mask = 7936 : ui32, row = 0 : i32, value = 3840 : ui32} - aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483663 : ui32} - - // Start trace control - aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 0 : i32, value = 32512 : ui32} - aiex.npu.write32 {address = 213068 : ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} - aiex.npu.write32 {address = 213000 : ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} - // ======================================================================== // DATA TRANSFER CONFIGURATION // ======================================================================== + // Configure DMA tasks for input, factor, and output %0 = aiex.dma_configure_task_for @in { aie.dma_bd(%arg0 : memref<4096xi32>, 0, 4096, [, , , ]) {burst_length = 0 : i32} aie.end @@ -229,24 +204,17 @@ module { aie.end } {issue_token = true} - %2 = aiex.dma_configure_task_for @out { + %2 = aiex.dma_configure_task_for @out_fwd { aie.dma_bd(%arg2 : memref<4096xi32>, 0, 4096, [, , , ]) {burst_length = 0 : i32} aie.end } {issue_token = true} - + aiex.dma_start_task(%0) aiex.dma_start_task(%1) aiex.dma_start_task(%2) aiex.dma_await_task(%0) aiex.dma_await_task(%1) aiex.dma_await_task(%2) - - // ======================================================================== - // TRACE COMPLETION - // ======================================================================== - - aiex.npu.write32 {address = 213064 : ui32, column = 0 : i32, row = 0 : i32, value = 126 : ui32} - aiex.npu.write32 {address = 213000 : ui32, column = 0 : i32, row = 0 : i32, value = 126 : ui32} } } } diff --git a/programming_examples/basic/event_trace/aie_trace.py b/programming_examples/basic/event_trace/aie_trace.py index 3b4bc6f7b9a..a704cf233d6 100644 --- a/programming_examples/basic/event_trace/aie_trace.py +++ b/programming_examples/basic/event_trace/aie_trace.py @@ -13,7 +13,7 @@ # - aie.trace.event for specifying events to capture # - aie.trace.start_config in runtime sequence # -# The passes aie-trace-to-config and aie-inline-trace-config will lower this. +# This will be incrementally lowered by trace passes # # Usage: # python3 aie_trace.py > aie_trace_from_py.mlir @@ -49,25 +49,38 @@ def device_body(): # Tile declarations shim_noc_tile_0_0 = tile(0, 0) + mem_tile_0_1 = tile(0, 1) tile_0_2 = tile(0, 2) - # ObjectFIFOs for data movement - of_in = object_fifo("in", shim_noc_tile_0_0, tile_0_2, 2, tile_ty) - of_factor = object_fifo("infactor", shim_noc_tile_0_0, tile_0_2, 2, scalar_ty) - of_out = object_fifo("out", tile_0_2, shim_noc_tile_0_0, 2, tile_ty) + # ObjectFIFOs for data movement through memtile + of_in = object_fifo("in", shim_noc_tile_0_0, mem_tile_0_1, 2, tile_ty) + of_in_fwd = object_fifo("in_fwd", mem_tile_0_1, tile_0_2, 2, tile_ty) + object_fifo_link(of_in, of_in_fwd) + + of_factor = object_fifo( + "infactor", shim_noc_tile_0_0, mem_tile_0_1, 2, scalar_ty + ) + of_factor_fwd = object_fifo( + "infactor_fwd", mem_tile_0_1, tile_0_2, 2, scalar_ty + ) + object_fifo_link(of_factor, of_factor_fwd) + + of_out = object_fifo("out", tile_0_2, mem_tile_0_1, 2, tile_ty) + of_out_fwd = object_fifo("out_fwd", mem_tile_0_1, shim_noc_tile_0_0, 2, tile_ty) + object_fifo_link(of_out, of_out_fwd) # Core computation @core(tile_0_2) def core_body(): for _ in range_(sys.maxsize): - elem_factor = of_factor.acquire(ObjectFifoPort.Consume, 1) + elem_factor = of_factor_fwd.acquire(ObjectFifoPort.Consume, 1) for _ in range_(num_sub_vectors): elem_out = of_out.acquire(ObjectFifoPort.Produce, 1) - elem_in = of_in.acquire(ObjectFifoPort.Consume, 1) + elem_in = of_in_fwd.acquire(ObjectFifoPort.Consume, 1) scale(elem_in, elem_out, elem_factor, tile_size) - of_in.release(ObjectFifoPort.Consume, 1) + of_in_fwd.release(ObjectFifoPort.Consume, 1) of_out.release(ObjectFifoPort.Produce, 1) - of_factor.release(ObjectFifoPort.Consume, 1) + of_factor_fwd.release(ObjectFifoPort.Consume, 1) # ================================================================== # TRACE CONFIGURATION @@ -84,17 +97,16 @@ def core_trace_body(): trace_event("MEMORY_STALL") trace_event("STREAM_STALL") trace_event("LOCK_STALL") + trace_event("PORT_RUNNING_0") trace_event("PORT_RUNNING_1") - trace_event("PORT_IDLE_1") trace_port(0, WireBundle.DMA, 0, DMAChannelDir.S2MM) trace_port(1, WireBundle.DMA, 0, DMAChannelDir.MM2S) - trace_start(event="BROADCAST_15") - trace_stop(event="BROADCAST_14") + trace_start(broadcast=15) + trace_stop(broadcast=14) # Trace configuration for compute tile (0,2) - memory events @trace(tile_0_2, "mem_trace") def mem_trace_body(): - trace_mode(TraceMode.EventTime) trace_packet(3, TracePacketType.Mem) trace_event("DMA_S2MM_0_START_TASK") trace_event("DMA_S2MM_1_START_TASK") @@ -107,6 +119,29 @@ def mem_trace_body(): trace_start(event="BROADCAST_15") trace_stop(event="BROADCAST_14") + # Trace configuration for mem tile (0,1) + @trace(mem_tile_0_1, "memtile_trace") + def memtile_trace_body(): + trace_packet(4, TracePacketType.MemTile) + trace_event("PORT_RUNNING_0") + trace_event("PORT_RUNNING_1") + trace_event("PORT_RUNNING_2") + trace_event("PORT_RUNNING_3") + trace_event("PORT_RUNNING_4") + trace_event("PORT_RUNNING_5") + trace_event("PORT_RUNNING_6") + trace_event("PORT_RUNNING_7") + trace_port(0, WireBundle.DMA, 0, DMAChannelDir.MM2S) + trace_port(1, WireBundle.DMA, 1, DMAChannelDir.MM2S) + trace_port(2, WireBundle.DMA, 0, DMAChannelDir.S2MM) + trace_port(3, WireBundle.DMA, 1, DMAChannelDir.S2MM) + trace_port(4, WireBundle.DMA, 2, DMAChannelDir.S2MM) + trace_port(5, WireBundle.DMA, 3, DMAChannelDir.S2MM) + trace_port(6, WireBundle.DMA, 4, DMAChannelDir.S2MM) + trace_port(7, WireBundle.DMA, 5, DMAChannelDir.S2MM) + trace_start(broadcast=15) + trace_stop(broadcast=14) + # Trace configuration for shim tile (0,0) @trace(shim_noc_tile_0_0, "shim_trace") def shim_trace_body(): @@ -122,99 +157,19 @@ def shim_trace_body(): trace_start(event="TRUE") trace_stop(event="NONE") - # Packet flows to route trace data - packetflow( - 1, - tile_0_2, - WireBundle.Trace, - 0, - {"dest": shim_noc_tile_0_0, "port": WireBundle.DMA, "channel": 1}, - keep_pkt_header=True, - ) - packetflow( - 3, - tile_0_2, - WireBundle.Trace, - 1, - {"dest": shim_noc_tile_0_0, "port": WireBundle.DMA, "channel": 1}, - keep_pkt_header=True, - ) - packetflow( - 2, - shim_noc_tile_0_0, - WireBundle.Trace, - 0, - {"dest": shim_noc_tile_0_0, "port": WireBundle.DMA, "channel": 1}, - keep_pkt_header=True, - ) - # ================================================================== # RUNTIME SEQUENCE WITH TRACE ACTIVATION # ================================================================== @runtime_sequence(tensor_ty, scalar_ty, tensor_ty) def sequence(A, F, C): - # Trace initialization - applied by lowering passes + # Start trace configuration trace_start_config("core_trace") trace_start_config("mem_trace") + trace_start_config("memtile_trace") trace_start_config("shim_trace") - # Timer_Control (address 0x34000 = 212992) - npu_write32(column=0, row=2, address=212992, value=31232) - - # Configure trace buffer descriptor - npu_writebd( - bd_id=15, - buffer_length=8192, - buffer_offset=0, - burst_length=64, - column=0, - d0_size=0, - d0_stride=0, - d0_zero_after=0, - d0_zero_before=0, - d1_size=0, - d1_stride=0, - d1_zero_after=0, - d1_zero_before=0, - d2_size=0, - d2_stride=0, - d2_zero_after=0, - d2_zero_before=0, - enable_packet=1, - iteration_current=0, - iteration_size=0, - iteration_stride=0, - lock_acq_enable=0, - lock_acq_id=0, - lock_acq_val=0, - lock_rel_id=0, - lock_rel_val=0, - next_bd=0, - out_of_order_id=0, - packet_id=0, - packet_type=0, - row=0, - use_next_bd=0, - valid_bd=1, - ) - - # Patch trace buffer address - npu_address_patch(addr=119268, arg_idx=4, arg_plus=0) - - # Configure DMA channel for trace - npu_maskwrite32(address=119304, column=0, mask=7936, row=0, value=3840) - npu_write32(address=119308, column=0, row=0, value=2147483663) - - # Start trace control - npu_write32(address=212992, column=0, row=0, value=32512) - npu_write32(address=213068, column=0, row=0, value=127) - npu_write32(address=213000, column=0, row=0, value=127) - - # ============================================================== - # DATA TRANSFER CONFIGURATION - # ============================================================== - + # Configure DMA tasks for input, factor, and output in_task = shim_dma_single_bd_task( of_in, A, sizes=[1, 1, 1, tensor_size], issue_token=True ) @@ -222,19 +177,12 @@ def sequence(A, F, C): of_factor, F, sizes=[1, 1, 1, 1], issue_token=True ) out_task = shim_dma_single_bd_task( - of_out, C, sizes=[1, 1, 1, tensor_size], issue_token=True + of_out_fwd, C, sizes=[1, 1, 1, tensor_size], issue_token=True ) dma_start_task(in_task, factor_task, out_task) dma_await_task(in_task, factor_task, out_task) - # ============================================================== - # TRACE COMPLETION - # ============================================================== - - npu_write32(address=213064, column=0, row=0, value=126) - npu_write32(address=213000, column=0, row=0, value=126) - with mlir_mod_ctx() as ctx: build_aie_trace() diff --git a/test/Dialect/AIE/combo_edge/test_combo_edge_full.mlir b/test/Dialect/AIE/combo_edge/test_combo_edge_full.mlir index 5ad4f4c6833..ad48ba0b1d5 100644 --- a/test/Dialect/AIE/combo_edge/test_combo_edge_full.mlir +++ b/test/Dialect/AIE/combo_edge/test_combo_edge_full.mlir @@ -34,8 +34,8 @@ aie.device(npu1_1col) { // Check trace control // CHECK: aie.trace.reg register = "Trace_Control0" field = "Mode" value = 0 // CHECK: aie.trace.reg register = "Trace_Control1" field = "ID" value = 1 - // CHECK: aie.trace.reg register = "Trace_Control0" field = "Trace_Start_Event" value = 15 - // CHECK: aie.trace.reg register = "Trace_Control0" field = "Trace_Stop_Event" value = 14 + // CHECK: aie.trace.reg register = "Trace_Control0" field = "Trace_Start_Event" value = 122 + // CHECK: aie.trace.reg register = "Trace_Control0" field = "Trace_Stop_Event" value = 121 // Check event slots // CHECK: aie.trace.reg register = "Trace_Event0" field = "Trace_Event0" // CHECK: aie.trace.reg register = "Trace_Event0" field = "Trace_Event1" diff --git a/test/Dialect/AIE/trace/test_trace_port_to_config.mlir b/test/Dialect/AIE/trace/test_trace_port_to_config.mlir index 5ee50f21564..4a9ee7be0b4 100644 --- a/test/Dialect/AIE/trace/test_trace_port_to_config.mlir +++ b/test/Dialect/AIE/trace/test_trace_port_to_config.mlir @@ -33,8 +33,8 @@ module { // CHECK: aie.device(npu1_1col) // CHECK: %[[TILE:.*]] = aie.tile(0, 2) // CHECK: aie.trace.config @port_trace_config(%[[TILE]]) -// CHECK-DAG: aie.trace.reg register = "Trace_Control0" field = "Trace_Start_Event" value = 15 -// CHECK-DAG: aie.trace.reg register = "Trace_Control0" field = "Trace_Stop_Event" value = 14 +// CHECK-DAG: aie.trace.reg register = "Trace_Control0" field = "Trace_Start_Event" value = 122 +// CHECK-DAG: aie.trace.reg register = "Trace_Control0" field = "Trace_Stop_Event" value = 121 // CHECK-DAG: aie.trace.reg register = "Trace_Control0" field = "Mode" value = 0 // CHECK-DAG: aie.trace.reg register = "Trace_Control1" field = "ID" value = 1 // CHECK-DAG: aie.trace.reg register = "Trace_Control1" field = "Packet_Type" value = 0 diff --git a/test/dialect/AIE/trace/test_trace_to_config.mlir b/test/dialect/AIE/trace/test_trace_to_config.mlir index 832e16d7833..1ec8faca40d 100644 --- a/test/dialect/AIE/trace/test_trace_to_config.mlir +++ b/test/dialect/AIE/trace/test_trace_to_config.mlir @@ -28,8 +28,8 @@ module { // CHECK-DAG: aie.trace.reg register = "Trace_Control0" field = "Mode" value = 0 // CHECK-DAG: aie.trace.reg register = "Trace_Control1" field = "ID" value = 1 // CHECK-DAG: aie.trace.reg register = "Trace_Control1" field = "Packet_Type" value = 0 - // CHECK-DAG: aie.trace.reg register = "Trace_Control0" field = "Trace_Start_Event" value = 15 - // CHECK-DAG: aie.trace.reg register = "Trace_Control0" field = "Trace_Stop_Event" value = 14 + // CHECK-DAG: aie.trace.reg register = "Trace_Control0" field = "Trace_Start_Event" value = 122 + // CHECK-DAG: aie.trace.reg register = "Trace_Control0" field = "Trace_Stop_Event" value = 121 // CHECK-DAG: aie.trace.reg register = "Trace_Event0" field = "Trace_Event0" value = "INSTR_EVENT_0" // CHECK-DAG: aie.trace.reg register = "Trace_Event0" field = "Trace_Event1" value = "INSTR_VECTOR" // CHECK-DAG: aie.trace.reg register = "Trace_Event0" field = "Trace_Event2" value = "LOCK_STALL" @@ -46,8 +46,8 @@ module { // CHECK-NOT: aie.trace.reg register = "Trace_Control0" field = "Mode" // CHECK-DAG: aie.trace.reg register = "Trace_Control1" field = "ID" value = 3 // CHECK-DAG: aie.trace.reg register = "Trace_Control1" field = "Packet_Type" value = 1 - // CHECK-DAG: aie.trace.reg register = "Trace_Control0" field = "Trace_Start_Event" value = 15 - // CHECK-DAG: aie.trace.reg register = "Trace_Control0" field = "Trace_Stop_Event" value = 14 + // CHECK-DAG: aie.trace.reg register = "Trace_Control0" field = "Trace_Start_Event" value = 122 + // CHECK-DAG: aie.trace.reg register = "Trace_Control0" field = "Trace_Stop_Event" value = 121 // CHECK-DAG: aie.trace.reg register = "Trace_Event0" field = "Trace_Event0" value = "DMA_S2MM_0_START_TASK" } } diff --git a/test/npu-xrt/vec_mul_event_trace/aie.mlir b/test/npu-xrt/vec_mul_event_trace/aie.mlir index bce20dcb959..e6a6cb3c4d5 100644 --- a/test/npu-xrt/vec_mul_event_trace/aie.mlir +++ b/test/npu-xrt/vec_mul_event_trace/aie.mlir @@ -4,33 +4,40 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// Copyright (C) 2025, Advanced Micro Devices, Inc. +// Copyright (C) 2025-2026, Advanced Micro Devices, Inc. // //===----------------------------------------------------------------------===// // -// Vector-scalar multiplication with event trace functionality on NPU. -// This tests basic trace configuration and data capture -// -// trace components: -// 1. aie.packet_flow - Routes trace packets from compute tiles to shim DMA -// 2. aiex.npu.write32 - Configures trace control registers -// 3. aiex.npu.writebd - Sets up buffer descriptor for trace data capture +// This example uses: +// - aie.trace operation for declarative trace configuration +// - aie.trace.event for specifying events to capture +// - aie.trace.start_config in runtime sequence // +// This will be incrementally lowered by trace passes //===----------------------------------------------------------------------===// module { - aie.device(npu1_1col) { + aie.device(npu2_1col) { // External kernel function declaration func.func private @vector_scalar_mul_aie_scalar(memref<1024xi32>, memref<1024xi32>, memref<1xi32>, i32) attributes {link_with = "vector_scalar_mul.o"} // Tile declarations %shim_noc_tile_0_0 = aie.tile(0, 0) + %mem_tile_0_1 = aie.tile(0, 1) %tile_0_2 = aie.tile(0, 2) // ObjectFIFOs for data movement - aie.objectfifo @in(%shim_noc_tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @infactor(%shim_noc_tile_0_0, {%tile_0_2}, 2 : i32) : !aie.objectfifo> - aie.objectfifo @out(%tile_0_2, {%shim_noc_tile_0_0}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @in(%shim_noc_tile_0_0, {%mem_tile_0_1}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @in_fwd(%mem_tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo> + aie.objectfifo.link [@in] -> [@in_fwd]([] [0]) + + aie.objectfifo @infactor(%shim_noc_tile_0_0, {%mem_tile_0_1}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @infactor_fwd(%mem_tile_0_1, {%tile_0_2}, 2 : i32) : !aie.objectfifo> + aie.objectfifo.link [@infactor] -> [@infactor_fwd]([] [0]) + + aie.objectfifo @out(%tile_0_2, {%mem_tile_0_1}, 2 : i32) : !aie.objectfifo> + aie.objectfifo @out_fwd(%mem_tile_0_1, {%shim_noc_tile_0_0}, 2 : i32) : !aie.objectfifo> + aie.objectfifo.link [@out] -> [@out_fwd]([] [0]) // Core computation %core_0_2 = aie.core(%tile_0_2) { @@ -38,7 +45,7 @@ module { %c9223372036854775807 = arith.constant 9223372036854775807 : index %c1 = arith.constant 1 : index scf.for %arg0 = %c0 to %c9223372036854775807 step %c1 { - %0 = aie.objectfifo.acquire @infactor(Consume, 1) : !aie.objectfifosubview> + %0 = aie.objectfifo.acquire @infactor_fwd(Consume, 1) : !aie.objectfifosubview> %1 = aie.objectfifo.subview.access %0[0] : !aie.objectfifosubview> -> memref<1xi32> %c0_0 = arith.constant 0 : index %c4 = arith.constant 4 : index @@ -46,144 +53,122 @@ module { scf.for %arg1 = %c0_0 to %c4 step %c1_1 { %2 = aie.objectfifo.acquire @out(Produce, 1) : !aie.objectfifosubview> %3 = aie.objectfifo.subview.access %2[0] : !aie.objectfifosubview> -> memref<1024xi32> - %4 = aie.objectfifo.acquire @in(Consume, 1) : !aie.objectfifosubview> + %4 = aie.objectfifo.acquire @in_fwd(Consume, 1) : !aie.objectfifosubview> %5 = aie.objectfifo.subview.access %4[0] : !aie.objectfifosubview> -> memref<1024xi32> %c1024_i32 = arith.constant 1024 : i32 func.call @vector_scalar_mul_aie_scalar(%5, %3, %1, %c1024_i32) : (memref<1024xi32>, memref<1024xi32>, memref<1xi32>, i32) -> () - aie.objectfifo.release @in(Consume, 1) + aie.objectfifo.release @in_fwd(Consume, 1) aie.objectfifo.release @out(Produce, 1) } - aie.objectfifo.release @infactor(Consume, 1) + aie.objectfifo.release @infactor_fwd(Consume, 1) } aie.end } // ======================================================================== - // Trace Packet Flow Configuration + // TRACE CONFIGURATION // ======================================================================== - // Packet flows to route trace data from compute tile to shim DMA - // Flow 1: Route trace with id=1 from compute tile (0,2) to shim tile (0,0) DMA channel 1 - aie.packet_flow(1) { - aie.packet_source<%tile_0_2, Trace : 0> - aie.packet_dest<%shim_noc_tile_0_0, DMA : 1> - } {keep_pkt_header = true} + // Trace configuration for compute tile (0,2) - core events + aie.trace @core_trace(%tile_0_2) { + aie.trace.mode "Event-Time" + aie.trace.packet id=1 type=core + + aie.trace.event<"INSTR_EVENT_0"> + aie.trace.event<"INSTR_EVENT_1"> + aie.trace.event<"INSTR_VECTOR"> + aie.trace.event<"PORT_RUNNING_0"> + aie.trace.event<"PORT_RUNNING_1"> + aie.trace.event<"INSTR_LOCK_ACQUIRE_REQ"> + aie.trace.event<"INSTR_LOCK_RELEASE_REQ"> + aie.trace.event<"LOCK_STALL"> + + aie.trace.port<0> port=DMA channel=0 direction=S2MM + aie.trace.port<1> port=DMA channel=0 direction=MM2S + + aie.trace.start broadcast=15 + aie.trace.stop broadcast=14 + } - // Flow 2: Route trace with id=2 from shim tile itself to DMA channel 1 - aie.packet_flow(2) { - aie.packet_source<%shim_noc_tile_0_0, Trace : 0> - aie.packet_dest<%shim_noc_tile_0_0, DMA : 1> - } {keep_pkt_header = true} + // Trace configuration for compute tile (0,2) - memory events + aie.trace @mem_trace(%tile_0_2) { + aie.trace.packet id=2 type=mem + + aie.trace.event<"DMA_S2MM_0_START_TASK"> + aie.trace.event<"DMA_S2MM_1_START_TASK"> + aie.trace.event<"DMA_MM2S_0_START_TASK"> + aie.trace.event<"DMA_S2MM_0_FINISHED_TASK"> + aie.trace.event<"DMA_S2MM_1_FINISHED_TASK"> + aie.trace.event<"DMA_MM2S_0_FINISHED_TASK"> + aie.trace.event<"DMA_S2MM_0_STREAM_STARVATION"> + aie.trace.event<"DMA_S2MM_1_STREAM_STARVATION"> + + aie.trace.start event=<"BROADCAST_15"> + aie.trace.stop event=<"BROADCAST_14"> + } - // Runtime sequence with trace configuration - aie.runtime_sequence(%arg0: memref<4096xi32>, %arg1: memref<1xi32>, %arg2: memref<4096xi32>) { + // Trace configuration for mem tile (0, 1) + aie.trace @memtile_trace(%mem_tile_0_1) { + aie.trace.packet id=3 type=memtile + + aie.trace.event<"PORT_RUNNING_0"> + aie.trace.event<"PORT_RUNNING_1"> + aie.trace.event<"PORT_RUNNING_2"> + aie.trace.event<"PORT_RUNNING_3"> + aie.trace.event<"PORT_RUNNING_4"> + aie.trace.event<"PORT_RUNNING_5"> + aie.trace.event<"PORT_RUNNING_6"> + aie.trace.event<"PORT_RUNNING_7"> + + aie.trace.port<0> port=DMA channel=0 direction=MM2S + aie.trace.port<1> port=DMA channel=1 direction=MM2S + aie.trace.port<2> port=DMA channel=0 direction=S2MM + aie.trace.port<3> port=DMA channel=1 direction=S2MM + aie.trace.port<4> port=DMA channel=2 direction=S2MM + aie.trace.port<5> port=DMA channel=3 direction=S2MM + aie.trace.port<6> port=DMA channel=4 direction=S2MM + aie.trace.port<7> port=DMA channel=5 direction=S2MM + + aie.trace.start broadcast=15 + aie.trace.stop broadcast=14 + } - // ======================================================================== - // Trace Control Register Configuration - // ======================================================================== + // Trace configuration for shim tile (0,0) + aie.trace @shim_trace(%shim_noc_tile_0_0) { + aie.trace.packet id=4 type=shimtile + + aie.trace.event<"DMA_S2MM_0_START_TASK"> + aie.trace.event<"DMA_S2MM_1_START_TASK"> + aie.trace.event<"DMA_MM2S_0_START_TASK"> + aie.trace.event<"DMA_S2MM_0_FINISHED_TASK"> + aie.trace.event<"DMA_S2MM_1_FINISHED_TASK"> + aie.trace.event<"DMA_MM2S_0_FINISHED_TASK"> + aie.trace.event<"DMA_S2MM_0_STREAM_STARVATION"> + aie.trace.event<"DMA_S2MM_1_STREAM_STARVATION"> + + aie.trace.start event=<"TRUE"> + aie.trace.stop event=<"NONE"> + } + + // ======================================================================== + // RUNTIME SEQUENCE WITH TRACE ACTIVATION + // ======================================================================== - // Configure trace unit for compute tile (0,2) - // Address 213200 (0x340D0): Trace_Control_0 - // Value enables trace with specific event selection - aiex.npu.write32 {address = 213200 : ui32, column = 0 : i32, row = 2 : i32, value = 2038038528 : ui32} - - // Address 213204 (0x340D4): Trace_Control_1 - // Value configures trace mode and packet generation - aiex.npu.write32 {address = 213204 : ui32, column = 0 : i32, row = 2 : i32, value = 1 : ui32} - - // Address 213216 (0x340E0): Trace_Event_0 - // Configures which events to trace (events 0-3) - aiex.npu.write32 {address = 213216 : ui32, column = 0 : i32, row = 2 : i32, value = 1260724769 : ui32} - - // Address 213220 (0x340E4): Trace_Event_1 - // Configures which events to trace (events 4-7) - aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 2 : i32, value = 439168079 : ui32} - - // Address 261888 (0x3FF00): Stream_Switch_Event_Port_Selection_0 - // Select Stream Switch Ports for event generation - aiex.npu.write32 {address = 261888 : ui32, column = 0 : i32, row = 2 : i32, value = 289 : ui32} - // Address 261892 (0x3FF04): Stream_Switch_Event_Port_Selection_1 - aiex.npu.write32 {address = 261892 : ui32, column = 0 : i32, row = 2 : i32, value = 0 : ui32} - - // Address 212992 (0x34000): Timer_Control - aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 2 : i32, value = 31232 : ui32} - - // Configure trace unit for shim tile (0,0) - // Address 213200 (0x340D0): Trace_Control_0 - aiex.npu.write32 {address = 213200 : ui32, column = 0 : i32, row = 0 : i32, value = 2122252288 : ui32} - // Address 213204 (0x340D4): Trace_Control_1 - aiex.npu.write32 {address = 213204 : ui32, column = 0 : i32, row = 0 : i32, value = 8194 : ui32} - // Address 213216 (0x340E0): Trace_Event_0 - aiex.npu.write32 {address = 213216 : ui32, column = 0 : i32, row = 0 : i32, value = 370151182 : ui32} - // Address 213220 (0x340E4): Trace_Event_1 - aiex.npu.write32 {address = 213220 : ui32, column = 0 : i32, row = 0 : i32, value = 522065943 : ui32} - // Address 212992 (0x34000): Timer_Control - aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 0 : i32, value = 32512 : ui32} + aie.runtime_sequence(%arg0: memref<4096xi32>, %arg1: memref<1xi32>, %arg2: memref<4096xi32>) { // ======================================================================== - // Trace Buffer Descriptor and DMA Configuration + // TRACE INITIALIZATION // ======================================================================== - // Configure buffer descriptor 15 for trace data capture - aiex.npu.writebd { - bd_id = 15 : i32, - buffer_length = 8192 : i32, // 8KB trace buffer - buffer_offset = 0 : i32, - burst_length = 64 : i32, - column = 0 : i32, - d0_size = 0 : i32, - d0_stride = 0 : i32, - d0_zero_after = 0 : i32, - d0_zero_before = 0 : i32, - d1_size = 0 : i32, - d1_stride = 0 : i32, - d1_zero_after = 0 : i32, - d1_zero_before = 0 : i32, - d2_size = 0 : i32, - d2_stride = 0 : i32, - d2_zero_after = 0 : i32, - d2_zero_before = 0 : i32, - enable_packet = 1 : i32, // Enable packet mode for trace - iteration_current = 0 : i32, - iteration_size = 0 : i32, - iteration_stride = 0 : i32, - lock_acq_enable = 0 : i32, - lock_acq_id = 0 : i32, - lock_acq_val = 0 : i32, - lock_rel_id = 0 : i32, - lock_rel_val = 0 : i32, - next_bd = 0 : i32, - out_of_order_id = 0 : i32, - packet_id = 0 : i32, - packet_type = 0 : i32, - row = 0 : i32, - use_next_bd = 0 : i32, - valid_bd = 1 : i32 - } - - // Patch the trace buffer address (arg_idx = 4 corresponds to the 5th XRT buffer) - // Address 119268 (0x1D1E4): Buffer descriptor 15 address field - aiex.npu.address_patch {addr = 119268 : ui32, arg_idx = 4 : i32, arg_plus = 0 : i32} - - // Configure DMA channel 1 for trace data transfer - // Address 119304 (0x1D208): DMA_S2MM_1_Control - aiex.npu.maskwrite32 {address = 119304 : ui32, column = 0 : i32, mask = 7936 : ui32, row = 0 : i32, value = 3840 : ui32} - // Address 119308 (0x1D20C): DMA_S2MM_1_Queue - aiex.npu.write32 {address = 119308 : ui32, column = 0 : i32, row = 0 : i32, value = 2147483663 : ui32} - - // Start trace control - // Address 212992 (0x34000): Timer_Control - aiex.npu.write32 {address = 212992 : ui32, column = 0 : i32, row = 0 : i32, value = 32512 : ui32} - // Address 213068 (0x3404C): Event_Broadcast_15 - aiex.npu.write32 {address = 213068 : ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} - // Address 213000 (0x34008): Event_Generate - aiex.npu.write32 {address = 213000 : ui32, column = 0 : i32, row = 0 : i32, value = 127 : ui32} + aie.trace.start_config @core_trace + aie.trace.start_config @mem_trace + aie.trace.start_config @memtile_trace + aie.trace.start_config @shim_trace // ======================================================================== - // Kernel Data Transfer + // DATA TRANSFER CONFIGURATION // ======================================================================== - // Configure DMA tasks for input, factor, and output %0 = aiex.dma_configure_task_for @in { aie.dma_bd(%arg0 : memref<4096xi32>, 0, 4096, [, , , ]) {burst_length = 0 : i32} aie.end @@ -194,28 +179,17 @@ module { aie.end } {issue_token = true} - %2 = aiex.dma_configure_task_for @out { + %2 = aiex.dma_configure_task_for @out_fwd { aie.dma_bd(%arg2 : memref<4096xi32>, 0, 4096, [, , , ]) {burst_length = 0 : i32} aie.end } {issue_token = true} - // Start and await data transfer tasks aiex.dma_start_task(%0) aiex.dma_start_task(%1) aiex.dma_start_task(%2) aiex.dma_await_task(%0) aiex.dma_await_task(%1) aiex.dma_await_task(%2) - - // ======================================================================== - // Trace Epilogue - // ======================================================================== - - // Flush trace data by writing trace done event - // Address 213064 (0x34048): Event_Broadcast_14 - aiex.npu.write32 {address = 213064 : ui32, column = 0 : i32, row = 0 : i32, value = 126 : ui32} - // Address 213000 (0x34008): Event_Generate - aiex.npu.write32 {address = 213000 : ui32, column = 0 : i32, row = 0 : i32, value = 126 : ui32} } } } diff --git a/test/npu-xrt/vec_mul_event_trace/test.py b/test/npu-xrt/vec_mul_event_trace/test.py index 133743276e8..2ab34f827cc 100644 --- a/test/npu-xrt/vec_mul_event_trace/test.py +++ b/test/npu-xrt/vec_mul_event_trace/test.py @@ -9,15 +9,15 @@ # # ===-----------------------------------------------------------------------===# # -# REQUIRES: ryzen_ai_npu1, xrt_python_bindings +# REQUIRES: ryzen_ai_npu2, xrt_python_bindings # # Build the test -# RUN: xchesscc_wrapper aie2 -I %aietools/include -c %S/vector_scalar_mul.cc -o vector_scalar_mul.o +# RUN: xchesscc_wrapper aie2p -I %aietools/include -c %S/vector_scalar_mul.cc -o vector_scalar_mul.o # RUN: %python aiecc.py --no-aiesim --aie-generate-xclbin --aie-generate-npu-insts --no-compile-host --xclbin-name=final.xclbin --npu-insts-name=insts.bin %S/aie.mlir -# Run the test -# RUN: %run_on_npu1% %python %S/test.py --xclbin final.xclbin --instr insts.bin --kernel MLIR_AIE --trace-sz 8192 --mlir %S/aie.mlir | FileCheck %s +# Run the test (input_with_addresses.mlir contains the lowered npu_write ops) +# RUN: %run_on_npu2% %python %S/test.py --xclbin final.xclbin --instr insts.bin --kernel MLIR_AIE --trace-sz 8192 --mlir aie.mlir.prj/input_with_addresses.mlir | FileCheck %s # CHECK: PASS! import numpy as np import sys @@ -108,31 +108,74 @@ def main(opts): if not trace_events: print("ERROR: Failed to generate trace events (empty or False returned).") errors += 1 - instr_event_0_count = 0 - instr_event_1_count = 0 else: - instr_event_0_count = sum( + # Count events from each trace type + # Core trace: INSTR_EVENT_0, INSTR_EVENT_1 + core_instr_event_0 = sum( 1 for event in trace_events if event.get("name") == "INSTR_EVENT_0" and event.get("ph") == "B" ) - instr_event_1_count = sum( + core_instr_event_1 = sum( 1 for event in trace_events if event.get("name") == "INSTR_EVENT_1" and event.get("ph") == "B" ) - if opts.verbosity >= 1: - print(f"INSTR_EVENT_0 count: {instr_event_0_count}") - print(f"INSTR_EVENT_1 count: {instr_event_1_count}") + # Mem trace (core memory): DMA_S2MM_0_START_TASK + mem_dma_start = sum( + 1 + for event in trace_events + if event.get("name") == "DMA_S2MM_0_START_TASK" and event.get("ph") == "B" + ) - # Verify expected counts. The kernel is expected to generate 4 of each event. - if instr_event_0_count < 4: - print(f"ERROR: Expected 4 INSTR_EVENT_0 events, found {instr_event_0_count}") - errors += 1 - if instr_event_1_count < 4: - print(f"ERROR: Expected 4 INSTR_EVENT_1 events, found {instr_event_1_count}") - errors += 1 + # Memtile trace: PORT_RUNNING_0 + memtile_port_running = sum( + 1 + for event in trace_events + if event.get("name") == "PORT_RUNNING_0" and event.get("ph") == "B" + ) + + # Shim trace: DMA_MM2S_0_START_TASK + shim_dma_start = sum( + 1 + for event in trace_events + if event.get("name") == "DMA_MM2S_0_START_TASK" and event.get("ph") == "B" + ) + + if opts.verbosity >= 1: + print(f"Core trace - INSTR_EVENT_0 count: {core_instr_event_0}") + print(f"Core trace - INSTR_EVENT_1 count: {core_instr_event_1}") + print(f"Mem trace - DMA_S2MM_0_START_TASK count: {mem_dma_start}") + print(f"Memtile trace - PORT_RUNNING_0 count: {memtile_port_running}") + print(f"Shim trace - DMA_MM2S_0_START_TASK count: {shim_dma_start}") + + # Verify expected counts - kernel runs 4 iterations + if core_instr_event_0 < 4: + print( + f"ERROR: Core trace - Expected >= 4 INSTR_EVENT_0, found {core_instr_event_0}" + ) + errors += 1 + if core_instr_event_1 < 4: + print( + f"ERROR: Core trace - Expected >= 4 INSTR_EVENT_1, found {core_instr_event_1}" + ) + errors += 1 + if mem_dma_start < 1: + print( + f"ERROR: Mem trace - Expected >= 1 DMA_S2MM_0_START_TASK, found {mem_dma_start}" + ) + errors += 1 + if memtile_port_running < 1: + print( + f"ERROR: Memtile trace - Expected >= 1 PORT_RUNNING_0, found {memtile_port_running}" + ) + errors += 1 + if shim_dma_start < 1: + print( + f"ERROR: Shim trace - Expected >= 1 DMA_MM2S_0_START_TASK, found {shim_dma_start}" + ) + errors += 1 # Final result if errors == 0: diff --git a/test/npu-xrt/vec_mul_event_trace/vector_scalar_mul.cc b/test/npu-xrt/vec_mul_event_trace/vector_scalar_mul.cc index bd0acbdb99a..11f36b9c32e 100644 --- a/test/npu-xrt/vec_mul_event_trace/vector_scalar_mul.cc +++ b/test/npu-xrt/vec_mul_event_trace/vector_scalar_mul.cc @@ -4,7 +4,7 @@ // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // -// Copyright (C) 2025, Advanced Micro Devices, Inc. +// Copyright (C) 2025-2026, Advanced Micro Devices, Inc. // //===----------------------------------------------------------------------===// diff --git a/tools/aiecc/aiecc.cpp b/tools/aiecc/aiecc.cpp index 48b0a40b0d1..b2d1955daa9 100644 --- a/tools/aiecc/aiecc.cpp +++ b/tools/aiecc/aiecc.cpp @@ -1325,6 +1325,7 @@ static LogicalResult runTraceLoweringPipeline(ModuleOp moduleOp, } OpPassManager &devicePm = pm.nest(); + devicePm.addPass(xilinx::AIE::createAIEInsertTraceFlowsPass()); devicePm.addPass(xilinx::AIE::createAIETraceToConfigPass()); devicePm.addPass(xilinx::AIE::createAIETraceRegPackWritesPass()); devicePm.addPass(xilinx::AIEX::createAIEXInlineTraceConfigPass()); From e9c2a06a729f46fa89346da3122419e62be3ce93 Mon Sep 17 00:00:00 2001 From: yenjames Date: Fri, 13 Mar 2026 09:56:47 -0600 Subject: [PATCH 3/3] Update small format errors. --- include/aie/Dialect/AIE/Transforms/AIEPasses.h | 3 +-- programming_examples/basic/event_trace/Makefile | 10 ---------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/include/aie/Dialect/AIE/Transforms/AIEPasses.h b/include/aie/Dialect/AIE/Transforms/AIEPasses.h index 211a029b880..9aa5d93e16d 100644 --- a/include/aie/Dialect/AIE/Transforms/AIEPasses.h +++ b/include/aie/Dialect/AIE/Transforms/AIEPasses.h @@ -68,8 +68,7 @@ std::unique_ptr> createAIEAssignTileCtrlIDsPass(); std::unique_ptr> createAIETraceToConfigPass(); std::unique_ptr> createAIETraceRegPackWritesPass(); -std::unique_ptr> -createAIEInsertTraceFlowsPass(); +std::unique_ptr> createAIEInsertTraceFlowsPass(); /// Generate the code for registering passes. #define GEN_PASS_REGISTRATION diff --git a/programming_examples/basic/event_trace/Makefile b/programming_examples/basic/event_trace/Makefile index e7de7e505c1..174c7f1f91e 100644 --- a/programming_examples/basic/event_trace/Makefile +++ b/programming_examples/basic/event_trace/Makefile @@ -39,16 +39,6 @@ else echo "Device type not supported" endif -# Preprocess the MLIR: substitute NPUDEVICE with the target device string -build/aie_trace.mlir: ${srcdir}/aie_trace.mlir - mkdir -p ${@D} - sed 's/NPUDEVICE/${devicename}_1col/g' $< > $@ - -# Generate MLIR from Python -build/aie_trace_from_py.mlir: ${srcdir}/aie_trace.py - mkdir -p ${@D} - python3 $< > $@ - # Preprocess the MLIR: substitute NPUDEVICE with the target device string build/aie_trace.mlir: ${srcdir}/aie_trace.mlir mkdir -p ${@D}