Skip to content
Draft
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
4c03a73
Add conv3d programming example with IRON Worker API
jgmelber Mar 6, 2026
01c001b
Fix Conv3D implementation - working 2D convolution per depth plane
jgmelber Mar 7, 2026
6d4022a
Implement true 3D convolution with sliding window
jgmelber Mar 7, 2026
7bc8e3b
Implement vectorized Conv3D kernel with 30x speedup
jgmelber Mar 7, 2026
59884aa
Add vectorized 3D convolution with 30x speedup + multi-core framework
jgmelber Mar 7, 2026
34f96f4
Update README with final performance and multi-core status
jgmelber Mar 7, 2026
a878992
Add working 2-core spatial parallelism - 1.34x speedup
jgmelber Mar 7, 2026
afbccda
Add massively parallel Conv3D design (up to 32 cores + multi-shim DMA)
jgmelber Mar 7, 2026
9d384f1
Add comprehensive CPU vs NPU benchmarking
jgmelber Mar 7, 2026
52cea5d
Update README with CPU vs NPU benchmarking
jgmelber Mar 7, 2026
8680c9b
Add large volume sweep and final benchmark analysis
jgmelber Mar 7, 2026
1998615
Add clean README with concise benchmarks
jgmelber Mar 7, 2026
a665503
Fix clang-format and black formatting issues
jgmelber Mar 7, 2026
e071e3a
Apply clang-format and black formatting
jgmelber Mar 7, 2026
8cf21f4
Remove logs, build artifacts, and unnecessary files
jgmelber Mar 7, 2026
2cce4b3
Remove trace logs and unrelated test files
jgmelber Mar 7, 2026
9bd5495
Remove benchmarking and sweep scripts
jgmelber Mar 7, 2026
3b85c79
Add multi-core build targets to unified Makefile
jgmelber Mar 7, 2026
cbbe8b6
Add lit tests for CI and clarify kernel description in README
jgmelber Mar 11, 2026
e19c680
Clarify kernel sizes in README
jgmelber Mar 11, 2026
50bc808
Clarify video benchmarks use 8 cores (estimated)
jgmelber Mar 11, 2026
1dbbc7c
Add width tiling for large frame conv3d (512x512, 1024x1024)
jgmelber Mar 11, 2026
565fde1
Fix 1024x1024 re-invocation: single-shot cores for split-DMA
jgmelber Mar 12, 2026
a7e2160
Update README with width tiling benchmarks
jgmelber Mar 12, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
321 changes: 321 additions & 0 deletions aie_kernels/aie2/conv3dk3.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,321 @@
//===- conv3dk3.cc ----------------------------------------------*- C++ -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Copyright (C) 2026, Advanced Micro Devices, Inc.
//
//===----------------------------------------------------------------------===//

#define NOCPP

#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>

#include "../aie_kernel_utils.h"
#include <aie_api/aie.hpp>

#define REL_WRITE 0
#define REL_READ 1

enum region { top_plane, middle_plane, bottom_plane };

#ifdef SCALAR

const int32_t MAX = 255;

extern "C" {

//*****************************************************************************
// conv3d 3x3x3 - scalar
// act: uint8, wts: int8, out: uint8
//
// Processes a single depth plane of output, using three input depth planes
// (z-1, z, z+1) to apply the 3x3x3 convolution kernel.
//
// Input layout: HxWxC (height, width, channels)
// Weight layout: {O/8}{I/8}KDHW{I8}{O8} where KD=3, KH=3, KW=3
// Output layout: HxWxC
//
// For border handling:
// - check==top_plane: skip z-1 plane (use only z and z+1)
// - check==bottom_plane: skip z+1 plane (use only z-1 and z)
// - check==middle_plane: use all three planes
//*****************************************************************************
void conv3dk3_ui8_scalar(uint8_t *plane0, uint8_t *plane1, uint8_t *plane2,
int8_t *wts, uint8_t *output,
const int32_t input_width, const int32_t input_height,
const int32_t input_channels,
const int32_t output_channels,
const int32_t kernel_width, const int32_t kernel_height,
const int32_t kernel_depth, const int32_t check,
const int scale, const int channel_offset) {
event0();

int x, y, kh, kw, kd, ic, oc, ic8, oc8;
int32_t sum;
int sum_srs;
const int plane_size = input_height * input_width * 8;

// Loop over output channel groups (groups of 8)
for (oc = 0; oc < output_channels / 8; oc++) {
int oc_ofst = oc + (channel_offset / 8);

for (oc8 = 0; oc8 < 8; oc8++) {
// Process each row of the output plane
for (y = 0; y < input_height; y++) {
// Left border (x=0)
sum = 0;
sum_srs = 0;
for (ic = 0; ic < input_channels / 8; ic++) {
for (ic8 = 0; ic8 < 8; ic8++) {
for (kd = 0; kd < kernel_depth; kd++) {
for (kh = 0; kh < kernel_height; kh++) {
for (kw = 1; kw < kernel_width; kw++) {
// Weight index for 3D: {O/8}{I/8}KDHW{I8}{O8}
// Formula: (kd*KH*KW*64) + (kh*KW*64) + (kw*64) +
// (ic*KD*KH*KW*64) + (ic8*8) +
// (oc_ofst*(IC/8)*KD*KH*KW*64) + oc8
int wts_indx = (kd * 3 * 3 * 64) + (kh * 3 * 64) +
(kw * 64) +
(ic * 3 * 3 * 3 * 64) + (ic8 * 8) +
(oc_ofst * (input_channels / 8) * 3 * 3 * 3 *
64) +
oc8;

// Input index (with left border replication)
int y_pos = (y - 1 + kh < 0)
? 0
: ((y - 1 + kh >= input_height)
? input_height - 1
: y - 1 + kh);
int x_pos = (kw == 0) ? 0 : kw - 1;
int in_indx =
(y_pos * input_width + x_pos) * 8 + (ic * plane_size) + ic8;

// Accumulate from the three depth planes
if (kd == 0 && check != top_plane)
sum += plane0[in_indx] * wts[wts_indx];
if (kd == 1)
sum += plane1[in_indx] * wts[wts_indx];
if (kd == 2 && check != bottom_plane)
sum += plane2[in_indx] * wts[wts_indx];
}
}
}
}
}
sum_srs = (sum + (1 << (scale - 1))) >> scale;
sum_srs = (sum_srs > MAX) ? MAX : (sum_srs < 0) ? 0 : sum_srs;
output[(oc * input_height * input_width * 8) + (y * input_width * 8) +
oc8] = sum_srs;

// Right border (x=input_width-1)
sum = 0;
sum_srs = 0;
for (ic = 0; ic < input_channels / 8; ic++) {
for (ic8 = 0; ic8 < 8; ic8++) {
for (kd = 0; kd < kernel_depth; kd++) {
for (kh = 0; kh < kernel_height; kh++) {
for (kw = 0; kw < kernel_width - 1; kw++) {
int wts_indx = (kd * 3 * 3 * 64) + (kh * 3 * 64) +
(kw * 64) +
(ic * 3 * 3 * 3 * 64) + (ic8 * 8) +
(oc_ofst * (input_channels / 8) * 3 * 3 * 3 *
64) +
oc8;

int y_pos = (y - 1 + kh < 0)
? 0
: ((y - 1 + kh >= input_height)
? input_height - 1
: y - 1 + kh);
int x_pos = (kw == 2) ? input_width - 1 : input_width - 2 + kw;
int in_indx =
(y_pos * input_width + x_pos) * 8 + (ic * plane_size) + ic8;

if (kd == 0 && check != top_plane)
sum += plane0[in_indx] * wts[wts_indx];
if (kd == 1)
sum += plane1[in_indx] * wts[wts_indx];
if (kd == 2 && check != bottom_plane)
sum += plane2[in_indx] * wts[wts_indx];
}
}
}
}
}
sum_srs = (sum + (1 << (scale - 1))) >> scale;
sum_srs = (sum_srs > MAX) ? MAX : (sum_srs < 0) ? 0 : sum_srs;
output[(oc * input_height * input_width * 8) +
(y * input_width * 8) + (input_width - 1) * 8 + oc8] = sum_srs;

// Middle columns (x=1 to input_width-2)
for (x = 1; x < input_width - 1; x++) {
sum = 0;
sum_srs = 0;
for (ic = 0; ic < input_channels / 8; ic++) {
for (ic8 = 0; ic8 < 8; ic8++) {
for (kd = 0; kd < kernel_depth; kd++) {
for (kh = 0; kh < kernel_height; kh++) {
for (kw = 0; kw < kernel_width; kw++) {
// Weight index for 3D convolution
int wts_indx = (kd * 3 * 3 * 64) + (kh * 3 * 64) +
(kw * 64) +
(ic * 3 * 3 * 3 * 64) + (ic8 * 8) +
(oc_ofst * (input_channels / 8) * 3 * 3 * 3 *
64) +
oc8;

// Input index with height border handling
int y_pos = (y - 1 + kh < 0)
? 0
: ((y - 1 + kh >= input_height)
? input_height - 1
: y - 1 + kh);
int x_pos = x - 1 + kw;
int in_indx = (y_pos * input_width + x_pos) * 8 +
(ic * plane_size) + ic8;

// Accumulate from three depth planes
if (kd == 0 && check != top_plane)
sum += plane0[in_indx] * wts[wts_indx];
if (kd == 1)
sum += plane1[in_indx] * wts[wts_indx];
if (kd == 2 && check != bottom_plane)
sum += plane2[in_indx] * wts[wts_indx];
}
}
}
}
}
sum_srs = (sum + (1 << (scale - 1))) >> scale;
sum_srs = (sum_srs > MAX) ? MAX : (sum_srs < 0) ? 0 : sum_srs;
output[(oc * input_height * input_width * 8) +
(y * input_width * 8) + x * 8 + oc8] = sum_srs;
}
}
}
}

event1();
}

} // extern "C"

#else // Vectorized implementation using AIE intrinsics

extern "C" {

//*****************************************************************************
// conv3d 3x3x3 - vectorized (AIE intrinsics)
// act: uint8, wts: int8, out: uint8
//
// Uses 4x8x8 matrix multiply (4 outputs x 8 channels x 8 weights)
//*****************************************************************************
void conv3dk3_ui8(uint8_t *plane0, uint8_t *plane1, uint8_t *plane2,
int8_t *wts, uint8_t *output, const int32_t input_width,
const int32_t input_height, const int32_t input_channels,
const int32_t output_channels, const int32_t kernel_width,
const int32_t kernel_height, const int32_t kernel_depth,
const int32_t check, const int scale,
const int channel_offset) {
event0();

constexpr int MMUL_M = 4; // Process 4 output pixels at a time
constexpr int MMUL_K = 8; // Channel group size
constexpr int MMUL_N = 8; // Output channel group size
constexpr int MMUL_MK = MMUL_M * MMUL_K;
constexpr int MMUL_KN = MMUL_K * MMUL_N;
constexpr int MMUL_MN = MMUL_M * MMUL_N;

using MMUL = aie::mmul<MMUL_M, MMUL_K, MMUL_N, uint8, int8>;

::aie::set_saturation(aie::saturation_mode::saturate);
::aie::set_rounding(aie::rounding_mode::symmetric_inf);

const int plane_size = input_height * input_width * 8;
const int MAX = 255;

// Process each output channel group
for (int oc = 0; oc < output_channels / 8; oc++) {
int oc_ofst = oc + (channel_offset / 8);

// Process each row
for (int y = 0; y < input_height; y++) {
int y_start = (y - 1 < 0) ? 0 : y - 1;
int y_end = (y + 1 >= input_height) ? input_height - 1 : y + 1;

// Process pixels in groups of 4 for vectorization
for (int x = 0; x < input_width; x += MMUL_M) {
int x_valid = (x + MMUL_M <= input_width) ? MMUL_M : input_width - x;

// Accumulator for this output group
MMUL acc = aie::zeros<acc32, MMUL_MN>();

// Accumulate over all input channel groups
for (int ic = 0; ic < input_channels / 8; ic++) {
// Accumulate over 3x3x3 kernel
for (int kd = 0; kd < kernel_depth; kd++) {
// Skip depth kernel positions based on check
if (kd == 0 && check == top_plane) continue;
if (kd == 2 && check == bottom_plane) continue;

// Select the appropriate plane
uint8_t *plane = (kd == 0) ? plane0 : (kd == 1) ? plane1 : plane2;

for (int kh = 0; kh < kernel_height; kh++) {
int y_pos = y_start + kh;
if (y_pos < 0) y_pos = 0;
if (y_pos >= input_height) y_pos = input_height - 1;

for (int kw = 0; kw < kernel_width; kw++) {
// Load weights for this kernel position
int wts_idx = (kd * 3 * 3 * 64) + (kh * 3 * 64) + (kw * 64) +
(ic * 3 * 3 * 3 * 64) + (oc_ofst * (input_channels / 8) * 3 * 3 * 3 * 64);
aie::vector<int8, MMUL_KN> w = aie::load_v<MMUL_KN>(wts + wts_idx);

// Load activations for this group of pixels
// Handle boundary conditions
alignas(32) uint8 act_buf[MMUL_MK];
for (int xx = 0; xx < x_valid && xx < MMUL_M; xx++) {
int x_pos = x + xx - 1 + kw;
if (x_pos < 0) x_pos = 0;
if (x_pos >= input_width) x_pos = input_width - 1;

int in_idx = (y_pos * input_width + x_pos) * 8 + (ic * plane_size);
for (int ch = 0; ch < 8; ch++) {
act_buf[xx * 8 + ch] = plane[in_idx + ch];
}
}

aie::vector<uint8, MMUL_MK> a = aie::load_v<MMUL_MK>(act_buf);
acc.mac(a, w);
}
}
}
}

// Convert accumulator to output with scaling
aie::vector<uint8, MMUL_MN> o = acc.to_vector<uint8>(scale);

// Store output, handling partial groups
for (int xx = 0; xx < x_valid && x + xx < input_width; xx++) {
for (int ch = 0; ch < 8; ch++) {
int out_idx = (oc * input_height * input_width * 8) +
(y * input_width * 8) + ((x + xx) * 8) + ch;
output[out_idx] = o[xx * 8 + ch];
}
}
}
}
}

event1();
}

} // extern "C"

#endif // SCALAR
43 changes: 43 additions & 0 deletions aie_kernels/aie2/conv3dk3.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
//===- conv3dk3.h -----------------------------------------------*- C++ -*-===//
//
// This file is licensed under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Copyright (C) 2026, Advanced Micro Devices, Inc.
//
//===----------------------------------------------------------------------===//

#ifndef _CONV3DK3_H
#define _CONV3DK3_H

#include <stdint.h>

extern "C" {

#ifdef SCALAR

void conv3dk3_ui8_scalar(uint8_t *plane0, uint8_t *plane1, uint8_t *plane2,
int8_t *wts, uint8_t *output,
const int32_t input_width, const int32_t input_height,
const int32_t input_channels,
const int32_t output_channels,
const int32_t kernel_width, const int32_t kernel_height,
const int32_t kernel_depth, const int32_t check,
const int scale, const int channel_offset);

#else

void conv3dk3_ui8(uint8_t *plane0, uint8_t *plane1, uint8_t *plane2,
int8_t *wts, uint8_t *output, const int32_t input_width,
const int32_t input_height, const int32_t input_channels,
const int32_t output_channels, const int32_t kernel_width,
const int32_t kernel_height, const int32_t kernel_depth,
const int32_t check, const int scale,
const int channel_offset);

#endif

} // extern "C"

#endif // _CONV3DK3_H
Loading
Loading