diff --git a/ggml/include/ggml-tsavorite.h b/ggml/include/ggml-tsavorite.h index d4baab4836c..2a7a32789c4 100644 --- a/ggml/include/ggml-tsavorite.h +++ b/ggml/include/ggml-tsavorite.h @@ -327,7 +327,7 @@ extern void _mlir_ciface_txe_rms_norm_16_host(void *a, void *res, void *buf); extern void ggml_tsi_log_tensor_data(tensor_log log_data); -#define NUM_OF_TXES 1 +#define NUM_OF_TXES 2 // GGML supports tensors with a maximum rank of 4 #define MEM_REF_DESCRIPTOR_RANK 4 diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp index 7880e3dd26c..2669983e504 100644 --- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp +++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp @@ -23,14 +23,36 @@ #include #include #include -#include #include +#include #include "ggml-backend-impl.h" #include "ggml-impl.h" #include "ggml.h" +#include "tsi-rt/TXEDeviceConfig.h" +#include "tsi-rt/host/BlobDescriptor.h" +#include "tsi-rt/queues/Command.h" #include "HostShimCAPI.h" #include "tsi-rt/utils/Profiler.h" + +#include +#include +#include +#include + + +using namespace tsi::runtime; + + +std::vector workers; +static std::mutex device_mutex; +static std::condition_variable device_cv; + +// device availability table +static bool device_free[NUM_OF_TXES] = { true, true }; + +static bool multi_thread_enable = false; + #ifdef TMU_DEBUG_VALIDATE // CPU reference GEMM for TMU packed tiles using the SAME MemRefDescriptor<4> @@ -108,6 +130,52 @@ typedef struct _txe_compute_pipeline_state_t *txe_compute_pipeline_state_s; FILE *tsi_op_log_file; bool runtime_initialized = false; uint64_t num_of_op; + + +//static TSI_DeviceIdType deviceId_1 = 0, deviceId_2 = 1; + +static void *loadResult_add[NUM_OF_TXES], *loadResult_mult[NUM_OF_TXES], *loadResult_rms_norm[NUM_OF_TXES]; +static BlobDescriptor *blobDescriptor_add[NUM_OF_TXES], *blobDescriptor_mult[NUM_OF_TXES], *blobDescriptor_rms_norm[NUM_OF_TXES]; + +void tsi_load_all_blobs() { + +for (int i=0; i < NUM_OF_TXES; ++i) { +printf("\n ANOOP Loading Blobs \n"); + loadResult_add[i] = tsi_load_blob( + i, + "txe_add", + ("/proj/work/akapoor/llama-cpp-march-30-multi-txe/llama.cpp/ggml-tsi-kernel/fpga-kernel/build-fpga/txe_add/blobs/txe_add")); + + blobDescriptor_add[i] = static_cast(loadResult_add[i]); + + loadResult_mult[i] = tsi_load_blob( + i, + "txe_mult", + ("/proj/work/akapoor/llama-cpp-march-30-multi-txe/llama.cpp/ggml-tsi-kernel/fpga-kernel/build-fpga/txe_mult/blobs/txe_mult")); + + blobDescriptor_mult[i] = static_cast(loadResult_mult[i]); + + loadResult_rms_norm[i] = tsi_load_blob( + i, + "txe_rms_norm", + ("/proj/work/akapoor/llama-cpp-march-30-multi-txe/llama.cpp/ggml-tsi-kernel/fpga-kernel/build-fpga/txe_rms_norm/blobs/txe_rms_norm")); + + blobDescriptor_rms_norm[i] = static_cast(loadResult_rms_norm[i]); + +} + + return; +} + +static void tsi_unload_all_blobs() { +for (int i=0; i < NUM_OF_TXES; ++i) { + tsi_unload_blob(blobDescriptor_add[i]); + tsi_unload_blob(blobDescriptor_mult[i]); + tsi_unload_blob(blobDescriptor_rms_norm[i]); +} + return; +} + // Centralized TSI runtime initialization - called once globally static void ensure_tsi_runtime_initialized() { if (!runtime_initialized) { @@ -115,6 +183,10 @@ static void ensure_tsi_runtime_initialized() { tsirt::utils::TSIProfiler::initialize(); // TSI Run time Initalization tsi_initialize(NUM_OF_TXES, NULL); + + tsi_load_all_blobs(); + + workers.reserve(2); runtime_initialized = true; GGML_TSAVORITE_LOG_INFO("Profiler and TSI runtime initialized early in registration\n"); } @@ -491,6 +563,287 @@ static void _mlir_ciface_txe_mult_test (void *src0, void *src1, void *res) return; } + +// Packed args layout for 3x memref, 1> +// Per TXE_PackArgsOp: group per-arg as (handle, offset, sizes, strides), +// and only dynamic metadata is packed. For this type: offset is dynamic, size(0) is dynamic, stride(0)=1 is static. +// So each arg contributes: (handle, offset, size0) => 3 int64s per arg. +// Total = 3 args * 3 int64 = 9 int64 = 72 bytes.) + + +// ============================================================ +// DEVICE ACQUIRE / RELEASE +// ============================================================ + +static inline int acquire_device_blocking() { + std::unique_lock lock(device_mutex); + + device_cv.wait(lock, [] { + for (int i = 0; i < NUM_OF_TXES; ++i) + if (device_free[i]) + return true; + return false; + }); + + for (int i = 0; i < NUM_OF_TXES; ++i) { + if (device_free[i]) { + device_free[i] = false; + return i; + } + } + return -1; +} + +static inline void release_device(int deviceId) { + std::lock_guard lock(device_mutex); + device_free[deviceId] = true; + device_cv.notify_one(); +} + +// ============================================================ +// FINAL JOIN — CALL AT END OF ggml_tsavorite_graph_compute() +// ============================================================ + +static inline void join_all_workers() { + for (auto &t : workers) { + if (t.joinable()) + t.join(); + } + workers.clear(); + + { + std::lock_guard lock(device_mutex); + for (int i = 0; i < NUM_OF_TXES; ++i) + device_free[i] = true; + } + device_cv.notify_all(); +} + +static void tsi_blob_execution_internal(void *commandList) { + // Enqueue & run + tsi_finalize_command_list(commandList); + tsi_wait(commandList); + return; +} + +static void *_mlir_ciface_txe_add_host_internal(void *a, void *b, void *res, TSI_DeviceIdType deviceId) { + constexpr int64_t kPackedArgsI64 = 9; + constexpr int64_t kPackedArgsBytes = kPackedArgsI64 * 8; + + // Create the command list for the blob execute command + void *commandList = tsi_create_command_list(deviceId); + + // Allocate packed args buffer in shared DRAM + void *packed = tsi_alloc(kPackedArgsBytes, tsi::MemorySpace::SHARED_DRAM_TS); + auto *p = static_cast(packed); + + MemRefDescriptor *A = (MemRefDescriptor *)a; + MemRefDescriptor *B = (MemRefDescriptor *)b; + MemRefDescriptor *C = (MemRefDescriptor *)res; + + // Pack args strictly as: + // (A handle, A offset, A size0, B handle, B offset, B size0, C handle, C offset, C size0) + // NOTE: this is NOT "all handles first". + int idx = 0; + + // Arg A + p[idx++] = tsi_shmem_handle_from_ptr(A->data); + p[idx++] = (int64_t)A->offset; + p[idx++] = (int64_t)A->shape[0]; + + // Arg B + p[idx++] = tsi_shmem_handle_from_ptr(B->data); + p[idx++] = (int64_t)B->offset; + p[idx++] = (int64_t)B->shape[0]; + + // Arg C + p[idx++] = tsi_shmem_handle_from_ptr(C->data); + p[idx++] = (int64_t)C->offset; + p[idx++] = (int64_t)C->shape[0]; + + // Sanity: we must have filled exactly kPackedArgsI64 entries + // (avoid silent layout drift). + if (idx != kPackedArgsI64) { + printf("ERROR: packed-args idx=%d expected=%ld\n", idx, (long)kPackedArgsI64); + abort(); + } + + const int64_t packedHandle = tsi_shmem_handle_from_ptr(packed); + + void *blobExecuteCmd = tsi_launch_blob(blobDescriptor_add[deviceId], /*packedArgs*/ packedHandle); + tsi_add_command_to_list(commandList, blobExecuteCmd); + + return commandList; +} + +static void _mlir_ciface_txe_add_host_new(void *a, void *b, void *res) { + if (!multi_thread_enable) { + void *commandList = _mlir_ciface_txe_add_host_internal(a, b, res, 0); + tsi_blob_execution_internal(commandList); + return; + } + + int deviceId = acquire_device_blocking(); +printf("\n ANOOP ADD device ID %d", deviceId); + + workers.emplace_back([=]() { + void *commandList = _mlir_ciface_txe_add_host_internal(a, b, res, deviceId); + tsi_blob_execution_internal(commandList); + release_device(deviceId); + printf("\n ANOOP Release ADD device ID %d", deviceId); + }); +} + + +static void *_mlir_ciface_txe_mult_host_internal(void *a, void *b, void *res, TSI_DeviceIdType deviceId) { + constexpr int64_t kPackedArgsI64 = 9; + constexpr int64_t kPackedArgsBytes = kPackedArgsI64 * 8; + + // Create the command list for the blob execute command + void *commandList = tsi_create_command_list(deviceId); + + // Allocate packed args buffer in shared DRAM + void *packed = tsi_alloc(kPackedArgsBytes, tsi::MemorySpace::SHARED_DRAM_TS); + auto *p = static_cast(packed); + + MemRefDescriptor *A = (MemRefDescriptor *)a; + MemRefDescriptor *B = (MemRefDescriptor *)b; + MemRefDescriptor *C = (MemRefDescriptor *)res; + + // Pack args strictly as: + // (A handle, A offset, A size0, B handle, B offset, B size0, C handle, C offset, C size0) + // NOTE: this is NOT "all handles first". + int idx = 0; + + // Arg A + p[idx++] = tsi_shmem_handle_from_ptr(A->data); + p[idx++] = (int64_t)A->offset; + p[idx++] = (int64_t)A->shape[0]; + + // Arg B + p[idx++] = tsi_shmem_handle_from_ptr(B->data); + p[idx++] = (int64_t)B->offset; + p[idx++] = (int64_t)B->shape[0]; + + // Arg C + p[idx++] = tsi_shmem_handle_from_ptr(C->data); + p[idx++] = (int64_t)C->offset; + p[idx++] = (int64_t)C->shape[0]; + + // Sanity: we must have filled exactly kPackedArgsI64 entries + // (avoid silent layout drift). + if (idx != kPackedArgsI64) { + printf("ERROR: packed-args idx=%d expected=%ld\n", idx, (long)kPackedArgsI64); + abort(); + } + + const int64_t packedHandle = tsi_shmem_handle_from_ptr(packed); + + void *blobExecuteCmd = tsi_launch_blob(blobDescriptor_mult[deviceId], /*packedArgs*/ packedHandle); + tsi_add_command_to_list(commandList, blobExecuteCmd); + + return commandList; +} + +static void _mlir_ciface_txe_mult_host_new(void *a, void *b, void *res) { + if (!multi_thread_enable) { + void *commandList = _mlir_ciface_txe_mult_host_internal(a, b, res, 1); + tsi_blob_execution_internal(commandList); + return; + } + + int deviceId = acquire_device_blocking(); + +printf("\n ANOOP MUL device ID %d", deviceId); + workers.emplace_back([=]() { + void *commandList = _mlir_ciface_txe_mult_host_internal(a, b, res, deviceId); + tsi_blob_execution_internal(commandList); + release_device(deviceId); + printf("\n ANOOP Release MUL device ID %d", deviceId); + }); +} + + +static void *_mlir_ciface_txe_rms_norm_host_internal(void *a, void *b, void *buf, TSI_DeviceIdType deviceId) { + constexpr int64_t kPackedArgsI64 = 20; + constexpr int64_t kPackedArgsBytes = kPackedArgsI64 * 8; + + // Create the command list for the blob execute command + void *commandList = tsi_create_command_list(deviceId); + + // Allocate packed args buffer in shared DRAM + void *packed = tsi_alloc(kPackedArgsBytes, tsi::MemorySpace::SHARED_DRAM_TS); + auto *p = static_cast(packed); + + MemRefDescriptor *A = (MemRefDescriptor *)a; + MemRefDescriptor *B = (MemRefDescriptor *)b; + MemRefDescriptor *C = (MemRefDescriptor *)buf; + + // Pack args strictly as: + // (A handle, A offset, A size0, B handle, B offset, B size0, C handle, C offset, C size0) + // NOTE: this is NOT "all handles first". + int idx = 0; + + // Arg A + p[idx++] = tsi_shmem_handle_from_ptr(A->data); + p[idx++] = (int64_t)A->offset; + for(int i=0; i <=3; ++i) { + p[idx++] = (int64_t)A->shape[i]; + } + for(int i=0; i <=2; ++i) { + p[idx++] = (int64_t)A->strides[i]; + } + + // Arg B + p[idx++] = tsi_shmem_handle_from_ptr(B->data); + p[idx++] = (int64_t)B->offset; + + for(int i=0; i <=3; ++i) { + p[idx++] = (int64_t)B->shape[i]; + } + + for(int i=0; i <=2; ++i) { + p[idx++] = (int64_t)B->strides[i]; + } + + + // Arg C + p[idx++] = tsi_shmem_handle_from_ptr(C->data); + p[idx++] = (int64_t)C->offset; + + // Sanity: we must have filled exactly kPackedArgsI64 entries + // (avoid silent layout drift). + if (idx != kPackedArgsI64) { + printf("ERROR: packed-args idx=%d expected=%ld\n", idx, (long)kPackedArgsI64); + abort(); + } + + const int64_t packedHandle = tsi_shmem_handle_from_ptr(packed); + + void *blobExecuteCmd = tsi_launch_blob(blobDescriptor_rms_norm[deviceId], /*packedArgs*/ packedHandle); + tsi_add_command_to_list(commandList, blobExecuteCmd); + return commandList; +} + +static void _mlir_ciface_txe_rms_norm_host_new(void *a, void *b, void *buf) { + if (!multi_thread_enable) { + void *commandList = _mlir_ciface_txe_rms_norm_host_internal(a, b, buf, 0); + tsi_blob_execution_internal(commandList); + return; + } + + int deviceId = acquire_device_blocking(); +printf("\n ANOOP RMS_NORM device ID %d", deviceId); + + workers.emplace_back([=]() { + void *commandList = _mlir_ciface_txe_rms_norm_host_internal(a, b, buf, deviceId); + tsi_blob_execution_internal(commandList); + release_device(deviceId); + printf("\n ANOOP RMS_NORM Releasing device ID %d", deviceId); + }); +} + + static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_type kernel_type) { txe_compute_pipeline_state_s kernel_pipeline = (txe_compute_pipeline_state_s)calloc(1, sizeof(struct _txe_compute_pipeline_state_t)); @@ -507,7 +860,8 @@ static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_ if (ggml_tsavorite_kernel_mode_flag == GGML_TSAVORITE_KERNEL_MODE_CPU) kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_add_test; else { - kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_add_host; + //kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_add_host; + kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_add_host_new; kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F16_INDEX] = &_mlir_ciface_txe_add_16_host; } kernel_pipeline->kernel_name = "TXE_ADD"; @@ -523,7 +877,8 @@ static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_ if (ggml_tsavorite_kernel_mode_flag == GGML_TSAVORITE_KERNEL_MODE_CPU) kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_mult_test; else { - kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_mult_host; + //kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_mult_host; + kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_mult_host_new; kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F16_INDEX] = &_mlir_ciface_txe_mult_16_host; } kernel_pipeline->kernel_name = "TXE_MULT"; @@ -578,7 +933,8 @@ static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_ flag = true; break; case GGML_TSAVORITE_KERNEL_TYPE_RMS_NORM: - kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_rms_norm_host; + //kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_rms_norm_host; + kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_rms_norm_host_new; kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F16_INDEX] = &_mlir_ciface_txe_rms_norm_16_host; kernel_pipeline->kernel_name = "TXE_RMS_NORM"; flag = true; @@ -809,6 +1165,7 @@ static void ggml_tsavorite_free(struct ggml_backend_tsavorite_context *ctx) { if (runtime_initialized == true) { sleep(2); runtime_initialized = false; + tsi_unload_all_blobs(); tsi_finalize(); tsirt::utils::TSIProfiler::finalize(); sleep(2); @@ -826,6 +1183,7 @@ tsi_cleanup() { if (runtime_initialized != true) return; runtime_initialized = false; + tsi_unload_all_blobs(); tsi_finalize(); GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__); tsirt::utils::TSIProfiler::finalize(); @@ -1634,6 +1992,9 @@ static enum ggml_status ggml_tsavorite_run_tmu_mul_mat( return GGML_STATUS_SUCCESS; } +static void anoop_test() { + return; +} // nodes are intermediate which has multiple src tensors & operation // Here we create multiple thread // Each Thread run the command buffer & pick Tensor and execute and get the result back base on @@ -1704,6 +2065,20 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, tensor_log log_data; +multi_thread_enable = false; +if (cgraph->n_nodes == 2) { + node = cgraph->nodes[0]; + if (node->op == GGML_OP_MUL) { + multi_thread_enable = true; + printf("\n ANOOP Multi-thread-enable for MUL GRAPH EXECUTIOn going to start"); + } + node = cgraph->nodes[1]; + if (node->op == GGML_OP_MUL) { + multi_thread_enable = true; + printf("\n ANOOP Multi-thread-enable for MUL GRAPH EXECUTIOn going to start"); + } +} + for (int i = 0; i < cgraph->n_nodes; i++) { int32_t kernel_sub_type=-1; #if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL) @@ -1740,7 +2115,6 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, for(ii=0; ii <= 95; ++ii) vall[ii] = 0; } - switch (node->op) { case GGML_OP_ADD: kernel_type = GGML_TSAVORITE_KERNEL_TYPE_ADD; @@ -2182,7 +2556,14 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend, node->perf_time_us += (INT64_MAX - t_start + t_end + 1); } #endif /* GGML_PERF-related flags */ - } + } /* this is main for loop */ + + if (multi_thread_enable) { + printf("\n ANOOP Multi-thread-enable for MUL GRAPH EXECUTIOn Completed"); + join_all_workers(); + } + anoop_test(); + // This this need to implement correctly when we have mixture of CPU and accelerator operation // return ggml_graph_compute(cgraph, &cplan);