diff --git a/ggml/include/ggml-tsavorite.h b/ggml/include/ggml-tsavorite.h
index d4baab4836c..2a7a32789c4 100644
--- a/ggml/include/ggml-tsavorite.h
+++ b/ggml/include/ggml-tsavorite.h
@@ -327,7 +327,7 @@ extern void _mlir_ciface_txe_rms_norm_16_host(void *a, void *res, void *buf);
 
 extern void ggml_tsi_log_tensor_data(tensor_log log_data);
 
-#define NUM_OF_TXES 1
+#define NUM_OF_TXES 2
 
 // GGML supports tensors with a maximum rank of 4
 #define MEM_REF_DESCRIPTOR_RANK 4
diff --git a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
index 7880e3dd26c..2669983e504 100644
--- a/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
+++ b/ggml/src/ggml-tsavorite/ggml-tsavorite.cpp
@@ -23,14 +23,36 @@
 #include <unistd.h>
 #include <inttypes.h>
 #include <math.h>
-#include <string>
 #include <iostream>
+#include <magic_enum/magic_enum.hpp>
 #include "ggml-backend-impl.h"
 #include "ggml-impl.h"
 #include "ggml.h"
+#include "tsi-rt/TXEDeviceConfig.h"
+#include "tsi-rt/host/BlobDescriptor.h"
+#include "tsi-rt/queues/Command.h"
 #include "HostShimCAPI.h"
 #include "tsi-rt/utils/Profiler.h"
 
+
+#include <thread>
+#include <vector>
+#include  <mutex>
+#include <condition_variable>
+
+
+using namespace tsi::runtime;
+
+
+std::vector<std::thread> workers;
+static std::mutex device_mutex;
+static std::condition_variable device_cv;
+
+// device availability table
+static bool device_free[NUM_OF_TXES] = { true, true };
+
+static bool multi_thread_enable = false;
+
 #ifdef TMU_DEBUG_VALIDATE
 
 // CPU reference GEMM for TMU packed tiles using the SAME MemRefDescriptor<4>
@@ -108,6 +130,52 @@ typedef struct _txe_compute_pipeline_state_t *txe_compute_pipeline_state_s;
 FILE *tsi_op_log_file;
 bool runtime_initialized = false;
 uint64_t num_of_op;
+
+
+//static TSI_DeviceIdType deviceId_1 = 0, deviceId_2 = 1;
+
+static void *loadResult_add[NUM_OF_TXES], *loadResult_mult[NUM_OF_TXES], *loadResult_rms_norm[NUM_OF_TXES];
+static BlobDescriptor *blobDescriptor_add[NUM_OF_TXES], *blobDescriptor_mult[NUM_OF_TXES], *blobDescriptor_rms_norm[NUM_OF_TXES];
+
+void tsi_load_all_blobs() {
+
+for (int i=0; i < NUM_OF_TXES; ++i) {
+printf("\n ANOOP Loading Blobs \n");
+  loadResult_add[i] = tsi_load_blob(
+      i,
+      "txe_add",
+      ("/proj/work/akapoor/llama-cpp-march-30-multi-txe/llama.cpp/ggml-tsi-kernel/fpga-kernel/build-fpga/txe_add/blobs/txe_add"));
+
+  blobDescriptor_add[i] = static_cast<BlobDescriptor *>(loadResult_add[i]);
+
+  loadResult_mult[i] = tsi_load_blob(
+      i,
+      "txe_mult",
+      ("/proj/work/akapoor/llama-cpp-march-30-multi-txe/llama.cpp/ggml-tsi-kernel/fpga-kernel/build-fpga/txe_mult/blobs/txe_mult"));
+
+  blobDescriptor_mult[i] = static_cast<BlobDescriptor *>(loadResult_mult[i]);
+
+  loadResult_rms_norm[i] = tsi_load_blob(
+      i,
+      "txe_rms_norm",
+      ("/proj/work/akapoor/llama-cpp-march-30-multi-txe/llama.cpp/ggml-tsi-kernel/fpga-kernel/build-fpga/txe_rms_norm/blobs/txe_rms_norm"));
+
+  blobDescriptor_rms_norm[i] = static_cast<BlobDescriptor *>(loadResult_rms_norm[i]);
+
+}
+
+  return;
+}
+
+static void tsi_unload_all_blobs() {
+for (int i=0; i < NUM_OF_TXES; ++i) {
+  tsi_unload_blob(blobDescriptor_add[i]);
+  tsi_unload_blob(blobDescriptor_mult[i]);
+  tsi_unload_blob(blobDescriptor_rms_norm[i]);
+}
+  return;
+}
+
 // Centralized TSI runtime initialization - called once globally
 static void ensure_tsi_runtime_initialized() {
   if (!runtime_initialized) {
@@ -115,6 +183,10 @@ static void ensure_tsi_runtime_initialized() {
     tsirt::utils::TSIProfiler::initialize();
     // TSI Run time Initalization
     tsi_initialize(NUM_OF_TXES, NULL);
+
+    tsi_load_all_blobs();
+
+    workers.reserve(2);
     runtime_initialized = true;
     GGML_TSAVORITE_LOG_INFO("Profiler and TSI runtime initialized early in registration\n");
   }
@@ -491,6 +563,287 @@ static void _mlir_ciface_txe_mult_test (void *src0, void *src1, void *res)
     return;
 }
 
+
+// Packed args layout for 3x memref<?xf32, strided<[1], offset: ?>, 1>
+// Per TXE_PackArgsOp: group per-arg as (handle, offset, sizes, strides),
+// and only dynamic metadata is packed. For this type: offset is dynamic, size(0) is dynamic, stride(0)=1 is static.
+// So each arg contributes: (handle, offset, size0) => 3 int64s per arg.
+// Total = 3 args * 3 int64 = 9 int64 = 72 bytes.)
+
+
+// ============================================================
+// DEVICE ACQUIRE / RELEASE
+// ============================================================
+
+static inline int acquire_device_blocking() {
+    std::unique_lock<std::mutex> lock(device_mutex);
+
+    device_cv.wait(lock, [] {
+        for (int i = 0; i < NUM_OF_TXES; ++i)
+            if (device_free[i])
+                return true;
+        return false;
+    });
+
+    for (int i = 0; i < NUM_OF_TXES; ++i) {
+        if (device_free[i]) {
+            device_free[i] = false;
+            return i;
+        }
+    }
+    return -1;
+}
+
+static inline void release_device(int deviceId) {
+    std::lock_guard<std::mutex> lock(device_mutex);
+    device_free[deviceId] = true;
+    device_cv.notify_one();
+}
+
+// ============================================================
+// FINAL JOIN — CALL AT END OF ggml_tsavorite_graph_compute()
+// ============================================================
+
+static inline void join_all_workers() {
+    for (auto &t : workers) {
+        if (t.joinable())
+            t.join();
+    }
+    workers.clear();
+
+    {
+        std::lock_guard<std::mutex> lock(device_mutex);
+        for (int i = 0; i < NUM_OF_TXES; ++i)
+            device_free[i] = true;
+    }
+    device_cv.notify_all();
+}
+
+static void tsi_blob_execution_internal(void *commandList) {
+  // Enqueue & run
+  tsi_finalize_command_list(commandList);
+  tsi_wait(commandList);
+  return;
+}
+
+static void *_mlir_ciface_txe_add_host_internal(void *a, void *b, void *res, TSI_DeviceIdType deviceId) {
+  constexpr int64_t kPackedArgsI64  = 9;
+  constexpr int64_t kPackedArgsBytes = kPackedArgsI64 * 8;
+
+  // Create the command list for the blob execute command
+  void *commandList = tsi_create_command_list(deviceId);
+
+  // Allocate packed args buffer in shared DRAM
+  void *packed = tsi_alloc(kPackedArgsBytes, tsi::MemorySpace::SHARED_DRAM_TS);
+  auto *p = static_cast<int64_t *>(packed);
+
+  MemRefDescriptor<Rank> *A = (MemRefDescriptor<Rank> *)a;
+  MemRefDescriptor<Rank> *B = (MemRefDescriptor<Rank> *)b;
+  MemRefDescriptor<Rank> *C = (MemRefDescriptor<Rank> *)res;
+
+  // Pack args strictly as:
+  // (A handle, A offset, A size0,  B handle, B offset, B size0,  C handle, C offset, C size0)
+  // NOTE: this is NOT "all handles first".
+  int idx = 0;
+
+  // Arg A
+  p[idx++] = tsi_shmem_handle_from_ptr(A->data);
+  p[idx++] = (int64_t)A->offset;
+  p[idx++] = (int64_t)A->shape[0];
+
+  // Arg B
+  p[idx++] = tsi_shmem_handle_from_ptr(B->data);
+  p[idx++] = (int64_t)B->offset;
+  p[idx++] = (int64_t)B->shape[0];
+
+  // Arg C
+  p[idx++] = tsi_shmem_handle_from_ptr(C->data);
+  p[idx++] = (int64_t)C->offset;
+  p[idx++] = (int64_t)C->shape[0];
+
+  // Sanity: we must have filled exactly kPackedArgsI64 entries
+  // (avoid silent layout drift).
+  if (idx != kPackedArgsI64) {
+    printf("ERROR: packed-args idx=%d expected=%ld\n", idx, (long)kPackedArgsI64);
+    abort();
+  }
+
+  const int64_t packedHandle = tsi_shmem_handle_from_ptr(packed);
+
+  void *blobExecuteCmd = tsi_launch_blob(blobDescriptor_add[deviceId], /*packedArgs*/ packedHandle);
+  tsi_add_command_to_list(commandList, blobExecuteCmd);
+
+  return commandList;
+}
+
+static void _mlir_ciface_txe_add_host_new(void *a, void *b, void *res) {
+    if (!multi_thread_enable) {
+       void *commandList = _mlir_ciface_txe_add_host_internal(a, b, res, 0);
+        tsi_blob_execution_internal(commandList);
+        return;
+    }
+
+    int deviceId = acquire_device_blocking();
+printf("\n ANOOP ADD device ID %d", deviceId);
+
+    workers.emplace_back([=]() {
+        void *commandList = _mlir_ciface_txe_add_host_internal(a, b, res, deviceId);
+        tsi_blob_execution_internal(commandList);
+        release_device(deviceId);
+        printf("\n ANOOP Release ADD device ID %d", deviceId);
+    });
+}
+
+
+static void *_mlir_ciface_txe_mult_host_internal(void *a, void *b, void *res, TSI_DeviceIdType deviceId) {
+  constexpr int64_t kPackedArgsI64  = 9;
+  constexpr int64_t kPackedArgsBytes = kPackedArgsI64 * 8;
+
+  // Create the command list for the blob execute command
+  void *commandList = tsi_create_command_list(deviceId);
+
+  // Allocate packed args buffer in shared DRAM
+  void *packed = tsi_alloc(kPackedArgsBytes, tsi::MemorySpace::SHARED_DRAM_TS);
+  auto *p = static_cast<int64_t *>(packed);
+
+  MemRefDescriptor<Rank> *A = (MemRefDescriptor<Rank> *)a;
+  MemRefDescriptor<Rank> *B = (MemRefDescriptor<Rank> *)b;
+  MemRefDescriptor<Rank> *C = (MemRefDescriptor<Rank> *)res;
+
+  // Pack args strictly as:
+  // (A handle, A offset, A size0,  B handle, B offset, B size0,  C handle, C offset, C size0)
+  // NOTE: this is NOT "all handles first".
+  int idx = 0;
+
+  // Arg A
+  p[idx++] = tsi_shmem_handle_from_ptr(A->data);
+  p[idx++] = (int64_t)A->offset;
+  p[idx++] = (int64_t)A->shape[0];
+
+  // Arg B
+  p[idx++] = tsi_shmem_handle_from_ptr(B->data);
+  p[idx++] = (int64_t)B->offset;
+  p[idx++] = (int64_t)B->shape[0];
+
+  // Arg C
+  p[idx++] = tsi_shmem_handle_from_ptr(C->data);
+  p[idx++] = (int64_t)C->offset;
+  p[idx++] = (int64_t)C->shape[0];
+
+  // Sanity: we must have filled exactly kPackedArgsI64 entries
+  // (avoid silent layout drift).
+  if (idx != kPackedArgsI64) {
+    printf("ERROR: packed-args idx=%d expected=%ld\n", idx, (long)kPackedArgsI64);
+    abort();
+  }
+
+  const int64_t packedHandle = tsi_shmem_handle_from_ptr(packed);
+
+  void *blobExecuteCmd = tsi_launch_blob(blobDescriptor_mult[deviceId], /*packedArgs*/ packedHandle);
+  tsi_add_command_to_list(commandList, blobExecuteCmd);
+
+  return commandList;
+}
+
+static void _mlir_ciface_txe_mult_host_new(void *a, void *b, void *res) {
+    if (!multi_thread_enable) {
+        void *commandList = _mlir_ciface_txe_mult_host_internal(a, b, res, 1);
+        tsi_blob_execution_internal(commandList);
+        return;
+    }
+
+    int deviceId = acquire_device_blocking();
+
+printf("\n ANOOP MUL device ID %d", deviceId);
+    workers.emplace_back([=]() {
+        void *commandList = _mlir_ciface_txe_mult_host_internal(a, b, res, deviceId);
+        tsi_blob_execution_internal(commandList);
+        release_device(deviceId);
+        printf("\n ANOOP Release MUL device ID %d", deviceId);
+    });
+}
+
+
+static void *_mlir_ciface_txe_rms_norm_host_internal(void *a, void *b, void *buf, TSI_DeviceIdType deviceId) {
+  constexpr int64_t kPackedArgsI64  = 20;
+  constexpr int64_t kPackedArgsBytes = kPackedArgsI64 * 8;
+
+  // Create the command list for the blob execute command
+  void *commandList = tsi_create_command_list(deviceId);
+
+  // Allocate packed args buffer in shared DRAM
+  void *packed = tsi_alloc(kPackedArgsBytes, tsi::MemorySpace::SHARED_DRAM_TS);
+  auto *p = static_cast<int64_t *>(packed);
+
+  MemRefDescriptor<Rank> *A = (MemRefDescriptor<Rank> *)a;
+  MemRefDescriptor<Rank> *B = (MemRefDescriptor<Rank> *)b;
+  MemRefDescriptor<Rank> *C = (MemRefDescriptor<Rank> *)buf;
+
+  // Pack args strictly as:
+  // (A handle, A offset, A size0,  B handle, B offset, B size0,  C handle, C offset, C size0)
+  // NOTE: this is NOT "all handles first".
+  int idx = 0;
+
+  // Arg A
+  p[idx++] = tsi_shmem_handle_from_ptr(A->data);
+  p[idx++] = (int64_t)A->offset;
+  for(int i=0; i <=3; ++i) {
+      p[idx++] = (int64_t)A->shape[i];
+  }
+  for(int i=0; i <=2; ++i) {
+      p[idx++] = (int64_t)A->strides[i];
+  }
+
+  // Arg B
+  p[idx++] = tsi_shmem_handle_from_ptr(B->data);
+  p[idx++] = (int64_t)B->offset;
+
+  for(int i=0; i <=3; ++i) {
+      p[idx++] = (int64_t)B->shape[i];
+  }
+
+  for(int i=0; i <=2; ++i) {
+      p[idx++] = (int64_t)B->strides[i];
+  }
+ 
+
+  // Arg C
+  p[idx++] = tsi_shmem_handle_from_ptr(C->data);
+  p[idx++] = (int64_t)C->offset;
+
+  // Sanity: we must have filled exactly kPackedArgsI64 entries
+  // (avoid silent layout drift).
+  if (idx != kPackedArgsI64) {
+    printf("ERROR: packed-args idx=%d expected=%ld\n", idx, (long)kPackedArgsI64);
+    abort();
+  }
+
+  const int64_t packedHandle = tsi_shmem_handle_from_ptr(packed);
+
+  void *blobExecuteCmd = tsi_launch_blob(blobDescriptor_rms_norm[deviceId], /*packedArgs*/ packedHandle);
+  tsi_add_command_to_list(commandList, blobExecuteCmd);
+  return commandList;
+}
+
+static void _mlir_ciface_txe_rms_norm_host_new(void *a, void *b, void *buf) {
+    if (!multi_thread_enable) {
+        void *commandList = _mlir_ciface_txe_rms_norm_host_internal(a, b, buf, 0);
+        tsi_blob_execution_internal(commandList);
+        return;
+    }
+
+    int deviceId = acquire_device_blocking();
+printf("\n ANOOP RMS_NORM device ID %d", deviceId);
+
+    workers.emplace_back([=]() {
+        void *commandList = _mlir_ciface_txe_rms_norm_host_internal(a, b, buf, deviceId);
+        tsi_blob_execution_internal(commandList);
+        release_device(deviceId);
+       printf("\n ANOOP RMS_NORM Releasing device ID %d", deviceId);
+    });
+}
+
+
 static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_type kernel_type) {
   txe_compute_pipeline_state_s kernel_pipeline =
       (txe_compute_pipeline_state_s)calloc(1, sizeof(struct _txe_compute_pipeline_state_t));
@@ -507,7 +860,8 @@ static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_
           if (ggml_tsavorite_kernel_mode_flag == GGML_TSAVORITE_KERNEL_MODE_CPU)
               kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_add_test;
           else {
-              kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_add_host;
+              //kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_add_host;
+              kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_add_host_new;
               kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F16_INDEX] = &_mlir_ciface_txe_add_16_host;
 	  }
           kernel_pipeline->kernel_name = "TXE_ADD";
@@ -523,7 +877,8 @@ static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_
           if (ggml_tsavorite_kernel_mode_flag == GGML_TSAVORITE_KERNEL_MODE_CPU)
               kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_mult_test;
           else {
-              kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_mult_host;
+              //kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_mult_host;
+              kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_mult_host_new;
               kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F16_INDEX] = &_mlir_ciface_txe_mult_16_host;
 	  }
           kernel_pipeline->kernel_name = "TXE_MULT";
@@ -578,7 +933,8 @@ static txe_compute_pipeline_state_s tsi_kernel_setup(enum ggml_tsavorite_kernel_
           flag = true;
           break;
       case GGML_TSAVORITE_KERNEL_TYPE_RMS_NORM:
-          kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_rms_norm_host;
+          //kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_rms_norm_host;
+          kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F32_INDEX] = &_mlir_ciface_txe_rms_norm_host_new;
           kernel_pipeline->_mlir_fptr_2_input[DATA_TYPE_F16_INDEX] = &_mlir_ciface_txe_rms_norm_16_host;
           kernel_pipeline->kernel_name = "TXE_RMS_NORM";
           flag = true;
@@ -809,6 +1165,7 @@ static void ggml_tsavorite_free(struct ggml_backend_tsavorite_context *ctx) {
   if (runtime_initialized == true) {
       sleep(2);
       runtime_initialized = false;
+      tsi_unload_all_blobs();
       tsi_finalize();
       tsirt::utils::TSIProfiler::finalize();
       sleep(2);
@@ -826,6 +1183,7 @@ tsi_cleanup() {
     if (runtime_initialized != true)
         return;
     runtime_initialized = false;
+    tsi_unload_all_blobs();
     tsi_finalize();
     GGML_TSAVORITE_LOG_INFO("Start %s\n", __func__);
     tsirt::utils::TSIProfiler::finalize();
@@ -1634,6 +1992,9 @@ static enum ggml_status ggml_tsavorite_run_tmu_mul_mat(
     return GGML_STATUS_SUCCESS;
 }
 
+static void anoop_test() {
+   return;
+}
 // nodes are intermediate which has multiple src tensors & operation
 // Here we create multiple thread
 // Each Thread run the command buffer & pick Tensor and execute and get the result back base on
@@ -1704,6 +2065,20 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
   tensor_log log_data;
 
 
+multi_thread_enable = false;
+if (cgraph->n_nodes == 2) {
+             node = cgraph->nodes[0];
+             if (node->op == GGML_OP_MUL)  {
+                 multi_thread_enable = true;
+                 printf("\n ANOOP Multi-thread-enable for MUL GRAPH EXECUTIOn going to start");
+             }
+             node = cgraph->nodes[1];
+             if (node->op == GGML_OP_MUL)  {
+                 multi_thread_enable = true;
+                 printf("\n ANOOP Multi-thread-enable for MUL GRAPH EXECUTIOn going to start");
+             }
+}
+
   for (int i = 0; i < cgraph->n_nodes; i++) {
      int32_t kernel_sub_type=-1;
 #if defined(GGML_PERF) || defined(GGML_PERF_RELEASE) || defined(GGML_PERF_DETAIL)
@@ -1740,7 +2115,6 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
         for(ii=0; ii <= 95; ++ii)
                vall[ii] = 0;
     }
-
     switch (node->op) {
     case GGML_OP_ADD:
       kernel_type = GGML_TSAVORITE_KERNEL_TYPE_ADD;
@@ -2182,7 +2556,14 @@ static enum ggml_status ggml_tsavorite_graph_compute(ggml_backend_t backend,
         node->perf_time_us += (INT64_MAX - t_start + t_end + 1);
     }
 #endif /* GGML_PERF-related flags */
-  }
+  } /* this is main for loop */
+
+  if (multi_thread_enable) {
+    printf("\n ANOOP Multi-thread-enable for MUL GRAPH EXECUTIOn Completed");
+    join_all_workers();
+   }
+  anoop_test();
+
 
   // This this need to implement correctly when we have mixture of CPU and accelerator operation
   // return ggml_graph_compute(cgraph, &cplan);