bmers
diff --git a/‎backends/npu/custom_op/llama_decoder_layer_parallel_op.cc‎
Lines changed: 2 additions & 2 deletions b/‎backends/npu/custom_op/llama_decoder_layer_parallel_op.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎backends/npu/custom_op/llama_encoder_layer_parallel_op.cc‎
Lines changed: 40 additions & 20 deletions b/‎backends/npu/custom_op/llama_encoder_layer_parallel_op.cc‎
Lines changed: 40 additions & 20 deletions
diff --git a/‎backends/npu/custom_op/llama_layer/llama_encoder_parallel_operation.cpp‎
Lines changed: 2 additions & 0 deletions b/‎backends/npu/custom_op/llama_layer/llama_encoder_parallel_operation.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎backends/npu/custom_op/llama_layer/llama_fusion_parallel_operation.cpp‎
Lines changed: 67 additions & 22 deletions b/‎backends/npu/custom_op/llama_layer/llama_fusion_parallel_operation.cpp‎
Lines changed: 67 additions & 22 deletions
diff --git a/‎backends/npu/custom_op/llama_layer/llama_fusion_parallel_operation.h‎
Lines changed: 1 addition & 0 deletions b/‎backends/npu/custom_op/llama_layer/llama_fusion_parallel_operation.h‎
Lines changed: 1 addition & 0 deletions
@@ -186,7 +186,7 @@ std::vector<paddle::Tensor> LlaMaDecoderLayerParallelOp(
                                            head_dim,
                                            0,
                                            0,
-                                           g_llamadecoderLayerId,
+                                           0,
                                            2,
                                            true,
                                            g_atbSeqLen.kv_seq_len_param,
@@ -263,7 +263,7 @@ PD_BUILD_OP(llama_decoder_layer_parallel)
              "AttentionMask",
              "Cache_KV",
              "SeqLength"})
-    .Outputs({"Out", "PresentKey", "PresentValue"})
+    .Outputs({"Out", "PresentKV"})
     .Attrs({"rmsNormEps: float",
             "headDim: int",
             "headNum: int"})
 
@@ -16,8 +16,12 @@
 #include <hccl/hccl.h>
 #include <hccl/hccl_types.h>
 #include "llama_layer_parallel_op.h"
+#include "llama_layer/llama_fusion_parallel_operation.h"
 #include "llama_layer/llama_encoder_parallel_operation.h"
 #include "paddle/extension.h"
+#include "kernels/funcs/format_utils.h"
+#include "kernels/funcs/npu_funcs.h"
+#include "kernels/funcs/npu_op_runner.h"
 
 std::shared_ptr<PpAtbLlaMaEncoderLayerParallelOp> g_llaMaEncoderLayerParallelOp;
 static int32_t g_llamaEncoderLayerId = 0;
@@ -34,6 +38,10 @@ void PerpareLlaMaEncoderLayerInputs(
     const paddle::Tensor &positionIDs,
     const paddle::Tensor &cos_sin_table,
     const paddle::Tensor &attention_mask,
+    const paddle::Tensor &cache_key_value,
+    const paddle::Tensor &kv_seq_len,
+    const paddle::Tensor &q_seq_len,
+    phi::DenseTensor &layer_id_dense,
     std::vector<const phi::DenseTensor *> &inputs) {
 
   auto hidden_tensor = static_cast<const phi::DenseTensor *>(hidden.impl().get());
@@ -46,6 +54,9 @@ void PerpareLlaMaEncoderLayerInputs(
   auto positionIDs_tensor = static_cast<const phi::DenseTensor *>(positionIDs.impl().get());
   auto cos_sin_table_tensor = static_cast<const phi::DenseTensor *>(cos_sin_table.impl().get());
   auto attention_mask_tensor = static_cast<const phi::DenseTensor *>(attention_mask.impl().get());
+  auto cache_key_value_tensor = static_cast<const phi::DenseTensor *>(cache_key_value.impl().get());
+  auto kv_seq_len_tensor = static_cast<const phi::DenseTensor *>(kv_seq_len.impl().get());
+  auto q_seq_len_tensor = static_cast<const phi::DenseTensor *>(q_seq_len.impl().get());
 
   inputs.push_back(hidden_tensor);
   inputs.push_back(norm_weight_tensor);
@@ -57,6 +68,10 @@ void PerpareLlaMaEncoderLayerInputs(
   inputs.push_back(positionIDs_tensor);
   inputs.push_back(cos_sin_table_tensor);
   inputs.push_back(attention_mask_tensor);
+  inputs.push_back(cache_key_value_tensor);
+  inputs.push_back(kv_seq_len_tensor);
+  inputs.push_back(q_seq_len_tensor);
+  inputs.push_back(&layer_id_dense);
 }
 
 PpAtbLlaMaEncoderLayerParallelOp::PpAtbLlaMaEncoderLayerParallelOp(
@@ -77,6 +92,8 @@ std::vector<paddle::Tensor> LlaMaEncoderLayerParallelOp(
     const paddle::Tensor &positionIDs,
     const paddle::Tensor &cos_sin_table,
     const paddle::Tensor &attention_mask,
+    const paddle::Tensor &cache_key_value,
+    const paddle::Tensor &kv_seq_len,
     float rmsNormEps,
     int headDim,
     int headNum) {
@@ -112,34 +129,30 @@ std::vector<paddle::Tensor> LlaMaEncoderLayerParallelOp(
   layerout_tensor->Resize(phi::make_ddim(hidden.shape()));
   dev_ctx->Alloc(layerout_tensor.get(), data_type);
 
-  std::shared_ptr<phi::DenseTensor> key_tensor =
-      std::make_shared<phi::DenseTensor>();
-  key_tensor->Resize(phi::make_ddim(key_shape));
-  dev_ctx->Alloc(key_tensor.get(), data_type);
-
-  std::shared_ptr<phi::DenseTensor> value_tensor =
-      std::make_shared<phi::DenseTensor>();
-  value_tensor->Resize(phi::make_ddim(value_shape));
-  dev_ctx->Alloc(value_tensor.get(), data_type);
-
   std::vector<const phi::DenseTensor *> outputs;
   outputs.push_back(layerout_tensor.get());
-  outputs.push_back(key_tensor.get());
-  outputs.push_back(value_tensor.get());
 
   if (!g_llaMaEncoderLayerParallelOp) {
     std::cout << "Run In Encoder Parallel layernum: " << layer_num << " head_num: " << head_num << "head_dim: " << head_dim << std::endl;
     g_llaMaEncoderLayerParallelOp.reset(new PpAtbLlaMaEncoderLayerParallelOp("LlaMaEncoderLayerParallelOp", layer_num));
 
     atb::Operation *op = nullptr;
-    LlamaLayerEncoderParallelParam param = {rmsNormEps,
+    LlamaLayerFusionParallelParam param = {rmsNormEps,
                                            head_num,
                                            head_dim,
                                            0,
                                            0,
+                                           0,
+                                           2,
+                                           true,
+                                           {0},
+                                           {0},
                                            comm};
-    CreateLlamaLayerEncoderParallelOperation(param, &op);
+    LlamaLayerFusionParallelOperation(param, &op);
     g_llaMaEncoderLayerParallelOp->operation_.reset(op);
+    std::vector<int32_t> layer_id_vec(1, 0);
+    custom_kernel::TensorFromVector(*dev_ctx, layer_id_vec,
+                                    *dev_ctx, &(g_llaMaEncoderLayerParallelOp->layerIdTensor));
   }
 
   std::vector<const phi::DenseTensor *> inputs;
@@ -153,6 +166,10 @@ std::vector<paddle::Tensor> LlaMaEncoderLayerParallelOp(
                                  positionIDs,
                                  cos_sin_table,
                                  attention_mask,
+                                 cache_key_value,
+                                 kv_seq_len, // token offset即kv_seq_len
+                                 kv_seq_len, // 增量q_seq_len，始终为1
+                                 g_llaMaEncoderLayerParallelOp->layerIdTensor,
                                  inputs);
 
   g_llaMaEncoderLayerParallelOp->Execute(stream, inputs, outputs);
@@ -163,8 +180,7 @@ std::vector<paddle::Tensor> LlaMaEncoderLayerParallelOp(
   }
 
   return {paddle::Tensor(layerout_tensor),
-          paddle::Tensor(key_tensor),
-          paddle::Tensor(value_tensor)};
+          paddle::Tensor(cache_key_value)};
 }
 
 std::vector<std::vector<int64_t>> LlaMaEncoderLayerOpInferShape(
@@ -178,8 +194,10 @@ std::vector<std::vector<int64_t>> LlaMaEncoderLayerOpInferShape(
     const std::vector<int64_t> &positionIDs_shape,
     const std::vector<int64_t> &cos_sin_table_shape,
     const std::vector<int64_t> &attention_mask_shape,
+    const std::vector<int64_t> &cacheKV_shape,
+    const std::vector<int64_t> &seq_len_shape,
     float rmsNormEps,
-	  int headDim,
+    int headDim,
     int headNum) {
 
   int32_t head_num = headNum; /* TODO:64个，写死8卡 */
@@ -196,7 +214,7 @@ std::vector<std::vector<int64_t>> LlaMaEncoderLayerOpInferShape(
   value_shape.push_back(hidden_shape.at(1));
   value_shape.push_back(head_num);
   value_shape.push_back(head_dim);
-  return {hidden_shape, key_shape, value_shape};
+  return {hidden_shape, cacheKV_shape};
 }
 
 PD_BUILD_OP(llama_encoder_layer_parallel)
@@ -209,8 +227,10 @@ PD_BUILD_OP(llama_encoder_layer_parallel)
              "MlpDownWeight",
              "PositionIDs",
              "CosSinTable",
-             "AttentionMask"})
-    .Outputs({"Out", "PresentKey", "PresentValue"})
+             "AttentionMask",
+             "Cache_KV",
+             "SeqLength"})
+    .Outputs({"Out", "PresentKV"})
     .Attrs({"rmsNormEps: float",
             "headDim: int",
             "headNum: int"})
 
@@ -103,6 +103,7 @@ atb::Status CreateLlamaLayerEncoderParallelOperation(const LlamaLayerEncoderPara
   CreateLlamaPositionEmbedding1DSplitOperation(positionEmbedding1dSplitQParam, &qPositionEmbeddingNode.operation);
   qPositionEmbeddingNode.inTensorIds = {INTERMIDATE_MIXEDQ, IN_POSITIONIDS, INTERMIDATE_CASTCOS, INTERMIDATE_CASTSIN};
   qPositionEmbeddingNode.outTensorIds = {INTERMIDATE_POSITIONEMBEDQ};
+  qPositionEmbeddingNode.inTensorReshapeFuncs.resize(qPositionEmbeddingNode.inTensorIds.size());
   qPositionEmbeddingNode.inTensorReshapeFuncs.at(2) = [=](const atb::Dims &oldShape, atb::Dims &newShape) {
       newShape.dimNum = 4; // dimNum: 4
       newShape.dims[0] = oldShape.dims[0] * oldShape.dims[1];
@@ -123,6 +124,7 @@ atb::Status CreateLlamaLayerEncoderParallelOperation(const LlamaLayerEncoderPara
   CreateLlamaPositionEmbedding1DSplitOperation(positionEmbedding1dSplitKParam, &kPositionEmbeddingNode.operation);
   kPositionEmbeddingNode.inTensorIds = {INTERMIDATE_MIXEDK, IN_POSITIONIDS, INTERMIDATE_CASTCOS, INTERMIDATE_CASTSIN};
   kPositionEmbeddingNode.outTensorIds = {INTERMIDATE_POSITIONEMBEDK};
+  kPositionEmbeddingNode.inTensorReshapeFuncs.resize(kPositionEmbeddingNode.inTensorIds.size());
   kPositionEmbeddingNode.inTensorReshapeFuncs.at(2) = [=](const atb::Dims &oldShape, atb::Dims &newShape) {
       newShape.dimNum = 4; // dimNum: 4
       newShape.dims[0] = oldShape.dims[0] * oldShape.dims[1];
 
@@ -21,8 +21,8 @@
 
 static const uint64_t IN_TENSOR_COUNT = 14;
 static const uint64_t OUT_TENSOR_COUNT = 1;
-static const uint64_t INTERMEDIATE_TENSOR_COUNT = 16;
-static const uint64_t NODE_COUNT = 12;
+static const uint64_t INTERMEDIATE_TENSOR_COUNT = 17;
+static const uint64_t NODE_COUNT = 13;
 
 atb::Status LlamaLayerFusionParallelOperation(const LlamaLayerFusionParallelParam &param,
                                                         atb::Operation **operation)
@@ -36,6 +36,7 @@ atb::Status LlamaLayerFusionParallelOperation(const LlamaLayerFusionParallelPara
     size_t nodeId = 0;
     atb::Node &inputNormNode  = opGraph.nodes.at(nodeId++);
     atb::Node &mixdQKVLinearNode  = opGraph.nodes.at(nodeId++);
+    atb::Node &castInNode = opGraph.nodes.at(nodeId++);
     atb::Node &cosSinSplitNode = opGraph.nodes.at(nodeId++);
     atb::Node &ropeNode  = opGraph.nodes.at(nodeId++);
     atb::Node &cacheKVSplitNode = opGraph.nodes.at(nodeId++);
@@ -47,24 +48,45 @@ atb::Status LlamaLayerFusionParallelOperation(const LlamaLayerFusionParallelPara
     atb::Node &mlpLinearParallelNode   = opGraph.nodes.at(nodeId++);
     atb::Node &mlpResidualAddNode   = opGraph.nodes.at(nodeId++);
 
+    // [bs, seq_len, hidden_size]
     atb::infer::RmsNormParam inputNormParam;
     inputNormParam.layerType = atb::infer::RmsNormParam::RmsNormType::RMS_NORM_NORM;
     inputNormParam.normParam.epsilon = param.rmsNormEps;
     atb::CreateOperation(inputNormParam, &inputNormNode.operation);
     inputNormNode.inTensorIds = {IN_HIDDENSTATES, IN_NORMWEIGHT};
     inputNormNode.outTensorIds = {INTERMIDATE_INPUTNORMOUT};
+    inputNormNode.inTensorReshapeFuncs.resize(inputNormNode.inTensorIds.size());
+    inputNormNode.inTensorReshapeFuncs.at(0) = [=](const atb::Dims &oldShape, atb::Dims &newShape) {
+        if (oldShape.dimNum == 3) {
+          newShape = oldShape;
+        } else if (oldShape.dimNum == 2) {
+          newShape.dimNum = 3; // 增量阶段
+          newShape.dims[0] = oldShape.dims[0];
+          newShape.dims[1] = 1;
+          newShape.dims[2] = oldShape.dims[1];
+        }
+    };
 
+    // [bs, seq_len, hidden_size] * [3 * hidden_size / card_num, hidden_size] -> [bs，seq_len, hidden_size / card_num]
     MultiLayerLinearParam multiLayerLinearParam;
     multiLayerLinearParam.transpose = param.transpose;
     CreateLlamaMultiLayerLinearOperation(multiLayerLinearParam, &mixdQKVLinearNode.operation);
     mixdQKVLinearNode.inTensorIds = {INTERMIDATE_INPUTNORMOUT, IN_QKVMIXDWEIGHT};
     mixdQKVLinearNode.outTensorIds = {INTERMIDATE_MIXEDQ, INTERMIDATE_MIXEDK, INTERMIDATE_MIXEDV};
 
+    atb::infer::ElewiseParam castParam;
+    castParam.elewiseType = atb::infer::ElewiseParam::ElewiseType::ELEWISE_CAST;
+    CreateOperation(castParam, &castInNode.operation);
+    castInNode.inTensorIds = {IN_COS_SIN_TABLE};
+    castInNode.outTensorIds = {INTERNAL_CAST_COS_SIN_TABLE};
+
+    // [2, head_dim, 1, seq_len, 1] ? [1, head_dim, 1, seq_len, 1]
     atb::infer::SplitParam splitParam = {0, 2};
     atb::CreateOperation(splitParam, &cosSinSplitNode.operation);
-    cosSinSplitNode.inTensorIds = {IN_COS_SIN_TABLE};
+    cosSinSplitNode.inTensorIds = {INTERNAL_CAST_COS_SIN_TABLE};
     cosSinSplitNode.outTensorIds = {INTERMIDATE_CASTCOS, INTERMIDATE_CASTSIN};
 
+    // 全量：
     llamaPositionEmbedding1DSplitFusionParam positionEmbedding1dFusionParam;
     positionEmbedding1dFusionParam.headNum = param.headNum;
     positionEmbedding1dFusionParam.rotaryCoeff = param.rotaryCoeff;
@@ -96,12 +118,17 @@ atb::Status LlamaLayerFusionParallelOperation(const LlamaLayerFusionParallelPara
         newShape.dims[2] = oldShape.dims[3];
         newShape.dims[3] = oldShape.dims[4];
     };
+    ropeNode.inTensorReshapeFuncs.at(5) = [=](const atb::Dims &oldShape, atb::Dims &newShape) {
+        newShape.dimNum = 1; // dimNum: 4
+        newShape.dims[0] = oldShape.dims[0] * oldShape.dims[1];
+    };
 
+    // [2, 1, head_num / card_num, max_length, head_dim]
     atb::infer::SplitParam splitKVParam = {0, 2};
     atb::CreateOperation(splitKVParam, &cacheKVSplitNode.operation);
     cacheKVSplitNode.inTensorIds = {IN_CACHE_KV};
     cacheKVSplitNode.outTensorIds = {INTERMIDATE_CACHEK, INTERMIDATE_CACHEV};
-
+    //全量 [1, 1, 4, 128] [1, 1, 4, 128] [1, 1, 4, 128] [1, 4, 2048, 128] [1, 4, 2048, 128] [1, 4, 2048, 2048] [1, 1] [1, 1] [1]
     atb::infer::SelfAttentionParam selfAttentionKvCacheParam;
     selfAttentionKvCacheParam.headDim = param.headDim;
     selfAttentionKvCacheParam.headNum = param.headNum;
@@ -120,40 +147,58 @@ atb::Status LlamaLayerFusionParallelOperation(const LlamaLayerFusionParallelPara
     selfAttentionKvCacheNode.inTensorReshapeFuncs.resize(selfAttentionKvCacheNode.inTensorIds.size());
     selfAttentionKvCacheNode.inTensorReshapeFuncs.at(0) = [=](const atb::Dims &oldShape, atb::Dims &newShape) {
         newShape.dimNum = 4; // dimNum: 4
-        newShape.dims[0] = oldShape.dims[0];
+        newShape.dims[0] = 1;
         newShape.dims[1] = oldShape.dims[0];
         newShape.dims[2] = param.headNum;
         newShape.dims[3] = oldShape.dims[1] / param.headNum;
     };
     selfAttentionKvCacheNode.inTensorReshapeFuncs.at(1) = [=](const atb::Dims &oldShape, atb::Dims &newShape) {
         newShape.dimNum = 4; // dimNum: 4
-        newShape.dims[0] = oldShape.dims[0];
+        newShape.dims[0] = 1;
         newShape.dims[1] = oldShape.dims[0];
         newShape.dims[2] = param.headNum;
         newShape.dims[3] = oldShape.dims[1] / param.headNum;
     };
     selfAttentionKvCacheNode.inTensorReshapeFuncs.at(2) = [=](const atb::Dims &oldShape, atb::Dims &newShape) {
         newShape.dimNum = 4; // dimNum: 4
         newShape.dims[0] = oldShape.dims[0];
-        newShape.dims[1] = oldShape.dims[1];
+        newShape.dims[1] = oldShape.dims[1]; // TODO: how to get seq_len
         newShape.dims[2] = param.headNum;
         newShape.dims[3] = oldShape.dims[2] / param.headNum;
     };
     selfAttentionKvCacheNode.inTensorReshapeFuncs.at(3) = [=](const atb::Dims &oldShape, atb::Dims &newShape) {
-        newShape.dimNum = 4; // dimNum: 4
-        newShape.dims[0] = oldShape.dims[0] * oldShape.dims[1];
-        newShape.dims[1] = oldShape.dims[2];
-        newShape.dims[2] = oldShape.dims[3];
-        newShape.dims[3] = oldShape.dims[4];
+      // 生成的是[1, max_batch_size, head_num, max_len, head_dim]
+      // 加速库需要[layer, max_batch_size, max_len, head_size], 理论应有transpose完成，但读写都为加速库使用，故直接reshape规避
+      newShape.dimNum = 4; // dimNum: 4
+      newShape.dims[0] = 1;
+      newShape.dims[1] = oldShape.dims[1];
+      newShape.dims[2] = oldShape.dims[3];
+      newShape.dims[3] = oldShape.dims[2] * oldShape.dims[4];
     };
     selfAttentionKvCacheNode.inTensorReshapeFuncs.at(4) = [=](const atb::Dims &oldShape, atb::Dims &newShape) {
-        newShape.dimNum = 4; // dimNum: 4
+      // 生成的是[1, max_batch_size, head_num, max_len, head_dim]
+      // 加速库需要[layer, max_batch_size, max_len, head_size], 理论应有transpose完成，但读写都为加速库使用，故直接reshape规避
+      newShape.dimNum = 4; // dimNum: 4
+      newShape.dims[0] = 1;
+      newShape.dims[1] = oldShape.dims[1];
+      newShape.dims[2] = oldShape.dims[3];
+      newShape.dims[3] = oldShape.dims[2] * oldShape.dims[4];
+    };
+    selfAttentionKvCacheNode.inTensorReshapeFuncs.at(5) = [=](const atb::Dims &oldShape, atb::Dims &newShape) {
+        newShape.dimNum = 2; // dimNum: 4
+        newShape.dims[0] = oldShape.dims[2];
+        newShape.dims[1] = oldShape.dims[3];
+    };
+    selfAttentionKvCacheNode.inTensorReshapeFuncs.at(6) = [=](const atb::Dims &oldShape, atb::Dims &newShape) {
+        newShape.dimNum = 1; // dimNum: 1
+        newShape.dims[0] = oldShape.dims[0] * oldShape.dims[1];
+    };
+    selfAttentionKvCacheNode.inTensorReshapeFuncs.at(7) = [=](const atb::Dims &oldShape, atb::Dims &newShape) {
+        newShape.dimNum = 1; // dimNum: 1
         newShape.dims[0] = oldShape.dims[0] * oldShape.dims[1];
-        newShape.dims[1] = oldShape.dims[2];
-        newShape.dims[2] = oldShape.dims[3];
-        newShape.dims[3] = oldShape.dims[4];
     };
 
+    // [1, 1, 512] * [512, 4096] -> [1, 1, 4096]
     atb::infer::LinearParallelParam selfOutLinearParallelParam;
     selfOutLinearParallelParam.transWeight = true;
     selfOutLinearParallelParam.rank = param.rank;
@@ -167,18 +212,18 @@ atb::Status LlamaLayerFusionParallelOperation(const LlamaLayerFusionParallelPara
     selfOutLinearParallelNode.inTensorIds = {INTERMIDATE_SELFOUT, IN_SELFOUTLINEARWEIGHT};
     selfOutLinearParallelNode.outTensorIds = {INTERMIDATE_SELFLINEAROUT};
 
+    // [bs * seq_len, hidden_size] + [1, 1, 4096]
     atb::infer::ElewiseParam selfResidualAddParam;
     selfResidualAddParam.elewiseType = atb::infer::ElewiseParam::ElewiseType::ELEWISE_ADD;
     atb::CreateOperation(selfResidualAddParam, &selfResidualAddNode.operation);
     selfResidualAddNode.inTensorIds = {IN_HIDDENSTATES, INTERMIDATE_SELFLINEAROUT};
     selfResidualAddNode.outTensorIds = {INTERMIDATE_SELFRESIDUALADDOUT};
     selfResidualAddNode.inTensorReshapeFuncs.resize(selfResidualAddNode.inTensorIds.size());
-    selfResidualAddNode.inTensorReshapeFuncs.at(1) = [=](const atb::Dims &oldShape, atb::Dims &newShape) {
-        newShape.dimNum = 3; // dimNum: 3
-        newShape.dims[0] = oldShape.dims[1];
-        newShape.dims[1] = oldShape.dims[0];
-        newShape.dims[2] = oldShape.dims[2];
-    };
+    // selfResidualAddNode.inTensorReshapeFuncs.at(1) = [=](const atb::Dims &oldShape, atb::Dims &newShape) {
+    //     newShape.dimNum = 2; // dimNum: 3
+    //     newShape.dims[0] = oldShape.dims[0] * oldShape.dims[1];
+    //     newShape.dims[1] = oldShape.dims[2];
+    // };
 
     atb::infer::RmsNormParam selfNormParam;
     selfNormParam.layerType = atb::infer::RmsNormParam::RmsNormType::RMS_NORM_NORM;
 
@@ -38,6 +38,7 @@ enum LlamaLayerFusionParallelTensorId {
     INTERMIDATE_MIXEDQ,
     INTERMIDATE_MIXEDK,
     INTERMIDATE_MIXEDV,
+    INTERNAL_CAST_COS_SIN_TABLE,
     INTERMIDATE_CASTCOS,
     INTERMIDATE_CASTSIN,
     INTERMIDATE_POSITIONEMBEDQ,