jd-opensource
diff --git a/‎xllm/core/framework/batch/batch_input_builder.cpp‎
Lines changed: 28 additions & 11 deletions b/‎xllm/core/framework/batch/batch_input_builder.cpp‎
Lines changed: 28 additions & 11 deletions
diff --git a/‎xllm/core/framework/batch/batch_input_builder.h‎
Lines changed: 3 additions & 0 deletions b/‎xllm/core/framework/batch/batch_input_builder.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎xllm/core/framework/request/sequence.cpp‎
Lines changed: 0 additions & 1 deletion b/‎xllm/core/framework/request/sequence.cpp‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎xllm/core/runtime/acl_graph_executor_impl.cpp‎
Lines changed: 6 additions & 12 deletions b/‎xllm/core/runtime/acl_graph_executor_impl.cpp‎
Lines changed: 6 additions & 12 deletions
diff --git a/‎xllm/core/runtime/forward_params.h‎
Lines changed: 11 additions & 0 deletions b/‎xllm/core/runtime/forward_params.h‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎xllm/core/runtime/llm_worker_impl.cpp‎
Lines changed: 4 additions & 26 deletions b/‎xllm/core/runtime/llm_worker_impl.cpp‎
Lines changed: 4 additions & 26 deletions
diff --git a/‎xllm/core/runtime/params_utils.cpp‎
Lines changed: 1 addition & 2 deletions b/‎xllm/core/runtime/params_utils.cpp‎
Lines changed: 1 addition & 2 deletions
@@ -72,24 +72,35 @@ BatchInputBuilder::BatchInputBuilder(
 ForwardInput BatchInputBuilder::build_forward_input(
     uint32_t num_decoding_tokens,
     uint32_t min_decoding_batch_size) {
+  // Since dont test multithreaded for ForwardInput, set thread_pool_ to
+  // nullptr.
+  thread_pool_ = nullptr;
   process_sequences();
   padding_decode_batch_size(num_decoding_tokens, min_decoding_batch_size);
 
   return state_to_forward_input();
 }
 
 RawForwardInput BatchInputBuilder::build_raw_forward_input() {
-  if (!thread_pool_ || num_sequences_ < thread_pool_->size()) {
-    process_sequences();
-  } else {
-    process_sequences_multithreaded();
-  }
+  process_sequences();
   return state_to_raw_forward_input();
 }
 
 void BatchInputBuilder::process_sequences() {
-  for (int32_t i = 0; i < num_sequences_; ++i) {
-    process_single_sequence(i);
+  // when speculative decoding, we need to build raw forward input
+  // of decode batch for MTP (Eagle).
+  is_mtp_decode_ = false;
+  if (state_.batch_forward_type.is_decode() &&
+      FLAGS_num_speculative_tokens > 0) {
+    is_mtp_decode_ = true;
+  }
+
+  if (thread_pool_ && num_sequences_ >= thread_pool_->size()) {
+    process_sequences_multithreaded();
+  } else {
+    for (int32_t i = 0; i < num_sequences_; ++i) {
+      process_single_sequence(i);
+    }
   }
 }
 
@@ -275,14 +286,15 @@ void BatchInputBuilder::process_single_sequence(
                          << allowed_max_tokens_[seq_index];
 
   // Update state
+  int32_t offset = is_mtp_decode_ ? -1 : 0;
   state.empty_kv_cache = state.empty_kv_cache && (n_kv_cache_tokens == 0);
-  state.max_seq_len = std::max(state.max_seq_len, seq_len);
+  state.max_seq_len = std::max(state.max_seq_len, seq_len + offset);
   state.q_max_seq_len = std::max(state.q_max_seq_len, q_seq_len);
 #if defined(USE_NPU)
-  state.seq_lens.push_back(seq_len);
+  state.seq_lens.push_back(seq_len + offset);
   state.q_seq_lens.push_back(q_seq_len);
 #elif defined(USE_MLU) || defined(USE_CUDA)
-  state.seq_lens.push_back(state.seq_lens.back() + seq_len);
+  state.seq_lens.push_back(state.seq_lens.back() + seq_len + offset);
   state.q_seq_lens.push_back(state.q_seq_lens.back() + q_seq_len);
 #endif
   // Process tokens and positions
@@ -338,7 +350,8 @@ void BatchInputBuilder::extract_tokens_and_positions(Sequence* sequence,
     state.flatten_tokens_vec.push_back(token_ids[j]);
 
     if (!use_mrope_) {
-      state.flatten_positions_vec.push_back(static_cast<int32_t>(j));
+      int32_t offset = is_mtp_decode_ ? -1 : 0;
+      state.flatten_positions_vec.push_back(static_cast<int32_t>(j + offset));
     }
 
     // Handle sampling for last tokens
@@ -422,6 +435,9 @@ void BatchInputBuilder::setup_kv_cache_info(
   // update kv cache tokens num
   sequence->kv_state().incr_kv_cache_tokens_num(/*size=*/q_seq_len);
 
+  int32_t offset = is_mtp_decode_ ? -1 : 0;
+  seq_len += offset;
+  n_kv_cache_tokens += offset;
   const auto blocks = sequence->kv_state().kv_blocks();
   const auto slot_ids =
       sequence->kv_state().kv_cache_slots(n_kv_cache_tokens, seq_len);
@@ -443,6 +459,7 @@ void BatchInputBuilder::setup_kv_cache_info(
       (seq_len % block_size == 0) ? block_size : seq_len % block_size;
   state.paged_kv_last_page_len.push_back(last_page_len);
 
+  // calculate the block ids that need to be written
   int32_t kv_cache_block_idx = n_kv_cache_tokens / block_size;
   for (auto iter = block_ids.begin() + kv_cache_block_idx;
        iter != block_ids.end();
 
@@ -161,6 +161,9 @@ class BatchInputBuilder {
   // thread pool for multithreaded processing, not owned
   ThreadPool* thread_pool_ = nullptr;
   uint64_t batch_id_;
+
+  // whether prepare draft input for MTP(EAGLE) at Decode phase.
+  bool is_mtp_decode_ = false;
 };
 
 }  // namespace xllm
@@ -230,7 +230,6 @@ void Sequence::update_embeddings(const torch::Tensor& embeddings) {
     if (output_embedding_.dim() == 1) {
       output_embedding_ = output_embedding_.unsqueeze(0);
     }
-    mm_data_ = MMData(MMType::EMBEDDING, {{"embedding", output_embedding_}});
   }
 }
 
 
@@ -160,10 +160,9 @@ void GraphPersistentParam::update(const torch::Tensor& tokens,
   slice_persistent_block_tables.copy_(params.block_tables,
                                       /*non_blocking=*/true);
 
-  // Update persistent embedding from mm_data if available
-  const auto& embedding_res = params.mm_data.get<torch::Tensor>("embedding");
-  if (embedding_res) {
-    const torch::Tensor& embedding = embedding_res.value();
+  // Update persistent embedding from input_embedding if available
+  const auto& embedding = params.input_embedding;
+  if (embedding.defined()) {
     const int64_t embedding_tokens = embedding.size(0);
 
     // Initialize persistent_embedding_ if needed and not already initialized
@@ -643,17 +642,12 @@ bool AclGraph::capture(CausalLM* model,
   graph_params.graph_buffer.tiling_data = persistent_param_.tiling_data();
 
   // Set persistent embedding if available and original input has embedding
-  const auto& original_embedding =
-      params.mm_data.get<torch::Tensor>("embedding");
-  if (original_embedding.has_value()) {
+  const auto& original_embedding = params.input_embedding;
+  if (original_embedding.defined()) {
     torch::Tensor persistent_embedding =
         persistent_param_.persistent_embedding(num_tokens_);
     if (persistent_embedding.numel() > 0) {
-      // graph_params.input_embedding = persistent_embedding;
-      // Replace embedding in mm_data with persistent embedding using update
-      // method
-      graph_params.mm_data.update<torch::Tensor>(
-          MMType::EMBEDDING, "embedding", persistent_embedding);
+      graph_params.input_embedding = persistent_embedding;
     }
   }
 
 
@@ -100,6 +100,17 @@ struct ForwardInput {
     inputs.acc_logprob = safe_to(acc_logprob, device, true);
     return inputs;
   }
+
+  void print() const {
+    LOG(INFO) << "  token_ids: " << token_ids << std::endl;
+    LOG(INFO) << "  positions: " << positions << std::endl;
+    input_params.print();
+    LOG(INFO) << " params.selected_token_idxes "
+              << sampling_params.selected_token_idxes;
+    LOG(INFO) << " params.sample_idxes " << sampling_params.sample_idxes;
+    LOG(INFO) << " params.do_sample " << sampling_params.do_sample;
+  }
+
   // flatten token ids
   torch::Tensor token_ids;
   // flatten positions
 
@@ -170,34 +170,12 @@ std::optional<ForwardOutput> LLMWorkerImpl::step(const ForwardInput& input) {
     output.beam_search_output = beam_search_output;
   }
 
-  // if running in multi_stream_parallel step, all micro batches
-  // should be in same prefill stage, so, to judge empty_kv_cache,
-  // just use micro batch 0 here
-  if (options_.enable_speculative_decode() && !is_spec_draft_) {
-    if (check_is_prefill(input.input_params.q_seq_lens_vec)) {
+  if (options_.enable_speculative_decode()) {
+    if (!input.input_params.batch_forward_type.is_decode() && !is_spec_draft_) {
       output.sample_output.embeddings = hidden_states;
-    } else if (sampling_params.sample_idxes.defined()) {
-      // auto sample_idxes =
-      //     concated_sampling_params.selected_token_idxes.index_select(
-      //         /*dim=*/0, concated_sampling_params.sample_idxes);
+    } else if (sampling_params.selected_token_idxes.defined()) {
       auto embeddings = hidden_states.index_select(
-          /*dim=*/0, sampling_params.sample_idxes);
-      output.sample_output.embeddings = embeddings;
-    }
-  }
-
-  // if running in multi_stream_parallel step, all micro batches
-  // should be in same prefill stage, so, to judge empty_kv_cache,
-  // just use micro batch 0 here
-  if (options_.enable_speculative_decode() && !is_spec_draft_) {
-    if (input.input_params.q_seq_lens_vec[0] > 1) {
-      output.sample_output.embeddings = hidden_states;
-    } else if (sampling_params.sample_idxes.defined()) {
-      // auto sample_idxes =
-      //     concated_sampling_params.selected_token_idxes.index_select(
-      //         /*dim=*/0, concated_sampling_params.sample_idxes);
-      auto embeddings = hidden_states.index_select(
-          /*dim=*/0, sampling_params.sample_idxes);
+          /*dim=*/0, sampling_params.selected_token_idxes);
       output.sample_output.embeddings = embeddings;
     }
   }
 
@@ -231,8 +231,7 @@ void proto_to_forward_input(const proto::ForwardInput* pb_forward_input,
     }
     torch::Tensor embeddings =
         create_2d_tensor(embeddings_vec, torch::kBFloat16);
-    input_params.mm_data =
-        MMData(MMType::EMBEDDING, {{"embedding", embeddings}});
+    input_params.input_embedding = embeddings;
   }
 
   CHECK_EQ(sampling_params.size(), selected_token_idxes.size());
Original file line number	Diff line number	Diff line change
`@@ -230,7 +230,6 @@ void Sequence::update_embeddings(const torch::Tensor& embeddings) {`
`230`	`230`	`if (output_embedding_.dim() == 1) {`
`231`	`231`	`output_embedding_ = output_embedding_.unsqueeze(0);`
`232`	`232`	`}`
`233`		`- mm_data_ = MMData(MMType::EMBEDDING, {{"embedding", output_embedding_}});`
`234`	`233`	`}`
`235`	`234`	`}`
`236`	`235`
Original file line number	Diff line number	Diff line change
`@@ -231,8 +231,7 @@ void proto_to_forward_input(const proto::ForwardInput* pb_forward_input,`
`231`	`231`	`}`
`232`	`232`	`torch::Tensor embeddings =`
`233`	`233`	`create_2d_tensor(embeddings_vec, torch::kBFloat16);`
`234`		`- input_params.mm_data =`
`235`		`- MMData(MMType::EMBEDDING, {{"embedding", embeddings}});`
	`234`	`+ input_params.input_embedding = embeddings;`
`236`	`235`	`}`
`237`	`236`
`238`	`237`	`CHECK_EQ(sampling_params.size(), selected_token_idxes.size());`