openvinotoolkit
diff --git a/‎src/plugins/intel_gpu/src/graph/debug_helper.cpp‎
Lines changed: 24 additions & 0 deletions b/‎src/plugins/intel_gpu/src/graph/debug_helper.cpp‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp‎
Lines changed: 0 additions & 7 deletions b/‎src/plugins/intel_gpu/src/graph/impls/ocl/kernels_cache.cpp‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp‎
Lines changed: 0 additions & 1177 deletions b/‎src/plugins/intel_gpu/src/graph/impls/ocl/paged_attention.cpp‎
Lines changed: 0 additions & 1177 deletions
diff --git a/‎src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp‎
Lines changed: 0 additions & 2 deletions b/‎src/plugins/intel_gpu/src/graph/impls/ocl/register.cpp‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp‎
Lines changed: 0 additions & 4 deletions b/‎src/plugins/intel_gpu/src/graph/impls/ocl/register.hpp‎
Lines changed: 0 additions & 4 deletions
@@ -466,7 +466,31 @@ NodeDebugHelper::~NodeDebugHelper() {
                                        dump_raw);
                 }
             }
+            for (size_t i = 0; i < m_inst.get_intermediates_memories().size(); i++) {
+                std::string name = get_file_prefix() + "_intermediates_" + std::to_string(i);
+                auto output_mem = m_inst.get_intermediates_memories()[i];
+                if (output_mem == nullptr) {
+                    GPU_DEBUG_COUT << " intermediates_mem is nullptr. Nothing to dump." << std::endl;
+                    continue;
+                }
 
+                auto& output_layout = output_mem->get_layout();
+                if (config.get_dump_tensors_format() == ov::intel_gpu::DumpFormat::binary) {
+                    // Binary dump : raw
+                    auto filename = get_file_path_for_binary_dump(output_layout, name, config.get_dump_tensors_path());
+
+                    mem_lock<char, mem_lock_type::read> lock(output_mem, m_stream);
+                    ov::util::save_binary(filename, lock.data(), output_mem->size());
+                    GPU_DEBUG_COUT << " Dump layer dst : " << layer_name << " to " << filename << std::endl;
+                    debug_str_for_bin_load += (filename + ",");
+                } else {
+                    const bool dump_raw = config.get_dump_tensors_format() == ov::intel_gpu::DumpFormat::text_raw;
+                    GPU_DEBUG_COUT << " Dump " << (dump_raw ? "raw " : "") << name << std::endl;
+                    auto filename = config.get_dump_tensors_path() + get_name_for_dump(name) + ".txt";
+                    // Text dump
+                    log_memory_to_file(output_mem, output_layout, m_stream, filename, dump_raw);
+                }
+            }
             if (config.get_dump_tensors_format() == ov::intel_gpu::DumpFormat::binary && m_inst.is_input()) {
                 debug_str_for_bin_load[debug_str_for_bin_load.size()-1] = '\"';
                 GPU_DEBUG_COUT << debug_str_for_bin_load << std::endl;;
 
@@ -213,13 +213,6 @@ void kernels_cache::get_program_source(const kernels_code& kernels_source_code,
 
             current_batch.has_microkernels |= kernel_string->has_microkernels;
 
-            // TODO: Technically, microkernels doesn't require specific headers, but we don't want to include
-            // some headers to all batches as it may lead to compilation error on some driver versions.
-            // Need to generalize work with headers to include only necessary parts
-            if (current_batch.has_microkernels) {
-                current_batch.source.insert(current_batch.source.begin(), current_batch.micro_headers.begin(), current_batch.micro_headers.end());
-            }
-
             current_batch.source.push_back(std::move(full_code));
             current_batch.kernels_counter++;
         }
 
@@ -35,7 +35,6 @@ void register_implementations() {
     REGISTER_OCL(generate_proposals);
     REGISTER_OCL(grid_sample);
     REGISTER_OCL(kv_cache);
-    REGISTER_OCL(paged_attention);
     REGISTER_OCL(lrn);
     REGISTER_OCL(multiclass_nms);
     REGISTER_OCL(multinomial);
@@ -81,7 +80,6 @@ void register_implementations() {
     REGISTER_OCL(eye);
     REGISTER_OCL(unique_count);
     REGISTER_OCL(unique_gather);
-    REGISTER_OCL(scaled_dot_product_attention);
     REGISTER_OCL(search_sorted);
     REGISTER_OCL(STFT);
     REGISTER_OCL(ISTFT);
 
@@ -62,9 +62,7 @@
 #include "intel_gpu/primitives/non_zero.hpp"
 #include "intel_gpu/primitives/eye.hpp"
 #include "intel_gpu/primitives/unique.hpp"
-#include "intel_gpu/primitives/paged_attention.hpp"
 #include "intel_gpu/primitives/kv_cache.hpp"
-#include "intel_gpu/primitives/scaled_dot_product_attention.hpp"
 
 namespace cldnn {
 namespace ocl {
@@ -106,7 +104,6 @@ REGISTER_OCL(gemm);
 REGISTER_OCL(generate_proposals);
 REGISTER_OCL(grid_sample);
 REGISTER_OCL(kv_cache);
-REGISTER_OCL(paged_attention);
 REGISTER_OCL(lrn);
 REGISTER_OCL(multiclass_nms);
 REGISTER_OCL(multinomial);
@@ -152,7 +149,6 @@ REGISTER_OCL(gather_nonzero);
 REGISTER_OCL(eye);
 REGISTER_OCL(unique_count);
 REGISTER_OCL(unique_gather);
-REGISTER_OCL(scaled_dot_product_attention);
 REGISTER_OCL(search_sorted);
 REGISTER_OCL(STFT);
 REGISTER_OCL(ISTFT);