From 6e98e3f68e40769c8ba5a049a85b483eaac45a66 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 22 May 2024 04:12:01 -0700
Subject: [PATCH 01/52] not buildable: remove host device from device_impl.*

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/device_impl.cpp | 38 +++++-------------------------
 sycl/source/detail/device_impl.hpp | 14 -----------
 2 files changed, 6 insertions(+), 46 deletions(-)
diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp
index 532cffe22500f..d043a59d9cebd 100644
--- a/sycl/source/detail/device_impl.cpp
+++ b/sycl/source/detail/device_impl.cpp
@@ -17,11 +17,6 @@ namespace sycl {
 inline namespace _V1 {
 namespace detail {
 
-device_impl::device_impl()
-    : MIsHostDevice(true), MPlatform(platform_impl::getHostPlatformImpl()),
-      // assert is natively supported by host
-      MIsAssertFailSupported(true) {}
-
 device_impl::device_impl(pi_native_handle InteropDeviceHandle,
                          const PluginPtr &Plugin)
     : device_impl(InteropDeviceHandle, nullptr, nullptr, Plugin) {}
@@ -39,7 +34,7 @@ device_impl::device_impl(sycl::detail::pi::PiDevice Device,
 device_impl::device_impl(pi_native_handle InteropDeviceHandle,
                          sycl::detail::pi::PiDevice Device,
                          PlatformImplPtr Platform, const PluginPtr &Plugin)
-    : MDevice(Device), MIsHostDevice(false),
+    : MDevice(Device),
       MDeviceHostBaseTime(std::make_pair(0, 0)) {
 
   bool InteroperabilityConstructor = false;
@@ -84,13 +79,11 @@ device_impl::device_impl(pi_native_handle InteropDeviceHandle,
 }
 
 device_impl::~device_impl() {
-  if (!MIsHostDevice) {
-    // TODO catch an exception and put it to list of asynchronous exceptions
-    const PluginPtr &Plugin = getPlugin();
-    sycl::detail::pi::PiResult Err =
-        Plugin->call_nocheck<PiApiKind::piDeviceRelease>(MDevice);
-    __SYCL_CHECK_OCL_CODE_NO_EXC(Err);
-  }
+  // TODO catch an exception and put it to list of asynchronous exceptions
+  const PluginPtr &Plugin = getPlugin();
+  sycl::detail::pi::PiResult Err =
+      Plugin->call_nocheck<PiApiKind::piDeviceRelease>(MDevice);
+  __SYCL_CHECK_OCL_CODE_NO_EXC(Err);
 }
 
 bool device_impl::is_affinity_supported(
@@ -101,11 +94,6 @@ bool device_impl::is_affinity_supported(
 }
 
 cl_device_id device_impl::get() const {
-  if (MIsHostDevice) {
-    throw invalid_object_error(
-        "This instance of device doesn't support OpenCL interoperability.",
-        PI_ERROR_INVALID_DEVICE);
-  }
   // TODO catch an exception and put it to list of asynchronous exceptions
   getPlugin()->call<PiApiKind::piDeviceRetain>(MDevice);
   return pi::cast<cl_device_id>(getNative());
@@ -180,9 +168,6 @@ device_impl::get_backend_info<info::device::backend_version>() const {
 }
 
 bool device_impl::has_extension(const std::string &ExtensionName) const {
-  if (MIsHostDevice)
-    // TODO: implement extension management for host device;
-    return false;
   std::string AllExtensionNames =
       get_device_info_string(PiInfoCode<info::device::extensions>::value);
   return (AllExtensionNames.find(ExtensionName) != std::string::npos);
@@ -224,8 +209,6 @@ device_impl::create_sub_devices(const cl_device_partition_property *Properties,
 }
 
 std::vector<device> device_impl::create_sub_devices(size_t ComputeUnits) const {
-  assert(!MIsHostDevice && "Partitioning is not supported on host.");
-
   if (!is_partition_supported(info::partition_property::partition_equally)) {
     throw sycl::feature_not_supported(
         "Device does not support "
@@ -248,8 +231,6 @@ std::vector<device> device_impl::create_sub_devices(size_t ComputeUnits) const {
 
 std::vector<device>
 device_impl::create_sub_devices(const std::vector<size_t> &Counts) const {
-  assert(!MIsHostDevice && "Partitioning is not supported on host.");
-
   if (!is_partition_supported(info::partition_property::partition_by_counts)) {
     throw sycl::feature_not_supported(
         "Device does not support "
@@ -291,8 +272,6 @@ device_impl::create_sub_devices(const std::vector<size_t> &Counts) const {
 
 std::vector<device> device_impl::create_sub_devices(
     info::partition_affinity_domain AffinityDomain) const {
-  assert(!MIsHostDevice && "Partitioning is not supported on host.");
-
   if (!is_partition_supported(
           info::partition_property::partition_by_affinity_domain)) {
     throw sycl::feature_not_supported(
@@ -319,8 +298,6 @@ std::vector<device> device_impl::create_sub_devices(
 }
 
 std::vector<device> device_impl::create_sub_devices() const {
-  assert(!MIsHostDevice && "Partitioning is not supported on host.");
-
   if (!is_partition_supported(
           info::partition_property::ext_intel_partition_by_cslice)) {
     throw sycl::feature_not_supported(
@@ -789,9 +766,6 @@ uint64_t device_impl::getCurrentDeviceTime() {
   uint64_t HostTime =
       duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
           .count();
-  if (MIsHostDevice) {
-    return HostTime;
-  }
 
   // To account for potential clock drift between host clock and device clock.
   // The value set is arbitrary: 200 seconds
diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp
index 981b1e059a30e..2526647152892 100644
--- a/sycl/source/detail/device_impl.hpp
+++ b/sycl/source/detail/device_impl.hpp
@@ -65,10 +65,6 @@ class device_impl {
   ///
   /// \return non-constant reference to PI device
   sycl::detail::pi::PiDevice &getHandleRef() {
-    if (MIsHostDevice)
-      throw invalid_object_error("This instance of device is a host instance",
-                                 PI_ERROR_INVALID_DEVICE);
-
     return MDevice;
   }
 
@@ -78,18 +74,9 @@ class device_impl {
   ///
   /// \return constant reference to PI device
   const sycl::detail::pi::PiDevice &getHandleRef() const {
-    if (MIsHostDevice)
-      throw invalid_object_error("This instance of device is a host instance",
-                                 PI_ERROR_INVALID_DEVICE);
-
     return MDevice;
   }
 
-  /// Check if SYCL device is a host device
-  ///
-  /// \return true if SYCL device is a host device
-  bool is_host() const { return MIsHostDevice; }
-
   /// Check if device is a CPU device
   ///
   /// \return true if SYCL device is a CPU device
@@ -327,7 +314,6 @@ class device_impl {
   sycl::detail::pi::PiDevice MDevice = 0;
   sycl::detail::pi::PiDeviceType MType;
   sycl::detail::pi::PiDevice MRootDevice = nullptr;
-  bool MIsHostDevice;
   PlatformImplPtr MPlatform;
   bool MIsAssertFailSupported = false;
   mutable std::string MDeviceName;

From abe4586ce16a07b69a1d2c662679697754db00a2 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 22 May 2024 04:13:51 -0700
Subject: [PATCH 02/52] not-buildable: remove getHostPlatformImpl

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/context_impl.cpp  | 3 ---
 sycl/source/detail/device_info.hpp   | 4 ----
 sycl/source/detail/platform_impl.cpp | 6 ------
 sycl/source/detail/platform_impl.hpp | 8 --------
 4 files changed, 21 deletions(-)

diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp
index 388c312305d4a..c2124456dae24 100644
--- a/sycl/source/detail/context_impl.cpp
+++ b/sycl/source/detail/context_impl.cpp
@@ -177,9 +177,6 @@ uint32_t context_impl::get_info<info::context::reference_count>() const {
                                                           this->getPlugin());
 }
 template <> platform context_impl::get_info<info::context::platform>() const {
-  if (is_host())
-    return createSyclObjFromImpl<platform>(
-        platform_impl::getHostPlatformImpl());
   return createSyclObjFromImpl<platform>(MPlatform);
 }
 template <>
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index a8769b69e83cc..61cb09e1b0b38 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -1802,10 +1802,6 @@ get_device_info_host<info::device::built_in_kernels>() {
   return {};
 }
 
-template <> inline platform get_device_info_host<info::device::platform>() {
-  return createSyclObjFromImpl<platform>(platform_impl::getHostPlatformImpl());
-}
-
 template <> inline std::string get_device_info_host<info::device::name>() {
   return "SYCL host device";
 }
diff --git a/sycl/source/detail/platform_impl.cpp b/sycl/source/detail/platform_impl.cpp
index 2bdfab26676d9..9700fde466803 100644
--- a/sycl/source/detail/platform_impl.cpp
+++ b/sycl/source/detail/platform_impl.cpp
@@ -30,12 +30,6 @@ namespace detail {
 
 using PlatformImplPtr = std::shared_ptr<platform_impl>;
 
-PlatformImplPtr platform_impl::getHostPlatformImpl() {
-  static PlatformImplPtr HostImpl = std::make_shared<platform_impl>();
-
-  return HostImpl;
-}
-
 PlatformImplPtr
 platform_impl::getOrMakePlatformImpl(sycl::detail::pi::PiPlatform PiPlatform,
                                      const PluginPtr &Plugin) {
diff --git a/sycl/source/detail/platform_impl.hpp b/sycl/source/detail/platform_impl.hpp
index 34537c7191af6..0bb8d1ab77e2f 100644
--- a/sycl/source/detail/platform_impl.hpp
+++ b/sycl/source/detail/platform_impl.hpp
@@ -192,14 +192,6 @@ class platform_impl {
   getOrMakeDeviceImpl(sycl::detail::pi::PiDevice PiDevice,
                       const std::shared_ptr<platform_impl> &PlatformImpl);
 
-  /// Static functions that help maintain platform uniquess and
-  /// equality of comparison
-
-  /// Returns the host platform impl
-  ///
-  /// \return the host platform impl
-  static std::shared_ptr<platform_impl> getHostPlatformImpl();
-
   /// Queries the cache to see if the specified PiPlatform has been seen
   /// before.  If so, return the cached platform_impl, otherwise create a new
   /// one and cache it.

From 6a0a25005b1b9b831419e94ed56b0bb8f15b4017 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 22 May 2024 04:18:11 -0700
Subject: [PATCH 03/52] not buildable: remove get_device_info_host

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/device_impl.cpp |    3 -
 sycl/source/detail/device_info.hpp | 1032 ----------------------------
 2 files changed, 1035 deletions(-)

diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp
index d043a59d9cebd..2e87300425c20 100644
--- a/sycl/source/detail/device_impl.cpp
+++ b/sycl/source/detail/device_impl.cpp
@@ -105,9 +105,6 @@ platform device_impl::get_platform() const {
 
 template <typename Param>
 typename Param::return_type device_impl::get_info() const {
-  if (is_host()) {
-    return get_device_info_host<Param>();
-  }
   return get_device_info<Param>(
       MPlatform->getOrMakeDeviceImpl(MDevice, MPlatform));
 }
diff --git a/sycl/source/detail/device_info.hpp b/sycl/source/detail/device_info.hpp
index 61cb09e1b0b38..9322b65128652 100644
--- a/sycl/source/detail/device_info.hpp
+++ b/sycl/source/detail/device_info.hpp
@@ -1272,1038 +1272,6 @@ typename Param::return_type get_device_info(const DeviceImplPtr &Dev) {
   return get_device_info_impl<typename Param::return_type, Param>::get(Dev);
 }
 
-// SYCL host device information
-
-// Default template is disabled, all possible instantiations are
-// specified explicitly.
-template <typename Param>
-inline typename Param::return_type get_device_info_host() = delete;
-
-template <>
-inline std::vector<sycl::aspect> get_device_info_host<info::device::aspects>() {
-  return std::vector<sycl::aspect>();
-}
-
-template <>
-inline ext::oneapi::experimental::architecture
-get_device_info_host<ext::oneapi::experimental::info::device::architecture>() {
-  return ext::oneapi::experimental::architecture::x86_64;
-}
-
-template <>
-inline info::device_type get_device_info_host<info::device::device_type>() {
-  return info::device_type::host;
-}
-
-template <> inline uint32_t get_device_info_host<info::device::vendor_id>() {
-  return 0x8086;
-}
-
-template <>
-inline uint32_t get_device_info_host<info::device::max_compute_units>() {
-  return std::thread::hardware_concurrency();
-}
-
-template <>
-inline uint32_t get_device_info_host<info::device::max_work_item_dimensions>() {
-  return 3;
-}
-
-template <>
-inline range<1> get_device_info_host<info::device::max_work_item_sizes<1>>() {
-  // current value is the required minimum
-  return {1};
-}
-
-template <>
-inline range<2> get_device_info_host<info::device::max_work_item_sizes<2>>() {
-  // current value is the required minimum
-  return {1, 1};
-}
-
-template <>
-inline range<3> get_device_info_host<info::device::max_work_item_sizes<3>>() {
-  // current value is the required minimum
-  return {1, 1, 1};
-}
-
-template <>
-inline constexpr size_t get_device_info_host<
-    ext::oneapi::experimental::info::device::max_global_work_groups>() {
-  // See handler.hpp for the maximum value :
-  return static_cast<size_t>((std::numeric_limits<int>::max)());
-}
-
-template <>
-inline id<1> get_device_info_host<
-    ext::oneapi::experimental::info::device::max_work_groups<1>>() {
-  // See handler.hpp for the maximum value :
-  static constexpr size_t Limit = get_device_info_host<
-      ext::oneapi::experimental::info::device::max_global_work_groups>();
-  return {Limit};
-}
-
-template <>
-inline id<2> get_device_info_host<
-    ext::oneapi::experimental::info::device::max_work_groups<2>>() {
-  // See handler.hpp for the maximum value :
-  static constexpr size_t Limit = get_device_info_host<
-      ext::oneapi::experimental::info::device::max_global_work_groups>();
-  return {Limit, Limit};
-}
-
-template <>
-inline id<3> get_device_info_host<
-    ext::oneapi::experimental::info::device::max_work_groups<3>>() {
-  // See handler.hpp for the maximum value :
-  static constexpr size_t Limit = get_device_info_host<
-      ext::oneapi::experimental::info::device::max_global_work_groups>();
-  return {Limit, Limit, Limit};
-}
-
-// TODO:remove with deprecated feature
-// device::get_info<info::device::ext_oneapi_max_global_work_groups>
-template <>
-inline constexpr size_t
-get_device_info_host<info::device::ext_oneapi_max_global_work_groups>() {
-  return get_device_info_host<
-      ext::oneapi::experimental::info::device::max_global_work_groups>();
-}
-
-// TODO:remove with deprecated feature
-// device::get_info<info::device::ext_oneapi_max_work_groups_1d>
-template <>
-inline id<1>
-get_device_info_host<info::device::ext_oneapi_max_work_groups_1d>() {
-
-  return get_device_info_host<
-      ext::oneapi::experimental::info::device::max_work_groups<1>>();
-}
-
-// TODO:remove with deprecated feature
-// device::get_info<info::device::ext_oneapi_max_work_groups_2d>
-template <>
-inline id<2>
-get_device_info_host<info::device::ext_oneapi_max_work_groups_2d>() {
-  return get_device_info_host<
-      ext::oneapi::experimental::info::device::max_work_groups<2>>();
-}
-
-// TODO:remove with deprecated feature
-// device::get_info<info::device::ext_oneapi_max_work_groups_3d>
-template <>
-inline id<3>
-get_device_info_host<info::device::ext_oneapi_max_work_groups_3d>() {
-  return get_device_info_host<
-      ext::oneapi::experimental::info::device::max_work_groups<3>>();
-}
-
-template <>
-inline size_t get_device_info_host<info::device::max_work_group_size>() {
-  // current value is the required minimum
-  return 1;
-}
-
-template <>
-inline uint32_t
-get_device_info_host<info::device::preferred_vector_width_char>() {
-  // TODO update when appropriate
-  return 1;
-}
-
-template <>
-inline uint32_t
-get_device_info_host<info::device::preferred_vector_width_short>() {
-  // TODO update when appropriate
-  return 1;
-}
-
-template <>
-inline uint32_t
-get_device_info_host<info::device::preferred_vector_width_int>() {
-  // TODO update when appropriate
-  return 1;
-}
-
-template <>
-inline uint32_t
-get_device_info_host<info::device::preferred_vector_width_long>() {
-  // TODO update when appropriate
-  return 1;
-}
-
-template <>
-inline uint32_t
-get_device_info_host<info::device::preferred_vector_width_float>() {
-  // TODO update when appropriate
-  return 1;
-}
-
-template <>
-inline uint32_t
-get_device_info_host<info::device::preferred_vector_width_double>() {
-  // TODO update when appropriate
-  return 1;
-}
-
-template <>
-inline uint32_t
-get_device_info_host<info::device::preferred_vector_width_half>() {
-  // TODO update when appropriate
-  return 0;
-}
-
-template <>
-inline uint32_t get_device_info_host<info::device::native_vector_width_char>() {
-  return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Char);
-}
-
-template <>
-inline uint32_t
-get_device_info_host<info::device::native_vector_width_short>() {
-  return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Short);
-}
-
-template <>
-inline uint32_t get_device_info_host<info::device::native_vector_width_int>() {
-  return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Int);
-}
-
-template <>
-inline uint32_t get_device_info_host<info::device::native_vector_width_long>() {
-  return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Long);
-}
-
-template <>
-inline uint32_t
-get_device_info_host<info::device::native_vector_width_float>() {
-  return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Float);
-}
-
-template <>
-inline uint32_t
-get_device_info_host<info::device::native_vector_width_double>() {
-  return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Double);
-}
-
-template <>
-inline uint32_t get_device_info_host<info::device::native_vector_width_half>() {
-  return PlatformUtil::getNativeVectorWidth(PlatformUtil::TypeIndex::Half);
-}
-
-template <>
-inline uint32_t get_device_info_host<info::device::max_clock_frequency>() {
-  return PlatformUtil::getMaxClockFrequency();
-}
-
-template <> inline uint32_t get_device_info_host<info::device::address_bits>() {
-  return sizeof(void *) * 8;
-}
-
-template <>
-inline uint64_t get_device_info_host<info::device::global_mem_size>() {
-  return static_cast<uint64_t>(OSUtil::getOSMemSize());
-}
-
-template <>
-inline uint64_t get_device_info_host<info::device::max_mem_alloc_size>() {
-  // current value is the required minimum
-  const uint64_t a = get_device_info_host<info::device::global_mem_size>() / 4;
-  const uint64_t b = 128ul * 1024 * 1024;
-  return (a > b) ? a : b;
-}
-
-template <> inline bool get_device_info_host<info::device::image_support>() {
-  return true;
-}
-
-template <> inline bool get_device_info_host<info::device::atomic64>() {
-  return false;
-}
-
-template <>
-inline std::vector<memory_order>
-get_device_info_host<info::device::atomic_memory_order_capabilities>() {
-  return {memory_order::relaxed, memory_order::acquire, memory_order::release,
-          memory_order::acq_rel, memory_order::seq_cst};
-}
-
-template <>
-inline std::vector<memory_order>
-get_device_info_host<info::device::atomic_fence_order_capabilities>() {
-  return {memory_order::relaxed, memory_order::acquire, memory_order::release,
-          memory_order::acq_rel};
-}
-
-template <>
-inline std::vector<memory_scope>
-get_device_info_host<info::device::atomic_memory_scope_capabilities>() {
-  return {memory_scope::work_item, memory_scope::sub_group,
-          memory_scope::work_group, memory_scope::device, memory_scope::system};
-}
-
-template <>
-inline std::vector<memory_scope>
-get_device_info_host<info::device::atomic_fence_scope_capabilities>() {
-  return {memory_scope::work_item, memory_scope::sub_group,
-          memory_scope::work_group, memory_scope::device, memory_scope::system};
-}
-
-template <>
-inline bool
-get_device_info_host<info::device::ext_oneapi_bfloat16_math_functions>() {
-  return false;
-}
-
-template <>
-inline uint32_t get_device_info_host<info::device::max_read_image_args>() {
-  // current value is the required minimum
-  return 128;
-}
-
-template <>
-inline uint32_t get_device_info_host<info::device::max_write_image_args>() {
-  // current value is the required minimum
-  return 8;
-}
-
-template <>
-inline size_t get_device_info_host<info::device::image2d_max_width>() {
-  // SYCL guarantees at least 8192. Some devices already known to provide more
-  // than that (i.e. it is 16384 for opencl:gpu), which may create issues during
-  // image object allocation on host.
-  // Using any fixed number (i.e. 16384) brings the risk of having similar
-  // issues on newer devices in future. Thus it does not make sense limiting
-  // the returned value on host. Practially speaking the returned value on host
-  // depends only on memory required for the image, which also depends on
-  // the image channel_type and the image height. Both are not known in this
-  // query, thus it becomes user's responsibility to choose proper image
-  // parameters depending on similar query to (non-host device) and amount
-  // of available/allocatable memory.
-  return std::numeric_limits<std::size_t>::max();
-}
-
-template <>
-inline size_t get_device_info_host<info::device::image2d_max_height>() {
-  // SYCL guarantees at least 8192. Some devices already known to provide more
-  // than that (i.e. it is 16384 for opencl:gpu), which may create issues during
-  // image object allocation on host.
-  // Using any fixed number (i.e. 16384) brings the risk of having similar
-  // issues on newer devices in future. Thus it does not make sense limiting
-  // the returned value on host. Practially speaking the returned value on host
-  // depends only on memory required for the image, which also depends on
-  // the image channel_type and the image width. Both are not known in this
-  // query, thus it becomes user's responsibility to choose proper image
-  // parameters depending on similar query to (non-host device) and amount
-  // of available/allocatable memory.
-  return std::numeric_limits<std::size_t>::max();
-}
-
-template <>
-inline size_t get_device_info_host<info::device::image3d_max_width>() {
-  // SYCL guarantees at least 8192. Some devices already known to provide more
-  // than that (i.e. it is 16384 for opencl:gpu), which may create issues during
-  // image object allocation on host.
-  // Using any fixed number (i.e. 16384) brings the risk of having similar
-  // issues on newer devices in future. Thus it does not make sense limiting
-  // the returned value on host. Practially speaking the returned value on host
-  // depends only on memory required for the image, which also depends on
-  // the image channel_type and the image height/depth. Both are not known
-  // in this query, thus it becomes user's responsibility to choose proper image
-  // parameters depending on similar query to (non-host device) and amount
-  // of available/allocatable memory.
-  return std::numeric_limits<std::size_t>::max();
-}
-
-template <>
-inline size_t get_device_info_host<info::device::image3d_max_height>() {
-  // SYCL guarantees at least 8192. Some devices already known to provide more
-  // than that (i.e. it is 16384 for opencl:gpu), which may create issues during
-  // image object allocation on host.
-  // Using any fixed number (i.e. 16384) brings the risk of having similar
-  // issues on newer devices in future. Thus it does not make sense limiting
-  // the returned value on host. Practially speaking the returned value on host
-  // depends only on memory required for the image, which also depends on
-  // the image channel_type and the image width/depth. Both are not known
-  // in this query, thus it becomes user's responsibility to choose proper image
-  // parameters depending on similar query to (non-host device) and amount
-  // of available/allocatable memory.
-  return std::numeric_limits<std::size_t>::max();
-}
-
-template <>
-inline size_t get_device_info_host<info::device::image3d_max_depth>() {
-  // SYCL guarantees at least 8192. Some devices already known to provide more
-  // than that (i.e. it is 16384 for opencl:gpu), which may create issues during
-  // image object allocation on host.
-  // Using any fixed number (i.e. 16384) brings the risk of having similar
-  // issues on newer devices in future. Thus it does not make sense limiting
-  // the returned value on host. Practially speaking the returned value on host
-  // depends only on memory required for the image, which also depends on
-  // the image channel_type and the image height/width, which are not known
-  // in this query, thus it becomes user's responsibility to choose proper image
-  // parameters depending on similar query to (non-host device) and amount
-  // of available/allocatable memory.
-  return std::numeric_limits<std::size_t>::max();
-}
-
-template <>
-inline size_t get_device_info_host<info::device::image_max_buffer_size>() {
-  // Not supported in SYCL
-  return 0;
-}
-
-template <>
-inline size_t get_device_info_host<info::device::image_max_array_size>() {
-  // current value is the required minimum
-  return 2048;
-}
-
-template <> inline uint32_t get_device_info_host<info::device::max_samplers>() {
-  // current value is the required minimum
-  return 16;
-}
-
-template <>
-inline size_t get_device_info_host<info::device::max_parameter_size>() {
-  // current value is the required minimum
-  return 1024;
-}
-
-template <>
-inline uint32_t get_device_info_host<info::device::mem_base_addr_align>() {
-  return 1024;
-}
-
-template <>
-inline std::vector<info::fp_config>
-get_device_info_host<info::device::half_fp_config>() {
-  // current value is the required minimum
-  return {};
-}
-
-template <>
-inline std::vector<info::fp_config>
-get_device_info_host<info::device::single_fp_config>() {
-  // current value is the required minimum
-  return {info::fp_config::round_to_nearest, info::fp_config::inf_nan};
-}
-
-template <>
-inline std::vector<info::fp_config>
-get_device_info_host<info::device::double_fp_config>() {
-  // current value is the required minimum
-  return {info::fp_config::fma,           info::fp_config::round_to_nearest,
-          info::fp_config::round_to_zero, info::fp_config::round_to_inf,
-          info::fp_config::inf_nan,       info::fp_config::denorm};
-}
-
-template <>
-inline info::global_mem_cache_type
-get_device_info_host<info::device::global_mem_cache_type>() {
-  return info::global_mem_cache_type::read_write;
-}
-
-template <>
-inline uint32_t
-get_device_info_host<info::device::global_mem_cache_line_size>() {
-  return PlatformUtil::getMemCacheLineSize();
-}
-
-template <>
-inline uint64_t get_device_info_host<info::device::global_mem_cache_size>() {
-  return PlatformUtil::getMemCacheSize();
-}
-
-template <>
-inline uint64_t get_device_info_host<info::device::max_constant_buffer_size>() {
-  // current value is the required minimum
-  return 64 * 1024;
-}
-
-template <>
-inline uint32_t get_device_info_host<info::device::max_constant_args>() {
-  // current value is the required minimum
-  return 8;
-}
-
-template <>
-inline info::local_mem_type
-get_device_info_host<info::device::local_mem_type>() {
-  return info::local_mem_type::global;
-}
-
-template <>
-inline uint64_t get_device_info_host<info::device::local_mem_size>() {
-  // current value is the required minimum
-  return 32 * 1024;
-}
-
-template <>
-inline bool get_device_info_host<info::device::error_correction_support>() {
-  return false;
-}
-
-template <>
-inline bool get_device_info_host<info::device::host_unified_memory>() {
-  return true;
-}
-
-template <>
-inline size_t get_device_info_host<info::device::profiling_timer_resolution>() {
-  typedef std::ratio_divide<std::chrono::high_resolution_clock::period,
-                            std::nano>
-      ns_period;
-  return ns_period::num / ns_period::den;
-}
-
-template <> inline bool get_device_info_host<info::device::is_endian_little>() {
-  union {
-    uint16_t a;
-    uint8_t b[2];
-  } u = {0x0100};
-
-  return u.b[1];
-}
-
-template <> inline bool get_device_info_host<info::device::is_available>() {
-  return true;
-}
-
-template <>
-inline bool get_device_info_host<info::device::is_compiler_available>() {
-  return true;
-}
-
-template <>
-inline bool get_device_info_host<info::device::is_linker_available>() {
-  return true;
-}
-
-template <>
-inline std::vector<info::execution_capability>
-get_device_info_host<info::device::execution_capabilities>() {
-  return {info::execution_capability::exec_kernel};
-}
-
-template <> inline bool get_device_info_host<info::device::queue_profiling>() {
-  return true;
-}
-
-template <>
-inline std::vector<kernel_id>
-get_device_info_host<info::device::built_in_kernel_ids>() {
-  return {};
-}
-
-template <>
-inline std::vector<std::string>
-get_device_info_host<info::device::built_in_kernels>() {
-  return {};
-}
-
-template <> inline std::string get_device_info_host<info::device::name>() {
-  return "SYCL host device";
-}
-
-template <> inline std::string get_device_info_host<info::device::vendor>() {
-  return "";
-}
-
-template <>
-inline std::string get_device_info_host<info::device::driver_version>() {
-  return "1.2";
-}
-
-template <> inline std::string get_device_info_host<info::device::profile>() {
-  return "FULL PROFILE";
-}
-
-template <> inline std::string get_device_info_host<info::device::version>() {
-  return "1.2";
-}
-
-template <>
-inline std::string get_device_info_host<info::device::opencl_c_version>() {
-  return "not applicable";
-}
-
-template <>
-inline std::vector<std::string>
-get_device_info_host<info::device::extensions>() {
-  // TODO update when appropriate
-  return {};
-}
-
-template <>
-inline size_t get_device_info_host<info::device::printf_buffer_size>() {
-  // current value is the required minimum
-  return 1024 * 1024;
-}
-
-template <>
-inline bool get_device_info_host<info::device::preferred_interop_user_sync>() {
-  return false;
-}
-
-template <> inline device get_device_info_host<info::device::parent_device>() {
-  throw invalid_object_error(
-      "Partitioning to subdevices of the host device is not implemented",
-      PI_ERROR_INVALID_DEVICE);
-}
-
-template <>
-inline uint32_t
-get_device_info_host<info::device::partition_max_sub_devices>() {
-  // TODO update once subdevice creation is enabled
-  return 1;
-}
-
-template <>
-inline std::vector<info::partition_property>
-get_device_info_host<info::device::partition_properties>() {
-  // TODO update once subdevice creation is enabled
-  return {};
-}
-
-template <>
-inline std::vector<info::partition_affinity_domain>
-get_device_info_host<info::device::partition_affinity_domains>() {
-  // TODO update once subdevice creation is enabled
-  return {};
-}
-
-template <>
-inline info::partition_property
-get_device_info_host<info::device::partition_type_property>() {
-  return info::partition_property::no_partition;
-}
-
-template <>
-inline info::partition_affinity_domain
-get_device_info_host<info::device::partition_type_affinity_domain>() {
-  // TODO update once subdevice creation is enabled
-  return info::partition_affinity_domain::not_applicable;
-}
-
-template <>
-inline uint32_t get_device_info_host<info::device::reference_count>() {
-  // TODO update once subdevice creation is enabled
-  return 1;
-}
-
-template <>
-inline uint32_t get_device_info_host<info::device::max_num_sub_groups>() {
-  // TODO update once subgroups are enabled
-  throw runtime_error("Sub-group feature is not supported on HOST device.",
-                      PI_ERROR_INVALID_DEVICE);
-}
-
-template <>
-inline std::vector<size_t>
-get_device_info_host<info::device::sub_group_sizes>() {
-  // TODO update once subgroups are enabled
-  throw runtime_error("Sub-group feature is not supported on HOST device.",
-                      PI_ERROR_INVALID_DEVICE);
-}
-
-template <>
-inline bool
-get_device_info_host<info::device::sub_group_independent_forward_progress>() {
-  // TODO update once subgroups are enabled
-  throw runtime_error("Sub-group feature is not supported on HOST device.",
-                      PI_ERROR_INVALID_DEVICE);
-}
-
-template <>
-inline bool get_device_info_host<info::device::kernel_kernel_pipe_support>() {
-  return false;
-}
-
-template <>
-inline std::string get_device_info_host<info::device::backend_version>() {
-  throw runtime_error(
-      "Backend version feature is not supported on HOST device.",
-      PI_ERROR_INVALID_DEVICE);
-}
-
-template <>
-inline bool get_device_info_host<info::device::usm_device_allocations>() {
-  return true;
-}
-
-template <>
-inline bool get_device_info_host<info::device::usm_host_allocations>() {
-  return true;
-}
-
-template <>
-inline bool get_device_info_host<info::device::usm_shared_allocations>() {
-  return true;
-}
-
-template <>
-inline bool
-get_device_info_host<info::device::usm_restricted_shared_allocations>() {
-  return true;
-}
-
-template <>
-inline bool get_device_info_host<info::device::usm_system_allocations>() {
-  return true;
-}
-
-template <>
-inline bool get_device_info_host<info::device::ext_intel_mem_channel>() {
-  return false;
-}
-
-// Specializations for intel extensions for Level Zero low-level
-// detail device descriptors (not support on host).
-template <>
-inline uint32_t get_device_info_host<ext::intel::info::device::device_id>() {
-  throw runtime_error("Obtaining the device ID is not supported on HOST device",
-                      PI_ERROR_INVALID_DEVICE);
-}
-template <>
-inline std::string
-get_device_info_host<ext::intel::info::device::pci_address>() {
-  throw runtime_error(
-      "Obtaining the PCI address is not supported on HOST device",
-      PI_ERROR_INVALID_DEVICE);
-}
-template <>
-inline uint32_t get_device_info_host<ext::intel::info::device::gpu_eu_count>() {
-  throw runtime_error("Obtaining the EU count is not supported on HOST device",
-                      PI_ERROR_INVALID_DEVICE);
-}
-template <>
-inline uint32_t
-get_device_info_host<ext::intel::info::device::gpu_eu_simd_width>() {
-  throw runtime_error(
-      "Obtaining the EU SIMD width is not supported on HOST device",
-      PI_ERROR_INVALID_DEVICE);
-}
-template <>
-inline uint32_t get_device_info_host<ext::intel::info::device::gpu_slices>() {
-  throw runtime_error(
-      "Obtaining the number of slices is not supported on HOST device",
-      PI_ERROR_INVALID_DEVICE);
-}
-template <>
-inline uint32_t
-get_device_info_host<ext::intel::info::device::gpu_subslices_per_slice>() {
-  throw runtime_error("Obtaining the number of subslices per slice is not "
-                      "supported on HOST device",
-                      PI_ERROR_INVALID_DEVICE);
-}
-template <>
-inline uint32_t
-get_device_info_host<ext::intel::info::device::gpu_eu_count_per_subslice>() {
-  throw runtime_error(
-      "Obtaining the EU count per subslice is not supported on HOST device",
-      PI_ERROR_INVALID_DEVICE);
-}
-template <>
-inline uint32_t
-get_device_info_host<ext::intel::info::device::gpu_hw_threads_per_eu>() {
-  throw runtime_error(
-      "Obtaining the HW threads count per EU is not supported on HOST device",
-      PI_ERROR_INVALID_DEVICE);
-}
-template <>
-inline uint64_t
-get_device_info_host<ext::intel::info::device::max_mem_bandwidth>() {
-  throw runtime_error(
-      "Obtaining the maximum memory bandwidth is not supported on HOST device",
-      PI_ERROR_INVALID_DEVICE);
-}
-template <>
-inline detail::uuid_type
-get_device_info_host<ext::intel::info::device::uuid>() {
-  throw runtime_error(
-      "Obtaining the device uuid is not supported on HOST device",
-      PI_ERROR_INVALID_DEVICE);
-}
-
-// TODO: Remove with deprecated feature
-// device::get_info<info::device::ext_intel_pci_address>()
-template <>
-inline std::string get_device_info_host<info::device::ext_intel_pci_address>() {
-  throw runtime_error(
-      "Obtaining the PCI address is not supported on HOST device",
-      PI_ERROR_INVALID_DEVICE);
-}
-// TODO: Remove with deprecated feature
-// device::get_info<info::device::ext_intel_gpu_eu_count>()
-template <>
-inline uint32_t get_device_info_host<info::device::ext_intel_gpu_eu_count>() {
-  throw runtime_error("Obtaining the EU count is not supported on HOST device",
-                      PI_ERROR_INVALID_DEVICE);
-}
-// TODO: Remove with deprecated feature
-// device::get_info<info::device::ext_intel_gpu_eu_simd_width>()
-template <>
-inline uint32_t
-get_device_info_host<info::device::ext_intel_gpu_eu_simd_width>() {
-  throw runtime_error(
-      "Obtaining the EU SIMD width is not supported on HOST device",
-      PI_ERROR_INVALID_DEVICE);
-}
-// TODO: Remove with deprecated feature
-// device::get_info<info::device::ext_intel_gpu_slices>()
-template <>
-inline uint32_t get_device_info_host<info::device::ext_intel_gpu_slices>() {
-  throw runtime_error(
-      "Obtaining the number of slices is not supported on HOST device",
-      PI_ERROR_INVALID_DEVICE);
-}
-// TODO: Remove with deprecated feature
-// device::get_info<info::device::ext_intel_gpu_subslices_per_slice>()
-template <>
-inline uint32_t
-get_device_info_host<info::device::ext_intel_gpu_subslices_per_slice>() {
-  throw runtime_error("Obtaining the number of subslices per slice is not "
-                      "supported on HOST device",
-                      PI_ERROR_INVALID_DEVICE);
-}
-// TODO: Remove with deprecated feature
-// device::get_info<info::device::ext_intel_gpu_eu_count_per_subslices>()
-template <>
-inline uint32_t
-get_device_info_host<info::device::ext_intel_gpu_eu_count_per_subslice>() {
-  throw runtime_error(
-      "Obtaining the EU count per subslice is not supported on HOST device",
-      PI_ERROR_INVALID_DEVICE);
-}
-// TODO: Remove with deprecated feature
-// device::get_info<info::device::ext_intel_gpu_hw_threads_per_eu>()
-template <>
-inline uint32_t
-get_device_info_host<info::device::ext_intel_gpu_hw_threads_per_eu>() {
-  throw runtime_error(
-      "Obtaining the HW threads count per EU is not supported on HOST device",
-      PI_ERROR_INVALID_DEVICE);
-}
-// TODO: Remove with deprecated feature
-// device::get_info<info::device::ext_intel_max_mem_bandwidth>()
-template <>
-inline uint64_t
-get_device_info_host<info::device::ext_intel_max_mem_bandwidth>() {
-  throw runtime_error(
-      "Obtaining the maximum memory bandwidth is not supported on HOST device",
-      PI_ERROR_INVALID_DEVICE);
-}
-// TODO:Move to namespace ext::intel::info::device
-template <> inline bool get_device_info_host<info::device::ext_oneapi_srgb>() {
-  return false;
-}
-
-// TODO: Remove with deprecated feature
-// device::get_info<info::device::ext_intel_device_info_uuid>()
-template <>
-inline detail::uuid_type
-get_device_info_host<info::device::ext_intel_device_info_uuid>() {
-  throw runtime_error(
-      "Obtaining the device uuid is not supported on HOST device",
-      PI_ERROR_INVALID_DEVICE);
-}
-
-template <>
-inline uint64_t get_device_info_host<ext::intel::info::device::free_memory>() {
-  throw runtime_error(
-      "Obtaining the device free memory is not supported on HOST device",
-      PI_ERROR_INVALID_DEVICE);
-}
-
-template <>
-inline uint32_t
-get_device_info_host<ext::intel::info::device::memory_clock_rate>() {
-  throw runtime_error(
-      "Obtaining the device memory clock rate is not supported on HOST device",
-      PI_ERROR_INVALID_DEVICE);
-}
-
-template <>
-inline uint32_t
-get_device_info_host<ext::intel::info::device::memory_bus_width>() {
-  throw runtime_error(
-      "Obtaining the device memory bus width is not supported on HOST device",
-      PI_ERROR_INVALID_DEVICE);
-}
-
-template <>
-inline int32_t
-get_device_info_host<ext::intel::info::device::max_compute_queue_indices>() {
-  throw runtime_error(
-      "Obtaining max compute queue indices is not supported on HOST device",
-      PI_ERROR_INVALID_DEVICE);
-}
-
-template <>
-inline bool get_device_info_host<
-    ext::codeplay::experimental::info::device::supports_fusion>() {
-  // No support for fusion on the host device.
-  return false;
-}
-
-template <>
-inline uint32_t get_device_info_host<
-    ext::codeplay::experimental::info::device::max_registers_per_work_group>() {
-  throw runtime_error("Obtaining the maximum number of available registers per "
-                      "work-group is not supported on HOST device",
-                      PI_ERROR_INVALID_DEVICE);
-}
-
-template <>
-inline uint32_t get_device_info_host<
-    ext::oneapi::experimental::info::device::image_row_pitch_align>() {
-  throw runtime_error("Obtaining image pitch alignment is not "
-                      "supported on HOST device",
-                      PI_ERROR_INVALID_DEVICE);
-}
-
-template <>
-inline uint32_t get_device_info_host<
-    ext::oneapi::experimental::info::device::max_image_linear_row_pitch>() {
-  throw runtime_error("Obtaining max image linear pitch is not "
-                      "supported on HOST device",
-                      PI_ERROR_INVALID_DEVICE);
-}
-
-template <>
-inline std::vector<ext::oneapi::experimental::matrix::combination>
-get_device_info_host<
-    ext::oneapi::experimental::info::device::matrix_combinations>() {
-  throw runtime_error("Obtaining matrix combinations is not "
-                      "supported on HOST device",
-                      PI_ERROR_INVALID_DEVICE);
-}
-
-template <>
-inline uint32_t get_device_info_host<
-    ext::oneapi::experimental::info::device::max_image_linear_width>() {
-  throw runtime_error("Obtaining max image linear width is not "
-                      "supported on HOST device",
-                      PI_ERROR_INVALID_DEVICE);
-}
-
-template <>
-inline uint32_t get_device_info_host<
-    ext::oneapi::experimental::info::device::max_image_linear_height>() {
-  throw runtime_error("Obtaining max image linear height is not "
-                      "supported on HOST device",
-                      PI_ERROR_INVALID_DEVICE);
-}
-
-template <>
-inline float get_device_info_host<
-    ext::oneapi::experimental::info::device::mipmap_max_anisotropy>() {
-  throw runtime_error("Bindless image mipaps are not supported on HOST device",
-                      PI_ERROR_INVALID_DEVICE);
-}
-
-template <>
-inline std::vector<sycl::device> get_device_info_host<
-    ext::oneapi::experimental::info::device::component_devices>() {
-  throw runtime_error("Host devices cannot be component devices.",
-                      PI_ERROR_INVALID_DEVICE);
-}
-
-template <>
-inline sycl::device get_device_info_host<
-    ext::oneapi::experimental::info::device::composite_device>() {
-  throw runtime_error("Host devices cannot be composite devices.",
-                      PI_ERROR_INVALID_DEVICE);
-}
-
-// Returns the list of all progress guarantees that can be requested for
-// work_groups from the coordination level of root_group when using host device.
-// First it calls getHostProgressGuarantee to get the strongest guarantee
-// available and then calls getProgressGuaranteesUpTo to get a list of all
-// guarantees that are either equal to the strongest guarantee or weaker than
-// it. The next 5 definitions follow the same model but for different scopes.
-template <>
-inline std::vector<ext::oneapi::experimental::forward_progress_guarantee>
-get_device_info_host<
-    ext::oneapi::experimental::info::device::work_group_progress_capabilities<
-        ext::oneapi::experimental::execution_scope::root_group>>() {
-
-  using execution_scope = ext::oneapi::experimental::execution_scope;
-  using ReturnT =
-      std::vector<ext::oneapi::experimental::forward_progress_guarantee>;
-  return device_impl::getProgressGuaranteesUpTo<ReturnT>(
-      device_impl::getHostProgressGuarantee(execution_scope::work_group,
-                                            execution_scope::root_group));
-}
-
-template <>
-inline std::vector<ext::oneapi::experimental::forward_progress_guarantee>
-get_device_info_host<
-    ext::oneapi::experimental::info::device::sub_group_progress_capabilities<
-        ext::oneapi::experimental::execution_scope::root_group>>() {
-
-  using execution_scope = ext::oneapi::experimental::execution_scope;
-  using ReturnT =
-      std::vector<ext::oneapi::experimental::forward_progress_guarantee>;
-  return device_impl::getProgressGuaranteesUpTo<ReturnT>(
-      device_impl::getHostProgressGuarantee(execution_scope::sub_group,
-                                            execution_scope::root_group));
-}
-
-template <>
-inline std::vector<ext::oneapi::experimental::forward_progress_guarantee>
-get_device_info_host<
-    ext::oneapi::experimental::info::device::sub_group_progress_capabilities<
-        ext::oneapi::experimental::execution_scope::work_group>>() {
-  using execution_scope = ext::oneapi::experimental::execution_scope;
-  using ReturnT =
-      std::vector<ext::oneapi::experimental::forward_progress_guarantee>;
-  return device_impl::getProgressGuaranteesUpTo<ReturnT>(
-      device_impl::getHostProgressGuarantee(execution_scope::sub_group,
-                                            execution_scope::work_group));
-}
-
-template <>
-inline std::vector<ext::oneapi::experimental::forward_progress_guarantee>
-get_device_info_host<
-    ext::oneapi::experimental::info::device::work_item_progress_capabilities<
-        ext::oneapi::experimental::execution_scope::root_group>>() {
-
-  using execution_scope = ext::oneapi::experimental::execution_scope;
-  using ReturnT =
-      std::vector<ext::oneapi::experimental::forward_progress_guarantee>;
-  return device_impl::getProgressGuaranteesUpTo<ReturnT>(
-      device_impl::getHostProgressGuarantee(execution_scope::work_item,
-                                            execution_scope::root_group));
-}
-
-template <>
-inline std::vector<ext::oneapi::experimental::forward_progress_guarantee>
-get_device_info_host<
-    ext::oneapi::experimental::info::device::work_item_progress_capabilities<
-        ext::oneapi::experimental::execution_scope::work_group>>() {
-  using execution_scope = ext::oneapi::experimental::execution_scope;
-  using ReturnT =
-      std::vector<ext::oneapi::experimental::forward_progress_guarantee>;
-  return device_impl::getProgressGuaranteesUpTo<ReturnT>(
-      device_impl::getHostProgressGuarantee(execution_scope::work_item,
-                                            execution_scope::work_group));
-}
-
-template <>
-inline std::vector<ext::oneapi::experimental::forward_progress_guarantee>
-get_device_info_host<
-    ext::oneapi::experimental::info::device::work_item_progress_capabilities<
-        ext::oneapi::experimental::execution_scope::sub_group>>() {
-  using execution_scope = ext::oneapi::experimental::execution_scope;
-  using ReturnT =
-      std::vector<ext::oneapi::experimental::forward_progress_guarantee>;
-  return device_impl::getProgressGuaranteesUpTo<ReturnT>(
-      device_impl::getHostProgressGuarantee(execution_scope::work_item,
-                                            execution_scope::sub_group));
-}
-
 // Returns the list of all progress guarantees that can be requested for
 // work_groups from the coordination level of root_group when using the device
 // given by Dev. First it calls getProgressGuarantee to get the strongest

From 35b682216afe064e98bf8c6f2c45334d99a5120a Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 22 May 2024 04:23:01 -0700
Subject: [PATCH 04/52] not-buildable: remove is_host from context_impl.*

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/context_impl.cpp | 19 +++++--------------
 sycl/source/detail/context_impl.hpp |  1 -
 2 files changed, 5 insertions(+), 15 deletions(-)

diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp
index c2124456dae24..87663c4e10775 100644
--- a/sycl/source/detail/context_impl.cpp
+++ b/sycl/source/detail/context_impl.cpp
@@ -34,7 +34,6 @@ context_impl::context_impl(const device &Device, async_handler AsyncHandler,
       MContext(nullptr),
       MPlatform(detail::getSyclObjImpl(Device.get_platform())),
       MPropList(PropList),
-      MHostContext(detail::getSyclObjImpl(Device)->is_host()),
       MSupportBufferLocationByDevices(NotChecked) {
   MKernelProgramCache.setContextPtr(this);
 }
@@ -43,7 +42,7 @@ context_impl::context_impl(const std::vector<sycl::device> Devices,
                            async_handler AsyncHandler,
                            const property_list &PropList)
     : MOwnedByRuntime(true), MAsyncHandler(AsyncHandler), MDevices(Devices),
-      MContext(nullptr), MPlatform(), MPropList(PropList), MHostContext(false),
+      MContext(nullptr), MPlatform(), MPropList(PropList),
       MSupportBufferLocationByDevices(NotChecked) {
   MPlatform = detail::getSyclObjImpl(MDevices[0].get_platform());
   std::vector<sycl::detail::pi::PiDevice> DeviceIds;
@@ -88,7 +87,7 @@ context_impl::context_impl(sycl::detail::pi::PiContext PiContext,
                            bool OwnedByRuntime)
     : MOwnedByRuntime(OwnedByRuntime), MAsyncHandler(AsyncHandler),
       MDevices(DeviceList), MContext(PiContext), MPlatform(),
-      MHostContext(false), MSupportBufferLocationByDevices(NotChecked) {
+      MSupportBufferLocationByDevices(NotChecked) {
   if (!MDevices.empty()) {
     MPlatform = detail::getSyclObjImpl(MDevices[0].get_platform());
   } else {
@@ -132,18 +131,11 @@ context_impl::context_impl(sycl::detail::pi::PiContext PiContext,
 }
 
 cl_context context_impl::get() const {
-  if (MHostContext) {
-    throw invalid_object_error(
-        "This instance of context doesn't support OpenCL interoperability.",
-        PI_ERROR_INVALID_CONTEXT);
-  }
   // TODO catch an exception and put it to list of asynchronous exceptions
   getPlugin()->call<PiApiKind::piContextRetain>(MContext);
   return pi::cast<cl_context>(MContext);
 }
 
-bool context_impl::is_host() const { return MHostContext; }
-
 context_impl::~context_impl() {
   // Free all events associated with the initialization of device globals.
   for (auto &DeviceGlobalInitializer : MDeviceGlobalInitializers)
@@ -159,10 +151,9 @@ context_impl::~context_impl() {
     assert(LibProg.second && "Null program must not be kept in the cache");
     getPlugin()->call<PiApiKind::piProgramRelease>(LibProg.second);
   }
-  if (!MHostContext) {
-    // TODO catch an exception and put it to list of asynchronous exceptions
-    getPlugin()->call_nocheck<PiApiKind::piContextRelease>(MContext);
-  }
+
+  // TODO catch an exception and put it to list of asynchronous exceptions
+  getPlugin()->call_nocheck<PiApiKind::piContextRelease>(MContext);
 }
 
 const async_handler &context_impl::get_async_handler() const {
diff --git a/sycl/source/detail/context_impl.hpp b/sycl/source/detail/context_impl.hpp
index a1e383f721e31..af20236fc4b23 100644
--- a/sycl/source/detail/context_impl.hpp
+++ b/sycl/source/detail/context_impl.hpp
@@ -272,7 +272,6 @@ class context_impl {
   sycl::detail::pi::PiContext MContext;
   PlatformImplPtr MPlatform;
   property_list MPropList;
-  bool MHostContext;
   CachedLibProgramsT MCachedLibPrograms;
   std::mutex MCachedLibProgramsMutex;
   mutable KernelProgramCache MKernelProgramCache;

From 77c749c6ea54b35b5324bfe163460279b3039930 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 22 May 2024 04:29:12 -0700
Subject: [PATCH 05/52] not-buildable: remove is_host from event_impl.*

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/event_impl.cpp | 91 +++++++++++++------------------
 sycl/source/detail/event_impl.hpp |  3 +-
 2 files changed, 38 insertions(+), 56 deletions(-)

diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index 7442cd4ccfe7a..e187be3563f5b 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -37,20 +37,9 @@ void event_impl::ensureContextInitialized() {
   if (MIsContextInitialized)
     return;
 
-  if (MHostEvent) {
-    QueueImplPtr HostQueue = Scheduler::getInstance().getDefaultHostQueue();
-    this->setContextImpl(detail::getSyclObjImpl(HostQueue->get_context()));
-  } else {
-    const device SyclDevice;
-    this->setContextImpl(detail::queue_impl::getDefaultOrNew(
-        detail::getSyclObjImpl(SyclDevice)));
-  }
-}
-
-bool event_impl::is_host() {
-  // Treat all devices that don't support interoperability as host devices to
-  // avoid attempts to call method get on such events.
-  return MHostEvent;
+  const device SyclDevice;
+  this->setContextImpl(detail::queue_impl::getDefaultOrNew(
+      detail::getSyclObjImpl(SyclDevice)));
 }
 
 event_impl::~event_impl() {
@@ -59,7 +48,7 @@ event_impl::~event_impl() {
 }
 
 void event_impl::waitInternal(bool *Success) {
-  if (!MHostEvent && MEvent) {
+  if (MEvent) {
     // Wait for the native event
     sycl::detail::pi::PiResult Err =
         getPlugin()->call_nocheck<PiApiKind::piEventsWait>(1, &MEvent);
@@ -92,7 +81,7 @@ void event_impl::waitInternal(bool *Success) {
 }
 
 void event_impl::setComplete() {
-  if (MHostEvent || !MEvent) {
+  if (!MEvent) {
     {
       std::unique_lock<std::mutex> lock(MMutex);
 #ifndef NDEBUG
@@ -137,7 +126,6 @@ const PluginPtr &event_impl::getPlugin() {
 void event_impl::setStateIncomplete() { MState = HES_NotComplete; }
 
 void event_impl::setContextImpl(const ContextImplPtr &Context) {
-  MHostEvent = Context->is_host();
   MContext = Context;
   MIsContextInitialized = true;
 }
@@ -145,7 +133,7 @@ void event_impl::setContextImpl(const ContextImplPtr &Context) {
 event_impl::event_impl(sycl::detail::pi::PiEvent Event,
                        const context &SyclContext)
     : MIsContextInitialized(true), MEvent(Event),
-      MContext(detail::getSyclObjImpl(SyclContext)), MHostEvent(false),
+      MContext(detail::getSyclObjImpl(SyclContext)),
       MIsFlushed(true), MState(HES_Complete) {
 
   if (MContext->is_host()) {
@@ -317,7 +305,7 @@ event_impl::get_profiling_info<info::event_profiling::command_submit>() {
   // made by forcing the re-sync of submit time to start time is less than
   // 0.5ms. These timing values were obtained empirically using an integrated
   // Intel GPU).
-  if (MEventFromSubmittedExecCommandBuffer && !MHostEvent && MEvent) {
+  if (MEventFromSubmittedExecCommandBuffer && MEvent) {
     uint64_t StartTime =
         get_event_profiling_info<info::event_profiling::command_start>(
             this->getHandleRef(), this->getPlugin());
@@ -336,20 +324,19 @@ event_impl::get_profiling_info<info::event_profiling::command_start>() {
   if (isNOP() && MSubmitTime)
     return MSubmitTime;
 
-  if (!MHostEvent) {
-    if (MEvent) {
-      auto StartTime =
-          get_event_profiling_info<info::event_profiling::command_start>(
+  if (MEvent) {
+    auto StartTime =
+        get_event_profiling_info<info::event_profiling::command_start>(
+            this->getHandleRef(), this->getPlugin());
+    if (!MFallbackProfiling) {
+      return StartTime;
+    } else {
+      auto DeviceBaseTime =
+          get_event_profiling_info<info::event_profiling::command_submit>(
               this->getHandleRef(), this->getPlugin());
-      if (!MFallbackProfiling) {
-        return StartTime;
-      } else {
-        auto DeviceBaseTime =
-            get_event_profiling_info<info::event_profiling::command_submit>(
-                this->getHandleRef(), this->getPlugin());
-        return MHostBaseTime - DeviceBaseTime + StartTime;
-      }
+      return MHostBaseTime - DeviceBaseTime + StartTime;
     }
+  
     return 0;
   }
   if (!MHostProfilingInfo)
@@ -368,19 +355,17 @@ uint64_t event_impl::get_profiling_info<info::event_profiling::command_end>() {
   if (isNOP() && MSubmitTime)
     return MSubmitTime;
 
-  if (!MHostEvent) {
-    if (MEvent) {
-      auto EndTime =
-          get_event_profiling_info<info::event_profiling::command_end>(
+  if (MEvent) {
+    auto EndTime =
+        get_event_profiling_info<info::event_profiling::command_end>(
+            this->getHandleRef(), this->getPlugin());
+    if (!MFallbackProfiling) {
+      return EndTime;
+    } else {
+      auto DeviceBaseTime =
+          get_event_profiling_info<info::event_profiling::command_submit>(
               this->getHandleRef(), this->getPlugin());
-      if (!MFallbackProfiling) {
-        return EndTime;
-      } else {
-        auto DeviceBaseTime =
-            get_event_profiling_info<info::event_profiling::command_submit>(
-                this->getHandleRef(), this->getPlugin());
-        return MHostBaseTime - DeviceBaseTime + EndTime;
-      }
+      return MHostBaseTime - DeviceBaseTime + EndTime;
     }
     return 0;
   }
@@ -393,7 +378,7 @@ uint64_t event_impl::get_profiling_info<info::event_profiling::command_end>() {
 }
 
 template <> uint32_t event_impl::get_info<info::event::reference_count>() {
-  if (!MHostEvent && MEvent) {
+  if (MEvent) {
     return get_event_info<info::event::reference_count>(this->getHandleRef(),
                                                         this->getPlugin());
   }
@@ -406,17 +391,15 @@ event_impl::get_info<info::event::command_execution_status>() {
   if (MState == HES_Discarded)
     return info::event_command_status::ext_oneapi_unknown;
 
-  if (!MHostEvent) {
-    // Command is enqueued and PiEvent is ready
-    if (MEvent)
-      return get_event_info<info::event::command_execution_status>(
-          this->getHandleRef(), this->getPlugin());
-    // Command is blocked and not enqueued, PiEvent is not assigned yet
-    else if (MCommand)
-      return sycl::info::event_command_status::submitted;
-  }
+  // Command is enqueued and PiEvent is ready
+  if (MEvent)
+    return get_event_info<info::event::command_execution_status>(
+        this->getHandleRef(), this->getPlugin());
+  // Command is blocked and not enqueued, PiEvent is not assigned yet
+  else if (MCommand)
+    return sycl::info::event_command_status::submitted;
 
-  return MHostEvent && MState.load() != HES_Complete
+  return MState.load() != HES_Complete
              ? sycl::info::event_command_status::submitted
              : info::event_command_status::complete;
 }
diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp
index f33c160f9df97..08bb15cff6ff8 100644
--- a/sycl/source/detail/event_impl.hpp
+++ b/sycl/source/detail/event_impl.hpp
@@ -49,7 +49,7 @@ class event_impl {
   /// Normally constructs a host event, use std::nullopt to instead instantiate
   /// a device event.
   event_impl(std::optional<HostEventState> State = HES_Complete)
-      : MIsInitialized(false), MHostEvent(State), MIsFlushed(true),
+      : MIsInitialized(false), MIsFlushed(true),
         MState(State.value_or(HES_Complete)) {
     // Need to fail in event() constructor  if there are problems with the
     // ONEAPI_DEVICE_SELECTOR. Deferring may lead to conficts with noexcept
@@ -364,7 +364,6 @@ class event_impl {
   uint64_t MSubmitTime = 0;
   uint64_t MHostBaseTime = 0;
   ContextImplPtr MContext;
-  bool MHostEvent = true;
   std::unique_ptr<HostProfilingInfo> MHostProfilingInfo;
   void *MCommand = nullptr;
   std::weak_ptr<queue_impl> MQueue;

From 6e7142097db4e014c7a12e576c2af6d124675ed1 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 22 May 2024 04:31:22 -0700
Subject: [PATCH 06/52] not-buildable: update is_host for API objects to be
 easily removed

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/context.cpp  | 5 ++---
 sycl/source/device.cpp   | 5 ++---
 sycl/source/event.cpp    | 5 ++---
 sycl/source/kernel.cpp   | 5 ++---
 sycl/source/platform.cpp | 6 ++----
 sycl/source/queue.cpp    | 5 ++---
 6 files changed, 12 insertions(+), 19 deletions(-)

diff --git a/sycl/source/context.cpp b/sycl/source/context.cpp
index 3273c4f3056c2..c24a6c1ec2079 100644
--- a/sycl/source/context.cpp
+++ b/sycl/source/context.cpp
@@ -138,9 +138,8 @@ context::get_backend_info() const {
 cl_context context::get() const { return impl->get(); }
 
 bool context::is_host() const {
-  bool IsHost = impl->is_host();
-  assert(!IsHost && "context::is_host should not be called in implementation.");
-  return IsHost;
+  assert(true && "context::is_host should not be called in implementation.");
+  return false;
 }
 
 backend context::get_backend() const noexcept { return impl->getBackend(); }
diff --git a/sycl/source/device.cpp b/sycl/source/device.cpp
index 70aa37aad26a2..a3a88ebf6636a 100644
--- a/sycl/source/device.cpp
+++ b/sycl/source/device.cpp
@@ -71,9 +71,8 @@ std::vector<device> device::get_devices(info::device_type deviceType) {
 cl_device_id device::get() const { return impl->get(); }
 
 bool device::is_host() const {
-  bool IsHost = impl->is_host();
-  assert(!IsHost && "device::is_host should not be called in implementation.");
-  return IsHost;
+  assert(true && "device::is_host should not be called in implementation.");
+  return false;
 }
 
 bool device::is_cpu() const { return impl->is_cpu(); }
diff --git a/sycl/source/event.cpp b/sycl/source/event.cpp
index a7bae8055c10b..12b4a7e68164e 100644
--- a/sycl/source/event.cpp
+++ b/sycl/source/event.cpp
@@ -38,9 +38,8 @@ bool event::operator==(const event &rhs) const { return rhs.impl == impl; }
 bool event::operator!=(const event &rhs) const { return !(*this == rhs); }
 
 bool event::is_host() const {
-  bool IsHost = impl->is_host();
-  assert(!IsHost && "event::is_host should not be called in implementation.");
-  return IsHost;
+  assert(true && "event::is_host should not be called in implementation.");
+  return false;
 }
 
 void event::wait() { impl->wait(impl); }
diff --git a/sycl/source/kernel.cpp b/sycl/source/kernel.cpp
index ff14c0a879078..bc842f6e596a5 100644
--- a/sycl/source/kernel.cpp
+++ b/sycl/source/kernel.cpp
@@ -31,9 +31,8 @@ kernel::kernel(cl_kernel ClKernel, const context &SyclContext)
 cl_kernel kernel::get() const { return impl->get(); }
 
 bool kernel::is_host() const {
-  bool IsHost = impl->is_host();
-  assert(!IsHost && "kernel::is_host should not be called in implementation.");
-  return IsHost;
+  assert(true && "kernel::is_host should not be called in implementation.");
+  return false;
 }
 
 context kernel::get_context() const {
diff --git a/sycl/source/platform.cpp b/sycl/source/platform.cpp
index a2ee714952be9..9a15943213ec6 100644
--- a/sycl/source/platform.cpp
+++ b/sycl/source/platform.cpp
@@ -41,10 +41,8 @@ bool platform::has_extension(const std::string &ExtensionName) const {
 }
 
 bool platform::is_host() const {
-  bool IsHost = impl->is_host();
-  assert(!IsHost &&
-         "platform::is_host should not be called in implementation.");
-  return IsHost;
+  assert(true && "platform::is_host should not be called in implementation.");
+  return false;
 }
 
 std::vector<device> platform::get_devices(info::device_type DeviceType) const {
diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp
index 15d7f11fcb42d..6a66cce267aa1 100644
--- a/sycl/source/queue.cpp
+++ b/sycl/source/queue.cpp
@@ -96,9 +96,8 @@ queue::ext_oneapi_get_graph() const {
 }
 
 bool queue::is_host() const {
-  bool IsHost = impl->is_host();
-  assert(!IsHost && "queue::is_host should not be called in implementation.");
-  return IsHost;
+  assert(true && "queue::is_host should not be called in implementation.");
+  return false;
 }
 
 void queue::throw_asynchronous() { impl->throw_asynchronous(); }

From 7e5abe966b8ebbfee9e0adcc7ce935cd864c21b8 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 22 May 2024 08:53:47 -0700
Subject: [PATCH 07/52] not-buildable: update most obvious places

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/context.cpp                       | 37 ++++--------
 sycl/source/detail/event_impl.cpp             | 27 +++------
 sycl/source/detail/event_impl.hpp             | 13 ++--
 sycl/source/detail/scheduler/commands.cpp     | 60 +++----------------
 sycl/source/detail/scheduler/commands.hpp     |  7 +--
 .../source/detail/scheduler/graph_builder.cpp |  4 +-
 sycl/source/detail/scheduler/scheduler.cpp    | 24 +-------
 sycl/source/detail/scheduler/scheduler.hpp    |  8 ---
 sycl/source/handler.cpp                       |  9 +--
 9 files changed, 39 insertions(+), 150 deletions(-)

diff --git a/sycl/source/context.cpp b/sycl/source/context.cpp
index c24a6c1ec2079..70b12836fc297 100644
--- a/sycl/source/context.cpp
+++ b/sycl/source/context.cpp
@@ -56,31 +56,20 @@ context::context(const std::vector<device> &DeviceList,
     throw invalid_parameter_error("DeviceList is empty.",
                                   PI_ERROR_INVALID_VALUE);
   }
-  auto NonHostDeviceIter = std::find_if_not(
-      DeviceList.begin(), DeviceList.end(), [&](const device &CurrentDevice) {
-        return detail::getSyclObjImpl(CurrentDevice)->is_host();
-      });
-  if (NonHostDeviceIter == DeviceList.end())
-    impl = std::make_shared<detail::context_impl>(DeviceList[0], AsyncHandler,
+  
+  const auto &RefPlatform =
+      detail::getSyclObjImpl(DeviceList[0].get_platform())->getHandleRef();
+  if (std::any_of(DeviceList.begin(), DeviceList.end(),
+                  [&](const device &CurrentDevice) {
+                    return (detail::getSyclObjImpl(CurrentDevice.get_platform())
+                              ->getHandleRef() != RefPlatform);
+                  }))
+    throw invalid_parameter_error(
+        "Can't add devices across platforms to a single context.",
+        PI_ERROR_INVALID_DEVICE);
+  else
+    impl = std::make_shared<detail::context_impl>(DeviceList, AsyncHandler,
                                                   PropList);
-  else {
-    const device &NonHostDevice = *NonHostDeviceIter;
-    const auto &NonHostPlatform =
-        detail::getSyclObjImpl(NonHostDevice.get_platform())->getHandleRef();
-    if (std::any_of(DeviceList.begin(), DeviceList.end(),
-                    [&](const device &CurrentDevice) {
-                      return (
-                          detail::getSyclObjImpl(CurrentDevice)->is_host() ||
-                          (detail::getSyclObjImpl(CurrentDevice.get_platform())
-                               ->getHandleRef() != NonHostPlatform));
-                    }))
-      throw invalid_parameter_error(
-          "Can't add devices across platforms to a single context.",
-          PI_ERROR_INVALID_DEVICE);
-    else
-      impl = std::make_shared<detail::context_impl>(DeviceList, AsyncHandler,
-                                                    PropList);
-  }
 }
 context::context(cl_context ClContext, async_handler AsyncHandler) {
   const auto &Plugin = sycl::detail::pi::getPlugin<backend::opencl>();
diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index e187be3563f5b..28bb37200392a 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -136,13 +136,6 @@ event_impl::event_impl(sycl::detail::pi::PiEvent Event,
       MContext(detail::getSyclObjImpl(SyclContext)),
       MIsFlushed(true), MState(HES_Complete) {
 
-  if (MContext->is_host()) {
-    throw sycl::exception(sycl::make_error_code(sycl::errc::invalid),
-                          "The syclContext must match the OpenCL context "
-                          "associated with the clEvent. " +
-                              codeToString(PI_ERROR_INVALID_CONTEXT));
-  }
-
   sycl::detail::pi::PiContext TempContext;
   getPlugin()->call<PiApiKind::piEventGetInfo>(
       MEvent, PI_EVENT_INFO_CONTEXT, sizeof(sycl::detail::pi::PiContext),
@@ -162,19 +155,8 @@ event_impl::event_impl(const QueueImplPtr &Queue) {
 
 void event_impl::associateWithQueue(const QueueImplPtr &Queue) {
   MQueue = Queue;
-  MIsProfilingEnabled = Queue->is_host() || Queue->MIsProfilingEnabled;
+  MIsProfilingEnabled = Queue->MIsProfilingEnabled;
   MFallbackProfiling = MIsProfilingEnabled && Queue->isProfilingFallback();
-  if (Queue->is_host()) {
-    MState.store(HES_NotComplete);
-    if (Queue->has_property<property::queue::enable_profiling>()) {
-      MHostProfilingInfo.reset(new HostProfilingInfo());
-      if (!MHostProfilingInfo)
-        throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
-                              "Out of host memory " +
-                                  codeToString(PI_ERROR_OUT_OF_HOST_MEMORY));
-    }
-    return;
-  }
   MState.store(HES_Complete);
 }
 
@@ -578,6 +560,13 @@ bool event_impl::isCompleted() {
          info::event_command_status::complete;
 }
 
+void event_impl::setCommand(void *Cmd) {
+  MCommand = Cmd;
+  auto TypedCommand = static_cast<Command*>(Cmd);
+  if (TypedCommand)
+    MIsHostTask = TypedCommand->isHostTask();
+}
+
 } // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp
index 08bb15cff6ff8..7c1eb99e3b286 100644
--- a/sycl/source/detail/event_impl.hpp
+++ b/sycl/source/detail/event_impl.hpp
@@ -68,14 +68,6 @@ class event_impl {
   event_impl(sycl::detail::pi::PiEvent Event, const context &SyclContext);
   event_impl(const QueueImplPtr &Queue);
 
-  /// Checks if this event is a SYCL host event.
-  ///
-  /// All devices that do not support OpenCL interoperability are treated as
-  /// host device to avoid attempts to call method get on such events.
-  //
-  /// \return true if this event is a SYCL host event.
-  bool is_host();
-
   /// Waits for the event.
   ///
   /// Self is needed in order to pass shared_ptr to Scheduler.
@@ -177,7 +169,7 @@ class event_impl {
   /// Scheduler mutex must be locked in write mode when this is called.
   ///
   /// @param Command is a generic pointer to Command object instance.
-  void setCommand(void *Command) { MCommand = Command; }
+  void setCommand(void *Command);
 
   /// Returns host profiling information.
   ///
@@ -345,6 +337,8 @@ class event_impl {
 
   void setEnqueued() { MIsEnqueued = true; }
 
+  bool isHost() { return MIsHostTask; }
+
 protected:
   // When instrumentation is enabled emits trace event for event wait begin and
   // returns the telemetry event generated for the wait
@@ -412,6 +406,7 @@ class event_impl {
                   std::shared_ptr<sycl::detail::context_impl> Context);
 
   std::atomic_bool MIsEnqueued{false};
+  bool MIsHostTask{false};
 };
 
 } // namespace detail
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index bf7e44062cb5e..0739ac77373b7 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -96,9 +96,7 @@ static std::string demangleKernelName(std::string Name) { return Name; }
 #endif
 
 static std::string deviceToString(device Device) {
-  if (getSyclObjImpl(Device)->is_host())
-    return "HOST";
-  else if (Device.is_cpu())
+  if (Device.is_cpu())
     return "CPU";
   else if (Device.is_gpu())
     return "GPU";
@@ -144,10 +142,7 @@ void applyFuncOnFilteredArgs(
 
 #ifdef XPTI_ENABLE_INSTRUMENTATION
 static size_t deviceToID(const device &Device) {
-  if (getSyclObjImpl(Device)->is_host())
-    return 0;
-  else
-    return reinterpret_cast<size_t>(getSyclObjImpl(Device)->getHandleRef());
+  return reinterpret_cast<size_t>(getSyclObjImpl(Device)->getHandleRef());
 }
 #endif
 
@@ -265,7 +260,7 @@ std::vector<sycl::detail::pi::PiEvent> Command::getPiEventsBlocking(
     // (which is set lazily) calling getContextImpl() would set that
     // context, which we wish to avoid as it is expensive.
     // Skip host task and NOP events also.
-    if (!EventImpl->isContextInitialized() || EventImpl->is_host() ||
+    if (!EventImpl->isContextInitialized() || EventImpl->isHost() ||
         EventImpl->isNOP())
       continue;
     // In this path nullptr native event means that the command has not been
@@ -455,40 +450,9 @@ void Command::waitForEvents(QueueImplPtr Queue,
                             std::vector<EventImplPtr> &EventImpls,
                             sycl::detail::pi::PiEvent &Event) {
   if (!EventImpls.empty()) {
-    if (Queue->is_host()) {
-      // Host queue can wait for events from different contexts, i.e. it may
-      // contain events with different contexts in its MPreparedDepsEvents.
-      // OpenCL 2.1 spec says that clWaitForEvents will return
-      // CL_INVALID_CONTEXT if events specified in the list do not belong to
-      // the same context. Thus we split all the events into per-context map.
-      // An example. We have two queues for the same CPU device: Q1, Q2. Thus
-      // we will have two different contexts for the same CPU device: C1, C2.
-      // Also we have default host queue. This queue is accessible via
-      // Scheduler. Now, let's assume we have three different events: E1(C1),
-      // E2(C1), E3(C2). The command's MPreparedDepsEvents will contain all
-      // three events (E1, E2, E3). Now, if piEventsWait is called for all
-      // three events we'll experience failure with CL_INVALID_CONTEXT 'cause
-      // these events refer to different contexts.
-      std::map<context_impl *, std::vector<EventImplPtr>>
-          RequiredEventsPerContext;
-
-      for (const EventImplPtr &Event : EventImpls) {
-        ContextImplPtr Context = Event->getContextImpl();
-        assert(Context.get() &&
-               "Only non-host events are expected to be waited for here");
-        RequiredEventsPerContext[Context.get()].push_back(Event);
-      }
-
-      for (auto &CtxWithEvents : RequiredEventsPerContext) {
-        std::vector<sycl::detail::pi::PiEvent> RawEvents =
-            getPiEvents(CtxWithEvents.second);
-        CtxWithEvents.first->getPlugin()->call<PiApiKind::piEventsWait>(
-            RawEvents.size(), RawEvents.data());
-      }
-    } else {
 #ifndef NDEBUG
       for (const EventImplPtr &Event : EventImpls)
-        assert(Event->getContextImpl().get() &&
+        assert(!Event->isHost() &&
                "Only non-host events are expected to be waited for here");
 #endif
 
@@ -501,7 +465,6 @@ void Command::waitForEvents(QueueImplPtr Queue,
         MEvent->setHostEnqueueTime();
       Plugin->call<PiApiKind::piEnqueueEventsWait>(
           Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event);
-    }
   }
 }
 
@@ -714,7 +677,7 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep,
   // enqueued
   //    (e.g. alloca). Note that we can't check the pi event to make that
   //    distinction since the command might still be unenqueued at this point.
-  bool PiEventExpected = (!DepEvent->is_host() && DepEvent->isInitialized());
+  bool PiEventExpected = (!DepEvent->isHost() && DepEvent->isInitialized());
   if (auto *DepCmd = static_cast<Command *>(DepEvent->getCommand()))
     PiEventExpected &= DepCmd->producesPiEvent();
 
@@ -885,7 +848,7 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking,
   else {
     MEvent->setEnqueued();
     if (MShouldCompleteEventIfPossible &&
-        (MEvent->is_host() || MEvent->getHandleRef() == nullptr))
+        (MEvent->isHost() || MEvent->getHandleRef() == nullptr))
       MEvent->setComplete();
 
     // Consider the command is successfully enqueued if return code is
@@ -3172,8 +3135,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     std::vector<detail::EventImplPtr> Events = Barrier->MEventsWaitWithBarrier;
     std::vector<sycl::detail::pi::PiEvent> PiEvents =
         getPiEventsBlocking(Events);
-    if (MQueue->getDeviceImplPtr()->is_host() || PiEvents.empty()) {
-      // NOP for host device.
+    if (PiEvents.empty()) {
       // If Events is empty, then the barrier has no effect.
       return PI_SUCCESS;
     }
@@ -3244,10 +3206,6 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
   }
   case CG::CGTYPE::SemaphoreWait: {
     CGSemaphoreWait *SemWait = (CGSemaphoreWait *)MCommandGroup.get();
-    if (MQueue->getDeviceImplPtr()->is_host()) {
-      // NOP for host device.
-      return PI_SUCCESS;
-    }
 
     const detail::PluginPtr &Plugin = MQueue->getPlugin();
     Plugin->call<PiApiKind::piextWaitExternalSemaphore>(
@@ -3258,10 +3216,6 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
   }
   case CG::CGTYPE::SemaphoreSignal: {
     CGSemaphoreSignal *SemSignal = (CGSemaphoreSignal *)MCommandGroup.get();
-    if (MQueue->getDeviceImplPtr()->is_host()) {
-      // NOP for host device.
-      return PI_SUCCESS;
-    }
 
     const detail::PluginPtr &Plugin = MQueue->getPlugin();
     Plugin->call<PiApiKind::piextSignalExternalSemaphore>(
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 8ba0cceee9e6a..89cabd134a7e1 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -377,10 +377,9 @@ class Command {
   std::string MSubmissionFileName;
   std::string MSubmissionFunctionName;
 
-  // This flag allows to control whether host event should be set complete
-  // after successfull enqueue of command. Event is considered as host event if
-  // either it's is_host() return true or there is no backend representation
-  // of event (i.e. getHandleRef() return reference to nullptr value).
+  // This flag allows to control whether event should be set complete
+  // after successfull enqueue of command. Event is considered as "host" event if
+  // there is no backend representation of event (i.e. getHandleRef() return reference to nullptr value).
   // By default the flag is set to true due to most of host operations are
   // synchronous. The only asynchronous operation currently is host-task.
   bool MShouldCompleteEventIfPossible = true;
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index f0c5dc670aa05..196232b95d734 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -50,9 +50,7 @@ static bool doOverlap(const Requirement *LHS, const Requirement *RHS) {
 }
 
 static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS) {
-  // Consider two different host contexts to be the same to avoid additional
-  // allocation on the host
-  return LHS == RHS || (LHS->is_host() && RHS->is_host());
+  return LHS == RHS;
 }
 
 /// Checks if current requirement is requirement for sub buffer.
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index 7b6c837131658..0b061a86dbc62 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -105,14 +105,6 @@ EventImplPtr Scheduler::addCG(
     auto *CGExecKernelPtr = static_cast<CGExecKernel *>(CommandGroup.get());
     Streams = CGExecKernelPtr->getStreams();
     CGExecKernelPtr->clearStreams();
-    // Stream's flush buffer memory is mainly initialized in stream's __init
-    // method. However, this method is not available on host device.
-    // Initializing stream's flush buffer on the host side in a separate task.
-    if (Queue->is_host()) {
-      for (const StreamImplPtr &Stream : Streams) {
-        Stream->initStreamHost(Queue);
-      }
-    }
   }
   std::vector<std::shared_ptr<const void>> AuxiliaryResources;
   AuxiliaryResources = CommandGroup->getAuxiliaryResources();
@@ -394,18 +386,6 @@ void Scheduler::enqueueUnblockedCommands(
   }
 }
 
-Scheduler::Scheduler() {
-  sycl::device HostDevice =
-      createSyclObjFromImpl<device>(device_impl::getHostDeviceImpl());
-  sycl::context HostContext{HostDevice};
-  DefaultHostQueue = QueueImplPtr(
-      new queue_impl(detail::getSyclObjImpl(HostDevice),
-                     detail::getSyclObjImpl(HostContext), /*AsyncHandler=*/{},
-                     /*PropList=*/{sycl::property::queue::enable_profiling()}));
-}
-
-Scheduler::~Scheduler() { DefaultHostQueue.reset(); }
-
 void Scheduler::releaseResources(BlockingT Blocking) {
   //  There might be some commands scheduled for post enqueue cleanup that
   //  haven't been freed because of the graph mutex being locked at the time,
@@ -726,11 +706,11 @@ bool CheckEventReadiness(const ContextImplPtr &Context,
   // their context, which we wish to avoid as it is expensive.
   // NOP events also don't represent actual dependencies.
   if ((!SyclEventImplPtr->isContextInitialized() &&
-       !SyclEventImplPtr->is_host()) ||
+       !SyclEventImplPtr->isHost()) ||
       SyclEventImplPtr->isNOP()) {
     return true;
   }
-  if (SyclEventImplPtr->is_host()) {
+  if (SyclEventImplPtr->isHost()) {
     return SyclEventImplPtr->isCompleted();
   }
   // Cross-context dependencies can't be passed to the backend directly.
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index 09437928f1d32..6fa95cb4a4a54 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -450,10 +450,6 @@ class Scheduler {
   /// \return true if an instance of the scheduler object exists.
   static bool isInstanceAlive();
 
-  QueueImplPtr getDefaultHostQueue() { return DefaultHostQueue; }
-
-  const QueueImplPtr &getDefaultHostQueue() const { return DefaultHostQueue; }
-
   static MemObjRecord *getMemObjRecord(const Requirement *const Req);
 
   void deferMemObjRelease(const std::shared_ptr<detail::SYCLMemObjI> &MemObj);
@@ -468,8 +464,6 @@ class Scheduler {
 
   bool isInFusionMode(QueueIdT Queue);
 
-  Scheduler();
-  ~Scheduler();
   void releaseResources(BlockingT Blocking = BlockingT::BLOCKING);
   bool isDeferredMemObjectsEmpty();
 
@@ -966,8 +960,6 @@ class Scheduler {
       MAuxiliaryResources;
   std::mutex MAuxiliaryResourcesMutex;
 
-  QueueImplPtr DefaultHostQueue;
-
   friend class Command;
   friend class DispatchHostTask;
   friend class queue_impl;
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index 8223c9330814e..749ab6750df5e 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -273,12 +273,6 @@ event handler::finalize() {
         detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent,
                                            xpti::trace_task_begin, nullptr);
 #endif
-        if (MQueue->is_host()) {
-          MHostKernel->call(MNDRDesc, (NewEvent)
-                                          ? NewEvent->getHostProfilingInfo()
-                                          : nullptr);
-          Result = PI_SUCCESS;
-        } else {
           if (MQueue->getDeviceImplPtr()->getBackend() ==
               backend::ext_intel_esimd_emulator) {
             // Capture the host timestamp for profiling (queue time)
@@ -313,7 +307,6 @@ event handler::finalize() {
                 MKernelName.c_str(), RawEvents, NewEvent, nullptr,
                 MImpl->MKernelCacheConfig, MImpl->MKernelIsCooperative);
           }
-        }
 #ifdef XPTI_ENABLE_INSTRUMENTATION
         // Emit signal only when event is created
         if (NewEvent != nullptr) {
@@ -351,7 +344,7 @@ event handler::finalize() {
         if (PI_SUCCESS != EnqueueKernel())
           throw runtime_error("Enqueue process failed.",
                               PI_ERROR_INVALID_OPERATION);
-        else if (NewEvent->is_host() || NewEvent->getHandleRef() == nullptr)
+        else if (NewEvent->isHost() || NewEvent->getHandleRef() == nullptr)
           NewEvent->setComplete();
         NewEvent->setEnqueued();
 

From 31a702c1c2ec81aa2430595230761edc75d52dce Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Thu, 23 May 2024 06:33:00 -0700
Subject: [PATCH 08/52] not-buildable: remove is_host from obvious places,
 part2

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/backend_impl.hpp           |  1 -
 sycl/source/detail/bindless_images.cpp        |  4 -
 sycl/source/detail/context_impl.cpp           | 10 ---
 sycl/source/detail/device_impl.hpp            |  6 +-
 sycl/source/detail/filter_selector_impl.cpp   |  3 -
 sycl/source/detail/helpers.cpp                |  4 +-
 sycl/source/detail/kernel_impl.cpp            |  4 +-
 sycl/source/detail/kernel_impl.hpp            | 22 ------
 sycl/source/detail/kernel_info.hpp            | 73 -------------------
 sycl/source/detail/platform_impl.cpp          | 17 +----
 sycl/source/detail/platform_impl.hpp          | 12 ---
 sycl/source/detail/platform_info.hpp          | 30 --------
 sycl/source/detail/program_impl.cpp           | 46 +++---------
 sycl/source/detail/program_impl.hpp           |  6 --
 sycl/source/detail/queue_impl.cpp             | 35 ++++-----
 sycl/source/detail/queue_impl.hpp             | 61 +++++-----------
 sycl/source/detail/scheduler/commands.cpp     | 20 +----
 .../source/detail/scheduler/graph_builder.cpp |  2 +-
 18 files changed, 56 insertions(+), 300 deletions(-)

diff --git a/sycl/source/detail/backend_impl.hpp b/sycl/source/detail/backend_impl.hpp
index ca23ceb48815c..0c160ed1920c4 100644
--- a/sycl/source/detail/backend_impl.hpp
+++ b/sycl/source/detail/backend_impl.hpp
@@ -15,7 +15,6 @@ inline namespace _V1 {
 namespace detail {
 
 template <class T> backend getImplBackend(const T &Impl) {
-  assert(!Impl->is_host() && "Cannot get the backend for host.");
   return Impl->getContextImplPtr()->getBackend();
 }
 
diff --git a/sycl/source/detail/bindless_images.cpp b/sycl/source/detail/bindless_images.cpp
index 174fe087ede4f..fbf90e692598e 100644
--- a/sycl/source/detail/bindless_images.cpp
+++ b/sycl/source/detail/bindless_images.cpp
@@ -746,10 +746,6 @@ __SYCL_EXPORT void *pitched_alloc_device(size_t *resultPitch,
 
   std::shared_ptr<sycl::detail::context_impl> CtxImpl =
       sycl::detail::getSyclObjImpl(syclContext);
-  if (CtxImpl->is_host()) {
-    throw sycl::exception(sycl::make_error_code(sycl::errc::memory_allocation),
-                          "Cannot allocate pitched memory on host!");
-  }
 
   pi_context PiContext = CtxImpl->getHandleRef();
   const sycl::detail::PluginPtr &Plugin = CtxImpl->getPlugin();
diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp
index 87663c4e10775..0c79ed2f70462 100644
--- a/sycl/source/detail/context_impl.cpp
+++ b/sycl/source/detail/context_impl.cpp
@@ -162,8 +162,6 @@ const async_handler &context_impl::get_async_handler() const {
 
 template <>
 uint32_t context_impl::get_info<info::context::reference_count>() const {
-  if (is_host())
-    return 0;
   return get_context_info<info::context::reference_count>(this->getHandleRef(),
                                                           this->getPlugin());
 }
@@ -183,8 +181,6 @@ context_impl::get_info<info::context::atomic_memory_order_capabilities>()
       sycl::memory_order::relaxed, sycl::memory_order::acquire,
       sycl::memory_order::release, sycl::memory_order::acq_rel,
       sycl::memory_order::seq_cst};
-  if (is_host())
-    return CapabilityList;
 
   GetCapabilitiesIntersectionSet<
       sycl::memory_order, info::device::atomic_memory_order_capabilities>(
@@ -200,8 +196,6 @@ context_impl::get_info<info::context::atomic_memory_scope_capabilities>()
       sycl::memory_scope::work_item, sycl::memory_scope::sub_group,
       sycl::memory_scope::work_group, sycl::memory_scope::device,
       sycl::memory_scope::system};
-  if (is_host())
-    return CapabilityList;
 
   GetCapabilitiesIntersectionSet<
       sycl::memory_scope, info::device::atomic_memory_scope_capabilities>(
@@ -216,8 +210,6 @@ context_impl::get_info<info::context::atomic_fence_order_capabilities>() const {
       sycl::memory_order::relaxed, sycl::memory_order::acquire,
       sycl::memory_order::release, sycl::memory_order::acq_rel,
       sycl::memory_order::seq_cst};
-  if (is_host())
-    return CapabilityList;
 
   GetCapabilitiesIntersectionSet<sycl::memory_order,
                                  info::device::atomic_fence_order_capabilities>(
@@ -232,8 +224,6 @@ context_impl::get_info<info::context::atomic_fence_scope_capabilities>() const {
       sycl::memory_scope::work_item, sycl::memory_scope::sub_group,
       sycl::memory_scope::work_group, sycl::memory_scope::device,
       sycl::memory_scope::system};
-  if (is_host())
-    return CapabilityList;
 
   GetCapabilitiesIntersectionSet<sycl::memory_scope,
                                  info::device::atomic_fence_scope_capabilities>(
diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp
index 2526647152892..efec017d372f5 100644
--- a/sycl/source/detail/device_impl.hpp
+++ b/sycl/source/detail/device_impl.hpp
@@ -80,18 +80,18 @@ class device_impl {
   /// Check if device is a CPU device
   ///
   /// \return true if SYCL device is a CPU device
-  bool is_cpu() const { return (!is_host() && (MType == PI_DEVICE_TYPE_CPU)); }
+  bool is_cpu() const { return MType == PI_DEVICE_TYPE_CPU; }
 
   /// Check if device is a GPU device
   ///
   /// \return true if SYCL device is a GPU device
-  bool is_gpu() const { return (!is_host() && (MType == PI_DEVICE_TYPE_GPU)); }
+  bool is_gpu() const { return MType == PI_DEVICE_TYPE_GPU; }
 
   /// Check if device is an accelerator device
   ///
   /// \return true if SYCL device is an accelerator device
   bool is_accelerator() const {
-    return (!is_host() && (MType == PI_DEVICE_TYPE_ACC));
+    return MType == PI_DEVICE_TYPE_ACC;
   }
 
   /// Return device type
diff --git a/sycl/source/detail/filter_selector_impl.cpp b/sycl/source/detail/filter_selector_impl.cpp
index 4b5f8e836ee6d..0043622d62483 100644
--- a/sycl/source/detail/filter_selector_impl.cpp
+++ b/sycl/source/detail/filter_selector_impl.cpp
@@ -99,9 +99,6 @@ filter_selector_impl::filter_selector_impl(const std::string &Input)
 }
 
 int filter_selector_impl::operator()(const device &Dev) const {
-  assert(!sycl::detail::getSyclObjImpl(Dev)->is_host() &&
-         "filter_selector_impl should not be used with host.");
-
   int Score = REJECT_DEVICE_SCORE;
 
   for (auto &Filter : mFilters) {
diff --git a/sycl/source/detail/helpers.cpp b/sycl/source/detail/helpers.cpp
index 1bdb2ddbd4697..75c6fd72b8fd0 100644
--- a/sycl/source/detail/helpers.cpp
+++ b/sycl/source/detail/helpers.cpp
@@ -32,7 +32,7 @@ getOrWaitEvents(std::vector<sycl::event> DepEvents, ContextImplPtr Context) {
     // (which is set lazily) calling getContextImpl() would set that
     // context, which we wish to avoid as it is expensive.
     if ((!SyclEventImplPtr->isContextInitialized() &&
-         !SyclEventImplPtr->is_host()) ||
+         !SyclEventImplPtr->isHost()) ||
         SyclEventImplPtr->isNOP()) {
       continue;
     }
@@ -41,7 +41,7 @@ getOrWaitEvents(std::vector<sycl::event> DepEvents, ContextImplPtr Context) {
     bool NoPiEvent =
         SyclEventImplPtr->MCommand &&
         !static_cast<Command *>(SyclEventImplPtr->MCommand)->producesPiEvent();
-    if (SyclEventImplPtr->is_host() ||
+    if (SyclEventImplPtr->isHost() ||
         SyclEventImplPtr->getContextImpl() != Context || NoPiEvent) {
       // Call wait, because the command for the event might not have been
       // enqueued when kernel fusion is happening.
diff --git a/sycl/source/detail/kernel_impl.cpp b/sycl/source/detail/kernel_impl.cpp
index 9c5a1851cd3b1..b4ab6b232eef9 100644
--- a/sycl/source/detail/kernel_impl.cpp
+++ b/sycl/source/detail/kernel_impl.cpp
@@ -76,9 +76,7 @@ kernel_impl::kernel_impl(ContextImplPtr Context, ProgramImplPtr ProgramImpl)
 
 kernel_impl::~kernel_impl() {
   // TODO catch an exception and put it to list of asynchronous exceptions
-  if (!is_host()) {
-    getPlugin()->call<PiApiKind::piKernelRelease>(MKernel);
-  }
+  getPlugin()->call<PiApiKind::piKernelRelease>(MKernel);
 }
 
 bool kernel_impl::isCreatedFromSource() const {
diff --git a/sycl/source/detail/kernel_impl.hpp b/sycl/source/detail/kernel_impl.hpp
index 1e56e6da4dc53..1a1542d0d409b 100644
--- a/sycl/source/detail/kernel_impl.hpp
+++ b/sycl/source/detail/kernel_impl.hpp
@@ -103,20 +103,10 @@ class kernel_impl {
   ///
   /// \return a valid cl_kernel instance
   cl_kernel get() const {
-    if (is_host()) {
-      throw invalid_object_error(
-          "This instance of kernel doesn't support OpenCL interoperability.",
-          PI_ERROR_INVALID_KERNEL);
-    }
     getPlugin()->call<PiApiKind::piKernelRetain>(MKernel);
     return pi::cast<cl_kernel>(MKernel);
   }
 
-  /// Check if the associated SYCL context is a SYCL host context.
-  ///
-  /// \return true if this SYCL kernel is a host kernel.
-  bool is_host() const { return MContext->is_host(); }
-
   const PluginPtr &getPlugin() const { return MContext->getPlugin(); }
 
   /// Query information from the kernel object using the info::kernel_info
@@ -217,11 +207,6 @@ template <typename Param>
 inline typename Param::return_type kernel_impl::get_info() const {
   static_assert(is_kernel_info_desc<Param>::value,
                 "Invalid kernel information descriptor");
-  if (is_host()) {
-    // TODO implement
-    assert(0 && "Not implemented");
-  }
-
   if constexpr (std::is_same_v<Param, info::kernel::num_args>)
     checkIfValidForNumArgsInfoQuery();
 
@@ -248,9 +233,6 @@ kernel_impl::get_info(const device &Device) const {
           "is a built-in kernel.");
   }
 
-  if (is_host()) {
-    return get_kernel_device_specific_info_host<Param>(Device);
-  }
   return get_kernel_device_specific_info<Param>(
       this->getHandleRef(), getSyclObjImpl(Device)->getHandleRef(),
       getPlugin());
@@ -260,10 +242,6 @@ template <typename Param>
 inline typename Param::return_type
 kernel_impl::get_info(const device &Device,
                       const sycl::range<3> &WGSize) const {
-  if (is_host()) {
-    throw runtime_error("Sub-group feature is not supported on HOST device.",
-                        PI_ERROR_INVALID_DEVICE);
-  }
   return get_kernel_device_specific_info_with_input<Param>(
       this->getHandleRef(), getSyclObjImpl(Device)->getHandleRef(), WGSize,
       getPlugin());
diff --git a/sycl/source/detail/kernel_info.hpp b/sycl/source/detail/kernel_info.hpp
index 12256158eed49..79c0f73c952de 100644
--- a/sycl/source/detail/kernel_info.hpp
+++ b/sycl/source/detail/kernel_info.hpp
@@ -137,79 +137,6 @@ uint32_t get_kernel_device_specific_info_with_input(
   return Result;
 }
 
-template <typename Param>
-inline typename Param::return_type
-get_kernel_device_specific_info_host(const sycl::device &Device) = delete;
-
-template <>
-inline sycl::range<3> get_kernel_device_specific_info_host<
-    info::kernel_device_specific::global_work_size>(const sycl::device &) {
-  throw invalid_object_error("This instance of kernel is a host instance",
-                             PI_ERROR_INVALID_KERNEL);
-}
-
-template <>
-inline size_t get_kernel_device_specific_info_host<
-    info::kernel_device_specific::work_group_size>(const sycl::device &Dev) {
-  return Dev.get_info<info::device::max_work_group_size>();
-}
-
-template <>
-inline sycl::range<3> get_kernel_device_specific_info_host<
-    info::kernel_device_specific::compile_work_group_size>(
-    const sycl::device &) {
-  return {0, 0, 0};
-}
-
-template <>
-inline size_t get_kernel_device_specific_info_host<
-    info::kernel_device_specific::preferred_work_group_size_multiple>(
-    const sycl::device &Dev) {
-  return get_kernel_device_specific_info_host<
-      info::kernel_device_specific::work_group_size>(Dev);
-}
-
-template <>
-inline size_t get_kernel_device_specific_info_host<
-    info::kernel_device_specific::private_mem_size>(const sycl::device &) {
-  return 0;
-}
-
-template <>
-inline uint32_t get_kernel_device_specific_info_host<
-    info::kernel_device_specific::ext_codeplay_num_regs>(const sycl::device &) {
-  return 0;
-}
-
-template <>
-inline uint32_t get_kernel_device_specific_info_host<
-    info::kernel_device_specific::max_num_sub_groups>(const sycl::device &) {
-  throw invalid_object_error("This instance of kernel is a host instance",
-                             PI_ERROR_INVALID_KERNEL);
-}
-
-template <>
-inline uint32_t get_kernel_device_specific_info_host<
-    info::kernel_device_specific::max_sub_group_size>(const sycl::device &) {
-  throw invalid_object_error("This instance of kernel is a host instance",
-                             PI_ERROR_INVALID_KERNEL);
-}
-
-template <>
-inline uint32_t get_kernel_device_specific_info_host<
-    info::kernel_device_specific::compile_num_sub_groups>(
-    const sycl::device &) {
-  throw invalid_object_error("This instance of kernel is a host instance",
-                             PI_ERROR_INVALID_KERNEL);
-}
-
-template <>
-inline uint32_t get_kernel_device_specific_info_host<
-    info::kernel_device_specific::compile_sub_group_size>(
-    const sycl::device &) {
-  throw invalid_object_error("This instance of kernel is a host instance",
-                             PI_ERROR_INVALID_KERNEL);
-}
 } // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/source/detail/platform_impl.cpp b/sycl/source/detail/platform_impl.cpp
index 9700fde466803..2caf958bb842b 100644
--- a/sycl/source/detail/platform_impl.cpp
+++ b/sycl/source/detail/platform_impl.cpp
@@ -79,9 +79,6 @@ static bool IsBannedPlatform(platform Platform) {
   // is disabled as well.
   //
   auto IsMatchingOpenCL = [](platform Platform, const std::string_view name) {
-    if (getSyclObjImpl(Platform)->is_host())
-      return false;
-
     const bool HasNameMatch = Platform.get_info<info::platform::name>().find(
                                   name) != std::string::npos;
     const auto Backend = detail::getSyclObjImpl(Platform)->getBackend();
@@ -466,15 +463,9 @@ platform_impl::get_devices(info::device_type DeviceType) const {
 
   ods_target_list *OdsTargetList = SYCLConfig<ONEAPI_DEVICE_SELECTOR>::get();
 
-  if (is_host() && (DeviceType == info::device_type::host ||
-                    DeviceType == info::device_type::all)) {
-    Res.push_back(
-        createSyclObjFromImpl<device>(device_impl::getHostDeviceImpl()));
-  }
-
   // If any DeviceType other than host was requested for host platform,
   // an empty vector will be returned.
-  if (is_host() || DeviceType == info::device_type::host)
+  if (DeviceType == info::device_type::host)
     return Res;
 
   pi_uint32 NumDevices = 0;
@@ -556,9 +547,6 @@ platform_impl::get_devices(info::device_type DeviceType) const {
 }
 
 bool platform_impl::has_extension(const std::string &ExtensionName) const {
-  if (is_host())
-    return false;
-
   std::string AllExtensionNames = get_platform_info_string_impl(
       MPlatform, getPlugin(),
       detail::PiInfoCode<info::platform::extensions>::value);
@@ -580,9 +568,6 @@ pi_native_handle platform_impl::getNative() const {
 
 template <typename Param>
 typename Param::return_type platform_impl::get_info() const {
-  if (is_host())
-    return get_platform_info_host<Param>();
-
   return get_platform_info<Param>(this->getHandleRef(), getPlugin());
 }
 
diff --git a/sycl/source/detail/platform_impl.hpp b/sycl/source/detail/platform_impl.hpp
index 0bb8d1ab77e2f..e13bd0a3a1b31 100644
--- a/sycl/source/detail/platform_impl.hpp
+++ b/sycl/source/detail/platform_impl.hpp
@@ -89,9 +89,6 @@ class platform_impl {
   template <typename Param>
   typename Param::return_type get_backend_info() const;
 
-  /// \return true if this SYCL platform is a host platform.
-  bool is_host() const { return MHostPlatform; };
-
   /// Returns the backend of this platform.
   backend getBackend(void) const { return MBackend; }
 
@@ -107,11 +104,6 @@ class platform_impl {
 
   /// \return an instance of OpenCL cl_platform_id.
   cl_platform_id get() const {
-    if (is_host()) {
-      throw invalid_object_error(
-          "This instance of platform doesn't support OpenCL interoperability.",
-          PI_ERROR_INVALID_PLATFORM);
-    }
     return pi::cast<cl_platform_id>(MPlatform);
   }
 
@@ -123,10 +115,6 @@ class platform_impl {
   ///
   /// \return a raw plug-in platform handle.
   const sycl::detail::pi::PiPlatform &getHandleRef() const {
-    if (is_host())
-      throw invalid_object_error("This instance of platform is a host instance",
-                                 PI_ERROR_INVALID_PLATFORM);
-
     return MPlatform;
   }
 
diff --git a/sycl/source/detail/platform_info.hpp b/sycl/source/detail/platform_info.hpp
index 42c41b5063cf5..70bcd626024d9 100644
--- a/sycl/source/detail/platform_info.hpp
+++ b/sycl/source/detail/platform_info.hpp
@@ -59,36 +59,6 @@ get_platform_info(sycl::detail::pi::PiPlatform Plt, const PluginPtr &Plugin) {
   return split_string(Result, ' ');
 }
 
-// Host platform information methods
-template <typename Param>
-inline typename Param::return_type get_platform_info_host() = delete;
-
-template <>
-inline std::string get_platform_info_host<info::platform::profile>() {
-  return "FULL PROFILE";
-}
-
-template <>
-inline std::string get_platform_info_host<info::platform::version>() {
-  return "1.2";
-}
-
-template <> inline std::string get_platform_info_host<info::platform::name>() {
-  return "SYCL host platform";
-}
-
-template <>
-inline std::string get_platform_info_host<info::platform::vendor>() {
-  return "";
-}
-
-template <>
-inline std::vector<std::string>
-get_platform_info_host<info::platform::extensions>() {
-  // TODO update when appropriate
-  return {};
-}
-
 } // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/source/detail/program_impl.cpp b/sycl/source/detail/program_impl.cpp
index d65f3163b961f..584b2487f5dee 100644
--- a/sycl/source/detail/program_impl.cpp
+++ b/sycl/source/detail/program_impl.cpp
@@ -72,9 +72,8 @@ program_impl::program_impl(
   }
   MDevices = ProgramList[0]->MDevices;
   std::vector<device> DevicesSorted;
-  if (!is_host()) {
-    DevicesSorted = sort_devices_by_cl_device_id(MDevices);
-  }
+  DevicesSorted = sort_devices_by_cl_device_id(MDevices);
+
   check_device_feature_support<info::device::is_linker_available>(MDevices);
   std::list<std::lock_guard<std::mutex>> Locks;
   for (const auto &Prg : ProgramList) {
@@ -85,18 +84,16 @@ program_impl::program_impl(
           "Not all programs are associated with the same context",
           PI_ERROR_INVALID_PROGRAM);
     }
-    if (!is_host()) {
-      std::vector<device> PrgDevicesSorted =
-          sort_devices_by_cl_device_id(Prg->MDevices);
-      if (PrgDevicesSorted != DevicesSorted) {
-        throw invalid_object_error(
-            "Not all programs are associated with the same devices",
-            PI_ERROR_INVALID_PROGRAM);
-      }
+
+    std::vector<device> PrgDevicesSorted =
+        sort_devices_by_cl_device_id(Prg->MDevices);
+    if (PrgDevicesSorted != DevicesSorted) {
+      throw invalid_object_error(
+          "Not all programs are associated with the same devices",
+          PI_ERROR_INVALID_PROGRAM);
     }
   }
 
-  if (!is_host()) {
     std::vector<sycl::detail::pi::PiDevice> Devices(get_pi_devices());
     std::vector<sycl::detail::pi::PiProgram> Programs;
     bool NonInterOpToLink = false;
@@ -113,7 +110,6 @@ program_impl::program_impl(
             LinkOptions.c_str(), Programs.size(), Programs.data(), nullptr,
             nullptr, &MProgram);
     Plugin->checkPiResult<compile_program_error>(Err);
-  }
 }
 
 program_impl::program_impl(ContextImplPtr Context,
@@ -208,7 +204,7 @@ program_impl::program_impl(ContextImplPtr Context,
 
 program_impl::~program_impl() {
   // TODO catch an exception and put it to list of asynchronous exceptions
-  if (!is_host() && MProgram != nullptr) {
+  if (MProgram != nullptr) {
     const PluginPtr &Plugin = getPlugin();
     Plugin->call<PiApiKind::piProgramRelease>(MProgram);
   }
@@ -216,11 +212,6 @@ program_impl::~program_impl() {
 
 cl_program program_impl::get() const {
   throw_if_state_is(program_state::none);
-  if (is_host()) {
-    throw invalid_object_error(
-        "This instance of program doesn't support OpenCL interoperability.",
-        PI_ERROR_INVALID_PROGRAM);
-  }
   getPlugin()->call<PiApiKind::piProgramRetain>(MProgram);
   return pi::cast<cl_program>(MProgram);
 }
@@ -229,19 +220,16 @@ void program_impl::compile_with_kernel_name(std::string KernelName,
                                             std::string CompileOptions) {
   std::lock_guard<std::mutex> Lock(MMutex);
   throw_if_state_is_not(program_state::none);
-  if (!is_host()) {
     create_pi_program_with_kernel_name(
         KernelName,
         /*JITCompilationIsRequired=*/(!CompileOptions.empty()));
     compile(CompileOptions);
-  }
   MState = program_state::compiled;
 }
 
 void program_impl::link(std::string LinkOptions) {
   std::lock_guard<std::mutex> Lock(MMutex);
   throw_if_state_is_not(program_state::compiled);
-  if (!is_host()) {
     check_device_feature_support<info::device::is_linker_available>(MDevices);
     std::vector<sycl::detail::pi::PiDevice> Devices(get_pi_devices());
     const PluginPtr &Plugin = getPlugin();
@@ -263,16 +251,12 @@ void program_impl::link(std::string LinkOptions) {
     Plugin->checkPiResult<compile_program_error>(Err);
     MLinkOptions = LinkOptions;
     MBuildOptions = LinkOptions;
-  }
   MState = program_state::linked;
 }
 
 bool program_impl::has_kernel(std::string KernelName,
                               bool IsCreatedFromSource) const {
   throw_if_state_is(program_state::none);
-  if (is_host()) {
-    return !IsCreatedFromSource;
-  }
 
   std::vector<sycl::detail::pi::PiDevice> Devices(get_pi_devices());
   pi_uint64 function_ptr;
@@ -299,14 +283,6 @@ kernel program_impl::get_kernel(std::string KernelName,
                                 std::shared_ptr<program_impl> PtrToSelf,
                                 bool IsCreatedFromSource) const {
   throw_if_state_is(program_state::none);
-  if (is_host()) {
-    if (IsCreatedFromSource)
-      throw invalid_object_error("This instance of program is a host instance",
-                                 PI_ERROR_INVALID_PROGRAM);
-
-    return createSyclObjFromImpl<kernel>(
-        std::make_shared<kernel_impl>(MContext, PtrToSelf));
-  }
   auto [Kernel, ArgMask] = get_pi_kernel_arg_mask_pair(KernelName);
   return createSyclObjFromImpl<kernel>(std::make_shared<kernel_impl>(
       Kernel, MContext, PtrToSelf, IsCreatedFromSource, nullptr, ArgMask));
@@ -314,8 +290,6 @@ kernel program_impl::get_kernel(std::string KernelName,
 
 std::vector<std::vector<char>> program_impl::get_binaries() const {
   throw_if_state_is(program_state::none);
-  if (is_host())
-    return {};
 
   std::vector<std::vector<char>> Result;
   const PluginPtr &Plugin = getPlugin();
diff --git a/sycl/source/detail/program_impl.hpp b/sycl/source/detail/program_impl.hpp
index 32a0c7fd38bfe..1fa8767774961 100644
--- a/sycl/source/detail/program_impl.hpp
+++ b/sycl/source/detail/program_impl.hpp
@@ -134,9 +134,6 @@ class program_impl {
   /// not retained before return.
   const sycl::detail::pi::PiProgram &getHandleRef() const { return MProgram; }
 
-  /// \return true if this SYCL program is a host program.
-  bool is_host() const { return MContext->is_host(); }
-
   /// Compiles the SYCL kernel function into the encapsulated raw program.
   ///
   /// The kernel function is defined by its name. This member function
@@ -215,14 +212,11 @@ class program_impl {
 
   /// \return the SYCL context that this program was constructed with.
   context get_context() const {
-    if (is_host())
-      return context();
     return createSyclObjFromImpl<context>(MContext);
   }
 
   /// \return the Plugin associated with the context of this program.
   const PluginPtr &getPlugin() const {
-    assert(!is_host() && "Plugin is not available for Host.");
     return MContext->getPlugin();
   }
 
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index 05c579f78a405..2c7876ea14c08 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -42,10 +42,9 @@ getPIEvents(const std::vector<sycl::event> &DepEvents) {
 template <>
 uint32_t queue_impl::get_info<info::queue::reference_count>() const {
   sycl::detail::pi::PiResult result = PI_SUCCESS;
-  if (!is_host())
-    getPlugin()->call<PiApiKind::piQueueGetInfo>(
-        MQueues[0], PI_QUEUE_INFO_REFERENCE_COUNT, sizeof(result), &result,
-        nullptr);
+  getPlugin()->call<PiApiKind::piQueueGetInfo>(
+      MQueues[0], PI_QUEUE_INFO_REFERENCE_COUNT, sizeof(result), &result,
+      nullptr);
   return result;
 }
 
@@ -142,8 +141,7 @@ event queue_impl::memset(const std::shared_ptr<detail::queue_impl> &Self,
                           SYCL_STREAM_NAME, "memory_transfer_node");
   PrepareNotify.addMetadata([&](auto TEvent) {
     xpti::addMetadata(TEvent, "sycl_device",
-                      reinterpret_cast<size_t>(
-                          MDevice->is_host() ? 0 : MDevice->getHandleRef()));
+                      reinterpret_cast<size_t>(MDevice->getHandleRef()));
     xpti::addMetadata(TEvent, "memory_ptr", reinterpret_cast<size_t>(Ptr));
     xpti::addMetadata(TEvent, "value_set", Value);
     xpti::addMetadata(TEvent, "memory_size", Count);
@@ -190,8 +188,7 @@ event queue_impl::memcpy(const std::shared_ptr<detail::queue_impl> &Self,
                           SYCL_STREAM_NAME, "memory_transfer_node");
   PrepareNotify.addMetadata([&](auto TEvent) {
     xpti::addMetadata(TEvent, "sycl_device",
-                      reinterpret_cast<size_t>(
-                          MDevice->is_host() ? 0 : MDevice->getHandleRef()));
+                      reinterpret_cast<size_t>(MDevice->getHandleRef()));
     xpti::addMetadata(TEvent, "src_memory_ptr", reinterpret_cast<size_t>(Src));
     xpti::addMetadata(TEvent, "dest_memory_ptr",
                       reinterpret_cast<size_t>(Dest));
@@ -430,9 +427,7 @@ void *queue_impl::instrumentationProlog(const detail::code_location &CodeLoc,
   if (WaitEvent) {
     device D = get_device();
     std::string DevStr;
-    if (getSyclObjImpl(D)->is_host())
-      DevStr = "HOST";
-    else if (D.is_cpu())
+    if (D.is_cpu())
       DevStr = "CPU";
     else if (D.is_gpu())
       DevStr = "GPU";
@@ -588,14 +583,12 @@ bool queue_impl::ext_oneapi_empty() const {
   }
 
   // Check the status of the backend queue if this is not a host queue.
-  if (!is_host()) {
-    pi_bool IsReady = false;
-    getPlugin()->call<PiApiKind::piQueueGetInfo>(
-        MQueues[0], PI_EXT_ONEAPI_QUEUE_INFO_EMPTY, sizeof(pi_bool), &IsReady,
-        nullptr);
-    if (!IsReady)
-      return false;
-  }
+  pi_bool IsReady = false;
+  getPlugin()->call<PiApiKind::piQueueGetInfo>(
+      MQueues[0], PI_EXT_ONEAPI_QUEUE_INFO_EMPTY, sizeof(pi_bool), &IsReady,
+      nullptr);
+  if (!IsReady)
+    return false;
 
   // We may have events like host tasks which are not submitted to the backend
   // queue so we need to get their status separately.
@@ -609,7 +602,7 @@ bool queue_impl::ext_oneapi_empty() const {
        EventImplWeakPtrIt != MEventsWeak.end(); ++EventImplWeakPtrIt)
     if (std::shared_ptr<event_impl> EventImplSharedPtr =
             EventImplWeakPtrIt->lock())
-      if (EventImplSharedPtr->is_host() &&
+      if (EventImplSharedPtr->isHost() &&
           EventImplSharedPtr
                   ->get_info<info::event::command_execution_status>() !=
               info::event_command_status::complete)
@@ -641,7 +634,7 @@ void queue_impl::revisitUnenqueuedCommandsState(
           std::remove_if(
               Deps.UnenqueuedCmdEvents.begin(), Deps.UnenqueuedCmdEvents.end(),
               [](const EventImplPtr &CommandEvent) {
-                return (CommandEvent->is_host() ? CommandEvent->isCompleted()
+                return (CommandEvent->isHost() ? CommandEvent->isCompleted()
                                                 : CommandEvent->isEnqueued());
               }),
           Deps.UnenqueuedCmdEvents.end());
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index dff24ad1dfec1..c205b5916f302 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -106,13 +106,12 @@ class queue_impl {
   queue_impl(const DeviceImplPtr &Device, const ContextImplPtr &Context,
              const async_handler &AsyncHandler, const property_list &PropList)
       : MDevice(Device), MContext(Context), MAsyncHandler(AsyncHandler),
-        MPropList(PropList), MHostQueue(MDevice->is_host()),
+        MPropList(PropList),
         MIsInorder(has_property<property::queue::in_order>()),
         MDiscardEvents(
             has_property<ext::oneapi::property::queue::discard_events>()),
         MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
-        MSupportsDiscardingPiEvents(MDiscardEvents &&
-                                    (MHostQueue ? true : MIsInorder)),
+        MSupportsDiscardingPiEvents(MDiscardEvents && MIsInorder)),
         MQueueID{
             MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} {
     if (has_property<property::queue::enable_profiling>()) {
@@ -124,8 +123,7 @@ class queue_impl {
       if (MDevice->has(aspect::queue_profiling)) {
         // When piGetDeviceAndHostTimer is not supported, compute the
         // profiling time OpenCL version < 2.1 case
-        if (!getDeviceImplPtr()->is_host() &&
-            !getDeviceImplPtr()->isGetDeviceAndHostTimerSupported())
+        if (!getDeviceImplPtr()->isGetDeviceAndHostTimerSupported())
           MFallbackProfiling = true;
       } else {
         throw sycl::exception(make_error_code(errc::feature_not_supported),
@@ -154,7 +152,7 @@ class queue_impl {
           "Cannot enable fusion if device does not support fusion");
     }
     if (!Context->isDeviceValid(Device)) {
-      if (!Context->is_host() && Context->getBackend() == backend::opencl)
+      if (Context->getBackend() == backend::opencl)
         throw sycl::invalid_object_error(
             "Queue cannot be constructed with the given context and device "
             "since the device is not a member of the context (descendants of "
@@ -166,13 +164,12 @@ class queue_impl {
           "descendant of its member.",
           PI_ERROR_INVALID_DEVICE);
     }
-    if (!MHostQueue) {
-      const QueueOrder QOrder =
-          MIsInorder ? QueueOrder::Ordered : QueueOrder::OOO;
-      MQueues.push_back(createQueue(QOrder));
-      // This section is the second part of the instrumentation that uses the
-      // tracepoint information and notifies
-    }
+
+    const QueueOrder QOrder =
+        MIsInorder ? QueueOrder::Ordered : QueueOrder::OOO;
+    MQueues.push_back(createQueue(QOrder));
+    // This section is the second part of the instrumentation that uses the
+    // tracepoint information and notifies
 
     // We enable XPTI tracing events using the TLS mechanism; if the code
     // location data is available, then the tracing data will be rich.
@@ -198,13 +195,11 @@ class queue_impl {
                             MDevice->getDeviceName());
           xpti::addMetadata(
               TEvent, "sycl_device",
-              reinterpret_cast<size_t>(
-                  MDevice->is_host() ? 0 : MDevice->getHandleRef()));
+              reinterpret_cast<size_t>(MDevice->getHandleRef()));
         }
         xpti::addMetadata(TEvent, "is_inorder", MIsInorder);
         xpti::addMetadata(TEvent, "queue_id", MQueueID);
-        if (!MHostQueue)
-          xpti::addMetadata(TEvent, "queue_handle",
+        xpti::addMetadata(TEvent, "queue_handle",
                             reinterpret_cast<size_t>(getHandleRef()));
       });
       // Also publish to TLS
@@ -263,13 +258,11 @@ class queue_impl {
                             MDevice->getDeviceName());
           xpti::addMetadata(
               TEvent, "sycl_device",
-              reinterpret_cast<size_t>(
-                  MDevice->is_host() ? 0 : MDevice->getHandleRef()));
+              reinterpret_cast<size_t>(MDevice->getHandleRef()));
         }
         xpti::addMetadata(TEvent, "is_inorder", MIsInorder);
         xpti::addMetadata(TEvent, "queue_id", MQueueID);
-        if (!MHostQueue)
-          xpti::addMetadata(TEvent, "queue_handle", getHandleRef());
+        xpti::addMetadata(TEvent, "queue_handle", getHandleRef());
       });
       // Also publish to TLS before notification
       xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID);
@@ -287,13 +280,12 @@ class queue_impl {
   /// \param AsyncHandler is a SYCL asynchronous exception handler.
   queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context,
              const async_handler &AsyncHandler)
-      : MContext(Context), MAsyncHandler(AsyncHandler), MHostQueue(false),
+      : MContext(Context), MAsyncHandler(AsyncHandler),
         MIsInorder(has_property<property::queue::in_order>()),
         MDiscardEvents(
             has_property<ext::oneapi::property::queue::discard_events>()),
         MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
-        MSupportsDiscardingPiEvents(MDiscardEvents &&
-                                    (MHostQueue ? true : MIsInorder)),
+        MSupportsDiscardingPiEvents(MDiscardEvents && MIsInorder)),
         MQueueID{
             MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} {
     queue_impl_interop(PiQueue);
@@ -309,13 +301,11 @@ class queue_impl {
   queue_impl(sycl::detail::pi::PiQueue PiQueue, const ContextImplPtr &Context,
              const async_handler &AsyncHandler, const property_list &PropList)
       : MContext(Context), MAsyncHandler(AsyncHandler), MPropList(PropList),
-        MHostQueue(false),
         MIsInorder(has_property<property::queue::in_order>()),
         MDiscardEvents(
             has_property<ext::oneapi::property::queue::discard_events>()),
         MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
-        MSupportsDiscardingPiEvents(MDiscardEvents &&
-                                    (MHostQueue ? true : MIsInorder)) {
+        MSupportsDiscardingPiEvents(MDiscardEvents && MIsInorder)) {
     queue_impl_interop(PiQueue);
   }
 
@@ -336,19 +326,12 @@ class queue_impl {
     }
 #endif
     throw_asynchronous();
-    if (!MHostQueue) {
-      cleanup_fusion_cmd();
-      getPlugin()->call<PiApiKind::piQueueRelease>(MQueues[0]);
-    }
+    cleanup_fusion_cmd();
+    getPlugin()->call<PiApiKind::piQueueRelease>(MQueues[0]);
   }
 
   /// \return an OpenCL interoperability queue handle.
   cl_command_queue get() {
-    if (MHostQueue) {
-      throw invalid_object_error(
-          "This instance of queue doesn't support OpenCL interoperability",
-          PI_ERROR_INVALID_QUEUE);
-    }
     getPlugin()->call<PiApiKind::piQueueRetain>(MQueues[0]);
     return pi::cast<cl_command_queue>(MQueues[0]);
   }
@@ -367,9 +350,6 @@ class queue_impl {
   /// \return an associated SYCL device.
   device get_device() const { return createSyclObjFromImpl<device>(MDevice); }
 
-  /// \return true if this queue is a SYCL host queue.
-  bool is_host() const { return MHostQueue; }
-
   /// \return true if this queue has discard_events support.
   bool supportsDiscardingPiEvents() const {
     return MSupportsDiscardingPiEvents;
@@ -859,7 +839,7 @@ class queue_impl {
           "function objects should use the sycl::handler API instead.");
     }
 
-    handler Handler(Self, PrimaryQueue, SecondaryQueue, MHostQueue);
+    handler Handler(Self, PrimaryQueue, SecondaryQueue);
     Handler.saveCodeLoc(Loc);
     PreventSubmit = true;
     try {
@@ -969,7 +949,6 @@ class queue_impl {
   /// Iterator through MQueues.
   size_t MNextQueueIdx = 0;
 
-  const bool MHostQueue = false;
   /// Indicates that a native out-of-order queue could not be created and we
   /// need to emulate it with multiple native in-order queues.
   bool MEmulateOOO = false;
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 0739ac77373b7..d6c41f39e9942 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -2246,7 +2246,7 @@ void SetArgBasedOnType(
     const PluginPtr &Plugin, sycl::detail::pi::PiKernel Kernel,
     const std::shared_ptr<device_image_impl> &DeviceImageImpl,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
-    const sycl::context &Context, bool IsHost, detail::ArgDesc &Arg,
+    const sycl::context &Context, detail::ArgDesc &Arg,
     size_t NextTrueIndex) {
   switch (Arg.MType) {
   case kernel_param_kind_t::kind_stream:
@@ -2300,13 +2300,6 @@ void SetArgBasedOnType(
     break;
   }
   case kernel_param_kind_t::kind_specialization_constants_buffer: {
-    if (IsHost) {
-      throw sycl::exception(
-          sycl::make_error_code(sycl::errc::feature_not_supported),
-          "SYCL2020 specialization constants are not yet supported on host "
-          "device " +
-              codeToString(PI_ERROR_INVALID_OPERATION));
-    }
     assert(DeviceImageImpl != nullptr);
     sycl::detail::pi::PiMem SpecConstsBuffer =
         DeviceImageImpl->get_spec_const_buffer_ref();
@@ -2343,7 +2336,7 @@ static pi_result SetKernelParamsAndLaunch(
   auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
                   &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) {
     SetArgBasedOnType(Plugin, Kernel, DeviceImageImpl, getMemAllocationFunc,
-                      Queue->get_context(), Queue->is_host(), Arg,
+                      Queue->get_context(), Arg,
                       NextTrueIndex);
   };
 
@@ -2940,8 +2933,8 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     NDRDescT &NDRDesc = ExecKernel->MNDRDesc;
     std::vector<ArgDesc> &Args = ExecKernel->MArgs;
 
-    if (MQueue->is_host() || (MQueue->getDeviceImplPtr()->getBackend() ==
-                              backend::ext_intel_esimd_emulator)) {
+    if (MQueue->getDeviceImplPtr()->getBackend() ==
+                              backend::ext_intel_esimd_emulator) {
       for (ArgDesc &Arg : Args)
         if (kernel_param_kind_t::kind_accessor == Arg.MType) {
           Requirement *Req = (Requirement *)(Arg.MPtr);
@@ -2954,10 +2947,6 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
         Plugin->call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
       }
 
-      if (MQueue->is_host()) {
-        ExecKernel->MHostKernel->call(NDRDesc,
-                                      getEvent()->getHostProfilingInfo());
-      } else {
         assert(MQueue->getDeviceImplPtr()->getBackend() ==
                backend::ext_intel_esimd_emulator);
         if (MEvent != nullptr)
@@ -2967,7 +2956,6 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
             reinterpret_cast<pi_kernel>(ExecKernel->MHostKernel->getPtr()),
             NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0],
             &NDRDesc.LocalSize[0], 0, nullptr, nullptr);
-      }
       return PI_SUCCESS;
     }
 
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 196232b95d734..d1b57182d78ff 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -678,7 +678,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::findAllocaForReq(
 static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) {
   if (const char *HUMConfig = SYCLConfig<SYCL_HOST_UNIFIED_MEMORY>::get()) {
     if (std::strcmp(HUMConfig, "0") == 0)
-      return Ctx->is_host();
+      return false;
     if (std::strcmp(HUMConfig, "1") == 0)
       return true;
   }

From fa08c2b3314604af314406fb73bcaf33e669f04a Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Mon, 27 May 2024 02:12:53 -0700
Subject: [PATCH 09/52] non-buildable: remove is_host from obvious places

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/context_impl.hpp |  7 +----
 sycl/source/detail/device_impl.cpp  |  8 ++---
 sycl/source/detail/usm/usm_impl.cpp | 47 -----------------------------
 3 files changed, 3 insertions(+), 59 deletions(-)

diff --git a/sycl/source/detail/context_impl.hpp b/sycl/source/detail/context_impl.hpp
index af20236fc4b23..203242ee40077 100644
--- a/sycl/source/detail/context_impl.hpp
+++ b/sycl/source/detail/context_impl.hpp
@@ -97,11 +97,6 @@ class context_impl {
   /// \return an instance of OpenCL cl_context.
   cl_context get() const;
 
-  /// Checks if this context is a host context.
-  ///
-  /// \return true if this context is a host context.
-  bool is_host() const;
-
   /// Gets asynchronous exception handler.
   ///
   /// \return an instance of SYCL async_handler.
@@ -182,7 +177,7 @@ class context_impl {
     // OpenCL does not support using descendants of context members within that
     // context yet.
     // TODO remove once this limitation is lifted
-    if (!is_host() && Device->getBackend() == backend::opencl)
+    if (Device->getBackend() == backend::opencl)
       return hasDevice(Device);
 
     while (!hasDevice(Device)) {
diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp
index 2e87300425c20..c677b9165d71f 100644
--- a/sycl/source/detail/device_impl.cpp
+++ b/sycl/source/detail/device_impl.cpp
@@ -327,8 +327,6 @@ bool device_impl::has(aspect Aspect) const {
   size_t return_size = 0;
 
   switch (Aspect) {
-  case aspect::host:
-    return is_host();
   case aspect::cpu:
     return is_cpu();
   case aspect::gpu:
@@ -369,16 +367,14 @@ bool device_impl::has(aspect Aspect) const {
   case aspect::ext_intel_mem_channel:
     return get_info<info::device::ext_intel_mem_channel>();
   case aspect::usm_atomic_host_allocations:
-    return is_host() ||
-           (get_device_info_impl<pi_usm_capabilities,
+    return (get_device_info_impl<pi_usm_capabilities,
                                  info::device::usm_host_allocations>::
                 get(MPlatform->getDeviceImpl(MDevice)) &
             PI_USM_CONCURRENT_ATOMIC_ACCESS);
   case aspect::usm_shared_allocations:
     return get_info<info::device::usm_shared_allocations>();
   case aspect::usm_atomic_shared_allocations:
-    return is_host() ||
-           (get_device_info_impl<pi_usm_capabilities,
+    return (get_device_info_impl<pi_usm_capabilities,
                                  info::device::usm_shared_allocations>::
                 get(MPlatform->getDeviceImpl(MDevice)) &
             PI_USM_CONCURRENT_ATOMIC_ACCESS);
diff --git a/sycl/source/detail/usm/usm_impl.cpp b/sycl/source/detail/usm/usm_impl.cpp
index ecf63bc63e427..753c27d5f678d 100755
--- a/sycl/source/detail/usm/usm_impl.cpp
+++ b/sycl/source/detail/usm/usm_impl.cpp
@@ -73,20 +73,6 @@ void *alignedAllocHost(size_t Alignment, size_t Size, const context &Ctxt,
     return nullptr;
 
   std::shared_ptr<context_impl> CtxImpl = detail::getSyclObjImpl(Ctxt);
-  if (CtxImpl->is_host()) {
-    if (!Alignment) {
-      // worst case default
-      Alignment = 128;
-    }
-
-    aligned_allocator<char> Alloc(Alignment);
-    try {
-      RetVal = Alloc.allocate(Size);
-    } catch (const std::bad_alloc &) {
-      // Conform with Specification behavior
-      RetVal = nullptr;
-    }
-  } else {
     pi_context C = CtxImpl->getHandleRef();
     const PluginPtr &Plugin = CtxImpl->getPlugin();
     pi_result Error = PI_ERROR_INVALID_VALUE;
@@ -128,7 +114,6 @@ void *alignedAllocHost(size_t Alignment, size_t Size, const context &Ctxt,
     // The spec wants a nullptr returned, not an exception.
     if (Error != PI_SUCCESS)
       return nullptr;
-  }
 #ifdef XPTI_ENABLE_INSTRUMENTATION
   xpti::addMetadata(PrepareNotify.traceEvent(), "memory_ptr",
                     reinterpret_cast<size_t>(RetVal));
@@ -154,24 +139,6 @@ void *alignedAllocInternal(size_t Alignment, size_t Size,
   if (Size == 0)
     return nullptr;
 
-  if (CtxImpl->is_host()) {
-    if (Kind == alloc::unknown) {
-      RetVal = nullptr;
-    } else {
-      if (!Alignment) {
-        // worst case default
-        Alignment = 128;
-      }
-
-      aligned_allocator<char> Alloc(Alignment);
-      try {
-        RetVal = Alloc.allocate(Size);
-      } catch (const std::bad_alloc &) {
-        // Conform with Specification behavior
-        RetVal = nullptr;
-      }
-    }
-  } else {
     pi_context C = CtxImpl->getHandleRef();
     const PluginPtr &Plugin = CtxImpl->getPlugin();
     pi_result Error = PI_ERROR_INVALID_VALUE;
@@ -245,7 +212,6 @@ void *alignedAllocInternal(size_t Alignment, size_t Size,
     // The spec wants a nullptr returned, not an exception.
     if (Error != PI_SUCCESS)
       return nullptr;
-  }
   return RetVal;
 }
 
@@ -284,14 +250,9 @@ void *alignedAlloc(size_t Alignment, size_t Size, const context &Ctxt,
 void freeInternal(void *Ptr, const context_impl *CtxImpl) {
   if (Ptr == nullptr)
     return;
-  if (CtxImpl->is_host()) {
-    // need to use alignedFree here for Windows
-    detail::OSUtil::alignedFree(Ptr);
-  } else {
     pi_context C = CtxImpl->getHandleRef();
     const PluginPtr &Plugin = CtxImpl->getPlugin();
     Plugin->call<PiApiKind::piextUSMFree>(C, Ptr);
-  }
 }
 
 void free(void *Ptr, const context &Ctxt,
@@ -578,10 +539,6 @@ alloc get_pointer_type(const void *Ptr, const context &Ctxt) {
 
   std::shared_ptr<detail::context_impl> CtxImpl = detail::getSyclObjImpl(Ctxt);
 
-  // Everything on a host device is just system malloc so call it host
-  if (CtxImpl->is_host())
-    return alloc::host;
-
   pi_context PICtx = CtxImpl->getHandleRef();
   pi_usm_type AllocTy;
 
@@ -631,10 +588,6 @@ device get_pointer_device(const void *Ptr, const context &Ctxt) {
 
   std::shared_ptr<detail::context_impl> CtxImpl = detail::getSyclObjImpl(Ctxt);
 
-  // Just return the host device in the host context
-  if (CtxImpl->is_host())
-    return Ctxt.get_devices()[0];
-
   // Check if ptr is a host allocation
   if (get_pointer_type(Ptr, Ctxt) == alloc::host) {
     auto Devs = CtxImpl->getDevices();

From d021de9af53da859390f6519730dd363b9b2d4bb Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Mon, 27 May 2024 06:03:56 -0700
Subject: [PATCH 10/52] not-buildable: remove is_host in simple places

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/buffer_impl.cpp    |  3 ---
 sycl/source/detail/memory_manager.cpp | 27 +--------------------------
 sycl/source/detail/memory_manager.hpp |  4 ----
 sycl/source/detail/queue_impl.cpp     | 11 ++++-------
 sycl/source/detail/sycl_mem_obj_t.cpp | 23 ++---------------------
 5 files changed, 7 insertions(+), 61 deletions(-)

diff --git a/sycl/source/detail/buffer_impl.cpp b/sycl/source/detail/buffer_impl.cpp
index 835c732a40bf9..d7d77205b162c 100644
--- a/sycl/source/detail/buffer_impl.cpp
+++ b/sycl/source/detail/buffer_impl.cpp
@@ -25,9 +25,6 @@ void *buffer_impl::allocateMem(ContextImplPtr Context, bool InitFromUserData,
   bool HostPtrReadOnly = false;
   BaseT::determineHostPtr(Context, InitFromUserData, HostPtr, HostPtrReadOnly);
 
-  assert(!(nullptr == HostPtr && BaseT::useHostPtr() && Context->is_host()) &&
-         "Internal error. Allocating memory on the host "
-         "while having use_host_ptr property");
   return MemoryManager::allocateMemBuffer(
       std::move(Context), this, HostPtr, HostPtrReadOnly,
       BaseT::getSizeInBytes(), BaseT::MInteropEvent, BaseT::MInteropContext,
diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp
index 840f95ea7a643..f4e42363cb6e1 100644
--- a/sycl/source/detail/memory_manager.cpp
+++ b/sycl/source/detail/memory_manager.cpp
@@ -266,11 +266,6 @@ void MemoryManager::releaseMemObj(ContextImplPtr TargetContext,
     return;
   }
 
-  if (TargetContext->is_host()) {
-    MemObj->releaseHostMem(MemAllocation);
-    return;
-  }
-
   const PluginPtr &Plugin = TargetContext->getPlugin();
   memReleaseHelper(Plugin, pi::cast<sycl::detail::pi::PiMem>(MemAllocation));
 }
@@ -288,20 +283,6 @@ void *MemoryManager::allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj,
                              OutEvent);
 }
 
-void *MemoryManager::allocateHostMemory(SYCLMemObjI *MemObj, void *UserPtr,
-                                        bool HostPtrReadOnly, size_t Size,
-                                        const sycl::property_list &) {
-  std::ignore = HostPtrReadOnly;
-  std::ignore = Size;
-
-  // Can return user pointer directly if it is not a nullptr.
-  if (UserPtr)
-    return UserPtr;
-
-  return MemObj->allocateHostMem();
-  ;
-}
-
 void *MemoryManager::allocateInteropMemObject(
     ContextImplPtr TargetContext, void *UserPtr,
     const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext,
@@ -398,10 +379,7 @@ void *MemoryManager::allocateMemBuffer(
     const ContextImplPtr &InteropContext, const sycl::property_list &PropsList,
     sycl::detail::pi::PiEvent &OutEventToWait) {
   void *MemPtr;
-  if (TargetContext->is_host())
-    MemPtr =
-        allocateHostMemory(MemObj, UserPtr, HostPtrReadOnly, Size, PropsList);
-  else if (UserPtr && InteropContext)
+  if (UserPtr && InteropContext)
     MemPtr =
         allocateInteropMemObject(TargetContext, UserPtr, InteropEvent,
                                  InteropContext, PropsList, OutEventToWait);
@@ -420,9 +398,6 @@ void *MemoryManager::allocateMemImage(
     const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext,
     const sycl::property_list &PropsList,
     sycl::detail::pi::PiEvent &OutEventToWait) {
-  if (TargetContext->is_host())
-    return allocateHostMemory(MemObj, UserPtr, HostPtrReadOnly, Size,
-                              PropsList);
   if (UserPtr && InteropContext)
     return allocateInteropMemObject(TargetContext, UserPtr, InteropEvent,
                                     InteropContext, PropsList, OutEventToWait);
diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp
index 1d2800bf9dadc..7be17898bc0d9 100644
--- a/sycl/source/detail/memory_manager.hpp
+++ b/sycl/source/detail/memory_manager.hpp
@@ -85,10 +85,6 @@ class __SYCL_EXPORT MemoryManager {
   static void releaseMemObj(ContextImplPtr TargetContext, SYCLMemObjI *MemObj,
                             void *MemAllocation, void *UserPtr);
 
-  static void *allocateHostMemory(SYCLMemObjI *MemObj, void *UserPtr,
-                                  bool HostPtrReadOnly, size_t Size,
-                                  const sycl::property_list &PropsList);
-
   static void *
   allocateInteropMemObject(ContextImplPtr TargetContext, void *UserPtr,
                            const EventImplPtr &InteropEvent,
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index 2c7876ea14c08..bba423df61b60 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -283,12 +283,12 @@ void queue_impl::addEvent(const event &Event) {
     // if there is no command on the event, we cannot track it with MEventsWeak
     // as that will leave it with no owner. Track in MEventsShared only if we're
     // unable to call piQueueFinish during wait.
-    if (is_host() || MEmulateOOO)
+    if (Event->isHost() || MEmulateOOO)
       addSharedEvent(Event);
   }
   // As long as the queue supports piQueueFinish we only need to store events
   // for unenqueued commands and host tasks.
-  else if (is_host() || MEmulateOOO || EImpl->getHandleRef() == nullptr) {
+  else if (Event->isHost() || MEmulateOOO || EImpl->getHandleRef() == nullptr) {
     std::weak_ptr<event_impl> EventWeakPtr{EImpl};
     std::lock_guard<std::mutex> Lock{MMutex};
     MEventsWeak.push_back(std::move(EventWeakPtr));
@@ -299,7 +299,7 @@ void queue_impl::addEvent(const event &Event) {
 /// but some events have no other owner. In this case,
 /// addSharedEvent will have the queue track the events via a shared pointer.
 void queue_impl::addSharedEvent(const event &Event) {
-  assert(is_host() || MEmulateOOO);
+  assert(MEmulateOOO);
   std::lock_guard<std::mutex> Lock(MMutex);
   // Events stored in MEventsShared are not released anywhere else aside from
   // calls to queue::wait/wait_and_throw, which a user application might not
@@ -369,9 +369,6 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr<queue_impl> &Self,
       MemOpFunc(MemOpArgs..., getPIEvents(ExpandedDepEvents),
                 &EventImpl->getHandleRef(), EventImpl);
 
-      if (MContext->is_host())
-        return MDiscardEvents ? createDiscardedEvent() : event();
-
       if (isInOrder()) {
         auto &EventToStoreIn = MGraph.expired() ? MDefaultGraphDeps.LastEventPtr
                                                 : MExtGraphDeps.LastEventPtr;
@@ -520,7 +517,7 @@ void queue_impl::wait(const detail::code_location &CodeLoc) {
   // directly. Otherwise, only wait for unenqueued or host task events, starting
   // from the latest submitted task in order to minimize total amount of calls,
   // then handle the rest with piQueueFinish.
-  const bool SupportsPiFinish = !is_host() && !MEmulateOOO;
+  const bool SupportsPiFinish = !MEmulateOOO;
   for (auto EventImplWeakPtrIt = WeakEvents.rbegin();
        EventImplWeakPtrIt != WeakEvents.rend(); ++EventImplWeakPtrIt) {
     if (std::shared_ptr<event_impl> EventImplSharedPtr =
diff --git a/sycl/source/detail/sycl_mem_obj_t.cpp b/sycl/source/detail/sycl_mem_obj_t.cpp
index bb4c5f4e1441d..87f005fe8ca78 100644
--- a/sycl/source/detail/sycl_mem_obj_t.cpp
+++ b/sycl/source/detail/sycl_mem_obj_t.cpp
@@ -33,12 +33,6 @@ SYCLMemObjT::SYCLMemObjT(pi_native_handle MemObject, const context &SyclContext,
       MUserPtr(nullptr), MShadowCopy(nullptr), MUploadDataFunctor(nullptr),
       MSharedPtrStorage(nullptr), MHostPtrProvided(true),
       MOwnNativeHandle(OwnNativeHandle) {
-  if (MInteropContext->is_host())
-    throw sycl::invalid_parameter_error(
-        "Creation of interoperability memory object using host context is "
-        "not allowed",
-        PI_ERROR_INVALID_CONTEXT);
-
   sycl::detail::pi::PiContext Context = nullptr;
   const PluginPtr &Plugin = getPlugin();
 
@@ -84,12 +78,6 @@ SYCLMemObjT::SYCLMemObjT(pi_native_handle MemObject, const context &SyclContext,
       MUserPtr(nullptr), MShadowCopy(nullptr), MUploadDataFunctor(nullptr),
       MSharedPtrStorage(nullptr), MHostPtrProvided(true),
       MOwnNativeHandle(OwnNativeHandle) {
-  if (MInteropContext->is_host())
-    throw sycl::invalid_parameter_error(
-        "Creation of interoperability memory object using host context is "
-        "not allowed",
-        PI_ERROR_INVALID_CONTEXT);
-
   sycl::detail::pi::PiContext Context = nullptr;
   const PluginPtr &Plugin = getPlugin();
 
@@ -191,19 +179,12 @@ void SYCLMemObjT::determineHostPtr(const ContextImplPtr &Context,
   // The data for the allocation can be provided via either the user pointer
   // (InitFromUserData, can be read-only) or a runtime-allocated read-write
   // HostPtr. We can have one of these scenarios:
-  // 1. The allocation is the first one and on host. InitFromUserData == true.
-  // 2. The allocation is the first one and isn't on host. InitFromUserData
+  // 1. The allocation is the first one and isn't on host. InitFromUserData
   // varies based on unified host memory support and whether or not the data can
   // be discarded.
-  // 3. The allocation is not the first one and is on host. InitFromUserData ==
-  // false, HostPtr == nullptr. This can only happen if the allocation command
-  // is not linked since it would be a no-op otherwise. Attempt to reuse the
-  // user pointer if it's read-write, but do not copy its contents if it's not.
-  // 4. The allocation is not the first one and not on host. InitFromUserData ==
+  // 2. The allocation is not the first one and not on host. InitFromUserData ==
   // false, HostPtr is provided if the command is linked. The host pointer is
   // guaranteed to be reused in this case.
-  if (Context->is_host() && !MOpenCLInterop && !MHostPtrReadOnly)
-    InitFromUserData = true;
 
   if (InitFromUserData) {
     assert(!HostPtr && "Cannot init from user data and reuse host ptr provided "

From 5b60b90c37d2bc388272eaed40f375403a148e80 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Tue, 28 May 2024 04:26:44 -0700
Subject: [PATCH 11/52] draft

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/memory_manager.cpp         | 27 -----------
 sycl/source/detail/scheduler/commands.cpp     | 21 ++++-----
 .../source/detail/scheduler/graph_builder.cpp | 46 +++++++++----------
 sycl/source/detail/scheduler/scheduler.hpp    | 32 +++++++++----
 4 files changed, 55 insertions(+), 71 deletions(-)

diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp
index f4e42363cb6e1..792c1c57bd3f1 100644
--- a/sycl/source/detail/memory_manager.cpp
+++ b/sycl/source/detail/memory_manager.cpp
@@ -921,9 +921,6 @@ void MemoryManager::copy_usm(const void *SrcMem, QueueImplPtr SrcQueue,
                              std::vector<sycl::detail::pi::PiEvent> DepEvents,
                              sycl::detail::pi::PiEvent *OutEvent,
                              const detail::EventImplPtr &OutEventImpl) {
-  assert(!SrcQueue->getContextImplPtr()->is_host() &&
-         "Host queue not supported in fill_usm.");
-
   if (!Len) { // no-op, but ensure DepEvents will still be waited on
     if (!DepEvents.empty()) {
       if (OutEventImpl != nullptr)
@@ -962,9 +959,6 @@ void MemoryManager::fill_usm(void *Mem, QueueImplPtr Queue, size_t Length,
                              std::vector<sycl::detail::pi::PiEvent> DepEvents,
                              sycl::detail::pi::PiEvent *OutEvent,
                              const detail::EventImplPtr &OutEventImpl) {
-  assert(!Queue->getContextImplPtr()->is_host() &&
-         "Host queue not supported in fill_usm.");
-
   if (!Length) { // no-op, but ensure DepEvents will still be waited on
     if (!DepEvents.empty()) {
       if (OutEventImpl != nullptr)
@@ -1000,9 +994,6 @@ void MemoryManager::prefetch_usm(
     std::vector<sycl::detail::pi::PiEvent> DepEvents,
     sycl::detail::pi::PiEvent *OutEvent,
     const detail::EventImplPtr &OutEventImpl) {
-  assert(!Queue->getContextImplPtr()->is_host() &&
-         "Host queue not supported in prefetch_usm.");
-
   const PluginPtr &Plugin = Queue->getPlugin();
   if (OutEventImpl != nullptr)
     OutEventImpl->setHostEnqueueTime();
@@ -1024,9 +1015,6 @@ void MemoryManager::advise_usm(
     std::vector<sycl::detail::pi::PiEvent> /*DepEvents*/,
     sycl::detail::pi::PiEvent *OutEvent,
     const detail::EventImplPtr &OutEventImpl) {
-  assert(!Queue->getContextImplPtr()->is_host() &&
-         "Host queue not supported in advise_usm.");
-
   const PluginPtr &Plugin = Queue->getPlugin();
   if (OutEventImpl != nullptr)
     OutEventImpl->setHostEnqueueTime();
@@ -1049,9 +1037,6 @@ void MemoryManager::copy_2d_usm(
     std::vector<sycl::detail::pi::PiEvent> DepEvents,
     sycl::detail::pi::PiEvent *OutEvent,
     const detail::EventImplPtr &OutEventImpl) {
-  assert(!Queue->getContextImplPtr()->is_host() &&
-         "Host queue not supported in copy_2d_usm.");
-
   if (Width == 0 || Height == 0) {
     // no-op, but ensure DepEvents will still be waited on
     if (!DepEvents.empty()) {
@@ -1137,9 +1122,6 @@ void MemoryManager::fill_2d_usm(
     std::vector<sycl::detail::pi::PiEvent> DepEvents,
     sycl::detail::pi::PiEvent *OutEvent,
     const detail::EventImplPtr &OutEventImpl) {
-  assert(!Queue->getContextImplPtr()->is_host() &&
-         "Host queue not supported in fill_2d_usm.");
-
   if (Width == 0 || Height == 0) {
     // no-op, but ensure DepEvents will still be waited on
     if (!DepEvents.empty()) {
@@ -1177,9 +1159,6 @@ void MemoryManager::memset_2d_usm(
     char Value, std::vector<sycl::detail::pi::PiEvent> DepEvents,
     sycl::detail::pi::PiEvent *OutEvent,
     const detail::EventImplPtr &OutEventImpl) {
-  assert(!Queue->getContextImplPtr()->is_host() &&
-         "Host queue not supported in fill_2d_usm.");
-
   if (Width == 0 || Height == 0) {
     // no-op, but ensure DepEvents will still be waited on
     if (!DepEvents.empty()) {
@@ -1714,8 +1693,6 @@ void MemoryManager::ext_oneapi_prefetch_usm_cmd_buffer(
     sycl::detail::pi::PiExtCommandBuffer CommandBuffer, void *Mem,
     size_t Length, std::vector<sycl::detail::pi::PiExtSyncPoint> Deps,
     sycl::detail::pi::PiExtSyncPoint *OutSyncPoint) {
-  assert(!Context->is_host() && "Host queue not supported in prefetch_usm.");
-
   const PluginPtr &Plugin = Context->getPlugin();
   Plugin->call<PiApiKind::piextCommandBufferPrefetchUSM>(
       CommandBuffer, Mem, Length, _pi_usm_migration_flags(0), Deps.size(),
@@ -1728,8 +1705,6 @@ void MemoryManager::ext_oneapi_advise_usm_cmd_buffer(
     size_t Length, pi_mem_advice Advice,
     std::vector<sycl::detail::pi::PiExtSyncPoint> Deps,
     sycl::detail::pi::PiExtSyncPoint *OutSyncPoint) {
-  assert(!Context->is_host() && "Host queue not supported in advise_usm.");
-
   const PluginPtr &Plugin = Context->getPlugin();
   Plugin->call<PiApiKind::piextCommandBufferAdviseUSM>(
       CommandBuffer, Mem, Length, Advice, Deps.size(), Deps.data(),
@@ -1748,8 +1723,6 @@ void MemoryManager::copy_image_bindless(
     const std::vector<sycl::detail::pi::PiEvent> &DepEvents,
     sycl::detail::pi::PiEvent *OutEvent) {
 
-  assert(!Queue->getContextImplPtr()->is_host() &&
-         "Host queue not supported in copy_image_bindless.");
   assert((Flags == (sycl::detail::pi::PiImageCopyFlags)
                        ext::oneapi::experimental::image_copy_flags::HtoD ||
           Flags == (sycl::detail::pi::PiImageCopyFlags)
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index d6c41f39e9942..0a25d7b3ee6c1 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -671,12 +671,9 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep,
   const QueueImplPtr &WorkerQueue = getWorkerQueue();
   const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr();
 
-  // 1. Async work is not supported for host device.
-  // 2. Non-host events can be ignored if they are not fully initialized.
-  // 3. Some types of commands do not produce PI events after they are
-  // enqueued
-  //    (e.g. alloca). Note that we can't check the pi event to make that
-  //    distinction since the command might still be unenqueued at this point.
+  // 1. Non-host events can be ignored if they are not fully initialized.
+  // 2. Some types of commands do not produce PI events after they are
+  // enqueued (e.g. alloca). Note that we can't check the pi event to make that distinction since the command might still be unenqueued at this point.
   bool PiEventExpected = (!DepEvent->isHost() && DepEvent->isInitialized());
   if (auto *DepCmd = static_cast<Command *>(DepEvent->getCommand()))
     PiEventExpected &= DepCmd->producesPiEvent();
@@ -692,11 +689,13 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep,
 
   ContextImplPtr DepEventContext = DepEvent->getContextImpl();
   // If contexts don't match we'll connect them using host task
-  if (DepEventContext != WorkerContext && !WorkerContext->is_host()) {
+  if (DepEventContext == WorkerContext)
+    MPreparedDepsEvents.push_back(std::move(DepEvent));
+  else
+  {
     Scheduler::GraphBuilder &GB = Scheduler::getInstance().MGraphBuilder;
     ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep, ToCleanUp);
-  } else
-    MPreparedDepsEvents.push_back(std::move(DepEvent));
+  }
 
   return ConnectionCmd;
 }
@@ -3106,10 +3105,6 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     return PI_SUCCESS;
   }
   case CG::CGTYPE::Barrier: {
-    if (MQueue->getDeviceImplPtr()->is_host()) {
-      // NOP for host device.
-      return PI_SUCCESS;
-    }
     const PluginPtr &Plugin = MQueue->getPlugin();
     if (MEvent != nullptr)
       MEvent->setHostEnqueueTime();
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index d1b57182d78ff..bbb6d8de12f98 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -243,7 +243,7 @@ MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord(
     getOrCreateAllocaForReq(MemObject->MRecord.get(), Req, InteropQueuePtr,
                             ToEnqueue);
   } else
-    MemObject->MRecord.reset(new MemObjRecord{Queue->getContextImplPtr(),
+    MemObject->MRecord.reset(new MemObjRecord{Queue ? Queue->getContextImplPtr() : nullptr,
                                               LeafLimit, AllocateDependency});
 
   MMemObjs.push_back(MemObject);
@@ -317,7 +317,7 @@ static Command *insertMapUnmapForLinkedCmds(AllocaCommandBase *AllocaCmdSrc,
   assert(AllocaCmdSrc->MIsActive &&
          "Expected source alloca command to be active");
 
-  if (AllocaCmdSrc->getQueue()->is_host()) {
+  if (!AllocaCmdSrc->getQueue()) {
     UnMapMemObject *UnMapCmd = new UnMapMemObject(
         AllocaCmdDst, *AllocaCmdDst->getRequirement(),
         &AllocaCmdSrc->MMemAllocation, AllocaCmdDst->getQueue());
@@ -427,7 +427,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(
 Command *Scheduler::GraphBuilder::remapMemoryObject(
     MemObjRecord *Record, Requirement *Req, AllocaCommandBase *HostAllocaCmd,
     std::vector<Command *> &ToEnqueue) {
-  assert(HostAllocaCmd->getQueue()->is_host() &&
+  assert(!HostAllocaCmd->getQueue() &&
          "Host alloca command expected");
   assert(HostAllocaCmd->MIsActive && "Active alloca command expected");
 
@@ -525,16 +525,14 @@ Scheduler::GraphBuilder::addHostAccessor(Requirement *Req,
     auto SYCLMemObj = static_cast<detail::SYCLMemObjT *>(Req->MSYCLMemObj);
     SYCLMemObj->handleWriteAccessorCreation();
   }
-
-  const QueueImplPtr &HostQueue = getInstance().getDefaultHostQueue();
-
-  MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue);
+  // Host accessor is not attached to any queue so no QueueImplPtr object to be sent to getOrInsertMemObjRecord.
+  MemObjRecord *Record = getOrInsertMemObjRecord(nullptr, Req, ToEnqueue);
   if (MPrintOptionsArray[BeforeAddHostAcc])
     printGraphAsDot("before_addHostAccessor");
   markModifiedIfWrite(Record, Req);
 
   AllocaCommandBase *HostAllocaCmd =
-      getOrCreateAllocaForReq(Record, Req, HostQueue, ToEnqueue);
+      getOrCreateAllocaForReq(Record, Req, nullptr, ToEnqueue);
 
   if (sameCtx(HostAllocaCmd->getQueue()->getContextImplPtr(),
               Record->MCurContext)) {
@@ -682,6 +680,10 @@ static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) {
     if (std::strcmp(HUMConfig, "1") == 0)
       return true;
   }
+  // host task & host accessor is covered with no device context but provide required support.
+  if (Ctx == nullptr)
+    return true;
+
   for (const device &Device : Ctx->getDevices()) {
     if (!Device.get_info<info::device::host_unified_memory>())
       return false;
@@ -696,9 +698,9 @@ static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) {
 AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
     MemObjRecord *Record, const Requirement *Req, const QueueImplPtr &Queue,
     std::vector<Command *> &ToEnqueue) {
-
+  auto Context = Queue != nullptr ? Queue->getContextImplPtr() : nullptr;
   AllocaCommandBase *AllocaCmd = findAllocaForReq(
-      Record, Req, Queue->getContextImplPtr(), /*AllowConst=*/false);
+      Record, Req, Context, /*AllowConst=*/false);
 
   if (!AllocaCmd) {
     std::vector<Command *> ToCleanUp;
@@ -729,7 +731,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
       // the user pointer is read-only is still not handled: it leads to
       // unnecessary copy on devices with unified host memory support.
       const bool HostUnifiedMemory =
-          checkHostUnifiedMemory(Queue->getContextImplPtr());
+          checkHostUnifiedMemory(Context);
       SYCLMemObjI *MemObj = Req->MSYCLMemObj;
       const bool InitFromUserData = Record->MAllocaCommands.empty() &&
                                     (HostUnifiedMemory || MemObj->isInterop());
@@ -745,16 +747,14 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
           // There's no need to make a host allocation if the buffer is not
           // initialized with user data.
           if (MemObj->hasUserDataPtr()) {
-            QueueImplPtr DefaultHostQueue =
-                Scheduler::getInstance().getDefaultHostQueue();
             AllocaCommand *HostAllocaCmd = new AllocaCommand(
-                DefaultHostQueue, FullReq, true /* InitFromUserData */,
+                nullptr, FullReq, true /* InitFromUserData */,
                 nullptr /* LinkedAllocaCmd */,
                 MemObj->isHostPointerReadOnly() /* IsConst */);
             Record->MAllocaCommands.push_back(HostAllocaCmd);
             Record->MWriteLeaves.push_back(HostAllocaCmd, ToEnqueue);
             ++(HostAllocaCmd->MLeafCounter);
-            Record->MCurContext = DefaultHostQueue->getContextImplPtr();
+            Record->usedOnHost();
           }
         }
       } else {
@@ -766,7 +766,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
           // new one. There could be situations when we could setup link with
           // "not" current allocation, but it will require memory copy.
           // Can setup link between cl and host allocations only
-          if (Queue->is_host() != Record->MCurContext->is_host()) {
+          if ((Context != nullptr) + (Record->MCurContext != nullptr) == 1) {
             // Linked commands assume that the host allocation is reused by the
             // plugin runtime and that can lead to unnecessary copy overhead on
             // devices that do not support host unified memory. Do not link the
@@ -778,7 +778,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
             bool PinnedHostMemory = MemObj->usesPinnedHostMemory();
 
             bool HostUnifiedMemoryOnNonHostDevice =
-                Queue->is_host() ? checkHostUnifiedMemory(Record->MCurContext)
+                Queue == nullptr ? checkHostUnifiedMemory(Record->MCurContext)
                                  : HostUnifiedMemory;
             if (PinnedHostMemory || HostUnifiedMemoryOnNonHostDevice) {
               AllocaCommandBase *LinkedAllocaCmdCand = findAllocaForReq(
@@ -818,14 +818,14 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
         // construction, host allocation doesn't. So, device allocation should
         // always be active here. Also if the "follower" command is a device one
         // we have to change current context to the device one.
-        if (Queue->is_host()) {
+        if (Queue == nullptr) {
           AllocaCmd->MIsActive = false;
         } else {
           LinkedAllocaCmd->MIsActive = false;
           Record->MCurContext = Queue->getContextImplPtr();
 
           std::set<Command *> Deps =
-              findDepsForReq(Record, Req, Queue->getContextImplPtr());
+              findDepsForReq(Record, Req, Context);
           for (Command *Dep : Deps) {
             Command *ConnCmd = AllocaCmd->addDep(
                 DepDesc{Dep, Req, LinkedAllocaCmd}, ToCleanUp);
@@ -1071,7 +1071,7 @@ void Scheduler::GraphBuilder::createGraphForCommand(
     if (isSameCtx) {
       // If the memory is already in the required host context, check if the
       // required access mode is valid, remap if not.
-      if (Record->MCurContext->is_host() &&
+      if (!Record->MCurContext &&
           !isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) {
         remapMemoryObject(Record, Req,
                           Req->MIsSubBuffer
@@ -1093,7 +1093,7 @@ void Scheduler::GraphBuilder::createGraphForCommand(
           NeedMemMoveToHost = true;
           MemMoveTargetQueue = HT.MQueue;
         }
-      } else if (!Queue->is_host() && !Record->MCurContext->is_host())
+      } else if (Queue && Record->MCurContext)
         NeedMemMoveToHost = true;
 
       if (NeedMemMoveToHost)
@@ -1714,12 +1714,12 @@ Command *Scheduler::GraphBuilder::addCommandGraphUpdate(
       bool NeedMemMoveToHost = false;
       auto MemMoveTargetQueue = Queue;
 
-      if (!Queue->is_host() && !Record->MCurContext->is_host())
+      if (Queue && Record->MCurContext)
         NeedMemMoveToHost = true;
 
       if (NeedMemMoveToHost)
         insertMemoryMove(Record, Req,
-                         Scheduler::getInstance().getDefaultHostQueue(),
+                        nullptr,
                          ToEnqueue);
       insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue);
     }
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index 6fa95cb4a4a54..bcb930bc8194a 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -199,12 +199,12 @@ using FusionMap = std::unordered_map<QueueIdT, FusionList>;
 /// There must be a single MemObjRecord for each SYCL memory object.
 ///
 /// \ingroup sycl_graph
-struct MemObjRecord {
+class MemObjRecord {
   MemObjRecord(ContextImplPtr Ctx, std::size_t LeafLimit,
                LeavesCollection::AllocateDependencyF AllocateDependency)
       : MReadLeaves{this, LeafLimit, AllocateDependency},
-        MWriteLeaves{this, LeafLimit, AllocateDependency}, MCurContext{Ctx} {}
-
+        MWriteLeaves{this, LeafLimit, AllocateDependency}, MCurContext{Ctx}, MCurHostAccess{ MCurContext == nullptr } {}
+public:
   // Contains all allocation commands for the memory object.
   std::vector<AllocaCommandBase *> MAllocaCommands;
 
@@ -214,16 +214,32 @@ struct MemObjRecord {
   // Contains latest write commands working with memory object.
   LeavesCollection MWriteLeaves;
 
+  // The flag indicates that the content of the memory object was/will be
+  // modified. Used while deciding if copy back needed.
+  bool MMemModified = false;
+
+  void usedOnDevice(ContextImplPtr& NewContext)
+  {
+    MCurContext = NewContext;
+    MCurHostAccess = false;
+  }
+
+  void usedOnHost()
+  {
+    MCurContext = nullptr;
+    MCurHostAccess = true;
+  }
+
+  bool usedOnHost() { return MCurHostAccess; }
+protected:
   // The context which has the latest state of the memory object.
   ContextImplPtr MCurContext;
 
-  // The mode this object can be accessed with from the host context.
-  // Valid only if the current context is host.
+  // The mode this object can be accessed with from the host (host_accessor).
+  // Valid only if the current usage is on host.
   access::mode MHostAccess = access::mode::read_write;
 
-  // The flag indicates that the content of the memory object was/will be
-  // modified. Used while deciding if copy back needed.
-  bool MMemModified = false;
+  bool MCurHostAccess = false;
 };
 
 /// DPC++ graph scheduler class.

From 21ed380f362dd560342f75f94a58b84da50edd9c Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 29 May 2024 05:58:36 -0700
Subject: [PATCH 12/52] non-buildable: eliminate getDefaultHostQueue usage

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/memory_manager.cpp         | 14 +--
 sycl/source/detail/scheduler/commands.cpp     |  6 +-
 .../source/detail/scheduler/graph_builder.cpp | 96 +++++++++----------
 sycl/source/detail/scheduler/scheduler.cpp    |  4 +-
 sycl/source/detail/scheduler/scheduler.hpp    | 18 ++--
 5 files changed, 65 insertions(+), 73 deletions(-)

diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp
index 792c1c57bd3f1..3c0ad08e0763f 100644
--- a/sycl/source/detail/memory_manager.cpp
+++ b/sycl/source/detail/memory_manager.cpp
@@ -750,23 +750,23 @@ void MemoryManager::copy(SYCLMemObjI *SYCLMemObj, void *SrcMem,
                          sycl::detail::pi::PiEvent &OutEvent,
                          const detail::EventImplPtr &OutEventImpl) {
 
-  if (SrcQueue->is_host()) {
-    if (TgtQueue->is_host())
-      copyH2H(SYCLMemObj, (char *)SrcMem, std::move(SrcQueue), DimSrc, SrcSize,
+  if (!SrcQueue) {
+    if (!TgtQueue)
+      copyH2H(SYCLMemObj, (char *)SrcMem, nullptr, DimSrc, SrcSize,
               SrcAccessRange, SrcOffset, SrcElemSize, (char *)DstMem,
-              std::move(TgtQueue), DimDst, DstSize, DstAccessRange, DstOffset,
+              nullptr, DimDst, DstSize, DstAccessRange, DstOffset,
               DstElemSize, std::move(DepEvents), OutEvent, OutEventImpl);
     else
-      copyH2D(SYCLMemObj, (char *)SrcMem, std::move(SrcQueue), DimSrc, SrcSize,
+      copyH2D(SYCLMemObj, (char *)SrcMem, nullptr, DimSrc, SrcSize,
               SrcAccessRange, SrcOffset, SrcElemSize,
               pi::cast<sycl::detail::pi::PiMem>(DstMem), std::move(TgtQueue),
               DimDst, DstSize, DstAccessRange, DstOffset, DstElemSize,
               std::move(DepEvents), OutEvent, OutEventImpl);
   } else {
-    if (TgtQueue->is_host())
+    if (!TgtQueue)
       copyD2H(SYCLMemObj, pi::cast<sycl::detail::pi::PiMem>(SrcMem),
               std::move(SrcQueue), DimSrc, SrcSize, SrcAccessRange, SrcOffset,
-              SrcElemSize, (char *)DstMem, std::move(TgtQueue), DimDst, DstSize,
+              SrcElemSize, (char *)DstMem, nullptr, DimDst, DstSize,
               DstAccessRange, DstOffset, DstElemSize, std::move(DepEvents),
               OutEvent, OutEventImpl);
     else
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 0a25d7b3ee6c1..f0e3471a0f6f6 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -2872,7 +2872,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
         AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
         Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
         Req->MElemSize, Copy->getDst(),
-        Scheduler::getInstance().getDefaultHostQueue(), Req->MDims,
+        nullptr, Req->MDims,
         Req->MAccessRange, Req->MAccessRange, /*DstOffset=*/{0, 0, 0},
         Req->MElemSize, std::move(RawEvents), MEvent->getHandleRef(), MEvent);
 
@@ -2883,11 +2883,9 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     Requirement *Req = (Requirement *)(Copy->getDst());
     AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
 
-    Scheduler::getInstance().getDefaultHostQueue();
-
     MemoryManager::copy(
         AllocaCmd->getSYCLMemObj(), Copy->getSrc(),
-        Scheduler::getInstance().getDefaultHostQueue(), Req->MDims,
+        nullptr, Req->MDims,
         Req->MAccessRange, Req->MAccessRange,
         /*SrcOffset*/ {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
         MQueue, Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index bbb6d8de12f98..6c9244f9ecb2c 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -49,15 +49,16 @@ static bool doOverlap(const Requirement *LHS, const Requirement *RHS) {
           LHS->MOffsetInBytes);
 }
 
-static bool sameCtx(const ContextImplPtr &LHS, const ContextImplPtr &RHS) {
-  return LHS == RHS;
-}
-
 /// Checks if current requirement is requirement for sub buffer.
 static bool IsSuitableSubReq(const Requirement *Req) {
   return Req->MIsSubBuffer;
 }
 
+static ContextImplPtr GetContext(const QueueImplPtr& Queue)
+{
+  return Queue ? Queue->getContextImplPtr() : nullptr;
+}
+
 /// Checks if the required access mode is allowed under the current one.
 static bool isAccessModeAllowed(access::mode Required, access::mode Current) {
   switch (Current) {
@@ -243,7 +244,7 @@ MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord(
     getOrCreateAllocaForReq(MemObject->MRecord.get(), Req, InteropQueuePtr,
                             ToEnqueue);
   } else
-    MemObject->MRecord.reset(new MemObjRecord{Queue ? Queue->getContextImplPtr() : nullptr,
+    MemObject->MRecord.reset(new MemObjRecord{GetContext(Queue),
                                               LeafLimit, AllocateDependency});
 
   MMemObjs.push_back(MemObject);
@@ -282,8 +283,9 @@ void Scheduler::GraphBuilder::addNodeToLeaves(
 UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd(
     MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue,
     std::vector<Command *> &ToEnqueue) {
+  auto Context = GetContext(Queue);
   AllocaCommandBase *AllocaCmd =
-      findAllocaForReq(Record, Req, Queue->getContextImplPtr());
+      findAllocaForReq(Record, Req, Context);
   assert(AllocaCmd && "There must be alloca for requirement!");
   UpdateHostRequirementCommand *UpdateCommand =
       new UpdateHostRequirementCommand(Queue, *Req, AllocaCmd, &Req->MData);
@@ -292,7 +294,7 @@ UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd(
   const Requirement *StoredReq = UpdateCommand->getRequirement();
 
   std::set<Command *> Deps =
-      findDepsForReq(Record, Req, Queue->getContextImplPtr());
+      findDepsForReq(Record, Req, Context);
   std::vector<Command *> ToCleanUp;
   for (Command *Dep : Deps) {
     Command *ConnCmd =
@@ -345,8 +347,9 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(
   if (!AllocaCmdDst)
     throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
 
+  auto Context = GetContext(Queue);
   std::set<Command *> Deps =
-      findDepsForReq(Record, Req, Queue->getContextImplPtr());
+      findDepsForReq(Record, Req, Context);
   Deps.insert(AllocaCmdDst);
   // Get parent allocation of sub buffer to perform full copy of whole buffer
   if (IsSuitableSubReq(Req)) {
@@ -362,8 +365,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(
     // current context, need to find a parent alloca command for it (it must be
     // there)
     auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) {
-      bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(),
-                         Record->MCurContext) &&
+      bool Res = Record->isSameContext(AllocaCmd->getQueue()) &&
                  // Looking for a parent buffer alloca command
                  AllocaCmd->getType() == Command::CommandType::ALLOCA;
       return Res;
@@ -398,7 +400,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(
 
     if ((Req->MAccessMode == access::mode::discard_write) ||
         (Req->MAccessMode == access::mode::discard_read_write)) {
-      Record->MCurContext = Queue->getContextImplPtr();
+      Record->updateUsage(Context);
       return nullptr;
     } else {
       // Full copy of buffer is needed to avoid loss of data that may be caused
@@ -420,7 +422,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(
   addNodeToLeaves(Record, NewCmd, access::mode::read_write, ToEnqueue);
   for (Command *Cmd : ToCleanUp)
     cleanupCommand(Cmd);
-  Record->MCurContext = Queue->getContextImplPtr();
+  Record->updateUsage(Context);
   return NewCmd;
 }
 
@@ -474,7 +476,6 @@ Command *Scheduler::GraphBuilder::remapMemoryObject(
 Command *
 Scheduler::GraphBuilder::addCopyBack(Requirement *Req,
                                      std::vector<Command *> &ToEnqueue) {
-  QueueImplPtr HostQueue = Scheduler::getInstance().getDefaultHostQueue();
   SYCLMemObjI *MemObj = Req->MSYCLMemObj;
   MemObjRecord *Record = getMemObjRecord(MemObj);
   if (Record && MPrintOptionsArray[BeforeAddCopyBack])
@@ -485,13 +486,13 @@ Scheduler::GraphBuilder::addCopyBack(Requirement *Req,
     return nullptr;
 
   std::set<Command *> Deps =
-      findDepsForReq(Record, Req, HostQueue->getContextImplPtr());
+      findDepsForReq(Record, Req, nullptr);
   AllocaCommandBase *SrcAllocaCmd =
       findAllocaForReq(Record, Req, Record->MCurContext);
 
   auto MemCpyCmdUniquePtr = std::make_unique<MemCpyCommandHost>(
       *SrcAllocaCmd->getRequirement(), SrcAllocaCmd, *Req, &Req->MData,
-      SrcAllocaCmd->getQueue(), std::move(HostQueue));
+      SrcAllocaCmd->getQueue(), nullptr);
 
   if (!MemCpyCmdUniquePtr)
     throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
@@ -534,8 +535,7 @@ Scheduler::GraphBuilder::addHostAccessor(Requirement *Req,
   AllocaCommandBase *HostAllocaCmd =
       getOrCreateAllocaForReq(Record, Req, nullptr, ToEnqueue);
 
-  if (sameCtx(HostAllocaCmd->getQueue()->getContextImplPtr(),
-              Record->MCurContext)) {
+  if (Record->isSameContext(HostAllocaCmd->getQueue())) {
     if (!isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) {
       remapMemoryObject(Record, Req,
                         Req->MIsSubBuffer ? (static_cast<AllocaSubBufCommand *>(
@@ -545,15 +545,14 @@ Scheduler::GraphBuilder::addHostAccessor(Requirement *Req,
                         ToEnqueue);
     }
   } else
-    insertMemoryMove(Record, Req, HostQueue, ToEnqueue);
+    insertMemoryMove(Record, Req, nullptr, ToEnqueue);
 
   Command *UpdateHostAccCmd =
-      insertUpdateHostReqCmd(Record, Req, HostQueue, ToEnqueue);
+      insertUpdateHostReqCmd(Record, Req, nullptr, ToEnqueue);
 
   // Need empty command to be blocked until host accessor is destructed
   EmptyCommand *EmptyCmd =
-      addEmptyCmd(UpdateHostAccCmd, {Req}, HostQueue,
-                  Command::BlockReason::HostAccessor, ToEnqueue);
+      addEmptyCmd(UpdateHostAccCmd, {Req}, Command::BlockReason::HostAccessor, ToEnqueue);
 
   Req->MBlockedCmd = EmptyCmd;
 
@@ -564,14 +563,14 @@ Scheduler::GraphBuilder::addHostAccessor(Requirement *Req,
 }
 
 Command *Scheduler::GraphBuilder::addCGUpdateHost(
-    std::unique_ptr<detail::CG> CommandGroup, const QueueImplPtr &HostQueue,
+    std::unique_ptr<detail::CG> CommandGroup,
     std::vector<Command *> &ToEnqueue) {
 
   auto UpdateHost = static_cast<CGUpdateHost *>(CommandGroup.get());
   Requirement *Req = UpdateHost->getReqToUpdate();
 
-  MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req, ToEnqueue);
-  return insertMemoryMove(Record, Req, HostQueue, ToEnqueue);
+  MemObjRecord *Record = getOrInsertMemObjRecord(nullptr, Req, ToEnqueue);
+  return insertMemoryMove(Record, Req, nullptr, ToEnqueue);
 }
 
 /// Start the search for the record from list of "leaf" commands and check if
@@ -618,8 +617,10 @@ Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record,
 
       // Going through copying memory between contexts is not supported.
       if (Dep.MDepCommand)
-        CanBypassDep &=
-            sameCtx(Context, Dep.MDepCommand->getQueue()->getContextImplPtr());
+      {
+        auto DepQueue = Dep.MDepCommand->getQueue();
+        CanBypassDep &= IsOnSameContext(Context, DepQueue);
+      }
 
       if (!CanBypassDep) {
         RetDeps.insert(DepCmd);
@@ -658,7 +659,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::findAllocaForReq(
     bool AllowConst) {
   auto IsSuitableAlloca = [&Context, Req,
                            AllowConst](AllocaCommandBase *AllocaCmd) {
-    bool Res = sameCtx(AllocaCmd->getQueue()->getContextImplPtr(), Context);
+    bool Res = IsOnSameContext(Context, AllocaCmd->getQueue());
     if (IsSuitableSubReq(Req)) {
       const Requirement *TmpReq = AllocaCmd->getRequirement();
       Res &= AllocaCmd->getType() == Command::CommandType::ALLOCA_SUB_BUF;
@@ -698,7 +699,7 @@ static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) {
 AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
     MemObjRecord *Record, const Requirement *Req, const QueueImplPtr &Queue,
     std::vector<Command *> &ToEnqueue) {
-  auto Context = Queue != nullptr ? Queue->getContextImplPtr() : nullptr;
+  auto Context = GetContext(Queue);
   AllocaCommandBase *AllocaCmd = findAllocaForReq(
       Record, Req, Context, /*AllowConst=*/false);
 
@@ -754,7 +755,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
             Record->MAllocaCommands.push_back(HostAllocaCmd);
             Record->MWriteLeaves.push_back(HostAllocaCmd, ToEnqueue);
             ++(HostAllocaCmd->MLeafCounter);
-            Record->usedOnHost();
+            Record->updateUsage(nullptr);
           }
         }
       } else {
@@ -766,7 +767,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
           // new one. There could be situations when we could setup link with
           // "not" current allocation, but it will require memory copy.
           // Can setup link between cl and host allocations only
-          if ((Context != nullptr) + (Record->MCurContext != nullptr) == 1) {
+          if ((Context != nullptr) + (Record->usedOnDevice()) == 1) {
             // Linked commands assume that the host allocation is reused by the
             // plugin runtime and that can lead to unnecessary copy overhead on
             // devices that do not support host unified memory. Do not link the
@@ -822,7 +823,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
           AllocaCmd->MIsActive = false;
         } else {
           LinkedAllocaCmd->MIsActive = false;
-          Record->MCurContext = Queue->getContextImplPtr();
+          Record->updateUsage(Context);
 
           std::set<Command *> Deps =
               findDepsForReq(Record, Req, Context);
@@ -865,10 +866,9 @@ void Scheduler::GraphBuilder::markModifiedIfWrite(MemObjRecord *Record,
 
 EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd(
     Command *Cmd, const std::vector<Requirement *> &Reqs,
-    const QueueImplPtr &Queue, Command::BlockReason Reason,
+    Command::BlockReason Reason,
     std::vector<Command *> &ToEnqueue, const bool AddDepsToLeaves) {
-  EmptyCommand *EmptyCmd =
-      new EmptyCommand(Scheduler::getInstance().getDefaultHostQueue());
+  EmptyCommand *EmptyCmd = new EmptyCommand();
 
   if (!EmptyCmd)
     throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
@@ -878,9 +878,9 @@ EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd(
   EmptyCmd->MBlockReason = Reason;
 
   for (Requirement *Req : Reqs) {
-    MemObjRecord *Record = getOrInsertMemObjRecord(Queue, Req, ToEnqueue);
+    MemObjRecord *Record = getOrInsertMemObjRecord(nullptr, Req, ToEnqueue);
     AllocaCommandBase *AllocaCmd =
-        getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
+        getOrCreateAllocaForReq(Record, Req, nullptr, ToEnqueue);
     EmptyCmd->addRequirement(Cmd, AllocaCmd, Req);
   }
   // addRequirement above call addDep that already will add EmptyCmd as user for
@@ -1062,8 +1062,7 @@ void Scheduler::GraphBuilder::createGraphForCommand(
       AllocaCmd =
           getOrCreateAllocaForReq(Record, Req, QueueForAlloca, ToEnqueue);
 
-      isSameCtx =
-          sameCtx(QueueForAlloca->getContextImplPtr(), Record->MCurContext);
+      isSameCtx = Record->isSameContext(QueueForAlloca);
     }
 
     // If there is alloca command we need to check if the latest memory is in
@@ -1071,7 +1070,7 @@ void Scheduler::GraphBuilder::createGraphForCommand(
     if (isSameCtx) {
       // If the memory is already in the required host context, check if the
       // required access mode is valid, remap if not.
-      if (!Record->MCurContext &&
+      if (!Record->usedOnDevice() &&
           !isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) {
         remapMemoryObject(Record, Req,
                           Req->MIsSubBuffer
@@ -1089,21 +1088,20 @@ void Scheduler::GraphBuilder::createGraphForCommand(
       if (isInteropTask) {
         const detail::CGHostTask &HT = static_cast<detail::CGHostTask &>(CG);
 
-        if (HT.MQueue->getContextImplPtr() != Record->MCurContext) {
+        if (!(Record->isSameContext(HT.MQueue)) {
           NeedMemMoveToHost = true;
           MemMoveTargetQueue = HT.MQueue;
         }
-      } else if (Queue && Record->MCurContext)
+      } else if (Queue && Record->usedOnDevice())
         NeedMemMoveToHost = true;
 
       if (NeedMemMoveToHost)
-        insertMemoryMove(Record, Req,
-                         Scheduler::getInstance().getDefaultHostQueue(),
-                         ToEnqueue);
+        insertMemoryMove(Record, Req, nullptr, ToEnqueue);
       insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue);
     }
+
     std::set<Command *> Deps =
-        findDepsForReq(Record, Req, Queue->getContextImplPtr());
+        findDepsForReq(Record, Req, GetContext(Queue));
 
     for (Command *Dep : Deps) {
       if (Dep != NewCmd) {
@@ -1343,7 +1341,7 @@ Command *Scheduler::GraphBuilder::connectDepEvent(
         CG::CodeplayHostTask,
         /* Payload */ {}));
     ConnectCmd = new ExecCGCommand(
-        std::move(ConnectCG), Scheduler::getInstance().getDefaultHostQueue());
+        std::move(ConnectCG), Cmd->getQueue());
   } catch (const std::bad_alloc &) {
     throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
   }
@@ -1705,7 +1703,7 @@ Command *Scheduler::GraphBuilder::addCommandGraphUpdate(
 
       AllocaCmd = getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
 
-      isSameCtx = sameCtx(Queue->getContextImplPtr(), Record->MCurContext);
+      isSameCtx = Record->isSameContext(Queue);
     }
 
     if (!isSameCtx) {
@@ -1714,7 +1712,7 @@ Command *Scheduler::GraphBuilder::addCommandGraphUpdate(
       bool NeedMemMoveToHost = false;
       auto MemMoveTargetQueue = Queue;
 
-      if (Queue && Record->MCurContext)
+      if (Queue && Record->usedOnDevice())
         NeedMemMoveToHost = true;
 
       if (NeedMemMoveToHost)
@@ -1724,7 +1722,7 @@ Command *Scheduler::GraphBuilder::addCommandGraphUpdate(
       insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue);
     }
     std::set<Command *> Deps =
-        findDepsForReq(Record, Req, Queue->getContextImplPtr());
+        findDepsForReq(Record, Req,  GetContext(Queue));
 
     for (Command *Dep : Deps) {
       if (Dep != NewCmd.get()) {
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index 0b061a86dbc62..7e5db05daf01a 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -118,12 +118,12 @@ EventImplPtr Scheduler::addCG(
     switch (Type) {
     case CG::UpdateHost:
       NewCmd = MGraphBuilder.addCGUpdateHost(std::move(CommandGroup),
-                                             DefaultHostQueue, AuxiliaryCmds);
+                                             AuxiliaryCmds);
       NewEvent = NewCmd->getEvent();
       break;
     case CG::CodeplayHostTask: {
       auto Result = MGraphBuilder.addCG(std::move(CommandGroup),
-                                        DefaultHostQueue, AuxiliaryCmds);
+                                        nullptr, AuxiliaryCmds);
       NewCmd = Result.NewCmd;
       NewEvent = Result.NewEvent;
       ShouldEnqueue = Result.ShouldEnqueue;
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index bcb930bc8194a..6a2bcc4e5004a 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -203,7 +203,7 @@ class MemObjRecord {
   MemObjRecord(ContextImplPtr Ctx, std::size_t LeafLimit,
                LeavesCollection::AllocateDependencyF AllocateDependency)
       : MReadLeaves{this, LeafLimit, AllocateDependency},
-        MWriteLeaves{this, LeafLimit, AllocateDependency}, MCurContext{Ctx}, MCurHostAccess{ MCurContext == nullptr } {}
+        MWriteLeaves{this, LeafLimit, AllocateDependency}, MCurContext{Ctx} {}
 public:
   // Contains all allocation commands for the memory object.
   std::vector<AllocaCommandBase *> MAllocaCommands;
@@ -218,19 +218,19 @@ class MemObjRecord {
   // modified. Used while deciding if copy back needed.
   bool MMemModified = false;
 
-  void usedOnDevice(ContextImplPtr& NewContext)
+  void updateUsage(ContextImplPtr& NewContext)
   {
     MCurContext = NewContext;
-    MCurHostAccess = false;
   }
 
-  void usedOnHost()
+  bool isSameContext(const QueueImplPtr& Queue) const
   {
-    MCurContext = nullptr;
-    MCurHostAccess = true;
+    // Covers case for host usage (nullptr == nullptr) and existing device contexts comparison.
+    return LHS == (Queue ? Queue->getContextImplPtr() : nullptr);
   }
 
-  bool usedOnHost() { return MCurHostAccess; }
+  bool usedOnDevice( return MCurContext != nullptr; )
+
 protected:
   // The context which has the latest state of the memory object.
   ContextImplPtr MCurContext;
@@ -238,8 +238,6 @@ class MemObjRecord {
   // The mode this object can be accessed with from the host (host_accessor).
   // Valid only if the current usage is on host.
   access::mode MHostAccess = access::mode::read_write;
-
-  bool MCurHostAccess = false;
 };
 
 /// DPC++ graph scheduler class.
@@ -621,7 +619,6 @@ class Scheduler {
     ///
     /// \return a command that represents command group execution.
     Command *addCGUpdateHost(std::unique_ptr<detail::CG> CommandGroup,
-                             const QueueImplPtr &HostQueue,
                              std::vector<Command *> &ToEnqueue);
 
     /// Enqueues a command to update memory to the latest state.
@@ -759,7 +756,6 @@ class Scheduler {
 
     EmptyCommand *addEmptyCmd(Command *Cmd,
                               const std::vector<Requirement *> &Req,
-                              const QueueImplPtr &Queue,
                               Command::BlockReason Reason,
                               std::vector<Command *> &ToEnqueue,
                               const bool AddDepsToLeaves = true);

From c533af788609ed1b86dd27307eb48045f05c7565 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Tue, 4 Jun 2024 03:41:44 -0700
Subject: [PATCH 13/52] non-buildable: cleanup queue usages

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/queue_impl.cpp         |   3 +-
 sycl/source/detail/scheduler/commands.cpp | 208 +++++++++-------------
 2 files changed, 88 insertions(+), 123 deletions(-)

diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index bba423df61b60..c1c1d3835a54d 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -26,7 +26,8 @@
 namespace sycl {
 inline namespace _V1 {
 namespace detail {
-std::atomic<unsigned long long> queue_impl::MNextAvailableQueueID = 0;
+// Treat 0 as reserved for "host" queue
+std::atomic<unsigned long long> queue_impl::MNextAvailableQueueID = 1;
 
 static std::vector<sycl::detail::pi::PiEvent>
 getPIEvents(const std::vector<sycl::event> &DepEvents) {
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index f0e3471a0f6f6..f7962bb7a5d66 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -56,7 +56,7 @@ namespace detail {
 // Global graph for the application
 extern xpti::trace_event_data_t *GSYCLGraphEvent;
 
-bool CurrentCodeLocationValid() {
+static bool CurrentCodeLocationValid() {
   detail::tls_code_loc_t Tls;
   auto CodeLoc = Tls.query();
   auto FileName = CodeLoc.fileName();
@@ -65,7 +65,7 @@ bool CurrentCodeLocationValid() {
          (FunctionName && FunctionName[0] != '\0');
 }
 
-void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID,
+static void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID,
                                 xpti_td *TraceEvent, uint16_t Type,
                                 const void *Addr) {
   if (!(xptiCheckTraceEnabled(StreamID, Type) && TraceEvent))
@@ -74,6 +74,17 @@ void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID,
   xptiNotifySubscribers(StreamID, Type, detail::GSYCLGraphEvent,
                         static_cast<xpti_td *>(TraceEvent), InstanceID, Addr);
 }
+
+static addDeviceMetadata(xpti_td* TraceEvent, const QueueImplPtr& Queue)
+{
+    xpti::addMetadata(TraceEvent, "sycl_device",
+                      Queue ? deviceToID(MQueue->get_device()) : nullptr);
+    xpti::addMetadata(TraceEvent, "sycl_device_type",
+                      Queue ? deviceToString(MQueue->get_device()) : "host");
+    if (Queue)
+      xpti::addMetadata(TraceEvent, "sycl_device_name",
+                      getSyclObjImpl(MQueue->get_device())->getDeviceName());
+}
 #endif
 
 #ifdef __SYCL_ENABLE_GNU_DEMANGLING
@@ -236,9 +247,7 @@ Command::getPiEvents(const std::vector<EventImplPtr> &EventImpls) const {
     // current one is a host task. In this case we should not skip pi event due
     // to different sync mechanisms for different task types on in-order queue.
     const QueueImplPtr &WorkerQueue = getWorkerQueue();
-    // MWorkerQueue in command is always not null. So check if
-    // EventImpl->getWorkerQueue != nullptr is implicit.
-    if (EventImpl->getWorkerQueue() == WorkerQueue &&
+    if (WorkerQueue && EventImpl->getWorkerQueue() == WorkerQueue &&
         WorkerQueue->isInOrder() && !isHostTask())
       continue;
 
@@ -278,9 +287,7 @@ std::vector<sycl::detail::pi::PiEvent> Command::getPiEventsBlocking(
     // current one is a host task. In this case we should not skip pi event due
     // to different sync mechanisms for different task types on in-order queue.
     const QueueImplPtr &WorkerQueue = getWorkerQueue();
-    // MWorkerQueue in command is always not null. So check if
-    // EventImpl->getWorkerQueue != nullptr is implicit.
-    if (EventImpl->getWorkerQueue() == WorkerQueue &&
+    if (MWorkerQueue && EventImpl->getWorkerQueue() == WorkerQueue &&
         WorkerQueue->isInOrder() && !isHostTask())
       continue;
 
@@ -337,12 +344,10 @@ class DispatchHostTask {
         PluginWithEvents.first->call<PiApiKind::piEventsWait>(RawEvents.size(),
                                                               RawEvents.data());
       } catch (const sycl::exception &E) {
-        CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
-        HostTask.MQueue->reportAsyncException(std::current_exception());
+        MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(std::current_exception());
         return (pi_result)E.get_cl_code();
       } catch (...) {
-        CGHostTask &HostTask = static_cast<CGHostTask &>(MThisCmd->getCG());
-        HostTask.MQueue->reportAsyncException(std::current_exception());
+        MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(std::current_exception());
         return PI_ERROR_UNKNOWN;
       }
     }
@@ -383,7 +388,7 @@ class DispatchHostTask {
       std::exception_ptr EPtr = std::make_exception_ptr(sycl::runtime_error(
           std::string("Couldn't wait for host-task's dependencies"),
           WaitResult));
-      HostTask.MQueue->reportAsyncException(EPtr);
+      MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(EPtr);
       // reset host-task's lambda and quit
       HostTask.MHostTask.reset();
       Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd);
@@ -394,7 +399,7 @@ class DispatchHostTask {
       // we're ready to call the user-defined lambda now
       if (HostTask.MHostTask->isInteropTask()) {
         interop_handle IH{MReqToMem, HostTask.MQueue,
-                          HostTask.MQueue->getDeviceImplPtr(),
+                        //  HostTask.MQueue->getDeviceImplPtr(),
                           HostTask.MQueue->getContextImplPtr()};
 
         HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo(), IH);
@@ -419,7 +424,7 @@ class DispatchHostTask {
         }
       }
 #endif
-      HostTask.MQueue->reportAsyncException(CurrentException);
+      MthisCmd->MEvent->getSubmittedQueue()->reportAsyncException(CurrentException);
     }
 
     HostTask.MHostTask.reset();
@@ -436,7 +441,7 @@ class DispatchHostTask {
       Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd);
     } catch (...) {
       auto CurrentException = std::current_exception();
-      HostTask.MQueue->reportAsyncException(CurrentException);
+      MthisCmd->MEvent->getSubmittedQueue()->reportAsyncException(CurrentException);
     }
   }
 };
@@ -449,6 +454,7 @@ void Command::waitForPreparedHostEvents() const {
 void Command::waitForEvents(QueueImplPtr Queue,
                             std::vector<EventImplPtr> &EventImpls,
                             sycl::detail::pi::PiEvent &Event) {
+  assert(Queue && "Device queue is expected here");
   if (!EventImpls.empty()) {
 #ifndef NDEBUG
       for (const EventImplPtr &Event : EventImpls)
@@ -484,7 +490,7 @@ Command::Command(
   MEvent->setWorkerQueue(MWorkerQueue);
   MEvent->setSubmittedQueue(MWorkerQueue);
   MEvent->setCommand(this);
-  MEvent->setContextImpl(MQueue->getContextImplPtr());
+  MEvent->setContextImpl(MQueue ? MQueue->getContextImplPtr(): nullptr);
   MEvent->setStateIncomplete();
   MEnqueueStatus = EnqueueResultT::SyclEnqueueReady;
 
@@ -669,7 +675,7 @@ void Command::makeTraceEventEpilog() {
 Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep,
                                   std::vector<Command *> &ToCleanUp) {
   const QueueImplPtr &WorkerQueue = getWorkerQueue();
-  const ContextImplPtr &WorkerContext = WorkerQueue->getContextImplPtr();
+  const ContextImplPtr &WorkerContext = WorkerQueue ? WorkerQueue->getContextImplPtr() : nullptr;
 
   // 1. Non-host events can be ignored if they are not fully initialized.
   // 2. Some types of commands do not produce PI events after they are
@@ -701,7 +707,8 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep,
 }
 
 const ContextImplPtr &Command::getWorkerContext() const {
-  return MQueue->getContextImplPtr();
+  assert(MWorkerQueue && "MWorkerQueue must not be nullptr");
+  return MWorkerQueue->getContextImplPtr();
 }
 
 const QueueImplPtr &Command::getWorkerQueue() const {
@@ -963,16 +970,12 @@ void AllocaCommandBase::emitInstrumentationData() {
   // Set the relevant meta data properties for this command
   if (MTraceEvent && MFirstInstance) {
     xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
-    xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
-    xpti::addMetadata(TE, "sycl_device_type",
-                      deviceToString(MQueue->get_device()));
-    xpti::addMetadata(TE, "sycl_device_name",
-                      getSyclObjImpl(MQueue->get_device())->getDeviceName());
+    addDeviceMetadata(TE, MQueue);
     xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue->getQueueID());
+                        MQueue ? MQueue->getQueueID() : 0);
   }
 #endif
 }
@@ -1022,7 +1025,7 @@ pi_int32 AllocaCommand::enqueueImp() {
   void *HostPtr = nullptr;
   if (!MIsLeaderAlloca) {
 
-    if (MQueue->is_host()) {
+    if (!MQueue) {
       // Do not need to make allocation if we have a linked device allocation
       Command::waitForEvents(MQueue, EventImpls, Event);
 
@@ -1033,7 +1036,7 @@ pi_int32 AllocaCommand::enqueueImp() {
   // TODO: Check if it is correct to use std::move on stack variable and
   // delete it RawEvents below.
   MMemAllocation = MemoryManager::allocate(
-      MQueue->getContextImplPtr(), getSYCLMemObj(), MInitFromUserData, HostPtr,
+      MQueue ? MQueue->getContextImplPtr() : nullptr, getSYCLMemObj(), MInitFromUserData, HostPtr,
       std::move(EventImpls), Event);
 
   return PI_SUCCESS;
@@ -1043,7 +1046,7 @@ void AllocaCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "ALLOCA ON " << deviceToString(MQueue->get_device()) << "\\n";
+  Stream << "ALLOCA ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n";
   Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
   Stream << " Link : " << this->MLinkedAllocaCmd << "\\n";
   Stream << "\"];" << std::endl;
@@ -1092,7 +1095,7 @@ void AllocaSubBufCommand::emitInstrumentationData() {
     xpti::addMetadata(TE, "access_range_end",
                       this->MRequirement.MAccessRange[1]);
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue->getQueueID());
+                                 MQueue ? MQueue->getQueueID() : 0);
     makeTraceEventEpilog();
   }
 #endif
@@ -1102,7 +1105,7 @@ void *AllocaSubBufCommand::getMemAllocation() const {
   // In some cases parent`s memory allocation might change (e.g., after
   // map/unmap operations). If parent`s memory allocation changes, sub-buffer
   // memory allocation should be changed as well.
-  if (MQueue->is_host()) {
+  if (!MQueue) {
     return static_cast<void *>(
         static_cast<char *>(MParentAlloca->getMemAllocation()) +
         MRequirement.MOffsetInBytes);
@@ -1116,7 +1119,7 @@ pi_int32 AllocaSubBufCommand::enqueueImp() {
   sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
 
   MMemAllocation = MemoryManager::allocateMemSubBuffer(
-      MQueue->getContextImplPtr(), MParentAlloca->getMemAllocation(),
+      MQueue ? MQueue->getContextImplPtr() : nullptr, MParentAlloca->getMemAllocation(),
       MRequirement.MElemSize, MRequirement.MOffsetInBytes,
       MRequirement.MAccessRange, std::move(EventImpls), Event);
 
@@ -1129,7 +1132,7 @@ void AllocaSubBufCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "ALLOCA SUB BUF ON " << deviceToString(MQueue->get_device())
+  Stream << "ALLOCA SUB BUF ON " << MQueue ? deviceToString(MQueue->get_device()) : "host"
          << "\\n";
   Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
   Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n";
@@ -1163,17 +1166,13 @@ void ReleaseCommand::emitInstrumentationData() {
 
   if (MFirstInstance) {
     xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
-    xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
-    xpti::addMetadata(TE, "sycl_device_type",
-                      deviceToString(MQueue->get_device()));
-    xpti::addMetadata(TE, "sycl_device_name",
-                      getSyclObjImpl(MQueue->get_device())->getDeviceName());
+    addDeviceMetadata(TE, MQueue);
     xpti::addMetadata(TE, "allocation_type",
                       commandToName(MAllocaCmd->getType()));
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue->getQueueID());
+                                 MQueue ? MQueue->getQueueID() : 0);
     makeTraceEventEpilog();
   }
 #endif
@@ -1187,9 +1186,9 @@ pi_int32 ReleaseCommand::enqueueImp() {
 
   // On host side we only allocate memory for full buffers.
   // Thus, deallocating sub buffers leads to double memory freeing.
-  SkipRelease |= MQueue->is_host() && MAllocaCmd->getType() == ALLOCA_SUB_BUF;
+  SkipRelease |= !MQueue && MAllocaCmd->getType() == ALLOCA_SUB_BUF;
 
-  const bool CurAllocaIsHost = MAllocaCmd->getQueue()->is_host();
+  const bool CurAllocaIsHost = !MAllocaCmd->getQueue();
   bool NeedUnmap = false;
   if (MAllocaCmd->MLinkedAllocaCmd) {
 
@@ -1213,7 +1212,7 @@ pi_int32 ReleaseCommand::enqueueImp() {
                                     : MAllocaCmd->getQueue();
 
     EventImplPtr UnmapEventImpl(new event_impl(Queue));
-    UnmapEventImpl->setContextImpl(Queue->getContextImplPtr());
+    UnmapEventImpl->setContextImpl(Queue ? Queue->getContextImplPtr() : nullptr);
     UnmapEventImpl->setStateIncomplete();
     sycl::detail::pi::PiEvent &UnmapEvent = UnmapEventImpl->getHandleRef();
 
@@ -1237,7 +1236,7 @@ pi_int32 ReleaseCommand::enqueueImp() {
     Command::waitForEvents(MQueue, EventImpls, Event);
   else {
     MemoryManager::release(
-        MQueue->getContextImplPtr(), MAllocaCmd->getSYCLMemObj(),
+        MQueue ? MQueue->getContextImplPtr() : nullptr, MAllocaCmd->getSYCLMemObj(),
         MAllocaCmd->getMemAllocation(), std::move(EventImpls), Event);
   }
   return PI_SUCCESS;
@@ -1247,7 +1246,7 @@ void ReleaseCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FF827A\", label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "RELEASE ON " << deviceToString(MQueue->get_device()) << "\\n";
+  Stream << "RELEASE ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n";
   Stream << " Alloca : " << MAllocaCmd << "\\n";
   Stream << " MemObj : " << MAllocaCmd->getSYCLMemObj() << "\\n";
   Stream << "\"];" << std::endl;
@@ -1287,16 +1286,12 @@ void MapMemObject::emitInstrumentationData() {
 
   if (MFirstInstance) {
     xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
-    xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
-    xpti::addMetadata(TE, "sycl_device_type",
-                      deviceToString(MQueue->get_device()));
-    xpti::addMetadata(TE, "sycl_device_name",
-                      getSyclObjImpl(MQueue->get_device())->getDeviceName());
+    addDeviceMetadata(TE, MQueue);
     xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue->getQueueID());
+                                 MQueue ? MQueue->getQueueID() : 0);
     makeTraceEventEpilog();
   }
 #endif
@@ -1321,7 +1316,7 @@ void MapMemObject::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "MAP ON " << deviceToString(MQueue->get_device()) << "\\n";
+  Stream << "MAP ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n";
 
   Stream << "\"];" << std::endl;
 
@@ -1352,16 +1347,12 @@ void UnMapMemObject::emitInstrumentationData() {
 
   if (MFirstInstance) {
     xpti_td *TE = static_cast<xpti_td *>(MTraceEvent);
-    xpti::addMetadata(TE, "sycl_device", deviceToID(MQueue->get_device()));
-    xpti::addMetadata(TE, "sycl_device_type",
-                      deviceToString(MQueue->get_device()));
-    xpti::addMetadata(TE, "sycl_device_name",
-                      getSyclObjImpl(MQueue->get_device())->getDeviceName());
+    addDeviceMetadata(TE, MQueue);
     xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue->getQueueID());
+                                 MQueue ? MQueue->getQueueID() : 0);
     makeTraceEventEpilog();
   }
 #endif
@@ -1383,9 +1374,9 @@ bool UnMapMemObject::producesPiEvent() const {
   // an event waitlist and Level Zero plugin attempts to batch these commands,
   // so the execution of kernel B starts only on step 4. This workaround
   // restores the old behavior in this case until this is resolved.
-  return MQueue->getDeviceImplPtr()->getBackend() !=
+  return MQueue && (MQueue->getDeviceImplPtr()->getBackend() !=
              backend::ext_oneapi_level_zero ||
-         MEvent->getHandleRef() != nullptr;
+         MEvent->getHandleRef() != nullptr);
 }
 
 pi_int32 UnMapMemObject::enqueueImp() {
@@ -1406,7 +1397,7 @@ void UnMapMemObject::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#EBC40F\", label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "UNMAP ON " << deviceToString(MQueue->get_device()) << "\\n";
+  Stream << "UNMAP ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n";
 
   Stream << "\"];" << std::endl;
 
@@ -1428,11 +1419,11 @@ MemCpyCommand::MemCpyCommand(Requirement SrcReq,
       MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)),
       MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)),
       MDstAllocaCmd(DstAllocaCmd) {
-  if (!MSrcQueue->is_host()) {
+  if (MSrcQueue) {
     MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
   }
 
-  MWorkerQueue = MQueue->is_host() ? MSrcQueue : MQueue;
+  MWorkerQueue = !MQueue ? MSrcQueue : MQueue;
   MEvent->setWorkerQueue(MWorkerQueue);
 
   emitInstrumentationDataProxy();
@@ -1449,24 +1440,19 @@ void MemCpyCommand::emitInstrumentationData() {
 
   if (MFirstInstance) {
     xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
-    xpti::addMetadata(CmdTraceEvent, "sycl_device",
-                      deviceToID(MQueue->get_device()));
-    xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
-                      deviceToString(MQueue->get_device()));
-    xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
-                      getSyclObjImpl(MQueue->get_device())->getDeviceName());
+    addDeviceMetadata(CmdTraceEvent, MQueue);
     xpti::addMetadata(CmdTraceEvent, "memory_object",
                       reinterpret_cast<size_t>(MAddress));
     xpti::addMetadata(CmdTraceEvent, "copy_from",
-                      reinterpret_cast<size_t>(
-                          getSyclObjImpl(MSrcQueue->get_device()).get()));
+                      MSrcQueue ? reinterpret_cast<size_t>(
+                          getSyclObjImpl(MSrcQueue->get_device()).get()) : nullptr);
     xpti::addMetadata(
         CmdTraceEvent, "copy_to",
-        reinterpret_cast<size_t>(getSyclObjImpl(MQueue->get_device()).get()));
+        MQueue ? reinterpret_cast<size_t>(getSyclObjImpl(MQueue->get_device()).get()): nullptr);
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue->getQueueID());
+                        MQueue ? MQueue->getQueueID() : 0);
     makeTraceEventEpilog();
   }
 #endif
@@ -1492,7 +1478,7 @@ bool MemCpyCommand::producesPiEvent() const {
   // an event waitlist and Level Zero plugin attempts to batch these commands,
   // so the execution of kernel B starts only on step 4. This workaround
   // restores the old behavior in this case until this is resolved.
-  return MQueue->is_host() ||
+  return !MQueue ||
          MQueue->getDeviceImplPtr()->getBackend() !=
              backend::ext_oneapi_level_zero ||
          MEvent->getHandleRef() != nullptr;
@@ -1521,10 +1507,10 @@ void MemCpyCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#C7EB15\" label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "MEMCPY ON " << deviceToString(MQueue->get_device()) << "\\n";
-  Stream << "From: " << MSrcAllocaCmd << " is host: " << MSrcQueue->is_host()
+  Stream << "MEMCPY ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n";
+  Stream << "From: " << MSrcAllocaCmd << " is host: " << !MSrcQueue
          << "\\n";
-  Stream << "To: " << MDstAllocaCmd << " is host: " << MQueue->is_host()
+  Stream << "To: " << MDstAllocaCmd << " is host: " << !MQueue
          << "\\n";
 
   Stream << "\"];" << std::endl;
@@ -1579,7 +1565,7 @@ void UpdateHostRequirementCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#f1337f\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "UPDATE REQ ON " << deviceToString(MQueue->get_device()) << "\\n";
+  Stream << "UPDATE REQ ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n";
   bool IsReqOnBuffer =
       MDstReq.MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer;
   Stream << "TYPE: " << (IsReqOnBuffer ? "Buffer" : "Image") << "\\n";
@@ -1606,11 +1592,11 @@ MemCpyCommandHost::MemCpyCommandHost(Requirement SrcReq,
     : Command(CommandType::COPY_MEMORY, std::move(DstQueue)),
       MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)),
       MSrcAllocaCmd(SrcAllocaCmd), MDstReq(std::move(DstReq)), MDstPtr(DstPtr) {
-  if (!MSrcQueue->is_host()) {
+  if (MSrcQueue) {
     MEvent->setContextImpl(MSrcQueue->getContextImplPtr());
   }
 
-  MWorkerQueue = MQueue->is_host() ? MSrcQueue : MQueue;
+  MWorkerQueue = !MQueue ? MSrcQueue : MQueue;
   MEvent->setWorkerQueue(MWorkerQueue);
 
   emitInstrumentationDataProxy();
@@ -1627,24 +1613,19 @@ void MemCpyCommandHost::emitInstrumentationData() {
 
   if (MFirstInstance) {
     xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
-    xpti::addMetadata(CmdTraceEvent, "sycl_device",
-                      deviceToID(MQueue->get_device()));
-    xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
-                      deviceToString(MQueue->get_device()));
-    xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
-                      getSyclObjImpl(MQueue->get_device())->getDeviceName());
+    addDeviceMetadata(CmdTraceEvent, MQueue);
     xpti::addMetadata(CmdTraceEvent, "memory_object",
                       reinterpret_cast<size_t>(MAddress));
     xpti::addMetadata(CmdTraceEvent, "copy_from",
                       reinterpret_cast<size_t>(
-                          getSyclObjImpl(MSrcQueue->get_device()).get()));
+                          MSrcQueue ? getSyclObjImpl(MSrcQueue->get_device()).get()) : "nullptr");
     xpti::addMetadata(
         CmdTraceEvent, "copy_to",
-        reinterpret_cast<size_t>(getSyclObjImpl(MQueue->get_device()).get()));
+        MQueue ? reinterpret_cast<size_t>(getSyclObjImpl(MQueue->get_device()).get()) : "nullptr");
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue->getQueueID());
+                        MQueue ? MQueue->getQueueID() : 0);
     makeTraceEventEpilog();
   }
 #endif
@@ -1726,18 +1707,13 @@ void EmptyCommand::emitInstrumentationData() {
 
   if (MFirstInstance) {
     xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
-    xpti::addMetadata(CmdTraceEvent, "sycl_device",
-                      deviceToID(MQueue->get_device()));
-    xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
-                      deviceToString(MQueue->get_device()));
-    xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
-                      getSyclObjImpl(MQueue->get_device())->getDeviceName());
+    addDeviceMetadata(CmdTraceEvent, MQueue);
     xpti::addMetadata(CmdTraceEvent, "memory_object",
                       reinterpret_cast<size_t>(MAddress));
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue->getQueueID());
+                        MQueue ? MQueue->getQueueID() : 0);
     makeTraceEventEpilog();
   }
 #endif
@@ -1766,7 +1742,7 @@ void MemCpyCommandHost::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "MEMCPY HOST ON " << deviceToString(MQueue->get_device()) << "\\n";
+  Stream << "MEMCPY HOST ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n";
 
   Stream << "\"];" << std::endl;
 
@@ -1799,18 +1775,13 @@ void UpdateHostRequirementCommand::emitInstrumentationData() {
 
   if (MFirstInstance) {
     xpti_td *CmdTraceEvent = static_cast<xpti_td *>(MTraceEvent);
-    xpti::addMetadata(CmdTraceEvent, "sycl_device",
-                      deviceToID(MQueue->get_device()));
-    xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
-                      deviceToString(MQueue->get_device()));
-    xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
-                      getSyclObjImpl(MQueue->get_device())->getDeviceName());
+    addDeviceMetadata(CmdTraceEvent, MQueue);
     xpti::addMetadata(CmdTraceEvent, "memory_object",
                       reinterpret_cast<size_t>(MAddress));
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue->getQueueID());
+                        MQueue ? MQueue->getQueueID() : 0);
     makeTraceEventEpilog();
   }
 #endif
@@ -1960,6 +1931,7 @@ void instrumentationAddExtraKernelMetadata(
     if (!SyclKernel->isCreatedFromSource())
       EliminatedArgMask = SyclKernel->getKernelArgMask();
   } else {
+    assert(Queue && "Queue with submitted kernel could not be on host");
     std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
         detail::ProgramManager::getInstance().getOrCreateKernel(
             Queue->getContextImplPtr(), Queue->getDeviceImplPtr(), KernelName);
@@ -2024,12 +1996,7 @@ void instrumentationFillCommonData(const std::string &KernelName,
     if (CGKernelInstanceNo > 1)
       return;
 
-    xpti::addMetadata(CmdTraceEvent, "sycl_device",
-                      deviceToID(Queue->get_device()));
-    xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
-                      deviceToString(Queue->get_device()));
-    xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
-                      getSyclObjImpl(Queue->get_device())->getDeviceName());
+    addDeviceMetadata(CmdTraceEvent, Queue);
     if (!KernelName.empty()) {
       xpti::addMetadata(CmdTraceEvent, "kernel_name", KernelName);
     }
@@ -2080,7 +2047,7 @@ std::pair<xpti_td *, uint64_t> emitKernelInstrumentationData(
   if (CmdTraceEvent) {
     // Stash the queue_id mutable metadata in TLS
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 Queue->getQueueID());
+                        MQueue ? MQueue->getQueueID() : 0);
 
     instrumentationAddExtraKernelMetadata(CmdTraceEvent, NDRDesc,
                                           KernelBundleImplPtr, SyclKernelName,
@@ -2126,7 +2093,7 @@ void ExecCGCommand::emitInstrumentationData() {
 
   if (CmdTraceEvent) {
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue->getQueueID());
+                        MQueue ? MQueue->getQueueID() : 0);
     MTraceEvent = static_cast<void *>(CmdTraceEvent);
     if (MCommandGroup->getType() == detail::CG::Kernel) {
       auto KernelCG =
@@ -2149,7 +2116,7 @@ void ExecCGCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "EXEC CG ON " << deviceToString(MQueue->get_device()) << "\\n";
+  Stream << "EXEC CG ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n";
 
   switch (MCommandGroup->getType()) {
   case detail::CG::Kernel: {
@@ -2330,6 +2297,7 @@ static pi_result SetKernelParamsAndLaunch(
     const KernelArgMask *EliminatedArgMask,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
     bool IsCooperative) {
+  assert(Queue && "Queue with submitted kernel could not be on host");
   const PluginPtr &Plugin = Queue->getPlugin();
 
   auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
@@ -2521,7 +2489,7 @@ pi_int32 enqueueImpKernel(
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
     sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig,
     const bool KernelIsCooperative) {
-
+  assert(Queue && "Queue with submitted kernel could not be on host");
   // Run OpenCL kernel
   auto ContextImpl = Queue->getContextImplPtr();
   auto DeviceImpl = Queue->getDeviceImplPtr();
@@ -2636,6 +2604,7 @@ enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName,
                          bool blocking, void *ptr, size_t size,
                          std::vector<sycl::detail::pi::PiEvent> &RawEvents,
                          const detail::EventImplPtr &OutEventImpl, bool read) {
+  assert(Queue && "Queue with submitted read write host pipe could not be on host");
   detail::HostPipeMapEntry *hostPipeEntry =
       ProgramManager::getInstance().getHostPipeEntry(PipeName);
 
@@ -3309,19 +3278,14 @@ void KernelFusionCommand::emitInstrumentationData() {
   // This function is called in the constructor of the command. At this point
   // the kernel fusion list is still empty, so we don't have a terrible lot of
   // information we could attach to this node here.
-  if (MFirstInstance && CmdTraceEvent) {
-    xpti::addMetadata(CmdTraceEvent, "sycl_device",
-                      deviceToID(MQueue->get_device()));
-    xpti::addMetadata(CmdTraceEvent, "sycl_device_type",
-                      deviceToString(MQueue->get_device()));
-    xpti::addMetadata(CmdTraceEvent, "sycl_device_name",
-                      getSyclObjImpl(MQueue->get_device())->getDeviceName());
-  }
+  if (MFirstInstance && CmdTraceEvent)
+    addDeviceMetadata(CmdTraceEVent, MQueue);
+
   if (MFirstInstance) {
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue->getQueueID());
+                        MQueue ? MQueue->getQueueID() : 0);
     xptiNotifySubscribers(MStreamID, NotificationTraceType,
                           detail::GSYCLGraphEvent,
                           static_cast<xpti_td *>(MTraceEvent), MInstanceID,
@@ -3335,7 +3299,7 @@ void KernelFusionCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "KERNEL FUSION on " << deviceToString(MQueue->get_device()) << "\\n"
+  Stream << "KERNEL FUSION on " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n"
          << "FUSION LIST: {";
   bool Initial = true;
   for (auto *Cmd : MFusionList) {

From f0868f5ecb17b2886e999e4891725e1695e22c36 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 5 Jun 2024 04:31:26 -0700
Subject: [PATCH 14/52] handle nullptr Queue in commands.*

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/queue_impl.hpp         |  6 ++--
 sycl/source/detail/scheduler/commands.cpp | 39 ++++++++++++++++-------
 2 files changed, 30 insertions(+), 15 deletions(-)

diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index c205b5916f302..15e19f143f29d 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -670,9 +670,9 @@ class queue_impl {
     MExceptions.PushBack(ExceptionPtr);
   }
 
-  ThreadPool &getThreadPool() {
-    return GlobalHandler::instance().getHostTaskThreadPool();
-  }
+  // ThreadPool &getThreadPool() {
+  //   return GlobalHandler::instance().getHostTaskThreadPool();
+  // }
 
   /// Gets the native handle of the SYCL queue.
   ///
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index f7962bb7a5d66..55b29ac7dd426 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -87,6 +87,13 @@ static addDeviceMetadata(xpti_td* TraceEvent, const QueueImplPtr& Queue)
 }
 #endif
 
+static ContextImplPtr getContext(const QueueImplPtr& Queue)
+{
+  if (Queue)
+    return Queue->getContextImplPtr();
+  return nullptr;
+}
+
 #ifdef __SYCL_ENABLE_GNU_DEMANGLING
 struct DemangleHandle {
   char *p;
@@ -490,7 +497,8 @@ Command::Command(
   MEvent->setWorkerQueue(MWorkerQueue);
   MEvent->setSubmittedQueue(MWorkerQueue);
   MEvent->setCommand(this);
-  MEvent->setContextImpl(MQueue ? MQueue->getContextImplPtr(): nullptr);
+  if (MQueue)
+    MEvent->setContextImpl(MQueue->getContextImplPtr());
   MEvent->setStateIncomplete();
   MEnqueueStatus = EnqueueResultT::SyclEnqueueReady;
 
@@ -707,12 +715,12 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep,
 }
 
 const ContextImplPtr &Command::getWorkerContext() const {
-  assert(MWorkerQueue && "MWorkerQueue must not be nullptr");
+  if (!MWorkerQueue)
+    return nullptr;
   return MWorkerQueue->getContextImplPtr();
 }
 
 const QueueImplPtr &Command::getWorkerQueue() const {
-  assert(MWorkerQueue && "MWorkerQueue must not be nullptr");
   return MWorkerQueue;
 }
 
@@ -1036,7 +1044,7 @@ pi_int32 AllocaCommand::enqueueImp() {
   // TODO: Check if it is correct to use std::move on stack variable and
   // delete it RawEvents below.
   MMemAllocation = MemoryManager::allocate(
-      MQueue ? MQueue->getContextImplPtr() : nullptr, getSYCLMemObj(), MInitFromUserData, HostPtr,
+      getContext(MQueue), getSYCLMemObj(), MInitFromUserData, HostPtr,
       std::move(EventImpls), Event);
 
   return PI_SUCCESS;
@@ -1119,7 +1127,7 @@ pi_int32 AllocaSubBufCommand::enqueueImp() {
   sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
 
   MMemAllocation = MemoryManager::allocateMemSubBuffer(
-      MQueue ? MQueue->getContextImplPtr() : nullptr, MParentAlloca->getMemAllocation(),
+      getContext(MQueue), MParentAlloca->getMemAllocation(),
       MRequirement.MElemSize, MRequirement.MOffsetInBytes,
       MRequirement.MAccessRange, std::move(EventImpls), Event);
 
@@ -1212,7 +1220,7 @@ pi_int32 ReleaseCommand::enqueueImp() {
                                     : MAllocaCmd->getQueue();
 
     EventImplPtr UnmapEventImpl(new event_impl(Queue));
-    UnmapEventImpl->setContextImpl(Queue ? Queue->getContextImplPtr() : nullptr);
+    UnmapEventImpl->setContextImpl(getContext(Queue));
     UnmapEventImpl->setStateIncomplete();
     sycl::detail::pi::PiEvent &UnmapEvent = UnmapEventImpl->getHandleRef();
 
@@ -1236,7 +1244,7 @@ pi_int32 ReleaseCommand::enqueueImp() {
     Command::waitForEvents(MQueue, EventImpls, Event);
   else {
     MemoryManager::release(
-        MQueue ? MQueue->getContextImplPtr() : nullptr, MAllocaCmd->getSYCLMemObj(),
+        getContext(MQueue), MAllocaCmd->getSYCLMemObj(),
         MAllocaCmd->getMemAllocation(), std::move(EventImpls), Event);
   }
   return PI_SUCCESS;
@@ -2654,6 +2662,7 @@ enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName,
 }
 
 pi_int32 ExecCGCommand::enqueueImpCommandBuffer() {
+  assert(MQueue && "Device queue is required for command buffer enqueue");
   // Wait on host command dependencies
   waitForPreparedHostEvents();
 
@@ -2819,8 +2828,8 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
   auto RawEvents = getPiEvents(EventImpls);
   flushCrossQueueDeps(EventImpls, getWorkerQueue());
 
-  bool DiscardPiEvent = (MQueue->supportsDiscardingPiEvents() &&
-                         MCommandGroup->getRequirements().size() == 0);
+  bool DiscardPiEvent = MQueue && MQueue->supportsDiscardingPiEvents() &&
+                         (MCommandGroup->getRequirements().size() == 0);
   sycl::detail::pi::PiEvent *Event =
       DiscardPiEvent ? nullptr : &MEvent->getHandleRef();
   detail::EventImplPtr EventImpl = DiscardPiEvent ? nullptr : MEvent;
@@ -2894,6 +2903,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     return PI_SUCCESS;
   }
   case CG::CGTYPE::Kernel: {
+    assert(MQueue && "Device queue must be present for kernel command");
     CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
 
     NDRDescT &NDRDesc = ExecKernel->MNDRDesc;
@@ -3039,8 +3049,8 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
             Req->MSYCLMemObj->MRecord->MAllocaCommands;
 
         for (AllocaCommandBase *AllocaCmd : AllocaCmds)
-          if (HostTask->MQueue->getContextImplPtr() ==
-              AllocaCmd->getQueue()->getContextImplPtr()) {
+          if (getContext(HostTask->MQueue) ==
+              getContext(AllocaCmd->getQueue()) {
             auto MemArg =
                 reinterpret_cast<pi_mem>(AllocaCmd->getMemAllocation());
             ReqToMem.emplace_back(std::make_pair(Req, MemArg));
@@ -3064,7 +3074,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     // submitted to report exception origin properly.
     copySubmissionCodeLocation();
 
-    MQueue->getThreadPool().submit<DispatchHostTask>(
+    getThreadPool().submit<DispatchHostTask>(
         DispatchHostTask(this, std::move(ReqToMem)));
 
     MShouldCompleteEventIfPossible = false;
@@ -3072,6 +3082,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     return PI_SUCCESS;
   }
   case CG::CGTYPE::Barrier: {
+    assert(MQueue && "Device queue must be present for barrier command");
     const PluginPtr &Plugin = MQueue->getPlugin();
     if (MEvent != nullptr)
       MEvent->setHostEnqueueTime();
@@ -3081,6 +3092,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     return PI_SUCCESS;
   }
   case CG::CGTYPE::BarrierWaitlist: {
+    assert(MQueue && "Device queue must be present for barrier with wait list command");
     CGBarrier *Barrier = static_cast<CGBarrier *>(MCommandGroup.get());
     std::vector<detail::EventImplPtr> Events = Barrier->MEventsWaitWithBarrier;
     std::vector<sycl::detail::pi::PiEvent> PiEvents =
@@ -3132,6 +3144,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
                                     typeSize, RawEvents, EventImpl, read);
   }
   case CG::CGTYPE::ExecCommandBuffer: {
+    assert(MQueue && "Device queue must be present for command buffer enqueue");
     CGExecCommandBuffer *CmdBufferCG =
         static_cast<CGExecCommandBuffer *>(MCommandGroup.get());
     if (MEvent != nullptr)
@@ -3155,6 +3168,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     return PI_SUCCESS;
   }
   case CG::CGTYPE::SemaphoreWait: {
+    assert(MQueue && "Device queue must be present for semaphore wait command");
     CGSemaphoreWait *SemWait = (CGSemaphoreWait *)MCommandGroup.get();
 
     const detail::PluginPtr &Plugin = MQueue->getPlugin();
@@ -3165,6 +3179,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     return PI_SUCCESS;
   }
   case CG::CGTYPE::SemaphoreSignal: {
+    assert(MQueue && "Device queue must be present for semaphore signal command");
     CGSemaphoreSignal *SemSignal = (CGSemaphoreSignal *)MCommandGroup.get();
 
     const detail::PluginPtr &Plugin = MQueue->getPlugin();

From 3d044e896cc6ff1d851c56268dfeb2dc623b55e9 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 5 Jun 2024 06:04:41 -0700
Subject: [PATCH 15/52] non-buildable: handle nullptr queue in
 memory_manager.cpp

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/event_impl.cpp     | 12 +++++++-----
 sycl/source/detail/memory_manager.cpp | 22 ++++++++++++++++++++--
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index 28bb37200392a..be32787c0aa4d 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -149,15 +149,16 @@ event_impl::event_impl(sycl::detail::pi::PiEvent Event,
 }
 
 event_impl::event_impl(const QueueImplPtr &Queue) {
-  this->setContextImpl(Queue->getContextImplPtr());
+  // Queue == nullptr means that it is a host task event
+  this->setContextImpl(getContext(Queue));
   this->associateWithQueue(Queue);
 }
 
 void event_impl::associateWithQueue(const QueueImplPtr &Queue) {
   MQueue = Queue;
-  MIsProfilingEnabled = Queue->MIsProfilingEnabled;
+  MIsProfilingEnabled = Queue && Queue->MIsProfilingEnabled;
   MFallbackProfiling = MIsProfilingEnabled && Queue->isProfilingFallback();
-  MState.store(HES_Complete);
+  MState.store(Queue ? HES_Complete : HES_NotComplete);
 }
 
 void *event_impl::instrumentationProlog(std::string &Name, int32_t StreamID,
@@ -402,8 +403,9 @@ event_impl::get_backend_info<info::platform::version>() const {
         ->get_platform()
         .get_info<info::platform::version>();
   }
-  return ""; // If the queue has been released, no platform will be associated
-             // so return empty string
+  // If the queue has been released, no platform will be associated
+  // so return empty string.
+  return ""; 
 }
 
 template <>
diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp
index 3c0ad08e0763f..30827adb15e8f 100644
--- a/sycl/source/detail/memory_manager.cpp
+++ b/sycl/source/detail/memory_manager.cpp
@@ -482,6 +482,7 @@ void copyH2D(SYCLMemObjI *SYCLMemObj, char *SrcMem, QueueImplPtr,
              const detail::EventImplPtr &OutEventImpl) {
   (void)SrcAccessRange;
   assert(SYCLMemObj && "The SYCLMemObj is nullptr");
+  assert(TgtQueue && "Destination mem object queue must be not nullptr");
 
   const sycl::detail::pi::PiQueue Queue = TgtQueue->getHandleRef();
   const PluginPtr &Plugin = TgtQueue->getPlugin();
@@ -560,6 +561,7 @@ void copyD2H(SYCLMemObjI *SYCLMemObj, sycl::detail::pi::PiMem SrcMem,
              const detail::EventImplPtr &OutEventImpl) {
   (void)DstAccessRange;
   assert(SYCLMemObj && "The SYCLMemObj is nullptr");
+  assert(SrcQueue && "Source mem object queue is expected to be not nullptr");
 
   const sycl::detail::pi::PiQueue Queue = SrcQueue->getHandleRef();
   const PluginPtr &Plugin = SrcQueue->getPlugin();
@@ -641,6 +643,7 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, sycl::detail::pi::PiMem SrcMem,
              sycl::detail::pi::PiEvent &OutEvent,
              const detail::EventImplPtr &OutEventImpl) {
   assert(SYCLMemObj && "The SYCLMemObj is nullptr");
+  assert(SrcQueue && TgtQueue && "Source mem object and target mem object queues are expected to be not nullptr");
 
   const sycl::detail::pi::PiQueue Queue = SrcQueue->getHandleRef();
   const PluginPtr &Plugin = SrcQueue->getPlugin();
@@ -804,6 +807,7 @@ void MemoryManager::fill(SYCLMemObjI *SYCLMemObj, void *Mem, QueueImplPtr Queue,
                          sycl::detail::pi::PiEvent &OutEvent,
                          const detail::EventImplPtr &OutEventImpl) {
   assert(SYCLMemObj && "The SYCLMemObj is nullptr");
+  assert(Queue && "Fill should be called only with a valid device queue");
 
   const PluginPtr &Plugin = Queue->getPlugin();
 
@@ -861,7 +865,7 @@ void *MemoryManager::map(SYCLMemObjI *, void *Mem, QueueImplPtr Queue,
                          unsigned int ElementSize,
                          std::vector<sycl::detail::pi::PiEvent> DepEvents,
                          sycl::detail::pi::PiEvent &OutEvent) {
-  if (Queue->is_host()) {
+  if (!Queue) {
     throw runtime_error("Not supported configuration of map requested",
                         PI_ERROR_INVALID_OPERATION);
   }
@@ -907,6 +911,10 @@ void MemoryManager::unmap(SYCLMemObjI *, void *Mem, QueueImplPtr Queue,
                           sycl::detail::pi::PiEvent &OutEvent) {
 
   // Host queue is not supported here.
+  if (!Queue) {
+    throw runtime_error("Not supported configuration of unmap requested",
+                        PI_ERROR_INVALID_OPERATION);
+  }
   // All DepEvents are to the same Context.
   // Using the plugin of the Queue.
 
@@ -921,6 +929,7 @@ void MemoryManager::copy_usm(const void *SrcMem, QueueImplPtr SrcQueue,
                              std::vector<sycl::detail::pi::PiEvent> DepEvents,
                              sycl::detail::pi::PiEvent *OutEvent,
                              const detail::EventImplPtr &OutEventImpl) {
+  assert(SrcQueue && "USM copy must be called with a valid device queue");
   if (!Len) { // no-op, but ensure DepEvents will still be waited on
     if (!DepEvents.empty()) {
       if (OutEventImpl != nullptr)
@@ -959,6 +968,7 @@ void MemoryManager::fill_usm(void *Mem, QueueImplPtr Queue, size_t Length,
                              std::vector<sycl::detail::pi::PiEvent> DepEvents,
                              sycl::detail::pi::PiEvent *OutEvent,
                              const detail::EventImplPtr &OutEventImpl) {
+  assert(Queue && "USM fill must be called with a valid device queue");
   if (!Length) { // no-op, but ensure DepEvents will still be waited on
     if (!DepEvents.empty()) {
       if (OutEventImpl != nullptr)
@@ -994,6 +1004,7 @@ void MemoryManager::prefetch_usm(
     std::vector<sycl::detail::pi::PiEvent> DepEvents,
     sycl::detail::pi::PiEvent *OutEvent,
     const detail::EventImplPtr &OutEventImpl) {
+  assert(Queue && "USM prefetch must be called with a valid device queue");
   const PluginPtr &Plugin = Queue->getPlugin();
   if (OutEventImpl != nullptr)
     OutEventImpl->setHostEnqueueTime();
@@ -1015,6 +1026,7 @@ void MemoryManager::advise_usm(
     std::vector<sycl::detail::pi::PiEvent> /*DepEvents*/,
     sycl::detail::pi::PiEvent *OutEvent,
     const detail::EventImplPtr &OutEventImpl) {
+  assert(Queue && "USM advise must be called with a valid device queue");
   const PluginPtr &Plugin = Queue->getPlugin();
   if (OutEventImpl != nullptr)
     OutEventImpl->setHostEnqueueTime();
@@ -1037,6 +1049,7 @@ void MemoryManager::copy_2d_usm(
     std::vector<sycl::detail::pi::PiEvent> DepEvents,
     sycl::detail::pi::PiEvent *OutEvent,
     const detail::EventImplPtr &OutEventImpl) {
+  assert(Queue && "USM copy 2d must be called with a valid device queue");
   if (Width == 0 || Height == 0) {
     // no-op, but ensure DepEvents will still be waited on
     if (!DepEvents.empty()) {
@@ -1122,6 +1135,7 @@ void MemoryManager::fill_2d_usm(
     std::vector<sycl::detail::pi::PiEvent> DepEvents,
     sycl::detail::pi::PiEvent *OutEvent,
     const detail::EventImplPtr &OutEventImpl) {
+  assert(Queue && "USM fill 2d must be called with a valid device queue");
   if (Width == 0 || Height == 0) {
     // no-op, but ensure DepEvents will still be waited on
     if (!DepEvents.empty()) {
@@ -1159,6 +1173,7 @@ void MemoryManager::memset_2d_usm(
     char Value, std::vector<sycl::detail::pi::PiEvent> DepEvents,
     sycl::detail::pi::PiEvent *OutEvent,
     const detail::EventImplPtr &OutEventImpl) {
+  assert(Queue && "USM memset 2d must be called with a valid device queue");
   if (Width == 0 || Height == 0) {
     // no-op, but ensure DepEvents will still be waited on
     if (!DepEvents.empty()) {
@@ -1198,6 +1213,7 @@ memcpyToDeviceGlobalUSM(QueueImplPtr Queue,
                         const std::vector<sycl::detail::pi::PiEvent> &DepEvents,
                         sycl::detail::pi::PiEvent *OutEvent,
                         const detail::EventImplPtr &OutEventImpl) {
+  assert(Queue && "Copy to device global USM must be called with a valid device queue");
   // Get or allocate USM memory for the device_global.
   DeviceGlobalUSMMem &DeviceGlobalUSM =
       DeviceGlobalEntry->getOrAllocateDeviceGlobalUSM(Queue);
@@ -1299,6 +1315,7 @@ static void memcpyToDeviceGlobalDirect(
     size_t NumBytes, size_t Offset, const void *Src,
     const std::vector<sycl::detail::pi::PiEvent> &DepEvents,
     sycl::detail::pi::PiEvent *OutEvent) {
+  assert(Queue && "Direct copy to device global must be called with a valid device queue");
   sycl::detail::pi::PiProgram Program =
       getOrBuildProgramForDeviceGlobal(Queue, DeviceGlobalEntry);
   const PluginPtr &Plugin = Queue->getPlugin();
@@ -1313,6 +1330,7 @@ static void memcpyFromDeviceGlobalDirect(
     size_t NumBytes, size_t Offset, void *Dest,
     const std::vector<sycl::detail::pi::PiEvent> &DepEvents,
     sycl::detail::pi::PiEvent *OutEvent) {
+  assert(Queue && "Direct copy from device global must be called with a valid device queue");
   sycl::detail::pi::PiProgram Program =
       getOrBuildProgramForDeviceGlobal(Queue, DeviceGlobalEntry);
   const PluginPtr &Plugin = Queue->getPlugin();
@@ -1722,7 +1740,7 @@ void MemoryManager::copy_image_bindless(
     sycl::detail::pi::PiImageRegion CopyExtent,
     const std::vector<sycl::detail::pi::PiEvent> &DepEvents,
     sycl::detail::pi::PiEvent *OutEvent) {
-
+  assert(Queue && "Copy image bindless must be called with a valid device queue");
   assert((Flags == (sycl::detail::pi::PiImageCopyFlags)
                        ext::oneapi::experimental::image_copy_flags::HtoD ||
           Flags == (sycl::detail::pi::PiImageCopyFlags)

From b3161e8bf8b978600e6910e7e8953a530ac26d23 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 5 Jun 2024 06:55:19 -0700
Subject: [PATCH 16/52] non-buildable: build enabling

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/queue_impl.hpp             |  6 ++---
 .../source/detail/scheduler/graph_builder.cpp |  6 +++++
 sycl/source/detail/scheduler/scheduler.hpp    | 23 ++++++++-----------
 sycl/source/handler.cpp                       |  9 ++++----
 4 files changed, 22 insertions(+), 22 deletions(-)

diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 15e19f143f29d..a3463225a54d1 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -111,7 +111,7 @@ class queue_impl {
         MDiscardEvents(
             has_property<ext::oneapi::property::queue::discard_events>()),
         MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
-        MSupportsDiscardingPiEvents(MDiscardEvents && MIsInorder)),
+        MSupportsDiscardingPiEvents(MDiscardEvents && MIsInorder),
         MQueueID{
             MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} {
     if (has_property<property::queue::enable_profiling>()) {
@@ -285,7 +285,7 @@ class queue_impl {
         MDiscardEvents(
             has_property<ext::oneapi::property::queue::discard_events>()),
         MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
-        MSupportsDiscardingPiEvents(MDiscardEvents && MIsInorder)),
+        MSupportsDiscardingPiEvents(MDiscardEvents && MIsInorder),
         MQueueID{
             MNextAvailableQueueID.fetch_add(1, std::memory_order_relaxed)} {
     queue_impl_interop(PiQueue);
@@ -305,7 +305,7 @@ class queue_impl {
         MDiscardEvents(
             has_property<ext::oneapi::property::queue::discard_events>()),
         MIsProfilingEnabled(has_property<property::queue::enable_profiling>()),
-        MSupportsDiscardingPiEvents(MDiscardEvents && MIsInorder)) {
+        MSupportsDiscardingPiEvents(MDiscardEvents && MIsInorder) {
     queue_impl_interop(PiQueue);
   }
 
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 6c9244f9ecb2c..d9614e9ca9d51 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -59,6 +59,12 @@ static ContextImplPtr GetContext(const QueueImplPtr& Queue)
   return Queue ? Queue->getContextImplPtr() : nullptr;
 }
 
+bool MemObjRecord::isSameContext(const QueueImplPtr& Queue) const
+{
+  // Covers case for host usage (nullptr == nullptr) and existing device contexts comparison.
+  return MCurContext == (Queue ? Queue->getContextImplPtr() : nullptr);
+}
+
 /// Checks if the required access mode is allowed under the current one.
 static bool isAccessModeAllowed(access::mode Required, access::mode Current) {
   switch (Current) {
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index 6a2bcc4e5004a..61f01863c477b 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -218,26 +218,21 @@ class MemObjRecord {
   // modified. Used while deciding if copy back needed.
   bool MMemModified = false;
 
-  void updateUsage(ContextImplPtr& NewContext)
-  {
-    MCurContext = NewContext;
-  }
-
-  bool isSameContext(const QueueImplPtr& Queue) const
-  {
-    // Covers case for host usage (nullptr == nullptr) and existing device contexts comparison.
-    return LHS == (Queue ? Queue->getContextImplPtr() : nullptr);
-  }
-
-  bool usedOnDevice( return MCurContext != nullptr; )
-
-protected:
   // The context which has the latest state of the memory object.
   ContextImplPtr MCurContext;
 
   // The mode this object can be accessed with from the host (host_accessor).
   // Valid only if the current usage is on host.
   access::mode MHostAccess = access::mode::read_write;
+
+  void updateUsage(ContextImplPtr& NewContext)
+  {
+    MCurContext = NewContext;
+  }
+
+  bool isSameContext(const QueueImplPtr& Queue) const;
+
+  bool usedOnDevice() { return MCurContext != nullptr; }
 };
 
 /// DPC++ graph scheduler class.
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index 749ab6750df5e..c0e0438d9cd2f 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -80,16 +80,15 @@ void *getValueFromDynamicParameter(
 
 } // namespace detail
 
-handler::handler(std::shared_ptr<detail::queue_impl> Queue, bool IsHost)
-    : handler(Queue, Queue, nullptr, IsHost) {}
+handler::handler(std::shared_ptr<detail::queue_impl> Queue)
+    : handler(Queue, Queue, nullptr) {}
 
 handler::handler(std::shared_ptr<detail::queue_impl> Queue,
                  std::shared_ptr<detail::queue_impl> PrimaryQueue,
-                 std::shared_ptr<detail::queue_impl> SecondaryQueue,
-                 bool IsHost)
+                 std::shared_ptr<detail::queue_impl> SecondaryQueue)
     : MImpl(std::make_shared<detail::handler_impl>(std::move(PrimaryQueue),
                                                    std::move(SecondaryQueue))),
-      MQueue(std::move(Queue)), MIsHost(IsHost) {}
+      MQueue(std::move(Queue)) {}
 
 handler::handler(
     std::shared_ptr<ext::oneapi::experimental::detail::graph_impl> Graph)

From 2258a1cbb812161a21af5dbb9a38c170a41badc8 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 5 Jun 2024 08:07:45 -0700
Subject: [PATCH 17/52] not-buildable: build enabling 2

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/buffer_impl.cpp            |   9 +-
 sycl/source/detail/event_impl.cpp             |   2 +-
 sycl/source/detail/memory_manager.cpp         |   4 +-
 sycl/source/detail/queue_impl.cpp             |   4 +-
 sycl/source/detail/queue_impl.hpp             |   5 +
 sycl/source/detail/scheduler/commands.cpp     | 136 +++++++++---------
 sycl/source/detail/scheduler/commands.hpp     |  12 +-
 .../source/detail/scheduler/graph_builder.cpp |  51 +++----
 sycl/source/detail/scheduler/scheduler.hpp    |  12 +-
 9 files changed, 108 insertions(+), 127 deletions(-)

diff --git a/sycl/source/detail/buffer_impl.cpp b/sycl/source/detail/buffer_impl.cpp
index d7d77205b162c..f13444107e9eb 100644
--- a/sycl/source/detail/buffer_impl.cpp
+++ b/sycl/source/detail/buffer_impl.cpp
@@ -68,10 +68,13 @@ buffer_impl::getNativeVector(backend BackendName) const {
     sycl::detail::pi::PiMem NativeMem =
         pi::cast<sycl::detail::pi::PiMem>(Cmd->getMemAllocation());
     auto Ctx = Cmd->getWorkerContext();
-    auto Platform = Ctx->getPlatformImpl();
     // If Host Shared Memory is not supported then there is alloca for host that
-    // doesn't have platform
-    if (!Platform || (Platform->getBackend() != BackendName))
+    // doesn't have context and platform
+    if (!Ctx)
+      continue;
+    PlatformImplPtr Platform = Ctx->getPlatformImpl();
+    assert(Platform && "Platform must be present for device context");
+    if (Platform->getBackend() != BackendName)
       continue;
 
     auto Plugin = Platform->getPlugin();
diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index be32787c0aa4d..e34597aa008d1 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -150,7 +150,7 @@ event_impl::event_impl(sycl::detail::pi::PiEvent Event,
 
 event_impl::event_impl(const QueueImplPtr &Queue) {
   // Queue == nullptr means that it is a host task event
-  this->setContextImpl(getContext(Queue));
+  this->setContextImpl(queue_impl::getContext(Queue));
   this->associateWithQueue(Queue);
 }
 
diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp
index 30827adb15e8f..e2c22f794f587 100644
--- a/sycl/source/detail/memory_manager.cpp
+++ b/sycl/source/detail/memory_manager.cpp
@@ -413,7 +413,7 @@ void *MemoryManager::allocateMemSubBuffer(ContextImplPtr TargetContext,
   waitForEvents(DepEvents);
   OutEvent = nullptr;
 
-  if (TargetContext->is_host())
+  if (!TargetContext)
     return static_cast<void *>(static_cast<char *>(ParentMemObj) + Offset);
 
   size_t SizeInBytes = ElemSize;
@@ -643,7 +643,7 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, sycl::detail::pi::PiMem SrcMem,
              sycl::detail::pi::PiEvent &OutEvent,
              const detail::EventImplPtr &OutEventImpl) {
   assert(SYCLMemObj && "The SYCLMemObj is nullptr");
-  assert(SrcQueue && TgtQueue && "Source mem object and target mem object queues are expected to be not nullptr");
+  assert(SrcQueue && "Source mem object and target mem object queues are expected to be not nullptr");
 
   const sycl::detail::pi::PiQueue Queue = SrcQueue->getHandleRef();
   const PluginPtr &Plugin = SrcQueue->getPlugin();
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index c1c1d3835a54d..ce4dd462eef32 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -284,12 +284,12 @@ void queue_impl::addEvent(const event &Event) {
     // if there is no command on the event, we cannot track it with MEventsWeak
     // as that will leave it with no owner. Track in MEventsShared only if we're
     // unable to call piQueueFinish during wait.
-    if (Event->isHost() || MEmulateOOO)
+    if (EImpl->isHost() || MEmulateOOO)
       addSharedEvent(Event);
   }
   // As long as the queue supports piQueueFinish we only need to store events
   // for unenqueued commands and host tasks.
-  else if (Event->isHost() || MEmulateOOO || EImpl->getHandleRef() == nullptr) {
+  else if (EImpl->isHost() || MEmulateOOO || EImpl->getHandleRef() == nullptr) {
     std::weak_ptr<event_impl> EventWeakPtr{EImpl};
     std::lock_guard<std::mutex> Lock{MMutex};
     MEventsWeak.push_back(std::move(EventWeakPtr));
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index a3463225a54d1..61f34c35c7baf 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -750,6 +750,11 @@ class queue_impl {
   // tasks and host tasks is applicable for out of order queues only. Not neede
   // for in order ones.
   void revisitUnenqueuedCommandsState(const EventImplPtr &CompletedHostTask);
+ 
+  static ContextImplPtr getContext(const QueueImplPtr& Queue)
+  {
+    return Queue ? Queue->getContextImplPtr() : nullptr;
+  }
 
 protected:
   event discard_or_return(const event &Event);
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 55b29ac7dd426..05873f23f45a9 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -75,16 +75,32 @@ static void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID,
                         static_cast<xpti_td *>(TraceEvent), InstanceID, Addr);
 }
 
-static addDeviceMetadata(xpti_td* TraceEvent, const QueueImplPtr& Queue)
+static size_t deviceToID(const device &Device) {
+  return reinterpret_cast<size_t>(getSyclObjImpl(Device)->getHandleRef());
+}
+
+static std::string deviceToString(device Device) {
+  if (Device.is_cpu())
+    return "CPU";
+  else if (Device.is_gpu())
+    return "GPU";
+  else if (Device.is_accelerator())
+    return "ACCELERATOR";
+  else
+    return "UNKNOWN";
+}
+
+static void addDeviceMetadata(xpti_td* TraceEvent, const QueueImplPtr& Queue)
 {
     xpti::addMetadata(TraceEvent, "sycl_device",
-                      Queue ? deviceToID(MQueue->get_device()) : nullptr);
+                      Queue ? deviceToID(Queue->get_device()) : 0);
     xpti::addMetadata(TraceEvent, "sycl_device_type",
-                      Queue ? deviceToString(MQueue->get_device()) : "host");
+                      Queue ? deviceToString(Queue->get_device()) : "host");
     if (Queue)
       xpti::addMetadata(TraceEvent, "sycl_device_name",
-                      getSyclObjImpl(MQueue->get_device())->getDeviceName());
+                      getSyclObjImpl(Queue->get_device())->getDeviceName());
 }
+
 #endif
 
 static ContextImplPtr getContext(const QueueImplPtr& Queue)
@@ -113,17 +129,6 @@ static std::string demangleKernelName(std::string Name) {
 static std::string demangleKernelName(std::string Name) { return Name; }
 #endif
 
-static std::string deviceToString(device Device) {
-  if (Device.is_cpu())
-    return "CPU";
-  else if (Device.is_gpu())
-    return "GPU";
-  else if (Device.is_accelerator())
-    return "ACCELERATOR";
-  else
-    return "UNKNOWN";
-}
-
 void applyFuncOnFilteredArgs(
     const KernelArgMask *EliminatedArgMask, std::vector<ArgDesc> &Args,
     std::function<void(detail::ArgDesc &Arg, int NextTrueIndex)> Func) {
@@ -158,12 +163,6 @@ void applyFuncOnFilteredArgs(
   }
 }
 
-#ifdef XPTI_ENABLE_INSTRUMENTATION
-static size_t deviceToID(const device &Device) {
-  return reinterpret_cast<size_t>(getSyclObjImpl(Device)->getHandleRef());
-}
-#endif
-
 static std::string accessModeToString(access::mode Mode) {
   switch (Mode) {
   case access::mode::read:
@@ -253,9 +252,8 @@ Command::getPiEvents(const std::vector<EventImplPtr> &EventImpls) const {
     // At this stage dependency is definitely pi task and need to check if
     // current one is a host task. In this case we should not skip pi event due
     // to different sync mechanisms for different task types on in-order queue.
-    const QueueImplPtr &WorkerQueue = getWorkerQueue();
-    if (WorkerQueue && EventImpl->getWorkerQueue() == WorkerQueue &&
-        WorkerQueue->isInOrder() && !isHostTask())
+    if (MWorkerQueue && EventImpl->getWorkerQueue() == MWorkerQueue &&
+        MWorkerQueue->isInOrder() && !isHostTask())
       continue;
 
     RetPiEvents.push_back(EventImpl->getHandleRef());
@@ -293,9 +291,8 @@ std::vector<sycl::detail::pi::PiEvent> Command::getPiEventsBlocking(
     // At this stage dependency is definitely pi task and need to check if
     // current one is a host task. In this case we should not skip pi event due
     // to different sync mechanisms for different task types on in-order queue.
-    const QueueImplPtr &WorkerQueue = getWorkerQueue();
-    if (MWorkerQueue && EventImpl->getWorkerQueue() == WorkerQueue &&
-        WorkerQueue->isInOrder() && !isHostTask())
+    if (MWorkerQueue && EventImpl->getWorkerQueue() == MWorkerQueue &&
+        MWorkerQueue->isInOrder() && !isHostTask())
       continue;
 
     RetPiEvents.push_back(EventImpl->getHandleRef());
@@ -431,7 +428,7 @@ class DispatchHostTask {
         }
       }
 #endif
-      MthisCmd->MEvent->getSubmittedQueue()->reportAsyncException(CurrentException);
+      MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(CurrentException);
     }
 
     HostTask.MHostTask.reset();
@@ -448,7 +445,7 @@ class DispatchHostTask {
       Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd);
     } catch (...) {
       auto CurrentException = std::current_exception();
-      MthisCmd->MEvent->getSubmittedQueue()->reportAsyncException(CurrentException);
+      MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(CurrentException);
     }
   }
 };
@@ -471,7 +468,7 @@ void Command::waitForEvents(QueueImplPtr Queue,
 
       std::vector<sycl::detail::pi::PiEvent> RawEvents =
           getPiEvents(EventImpls);
-      flushCrossQueueDeps(EventImpls, getWorkerQueue());
+      flushCrossQueueDeps(EventImpls, MWorkerQueue);
       const PluginPtr &Plugin = Queue->getPlugin();
 
       if (MEvent != nullptr)
@@ -682,8 +679,7 @@ void Command::makeTraceEventEpilog() {
 
 Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep,
                                   std::vector<Command *> &ToCleanUp) {
-  const QueueImplPtr &WorkerQueue = getWorkerQueue();
-  const ContextImplPtr &WorkerContext = WorkerQueue ? WorkerQueue->getContextImplPtr() : nullptr;
+  const ContextImplPtr &WorkerContext = getWorkerContext();
 
   // 1. Non-host events can be ignored if they are not fully initialized.
   // 2. Some types of commands do not produce PI events after they are
@@ -714,14 +710,10 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep,
   return ConnectionCmd;
 }
 
-const ContextImplPtr &Command::getWorkerContext() const {
-  if (!MWorkerQueue)
+ContextImplPtr Command::getWorkerContext() const {
+  if (!MQueue)
     return nullptr;
-  return MWorkerQueue->getContextImplPtr();
-}
-
-const QueueImplPtr &Command::getWorkerQueue() const {
-  return MWorkerQueue;
+  return MQueue->getContextImplPtr();
 }
 
 bool Command::producesPiEvent() const { return true; }
@@ -1054,7 +1046,7 @@ void AllocaCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "ALLOCA ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n";
+  Stream << "ALLOCA ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
   Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
   Stream << " Link : " << this->MLinkedAllocaCmd << "\\n";
   Stream << "\"];" << std::endl;
@@ -1140,7 +1132,7 @@ void AllocaSubBufCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "ALLOCA SUB BUF ON " << MQueue ? deviceToString(MQueue->get_device()) : "host"
+  Stream << "ALLOCA SUB BUF ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host")
          << "\\n";
   Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
   Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n";
@@ -1254,7 +1246,7 @@ void ReleaseCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FF827A\", label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "RELEASE ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n";
+  Stream << "RELEASE ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
   Stream << " Alloca : " << MAllocaCmd << "\\n";
   Stream << " MemObj : " << MAllocaCmd->getSYCLMemObj() << "\\n";
   Stream << "\"];" << std::endl;
@@ -1309,7 +1301,7 @@ pi_int32 MapMemObject::enqueueImp() {
   waitForPreparedHostEvents();
   std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
   std::vector<sycl::detail::pi::PiEvent> RawEvents = getPiEvents(EventImpls);
-  flushCrossQueueDeps(EventImpls, getWorkerQueue());
+  flushCrossQueueDeps(EventImpls, MWorkerQueue);
 
   sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
   *MDstPtr = MemoryManager::map(
@@ -1324,7 +1316,7 @@ void MapMemObject::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "MAP ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n";
+  Stream << "MAP ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
 
   Stream << "\"];" << std::endl;
 
@@ -1391,7 +1383,7 @@ pi_int32 UnMapMemObject::enqueueImp() {
   waitForPreparedHostEvents();
   std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
   std::vector<sycl::detail::pi::PiEvent> RawEvents = getPiEvents(EventImpls);
-  flushCrossQueueDeps(EventImpls, getWorkerQueue());
+  flushCrossQueueDeps(EventImpls, MWorkerQueue);
 
   sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
   MemoryManager::unmap(MDstAllocaCmd->getSYCLMemObj(),
@@ -1405,7 +1397,7 @@ void UnMapMemObject::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#EBC40F\", label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "UNMAP ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n";
+  Stream << "UNMAP ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
 
   Stream << "\"];" << std::endl;
 
@@ -1452,11 +1444,10 @@ void MemCpyCommand::emitInstrumentationData() {
     xpti::addMetadata(CmdTraceEvent, "memory_object",
                       reinterpret_cast<size_t>(MAddress));
     xpti::addMetadata(CmdTraceEvent, "copy_from",
-                      MSrcQueue ? reinterpret_cast<size_t>(
-                          getSyclObjImpl(MSrcQueue->get_device()).get()) : nullptr);
+                      MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0);
     xpti::addMetadata(
         CmdTraceEvent, "copy_to",
-        MQueue ? reinterpret_cast<size_t>(getSyclObjImpl(MQueue->get_device()).get()): nullptr);
+        MQueue ? deviceToID(MQueue->get_device()): 0);
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
@@ -1466,8 +1457,9 @@ void MemCpyCommand::emitInstrumentationData() {
 #endif
 }
 
-const ContextImplPtr &MemCpyCommand::getWorkerContext() const {
-  return getWorkerQueue()->getContextImplPtr();
+ContextImplPtr MemCpyCommand::getWorkerContext() const {
+  assert(MWorkerQueue && "Worker queue for mem cpy command must be not nullptr");
+  return MWorkerQueue->getContextImplPtr();
 }
 
 bool MemCpyCommand::producesPiEvent() const {
@@ -1499,7 +1491,7 @@ pi_int32 MemCpyCommand::enqueueImp() {
   sycl::detail::pi::PiEvent &Event = MEvent->getHandleRef();
 
   auto RawEvents = getPiEvents(EventImpls);
-  flushCrossQueueDeps(EventImpls, getWorkerQueue());
+  flushCrossQueueDeps(EventImpls, MWorkerQueue);
 
   MemoryManager::copy(
       MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
@@ -1515,7 +1507,7 @@ void MemCpyCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#C7EB15\" label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "MEMCPY ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n";
+  Stream << "MEMCPY ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
   Stream << "From: " << MSrcAllocaCmd << " is host: " << !MSrcQueue
          << "\\n";
   Stream << "To: " << MDstAllocaCmd << " is host: " << !MQueue
@@ -1573,7 +1565,7 @@ void UpdateHostRequirementCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#f1337f\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "UPDATE REQ ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n";
+  Stream << "UPDATE REQ ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
   bool IsReqOnBuffer =
       MDstReq.MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer;
   Stream << "TYPE: " << (IsReqOnBuffer ? "Buffer" : "Image") << "\\n";
@@ -1625,11 +1617,10 @@ void MemCpyCommandHost::emitInstrumentationData() {
     xpti::addMetadata(CmdTraceEvent, "memory_object",
                       reinterpret_cast<size_t>(MAddress));
     xpti::addMetadata(CmdTraceEvent, "copy_from",
-                      reinterpret_cast<size_t>(
-                          MSrcQueue ? getSyclObjImpl(MSrcQueue->get_device()).get()) : "nullptr");
+                          MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0);
     xpti::addMetadata(
         CmdTraceEvent, "copy_to",
-        MQueue ? reinterpret_cast<size_t>(getSyclObjImpl(MQueue->get_device()).get()) : "nullptr");
+        MQueue ? deviceToID(MQueue->get_device()) : 0);
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
@@ -1639,12 +1630,13 @@ void MemCpyCommandHost::emitInstrumentationData() {
 #endif
 }
 
-const ContextImplPtr &MemCpyCommandHost::getWorkerContext() const {
-  return getWorkerQueue()->getContextImplPtr();
+ContextImplPtr MemCpyCommandHost::getWorkerContext() const {
+  assert(MWorkerQueue && "Worker queue for mem cpy host command must be not nullptr");
+  return MWorkerQueue->getContextImplPtr();
 }
 
 pi_int32 MemCpyCommandHost::enqueueImp() {
-  const QueueImplPtr &Queue = getWorkerQueue();
+  const QueueImplPtr &Queue = MWorkerQueue;
   waitForPreparedHostEvents();
   std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
   std::vector<sycl::detail::pi::PiEvent> RawEvents = getPiEvents(EventImpls);
@@ -1660,7 +1652,7 @@ pi_int32 MemCpyCommandHost::enqueueImp() {
     return PI_SUCCESS;
   }
 
-  flushCrossQueueDeps(EventImpls, getWorkerQueue());
+  flushCrossQueueDeps(EventImpls, MWorkerQueue);
   MemoryManager::copy(
       MSrcAllocaCmd->getSYCLMemObj(), MSrcAllocaCmd->getMemAllocation(),
       MSrcQueue, MSrcReq.MDims, MSrcReq.MMemoryRange, MSrcReq.MAccessRange,
@@ -1671,8 +1663,8 @@ pi_int32 MemCpyCommandHost::enqueueImp() {
   return PI_SUCCESS;
 }
 
-EmptyCommand::EmptyCommand(QueueImplPtr Queue)
-    : Command(CommandType::EMPTY_TASK, std::move(Queue)) {
+EmptyCommand::EmptyCommand()
+    : Command(CommandType::EMPTY_TASK, nullptr) {
   emitInstrumentationDataProxy();
 }
 
@@ -1750,7 +1742,7 @@ void MemCpyCommandHost::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "MEMCPY HOST ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n";
+  Stream << "MEMCPY HOST ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
 
   Stream << "\"];" << std::endl;
 
@@ -2055,7 +2047,7 @@ std::pair<xpti_td *, uint64_t> emitKernelInstrumentationData(
   if (CmdTraceEvent) {
     // Stash the queue_id mutable metadata in TLS
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                        MQueue ? MQueue->getQueueID() : 0);
+                        Queue ? Queue->getQueueID() : 0);
 
     instrumentationAddExtraKernelMetadata(CmdTraceEvent, NDRDesc,
                                           KernelBundleImplPtr, SyclKernelName,
@@ -2124,7 +2116,7 @@ void ExecCGCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "EXEC CG ON " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n";
+  Stream << "EXEC CG ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
 
   switch (MCommandGroup->getType()) {
   case detail::CG::Kernel: {
@@ -2670,7 +2662,7 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() {
   // submissions of the command buffer itself will not receive dependencies on
   // them, e.g. initial copies from host to device
   std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
-  flushCrossQueueDeps(EventImpls, getWorkerQueue());
+  flushCrossQueueDeps(EventImpls, MWorkerQueue);
   std::vector<sycl::detail::pi::PiEvent> RawEvents = getPiEvents(EventImpls);
   if (!RawEvents.empty()) {
     const PluginPtr &Plugin = MQueue->getPlugin();
@@ -2826,7 +2818,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     waitForPreparedHostEvents();
   std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
   auto RawEvents = getPiEvents(EventImpls);
-  flushCrossQueueDeps(EventImpls, getWorkerQueue());
+  flushCrossQueueDeps(EventImpls, MWorkerQueue);
 
   bool DiscardPiEvent = MQueue && MQueue->supportsDiscardingPiEvents() &&
                          (MCommandGroup->getRequirements().size() == 0);
@@ -3050,7 +3042,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
 
         for (AllocaCommandBase *AllocaCmd : AllocaCmds)
           if (getContext(HostTask->MQueue) ==
-              getContext(AllocaCmd->getQueue()) {
+              getContext(AllocaCmd->getQueue())) {
             auto MemArg =
                 reinterpret_cast<pi_mem>(AllocaCmd->getMemAllocation());
             ReqToMem.emplace_back(std::make_pair(Req, MemArg));
@@ -3294,7 +3286,7 @@ void KernelFusionCommand::emitInstrumentationData() {
   // the kernel fusion list is still empty, so we don't have a terrible lot of
   // information we could attach to this node here.
   if (MFirstInstance && CmdTraceEvent)
-    addDeviceMetadata(CmdTraceEVent, MQueue);
+    addDeviceMetadata(CmdTraceEvent, MQueue);
 
   if (MFirstInstance) {
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
@@ -3314,7 +3306,7 @@ void KernelFusionCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "KERNEL FUSION on " << MQueue ? deviceToString(MQueue->get_device()) : "host" << "\\n"
+  Stream << "KERNEL FUSION on " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"
          << "FUSION LIST: {";
   bool Initial = true;
   for (auto *Cmd : MFusionList) {
@@ -3354,7 +3346,7 @@ pi_int32 UpdateCommandBufferCommand::enqueueImp() {
   waitForPreparedHostEvents();
   std::vector<EventImplPtr> EventImpls = MPreparedDepsEvents;
   auto RawEvents = getPiEvents(EventImpls);
-  flushCrossQueueDeps(EventImpls, getWorkerQueue());
+  flushCrossQueueDeps(EventImpls, MWorkerQueue);
 
   for (auto &Node : MNodes) {
     auto CG = static_cast<CGExecKernel *>(Node->MCommandGroup.get());
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 89cabd134a7e1..ea2ba3ea72118 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -223,11 +223,7 @@ class Command {
 
   /// Get the context of the queue this command will be submitted to. Could
   /// differ from the context of MQueue for memory copy commands.
-  virtual const ContextImplPtr &getWorkerContext() const;
-
-  /// Get the queue this command will be submitted to. Could differ from MQueue
-  /// for memory copy commands.
-  const QueueImplPtr &getWorkerQueue() const;
+  virtual ContextImplPtr getWorkerContext() const;
 
   /// Returns true iff the command produces a PI event on non-host devices.
   virtual bool producesPiEvent() const;
@@ -414,7 +410,7 @@ class Command {
 /// implement lock in the graph, or to merge several nodes into one.
 class EmptyCommand : public Command {
 public:
-  EmptyCommand(QueueImplPtr Queue);
+  EmptyCommand();
 
   void printDot(std::ostream &Stream) const final;
   const Requirement *getRequirement() const final { return &MRequirements[0]; }
@@ -586,7 +582,7 @@ class MemCpyCommand : public Command {
   void printDot(std::ostream &Stream) const final;
   const Requirement *getRequirement() const final { return &MDstReq; }
   void emitInstrumentationData() final;
-  const ContextImplPtr &getWorkerContext() const final;
+  ContextImplPtr getWorkerContext() const final;
   bool producesPiEvent() const final;
 
 private:
@@ -610,7 +606,7 @@ class MemCpyCommandHost : public Command {
   void printDot(std::ostream &Stream) const final;
   const Requirement *getRequirement() const final { return &MDstReq; }
   void emitInstrumentationData() final;
-  const ContextImplPtr &getWorkerContext() const final;
+  ContextImplPtr getWorkerContext() const final;
 
 private:
   pi_int32 enqueueImp() final;
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index d9614e9ca9d51..8778ad6927c3e 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -54,15 +54,10 @@ static bool IsSuitableSubReq(const Requirement *Req) {
   return Req->MIsSubBuffer;
 }
 
-static ContextImplPtr GetContext(const QueueImplPtr& Queue)
-{
-  return Queue ? Queue->getContextImplPtr() : nullptr;
-}
-
-bool MemObjRecord::isSameContext(const QueueImplPtr& Queue) const
+static bool isOnSameContext(const ContextImplPtr Context, const QueueImplPtr& Queue)
 {
   // Covers case for host usage (nullptr == nullptr) and existing device contexts comparison.
-  return MCurContext == (Queue ? Queue->getContextImplPtr() : nullptr);
+  return Context == queue_impl::getContext(Queue);
 }
 
 /// Checks if the required access mode is allowed under the current one.
@@ -250,7 +245,7 @@ MemObjRecord *Scheduler::GraphBuilder::getOrInsertMemObjRecord(
     getOrCreateAllocaForReq(MemObject->MRecord.get(), Req, InteropQueuePtr,
                             ToEnqueue);
   } else
-    MemObject->MRecord.reset(new MemObjRecord{GetContext(Queue),
+    MemObject->MRecord.reset(new MemObjRecord{queue_impl::getContext(Queue),
                                               LeafLimit, AllocateDependency});
 
   MMemObjs.push_back(MemObject);
@@ -289,7 +284,7 @@ void Scheduler::GraphBuilder::addNodeToLeaves(
 UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd(
     MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue,
     std::vector<Command *> &ToEnqueue) {
-  auto Context = GetContext(Queue);
+  auto Context = queue_impl::getContext(Queue);
   AllocaCommandBase *AllocaCmd =
       findAllocaForReq(Record, Req, Context);
   assert(AllocaCmd && "There must be alloca for requirement!");
@@ -353,7 +348,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(
   if (!AllocaCmdDst)
     throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
 
-  auto Context = GetContext(Queue);
+  auto Context = queue_impl::getContext(Queue);
   std::set<Command *> Deps =
       findDepsForReq(Record, Req, Context);
   Deps.insert(AllocaCmdDst);
@@ -371,7 +366,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(
     // current context, need to find a parent alloca command for it (it must be
     // there)
     auto IsSuitableAlloca = [Record](AllocaCommandBase *AllocaCmd) {
-      bool Res = Record->isSameContext(AllocaCmd->getQueue()) &&
+      bool Res = isOnSameContext(Record->MCurContext, AllocaCmd->getQueue()) &&
                  // Looking for a parent buffer alloca command
                  AllocaCmd->getType() == Command::CommandType::ALLOCA;
       return Res;
@@ -406,7 +401,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(
 
     if ((Req->MAccessMode == access::mode::discard_write) ||
         (Req->MAccessMode == access::mode::discard_read_write)) {
-      Record->updateUsage(Context);
+      Record->MCurContext = Context;
       return nullptr;
     } else {
       // Full copy of buffer is needed to avoid loss of data that may be caused
@@ -428,7 +423,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(
   addNodeToLeaves(Record, NewCmd, access::mode::read_write, ToEnqueue);
   for (Command *Cmd : ToCleanUp)
     cleanupCommand(Cmd);
-  Record->updateUsage(Context);
+  Record->MCurContext = Context;
   return NewCmd;
 }
 
@@ -541,7 +536,7 @@ Scheduler::GraphBuilder::addHostAccessor(Requirement *Req,
   AllocaCommandBase *HostAllocaCmd =
       getOrCreateAllocaForReq(Record, Req, nullptr, ToEnqueue);
 
-  if (Record->isSameContext(HostAllocaCmd->getQueue())) {
+  if (isOnSameContext(Record->MCurContext, HostAllocaCmd->getQueue())) {
     if (!isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) {
       remapMemoryObject(Record, Req,
                         Req->MIsSubBuffer ? (static_cast<AllocaSubBufCommand *>(
@@ -625,7 +620,7 @@ Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record,
       if (Dep.MDepCommand)
       {
         auto DepQueue = Dep.MDepCommand->getQueue();
-        CanBypassDep &= IsOnSameContext(Context, DepQueue);
+        CanBypassDep &= isOnSameContext(Context, DepQueue);
       }
 
       if (!CanBypassDep) {
@@ -665,7 +660,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::findAllocaForReq(
     bool AllowConst) {
   auto IsSuitableAlloca = [&Context, Req,
                            AllowConst](AllocaCommandBase *AllocaCmd) {
-    bool Res = IsOnSameContext(Context, AllocaCmd->getQueue());
+    bool Res = isOnSameContext(Context, AllocaCmd->getQueue());
     if (IsSuitableSubReq(Req)) {
       const Requirement *TmpReq = AllocaCmd->getRequirement();
       Res &= AllocaCmd->getType() == Command::CommandType::ALLOCA_SUB_BUF;
@@ -705,7 +700,7 @@ static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) {
 AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
     MemObjRecord *Record, const Requirement *Req, const QueueImplPtr &Queue,
     std::vector<Command *> &ToEnqueue) {
-  auto Context = GetContext(Queue);
+  auto Context = queue_impl::getContext(Queue);
   AllocaCommandBase *AllocaCmd = findAllocaForReq(
       Record, Req, Context, /*AllowConst=*/false);
 
@@ -761,7 +756,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
             Record->MAllocaCommands.push_back(HostAllocaCmd);
             Record->MWriteLeaves.push_back(HostAllocaCmd, ToEnqueue);
             ++(HostAllocaCmd->MLeafCounter);
-            Record->updateUsage(nullptr);
+            Record->MCurContext = nullptr;
           }
         }
       } else {
@@ -773,7 +768,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
           // new one. There could be situations when we could setup link with
           // "not" current allocation, but it will require memory copy.
           // Can setup link between cl and host allocations only
-          if ((Context != nullptr) + (Record->usedOnDevice()) == 1) {
+          if ((Context != nullptr) != (Record->MCurContext != nullptr)) {
             // Linked commands assume that the host allocation is reused by the
             // plugin runtime and that can lead to unnecessary copy overhead on
             // devices that do not support host unified memory. Do not link the
@@ -829,7 +824,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
           AllocaCmd->MIsActive = false;
         } else {
           LinkedAllocaCmd->MIsActive = false;
-          Record->updateUsage(Context);
+          Record->MCurContext =Context;
 
           std::set<Command *> Deps =
               findDepsForReq(Record, Req, Context);
@@ -1068,7 +1063,7 @@ void Scheduler::GraphBuilder::createGraphForCommand(
       AllocaCmd =
           getOrCreateAllocaForReq(Record, Req, QueueForAlloca, ToEnqueue);
 
-      isSameCtx = Record->isSameContext(QueueForAlloca);
+      isSameCtx = isOnSameContext(Record->MCurContext, QueueForAlloca);
     }
 
     // If there is alloca command we need to check if the latest memory is in
@@ -1076,7 +1071,7 @@ void Scheduler::GraphBuilder::createGraphForCommand(
     if (isSameCtx) {
       // If the memory is already in the required host context, check if the
       // required access mode is valid, remap if not.
-      if (!Record->usedOnDevice() &&
+      if (!Record->MCurContext &&
           !isAccessModeAllowed(Req->MAccessMode, Record->MHostAccess)) {
         remapMemoryObject(Record, Req,
                           Req->MIsSubBuffer
@@ -1094,11 +1089,11 @@ void Scheduler::GraphBuilder::createGraphForCommand(
       if (isInteropTask) {
         const detail::CGHostTask &HT = static_cast<detail::CGHostTask &>(CG);
 
-        if (!(Record->isSameContext(HT.MQueue)) {
+        if (!isOnSameContext(Record->MCurContext, HT.MQueue)) {
           NeedMemMoveToHost = true;
           MemMoveTargetQueue = HT.MQueue;
         }
-      } else if (Queue && Record->usedOnDevice())
+      } else if (Queue && Record->MCurContext)
         NeedMemMoveToHost = true;
 
       if (NeedMemMoveToHost)
@@ -1107,7 +1102,7 @@ void Scheduler::GraphBuilder::createGraphForCommand(
     }
 
     std::set<Command *> Deps =
-        findDepsForReq(Record, Req, GetContext(Queue));
+        findDepsForReq(Record, Req, queue_impl::getContext(Queue));
 
     for (Command *Dep : Deps) {
       if (Dep != NewCmd) {
@@ -1709,7 +1704,7 @@ Command *Scheduler::GraphBuilder::addCommandGraphUpdate(
 
       AllocaCmd = getOrCreateAllocaForReq(Record, Req, Queue, ToEnqueue);
 
-      isSameCtx = Record->isSameContext(Queue);
+      isSameCtx = isOnSameContext(Record->MCurContext, Queue);
     }
 
     if (!isSameCtx) {
@@ -1718,7 +1713,7 @@ Command *Scheduler::GraphBuilder::addCommandGraphUpdate(
       bool NeedMemMoveToHost = false;
       auto MemMoveTargetQueue = Queue;
 
-      if (Queue && Record->usedOnDevice())
+      if (Queue && Record->MCurContext)
         NeedMemMoveToHost = true;
 
       if (NeedMemMoveToHost)
@@ -1728,7 +1723,7 @@ Command *Scheduler::GraphBuilder::addCommandGraphUpdate(
       insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue);
     }
     std::set<Command *> Deps =
-        findDepsForReq(Record, Req,  GetContext(Queue));
+        findDepsForReq(Record, Req,  queue_impl::getContext(Queue));
 
     for (Command *Dep : Deps) {
       if (Dep != NewCmd.get()) {
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index 61f01863c477b..d3462872c9ddf 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -199,12 +199,11 @@ using FusionMap = std::unordered_map<QueueIdT, FusionList>;
 /// There must be a single MemObjRecord for each SYCL memory object.
 ///
 /// \ingroup sycl_graph
-class MemObjRecord {
+struct MemObjRecord {
   MemObjRecord(ContextImplPtr Ctx, std::size_t LeafLimit,
                LeavesCollection::AllocateDependencyF AllocateDependency)
       : MReadLeaves{this, LeafLimit, AllocateDependency},
         MWriteLeaves{this, LeafLimit, AllocateDependency}, MCurContext{Ctx} {}
-public:
   // Contains all allocation commands for the memory object.
   std::vector<AllocaCommandBase *> MAllocaCommands;
 
@@ -224,15 +223,6 @@ class MemObjRecord {
   // The mode this object can be accessed with from the host (host_accessor).
   // Valid only if the current usage is on host.
   access::mode MHostAccess = access::mode::read_write;
-
-  void updateUsage(ContextImplPtr& NewContext)
-  {
-    MCurContext = NewContext;
-  }
-
-  bool isSameContext(const QueueImplPtr& Queue) const;
-
-  bool usedOnDevice() { return MCurContext != nullptr; }
 };
 
 /// DPC++ graph scheduler class.

From df27615254aff2efd52952930673920c521fd3fb Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 5 Jun 2024 08:49:20 -0700
Subject: [PATCH 18/52] almost buildable: build enabling 3

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/queue_impl.hpp         | 6 +++---
 sycl/source/detail/scheduler/commands.cpp | 6 +++---
 sycl/source/detail/scheduler/commands.hpp | 3 +--
 3 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 61f34c35c7baf..3bd7b6ea7ec0a 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -670,9 +670,9 @@ class queue_impl {
     MExceptions.PushBack(ExceptionPtr);
   }
 
-  // ThreadPool &getThreadPool() {
-  //   return GlobalHandler::instance().getHostTaskThreadPool();
-  // }
+  static ThreadPool &getThreadPool() {
+    return GlobalHandler::instance().getHostTaskThreadPool();
+  }
 
   /// Gets the native handle of the SYCL queue.
   ///
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 05873f23f45a9..d0a790ed97059 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -65,7 +65,7 @@ static bool CurrentCodeLocationValid() {
          (FunctionName && FunctionName[0] != '\0');
 }
 
-static void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID,
+void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID,
                                 xpti_td *TraceEvent, uint16_t Type,
                                 const void *Addr) {
   if (!(xptiCheckTraceEnabled(StreamID, Type) && TraceEvent))
@@ -2424,7 +2424,7 @@ pi_int32 enqueueImpCommandBufferKernel(
                   &getMemAllocationFunc](sycl::detail::ArgDesc &Arg,
                                          size_t NextTrueIndex) {
     sycl::detail::SetArgBasedOnType(Plugin, PiKernel, DeviceImageImpl,
-                                    getMemAllocationFunc, Ctx, false, Arg,
+                                    getMemAllocationFunc, Ctx, Arg,
                                     NextTrueIndex);
   };
   // Copy args for modification
@@ -3066,7 +3066,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     // submitted to report exception origin properly.
     copySubmissionCodeLocation();
 
-    getThreadPool().submit<DispatchHostTask>(
+    queue_impl::getThreadPool().submit<DispatchHostTask>(
         DispatchHostTask(this, std::move(ReqToMem)));
 
     MShouldCompleteEventIfPossible = false;
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index ea2ba3ea72118..628ccdf2593da 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -33,7 +33,6 @@ class node_impl;
 namespace detail {
 
 #ifdef XPTI_ENABLE_INSTRUMENTATION
-bool CurrentCodeLocationValid();
 void emitInstrumentationGeneral(uint32_t StreamID, uint64_t InstanceID,
                                 xpti_td *TraceEvent, uint16_t Type,
                                 const void *Addr);
@@ -793,7 +792,7 @@ void SetArgBasedOnType(
     const detail::plugin &Plugin, sycl::detail::pi::PiKernel Kernel,
     const std::shared_ptr<device_image_impl> &DeviceImageImpl,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
-    const sycl::context &Context, bool IsHost, detail::ArgDesc &Arg,
+    const sycl::context &Context, detail::ArgDesc &Arg,
     size_t NextTrueIndex);
 
 void applyFuncOnFilteredArgs(

From eebc51933df59666baad0bb50100cb02dce4e485 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 5 Jun 2024 09:34:20 -0700
Subject: [PATCH 19/52] almost almost buildable: enable build 4

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/queue_impl.hpp         | 2 +-
 sycl/source/detail/scheduler/commands.cpp | 3 ++-
 sycl/source/handler.cpp                   | 6 +++---
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 3bd7b6ea7ec0a..1315d32ecaa4f 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -844,7 +844,7 @@ class queue_impl {
           "function objects should use the sycl::handler API instead.");
     }
 
-    handler Handler(Self, PrimaryQueue, SecondaryQueue);
+    handler Handler(Self, PrimaryQueue, SecondaryQueue, false);
     Handler.saveCodeLoc(Loc);
     PreventSubmit = true;
     try {
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index d0a790ed97059..1683b874fba5d 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -402,8 +402,9 @@ class DispatchHostTask {
     try {
       // we're ready to call the user-defined lambda now
       if (HostTask.MHostTask->isInteropTask()) {
+        assert(HostTask.MQueue && "Submitted queue for host task must be device queue");
         interop_handle IH{MReqToMem, HostTask.MQueue,
-                        //  HostTask.MQueue->getDeviceImplPtr(),
+                          HostTask.MQueue->getDeviceImplPtr(),
                           HostTask.MQueue->getContextImplPtr()};
 
         HostTask.MHostTask->call(MThisCmd->MEvent->getHostProfilingInfo(), IH);
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index c0e0438d9cd2f..015d690d67e7d 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -80,12 +80,12 @@ void *getValueFromDynamicParameter(
 
 } // namespace detail
 
-handler::handler(std::shared_ptr<detail::queue_impl> Queue)
-    : handler(Queue, Queue, nullptr) {}
+handler::handler(std::shared_ptr<detail::queue_impl> Queue, bool)
+    : handler(Queue, Queue, nullptr, false) {}
 
 handler::handler(std::shared_ptr<detail::queue_impl> Queue,
                  std::shared_ptr<detail::queue_impl> PrimaryQueue,
-                 std::shared_ptr<detail::queue_impl> SecondaryQueue)
+                 std::shared_ptr<detail::queue_impl> SecondaryQueue, bool)
     : MImpl(std::make_shared<detail::handler_impl>(std::move(PrimaryQueue),
                                                    std::move(SecondaryQueue))),
       MQueue(std::move(Queue)) {}

From c6fe5c8098daadcde4dd19241be937e146bf9a17 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Thu, 6 Jun 2024 10:12:13 -0700
Subject: [PATCH 20/52] buildable

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/device_impl.cpp |  7 -------
 sycl/source/detail/device_impl.hpp |  5 -----
 sycl/source/detail/stream_impl.cpp | 14 +++++---------
 sycl/source/detail/stream_impl.hpp |  4 ----
 4 files changed, 5 insertions(+), 25 deletions(-)

diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp
index c677b9165d71f..ae3b04486d1ea 100644
--- a/sycl/source/detail/device_impl.cpp
+++ b/sycl/source/detail/device_impl.cpp
@@ -716,13 +716,6 @@ bool device_impl::has(aspect Aspect) const {
                       PI_ERROR_INVALID_DEVICE);
 }
 
-std::shared_ptr<device_impl> device_impl::getHostDeviceImpl() {
-  static std::shared_ptr<device_impl> HostImpl =
-      std::make_shared<device_impl>();
-
-  return HostImpl;
-}
-
 bool device_impl::isAssertFailSupported() const {
   return MIsAssertFailSupported;
 }
diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp
index efec017d372f5..9249bbba59fe8 100644
--- a/sycl/source/detail/device_impl.hpp
+++ b/sycl/source/detail/device_impl.hpp
@@ -217,11 +217,6 @@ class device_impl {
   /// \return true if the SYCL device has the given feature.
   bool has(aspect Aspect) const;
 
-  /// Gets the single instance of the Host Device
-  ///
-  /// \return the host device_impl singleton
-  static std::shared_ptr<device_impl> getHostDeviceImpl();
-
   bool isAssertFailSupported() const;
 
   bool isRootDevice() const { return MRootDevice == nullptr; }
diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp
index 4550b5cc26629..7268293433e82 100644
--- a/sycl/source/detail/stream_impl.cpp
+++ b/sycl/source/detail/stream_impl.cpp
@@ -94,12 +94,12 @@ void stream_impl::initStreamHost(QueueImplPtr Queue) {
 }
 
 void stream_impl::flush(const EventImplPtr &LeadEvent) {
+  assert(LeadEvent && "LeadEvent is expected to be not nullptr");
   // We don't want stream flushing to be blocking operation that is why submit a
   // host task to print stream buffer. It will fire up as soon as the kernel
   // finishes execution.
-  auto Q = detail::createSyclObjFromImpl<queue>(
-      sycl::detail::Scheduler::getInstance().getDefaultHostQueue());
-  event Event = Q.submit([&](handler &cgh) {
+  auto Q = LeadEvent->getSubmittedQueue();
+  event Event = detail::createSyclObjFromImpl<sycl::queue>(Q).submit([&](handler &cgh) {
     auto BufHostAcc =
         Buf_.get_access<access::mode::read_write, access::target::host_buffer>(
             cgh, range<1>(BufferSize_), id<1>(OffsetSize));
@@ -131,14 +131,10 @@ void stream_impl::flush(const EventImplPtr &LeadEvent) {
       fflush(stdout);
     });
   });
-  if (LeadEvent) {
-    LeadEvent->attachEventToComplete(detail::getSyclObjImpl(Event));
-    LeadEvent->getSubmittedQueue()->registerStreamServiceEvent(
-        detail::getSyclObjImpl(Event));
-  }
+  LeadEvent->attachEventToComplete(detail::getSyclObjImpl(Event));
+  Q->registerStreamServiceEvent(detail::getSyclObjImpl(Event));
 }
 
-void stream_impl::flush() { flush(nullptr); }
 } // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/source/detail/stream_impl.hpp b/sycl/source/detail/stream_impl.hpp
index 823653016c162..cd3d503b4b894 100644
--- a/sycl/source/detail/stream_impl.hpp
+++ b/sycl/source/detail/stream_impl.hpp
@@ -49,10 +49,6 @@ class __SYCL_EXPORT stream_impl {
   // LeadEvent as well as in queue LeadEvent associated with.
   void flush(const EventImplPtr &LeadEvent);
 
-  // Enqueue task to copy stream buffer to the host and print the contents
-  // Remove during next ABI breaking window
-  void flush();
-
   size_t size() const noexcept;
 
   size_t get_work_item_buffer_size() const;

From 24669e2a82d3765cc08800d4e8691e0c2bc5b28b Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Thu, 6 Jun 2024 10:52:53 -0700
Subject: [PATCH 21/52] RT-buildable: enabling UT build

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/unittests/scheduler/AllocaLinking.cpp    | 13 +++----------
 .../scheduler/CommandsWaitForEvents.cpp       | 10 ++--------
 .../scheduler/EnqueueWithDependsOnDeps.cpp    |  3 +--
 sycl/unittests/scheduler/GraphCleanup.cpp     | 11 +++--------
 sycl/unittests/scheduler/InOrderQueueDeps.cpp | 11 +++--------
 sycl/unittests/scheduler/LeafLimit.cpp        |  2 --
 .../scheduler/LeafLimitDiffContexts.cpp       |  2 +-
 sycl/unittests/scheduler/LeavesCollection.cpp |  9 ++++-----
 .../scheduler/LinkedAllocaDependencies.cpp    | 14 ++++----------
 .../scheduler/NoHostUnifiedMemory.cpp         | 19 +++++++------------
 sycl/unittests/scheduler/QueueFlushing.cpp    | 10 +++-------
 .../scheduler/SchedulerTestUtils.hpp          |  3 +--
 .../scheduler/StreamInitDependencyOnHost.cpp  |  9 +++------
 13 files changed, 35 insertions(+), 81 deletions(-)

diff --git a/sycl/unittests/scheduler/AllocaLinking.cpp b/sycl/unittests/scheduler/AllocaLinking.cpp
index a77995a203da3..e15cf24761ee1 100644
--- a/sycl/unittests/scheduler/AllocaLinking.cpp
+++ b/sycl/unittests/scheduler/AllocaLinking.cpp
@@ -47,13 +47,6 @@ static pi_result redefinedDeviceGetInfoAfter(pi_device Device,
 
 TEST_F(SchedulerTest, AllocaLinking) {
   HostUnifiedMemory = false;
-  // This host device constructor should be placed before Mock.redefine
-  // because it overrides the real implementation of get_device_info
-  // which is needed when creating a host device.
-  device HostDevice = detail::createSyclObjFromImpl<device>(
-      detail::device_impl::getHostDeviceImpl());
-  std::shared_ptr<detail::queue_impl> DefaultHostQueue{
-      new detail::queue_impl(detail::getSyclObjImpl(HostDevice), {}, {})};
 
   sycl::unittest::PiMock Mock;
   sycl::queue Q{Mock.getPlatform().get_devices()[0]};
@@ -73,7 +66,7 @@ TEST_F(SchedulerTest, AllocaLinking) {
     detail::AllocaCommandBase *NonHostAllocaCmd =
         MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds);
     detail::AllocaCommandBase *HostAllocaCmd =
-        MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds);
+        MS.getOrCreateAllocaForReq(Record, &Req, nullptr, AuxCmds);
 
     EXPECT_FALSE(HostAllocaCmd->MLinkedAllocaCmd);
     EXPECT_FALSE(NonHostAllocaCmd->MLinkedAllocaCmd);
@@ -90,7 +83,7 @@ TEST_F(SchedulerTest, AllocaLinking) {
     detail::AllocaCommandBase *NonHostAllocaCmd =
         MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds);
     detail::AllocaCommandBase *HostAllocaCmd =
-        MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds);
+        MS.getOrCreateAllocaForReq(Record, &Req, nullptr, AuxCmds);
 
     EXPECT_EQ(HostAllocaCmd->MLinkedAllocaCmd, NonHostAllocaCmd);
     EXPECT_EQ(NonHostAllocaCmd->MLinkedAllocaCmd, HostAllocaCmd);
@@ -107,7 +100,7 @@ TEST_F(SchedulerTest, AllocaLinking) {
     detail::AllocaCommandBase *NonHostAllocaCmd =
         MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds);
     detail::AllocaCommandBase *HostAllocaCmd =
-        MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds);
+        MS.getOrCreateAllocaForReq(Record, &Req, nullptr, AuxCmds);
 
     EXPECT_EQ(HostAllocaCmd->MLinkedAllocaCmd, NonHostAllocaCmd);
     EXPECT_EQ(NonHostAllocaCmd->MLinkedAllocaCmd, HostAllocaCmd);
diff --git a/sycl/unittests/scheduler/CommandsWaitForEvents.cpp b/sycl/unittests/scheduler/CommandsWaitForEvents.cpp
index d893c33f5cc26..499a45d0fe70f 100644
--- a/sycl/unittests/scheduler/CommandsWaitForEvents.cpp
+++ b/sycl/unittests/scheduler/CommandsWaitForEvents.cpp
@@ -219,13 +219,7 @@ TEST_F(SchedulerTest, CommandsWaitForEvents) {
   std::shared_ptr<detail::event_impl> E2(
       new detail::event_impl(TestContext->EventCtx2, Q2.get_context()));
 
-  device HostDevice = detail::createSyclObjFromImpl<device>(
-      detail::device_impl::getHostDeviceImpl());
-  std::shared_ptr<detail::queue_impl> DefaultHostQueue(new detail::queue_impl(
-      detail::getSyclObjImpl(HostDevice), /*AsyncHandler=*/{},
-      /*PropList=*/{}));
-
-  MockCommand Cmd(DefaultHostQueue);
+  MockCommand Cmd(nullptr);
 
   std::vector<std::shared_ptr<detail::event_impl>> Events;
   Events.push_back(E1);
@@ -233,7 +227,7 @@ TEST_F(SchedulerTest, CommandsWaitForEvents) {
 
   pi_event EventResult = nullptr;
 
-  Cmd.waitForEventsCall(DefaultHostQueue, Events, EventResult);
+  Cmd.waitForEventsCall(nullptr, Events, EventResult);
 
   ASSERT_TRUE(TestContext->EventCtx1WasWaited &&
               TestContext->EventCtx2WasWaited)
diff --git a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
index fc816d1a4f3af..bd7531c964716 100644
--- a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
+++ b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
@@ -83,7 +83,7 @@ class DependsOnTests : public ::testing::Test {
 
     detail::Command *NewCmd = MS.addCG(
         std::move(CmdGroup),
-        Type == TestCGType::HOST_TASK ? MS.getDefaultHostQueue() : QueueDevImpl,
+        Type == TestCGType::HOST_TASK ? nullptr : QueueDevImpl,
         ToEnqueue);
     EXPECT_EQ(ToEnqueue.size(), 0u);
     return NewCmd;
@@ -167,7 +167,6 @@ class DependsOnTests : public ::testing::Test {
 
 TEST_F(DependsOnTests, EnqueueNoMemObjTwoHostTasks) {
   // Checks enqueue of two dependent host tasks
-  detail::QueueImplPtr QueueHostImpl = MS.getDefaultHostQueue();
   std::vector<EventImplPtr> Events;
 
   detail::Command *Cmd1 =
diff --git a/sycl/unittests/scheduler/GraphCleanup.cpp b/sycl/unittests/scheduler/GraphCleanup.cpp
index 3389769569e5e..e0ec582db065c 100644
--- a/sycl/unittests/scheduler/GraphCleanup.cpp
+++ b/sycl/unittests/scheduler/GraphCleanup.cpp
@@ -172,7 +172,7 @@ static void checkCleanupOnEnqueue(MockScheduler &MS,
 }
 
 static void checkCleanupOnLeafUpdate(
-    MockScheduler &MS, detail::QueueImplPtr &QueueImpl, buffer<int, 1> &Buf,
+    MockScheduler &MS, detail::QueueImplPtr QueueImpl, buffer<int, 1> &Buf,
     detail::Requirement &MockReq,
     std::function<void(detail::MemObjRecord *)> SchedulerCall) {
   bool CommandDeleted = false;
@@ -247,15 +247,10 @@ TEST_F(SchedulerTest, PostEnqueueCleanup) {
   checkCleanupOnLeafUpdate(
       MS, QueueImpl, Buf, MockReq, [&](detail::MemObjRecord *Record) {
         detail::Command *Leaf = *Record->MWriteLeaves.begin();
-        MS.addEmptyCmd(Leaf, {&MockReq}, QueueImpl,
-                       detail::Command::BlockReason::HostTask, ToEnqueue);
+        MS.addEmptyCmd(Leaf, {&MockReq}, detail::Command::BlockReason::HostTask, ToEnqueue);
       });
-  device HostDevice = detail::createSyclObjFromImpl<device>(
-      detail::device_impl::getHostDeviceImpl());
-  detail::QueueImplPtr DefaultHostQueue{
-      new detail::queue_impl(detail::getSyclObjImpl(HostDevice), {}, {})};
   checkCleanupOnLeafUpdate(
-      MS, DefaultHostQueue, Buf, MockReq, [&](detail::MemObjRecord *Record) {
+      MS, nullptr, Buf, MockReq, [&](detail::MemObjRecord *Record) {
         MS.getOrCreateAllocaForReq(Record, &MockReq, QueueImpl, ToEnqueue);
       });
   // Check cleanup on exceeding leaf limit.
diff --git a/sycl/unittests/scheduler/InOrderQueueDeps.cpp b/sycl/unittests/scheduler/InOrderQueueDeps.cpp
index 337ef2ef3d403..c19b494f9c484 100644
--- a/sycl/unittests/scheduler/InOrderQueueDeps.cpp
+++ b/sycl/unittests/scheduler/InOrderQueueDeps.cpp
@@ -77,11 +77,6 @@ TEST_F(SchedulerTest, InOrderQueueDeps) {
   sycl::detail::QueueImplPtr InOrderQueueImpl =
       detail::getSyclObjImpl(InOrderQueue);
 
-  device HostDevice = detail::createSyclObjFromImpl<device>(
-      detail::device_impl::getHostDeviceImpl());
-  std::shared_ptr<detail::queue_impl> DefaultHostQueue{
-      new detail::queue_impl(detail::getSyclObjImpl(HostDevice), {}, {})};
-
   MockScheduler MS;
 
   int val;
@@ -92,18 +87,18 @@ TEST_F(SchedulerTest, InOrderQueueDeps) {
   detail::MemObjRecord *Record =
       MS.getOrInsertMemObjRecord(InOrderQueueImpl, &Req, AuxCmds);
   MS.getOrCreateAllocaForReq(Record, &Req, InOrderQueueImpl, AuxCmds);
-  MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds);
+  MS.getOrCreateAllocaForReq(Record, &Req, nullptr, AuxCmds);
 
   // Check that sequential memory movements submitted to the same in-order
   // queue do not depend on each other.
   detail::Command *Cmd =
-      MS.insertMemoryMove(Record, &Req, DefaultHostQueue, AuxCmds);
+      MS.insertMemoryMove(Record, &Req, nullptr, AuxCmds);
   detail::EnqueueResultT Res;
   auto ReadLock = MS.acquireGraphReadLock();
   MockScheduler::enqueueCommand(Cmd, Res, detail::NON_BLOCKING);
   Cmd = MS.insertMemoryMove(Record, &Req, InOrderQueueImpl, AuxCmds);
   MockScheduler::enqueueCommand(Cmd, Res, detail::NON_BLOCKING);
-  Cmd = MS.insertMemoryMove(Record, &Req, DefaultHostQueue, AuxCmds);
+  Cmd = MS.insertMemoryMove(Record, &Req, nullptr, AuxCmds);
   MockScheduler::enqueueCommand(Cmd, Res, detail::NON_BLOCKING);
 }
 
diff --git a/sycl/unittests/scheduler/LeafLimit.cpp b/sycl/unittests/scheduler/LeafLimit.cpp
index 36d8f459a324a..f3417b297bc31 100644
--- a/sycl/unittests/scheduler/LeafLimit.cpp
+++ b/sycl/unittests/scheduler/LeafLimit.cpp
@@ -36,8 +36,6 @@ TEST_F(SchedulerTest, LeafLimit) {
   unittest::ScopedEnvVar DisabledCleanup{
       DisableCleanupName, "1",
       detail::SYCLConfig<detail::SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP>::reset};
-  sycl::queue HQueue(detail::createSyclObjFromImpl<device>(
-      detail::device_impl::getHostDeviceImpl()));
   MockScheduler MS;
   std::vector<std::unique_ptr<MockCommand>> LeavesToAdd;
   std::unique_ptr<MockCommand> MockDepCmd;
diff --git a/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp b/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp
index 38d9ac784c09f..1af882a423af8 100644
--- a/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp
+++ b/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp
@@ -61,7 +61,7 @@ TEST_F(SchedulerTest, LeafLimitDiffContexts) {
       AllocaCmd = MS.getOrCreateAllocaForReq(
           Rec, &MockReq, detail::getSyclObjImpl(Queue), ToEnqueue);
       std::ignore = MS.getOrCreateAllocaForReq(
-          Rec, &MockReq, MS.getDefaultHostQueue(), ToEnqueue);
+          Rec, &MockReq, nullptr, ToEnqueue);
       DepCmd =
           std::make_unique<MockCommand>(detail::getSyclObjImpl(Queue), MockReq);
     }
diff --git a/sycl/unittests/scheduler/LeavesCollection.cpp b/sycl/unittests/scheduler/LeavesCollection.cpp
index ea883041add66..39146ffaa95e8 100644
--- a/sycl/unittests/scheduler/LeavesCollection.cpp
+++ b/sycl/unittests/scheduler/LeavesCollection.cpp
@@ -37,9 +37,8 @@ createGenericCommand(const std::shared_ptr<queue_impl> &Q) {
 }
 
 std::shared_ptr<Command>
-createEmptyCommand(const std::shared_ptr<queue_impl> &Q,
-                   const Requirement &Req) {
-  EmptyCommand *Cmd = new EmptyCommand(Q);
+createEmptyCommand(const Requirement &Req) {
+  EmptyCommand *Cmd = new EmptyCommand();
   Cmd->addRequirement(/* DepCmd = */ nullptr, /* AllocaCmd = */ nullptr, &Req);
   Cmd->MBlockReason = Command::BlockReason::HostAccessor;
   return std::shared_ptr<Command>{Cmd};
@@ -97,7 +96,7 @@ TEST_F(LeavesCollectionTest, PushBack) {
 
     for (size_t Idx = 0; Idx < GenericCmdsCapacity * 4; ++Idx) {
       auto Cmd = Idx % 2 ? createGenericCommand(getSyclObjImpl(Q))
-                         : createEmptyCommand(getSyclObjImpl(Q), MockReq);
+                         : createEmptyCommand(MockReq);
       Cmds.push_back(Cmd);
 
       LE.push_back(Cmds.back().get(), ToEnqueue);
@@ -137,7 +136,7 @@ TEST_F(LeavesCollectionTest, Remove) {
 
     for (size_t Idx = 0; Idx < GenericCmdsCapacity * 4; ++Idx) {
       auto Cmd = Idx % 2 ? createGenericCommand(getSyclObjImpl(Q))
-                         : createEmptyCommand(getSyclObjImpl(Q), MockReq);
+                         : createEmptyCommand(MockReq);
       Cmds.push_back(Cmd);
 
       if (LE.push_back(Cmds.back().get(), ToEnqueue))
diff --git a/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp b/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp
index 5ab9cfbb43f5a..6ae6b9bfc2344 100644
--- a/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp
+++ b/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp
@@ -64,28 +64,22 @@ TEST_F(SchedulerTest, LinkedAllocaDependencies) {
   sycl::queue Queue1{Dev};
   sycl::detail::QueueImplPtr Q1 = sycl::detail::getSyclObjImpl(Queue1);
 
-  device HostDevice = detail::createSyclObjFromImpl<device>(
-      detail::device_impl::getHostDeviceImpl());
-  std::shared_ptr<detail::queue_impl> DefaultHostQueue(new detail::queue_impl(
-      detail::getSyclObjImpl(HostDevice), /*AsyncHandler=*/{},
-      /*PropList=*/{}));
-
   auto AllocaDep = [](sycl::detail::Command *, sycl::detail::Command *,
                       sycl::detail::MemObjRecord *,
                       std::vector<sycl::detail::Command *> &) {};
 
   std::shared_ptr<sycl::detail::MemObjRecord> Record{
-      new sycl::detail::MemObjRecord(DefaultHostQueue->getContextImplPtr(), 10,
+      new sycl::detail::MemObjRecord(nullptr, 10,
                                      AllocaDep)};
 
   MemObjMock MemObj(Record);
   Req.MSYCLMemObj = &MemObj;
 
-  sycl::detail::AllocaCommand AllocaCmd1(DefaultHostQueue, Req, false);
+  sycl::detail::AllocaCommand AllocaCmd1(nullptr, Req, false);
   Record->MAllocaCommands.push_back(&AllocaCmd1);
 
-  MockCommand DepCmd(DefaultHostQueue, Req);
-  MockCommand DepDepCmd(DefaultHostQueue, Req);
+  MockCommand DepCmd(nullptr, Req);
+  MockCommand DepDepCmd(nullptr, Req);
   DepCmd.MDeps.push_back({&DepDepCmd, DepDepCmd.getRequirement(), &AllocaCmd1});
   DepDepCmd.MUsers.insert(&DepCmd);
   std::vector<sycl::detail::Command *> ToEnqueue;
diff --git a/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp b/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp
index 635a8e9c3389c..20cf879d53daf 100644
--- a/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp
+++ b/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp
@@ -91,11 +91,6 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) {
       redefinedMemCreateWithNativeHandle);
   sycl::detail::QueueImplPtr QImpl = detail::getSyclObjImpl(Q);
 
-  device HostDevice = detail::createSyclObjFromImpl<device>(
-      detail::device_impl::getHostDeviceImpl());
-  std::shared_ptr<detail::queue_impl> DefaultHostQueue{
-      new detail::queue_impl(detail::getSyclObjImpl(HostDevice), {}, {})};
-
   MockScheduler MS;
   // Check non-host alloca with non-discard access mode
   {
@@ -113,10 +108,10 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) {
     // order to perform a memory move.
     EXPECT_EQ(Record->MAllocaCommands.size(), 2U);
     detail::AllocaCommandBase *HostAllocaCmd = Record->MAllocaCommands[0];
-    EXPECT_TRUE(HostAllocaCmd->getQueue()->is_host());
+    EXPECT_TRUE(HostAllocaCmd->getQueue() == nullptr);
     EXPECT_TRUE(!HostAllocaCmd->MLinkedAllocaCmd);
     EXPECT_TRUE(!NonHostAllocaCmd->MLinkedAllocaCmd);
-    EXPECT_TRUE(Record->MCurContext->is_host());
+    EXPECT_TRUE(Record->MCurContext == nullptr);
 
     detail::Command *MemoryMove =
         MS.insertMemoryMove(Record, &Req, QImpl, AuxCmds);
@@ -162,9 +157,9 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) {
     // another and the transfer is done via a write operation.
     std::vector<detail::Command *> AuxCmds;
     detail::MemObjRecord *Record =
-        MS.getOrInsertMemObjRecord(DefaultHostQueue, &Req, AuxCmds);
+        MS.getOrInsertMemObjRecord(nullptr, &Req, AuxCmds);
     detail::AllocaCommandBase *HostAllocaCmd =
-        MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds);
+        MS.getOrCreateAllocaForReq(Record, &Req, nullptr, AuxCmds);
     EXPECT_EQ(Record->MAllocaCommands.size(), 1U);
     detail::AllocaCommandBase *NonHostAllocaCmd =
         MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds);
@@ -190,14 +185,14 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) {
     detail::MemObjRecord *Record =
         MS.getOrInsertMemObjRecord(QImpl, &Req, AuxCmds);
     MS.getOrCreateAllocaForReq(Record, &Req, QImpl, AuxCmds);
-    MS.getOrCreateAllocaForReq(Record, &Req, DefaultHostQueue, AuxCmds);
+    MS.getOrCreateAllocaForReq(Record, &Req, nullptr, AuxCmds);
 
     // Memory movement operations should be omitted for discard access modes.
     detail::Command *MemoryMove =
-        MS.insertMemoryMove(Record, &DiscardReq, DefaultHostQueue, AuxCmds);
+        MS.insertMemoryMove(Record, &DiscardReq, nullptr, AuxCmds);
     EXPECT_TRUE(MemoryMove == nullptr);
     // The current context for the record should still be modified.
-    EXPECT_EQ(Record->MCurContext, DefaultHostQueue->getContextImplPtr());
+    EXPECT_EQ(Record->MCurContext, nullptr);
   }
   // Check that interoperability memory objects are initialized.
   {
diff --git a/sycl/unittests/scheduler/QueueFlushing.cpp b/sycl/unittests/scheduler/QueueFlushing.cpp
index c97428b9d55c6..330ff7e0f02d2 100644
--- a/sycl/unittests/scheduler/QueueFlushing.cpp
+++ b/sycl/unittests/scheduler/QueueFlushing.cpp
@@ -122,21 +122,17 @@ TEST_F(SchedulerTest, QueueFlushing) {
                                     QueueImplA};
     testCommandEnqueue(&UnmapCmd, QueueImplB, MockReq);
 
-    device HostDevice = detail::createSyclObjFromImpl<device>(
-        detail::device_impl::getHostDeviceImpl());
-    detail::QueueImplPtr DefaultHostQueue{
-        new detail::queue_impl(detail::getSyclObjImpl(HostDevice), {}, {})};
     detail::AllocaCommand HostAllocaCmd =
-        detail::AllocaCommand(DefaultHostQueue, MockReq);
+        detail::AllocaCommand(nullptr, MockReq);
 
     detail::MemCpyCommand MemCpyCmd{MockReq,    &AllocaCmd,
                                     MockReq,    &HostAllocaCmd,
-                                    QueueImplA, DefaultHostQueue};
+                                    QueueImplA, nullptr};
     testCommandEnqueue(&MemCpyCmd, QueueImplB, MockReq);
 
     detail::MemCpyCommandHost MemCpyCmdHost{MockReq,    &AllocaCmd,
                                             MockReq,    &MockHostPtr,
-                                            QueueImplA, DefaultHostQueue};
+                                            QueueImplA, nullptr};
     testCommandEnqueue(&MemCpyCmdHost, QueueImplB, MockReq);
 
     std::unique_ptr<detail::CG> CG{
diff --git a/sycl/unittests/scheduler/SchedulerTestUtils.hpp b/sycl/unittests/scheduler/SchedulerTestUtils.hpp
index 88ced1f25904a..20f82f9165c01 100644
--- a/sycl/unittests/scheduler/SchedulerTestUtils.hpp
+++ b/sycl/unittests/scheduler/SchedulerTestUtils.hpp
@@ -189,10 +189,9 @@ class MockScheduler : public sycl::detail::Scheduler {
   sycl::detail::EmptyCommand *
   addEmptyCmd(sycl::detail::Command *Cmd,
               const std::vector<sycl::detail::Requirement *> &Reqs,
-              const sycl::detail::QueueImplPtr &Queue,
               sycl::detail::Command::BlockReason Reason,
               std::vector<sycl::detail::Command *> &ToEnqueue) {
-    return MGraphBuilder.addEmptyCmd(Cmd, Reqs, Queue, Reason, ToEnqueue);
+    return MGraphBuilder.addEmptyCmd(Cmd, Reqs, Reason, ToEnqueue);
   }
 
   sycl::detail::Command *
diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
index 18c0b3e1a8070..838b60809472c 100644
--- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
+++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
@@ -80,12 +80,9 @@ TEST_F(SchedulerTest, StreamInitDependencyOnHost) {
   unittest::ScopedEnvVar DisabledCleanup{
       DisableCleanupName, "1",
       detail::SYCLConfig<detail::SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP>::reset};
-  std::shared_ptr<detail::queue_impl> HQueueImpl(new detail::queue_impl(
-      detail::device_impl::getHostDeviceImpl(), /*AsyncHandler=*/{},
-      /*PropList=*/{}));
 
   // Emulating processing of command group function
-  MockHandlerStreamInit MockCGH(HQueueImpl, true);
+  MockHandlerStreamInit MockCGH(nullptr, true);
   MockCGH.setType(detail::CG::Kernel);
 
   auto EmptyKernel = [](sycl::nd_item<1>) {};
@@ -114,11 +111,11 @@ TEST_F(SchedulerTest, StreamInitDependencyOnHost) {
       static_cast<detail::CGExecKernel *>(MainCG.get())->getStreams();
   ASSERT_EQ(Streams.size(), 1u) << "Invalid number of stream objects";
 
-  Streams[0]->initStreamHost(HQueueImpl);
+  Streams[0]->initStreamHost(nullptr);
 
   MockScheduler MS;
   std::vector<detail::Command *> AuxCmds;
-  detail::Command *NewCmd = MS.addCG(std::move(MainCG), HQueueImpl, AuxCmds);
+  detail::Command *NewCmd = MS.addCG(std::move(MainCG), nullptr, AuxCmds);
   ASSERT_TRUE(!!NewCmd) << "Failed to add command group into scheduler";
   ASSERT_GT(NewCmd->MDeps.size(), 0u)
       << "No deps appeared in the new exec kernel command";

From fcc7748699821b8a53db059de50b94dff5f96232 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Fri, 7 Jun 2024 03:42:25 -0700
Subject: [PATCH 22/52] RT-buildable: restore incorrectly deleted code

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/memory_manager.cpp     | 28 ++++++++++++++--
 sycl/source/detail/memory_manager.hpp     |  3 ++
 sycl/source/detail/scheduler/commands.cpp | 41 +++++++++++++++++++----
 3 files changed, 63 insertions(+), 9 deletions(-)

diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp
index e2c22f794f587..461cf8b85915c 100644
--- a/sycl/source/detail/memory_manager.cpp
+++ b/sycl/source/detail/memory_manager.cpp
@@ -266,6 +266,11 @@ void MemoryManager::releaseMemObj(ContextImplPtr TargetContext,
     return;
   }
 
+  if (!TargetContext) {
+    MemObj->releaseHostMem(MemAllocation);
+    return;
+  }
+
   const PluginPtr &Plugin = TargetContext->getPlugin();
   memReleaseHelper(Plugin, pi::cast<sycl::detail::pi::PiMem>(MemAllocation));
 }
@@ -283,6 +288,19 @@ void *MemoryManager::allocate(ContextImplPtr TargetContext, SYCLMemObjI *MemObj,
                              OutEvent);
 }
 
+void *MemoryManager::allocateHostMemory(SYCLMemObjI *MemObj, void *UserPtr,
+                                        bool HostPtrReadOnly, size_t Size,
+                                        const sycl::property_list &) {
+  std::ignore = HostPtrReadOnly;
+  std::ignore = Size;
+
+  // Can return user pointer directly if it is not a nullptr.
+  if (UserPtr)
+    return UserPtr;
+
+  return MemObj->allocateHostMem();
+}
+
 void *MemoryManager::allocateInteropMemObject(
     ContextImplPtr TargetContext, void *UserPtr,
     const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext,
@@ -379,9 +397,10 @@ void *MemoryManager::allocateMemBuffer(
     const ContextImplPtr &InteropContext, const sycl::property_list &PropsList,
     sycl::detail::pi::PiEvent &OutEventToWait) {
   void *MemPtr;
-  if (UserPtr && InteropContext)
-    MemPtr =
-        allocateInteropMemObject(TargetContext, UserPtr, InteropEvent,
+  if (!TargetContext)
+    MemPtr = allocateHostMemory(MemObj, UserPtr, HostPtrReadOnly, Size, PropsList);
+  else if (UserPtr && InteropContext)
+    MemPtr = allocateInteropMemObject(TargetContext, UserPtr, InteropEvent,
                                  InteropContext, PropsList, OutEventToWait);
   else
     MemPtr = allocateBufferObject(TargetContext, UserPtr, HostPtrReadOnly, Size,
@@ -398,6 +417,9 @@ void *MemoryManager::allocateMemImage(
     const EventImplPtr &InteropEvent, const ContextImplPtr &InteropContext,
     const sycl::property_list &PropsList,
     sycl::detail::pi::PiEvent &OutEventToWait) {
+  if (!TargetContext)
+    return allocateHostMemory(MemObj, UserPtr, HostPtrReadOnly, Size,
+                              PropsList);
   if (UserPtr && InteropContext)
     return allocateInteropMemObject(TargetContext, UserPtr, InteropEvent,
                                     InteropContext, PropsList, OutEventToWait);
diff --git a/sycl/source/detail/memory_manager.hpp b/sycl/source/detail/memory_manager.hpp
index 7be17898bc0d9..deefda9ccd8ff 100644
--- a/sycl/source/detail/memory_manager.hpp
+++ b/sycl/source/detail/memory_manager.hpp
@@ -85,6 +85,9 @@ class __SYCL_EXPORT MemoryManager {
   static void releaseMemObj(ContextImplPtr TargetContext, SYCLMemObjI *MemObj,
                             void *MemAllocation, void *UserPtr);
 
+  static void *allocateHostMemory(SYCLMemObjI *MemObj, void *UserPtr,
+                                  bool HostPtrReadOnly, size_t Size,
+                                  const sycl::property_list &PropsList);
   static void *
   allocateInteropMemObject(ContextImplPtr TargetContext, void *UserPtr,
                            const EventImplPtr &InteropEvent,
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 1683b874fba5d..b1713473f2de3 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -459,8 +459,38 @@ void Command::waitForPreparedHostEvents() const {
 void Command::waitForEvents(QueueImplPtr Queue,
                             std::vector<EventImplPtr> &EventImpls,
                             sycl::detail::pi::PiEvent &Event) {
-  assert(Queue && "Device queue is expected here");
   if (!EventImpls.empty()) {
+      if (!Queue) {
+      // Host queue can wait for events from different contexts, i.e. it may
+      // contain events with different contexts in its MPreparedDepsEvents.
+      // OpenCL 2.1 spec says that clWaitForEvents will return
+      // CL_INVALID_CONTEXT if events specified in the list do not belong to
+      // the same context. Thus we split all the events into per-context map.
+      // An example. We have two queues for the same CPU device: Q1, Q2. Thus
+      // we will have two different contexts for the same CPU device: C1, C2.
+      // Also we have default host queue. This queue is accessible via
+      // Scheduler. Now, let's assume we have three different events: E1(C1),
+      // E2(C1), E3(C2). The command's MPreparedDepsEvents will contain all
+      // three events (E1, E2, E3). Now, if piEventsWait is called for all
+      // three events we'll experience failure with CL_INVALID_CONTEXT 'cause
+      // these events refer to different contexts.
+      std::map<context_impl *, std::vector<EventImplPtr>>
+          RequiredEventsPerContext;
+
+      for (const EventImplPtr &Event : EventImpls) {
+        ContextImplPtr Context = Event->getContextImpl();
+        assert(Context.get() &&
+               "Only non-host events are expected to be waited for here");
+        RequiredEventsPerContext[Context.get()].push_back(Event);
+      }
+
+      for (auto &CtxWithEvents : RequiredEventsPerContext) {
+        std::vector<sycl::detail::pi::PiEvent> RawEvents =
+            getPiEvents(CtxWithEvents.second);
+        CtxWithEvents.first->getPlugin()->call<PiApiKind::piEventsWait>(
+            RawEvents.size(), RawEvents.data());
+      }
+    } else {
 #ifndef NDEBUG
       for (const EventImplPtr &Event : EventImpls)
         assert(!Event->isHost() &&
@@ -477,6 +507,7 @@ void Command::waitForEvents(QueueImplPtr Queue,
       Plugin->call<PiApiKind::piEnqueueEventsWait>(
           Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event);
   }
+  }
 }
 
 /// It is safe to bind MPreparedDepsEvents and MPreparedHostDepsEvents
@@ -700,13 +731,11 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep,
 
   ContextImplPtr DepEventContext = DepEvent->getContextImpl();
   // If contexts don't match we'll connect them using host task
-  if (DepEventContext == WorkerContext)
-    MPreparedDepsEvents.push_back(std::move(DepEvent));
-  else
-  {
+  if (DepEventContext != WorkerContext && WorkerContext){
     Scheduler::GraphBuilder &GB = Scheduler::getInstance().MGraphBuilder;
     ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep, ToCleanUp);
-  }
+  } else
+    MPreparedDepsEvents.push_back(std::move(DepEvent));
 
   return ConnectionCmd;
 }

From 7aa76d9f1e51eb430909125e9c4acc54518c7e81 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Fri, 7 Jun 2024 05:59:28 -0700
Subject: [PATCH 23/52] RT buildable: check-sycl-AccessorTests passed

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/event_impl.cpp         |  2 +-
 sycl/source/detail/event_impl.hpp         |  4 ++--
 sycl/source/detail/scheduler/commands.cpp | 17 +++++++++--------
 sycl/source/detail/sycl_mem_obj_t.cpp     |  2 +-
 4 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index e34597aa008d1..e38c15e04879a 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -566,7 +566,7 @@ void event_impl::setCommand(void *Cmd) {
   MCommand = Cmd;
   auto TypedCommand = static_cast<Command*>(Cmd);
   if (TypedCommand)
-    MIsHostTask = TypedCommand->isHostTask();
+    MIsHostEvent = TypedCommand->getWorkerContext() == nullptr;
 }
 
 } // namespace detail
diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp
index 7c1eb99e3b286..237939ea37bd8 100644
--- a/sycl/source/detail/event_impl.hpp
+++ b/sycl/source/detail/event_impl.hpp
@@ -337,7 +337,7 @@ class event_impl {
 
   void setEnqueued() { MIsEnqueued = true; }
 
-  bool isHost() { return MIsHostTask; }
+  bool isHost() { return MIsHostEvent; }
 
 protected:
   // When instrumentation is enabled emits trace event for event wait begin and
@@ -406,7 +406,7 @@ class event_impl {
                   std::shared_ptr<sycl::detail::context_impl> Context);
 
   std::atomic_bool MIsEnqueued{false};
-  bool MIsHostTask{false};
+  bool MIsHostEvent{false};
 };
 
 } // namespace detail
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index b1713473f2de3..f7b9805ff17ec 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -459,6 +459,11 @@ void Command::waitForPreparedHostEvents() const {
 void Command::waitForEvents(QueueImplPtr Queue,
                             std::vector<EventImplPtr> &EventImpls,
                             sycl::detail::pi::PiEvent &Event) {
+  #ifndef NDEBUG
+      for (const EventImplPtr &Event : EventImpls)
+        assert(!Event->isHost() &&
+               "Only non-host events are expected to be waited for here");
+#endif
   if (!EventImpls.empty()) {
       if (!Queue) {
       // Host queue can wait for events from different contexts, i.e. it may
@@ -491,12 +496,6 @@ void Command::waitForEvents(QueueImplPtr Queue,
             RawEvents.size(), RawEvents.data());
       }
     } else {
-#ifndef NDEBUG
-      for (const EventImplPtr &Event : EventImpls)
-        assert(!Event->isHost() &&
-               "Only non-host events are expected to be waited for here");
-#endif
-
       std::vector<sycl::detail::pi::PiEvent> RawEvents =
           getPiEvents(EventImpls);
       flushCrossQueueDeps(EventImpls, MWorkerQueue);
@@ -1488,7 +1487,8 @@ void MemCpyCommand::emitInstrumentationData() {
 }
 
 ContextImplPtr MemCpyCommand::getWorkerContext() const {
-  assert(MWorkerQueue && "Worker queue for mem cpy command must be not nullptr");
+  if (!MWorkerQueue)
+    return nullptr;
   return MWorkerQueue->getContextImplPtr();
 }
 
@@ -1661,7 +1661,8 @@ void MemCpyCommandHost::emitInstrumentationData() {
 }
 
 ContextImplPtr MemCpyCommandHost::getWorkerContext() const {
-  assert(MWorkerQueue && "Worker queue for mem cpy host command must be not nullptr");
+  if (!MWorkerQueue)
+    return nullptr;
   return MWorkerQueue->getContextImplPtr();
 }
 
diff --git a/sycl/source/detail/sycl_mem_obj_t.cpp b/sycl/source/detail/sycl_mem_obj_t.cpp
index 87f005fe8ca78..a95b9b43d7f5c 100644
--- a/sycl/source/detail/sycl_mem_obj_t.cpp
+++ b/sycl/source/detail/sycl_mem_obj_t.cpp
@@ -209,7 +209,7 @@ void SYCLMemObjT::detachMemoryObject(
       !MOwnNativeHandle ||
       (MInteropContext && !MInteropContext->isOwnedByRuntime());
 
-  if (MRecord && MRecord->MCurContext->isOwnedByRuntime() &&
+  if (MRecord && MRecord->MCurContext && MRecord->MCurContext->isOwnedByRuntime() &&
       !InteropObjectsUsed && (!MHostPtrProvided || MIsInternal))
     Scheduler::getInstance().deferMemObjRelease(Self);
 }

From dc4a94ea111456a188ec60eaeef7ff9a053bf3bd Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Fri, 7 Jun 2024 06:28:04 -0700
Subject: [PATCH 24/52] RT-buildable: enable unittests 2

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/event_impl.cpp          | 3 ++-
 sycl/source/detail/scheduler/scheduler.cpp | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index e38c15e04879a..8f676a97f187d 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -81,7 +81,7 @@ void event_impl::waitInternal(bool *Success) {
 }
 
 void event_impl::setComplete() {
-  if (!MEvent) {
+  if (MIsHostEvent || !MEvent) {
     {
       std::unique_lock<std::mutex> lock(MMutex);
 #ifndef NDEBUG
@@ -126,6 +126,7 @@ const PluginPtr &event_impl::getPlugin() {
 void event_impl::setStateIncomplete() { MState = HES_NotComplete; }
 
 void event_impl::setContextImpl(const ContextImplPtr &Context) {
+  MIsHostEvent = Context == nullptr;
   MContext = Context;
   MIsContextInitialized = true;
 }
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index 7e5db05daf01a..d3fe7b523e689 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -459,7 +459,8 @@ void Scheduler::NotifyHostTaskCompletion(Command *Cmd) {
 
   std::vector<Command *> ToCleanUp;
   auto CmdEvent = Cmd->getEvent();
-  auto QueueImpl = Cmd->getQueue();
+  auto QueueImpl = CmdEvent->getSubmittedQueue();
+  assert(QueueImpl && "Submitted queue for host task must not be null");
   {
     ReadLockT Lock = acquireReadLock();
 

From 8c57888b2a5a733d248322287e599d0f08855444 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Fri, 7 Jun 2024 08:52:24 -0700
Subject: [PATCH 25/52] RT-buildable: unittests enabling 3

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 .../source/detail/scheduler/graph_builder.cpp |  2 +-
 sycl/source/detail/stream_impl.cpp            | 70 +++++++++----------
 .../scheduler/StreamInitDependencyOnHost.cpp  | 10 ++-
 3 files changed, 44 insertions(+), 38 deletions(-)

diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 8778ad6927c3e..6d3fbdd157618 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -1342,7 +1342,7 @@ Command *Scheduler::GraphBuilder::connectDepEvent(
         CG::CodeplayHostTask,
         /* Payload */ {}));
     ConnectCmd = new ExecCGCommand(
-        std::move(ConnectCG), Cmd->getQueue());
+        std::move(ConnectCG), nullptr);
   } catch (const std::bad_alloc &) {
     throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
   }
diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp
index 7268293433e82..cb46510551a30 100644
--- a/sycl/source/detail/stream_impl.cpp
+++ b/sycl/source/detail/stream_impl.cpp
@@ -98,41 +98,41 @@ void stream_impl::flush(const EventImplPtr &LeadEvent) {
   // We don't want stream flushing to be blocking operation that is why submit a
   // host task to print stream buffer. It will fire up as soon as the kernel
   // finishes execution.
-  auto Q = LeadEvent->getSubmittedQueue();
-  event Event = detail::createSyclObjFromImpl<sycl::queue>(Q).submit([&](handler &cgh) {
-    auto BufHostAcc =
-        Buf_.get_access<access::mode::read_write, access::target::host_buffer>(
-            cgh, range<1>(BufferSize_), id<1>(OffsetSize));
-    // Create accessor to the flush buffer even if not using it yet. Otherwise
-    // kernel will be a leaf for the flush buffer and scheduler will not be able
-    // to cleanup the kernel. TODO: get rid of finalize method by using host
-    // accessor to the flush buffer.
-    auto FlushBufHostAcc =
-        FlushBuf_
-            .get_access<access::mode::read_write, access::target::host_buffer>(
-                cgh);
-    cgh.host_task([=] {
-      if (!BufHostAcc.empty()) {
-        // SYCL 2020, 4.16:
-        // > If the totalBufferSize or workItemBufferSize limits are exceeded,
-        // > it is implementation-defined whether the streamed characters
-        // > exceeding the limit are output, or silently ignored/discarded, and
-        // > if output it is implementation-defined whether those extra
-        // > characters exceeding the workItemBufferSize limit count toward the
-        // > totalBufferSize limit. Regardless of this implementation defined
-        // > behavior of output exceeding the limits, no undefined or erroneous
-        // > behavior is permitted of an implementation when the limits are
-        // > exceeded.
-        //
-        // Defend against zero-sized buffers (although they'd have no practical
-        // use).
-        printf("%s", &(BufHostAcc[0]));
-      }
-      fflush(stdout);
-    });
-  });
-  LeadEvent->attachEventToComplete(detail::getSyclObjImpl(Event));
-  Q->registerStreamServiceEvent(detail::getSyclObjImpl(Event));
+  // auto Q = LeadEvent->getSubmittedQueue();
+  // event Event = detail::createSyclObjFromImpl<sycl::queue>(Q).submit([&](handler &cgh) {
+  //   auto BufHostAcc =
+  //       Buf_.get_access<access::mode::read_write, access::target::host_buffer>(
+  //           cgh, range<1>(BufferSize_), id<1>(OffsetSize));
+  //   // Create accessor to the flush buffer even if not using it yet. Otherwise
+  //   // kernel will be a leaf for the flush buffer and scheduler will not be able
+  //   // to cleanup the kernel. TODO: get rid of finalize method by using host
+  //   // accessor to the flush buffer.
+  //   auto FlushBufHostAcc =
+  //       FlushBuf_
+  //           .get_access<access::mode::read_write, access::target::host_buffer>(
+  //               cgh);
+  //   cgh.host_task([=] {
+  //     if (!BufHostAcc.empty()) {
+  //       // SYCL 2020, 4.16:
+  //       // > If the totalBufferSize or workItemBufferSize limits are exceeded,
+  //       // > it is implementation-defined whether the streamed characters
+  //       // > exceeding the limit are output, or silently ignored/discarded, and
+  //       // > if output it is implementation-defined whether those extra
+  //       // > characters exceeding the workItemBufferSize limit count toward the
+  //       // > totalBufferSize limit. Regardless of this implementation defined
+  //       // > behavior of output exceeding the limits, no undefined or erroneous
+  //       // > behavior is permitted of an implementation when the limits are
+  //       // > exceeded.
+  //       //
+  //       // Defend against zero-sized buffers (although they'd have no practical
+  //       // use).
+  //       printf("%s", &(BufHostAcc[0]));
+  //     }
+  //     fflush(stdout);
+  //   });
+  // });
+  // LeadEvent->attachEventToComplete(detail::getSyclObjImpl(Event));
+  // Q->registerStreamServiceEvent(detail::getSyclObjImpl(Event));
 }
 
 } // namespace detail
diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
index 838b60809472c..4b34a1f4d6828 100644
--- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
+++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
@@ -12,6 +12,7 @@
 #include <detail/config.hpp>
 #include <detail/handler_impl.hpp>
 #include <helpers/ScopedEnvVar.hpp>
+#include <helpers/PiMock.hpp>
 
 using namespace sycl;
 
@@ -81,8 +82,13 @@ TEST_F(SchedulerTest, StreamInitDependencyOnHost) {
       DisableCleanupName, "1",
       detail::SYCLConfig<detail::SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP>::reset};
 
+  sycl::unittest::PiMock Mock;
+  sycl::platform Plt = Mock.getPlatform();
+  sycl::queue Q(Plt.get_devices()[0]);
+  std::shared_ptr<detail::queue_impl> QImpl = detail::getSyclObjImpl(Q);
+
   // Emulating processing of command group function
-  MockHandlerStreamInit MockCGH(nullptr, true);
+  MockHandlerStreamInit MockCGH(QImpl, true);
   MockCGH.setType(detail::CG::Kernel);
 
   auto EmptyKernel = [](sycl::nd_item<1>) {};
@@ -111,7 +117,7 @@ TEST_F(SchedulerTest, StreamInitDependencyOnHost) {
       static_cast<detail::CGExecKernel *>(MainCG.get())->getStreams();
   ASSERT_EQ(Streams.size(), 1u) << "Invalid number of stream objects";
 
-  Streams[0]->initStreamHost(nullptr);
+  Streams[0]->initStreamHost(QImpl);
 
   MockScheduler MS;
   std::vector<detail::Command *> AuxCmds;

From abfc5bfbdf48b8bfe48cfb17e68d9a91bb64ba9e Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Mon, 17 Jun 2024 07:49:32 -0700
Subject: [PATCH 26/52] tiny cleanup

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 .../source/detail/scheduler/graph_builder.cpp | 22 +++++++++----------
 sycl/source/detail/scheduler/scheduler.hpp    |  3 +--
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 6d3fbdd157618..1932f18d697ac 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -868,7 +868,7 @@ void Scheduler::GraphBuilder::markModifiedIfWrite(MemObjRecord *Record,
 EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd(
     Command *Cmd, const std::vector<Requirement *> &Reqs,
     Command::BlockReason Reason,
-    std::vector<Command *> &ToEnqueue, const bool AddDepsToLeaves) {
+    std::vector<Command *> &ToEnqueue) {
   EmptyCommand *EmptyCmd = new EmptyCommand();
 
   if (!EmptyCmd)
@@ -889,19 +889,17 @@ EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd(
   if (!Reqs.size())
     Cmd->addUser(EmptyCmd);
 
-  if (AddDepsToLeaves) {
-    const std::vector<DepDesc> &Deps = Cmd->MDeps;
-    std::vector<Command *> ToCleanUp;
-    for (const DepDesc &Dep : Deps) {
-      const Requirement *Req = Dep.MDepRequirement;
-      MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
+  const std::vector<DepDesc> &Deps = Cmd->MDeps;
+  std::vector<Command *> ToCleanUp;
+  for (const DepDesc &Dep : Deps) {
+    const Requirement *Req = Dep.MDepRequirement;
+    MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
 
-      updateLeaves({Cmd}, Record, Req->MAccessMode, ToCleanUp);
-      addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode, ToEnqueue);
-    }
-    for (Command *Cmd : ToCleanUp)
-      cleanupCommand(Cmd);
+    updateLeaves({Cmd}, Record, Req->MAccessMode, ToCleanUp);
+    addNodeToLeaves(Record, EmptyCmd, Req->MAccessMode, ToEnqueue);
   }
+  for (Command *Cmd : ToCleanUp)
+    cleanupCommand(Cmd);
 
   return EmptyCmd;
 }
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index d3462872c9ddf..4e0bf465d59fd 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -742,8 +742,7 @@ class Scheduler {
     EmptyCommand *addEmptyCmd(Command *Cmd,
                               const std::vector<Requirement *> &Req,
                               Command::BlockReason Reason,
-                              std::vector<Command *> &ToEnqueue,
-                              const bool AddDepsToLeaves = true);
+                              std::vector<Command *> &ToEnqueue);
 
     void createGraphForCommand(Command *NewCmd, CG &CG, bool isInteropTask,
                                std::vector<Requirement *> &Reqs,

From 75f6eab8dd7a8f5b008d1b955bad3c3fc36914ba Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Mon, 17 Jun 2024 07:21:30 -0700
Subject: [PATCH 27/52] move stream_impl flush

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/queue_impl.cpp             | 19 ++++-
 sycl/source/detail/queue_impl.hpp             |  3 +-
 sycl/source/detail/scheduler/scheduler.cpp    | 11 ---
 sycl/source/detail/stream_impl.cpp            | 83 ++++++-------------
 sycl/source/detail/stream_impl.hpp            | 10 +--
 .../scheduler/CommandsWaitForEvents.cpp       |  2 +-
 .../scheduler/StreamInitDependencyOnHost.cpp  | 62 --------------
 7 files changed, 49 insertions(+), 141 deletions(-)

diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index 298d4078cc922..af7af19ede120 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -361,8 +361,10 @@ event queue_impl::submit_impl(const std::function<void(handler &)> &CGF,
   // Host and interop tasks, however, are not submitted to low-level runtimes
   // and require separate dependency management.
   const CG::CGTYPE Type = Handler.getType();
-  event Event = detail::createSyclObjFromImpl<event>(
-      std::make_shared<detail::event_impl>());
+  event Event = detail::createSyclObjFromImpl<event>(std::make_shared<detail::event_impl>());
+  std::vector<StreamImplPtr> Streams;
+  if (Type == CG::Kernel)
+    Streams = std::move(Handler.MStreamStorage);
 
   if (PostProcess) {
     bool IsKernel = Type == CG::Kernel;
@@ -380,6 +382,19 @@ event queue_impl::submit_impl(const std::function<void(handler &)> &CGF,
     finalizeHandler(Handler, Event);
 
   addEvent(Event);
+
+  auto EventImpl = detail::getSyclObjImpl(Event);
+  for (auto &Stream : Streams) {
+    // We don't want stream flushing to be blocking operation that is why submit a
+    // host task to print stream buffer. It will fire up as soon as the kernel
+    // finishes execution.
+    event FlushEvent = submit_impl([&](handler &ServiceCGH) {
+      Stream->generateFlushCommand(ServiceCGH);
+    }, Self, PrimaryQueue, SecondaryQueue, Loc, {});
+    EventImpl->attachEventToComplete(detail::getSyclObjImpl(FlushEvent));
+    registerStreamServiceEvent(detail::getSyclObjImpl(FlushEvent));
+  }
+
   return Event;
 }
 
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index c3d0c4c5752f8..e72ded829a798 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -13,10 +13,12 @@
 #include <detail/device_impl.hpp>
 #include <detail/device_info.hpp>
 #include <detail/event_impl.hpp>
+#include <detail/handler_impl.hpp>
 #include <detail/global_handler.hpp>
 #include <detail/kernel_impl.hpp>
 #include <detail/plugin.hpp>
 #include <detail/scheduler/scheduler.hpp>
+#include <detail/stream_impl.hpp>
 #include <detail/thread_pool.hpp>
 #include <sycl/context.hpp>
 #include <sycl/detail/assert_happened.hpp>
@@ -26,7 +28,6 @@
 #include <sycl/exception.hpp>
 #include <sycl/exception_list.hpp>
 #include <sycl/ext/codeplay/experimental/fusion_properties.hpp>
-#include <sycl/handler.hpp>
 #include <sycl/properties/context_properties.hpp>
 #include <sycl/properties/queue_properties.hpp>
 #include <sycl/property_list.hpp>
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index d3fe7b523e689..52eb59b225004 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -99,13 +99,6 @@ EventImplPtr Scheduler::addCG(
   EventImplPtr NewEvent = nullptr;
   const CG::CGTYPE Type = CommandGroup->getType();
   std::vector<Command *> AuxiliaryCmds;
-  std::vector<StreamImplPtr> Streams;
-
-  if (Type == CG::Kernel) {
-    auto *CGExecKernelPtr = static_cast<CGExecKernel *>(CommandGroup.get());
-    Streams = CGExecKernelPtr->getStreams();
-    CGExecKernelPtr->clearStreams();
-  }
   std::vector<std::shared_ptr<const void>> AuxiliaryResources;
   AuxiliaryResources = CommandGroup->getAuxiliaryResources();
   CommandGroup->clearAuxiliaryResources();
@@ -143,10 +136,6 @@ EventImplPtr Scheduler::addCG(
 
   if (ShouldEnqueue) {
     enqueueCommandForCG(NewEvent, AuxiliaryCmds);
-
-    for (const auto &StreamImplPtr : Streams) {
-      StreamImplPtr->flush(NewEvent);
-    }
   }
 
   if (!AuxiliaryResources.empty())
diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp
index cb46510551a30..7d926fbdb83dd 100644
--- a/sycl/source/detail/stream_impl.cpp
+++ b/sycl/source/detail/stream_impl.cpp
@@ -76,65 +76,36 @@ size_t stream_impl::get_size() const { return BufferSize_; }
 
 size_t stream_impl::get_max_statement_size() const { return MaxStatementSize_; }
 
-void stream_impl::initStreamHost(QueueImplPtr Queue) {
-  // Real size of full flush buffer is saved only in buffer_impl field of
-  // FlushBuf object.
-  size_t FlushBufSize = getSyclObjImpl(FlushBuf_)->size();
-
-  auto Q = createSyclObjFromImpl<queue>(Queue);
-  Q.submit([&](handler &cgh) {
-    auto FlushBufAcc = FlushBuf_.get_access<access::mode::discard_write,
-                                            access::target::host_buffer>(
-        cgh, range<1>(1), id<1>(0));
-    cgh.host_task([=] {
-      char *FlushBufPtr = FlushBufAcc.get_pointer();
-      std::memset(FlushBufPtr, 0, FlushBufSize);
-    });
+void stream_impl::generateFlushCommand(handler& cgh)
+{
+  // Create accessor to the flush buffer even if not using it yet. Otherwise
+  // kernel will be a leaf for the flush buffer and scheduler will not be able
+  // to cleanup the kernel. TODO: get rid of finalize method by using host
+  // accessor to the flush buffer.
+  host_accessor<char, 1, access::mode::read_write> FlushBuffHostAcc(FlushBuf_, cgh);
+  host_accessor<char, 1, access::mode::read_write> BufHostAcc (Buf_, cgh, range<1>(BufferSize_), id<1>(OffsetSize));
+
+  cgh.host_task([=] {
+    if (!BufHostAcc.empty()) {
+      // SYCL 2020, 4.16:
+      // > If the totalBufferSize or workItemBufferSize limits are exceeded,
+      // > it is implementation-defined whether the streamed characters
+      // > exceeding the limit are output, or silently ignored/discarded, and
+      // > if output it is implementation-defined whether those extra
+      // > characters exceeding the workItemBufferSize limit count toward the
+      // > totalBufferSize limit. Regardless of this implementation defined
+      // > behavior of output exceeding the limits, no undefined or erroneous
+      // > behavior is permitted of an implementation when the limits are
+      // > exceeded.
+      //
+      // Defend against zero-sized buffers (although they'd have no practical
+      // use).
+      printf("%s", &(BufHostAcc[0]));
+    }
+    fflush(stdout);
   });
 }
 
-void stream_impl::flush(const EventImplPtr &LeadEvent) {
-  assert(LeadEvent && "LeadEvent is expected to be not nullptr");
-  // We don't want stream flushing to be blocking operation that is why submit a
-  // host task to print stream buffer. It will fire up as soon as the kernel
-  // finishes execution.
-  // auto Q = LeadEvent->getSubmittedQueue();
-  // event Event = detail::createSyclObjFromImpl<sycl::queue>(Q).submit([&](handler &cgh) {
-  //   auto BufHostAcc =
-  //       Buf_.get_access<access::mode::read_write, access::target::host_buffer>(
-  //           cgh, range<1>(BufferSize_), id<1>(OffsetSize));
-  //   // Create accessor to the flush buffer even if not using it yet. Otherwise
-  //   // kernel will be a leaf for the flush buffer and scheduler will not be able
-  //   // to cleanup the kernel. TODO: get rid of finalize method by using host
-  //   // accessor to the flush buffer.
-  //   auto FlushBufHostAcc =
-  //       FlushBuf_
-  //           .get_access<access::mode::read_write, access::target::host_buffer>(
-  //               cgh);
-  //   cgh.host_task([=] {
-  //     if (!BufHostAcc.empty()) {
-  //       // SYCL 2020, 4.16:
-  //       // > If the totalBufferSize or workItemBufferSize limits are exceeded,
-  //       // > it is implementation-defined whether the streamed characters
-  //       // > exceeding the limit are output, or silently ignored/discarded, and
-  //       // > if output it is implementation-defined whether those extra
-  //       // > characters exceeding the workItemBufferSize limit count toward the
-  //       // > totalBufferSize limit. Regardless of this implementation defined
-  //       // > behavior of output exceeding the limits, no undefined or erroneous
-  //       // > behavior is permitted of an implementation when the limits are
-  //       // > exceeded.
-  //       //
-  //       // Defend against zero-sized buffers (although they'd have no practical
-  //       // use).
-  //       printf("%s", &(BufHostAcc[0]));
-  //     }
-  //     fflush(stdout);
-  //   });
-  // });
-  // LeadEvent->attachEventToComplete(detail::getSyclObjImpl(Event));
-  // Q->registerStreamServiceEvent(detail::getSyclObjImpl(Event));
-}
-
 } // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/source/detail/stream_impl.hpp b/sycl/source/detail/stream_impl.hpp
index cd3d503b4b894..aacb495537943 100644
--- a/sycl/source/detail/stream_impl.hpp
+++ b/sycl/source/detail/stream_impl.hpp
@@ -41,14 +41,6 @@ class __SYCL_EXPORT stream_impl {
   // buffer and offset in the flush buffer
   GlobalOffsetAccessorT accessGlobalOffset(handler &CGH);
 
-  // Initialize flush buffers on host.
-  void initStreamHost(QueueImplPtr Queue);
-
-  // Enqueue task to copy stream buffer to the host and print the contents
-  // The host task event is then registered for post processing in the
-  // LeadEvent as well as in queue LeadEvent associated with.
-  void flush(const EventImplPtr &LeadEvent);
-
   size_t size() const noexcept;
 
   size_t get_work_item_buffer_size() const;
@@ -67,6 +59,8 @@ class __SYCL_EXPORT stream_impl {
     return PropList_.get_property<propertyT>();
   }
 
+  void generateFlushCommand(handler& cgh);
+
 private:
   // Size of the stream buffer
   size_t BufferSize_;
diff --git a/sycl/unittests/scheduler/CommandsWaitForEvents.cpp b/sycl/unittests/scheduler/CommandsWaitForEvents.cpp
index 499a45d0fe70f..43aa7a88775d7 100644
--- a/sycl/unittests/scheduler/CommandsWaitForEvents.cpp
+++ b/sycl/unittests/scheduler/CommandsWaitForEvents.cpp
@@ -163,7 +163,7 @@ TEST_F(SchedulerTest, StreamAUXCmdsWait) {
 
     auto EventImplProxy = std::static_pointer_cast<EventImplProxyT>(EventImpl);
 
-    ASSERT_TRUE(EventImplProxy->MPostCompleteEvents.size() == 1)
+    ASSERT_EQ(EventImplProxy->MPostCompleteEvents.size(), 1)
         << "Expected 1 post complete event";
 
     Q.wait();
diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
index 4b34a1f4d6828..d1e7f22aa9485 100644
--- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
+++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
@@ -74,65 +74,3 @@ static bool ValidateDepCommandsTree(const detail::Command *Cmd,
 
   return false;
 }
-
-TEST_F(SchedulerTest, StreamInitDependencyOnHost) {
-  // Disable post enqueue cleanup so that it doesn't interfere with dependency
-  // checks.
-  unittest::ScopedEnvVar DisabledCleanup{
-      DisableCleanupName, "1",
-      detail::SYCLConfig<detail::SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP>::reset};
-
-  sycl::unittest::PiMock Mock;
-  sycl::platform Plt = Mock.getPlatform();
-  sycl::queue Q(Plt.get_devices()[0]);
-  std::shared_ptr<detail::queue_impl> QImpl = detail::getSyclObjImpl(Q);
-
-  // Emulating processing of command group function
-  MockHandlerStreamInit MockCGH(QImpl, true);
-  MockCGH.setType(detail::CG::Kernel);
-
-  auto EmptyKernel = [](sycl::nd_item<1>) {};
-  MockCGH
-      .setHostKernel<decltype(EmptyKernel), sycl::nd_item<1>, 1, class Empty>(
-          EmptyKernel);
-  MockCGH.setNDRangeDesc(
-      sycl::nd_range<1>{sycl::range<1>{1}, sycl::range<1>{1}});
-
-  // Emulating construction of stream object inside command group
-  detail::StreamImplPtr StreamImpl =
-      std::make_shared<detail::stream_impl>(1024, 200, MockCGH);
-  detail::GlobalBufAccessorT FlushBufAcc =
-      StreamImpl->accessGlobalFlushBuf(MockCGH);
-  MockCGH.addStream(StreamImpl);
-
-  detail::SYCLMemObjI *FlushBufMemObjPtr =
-      detail::getSyclObjImpl(FlushBufAcc)->MSYCLMemObj;
-  ASSERT_TRUE(!!FlushBufMemObjPtr)
-      << "Memory object for stream flush buffer not initialized";
-
-  std::unique_ptr<detail::CG> MainCG = MockCGH.finalize();
-
-  // Emulate call of Scheduler::addCG
-  std::vector<detail::StreamImplPtr> Streams =
-      static_cast<detail::CGExecKernel *>(MainCG.get())->getStreams();
-  ASSERT_EQ(Streams.size(), 1u) << "Invalid number of stream objects";
-
-  Streams[0]->initStreamHost(QImpl);
-
-  MockScheduler MS;
-  std::vector<detail::Command *> AuxCmds;
-  detail::Command *NewCmd = MS.addCG(std::move(MainCG), nullptr, AuxCmds);
-  ASSERT_TRUE(!!NewCmd) << "Failed to add command group into scheduler";
-  ASSERT_GT(NewCmd->MDeps.size(), 0u)
-      << "No deps appeared in the new exec kernel command";
-
-  // Searching in dependencies for CG execution command that initializes flush
-  // buffer of a stream that is supposed to be used inside NewCmd's CG.
-  // Tree of dependencies should look like:
-  // [MAIN_CG] -> [EMPTY_NODE {FlushBufMemObj}] -> [FILL_CG {FlushBufMemObj}] ->
-  //     [[ALLOC_TASK {FlushBufMemObj}]
-  std::vector<CmdTypeTy> DepCmdsTypes({CmdTypeTy::RUN_CG, // FILL_CG
-                                       CmdTypeTy::ALLOCA});
-  ASSERT_TRUE(ValidateDepCommandsTree(NewCmd, DepCmdsTypes, FlushBufMemObjPtr))
-      << "Dependency on stream flush buffer initialization not found";
-}

From be12c01ecc837de0ff5f7f3c2f17ca34b03d921d Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 19 Jun 2024 04:44:06 -0700
Subject: [PATCH 28/52] test fix

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/device_impl.cpp            |  3 ++
 sycl/source/detail/event_impl.cpp             | 30 +++++++++----------
 sycl/source/detail/image_impl.cpp             |  2 ++
 .../scheduler/CommandsWaitForEvents.cpp       |  2 +-
 4 files changed, 21 insertions(+), 16 deletions(-)

diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp
index 846972254f7d9..e24b6f6f2510e 100644
--- a/sycl/source/detail/device_impl.cpp
+++ b/sycl/source/detail/device_impl.cpp
@@ -327,6 +327,9 @@ bool device_impl::has(aspect Aspect) const {
   size_t return_size = 0;
 
   switch (Aspect) {
+  case aspect::host:
+  //Deprecated
+    return false;
   case aspect::cpu:
     return is_cpu();
   case aspect::gpu:
diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index 0d2976e7ec271..93dc4b7fca1b1 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -155,15 +155,13 @@ event_impl::event_impl(const QueueImplPtr &Queue)
       MFallbackProfiling{MIsProfilingEnabled && Queue && Queue->isProfilingFallback()} {
   if (Queue)
     this->setContextImpl(Queue->getContextImplPtr());
-  if (!Queue) {
+  else {
     MState.store(HES_NotComplete);
-    if (Queue->has_property<property::queue::enable_profiling>()) {
-      MHostProfilingInfo.reset(new HostProfilingInfo());
-      if (!MHostProfilingInfo)
-        throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
-                              "Out of host memory " +
-                                  codeToString(PI_ERROR_OUT_OF_HOST_MEMORY));
-    }
+    MHostProfilingInfo.reset(new HostProfilingInfo());
+    if (!MHostProfilingInfo)
+      throw sycl::exception(sycl::make_error_code(sycl::errc::runtime),
+                            "Out of host memory " +
+                                codeToString(PI_ERROR_OUT_OF_HOST_MEMORY));
     return;
   }
   MState.store(HES_Complete);
@@ -381,13 +379,15 @@ event_impl::get_info<info::event::command_execution_status>() {
   if (MState == HES_Discarded)
     return info::event_command_status::ext_oneapi_unknown;
 
-  // Command is enqueued and PiEvent is ready
-  if (MEvent)
-    return get_event_info<info::event::command_execution_status>(
-        this->getHandleRef(), this->getPlugin());
-  // Command is blocked and not enqueued, PiEvent is not assigned yet
-  else if (MCommand)
-    return sycl::info::event_command_status::submitted;
+  if (!MIsHostEvent) {
+    // Command is enqueued and PiEvent is ready
+    if (MEvent)
+      return get_event_info<info::event::command_execution_status>(
+          this->getHandleRef(), this->getPlugin());
+    // Command is blocked and not enqueued, PiEvent is not assigned yet
+    else if (MCommand)
+      return sycl::info::event_command_status::submitted;
+  }
 
   return MState.load() != HES_Complete
              ? sycl::info::event_command_status::submitted
diff --git a/sycl/source/detail/image_impl.cpp b/sycl/source/detail/image_impl.cpp
index 0b512ae1aedbe..e5bacd33fc70d 100644
--- a/sycl/source/detail/image_impl.cpp
+++ b/sycl/source/detail/image_impl.cpp
@@ -471,6 +471,8 @@ bool image_impl::checkImageFormat(
 }
 
 std::vector<device> image_impl::getDevices(const ContextImplPtr Context) {
+  if (!Context)
+    return {};
   return Context->get_info<info::context::devices>();
 }
 
diff --git a/sycl/unittests/scheduler/CommandsWaitForEvents.cpp b/sycl/unittests/scheduler/CommandsWaitForEvents.cpp
index 43aa7a88775d7..daf8599947ad2 100644
--- a/sycl/unittests/scheduler/CommandsWaitForEvents.cpp
+++ b/sycl/unittests/scheduler/CommandsWaitForEvents.cpp
@@ -163,7 +163,7 @@ TEST_F(SchedulerTest, StreamAUXCmdsWait) {
 
     auto EventImplProxy = std::static_pointer_cast<EventImplProxyT>(EventImpl);
 
-    ASSERT_EQ(EventImplProxy->MPostCompleteEvents.size(), 1)
+    ASSERT_EQ(EventImplProxy->MPostCompleteEvents.size(), 1u)
         << "Expected 1 post complete event";
 
     Q.wait();

From e043ee01f185cecac5c0cbd2648853ac0ff4c6db Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 19 Jun 2024 05:35:10 -0700
Subject: [PATCH 29/52] restore & update ABI - not breaking

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/stream_impl.cpp    |  9 +++++++++
 sycl/source/detail/stream_impl.hpp    |  9 +++++++++
 sycl/test/abi/sycl_symbols_linux.dump | 17 +++++++++--------
 3 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp
index 7d926fbdb83dd..75c80745ec71c 100644
--- a/sycl/source/detail/stream_impl.cpp
+++ b/sycl/source/detail/stream_impl.cpp
@@ -106,6 +106,15 @@ void stream_impl::generateFlushCommand(handler& cgh)
   });
 }
 
+  // ABI break: remove
+  void stream_impl::initStreamHost(QueueImplPtr ){};
+
+  // ABI break: remove
+  void stream_impl::flush(const EventImplPtr &) {};
+
+  // ABI break: remove
+  void stream_impl::flush() {};
+
 } // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/source/detail/stream_impl.hpp b/sycl/source/detail/stream_impl.hpp
index aacb495537943..4fc1f4b1d5a8a 100644
--- a/sycl/source/detail/stream_impl.hpp
+++ b/sycl/source/detail/stream_impl.hpp
@@ -41,6 +41,15 @@ class __SYCL_EXPORT stream_impl {
   // buffer and offset in the flush buffer
   GlobalOffsetAccessorT accessGlobalOffset(handler &CGH);
 
+  // ABI break: remove
+  void initStreamHost(QueueImplPtr);
+
+  // ABI break: remove
+  void flush(const EventImplPtr &);
+
+  // ABI break: remove
+  void flush();
+
   size_t size() const noexcept;
 
   size_t get_work_item_buffer_size() const;
diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump
index 0edaaa25b4ba1..c60fdb1318905 100644
--- a/sycl/test/abi/sycl_symbols_linux.dump
+++ b/sycl/test/abi/sycl_symbols_linux.dump
@@ -3119,6 +3119,7 @@ _ZN4sycl3_V15queue10mem_adviseEPKvmiRKSt6vectorINS0_5eventESaIS5_EERKNS0_6detail
 _ZN4sycl3_V15queue10wait_proxyERKNS0_6detail13code_locationE
 _ZN4sycl3_V15queue11submit_implESt8functionIFvRNS0_7handlerEEERKNS0_6detail13code_locationE
 _ZN4sycl3_V15queue11submit_implESt8functionIFvRNS0_7handlerEEES1_RKNS0_6detail13code_locationE
+_ZN4sycl3_V15queue15ext_oneapi_prodEv
 _ZN4sycl3_V15queue17discard_or_returnERKNS0_5eventE
 _ZN4sycl3_V15queue18throw_asynchronousEv
 _ZN4sycl3_V15queue20memcpyToDeviceGlobalEPvPKvbmmRKSt6vectorINS0_5eventESaIS6_EE
@@ -3230,6 +3231,7 @@ _ZN4sycl3_V16detail11stream_impl14initStreamHostESt10shared_ptrINS1_10queue_impl
 _ZN4sycl3_V16detail11stream_impl15accessGlobalBufERNS0_7handlerE
 _ZN4sycl3_V16detail11stream_impl18accessGlobalOffsetERNS0_7handlerE
 _ZN4sycl3_V16detail11stream_impl20accessGlobalFlushBufERNS0_7handlerE
+_ZN4sycl3_V16detail11stream_impl20generateFlushCommandERNS0_7handlerE
 _ZN4sycl3_V16detail11stream_impl5flushERKSt10shared_ptrINS1_10event_implEE
 _ZN4sycl3_V16detail11stream_impl5flushEv
 _ZN4sycl3_V16detail11stream_implC1EmmRKNS0_13property_listE
@@ -3621,6 +3623,7 @@ _ZN4sycl3_V17handler28memcpyToHostOnlyDeviceGlobalEPKvS3_mbmm
 _ZN4sycl3_V17handler28setStateExplicitKernelBundleEv
 _ZN4sycl3_V17handler30memcpyFromHostOnlyDeviceGlobalEPvPKvbmm
 _ZN4sycl3_V17handler30verifyUsedKernelBundleInternalENS0_6detail11string_viewE
+_ZN4sycl3_V17handler32verifyDeviceHasProgressGuaranteeENS0_3ext6oneapi12experimental26forward_progress_guaranteeENS4_15execution_scopeES6_
 _ZN4sycl3_V17handler34ext_oneapi_wait_external_semaphoreENS0_3ext6oneapi12experimental24interop_semaphore_handleE
 _ZN4sycl3_V17handler36ext_oneapi_signal_external_semaphoreENS0_3ext6oneapi12experimental24interop_semaphore_handleE
 _ZN4sycl3_V17handler6memcpyEPvPKvm
@@ -3633,7 +3636,6 @@ _ZN4sycl3_V17handlerC1ESt10shared_ptrINS0_6detail10queue_implEEb
 _ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_3ext6oneapi12experimental6detail10graph_implEE
 _ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_6detail10queue_implEES5_S5_b
 _ZN4sycl3_V17handlerC2ESt10shared_ptrINS0_6detail10queue_implEEb
-_ZN4sycl3_V17handler32verifyDeviceHasProgressGuaranteeENS0_3ext6oneapi12experimental26forward_progress_guaranteeENS4_15execution_scopeES6_
 _ZN4sycl3_V17samplerC1ENS0_29coordinate_normalization_modeENS0_15addressing_modeENS0_14filtering_modeERKNS0_13property_listE
 _ZN4sycl3_V17samplerC1EP11_cl_samplerRKNS0_7contextE
 _ZN4sycl3_V17samplerC2ENS0_29coordinate_normalization_modeENS0_15addressing_modeENS0_14filtering_modeERKNS0_13property_listE
@@ -3748,7 +3750,6 @@ _ZNK4sycl3_V15queue12has_propertyINS0_8property5queue16enable_profilingEEEbv
 _ZNK4sycl3_V15queue12has_propertyINS0_8property5queue4cuda18use_default_streamEEEbv
 _ZNK4sycl3_V15queue12has_propertyINS0_8property5queue8in_orderEEEbv
 _ZNK4sycl3_V15queue16ext_oneapi_emptyEv
-_ZN4sycl3_V15queue15ext_oneapi_prodEv
 _ZNK4sycl3_V15queue16get_backend_infoINS0_4info6device15backend_versionEEENS0_6detail20is_backend_info_descIT_E11return_typeEv
 _ZNK4sycl3_V15queue16get_backend_infoINS0_4info6device7versionEEENS0_6detail20is_backend_info_descIT_E11return_typeEv
 _ZNK4sycl3_V15queue16get_backend_infoINS0_4info8platform7versionEEENS0_6detail20is_backend_info_descIT_E11return_typeEv
@@ -3973,6 +3974,12 @@ _ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device22m
 _ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device22max_image_linear_widthEEENS0_6detail11ABINeutralTINS9_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device23max_image_linear_heightEEENS0_6detail11ABINeutralTINS9_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device26max_image_linear_row_pitchEEENS0_6detail11ABINeutralTINS9_19is_device_info_descIT_E11return_typeEE4typeEv
+_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31sub_group_progress_capabilitiesILNS5_15execution_scopeE2EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
+_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31sub_group_progress_capabilitiesILNS5_15execution_scopeE3EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
+_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS5_15execution_scopeE1EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
+_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS5_15execution_scopeE2EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
+_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS5_15execution_scopeE3EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
+_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device32work_group_progress_capabilitiesILNS5_15execution_scopeE3EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13get_info_implINS0_3ext8codeplay12experimental4info6device15supports_fusionEEENS0_6detail11ABINeutralTINS9_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13get_info_implINS0_3ext8codeplay12experimental4info6device28max_registers_per_work_groupEEENS0_6detail11ABINeutralTINS9_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13get_info_implINS0_4info6device10extensionsEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv
@@ -4084,12 +4091,6 @@ _ZNK4sycl3_V16device13get_info_implINS0_4info6device7versionEEENS0_6detail11ABIN
 _ZNK4sycl3_V16device13get_info_implINS0_4info6device8atomic64EEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13get_info_implINS0_4info6device8platformEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13get_info_implINS0_4info6device9vendor_idEEENS0_6detail11ABINeutralTINS6_19is_device_info_descIT_E11return_typeEE4typeEv
-_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device32work_group_progress_capabilitiesILNS5_15execution_scopeE3EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
-_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31sub_group_progress_capabilitiesILNS5_15execution_scopeE3EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
-_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31sub_group_progress_capabilitiesILNS5_15execution_scopeE2EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
-_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS5_15execution_scopeE2EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
-_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS5_15execution_scopeE3EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
-_ZNK4sycl3_V16device13get_info_implINS0_3ext6oneapi12experimental4info6device31work_item_progress_capabilitiesILNS5_15execution_scopeE1EEEEENS0_6detail11ABINeutralTINSB_19is_device_info_descIT_E11return_typeEE4typeEv
 _ZNK4sycl3_V16device13has_extensionERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE
 _ZNK4sycl3_V16device14is_acceleratorEv
 _ZNK4sycl3_V16device16get_backend_infoINS0_4info6device15backend_versionEEENS0_6detail20is_backend_info_descIT_E11return_typeEv

From cea7c7271f0172ea8b45db2b3b221d4d5cb11937 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 19 Jun 2024 05:48:29 -0700
Subject: [PATCH 30/52] clang git-clang-format run on changed files

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/context.cpp                       |   4 +-
 sycl/source/detail/context_impl.cpp           |   3 +-
 sycl/source/detail/device_impl.cpp            |   5 +-
 sycl/source/detail/device_impl.hpp            |  12 +-
 sycl/source/detail/event_impl.cpp             |  18 +-
 sycl/source/detail/memory_manager.cpp         |  28 +--
 sycl/source/detail/platform_impl.hpp          |   8 +-
 sycl/source/detail/program_impl.cpp           |  22 +--
 sycl/source/detail/program_impl.hpp           |   4 +-
 sycl/source/detail/queue_impl.cpp             |  15 +-
 sycl/source/detail/queue_impl.hpp             |  19 +-
 sycl/source/detail/scheduler/commands.cpp     | 165 +++++++++---------
 sycl/source/detail/scheduler/commands.hpp     |  12 +-
 .../source/detail/scheduler/graph_builder.cpp |  59 +++----
 sycl/source/detail/scheduler/scheduler.cpp    |   8 +-
 sycl/source/detail/stream_impl.cpp            |  21 +--
 sycl/source/detail/stream_impl.hpp            |   2 +-
 sycl/source/detail/sycl_mem_obj_t.cpp         |   5 +-
 sycl/source/detail/usm/usm_impl.cpp           |  48 ++---
 .../scheduler/EnqueueWithDependsOnDeps.cpp    |   3 +-
 sycl/unittests/scheduler/GraphCleanup.cpp     |   3 +-
 sycl/unittests/scheduler/InOrderQueueDeps.cpp |   3 +-
 .../scheduler/LeafLimitDiffContexts.cpp       |   4 +-
 sycl/unittests/scheduler/LeavesCollection.cpp |   3 +-
 .../scheduler/LinkedAllocaDependencies.cpp    |   3 +-
 .../scheduler/NoHostUnifiedMemory.cpp         |   3 +-
 sycl/unittests/scheduler/QueueFlushing.cpp    |  10 +-
 .../scheduler/StreamInitDependencyOnHost.cpp  |   2 +-
 28 files changed, 239 insertions(+), 253 deletions(-)

diff --git a/sycl/source/context.cpp b/sycl/source/context.cpp
index 70b12836fc297..1261096b82047 100644
--- a/sycl/source/context.cpp
+++ b/sycl/source/context.cpp
@@ -56,13 +56,13 @@ context::context(const std::vector<device> &DeviceList,
     throw invalid_parameter_error("DeviceList is empty.",
                                   PI_ERROR_INVALID_VALUE);
   }
-  
+
   const auto &RefPlatform =
       detail::getSyclObjImpl(DeviceList[0].get_platform())->getHandleRef();
   if (std::any_of(DeviceList.begin(), DeviceList.end(),
                   [&](const device &CurrentDevice) {
                     return (detail::getSyclObjImpl(CurrentDevice.get_platform())
-                              ->getHandleRef() != RefPlatform);
+                                ->getHandleRef() != RefPlatform);
                   }))
     throw invalid_parameter_error(
         "Can't add devices across platforms to a single context.",
diff --git a/sycl/source/detail/context_impl.cpp b/sycl/source/detail/context_impl.cpp
index 0c79ed2f70462..8ae13b345b250 100644
--- a/sycl/source/detail/context_impl.cpp
+++ b/sycl/source/detail/context_impl.cpp
@@ -33,8 +33,7 @@ context_impl::context_impl(const device &Device, async_handler AsyncHandler,
     : MOwnedByRuntime(true), MAsyncHandler(AsyncHandler), MDevices(1, Device),
       MContext(nullptr),
       MPlatform(detail::getSyclObjImpl(Device.get_platform())),
-      MPropList(PropList),
-      MSupportBufferLocationByDevices(NotChecked) {
+      MPropList(PropList), MSupportBufferLocationByDevices(NotChecked) {
   MKernelProgramCache.setContextPtr(this);
 }
 
diff --git a/sycl/source/detail/device_impl.cpp b/sycl/source/detail/device_impl.cpp
index e24b6f6f2510e..ebad36158cfc6 100644
--- a/sycl/source/detail/device_impl.cpp
+++ b/sycl/source/detail/device_impl.cpp
@@ -34,8 +34,7 @@ device_impl::device_impl(sycl::detail::pi::PiDevice Device,
 device_impl::device_impl(pi_native_handle InteropDeviceHandle,
                          sycl::detail::pi::PiDevice Device,
                          PlatformImplPtr Platform, const PluginPtr &Plugin)
-    : MDevice(Device),
-      MDeviceHostBaseTime(std::make_pair(0, 0)) {
+    : MDevice(Device), MDeviceHostBaseTime(std::make_pair(0, 0)) {
 
   bool InteroperabilityConstructor = false;
   if (Device == nullptr) {
@@ -328,7 +327,7 @@ bool device_impl::has(aspect Aspect) const {
 
   switch (Aspect) {
   case aspect::host:
-  //Deprecated
+    // Deprecated
     return false;
   case aspect::cpu:
     return is_cpu();
diff --git a/sycl/source/detail/device_impl.hpp b/sycl/source/detail/device_impl.hpp
index 9249bbba59fe8..a3344ecdd3870 100644
--- a/sycl/source/detail/device_impl.hpp
+++ b/sycl/source/detail/device_impl.hpp
@@ -64,18 +64,14 @@ class device_impl {
   /// For host device an exception is thrown
   ///
   /// \return non-constant reference to PI device
-  sycl::detail::pi::PiDevice &getHandleRef() {
-    return MDevice;
-  }
+  sycl::detail::pi::PiDevice &getHandleRef() { return MDevice; }
 
   /// Get constant reference to PI device
   ///
   /// For host device an exception is thrown
   ///
   /// \return constant reference to PI device
-  const sycl::detail::pi::PiDevice &getHandleRef() const {
-    return MDevice;
-  }
+  const sycl::detail::pi::PiDevice &getHandleRef() const { return MDevice; }
 
   /// Check if device is a CPU device
   ///
@@ -90,9 +86,7 @@ class device_impl {
   /// Check if device is an accelerator device
   ///
   /// \return true if SYCL device is an accelerator device
-  bool is_accelerator() const {
-    return MType == PI_DEVICE_TYPE_ACC;
-  }
+  bool is_accelerator() const { return MType == PI_DEVICE_TYPE_ACC; }
 
   /// Return device type
   ///
diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index 93dc4b7fca1b1..7d91129f25b51 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -38,8 +38,8 @@ void event_impl::ensureContextInitialized() {
     return;
 
   const device SyclDevice;
-  this->setContextImpl(detail::queue_impl::getDefaultOrNew(
-      detail::getSyclObjImpl(SyclDevice)));
+  this->setContextImpl(
+      detail::queue_impl::getDefaultOrNew(detail::getSyclObjImpl(SyclDevice)));
 }
 
 event_impl::~event_impl() {
@@ -134,8 +134,8 @@ void event_impl::setContextImpl(const ContextImplPtr &Context) {
 event_impl::event_impl(sycl::detail::pi::PiEvent Event,
                        const context &SyclContext)
     : MIsContextInitialized(true), MEvent(Event),
-      MContext(detail::getSyclObjImpl(SyclContext)),
-      MIsFlushed(true), MState(HES_Complete) {
+      MContext(detail::getSyclObjImpl(SyclContext)), MIsFlushed(true),
+      MState(HES_Complete) {
 
   sycl::detail::pi::PiContext TempContext;
   getPlugin()->call<PiApiKind::piEventGetInfo>(
@@ -150,9 +150,9 @@ event_impl::event_impl(sycl::detail::pi::PiEvent Event,
 }
 
 event_impl::event_impl(const QueueImplPtr &Queue)
-    : MQueue{Queue},
-      MIsProfilingEnabled{!Queue || Queue->MIsProfilingEnabled},
-      MFallbackProfiling{MIsProfilingEnabled && Queue && Queue->isProfilingFallback()} {
+    : MQueue{Queue}, MIsProfilingEnabled{!Queue || Queue->MIsProfilingEnabled},
+      MFallbackProfiling{MIsProfilingEnabled && Queue &&
+                         Queue->isProfilingFallback()} {
   if (Queue)
     this->setContextImpl(Queue->getContextImplPtr());
   else {
@@ -412,7 +412,7 @@ event_impl::get_backend_info<info::platform::version>() const {
   }
   // If the queue has been released, no platform will be associated
   // so return empty string.
-  return ""; 
+  return "";
 }
 
 template <>
@@ -571,7 +571,7 @@ bool event_impl::isCompleted() {
 
 void event_impl::setCommand(void *Cmd) {
   MCommand = Cmd;
-  auto TypedCommand = static_cast<Command*>(Cmd);
+  auto TypedCommand = static_cast<Command *>(Cmd);
   if (TypedCommand)
     MIsHostEvent = TypedCommand->getWorkerContext() == nullptr;
 }
diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp
index 461cf8b85915c..6f30ceef8eb51 100644
--- a/sycl/source/detail/memory_manager.cpp
+++ b/sycl/source/detail/memory_manager.cpp
@@ -398,9 +398,11 @@ void *MemoryManager::allocateMemBuffer(
     sycl::detail::pi::PiEvent &OutEventToWait) {
   void *MemPtr;
   if (!TargetContext)
-    MemPtr = allocateHostMemory(MemObj, UserPtr, HostPtrReadOnly, Size, PropsList);
+    MemPtr =
+        allocateHostMemory(MemObj, UserPtr, HostPtrReadOnly, Size, PropsList);
   else if (UserPtr && InteropContext)
-    MemPtr = allocateInteropMemObject(TargetContext, UserPtr, InteropEvent,
+    MemPtr =
+        allocateInteropMemObject(TargetContext, UserPtr, InteropEvent,
                                  InteropContext, PropsList, OutEventToWait);
   else
     MemPtr = allocateBufferObject(TargetContext, UserPtr, HostPtrReadOnly, Size,
@@ -665,7 +667,8 @@ void copyD2D(SYCLMemObjI *SYCLMemObj, sycl::detail::pi::PiMem SrcMem,
              sycl::detail::pi::PiEvent &OutEvent,
              const detail::EventImplPtr &OutEventImpl) {
   assert(SYCLMemObj && "The SYCLMemObj is nullptr");
-  assert(SrcQueue && "Source mem object and target mem object queues are expected to be not nullptr");
+  assert(SrcQueue && "Source mem object and target mem object queues are "
+                     "expected to be not nullptr");
 
   const sycl::detail::pi::PiQueue Queue = SrcQueue->getHandleRef();
   const PluginPtr &Plugin = SrcQueue->getPlugin();
@@ -778,9 +781,9 @@ void MemoryManager::copy(SYCLMemObjI *SYCLMemObj, void *SrcMem,
   if (!SrcQueue) {
     if (!TgtQueue)
       copyH2H(SYCLMemObj, (char *)SrcMem, nullptr, DimSrc, SrcSize,
-              SrcAccessRange, SrcOffset, SrcElemSize, (char *)DstMem,
-              nullptr, DimDst, DstSize, DstAccessRange, DstOffset,
-              DstElemSize, std::move(DepEvents), OutEvent, OutEventImpl);
+              SrcAccessRange, SrcOffset, SrcElemSize, (char *)DstMem, nullptr,
+              DimDst, DstSize, DstAccessRange, DstOffset, DstElemSize,
+              std::move(DepEvents), OutEvent, OutEventImpl);
     else
       copyH2D(SYCLMemObj, (char *)SrcMem, nullptr, DimSrc, SrcSize,
               SrcAccessRange, SrcOffset, SrcElemSize,
@@ -1235,7 +1238,8 @@ memcpyToDeviceGlobalUSM(QueueImplPtr Queue,
                         const std::vector<sycl::detail::pi::PiEvent> &DepEvents,
                         sycl::detail::pi::PiEvent *OutEvent,
                         const detail::EventImplPtr &OutEventImpl) {
-  assert(Queue && "Copy to device global USM must be called with a valid device queue");
+  assert(Queue &&
+         "Copy to device global USM must be called with a valid device queue");
   // Get or allocate USM memory for the device_global.
   DeviceGlobalUSMMem &DeviceGlobalUSM =
       DeviceGlobalEntry->getOrAllocateDeviceGlobalUSM(Queue);
@@ -1337,7 +1341,9 @@ static void memcpyToDeviceGlobalDirect(
     size_t NumBytes, size_t Offset, const void *Src,
     const std::vector<sycl::detail::pi::PiEvent> &DepEvents,
     sycl::detail::pi::PiEvent *OutEvent) {
-  assert(Queue && "Direct copy to device global must be called with a valid device queue");
+  assert(
+      Queue &&
+      "Direct copy to device global must be called with a valid device queue");
   sycl::detail::pi::PiProgram Program =
       getOrBuildProgramForDeviceGlobal(Queue, DeviceGlobalEntry);
   const PluginPtr &Plugin = Queue->getPlugin();
@@ -1352,7 +1358,8 @@ static void memcpyFromDeviceGlobalDirect(
     size_t NumBytes, size_t Offset, void *Dest,
     const std::vector<sycl::detail::pi::PiEvent> &DepEvents,
     sycl::detail::pi::PiEvent *OutEvent) {
-  assert(Queue && "Direct copy from device global must be called with a valid device queue");
+  assert(Queue && "Direct copy from device global must be called with a valid "
+                  "device queue");
   sycl::detail::pi::PiProgram Program =
       getOrBuildProgramForDeviceGlobal(Queue, DeviceGlobalEntry);
   const PluginPtr &Plugin = Queue->getPlugin();
@@ -1762,7 +1769,8 @@ void MemoryManager::copy_image_bindless(
     sycl::detail::pi::PiImageRegion CopyExtent,
     const std::vector<sycl::detail::pi::PiEvent> &DepEvents,
     sycl::detail::pi::PiEvent *OutEvent) {
-  assert(Queue && "Copy image bindless must be called with a valid device queue");
+  assert(Queue &&
+         "Copy image bindless must be called with a valid device queue");
   assert((Flags == (sycl::detail::pi::PiImageCopyFlags)
                        ext::oneapi::experimental::image_copy_flags::HtoD ||
           Flags == (sycl::detail::pi::PiImageCopyFlags)
diff --git a/sycl/source/detail/platform_impl.hpp b/sycl/source/detail/platform_impl.hpp
index e13bd0a3a1b31..bc6278d54f32c 100644
--- a/sycl/source/detail/platform_impl.hpp
+++ b/sycl/source/detail/platform_impl.hpp
@@ -103,9 +103,7 @@ class platform_impl {
   }
 
   /// \return an instance of OpenCL cl_platform_id.
-  cl_platform_id get() const {
-    return pi::cast<cl_platform_id>(MPlatform);
-  }
+  cl_platform_id get() const { return pi::cast<cl_platform_id>(MPlatform); }
 
   /// Returns raw underlying plug-in platform handle.
   ///
@@ -114,9 +112,7 @@ class platform_impl {
   /// is in use.
   ///
   /// \return a raw plug-in platform handle.
-  const sycl::detail::pi::PiPlatform &getHandleRef() const {
-    return MPlatform;
-  }
+  const sycl::detail::pi::PiPlatform &getHandleRef() const { return MPlatform; }
 
   /// Returns all available SYCL platforms in the system.
   ///
diff --git a/sycl/source/detail/program_impl.cpp b/sycl/source/detail/program_impl.cpp
index 584b2487f5dee..df95614d872c3 100644
--- a/sycl/source/detail/program_impl.cpp
+++ b/sycl/source/detail/program_impl.cpp
@@ -220,22 +220,22 @@ void program_impl::compile_with_kernel_name(std::string KernelName,
                                             std::string CompileOptions) {
   std::lock_guard<std::mutex> Lock(MMutex);
   throw_if_state_is_not(program_state::none);
-    create_pi_program_with_kernel_name(
-        KernelName,
-        /*JITCompilationIsRequired=*/(!CompileOptions.empty()));
-    compile(CompileOptions);
+  create_pi_program_with_kernel_name(
+      KernelName,
+      /*JITCompilationIsRequired=*/(!CompileOptions.empty()));
+  compile(CompileOptions);
   MState = program_state::compiled;
 }
 
 void program_impl::link(std::string LinkOptions) {
   std::lock_guard<std::mutex> Lock(MMutex);
   throw_if_state_is_not(program_state::compiled);
-    check_device_feature_support<info::device::is_linker_available>(MDevices);
-    std::vector<sycl::detail::pi::PiDevice> Devices(get_pi_devices());
-    const PluginPtr &Plugin = getPlugin();
-    const char *LinkOpts = SYCLConfig<SYCL_PROGRAM_LINK_OPTIONS>::get();
-    if (!LinkOpts) {
-      LinkOpts = LinkOptions.c_str();
+  check_device_feature_support<info::device::is_linker_available>(MDevices);
+  std::vector<sycl::detail::pi::PiDevice> Devices(get_pi_devices());
+  const PluginPtr &Plugin = getPlugin();
+  const char *LinkOpts = SYCLConfig<SYCL_PROGRAM_LINK_OPTIONS>::get();
+  if (!LinkOpts) {
+    LinkOpts = LinkOptions.c_str();
     }
 
     // Plugin resets MProgram with a new pi_program as a result of the call to
@@ -251,7 +251,7 @@ void program_impl::link(std::string LinkOptions) {
     Plugin->checkPiResult<compile_program_error>(Err);
     MLinkOptions = LinkOptions;
     MBuildOptions = LinkOptions;
-  MState = program_state::linked;
+    MState = program_state::linked;
 }
 
 bool program_impl::has_kernel(std::string KernelName,
diff --git a/sycl/source/detail/program_impl.hpp b/sycl/source/detail/program_impl.hpp
index 1fa8767774961..67c02e95734ab 100644
--- a/sycl/source/detail/program_impl.hpp
+++ b/sycl/source/detail/program_impl.hpp
@@ -216,9 +216,7 @@ class program_impl {
   }
 
   /// \return the Plugin associated with the context of this program.
-  const PluginPtr &getPlugin() const {
-    return MContext->getPlugin();
-  }
+  const PluginPtr &getPlugin() const { return MContext->getPlugin(); }
 
   ContextImplPtr getContextImplPtr() const { return MContext; }
 
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index af7af19ede120..83f33688ed0b1 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -361,7 +361,8 @@ event queue_impl::submit_impl(const std::function<void(handler &)> &CGF,
   // Host and interop tasks, however, are not submitted to low-level runtimes
   // and require separate dependency management.
   const CG::CGTYPE Type = Handler.getType();
-  event Event = detail::createSyclObjFromImpl<event>(std::make_shared<detail::event_impl>());
+  event Event = detail::createSyclObjFromImpl<event>(
+      std::make_shared<detail::event_impl>());
   std::vector<StreamImplPtr> Streams;
   if (Type == CG::Kernel)
     Streams = std::move(Handler.MStreamStorage);
@@ -385,12 +386,12 @@ event queue_impl::submit_impl(const std::function<void(handler &)> &CGF,
 
   auto EventImpl = detail::getSyclObjImpl(Event);
   for (auto &Stream : Streams) {
-    // We don't want stream flushing to be blocking operation that is why submit a
-    // host task to print stream buffer. It will fire up as soon as the kernel
+    // We don't want stream flushing to be blocking operation that is why submit
+    // a host task to print stream buffer. It will fire up as soon as the kernel
     // finishes execution.
-    event FlushEvent = submit_impl([&](handler &ServiceCGH) {
-      Stream->generateFlushCommand(ServiceCGH);
-    }, Self, PrimaryQueue, SecondaryQueue, Loc, {});
+    event FlushEvent = submit_impl(
+        [&](handler &ServiceCGH) { Stream->generateFlushCommand(ServiceCGH); },
+        Self, PrimaryQueue, SecondaryQueue, Loc, {});
     EventImpl->attachEventToComplete(detail::getSyclObjImpl(FlushEvent));
     registerStreamServiceEvent(detail::getSyclObjImpl(FlushEvent));
   }
@@ -707,7 +708,7 @@ void queue_impl::revisitUnenqueuedCommandsState(
               Deps.UnenqueuedCmdEvents.begin(), Deps.UnenqueuedCmdEvents.end(),
               [](const EventImplPtr &CommandEvent) {
                 return (CommandEvent->isHost() ? CommandEvent->isCompleted()
-                                                : CommandEvent->isEnqueued());
+                                               : CommandEvent->isEnqueued());
               }),
           Deps.UnenqueuedCmdEvents.end());
     }
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index e72ded829a798..d0a74cc80c793 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -13,8 +13,8 @@
 #include <detail/device_impl.hpp>
 #include <detail/device_info.hpp>
 #include <detail/event_impl.hpp>
-#include <detail/handler_impl.hpp>
 #include <detail/global_handler.hpp>
+#include <detail/handler_impl.hpp>
 #include <detail/kernel_impl.hpp>
 #include <detail/plugin.hpp>
 #include <detail/scheduler/scheduler.hpp>
@@ -194,14 +194,13 @@ class queue_impl {
         if (MDevice) {
           xpti::addMetadata(TEvent, "sycl_device_name",
                             MDevice->getDeviceName());
-          xpti::addMetadata(
-              TEvent, "sycl_device",
-              reinterpret_cast<size_t>(MDevice->getHandleRef()));
+          xpti::addMetadata(TEvent, "sycl_device",
+                            reinterpret_cast<size_t>(MDevice->getHandleRef()));
         }
         xpti::addMetadata(TEvent, "is_inorder", MIsInorder);
         xpti::addMetadata(TEvent, "queue_id", MQueueID);
         xpti::addMetadata(TEvent, "queue_handle",
-                            reinterpret_cast<size_t>(getHandleRef()));
+                          reinterpret_cast<size_t>(getHandleRef()));
       });
       // Also publish to TLS
       xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, MQueueID);
@@ -257,9 +256,8 @@ class queue_impl {
         if (MDevice) {
           xpti::addMetadata(TEvent, "sycl_device_name",
                             MDevice->getDeviceName());
-          xpti::addMetadata(
-              TEvent, "sycl_device",
-              reinterpret_cast<size_t>(MDevice->getHandleRef()));
+          xpti::addMetadata(TEvent, "sycl_device",
+                            reinterpret_cast<size_t>(MDevice->getHandleRef()));
         }
         xpti::addMetadata(TEvent, "is_inorder", MIsInorder);
         xpti::addMetadata(TEvent, "queue_id", MQueueID);
@@ -751,9 +749,8 @@ class queue_impl {
   // tasks and host tasks is applicable for out of order queues only. Not neede
   // for in order ones.
   void revisitUnenqueuedCommandsState(const EventImplPtr &CompletedHostTask);
- 
-  static ContextImplPtr getContext(const QueueImplPtr& Queue)
-  {
+
+  static ContextImplPtr getContext(const QueueImplPtr &Queue) {
     return Queue ? Queue->getContextImplPtr() : nullptr;
   }
 
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index c751cf7438ae7..3d51fe7a1c12f 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -90,21 +90,19 @@ static std::string deviceToString(device Device) {
     return "UNKNOWN";
 }
 
-static void addDeviceMetadata(xpti_td* TraceEvent, const QueueImplPtr& Queue)
-{
-    xpti::addMetadata(TraceEvent, "sycl_device",
-                      Queue ? deviceToID(Queue->get_device()) : 0);
-    xpti::addMetadata(TraceEvent, "sycl_device_type",
-                      Queue ? deviceToString(Queue->get_device()) : "host");
-    if (Queue)
-      xpti::addMetadata(TraceEvent, "sycl_device_name",
+static void addDeviceMetadata(xpti_td *TraceEvent, const QueueImplPtr &Queue) {
+  xpti::addMetadata(TraceEvent, "sycl_device",
+                    Queue ? deviceToID(Queue->get_device()) : 0);
+  xpti::addMetadata(TraceEvent, "sycl_device_type",
+                    Queue ? deviceToString(Queue->get_device()) : "host");
+  if (Queue)
+    xpti::addMetadata(TraceEvent, "sycl_device_name",
                       getSyclObjImpl(Queue->get_device())->getDeviceName());
 }
 
 #endif
 
-static ContextImplPtr getContext(const QueueImplPtr& Queue)
-{
+static ContextImplPtr getContext(const QueueImplPtr &Queue) {
   if (Queue)
     return Queue->getContextImplPtr();
   return nullptr;
@@ -350,10 +348,12 @@ class DispatchHostTask {
         PluginWithEvents.first->call<PiApiKind::piEventsWait>(RawEvents.size(),
                                                               RawEvents.data());
       } catch (const sycl::exception &E) {
-        MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(std::current_exception());
+        MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(
+            std::current_exception());
         return (pi_result)E.get_cl_code();
       } catch (...) {
-        MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(std::current_exception());
+        MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(
+            std::current_exception());
         return PI_ERROR_UNKNOWN;
       }
     }
@@ -404,7 +404,8 @@ class DispatchHostTask {
     try {
       // we're ready to call the user-defined lambda now
       if (HostTask.MHostTask->isInteropTask()) {
-        assert(HostTask.MQueue && "Submitted queue for host task must be device queue");
+        assert(HostTask.MQueue &&
+               "Submitted queue for host task must be device queue");
         interop_handle IH{MReqToMem, HostTask.MQueue,
                           HostTask.MQueue->getDeviceImplPtr(),
                           HostTask.MQueue->getContextImplPtr()};
@@ -431,7 +432,8 @@ class DispatchHostTask {
         }
       }
 #endif
-      MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(CurrentException);
+      MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(
+          CurrentException);
     }
 
     HostTask.MHostTask.reset();
@@ -448,7 +450,8 @@ class DispatchHostTask {
       Scheduler::getInstance().NotifyHostTaskCompletion(MThisCmd);
     } catch (...) {
       auto CurrentException = std::current_exception();
-      MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(CurrentException);
+      MThisCmd->MEvent->getSubmittedQueue()->reportAsyncException(
+          CurrentException);
     }
   }
 };
@@ -461,13 +464,13 @@ void Command::waitForPreparedHostEvents() const {
 void Command::waitForEvents(QueueImplPtr Queue,
                             std::vector<EventImplPtr> &EventImpls,
                             sycl::detail::pi::PiEvent &Event) {
-  #ifndef NDEBUG
-      for (const EventImplPtr &Event : EventImpls)
-        assert(!Event->isHost() &&
-               "Only non-host events are expected to be waited for here");
+#ifndef NDEBUG
+  for (const EventImplPtr &Event : EventImpls)
+    assert(!Event->isHost() &&
+           "Only non-host events are expected to be waited for here");
 #endif
   if (!EventImpls.empty()) {
-      if (!Queue) {
+    if (!Queue) {
       // Host queue can wait for events from different contexts, i.e. it may
       // contain events with different contexts in its MPreparedDepsEvents.
       // OpenCL 2.1 spec says that clWaitForEvents will return
@@ -507,7 +510,7 @@ void Command::waitForEvents(QueueImplPtr Queue,
         MEvent->setHostEnqueueTime();
       Plugin->call<PiApiKind::piEnqueueEventsWait>(
           Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event);
-  }
+    }
   }
 }
 
@@ -716,7 +719,8 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep,
 
   // 1. Non-host events can be ignored if they are not fully initialized.
   // 2. Some types of commands do not produce PI events after they are
-  // enqueued (e.g. alloca). Note that we can't check the pi event to make that distinction since the command might still be unenqueued at this point.
+  // enqueued (e.g. alloca). Note that we can't check the pi event to make that
+  // distinction since the command might still be unenqueued at this point.
   bool PiEventExpected = (!DepEvent->isHost() && DepEvent->isInitialized());
   if (auto *DepCmd = static_cast<Command *>(DepEvent->getCommand()))
     PiEventExpected &= DepCmd->producesPiEvent();
@@ -732,7 +736,7 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep,
 
   ContextImplPtr DepEventContext = DepEvent->getContextImpl();
   // If contexts don't match we'll connect them using host task
-  if (DepEventContext != WorkerContext && WorkerContext){
+  if (DepEventContext != WorkerContext && WorkerContext) {
     Scheduler::GraphBuilder &GB = Scheduler::getInstance().MGraphBuilder;
     ConnectionCmd = GB.connectDepEvent(this, DepEvent, Dep, ToCleanUp);
   } else
@@ -1006,7 +1010,7 @@ void AllocaCommandBase::emitInstrumentationData() {
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                        MQueue ? MQueue->getQueueID() : 0);
+                                 MQueue ? MQueue->getQueueID() : 0);
   }
 #endif
 }
@@ -1066,9 +1070,9 @@ pi_int32 AllocaCommand::enqueueImp() {
   }
   // TODO: Check if it is correct to use std::move on stack variable and
   // delete it RawEvents below.
-  MMemAllocation = MemoryManager::allocate(
-      getContext(MQueue), getSYCLMemObj(), MInitFromUserData, HostPtr,
-      std::move(EventImpls), Event);
+  MMemAllocation = MemoryManager::allocate(getContext(MQueue), getSYCLMemObj(),
+                                           MInitFromUserData, HostPtr,
+                                           std::move(EventImpls), Event);
 
   return PI_SUCCESS;
 }
@@ -1077,7 +1081,8 @@ void AllocaCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "ALLOCA ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
+  Stream << "ALLOCA ON "
+         << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
   Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
   Stream << " Link : " << this->MLinkedAllocaCmd << "\\n";
   Stream << "\"];" << std::endl;
@@ -1163,8 +1168,8 @@ void AllocaSubBufCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "ALLOCA SUB BUF ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host")
-         << "\\n";
+  Stream << "ALLOCA SUB BUF ON "
+         << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
   Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
   Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n";
   Stream << " Access range : " << this->MRequirement.MAccessRange[0] << "\\n";
@@ -1266,9 +1271,9 @@ pi_int32 ReleaseCommand::enqueueImp() {
   if (SkipRelease)
     Command::waitForEvents(MQueue, EventImpls, Event);
   else {
-    MemoryManager::release(
-        getContext(MQueue), MAllocaCmd->getSYCLMemObj(),
-        MAllocaCmd->getMemAllocation(), std::move(EventImpls), Event);
+    MemoryManager::release(getContext(MQueue), MAllocaCmd->getSYCLMemObj(),
+                           MAllocaCmd->getMemAllocation(),
+                           std::move(EventImpls), Event);
   }
   return PI_SUCCESS;
 }
@@ -1277,7 +1282,8 @@ void ReleaseCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FF827A\", label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "RELEASE ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
+  Stream << "RELEASE ON "
+         << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
   Stream << " Alloca : " << MAllocaCmd << "\\n";
   Stream << " MemObj : " << MAllocaCmd->getSYCLMemObj() << "\\n";
   Stream << "\"];" << std::endl;
@@ -1347,7 +1353,8 @@ void MapMemObject::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "MAP ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
+  Stream << "MAP ON "
+         << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
 
   Stream << "\"];" << std::endl;
 
@@ -1406,8 +1413,8 @@ bool UnMapMemObject::producesPiEvent() const {
   // so the execution of kernel B starts only on step 4. This workaround
   // restores the old behavior in this case until this is resolved.
   return MQueue && (MQueue->getDeviceImplPtr()->getBackend() !=
-             backend::ext_oneapi_level_zero ||
-         MEvent->getHandleRef() != nullptr);
+                        backend::ext_oneapi_level_zero ||
+                    MEvent->getHandleRef() != nullptr);
 }
 
 pi_int32 UnMapMemObject::enqueueImp() {
@@ -1428,7 +1435,8 @@ void UnMapMemObject::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#EBC40F\", label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "UNMAP ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
+  Stream << "UNMAP ON "
+         << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
 
   Stream << "\"];" << std::endl;
 
@@ -1476,13 +1484,12 @@ void MemCpyCommand::emitInstrumentationData() {
                       reinterpret_cast<size_t>(MAddress));
     xpti::addMetadata(CmdTraceEvent, "copy_from",
                       MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0);
-    xpti::addMetadata(
-        CmdTraceEvent, "copy_to",
-        MQueue ? deviceToID(MQueue->get_device()): 0);
+    xpti::addMetadata(CmdTraceEvent, "copy_to",
+                      MQueue ? deviceToID(MQueue->get_device()) : 0);
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                        MQueue ? MQueue->getQueueID() : 0);
+                                 MQueue ? MQueue->getQueueID() : 0);
     makeTraceEventEpilog();
   }
 #endif
@@ -1539,11 +1546,10 @@ void MemCpyCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#C7EB15\" label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "MEMCPY ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
-  Stream << "From: " << MSrcAllocaCmd << " is host: " << !MSrcQueue
-         << "\\n";
-  Stream << "To: " << MDstAllocaCmd << " is host: " << !MQueue
-         << "\\n";
+  Stream << "MEMCPY ON "
+         << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
+  Stream << "From: " << MSrcAllocaCmd << " is host: " << !MSrcQueue << "\\n";
+  Stream << "To: " << MDstAllocaCmd << " is host: " << !MQueue << "\\n";
 
   Stream << "\"];" << std::endl;
 
@@ -1597,7 +1603,8 @@ void UpdateHostRequirementCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#f1337f\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "UPDATE REQ ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
+  Stream << "UPDATE REQ ON "
+         << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
   bool IsReqOnBuffer =
       MDstReq.MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer;
   Stream << "TYPE: " << (IsReqOnBuffer ? "Buffer" : "Image") << "\\n";
@@ -1649,14 +1656,13 @@ void MemCpyCommandHost::emitInstrumentationData() {
     xpti::addMetadata(CmdTraceEvent, "memory_object",
                       reinterpret_cast<size_t>(MAddress));
     xpti::addMetadata(CmdTraceEvent, "copy_from",
-                          MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0);
-    xpti::addMetadata(
-        CmdTraceEvent, "copy_to",
-        MQueue ? deviceToID(MQueue->get_device()) : 0);
+                      MSrcQueue ? deviceToID(MSrcQueue->get_device()) : 0);
+    xpti::addMetadata(CmdTraceEvent, "copy_to",
+                      MQueue ? deviceToID(MQueue->get_device()) : 0);
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                        MQueue ? MQueue->getQueueID() : 0);
+                                 MQueue ? MQueue->getQueueID() : 0);
     makeTraceEventEpilog();
   }
 #endif
@@ -1696,8 +1702,7 @@ pi_int32 MemCpyCommandHost::enqueueImp() {
   return PI_SUCCESS;
 }
 
-EmptyCommand::EmptyCommand()
-    : Command(CommandType::EMPTY_TASK, nullptr) {
+EmptyCommand::EmptyCommand() : Command(CommandType::EMPTY_TASK, nullptr) {
   emitInstrumentationDataProxy();
 }
 
@@ -1746,7 +1751,7 @@ void EmptyCommand::emitInstrumentationData() {
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                        MQueue ? MQueue->getQueueID() : 0);
+                                 MQueue ? MQueue->getQueueID() : 0);
     makeTraceEventEpilog();
   }
 #endif
@@ -1775,7 +1780,8 @@ void MemCpyCommandHost::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "MEMCPY HOST ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
+  Stream << "MEMCPY HOST ON "
+         << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
 
   Stream << "\"];" << std::endl;
 
@@ -1814,7 +1820,7 @@ void UpdateHostRequirementCommand::emitInstrumentationData() {
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                        MQueue ? MQueue->getQueueID() : 0);
+                                 MQueue ? MQueue->getQueueID() : 0);
     makeTraceEventEpilog();
   }
 #endif
@@ -2082,7 +2088,7 @@ std::pair<xpti_td *, uint64_t> emitKernelInstrumentationData(
   if (CmdTraceEvent) {
     // Stash the queue_id mutable metadata in TLS
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                        Queue ? Queue->getQueueID() : 0);
+                                 Queue ? Queue->getQueueID() : 0);
 
     instrumentationAddExtraKernelMetadata(CmdTraceEvent, NDRDesc,
                                           KernelBundleImplPtr, SyclKernelName,
@@ -2128,7 +2134,7 @@ void ExecCGCommand::emitInstrumentationData() {
 
   if (CmdTraceEvent) {
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                        MQueue ? MQueue->getQueueID() : 0);
+                                 MQueue ? MQueue->getQueueID() : 0);
     MTraceEvent = static_cast<void *>(CmdTraceEvent);
     if (MCommandGroup->getType() == detail::CG::Kernel) {
       auto KernelCG =
@@ -2151,7 +2157,8 @@ void ExecCGCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "EXEC CG ON " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
+  Stream << "EXEC CG ON "
+         << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
 
   switch (MCommandGroup->getType()) {
   case detail::CG::Kernel: {
@@ -2247,8 +2254,7 @@ void SetArgBasedOnType(
     const PluginPtr &Plugin, sycl::detail::pi::PiKernel Kernel,
     const std::shared_ptr<device_image_impl> &DeviceImageImpl,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
-    const sycl::context &Context, detail::ArgDesc &Arg,
-    size_t NextTrueIndex) {
+    const sycl::context &Context, detail::ArgDesc &Arg, size_t NextTrueIndex) {
   switch (Arg.MType) {
   case kernel_param_kind_t::kind_stream:
     break;
@@ -2338,8 +2344,7 @@ static pi_result SetKernelParamsAndLaunch(
   auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
                   &Queue](detail::ArgDesc &Arg, size_t NextTrueIndex) {
     SetArgBasedOnType(Plugin, Kernel, DeviceImageImpl, getMemAllocationFunc,
-                      Queue->get_context(), Arg,
-                      NextTrueIndex);
+                      Queue->get_context(), Arg, NextTrueIndex);
   };
 
   applyFuncOnFilteredArgs(EliminatedArgMask, Args, setFunc);
@@ -2639,7 +2644,8 @@ enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName,
                          bool blocking, void *ptr, size_t size,
                          std::vector<sycl::detail::pi::PiEvent> &RawEvents,
                          const detail::EventImplPtr &OutEventImpl, bool read) {
-  assert(Queue && "Queue with submitted read write host pipe could not be on host");
+  assert(Queue &&
+         "Queue with submitted read write host pipe could not be on host");
   detail::HostPipeMapEntry *hostPipeEntry =
       ProgramManager::getInstance().getHostPipeEntry(PipeName);
 
@@ -2856,7 +2862,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
   flushCrossQueueDeps(EventImpls, MWorkerQueue);
 
   bool DiscardPiEvent = MQueue && MQueue->supportsDiscardingPiEvents() &&
-                         (MCommandGroup->getRequirements().size() == 0);
+                        (MCommandGroup->getRequirements().size() == 0);
   sycl::detail::pi::PiEvent *Event =
       DiscardPiEvent ? nullptr : &MEvent->getHandleRef();
   detail::EventImplPtr EventImpl = DiscardPiEvent ? nullptr : MEvent;
@@ -2876,10 +2882,9 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     MemoryManager::copy(
         AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
         Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
-        Req->MElemSize, Copy->getDst(),
-        nullptr, Req->MDims,
-        Req->MAccessRange, Req->MAccessRange, /*DstOffset=*/{0, 0, 0},
-        Req->MElemSize, std::move(RawEvents), MEvent->getHandleRef(), MEvent);
+        Req->MElemSize, Copy->getDst(), nullptr, Req->MDims, Req->MAccessRange,
+        Req->MAccessRange, /*DstOffset=*/{0, 0, 0}, Req->MElemSize,
+        std::move(RawEvents), MEvent->getHandleRef(), MEvent);
 
     return PI_SUCCESS;
   }
@@ -2889,8 +2894,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     AllocaCommandBase *AllocaCmd = getAllocaForReq(Req);
 
     MemoryManager::copy(
-        AllocaCmd->getSYCLMemObj(), Copy->getSrc(),
-        nullptr, Req->MDims,
+        AllocaCmd->getSYCLMemObj(), Copy->getSrc(), nullptr, Req->MDims,
         Req->MAccessRange, Req->MAccessRange,
         /*SrcOffset*/ {0, 0, 0}, Req->MElemSize, AllocaCmd->getMemAllocation(),
         MQueue, Req->MDims, Req->MMemoryRange, Req->MAccessRange, Req->MOffset,
@@ -2937,7 +2941,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     std::vector<ArgDesc> &Args = ExecKernel->MArgs;
 
     if (MQueue->getDeviceImplPtr()->getBackend() ==
-                              backend::ext_intel_esimd_emulator) {
+        backend::ext_intel_esimd_emulator) {
       for (ArgDesc &Arg : Args)
         if (kernel_param_kind_t::kind_accessor == Arg.MType) {
           Requirement *Req = (Requirement *)(Arg.MPtr);
@@ -2959,7 +2963,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
             reinterpret_cast<pi_kernel>(ExecKernel->MHostKernel->getPtr()),
             NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0],
             &NDRDesc.LocalSize[0], 0, nullptr, nullptr);
-      return PI_SUCCESS;
+        return PI_SUCCESS;
     }
 
     auto getMemAllocationFunc = [this](Requirement *Req) {
@@ -3119,7 +3123,8 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     return PI_SUCCESS;
   }
   case CG::CGTYPE::BarrierWaitlist: {
-    assert(MQueue && "Device queue must be present for barrier with wait list command");
+    assert(MQueue &&
+           "Device queue must be present for barrier with wait list command");
     CGBarrier *Barrier = static_cast<CGBarrier *>(MCommandGroup.get());
     std::vector<detail::EventImplPtr> Events = Barrier->MEventsWaitWithBarrier;
     std::vector<sycl::detail::pi::PiEvent> PiEvents =
@@ -3224,7 +3229,8 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     return PI_SUCCESS;
   }
   case CG::CGTYPE::SemaphoreSignal: {
-    assert(MQueue && "Device queue must be present for semaphore signal command");
+    assert(MQueue &&
+           "Device queue must be present for semaphore signal command");
     CGSemaphoreSignal *SemSignal = (CGSemaphoreSignal *)MCommandGroup.get();
 
     const detail::PluginPtr &Plugin = MQueue->getPlugin();
@@ -3348,7 +3354,7 @@ void KernelFusionCommand::emitInstrumentationData() {
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
     xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                        MQueue ? MQueue->getQueueID() : 0);
+                                 MQueue ? MQueue->getQueueID() : 0);
     xptiNotifySubscribers(MStreamID, NotificationTraceType,
                           detail::GSYCLGraphEvent,
                           static_cast<xpti_td *>(MTraceEvent), MInstanceID,
@@ -3362,7 +3368,8 @@ void KernelFusionCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "KERNEL FUSION on " << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"
+  Stream << "KERNEL FUSION on "
+         << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"
          << "FUSION LIST: {";
   bool Initial = true;
   for (auto *Cmd : MFusionList) {
diff --git a/sycl/source/detail/scheduler/commands.hpp b/sycl/source/detail/scheduler/commands.hpp
index 628ccdf2593da..63fb4853d88e4 100644
--- a/sycl/source/detail/scheduler/commands.hpp
+++ b/sycl/source/detail/scheduler/commands.hpp
@@ -373,10 +373,11 @@ class Command {
   std::string MSubmissionFunctionName;
 
   // This flag allows to control whether event should be set complete
-  // after successfull enqueue of command. Event is considered as "host" event if
-  // there is no backend representation of event (i.e. getHandleRef() return reference to nullptr value).
-  // By default the flag is set to true due to most of host operations are
-  // synchronous. The only asynchronous operation currently is host-task.
+  // after successfull enqueue of command. Event is considered as "host" event
+  // if there is no backend representation of event (i.e. getHandleRef() return
+  // reference to nullptr value). By default the flag is set to true due to most
+  // of host operations are synchronous. The only asynchronous operation
+  // currently is host-task.
   bool MShouldCompleteEventIfPossible = true;
 
   /// Indicates that the node will be freed by graph cleanup. Such nodes should
@@ -792,8 +793,7 @@ void SetArgBasedOnType(
     const detail::plugin &Plugin, sycl::detail::pi::PiKernel Kernel,
     const std::shared_ptr<device_image_impl> &DeviceImageImpl,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
-    const sycl::context &Context, detail::ArgDesc &Arg,
-    size_t NextTrueIndex);
+    const sycl::context &Context, detail::ArgDesc &Arg, size_t NextTrueIndex);
 
 void applyFuncOnFilteredArgs(
     const KernelArgMask *EliminatedArgMask, std::vector<ArgDesc> &Args,
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 2919932c4e788..2ac97baefb543 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -54,9 +54,10 @@ static bool IsSuitableSubReq(const Requirement *Req) {
   return Req->MIsSubBuffer;
 }
 
-static bool isOnSameContext(const ContextImplPtr Context, const QueueImplPtr& Queue)
-{
-  // Covers case for host usage (nullptr == nullptr) and existing device contexts comparison.
+static bool isOnSameContext(const ContextImplPtr Context,
+                            const QueueImplPtr &Queue) {
+  // Covers case for host usage (nullptr == nullptr) and existing device
+  // contexts comparison.
   return Context == queue_impl::getContext(Queue);
 }
 
@@ -289,8 +290,7 @@ UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd(
     MemObjRecord *Record, Requirement *Req, const QueueImplPtr &Queue,
     std::vector<Command *> &ToEnqueue) {
   auto Context = queue_impl::getContext(Queue);
-  AllocaCommandBase *AllocaCmd =
-      findAllocaForReq(Record, Req, Context);
+  AllocaCommandBase *AllocaCmd = findAllocaForReq(Record, Req, Context);
   assert(AllocaCmd && "There must be alloca for requirement!");
   UpdateHostRequirementCommand *UpdateCommand =
       new UpdateHostRequirementCommand(Queue, *Req, AllocaCmd, &Req->MData);
@@ -298,8 +298,7 @@ UpdateHostRequirementCommand *Scheduler::GraphBuilder::insertUpdateHostReqCmd(
   // dependencies become invalid if requirement is stored by pointer.
   const Requirement *StoredReq = UpdateCommand->getRequirement();
 
-  std::set<Command *> Deps =
-      findDepsForReq(Record, Req, Context);
+  std::set<Command *> Deps = findDepsForReq(Record, Req, Context);
   std::vector<Command *> ToCleanUp;
   for (Command *Dep : Deps) {
     Command *ConnCmd =
@@ -353,8 +352,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(
     throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
 
   auto Context = queue_impl::getContext(Queue);
-  std::set<Command *> Deps =
-      findDepsForReq(Record, Req, Context);
+  std::set<Command *> Deps = findDepsForReq(Record, Req, Context);
   Deps.insert(AllocaCmdDst);
   // Get parent allocation of sub buffer to perform full copy of whole buffer
   if (IsSuitableSubReq(Req)) {
@@ -434,8 +432,7 @@ Command *Scheduler::GraphBuilder::insertMemoryMove(
 Command *Scheduler::GraphBuilder::remapMemoryObject(
     MemObjRecord *Record, Requirement *Req, AllocaCommandBase *HostAllocaCmd,
     std::vector<Command *> &ToEnqueue) {
-  assert(!HostAllocaCmd->getQueue() &&
-         "Host alloca command expected");
+  assert(!HostAllocaCmd->getQueue() && "Host alloca command expected");
   assert(HostAllocaCmd->MIsActive && "Active alloca command expected");
 
   AllocaCommandBase *LinkedAllocaCmd = HostAllocaCmd->MLinkedAllocaCmd;
@@ -490,8 +487,7 @@ Scheduler::GraphBuilder::addCopyBack(Requirement *Req,
   if (nullptr == Record || !Record->MMemModified)
     return nullptr;
 
-  std::set<Command *> Deps =
-      findDepsForReq(Record, Req, nullptr);
+  std::set<Command *> Deps = findDepsForReq(Record, Req, nullptr);
   AllocaCommandBase *SrcAllocaCmd =
       findAllocaForReq(Record, Req, Record->MCurContext);
 
@@ -531,7 +527,8 @@ Scheduler::GraphBuilder::addHostAccessor(Requirement *Req,
     auto SYCLMemObj = static_cast<detail::SYCLMemObjT *>(Req->MSYCLMemObj);
     SYCLMemObj->handleWriteAccessorCreation();
   }
-  // Host accessor is not attached to any queue so no QueueImplPtr object to be sent to getOrInsertMemObjRecord.
+  // Host accessor is not attached to any queue so no QueueImplPtr object to be
+  // sent to getOrInsertMemObjRecord.
   MemObjRecord *Record = getOrInsertMemObjRecord(nullptr, Req);
   if (MPrintOptionsArray[BeforeAddHostAcc])
     printGraphAsDot("before_addHostAccessor");
@@ -556,8 +553,8 @@ Scheduler::GraphBuilder::addHostAccessor(Requirement *Req,
       insertUpdateHostReqCmd(Record, Req, nullptr, ToEnqueue);
 
   // Need empty command to be blocked until host accessor is destructed
-  EmptyCommand *EmptyCmd =
-      addEmptyCmd(UpdateHostAccCmd, {Req}, Command::BlockReason::HostAccessor, ToEnqueue);
+  EmptyCommand *EmptyCmd = addEmptyCmd(
+      UpdateHostAccCmd, {Req}, Command::BlockReason::HostAccessor, ToEnqueue);
 
   Req->MBlockedCmd = EmptyCmd;
 
@@ -621,8 +618,7 @@ Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record,
       CanBypassDep |= !doOverlap(Dep.MDepRequirement, Req);
 
       // Going through copying memory between contexts is not supported.
-      if (Dep.MDepCommand)
-      {
+      if (Dep.MDepCommand) {
         auto DepQueue = Dep.MDepCommand->getQueue();
         CanBypassDep &= isOnSameContext(Context, DepQueue);
       }
@@ -686,7 +682,8 @@ static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) {
     if (std::strcmp(HUMConfig, "1") == 0)
       return true;
   }
-  // host task & host accessor is covered with no device context but provide required support.
+  // host task & host accessor is covered with no device context but provide
+  // required support.
   if (Ctx == nullptr)
     return true;
 
@@ -705,8 +702,8 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
     MemObjRecord *Record, const Requirement *Req, const QueueImplPtr &Queue,
     std::vector<Command *> &ToEnqueue) {
   auto Context = queue_impl::getContext(Queue);
-  AllocaCommandBase *AllocaCmd = findAllocaForReq(
-      Record, Req, Context, /*AllowConst=*/false);
+  AllocaCommandBase *AllocaCmd =
+      findAllocaForReq(Record, Req, Context, /*AllowConst=*/false);
 
   if (!AllocaCmd) {
     std::vector<Command *> ToCleanUp;
@@ -736,8 +733,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
       // TODO the case where the first alloca is made with a discard mode and
       // the user pointer is read-only is still not handled: it leads to
       // unnecessary copy on devices with unified host memory support.
-      const bool HostUnifiedMemory =
-          checkHostUnifiedMemory(Context);
+      const bool HostUnifiedMemory = checkHostUnifiedMemory(Context);
       SYCLMemObjI *MemObj = Req->MSYCLMemObj;
       const bool InitFromUserData = Record->MAllocaCommands.empty() &&
                                     (HostUnifiedMemory || MemObj->isInterop());
@@ -828,10 +824,9 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
           AllocaCmd->MIsActive = false;
         } else {
           LinkedAllocaCmd->MIsActive = false;
-          Record->MCurContext =Context;
+          Record->MCurContext = Context;
 
-          std::set<Command *> Deps =
-              findDepsForReq(Record, Req, Context);
+          std::set<Command *> Deps = findDepsForReq(Record, Req, Context);
           for (Command *Dep : Deps) {
             Command *ConnCmd = AllocaCmd->addDep(
                 DepDesc{Dep, Req, LinkedAllocaCmd}, ToCleanUp);
@@ -871,8 +866,7 @@ void Scheduler::GraphBuilder::markModifiedIfWrite(MemObjRecord *Record,
 
 EmptyCommand *Scheduler::GraphBuilder::addEmptyCmd(
     Command *Cmd, const std::vector<Requirement *> &Reqs,
-    Command::BlockReason Reason,
-    std::vector<Command *> &ToEnqueue) {
+    Command::BlockReason Reason, std::vector<Command *> &ToEnqueue) {
   EmptyCommand *EmptyCmd = new EmptyCommand();
 
   if (!EmptyCmd)
@@ -1343,8 +1337,7 @@ Command *Scheduler::GraphBuilder::connectDepEvent(
             /* DepEvents = */ {DepEvent}),
         CG::CodeplayHostTask,
         /* Payload */ {}));
-    ConnectCmd = new ExecCGCommand(
-        std::move(ConnectCG), nullptr);
+    ConnectCmd = new ExecCGCommand(std::move(ConnectCG), nullptr);
   } catch (const std::bad_alloc &) {
     throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
   }
@@ -1719,13 +1712,11 @@ Command *Scheduler::GraphBuilder::addCommandGraphUpdate(
         NeedMemMoveToHost = true;
 
       if (NeedMemMoveToHost)
-        insertMemoryMove(Record, Req,
-                        nullptr,
-                         ToEnqueue);
+        insertMemoryMove(Record, Req, nullptr, ToEnqueue);
       insertMemoryMove(Record, Req, MemMoveTargetQueue, ToEnqueue);
     }
     std::set<Command *> Deps =
-        findDepsForReq(Record, Req,  queue_impl::getContext(Queue));
+        findDepsForReq(Record, Req, queue_impl::getContext(Queue));
 
     for (Command *Dep : Deps) {
       if (Dep != NewCmd.get()) {
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index 52eb59b225004..4d26c2a822457 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -110,13 +110,13 @@ EventImplPtr Scheduler::addCG(
     Command *NewCmd = nullptr;
     switch (Type) {
     case CG::UpdateHost:
-      NewCmd = MGraphBuilder.addCGUpdateHost(std::move(CommandGroup),
-                                             AuxiliaryCmds);
+      NewCmd =
+          MGraphBuilder.addCGUpdateHost(std::move(CommandGroup), AuxiliaryCmds);
       NewEvent = NewCmd->getEvent();
       break;
     case CG::CodeplayHostTask: {
-      auto Result = MGraphBuilder.addCG(std::move(CommandGroup),
-                                        nullptr, AuxiliaryCmds);
+      auto Result =
+          MGraphBuilder.addCG(std::move(CommandGroup), nullptr, AuxiliaryCmds);
       NewCmd = Result.NewCmd;
       NewEvent = Result.NewEvent;
       ShouldEnqueue = Result.ShouldEnqueue;
diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp
index 75c80745ec71c..7e81e964bdc17 100644
--- a/sycl/source/detail/stream_impl.cpp
+++ b/sycl/source/detail/stream_impl.cpp
@@ -76,14 +76,15 @@ size_t stream_impl::get_size() const { return BufferSize_; }
 
 size_t stream_impl::get_max_statement_size() const { return MaxStatementSize_; }
 
-void stream_impl::generateFlushCommand(handler& cgh)
-{
+void stream_impl::generateFlushCommand(handler &cgh) {
   // Create accessor to the flush buffer even if not using it yet. Otherwise
   // kernel will be a leaf for the flush buffer and scheduler will not be able
   // to cleanup the kernel. TODO: get rid of finalize method by using host
   // accessor to the flush buffer.
-  host_accessor<char, 1, access::mode::read_write> FlushBuffHostAcc(FlushBuf_, cgh);
-  host_accessor<char, 1, access::mode::read_write> BufHostAcc (Buf_, cgh, range<1>(BufferSize_), id<1>(OffsetSize));
+  host_accessor<char, 1, access::mode::read_write> FlushBuffHostAcc(FlushBuf_,
+                                                                    cgh);
+  host_accessor<char, 1, access::mode::read_write> BufHostAcc(
+      Buf_, cgh, range<1>(BufferSize_), id<1>(OffsetSize));
 
   cgh.host_task([=] {
     if (!BufHostAcc.empty()) {
@@ -106,14 +107,14 @@ void stream_impl::generateFlushCommand(handler& cgh)
   });
 }
 
-  // ABI break: remove
-  void stream_impl::initStreamHost(QueueImplPtr ){};
+// ABI break: remove
+void stream_impl::initStreamHost(QueueImplPtr){};
 
-  // ABI break: remove
-  void stream_impl::flush(const EventImplPtr &) {};
+// ABI break: remove
+void stream_impl::flush(const EventImplPtr &) {};
 
-  // ABI break: remove
-  void stream_impl::flush() {};
+// ABI break: remove
+void stream_impl::flush() {};
 
 } // namespace detail
 } // namespace _V1
diff --git a/sycl/source/detail/stream_impl.hpp b/sycl/source/detail/stream_impl.hpp
index 4fc1f4b1d5a8a..670931c815185 100644
--- a/sycl/source/detail/stream_impl.hpp
+++ b/sycl/source/detail/stream_impl.hpp
@@ -68,7 +68,7 @@ class __SYCL_EXPORT stream_impl {
     return PropList_.get_property<propertyT>();
   }
 
-  void generateFlushCommand(handler& cgh);
+  void generateFlushCommand(handler &cgh);
 
 private:
   // Size of the stream buffer
diff --git a/sycl/source/detail/sycl_mem_obj_t.cpp b/sycl/source/detail/sycl_mem_obj_t.cpp
index 7440a3b816ce2..68207bec67d53 100644
--- a/sycl/source/detail/sycl_mem_obj_t.cpp
+++ b/sycl/source/detail/sycl_mem_obj_t.cpp
@@ -209,8 +209,9 @@ void SYCLMemObjT::detachMemoryObject(
       !MOwnNativeHandle ||
       (MInteropContext && !MInteropContext->isOwnedByRuntime());
 
-   if (MRecord && MRecord->MCurContext && MRecord->MCurContext->isOwnedByRuntime() &&
-      !InteropObjectsUsed && (!MHostPtrProvided || MIsInternal)) {
+  if (MRecord && MRecord->MCurContext &&
+      MRecord->MCurContext->isOwnedByRuntime() && !InteropObjectsUsed &&
+      (!MHostPtrProvided || MIsInternal)) {
     bool okToDefer = GlobalHandler::instance().isOkToDefer();
     if (okToDefer)
       Scheduler::getInstance().deferMemObjRelease(Self);
diff --git a/sycl/source/detail/usm/usm_impl.cpp b/sycl/source/detail/usm/usm_impl.cpp
index 753c27d5f678d..57c54275069e6 100755
--- a/sycl/source/detail/usm/usm_impl.cpp
+++ b/sycl/source/detail/usm/usm_impl.cpp
@@ -73,33 +73,33 @@ void *alignedAllocHost(size_t Alignment, size_t Size, const context &Ctxt,
     return nullptr;
 
   std::shared_ptr<context_impl> CtxImpl = detail::getSyclObjImpl(Ctxt);
-    pi_context C = CtxImpl->getHandleRef();
-    const PluginPtr &Plugin = CtxImpl->getPlugin();
-    pi_result Error = PI_ERROR_INVALID_VALUE;
-
-    switch (Kind) {
-    case alloc::host: {
-      std::array<pi_usm_mem_properties, 3> Props;
-      auto PropsIter = Props.begin();
-
-      if (PropList.has_property<sycl::ext::intel::experimental::property::usm::
-                                    buffer_location>() &&
-          Ctxt.get_platform().has_extension(
-              "cl_intel_mem_alloc_buffer_location")) {
-        *PropsIter++ = PI_MEM_USM_ALLOC_BUFFER_LOCATION;
-        *PropsIter++ = PropList
-                           .get_property<sycl::ext::intel::experimental::
-                                             property::usm::buffer_location>()
-                           .get_buffer_location();
-      }
+  pi_context C = CtxImpl->getHandleRef();
+  const PluginPtr &Plugin = CtxImpl->getPlugin();
+  pi_result Error = PI_ERROR_INVALID_VALUE;
+
+  switch (Kind) {
+  case alloc::host: {
+    std::array<pi_usm_mem_properties, 3> Props;
+    auto PropsIter = Props.begin();
+
+    if (PropList.has_property<
+            sycl::ext::intel::experimental::property::usm::buffer_location>() &&
+        Ctxt.get_platform().has_extension(
+            "cl_intel_mem_alloc_buffer_location")) {
+      *PropsIter++ = PI_MEM_USM_ALLOC_BUFFER_LOCATION;
+      *PropsIter++ = PropList
+                         .get_property<sycl::ext::intel::experimental::
+                                           property::usm::buffer_location>()
+                         .get_buffer_location();
+    }
 
-      assert(PropsIter >= Props.begin() && PropsIter < Props.end());
-      *PropsIter++ = 0; // null-terminate property list
+    assert(PropsIter >= Props.begin() && PropsIter < Props.end());
+    *PropsIter++ = 0; // null-terminate property list
 
-      Error = Plugin->call_nocheck<PiApiKind::piextUSMHostAlloc>(
-          &RetVal, C, Props.data(), Size, Alignment);
+    Error = Plugin->call_nocheck<PiApiKind::piextUSMHostAlloc>(
+        &RetVal, C, Props.data(), Size, Alignment);
 
-      break;
+    break;
     }
     case alloc::device:
     case alloc::shared:
diff --git a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
index 1947e31b7daaa..e1bc8c894f311 100644
--- a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
+++ b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
@@ -83,8 +83,7 @@ class DependsOnTests : public ::testing::Test {
 
     detail::Command *NewCmd = MS.addCG(
         std::move(CmdGroup),
-        Type == TestCGType::HOST_TASK ? nullptr : QueueDevImpl,
-        ToEnqueue);
+        Type == TestCGType::HOST_TASK ? nullptr : QueueDevImpl, ToEnqueue);
     EXPECT_EQ(ToEnqueue.size(), 0u);
     return NewCmd;
   }
diff --git a/sycl/unittests/scheduler/GraphCleanup.cpp b/sycl/unittests/scheduler/GraphCleanup.cpp
index 437f98b1579a6..c3681bfc07a3b 100644
--- a/sycl/unittests/scheduler/GraphCleanup.cpp
+++ b/sycl/unittests/scheduler/GraphCleanup.cpp
@@ -245,7 +245,8 @@ TEST_F(SchedulerTest, PostEnqueueCleanup) {
   checkCleanupOnLeafUpdate(
       MS, QueueImpl, Buf, MockReq, [&](detail::MemObjRecord *Record) {
         detail::Command *Leaf = *Record->MWriteLeaves.begin();
-        MS.addEmptyCmd(Leaf, {&MockReq}, detail::Command::BlockReason::HostTask, ToEnqueue);
+        MS.addEmptyCmd(Leaf, {&MockReq}, detail::Command::BlockReason::HostTask,
+                       ToEnqueue);
       });
   checkCleanupOnLeafUpdate(
       MS, nullptr, Buf, MockReq, [&](detail::MemObjRecord *Record) {
diff --git a/sycl/unittests/scheduler/InOrderQueueDeps.cpp b/sycl/unittests/scheduler/InOrderQueueDeps.cpp
index bffdf6af4afe2..9ce9a1f944349 100644
--- a/sycl/unittests/scheduler/InOrderQueueDeps.cpp
+++ b/sycl/unittests/scheduler/InOrderQueueDeps.cpp
@@ -91,8 +91,7 @@ TEST_F(SchedulerTest, InOrderQueueDeps) {
 
   // Check that sequential memory movements submitted to the same in-order
   // queue do not depend on each other.
-  detail::Command *Cmd =
-      MS.insertMemoryMove(Record, &Req, nullptr, AuxCmds);
+  detail::Command *Cmd = MS.insertMemoryMove(Record, &Req, nullptr, AuxCmds);
   detail::EnqueueResultT Res;
   auto ReadLock = MS.acquireGraphReadLock();
   MockScheduler::enqueueCommand(Cmd, Res, detail::NON_BLOCKING);
diff --git a/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp b/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp
index 71f30f91117a0..565c3b2a2314c 100644
--- a/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp
+++ b/sycl/unittests/scheduler/LeafLimitDiffContexts.cpp
@@ -60,8 +60,8 @@ TEST_F(SchedulerTest, LeafLimitDiffContexts) {
       std::vector<detail::Command *> ToEnqueue;
       AllocaCmd = MS.getOrCreateAllocaForReq(
           Rec, &MockReq, detail::getSyclObjImpl(Queue), ToEnqueue);
-      std::ignore = MS.getOrCreateAllocaForReq(
-          Rec, &MockReq, nullptr, ToEnqueue);
+      std::ignore =
+          MS.getOrCreateAllocaForReq(Rec, &MockReq, nullptr, ToEnqueue);
       DepCmd =
           std::make_unique<MockCommand>(detail::getSyclObjImpl(Queue), MockReq);
     }
diff --git a/sycl/unittests/scheduler/LeavesCollection.cpp b/sycl/unittests/scheduler/LeavesCollection.cpp
index 39146ffaa95e8..e0732926537b0 100644
--- a/sycl/unittests/scheduler/LeavesCollection.cpp
+++ b/sycl/unittests/scheduler/LeavesCollection.cpp
@@ -36,8 +36,7 @@ createGenericCommand(const std::shared_ptr<queue_impl> &Q) {
   return std::shared_ptr<Command>{new MockCommand(Q, Command::RUN_CG)};
 }
 
-std::shared_ptr<Command>
-createEmptyCommand(const Requirement &Req) {
+std::shared_ptr<Command> createEmptyCommand(const Requirement &Req) {
   EmptyCommand *Cmd = new EmptyCommand();
   Cmd->addRequirement(/* DepCmd = */ nullptr, /* AllocaCmd = */ nullptr, &Req);
   Cmd->MBlockReason = Command::BlockReason::HostAccessor;
diff --git a/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp b/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp
index 6ae6b9bfc2344..b08b211d1e2dc 100644
--- a/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp
+++ b/sycl/unittests/scheduler/LinkedAllocaDependencies.cpp
@@ -69,8 +69,7 @@ TEST_F(SchedulerTest, LinkedAllocaDependencies) {
                       std::vector<sycl::detail::Command *> &) {};
 
   std::shared_ptr<sycl::detail::MemObjRecord> Record{
-      new sycl::detail::MemObjRecord(nullptr, 10,
-                                     AllocaDep)};
+      new sycl::detail::MemObjRecord(nullptr, 10, AllocaDep)};
 
   MemObjMock MemObj(Record);
   Req.MSYCLMemObj = &MemObj;
diff --git a/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp b/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp
index 83a0702861141..24a19977844fb 100644
--- a/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp
+++ b/sycl/unittests/scheduler/NoHostUnifiedMemory.cpp
@@ -152,8 +152,7 @@ TEST_F(SchedulerTest, NoHostUnifiedMemory) {
 
     // No special handling required: alloca commands are created one after
     // another and the transfer is done via a write operation.
-    detail::MemObjRecord *Record =
-        MS.getOrInsertMemObjRecord(nullptr, &Req);
+    detail::MemObjRecord *Record = MS.getOrInsertMemObjRecord(nullptr, &Req);
     std::vector<detail::Command *> AuxCmds;
     detail::AllocaCommandBase *HostAllocaCmd =
         MS.getOrCreateAllocaForReq(Record, &Req, nullptr, AuxCmds);
diff --git a/sycl/unittests/scheduler/QueueFlushing.cpp b/sycl/unittests/scheduler/QueueFlushing.cpp
index 330ff7e0f02d2..c90db25fc019a 100644
--- a/sycl/unittests/scheduler/QueueFlushing.cpp
+++ b/sycl/unittests/scheduler/QueueFlushing.cpp
@@ -125,14 +125,12 @@ TEST_F(SchedulerTest, QueueFlushing) {
     detail::AllocaCommand HostAllocaCmd =
         detail::AllocaCommand(nullptr, MockReq);
 
-    detail::MemCpyCommand MemCpyCmd{MockReq,    &AllocaCmd,
-                                    MockReq,    &HostAllocaCmd,
-                                    QueueImplA, nullptr};
+    detail::MemCpyCommand MemCpyCmd{MockReq,        &AllocaCmd, MockReq,
+                                    &HostAllocaCmd, QueueImplA, nullptr};
     testCommandEnqueue(&MemCpyCmd, QueueImplB, MockReq);
 
-    detail::MemCpyCommandHost MemCpyCmdHost{MockReq,    &AllocaCmd,
-                                            MockReq,    &MockHostPtr,
-                                            QueueImplA, nullptr};
+    detail::MemCpyCommandHost MemCpyCmdHost{MockReq,      &AllocaCmd, MockReq,
+                                            &MockHostPtr, QueueImplA, nullptr};
     testCommandEnqueue(&MemCpyCmdHost, QueueImplB, MockReq);
 
     std::unique_ptr<detail::CG> CG{
diff --git a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
index d1e7f22aa9485..789961b081da8 100644
--- a/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
+++ b/sycl/unittests/scheduler/StreamInitDependencyOnHost.cpp
@@ -11,8 +11,8 @@
 
 #include <detail/config.hpp>
 #include <detail/handler_impl.hpp>
-#include <helpers/ScopedEnvVar.hpp>
 #include <helpers/PiMock.hpp>
+#include <helpers/ScopedEnvVar.hpp>
 
 using namespace sycl;
 

From c76484daf99edc74b77d6722fdbb4d62b707df56 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 19 Jun 2024 05:56:31 -0700
Subject: [PATCH 31/52] fix clang-format

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/program_impl.cpp |  93 ++++++++--------
 sycl/source/detail/usm/usm_impl.cpp | 160 ++++++++++++++--------------
 2 files changed, 126 insertions(+), 127 deletions(-)
 mode change 100755 => 100644 sycl/source/detail/usm/usm_impl.cpp

diff --git a/sycl/source/detail/program_impl.cpp b/sycl/source/detail/program_impl.cpp
index df95614d872c3..f3ac2185627f9 100644
--- a/sycl/source/detail/program_impl.cpp
+++ b/sycl/source/detail/program_impl.cpp
@@ -94,22 +94,22 @@ program_impl::program_impl(
     }
   }
 
-    std::vector<sycl::detail::pi::PiDevice> Devices(get_pi_devices());
-    std::vector<sycl::detail::pi::PiProgram> Programs;
-    bool NonInterOpToLink = false;
-    for (const auto &Prg : ProgramList) {
-      if (!Prg->MLinkable && NonInterOpToLink)
-        continue;
-      NonInterOpToLink |= !Prg->MLinkable;
-      Programs.push_back(Prg->MProgram);
-    }
-    const PluginPtr &Plugin = getPlugin();
-    sycl::detail::pi::PiResult Err =
-        Plugin->call_nocheck<PiApiKind::piProgramLink>(
-            MContext->getHandleRef(), Devices.size(), Devices.data(),
-            LinkOptions.c_str(), Programs.size(), Programs.data(), nullptr,
-            nullptr, &MProgram);
-    Plugin->checkPiResult<compile_program_error>(Err);
+  std::vector<sycl::detail::pi::PiDevice> Devices(get_pi_devices());
+  std::vector<sycl::detail::pi::PiProgram> Programs;
+  bool NonInterOpToLink = false;
+  for (const auto &Prg : ProgramList) {
+    if (!Prg->MLinkable && NonInterOpToLink)
+      continue;
+    NonInterOpToLink |= !Prg->MLinkable;
+    Programs.push_back(Prg->MProgram);
+  }
+  const PluginPtr &Plugin = getPlugin();
+  sycl::detail::pi::PiResult Err =
+      Plugin->call_nocheck<PiApiKind::piProgramLink>(
+          MContext->getHandleRef(), Devices.size(), Devices.data(),
+          LinkOptions.c_str(), Programs.size(), Programs.data(), nullptr,
+          nullptr, &MProgram);
+  Plugin->checkPiResult<compile_program_error>(Err);
 }
 
 program_impl::program_impl(ContextImplPtr Context,
@@ -236,22 +236,22 @@ void program_impl::link(std::string LinkOptions) {
   const char *LinkOpts = SYCLConfig<SYCL_PROGRAM_LINK_OPTIONS>::get();
   if (!LinkOpts) {
     LinkOpts = LinkOptions.c_str();
-    }
+  }
 
-    // Plugin resets MProgram with a new pi_program as a result of the call to
-    // "piProgramLink". Thus, we need to release MProgram before the call to
-    // piProgramLink.
-    if (MProgram != nullptr)
-      Plugin->call<PiApiKind::piProgramRelease>(MProgram);
-
-    sycl::detail::pi::PiResult Err =
-        Plugin->call_nocheck<PiApiKind::piProgramLink>(
-            MContext->getHandleRef(), Devices.size(), Devices.data(), LinkOpts,
-            /*num_input_programs*/ 1, &MProgram, nullptr, nullptr, &MProgram);
-    Plugin->checkPiResult<compile_program_error>(Err);
-    MLinkOptions = LinkOptions;
-    MBuildOptions = LinkOptions;
-    MState = program_state::linked;
+  // Plugin resets MProgram with a new pi_program as a result of the call to
+  // "piProgramLink". Thus, we need to release MProgram before the call to
+  // piProgramLink.
+  if (MProgram != nullptr)
+    Plugin->call<PiApiKind::piProgramRelease>(MProgram);
+
+  sycl::detail::pi::PiResult Err =
+      Plugin->call_nocheck<PiApiKind::piProgramLink>(
+          MContext->getHandleRef(), Devices.size(), Devices.data(), LinkOpts,
+          /*num_input_programs*/ 1, &MProgram, nullptr, nullptr, &MProgram);
+  Plugin->checkPiResult<compile_program_error>(Err);
+  MLinkOptions = LinkOptions;
+  MBuildOptions = LinkOptions;
+  MState = program_state::linked;
 }
 
 bool program_impl::has_kernel(std::string KernelName,
@@ -363,24 +363,23 @@ std::pair<sycl::detail::pi::PiKernel, const KernelArgMask *>
 program_impl::get_pi_kernel_arg_mask_pair(const std::string &KernelName) const {
   std::pair<sycl::detail::pi::PiKernel, const KernelArgMask *> Result;
 
-    const PluginPtr &Plugin = getPlugin();
-    sycl::detail::pi::PiResult Err =
-        Plugin->call_nocheck<PiApiKind::piKernelCreate>(
-            MProgram, KernelName.c_str(), &Result.first);
-    if (Err == PI_ERROR_INVALID_KERNEL_NAME) {
-      throw invalid_object_error(
-          "This instance of program does not contain the kernel requested",
-          Err);
-    }
-    Plugin->checkPiResult(Err);
+  const PluginPtr &Plugin = getPlugin();
+  sycl::detail::pi::PiResult Err =
+      Plugin->call_nocheck<PiApiKind::piKernelCreate>(
+          MProgram, KernelName.c_str(), &Result.first);
+  if (Err == PI_ERROR_INVALID_KERNEL_NAME) {
+    throw invalid_object_error(
+        "This instance of program does not contain the kernel requested", Err);
+  }
+  Plugin->checkPiResult(Err);
 
-    // Some PI Plugins (like OpenCL) require this call to enable USM
-    // For others, PI will turn this into a NOP.
-    if (getContextImplPtr()->getPlatformImpl()->supports_usm())
-      Plugin->call<PiApiKind::piKernelSetExecInfo>(
-          Result.first, PI_USM_INDIRECT_ACCESS, sizeof(pi_bool), &PI_TRUE);
+  // Some PI Plugins (like OpenCL) require this call to enable USM
+  // For others, PI will turn this into a NOP.
+  if (getContextImplPtr()->getPlatformImpl()->supports_usm())
+    Plugin->call<PiApiKind::piKernelSetExecInfo>(
+        Result.first, PI_USM_INDIRECT_ACCESS, sizeof(pi_bool), &PI_TRUE);
 
-    return Result;
+  return Result;
 }
 
 std::vector<device>
diff --git a/sycl/source/detail/usm/usm_impl.cpp b/sycl/source/detail/usm/usm_impl.cpp
old mode 100755
new mode 100644
index 57c54275069e6..7237e88be440f
--- a/sycl/source/detail/usm/usm_impl.cpp
+++ b/sycl/source/detail/usm/usm_impl.cpp
@@ -100,20 +100,20 @@ void *alignedAllocHost(size_t Alignment, size_t Size, const context &Ctxt,
         &RetVal, C, Props.data(), Size, Alignment);
 
     break;
-    }
-    case alloc::device:
-    case alloc::shared:
-    case alloc::unknown: {
-      RetVal = nullptr;
-      Error = PI_ERROR_INVALID_VALUE;
-      break;
-    }
-    }
+  }
+  case alloc::device:
+  case alloc::shared:
+  case alloc::unknown: {
+    RetVal = nullptr;
+    Error = PI_ERROR_INVALID_VALUE;
+    break;
+  }
+  }
 
-    // Error is for debugging purposes.
-    // The spec wants a nullptr returned, not an exception.
-    if (Error != PI_SUCCESS)
-      return nullptr;
+  // Error is for debugging purposes.
+  // The spec wants a nullptr returned, not an exception.
+  if (Error != PI_SUCCESS)
+    return nullptr;
 #ifdef XPTI_ENABLE_INSTRUMENTATION
   xpti::addMetadata(PrepareNotify.traceEvent(), "memory_ptr",
                     reinterpret_cast<size_t>(RetVal));
@@ -139,79 +139,79 @@ void *alignedAllocInternal(size_t Alignment, size_t Size,
   if (Size == 0)
     return nullptr;
 
-    pi_context C = CtxImpl->getHandleRef();
-    const PluginPtr &Plugin = CtxImpl->getPlugin();
-    pi_result Error = PI_ERROR_INVALID_VALUE;
-    pi_device Id;
+  pi_context C = CtxImpl->getHandleRef();
+  const PluginPtr &Plugin = CtxImpl->getPlugin();
+  pi_result Error = PI_ERROR_INVALID_VALUE;
+  pi_device Id;
 
-    switch (Kind) {
-    case alloc::device: {
-      Id = DevImpl->getHandleRef();
+  switch (Kind) {
+  case alloc::device: {
+    Id = DevImpl->getHandleRef();
 
-      std::array<pi_usm_mem_properties, 3> Props;
-      auto PropsIter = Props.begin();
+    std::array<pi_usm_mem_properties, 3> Props;
+    auto PropsIter = Props.begin();
 
-      // Buffer location is only supported on FPGA devices
-      if (PropList.has_property<sycl::ext::intel::experimental::property::usm::
-                                    buffer_location>() &&
-          DevImpl->has_extension("cl_intel_mem_alloc_buffer_location")) {
-        *PropsIter++ = PI_MEM_USM_ALLOC_BUFFER_LOCATION;
-        *PropsIter++ = PropList
-                           .get_property<sycl::ext::intel::experimental::
-                                             property::usm::buffer_location>()
-                           .get_buffer_location();
-      }
+    // Buffer location is only supported on FPGA devices
+    if (PropList.has_property<
+            sycl::ext::intel::experimental::property::usm::buffer_location>() &&
+        DevImpl->has_extension("cl_intel_mem_alloc_buffer_location")) {
+      *PropsIter++ = PI_MEM_USM_ALLOC_BUFFER_LOCATION;
+      *PropsIter++ = PropList
+                         .get_property<sycl::ext::intel::experimental::
+                                           property::usm::buffer_location>()
+                         .get_buffer_location();
+    }
 
-      assert(PropsIter >= Props.begin() && PropsIter < Props.end());
-      *PropsIter++ = 0; // null-terminate property list
+    assert(PropsIter >= Props.begin() && PropsIter < Props.end());
+    *PropsIter++ = 0; // null-terminate property list
 
-      Error = Plugin->call_nocheck<PiApiKind::piextUSMDeviceAlloc>(
-          &RetVal, C, Id, Props.data(), Size, Alignment);
+    Error = Plugin->call_nocheck<PiApiKind::piextUSMDeviceAlloc>(
+        &RetVal, C, Id, Props.data(), Size, Alignment);
 
-      break;
-    }
-    case alloc::shared: {
-      Id = DevImpl->getHandleRef();
-
-      std::array<pi_usm_mem_properties, 5> Props;
-      auto PropsIter = Props.begin();
-
-      if (PropList.has_property<
-              sycl::ext::oneapi::property::usm::device_read_only>()) {
-        *PropsIter++ = PI_MEM_ALLOC_FLAGS;
-        *PropsIter++ = PI_MEM_ALLOC_DEVICE_READ_ONLY;
-      }
-
-      if (PropList.has_property<sycl::ext::intel::experimental::property::usm::
-                                    buffer_location>() &&
-          DevImpl->has_extension("cl_intel_mem_alloc_buffer_location")) {
-        *PropsIter++ = PI_MEM_USM_ALLOC_BUFFER_LOCATION;
-        *PropsIter++ = PropList
-                           .get_property<sycl::ext::intel::experimental::
-                                             property::usm::buffer_location>()
-                           .get_buffer_location();
-      }
-
-      assert(PropsIter >= Props.begin() && PropsIter < Props.end());
-      *PropsIter++ = 0; // null-terminate property list
-
-      Error = Plugin->call_nocheck<PiApiKind::piextUSMSharedAlloc>(
-          &RetVal, C, Id, Props.data(), Size, Alignment);
-
-      break;
-    }
-    case alloc::host:
-    case alloc::unknown: {
-      RetVal = nullptr;
-      Error = PI_ERROR_INVALID_VALUE;
-      break;
+    break;
+  }
+  case alloc::shared: {
+    Id = DevImpl->getHandleRef();
+
+    std::array<pi_usm_mem_properties, 5> Props;
+    auto PropsIter = Props.begin();
+
+    if (PropList.has_property<
+            sycl::ext::oneapi::property::usm::device_read_only>()) {
+      *PropsIter++ = PI_MEM_ALLOC_FLAGS;
+      *PropsIter++ = PI_MEM_ALLOC_DEVICE_READ_ONLY;
     }
+
+    if (PropList.has_property<
+            sycl::ext::intel::experimental::property::usm::buffer_location>() &&
+        DevImpl->has_extension("cl_intel_mem_alloc_buffer_location")) {
+      *PropsIter++ = PI_MEM_USM_ALLOC_BUFFER_LOCATION;
+      *PropsIter++ = PropList
+                         .get_property<sycl::ext::intel::experimental::
+                                           property::usm::buffer_location>()
+                         .get_buffer_location();
     }
 
-    // Error is for debugging purposes.
-    // The spec wants a nullptr returned, not an exception.
-    if (Error != PI_SUCCESS)
-      return nullptr;
+    assert(PropsIter >= Props.begin() && PropsIter < Props.end());
+    *PropsIter++ = 0; // null-terminate property list
+
+    Error = Plugin->call_nocheck<PiApiKind::piextUSMSharedAlloc>(
+        &RetVal, C, Id, Props.data(), Size, Alignment);
+
+    break;
+  }
+  case alloc::host:
+  case alloc::unknown: {
+    RetVal = nullptr;
+    Error = PI_ERROR_INVALID_VALUE;
+    break;
+  }
+  }
+
+  // Error is for debugging purposes.
+  // The spec wants a nullptr returned, not an exception.
+  if (Error != PI_SUCCESS)
+    return nullptr;
   return RetVal;
 }
 
@@ -250,9 +250,9 @@ void *alignedAlloc(size_t Alignment, size_t Size, const context &Ctxt,
 void freeInternal(void *Ptr, const context_impl *CtxImpl) {
   if (Ptr == nullptr)
     return;
-    pi_context C = CtxImpl->getHandleRef();
-    const PluginPtr &Plugin = CtxImpl->getPlugin();
-    Plugin->call<PiApiKind::piextUSMFree>(C, Ptr);
+  pi_context C = CtxImpl->getHandleRef();
+  const PluginPtr &Plugin = CtxImpl->getPlugin();
+  Plugin->call<PiApiKind::piextUSMFree>(C, Ptr);
 }
 
 void free(void *Ptr, const context &Ctxt,

From 61d1c6208e4ef52c3b72908b9f904ba9869ffdb5 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 19 Jun 2024 08:52:31 -0700
Subject: [PATCH 32/52] fix connect task queue

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/scheduler/graph_builder.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 2ac97baefb543..7cfc0446fdd69 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -1330,7 +1330,8 @@ Command *Scheduler::GraphBuilder::connectDepEvent(
   try {
     std::unique_ptr<detail::HostTask> HT(new detail::HostTask);
     std::unique_ptr<detail::CG> ConnectCG(new detail::CGHostTask(
-        std::move(HT), /* Queue = */ {}, /* Context = */ {}, /* Args = */ {},
+        std::move(HT), /* Queue = */ Cmd->getQueue(), /* Context = */ {},
+        /* Args = */ {},
         detail::CG::StorageInitHelper(
             /* ArgsStorage = */ {}, /* AccStorage = */ {},
             /* SharedPtrStorage = */ {}, /* Requirements = */ {},

From 5814e466577f0b99d6d6095d3e0d68a25452203c Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Thu, 20 Jun 2024 06:30:09 -0700
Subject: [PATCH 33/52] fix bugs

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/event_impl.cpp | 11 +++++++++--
 sycl/source/detail/queue_impl.cpp |  4 ++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index 7d91129f25b51..a270867f6b637 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -262,7 +262,8 @@ void event_impl::wait_and_throw(
 void event_impl::checkProfilingPreconditions() const {
   std::weak_ptr<queue_impl> EmptyPtr;
 
-  if (!EmptyPtr.owner_before(MQueue) && !MQueue.owner_before(EmptyPtr)) {
+  if (!MIsHostEvent && !EmptyPtr.owner_before(MQueue) &&
+      !MQueue.owner_before(EmptyPtr)) {
     throw sycl::exception(make_error_code(sycl::errc::invalid),
                           "Profiling information is unavailable as the event "
                           "has no associated queue.");
@@ -300,7 +301,7 @@ event_impl::get_profiling_info<info::event_profiling::command_submit>() {
   // made by forcing the re-sync of submit time to start time is less than
   // 0.5ms. These timing values were obtained empirically using an integrated
   // Intel GPU).
-  if (MEventFromSubmittedExecCommandBuffer && MEvent) {
+  if (MEventFromSubmittedExecCommandBuffer && !MIsHostEvent && MEvent) {
     uint64_t StartTime =
         get_event_profiling_info<info::event_profiling::command_start>(
             this->getHandleRef(), this->getPlugin());
@@ -546,6 +547,12 @@ void event_impl::setSubmissionTime() {
                   e.what());
         std::rethrow_exception(std::current_exception());
       }
+    } else {
+      // Returning host time
+      using namespace std::chrono;
+      MSubmitTime =
+          duration_cast<nanoseconds>(steady_clock::now().time_since_epoch())
+              .count();
     }
   } else {
     // Capture the host timestamp for a return value of function call
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index 83f33688ed0b1..572b0b8cf568a 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -299,12 +299,12 @@ void queue_impl::addEvent(const event &Event) {
     // if there is no command on the event, we cannot track it with MEventsWeak
     // as that will leave it with no owner. Track in MEventsShared only if we're
     // unable to call piQueueFinish during wait.
-    if (EImpl->isHost() || MEmulateOOO)
+    if (MEmulateOOO)
       addSharedEvent(Event);
   }
   // As long as the queue supports piQueueFinish we only need to store events
   // for unenqueued commands and host tasks.
-  else if (EImpl->isHost() || MEmulateOOO || EImpl->getHandleRef() == nullptr) {
+  else if (MEmulateOOO || EImpl->getHandleRef() == nullptr) {
     std::weak_ptr<event_impl> EventWeakPtr{EImpl};
     std::lock_guard<std::mutex> Lock{MMutex};
     MEventsWeak.push_back(std::move(EventWeakPtr));

From a03468173acf6f9c58593685069d030955a4782c Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Thu, 20 Jun 2024 09:43:06 -0700
Subject: [PATCH 34/52] fix work with graph

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/event_impl.cpp         |  4 ++--
 sycl/source/detail/queue_impl.cpp         | 21 ++++++++++++++++-----
 sycl/source/detail/queue_impl.hpp         | 16 +++++++++++++---
 sycl/source/detail/scheduler/commands.cpp | 20 ++++++++++----------
 4 files changed, 41 insertions(+), 20 deletions(-)

diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index a270867f6b637..e203924d2d612 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -48,7 +48,7 @@ event_impl::~event_impl() {
 }
 
 void event_impl::waitInternal(bool *Success) {
-  if (MEvent) {
+  if (!MIsHostEvent && MEvent) {
     // Wait for the native event
     sycl::detail::pi::PiResult Err =
         getPlugin()->call_nocheck<PiApiKind::piEventsWait>(1, &MEvent);
@@ -390,7 +390,7 @@ event_impl::get_info<info::event::command_execution_status>() {
       return sycl::info::event_command_status::submitted;
   }
 
-  return MState.load() != HES_Complete
+  return MIsHostEvent && MState.load() != HES_Complete
              ? sycl::info::event_command_status::submitted
              : info::event_command_status::complete;
 }
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index 572b0b8cf568a..a5f9ae9964ac6 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -696,6 +696,19 @@ void queue_impl::revisitUnenqueuedCommandsState(
     const EventImplPtr &CompletedHostTask) {
   if (MIsInorder)
     return;
+
+  std::unique_lock<std::mutex> Lock{MMutex, std::try_to_lock};
+  if (Lock.owns_lock())
+    doUnenqueuedCommandCleanup(CompletedHostTask->getCommandGraph());
+  else {
+    std::lock_guard<std::mutex> RequestLock(MMissedCleanupRequestsMtx);
+    MMissedCleanupRequests.push_back(CompletedHostTask->getCommandGraph());
+  }
+}
+
+void queue_impl::doUnenqueuedCommandCleanup(
+    const std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
+        &Graph) {
   auto tryToCleanup = [](DependencyTrackingItems &Deps) {
     if (Deps.LastBarrier && Deps.LastBarrier->isEnqueued()) {
       Deps.LastBarrier = nullptr;
@@ -713,14 +726,12 @@ void queue_impl::revisitUnenqueuedCommandsState(
           Deps.UnenqueuedCmdEvents.end());
     }
   };
-  std::lock_guard<std::mutex> Lock{MMutex};
   // Barrier enqueue could be significantly postponed due to host task
   // dependency if any. No guarantee that it will happen while same graph deps
   // are still recording.
-  if (auto Graph = CompletedHostTask->getCommandGraph()) {
-    if (Graph == getCommandGraph())
-      tryToCleanup(MExtGraphDeps);
-  } else
+  if (Graph && Graph == getCommandGraph())
+    tryToCleanup(MExtGraphDeps);
+  else
     tryToCleanup(MDefaultGraphDeps);
 }
 
diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index d0a74cc80c793..aa3dd9fc780bf 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -93,7 +93,7 @@ class queue_impl {
   /// \param PropList is a list of properties to use for queue construction.
   queue_impl(const DeviceImplPtr &Device, const async_handler &AsyncHandler,
              const property_list &PropList)
-      : queue_impl(Device, getDefaultOrNew(Device), AsyncHandler, PropList){};
+      : queue_impl(Device, getDefaultOrNew(Device), AsyncHandler, PropList) {};
 
   /// Constructs a SYCL queue with an async_handler and property_list provided
   /// form a device and a context.
@@ -749,6 +749,9 @@ class queue_impl {
   // tasks and host tasks is applicable for out of order queues only. Not neede
   // for in order ones.
   void revisitUnenqueuedCommandsState(const EventImplPtr &CompletedHostTask);
+  void doUnenqueuedCommandCleanup(
+      const std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>
+          &Graph);
 
   static ContextImplPtr getContext(const QueueImplPtr &Queue) {
     return Queue ? Queue->getContextImplPtr() : nullptr;
@@ -790,13 +793,12 @@ class queue_impl {
       EventToBuildDeps = getSyclObjImpl(EventRet);
     } else {
       const CG::CGTYPE Type = Handler.getType();
-
+      std::lock_guard<std::mutex> Lock{MMutex};
       // The following code supports barrier synchronization if host task is
       // involved in the scenario. Native barriers cannot handle host task
       // dependency so in the case where some commands were not enqueued
       // (blocked), we track them to prevent barrier from being enqueued
       // earlier.
-      std::lock_guard<std::mutex> Lock{MMutex};
       auto &Deps = MGraph.expired() ? MDefaultGraphDeps : MExtGraphDeps;
       if (Type == CG::Barrier && !Deps.UnenqueuedCmdEvents.empty()) {
         Handler.depends_on(Deps.UnenqueuedCmdEvents);
@@ -814,6 +816,10 @@ class queue_impl {
         } else
           Deps.UnenqueuedCmdEvents.push_back(EventRetImpl);
       }
+      std::lock_guard<std::mutex> RequestLock(MMissedCleanupRequestsMtx);
+      for (auto &UpdatedGraph : MMissedCleanupRequests)
+        doUnenqueuedCommandCleanup(UpdatedGraph);
+      MMissedCleanupRequests.clear();
     }
   }
 
@@ -966,6 +972,10 @@ class queue_impl {
   unsigned long long MQueueID;
   static std::atomic<unsigned long long> MNextAvailableQueueID;
 
+  std::deque<std::shared_ptr<ext::oneapi::experimental::detail::graph_impl>>
+      MMissedCleanupRequests;
+  std::mutex MMissedCleanupRequestsMtx;
+
   friend class sycl::ext::oneapi::experimental::detail::node_impl;
 };
 
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 3d51fe7a1c12f..6322b904fd6bc 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -2954,16 +2954,16 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
         Plugin->call<PiApiKind::piEventsWait>(RawEvents.size(), &RawEvents[0]);
       }
 
-        assert(MQueue->getDeviceImplPtr()->getBackend() ==
-               backend::ext_intel_esimd_emulator);
-        if (MEvent != nullptr)
-          MEvent->setHostEnqueueTime();
-        MQueue->getPlugin()->call<PiApiKind::piEnqueueKernelLaunch>(
-            nullptr,
-            reinterpret_cast<pi_kernel>(ExecKernel->MHostKernel->getPtr()),
-            NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0],
-            &NDRDesc.LocalSize[0], 0, nullptr, nullptr);
-        return PI_SUCCESS;
+      assert(MQueue->getDeviceImplPtr()->getBackend() ==
+             backend::ext_intel_esimd_emulator);
+      if (MEvent != nullptr)
+        MEvent->setHostEnqueueTime();
+      MQueue->getPlugin()->call<PiApiKind::piEnqueueKernelLaunch>(
+          nullptr,
+          reinterpret_cast<pi_kernel>(ExecKernel->MHostKernel->getPtr()),
+          NDRDesc.Dims, &NDRDesc.GlobalOffset[0], &NDRDesc.GlobalSize[0],
+          &NDRDesc.LocalSize[0], 0, nullptr, nullptr);
+      return PI_SUCCESS;
     }
 
     auto getMemAllocationFunc = [this](Requirement *Req) {

From c274c5ec74a0e92306824194a7f5ef9509c83df2 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Thu, 20 Jun 2024 10:14:54 -0700
Subject: [PATCH 35/52] fix tracing tests

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 .../Tracing/code_location_queue_submit.cpp     | 13 +++----------
 sycl/test-e2e/Tracing/task_execution.cpp       | 18 ++++++------------
 .../Tracing/task_execution_handler.cpp         |  4 ++--
 3 files changed, 11 insertions(+), 24 deletions(-)

diff --git a/sycl/test-e2e/Tracing/code_location_queue_submit.cpp b/sycl/test-e2e/Tracing/code_location_queue_submit.cpp
index 6ebfe43e936e5..ce780f5e81725 100644
--- a/sycl/test-e2e/Tracing/code_location_queue_submit.cpp
+++ b/sycl/test-e2e/Tracing/code_location_queue_submit.cpp
@@ -5,8 +5,7 @@
 
 // Test tracing of the code location data for queue.submit in case of failure
 // (exception generation)
-// First queue creation (id = 0) is queue created on line 15.
-// The second queue is a host queue created on first scheduler usage.
+// First queue creation (id = 0) is queue created on line 17.
 
 #include <sycl/detail/core.hpp>
 #include <sycl/usm.hpp>
@@ -19,16 +18,10 @@ int main() {
     unsigned char *HostAllocDst = NULL;
     // CHECK: [SYCL] Queue create:
     // CHECK-DAG:        queue_handle : {{.*}}
-    // CHECK-DAG:        queue_id : 0
-    // CHECK-DAG:        is_inorder : false
-    // CHECK-DAG:        sycl_device : {{.*}}
-    // CHECK-DAG:        sycl_device_name : {{.*}}
-    // CHECK-DAG:        sycl_context : {{.*}}
-    // CHECK-NEXT: [SYCL] Queue create:
     // CHECK-DAG:        queue_id : 1
     // CHECK-DAG:        is_inorder : false
     // CHECK-DAG:        sycl_device : {{.*}}
-    // CHECK-DAG:        sycl_device_name : SYCL host device
+    // CHECK-DAG:        sycl_device_name : {{.*}}
     // CHECK-DAG:        sycl_context : {{.*}}
     // CHECK: [SYCL] Runtime reports:
     // CHECK-NEXT: what:  NULL pointer argument in memory copy operation. -30 (PI_ERROR_INVALID_VALUE)
@@ -44,6 +37,6 @@ int main() {
     sycl::free(HostAllocSrc, Q);
   }
   // CHECK-NEXT: [SYCL] Queue destroy:
-  // CHECK-DAG:        queue_id : 0
+  // CHECK-DAG:        queue_id : 1
   return !ExceptionCaught;
 }
diff --git a/sycl/test-e2e/Tracing/task_execution.cpp b/sycl/test-e2e/Tracing/task_execution.cpp
index d591c20b8f6c0..b4932df0eda55 100644
--- a/sycl/test-e2e/Tracing/task_execution.cpp
+++ b/sycl/test-e2e/Tracing/task_execution.cpp
@@ -15,38 +15,32 @@ int main() {
     Q.copy(AllocDst, AllocSrc, 1).wait();
     // CHECK: [SYCL] Queue create:
     // CHECK-DAG:        queue_handle : {{.*}}
-    // CHECK-DAG:        queue_id : 0
+    // CHECK-DAG:        queue_id : 1
     // CHECK-DAG:        is_inorder : false
     // CHECK-DAG:        sycl_device : {{.*}}
     // CHECK-DAG:        sycl_device_name : {{.*}}
     // CHECK-DAG:        sycl_context : {{.*}}
     // CHECK-NEXT: [SYCL] Task begin (event={{.*}},instanceID={{.*}})
-    // CHECK-DAG:          queue_id : 0
+    // CHECK-DAG:          queue_id : 1
     // CHECK-DAG:          memory_size : 1
     // CHECK-DAG:          value_set : 0
     // CHECK-DAG:          memory_ptr : {{.*}}
     // CHECK-DAG:          sycl_device : {{.*}}
     // CHECK-NEXT: [SYCL] Task end   (event={{.*}},instanceID={{.*}})
     // CHECK-NEXT: [SYCL] Task begin (event={{.*}},instanceID={{.*}})
-    // CHECK-DAG:          queue_id : 0
+    // CHECK-DAG:          queue_id : 1
     // CHECK-DAG:          memory_size : 1
     // CHECK-DAG:          dest_memory_ptr : {{.*}}
     // CHECK-DAG:          src_memory_ptr : {{.*}}
     // CHECK-DAG:          sycl_device : {{.*}}
     // CHECK-NEXT: [SYCL] Task end   (event={{.*}},instanceID={{.*}})
-    // CHECK-NEXT: [SYCL] Queue create:
-    // CHECK-DAG:        queue_id : 1
-    // CHECK-DAG:        is_inorder : false
-    // CHECK-DAG:        sycl_device : {{.*}}
-    // CHECK-DAG:        sycl_device_name : SYCL host device
-    // CHECK-DAG:        sycl_context : {{.*}}
     Q.single_task<class E2ETestKernel>([]() {}).wait();
     // CHECK-NEXT: [SYCL] Task begin (event={{.*}},instanceID={{.*}})
     // CHECK-DAG:          enqueue_kernel_data : {{.*}}
     // CHECK-DAG:          sym_column_no : {{.*}}
-    // CHECK-DAG:          sym_line_no : 43
+    // CHECK-DAG:          sym_line_no : 37
     // CHECK-DAG:          sym_source_file_name : {{.*}}task_execution.cpp
-    // CHECK-DAG:          queue_id : 0
+    // CHECK-DAG:          queue_id : 1
     // CHECK-DAG:          sym_function_name : typeinfo name for main::E2ETestKernel
     // CHECK-DAG:          from_source : {{.*}}
     // CHECK-DAG:          sycl_device_name : {{.*}}
@@ -55,7 +49,7 @@ int main() {
     // CHECK-DAG:          sycl_device : {{.*}}
     // CHECK-NEXT: [SYCL] Task end   (event={{.*}},instanceID={{.*}})
     // CHECK-NEXT: [SYCL] Queue destroy:
-    // CHECK-DAG:        queue_id : 0
+    // CHECK-DAG:        queue_id : 1
     sycl::free(AllocSrc, Q);
     sycl::free(AllocDst, Q);
   }
diff --git a/sycl/test-e2e/Tracing/task_execution_handler.cpp b/sycl/test-e2e/Tracing/task_execution_handler.cpp
index 0563275f81312..a208fe6655bda 100644
--- a/sycl/test-e2e/Tracing/task_execution_handler.cpp
+++ b/sycl/test-e2e/Tracing/task_execution_handler.cpp
@@ -16,7 +16,7 @@ int main() {
              { cgh.memset(AllocSrc, 0, 1); })
         .wait();
     // CHECK: [SYCL] Task begin (event={{.*}},instanceID={{.*}})
-    // CHECK-DAG:          queue_id : 0
+    // CHECK-DAG:          queue_id : 1
     // CHECK-DAG:          sym_column_no : {{.*}}
     // CHECK-DAG:          sym_function_name : {{.*}}
     // CHECK-DAG:          kernel_name : {{.*}}
@@ -27,7 +27,7 @@ int main() {
     // CHECK-DAG:          sycl_device : {{.*}}
     // CHECK-NEXT: [SYCL] Task end   (event={{.*}},instanceID={{.*}})
     // CHECK-NEXT: [SYCL] Task begin (event={{.*}},instanceID={{.*}})
-    // CHECK-DAG:          queue_id : 0
+    // CHECK-DAG:          queue_id : 1
     // CHECK-DAG:          sym_column_no : {{.*}}
     // CHECK-DAG:          sym_function_name : {{.*}}
     // CHECK-DAG:          kernel_name : {{.*}}

From f50526bf29351cbc0d897ae6a59c699aca910522 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Fri, 21 Jun 2024 04:23:03 -0700
Subject: [PATCH 36/52] fix test

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/scheduler/scheduler.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index 4d26c2a822457..905ca889aaf0d 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -207,7 +207,7 @@ EventImplPtr Scheduler::addCopyBack(Requirement *Req) {
   {
     WriteLockT Lock = acquireWriteLock();
     NewCmd = MGraphBuilder.addCopyBack(Req, AuxiliaryCmds);
-    // Command was not creted because there were no operations with
+    // Command was not created because there were no operations with
     // buffer.
     if (!NewCmd)
       return nullptr;
@@ -232,7 +232,9 @@ EventImplPtr Scheduler::addCopyBack(Requirement *Req) {
       throw runtime_error("Enqueue process failed.",
                           PI_ERROR_INVALID_OPERATION);
   } catch (...) {
-    NewCmd->getQueue()->reportAsyncException(std::current_exception());
+    auto WorkerQueue = NewCmd->getEvent()->getWorkerQueue();
+    assert(WorkerQueue && "WorkerQueue for CopyBack command must be not null");
+    WorkerQueue->reportAsyncException(std::current_exception());
   }
   EventImplPtr NewEvent = NewCmd->getEvent();
   cleanupCommands(ToCleanUp);

From 2bd06e3a3ab0170ce0dfef9ace4ae16573ce7c69 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Mon, 24 Jun 2024 04:17:25 -0700
Subject: [PATCH 37/52] update win symbols

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/test/abi/sycl_symbols_windows.dump | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump
index e8610211e8572..c091a7751a0cc 100644
--- a/sycl/test/abi/sycl_symbols_windows.dump
+++ b/sycl/test/abi/sycl_symbols_windows.dump
@@ -41,18 +41,12 @@
 ??$get_info@U?$max_work_groups@$00@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$id@$00@23@XZ
 ??$get_info@U?$max_work_groups@$01@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$id@$01@23@XZ
 ??$get_info@U?$max_work_groups@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$id@$02@23@XZ
+??$get_info@U?$sub_group_progress_capabilities@$01@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
 ??$get_info@U?$sub_group_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
-??$get_info@U?$work_item_progress_capabilities@$00@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
 ??$get_info@U?$work_group_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
-??$get_info_impl@U?$work_item_progress_capabilities@$01@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
+??$get_info@U?$work_item_progress_capabilities@$00@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
 ??$get_info@U?$work_item_progress_capabilities@$01@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
-??$get_info_impl@U?$work_item_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
 ??$get_info@U?$work_item_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
-??$get_info_impl@U?$sub_group_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
-??$get_info_impl@U?$work_item_progress_capabilities@$00@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
-??$get_info@U?$sub_group_progress_capabilities@$01@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
-??$get_info_impl@U?$work_group_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
-??$get_info_impl@U?$sub_group_progress_capabilities@$01@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
 ??$get_info@Uarchitecture@device@info@experimental@oneapi@ext@_V1@sycl@@@device_impl@detail@_V1@sycl@@QEBA?AW4architecture@experimental@oneapi@ext@23@XZ
 ??$get_info@Uatomic_fence_order_capabilities@context@info@_V1@sycl@@@context@_V1@sycl@@QEBA?AV?$vector@W4memory_order@_V1@sycl@@V?$allocator@W4memory_order@_V1@sycl@@@std@@@std@@XZ
 ??$get_info@Uatomic_fence_scope_capabilities@context@info@_V1@sycl@@@context@_V1@sycl@@QEBA?AV?$vector@W4memory_scope@_V1@sycl@@V?$allocator@W4memory_scope@_V1@sycl@@@std@@@std@@XZ
@@ -108,6 +102,12 @@
 ??$get_info_impl@U?$max_work_item_sizes@$00@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$range@$00@12@XZ
 ??$get_info_impl@U?$max_work_item_sizes@$01@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$range@$01@12@XZ
 ??$get_info_impl@U?$max_work_item_sizes@$02@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$range@$02@12@XZ
+??$get_info_impl@U?$sub_group_progress_capabilities@$01@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
+??$get_info_impl@U?$sub_group_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
+??$get_info_impl@U?$work_group_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
+??$get_info_impl@U?$work_item_progress_capabilities@$00@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
+??$get_info_impl@U?$work_item_progress_capabilities@$01@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
+??$get_info_impl@U?$work_item_progress_capabilities@$02@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@V?$allocator@W4forward_progress_guarantee@experimental@oneapi@ext@_V1@sycl@@@std@@@std@@XZ
 ??$get_info_impl@Uaddress_bits@device@info@_V1@sycl@@@device@_V1@sycl@@AEBAIXZ
 ??$get_info_impl@Uarchitecture@device@info@experimental@oneapi@ext@_V1@sycl@@@device@_V1@sycl@@AEBA?AW4architecture@experimental@oneapi@ext@12@XZ
 ??$get_info_impl@Uaspects@device@info@_V1@sycl@@@device@_V1@sycl@@AEBA?AV?$vector@W4aspect@_V1@sycl@@V?$allocator@W4aspect@_V1@sycl@@@std@@@std@@XZ
@@ -4080,7 +4080,6 @@
 ?ext_intel_read_host_pipe@handler@_V1@sycl@@AEAAXVstring_view@detail@23@PEAX_K_N@Z
 ?ext_intel_write_host_pipe@handler@_V1@sycl@@AEAAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@PEAX_K_N@Z
 ?ext_intel_write_host_pipe@handler@_V1@sycl@@AEAAXVstring_view@detail@23@PEAX_K_N@Z
-?verifyDeviceHasProgressGuarantee@handler@_V1@sycl@@AEAAXW4forward_progress_guarantee@experimental@oneapi@ext@23@W4execution_scope@56723@1@Z
 ?ext_oneapi_advise_usm_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEBX_KW4_pi_mem_advice@@V?$vector@IV?$allocator@I@std@@@6@PEAI@Z
 ?ext_oneapi_architecture_is@device@_V1@sycl@@QEAA_NW4arch_category@experimental@oneapi@ext@23@@Z
 ?ext_oneapi_architecture_is@device@_V1@sycl@@QEAA_NW4architecture@experimental@oneapi@ext@23@@Z
@@ -4096,7 +4095,6 @@
 ?ext_oneapi_copy@handler@_V1@sycl@@QEAAXUimage_mem_handle@experimental@oneapi@ext@23@0AEBUimage_descriptor@56723@@Z
 ?ext_oneapi_copy@handler@_V1@sycl@@QEAAXUimage_mem_handle@experimental@oneapi@ext@23@PEAXAEBUimage_descriptor@56723@@Z
 ?ext_oneapi_copy@handler@_V1@sycl@@QEAAXUimage_mem_handle@experimental@oneapi@ext@23@V?$range@$02@23@AEBUimage_descriptor@56723@PEAX111@Z
-?ext_oneapi_prod@queue@_V1@sycl@@QEAAXXZ
 ?ext_oneapi_copy@queue@_V1@sycl@@QEAA?AVevent@23@PEAX0AEBUimage_descriptor@experimental@oneapi@ext@23@_KAEBUcode_location@detail@23@@Z
 ?ext_oneapi_copy@queue@_V1@sycl@@QEAA?AVevent@23@PEAX0AEBUimage_descriptor@experimental@oneapi@ext@23@_KAEBV?$vector@Vevent@_V1@sycl@@V?$allocator@Vevent@_V1@sycl@@@std@@@std@@AEBUcode_location@detail@23@@Z
 ?ext_oneapi_copy@queue@_V1@sycl@@QEAA?AVevent@23@PEAX0AEBUimage_descriptor@experimental@oneapi@ext@23@_KV423@AEBUcode_location@detail@23@@Z
@@ -4158,6 +4156,7 @@
 ?ext_oneapi_owner_before@?$OwnerLessBase@Vstream@_V1@sycl@@@detail@_V1@sycl@@QEBA_NAEBV?$weak_object_base@Vstream@_V1@sycl@@@2oneapi@ext@34@@Z
 ?ext_oneapi_owner_before@?$OwnerLessBase@Vstream@_V1@sycl@@@detail@_V1@sycl@@QEBA_NAEBVstream@34@@Z
 ?ext_oneapi_prefetch_usm_cmd_buffer@MemoryManager@detail@_V1@sycl@@SAXV?$shared_ptr@Vcontext_impl@detail@_V1@sycl@@@std@@PEAU_pi_ext_command_buffer@@PEAX_KV?$vector@IV?$allocator@I@std@@@6@PEAI@Z
+?ext_oneapi_prod@queue@_V1@sycl@@QEAAXXZ
 ?ext_oneapi_set_external_event@queue@_V1@sycl@@QEAAXAEBVevent@23@@Z
 ?ext_oneapi_signal_external_semaphore@handler@_V1@sycl@@QEAAXUinterop_semaphore_handle@experimental@oneapi@ext@23@@Z
 ?ext_oneapi_signal_external_semaphore@handler@_V1@sycl@@QEAAXUinterop_semaphore_handle@experimental@oneapi@ext@23@_K@Z
@@ -4205,6 +4204,7 @@
 ?frexp_impl@detail@_V1@sycl@@YA?AVhalf@half_impl@123@V45123@PEAH@Z
 ?frexp_impl@detail@_V1@sycl@@YAMMPEAH@Z
 ?frexp_impl@detail@_V1@sycl@@YANNPEAH@Z
+?generateFlushCommand@stream_impl@detail@_V1@sycl@@QEAAXAEAVhandler@34@@Z
 ?get@context@_V1@sycl@@QEBAPEAU_cl_context@@XZ
 ?get@device@_V1@sycl@@QEBAPEAU_cl_device_id@@XZ
 ?get@kernel@_V1@sycl@@QEBAPEAU_cl_kernel@@XZ
@@ -4655,6 +4655,7 @@
 ?useHostPtr@SYCLMemObjT@detail@_V1@sycl@@QEAA_NXZ
 ?use_kernel_bundle@handler@_V1@sycl@@QEAAXAEBV?$kernel_bundle@$01@23@@Z
 ?usesPinnedHostMemory@SYCLMemObjT@detail@_V1@sycl@@UEBA_NXZ
+?verifyDeviceHasProgressGuarantee@handler@_V1@sycl@@AEAAXW4forward_progress_guarantee@experimental@oneapi@ext@23@W4execution_scope@56723@1@Z
 ?verifyKernelInvoc@handler@_V1@sycl@@AEAAXAEBVkernel@23@@Z
 ?verifyUsedKernelBundle@handler@_V1@sycl@@AEAAXAEBV?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z
 ?verifyUsedKernelBundleInternal@handler@_V1@sycl@@AEAAXVstring_view@detail@23@@Z

From 5fbcb1ead2551a055366f906a093c9267ccaf978 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Mon, 24 Jun 2024 05:17:33 -0700
Subject: [PATCH 38/52] fix format

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/stream_impl.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp
index 7e81e964bdc17..1ba09ed36369c 100644
--- a/sycl/source/detail/stream_impl.cpp
+++ b/sycl/source/detail/stream_impl.cpp
@@ -108,13 +108,13 @@ void stream_impl::generateFlushCommand(handler &cgh) {
 }
 
 // ABI break: remove
-void stream_impl::initStreamHost(QueueImplPtr){};
+void stream_impl::initStreamHost(QueueImplPtr){}
 
 // ABI break: remove
-void stream_impl::flush(const EventImplPtr &) {};
+void stream_impl::flush(const EventImplPtr &) {}
 
 // ABI break: remove
-void stream_impl::flush() {};
+void stream_impl::flush() {}
 
 } // namespace detail
 } // namespace _V1

From d5d15bf8f4b4317e3a9f43ce179a65f7a195f849 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Mon, 24 Jun 2024 08:28:19 -0700
Subject: [PATCH 39/52] fix formatting

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/stream_impl.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sycl/source/detail/stream_impl.cpp b/sycl/source/detail/stream_impl.cpp
index 1ba09ed36369c..b9f70581ac7a8 100644
--- a/sycl/source/detail/stream_impl.cpp
+++ b/sycl/source/detail/stream_impl.cpp
@@ -108,7 +108,7 @@ void stream_impl::generateFlushCommand(handler &cgh) {
 }
 
 // ABI break: remove
-void stream_impl::initStreamHost(QueueImplPtr){}
+void stream_impl::initStreamHost(QueueImplPtr) {}
 
 // ABI break: remove
 void stream_impl::flush(const EventImplPtr &) {}

From e185cbcca90a9d76827c95fe211aace1c7284f95 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Tue, 25 Jun 2024 08:25:30 -0700
Subject: [PATCH 40/52] self review comments fix

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/context.cpp                       |  2 +-
 sycl/source/detail/buffer_impl.cpp            |  4 +-
 sycl/source/detail/event_impl.cpp             |  2 +-
 sycl/source/detail/event_impl.hpp             |  2 +-
 sycl/source/detail/memory_manager.cpp         |  2 +-
 sycl/source/detail/platform_impl.hpp          |  6 --
 sycl/source/detail/queue_impl.cpp             |  2 +-
 sycl/source/detail/scheduler/commands.cpp     | 84 ++++++++-----------
 sycl/source/detail/scheduler/scheduler.hpp    | 10 +--
 sycl/source/device.cpp                        |  2 +-
 sycl/source/event.cpp                         |  2 +-
 sycl/source/kernel.cpp                        |  2 +-
 sycl/source/platform.cpp                      |  2 +-
 sycl/source/queue.cpp                         |  2 +-
 .../test-e2e/SubGroup/sub_groups_sycl2020.cpp |  4 -
 15 files changed, 52 insertions(+), 76 deletions(-)

diff --git a/sycl/source/context.cpp b/sycl/source/context.cpp
index 1261096b82047..e4c7404c7b078 100644
--- a/sycl/source/context.cpp
+++ b/sycl/source/context.cpp
@@ -127,7 +127,7 @@ context::get_backend_info() const {
 cl_context context::get() const { return impl->get(); }
 
 bool context::is_host() const {
-  assert(true && "context::is_host should not be called in implementation.");
+  assert(false && "context::is_host should not be called in implementation.");
   return false;
 }
 
diff --git a/sycl/source/detail/buffer_impl.cpp b/sycl/source/detail/buffer_impl.cpp
index f13444107e9eb..1795992594078 100644
--- a/sycl/source/detail/buffer_impl.cpp
+++ b/sycl/source/detail/buffer_impl.cpp
@@ -24,7 +24,9 @@ void *buffer_impl::allocateMem(ContextImplPtr Context, bool InitFromUserData,
                                sycl::detail::pi::PiEvent &OutEventToWait) {
   bool HostPtrReadOnly = false;
   BaseT::determineHostPtr(Context, InitFromUserData, HostPtr, HostPtrReadOnly);
-
+  assert(!(nullptr == HostPtr && BaseT::useHostPtr() && !Context) &&
+         "Internal error. Allocating memory on the host "
+         "while having use_host_ptr property");
   return MemoryManager::allocateMemBuffer(
       std::move(Context), this, HostPtr, HostPtrReadOnly,
       BaseT::getSizeInBytes(), BaseT::MInteropEvent, BaseT::MInteropContext,
diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index e203924d2d612..f4ad52221ed37 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -367,7 +367,7 @@ uint64_t event_impl::get_profiling_info<info::event_profiling::command_end>() {
 }
 
 template <> uint32_t event_impl::get_info<info::event::reference_count>() {
-  if (MEvent) {
+  if (!MIsHostEvent && MEvent) {
     return get_event_info<info::event::reference_count>(this->getHandleRef(),
                                                         this->getPlugin());
   }
diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp
index 8b46e715cd13e..12b58d25ab3cd 100644
--- a/sycl/source/detail/event_impl.hpp
+++ b/sycl/source/detail/event_impl.hpp
@@ -49,7 +49,7 @@ class event_impl {
   /// Normally constructs a host event, use std::nullopt to instead instantiate
   /// a device event.
   event_impl(std::optional<HostEventState> State = HES_Complete)
-      : MIsInitialized(false), MIsFlushed(true),
+      : MIsInitialized(false), MIsHostEvent(State), MIsFlushed(true),
         MState(State.value_or(HES_Complete)) {
     // Need to fail in event() constructor  if there are problems with the
     // ONEAPI_DEVICE_SELECTOR. Deferring may lead to conficts with noexcept
diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp
index 6f30ceef8eb51..97615960877ff 100644
--- a/sycl/source/detail/memory_manager.cpp
+++ b/sycl/source/detail/memory_manager.cpp
@@ -935,7 +935,7 @@ void MemoryManager::unmap(SYCLMemObjI *, void *Mem, QueueImplPtr Queue,
                           std::vector<sycl::detail::pi::PiEvent> DepEvents,
                           sycl::detail::pi::PiEvent &OutEvent) {
 
-  // Host queue is not supported here.
+  // Execution on host is not supported here.
   if (!Queue) {
     throw runtime_error("Not supported configuration of unmap requested",
                         PI_ERROR_INVALID_OPERATION);
diff --git a/sycl/source/detail/platform_impl.hpp b/sycl/source/detail/platform_impl.hpp
index bc6278d54f32c..0a926712eb806 100644
--- a/sycl/source/detail/platform_impl.hpp
+++ b/sycl/source/detail/platform_impl.hpp
@@ -32,9 +32,6 @@ class device_impl;
 // TODO: implement parameters treatment for host device
 class platform_impl {
 public:
-  /// Constructs platform_impl for a SYCL host platform.
-  platform_impl() : MHostPlatform(true) {}
-
   /// Constructs platform_impl from a plug-in interoperability platform
   /// handle.
   ///
@@ -125,7 +122,6 @@ class platform_impl {
 
   // \return the Plugin associated with this platform.
   const PluginPtr &getPlugin() const {
-    assert(!MHostPlatform && "Plugin is not available for Host.");
     return MPlugin;
   }
 
@@ -134,7 +130,6 @@ class platform_impl {
   /// \param PluginPtr is a pointer to a plugin instance
   /// \param Backend is the backend that we want this platform to use
   void setPlugin(PluginPtr &PluginPtr, backend Backend) {
-    assert(!MHostPlatform && "Plugin is not available for Host");
     MPlugin = PluginPtr;
     MBackend = Backend;
   }
@@ -214,7 +209,6 @@ class platform_impl {
   filterDeviceFilter(std::vector<sycl::detail::pi::PiDevice> &PiDevices,
                      ListT *FilterList) const;
 
-  bool MHostPlatform = false;
   sycl::detail::pi::PiPlatform MPlatform = 0;
   backend MBackend;
 
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index a5f9ae9964ac6..ae59239664327 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -655,7 +655,7 @@ bool queue_impl::ext_oneapi_empty() const {
                info::event_command_status::complete;
   }
 
-  // Check the status of the backend queue if this is not a host queue.
+  // Check the status of the backend queue.
   pi_bool IsReady = false;
   getPlugin()->call<PiApiKind::piQueueGetInfo>(
       MQueues[0], PI_EXT_ONEAPI_QUEUE_INFO_EMPTY, sizeof(pi_bool), &IsReady,
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 6322b904fd6bc..d52fb0da025f3 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -79,7 +79,10 @@ static size_t deviceToID(const device &Device) {
   return reinterpret_cast<size_t>(getSyclObjImpl(Device)->getHandleRef());
 }
 
-static std::string deviceToString(device Device) {
+static std::string queueDeviceToString(const QueueImplPtr &Queue) {
+  if (!Queue)
+    return "host";
+  auto Device = Queue->get_device();
   if (Device.is_cpu())
     return "CPU";
   else if (Device.is_gpu())
@@ -91,15 +94,19 @@ static std::string deviceToString(device Device) {
 }
 
 static void addDeviceMetadata(xpti_td *TraceEvent, const QueueImplPtr &Queue) {
-  xpti::addMetadata(TraceEvent, "sycl_device",
-                    Queue ? deviceToID(Queue->get_device()) : 0);
-  xpti::addMetadata(TraceEvent, "sycl_device_type",
-                    Queue ? deviceToString(Queue->get_device()) : "host");
+  xpti::addMetadata(TraceEvent, "sycl_device_type", queueDeviceToString(Queue));
   if (Queue)
+  {
+    xpti::addMetadata(TraceEvent, "sycl_device", deviceToID(Queue->get_device()));
     xpti::addMetadata(TraceEvent, "sycl_device_name",
                       getSyclObjImpl(Queue->get_device())->getDeviceName());
+  }
 }
 
+static unsigned long long getQueueID(const QueueImplPtr& Queue)
+{
+  return Queue ? Queue->getQueueID() : 0;
+}
 #endif
 
 static ContextImplPtr getContext(const QueueImplPtr &Queue) {
@@ -1009,8 +1016,7 @@ void AllocaCommandBase::emitInstrumentationData() {
     xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue ? MQueue->getQueueID() : 0);
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
   }
 #endif
 }
@@ -1081,8 +1087,7 @@ void AllocaCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "ALLOCA ON "
-         << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
+  Stream << "ALLOCA ON " << queueDeviceToString(MQueue) << "\\n";
   Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
   Stream << " Link : " << this->MLinkedAllocaCmd << "\\n";
   Stream << "\"];" << std::endl;
@@ -1130,8 +1135,7 @@ void AllocaSubBufCommand::emitInstrumentationData() {
                       this->MRequirement.MAccessRange[0]);
     xpti::addMetadata(TE, "access_range_end",
                       this->MRequirement.MAccessRange[1]);
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue ? MQueue->getQueueID() : 0);
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
     makeTraceEventEpilog();
   }
 #endif
@@ -1168,8 +1172,7 @@ void AllocaSubBufCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "ALLOCA SUB BUF ON "
-         << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
+  Stream << "ALLOCA SUB BUF ON " << queueDeviceToString(MQueue)<< "\\n";
   Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
   Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n";
   Stream << " Access range : " << this->MRequirement.MAccessRange[0] << "\\n";
@@ -1207,8 +1210,7 @@ void ReleaseCommand::emitInstrumentationData() {
                       commandToName(MAllocaCmd->getType()));
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue ? MQueue->getQueueID() : 0);
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
     makeTraceEventEpilog();
   }
 #endif
@@ -1282,8 +1284,7 @@ void ReleaseCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FF827A\", label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "RELEASE ON "
-         << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
+  Stream << "RELEASE ON " << queueDeviceToString(MQueue) << "\\n";
   Stream << " Alloca : " << MAllocaCmd << "\\n";
   Stream << " MemObj : " << MAllocaCmd->getSYCLMemObj() << "\\n";
   Stream << "\"];" << std::endl;
@@ -1327,8 +1328,7 @@ void MapMemObject::emitInstrumentationData() {
     xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue ? MQueue->getQueueID() : 0);
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
     makeTraceEventEpilog();
   }
 #endif
@@ -1353,8 +1353,7 @@ void MapMemObject::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "MAP ON "
-         << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
+  Stream << "MAP ON " << queueDeviceToString(MQueue) : "host") << "\\n";
 
   Stream << "\"];" << std::endl;
 
@@ -1389,8 +1388,7 @@ void UnMapMemObject::emitInstrumentationData() {
     xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue ? MQueue->getQueueID() : 0);
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
     makeTraceEventEpilog();
   }
 #endif
@@ -1435,8 +1433,7 @@ void UnMapMemObject::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#EBC40F\", label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "UNMAP ON "
-         << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
+  Stream << "UNMAP ON " << queueDeviceToString(MQueue) << "\\n";
 
   Stream << "\"];" << std::endl;
 
@@ -1488,8 +1485,7 @@ void MemCpyCommand::emitInstrumentationData() {
                       MQueue ? deviceToID(MQueue->get_device()) : 0);
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue ? MQueue->getQueueID() : 0);
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
     makeTraceEventEpilog();
   }
 #endif
@@ -1546,8 +1542,7 @@ void MemCpyCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#C7EB15\" label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "MEMCPY ON "
-         << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
+  Stream << "MEMCPY ON " << queueDeviceToString(MQueue) << "\\n";
   Stream << "From: " << MSrcAllocaCmd << " is host: " << !MSrcQueue << "\\n";
   Stream << "To: " << MDstAllocaCmd << " is host: " << !MQueue << "\\n";
 
@@ -1603,8 +1598,7 @@ void UpdateHostRequirementCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#f1337f\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "UPDATE REQ ON "
-         << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
+  Stream << "UPDATE REQ ON " << queueDeviceToString(MQueue) << "\\n";
   bool IsReqOnBuffer =
       MDstReq.MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer;
   Stream << "TYPE: " << (IsReqOnBuffer ? "Buffer" : "Image") << "\\n";
@@ -1661,8 +1655,7 @@ void MemCpyCommandHost::emitInstrumentationData() {
                       MQueue ? deviceToID(MQueue->get_device()) : 0);
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue ? MQueue->getQueueID() : 0);
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
     makeTraceEventEpilog();
   }
 #endif
@@ -1750,8 +1743,7 @@ void EmptyCommand::emitInstrumentationData() {
                       reinterpret_cast<size_t>(MAddress));
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue ? MQueue->getQueueID() : 0);
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
     makeTraceEventEpilog();
   }
 #endif
@@ -1780,8 +1772,7 @@ void MemCpyCommandHost::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "MEMCPY HOST ON "
-         << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
+  Stream << "MEMCPY HOST ON " << queueDeviceToString(MQueue) << "\\n";
 
   Stream << "\"];" << std::endl;
 
@@ -1819,8 +1810,7 @@ void UpdateHostRequirementCommand::emitInstrumentationData() {
                       reinterpret_cast<size_t>(MAddress));
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue ? MQueue->getQueueID() : 0);
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
     makeTraceEventEpilog();
   }
 #endif
@@ -2087,9 +2077,7 @@ std::pair<xpti_td *, uint64_t> emitKernelInstrumentationData(
 
   if (CmdTraceEvent) {
     // Stash the queue_id mutable metadata in TLS
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 Queue ? Queue->getQueueID() : 0);
-
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(Queue));
     instrumentationAddExtraKernelMetadata(CmdTraceEvent, NDRDesc,
                                           KernelBundleImplPtr, SyclKernelName,
                                           SyclKernel, Queue, CGArgs);
@@ -2133,8 +2121,7 @@ void ExecCGCommand::emitInstrumentationData() {
                                 CmdTraceEvent);
 
   if (CmdTraceEvent) {
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue ? MQueue->getQueueID() : 0);
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
     MTraceEvent = static_cast<void *>(CmdTraceEvent);
     if (MCommandGroup->getType() == detail::CG::Kernel) {
       auto KernelCG =
@@ -2157,8 +2144,7 @@ void ExecCGCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "EXEC CG ON "
-         << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n";
+  Stream << "EXEC CG ON " << queueDeviceToString(MQueue) << "\\n";
 
   switch (MCommandGroup->getType()) {
   case detail::CG::Kernel: {
@@ -3353,8 +3339,7 @@ void KernelFusionCommand::emitInstrumentationData() {
   if (MFirstInstance) {
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
-                                 MQueue ? MQueue->getQueueID() : 0);
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
     xptiNotifySubscribers(MStreamID, NotificationTraceType,
                           detail::GSYCLGraphEvent,
                           static_cast<xpti_td *>(MTraceEvent), MInstanceID,
@@ -3368,8 +3353,7 @@ void KernelFusionCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "KERNEL FUSION on "
-         << (MQueue ? deviceToString(MQueue->get_device()) : "host") << "\\n"
+  Stream << "KERNEL FUSION on " << queueDeviceToString(MQueue) << "\\n"
          << "FUSION LIST: {";
   bool Initial = true;
   for (auto *Cmd : MFusionList) {
diff --git a/sycl/source/detail/scheduler/scheduler.hpp b/sycl/source/detail/scheduler/scheduler.hpp
index 03372fc0b7a8f..cd5ae6bd0e0fe 100644
--- a/sycl/source/detail/scheduler/scheduler.hpp
+++ b/sycl/source/detail/scheduler/scheduler.hpp
@@ -213,16 +213,16 @@ struct MemObjRecord {
   // Contains latest write commands working with memory object.
   LeavesCollection MWriteLeaves;
 
-  // The flag indicates that the content of the memory object was/will be
-  // modified. Used while deciding if copy back needed.
-  bool MMemModified = false;
-
   // The context which has the latest state of the memory object.
   ContextImplPtr MCurContext;
 
-  // The mode this object can be accessed with from the host (host_accessor).
+  // The mode this object can be accessed from the host (host_accessor).
   // Valid only if the current usage is on host.
   access::mode MHostAccess = access::mode::read_write;
+
+  // The flag indicates that the content of the memory object was/will be
+  // modified. Used while deciding if copy back needed.
+  bool MMemModified = false;
 };
 
 /// DPC++ graph scheduler class.
diff --git a/sycl/source/device.cpp b/sycl/source/device.cpp
index a3a88ebf6636a..18b9cf4036cda 100644
--- a/sycl/source/device.cpp
+++ b/sycl/source/device.cpp
@@ -71,7 +71,7 @@ std::vector<device> device::get_devices(info::device_type deviceType) {
 cl_device_id device::get() const { return impl->get(); }
 
 bool device::is_host() const {
-  assert(true && "device::is_host should not be called in implementation.");
+  assert(false && "device::is_host should not be called in implementation.");
   return false;
 }
 
diff --git a/sycl/source/event.cpp b/sycl/source/event.cpp
index 12b4a7e68164e..69d62f354ea4c 100644
--- a/sycl/source/event.cpp
+++ b/sycl/source/event.cpp
@@ -38,7 +38,7 @@ bool event::operator==(const event &rhs) const { return rhs.impl == impl; }
 bool event::operator!=(const event &rhs) const { return !(*this == rhs); }
 
 bool event::is_host() const {
-  assert(true && "event::is_host should not be called in implementation.");
+  assert(false && "event::is_host should not be called in implementation.");
   return false;
 }
 
diff --git a/sycl/source/kernel.cpp b/sycl/source/kernel.cpp
index bc842f6e596a5..625eb995c47d3 100644
--- a/sycl/source/kernel.cpp
+++ b/sycl/source/kernel.cpp
@@ -31,7 +31,7 @@ kernel::kernel(cl_kernel ClKernel, const context &SyclContext)
 cl_kernel kernel::get() const { return impl->get(); }
 
 bool kernel::is_host() const {
-  assert(true && "kernel::is_host should not be called in implementation.");
+  assert(false && "kernel::is_host should not be called in implementation.");
   return false;
 }
 
diff --git a/sycl/source/platform.cpp b/sycl/source/platform.cpp
index 9a15943213ec6..179c8c09d0825 100644
--- a/sycl/source/platform.cpp
+++ b/sycl/source/platform.cpp
@@ -41,7 +41,7 @@ bool platform::has_extension(const std::string &ExtensionName) const {
 }
 
 bool platform::is_host() const {
-  assert(true && "platform::is_host should not be called in implementation.");
+  assert(false && "platform::is_host should not be called in implementation.");
   return false;
 }
 
diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp
index 174d1f9197af1..5cd0bd3449095 100644
--- a/sycl/source/queue.cpp
+++ b/sycl/source/queue.cpp
@@ -96,7 +96,7 @@ queue::ext_oneapi_get_graph() const {
 }
 
 bool queue::is_host() const {
-  assert(true && "queue::is_host should not be called in implementation.");
+  assert(false && "queue::is_host should not be called in implementation.");
   return false;
 }
 
diff --git a/sycl/test-e2e/SubGroup/sub_groups_sycl2020.cpp b/sycl/test-e2e/SubGroup/sub_groups_sycl2020.cpp
index 5b71a60a54051..a7d4c6493b8b5 100644
--- a/sycl/test-e2e/SubGroup/sub_groups_sycl2020.cpp
+++ b/sycl/test-e2e/SubGroup/sub_groups_sycl2020.cpp
@@ -1,9 +1,5 @@
 // RUN: %{build} -o %t.out
 // RUN: %{run} %t.out
-//
-// Assertion `!MHostPlatform && "Plugin is not available for Host."' failed on
-// Nvidia.
-// XFAIL: hip_nvidia
 
 #include <sycl/detail/core.hpp>
 

From a87b32817a46d1dfdba9205163106f2af565ea6c Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 26 Jun 2024 04:35:59 -0700
Subject: [PATCH 41/52] fix

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/event_impl.hpp         | 4 ++--
 sycl/source/detail/scheduler/commands.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp
index 12b58d25ab3cd..f609bd96b7189 100644
--- a/sycl/source/detail/event_impl.hpp
+++ b/sycl/source/detail/event_impl.hpp
@@ -49,8 +49,8 @@ class event_impl {
   /// Normally constructs a host event, use std::nullopt to instead instantiate
   /// a device event.
   event_impl(std::optional<HostEventState> State = HES_Complete)
-      : MIsInitialized(false), MIsHostEvent(State), MIsFlushed(true),
-        MState(State.value_or(HES_Complete)) {
+      : MIsInitialized(false), MIsFlushed(true),
+        MState(State.value_or(HES_Complete)),  MIsHostEvent(State) {
     // Need to fail in event() constructor  if there are problems with the
     // ONEAPI_DEVICE_SELECTOR. Deferring may lead to conficts with noexcept
     // event methods. This ::get() call uses static vars to read and parse the
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index d52fb0da025f3..9d9315652ed55 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -1353,7 +1353,7 @@ void MapMemObject::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "MAP ON " << queueDeviceToString(MQueue) : "host") << "\\n";
+  Stream << "MAP ON " << queueDeviceToString(MQueue) << "\\n";
 
   Stream << "\"];" << std::endl;
 

From 0a5a7583eef8f597c8b82c70a8671aeb1f45097c Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 26 Jun 2024 07:18:55 -0700
Subject: [PATCH 42/52] Update isCOntextInitialized stuff

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/event_impl.cpp             | 27 +++++-----
 sycl/source/detail/event_impl.hpp             | 39 ++++++++-------
 sycl/source/detail/helpers.cpp                |  4 +-
 sycl/source/detail/scheduler/commands.cpp     | 49 ++++++++++++-------
 sycl/source/detail/scheduler/scheduler.cpp    |  4 +-
 sycl/source/queue.cpp                         |  2 +-
 sycl/unittests/buffer/BufferReleaseBase.hpp   |  4 --
 sycl/unittests/pi/PiMock.cpp                  |  4 --
 .../scheduler/EnqueueWithDependsOnDeps.cpp    |  4 --
 .../scheduler/InOrderQueueHostTaskDeps.cpp    |  4 --
 sycl/unittests/scheduler/KernelFusion.cpp     |  4 --
 11 files changed, 66 insertions(+), 79 deletions(-)

diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index f4ad52221ed37..58a52230f1269 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -33,8 +33,8 @@ extern xpti::trace_event_data_t *GSYCLGraphEvent;
 #endif
 
 // If we do not yet have a context, use the default one.
-void event_impl::ensureContextInitialized() {
-  if (MIsContextInitialized)
+void event_impl::tryToInitContext() {
+  if (MContext || !MIsDefaultConstructed)
     return;
 
   const device SyclDevice;
@@ -114,12 +114,12 @@ const sycl::detail::pi::PiEvent &event_impl::getHandleRef() const {
 sycl::detail::pi::PiEvent &event_impl::getHandleRef() { return MEvent; }
 
 const ContextImplPtr &event_impl::getContextImpl() {
-  ensureContextInitialized();
+  tryToInitContext();
   return MContext;
 }
 
 const PluginPtr &event_impl::getPlugin() {
-  ensureContextInitialized();
+  tryToInitContext();
   return MContext->getPlugin();
 }
 
@@ -128,14 +128,12 @@ void event_impl::setStateIncomplete() { MState = HES_NotComplete; }
 void event_impl::setContextImpl(const ContextImplPtr &Context) {
   MIsHostEvent = Context == nullptr;
   MContext = Context;
-  MIsContextInitialized = true;
 }
 
 event_impl::event_impl(sycl::detail::pi::PiEvent Event,
                        const context &SyclContext)
-    : MIsContextInitialized(true), MEvent(Event),
-      MContext(detail::getSyclObjImpl(SyclContext)), MIsFlushed(true),
-      MState(HES_Complete) {
+    : MEvent(Event), MContext(detail::getSyclObjImpl(SyclContext)),
+      MIsFlushed(true), MState(HES_Complete) {
 
   sycl::detail::pi::PiContext TempContext;
   getPlugin()->call<PiApiKind::piEventGetInfo>(
@@ -398,7 +396,7 @@ event_impl::get_info<info::event::command_execution_status>() {
 template <>
 typename info::platform::version::return_type
 event_impl::get_backend_info<info::platform::version>() const {
-  if (!MIsContextInitialized) {
+  if (!MContext) {
     return "Context not initialized, no backend info available";
   }
   if (MContext->getBackend() != backend::opencl) {
@@ -419,7 +417,7 @@ event_impl::get_backend_info<info::platform::version>() const {
 template <>
 typename info::device::version::return_type
 event_impl::get_backend_info<info::device::version>() const {
-  if (!MIsContextInitialized) {
+  if (!MContext) {
     return "Context not initialized, no backend info available";
   }
   if (MContext->getBackend() != backend::opencl) {
@@ -437,7 +435,7 @@ event_impl::get_backend_info<info::device::version>() const {
 template <>
 typename info::device::backend_version::return_type
 event_impl::get_backend_info<info::device::backend_version>() const {
-  if (!MIsContextInitialized) {
+  if (!MContext) {
     return "Context not initialized, no backend info available";
   }
   if (MContext->getBackend() != backend::ext_oneapi_level_zero) {
@@ -456,11 +454,12 @@ void HostProfilingInfo::start() { StartTime = getTimestamp(); }
 void HostProfilingInfo::end() { EndTime = getTimestamp(); }
 
 pi_native_handle event_impl::getNative() {
-  ensureContextInitialized();
+  if (isHost())
+    return {};
+  tryToInitContext();
 
   auto Plugin = getPlugin();
-  if (!MIsInitialized) {
-    MIsInitialized = true;
+  if (MIsDefaultConstructed && !MEvent) {
     auto TempContext = MContext.get()->getHandleRef();
     Plugin->call<PiApiKind::piEventCreate>(TempContext, &MEvent);
   }
diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp
index f609bd96b7189..f4c2ac2e90a86 100644
--- a/sycl/source/detail/event_impl.hpp
+++ b/sycl/source/detail/event_impl.hpp
@@ -49,8 +49,8 @@ class event_impl {
   /// Normally constructs a host event, use std::nullopt to instead instantiate
   /// a device event.
   event_impl(std::optional<HostEventState> State = HES_Complete)
-      : MIsInitialized(false), MIsFlushed(true),
-        MState(State.value_or(HES_Complete)),  MIsHostEvent(State) {
+      : MIsFlushed(true), MState(State.value_or(HES_Complete)),
+        MIsDefaultConstructed(!State), MIsHostEvent(State) {
     // Need to fail in event() constructor  if there are problems with the
     // ONEAPI_DEVICE_SELECTOR. Deferring may lead to conficts with noexcept
     // event methods. This ::get() call uses static vars to read and parse the
@@ -255,15 +255,6 @@ class event_impl {
 
   QueueImplPtr getSubmittedQueue() const { return MSubmittedQueue.lock(); };
 
-  /// Checks if an event is in a fully intialized state. Default-constructed
-  /// events will return true only after having initialized its native event,
-  /// while other events will assume that they are fully initialized at
-  /// construction, relying on external sources to supply member data.
-  ///
-  /// \return true if the event is considered to be in a fully initialized
-  /// state.
-  bool isInitialized() const noexcept { return MIsInitialized; }
-
   /// Checks if this event is complete.
   ///
   /// \return true if this event is complete.
@@ -279,10 +270,11 @@ class event_impl {
     MPostCompleteEvents.push_back(Event);
   }
 
-  bool isContextInitialized() const noexcept { return MIsContextInitialized; }
+  bool isDefaultConstructed() const noexcept { return MIsDefaultConstructed; }
 
   ContextImplPtr getContextImplPtr() {
-    ensureContextInitialized();
+    if (MIsDefaultConstructed)
+      tryToInitContext();
     return MContext;
   }
 
@@ -347,11 +339,7 @@ class event_impl {
   void instrumentationEpilog(void *TelementryEvent, const std::string &Name,
                              int32_t StreamID, uint64_t IId) const;
   void checkProfilingPreconditions() const;
-  // Events constructed without a context will lazily use the default context
-  // when needed.
-  void ensureContextInitialized();
-  bool MIsInitialized = true;
-  bool MIsContextInitialized = false;
+
   sycl::detail::pi::PiEvent MEvent = nullptr;
   // Stores submission time of command associated with event
   uint64_t MSubmitTime = 0;
@@ -409,7 +397,20 @@ class event_impl {
                   std::shared_ptr<sycl::detail::context_impl> Context);
 
   std::atomic_bool MIsEnqueued{false};
-  bool MIsHostEvent{false};
+
+  // Events constructed without a context will lazily use the default context
+  // when needed.
+  void tryToInitContext();
+  // Event class represents 3 different kinds of operations:
+  // | type  | has PI event | MContext | MIsHostTask | MIsDefaultConstructed |
+  // | dev   | true         | !nullptr | false       | false                 |
+  // | host  | false        | nullptr  | true        | false                 |
+  // |default|   *          |    *     | false       | true                  |
+  // Default constructed event is created with empty ctor in host code, MContext
+  // is lazily initialized with default device context on first context query.
+  // MEvent is lazily created in first pi handle query.
+  bool MIsDefaultConstructed = false;
+  bool MIsHostEvent = false;
 };
 
 } // namespace detail
diff --git a/sycl/source/detail/helpers.cpp b/sycl/source/detail/helpers.cpp
index 75c6fd72b8fd0..901fd34b4cce8 100644
--- a/sycl/source/detail/helpers.cpp
+++ b/sycl/source/detail/helpers.cpp
@@ -31,9 +31,7 @@ getOrWaitEvents(std::vector<sycl::event> DepEvents, ContextImplPtr Context) {
     // throwaway events created with empty constructor will not have a context
     // (which is set lazily) calling getContextImpl() would set that
     // context, which we wish to avoid as it is expensive.
-    if ((!SyclEventImplPtr->isContextInitialized() &&
-         !SyclEventImplPtr->isHost()) ||
-        SyclEventImplPtr->isNOP()) {
+    if (SyclEventImplPtr->isDefaultConstructed() || SyclEventImplPtr->isNOP()) {
       continue;
     }
     // The fusion command and its event are associated with a non-host context,
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 9d9315652ed55..1b9aea1c10f02 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -95,16 +95,15 @@ static std::string queueDeviceToString(const QueueImplPtr &Queue) {
 
 static void addDeviceMetadata(xpti_td *TraceEvent, const QueueImplPtr &Queue) {
   xpti::addMetadata(TraceEvent, "sycl_device_type", queueDeviceToString(Queue));
-  if (Queue)
-  {
-    xpti::addMetadata(TraceEvent, "sycl_device", deviceToID(Queue->get_device()));
+  if (Queue) {
+    xpti::addMetadata(TraceEvent, "sycl_device",
+                      deviceToID(Queue->get_device()));
     xpti::addMetadata(TraceEvent, "sycl_device_name",
                       getSyclObjImpl(Queue->get_device())->getDeviceName());
   }
 }
 
-static unsigned long long getQueueID(const QueueImplPtr& Queue)
-{
+static unsigned long long getQueueID(const QueueImplPtr &Queue) {
   return Queue ? Queue->getQueueID() : 0;
 }
 #endif
@@ -279,7 +278,7 @@ std::vector<sycl::detail::pi::PiEvent> Command::getPiEventsBlocking(
     // (which is set lazily) calling getContextImpl() would set that
     // context, which we wish to avoid as it is expensive.
     // Skip host task and NOP events also.
-    if (!EventImpl->isContextInitialized() || EventImpl->isHost() ||
+    if (EventImpl->isDefaultConstructed() || EventImpl->isHost() ||
         EventImpl->isNOP())
       continue;
     // In this path nullptr native event means that the command has not been
@@ -728,7 +727,8 @@ Command *Command::processDepEvent(EventImplPtr DepEvent, const DepDesc &Dep,
   // 2. Some types of commands do not produce PI events after they are
   // enqueued (e.g. alloca). Note that we can't check the pi event to make that
   // distinction since the command might still be unenqueued at this point.
-  bool PiEventExpected = (!DepEvent->isHost() && DepEvent->isInitialized());
+  bool PiEventExpected =
+      (!DepEvent->isHost() && !DepEvent->isDefaultConstructed());
   if (auto *DepCmd = static_cast<Command *>(DepEvent->getCommand()))
     PiEventExpected &= DepCmd->producesPiEvent();
 
@@ -1016,7 +1016,8 @@ void AllocaCommandBase::emitInstrumentationData() {
     xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
+                                 getQueueID(MQueue));
   }
 #endif
 }
@@ -1135,7 +1136,8 @@ void AllocaSubBufCommand::emitInstrumentationData() {
                       this->MRequirement.MAccessRange[0]);
     xpti::addMetadata(TE, "access_range_end",
                       this->MRequirement.MAccessRange[1]);
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
+                                 getQueueID(MQueue));
     makeTraceEventEpilog();
   }
 #endif
@@ -1172,7 +1174,7 @@ void AllocaSubBufCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "ALLOCA SUB BUF ON " << queueDeviceToString(MQueue)<< "\\n";
+  Stream << "ALLOCA SUB BUF ON " << queueDeviceToString(MQueue) << "\\n";
   Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
   Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n";
   Stream << " Access range : " << this->MRequirement.MAccessRange[0] << "\\n";
@@ -1210,7 +1212,8 @@ void ReleaseCommand::emitInstrumentationData() {
                       commandToName(MAllocaCmd->getType()));
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
+                                 getQueueID(MQueue));
     makeTraceEventEpilog();
   }
 #endif
@@ -1328,7 +1331,8 @@ void MapMemObject::emitInstrumentationData() {
     xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
+                                 getQueueID(MQueue));
     makeTraceEventEpilog();
   }
 #endif
@@ -1388,7 +1392,8 @@ void UnMapMemObject::emitInstrumentationData() {
     xpti::addMetadata(TE, "memory_object", reinterpret_cast<size_t>(MAddress));
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
+                                 getQueueID(MQueue));
     makeTraceEventEpilog();
   }
 #endif
@@ -1485,7 +1490,8 @@ void MemCpyCommand::emitInstrumentationData() {
                       MQueue ? deviceToID(MQueue->get_device()) : 0);
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
+                                 getQueueID(MQueue));
     makeTraceEventEpilog();
   }
 #endif
@@ -1655,7 +1661,8 @@ void MemCpyCommandHost::emitInstrumentationData() {
                       MQueue ? deviceToID(MQueue->get_device()) : 0);
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
+                                 getQueueID(MQueue));
     makeTraceEventEpilog();
   }
 #endif
@@ -1743,7 +1750,8 @@ void EmptyCommand::emitInstrumentationData() {
                       reinterpret_cast<size_t>(MAddress));
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
+                                 getQueueID(MQueue));
     makeTraceEventEpilog();
   }
 #endif
@@ -1810,7 +1818,8 @@ void UpdateHostRequirementCommand::emitInstrumentationData() {
                       reinterpret_cast<size_t>(MAddress));
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
+                                 getQueueID(MQueue));
     makeTraceEventEpilog();
   }
 #endif
@@ -2121,7 +2130,8 @@ void ExecCGCommand::emitInstrumentationData() {
                                 CmdTraceEvent);
 
   if (CmdTraceEvent) {
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
+                                 getQueueID(MQueue));
     MTraceEvent = static_cast<void *>(CmdTraceEvent);
     if (MCommandGroup->getType() == detail::CG::Kernel) {
       auto KernelCG =
@@ -3339,7 +3349,8 @@ void KernelFusionCommand::emitInstrumentationData() {
   if (MFirstInstance) {
     // Since we do NOT add queue_id value to metadata, we are stashing it to TLS
     // as this data is mutable and the metadata is supposed to be invariant
-    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY, getQueueID(MQueue));
+    xpti::framework::stash_tuple(XPTI_QUEUE_INSTANCE_ID_KEY,
+                                 getQueueID(MQueue));
     xptiNotifySubscribers(MStreamID, NotificationTraceType,
                           detail::GSYCLGraphEvent,
                           static_cast<xpti_td *>(MTraceEvent), MInstanceID,
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index 905ca889aaf0d..4acc5b6c3a6a4 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -697,9 +697,7 @@ bool CheckEventReadiness(const ContextImplPtr &Context,
   // don't represent actual dependencies. Calling getContextImpl() would set
   // their context, which we wish to avoid as it is expensive.
   // NOP events also don't represent actual dependencies.
-  if ((!SyclEventImplPtr->isContextInitialized() &&
-       !SyclEventImplPtr->isHost()) ||
-      SyclEventImplPtr->isNOP()) {
+  if ((SyclEventImplPtr->isDefaultConstructed()) || SyclEventImplPtr->isNOP()) {
     return true;
   }
   if (SyclEventImplPtr->isHost()) {
diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp
index 5cd0bd3449095..9c807f90061b5 100644
--- a/sycl/source/queue.cpp
+++ b/sycl/source/queue.cpp
@@ -244,7 +244,7 @@ event queue::ext_oneapi_submit_barrier(const std::vector<event> &WaitList,
   bool AllEventsEmptyOrNop = std::all_of(
       begin(WaitList), end(WaitList), [&](const event &Event) -> bool {
         auto EventImpl = detail::getSyclObjImpl(Event);
-        return !EventImpl->isContextInitialized() || EventImpl->isNOP();
+        return EventImpl->isDefaultConstructed() || EventImpl->isNOP();
       });
   if (is_in_order() && !impl->getCommandGraph() && !impl->MIsProfilingEnabled &&
       AllEventsEmptyOrNop)
diff --git a/sycl/unittests/buffer/BufferReleaseBase.hpp b/sycl/unittests/buffer/BufferReleaseBase.hpp
index b35d73cb3909c..bfcc4fb8369ed 100644
--- a/sycl/unittests/buffer/BufferReleaseBase.hpp
+++ b/sycl/unittests/buffer/BufferReleaseBase.hpp
@@ -43,10 +43,6 @@ class BufferDestructionCheckCommon : public ::testing::Test {
 
 protected:
   void SetUp() override {
-    if (Plt.is_host()) {
-      std::cout << "Not run due to host-only environment\n";
-      GTEST_SKIP();
-    }
     MockSchedulerPtr = new MockScheduler();
     sycl::detail::GlobalHandler::instance().attachScheduler(
         dynamic_cast<sycl::detail::Scheduler *>(MockSchedulerPtr));
diff --git a/sycl/unittests/pi/PiMock.cpp b/sycl/unittests/pi/PiMock.cpp
index c7014162f9cf8..02044d9631376 100644
--- a/sycl/unittests/pi/PiMock.cpp
+++ b/sycl/unittests/pi/PiMock.cpp
@@ -56,10 +56,6 @@ TEST(PiMockTest, ConstructFromQueue) {
   sycl::unittest::PiMock Mock;
   queue MockQ{Mock.getPlatform().get_devices()[0]};
   queue NormalQ;
-  if (NormalQ.is_host()) {
-    std::cerr << "Not run due to host-only environment\n";
-    return;
-  }
 
   const auto &NormalPiPlugin =
       detail::getSyclObjImpl(NormalQ)->getPlugin()->getPiPlugin();
diff --git a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
index e1bc8c894f311..08f03420ac54e 100644
--- a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
+++ b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
@@ -26,10 +26,6 @@ constexpr auto DisableCleanupName = "SYCL_DISABLE_EXECUTION_GRAPH_CLEANUP";
 std::vector<std::pair<pi_uint32, const pi_event *>> PassedNumEvents;
 
 bool CheckTestExecutionRequirements(const platform &plt) {
-  if (plt.is_host()) {
-    std::cout << "Not run due to host-only environment\n";
-    return false;
-  }
   // This test only contains device image for SPIR-V capable devices.
   if (plt.get_backend() != sycl::backend::opencl &&
       plt.get_backend() != sycl::backend::ext_oneapi_level_zero) {
diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp
index 8693ff5e4c52b..929f8735bc85f 100644
--- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp
+++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp
@@ -130,10 +130,6 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) {
       customextUSMEnqueueMemset);
 
   sycl::platform Plt = Mock.getPlatform();
-  if (Plt.is_host()) {
-    std::cout << "Not run due to host-only environment\n";
-    GTEST_SKIP();
-  }
 
   context Ctx{Plt};
   queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()};
diff --git a/sycl/unittests/scheduler/KernelFusion.cpp b/sycl/unittests/scheduler/KernelFusion.cpp
index 8b45c03e37f1f..5a86636b13c09 100644
--- a/sycl/unittests/scheduler/KernelFusion.cpp
+++ b/sycl/unittests/scheduler/KernelFusion.cpp
@@ -42,10 +42,6 @@ detail::Command *CreateTaskCommand(MockScheduler &MS,
 }
 
 bool CheckTestExecRequirements(const platform &plt) {
-  if (plt.is_host()) {
-    std::cout << "Not run due to host-only environment\n";
-    return false;
-  }
   // This test only contains device image for SPIR-V capable devices.
   if (plt.get_backend() != sycl::backend::opencl &&
       plt.get_backend() != sycl::backend::ext_oneapi_level_zero) {

From 97c4ce548c894ab94b223fd66d1d18f7a97f7d78 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 26 Jun 2024 12:00:51 -0700
Subject: [PATCH 43/52] prepare removal from handler

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/include/sycl/handler.hpp        | 69 +++++++++-------------------
 sycl/source/detail/platform_impl.hpp |  4 +-
 2 files changed, 23 insertions(+), 50 deletions(-)

diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
index a71f5400a813d..19d0c5ac1e85e 100644
--- a/sycl/include/sycl/handler.hpp
+++ b/sycl/include/sycl/handler.hpp
@@ -178,22 +178,22 @@ template <typename DataT, int Dimensions, access::mode AccessMode,
 class image_accessor;
 class HandlerAccess;
 template <typename RetType, typename Func, typename Arg>
-static Arg member_ptr_helper(RetType (Func::*)(Arg) const);
+static Arg member_ptr_helper(RetType (Func:: *)(Arg) const);
 
 // Non-const version of the above template to match functors whose 'operator()'
 // is declared w/o the 'const' qualifier.
 template <typename RetType, typename Func, typename Arg>
-static Arg member_ptr_helper(RetType (Func::*)(Arg));
+static Arg member_ptr_helper(RetType (Func:: *)(Arg));
 
 // Version with two arguments to handle the case when kernel_handler is passed
 // to a lambda
 template <typename RetType, typename Func, typename Arg1, typename Arg2>
-static Arg1 member_ptr_helper(RetType (Func::*)(Arg1, Arg2) const);
+static Arg1 member_ptr_helper(RetType (Func:: *)(Arg1, Arg2) const);
 
 // Non-const version of the above template to match functors whose 'operator()'
 // is declared w/o the 'const' qualifier.
 template <typename RetType, typename Func, typename Arg1, typename Arg2>
-static Arg1 member_ptr_helper(RetType (Func::*)(Arg1, Arg2));
+static Arg1 member_ptr_helper(RetType (Func:: *)(Arg1, Arg2));
 
 template <typename F, typename SuggestedArgType>
 decltype(member_ptr_helper(&F::operator())) argument_helper(int);
@@ -464,8 +464,8 @@ class __SYCL_EXPORT handler {
   /// Constructs SYCL handler from queue.
   ///
   /// \param Queue is a SYCL queue.
-  /// \param IsHost indicates if this handler is created for SYCL host device.
-  handler(std::shared_ptr<detail::queue_impl> Queue, bool IsHost);
+  handler(std::shared_ptr<detail::queue_impl> Queue,
+          bool /*ABI Break: to remove */);
 
   /// Constructs SYCL handler from the associated queue and the submission's
   /// primary and secondary queue.
@@ -475,10 +475,10 @@ class __SYCL_EXPORT handler {
   /// \param PrimaryQueue is the primary SYCL queue of the submission.
   /// \param SecondaryQueue is the secondary SYCL queue of the submission. This
   ///        is null if no secondary queue is associated with the submission.
-  /// \param IsHost indicates if this handler is created for SYCL host device.
   handler(std::shared_ptr<detail::queue_impl> Queue,
           std::shared_ptr<detail::queue_impl> PrimaryQueue,
-          std::shared_ptr<detail::queue_impl> SecondaryQueue, bool IsHost);
+          std::shared_ptr<detail::queue_impl> SecondaryQueue,
+          bool /*ABI Break: to remove */);
 
   /// Constructs SYCL handler from Graph.
   ///
@@ -609,7 +609,7 @@ class __SYCL_EXPORT handler {
   ~handler() = default;
 
   // TODO: Private and unusued. Remove when ABI break is allowed.
-  bool is_host() { return MIsHost; }
+  bool is_host() { return false; }
 
 #ifdef __SYCL_DEVICE_ONLY__
   // In device compilation accessor isn't inherited from host base classes, so
@@ -888,12 +888,6 @@ class __SYCL_EXPORT handler {
         detail::KernelLambdaHasKernelHandlerArgT<KernelType,
                                                  LambdaArgType>::value;
 
-    if (IsCallableWithKernelHandler && MIsHost) {
-      throw sycl::feature_not_supported(
-          "kernel_handler is not yet supported by host device.",
-          PI_ERROR_INVALID_OPERATION);
-    }
-
     KernelType *KernelPtr =
         ResetHostKernel<KernelType, LambdaArgType, Dims>(KernelFunc);
 
@@ -1042,8 +1036,7 @@ class __SYCL_EXPORT handler {
   std::enable_if_t<(DimSrc > 0) && (DimDst > 0), bool>
   copyAccToAccHelper(accessor<TSrc, DimSrc, ModeSrc, TargetSrc, IsPHSrc> Src,
                      accessor<TDst, DimDst, ModeDst, TargetDst, IsPHDst> Dst) {
-    if (!MIsHost &&
-        IsCopyingRectRegionAvailable(Src.get_range(), Dst.get_range()))
+    if (IsCopyingRectRegionAvailable(Src.get_range(), Dst.get_range()))
       return false;
 
     range<1> LinearizedRange(Src.size());
@@ -1065,6 +1058,7 @@ class __SYCL_EXPORT handler {
   ///
   /// \param Src is a source SYCL accessor.
   /// \param Dst is a destination SYCL accessor.
+  // ABI break: to remove whole method
   template <typename TSrc, int DimSrc, access::mode ModeSrc,
             access::target TargetSrc, typename TDst, int DimDst,
             access::mode ModeDst, access::target TargetDst,
@@ -1072,16 +1066,11 @@ class __SYCL_EXPORT handler {
   std::enable_if_t<DimSrc == 0 || DimDst == 0, bool>
   copyAccToAccHelper(accessor<TSrc, DimSrc, ModeSrc, TargetSrc, IsPHSrc> Src,
                      accessor<TDst, DimDst, ModeDst, TargetDst, IsPHDst> Dst) {
-    if (!MIsHost)
-      return false;
-
-    single_task<__copyAcc2Acc<TSrc, DimSrc, ModeSrc, TargetSrc, TDst, DimDst,
-                              ModeDst, TargetDst, IsPHSrc, IsPHDst>>(
-        [=]() { *(Dst.get_pointer()) = *(Src.get_pointer()); });
-    return true;
+    return false;
   }
 
 #ifndef __SYCL_DEVICE_ONLY__
+  // ABI break: to remove whole method
   /// Copies the content of memory object accessed by Src into the memory
   /// pointed by Dst.
   ///
@@ -1101,6 +1090,7 @@ class __SYCL_EXPORT handler {
         });
   }
 
+  // ABI break: to remove whole method
   /// Copies 1 element accessed by 0-dimensional accessor Src into the memory
   /// pointed by Dst.
   ///
@@ -1118,6 +1108,7 @@ class __SYCL_EXPORT handler {
         });
   }
 
+  // ABI break: to remove whole method
   /// Copies the memory pointed by Src into the memory accessed by Dst.
   ///
   /// \param Src is a pointer to source memory.
@@ -1135,6 +1126,7 @@ class __SYCL_EXPORT handler {
         });
   }
 
+  // ABI break: to remove whole method
   /// Copies 1 element pointed by Src to memory accessed by 0-dimensional
   /// accessor Dst.
   ///
@@ -2245,7 +2237,7 @@ class __SYCL_EXPORT handler {
     MNDRDesc.set(range<1>{1});
     MKernel = detail::getSyclObjImpl(std::move(Kernel));
     setType(detail::CG::Kernel);
-    if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
+    if (!lambdaAndKernelHaveEqualName<NameT>()) {
       extractArgsAndReqs();
       MKernelName = getKernelName();
     } else
@@ -2282,7 +2274,7 @@ class __SYCL_EXPORT handler {
     MKernel = detail::getSyclObjImpl(std::move(Kernel));
     setType(detail::CG::Kernel);
     setNDRangeUsed(false);
-    if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
+    if (!lambdaAndKernelHaveEqualName<NameT>()) {
       extractArgsAndReqs();
       MKernelName = getKernelName();
     } else
@@ -2322,7 +2314,7 @@ class __SYCL_EXPORT handler {
     MKernel = detail::getSyclObjImpl(std::move(Kernel));
     setType(detail::CG::Kernel);
     setNDRangeUsed(false);
-    if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
+    if (!lambdaAndKernelHaveEqualName<NameT>()) {
       extractArgsAndReqs();
       MKernelName = getKernelName();
     } else
@@ -2361,7 +2353,7 @@ class __SYCL_EXPORT handler {
     MKernel = detail::getSyclObjImpl(std::move(Kernel));
     setType(detail::CG::Kernel);
     setNDRangeUsed(true);
-    if (!MIsHost && !lambdaAndKernelHaveEqualName<NameT>()) {
+    if (!lambdaAndKernelHaveEqualName<NameT>()) {
       extractArgsAndReqs();
       MKernelName = getKernelName();
     } else
@@ -2688,14 +2680,6 @@ class __SYCL_EXPORT handler {
                   "Invalid accessor target for the copy method.");
     static_assert(isValidModeForSourceAccessor(AccessMode),
                   "Invalid accessor mode for the copy method.");
-#ifndef __SYCL_DEVICE_ONLY__
-    if (MIsHost) {
-      // TODO: Temporary implementation for host. Should be handled by memory
-      // manager.
-      copyAccToPtrHost(Src, Dst);
-      return;
-    }
-#endif
     setType(detail::CG::CopyAccToPtr);
 
     detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Src;
@@ -2732,14 +2716,7 @@ class __SYCL_EXPORT handler {
                   "Invalid accessor mode for the copy method.");
     // TODO: Add static_assert with is_device_copyable when vec is
     // device-copyable.
-#ifndef __SYCL_DEVICE_ONLY__
-    if (MIsHost) {
-      // TODO: Temporary implementation for host. Should be handled by memory
-      // manager.
-      copyPtrToAccHost(Src, Dst);
-      return;
-    }
-#endif
+
     setType(detail::CG::CopyPtrToAcc);
 
     detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Dst;
@@ -2853,8 +2830,6 @@ class __SYCL_EXPORT handler {
   fill(accessor<T, Dims, AccessMode, AccessTarget, IsPlaceholder, PropertyListT>
            Dst,
        const T &Pattern) {
-    assert(!MIsHost && "fill() should no longer be callable on a host device.");
-
     if (Dst.is_placeholder())
       checkIfPlaceholderIsBoundToHandler(Dst);
 
@@ -3392,7 +3367,7 @@ class __SYCL_EXPORT handler {
   /// Storage for the CG created when handling graph nodes added explicitly.
   std::unique_ptr<detail::CG> MGraphNodeCG;
 
-  bool MIsHost = false;
+  bool MIsHost = false; // ABI break: to remove
 
   detail::code_location MCodeLoc = {};
   bool MIsFinalized = false;
diff --git a/sycl/source/detail/platform_impl.hpp b/sycl/source/detail/platform_impl.hpp
index 0a926712eb806..dfb2597bf417b 100644
--- a/sycl/source/detail/platform_impl.hpp
+++ b/sycl/source/detail/platform_impl.hpp
@@ -121,9 +121,7 @@ class platform_impl {
   static std::vector<platform> get_platforms();
 
   // \return the Plugin associated with this platform.
-  const PluginPtr &getPlugin() const {
-    return MPlugin;
-  }
+  const PluginPtr &getPlugin() const { return MPlugin; }
 
   /// Sets the platform implementation to use another plugin.
   ///

From 6cf3171d7d43021fd668789e5b83d12331d41858 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 26 Jun 2024 12:05:12 -0700
Subject: [PATCH 44/52] fix test

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/test-e2e/Config/allowlist.cpp | 58 +++++++++++++-----------------
 1 file changed, 24 insertions(+), 34 deletions(-)

diff --git a/sycl/test-e2e/Config/allowlist.cpp b/sycl/test-e2e/Config/allowlist.cpp
index 121e911c0474c..7bfb16ca687d0 100644
--- a/sycl/test-e2e/Config/allowlist.cpp
+++ b/sycl/test-e2e/Config/allowlist.cpp
@@ -35,61 +35,51 @@ int main() {
   // Expected that the allowlist filter is not set
   if (getenv("PRINT_PLATFORM_INFO")) {
     for (const sycl::platform &Platform : sycl::platform::get_platforms())
-      if (!Platform.is_host()) {
+      std::string Name = Platform.get_info<sycl::info::platform::name>();
+      std::string Ver = Platform.get_info<sycl::info::platform::version>();
+      // As a string will be used as regexp pattern, we need to get rid of
+      // symbols that can be treated in a special way.
+      replaceSpecialCharacters(Name);
+      replaceSpecialCharacters(Ver);
 
-        std::string Name = Platform.get_info<sycl::info::platform::name>();
-        std::string Ver = Platform.get_info<sycl::info::platform::version>();
-        // As a string will be used as regexp pattern, we need to get rid of
-        // symbols that can be treated in a special way.
-        replaceSpecialCharacters(Name);
-        replaceSpecialCharacters(Ver);
+      std::cout << "SYCL_DEVICE_ALLOWLIST=PlatformName:{{" << Name
+                << "}},PlatformVersion:{{" << Ver << "}}";
 
-        std::cout << "SYCL_DEVICE_ALLOWLIST=PlatformName:{{" << Name
-                  << "}},PlatformVersion:{{" << Ver << "}}";
-
-        return 0;
-      }
-    throw std::runtime_error("Non host device is not found");
+      return 0;
   }
 
   // Expected that the allowlist filter is not set
   if (getenv("PRINT_DEVICE_INFO")) {
     for (const sycl::platform &Platform : sycl::platform::get_platforms())
-      if (!Platform.is_host()) {
-        const sycl::device Dev = Platform.get_devices().at(0);
-        std::string Name = Dev.get_info<sycl::info::device::name>();
-        std::string Ver = Dev.get_info<sycl::info::device::driver_version>();
+      const sycl::device Dev = Platform.get_devices().at(0);
+      std::string Name = Dev.get_info<sycl::info::device::name>();
+      std::string Ver = Dev.get_info<sycl::info::device::driver_version>();
 
-        // As a string will be used as regexp pattern, we need to get rid of
-        // symbols that can be treated in a special way.
-        replaceSpecialCharacters(Name);
-        replaceSpecialCharacters(Ver);
+      // As a string will be used as regexp pattern, we need to get rid of
+      // symbols that can be treated in a special way.
+      replaceSpecialCharacters(Name);
+      replaceSpecialCharacters(Ver);
 
-        std::cout << "SYCL_DEVICE_ALLOWLIST=DeviceName:{{" << Name
-                  << "}},DriverVersion:{{" << Ver << "}}";
+      std::cout << "SYCL_DEVICE_ALLOWLIST=DeviceName:{{" << Name
+                << "}},DriverVersion:{{" << Ver << "}}";
 
-        return 0;
-      }
-    throw std::runtime_error("Non host device is not found");
+      return 0;
   }
 
   // Expected the allowlist to be set with the "PRINT_DEVICE_INFO" run result
   if (getenv("TEST_DEVICE_AVAILABLE")) {
     for (const sycl::platform &Platform : sycl::platform::get_platforms())
-      if (!Platform.is_host()) {
-        if (Platform.get_devices().size() != 1)
-          throw std::runtime_error("Expected only one non host device.");
+      if (Platform.get_devices().size() != 1)
+        throw std::runtime_error("Expected only one device.");
 
-        return 0;
-      }
-    throw std::runtime_error("Non host device is not found");
+      return 0;
+    }
   }
 
   // Expected the allowlist to be set but empty
   if (getenv("TEST_DEVICE_IS_NOT_AVAILABLE")) {
     for (const sycl::platform &Platform : sycl::platform::get_platforms())
-      if (!Platform.is_host())
-        throw std::runtime_error("Expected no non host device is available");
+        throw std::runtime_error("Expected no device is available");
     return 0;
   }
 

From 989557abba027be8a90c106ac69bac046016565d Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Wed, 26 Jun 2024 12:22:56 -0700
Subject: [PATCH 45/52] fix clang-format

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/include/sycl/handler.hpp      |  8 +++---
 sycl/test-e2e/Config/allowlist.cpp | 40 +++++++++++++++---------------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
index 19d0c5ac1e85e..6df476e2d2d96 100644
--- a/sycl/include/sycl/handler.hpp
+++ b/sycl/include/sycl/handler.hpp
@@ -178,22 +178,22 @@ template <typename DataT, int Dimensions, access::mode AccessMode,
 class image_accessor;
 class HandlerAccess;
 template <typename RetType, typename Func, typename Arg>
-static Arg member_ptr_helper(RetType (Func:: *)(Arg) const);
+static Arg member_ptr_helper(RetType (Func::*)(Arg) const);
 
 // Non-const version of the above template to match functors whose 'operator()'
 // is declared w/o the 'const' qualifier.
 template <typename RetType, typename Func, typename Arg>
-static Arg member_ptr_helper(RetType (Func:: *)(Arg));
+static Arg member_ptr_helper(RetType (Func::*)(Arg));
 
 // Version with two arguments to handle the case when kernel_handler is passed
 // to a lambda
 template <typename RetType, typename Func, typename Arg1, typename Arg2>
-static Arg1 member_ptr_helper(RetType (Func:: *)(Arg1, Arg2) const);
+static Arg1 member_ptr_helper(RetType (Func::*)(Arg1, Arg2) const);
 
 // Non-const version of the above template to match functors whose 'operator()'
 // is declared w/o the 'const' qualifier.
 template <typename RetType, typename Func, typename Arg1, typename Arg2>
-static Arg1 member_ptr_helper(RetType (Func:: *)(Arg1, Arg2));
+static Arg1 member_ptr_helper(RetType (Func::*)(Arg1, Arg2));
 
 template <typename F, typename SuggestedArgType>
 decltype(member_ptr_helper(&F::operator())) argument_helper(int);
diff --git a/sycl/test-e2e/Config/allowlist.cpp b/sycl/test-e2e/Config/allowlist.cpp
index 7bfb16ca687d0..7891088db5abb 100644
--- a/sycl/test-e2e/Config/allowlist.cpp
+++ b/sycl/test-e2e/Config/allowlist.cpp
@@ -36,34 +36,34 @@ int main() {
   if (getenv("PRINT_PLATFORM_INFO")) {
     for (const sycl::platform &Platform : sycl::platform::get_platforms())
       std::string Name = Platform.get_info<sycl::info::platform::name>();
-      std::string Ver = Platform.get_info<sycl::info::platform::version>();
-      // As a string will be used as regexp pattern, we need to get rid of
-      // symbols that can be treated in a special way.
-      replaceSpecialCharacters(Name);
-      replaceSpecialCharacters(Ver);
+    std::string Ver = Platform.get_info<sycl::info::platform::version>();
+    // As a string will be used as regexp pattern, we need to get rid of
+    // symbols that can be treated in a special way.
+    replaceSpecialCharacters(Name);
+    replaceSpecialCharacters(Ver);
 
-      std::cout << "SYCL_DEVICE_ALLOWLIST=PlatformName:{{" << Name
-                << "}},PlatformVersion:{{" << Ver << "}}";
+    std::cout << "SYCL_DEVICE_ALLOWLIST=PlatformName:{{" << Name
+              << "}},PlatformVersion:{{" << Ver << "}}";
 
-      return 0;
+    return 0;
   }
 
   // Expected that the allowlist filter is not set
   if (getenv("PRINT_DEVICE_INFO")) {
     for (const sycl::platform &Platform : sycl::platform::get_platforms())
       const sycl::device Dev = Platform.get_devices().at(0);
-      std::string Name = Dev.get_info<sycl::info::device::name>();
-      std::string Ver = Dev.get_info<sycl::info::device::driver_version>();
+    std::string Name = Dev.get_info<sycl::info::device::name>();
+    std::string Ver = Dev.get_info<sycl::info::device::driver_version>();
 
-      // As a string will be used as regexp pattern, we need to get rid of
-      // symbols that can be treated in a special way.
-      replaceSpecialCharacters(Name);
-      replaceSpecialCharacters(Ver);
+    // As a string will be used as regexp pattern, we need to get rid of
+    // symbols that can be treated in a special way.
+    replaceSpecialCharacters(Name);
+    replaceSpecialCharacters(Ver);
 
-      std::cout << "SYCL_DEVICE_ALLOWLIST=DeviceName:{{" << Name
-                << "}},DriverVersion:{{" << Ver << "}}";
+    std::cout << "SYCL_DEVICE_ALLOWLIST=DeviceName:{{" << Name
+              << "}},DriverVersion:{{" << Ver << "}}";
 
-      return 0;
+    return 0;
   }
 
   // Expected the allowlist to be set with the "PRINT_DEVICE_INFO" run result
@@ -72,14 +72,14 @@ int main() {
       if (Platform.get_devices().size() != 1)
         throw std::runtime_error("Expected only one device.");
 
-      return 0;
-    }
+    return 0;
+  }
   }
 
   // Expected the allowlist to be set but empty
   if (getenv("TEST_DEVICE_IS_NOT_AVAILABLE")) {
     for (const sycl::platform &Platform : sycl::platform::get_platforms())
-        throw std::runtime_error("Expected no device is available");
+      throw std::runtime_error("Expected no device is available");
     return 0;
   }
 

From 1a139752d02529ac27903be31b1e772e994aeb34 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Thu, 27 Jun 2024 03:41:00 -0700
Subject: [PATCH 46/52] fix warning

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/include/sycl/handler.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
index 6df476e2d2d96..a536d41f329e0 100644
--- a/sycl/include/sycl/handler.hpp
+++ b/sycl/include/sycl/handler.hpp
@@ -1064,8 +1064,8 @@ class __SYCL_EXPORT handler {
             access::mode ModeDst, access::target TargetDst,
             access::placeholder IsPHSrc, access::placeholder IsPHDst>
   std::enable_if_t<DimSrc == 0 || DimDst == 0, bool>
-  copyAccToAccHelper(accessor<TSrc, DimSrc, ModeSrc, TargetSrc, IsPHSrc> Src,
-                     accessor<TDst, DimDst, ModeDst, TargetDst, IsPHDst> Dst) {
+  copyAccToAccHelper(accessor<TSrc, DimSrc, ModeSrc, TargetSrc, IsPHSrc>,
+                     accessor<TDst, DimDst, ModeDst, TargetDst, IsPHDst>) {
     return false;
   }
 

From e9fffb6419638e729ca7a9da32bd054b50a1dc37 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Thu, 27 Jun 2024 03:48:10 -0700
Subject: [PATCH 47/52] fix allowlist test cherry-pick issues

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/test-e2e/Config/allowlist.cpp | 49 ++++++++++++++++--------------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/sycl/test-e2e/Config/allowlist.cpp b/sycl/test-e2e/Config/allowlist.cpp
index 7891088db5abb..393326cb76283 100644
--- a/sycl/test-e2e/Config/allowlist.cpp
+++ b/sycl/test-e2e/Config/allowlist.cpp
@@ -34,46 +34,51 @@ int main() {
 
   // Expected that the allowlist filter is not set
   if (getenv("PRINT_PLATFORM_INFO")) {
-    for (const sycl::platform &Platform : sycl::platform::get_platforms())
+    for (const sycl::platform &Platform : sycl::platform::get_platforms()) {
       std::string Name = Platform.get_info<sycl::info::platform::name>();
-    std::string Ver = Platform.get_info<sycl::info::platform::version>();
-    // As a string will be used as regexp pattern, we need to get rid of
-    // symbols that can be treated in a special way.
-    replaceSpecialCharacters(Name);
-    replaceSpecialCharacters(Ver);
+      std::string Ver = Platform.get_info<sycl::info::platform::version>();
+      // As a string will be used as regexp pattern, we need to get rid of
+      // symbols that can be treated in a special way.
+      replaceSpecialCharacters(Name);
+      replaceSpecialCharacters(Ver);
 
-    std::cout << "SYCL_DEVICE_ALLOWLIST=PlatformName:{{" << Name
-              << "}},PlatformVersion:{{" << Ver << "}}";
+      std::cout << "SYCL_DEVICE_ALLOWLIST=PlatformName:{{" << Name
+                << "}},PlatformVersion:{{" << Ver << "}}";
 
-    return 0;
+      return 0;
+    }
+    throw std::runtime_error("No device is found");
   }
 
   // Expected that the allowlist filter is not set
   if (getenv("PRINT_DEVICE_INFO")) {
-    for (const sycl::platform &Platform : sycl::platform::get_platforms())
+    for (const sycl::platform &Platform : sycl::platform::get_platforms()) {
       const sycl::device Dev = Platform.get_devices().at(0);
-    std::string Name = Dev.get_info<sycl::info::device::name>();
-    std::string Ver = Dev.get_info<sycl::info::device::driver_version>();
+      std::string Name = Dev.get_info<sycl::info::device::name>();
+      std::string Ver = Dev.get_info<sycl::info::device::driver_version>();
 
-    // As a string will be used as regexp pattern, we need to get rid of
-    // symbols that can be treated in a special way.
-    replaceSpecialCharacters(Name);
-    replaceSpecialCharacters(Ver);
+      // As a string will be used as regexp pattern, we need to get rid of
+      // symbols that can be treated in a special way.
+      replaceSpecialCharacters(Name);
+      replaceSpecialCharacters(Ver);
 
-    std::cout << "SYCL_DEVICE_ALLOWLIST=DeviceName:{{" << Name
-              << "}},DriverVersion:{{" << Ver << "}}";
+      std::cout << "SYCL_DEVICE_ALLOWLIST=DeviceName:{{" << Name
+                << "}},DriverVersion:{{" << Ver << "}}";
 
-    return 0;
+      return 0;
+    }
+    throw std::runtime_error("No device is found");
   }
 
   // Expected the allowlist to be set with the "PRINT_DEVICE_INFO" run result
   if (getenv("TEST_DEVICE_AVAILABLE")) {
-    for (const sycl::platform &Platform : sycl::platform::get_platforms())
+    for (const sycl::platform &Platform : sycl::platform::get_platforms()) {
       if (Platform.get_devices().size() != 1)
         throw std::runtime_error("Expected only one device.");
 
-    return 0;
-  }
+      return 0;
+    }
+    throw std::runtime_error("No device is found");
   }
 
   // Expected the allowlist to be set but empty

From 6ec2b63ecaedf8476d8a7dab3ce1bcc7b6e5963d Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Mon, 1 Jul 2024 05:06:17 -0700
Subject: [PATCH 48/52] fix code review comments

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/queue_impl.cpp             | 14 +----
 sycl/source/detail/scheduler/commands.cpp     | 60 +++++++------------
 .../source/detail/scheduler/graph_builder.cpp |  4 +-
 sycl/source/detail/scheduler/scheduler.cpp    |  2 +-
 sycl/source/detail/xpti_registry.cpp          | 15 +++++
 sycl/source/detail/xpti_registry.hpp          |  3 +
 sycl/test-e2e/Config/allowlist.cpp            |  2 +-
 7 files changed, 47 insertions(+), 53 deletions(-)

diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index 0ec8f57abb596..6f6e72fbd2af9 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -26,7 +26,7 @@
 namespace sycl {
 inline namespace _V1 {
 namespace detail {
-// Treat 0 as reserved for "host" queue
+// Treat 0 as reserved for host task traces
 std::atomic<unsigned long long> queue_impl::MNextAvailableQueueID = 1;
 
 thread_local bool NestedCallsDetector = false;
@@ -498,17 +498,7 @@ void *queue_impl::instrumentationProlog(const detail::code_location &CodeLoc,
                     xpti_at::active, &QWaitInstanceNo);
   IId = QWaitInstanceNo;
   if (WaitEvent) {
-    device D = get_device();
-    std::string DevStr;
-    if (D.is_cpu())
-      DevStr = "CPU";
-    else if (D.is_gpu())
-      DevStr = "GPU";
-    else if (D.is_accelerator())
-      DevStr = "ACCELERATOR";
-    else
-      DevStr = "UNKNOWN";
-    xpti::addMetadata(WaitEvent, "sycl_device_type", DevStr);
+    xpti::addMetadata(WaitEvent, "sycl_device_type", queueDeviceToString(this));
     if (HasSourceInfo) {
       xpti::addMetadata(WaitEvent, "sym_function_name", CodeLoc.functionName());
       xpti::addMetadata(WaitEvent, "sym_source_file_name", CodeLoc.fileName());
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 2ab4663c5db20..9ea45424f0ce5 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -79,22 +79,8 @@ static size_t deviceToID(const device &Device) {
   return reinterpret_cast<size_t>(getSyclObjImpl(Device)->getHandleRef());
 }
 
-static std::string queueDeviceToString(const QueueImplPtr &Queue) {
-  if (!Queue)
-    return "host";
-  auto Device = Queue->get_device();
-  if (Device.is_cpu())
-    return "CPU";
-  else if (Device.is_gpu())
-    return "GPU";
-  else if (Device.is_accelerator())
-    return "ACCELERATOR";
-  else
-    return "UNKNOWN";
-}
-
 static void addDeviceMetadata(xpti_td *TraceEvent, const QueueImplPtr &Queue) {
-  xpti::addMetadata(TraceEvent, "sycl_device_type", queueDeviceToString(Queue));
+  xpti::addMetadata(TraceEvent, "sycl_device_type", queueDeviceToString(Queue.get()));
   if (Queue) {
     xpti::addMetadata(TraceEvent, "sycl_device",
                       deviceToID(Queue->get_device()));
@@ -411,7 +397,7 @@ class DispatchHostTask {
       // we're ready to call the user-defined lambda now
       if (HostTask.MHostTask->isInteropTask()) {
         assert(HostTask.MQueue &&
-               "Submitted queue for host task must be device queue");
+               "Host task submissions should have an associated queue");
         interop_handle IH{MReqToMem, HostTask.MQueue,
                           HostTask.MQueue->getDeviceImplPtr(),
                           HostTask.MQueue->getContextImplPtr()};
@@ -1088,7 +1074,7 @@ void AllocaCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "ALLOCA ON " << queueDeviceToString(MQueue) << "\\n";
+  Stream << "ALLOCA ON " << queueDeviceToString(MQueue.get()) << "\\n";
   Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
   Stream << " Link : " << this->MLinkedAllocaCmd << "\\n";
   Stream << "\"];" << std::endl;
@@ -1174,7 +1160,7 @@ void AllocaSubBufCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FFD28A\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "ALLOCA SUB BUF ON " << queueDeviceToString(MQueue) << "\\n";
+  Stream << "ALLOCA SUB BUF ON " << queueDeviceToString(MQueue.get()) << "\\n";
   Stream << " MemObj : " << this->MRequirement.MSYCLMemObj << "\\n";
   Stream << " Offset : " << this->MRequirement.MOffsetInBytes << "\\n";
   Stream << " Access range : " << this->MRequirement.MAccessRange[0] << "\\n";
@@ -1287,7 +1273,7 @@ void ReleaseCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#FF827A\", label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "RELEASE ON " << queueDeviceToString(MQueue) << "\\n";
+  Stream << "RELEASE ON " << queueDeviceToString(MQueue.get()) << "\\n";
   Stream << " Alloca : " << MAllocaCmd << "\\n";
   Stream << " MemObj : " << MAllocaCmd->getSYCLMemObj() << "\\n";
   Stream << "\"];" << std::endl;
@@ -1357,7 +1343,7 @@ void MapMemObject::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#77AFFF\", label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "MAP ON " << queueDeviceToString(MQueue) << "\\n";
+  Stream << "MAP ON " << queueDeviceToString(MQueue.get()) << "\\n";
 
   Stream << "\"];" << std::endl;
 
@@ -1438,7 +1424,7 @@ void UnMapMemObject::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#EBC40F\", label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "UNMAP ON " << queueDeviceToString(MQueue) << "\\n";
+  Stream << "UNMAP ON " << queueDeviceToString(MQueue.get()) << "\\n";
 
   Stream << "\"];" << std::endl;
 
@@ -1548,7 +1534,7 @@ void MemCpyCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#C7EB15\" label=\"";
 
   Stream << "ID = " << this << " ; ";
-  Stream << "MEMCPY ON " << queueDeviceToString(MQueue) << "\\n";
+  Stream << "MEMCPY ON " << queueDeviceToString(MQueue.get()) << "\\n";
   Stream << "From: " << MSrcAllocaCmd << " is host: " << !MSrcQueue << "\\n";
   Stream << "To: " << MDstAllocaCmd << " is host: " << !MQueue << "\\n";
 
@@ -1604,7 +1590,7 @@ void UpdateHostRequirementCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#f1337f\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "UPDATE REQ ON " << queueDeviceToString(MQueue) << "\\n";
+  Stream << "UPDATE REQ ON " << queueDeviceToString(MQueue.get()) << "\\n";
   bool IsReqOnBuffer =
       MDstReq.MSYCLMemObj->getType() == SYCLMemObjI::MemObjType::Buffer;
   Stream << "TYPE: " << (IsReqOnBuffer ? "Buffer" : "Image") << "\\n";
@@ -1780,7 +1766,7 @@ void MemCpyCommandHost::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#B6A2EB\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "MEMCPY HOST ON " << queueDeviceToString(MQueue) << "\\n";
+  Stream << "MEMCPY HOST ON " << queueDeviceToString(MQueue.get()) << "\\n";
 
   Stream << "\"];" << std::endl;
 
@@ -1971,7 +1957,7 @@ void instrumentationAddExtraKernelMetadata(
     if (!SyclKernel->isCreatedFromSource())
       EliminatedArgMask = SyclKernel->getKernelArgMask();
   } else {
-    assert(Queue && "Queue with submitted kernel could not be on host");
+    assert(Queue && "Kernel submissions should have an associated queue");
     std::tie(Kernel, KernelMutex, EliminatedArgMask, Program) =
         detail::ProgramManager::getInstance().getOrCreateKernel(
             Queue->getContextImplPtr(), Queue->getDeviceImplPtr(), KernelName);
@@ -2154,7 +2140,7 @@ void ExecCGCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "EXEC CG ON " << queueDeviceToString(MQueue) << "\\n";
+  Stream << "EXEC CG ON " << queueDeviceToString(MQueue.get()) << "\\n";
 
   switch (MCommandGroup->getType()) {
   case detail::CG::Kernel: {
@@ -2345,7 +2331,7 @@ static pi_result SetKernelParamsAndLaunch(
     const KernelArgMask *EliminatedArgMask,
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
     bool IsCooperative) {
-  assert(Queue && "Queue with submitted kernel could not be on host");
+  assert(Queue && "Kernel submissions should have an associated queue");
   const PluginPtr &Plugin = Queue->getPlugin();
 
   auto setFunc = [&Plugin, Kernel, &DeviceImageImpl, &getMemAllocationFunc,
@@ -2536,7 +2522,7 @@ pi_int32 enqueueImpKernel(
     const std::function<void *(Requirement *Req)> &getMemAllocationFunc,
     sycl::detail::pi::PiKernelCacheConfig KernelCacheConfig,
     const bool KernelIsCooperative) {
-  assert(Queue && "Queue with submitted kernel could not be on host");
+  assert(Queue && "Kernel submissions should have an associated queue");
   // Run OpenCL kernel
   auto ContextImpl = Queue->getContextImplPtr();
   auto DeviceImpl = Queue->getDeviceImplPtr();
@@ -2652,7 +2638,7 @@ enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName,
                          std::vector<sycl::detail::pi::PiEvent> &RawEvents,
                          const detail::EventImplPtr &OutEventImpl, bool read) {
   assert(Queue &&
-         "Queue with submitted read write host pipe could not be on host");
+         "ReadWrite host pipe submissions should have an associated queue");
   detail::HostPipeMapEntry *hostPipeEntry =
       ProgramManager::getInstance().getHostPipeEntry(PipeName);
 
@@ -2702,7 +2688,7 @@ enqueueReadWriteHostPipe(const QueueImplPtr &Queue, const std::string &PipeName,
 }
 
 pi_int32 ExecCGCommand::enqueueImpCommandBuffer() {
-  assert(MQueue && "Device queue is required for command buffer enqueue");
+  assert(MQueue && "Command buffer enqueue should have an associated queue");
   // Wait on host command dependencies
   waitForPreparedHostEvents();
 
@@ -2941,7 +2927,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     return PI_SUCCESS;
   }
   case CG::CGTYPE::Kernel: {
-    assert(MQueue && "Device queue must be present for kernel command");
+    assert(MQueue && "Kernel submissions should have an associated queue");
     CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
 
     NDRDescT &NDRDesc = ExecKernel->MNDRDesc;
@@ -3094,7 +3080,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     return PI_SUCCESS;
   }
   case CG::CGTYPE::Barrier: {
-    assert(MQueue && "Device queue must be present for barrier command");
+    assert(MQueue && "Barrier submission should have an associated queue");
     const PluginPtr &Plugin = MQueue->getPlugin();
     if (MEvent != nullptr)
       MEvent->setHostEnqueueTime();
@@ -3105,7 +3091,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
   }
   case CG::CGTYPE::BarrierWaitlist: {
     assert(MQueue &&
-           "Device queue must be present for barrier with wait list command");
+           "Barrier submission should have an associated queue");
     CGBarrier *Barrier = static_cast<CGBarrier *>(MCommandGroup.get());
     std::vector<detail::EventImplPtr> Events = Barrier->MEventsWaitWithBarrier;
     std::vector<sycl::detail::pi::PiEvent> PiEvents =
@@ -3173,7 +3159,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
                                     typeSize, RawEvents, EventImpl, read);
   }
   case CG::CGTYPE::ExecCommandBuffer: {
-    assert(MQueue && "Device queue must be present for command buffer enqueue");
+    assert(MQueue && "Command buffer submissions should have an associated queue");
     CGExecCommandBuffer *CmdBufferCG =
         static_cast<CGExecCommandBuffer *>(MCommandGroup.get());
     if (MEvent != nullptr)
@@ -3197,7 +3183,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     return PI_SUCCESS;
   }
   case CG::CGTYPE::SemaphoreWait: {
-    assert(MQueue && "Device queue must be present for semaphore wait command");
+    assert(MQueue && "Semaphore wait submissions should have an associated queue");
     CGSemaphoreWait *SemWait = (CGSemaphoreWait *)MCommandGroup.get();
 
     const detail::PluginPtr &Plugin = MQueue->getPlugin();
@@ -3211,7 +3197,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
   }
   case CG::CGTYPE::SemaphoreSignal: {
     assert(MQueue &&
-           "Device queue must be present for semaphore signal command");
+           "Semaphore signal submissions should have an associated queue");
     CGSemaphoreSignal *SemSignal = (CGSemaphoreSignal *)MCommandGroup.get();
 
     const detail::PluginPtr &Plugin = MQueue->getPlugin();
@@ -3349,7 +3335,7 @@ void KernelFusionCommand::printDot(std::ostream &Stream) const {
   Stream << "\"" << this << "\" [style=filled, fillcolor=\"#AFFF82\", label=\"";
 
   Stream << "ID = " << this << "\\n";
-  Stream << "KERNEL FUSION on " << queueDeviceToString(MQueue) << "\\n"
+  Stream << "KERNEL FUSION on " << queueDeviceToString(MQueue.get()) << "\\n"
          << "FUSION LIST: {";
   bool Initial = true;
   for (auto *Cmd : MFusionList) {
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index 7cfc0446fdd69..284985b2f9c16 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -678,7 +678,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::findAllocaForReq(
 static bool checkHostUnifiedMemory(const ContextImplPtr &Ctx) {
   if (const char *HUMConfig = SYCLConfig<SYCL_HOST_UNIFIED_MEMORY>::get()) {
     if (std::strcmp(HUMConfig, "0") == 0)
-      return false;
+      return Ctx == nullptr;
     if (std::strcmp(HUMConfig, "1") == 0)
       return true;
   }
@@ -768,7 +768,7 @@ AllocaCommandBase *Scheduler::GraphBuilder::getOrCreateAllocaForReq(
           // new one. There could be situations when we could setup link with
           // "not" current allocation, but it will require memory copy.
           // Can setup link between cl and host allocations only
-          if ((Context != nullptr) != (Record->MCurContext != nullptr)) {
+          if ((Context == nullptr) != (Record->MCurContext == nullptr)) {
             // Linked commands assume that the host allocation is reused by the
             // plugin runtime and that can lead to unnecessary copy overhead on
             // devices that do not support host unified memory. Do not link the
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index 4acc5b6c3a6a4..a14af63b1a2a0 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -697,7 +697,7 @@ bool CheckEventReadiness(const ContextImplPtr &Context,
   // don't represent actual dependencies. Calling getContextImpl() would set
   // their context, which we wish to avoid as it is expensive.
   // NOP events also don't represent actual dependencies.
-  if ((SyclEventImplPtr->isDefaultConstructed()) || SyclEventImplPtr->isNOP()) {
+  if (SyclEventImplPtr->isDefaultConstructed() || SyclEventImplPtr->isNOP()) {
     return true;
   }
   if (SyclEventImplPtr->isHost()) {
diff --git a/sycl/source/detail/xpti_registry.cpp b/sycl/source/detail/xpti_registry.cpp
index c08e620b0583d..ed629b39b9be0 100644
--- a/sycl/source/detail/xpti_registry.cpp
+++ b/sycl/source/detail/xpti_registry.cpp
@@ -8,6 +8,7 @@
 
 #include <detail/global_handler.hpp>
 #include <detail/xpti_registry.hpp>
+#include <detail/queue_impl.hpp>
 
 #ifdef XPTI_ENABLE_INSTRUMENTATION
 #include "xpti/xpti_trace_framework.hpp"
@@ -362,6 +363,20 @@ void XPTIRegistry::sampledImageHostAccessorNotification(
 #endif
 }
 
+std::string queueDeviceToString(const queue_impl* const &Queue) {
+  if (!Queue)
+    return "HOST";
+  auto Device = Queue->get_device();
+  if (Device.is_cpu())
+    return "CPU";
+  else if (Device.is_gpu())
+    return "GPU";
+  else if (Device.is_accelerator())
+    return "ACCELERATOR";
+  else
+    return "UNKNOWN";
+}
+
 } // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/source/detail/xpti_registry.hpp b/sycl/source/detail/xpti_registry.hpp
index 681e2841c027b..a66ac46a0cd34 100644
--- a/sycl/source/detail/xpti_registry.hpp
+++ b/sycl/source/detail/xpti_registry.hpp
@@ -319,6 +319,9 @@ class XPTIScope {
 }; // class XPTIScope
 #endif
 
+class queue_impl;
+std::string queueDeviceToString(const detail::queue_impl* const &Queue);
+
 } // namespace detail
 } // namespace _V1
 } // namespace sycl
diff --git a/sycl/test-e2e/Config/allowlist.cpp b/sycl/test-e2e/Config/allowlist.cpp
index 393326cb76283..063ebabc1aba5 100644
--- a/sycl/test-e2e/Config/allowlist.cpp
+++ b/sycl/test-e2e/Config/allowlist.cpp
@@ -83,7 +83,7 @@ int main() {
 
   // Expected the allowlist to be set but empty
   if (getenv("TEST_DEVICE_IS_NOT_AVAILABLE")) {
-    for (const sycl::platform &Platform : sycl::platform::get_platforms())
+   if (!sycl::platform::get_platforms().empty())
       throw std::runtime_error("Expected no device is available");
     return 0;
   }

From 954ba8b77e99d017fdaac40417b75da7419a0d11 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Mon, 1 Jul 2024 05:22:06 -0700
Subject: [PATCH 49/52] extra code review changes

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/event_impl.cpp | 8 ++++----
 sycl/source/detail/event_impl.hpp | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index 58a52230f1269..85afb56fcaf9b 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -33,7 +33,7 @@ extern xpti::trace_event_data_t *GSYCLGraphEvent;
 #endif
 
 // If we do not yet have a context, use the default one.
-void event_impl::tryToInitContext() {
+void event_impl::initContextIfNeeded() {
   if (MContext || !MIsDefaultConstructed)
     return;
 
@@ -114,12 +114,12 @@ const sycl::detail::pi::PiEvent &event_impl::getHandleRef() const {
 sycl::detail::pi::PiEvent &event_impl::getHandleRef() { return MEvent; }
 
 const ContextImplPtr &event_impl::getContextImpl() {
-  tryToInitContext();
+  initContextIfNeeded();
   return MContext;
 }
 
 const PluginPtr &event_impl::getPlugin() {
-  tryToInitContext();
+  initContextIfNeeded();
   return MContext->getPlugin();
 }
 
@@ -456,7 +456,7 @@ void HostProfilingInfo::end() { EndTime = getTimestamp(); }
 pi_native_handle event_impl::getNative() {
   if (isHost())
     return {};
-  tryToInitContext();
+  initContextIfNeeded();
 
   auto Plugin = getPlugin();
   if (MIsDefaultConstructed && !MEvent) {
diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp
index f4c2ac2e90a86..e52ac40ad78d7 100644
--- a/sycl/source/detail/event_impl.hpp
+++ b/sycl/source/detail/event_impl.hpp
@@ -274,7 +274,7 @@ class event_impl {
 
   ContextImplPtr getContextImplPtr() {
     if (MIsDefaultConstructed)
-      tryToInitContext();
+      initContextIfNeeded();
     return MContext;
   }
 
@@ -400,7 +400,7 @@ class event_impl {
 
   // Events constructed without a context will lazily use the default context
   // when needed.
-  void tryToInitContext();
+  void initContextIfNeeded();
   // Event class represents 3 different kinds of operations:
   // | type  | has PI event | MContext | MIsHostTask | MIsDefaultConstructed |
   // | dev   | true         | !nullptr | false       | false                 |

From 3fb26e0fdc88ee470b6a360f0fda3f3a35137b9c Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Mon, 1 Jul 2024 05:35:49 -0700
Subject: [PATCH 50/52] fix format

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/include/sycl/handler.hpp                     |  8 ++++----
 sycl/source/detail/queue_impl.cpp                 |  3 +--
 sycl/source/detail/scheduler/commands.cpp         | 12 +++++++-----
 sycl/source/detail/scheduler/graph_builder.cpp    |  5 ++---
 sycl/source/detail/scheduler/scheduler.cpp        |  5 ++---
 sycl/source/detail/xpti_registry.cpp              |  4 ++--
 sycl/source/detail/xpti_registry.hpp              |  2 +-
 sycl/source/handler.cpp                           | 15 +++++++--------
 sycl/test-e2e/Config/allowlist.cpp                |  2 +-
 .../scheduler/EnqueueWithDependsOnDeps.cpp        |  8 ++++----
 10 files changed, 31 insertions(+), 33 deletions(-)

diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp
index ec59dc8aece7c..61b23ffd707d5 100644
--- a/sycl/include/sycl/handler.hpp
+++ b/sycl/include/sycl/handler.hpp
@@ -488,8 +488,8 @@ class __SYCL_EXPORT handler {
   /// \param IsHost indicates if this handler is created for SYCL host device.
   /// \param CallerNeedsEvent indicates if the event resulting from this handler
   ///        is needed by the caller.
-  handler(std::shared_ptr<detail::queue_impl> Queue, bool /* ABI break: remove */,
-          bool CallerNeedsEvent);
+  handler(std::shared_ptr<detail::queue_impl> Queue,
+          bool /* ABI break: remove */, bool CallerNeedsEvent);
 
   /// Constructs SYCL handler from the associated queue and the submission's
   /// primary and secondary queue.
@@ -504,8 +504,8 @@ class __SYCL_EXPORT handler {
   ///        is needed by the caller.
   handler(std::shared_ptr<detail::queue_impl> Queue,
           std::shared_ptr<detail::queue_impl> PrimaryQueue,
-          std::shared_ptr<detail::queue_impl> SecondaryQueue, bool /* ABI break: remove */,
-          bool CallerNeedsEvent);
+          std::shared_ptr<detail::queue_impl> SecondaryQueue,
+          bool /* ABI break: remove */, bool CallerNeedsEvent);
 
   /// Constructs SYCL handler from Graph.
   ///
diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp
index 45ca3aa0b2291..588254743701f 100644
--- a/sycl/source/detail/queue_impl.cpp
+++ b/sycl/source/detail/queue_impl.cpp
@@ -354,8 +354,7 @@ event queue_impl::submit_impl(const std::function<void(handler &)> &CGF,
                               bool CallerNeedsEvent,
                               const detail::code_location &Loc,
                               const SubmitPostProcessF *PostProcess) {
-  handler Handler(Self, PrimaryQueue, SecondaryQueue, false,
-                  CallerNeedsEvent);
+  handler Handler(Self, PrimaryQueue, SecondaryQueue, false, CallerNeedsEvent);
   Handler.saveCodeLoc(Loc);
 
   {
diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp
index 0b7f38d6e429d..38aa77e0c92ed 100644
--- a/sycl/source/detail/scheduler/commands.cpp
+++ b/sycl/source/detail/scheduler/commands.cpp
@@ -80,7 +80,8 @@ static size_t deviceToID(const device &Device) {
 }
 
 static void addDeviceMetadata(xpti_td *TraceEvent, const QueueImplPtr &Queue) {
-  xpti::addMetadata(TraceEvent, "sycl_device_type", queueDeviceToString(Queue.get()));
+  xpti::addMetadata(TraceEvent, "sycl_device_type",
+                    queueDeviceToString(Queue.get()));
   if (Queue) {
     xpti::addMetadata(TraceEvent, "sycl_device",
                       deviceToID(Queue->get_device()));
@@ -3099,8 +3100,7 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     return PI_SUCCESS;
   }
   case CG::CGTYPE::BarrierWaitlist: {
-    assert(MQueue &&
-           "Barrier submission should have an associated queue");
+    assert(MQueue && "Barrier submission should have an associated queue");
     CGBarrier *Barrier = static_cast<CGBarrier *>(MCommandGroup.get());
     std::vector<detail::EventImplPtr> Events = Barrier->MEventsWaitWithBarrier;
     std::vector<sycl::detail::pi::PiEvent> PiEvents =
@@ -3168,7 +3168,8 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
                                     typeSize, RawEvents, EventImpl, read);
   }
   case CG::CGTYPE::ExecCommandBuffer: {
-    assert(MQueue && "Command buffer submissions should have an associated queue");
+    assert(MQueue &&
+           "Command buffer submissions should have an associated queue");
     CGExecCommandBuffer *CmdBufferCG =
         static_cast<CGExecCommandBuffer *>(MCommandGroup.get());
     if (MEvent != nullptr)
@@ -3192,7 +3193,8 @@ pi_int32 ExecCGCommand::enqueueImpQueue() {
     return PI_SUCCESS;
   }
   case CG::CGTYPE::SemaphoreWait: {
-    assert(MQueue && "Semaphore wait submissions should have an associated queue");
+    assert(MQueue &&
+           "Semaphore wait submissions should have an associated queue");
     CGSemaphoreWait *SemWait = (CGSemaphoreWait *)MCommandGroup.get();
 
     const detail::PluginPtr &Plugin = MQueue->getPlugin();
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
index dcd4a0aa96dce..f8397016fce41 100644
--- a/sycl/source/detail/scheduler/graph_builder.cpp
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -1339,9 +1339,8 @@ Command *Scheduler::GraphBuilder::connectDepEvent(
             /* DepEvents = */ {DepEvent}),
         CG::CodeplayHostTask,
         /* Payload */ {}));
-    ConnectCmd = new ExecCGCommand(
-        std::move(ConnectCG), nullptr,
-        /*EventNeeded=*/true);
+    ConnectCmd = new ExecCGCommand(std::move(ConnectCG), nullptr,
+                                   /*EventNeeded=*/true);
   } catch (const std::bad_alloc &) {
     throw runtime_error("Out of host memory", PI_ERROR_OUT_OF_HOST_MEMORY);
   }
diff --git a/sycl/source/detail/scheduler/scheduler.cpp b/sycl/source/detail/scheduler/scheduler.cpp
index cea700a311b7d..fbea6f14dea3d 100644
--- a/sycl/source/detail/scheduler/scheduler.cpp
+++ b/sycl/source/detail/scheduler/scheduler.cpp
@@ -115,9 +115,8 @@ EventImplPtr Scheduler::addCG(
       NewEvent = NewCmd->getEvent();
       break;
     case CG::CodeplayHostTask: {
-      auto Result =
-          MGraphBuilder.addCG(std::move(CommandGroup), nullptr,
-                              AuxiliaryCmds, EventNeeded);
+      auto Result = MGraphBuilder.addCG(std::move(CommandGroup), nullptr,
+                                        AuxiliaryCmds, EventNeeded);
       NewCmd = Result.NewCmd;
       NewEvent = Result.NewEvent;
       ShouldEnqueue = Result.ShouldEnqueue;
diff --git a/sycl/source/detail/xpti_registry.cpp b/sycl/source/detail/xpti_registry.cpp
index ed629b39b9be0..1884f5cd34265 100644
--- a/sycl/source/detail/xpti_registry.cpp
+++ b/sycl/source/detail/xpti_registry.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include <detail/global_handler.hpp>
-#include <detail/xpti_registry.hpp>
 #include <detail/queue_impl.hpp>
+#include <detail/xpti_registry.hpp>
 
 #ifdef XPTI_ENABLE_INSTRUMENTATION
 #include "xpti/xpti_trace_framework.hpp"
@@ -363,7 +363,7 @@ void XPTIRegistry::sampledImageHostAccessorNotification(
 #endif
 }
 
-std::string queueDeviceToString(const queue_impl* const &Queue) {
+std::string queueDeviceToString(const queue_impl *const &Queue) {
   if (!Queue)
     return "HOST";
   auto Device = Queue->get_device();
diff --git a/sycl/source/detail/xpti_registry.hpp b/sycl/source/detail/xpti_registry.hpp
index a66ac46a0cd34..356679a75c2fb 100644
--- a/sycl/source/detail/xpti_registry.hpp
+++ b/sycl/source/detail/xpti_registry.hpp
@@ -320,7 +320,7 @@ class XPTIScope {
 #endif
 
 class queue_impl;
-std::string queueDeviceToString(const detail::queue_impl* const &Queue);
+std::string queueDeviceToString(const detail::queue_impl *const &Queue);
 
 } // namespace detail
 } // namespace _V1
diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp
index 011d3c4efce22..72277bb39ed31 100644
--- a/sycl/source/handler.cpp
+++ b/sycl/source/handler.cpp
@@ -87,8 +87,7 @@ handler::handler(std::shared_ptr<detail::queue_impl> Queue, bool)
 /// TODO: Unused. Remove with ABI break.
 handler::handler(std::shared_ptr<detail::queue_impl> Queue,
                  std::shared_ptr<detail::queue_impl> PrimaryQueue,
-                 std::shared_ptr<detail::queue_impl> SecondaryQueue,
-                 bool)
+                 std::shared_ptr<detail::queue_impl> SecondaryQueue, bool)
     : handler(Queue, PrimaryQueue, SecondaryQueue, false,
               /*CallerNeedsEvent=*/true) {}
 
@@ -98,8 +97,8 @@ handler::handler(std::shared_ptr<detail::queue_impl> Queue, bool,
 
 handler::handler(std::shared_ptr<detail::queue_impl> Queue,
                  std::shared_ptr<detail::queue_impl> PrimaryQueue,
-                 std::shared_ptr<detail::queue_impl> SecondaryQueue,
-                 bool, bool CallerNeedsEvent)
+                 std::shared_ptr<detail::queue_impl> SecondaryQueue, bool,
+                 bool CallerNeedsEvent)
     : MImpl(std::make_shared<detail::handler_impl>(std::move(PrimaryQueue),
                                                    std::move(SecondaryQueue),
                                                    CallerNeedsEvent)),
@@ -287,10 +286,10 @@ event handler::finalize() {
         detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent,
                                            xpti::trace_task_begin, nullptr);
 #endif
-        Result = enqueueImpKernel(
-            MQueue, MNDRDesc, MArgs, KernelBundleImpPtr, MKernel,
-            MKernelName.c_str(), RawEvents, NewEvent, nullptr,
-            MImpl->MKernelCacheConfig, MImpl->MKernelIsCooperative);
+        Result = enqueueImpKernel(MQueue, MNDRDesc, MArgs, KernelBundleImpPtr,
+                                  MKernel, MKernelName.c_str(), RawEvents,
+                                  NewEvent, nullptr, MImpl->MKernelCacheConfig,
+                                  MImpl->MKernelIsCooperative);
 #ifdef XPTI_ENABLE_INSTRUMENTATION
         // Emit signal only when event is created
         if (NewEvent != nullptr) {
diff --git a/sycl/test-e2e/Config/allowlist.cpp b/sycl/test-e2e/Config/allowlist.cpp
index 063ebabc1aba5..56dfbc081fb06 100644
--- a/sycl/test-e2e/Config/allowlist.cpp
+++ b/sycl/test-e2e/Config/allowlist.cpp
@@ -83,7 +83,7 @@ int main() {
 
   // Expected the allowlist to be set but empty
   if (getenv("TEST_DEVICE_IS_NOT_AVAILABLE")) {
-   if (!sycl::platform::get_platforms().empty())
+    if (!sycl::platform::get_platforms().empty())
       throw std::runtime_error("Expected no device is available");
     return 0;
   }
diff --git a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
index 5ad8a17af15d9..31d4e92bf89a8 100644
--- a/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
+++ b/sycl/unittests/scheduler/EnqueueWithDependsOnDeps.cpp
@@ -78,10 +78,10 @@ class DependsOnTests : public ::testing::Test {
 
     std::unique_ptr<sycl::detail::CG> CmdGroup = MockCGH.finalize();
 
-    detail::Command *NewCmd = MS.addCG(
-        std::move(CmdGroup),
-        Type == TestCGType::HOST_TASK ? nullptr : QueueDevImpl,
-        ToEnqueue, /*EventNeeded=*/true);
+    detail::Command *NewCmd =
+        MS.addCG(std::move(CmdGroup),
+                 Type == TestCGType::HOST_TASK ? nullptr : QueueDevImpl,
+                 ToEnqueue, /*EventNeeded=*/true);
     EXPECT_EQ(ToEnqueue.size(), 0u);
     return NewCmd;
   }

From 67a546270431a328f5920883732bce9820c394df Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Mon, 1 Jul 2024 05:42:16 -0700
Subject: [PATCH 51/52] fix format 2

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/source/detail/queue_impl.hpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp
index 4e9936fe042fb..123efc3d87af6 100644
--- a/sycl/source/detail/queue_impl.hpp
+++ b/sycl/source/detail/queue_impl.hpp
@@ -350,9 +350,7 @@ class queue_impl {
   bool hasDiscardEventsProperty() const { return MDiscardEvents; }
 
   /// \return true if this queue allows for discarded events.
-  bool supportsDiscardingPiEvents() const {
-    return MIsInorder;
-  }
+  bool supportsDiscardingPiEvents() const { return MIsInorder; }
 
   bool isInOrder() const { return MIsInorder; }
 

From 76a073c7d04b31c7952d1ce3f6e9dda37f36e800 Mon Sep 17 00:00:00 2001
From: "Tikhomirova, Kseniya" <kseniya.tikhomirova@intel.com>
Date: Mon, 1 Jul 2024 10:09:15 -0700
Subject: [PATCH 52/52] update win symbols

Signed-off-by: Tikhomirova, Kseniya <kseniya.tikhomirova@intel.com>
---
 sycl/test/abi/sycl_symbols_windows.dump | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump
index 54c7a77403c92..d02be89140c5a 100644
--- a/sycl/test/abi/sycl_symbols_windows.dump
+++ b/sycl/test/abi/sycl_symbols_windows.dump
@@ -569,10 +569,10 @@
 ??0half@host_half_impl@detail@_V1@sycl@@QEAA@AEBM@Z
 ??0half@host_half_impl@detail@_V1@sycl@@QEAA@G@Z
 ??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vgraph_impl@detail@experimental@oneapi@ext@_V1@sycl@@@std@@@Z
-??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@00_N@Z
 ??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@00_N1@Z
-??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_N@Z
+??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@00_N@Z
 ??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_N1@Z
+??0handler@_V1@sycl@@AEAA@V?$shared_ptr@Vqueue_impl@detail@_V1@sycl@@@std@@_N@Z
 ??0host_selector@_V1@sycl@@QEAA@$$QEAV012@@Z
 ??0host_selector@_V1@sycl@@QEAA@AEBV012@@Z
 ??0host_selector@_V1@sycl@@QEAA@XZ
@@ -4084,7 +4084,6 @@
 ?frexp_impl@detail@_V1@sycl@@YA?AVhalf@half_impl@123@V45123@PEAH@Z
 ?frexp_impl@detail@_V1@sycl@@YAMMPEAH@Z
 ?frexp_impl@detail@_V1@sycl@@YANNPEAH@Z
-?generateFlushCommand@stream_impl@detail@_V1@sycl@@QEAAXAEAVhandler@34@@Z
 ?get@context@_V1@sycl@@QEBAPEAU_cl_context@@XZ
 ?get@device@_V1@sycl@@QEBAPEAU_cl_device_id@@XZ
 ?get@kernel@_V1@sycl@@QEBAPEAU_cl_kernel@@XZ