From 400c42a69d406ab572efea3d0002460d8a641d58 Mon Sep 17 00:00:00 2001
From: vromanov <vlad.romanov@intel.com>
Date: Tue, 9 Apr 2019 15:40:46 +0300
Subject: [PATCH 1/6] [SYCL] Added MemoryManager.

The MemoryManager is supposed to be a single point of all allocations
in SYCL RT.

Signed-off-by: Vlad Romanov <vlad.romanov@intel.com>
---
 sycl/CMakeLists.txt                           |   1 +
 .../include/CL/sycl/detail/memory_manager.hpp |  91 +++++
 sycl/source/detail/memory_manager.cpp         | 356 ++++++++++++++++++
 3 files changed, 448 insertions(+)
 create mode 100644 sycl/include/CL/sycl/detail/memory_manager.hpp
 create mode 100644 sycl/source/detail/memory_manager.cpp
diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt
index 1682233684dd..ecd204f3c95e 100644
--- a/sycl/CMakeLists.txt
+++ b/sycl/CMakeLists.txt
@@ -128,6 +128,7 @@ add_library("${SYCLLibrary}" SHARED
   "${sourceRootPath}/detail/helpers.cpp"
   "${sourceRootPath}/detail/kernel_impl.cpp"
   "${sourceRootPath}/detail/kernel_info.cpp"
+  "${sourceRootPath}/detail/memory_manager.cpp"
   "${sourceRootPath}/detail/platform_host.cpp"
   "${sourceRootPath}/detail/platform_opencl.cpp"
   "${sourceRootPath}/detail/platform_info.cpp"
diff --git a/sycl/include/CL/sycl/detail/memory_manager.hpp b/sycl/include/CL/sycl/detail/memory_manager.hpp
new file mode 100644
index 000000000000..8a79d2dab139
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/memory_manager.hpp
@@ -0,0 +1,91 @@
+//==-------------- memory_manager.hpp - SYCL standard header file ----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/cl.h>
+#include <CL/sycl/access/access.hpp>
+#include <CL/sycl/detail/sycl_mem_obj.hpp>
+#include <CL/sycl/range.hpp>
+
+#include <memory>
+#include <vector>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+class queue_impl;
+class event_impl;
+class context_impl;
+
+using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
+using EventImplPtr = std::shared_ptr<detail::event_impl>;
+using ContextImplPtr = std::shared_ptr<detail::context_impl>;
+
+// The class contains methods that work with memory. All operations with
+// device memory should go through MemoryManager.
+
+class MemoryManager {
+public:
+  // The following method releases memory allocation of memory object.
+  // Depending on the context it releases memory on host or on device.
+  static void release(ContextImplPtr TargetContext, SYCLMemObjT *MemObj,
+                      void *MemAllocation, std::vector<cl_event> DepEvents,
+                      cl_event &OutEvent);
+
+  // The following method allocates memory allocation of memory object.
+  // Depending on the context it allocates memory on host or on device.
+  static void *allocate(ContextImplPtr TargetContext, SYCLMemObjT *MemObj,
+                        bool InitFromUserData, std::vector<cl_event> DepEvents,
+                        cl_event &OutEvent);
+
+  // Allocates buffer in specified context taking into account situations such
+  // as host ptr or cl_mem provided by user. TargetContext should be device
+  // one(not host).
+  static void *allocateMemBuffer(ContextImplPtr TargetContext,
+                                 SYCLMemObjT *MemObj, void *UserPtr,
+                                 bool HostPtrReadOnly, size_t Size,
+                                 const EventImplPtr &InteropEvent,
+                                 const ContextImplPtr &InteropContext,
+                                 cl_event &OutEventToWait);
+
+  // Releases buffer. TargetContext should be device one(not host).
+  static void releaseMemBuf(ContextImplPtr TargetContext, SYCLMemObjT *MemObj,
+                            void *MemAllocation, void *UserPtr);
+
+  // Copies memory between: host and device, host and host,
+  // device and device if memory objects bound to the one context.
+  static void copy(SYCLMemObjT *SYCLMemObj, void *SrcMem, QueueImplPtr SrcQueue,
+                   unsigned int DimSrc, sycl::range<3> SrcSize,
+                   sycl::range<3> SrcAccessRange, sycl::id<3> SrcOffset,
+                   unsigned int SrcElemSize, void *DstMem,
+                   QueueImplPtr TgtQueue, unsigned int DimDst,
+                   sycl::range<3> DstSize, sycl::range<3> DstAccessRange,
+                   sycl::id<3> DstOffset, unsigned int DstElemSize,
+                   std::vector<cl_event> DepEvents, cl_event &OutEvent);
+
+  static void fill(SYCLMemObjT *SYCLMemObj, void *Mem, QueueImplPtr Queue,
+                   size_t PatternSize, const char *Pattern, unsigned int Dim,
+                   sycl::range<3> Size, sycl::range<3> AccessRange,
+                   sycl::id<3> AccessOffset, unsigned int ElementSize,
+                   std::vector<cl_event> DepEvents, cl_event &OutEvent);
+
+  static void *map(SYCLMemObjT *SYCLMemObj, void *Mem, QueueImplPtr Queue,
+                   access::mode AccessMode, unsigned int Dim,
+                   sycl::range<3> Size, sycl::range<3> AccessRange,
+                   sycl::id<3> AccessOffset, unsigned int ElementSize,
+                   std::vector<cl_event> DepEvents, cl_event &OutEvent);
+
+  static void unmap(SYCLMemObjT *SYCLMemObj, void *Mem, QueueImplPtr Queue,
+                    void *MappedPtr, std::vector<cl_event> DepEvents,
+                    cl_event &OutEvent);
+};
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/detail/memory_manager.cpp b/sycl/source/detail/memory_manager.cpp
new file mode 100644
index 000000000000..75104e6e0842
--- /dev/null
+++ b/sycl/source/detail/memory_manager.cpp
@@ -0,0 +1,356 @@
+//==-------------- memory_manager.cpp --------------------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/detail/context_impl.hpp>
+#include <CL/sycl/detail/event_impl.hpp>
+#include <CL/sycl/detail/memory_manager.hpp>
+#include <CL/sycl/detail/queue_impl.hpp>
+
+#include <algorithm>
+#include <cassert>
+#include <cstring>
+#include <vector>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+static void waitForEvents(const std::vector<cl_event> &Events) {
+  if (!Events.empty())
+    CHECK_OCL_CODE(clWaitForEvents(Events.size(), &Events[0]));
+}
+
+void MemoryManager::release(ContextImplPtr TargetContext, SYCLMemObjT *MemObj,
+                            void *MemAllocation,
+                            std::vector<cl_event> DepEvents,
+                            cl_event &OutEvent) {
+  // There is no async API for memory releasing. Explicitly wait for all
+  // dependency events and return empty event.
+  waitForEvents(DepEvents);
+  OutEvent = nullptr;
+  MemObj->releaseMem(TargetContext, MemAllocation);
+}
+
+void MemoryManager::releaseMemBuf(ContextImplPtr TargetContext,
+                                  SYCLMemObjT *MemObj, void *MemAllocation,
+                                  void *UserPtr) {
+  if (UserPtr == MemAllocation) {
+    // Do nothing as it's user provided memory.
+    return;
+  }
+
+  if (TargetContext->is_host()) {
+    MemObj->releaseHostMem(MemAllocation);
+    return;
+  }
+
+  CHECK_OCL_CODE(clReleaseMemObject((cl_mem)MemAllocation));
+}
+
+void *MemoryManager::allocate(ContextImplPtr TargetContext, SYCLMemObjT *MemObj,
+                              bool InitFromUserData,
+                              std::vector<cl_event> DepEvents,
+                              cl_event &OutEvent) {
+  // There is no async API for memory allocation. Explicitly wait for all
+  // dependency events and return empty event.
+  waitForEvents(DepEvents);
+  OutEvent = nullptr;
+
+  return MemObj->allocateMem(TargetContext, InitFromUserData, OutEvent);
+}
+
+void *MemoryManager::allocateMemBuffer(ContextImplPtr TargetContext,
+                                       SYCLMemObjT *MemObj, void *UserPtr,
+                                       bool HostPtrReadOnly, size_t Size,
+                                       const EventImplPtr &InteropEvent,
+                                       const ContextImplPtr &InteropContext,
+                                       cl_event &OutEventToWait) {
+  if (TargetContext->is_host()) {
+    // Can return user pointer directly if it points to writable memory.
+    if (UserPtr && HostPtrReadOnly == false)
+      return UserPtr;
+
+    void *NewMem = MemObj->allocateHostMem();
+
+    // Need to initialize new memory if user provides pointer to read only
+    // memory.
+    if (UserPtr && HostPtrReadOnly == true)
+      std::memcpy((char *)NewMem, (char *)UserPtr, Size);
+    return NewMem;
+  }
+
+  // If memory object is created with interop c'tor.
+  if (UserPtr && InteropContext) {
+    // Return cl_mem as is if contexts match.
+    if (TargetContext == InteropContext) {
+      OutEventToWait = InteropEvent->getHandleRef();
+      return UserPtr;
+    }
+    // Allocate new cl_mem and initialize from user provided one.
+    assert(!"Not implemented");
+    return nullptr;
+  }
+
+  // Create read_write mem object by default to handle arbitrary uses.
+  cl_mem_flags CreationFlags = CL_MEM_READ_WRITE;
+
+  if (UserPtr)
+    CreationFlags |=
+        HostPtrReadOnly ? CL_MEM_COPY_HOST_PTR : CL_MEM_USE_HOST_PTR;
+  cl_int Error = CL_SUCCESS;
+  cl_mem NewMem = clCreateBuffer(TargetContext->getHandleRef(), CreationFlags,
+                                 Size, UserPtr, &Error);
+  CHECK_OCL_CODE(Error);
+  return NewMem;
+}
+
+void copyH2D(SYCLMemObjT *SYCLMemObj, char *SrcMem, QueueImplPtr SrcQueue,
+             unsigned int DimSrc, sycl::range<3> SrcSize,
+             sycl::range<3> SrcAccessRange, sycl::id<3> SrcOffset,
+             unsigned int SrcElemSize, cl_mem DstMem, QueueImplPtr TgtQueue,
+             unsigned int DimDst, sycl::range<3> DstSize,
+             sycl::range<3> DstAccessRange, sycl::id<3> DstOffset,
+             unsigned int DstElemSize, std::vector<cl_event> DepEvents,
+             cl_event &OutEvent) {
+  // TODO: Handle images.
+
+  // Adjust first dimension of copy range and offset as OpenCL expects size in
+  // bytes.
+  DstOffset[0] *= DstElemSize;
+  SrcOffset[0] *= SrcElemSize;
+  SrcAccessRange[0] *= SrcElemSize;
+  DstAccessRange[0] *= DstElemSize;
+  SrcSize[0] *= SrcElemSize;
+  DstSize[0] *= DstElemSize;
+
+  if (1 == DimDst && 1 == DimSrc) {
+    CHECK_OCL_CODE(clEnqueueWriteBuffer(
+        TgtQueue->getHandleRef(), DstMem,
+        /*blocking_write=*/CL_FALSE, DstOffset[0], DstAccessRange[0],
+        SrcMem + DstOffset[0], DepEvents.size(), &DepEvents[0], &OutEvent));
+  } else {
+    size_t BufferRowPitch = (1 == DimSrc) ? 0 : SrcSize[0];
+    size_t BufferSlicePitch = (3 == DimSrc) ? SrcSize[0] * SrcSize[1] : 0;
+
+    size_t HostRowPitch = (1 == DimDst) ? 0 : DstSize[0];
+    size_t HostSlicePitch = (3 == DimDst) ? DstSize[0] * DstSize[1] : 0;
+    CHECK_OCL_CODE(clEnqueueWriteBufferRect(
+        TgtQueue->getHandleRef(), DstMem,
+        /*blocking_write=*/CL_FALSE, &DstOffset[0], &SrcOffset[0],
+        &DstAccessRange[0], BufferRowPitch, BufferSlicePitch, HostRowPitch,
+        HostSlicePitch, SrcMem, DepEvents.size(), &DepEvents[0], &OutEvent));
+  }
+}
+
+void copyD2H(SYCLMemObjT *SYCLMemObj, cl_mem SrcMem, QueueImplPtr SrcQueue,
+             unsigned int DimSrc, sycl::range<3> SrcSize,
+             sycl::range<3> SrcAccessRange, sycl::id<3> SrcOffset,
+             unsigned int SrcElemSize, char *DstMem, QueueImplPtr TgtQueue,
+             unsigned int DimDst, sycl::range<3> DstSize,
+             sycl::range<3> DstAccessRange, sycl::id<3> DstOffset,
+             unsigned int DstElemSize, std::vector<cl_event> DepEvents,
+             cl_event &OutEvent) {
+  // TODO: Handle images.
+
+  // Adjust sizes of 1 dimensions as OpenCL expects size in bytes.
+  DstOffset[0] *= DstElemSize;
+  SrcOffset[0] *= SrcElemSize;
+  SrcAccessRange[0] *= SrcElemSize;
+  DstAccessRange[0] *= DstElemSize;
+  SrcSize[0] *= SrcElemSize;
+  DstSize[0] *= DstElemSize;
+
+  if (1 == DimDst && 1 == DimSrc) {
+    CHECK_OCL_CODE(clEnqueueReadBuffer(
+        SrcQueue->getHandleRef(), SrcMem,
+        /*blocking_read=*/CL_FALSE, DstOffset[0], DstAccessRange[0],
+        DstMem + DstOffset[0], DepEvents.size(), &DepEvents[0], &OutEvent));
+  } else {
+    size_t BufferRowPitch = (1 == DimSrc) ? 0 : SrcSize[0];
+    size_t BufferSlicePitch = (3 == DimSrc) ? SrcSize[0] * SrcSize[1] : 0;
+
+    size_t HostRowPitch = (1 == DimDst) ? 0 : DstSize[0];
+    size_t HostSlicePitch = (3 == DimDst) ? DstSize[0] * DstSize[1] : 0;
+    CHECK_OCL_CODE(clEnqueueReadBufferRect(
+        SrcQueue->getHandleRef(), SrcMem,
+        /*blocking_read=*/CL_FALSE, &SrcOffset[0], &DstOffset[0],
+        &SrcAccessRange[0], BufferRowPitch, BufferSlicePitch, HostRowPitch,
+        HostSlicePitch, DstMem, DepEvents.size(), &DepEvents[0], &OutEvent));
+  }
+}
+
+void copyD2D(SYCLMemObjT *SYCLMemObj, cl_mem SrcMem, QueueImplPtr SrcQueue,
+             unsigned int DimSrc, sycl::range<3> SrcSize,
+             sycl::range<3> SrcAccessRange, sycl::id<3> SrcOffset,
+             unsigned int SrcElemSize, cl_mem DstMem, QueueImplPtr TgtQueue,
+             unsigned int DimDst, sycl::range<3> DstSize,
+             sycl::range<3> DstAccessRange, sycl::id<3> DstOffset,
+             unsigned int DstElemSize, std::vector<cl_event> DepEvents,
+             cl_event &OutEvent) {
+  // TODO: Handle images.
+
+  // Adjust sizes of 1 dimensions as OpenCL expects size in bytes.
+  DstOffset[0] *= DstElemSize;
+  SrcOffset[0] *= SrcElemSize;
+  SrcAccessRange[0] *= SrcElemSize;
+  SrcSize[0] *= SrcElemSize;
+  DstSize[0] *= DstElemSize;
+
+  if (1 == DimDst && 1 == DimSrc) {
+    CHECK_OCL_CODE(clEnqueueCopyBuffer(
+        SrcQueue->getHandleRef(), SrcMem, DstMem, SrcOffset[0], DstOffset[0],
+        SrcAccessRange[0], DepEvents.size(), &DepEvents[0], &OutEvent));
+  } else {
+    size_t BufferRowPitch = (1 == DimSrc) ? 0 : SrcSize[0];
+    size_t BufferSlicePitch = (3 == DimSrc) ? SrcSize[0] * SrcSize[1] : 0;
+
+    size_t HostRowPitch = (1 == DimDst) ? 0 : DstSize[0];
+    size_t HostSlicePitch = (3 == DimDst) ? DstSize[0] * DstSize[1] : 0;
+
+    CHECK_OCL_CODE(clEnqueueCopyBufferRect(
+        SrcQueue->getHandleRef(), SrcMem, DstMem, &SrcOffset[0], &DstOffset[0],
+        &SrcAccessRange[0], BufferRowPitch, BufferSlicePitch, HostRowPitch,
+        HostSlicePitch, DepEvents.size(), &DepEvents[0], &OutEvent));
+  }
+}
+
+static void copyH2H(SYCLMemObjT *SYCLMemObj, char *SrcMem,
+                    QueueImplPtr SrcQueue, unsigned int DimSrc,
+                    sycl::range<3> SrcSize, sycl::range<3> SrcAccessRange,
+                    sycl::id<3> SrcOffset, unsigned int SrcElemSize,
+                    char *DstMem, QueueImplPtr TgtQueue, unsigned int DimDst,
+                    sycl::range<3> DstSize, sycl::range<3> DstAccessRange,
+                    sycl::id<3> DstOffset, unsigned int DstElemSize,
+                    std::vector<cl_event> DepEvents, cl_event &OutEvent) {
+  if ((DimSrc != 1 || DimDst != 1) &&
+      (SrcOffset != id<3>{0, 0, 0} || DstOffset != id<3>{0, 0, 0} ||
+       SrcSize != SrcAccessRange || DstSize != DstAccessRange)) {
+    assert(!"Not supported configuration of memcpy requested");
+    throw runtime_error("Not supported configuration of memcpy requested");
+  }
+
+  DstOffset[0] *= DstElemSize;
+  SrcOffset[0] *= SrcElemSize;
+
+  size_t BytesToCopy =
+      SrcAccessRange[0] * SrcElemSize * SrcAccessRange[1] * SrcAccessRange[2];
+
+  std::memcpy(DstMem + DstOffset[0], SrcMem + SrcOffset[0], BytesToCopy);
+}
+
+// Copies memory between: host and device, host and host,
+// device and device if memory objects bound to the one context.
+void MemoryManager::copy(SYCLMemObjT *SYCLMemObj, void *SrcMem,
+                         QueueImplPtr SrcQueue, unsigned int DimSrc,
+                         sycl::range<3> SrcSize, sycl::range<3> SrcAccessRange,
+                         sycl::id<3> SrcOffset, unsigned int SrcElemSize,
+                         void *DstMem, QueueImplPtr TgtQueue,
+                         unsigned int DimDst, sycl::range<3> DstSize,
+                         sycl::range<3> DstAccessRange, sycl::id<3> DstOffset,
+                         unsigned int DstElemSize,
+                         std::vector<cl_event> DepEvents, cl_event &OutEvent) {
+
+  if (SrcQueue->is_host()) {
+    if (TgtQueue->is_host())
+      copyH2H(SYCLMemObj, (char *)SrcMem, std::move(SrcQueue), DimSrc, SrcSize,
+              SrcAccessRange, SrcOffset, SrcElemSize, (char *)DstMem,
+              std::move(TgtQueue), DimDst, DstSize, DstAccessRange, DstOffset,
+              DstElemSize, std::move(DepEvents), OutEvent);
+
+    else
+      copyH2D(SYCLMemObj, (char *)SrcMem, std::move(SrcQueue), DimSrc, SrcSize,
+              SrcAccessRange, SrcOffset, SrcElemSize, (cl_mem)DstMem,
+              std::move(TgtQueue), DimDst, DstSize, DstAccessRange, DstOffset,
+              DstElemSize, std::move(DepEvents), OutEvent);
+  } else {
+    if (TgtQueue->is_host())
+      copyD2H(SYCLMemObj, (cl_mem)SrcMem, std::move(SrcQueue), DimSrc, SrcSize,
+              SrcAccessRange, SrcOffset, SrcElemSize, (char *)DstMem,
+              std::move(TgtQueue), DimDst, DstSize, DstAccessRange, DstOffset,
+              DstElemSize, std::move(DepEvents), OutEvent);
+    else
+      copyD2D(SYCLMemObj, (cl_mem)SrcMem, std::move(SrcQueue), DimSrc, SrcSize,
+              SrcAccessRange, SrcOffset, SrcElemSize, (cl_mem)DstMem,
+              std::move(TgtQueue), DimDst, DstSize, DstAccessRange, DstOffset,
+              DstElemSize, std::move(DepEvents), OutEvent);
+  }
+}
+
+void MemoryManager::fill(SYCLMemObjT *SYCLMemObj, void *Mem, QueueImplPtr Queue,
+                         size_t PatternSize, const char *Pattern,
+                         unsigned int Dim, sycl::range<3> Size,
+                         sycl::range<3> Range, sycl::id<3> Offset,
+                         unsigned int ElementSize,
+                         std::vector<cl_event> DepEvents, cl_event &OutEvent) {
+  // TODO: Handle images.
+
+  if (Dim == 1) {
+    CHECK_OCL_CODE(clEnqueueFillBuffer(
+        Queue->getHandleRef(), (cl_mem)Mem, Pattern, PatternSize, Offset[0],
+        Range[0] * ElementSize, DepEvents.size(), &DepEvents[0], &OutEvent));
+    return;
+  }
+
+  assert(!"Not supported configuration of fill requested");
+  throw runtime_error("Not supported configuration of fill requested");
+}
+
+void *MemoryManager::map(SYCLMemObjT *SYCLMemObj, void *Mem, QueueImplPtr Queue,
+                         access::mode AccessMode, unsigned int Dim,
+                         sycl::range<3> Size, sycl::range<3> AccessRange,
+                         sycl::id<3> AccessOffset, unsigned int ElementSize,
+                         std::vector<cl_event> DepEvents, cl_event &OutEvent) {
+  if (Queue->is_host() || Dim != 1) {
+    assert(!"Not supported configuration of map requested");
+    throw runtime_error("Not supported configuration of map requested");
+  }
+
+  cl_map_flags Flags = 0;
+
+  switch (AccessMode) {
+  case access::mode::read:
+    Flags |= CL_MAP_READ;
+    break;
+  case access::mode::write:
+    Flags |= CL_MAP_WRITE;
+    break;
+  case access::mode::read_write:
+  case access::mode::atomic:
+    Flags = CL_MAP_WRITE | CL_MAP_READ;
+    break;
+  case access::mode::discard_write:
+  case access::mode::discard_read_write:
+    Flags |= CL_MAP_WRITE_INVALIDATE_REGION;
+    break;
+  }
+
+  AccessOffset[0] *= ElementSize;
+  AccessRange[0] *= ElementSize;
+
+  cl_int Error = CL_SUCCESS;
+  void *MappedPtr = clEnqueueMapBuffer(
+      Queue->getHandleRef(), (cl_mem)Mem, CL_FALSE, Flags, AccessOffset[0],
+      AccessRange[0], DepEvents.size(),
+      DepEvents.empty() ? nullptr : &DepEvents[0], &OutEvent, &Error);
+  CHECK_OCL_CODE(Error);
+  return MappedPtr;
+}
+
+void MemoryManager::unmap(SYCLMemObjT *SYCLMemObj, void *Mem,
+                          QueueImplPtr Queue, void *MappedPtr,
+                          std::vector<cl_event> DepEvents, cl_event &OutEvent) {
+  cl_int Error = CL_SUCCESS;
+  Error = clEnqueueUnmapMemObject(
+      Queue->getHandleRef(), (cl_mem)Mem, MappedPtr, DepEvents.size(),
+      DepEvents.empty() ? nullptr : &DepEvents[0], &OutEvent);
+  CHECK_OCL_CODE(Error);
+}
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl

From f28f5b7a64069f381b33e92491bd91cdd96f9818 Mon Sep 17 00:00:00 2001
From: vromanov <vlad.romanov@intel.com>
Date: Tue, 9 Apr 2019 16:10:16 +0300
Subject: [PATCH 2/6] [SYCL] Redesign memory allocation in buffer_impl class.

Do not allocate memory on host in buffer_impl constructors when user's
ptr is provided. This is possible because the ownership of that memory
is given to the constructed SYCL buffer for the duration of it'
lifetime. The memory still need to be allocated in constructor that
takes iterators as they can point to non-consecutive memory(need
consecutive memory to interact with OpenCL).
This may improve performance as we avoid several memory allocation/copy.

As required by the Specification the host memory now always allocated
using allocator provided by user.

Moved methods that allocate, fill and copy memory to separate class.
Copy method was improved to support device to host and host to device
partial(aka CopyRect) version of copies. So, now the buffer_impl class
is decoupled from OpenCL.

Fix possible bug, now constructor from shared_ptr stores a copy of
shared_ptr in lambda to avoid deallocation of memory until buffer is
destructed or set_final_data is invoked to override destination for
updating host memory(aka copy back).

Signed-off-by: Vlad Romanov <vlad.romanov@intel.com>
---
 sycl/include/CL/sycl/buffer.hpp              |   6 +-
 sycl/include/CL/sycl/detail/buffer_impl.hpp  |   7 +
 sycl/include/CL/sycl/detail/buffer_impl2.hpp | 386 +++++++++++++++++++
 sycl/include/CL/sycl/detail/sycl_mem_obj.hpp |  60 +++
 4 files changed, 458 insertions(+), 1 deletion(-)
 create mode 100644 sycl/include/CL/sycl/detail/buffer_impl2.hpp
 create mode 100644 sycl/include/CL/sycl/detail/sycl_mem_obj.hpp

diff --git a/sycl/include/CL/sycl/buffer.hpp b/sycl/include/CL/sycl/buffer.hpp
index 0359213e16b6..a604701d7312 100644
--- a/sycl/include/CL/sycl/buffer.hpp
+++ b/sycl/include/CL/sycl/buffer.hpp
@@ -7,7 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #pragma once
-#include <CL/sycl/detail/buffer_impl.hpp>
+#ifdef SCHEDULER_20
+  #include <CL/sycl/detail/buffer_impl2.hpp>
+#else
+  #include <CL/sycl/detail/buffer_impl.hpp>
+#endif // SCHEDULER_20
 #include <CL/sycl/detail/common.hpp>
 #include <CL/sycl/exception.hpp>
 #include <CL/sycl/stl.hpp>
diff --git a/sycl/include/CL/sycl/detail/buffer_impl.hpp b/sycl/include/CL/sycl/detail/buffer_impl.hpp
index ebdd3448c0e7..7cbef555f6e2 100644
--- a/sycl/include/CL/sycl/detail/buffer_impl.hpp
+++ b/sycl/include/CL/sycl/detail/buffer_impl.hpp
@@ -6,6 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifdef SCHEDULER_20
+
+#include <CL/sycl/detail/buffer_impl2.hpp>
+
+#else
+
 #pragma once
 
 #include <CL/sycl/access/access.hpp>
@@ -608,3 +614,4 @@ cl_mem buffer_impl<AllocatorT>::getOpenCLMem() const {
 } // namespace detail
 } // namespace sycl
 } // namespace cl
+#endif // SCHEDULER_20
diff --git a/sycl/include/CL/sycl/detail/buffer_impl2.hpp b/sycl/include/CL/sycl/detail/buffer_impl2.hpp
new file mode 100644
index 000000000000..6088268ac4f0
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/buffer_impl2.hpp
@@ -0,0 +1,386 @@
+//==----------------- buffer_impl.hpp - SYCL standard header file ----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/cl.h>
+#include <CL/sycl/access/access.hpp>
+#include <CL/sycl/context.hpp>
+#include <CL/sycl/detail/aligned_allocator.hpp>
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/detail/helpers.hpp>
+#include <CL/sycl/detail/memory_manager.hpp>
+#include <CL/sycl/detail/scheduler/scheduler.hpp>
+#include <CL/sycl/detail/sycl_mem_obj.hpp>
+#include <CL/sycl/handler.hpp>
+#include <CL/sycl/property_list.hpp>
+#include <CL/sycl/stl.hpp>
+#include <CL/sycl/types.hpp>
+
+#include <functional>
+#include <memory>
+#include <type_traits>
+
+namespace cl {
+namespace sycl {
+// Forward declarations
+template <typename DataT, int Dimensions, access::mode AccessMode,
+          access::target AccessTarget, access::placeholder IsPlaceholder>
+class accessor;
+template <typename T, int Dimensions, typename AllocatorT> class buffer;
+class handler;
+
+using buffer_allocator = aligned_allocator<char, /*Alignment*/64>;
+
+namespace detail {
+using EventImplPtr = std::shared_ptr<detail::event_impl>;
+using ContextImplPtr = std::shared_ptr<detail::context_impl>;
+
+using cl::sycl::detail::SYCLMemObjT;
+
+using cl::sycl::detail::MemoryManager;
+
+template <typename AllocatorT> class buffer_impl : public SYCLMemObjT {
+public:
+  buffer_impl(size_t SizeInBytes, const property_list &PropList,
+              AllocatorT Allocator = AllocatorT())
+      : buffer_impl((void *)nullptr, SizeInBytes, PropList, Allocator) {}
+
+  buffer_impl(void *HostData, size_t SizeInBytes, const property_list &Props,
+              AllocatorT Allocator = AllocatorT())
+      : MSizeInBytes(SizeInBytes), MProps(Props), MAllocator(Allocator) {
+
+    if (!HostData)
+      return;
+
+    set_final_data(reinterpret_cast<char *>(HostData));
+    if (MProps.has_property<property::buffer::use_host_ptr>()) {
+      MUserPtr = HostData;
+      return;
+    }
+
+    // TODO: Reuse user's pointer if it has sufficient alignment.
+    MShadowCopy = allocateHostMem();
+    MUserPtr = MShadowCopy;
+    std::memcpy(MUserPtr, HostData, SizeInBytes);
+  }
+
+  buffer_impl(const void *HostData, size_t SizeInBytes,
+              const property_list &Props, AllocatorT Allocator = AllocatorT())
+      : buffer_impl(const_cast<void *>(HostData), SizeInBytes, Props,
+                    Allocator) {
+    MHostPtrReadOnly = true;
+  }
+
+  template <typename T>
+  buffer_impl(const shared_ptr_class<T> &HostData, const size_t SizeInBytes,
+              const property_list &Props, AllocatorT Allocator = AllocatorT())
+      : MSizeInBytes(SizeInBytes), MProps(Props), MAllocator(Allocator) {
+    // HostData can be destructed by the user so need to make copy
+    MUserPtr = MShadowCopy = allocateHostMem();
+
+    std::copy(HostData.get(), HostData.get() + SizeInBytes / sizeof(T),
+              (T *)MUserPtr);
+
+    set_final_data(weak_ptr_class<T>(HostData));
+  }
+
+  template <typename Iterator> struct is_const_iterator {
+    using pointer = typename std::iterator_traits<Iterator>::pointer;
+    static constexpr bool value =
+        std::is_const<typename std::remove_pointer<pointer>::type>::value;
+  };
+
+  template <typename Iterator>
+  using EnableIfConstIterator =
+      typename std::enable_if<is_const_iterator<Iterator>::value,
+                              Iterator>::type;
+
+  template <typename Iterator>
+  using EnableIfNotConstIterator =
+      typename std::enable_if<!is_const_iterator<Iterator>::value,
+                              Iterator>::type;
+
+  template <class InputIterator>
+  buffer_impl(EnableIfNotConstIterator<InputIterator> First, InputIterator Last,
+              const size_t SizeInBytes, const property_list &Props,
+              AllocatorT Allocator = AllocatorT())
+      : MSizeInBytes(SizeInBytes), MProps(Props), MAllocator(Allocator) {
+
+    // TODO: There is contradiction is the spec. It says SYCL RT must not
+    // allocate additional memory on the host if use_host_ptr prop was passed.
+    // On the other hand it says that SYCL RT should allocate temporal memory in
+    // this c'tor.
+
+    if (0) {
+      MUserPtr = MShadowCopy = allocateHostMem();
+    } else {
+      size_t AllocatorValueSize = sizeof(typename AllocatorT::value_type);
+      size_t AllocationSize = get_size() / AllocatorValueSize;
+      AllocationSize += (get_size() % AllocatorValueSize) ? 1 : 0;
+      MUserPtr = MShadowCopy = MAllocator.allocate(AllocationSize);
+    }
+
+    // We need to cast MUserPtr to pointer to the iteration type to get correct
+    // offset in std::copy when it will increment destination pointer.
+    auto *Ptr =
+        reinterpret_cast<typename std::iterator_traits<InputIterator>::pointer>(
+            MUserPtr);
+    std::copy(First, Last, Ptr);
+
+    // TODO: There is contradiction in the spec, in one place it says
+    // the data is not copied back at all if the buffer is construted
+    // using this c'tor, another section says that the data will be
+    // copied back if iterators passed are not const.
+    set_final_data(First);
+  }
+
+  template <class InputIterator>
+  buffer_impl(EnableIfConstIterator<InputIterator> First, InputIterator Last,
+              const size_t SizeInBytes, const property_list &Props,
+              AllocatorT Allocator = AllocatorT())
+      : MSizeInBytes(SizeInBytes), MProps(Props), MAllocator(Allocator) {
+
+    // TODO: There is contradiction is the spec. It says SYCL RT must not
+    // allocate addtional memory on the host if use_host_ptr prop was passed. On
+    // the other hand it says that SYCL RT should allocate temporal memory in
+    // this c'tor.
+    //
+
+    if (0) {
+      MUserPtr = MShadowCopy = allocateHostMem();
+    } else {
+      size_t AllocatorValueSize = sizeof(typename AllocatorT::value_type);
+      size_t AllocationSize = get_size() / AllocatorValueSize;
+      AllocationSize += (get_size() % AllocatorValueSize) ? 1 : 0;
+      MUserPtr = MShadowCopy = MAllocator.allocate(AllocationSize);
+    }
+
+    // We need to cast MUserPtr to pointer to the iteration type to get correct
+    // offset in std::copy when it will increment destination pointer.
+    using value = typename std::iterator_traits<InputIterator>::value_type;
+    auto *Ptr = reinterpret_cast<typename std::add_pointer<
+        typename std::remove_const<value>::type>::type>(MUserPtr);
+    std::copy(First, Last, Ptr);
+  }
+
+  buffer_impl(cl_mem MemObject, const context &SyclContext,
+              const size_t SizeInBytes, event AvailableEvent = {})
+      : MInteropMemObject(MemObject), MOpenCLInterop(true),
+        MSizeInBytes(SizeInBytes),
+        MInteropEvent(detail::getSyclObjImpl(std::move(AvailableEvent))),
+        MInteropContext(detail::getSyclObjImpl(SyclContext)) {
+
+    if (MInteropContext->is_host())
+      throw cl::sycl::invalid_parameter_error(
+          "Creation of interoperability buffer using host context is not "
+          "allowed");
+
+    cl_context Context = nullptr;
+    CHECK_OCL_CODE(clGetMemObjectInfo(MInteropMemObject, CL_MEM_CONTEXT,
+                                      sizeof(Context), &Context, nullptr));
+    if (MInteropContext->getHandleRef() != Context)
+      throw cl::sycl::invalid_parameter_error(
+          "Input context must be the same as the context of cl_mem");
+    CHECK_OCL_CODE(clRetainMemObject(MInteropMemObject));
+  }
+
+  size_t get_size() const { return MSizeInBytes; }
+
+  void set_write_back(bool flag) { MNeedWriteBack = flag; }
+
+  AllocatorT get_allocator() const { return MAllocator; }
+
+  ~buffer_impl() {
+    if (MUploadDataFn != nullptr && MNeedWriteBack) {
+      MUploadDataFn();
+    }
+
+    Scheduler::getInstance().removeMemoryObject(this);
+    releaseHostMem(MShadowCopy);
+
+    if (MOpenCLInterop)
+      CHECK_OCL_CODE_NO_EXC(clReleaseMemObject(MInteropMemObject));
+  }
+
+  void set_final_data(std::nullptr_t) { MUploadDataFn = nullptr; }
+
+  template <typename T> void set_final_data(weak_ptr_class<T> FinalData) {
+    MUploadDataFn = [this, FinalData]() {
+      if (auto finalData = FinalData.lock()) {
+        void *TempPtr = finalData.get();
+        detail::Requirement AccImpl({0, 0, 0}, {MSizeInBytes, 1, 1},
+                                    {MSizeInBytes, 1, 1}, access::mode::read,
+                                    this, 1, sizeof(char));
+        AccImpl.MData = TempPtr;
+
+        detail::EventImplPtr Event =
+            Scheduler::getInstance().addCopyBack(&AccImpl);
+        if (Event)
+          Event->wait(Event);
+      }
+    };
+  }
+
+  template <typename Destination>
+  void set_final_data(
+      Destination FinalData,
+      typename std::enable_if<std::is_pointer<Destination>::value>::type * =
+          0) {
+    static_assert(!std::is_const<Destination>::value,
+                  "Сan not write in a constant Destination. Destination should "
+                  "not be const.");
+    MUploadDataFn = [this, FinalData]() mutable {
+
+      detail::Requirement AccImpl({0, 0, 0}, {MSizeInBytes, 1, 1},
+                                  {MSizeInBytes, 1, 1}, access::mode::read,
+                                  this, 1, sizeof(char));
+      AccImpl.MData = FinalData;
+
+      detail::EventImplPtr Event =
+          Scheduler::getInstance().addCopyBack(&AccImpl);
+      if (Event)
+        Event->wait(Event);
+    };
+  }
+
+  template <typename Destination>
+  void set_final_data(
+      Destination FinalData,
+      typename std::enable_if<!std::is_pointer<Destination>::value>::type * =
+          0) {
+    static_assert(!std::is_const<Destination>::value,
+                  "Сan not write in a constant Destination. Destination should "
+                  "not be const.");
+    MUploadDataFn = [this, FinalData]() mutable {
+      using FinalDataType =
+          typename std::iterator_traits<Destination>::value_type;
+
+      // addCopyBack method expects consecutive memory while iterator
+      // passed can point to non consecutive one.
+      // Can be optmized if iterator papssed is consecutive one.
+      std::vector<FinalDataType> TempBuffer(MSizeInBytes /
+                                            sizeof(FinalDataType));
+      void *TempPtr = TempBuffer.data();
+
+      detail::Requirement AccImpl({0, 0, 0}, {MSizeInBytes, 1, 1},
+                                  {MSizeInBytes, 1, 1}, access::mode::read,
+                                  this, 1, sizeof(char));
+      AccImpl.MData = TempPtr;
+
+      detail::EventImplPtr Event =
+          Scheduler::getInstance().addCopyBack(&AccImpl);
+      if (Event) {
+        Event->wait(Event);
+        std::copy(TempBuffer.begin(), TempBuffer.end(), FinalData);
+      }
+    };
+  }
+
+  template <typename T, int Dimensions, access::mode Mode,
+            access::target Target = access::target::global_buffer>
+  accessor<T, Dimensions, Mode, Target, access::placeholder::false_t>
+  get_access(buffer<T, Dimensions, AllocatorT> &Buffer,
+             handler &CommandGroupHandler) {
+    return accessor<T, Dimensions, Mode, Target, access::placeholder::false_t>(
+        Buffer, CommandGroupHandler);
+  }
+
+  template <typename T, int Dimensions, access::mode Mode>
+  accessor<T, Dimensions, Mode, access::target::host_buffer,
+           access::placeholder::false_t>
+  get_access(buffer<T, Dimensions, AllocatorT> &Buffer) {
+    return accessor<T, Dimensions, Mode, access::target::host_buffer,
+                    access::placeholder::false_t>(Buffer);
+  }
+
+  template <typename T, int dimensions, access::mode mode,
+            access::target target = access::target::global_buffer>
+  accessor<T, dimensions, mode, target, access::placeholder::false_t>
+  get_access(buffer<T, dimensions, AllocatorT> &Buffer,
+             handler &commandGroupHandler, range<dimensions> accessRange,
+             id<dimensions> accessOffset) {
+    return accessor<T, dimensions, mode, target, access::placeholder::false_t>(
+        Buffer, commandGroupHandler, accessRange, accessOffset);
+  }
+
+  template <typename T, int dimensions, access::mode mode>
+  accessor<T, dimensions, mode, access::target::host_buffer,
+           access::placeholder::false_t>
+  get_access(buffer<T, dimensions, AllocatorT> &Buffer,
+             range<dimensions> accessRange, id<dimensions> accessOffset) {
+    return accessor<T, dimensions, mode, access::target::host_buffer,
+                    access::placeholder::false_t>(Buffer, accessRange,
+                                                  accessOffset);
+  }
+
+  void *allocateHostMem() override {
+    assert(
+        !MProps.has_property<property::buffer::use_host_ptr>() &&
+        "Cannot allocate additional memory if use_host_ptr property is set.");
+    size_t AllocatorValueSize = sizeof(typename AllocatorT::value_type);
+    size_t AllocationSize = get_size() / AllocatorValueSize;
+    AllocationSize += (get_size() % AllocatorValueSize) ? 1 : 0;
+    return MAllocator.allocate(AllocationSize);
+  }
+
+  void *allocateMem(ContextImplPtr Context, bool InitFromUserData,
+                    cl_event &OutEventToWait) override {
+
+    void *UserPtr = InitFromUserData ? getUserPtr() : nullptr;
+
+    return MemoryManager::allocateMemBuffer(
+        std::move(Context), this, UserPtr, MHostPtrReadOnly, get_size(),
+        MInteropEvent, MInteropContext, OutEventToWait);
+  }
+
+  MemObjType getType() const override { return MemObjType::BUFFER; }
+
+  void releaseHostMem(void *Ptr) override {
+    MAllocator.deallocate((typename AllocatorT::pointer)Ptr, get_size());
+  }
+
+  void releaseMem(ContextImplPtr Context, void *MemAllocation) override {
+    return MemoryManager::releaseMemBuf(Context, this, MemAllocation,
+                                        getUserPtr());
+  }
+
+  void *getUserPtr() const {
+    return MOpenCLInterop ? (void *)MInteropMemObject : MUserPtr;
+  }
+
+  template <typename propertyT> bool has_property() const {
+    return MProps.has_property<propertyT>();
+  }
+
+  template <typename propertyT> propertyT get_property() const {
+    return MProps.get_property<propertyT>();
+  }
+
+private:
+  bool MOpenCLInterop = false;
+  bool MHostPtrReadOnly = false;
+
+  bool MNeedWriteBack = true;
+
+  EventImplPtr MInteropEvent;
+  ContextImplPtr MInteropContext;
+  cl_mem MInteropMemObject = nullptr;
+
+  void *MUserPtr = nullptr;
+  void *MShadowCopy = nullptr;
+  size_t MSizeInBytes = 0;
+
+  property_list MProps;
+  std::function<void(void)> MUploadDataFn = nullptr;
+  AllocatorT MAllocator;
+};
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/sycl_mem_obj.hpp b/sycl/include/CL/sycl/detail/sycl_mem_obj.hpp
new file mode 100644
index 000000000000..25a2bd4e2b3a
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/sycl_mem_obj.hpp
@@ -0,0 +1,60 @@
+//==------------ sycl_mem_obj.hpp - SYCL standard header file --------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/cl.h>
+
+#include <memory>
+
+namespace cl {
+namespace sycl {
+
+namespace detail {
+
+class event_impl;
+class context_impl;
+
+using EventImplPtr = std::shared_ptr<detail::event_impl>;
+using ContextImplPtr = std::shared_ptr<detail::context_impl>;
+
+// The class serves as a base for all SYCL memory objects.
+class SYCLMemObjT {
+public:
+  enum MemObjType { BUFFER, IMAGE };
+
+  virtual MemObjType getType() const = 0;
+
+  // The method allocates memory for the SYCL memory object. The size of
+  // allocation will be taken from the size of SYCL memory object.
+  // If the memory returned cannot be used right away InteropEvent will
+  // point to event that should be waited before using the memory.
+  // InitFromUserData indicates that the returned memory should be intialized
+  // with the data provided by user(if any). Usually it should happen on the
+  // first allocation of memory for the buffer.
+  // Method returns a pointer to host allocation if Context is host one and
+  // cl_mem obect if not.
+  virtual void *allocateMem(ContextImplPtr Context, bool InitFromUserData,
+                            cl_event &InteropEvent) = 0;
+
+  // Should be used for memory object created without use_host_ptr property.
+  virtual void *allocateHostMem() = 0;
+
+  // Ptr must be a pointer returned by allocateMem for the same context.
+  // If Context is a device context and Ptr is a host pointer exception will be
+  // thrown. And it's undefined behaviour if Context is a host context and Ptr
+  // is a device pointer.
+  virtual void releaseMem(ContextImplPtr Context, void *Ptr) = 0;
+
+  // Ptr must be a pointer returned by allocateHostMem.
+  virtual void releaseHostMem(void *Ptr) = 0;
+};
+
+} // namespace detail
+} // namespace sycl
+} // namespace detail

From d559d11b991691804415706bec6d204639d62f83 Mon Sep 17 00:00:00 2001
From: vromanov <vlad.romanov@intel.com>
Date: Tue, 9 Apr 2019 17:24:41 +0300
Subject: [PATCH 3/6] [SYCL] Refactoring of handler class.

Modified handler class so it creates special CG class.
CG class represents different types of command groups, such as kernel execution,
fill and so on. This is needed to decouple Scheduler from template parameters
introduced by lambda, various ranges and arguments.

Signed-off-by: Vlad Romanov <vlad.romanov@intel.com>
---
 sycl/include/CL/sycl.hpp           |   2 +-
 sycl/include/CL/sycl/detail/cg.hpp | 347 ++++++++++
 sycl/include/CL/sycl/handler.hpp   |   6 +
 sycl/include/CL/sycl/handler2.hpp  | 985 +++++++++++++++++++++++++++++
 4 files changed, 1339 insertions(+), 1 deletion(-)
 create mode 100644 sycl/include/CL/sycl/detail/cg.hpp
 create mode 100644 sycl/include/CL/sycl/handler2.hpp

diff --git a/sycl/include/CL/sycl.hpp b/sycl/include/CL/sycl.hpp
index ded4c64a1bb2..bfc88a1ae4bc 100644
--- a/sycl/include/CL/sycl.hpp
+++ b/sycl/include/CL/sycl.hpp
@@ -10,8 +10,8 @@
 
 #include <CL/sycl/accessor.hpp>
 #include <CL/sycl/atomic.hpp>
-#include <CL/sycl/builtins.hpp>
 #include <CL/sycl/buffer.hpp>
+#include <CL/sycl/builtins.hpp>
 #include <CL/sycl/context.hpp>
 #include <CL/sycl/device.hpp>
 #include <CL/sycl/device_selector.hpp>
diff --git a/sycl/include/CL/sycl/detail/cg.hpp b/sycl/include/CL/sycl/detail/cg.hpp
new file mode 100644
index 000000000000..0bb42df3020c
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/cg.hpp
@@ -0,0 +1,347 @@
+//==-------------- CG.hpp - SYCL standard header file ----------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/detail/accessor_impl.hpp>
+#include <CL/sycl/detail/helpers.hpp>
+#include <CL/sycl/detail/kernel_desc.hpp>
+#include <CL/sycl/id.hpp>
+#include <CL/sycl/kernel.hpp>
+#include <CL/sycl/nd_item.hpp>
+#include <CL/sycl/range.hpp>
+
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <vector>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+using namespace cl;
+
+// The structure represents kernel argument.
+class ArgDesc {
+public:
+  ArgDesc(sycl::detail::kernel_param_kind_t Type, void *Ptr, int Size,
+          int Index)
+      : MType(Type), MPtr(Ptr), MSize(Size), MIndex(Index) {}
+
+  sycl::detail::kernel_param_kind_t MType;
+  void *MPtr;
+  int MSize;
+  int MIndex;
+};
+
+// The structure represents NDRange - global, local sizes, global offset and
+// number of dimensions.
+class NDRDescT {
+  // The method initializes all sizes for dimensions greater than the passed one
+  // to the default values, so they will not affect execution.
+  template <int Dims_> void setNDRangeLeftover() {
+    for (int I = Dims_; I < 3; ++I) {
+      GlobalSize[I] = 1;
+      LocalSize[I] = 1;
+      GlobalOffset[I] = 0;
+    }
+  }
+
+public:
+  NDRDescT() = default;
+
+  template <int Dims_> void set(sycl::range<Dims_> NumWorkItems) {
+    for (int I = 0; I < Dims_; ++I) {
+      GlobalSize[I] = NumWorkItems[I];
+      LocalSize[I] = 1;
+      GlobalOffset[I] = 0;
+    }
+
+    setNDRangeLeftover<Dims_>();
+    Dims = Dims_;
+  }
+
+  template <int Dims_> void set(sycl::nd_range<Dims_> ExecutionRange) {
+    for (int I = 0; I < Dims_; ++I) {
+      GlobalSize[I] = ExecutionRange.get_global_range()[I];
+      LocalSize[I] = ExecutionRange.get_local_range()[I];
+      GlobalOffset[I] = ExecutionRange.get_offset()[I];
+    }
+    setNDRangeLeftover<Dims_>();
+    Dims = Dims_;
+  }
+
+  sycl::range<3> GlobalSize;
+  sycl::range<3> LocalSize;
+  sycl::id<3> GlobalOffset;
+  size_t Dims;
+};
+
+// The pure virtual class aimed to store lambda/functors of any type.
+class HostKernelBase {
+public:
+  // The method executes lambda stored using NDRange passed.
+  virtual void call(const NDRDescT &NDRDesc) = 0;
+  // Return pointer to the lambda object.
+  // Used to extract captured variables.
+  virtual char *getPtr() = 0;
+  virtual ~HostKernelBase() = default;
+};
+
+// Class which stores specific lambda object.
+template <class KernelType, class KernelArgType, int Dims>
+class HostKernel : public HostKernelBase {
+  using IDBuilder = sycl::detail::Builder;
+  KernelType MKernel;
+
+public:
+  HostKernel(KernelType Kernel) : MKernel(Kernel) {}
+  void call(const NDRDescT &NDRDesc) override { runOnHost(NDRDesc); }
+
+  char *getPtr() override { return reinterpret_cast<char *>(&MKernel); }
+
+  template <class ArgT = KernelArgType>
+  typename std::enable_if<std::is_same<ArgT, void>::value>::type
+  runOnHost(const NDRDescT &NDRDesc) {
+    MKernel();
+  }
+
+  template <class ArgT = KernelArgType>
+  typename std::enable_if<std::is_same<ArgT, sycl::id<Dims>>::value>::type
+  runOnHost(const NDRDescT &NDRDesc) {
+    size_t XYZ[3] = {0};
+    sycl::id<Dims> ID;
+    for (; XYZ[2] < NDRDesc.GlobalSize[2]; ++XYZ[2]) {
+      XYZ[1] = 0;
+      for (; XYZ[1] < NDRDesc.GlobalSize[1]; ++XYZ[1]) {
+        XYZ[0] = 0;
+        for (; XYZ[0] < NDRDesc.GlobalSize[0]; ++XYZ[0]) {
+          for (int I = 0; I < Dims; ++I)
+            ID[I] = XYZ[I];
+          MKernel(ID);
+        }
+      }
+    }
+  }
+
+  template <class ArgT = KernelArgType>
+  typename std::enable_if<
+      (std::is_same<ArgT, item<Dims, /*Offset=*/false>>::value ||
+       std::is_same<ArgT, item<Dims, /*Offset=*/true>>::value)>::type
+  runOnHost(const NDRDescT &NDRDesc) {
+    size_t XYZ[3] = {0};
+    sycl::id<Dims> ID;
+    sycl::range<Dims> Range;
+    for (int I = 0; I < Dims; ++I)
+      Range[I] = NDRDesc.GlobalSize[I];
+
+    for (; XYZ[2] < NDRDesc.GlobalSize[2]; ++XYZ[2]) {
+      XYZ[1] = 0;
+      for (; XYZ[1] < NDRDesc.GlobalSize[1]; ++XYZ[1]) {
+        XYZ[0] = 0;
+        for (; XYZ[0] < NDRDesc.GlobalSize[0]; ++XYZ[0]) {
+          for (int I = 0; I < Dims; ++I)
+            ID[I] = XYZ[I];
+
+          sycl::item<Dims, /*Offset=*/false> Item =
+              IDBuilder::createItem<Dims, false>(Range, ID);
+          MKernel(Item);
+        }
+      }
+    }
+  }
+
+  template <class ArgT = KernelArgType>
+  typename std::enable_if<std::is_same<ArgT, nd_item<Dims>>::value>::type
+  runOnHost(const NDRDescT &NDRDesc) {
+    // TODO add offset logic
+
+    sycl::id<3> GroupSize;
+    for (int I = 0; I < 3; ++I) {
+      GroupSize[I] = NDRDesc.GlobalSize[I] / NDRDesc.LocalSize[I];
+    }
+
+    sycl::range<Dims> GlobalSize;
+    sycl::range<Dims> LocalSize;
+    sycl::id<Dims> GlobalOffset;
+    for (int I = 0; I < Dims; ++I) {
+      GlobalOffset[I] = NDRDesc.GlobalOffset[I];
+      LocalSize[I] = NDRDesc.LocalSize[I];
+      GlobalSize[I] = NDRDesc.GlobalSize[I];
+    }
+
+    sycl::id<Dims> GlobalID;
+    sycl::id<Dims> LocalID;
+
+    size_t GroupXYZ[3] = {0};
+    sycl::id<Dims> GroupID;
+    for (; GroupXYZ[2] < GroupSize[2]; ++GroupXYZ[2]) {
+      GroupXYZ[1] = 0;
+      for (; GroupXYZ[1] < GroupSize[1]; ++GroupXYZ[1]) {
+        GroupXYZ[0] = 0;
+        for (; GroupXYZ[0] < GroupSize[0]; ++GroupXYZ[0]) {
+          for (int I = 0; I < Dims; ++I)
+            GroupID[I] = GroupXYZ[I];
+
+          sycl::group<Dims> Group =
+              IDBuilder::createGroup<Dims>(GlobalSize, LocalSize, GroupID);
+          size_t LocalXYZ[3] = {0};
+          for (; LocalXYZ[2] < NDRDesc.LocalSize[2]; ++LocalXYZ[2]) {
+            LocalXYZ[1] = 0;
+            for (; LocalXYZ[1] < NDRDesc.LocalSize[1]; ++LocalXYZ[1]) {
+              LocalXYZ[0] = 0;
+              for (; LocalXYZ[0] < NDRDesc.LocalSize[0]; ++LocalXYZ[0]) {
+
+                for (int I = 0; I < Dims; ++I) {
+                  GlobalID[I] = GroupXYZ[I] * LocalSize[I] + LocalXYZ[I];
+                  LocalID[I] = LocalXYZ[I];
+                }
+                const sycl::item<Dims, /*Offset=*/true> GlobalItem =
+                    IDBuilder::createItem<Dims, true>(GlobalSize, GlobalID,
+                                                      GlobalOffset);
+                const sycl::item<Dims, /*Offset=*/false> LocalItem =
+                    IDBuilder::createItem<Dims, false>(LocalSize, LocalID);
+                const sycl::nd_item<Dims> NDItem =
+                    IDBuilder::createNDItem<Dims>(GlobalItem, LocalItem, Group);
+                MKernel(NDItem);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+  ~HostKernel() = default;
+};
+
+// The base class for all types of command groups.
+class CG {
+public:
+  // Type of the command group.
+  enum CGTYPE {
+    KERNEL,
+    COPY_ACC_TO_PTR,
+    COPY_PTR_TO_ACC,
+    COPY_ACC_TO_ACC,
+    FILL,
+    UPDATE_HOST
+  };
+
+  CG(CGTYPE Type, std::vector<std::vector<char>> ArgsStorage,
+     std::vector<detail::AccessorImplPtr> AccStorage,
+     std::vector<std::shared_ptr<void>> SharedPtrStorage,
+     std::vector<Requirement *> Requirements)
+      : MType(Type), MArgsStorage(std::move(ArgsStorage)),
+        MAccStorage(std::move(AccStorage)),
+        MSharedPtrStorage(std::move(SharedPtrStorage)),
+        MRequirements(std::move(Requirements)) {}
+
+  CG(CG &&CommandGroup) = default;
+
+  std::vector<Requirement *> getRequirements() const { return MRequirements; }
+
+  CGTYPE getType() { return MType; }
+
+private:
+  CGTYPE MType;
+  // The following storages needed to ensure that arguments won't die while
+  // we are using them.
+  // Storage for standard layout arguments.
+  std::vector<std::vector<char>> MArgsStorage;
+  // Storage for accessors.
+  std::vector<detail::AccessorImplPtr> MAccStorage;
+  // Storage for shared_ptrs.
+  std::vector<std::shared_ptr<void>> MSharedPtrStorage;
+  // List of requirements that specify which memory is needed for the command
+  // group to be executed.
+  std::vector<Requirement *> MRequirements;
+};
+
+// The class which represents "execute kernel" command group.
+class CGExecKernel : public CG {
+public:
+  NDRDescT MNDRDesc;
+  std::unique_ptr<HostKernelBase> MHostKernel;
+  std::shared_ptr<detail::kernel_impl> MSyclKernel;
+  std::vector<ArgDesc> MArgs;
+  std::string MKernelName;
+  detail::OSModuleHandle MOSModuleHandle;
+
+  CGExecKernel(NDRDescT NDRDesc, std::unique_ptr<HostKernelBase> HKernel,
+               std::shared_ptr<detail::kernel_impl> SyclKernel,
+               std::vector<std::vector<char>> ArgsStorage,
+               std::vector<detail::AccessorImplPtr> AccStorage,
+               std::vector<std::shared_ptr<void>> SharedPtrStorage,
+               std::vector<Requirement *> Requirements,
+               std::vector<ArgDesc> Args, std::string KernelName,
+               detail::OSModuleHandle OSModuleHandle)
+      : CG(KERNEL, std::move(ArgsStorage), std::move(AccStorage),
+           std::move(SharedPtrStorage), std::move(Requirements)),
+        MNDRDesc(std::move(NDRDesc)), MHostKernel(std::move(HKernel)),
+        MSyclKernel(std::move(SyclKernel)), MArgs(std::move(Args)),
+        MKernelName(std::move(KernelName)), MOSModuleHandle(OSModuleHandle) {}
+
+  std::vector<ArgDesc> getArguments() const { return MArgs; }
+  std::string getKernelName() const { return MKernelName; }
+};
+
+// The class which represents "copy" command group.
+class CGCopy : public CG {
+  void *MSrc;
+  void *MDst;
+
+public:
+  CGCopy(CGTYPE CopyType, void *Src, void *Dst,
+         std::vector<std::vector<char>> ArgsStorage,
+         std::vector<detail::AccessorImplPtr> AccStorage,
+         std::vector<std::shared_ptr<void>> SharedPtrStorage,
+         std::vector<Requirement *> Requirements)
+      : CG(CopyType, std::move(ArgsStorage), std::move(AccStorage),
+           std::move(SharedPtrStorage), std::move(Requirements)),
+        MSrc(Src), MDst(Dst) {}
+  void *getSrc() { return MSrc; }
+  void *getDst() { return MDst; }
+};
+
+// The class which represents "fill" command group.
+class CGFill : public CG {
+public:
+  std::vector<char> MPattern;
+  Requirement *MPtr;
+
+  CGFill(std::vector<char> Pattern, void *Ptr,
+         std::vector<std::vector<char>> ArgsStorage,
+         std::vector<detail::AccessorImplPtr> AccStorage,
+         std::vector<std::shared_ptr<void>> SharedPtrStorage,
+         std::vector<Requirement *> Requirements)
+      : CG(FILL, std::move(ArgsStorage), std::move(AccStorage),
+           std::move(SharedPtrStorage), std::move(Requirements)),
+        MPattern(std::move(Pattern)), MPtr((Requirement *)Ptr) {}
+  Requirement *getReqToFill() { return MPtr; }
+};
+
+// The class which represents "update host" command group.
+class CGUpdateHost : public CG {
+  Requirement *MPtr;
+
+public:
+  CGUpdateHost(void *Ptr, std::vector<std::vector<char>> ArgsStorage,
+               std::vector<detail::AccessorImplPtr> AccStorage,
+               std::vector<std::shared_ptr<void>> SharedPtrStorage,
+               std::vector<Requirement *> Requirements)
+      : CG(UPDATE_HOST, std::move(ArgsStorage), std::move(AccStorage),
+           std::move(SharedPtrStorage), std::move(Requirements)),
+        MPtr((Requirement *)Ptr) {}
+
+  Requirement *getReqToUpdate() { return MPtr; }
+};
+
+} // namespace cl
+} // namespace sycl
+} // namespace detail
diff --git a/sycl/include/CL/sycl/handler.hpp b/sycl/include/CL/sycl/handler.hpp
index 92bac7d8e0f1..1d1c9437e890 100644
--- a/sycl/include/CL/sycl/handler.hpp
+++ b/sycl/include/CL/sycl/handler.hpp
@@ -6,6 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifdef SCHEDULER_20
+#include <CL/sycl/handler2.hpp>
+
+#else
+
 #pragma once
 
 #include <CL/__spirv/spirv_vars.hpp>
@@ -728,3 +733,4 @@ class handler {
 };
 } // namespace sycl
 } // namespace cl
+#endif // SCHEDULER_20
diff --git a/sycl/include/CL/sycl/handler2.hpp b/sycl/include/CL/sycl/handler2.hpp
new file mode 100644
index 000000000000..2dfc88875741
--- /dev/null
+++ b/sycl/include/CL/sycl/handler2.hpp
@@ -0,0 +1,985 @@
+//==-------- handler.hpp --- SYCL command group handler --------*- C++ -*---==//
+//
+// Copyright (C) 2018 Intel Corporation. All rights reserved.
+//
+// The information and source code contained herein is the exclusive property
+// of Intel Corporation and may not be disclosed, examined or reproduced in
+// whole or in part without explicit written authorization from the company.
+//
+// ===--------------------------------------------------------------------=== //
+
+#pragma once
+
+#include <CL/__spirv/spirv_vars.hpp>
+#include <CL/sycl/access/access.hpp>
+#include <CL/sycl/accessor.hpp>
+#include <CL/sycl/context.hpp>
+#include <CL/sycl/detail/cg.hpp>
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/detail/kernel_desc.hpp>
+#include <CL/sycl/detail/os_util.hpp>
+#include <CL/sycl/detail/scheduler/scheduler.hpp>
+#include <CL/sycl/event.hpp>
+#include <CL/sycl/id.hpp>
+#include <CL/sycl/kernel.hpp>
+#include <CL/sycl/nd_item.hpp>
+#include <CL/sycl/nd_range.hpp>
+#include <CL/sycl/property_list.hpp>
+#include <CL/sycl/sampler.hpp>
+
+#include <CL/sycl/stl.hpp>
+
+#include <functional>
+#include <memory>
+#include <type_traits>
+
+template <typename DataT, int Dimensions, cl::sycl::access::mode AccessMode,
+          cl::sycl::access::target AccessTarget,
+          cl::sycl::access::placeholder IsPlaceholder>
+class __fill;
+
+template <typename T_Src, typename T_Dst, int Dims,
+          cl::sycl::access::mode AccessMode,
+          cl::sycl::access::target AccessTarget,
+          cl::sycl::access::placeholder IsPlaceholder>
+class __copyAcc2Ptr;
+
+template <typename T_Src, typename T_Dst, int Dims,
+          cl::sycl::access::mode AccessMode,
+          cl::sycl::access::target AccessTarget,
+          cl::sycl::access::placeholder IsPlaceholder>
+class __copyPtr2Acc;
+
+template <typename T_Src, int Dims_Src, cl::sycl::access::mode AccessMode_Src,
+          cl::sycl::access::target AccessTarget_Src, typename T_Dst,
+          int Dims_Dst, cl::sycl::access::mode AccessMode_Dst,
+          cl::sycl::access::target AccessTarget_Dst,
+          cl::sycl::access::placeholder IsPlaceholder_Src,
+          cl::sycl::access::placeholder IsPlaceholder_Dst>
+class __copyAcc2Acc;
+
+namespace cl {
+namespace sycl {
+
+namespace csd = cl::sycl::detail;
+
+// Forward declaration
+
+template <typename T, int Dimensions, typename AllocatorT> class buffer;
+namespace detail {
+
+#ifdef __SYCL_DEVICE_ONLY__
+
+#define DEFINE_INIT_SIZES(POSTFIX)                                             \
+                                                                               \
+  template <int Dim, class DstT> struct InitSizesST##POSTFIX;                  \
+                                                                               \
+  template <class DstT> struct InitSizesST##POSTFIX<1, DstT> {                 \
+    static void initSize(DstT &Dst) {                                          \
+      Dst[0] = cl::__spirv::get##POSTFIX<0>();                                 \
+    }                                                                          \
+  };                                                                           \
+                                                                               \
+  template <class DstT> struct InitSizesST##POSTFIX<2, DstT> {                 \
+    static void initSize(DstT &Dst) {                                          \
+      Dst[1] = cl::__spirv::get##POSTFIX<1>();                                 \
+      InitSizesST##POSTFIX<1, DstT>::initSize(Dst);                            \
+    }                                                                          \
+  };                                                                           \
+                                                                               \
+  template <class DstT> struct InitSizesST##POSTFIX<3, DstT> {                 \
+    static void initSize(DstT &Dst) {                                          \
+      Dst[2] = cl::__spirv::get##POSTFIX<2>();                                 \
+      InitSizesST##POSTFIX<2, DstT>::initSize(Dst);                            \
+    }                                                                          \
+  };                                                                           \
+                                                                               \
+  template <int Dims, class DstT> static void init##POSTFIX(DstT &Dst) {       \
+    InitSizesST##POSTFIX<Dims, DstT>::initSize(Dst);                           \
+  }
+
+DEFINE_INIT_SIZES(GlobalSize);
+DEFINE_INIT_SIZES(GlobalInvocationId)
+DEFINE_INIT_SIZES(WorkgroupSize)
+DEFINE_INIT_SIZES(LocalInvocationId)
+DEFINE_INIT_SIZES(WorkgroupId)
+DEFINE_INIT_SIZES(GlobalOffset)
+
+#undef DEFINE_INIT_SIZES
+
+#endif //__SYCL_DEVICE_ONLY__
+
+class queue_impl;
+template <typename RetType, typename Func, typename Arg>
+static Arg member_ptr_helper(RetType (Func::*)(Arg) const);
+
+// Non-const version of the above template to match functors whose 'operator()'
+// is declared w/o the 'const' qualifier.
+template <typename RetType, typename Func, typename Arg>
+static Arg member_ptr_helper(RetType (Func::*)(Arg));
+
+//template <typename RetType, typename Func>
+//static void member_ptr_helper(RetType (Func::*)() const);
+
+//template <typename RetType, typename Func>
+//static void member_ptr_helper(RetType (Func::*)());
+
+template <typename F>
+decltype(member_ptr_helper(&F::operator())) argument_helper(F);
+
+template <typename T>
+using lambda_arg_type = decltype(argument_helper(std::declval<T>()));
+} // namespace detail
+
+// Objects of the handler class collect information about command group, such as
+// kernel, requirements to the memory, arguments for the kernel.
+//
+// sycl::queue::submit([](handler &CGH){
+//   CGH.require(Accessor1);   // Adds a requirement to the memory object.
+//   CGH.setArg(0, Accessor2); // Registers accessor given as an argument to the
+//                             // kernel + adds a requirement to the memory
+//                             // object.
+//   CGH.setArg(1, N);         // Registers value given as an argument to the
+//                             // kernel.
+//   // The following registers KernelFunctor to be a kernel that will be
+//   // executed in case of queue is bound to the host device, SyclKernel - for
+//   // an OpenCL device. This function clearly indicates that command group
+//   // represents kernel execution.
+//   CGH.parallel_for(KernelFunctor, SyclKernel);
+//  });
+//
+// The command group can represent absolutely different operations. Depending
+// on the operation we need to store different data. But, in most cases, it's
+// impossible to say what kind of operation we need to perform until the very
+// end. So, handler class contains all fields simultaneously, then during
+// "finalization" it constructs CG object, that represents specific operation,
+// passing fields that are required only.
+
+// 4.8.3 Command group handler class
+class handler {
+  std::shared_ptr<detail::queue_impl> MQueue;
+  // The storage for the arguments passed.
+  // We need to store a copy of values that are passed explicitly through
+  // set_arg, require and so on, because we need them to be alive after
+  // we exit the method they are passed in.
+  std::vector<std::vector<char>> MArgsStorage;
+  std::vector<detail::AccessorImplPtr> MAccStorage;
+  std::vector<std::shared_ptr<void>> MSharedPtrStorage;
+  // The list of arguments for the kernel.
+  std::vector<detail::ArgDesc> MArgs;
+  // The list of requirements to the memory objects for the scheduling.
+  std::vector<detail::Requirement *> MRequirements;
+  // Struct that encodes global size, local size, ...
+  detail::NDRDescT MNDRDesc;
+  std::string MKernelName;
+  // Storage for a sycl::kernel object.
+  std::shared_ptr<detail::kernel_impl> MSyclKernel;
+  // Type of the command group, e.g. kernel, fill.
+  detail::CG::CGTYPE MCGType;
+  // Pointer to the source host memory or accessor(depending on command type).
+  void *MSrcPtr = nullptr;
+  // Pointer to the dest host memory or accessor(depends on command type).
+  void *MDstPtr = nullptr;
+  // Pattern that is used to fill memory object in case command type is fill.
+  std::vector<char> MPattern;
+  // Storage for a lambda or function object.
+  std::unique_ptr<detail::HostKernelBase> MHostKernel;
+  detail::OSModuleHandle MOSModuleHandle;
+
+  bool MIsHost = false;
+
+private:
+  handler(std::shared_ptr<detail::queue_impl> Queue, bool IsHost)
+      : MQueue(std::move(Queue)), MIsHost(IsHost) {}
+
+  // Method stores copy of Arg passed to the MArgsStorage.
+  template <typename T, typename F = typename std::remove_reference<T>::type>
+  F *storePlainArg(T &&Arg) {
+    MArgsStorage.emplace_back(sizeof(T));
+    F *Storage = (F *)MArgsStorage.back().data();
+    *Storage = Arg;
+    return Storage;
+  }
+
+  // Method extracts kernel arguments and requirements from the lambda using
+  // integration header.
+  void
+  extractArgsAndReqsFromLambda(char *LambdaPtr, size_t KernelArgsNum,
+                               const detail::kernel_param_desc_t *KernelArgs) {
+    unsigned NextArgId = 0;
+    for (unsigned I = 0; I < KernelArgsNum; ++I, ++NextArgId) {
+      void *Ptr = LambdaPtr + KernelArgs[I].offset;
+      const detail::kernel_param_kind_t &Kind = KernelArgs[I].kind;
+
+      switch (Kind) {
+      case detail::kernel_param_kind_t::kind_std_layout: {
+        const size_t Size = KernelArgs[I].info;
+        MArgs.emplace_back(detail::ArgDesc(Kind, Ptr, Size, NextArgId));
+        break;
+      }
+      case detail::kernel_param_kind_t::kind_accessor: {
+
+        const int AccTarget = KernelArgs[I].info & 0x7ff;
+        switch (static_cast<access::target>(AccTarget)) {
+        case access::target::global_buffer:
+        case access::target::constant_buffer: {
+
+          detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)Ptr;
+
+          detail::Requirement *AccImpl = detail::getSyclObjImpl(*AccBase).get();
+
+          MArgs.emplace_back(
+              detail::ArgDesc(Kind, AccImpl, /*Size=*/0, NextArgId));
+          MRequirements.emplace_back(AccImpl);
+
+          MArgs.emplace_back(
+              detail::ArgDesc(detail::kernel_param_kind_t::kind_std_layout,
+                              &(AccImpl->MRange[0]),
+                              sizeof(size_t) * AccImpl->MDims, NextArgId + 1));
+          MArgs.emplace_back(
+              detail::ArgDesc(detail::kernel_param_kind_t::kind_std_layout,
+                              &AccImpl->MOrigRange[0],
+                              sizeof(size_t) * AccImpl->MDims, NextArgId + 2));
+          MArgs.emplace_back(
+              detail::ArgDesc(detail::kernel_param_kind_t::kind_std_layout,
+                              &AccImpl->MOffset[0],
+                              sizeof(size_t) * AccImpl->MDims, NextArgId + 3));
+          NextArgId += 3;
+          break;
+        }
+        case access::target::local: {
+
+          detail::LocalAccessorBaseHost *LAcc =
+              (detail::LocalAccessorBaseHost *)Ptr;
+          range<3> &Size = LAcc->getSize();
+          const int Dims = LAcc->getNumOfDims();
+          int SizeInBytes = LAcc->getElementSize();
+          for (int I = 0; I < Dims; ++I)
+            SizeInBytes *= Size[I];
+          MArgs.emplace_back(
+              detail::ArgDesc(detail::kernel_param_kind_t::kind_std_layout,
+                              nullptr, SizeInBytes, NextArgId));
+
+          MArgs.emplace_back(
+              detail::ArgDesc(detail::kernel_param_kind_t::kind_std_layout,
+                              &Size, Dims * sizeof(Size[0]), NextArgId + 1));
+          MArgs.emplace_back(
+              detail::ArgDesc(detail::kernel_param_kind_t::kind_std_layout,
+                              &Size, Dims * sizeof(Size[0]), NextArgId + 2));
+          MArgs.emplace_back(
+              detail::ArgDesc(detail::kernel_param_kind_t::kind_std_layout,
+                              &Size, Dims * sizeof(Size[0]), NextArgId + 3));
+          NextArgId += 3;
+          break;
+        }
+
+        case access::target::image:
+        case access::target::host_buffer:
+        case access::target::host_image:
+        case access::target::image_array: {
+          assert(0);
+          break;
+        }
+        }
+        break;
+      }
+      case detail::kernel_param_kind_t::kind_sampler: {
+
+        sampler *SamplerPtr = (sampler *)Ptr;
+        MArgs.emplace_back(
+            detail::ArgDesc(detail::kernel_param_kind_t::kind_sampler, Ptr,
+                            sizeof(sampler), NextArgId));
+        NextArgId++;
+        break;
+      }
+      }
+    }
+  }
+
+  // The method constructs CG object of specific type, pass it to Scheduler and
+  // returns sycl::event object representing the command group.
+  // It's expected that the method is the latest method executed before
+  // object destruction.
+  event finalize() {
+    sycl::event EventRet;
+    std::unique_ptr<detail::CG> CommandGroup;
+    switch (MCGType) {
+    case detail::CG::KERNEL:
+      CommandGroup.reset(new detail::CGExecKernel(
+          std::move(MNDRDesc), std::move(MHostKernel), std::move(MSyclKernel),
+          std::move(MArgsStorage), std::move(MAccStorage),
+          std::move(MSharedPtrStorage), std::move(MRequirements),
+          std::move(MArgs), std::move(MKernelName),
+          std::move(MOSModuleHandle)));
+      break;
+    case detail::CG::COPY_ACC_TO_PTR:
+    case detail::CG::COPY_PTR_TO_ACC:
+    case detail::CG::COPY_ACC_TO_ACC:
+      CommandGroup.reset(new detail::CGCopy(
+          MCGType, MSrcPtr, MDstPtr, std::move(MArgsStorage),
+          std::move(MAccStorage), std::move(MSharedPtrStorage),
+          std::move(MRequirements)));
+      break;
+    case detail::CG::FILL:
+      CommandGroup.reset(new detail::CGFill(
+          std::move(MPattern), MDstPtr, std::move(MArgsStorage),
+          std::move(MAccStorage), std::move(MSharedPtrStorage),
+          std::move(MRequirements)));
+      break;
+    case detail::CG::UPDATE_HOST:
+      CommandGroup.reset(new detail::CGUpdateHost(
+          MDstPtr, std::move(MArgsStorage), std::move(MAccStorage),
+          std::move(MSharedPtrStorage), std::move(MRequirements)));
+      break;
+    default:
+      throw runtime_error("Unhandled type of command group");
+    }
+
+    detail::EventImplPtr Event = detail::Scheduler::getInstance().addCG(
+        std::move(CommandGroup), std::move(MQueue));
+
+    EventRet = detail::createSyclObjFromImpl<event>(Event);
+    return EventRet;
+  }
+
+  ~handler() = default;
+
+  bool is_host() { return MIsHost; }
+
+  // Recursively calls itself until arguments pack is fully processed.
+  // The version for regular(standard layout) argument.
+  template <typename T, typename... Ts>
+  void setArgsHelper(int ArgIndex, T &&Arg, Ts &&... Args) {
+    set_arg(ArgIndex, std::move(Arg));
+    setArgsHelper(++ArgIndex, std::move(Args)...);
+  }
+
+  void setArgsHelper(int ArgIndex) {}
+
+  // setArgHelper version for accessor argument.
+  template <typename DataT, int Dims, access::mode AccessMode,
+            access::target AccessTarget, access::placeholder IsPlaceholder>
+  void setArgHelper(
+      int ArgIndex,
+      accessor<DataT, Dims, AccessMode, AccessTarget, IsPlaceholder> &&Arg) {
+    // TODO: Handle local accessor in separate method.
+
+    detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Arg;
+    detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
+    // Add accessor to the list of arguments.
+    MRequirements.push_back(AccImpl.get());
+    MArgs.emplace_back(detail::kernel_param_kind_t::kind_accessor,
+                       AccImpl.get(),
+                       /*size=*/0, ArgIndex);
+    // TODO: offset, ranges...
+
+    // Store copy of the accessor.
+    MAccStorage.push_back(std::move(AccImpl));
+  }
+
+  template <typename T> void setArgHelper(int ArgIndex, T &&Arg) {
+    void *StoredArg = (void *)storePlainArg(Arg);
+    MArgs.emplace_back(detail::kernel_param_kind_t::kind_std_layout, StoredArg,
+                       sizeof(T), ArgIndex);
+  }
+
+  //  TODO: implement when sampler class is ready
+  //  void setArgHelper(int argIndex, sampler &&arg) {}
+
+  void verifySyclKernelInvoc(const kernel &SyclKernel) {
+    if (is_host()) {
+      throw invalid_object_error(
+          "This kernel invocation method cannot be used on the host");
+    }
+    if (SyclKernel.is_host()) {
+      throw invalid_object_error("Invalid kernel type, OpenCL expected");
+    }
+  }
+
+  // Make queue_impl class friend to be able to call finalize method.
+  friend class detail::queue_impl;
+
+public:
+  handler(const handler &) = delete;
+  handler(handler &&) = delete;
+  handler &operator=(const handler &) = delete;
+  handler &operator=(handler &&) = delete;
+
+  // The method registers requirement to the memory. So, the command group has a
+  // requirement to gain access to the given memory object before executing.
+  template <typename DataT, int Dims, access::mode AccMode,
+            access::target AccTarget>
+  void
+  require(accessor<DataT, Dims, AccMode, AccTarget, access::placeholder::true_t>
+              Acc) {
+    detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Acc;
+    detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
+    // Add accessor to the list of requirements.
+    MRequirements.emplace_back(AccImpl.get());
+    // Store copy of the accessor.
+    MAccStorage.push_back(std::move(AccImpl));
+  }
+
+  // OpenCL interoperability interface
+  // Registers Arg passed as argument # ArgIndex.
+  template <typename T> void set_arg(int ArgIndex, T &&Arg) {
+    setArgHelper(ArgIndex, std::move(Arg));
+  }
+
+  // Registers pack of arguments(Args) with indexes starting from 0.
+  template <typename... Ts> void set_args(Ts &&... Args) {
+    setArgsHelper(0, std::move(Args)...);
+  }
+
+#ifdef __SYCL_DEVICE_ONLY__
+  template <typename KernelName, typename KernelType>
+  __attribute__((sycl_kernel)) void kernel_single_task(KernelType KernelFunc) {
+    KernelFunc();
+  }
+
+  template <typename KernelName, typename KernelType, int dimensions>
+  __attribute__((sycl_kernel)) void kernel_parallel_for(
+      typename std::enable_if<std::is_same<detail::lambda_arg_type<KernelType>,
+                                           id<dimensions>>::value &&
+                                  (dimensions > 0 && dimensions < 4),
+                              KernelType>::type KernelFunc) {
+    id<dimensions> global_id;
+
+    detail::initGlobalInvocationId<dimensions>(global_id);
+
+    KernelFunc(global_id);
+  }
+
+  template <typename KernelName, typename KernelType, int dimensions>
+  __attribute__((sycl_kernel)) void kernel_parallel_for(
+      typename std::enable_if<std::is_same<detail::lambda_arg_type<KernelType>,
+                                           item<dimensions>>::value &&
+                                  (dimensions > 0 && dimensions < 4),
+                              KernelType>::type KernelFunc) {
+    id<dimensions> global_id;
+    range<dimensions> global_size;
+
+    detail::initGlobalInvocationId<dimensions>(global_id);
+    detail::initGlobalSize<dimensions>(global_size);
+
+    item<dimensions, false> Item =
+        detail::Builder::createItem<dimensions, false>(global_size, global_id);
+    KernelFunc(Item);
+  }
+
+  template <typename KernelName, typename KernelType, int dimensions>
+  __attribute__((sycl_kernel)) void kernel_parallel_for(
+      typename std::enable_if<std::is_same<detail::lambda_arg_type<KernelType>,
+                                           nd_item<dimensions>>::value &&
+                                  (dimensions > 0 && dimensions < 4),
+                              KernelType>::type KernelFunc) {
+    range<dimensions> global_size;
+    range<dimensions> local_size;
+    id<dimensions> group_id;
+    id<dimensions> global_id;
+    id<dimensions> local_id;
+    id<dimensions> global_offset;
+
+    detail::initGlobalSize<dimensions>(global_size);
+    detail::initWorkgroupSize<dimensions>(local_size);
+    detail::initWorkgroupId<dimensions>(group_id);
+    detail::initGlobalInvocationId<dimensions>(global_id);
+    detail::initLocalInvocationId<dimensions>(local_id);
+    detail::initGlobalOffset<dimensions>(global_offset);
+
+    group<dimensions> Group = detail::Builder::createGroup<dimensions>(
+        global_size, local_size, group_id);
+    item<dimensions, true> globalItem =
+        detail::Builder::createItem<dimensions, true>(global_size, global_id,
+                                                      global_offset);
+    item<dimensions, false> localItem =
+        detail::Builder::createItem<dimensions, false>(local_size, local_id);
+    nd_item<dimensions> Nd_item =
+        detail::Builder::createNDItem<dimensions>(globalItem, localItem, Group);
+
+    KernelFunc(Nd_item);
+  }
+#endif
+
+  // The method stores lambda to the template-free object and initializes
+  // kernel name, list of arguments and requirements using information from
+  // integration header.
+  template <typename KernelName, typename KernelType, int Dims,
+            typename LambdaArgType = sycl::detail::lambda_arg_type<KernelType>>
+  void StoreLambda(KernelType KernelFunc) {
+    MHostKernel.reset(
+        new detail::HostKernel<KernelType, LambdaArgType, Dims>(KernelFunc));
+
+    using KI = sycl::detail::KernelInfo<KernelName>;
+    // Empty name indicates that the compilation happens without integration
+    // header, so don't perform things that require it.
+    MArgs.clear();
+    extractArgsAndReqsFromLambda(MHostKernel->getPtr(), KI::getNumParams(),
+                                 &KI::getParamDesc(0));
+    MKernelName = KI::getName();
+    MOSModuleHandle = csd::OSUtil::getOSModuleHandle(KI::getName());
+  }
+
+  // single_task version with a kernel represented as a lambda.
+  template <typename KernelName, typename KernelType>
+  void single_task(KernelType KernelFunc) {
+#ifdef __SYCL_DEVICE_ONLY__
+    kernel_single_task<KernelName>(KernelFunc);
+#else
+    MNDRDesc.set(range<1>{1});
+
+    StoreLambda<KernelName, KernelType, /*Dims*/ 0, void>(KernelFunc);
+    MCGType = detail::CG::KERNEL;
+#endif
+  }
+
+  // single_task version with a kernel represented as a functor. Simply redirect
+  // to the lambda-based form of invocation, setting kernel name type to the
+  // functor type.
+  template <typename KernelFunctorType>
+  void single_task(KernelFunctorType KernelFunctor) {
+    single_task<KernelFunctorType, KernelFunctorType>(KernelFunctor);
+  }
+
+  // parallel_for version with a kernel represented as a lambda + range that
+  // specifies global size only.
+  template <typename KernelName, typename KernelType, int Dims>
+  void parallel_for(range<Dims> NumWorkItems, KernelType KernelFunc) {
+#ifdef __SYCL_DEVICE_ONLY__
+    kernel_parallel_for<KernelName, KernelType, Dims>(KernelFunc);
+#else
+    MNDRDesc.set(std::move(NumWorkItems));
+    StoreLambda<KernelName, KernelType, Dims>(std::move(KernelFunc));
+    MCGType = detail::CG::KERNEL;
+#endif
+  }
+
+  // parallel_for version with a kernel represented as a functor + range that
+  // specifies global size only. Simply redirect to the lambda-based form of
+  // invocation, setting kernel name type to the functor type.
+  template <typename KernelType, int Dims>
+  void parallel_for(range<Dims> NumWorkItems, KernelType KernelFunc) {
+    parallel_for<KernelType, KernelType, Dims>(NumWorkItems, KernelFunc);
+  }
+
+  // parallel_for version with a kernel represented as a lambda + range and
+  // offset that specify global size and global offset correspondingly.
+  template <typename KernelName, typename KernelType, int Dims>
+  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
+                    KernelType KernelFunc) {
+#ifdef __SYCL_DEVICE_ONLY__
+    kernel_parallel_for<KernelName, KernelType, Dims>(KernelFunc);
+#else
+    MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
+    StoreLambda<KernelName, KernelType, Dims>(std::move(KernelFunc));
+    MCGType = detail::CG::KERNEL;
+#endif
+  }
+
+  // parallel_for version with a kernel represented as a lambda + nd_range that
+  // specifies global, local sizes and offset.
+  template <typename KernelName, typename KernelType, int Dims>
+  void parallel_for(nd_range<Dims> ExecutionRange, KernelType KernelFunc) {
+#ifdef __SYCL_DEVICE_ONLY__
+    kernel_parallel_for<KernelName, KernelType, Dims>(KernelFunc);
+#else
+    MNDRDesc.set(std::move(ExecutionRange));
+    StoreLambda<KernelName, KernelType, Dims>(std::move(KernelFunc));
+    MCGType = detail::CG::KERNEL;
+#endif
+  }
+
+  // parallel_for version with a kernel represented as a functor + nd_range that
+  // specifies global, local sizes and offset. Simply redirect to the
+  // lambda-based form of invocation, setting kernel name type to the functor
+  // type.
+  template <typename KernelType, int Dims>
+  void parallel_for(nd_range<Dims> ExecutionRange, KernelType KernelFunc) {
+    parallel_for<KernelType, KernelType, Dims>(ExecutionRange, KernelFunc);
+  }
+
+  // template <typename KernelName, typename WorkgroupFunctionType, int
+  // dimensions>
+  // void parallel_for_work_group(range<dimensions> numWorkGroups,
+  //                              WorkgroupFunctionType KernelFunc);
+
+  // template <typename KernelName, typename WorkgroupFunctionType, int
+  // dimensions>
+  // void parallel_for_work_group(range<dimensions> numWorkGroups,
+  //                              range<dimensions> workGroupSize,
+  //                              WorkgroupFunctionType KernelFunc);
+
+  // single_task version with a kernel represented as a sycl::kernel.
+  // The kernel invocation method has no functors and cannot be called on host.
+  void single_task(kernel SyclKernel) {
+    verifySyclKernelInvoc(SyclKernel);
+    MNDRDesc.set(range<1>{1});
+    MSyclKernel = detail::getSyclObjImpl(std::move(SyclKernel));
+    MCGType = detail::CG::KERNEL;
+  }
+
+  // parallel_for version with a kernel represented as a sycl::kernel + range
+  // that specifies global size only. The kernel invocation method has no
+  // functors and cannot be called on host.
+  template <int Dims>
+  void parallel_for(range<Dims> NumWorkItems, kernel SyclKernel) {
+    verifySyclKernelInvoc(SyclKernel);
+    MSyclKernel = detail::getSyclObjImpl(std::move(SyclKernel));
+    MNDRDesc.set(std::move(NumWorkItems));
+    MCGType = detail::CG::KERNEL;
+  }
+
+  // parallel_for version with a kernel represented as a sycl::kernel + range
+  // and offset that specify global size and global offset correspondingly.
+  // The kernel invocation method has no functors and cannot be called on host.
+  template <int Dims>
+  void parallel_for(range<Dims> NumWorkItems, id<Dims> workItemOffset,
+                    kernel SyclKernel) {
+    verifySyclKernelInvoc(SyclKernel);
+    MSyclKernel = detail::getSyclObjImpl(std::move(SyclKernel));
+    MNDRDesc.set(std::move(NumWorkItems), std::move(workItemOffset));
+    MCGType = detail::CG::KERNEL;
+  }
+
+  // parallel_for version with a kernel represented as a sycl::kernel + nd_range
+  // that specifies global, local sizes and offset. The kernel invocation
+  // method has no functors and cannot be called on host.
+  template <int Dims>
+  void parallel_for(nd_range<Dims> NDRange, kernel SyclKernel) {
+    verifySyclKernelInvoc(SyclKernel);
+    MSyclKernel = detail::getSyclObjImpl(std::move(SyclKernel));
+    MNDRDesc.set(std::move(NDRange));
+    MCGType = detail::CG::KERNEL;
+  }
+
+  // Note: the kernel invocation methods below are only planned to be added
+  // to the spec as of v1.2.1 rev. 3, despite already being present in SYCL
+  // conformance tests.
+
+  // single_task version which takes two "kernels". One is a lambda which is
+  // used if device, queue is bound to, is host device. Second is a sycl::kernel
+  // which is used otherwise.
+  template <typename KernelName, typename KernelType>
+  void single_task(kernel SyclKernel, KernelType KernelFunc) {
+#ifdef __SYCL_DEVICE_ONLY__
+    kernel_single_task<KernelName>(KernelFunc);
+#else
+    MNDRDesc.set(range<1>{1});
+    MSyclKernel = detail::getSyclObjImpl(std::move(SyclKernel));
+    StoreLambda<KernelName, KernelType, /*Dims*/ 0, void>(
+        std::move(KernelFunc));
+    MCGType = detail::CG::KERNEL;
+#endif
+  }
+
+  // single_task version which takes two "kernels". One is a functor which is
+  // used if device, queue is bound to, is host device. Second is a sycl::kernel
+  // which is used otherwise. Simply redirect to the lambda-based form of
+  // invocation, setting kernel name type to the functor type.
+  template <typename KernelType>
+  void single_task(kernel SyclKernel, KernelType KernelFunc) {
+    single_task<KernelType, KernelType>(SyclKernel, KernelFunc);
+  }
+
+  // parallel_for version which takes two "kernels". One is a lambda which is
+  // used if device, queue is bound to, is host device. Second is a sycl::kernel
+  // which is used otherwise. range argument specifies global size.
+  template <typename KernelName, typename KernelType, int Dims>
+  void parallel_for(range<Dims> NumWorkItems, kernel SyclKernel,
+                    KernelType KernelFunc) {
+#ifdef __SYCL_DEVICE_ONLY__
+    kernel_parallel_for<KernelName, KernelType, Dims>(KernelFunc);
+#else
+    MNDRDesc.set(std::move(NumWorkItems));
+    MSyclKernel = detail::getSyclObjImpl(std::move(SyclKernel));
+    StoreLambda<KernelName, KernelType, Dims>(std::move(KernelFunc));
+    MCGType = detail::CG::KERNEL;
+#endif
+  }
+
+  // parallel_for version which takes two "kernels". One is a functor which is
+  // used if device, queue is bound to, is host device. Second is a sycl::kernel
+  // which is used otherwise. range argument specifies global size. Simply
+  // redirect to the lambda-based form of invocation, setting kernel name type
+  // to the functor type.
+  template <typename KernelType, int Dims>
+  void parallel_for(range<Dims> NumWorkItems, kernel SyclKernel,
+                    KernelType KernelFunc) {
+    parallel_for<KernelType, KernelType, Dims>(NumWorkItems, SyclKernel,
+                                               KernelFunc);
+  }
+
+  // parallel_for version which takes two "kernels". One is a lambda which is
+  // used if device, queue is bound to, is host device. Second is a sycl::kernel
+  // which is used otherwise. range and id specify global size and offset.
+  template <typename KernelName, typename KernelType, int Dims>
+  void parallel_for(range<Dims> NumWorkItems, id<Dims> WorkItemOffset,
+                    kernel SyclKernel, KernelType KernelFunc) {
+#ifdef __SYCL_DEVICE_ONLY__
+    kernel_parallel_for<KernelName, KernelType, Dims>(KernelFunc);
+#else
+    MNDRDesc.set(std::move(NumWorkItems), std::move(WorkItemOffset));
+    MSyclKernel = detail::getSyclObjImpl(std::move(SyclKernel));
+    StoreLambda<KernelName, KernelType, Dims>(std::move(KernelFunc));
+    MCGType = detail::CG::KERNEL;
+#endif
+  }
+
+  // parallel_for version which takes two "kernels". One is a lambda which is
+  // used if device, queue is bound to, is host device. Second is a sycl::kernel
+  // which is used otherwise. nd_range specifies global, local size and offset.
+  template <typename KernelName, typename KernelType, int Dims>
+  void parallel_for(nd_range<Dims> NDRange, kernel SyclKernel,
+                    KernelType KernelFunc) {
+#ifdef __SYCL_DEVICE_ONLY__
+    kernel_parallel_for<KernelName, KernelType, Dims>(KernelFunc);
+#else
+    MNDRDesc.set(std::move(NDRange));
+    MSyclKernel = detail::getSyclObjImpl(std::move(SyclKernel));
+    StoreLambda<KernelName, KernelType, Dims>(std::move(KernelFunc));
+    MCGType = detail::CG::KERNEL;
+#endif
+  }
+
+  // parallel_for version which takes two "kernels". One is a lambda which is
+  // used if device, queue is bound to, is host device. Second is a sycl::kernel
+  // which is used otherwise. nd_range specifies global, local size and offset.
+  // Simply redirects to the lambda-based form of invocation, setting kernel
+  // name type to the functor type.
+  template <typename KernelType, int Dims>
+  void parallel_for(nd_range<Dims> NDRange, kernel SyclKernel,
+                    KernelType KernelFunc) {
+    parallel_for<KernelType, KernelType, Dims>(NDRange, SyclKernel, KernelFunc);
+  }
+
+  // template <typename KernelName, typename WorkgroupFunctionType, int
+  // dimensions>
+  // void parallel_for_work_group(range<dimensions> num_work_groups, kernel
+  // SyclKernel, WorkgroupFunctionType KernelFunc);
+
+  // template <typename KernelName, typename WorkgroupFunctionType, int
+  // dimensions>
+  // void parallel_for_work_group(range<dimensions> num_work_groups,
+  // range<dimensions> work_group_size, kernel SyclKernel, WorkgroupFunctionType
+  // KernelFunc);
+
+  // Explicit copy operations API
+
+  // copy memory pointed by accessor to host memory pointed by shared_ptr
+  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
+            access::target AccessTarget,
+            access::placeholder IsPlaceholder = access::placeholder::false_t>
+  typename std::enable_if<(AccessTarget == access::target::global_buffer ||
+                           AccessTarget == access::target::constant_buffer),
+                          void>::type
+  copy(accessor<T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder> Src,
+       shared_ptr_class<T_Dst> Dst) {
+    // Make sure data shared_ptr points to is not released until we finish
+    // work with it.
+    MSharedPtrStorage.push_back(Dst);
+    T_Dst *RawDstPtr = Dst.get();
+    copy(Src, RawDstPtr);
+  }
+
+  // copy memory pointer by shared_ptr to host memory pointed by accessor
+  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
+            access::target AccessTarget,
+            access::placeholder IsPlaceholder = access::placeholder::false_t>
+  typename std::enable_if<(AccessTarget == access::target::global_buffer ||
+                           AccessTarget == access::target::constant_buffer),
+                          void>::type
+  copy(shared_ptr_class<T_Src> Src,
+       accessor<T_Dst, Dims, AccessMode, AccessTarget, IsPlaceholder> Dst) {
+    // Make sure data shared_ptr points to is not released until we finish
+    // work with it.
+    MSharedPtrStorage.push_back(Src);
+    T_Dst *RawSrcPtr = Src.get();
+    copy(RawSrcPtr, Dst);
+  }
+
+  // copy memory pointed by accessor to host memory pointed by raw pointer
+  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
+            access::target AccessTarget,
+            access::placeholder IsPlaceholder = access::placeholder::false_t>
+  typename std::enable_if<(AccessTarget == access::target::global_buffer ||
+                           AccessTarget == access::target::constant_buffer),
+                          void>::type
+  copy(accessor<T_Src, Dims, AccessMode, AccessTarget, IsPlaceholder> Src,
+       T_Dst *Dst) {
+    if (MIsHost) {
+      // TODO: Temporary implementation for host. Should be handled by memory
+      // manger.
+      range<Dims> Range = Src.get_range();
+      parallel_for< class __copyAcc2Ptr< T_Src, T_Dst, Dims, AccessMode,
+                                         AccessTarget, IsPlaceholder>>
+                                         (Range, [=](id<Dims> Index) {
+        size_t LinearIndex = Index[0];
+        for (int I = 1; I < Dims; ++I)
+          LinearIndex += Range[I] * Index[I];
+        ((T_Src *)Dst)[LinearIndex] = Src[Index];
+      });
+
+      return;
+    }
+    MCGType = detail::CG::COPY_ACC_TO_PTR;
+
+    detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Src;
+    detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
+
+    MRequirements.push_back(AccImpl.get());
+    MSrcPtr = (void *)AccImpl.get();
+    MDstPtr = (void *)Dst;
+    // Store copy of accessor to the local storage to make sure it is alive
+    // until we finish
+    MAccStorage.push_back(std::move(AccImpl));
+  }
+
+  // copy memory pointed by raw pointer to host memory pointed by accessor
+  template <typename T_Src, typename T_Dst, int Dims, access::mode AccessMode,
+            access::target AccessTarget,
+            access::placeholder IsPlaceholder = access::placeholder::false_t>
+  typename std::enable_if<(AccessTarget == access::target::global_buffer ||
+                           AccessTarget == access::target::constant_buffer),
+                          void>::type
+  copy(const T_Src *Src,
+       accessor<T_Dst, Dims, AccessMode, AccessTarget, IsPlaceholder> Dst) {
+
+    if (MIsHost) {
+      // TODO: Temporary implementation for host. Should be handled by memory
+      // manger.
+      range<Dims> Range = Dst.get_range();
+      parallel_for< class __copyPtr2Acc< T_Src, T_Dst, Dims, AccessMode,
+                                         AccessTarget, IsPlaceholder>>
+                                         (Range, [=](id<Dims> Index) {
+        size_t LinearIndex = Index[0];
+        for (int I = 1; I < Dims; ++I)
+          LinearIndex += Range[I] * Index[I];
+
+        Dst[Index] = ((T_Src *)Src)[LinearIndex];
+      });
+
+      return;
+    }
+    MCGType = detail::CG::COPY_PTR_TO_ACC;
+
+    detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Dst;
+    detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
+
+    MRequirements.push_back(AccImpl.get());
+    MSrcPtr = (void *)Src;
+    MDstPtr = (void *)AccImpl.get();
+    // Store copy of accessor to the local storage to make sure it is alive
+    // until we finish
+    MAccStorage.push_back(std::move(AccImpl));
+  }
+
+  template <access::target AccessTarget>
+  constexpr static bool isConstOrGlobal() {
+    return AccessTarget == access::target::global_buffer ||
+           AccessTarget == access::target::constant_buffer;
+  }
+
+  // copy memory pointed by accessor to the memory pointed by another accessor
+  template <
+      typename T_Src, int Dims_Src, access::mode AccessMode_Src,
+      access::target AccessTarget_Src, typename T_Dst, int Dims_Dst,
+      access::mode AccessMode_Dst, access::target AccessTarget_Dst,
+      access::placeholder IsPlaceholder_Src = access::placeholder::false_t,
+      access::placeholder IsPlaceholder_Dst = access::placeholder::false_t>
+
+  typename std::enable_if<(isConstOrGlobal<AccessTarget_Src>() ||
+                           isConstOrGlobal<AccessTarget_Dst>()),
+                          void>::type
+  copy(accessor<T_Src, Dims_Src, AccessMode_Src, AccessTarget_Src,
+                IsPlaceholder_Src> Src,
+       accessor<T_Dst, Dims_Dst, AccessMode_Dst, AccessTarget_Dst,
+                IsPlaceholder_Dst> Dst) {
+
+    if (MIsHost) {
+      range<Dims_Src> Range = Dst.get_range();
+      parallel_for< class __copyAcc2Acc< T_Src, Dims_Src, AccessMode_Src,
+                                         AccessTarget_Src, T_Dst, Dims_Dst,
+                                         AccessMode_Dst, AccessTarget_Dst,
+                                         IsPlaceholder_Src,
+                                         IsPlaceholder_Dst>>
+                                         (Range, [=](id<Dims_Src> Index) {
+        Dst[Index] = Src[Index];
+      });
+
+      return;
+    }
+    MCGType = detail::CG::COPY_ACC_TO_ACC;
+
+    detail::AccessorBaseHost *AccBaseSrc = (detail::AccessorBaseHost *)&Src;
+    detail::AccessorImplPtr AccImplSrc = detail::getSyclObjImpl(*AccBaseSrc);
+
+    detail::AccessorBaseHost *AccBaseDst = (detail::AccessorBaseHost *)&Dst;
+    detail::AccessorImplPtr AccImplDst = detail::getSyclObjImpl(*AccBaseDst);
+
+    MRequirements.push_back(AccImplSrc.get());
+    MRequirements.push_back(AccImplDst.get());
+    MSrcPtr = AccImplSrc.get();
+    MDstPtr = AccImplDst.get();
+    // Store copy of accessor to the local storage to make sure it is alive
+    // until we finish
+    MAccStorage.push_back(std::move(AccImplSrc));
+    MAccStorage.push_back(std::move(AccImplDst));
+  }
+
+  template <typename T, int Dims, access::mode AccessMode,
+            access::target AccessTarget,
+            access::placeholder IsPlaceholder = access::placeholder::false_t>
+  typename std::enable_if<(AccessTarget == access::target::global_buffer ||
+                           AccessTarget == access::target::constant_buffer),
+                          void>::type
+  update_host(accessor<T, Dims, AccessMode, AccessTarget, IsPlaceholder> Acc) {
+    MCGType = detail::CG::UPDATE_HOST;
+
+    detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Acc;
+    detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
+
+    MDstPtr = (void *)AccImpl.get();
+    MRequirements.push_back(AccImpl.get());
+    MAccStorage.push_back(std::move(AccImpl));
+  }
+
+  // Fill memory pointed by accessor with the pattern given.
+  // If the operation is submitted to queue associated with OpenCL device and
+  // accessor points to one dimensional memory object then use special type for
+  // filling. Otherwise fill using regular kernel.
+  template <typename T, int Dims, access::mode AccessMode,
+            access::target AccessTarget,
+            access::placeholder IsPlaceholder = access::placeholder::false_t>
+  typename std::enable_if<(AccessTarget == access::target::global_buffer ||
+                           AccessTarget == access::target::constant_buffer),
+                          void>::type
+  fill(accessor<T, Dims, AccessMode, AccessTarget, IsPlaceholder> Dst,
+       const T &Pattern) {
+    // TODO add check:T must be an integral scalar value or a SYCL vector type
+    if (!MIsHost && Dims == 1) {
+      MCGType = detail::CG::FILL;
+
+      detail::AccessorBaseHost *AccBase = (detail::AccessorBaseHost *)&Dst;
+      detail::AccessorImplPtr AccImpl = detail::getSyclObjImpl(*AccBase);
+
+      MDstPtr = (void *)AccImpl.get();
+      MRequirements.push_back(AccImpl.get());
+      MAccStorage.push_back(std::move(AccImpl));
+
+      MPattern.resize(sizeof(T));
+      T *PatternPtr = (T *)MPattern.data();
+      *PatternPtr = Pattern;
+    } else {
+
+      // TODO: Temporary implementation for host. Should be handled by memory
+      // manger.
+      range<Dims> Range = Dst.get_range();
+      parallel_for<class __fill<T, Dims, AccessMode, AccessTarget,
+                                IsPlaceholder>>(Range, [=](id<Dims> Index) {
+        Dst[Index] = Pattern;
+      });
+    }
+  }
+};
+} // namespace sycl
+} // namespace cl

From 0885a04009076645a9c7ff1392331b3cb208cc33 Mon Sep 17 00:00:00 2001
From: vromanov <vlad.romanov@intel.com>
Date: Tue, 9 Apr 2019 18:35:22 +0300
Subject: [PATCH 4/6] [SYCL] Refactoring of accessor class + introduction of
 Requirement class.

The Requirement class is added as base non-templated class for accessor.
This class describes command group's requirement to memory object and
is used to build dependency graph in Scheduler.

Signed-off-by: Vlad Romanov <vlad.romanov@intel.com>
---
 sycl/include/CL/sycl/access/access.hpp        |  15 +
 sycl/include/CL/sycl/accessor.hpp             |   7 +
 sycl/include/CL/sycl/accessor2.hpp            | 809 ++++++++++++++++++
 sycl/include/CL/sycl/detail/accessor_impl.hpp | 147 ++++
 sycl/include/CL/sycl/detail/event_impl.hpp    |   2 +
 sycl/include/CL/sycl/multi_ptr.hpp            |   5 +-
 sycl/source/detail/event_impl.cpp             |   4 +
 7 files changed, 987 insertions(+), 2 deletions(-)
 create mode 100644 sycl/include/CL/sycl/accessor2.hpp
 create mode 100644 sycl/include/CL/sycl/detail/accessor_impl.hpp

diff --git a/sycl/include/CL/sycl/access/access.hpp b/sycl/include/CL/sycl/access/access.hpp
index ae3944ca427a..a117f744d578 100644
--- a/sycl/include/CL/sycl/access/access.hpp
+++ b/sycl/include/CL/sycl/access/access.hpp
@@ -97,6 +97,21 @@ struct DeviceValueType<dataT, access::target::host_buffer> {
   using type = dataT;
 };
 
+template <access::target accessTarget> struct TargetToAS {
+  constexpr static access::address_space AS =
+      access::address_space::global_space;
+};
+
+template <> struct TargetToAS<access::target::local> {
+  constexpr static access::address_space AS =
+      access::address_space::local_space;
+};
+
+template <> struct TargetToAS<access::target::constant_buffer> {
+  constexpr static access::address_space AS =
+      access::address_space::constant_space;
+};
+
 template <typename ElementType, access::address_space addressSpace>
 struct PtrValueType;
 
diff --git a/sycl/include/CL/sycl/accessor.hpp b/sycl/include/CL/sycl/accessor.hpp
index 6ccd0f36b993..7b249c84ec1f 100644
--- a/sycl/include/CL/sycl/accessor.hpp
+++ b/sycl/include/CL/sycl/accessor.hpp
@@ -6,6 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifdef SCHEDULER_20
+
+#include <CL/sycl/accessor2.hpp>
+
+#else
+
 #pragma once
 
 #include <type_traits>
@@ -1028,3 +1034,4 @@ struct hash<cl::sycl::accessor<T, Dimensions, AccessMode, AccessTarget,
   }
 };
 } // namespace std
+#endif // SCHEDULER_20
diff --git a/sycl/include/CL/sycl/accessor2.hpp b/sycl/include/CL/sycl/accessor2.hpp
new file mode 100644
index 000000000000..2cee562b35d4
--- /dev/null
+++ b/sycl/include/CL/sycl/accessor2.hpp
@@ -0,0 +1,809 @@
+//==------------ accessor.hpp - SYCL standard header file ------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/atomic.hpp>
+#include <CL/sycl/buffer.hpp>
+#include <CL/sycl/detail/accessor_impl.hpp>
+#include <CL/sycl/detail/common.hpp>
+#include <CL/sycl/handler.hpp>
+#include <CL/sycl/id.hpp>
+#include <CL/sycl/pointers.hpp>
+
+// The file contains implementations of accessor class. Objects of accessor
+// class define a requirement to access some SYCL memory object or local memory
+// of the device.
+//
+// Basically there are 3 distinct types of accessors.
+//
+// One of them is an accessor to a SYCL buffer object(Buffer accessor) which has
+// the richest interface. It supports things like accessing only a part of
+// buffer, multidimensional access using sycl::id, conversions to various
+// multi_ptr and atomic classes.
+//
+// Second type is an accessor to a SYCL image object(Image accessor) which has
+// "image" specific methods for reading and writing.
+//
+// Finally, accessor to local memory(Local accessor) doesn't require access to
+// any SYCL memory object, but asks for some local memory on device to be
+// available. Some methods overlap with ones that "Buffer accessor" provides.
+//
+// Buffer and Image accessors create the requirement to access some SYCL memory
+// object(or part of it). SYCL RT must detect when two kernels want to access
+// the same memory objects and make sure they are executed in correct order.
+//
+// "accessor_common" class that contains several common methods between Buffer
+// and Local accessors.
+//
+// Accessors have different representation on host and on device. On host they
+// have non-templated base class, that is needed to safely work with any
+// accessor type. Furhermore on host we need some additional fields in order
+// to implement functionality required by Specification, for example during
+// lifetime of a host accessor other operations with memory object the accessor
+// refers to should be blocked and when all references to the host accessor are
+// desctructed, the memory this host accessor refers to should be "written
+// back".
+//
+// The scheme of inheritance for host side:
+//
+//  +------------------+     +-----------------+     +-----------------------+
+//  |                  |     |                 |     |                       |
+//  | AccessorBaseHost |     | accessor_common |     | LocalAccessorBaseHost |
+//  |                  |     |                 |     |                       |
+//  +------------------+     +-----+-----------+     +--------+--------------+
+//         |     |                     |   |                   |
+//         |     +-----------+    +----+   +---------+  +------+
+//         |                 |    |                  |  |
+//         v                 v    v                  v  v
+//  +----------------+  +-----------------+   +-------------+
+//  |                |  |   accessor(1)   |   | accessor(3) |
+//  | image_accessor |  +-----------------|   +-------------+
+//  |                |  | for targets:    |   | for target: |
+//  +---+---+---+----+  |                 |   |             |
+//      |   |   |       | host_buffer     |   | local       |
+//      |   |   |       | global_buffer   |   +-------------+
+//      |   |   |       | constant_buffer |
+//      |   |   |       +-----------------+
+//      |   |   |
+//      |   |   +------------------------------------+
+//      |   |                                        |
+//      |   +----------------------+                 |
+//      v                          v                 v
+//  +-----------------+    +--------------+    +-------------+
+//  |     acessor(2)  |    |  accessor(4) |    | accessor(5) |
+//  +-----------------+    +--------------+    +-------------+
+//  | for targets:    |    | for targets: |    | for target: |
+//  |                 |    |              |    |             |
+//  | host_image      |    |  image       |    | image_array |
+//  +-----------------+    +--------------+    +-------------+
+//
+// For host side AccessorBaseHost/LocalAccessorBaseHost contains shared_ptr
+// which points to AccessorImplHost/LocalAccessorImplHost object.
+//
+// The scheme of inheritance for device side:
+//
+//                            +-----------------+
+//                            |                 |
+//                            | accessor_common |
+//                            |                 |
+//                            +-----+-------+---+
+//                                     |       |
+//                                +----+       +-----+
+//                                |                  |
+//                                v                  v
+//  +----------------+  +-----------------+   +-------------+
+//  |                |  |   accessor(1)   |   | accessor(3) |
+//  | image_accessor |  +-----------------|   +-------------+
+//  |                |  | for targets:    |   | for target: |
+//  +---+---+---+----+  |                 |   |             |
+//      |   |   |       | host_buffer     |   | local       |
+//      |   |   |       | global_buffer   |   +-------------+
+//      |   |   |       | constant_buffer |
+//      |   |   |       +-----------------+
+//      |   |   |
+//      |   |   +------------------------------------+
+//      |   |                                        |
+//      |   +----------------------+                 |
+//      v                          v                 v
+//  +-----------------+    +--------------+    +-------------+
+//  |     acessor(2)  |    |  accessor(4) |    | accessor(5) |
+//  +-----------------+    +--------------+    +-------------+
+//  | for targets:    |    | for targets: |    | for target: |
+//  |                 |    |              |    |             |
+//  | host_image      |    |  image       |    | image_array |
+//  +-----------------+    +--------------+    +-------------+
+//
+// For device side AccessorImplHost/LocalAccessorImplHost are fileds of
+// accessor(1) and accessor(3).
+//
+// accessor(1) declares accessor as a template class and implements accessor
+// class for access targets: host_buffer, global_buffer and constant_buffer.
+//
+// accessor(3) specializes accessor(1) for the local access target.
+//
+// image_accessor contains implements interfaces for access targets: host_image,
+// image and image_array. But there are three distinct specializations of the
+// accessor(1) (accessor(2), accessor(4), accessor(5)) that are just inherited
+// from image_accessor.
+//
+// accessor_common contains several helpers common for both accessor(1) and
+// accessor(3)
+
+namespace cl {
+namespace sycl {
+
+template <typename DataT, int Dimensions, access::mode AccessMode,
+          access::target AccessTarget = access::target::global_buffer,
+          access::placeholder IsPlaceholder = access::placeholder::false_t>
+class accessor;
+
+namespace detail {
+
+// The function extends or truncates number of dimensions of objects of id
+// or ranges classes. When extending the new values are filled with
+// DefaultValue, truncation just removes extra values.
+template <int NewDim, int DefaultValue, template <int> class T, int OldDim>
+static T<NewDim> convertToArrayOfN(T<OldDim> OldObj) {
+  T<NewDim> NewObj;
+  const int CopyDims = NewDim > OldDim ? OldDim : NewDim;
+  for (int I = 0; I < CopyDims; ++I)
+    NewObj[I] = OldObj[I];
+  for (int I = CopyDims; I < NewDim; ++I)
+    NewObj[I] = DefaultValue;
+  return NewObj;
+}
+
+template <typename DataT, int Dimensions, access::mode AccessMode,
+          access::target AccessTarget, access::placeholder IsPlaceholder>
+class accessor_common {
+protected:
+  constexpr static bool IsPlaceH = IsPlaceholder == access::placeholder::true_t;
+  constexpr static access::address_space AS = TargetToAS<AccessTarget>::AS;
+
+  constexpr static bool IsHostBuf = AccessTarget == access::target::host_buffer;
+
+  constexpr static bool IsGlobalBuf =
+      AccessTarget == access::target::global_buffer;
+
+  constexpr static bool IsConstantBuf =
+      AccessTarget == access::target::constant_buffer;
+
+  constexpr static bool IsAccessAnyWrite =
+      AccessMode == access::mode::write ||
+      AccessMode == access::mode::read_write ||
+      AccessMode == access::mode::discard_write ||
+      AccessMode == access::mode::discard_read_write;
+
+  constexpr static bool IsAccessReadOnly = AccessMode == access::mode::read;
+
+  constexpr static bool IsAccessReadWrite =
+      AccessMode == access::mode::read_write;
+
+  using RefType = typename detail::PtrValueType<DataT, AS>::type &;
+  using PtrType = typename detail::PtrValueType<DataT, AS>::type *;
+
+  using AccType =
+      accessor<DataT, Dimensions, AccessMode, AccessTarget, IsPlaceholder>;
+
+  // The class which allows to access value of N dimensional accessor using N
+  // subscript operators, e.g. accessor[2][2][3]
+  template <int SubDims> class AccessorSubscript {
+    static constexpr int Dims = Dimensions;
+
+    template <bool B, class T = void>
+    using enable_if_t = typename std::enable_if<B, T>::type;
+
+    mutable id<Dims> MIDs;
+    AccType MAccessor;
+
+  public:
+    AccessorSubscript(AccType Accessor, id<Dims> IDs)
+        : MAccessor(Accessor), MIDs(IDs) {}
+
+    // Only accessor class is supposed to use this c'tor for the first
+    // operator[].
+    AccessorSubscript(AccType Accessor, size_t Index) : MAccessor(Accessor) {
+      MIDs[0] = Index;
+    }
+
+    template <int CurDims = SubDims, typename = enable_if_t<CurDims != 1>>
+    AccessorSubscript<CurDims - 1> operator[](size_t Index) {
+      MIDs[Dims - CurDims] = Index;
+      return AccessorSubscript<CurDims - 1>(MAccessor, MIDs);
+    }
+
+    template <int CurDims = SubDims,
+              typename = enable_if_t<IsAccessAnyWrite && CurDims == 1>>
+    RefType operator[](size_t Index) const {
+      MIDs[Dims - CurDims] = Index;
+      return MAccessor[MIDs];
+    }
+
+    template <int CurDims = SubDims,
+              typename = enable_if_t<IsAccessReadOnly && CurDims == 1>>
+    DataT operator[](size_t Index) const {
+      MIDs[Dims - SubDims] = Index;
+      return MAccessor[MIDs];
+    }
+  };
+};
+
+} // namespace detail
+
+template <typename DataT, int Dimensions, access::mode AccessMode,
+          access::target AccessTarget, access::placeholder IsPlaceholder>
+class accessor :
+#ifndef __SYCL_DEVICE_ONLY__
+    public detail::AccessorBaseHost,
+#endif
+    public detail::accessor_common<DataT, Dimensions, AccessMode, AccessTarget,
+                                   IsPlaceholder> {
+
+  static_assert((AccessTarget == access::target::global_buffer ||
+                 AccessTarget == access::target::constant_buffer ||
+                 AccessTarget == access::target::host_buffer),
+                "Expected buffer type");
+
+  template <bool B, class T = void>
+  using enable_if_t = typename std::enable_if<B, T>::type;
+
+  using AccessorCommonT = detail::accessor_common<DataT, Dimensions, AccessMode,
+                                                  AccessTarget, IsPlaceholder>;
+
+  constexpr static int AdjustedDim = Dimensions == 0 ? 1 : Dimensions;
+
+  using AccessorCommonT::AS;
+  using AccessorCommonT::IsAccessAnyWrite;
+  using AccessorCommonT::IsAccessReadOnly;
+  using AccessorCommonT::IsConstantBuf;
+  using AccessorCommonT::IsGlobalBuf;
+  using AccessorCommonT::IsHostBuf;
+  using AccessorCommonT::IsPlaceH;
+  template <int Dims>
+  using AccessorSubscript =
+      typename AccessorCommonT::template AccessorSubscript<Dims>;
+
+  using RefType = typename detail::PtrValueType<DataT, AS>::type &;
+  using PtrType = typename detail::PtrValueType<DataT, AS>::type *;
+
+  template <int Dims = Dimensions> size_t getLinearIndex(id<Dims> Id) const {
+    size_t Result = 0;
+    for (int I = 0; I < Dims; ++I)
+      Result = Result * getOrigRange()[I] + get_offset()[I] + Id[I];
+    return Result;
+  }
+
+#ifdef __SYCL_DEVICE_ONLY__
+
+  id<AdjustedDim> &getOffset() { return impl.Offset; }
+  range<AdjustedDim> &getRange() { return impl.AccessRange; }
+  range<AdjustedDim> &getOrigRange() { return impl.MemRange; }
+
+  const id<AdjustedDim> &getOffset() const { return impl.Offset; }
+  const range<AdjustedDim> &getRange() const { return impl.AccessRange; }
+  const range<AdjustedDim> &getOrigRange() const { return impl.MemRange; }
+
+  detail::AccessorImplDevice<AdjustedDim> impl;
+
+  PtrType MData;
+
+  void __init(PtrType Ptr, range<AdjustedDim> AccessRange,
+              range<AdjustedDim> MemRange, id<AdjustedDim> Offset) {
+    MData = Ptr;
+    for (int I = 0; I < AdjustedDim; ++I) {
+      getOffset()[I] = Offset[I];
+      getRange()[I] = AccessRange[I];
+      getOrigRange()[I] = MemRange[I];
+    }
+  }
+
+  void *getPtr() { return MData; }
+
+  PtrType getQualifiedPtr() const { return MData; }
+#else
+
+  using AccessorBaseHost::getRange;
+  using AccessorBaseHost::getOffset;
+  using AccessorBaseHost::getOrigRange;
+
+  char padding[sizeof(detail::AccessorImplDevice<AdjustedDim>) +
+               sizeof(PtrType) - sizeof(detail::AccessorBaseHost)];
+
+  PtrType getQualifiedPtr() const {
+    return reinterpret_cast<PtrType>(AccessorBaseHost::getPtr());
+  }
+
+#endif // __SYCL_DEVICE_ONLY__
+
+public:
+  using value_type = DataT;
+  using reference = typename detail::PtrValueType<DataT, AS>::type &;
+  using const_reference = const reference;
+
+  template <int Dims = Dimensions>
+  accessor(buffer<DataT, 1> &BufferRef,
+           enable_if_t<((!IsPlaceH && IsHostBuf) ||
+                        (IsPlaceH && (IsGlobalBuf || IsConstantBuf))) &&
+                       Dims == 0>)
+#ifdef __SYCL_DEVICE_ONLY__
+      : impl(id<AdjustedDim>(), BufferRef.get_range(), BufferRef.MemRange)
+      {
+#else
+      : AccessorBaseHost(
+            /*Offset=*/{0, 0, 0},
+            detail::convertToArrayOfN<3, 1>(BufferRef.get_range()),
+            detail::convertToArrayOfN<3, 1>(BufferRef.MemRange), AccessMode,
+            detail::getSyclObjImpl(BufferRef).get(), AdjustedDim,
+            sizeof(DataT)) {
+    detail::EventImplPtr Event =
+        detail::Scheduler::getInstance().addHostAccessor(this);
+    Event->wait(Event);
+#endif
+  }
+
+  template <int Dims = Dimensions>
+  accessor(
+      buffer<DataT, 1> &BufferRef, handler &CommandGroupHandler,
+      enable_if_t<(!IsPlaceH && (IsGlobalBuf || IsConstantBuf)) && Dims == 0>)
+#ifdef __SYCL_DEVICE_ONLY__
+      : impl(id<AdjustedDim>(), BufferRef.get_range(), BufferRef.MemRange) {
+  }
+#else
+      : AccessorBaseHost(
+            /*Offset=*/{0, 0, 0},
+            detail::convertToArrayOfN<3, 1>(BufferRef.get_range()),
+            detail::convertToArrayOfN<3, 1>(BufferRef.MemRange), AccessMode,
+            detail::getSyclObjImpl(BufferRef).get(), Dimensions,
+            sizeof(DataT)) {
+    CommandGroupHandler.set_arg(/*Index=*/0, *this);
+  }
+#endif
+
+  template <
+      int Dims = Dimensions,
+      typename = enable_if_t<((!IsPlaceH && IsHostBuf) ||
+                              (IsPlaceH && (IsGlobalBuf || IsConstantBuf))) &&
+                             (Dims > 0)>>
+  accessor(buffer<DataT, Dimensions> &BufferRef)
+#ifdef __SYCL_DEVICE_ONLY__
+      : impl(id<Dimensions>(), BufferRef.get_range(), BufferRef.MemRange) {
+  }
+#else
+      : AccessorBaseHost(
+            /*Offset=*/{0, 0, 0},
+            detail::convertToArrayOfN<3, 1>(BufferRef.get_range()),
+            detail::convertToArrayOfN<3, 1>(BufferRef.MemRange), AccessMode,
+            detail::getSyclObjImpl(BufferRef).get(), Dimensions,
+            sizeof(DataT)) {
+    detail::EventImplPtr Event =
+        detail::Scheduler::getInstance().addHostAccessor(
+            AccessorBaseHost::impl.get());
+    Event->wait(Event);
+  }
+#endif
+
+  template <int Dims = Dimensions,
+            typename = enable_if_t<
+                (!IsPlaceH && (IsGlobalBuf || IsConstantBuf)) && (Dims > 0)>>
+  accessor(buffer<DataT, Dimensions> &BufferRef, handler &CommandGroupHandler)
+#ifdef __SYCL_DEVICE_ONLY__
+      : impl(id<AdjustedDim>(), BufferRef.get_range(), BufferRef.MemRange) {
+  }
+#else
+      : AccessorBaseHost(
+            /*Offset=*/{0, 0, 0},
+            detail::convertToArrayOfN<3, 1>(BufferRef.get_range()),
+            detail::convertToArrayOfN<3, 1>(BufferRef.MemRange), AccessMode,
+            detail::getSyclObjImpl(BufferRef).get(), Dimensions,
+            sizeof(DataT)) {
+    CommandGroupHandler.set_arg(/*Index=*/0, *this);
+  }
+#endif
+
+  template <
+      int Dims = Dimensions,
+      typename = enable_if_t<((!IsPlaceH && IsHostBuf) ||
+                              (IsPlaceH && (IsGlobalBuf || IsConstantBuf))) &&
+                             (Dims > 0)>>
+  accessor(buffer<DataT, Dimensions> &BufferRef, range<Dimensions> AccessRange,
+           id<Dimensions> AccessOffset = {})
+#ifdef __SYCL_DEVICE_ONLY__
+      : impl(AccessOffset, AccessRange, BufferRef.MemRange) {
+  }
+#else
+      : AccessorBaseHost(detail::convertToArrayOfN<3, 0>(AccessOffset),
+                           detail::convertToArrayOfN<3, 1>(AccessRange),
+                           detail::convertToArrayOfN<3, 1>(BufferRef.MemRange),
+                           AccessMode, detail::getSyclObjImpl(BufferRef).get(),
+                           Dimensions, sizeof(DataT)) {
+    detail::EventImplPtr Event =
+        detail::Scheduler::getInstance().addHostAccessor(
+            AccessorBaseHost::impl.get());
+    Event->wait(Event);
+  }
+#endif
+
+  template <int Dims = Dimensions,
+            typename = enable_if_t<
+                (!IsPlaceH && (IsGlobalBuf || IsConstantBuf)) && (Dims > 0)>>
+  accessor(buffer<DataT, Dimensions> &BufferRef, handler &CommandGroupHandler,
+           range<Dimensions> AccessRange, id<Dimensions> AccessOffset = {})
+#ifdef __SYCL_DEVICE_ONLY__
+      : impl(AccessOffset, AccessRange, BufferRef.MemRange) {
+  }
+#else
+      : AccessorBaseHost(detail::convertToArrayOfN<3, 0>(AccessOffset),
+                           detail::convertToArrayOfN<3, 1>(AccessRange),
+                           detail::convertToArrayOfN<3, 1>(BufferRef.MemRange),
+                           AccessMode, detail::getSyclObjImpl(BufferRef).get(),
+                           Dimensions, sizeof(DataT)) {
+    CommandGroupHandler.set_arg(/*Index=*/0, *this);
+  }
+#endif
+
+  constexpr bool is_placeholder() const { return IsPlaceH; }
+
+  size_t get_size() const { return getRange().size() * sizeof(DataT); }
+
+  size_t get_count() const { return getRange().size(); }
+
+  template <int Dims = Dimensions, typename = enable_if_t<(Dims > 0)>>
+  range<Dimensions> get_range() const {
+    return detail::convertToArrayOfN<Dimensions, 1>(getRange());
+  }
+
+  template <int Dims = Dimensions, typename = enable_if_t<(Dims > 0)>>
+  id<Dimensions> get_offset() const {
+    return detail::convertToArrayOfN<Dimensions, 0>(getOffset());
+  }
+
+  template <int Dims = Dimensions,
+            typename = enable_if_t<IsAccessAnyWrite && Dims == 0>>
+  operator RefType() const {
+    return *(getQualifiedPtr() + get_offset()[0]);
+  }
+
+  template <int Dims = Dimensions,
+            typename = enable_if_t<IsAccessAnyWrite && (Dims > 0)>>
+  RefType operator[](id<Dimensions> Index) const {
+    const size_t LinearIndex = getLinearIndex(Index);
+    return getQualifiedPtr()[LinearIndex];
+  }
+
+  template <int Dims = Dimensions,
+            typename = enable_if_t<IsAccessAnyWrite && Dims == 1>>
+  RefType operator[](size_t Index) const {
+    return getQualifiedPtr()[Index + get_offset()[0]];
+  }
+
+  template <int Dims = Dimensions,
+            typename = enable_if_t<IsAccessReadOnly && Dims == 0>>
+  operator DataT() const {
+    return *(getQualifiedPtr() + get_offset()[0]);
+  }
+
+  template <int Dims = Dimensions,
+            typename = enable_if_t<IsAccessReadOnly && (Dims > 0)>>
+  DataT operator[](id<Dimensions> Index) const {
+    const size_t LinearIndex = getLinearIndex(Index);
+    return getQualifiedPtr()[LinearIndex];
+  }
+
+  template <int Dims = Dimensions,
+            typename = enable_if_t<IsAccessReadOnly && Dims == 1>>
+  DataT operator[](size_t Index) const {
+    return getQualifiedPtr()[Index + get_offset()[0]];
+  }
+
+  template <
+      int Dims = Dimensions,
+      typename = enable_if_t<AccessMode == access::mode::atomic && Dims == 0>>
+  operator atomic<DataT, AS>() const {
+    return atomic<DataT, AS>(multi_ptr<DataT, AS>(getQualifiedPtr()));
+  }
+
+  template <
+      int Dims = Dimensions,
+      typename = enable_if_t<AccessMode == access::mode::atomic && (Dims > 0)>>
+  atomic<DataT, AS> operator[](id<Dimensions> Index) const {
+    const size_t LinearIndex = getLinearIndex(Index);
+    return atomic<DataT, AS>(
+        multi_ptr<DataT, AS>(getQualifiedPtr() + LinearIndex));
+  }
+
+  template <
+      int Dims = Dimensions,
+      typename = enable_if_t<AccessMode == access::mode::atomic && Dims == 1>>
+  atomic<DataT, AS> operator[](size_t Index) const {
+    return atomic<DataT, AS>(
+        multi_ptr<DataT, AS>(getQualifiedPtr() + Index + get_offset()[0]));
+  }
+
+  template <int Dims = Dimensions, typename = enable_if_t<(Dims > 1)>>
+  typename AccessorCommonT::template AccessorSubscript<Dims - 1>
+  operator[](size_t Index) const {
+    return AccessorSubscript<Dims - 1>(*this, Index + get_offset()[0]);
+  }
+
+  template <
+      access::target AccessTarget_ = AccessTarget,
+      typename = enable_if_t<AccessTarget_ == access::target::host_buffer>>
+  DataT *get_pointer() const {
+    const size_t LinearIndex = getLinearIndex(id<AdjustedDim>());
+    return getQualifiedPtr() + LinearIndex;
+  }
+
+  template <
+      access::target AccessTarget_ = AccessTarget,
+      typename = enable_if_t<AccessTarget_ == access::target::global_buffer>>
+  global_ptr<DataT> get_pointer() const {
+    const size_t LinearIndex = getLinearIndex(id<AdjustedDim>());
+    return global_ptr<DataT>(getQualifiedPtr() + LinearIndex);
+  }
+
+  template <
+      access::target AccessTarget_ = AccessTarget,
+      typename = enable_if_t<AccessTarget_ == access::target::constant_buffer>>
+  constant_ptr<DataT> get_pointer() const {
+    const size_t LinearIndex = getLinearIndex(id<AdjustedDim>());
+    return constant_ptr<DataT>(getQualifiedPtr() + LinearIndex);
+  }
+};
+
+// Local accessor
+template <typename DataT, int Dimensions, access::mode AccessMode,
+          access::placeholder IsPlaceholder>
+class accessor<DataT, Dimensions, AccessMode, access::target::local,
+               IsPlaceholder> :
+#ifndef __SYCL_DEVICE_ONLY__
+    public detail::LocalAccessorBaseHost,
+#endif
+    public detail::accessor_common<DataT, Dimensions, AccessMode,
+                                   access::target::local, IsPlaceholder> {
+
+  constexpr static int AdjustedDim = Dimensions == 0 ? 1 : Dimensions;
+
+  using AccessorCommonT =
+      detail::accessor_common<DataT, Dimensions, AccessMode,
+                              access::target::local, IsPlaceholder>;
+
+  using AccessorCommonT::AS;
+  using AccessorCommonT::IsAccessAnyWrite;
+  template <int Dims>
+  using AccessorSubscript =
+      typename AccessorCommonT::template AccessorSubscript<Dims>;
+
+  using RefType = typename detail::PtrValueType<DataT, AS>::type &;
+  using PtrType = typename detail::PtrValueType<DataT, AS>::type *;
+
+  template <bool B, class T = void>
+  using enable_if_t = typename std::enable_if<B, T>::type;
+
+#ifdef __SYCL_DEVICE_ONLY__
+  detail::LocalAccessorBaseDevice<AdjustedDim> impl;
+
+  sycl::range<AdjustedDim> &getSize() { return impl.AccessRange; }
+  const sycl::range<AdjustedDim> &getSize() const { return impl.AccessRange; }
+
+  void __init(PtrType Ptr, range<AdjustedDim> AccessRange,
+              range<AdjustedDim> MemRange, id<AdjustedDim> Offset) {
+    MData = Ptr;
+    for (int I = 0; I < AdjustedDim; ++I)
+      getSize()[I] = AccessRange[I];
+  }
+
+  PtrType getQualifiedPtr() const { return MData; }
+
+  PtrType MData;
+
+#else
+
+  char padding[sizeof(detail::LocalAccessorBaseDevice<AdjustedDim>) +
+               sizeof(PtrType) - sizeof(detail::LocalAccessorBaseHost)];
+  using detail::LocalAccessorBaseHost::getSize;
+
+  PtrType getQualifiedPtr() const {
+    return reinterpret_cast<PtrType>(LocalAccessorBaseHost::getPtr());
+  }
+
+#endif // __SYCL_DEVICE_ONLY__
+
+  // Method which calculates linear offset for the ID using Range and Offset.
+  template <int Dims = AdjustedDim> size_t getLinearIndex(id<Dims> Id) const {
+    size_t Result = 0;
+    for (int I = 0; I < Dims; ++I)
+      Result = Result * getSize()[I] + Id[I];
+    return Result;
+  }
+
+public:
+  using value_type = DataT;
+  using reference = DataT &;
+  using const_reference = const DataT &;
+
+  template <int Dims = Dimensions, typename = enable_if_t<Dims == 0>>
+  accessor(handler &CommandGroupHandler)
+#ifdef __SYCL_DEVICE_ONLY__
+      : impl(range<AdjustedDim>{1}) {
+  }
+#else
+      : LocalAccessorBaseHost(range<3>{1, 1, 1}, AdjustedDim, sizeof(DataT)) {
+  }
+#endif
+
+  template <int Dims = Dimensions, typename = enable_if_t<(Dims > 0)>>
+  accessor(range<Dimensions> AllocationSize, handler &CommandGroupHandler)
+#ifdef __SYCL_DEVICE_ONLY__
+      : impl(AllocationSize) {
+  }
+#else
+      : LocalAccessorBaseHost(detail::convertToArrayOfN<3, 1>(AllocationSize),
+                              AdjustedDim, sizeof(DataT)) {
+  }
+#endif
+
+  size_t get_size() const { return getSize().size() * sizeof(DataT); }
+
+  size_t get_count() const { return getSize().size(); }
+
+  template <int Dims = Dimensions,
+            typename = enable_if_t<IsAccessAnyWrite && Dims == 0>>
+  operator RefType() const {
+    return *getQualifiedPtr();
+  }
+
+  template <int Dims = Dimensions,
+            typename = enable_if_t<IsAccessAnyWrite && (Dims > 0)>>
+  RefType operator[](id<Dimensions> Index) const {
+    const size_t LinearIndex = getLinearIndex(Index);
+    return getQualifiedPtr()[LinearIndex];
+  }
+
+  template <int Dims = Dimensions,
+            typename = enable_if_t<IsAccessAnyWrite && Dims == 1>>
+  RefType operator[](size_t Index) const {
+    return getQualifiedPtr()[Index];
+  }
+
+  template <
+      int Dims = Dimensions,
+      typename = enable_if_t<AccessMode == access::mode::atomic && Dims == 0>>
+  operator atomic<DataT, AS>() const {
+    return atomic<DataT, AS>(multi_ptr<DataT, AS>(getQualifiedPtr()));
+  }
+
+  template <
+      int Dims = Dimensions,
+      typename = enable_if_t<AccessMode == access::mode::atomic && (Dims > 0)>>
+  atomic<DataT, AS> operator[](id<Dimensions> Index) const {
+    const size_t LinearIndex = getLinearIndex(Index);
+    return atomic<DataT, AS>(
+        multi_ptr<DataT, AS>(getQualifiedPtr() + LinearIndex));
+  }
+
+  template <
+      int Dims = Dimensions,
+      typename = enable_if_t<AccessMode == access::mode::atomic && Dims == 1>>
+  atomic<DataT, AS> operator[](size_t Index) const {
+    return atomic<DataT, AS>(multi_ptr<DataT, AS>(getQualifiedPtr() + Index));
+  }
+
+  template <int Dims = Dimensions, typename = enable_if_t<(Dims > 1)>>
+  typename AccessorCommonT::template AccessorSubscript<Dims - 1>
+  operator[](size_t Index) const {
+    return AccessorSubscript<Dims - 1>(*this, Index);
+  }
+
+  local_ptr<DataT> get_pointer() const {
+    return local_ptr<DataT>(getQualifiedPtr());
+  }
+};
+
+// Image accessor
+template <typename DataT, int Dimensions, access::mode AccessMode,
+          access::target AccessTarget, access::placeholder IsPlaceholder>
+class image_accessor {
+  static_assert(AccessTarget == access::target::image ||
+                    AccessTarget == access::target::host_image ||
+                    AccessTarget == access::target::image_array,
+                "Expected image type");
+  // TODO: Check if placeholder is applicable here.
+public:
+  using value_type = DataT;
+  using reference = DataT &;
+  using const_reference = const DataT &;
+
+  /* Available only when: accessTarget == access::target::host_image */
+  // template <typename AllocatorT>
+  // accessor(image<dimensions, AllocatorT> &imageRef);
+  /* Available only when: accessTarget == access::target::image */
+  // template <typename AllocatorT>
+  // accessor(image<dimensions, AllocatorT> &imageRef,
+  // handler &commandGroupHandlerRef);
+
+  /* Available only when: accessTarget == access::target::image_array &&
+        dimensions < 3 */
+  // template <typename AllocatorT>
+  // accessor(image<dimensions + 1, AllocatorT> &imageRef,
+  // handler &commandGroupHandlerRef);
+
+  /* TODO -- common interface members -- */
+  // size_t get_size() const;
+
+  // size_t get_count() const;
+
+  /* Available only when: (accessTarget == access::target::image || accessTarget
+     == access::target::host_image) && accessMode == access::mode::read */
+  // template <typename coordT> dataT read(const coordT &coords) const;
+
+  /* Available only when: (accessTarget == access::target::image || accessTarget
+     == access::target::host_image) && accessMode == access::mode::read */
+  // template <typename coordT>
+  // dataT read(const coordT &coords, const sampler &smpl) const;
+
+  /* Available only when: (accessTarget == access::target::image || accessTarget
+      == access::target::host_image) && accessMode == access::mode::write ||
+      accessMode == access::mode::discard_write */
+  // template <typename coordT>
+  // void write(const coordT &coords, const dataT &color) const;
+
+  /* Available only when: accessTarget == access::target::image_array &&
+     dimensions < 3 */
+  //__image_array_slice__ operator[](size_t index) const;
+};
+
+// Image accessors
+template <typename DataT, int Dimensions, access::mode AccessMode,
+          access::placeholder IsPlaceholder>
+class accessor<DataT, Dimensions, AccessMode, access::target::image,
+               IsPlaceholder>
+    : public image_accessor<DataT, Dimensions, AccessMode,
+                            access::target::image, IsPlaceholder> {};
+
+template <typename DataT, int Dimensions, access::mode AccessMode,
+          access::placeholder IsPlaceholder>
+class accessor<DataT, Dimensions, AccessMode, access::target::host_image,
+               IsPlaceholder>
+    : public image_accessor<DataT, Dimensions, AccessMode,
+                            access::target::host_image, IsPlaceholder> {};
+
+template <typename DataT, int Dimensions, access::mode AccessMode,
+          access::placeholder IsPlaceholder>
+class accessor<DataT, Dimensions, AccessMode, access::target::image_array,
+               IsPlaceholder>
+    : public image_accessor<DataT, Dimensions, AccessMode,
+                            access::target::image_array, IsPlaceholder> {};
+
+} // namespace sycl
+} // namespace cl
+
+
+
+namespace std {
+template <typename DataT, int Dimensions, cl::sycl::access::mode AccessMode,
+          cl::sycl::access::target AccessTarget,
+          cl::sycl::access::placeholder IsPlaceholder>
+struct hash<cl::sycl::accessor<DataT, Dimensions, AccessMode, AccessTarget,
+                               IsPlaceholder>> {
+  using AccType = cl::sycl::accessor<DataT, Dimensions, AccessMode,
+                                     AccessTarget, IsPlaceholder>;
+
+  size_t operator()(const AccType &A) const {
+#ifdef __SYCL_DEVICE_ONLY__
+    // Hash is not supported on DEVICE. Just return 0 here.
+    return 0;
+#else
+    std::shared_ptr<cl::sycl::detail::AccessorBaseHost> AccBaseImplPtr =
+        cl::sycl::detail::getSyclObjImpl(A);
+    return hash<decltype(AccBaseImplPtr)>()(AccBaseImplPtr);
+#endif
+  }
+};
+
+} // namespace std
diff --git a/sycl/include/CL/sycl/detail/accessor_impl.hpp b/sycl/include/CL/sycl/detail/accessor_impl.hpp
new file mode 100644
index 000000000000..5f96c782dbef
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/accessor_impl.hpp
@@ -0,0 +1,147 @@
+//==------------ accessor_impl.hpp - SYCL standard header file -------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/access/access.hpp>
+#include <CL/sycl/detail/event_impl.hpp>
+#include <CL/sycl/detail/sycl_mem_obj.hpp>
+#include <CL/sycl/id.hpp>
+#include <CL/sycl/range.hpp>
+
+#include <memory>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+// The class describes a requirement to access a SYCL memory object such as
+// sycl::buffer and sycl::image. For example, each accessor used in a kernel,
+// except one with access target "local", adds such requirement for the command
+// group.
+
+template <int Dims> class AccessorImplDevice {
+public:
+  AccessorImplDevice(id<Dims> Offset, range<Dims> Range, range<Dims> OrigRange)
+      : Offset(Offset), AccessRange(Range), MemRange(OrigRange) {}
+
+  id<Dims> Offset;
+  range<Dims> AccessRange;
+  range<Dims> MemRange;
+};
+
+template <int Dims> class LocalAccessorBaseDevice {
+public:
+  LocalAccessorBaseDevice(sycl::range<Dims> Size) : AccessRange(Size) {}
+  // TODO: Actually we need only one field here, but currently compiler requires
+  // all of them.
+  range<Dims> AccessRange;
+  range<Dims> MemRange;
+  id<Dims> Offset;
+};
+
+class AccessorImplHost {
+public:
+  AccessorImplHost(id<3> Offset, range<3> Range, range<3> OrigRange,
+                   access::mode AccessMode, detail::SYCLMemObjT *SYCLMemObject,
+                   int Dims, int ElemSize)
+      : MOffset(Offset), MRange(Range), MOrigRange(OrigRange),
+        MAccessMode(AccessMode), MSYCLMemObj(SYCLMemObject), MDims(Dims),
+        MElemSize(ElemSize) {}
+
+  ~AccessorImplHost() {
+    if (BlockingEvent)
+      BlockingEvent->setComplete();
+  }
+  AccessorImplHost(const AccessorImplHost &Other)
+      : MOffset(Other.MOffset), MRange(Other.MRange),
+        MOrigRange(Other.MOrigRange), MAccessMode(Other.MAccessMode),
+        MSYCLMemObj(Other.MSYCLMemObj), MDims(Other.MDims),
+        MElemSize(Other.MElemSize) {}
+
+  id<3> MOffset;
+  // The size of accessing region.
+  range<3> MRange;
+  // The size of memory object this requirement is created for.
+  range<3> MOrigRange;
+  access::mode MAccessMode;
+
+  detail::SYCLMemObjT *MSYCLMemObj;
+
+  unsigned int MDims;
+  unsigned int MElemSize;
+
+  void *MData = nullptr;
+
+  EventImplPtr BlockingEvent;
+};
+
+using AccessorImplPtr = std::shared_ptr<AccessorImplHost>;
+
+class AccessorBaseHost {
+public:
+  AccessorBaseHost(id<3> Offset, range<3> Range, range<3> OrigRange,
+                   access::mode AccessMode, detail::SYCLMemObjT *SYCLMemObject,
+                   int Dims, int ElemSize) {
+    impl = std::make_shared<AccessorImplHost>(
+        Offset, Range, OrigRange, AccessMode, SYCLMemObject, Dims, ElemSize);
+  }
+
+protected:
+  id<3> &getOffset() { return impl->MOffset; }
+  range<3> &getRange() { return impl->MRange; }
+  range<3> &getOrigRange() { return impl->MOrigRange; }
+  void *getPtr() { return impl->MData; }
+
+  const id<3> &getOffset() const { return impl->MOffset; }
+  const range<3> &getRange() const { return impl->MRange; }
+  const range<3> &getOrigRange() const { return impl->MOrigRange; }
+  void *getPtr() const { return const_cast<void *>(impl->MData); }
+
+  template <class Obj>
+  friend decltype(Obj::impl) detail::getSyclObjImpl(const Obj &SyclObject);
+
+  AccessorImplPtr impl;
+};
+
+class LocalAccessorImplHost {
+public:
+  LocalAccessorImplHost(sycl::range<3> Size, int Dims, int ElemSize)
+      : MSize(Size), MDims(Dims), MElemSize(ElemSize),
+        MMem(Size[0] * Size[1] * Size[2] * ElemSize) {}
+
+  sycl::range<3> MSize;
+  int MDims;
+  int MElemSize;
+  std::vector<char> MMem;
+};
+
+class LocalAccessorBaseHost {
+public:
+  LocalAccessorBaseHost(sycl::range<3> Size, int Dims, int ElemSize) {
+    impl = std::make_shared<LocalAccessorImplHost>(Size, Dims, ElemSize);
+  }
+  sycl::range<3> &getSize() { return impl->MSize; }
+  const sycl::range<3> &getSize() const { return impl->MSize; }
+  void *getPtr() { return impl->MMem.data(); }
+  void *getPtr() const {
+    return const_cast<void *>(reinterpret_cast<void *>(impl->MMem.data()));
+  }
+
+  int getNumOfDims() { return impl->MDims; }
+  int getElementSize() { return impl->MElemSize; }
+protected:
+
+  std::shared_ptr<LocalAccessorImplHost> impl;
+};
+
+using Requirement = AccessorImplHost;
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/event_impl.hpp b/sycl/include/CL/sycl/detail/event_impl.hpp
index e79df7b78a38..8b26730e1e88 100644
--- a/sycl/include/CL/sycl/detail/event_impl.hpp
+++ b/sycl/include/CL/sycl/detail/event_impl.hpp
@@ -48,6 +48,8 @@ class event_impl {
 
   void waitInternal() const;
 
+  void setComplete();
+
   // Warning. Returned reference will be invalid if event_impl was destroyed.
   cl_event &getHandleRef();
 
diff --git a/sycl/include/CL/sycl/multi_ptr.hpp b/sycl/include/CL/sycl/multi_ptr.hpp
index 436cb0d2d4bd..9b84d50de88c 100644
--- a/sycl/include/CL/sycl/multi_ptr.hpp
+++ b/sycl/include/CL/sycl/multi_ptr.hpp
@@ -92,8 +92,9 @@ template <typename ElementType, access::address_space Space> class multi_ptr {
                 Space == access::address_space::global_space>::type>
   multi_ptr(accessor<ElementType, dimensions, Mode,
                      access::target::global_buffer, isPlaceholder>
-                Accessor)
-      : multi_ptr(Accessor.get_pointer()) {}
+                Accessor) {
+    m_Pointer = reinterpret_cast<pointer_t>(Accessor.get_pointer().m_Pointer);
+  }
 
   // Only if Space == local_space
   template <
diff --git a/sycl/source/detail/event_impl.cpp b/sycl/source/detail/event_impl.cpp
index f767ce1b4701..6505ac373136 100644
--- a/sycl/source/detail/event_impl.cpp
+++ b/sycl/source/detail/event_impl.cpp
@@ -33,6 +33,10 @@ event_impl::~event_impl() {
   }
 }
 
+void event_impl::setComplete() {
+  CHECK_OCL_CODE(clSetUserEventStatus(m_Event, CL_COMPLETE));
+}
+
 void event_impl::waitInternal() const {
   if (!m_HostEvent) {
     CHECK_OCL_CODE(clWaitForEvents(1, &m_Event));

From dd1ac8c38e370521df902f9261408890b276841e Mon Sep 17 00:00:00 2001
From: vromanov <vlad.romanov@intel.com>
Date: Wed, 10 Apr 2019 14:39:57 +0300
Subject: [PATCH 5/6] [SYCL] Introduction of new scheduler.

This patch introduces new scheduler which is enabled when SCHEDULER_20
macro is set(set by default). The new scheduler is based on accessor
rather on buffer, so it will support images.

Also now there are two commands that implements moving memory to
other contexts instead of one - alloca memory and memcpy. There is new
command - release memory which will be ran during the sycl::buffer or
sycl::image object destruction and releases memory instances for the
memory object.

Signed-off-by: Vlad Romanov <vlad.romanov@intel.com>
---
 sycl/CMakeLists.txt                           |   4 +
 sycl/include/CL/sycl.hpp                      |   6 +
 sycl/include/CL/sycl/detail/event_impl.hpp    |   5 +
 sycl/include/CL/sycl/detail/kernel_impl.hpp   |   2 +
 .../CL/sycl/detail/scheduler/commands.hpp     | 239 +++++++++++
 .../CL/sycl/detail/scheduler/scheduler.h      |   7 +
 .../CL/sycl/detail/scheduler/scheduler.hpp    | 175 ++++++++
 sycl/include/CL/sycl/sampler.hpp              |   1 -
 sycl/source/detail/scheduler/commands2.cpp    | 385 ++++++++++++++++++
 .../source/detail/scheduler/graph_builder.cpp | 377 +++++++++++++++++
 .../detail/scheduler/graph_processor.cpp      |  62 +++
 sycl/source/detail/scheduler/scheduler2.cpp   | 141 +++++++
 sycl/test/basic_tests/access_to_subset.cpp    |   5 +
 sycl/test/basic_tests/accessor/accessor.cpp   |   2 -
 .../basic_tests/accessor_static_check.cpp     |  99 -----
 .../test/basic_tests/accessor_syntax_only.cpp | 197 ---------
 sycl/test/basic_tests/buffer/reinterpret.cpp  |   2 +
 sycl/test/basic_tests/buffer/subbuffer.cpp    |   7 +
 sycl/test/fpga_tests/fpga_queue.cpp           |   1 +
 sycl/test/scheduler/Dump.cpp                  |  38 --
 sycl/test/scheduler/parallelReadOpt.cpp       |  92 -----
 sycl/test/sub_group/barrier.cpp               |  12 +-
 sycl/test/sub_group/broadcast.cpp             |  11 +-
 sycl/test/sub_group/common_ocl.cpp            |   2 +-
 sycl/test/sub_group/load_store.cpp            |  20 +-
 sycl/test/sub_group/reduce.cpp                |   2 +-
 sycl/test/sub_group/scan.cpp                  |   2 +-
 sycl/test/sub_group/vote.cpp                  |  11 +-
 28 files changed, 1465 insertions(+), 442 deletions(-)
 create mode 100644 sycl/include/CL/sycl/detail/scheduler/commands.hpp
 create mode 100644 sycl/include/CL/sycl/detail/scheduler/scheduler.hpp
 create mode 100644 sycl/source/detail/scheduler/commands2.cpp
 create mode 100644 sycl/source/detail/scheduler/graph_builder.cpp
 create mode 100644 sycl/source/detail/scheduler/graph_processor.cpp
 create mode 100644 sycl/source/detail/scheduler/scheduler2.cpp
 delete mode 100644 sycl/test/basic_tests/accessor_static_check.cpp
 delete mode 100644 sycl/test/basic_tests/accessor_syntax_only.cpp
 delete mode 100644 sycl/test/scheduler/Dump.cpp
 delete mode 100644 sycl/test/scheduler/parallelReadOpt.cpp

diff --git a/sycl/CMakeLists.txt b/sycl/CMakeLists.txt
index ecd204f3c95e..1aaff8ce5a6a 100644
--- a/sycl/CMakeLists.txt
+++ b/sycl/CMakeLists.txt
@@ -138,8 +138,12 @@ add_library("${SYCLLibrary}" SHARED
   "${sourceRootPath}/detail/os_util.cpp"
   "${sourceRootPath}/detail/sampler_impl.cpp"
   "${sourceRootPath}/detail/scheduler/commands.cpp"
+  "${sourceRootPath}/detail/scheduler/commands2.cpp"
   "${sourceRootPath}/detail/scheduler/printers.cpp"
   "${sourceRootPath}/detail/scheduler/scheduler.cpp"
+  "${sourceRootPath}/detail/scheduler/graph_processor.cpp"
+  "${sourceRootPath}/detail/scheduler/graph_builder.cpp"
+  "${sourceRootPath}/detail/scheduler/scheduler2.cpp"
   "${sourceRootPath}/detail/util.cpp"
   "${sourceRootPath}/context.cpp"
   "${sourceRootPath}/device.cpp"
diff --git a/sycl/include/CL/sycl.hpp b/sycl/include/CL/sycl.hpp
index bfc88a1ae4bc..f5d872489ab8 100644
--- a/sycl/include/CL/sycl.hpp
+++ b/sycl/include/CL/sycl.hpp
@@ -8,6 +8,10 @@
 
 #pragma once
 
+#ifndef SCHEDULER_10
+  #define SCHEDULER_20
+#endif
+
 #include <CL/sycl/accessor.hpp>
 #include <CL/sycl/atomic.hpp>
 #include <CL/sycl/buffer.hpp>
@@ -37,6 +41,7 @@
 #include <CL/sycl/types.hpp>
 #include <CL/sycl/version.hpp>
 
+#ifndef SCHEDULER_20
 // Do not include RT only function implementations for device code as it leads
 // to problem. Should be finally fixed when we introduce library.
 #ifndef __SYCL_DEVICE_ONLY__
@@ -46,3 +51,4 @@
 #include <CL/sycl/detail/scheduler/printers.cpp>
 #include <CL/sycl/detail/scheduler/scheduler.cpp>
 #endif //__SYCL_DEVICE_ONLY__
+#endif // !SCHEDULER_20
diff --git a/sycl/include/CL/sycl/detail/event_impl.hpp b/sycl/include/CL/sycl/detail/event_impl.hpp
index 8b26730e1e88..37b28bc78570 100644
--- a/sycl/include/CL/sycl/detail/event_impl.hpp
+++ b/sycl/include/CL/sycl/detail/event_impl.hpp
@@ -59,11 +59,16 @@ class event_impl {
   // with the cl_event object stored in this class
   void setContextImpl(const ContextImplPtr &Context);
 
+  void *getCommand() { return m_Command; }
+
+  void setCommand(void *Command) { m_Command = Command; }
+
 private:
   cl_event m_Event = nullptr;
   ContextImplPtr m_Context;
   bool m_OpenCLInterop = false;
   bool m_HostEvent = true;
+  void *m_Command = nullptr;
 };
 
 } // namespace detail
diff --git a/sycl/include/CL/sycl/detail/kernel_impl.hpp b/sycl/include/CL/sycl/detail/kernel_impl.hpp
index db5a6af110cd..64158cffd331 100644
--- a/sycl/include/CL/sycl/detail/kernel_impl.hpp
+++ b/sycl/include/CL/sycl/detail/kernel_impl.hpp
@@ -112,6 +112,8 @@ class kernel_impl {
                                                            Device.get(), Value);
   }
 
+  cl_kernel &getHandleRef() { return ClKernel; }
+
 private:
   cl_kernel ClKernel;
   context Context;
diff --git a/sycl/include/CL/sycl/detail/scheduler/commands.hpp b/sycl/include/CL/sycl/detail/scheduler/commands.hpp
new file mode 100644
index 000000000000..1446a816b9dd
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/scheduler/commands.hpp
@@ -0,0 +1,239 @@
+//==-------------- commands.hpp - SYCL standard header file ----------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <vector>
+
+#include <CL/sycl/detail/accessor_impl.hpp>
+#include <CL/sycl/detail/cg.hpp>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+class queue_impl;
+class event_impl;
+class context_impl;
+
+using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
+using EventImplPtr = std::shared_ptr<detail::event_impl>;
+using ContextImplPtr = std::shared_ptr<detail::context_impl>;
+
+class Command;
+class AllocaCommand;
+class ReleaseCommand;
+
+// DepDesc represents dependency between two commands
+struct DepDesc {
+  DepDesc(Command *DepCommand, Requirement *Req, AllocaCommand *AllocaCmd)
+      : MDepCommand(DepCommand), MReq(Req), MAllocaCmd(AllocaCmd) {}
+
+  friend bool operator<(const DepDesc &Lhs, const DepDesc &Rhs) {
+    return std::tie(Lhs.MReq, Lhs.MDepCommand) <
+           std::tie(Rhs.MReq, Rhs.MDepCommand);
+  }
+
+  // The actual dependency command.
+  Command *MDepCommand = nullptr;
+  // Requirement for the dependency.
+  Requirement *MReq = nullptr;
+  // Allocation command for the memory object we have requirement for.
+  // Used to simplify searching for memory handle.
+  AllocaCommand *MAllocaCmd = nullptr;
+};
+
+// The Command represents some action that needs to be performed on one or more
+// memory objects. The command has vector of Depdesc objects that represent
+// dependencies of the command. It has vector of pointer to commands that depend
+// on the command. It has pointer to sycl::queue object. And has event that is
+// associated with the command.
+class Command {
+public:
+  enum CommandType {
+    RUN_CG,
+    COPY_MEMORY,
+    ALLOCA,
+    RELEASE,
+    MAP_MEM_OBJ,
+    UNMAP_MEM_OBJ
+  };
+
+  Command(CommandType Type, QueueImplPtr Queue);
+
+  void addDep(DepDesc NewDep) {
+    if (NewDep.MDepCommand)
+      MDepsEvents.push_back(NewDep.MDepCommand->getEvent());
+    MDeps.push_back(NewDep);
+  }
+
+  void addDep(EventImplPtr Event) { MDepsEvents.push_back(std::move(Event)); }
+
+  void addUser(Command *NewUser) { MUsers.push_back(NewUser); }
+
+  // Return type of the command, e.g. Allocate, MemoryCopy.
+  CommandType getType() const { return MType; }
+
+  // The method checks if the command is enqueued, call enqueueImp if not and
+  // returns CL_SUCCESS on success.
+  cl_int enqueue();
+
+  bool isFinished();
+
+  bool isEnqueued() const { return MEnqueued; }
+
+  std::shared_ptr<queue_impl> getQueue() const { return MQueue; }
+
+  std::shared_ptr<event_impl> getEvent() const { return MEvent; }
+
+protected:
+  EventImplPtr MEvent;
+  QueueImplPtr MQueue;
+  std::vector<EventImplPtr> MDepsEvents;
+
+  std::vector<cl_event> prepareEvents(ContextImplPtr Context);
+
+  // Private interface. Derived classes should implement this method.
+  virtual cl_int enqueueImp() = 0;
+
+public:
+  std::vector<DepDesc> MDeps;
+  std::vector<Command *> MUsers;
+
+private:
+  CommandType MType;
+  std::atomic<bool> MEnqueued;
+};
+
+// The command enqueues release instance of memory allocated on Host or
+// underlying framework.
+class ReleaseCommand : public Command {
+public:
+  ReleaseCommand(QueueImplPtr Queue, AllocaCommand *AllocaCmd)
+      : Command(CommandType::RELEASE, std::move(Queue)), MAllocaCmd(AllocaCmd) {
+  }
+private:
+  cl_int enqueueImp() override;
+
+  AllocaCommand *MAllocaCmd = nullptr;
+};
+
+// The command enqueues allocation of instance of memory object on Host or
+// underlying framework.
+class AllocaCommand : public Command {
+public:
+  AllocaCommand(QueueImplPtr Queue, Requirement Req,
+                bool InitFromUserData = true)
+      : Command(CommandType::ALLOCA, Queue), MReleaseCmd(Queue, this),
+        MInitFromUserData(InitFromUserData), MReq(std::move(Req)) {
+    addDep(DepDesc(nullptr, &MReq, this));
+  }
+  ReleaseCommand *getReleaseCmd() { return &MReleaseCmd; }
+
+  SYCLMemObjT *getSYCLMemObj() const { return MReq.MSYCLMemObj; }
+
+  void *getMemAllocation() const { return MMemAllocation; }
+
+  Requirement *getAllocationReq() { return &MReq; }
+
+private:
+  cl_int enqueueImp() override;
+
+  ReleaseCommand MReleaseCmd;
+  void *MMemAllocation = nullptr;
+  bool MInitFromUserData = false;
+  Requirement MReq;
+};
+
+class MapMemObject : public Command {
+public:
+  MapMemObject(Requirement SrcReq, AllocaCommand *SrcAlloca,
+               Requirement *DstAcc, QueueImplPtr Queue);
+
+  Requirement MSrcReq;
+  AllocaCommand *MSrcAlloca = nullptr;
+  Requirement *MDstAcc = nullptr;
+  Requirement MDstReq;
+
+private:
+  cl_int enqueueImp() override;
+};
+
+class UnMapMemObject : public Command {
+public:
+  UnMapMemObject(Requirement SrcReq, AllocaCommand *SrcAlloca,
+                 Requirement *DstAcc, QueueImplPtr Queue);
+
+private:
+  cl_int enqueueImp() override;
+
+  Requirement MSrcReq;
+  AllocaCommand *MSrcAlloca = nullptr;
+  Requirement *MDstAcc = nullptr;
+};
+
+// The command enqueues memory copy between two instances of memory object.
+class MemCpyCommand : public Command {
+public:
+  MemCpyCommand(Requirement SrcReq, AllocaCommand *SrcAlloca,
+                Requirement DstReq, AllocaCommand *DstAlloca,
+                QueueImplPtr SrcQueue, QueueImplPtr DstQueue);
+
+  QueueImplPtr MSrcQueue;
+  Requirement MSrcReq;
+  AllocaCommand *MSrcAlloca = nullptr;
+  Requirement MDstReq;
+  AllocaCommand *MDstAlloca = nullptr;
+  Requirement *MAccToUpdate = nullptr;
+
+  void setAccessorToUpdate(Requirement *AccToUpdate) {
+    MAccToUpdate = AccToUpdate;
+  }
+
+private:
+  cl_int enqueueImp() override;
+};
+
+// The command enqueues memory copy between two instances of memory object.
+class MemCpyCommandHost : public Command {
+public:
+  MemCpyCommandHost(Requirement SrcReq, AllocaCommand *SrcAlloca,
+                    Requirement *DstAcc, QueueImplPtr SrcQueue,
+                    QueueImplPtr DstQueue);
+
+  QueueImplPtr MSrcQueue;
+  Requirement MSrcReq;
+  AllocaCommand *MSrcAlloca = nullptr;
+  Requirement MDstReq;
+  Requirement *MDstAcc = nullptr;
+
+private:
+  cl_int enqueueImp() override;
+};
+
+// The command enqueues execution of kernel or explicit memory operation.
+class ExecCGCommand : public Command {
+public:
+  ExecCGCommand(std::unique_ptr<detail::CG> CommandGroup, QueueImplPtr Queue)
+      : Command(CommandType::RUN_CG, std::move(Queue)),
+        MCommandGroup(std::move(CommandGroup)) {}
+
+private:
+  // Implementation of enqueueing of ExecCGCommand.
+  cl_int enqueueImp() override;
+
+  AllocaCommand *getAllocaForReq(Requirement *Req);
+
+  std::unique_ptr<detail::CG> MCommandGroup;
+};
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/detail/scheduler/scheduler.h b/sycl/include/CL/sycl/detail/scheduler/scheduler.h
index 8e93cb90a0f2..9270a34f3e43 100644
--- a/sycl/include/CL/sycl/detail/scheduler/scheduler.h
+++ b/sycl/include/CL/sycl/detail/scheduler/scheduler.h
@@ -6,6 +6,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#ifdef SCHEDULER_20
+
+#include <CL/sycl/detail/scheduler/scheduler.hpp>
+
+#else
+
 #pragma once
 
 #include <CL/sycl/context.hpp>
@@ -240,3 +246,4 @@ class Scheduler {
 } // namespace simple_scheduler
 } // namespace sycl
 } // namespace cl
+#endif // SCHEDULER_20
diff --git a/sycl/include/CL/sycl/detail/scheduler/scheduler.hpp b/sycl/include/CL/sycl/detail/scheduler/scheduler.hpp
new file mode 100644
index 000000000000..4da5618815a1
--- /dev/null
+++ b/sycl/include/CL/sycl/detail/scheduler/scheduler.hpp
@@ -0,0 +1,175 @@
+//==-------------- scheduler.hpp - SYCL standard header file ---------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <CL/sycl/detail/cg.hpp>
+#include <CL/sycl/detail/scheduler/commands.hpp>
+#include <CL/sycl/detail/sycl_mem_obj.hpp>
+
+#include <memory>
+#include <mutex>
+#include <set>
+#include <vector>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+class queue_impl;
+class event_impl;
+class context_impl;
+
+using QueueImplPtr = std::shared_ptr<detail::queue_impl>;
+using EventImplPtr = std::shared_ptr<detail::event_impl>;
+using ContextImplPtr = std::shared_ptr<detail::context_impl>;
+
+class Scheduler {
+public:
+  // Registers command group, adds it to the dependency graph and returns an
+  // event object that can be used for waiting later. It's called by SYCL's
+  // queue.submit.
+  EventImplPtr addCG(std::unique_ptr<detail::CG> CommandGroup,
+                     QueueImplPtr Queue);
+
+  EventImplPtr addCopyBack(Requirement *Req);
+
+  // Blocking call that waits for the event passed. For the eager execution mode
+  // this method invokes corresponding function of device API. In the lazy
+  // execution mode the method may enqueue the command associated with the event
+  // passed and its dependency before calling device API.
+  void waitForEvent(EventImplPtr Event);
+
+  // Removes buffer pointed by MemObj from the graph: ensures all commands
+  // accessing the memory objects are executed and triggers deallocation of all
+  // memory assigned to the memory object. It's called from the sycl::buffer and
+  // sycl::image destructors.
+  void removeMemoryObject(detail::SYCLMemObjT *MemObj);
+
+  EventImplPtr addHostAccessor(Requirement *Req);
+
+  // Returns an instance of the scheduler object.
+  static Scheduler &getInstance();
+
+  // Returns list of "immediate" dependencies for the Event given.
+  std::vector<EventImplPtr> getWaitList(EventImplPtr Event);
+
+  QueueImplPtr getDefaultHostQueue() { return DefaultHostQueue; }
+
+private:
+  Scheduler();
+  ~Scheduler();
+
+  // The graph builder provides interfaces that can change already existing
+  // graph (e.g. add/remove edges/nodes).
+  class GraphBuilder {
+  public:
+    // Registers command group, adds it to the dependency graph and returns an
+    // command that represents command group execution. It's called by SYCL's
+    // queue::submit.
+    Command *addCG(std::unique_ptr<detail::CG> CommandGroup,
+                   QueueImplPtr Queue);
+
+    Command *addCGUpdateHost(std::unique_ptr<detail::CG> CommandGroup,
+                             QueueImplPtr HostQueue);
+
+    Command *addCopyBack(Requirement *Req);
+    Command *addHostAccessor(Requirement *Req, EventImplPtr &RetEvent);
+
+    // [Provisional] Optimizes the whole graph.
+    void optimize();
+
+    // [Provisional] Optimizes subgraph that consists of command associated with
+    // Event passed and its dependencies.
+    void optimize(EventImplPtr Event);
+
+    // Removes unneeded commands from the graph.
+    void cleanupCommands(bool CleanupReleaseCommands = false);
+
+    // Reschedules command passed using Queue provided. this can lead to
+    // rescheduling of all dependent commands. This can be used when user
+    // provides "secondary" queue to submit method which may be used when
+    // command fails to enqueue/execute in primary queue.
+    void rescheduleCommand(Command *Cmd, QueueImplPtr Queue);
+
+    // The MemObjRecord is created for each memory object used in command
+    // groups. There should be only one MemObjRecord for SYCL memory object.
+
+    struct MemObjRecord {
+      // Used to distinguish one memory object from another.
+      detail::SYCLMemObjT *MMemObj;
+
+      // Contains all allocation commands for the memory object.
+      std::vector<AllocaCommand *> MAllocaCommands;
+
+      // Contains all commands access the buffer that has no users -
+      // commands that depend on them(except release command);
+      std::vector<Command *> MLeafs;
+
+      // The flag indicates that the content of the memory object was/will be
+      // modified. Used while deciding if copy back needed.
+      bool MMemModified;
+    };
+
+    MemObjRecord *getMemObjRecord(SYCLMemObjT *MemObject);
+    // Returns pointer to MemObjRecord for pointer to memory object.
+    // Return nullptr if there the record is not found.
+    MemObjRecord *getOrInsertMemObjRecord(const QueueImplPtr &Queue,
+                                          Requirement *Req);
+
+    // Removes MemObjRecord for memory object passed.
+    void removeRecordForMemObj(SYCLMemObjT *MemObject);
+
+    std::vector<MemObjRecord> MMemObjRecords;
+
+  private:
+    // The method inserts memory copy operation from the context where the
+    // memory current lives to the context bound to Queue.
+    MemCpyCommand *insertMemCpyCmd(MemObjRecord *Record, Requirement *Req,
+                                   const QueueImplPtr &Queue);
+
+    std::set<Command *> findDepsForReq(MemObjRecord *Record, Requirement *Req,
+                                       QueueImplPtr Context);
+
+    AllocaCommand *findAllocaForReq(MemObjRecord *Record, Requirement *Req,
+                                    QueueImplPtr Queue);
+
+    void markModifiedIfWrite(GraphBuilder::MemObjRecord *Record,
+                             Requirement *Req);
+  };
+
+  // The class that provides interfaces for enqueueing command and its
+  // dependencies to the underlying runtime. Methods of this class must not
+  // modify the graph.
+  class GraphProcessor {
+  public:
+    // Returns a list of events that represent immediate dependencies of the
+    // command associated with Event passed.
+    static std::vector<EventImplPtr> getWaitList(EventImplPtr Event);
+
+    // Wait for the command, associated with Event passed, is completed.
+    static void waitForEvent(EventImplPtr Event);
+
+    // Enqueue the command passed to the underlying device.
+    // Returns pointer to command which failed to enqueue, so this command
+    // with all commands that depend on it can be rescheduled.
+    static Command *enqueueCommand(Command *Cmd);
+  };
+
+  void waitForRecordToFinish(GraphBuilder::MemObjRecord *Record);
+
+  GraphBuilder MGraphBuilder;
+  // Use read-write mutex in future.
+  std::mutex MGraphLock;
+
+  QueueImplPtr DefaultHostQueue;
+};
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/include/CL/sycl/sampler.hpp b/sycl/include/CL/sycl/sampler.hpp
index f8fd7aff786b..05507f808744 100644
--- a/sycl/include/CL/sycl/sampler.hpp
+++ b/sycl/include/CL/sycl/sampler.hpp
@@ -11,7 +11,6 @@
 #include <CL/__spirv/spirv_types.hpp>
 #include <CL/sycl/detail/common.hpp>
 #include <CL/sycl/detail/sampler_impl.hpp>
-#include <CL/sycl/handler.hpp>
 
 namespace cl {
 namespace sycl {
diff --git a/sycl/source/detail/scheduler/commands2.cpp b/sycl/source/detail/scheduler/commands2.cpp
new file mode 100644
index 000000000000..5b1bb547aa05
--- /dev/null
+++ b/sycl/source/detail/scheduler/commands2.cpp
@@ -0,0 +1,385 @@
+//===----------- commands.cpp - SYCL commands -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CL/sycl/access/access.hpp"
+#include <CL/cl.h>
+#include <CL/sycl/detail/event_impl.hpp>
+#include <CL/sycl/detail/kernel_desc.hpp>
+#include <CL/sycl/detail/memory_manager.hpp>
+#include <CL/sycl/detail/program_manager/program_manager.hpp>
+#include <CL/sycl/detail/queue_impl.hpp>
+#include <CL/sycl/detail/scheduler/commands.hpp>
+#include <CL/sycl/detail/scheduler/scheduler.hpp>
+#include <CL/sycl/sampler.hpp>
+
+#include <vector>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+void EventCompletionClbk(cl_event, cl_int, void *data) {
+  // TODO: Handle return values. Store errors to async handler.
+  clSetUserEventStatus((cl_event)data, CL_COMPLETE);
+}
+
+// Method prepares cl_event's from list sycl::event's
+std::vector<cl_event> Command::prepareEvents(ContextImplPtr Context) {
+  std::vector<cl_event> Result;
+  std::vector<EventImplPtr> GlueEvents;
+  for (EventImplPtr &Event : MDepsEvents) {
+    // Async work is not supported for host device.
+    if (Event->getContextImpl()->is_host()) {
+      Event->waitInternal();
+      continue;
+    }
+    // The event handle can be null in case of, for example, alloca command,
+    // which is currently synchrounious, so don't generate OpenCL event.
+    if (Event->getHandleRef() == nullptr) {
+      continue;
+    }
+    ContextImplPtr EventContext = Event->getContextImpl();
+
+    // If contexts don't match - connect them using user event
+    if (EventContext != Context && !Context->is_host()) {
+      cl_int Error = CL_SUCCESS;
+
+      EventImplPtr GlueEvent(new detail::event_impl());
+      GlueEvent->setContextImpl(Context);
+
+      cl_event &GlueEventHandle = GlueEvent->getHandleRef();
+      GlueEventHandle = clCreateUserEvent(Context->getHandleRef(), &Error);
+      CHECK_OCL_CODE(Error);
+
+      Error = clSetEventCallback(Event->getHandleRef(), CL_COMPLETE,
+                                 EventCompletionClbk, /*data=*/GlueEventHandle);
+      CHECK_OCL_CODE(Error);
+      GlueEvents.push_back(std::move(GlueEvent));
+      Result.push_back(GlueEventHandle);
+      continue;
+    }
+    Result.push_back(Event->getHandleRef());
+  }
+  MDepsEvents.insert(MDepsEvents.end(), GlueEvents.begin(), GlueEvents.end());
+  return Result;
+}
+
+Command::Command(CommandType Type, QueueImplPtr Queue)
+    : MQueue(std::move(Queue)), MType(Type), MEnqueued(false) {
+  MEvent.reset(new detail::event_impl());
+  MEvent->setCommand(this);
+  MEvent->setContextImpl(detail::getSyclObjImpl(MQueue->get_context()));
+}
+
+cl_int Command::enqueue() {
+  bool Expected = false;
+  if (MEnqueued.compare_exchange_strong(Expected, true))
+    return enqueueImp();
+  return CL_SUCCESS;
+}
+
+cl_int AllocaCommand::enqueueImp() {
+  std::vector<cl_event> RawEvents =
+      Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context()));
+
+  cl_event &Event = MEvent->getHandleRef();
+  MMemAllocation = MemoryManager::allocate(
+      detail::getSyclObjImpl(MQueue->get_context()), getSYCLMemObj(),
+      MInitFromUserData, std::move(RawEvents), Event);
+  return CL_SUCCESS;
+}
+
+cl_int ReleaseCommand::enqueueImp() {
+  std::vector<cl_event> RawEvents =
+      Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context()));
+
+  cl_event &Event = MEvent->getHandleRef();
+  MemoryManager::release(detail::getSyclObjImpl(MQueue->get_context()),
+                         MAllocaCmd->getSYCLMemObj(),
+                         MAllocaCmd->getMemAllocation(), std::move(RawEvents),
+                         Event);
+  return CL_SUCCESS;
+}
+
+MapMemObject::MapMemObject(Requirement SrcReq, AllocaCommand *SrcAlloca,
+                           Requirement *DstAcc, QueueImplPtr Queue)
+    : Command(CommandType::MAP_MEM_OBJ, std::move(Queue)),
+      MSrcReq(std::move(SrcReq)), MSrcAlloca(SrcAlloca), MDstAcc(DstAcc),
+      MDstReq(*DstAcc) {}
+
+cl_int MapMemObject::enqueueImp() {
+  std::vector<cl_event> RawEvents =
+      Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context()));
+  assert(MDstReq.getNumOfDims() == 1);
+
+  cl_event &Event = MEvent->getHandleRef();
+  void *MappedPtr = MemoryManager::map(
+      MSrcAlloca->getSYCLMemObj(), MSrcAlloca->getMemAllocation(), MQueue,
+      MDstReq.MAccessMode, MDstReq.MDims, MDstReq.MOrigRange, MDstReq.MRange,
+      MDstReq.MOffset, MDstReq.MElemSize, std::move(RawEvents), Event);
+  MDstAcc->MData = MappedPtr;
+  return CL_SUCCESS;
+}
+
+UnMapMemObject::UnMapMemObject(Requirement SrcReq, AllocaCommand *SrcAlloca,
+                               Requirement *DstAcc, QueueImplPtr Queue)
+    : Command(CommandType::UNMAP_MEM_OBJ, std::move(Queue)),
+      MSrcReq(std::move(SrcReq)), MSrcAlloca(SrcAlloca), MDstAcc(DstAcc) {}
+
+cl_int UnMapMemObject::enqueueImp() {
+  std::vector<cl_event> RawEvents =
+      Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context()));
+
+  cl_event &Event = MEvent->getHandleRef();
+  MemoryManager::unmap(MSrcAlloca->getSYCLMemObj(),
+                       MSrcAlloca->getMemAllocation(), MQueue, MDstAcc->MData,
+                       std::move(RawEvents), Event);
+  return CL_SUCCESS;
+}
+
+MemCpyCommand::MemCpyCommand(Requirement SrcReq, AllocaCommand *SrcAlloca,
+                             Requirement DstReq, AllocaCommand *DstAlloca,
+                             QueueImplPtr SrcQueue, QueueImplPtr DstQueue)
+    : Command(CommandType::COPY_MEMORY, std::move(DstQueue)),
+      MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)), MSrcAlloca(SrcAlloca),
+      MDstReq(std::move(DstReq)), MDstAlloca(DstAlloca) {
+  if (!MSrcQueue->is_host())
+    MEvent->setContextImpl(detail::getSyclObjImpl(MSrcQueue->get_context()));
+}
+
+cl_int MemCpyCommand::enqueueImp() {
+  std::vector<cl_event> RawEvents;
+  QueueImplPtr Queue = MQueue->is_host() ? MSrcQueue : MQueue;
+  RawEvents =
+      Command::prepareEvents(detail::getSyclObjImpl(Queue->get_context()));
+
+  cl_event &Event = MEvent->getHandleRef();
+
+  // Omit copying if mode is discard one.
+  // TODO: Handle this at the graph building time by, for example, creating
+  // empty node instead of memcpy.
+  if (MDstReq.MAccessMode == access::mode::discard_read_write ||
+      MDstReq.MAccessMode == access::mode::discard_write ||
+      MSrcAlloca->getMemAllocation() == MDstAlloca->getMemAllocation()) {
+
+    if (!RawEvents.empty()) {
+      if (Queue->is_host()) {
+        CHECK_OCL_CODE(clWaitForEvents(RawEvents.size(), &RawEvents[0]));
+      } else {
+        CHECK_OCL_CODE(clEnqueueMarkerWithWaitList(
+            Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event));
+      }
+    }
+  } else {
+    MemoryManager::copy(
+        MSrcAlloca->getSYCLMemObj(), MSrcAlloca->getMemAllocation(), MSrcQueue,
+        MSrcReq.MDims, MSrcReq.MOrigRange, MSrcReq.MRange, MSrcReq.MOffset,
+        MSrcReq.MElemSize, MDstAlloca->getMemAllocation(), MQueue,
+        MDstReq.MDims, MDstReq.MOrigRange, MDstReq.MRange, MDstReq.MOffset,
+        MDstReq.MElemSize, std::move(RawEvents), Event);
+  }
+
+  if (MAccToUpdate)
+    MAccToUpdate->MData = MDstAlloca->getMemAllocation();
+  return CL_SUCCESS;
+}
+
+AllocaCommand *ExecCGCommand::getAllocaForReq(Requirement *Req) {
+  for (const DepDesc &Dep : MDeps) {
+    if (Dep.MReq == Req)
+      return Dep.MAllocaCmd;
+  }
+  throw runtime_error("Alloca for command not found");
+}
+
+MemCpyCommandHost::MemCpyCommandHost(Requirement SrcReq,
+                                     AllocaCommand *SrcAlloca,
+                                     Requirement *DstAcc, QueueImplPtr SrcQueue,
+                                     QueueImplPtr DstQueue)
+    : Command(CommandType::COPY_MEMORY, std::move(DstQueue)),
+      MSrcQueue(SrcQueue), MSrcReq(std::move(SrcReq)), MSrcAlloca(SrcAlloca),
+      MDstReq(*DstAcc), MDstAcc(DstAcc) {
+  if (!MSrcQueue->is_host())
+    MEvent->setContextImpl(detail::getSyclObjImpl(MSrcQueue->get_context()));
+}
+
+cl_int MemCpyCommandHost::enqueueImp() {
+  QueueImplPtr Queue = MQueue->is_host() ? MSrcQueue : MQueue;
+  std::vector<cl_event> RawEvents =
+      Command::prepareEvents(detail::getSyclObjImpl(Queue->get_context()));
+
+  cl_event &Event = MEvent->getHandleRef();
+  // Omit copying if mode is discard one.
+  // TODO: Handle this at the graph building time by, for example, creating
+  // empty node instead of memcpy.
+  if (MDstReq.MAccessMode == access::mode::discard_read_write ||
+      MDstReq.MAccessMode == access::mode::discard_write) {
+
+    if (!RawEvents.empty()) {
+      if (Queue->is_host()) {
+        CHECK_OCL_CODE(clWaitForEvents(RawEvents.size(), &RawEvents[0]));
+      } else {
+        CHECK_OCL_CODE(clEnqueueMarkerWithWaitList(
+            Queue->getHandleRef(), RawEvents.size(), &RawEvents[0], &Event));
+      }
+    }
+    return CL_SUCCESS;
+  }
+
+  MemoryManager::copy(MSrcAlloca->getSYCLMemObj(),
+                      MSrcAlloca->getMemAllocation(), MSrcQueue, MSrcReq.MDims,
+                      MSrcReq.MOrigRange, MSrcReq.MRange, MSrcReq.MOffset,
+                      MSrcReq.MElemSize, MDstAcc->MData, MQueue, MDstReq.MDims,
+                      MDstReq.MOrigRange, MDstReq.MRange, MDstReq.MOffset,
+                      MDstReq.MElemSize, std::move(RawEvents), Event);
+  return CL_SUCCESS;
+}
+
+cl_int ExecCGCommand::enqueueImp() {
+  std::vector<cl_event> RawEvents =
+      Command::prepareEvents(detail::getSyclObjImpl(MQueue->get_context()));
+
+  cl_event &Event = MEvent->getHandleRef();
+
+  switch (MCommandGroup->getType()) {
+
+  case CG::CGTYPE::UPDATE_HOST: {
+    assert(!"Update host should be handled by the Scheduler.");
+    throw runtime_error("Update host should be handled by the Scheduler.");
+  }
+  case CG::CGTYPE::COPY_ACC_TO_PTR: {
+    CGCopy *Copy = (CGCopy *)MCommandGroup.get();
+    Requirement *Req = (Requirement *)Copy->getSrc();
+    AllocaCommand *AllocaCmd = getAllocaForReq(Req);
+
+    MemoryManager::copy(
+        AllocaCmd->getSYCLMemObj(), AllocaCmd->getMemAllocation(), MQueue,
+        Req->MDims, Req->MOrigRange, Req->MRange, Req->MOffset, Req->MElemSize,
+        Copy->getDst(), Scheduler::getInstance().getDefaultHostQueue(),
+        Req->MDims, Req->MRange, Req->MRange,
+        /*DstOffset=*/{0, 0, 0}, Req->MElemSize, std::move(RawEvents), Event);
+    return CL_SUCCESS;
+  }
+  case CG::CGTYPE::COPY_PTR_TO_ACC: {
+    CGCopy *Copy = (CGCopy *)MCommandGroup.get();
+    Requirement *Req = (Requirement *)(Copy->getDst());
+    AllocaCommand *AllocaCmd = getAllocaForReq(Req);
+
+    Scheduler::getInstance().getDefaultHostQueue();
+
+    MemoryManager::copy(
+        AllocaCmd->getSYCLMemObj(), Copy->getSrc(),
+        Scheduler::getInstance().getDefaultHostQueue(), Req->MDims, Req->MRange,
+        Req->MRange, /*SrcOffset*/ {0, 0, 0}, Req->MElemSize,
+        AllocaCmd->getMemAllocation(), MQueue, Req->MDims, Req->MOrigRange,
+        Req->MRange, Req->MOffset, Req->MElemSize, std::move(RawEvents), Event);
+
+    return CL_SUCCESS;
+  }
+  case CG::CGTYPE::COPY_ACC_TO_ACC: {
+    CGCopy *Copy = (CGCopy *)MCommandGroup.get();
+    Requirement *ReqSrc = (Requirement *)(Copy->getSrc());
+    Requirement *ReqDst = (Requirement *)(Copy->getDst());
+
+    AllocaCommand *AllocaCmdSrc = getAllocaForReq(ReqSrc);
+    AllocaCommand *AllocaCmdDst = getAllocaForReq(ReqDst);
+
+    MemoryManager::copy(
+        AllocaCmdSrc->getSYCLMemObj(), AllocaCmdSrc->getMemAllocation(), MQueue,
+        ReqSrc->MDims, ReqSrc->MOrigRange, ReqSrc->MRange, ReqSrc->MOffset,
+        ReqSrc->MElemSize, AllocaCmdDst->getMemAllocation(), MQueue,
+        ReqDst->MDims, ReqDst->MOrigRange, ReqDst->MRange, ReqDst->MOffset,
+        ReqDst->MElemSize, std::move(RawEvents), Event);
+    return CL_SUCCESS;
+  }
+  case CG::CGTYPE::FILL: {
+    CGFill *Fill = (CGFill *)MCommandGroup.get();
+    Requirement *Req = (Requirement *)(Fill->getReqToFill());
+    AllocaCommand *AllocaCmd = getAllocaForReq(Req);
+
+    MemoryManager::fill(AllocaCmd->getSYCLMemObj(),
+                        AllocaCmd->getMemAllocation(), MQueue,
+                        Fill->MPattern.size(), Fill->MPattern.data(),
+                        Req->MDims, Req->MOrigRange, Req->MRange, Req->MOffset,
+                        Req->MElemSize, std::move(RawEvents), Event);
+    return CL_SUCCESS;
+  }
+  case CG::CGTYPE::KERNEL: {
+    CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
+
+    NDRDescT &NDRDesc = ExecKernel->MNDRDesc;
+
+    if (MQueue->is_host()) {
+      for (ArgDesc &Arg : ExecKernel->MArgs)
+        if (kernel_param_kind_t::kind_accessor == Arg.MType) {
+          Requirement *Req = (Requirement *)(Arg.MPtr);
+          AllocaCommand *AllocaCmd = getAllocaForReq(Req);
+          Req->MData = AllocaCmd->getMemAllocation();
+        }
+      if (!RawEvents.empty())
+        CHECK_OCL_CODE(clWaitForEvents(RawEvents.size(), &RawEvents[0]));
+      ExecKernel->MHostKernel->call(NDRDesc);
+      return CL_SUCCESS;
+    }
+
+    // Run OpenCL kernel
+    sycl::context Context = MQueue->get_context();
+    cl_kernel Kernel = nullptr;
+
+    if (nullptr != ExecKernel->MSyclKernel) {
+      assert(ExecKernel->MSyclKernel->get_context() == Context);
+      Kernel = ExecKernel->MSyclKernel->getHandleRef();
+    } else
+      Kernel = detail::ProgramManager::getInstance().getOrCreateKernel(
+          ExecKernel->MOSModuleHandle, Context, ExecKernel->MKernelName);
+
+    for (ArgDesc &Arg : ExecKernel->MArgs) {
+      switch (Arg.MType) {
+      case kernel_param_kind_t::kind_accessor: {
+        Requirement *Req = (Requirement *)(Arg.MPtr);
+        AllocaCommand *AllocaCmd = getAllocaForReq(Req);
+        cl_mem MemArg = (cl_mem)AllocaCmd->getMemAllocation();
+
+        CHECK_OCL_CODE(
+            clSetKernelArg(Kernel, Arg.MIndex, sizeof(cl_mem), &MemArg));
+        break;
+      }
+      case kernel_param_kind_t::kind_std_layout: {
+        CHECK_OCL_CODE(clSetKernelArg(Kernel, Arg.MIndex, Arg.MSize, Arg.MPtr));
+        break;
+      }
+      case kernel_param_kind_t::kind_sampler: {
+        sampler *SamplerPtr = (sampler *)Arg.MPtr;
+        cl_sampler CLSampler =
+            detail::getSyclObjImpl(*SamplerPtr)->getOrCreateSampler(Context);
+        CHECK_OCL_CODE(
+            clSetKernelArg(Kernel, Arg.MIndex, sizeof(cl_sampler), &CLSampler));
+        break;
+      }
+      default:
+        assert(!"Unhandled");
+      }
+    }
+
+    cl_int Error = CL_SUCCESS;
+    Error = clEnqueueNDRangeKernel(
+        MQueue->getHandleRef(), Kernel, NDRDesc.Dims, &NDRDesc.GlobalOffset[0],
+        &NDRDesc.GlobalSize[0], &NDRDesc.LocalSize[0], RawEvents.size(),
+        RawEvents.empty() ? nullptr : &RawEvents[0], &Event);
+    CHECK_OCL_CODE(Error);
+    return CL_SUCCESS;
+  }
+  }
+
+  assert(!"CG type not implemented");
+  throw runtime_error("CG type not implemented.");
+}
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/detail/scheduler/graph_builder.cpp b/sycl/source/detail/scheduler/graph_builder.cpp
new file mode 100644
index 000000000000..5149c49f0099
--- /dev/null
+++ b/sycl/source/detail/scheduler/graph_builder.cpp
@@ -0,0 +1,377 @@
+//===-- graph_builder.cpp - SYCL Graph Builder ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/access/access.hpp>
+#include <CL/sycl/detail/event_impl.hpp>
+#include <CL/sycl/detail/memory_manager.hpp>
+#include <CL/sycl/detail/queue_impl.hpp>
+#include <CL/sycl/detail/scheduler/scheduler.hpp>
+#include <CL/sycl/exception.hpp>
+
+#include <memory>
+#include <set>
+#include <vector>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+// The function check whether two requirements overlaps or not. This
+// information can be used to prove that executing two kernels that
+// work on different parts of the memory object in parallel is legal.
+static bool doOverlap(const Requirement *LHS, const Requirement *RHS) {
+  // TODO: Implement check for one dimensional case only. It will be
+  // enough for most of the cases because 2d and 3d sub-buffers cannot
+  // be mapped to OpenCL's ones.
+  return true;
+}
+
+// Returns record for the memory objects passed, nullptr if doesn't exist.
+Scheduler::GraphBuilder::MemObjRecord *
+Scheduler::GraphBuilder::getMemObjRecord(SYCLMemObjT *MemObject) {
+  const auto It = std::find_if(MMemObjRecords.begin(), MMemObjRecords.end(),
+                               [MemObject](const MemObjRecord &Record) {
+                                 return Record.MMemObj == MemObject;
+                               });
+  return (MMemObjRecords.end() != It) ? &*It : nullptr;
+}
+
+// Returns record for the memory object requirement refers to, if doesn't
+// exist, creates new one add populate it with initial alloca command.
+Scheduler::GraphBuilder::MemObjRecord *
+Scheduler::GraphBuilder::getOrInsertMemObjRecord(const QueueImplPtr &Queue,
+                                                 Requirement *Req) {
+  SYCLMemObjT *MemObject = Req->MSYCLMemObj;
+  Scheduler::GraphBuilder::MemObjRecord *Record = getMemObjRecord(MemObject);
+  if (nullptr != Record)
+    return Record;
+
+  // Construct requirement which describes full buffer because we allocate
+  // only full-sized memory objects.
+  Requirement AllocaReq(/*Offset*/ {0, 0, 0}, Req->MOrigRange, Req->MOrigRange,
+                        access::mode::discard_write, MemObject, Req->MDims,
+                        Req->MElemSize);
+
+  AllocaCommand *AllocaCmd = new AllocaCommand(Queue, std::move(AllocaReq));
+  MemObjRecord NewRecord{MemObject, {AllocaCmd}, {AllocaCmd}, false};
+
+  MMemObjRecords.push_back(std::move(NewRecord));
+  return &MMemObjRecords.back();
+}
+
+// Helper function which removes all values in Cmds from Leafs
+static void UpdateLeafs(const std::set<Command *> &Cmds,
+                        std::vector<Command *> &Leafs) {
+  for (const Command *Cmd : Cmds) {
+    auto NewEnd = std::remove(Leafs.begin(), Leafs.end(), Cmd);
+    Leafs.resize(std::distance(Leafs.begin(), NewEnd));
+  }
+}
+
+MemCpyCommand *
+Scheduler::GraphBuilder::insertMemCpyCmd(MemObjRecord *Record, Requirement *Req,
+                                         const QueueImplPtr &Queue) {
+
+  Requirement FullReq(/*Offset*/ {0, 0, 0}, Req->MOrigRange, Req->MOrigRange,
+                      access::mode::read_write, Req->MSYCLMemObj, Req->MDims,
+                      Req->MElemSize);
+
+  std::set<Command *> Deps = findDepsForReq(Record, &FullReq, Queue);
+  QueueImplPtr SrcQueue = (*Deps.begin())->getQueue();
+  AllocaCommand *AllocaCmdDst = findAllocaForReq(Record, &FullReq, Queue);
+
+  if (!AllocaCmdDst) {
+    std::unique_ptr<AllocaCommand> AllocaCmdUniquePtr(
+        new AllocaCommand(Queue, FullReq));
+
+    if (!AllocaCmdUniquePtr)
+      throw runtime_error("Out of host memory");
+
+    Record->MAllocaCommands.push_back(AllocaCmdUniquePtr.get());
+    AllocaCmdDst = AllocaCmdUniquePtr.release();
+    Deps.insert(AllocaCmdDst);
+  }
+
+  AllocaCommand *AllocaCmdSrc = findAllocaForReq(Record, Req, SrcQueue);
+
+  MemCpyCommand *MemCpyCmd = new MemCpyCommand(
+      *AllocaCmdSrc->getAllocationReq(), AllocaCmdSrc, *Req, AllocaCmdDst,
+      AllocaCmdSrc->getQueue(), AllocaCmdDst->getQueue());
+
+  for (Command *Dep : Deps) {
+    MemCpyCmd->addDep(DepDesc{Dep, &MemCpyCmd->MDstReq, AllocaCmdDst});
+    Dep->addUser(MemCpyCmd);
+  }
+  UpdateLeafs(Deps, Record->MLeafs);
+  Record->MLeafs.push_back(MemCpyCmd);
+  return MemCpyCmd;
+}
+
+// The function adds copy operation of the up to date'st memory to the memory
+// pointed by Req.
+Command *Scheduler::GraphBuilder::addCopyBack(Requirement *Req) {
+
+  QueueImplPtr HostQueue = Scheduler::getInstance().getDefaultHostQueue();
+  SYCLMemObjT *MemObj = Req->MSYCLMemObj;
+  Scheduler::GraphBuilder::MemObjRecord *Record = getMemObjRecord(MemObj);
+
+  // Do nothing if there were no or only read operations with the memory object.
+  if (nullptr == Record || !Record->MMemModified)
+    return nullptr;
+
+  std::set<Command *> Deps = findDepsForReq(Record, Req, HostQueue);
+  QueueImplPtr SrcQueue = (*Deps.begin())->getQueue();
+  AllocaCommand *SrcAllocaCmd = findAllocaForReq(Record, Req, SrcQueue);
+
+  std::unique_ptr<MemCpyCommandHost> MemCpyCmdUniquePtr(
+      new MemCpyCommandHost(*SrcAllocaCmd->getAllocationReq(), SrcAllocaCmd,
+                            Req, std::move(SrcQueue), std::move(HostQueue)));
+
+  if (!MemCpyCmdUniquePtr)
+    throw runtime_error("Out of host memory");
+
+  MemCpyCommandHost *MemCpyCmd = MemCpyCmdUniquePtr.release();
+  for (Command *Dep : Deps) {
+    MemCpyCmd->addDep(DepDesc{Dep, &MemCpyCmd->MDstReq, SrcAllocaCmd});
+    Dep->addUser(MemCpyCmd);
+  }
+
+  UpdateLeafs(Deps, Record->MLeafs);
+  Record->MLeafs.push_back(MemCpyCmd);
+  return MemCpyCmd;
+}
+
+// The function implements SYCL host accessor logic: host accessor
+// should provide access to the buffer in user space, then during
+// destruction the memory should be written back(if access mode is not read
+// only) to the memory object. No operations with buffer allowed during host
+// accessor lifetime.
+Command *Scheduler::GraphBuilder::addHostAccessor(Requirement *Req,
+                                                  EventImplPtr &RetEvent) {
+  QueueImplPtr HostQueue = Scheduler::getInstance().getDefaultHostQueue();
+  MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req);
+  markModifiedIfWrite(Record, Req);
+
+  std::set<Command *> Deps = findDepsForReq(Record, Req, HostQueue);
+  QueueImplPtr SrcQueue = (*Deps.begin())->getQueue();
+
+  AllocaCommand *SrcAllocaCmd = findAllocaForReq(Record, Req, SrcQueue);
+  Requirement *SrcReq = SrcAllocaCmd->getAllocationReq();
+
+  if (SrcQueue->is_host()) {
+    MemCpyCommand *DevToHostCmd = insertMemCpyCmd(Record, Req, HostQueue);
+    DevToHostCmd->setAccessorToUpdate(Req);
+    RetEvent = DevToHostCmd->getEvent();
+    return DevToHostCmd;
+  }
+
+  // Prepare "user" event that will block second operation(unmap of copy) until
+  // host accessor is destructed.
+  ContextImplPtr SrcContext = detail::getSyclObjImpl(SrcQueue->get_context());
+  Req->BlockingEvent.reset(new detail::event_impl());
+  Req->BlockingEvent->setContextImpl(SrcContext);
+  cl_event &CLEvent = Req->BlockingEvent->getHandleRef();
+  cl_int Error = CL_SUCCESS;
+  CLEvent = clCreateUserEvent(SrcContext->getHandleRef(), &Error);
+  CHECK_OCL_CODE(Error);
+
+  // In case of memory is 1 dimensional and located on OpenCL device we
+  // can use map/unmap operation.
+  if (!SrcQueue->is_host() && Req->MDims == 1 &&
+      Req->MRange == Req->MOrigRange) {
+
+    std::unique_ptr<MapMemObject> MapCmdUniquePtr(
+        new MapMemObject(*SrcReq, SrcAllocaCmd, Req, SrcQueue));
+    std::unique_ptr<UnMapMemObject> UnMapCmdUniquePtr(
+        new UnMapMemObject(*SrcReq, SrcAllocaCmd, Req, SrcQueue));
+
+    if (!MapCmdUniquePtr || !UnMapCmdUniquePtr)
+      throw runtime_error("Out of host memory");
+
+    MapMemObject *MapCmd = MapCmdUniquePtr.release();
+    for (Command *Dep : Deps) {
+      MapCmd->addDep(DepDesc{Dep, &MapCmd->MDstReq, SrcAllocaCmd});
+      Dep->addUser(MapCmd);
+    }
+
+    Command *UnMapCmd = UnMapCmdUniquePtr.release();
+    UnMapCmd->addDep(DepDesc{MapCmd, &MapCmd->MDstReq, SrcAllocaCmd});
+    MapCmd->addUser(UnMapCmd);
+
+    UpdateLeafs(Deps, Record->MLeafs);
+    Record->MLeafs.push_back(UnMapCmd);
+    UnMapCmd->addDep(Req->BlockingEvent);
+
+    RetEvent = MapCmd->getEvent();
+    return UnMapCmd;
+  }
+
+  // In other cases insert two mem copy operations.
+  MemCpyCommand *DevToHostCmd = insertMemCpyCmd(Record, Req, HostQueue);
+  DevToHostCmd->setAccessorToUpdate(Req);
+  Command *HostToDevCmd = insertMemCpyCmd(Record, Req, SrcQueue);
+  HostToDevCmd->addDep(Req->BlockingEvent);
+
+  RetEvent = DevToHostCmd->getEvent();
+  return HostToDevCmd;
+}
+
+Command *Scheduler::GraphBuilder::addCGUpdateHost(
+    std::unique_ptr<detail::CG> CommandGroup, QueueImplPtr HostQueue) {
+  // Dummy implementation of update host logic, just copy memory to the host
+  // device. We could avoid copying if there is no allocation of host memory.
+
+  CGUpdateHost *UpdateHost = (CGUpdateHost *)CommandGroup.get();
+  Requirement *Req = UpdateHost->getReqToUpdate();
+
+  MemObjRecord *Record = getOrInsertMemObjRecord(HostQueue, Req);
+  return insertMemCpyCmd(Record, Req, HostQueue);
+}
+
+// The functions finds dependencies for the requirement. It starts searching
+// from list of "leaf" commands for the record and check if the examining
+// command can be executed in parallel with new one with regard to the memory
+// object. If can, then continue searching through dependencies of that
+// command. There are several rules used:
+//
+// 1. New and examined commands only read -> can bypass
+// 2. New and examined commands has non-overlapping requirements -> can bypass
+// 3. New and examined commands has different contexts -> cannot bypass
+std::set<Command *>
+Scheduler::GraphBuilder::findDepsForReq(MemObjRecord *Record, Requirement *Req,
+                                        QueueImplPtr Queue) {
+  sycl::context Context = Queue->get_context();
+  std::set<Command *> RetDeps;
+  std::vector<Command *> ToAnalyze = Record->MLeafs;
+  std::set<Command *> Visited;
+  const bool ReadOnlyReq = Req->MAccessMode == access::mode::read;
+
+  while (!ToAnalyze.empty()) {
+    Command *DepCmd = ToAnalyze.back();
+    ToAnalyze.pop_back();
+
+    std::vector<Command *> NewAnalyze;
+
+    for (const DepDesc &Dep : DepCmd->MDeps) {
+      if (Dep.MReq->MSYCLMemObj != Req->MSYCLMemObj)
+        continue;
+
+      bool CanBypassDep = false;
+      // If both only read
+      CanBypassDep |=
+          Dep.MReq->MAccessMode == access::mode::read && ReadOnlyReq;
+
+      // If not overlap
+      CanBypassDep |= !doOverlap(Dep.MReq, Req);
+
+      // Going through copying memory between contexts is not supported.
+      if (Dep.MDepCommand)
+        CanBypassDep &= Context == Dep.MDepCommand->getQueue()->get_context();
+
+      if (!CanBypassDep) {
+        RetDeps.insert(DepCmd);
+        // No need to analyze deps of examining command as it's dependency
+        // itself.
+        NewAnalyze.clear();
+        break;
+      }
+
+      if (Visited.insert(Dep.MDepCommand).second)
+        NewAnalyze.push_back(Dep.MDepCommand);
+    }
+    ToAnalyze.insert(ToAnalyze.end(), NewAnalyze.begin(), NewAnalyze.end());
+  }
+  return RetDeps;
+}
+
+// The function searchs for the alloca command matching context and requirement.
+AllocaCommand *Scheduler::GraphBuilder::findAllocaForReq(MemObjRecord *Record,
+                                                         Requirement *Req,
+                                                         QueueImplPtr Queue) {
+  auto IsSuitableAlloca = [&Queue](const AllocaCommand *AllocaCmd) {
+    return AllocaCmd->getQueue()->get_context() == Queue->get_context();
+  };
+  const auto It = std::find_if(Record->MAllocaCommands.begin(),
+                               Record->MAllocaCommands.end(), IsSuitableAlloca);
+  return (Record->MAllocaCommands.end() != It) ? *It : nullptr;
+}
+
+// The function sets MemModified flag in record if requirement has write access.
+void Scheduler::GraphBuilder::markModifiedIfWrite(
+    GraphBuilder::MemObjRecord *Record, Requirement *Req) {
+  switch (Req->MAccessMode) {
+  case access::mode::write:
+  case access::mode::read_write:
+  case access::mode::discard_write:
+  case access::mode::discard_read_write:
+  case access::mode::atomic:
+    Record->MMemModified = true;
+  case access::mode::read:
+    break;
+  }
+}
+
+Command *
+Scheduler::GraphBuilder::addCG(std::unique_ptr<detail::CG> CommandGroup,
+                               QueueImplPtr Queue) {
+  std::vector<Requirement *> Reqs = CommandGroup->getRequirements();
+  std::unique_ptr<ExecCGCommand> NewCmd(
+      new ExecCGCommand(std::move(CommandGroup), Queue));
+  if (!NewCmd)
+    throw runtime_error("Out of host memory");
+
+  for (Requirement *Req : Reqs) {
+    MemObjRecord *Record = getOrInsertMemObjRecord(Queue, Req);
+    markModifiedIfWrite(Record, Req);
+    std::set<Command *> Deps = findDepsForReq(Record, Req, Queue);
+
+    // If contexts of dependency and new command don't match insert
+    // memcpy command.
+    for (const Command *Dep : Deps)
+      if (Dep->getQueue()->get_context() != Queue->get_context()) {
+        // Cannot directly copy memory from OpenCL device to OpenCL device -
+        // create to copies device->host and host->device.
+        if (!Dep->getQueue()->is_host() && !Queue->is_host())
+          insertMemCpyCmd(Record, Req,
+                          Scheduler::getInstance().getDefaultHostQueue());
+        insertMemCpyCmd(Record, Req, Queue);
+        // Need to search for dependencies again as we modified the graph.
+        Deps = findDepsForReq(Record, Req, Queue);
+        break;
+      }
+    AllocaCommand *AllocaCmd = findAllocaForReq(Record, Req, Queue);
+    UpdateLeafs(Deps, Record->MLeafs);
+
+    for (Command *Dep : Deps) {
+      NewCmd->addDep(DepDesc{Dep, Req, AllocaCmd});
+      Dep->addUser(NewCmd.get());
+    }
+  }
+
+  for (Requirement *Req : Reqs) {
+    MemObjRecord *Record = getMemObjRecord(Req->MSYCLMemObj);
+    Record->MLeafs.push_back(NewCmd.get());
+  }
+  return NewCmd.release();
+}
+
+void Scheduler::GraphBuilder::cleanupCommands(bool CleanupReleaseCommands) {
+  // TODO: Implement.
+}
+
+void Scheduler::GraphBuilder::removeRecordForMemObj(SYCLMemObjT *MemObject) {
+  const auto It = std::find_if(MMemObjRecords.begin(), MMemObjRecords.end(),
+                               [MemObject](const MemObjRecord &Record) {
+                                 return Record.MMemObj == MemObject;
+                               });
+  MMemObjRecords.erase(It);
+}
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/detail/scheduler/graph_processor.cpp b/sycl/source/detail/scheduler/graph_processor.cpp
new file mode 100644
index 000000000000..f999c047ddd5
--- /dev/null
+++ b/sycl/source/detail/scheduler/graph_processor.cpp
@@ -0,0 +1,62 @@
+//===-- graph_processor.cpp - SYCL Graph Processor --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <CL/sycl/detail/event_impl.hpp>
+#include <CL/sycl/detail/queue_impl.hpp>
+#include <CL/sycl/detail/scheduler/scheduler.hpp>
+
+#include <memory>
+#include <vector>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+static Command *getCommand(const EventImplPtr &Event) {
+  return (Command *)Event->getCommand();
+}
+
+std::vector<EventImplPtr>
+Scheduler::GraphProcessor::getWaitList(EventImplPtr Event) {
+  std::vector<EventImplPtr> Result;
+  Command *Cmd = getCommand(Event);
+  for (const DepDesc &Dep : Cmd->MDeps)
+    Result.push_back(Dep.MDepCommand->getEvent());
+  return Result;
+}
+
+void Scheduler::GraphProcessor::waitForEvent(EventImplPtr Event) {
+  Command *Cmd = getCommand(Event);
+  assert(!Cmd && "Event has no associated command?");
+  Command *FailedCommand = enqueueCommand(Cmd);
+  if (FailedCommand)
+    // TODO: Reschedule commands.
+    throw runtime_error("Enqueue process failed.");
+
+  cl_event &CLEvent = Cmd->getEvent()->getHandleRef();
+  if (CLEvent)
+    CHECK_OCL_CODE(clWaitForEvents(1, &CLEvent));
+}
+
+Command *Scheduler::GraphProcessor::enqueueCommand(Command *Cmd) {
+  if (!Cmd || Cmd->isEnqueued())
+    return nullptr;
+
+  for (DepDesc &Dep : Cmd->MDeps) {
+    Command *FailedCommand = enqueueCommand(Dep.MDepCommand);
+    if (FailedCommand)
+      return FailedCommand;
+  }
+
+  cl_int Result = Cmd->enqueue();
+  return CL_SUCCESS == Result ? nullptr : Cmd;
+}
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/source/detail/scheduler/scheduler2.cpp b/sycl/source/detail/scheduler/scheduler2.cpp
new file mode 100644
index 000000000000..57386cfb090f
--- /dev/null
+++ b/sycl/source/detail/scheduler/scheduler2.cpp
@@ -0,0 +1,141 @@
+//===-- scheduler.cpp - SYCL Schedule ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "CL/sycl/detail/sycl_mem_obj.hpp"
+#include <CL/sycl/detail/queue_impl.hpp>
+#include <CL/sycl/detail/scheduler/scheduler.hpp>
+#include <CL/sycl/device_selector.hpp>
+
+#include <memory>
+#include <mutex>
+#include <set>
+#include <vector>
+
+namespace cl {
+namespace sycl {
+namespace detail {
+
+void Scheduler::waitForRecordToFinish(GraphBuilder::MemObjRecord *Record) {
+  for (Command *Cmd : Record->MLeafs) {
+    Command *FailedCommand = GraphProcessor::enqueueCommand(Cmd);
+    if (FailedCommand) {
+      // TODO: What is the best way to handle failed command in Scheduler d'tor?
+      assert(!FailedCommand && "Command failed to enqueue");
+      throw runtime_error("Enqueue process failed.");
+    }
+    GraphProcessor::waitForEvent(Cmd->getEvent());
+  }
+  for (AllocaCommand *AllocaCmd : Record->MAllocaCommands) {
+    Command *ReleaseCmd = AllocaCmd->getReleaseCmd();
+    Command *FailedCommand = GraphProcessor::enqueueCommand(ReleaseCmd);
+    if (FailedCommand) {
+      // TODO: What is the best way to handle failed command in d'tor?
+      assert(!FailedCommand && "Command failed to enqueue");
+      throw runtime_error("Enqueue process failed.");
+    }
+    GraphProcessor::waitForEvent(ReleaseCmd->getEvent());
+  }
+}
+
+EventImplPtr Scheduler::addCG(std::unique_ptr<detail::CG> CommandGroup,
+                              QueueImplPtr Queue) {
+  std::lock_guard<std::mutex> lock(MGraphLock);
+
+  Command *NewCmd = nullptr;
+  switch (CommandGroup->getType()) {
+  case CG::UPDATE_HOST:
+    NewCmd = MGraphBuilder.addCGUpdateHost(std::move(CommandGroup),
+                                           DefaultHostQueue);
+    break;
+  default:
+    NewCmd = MGraphBuilder.addCG(std::move(CommandGroup), std::move(Queue));
+  }
+
+  // TODO: Check if lazy mode.
+  Command *FailedCommand = GraphProcessor::enqueueCommand(NewCmd);
+  MGraphBuilder.cleanupCommands();
+  if (FailedCommand)
+    // TODO: Reschedule commands.
+    throw runtime_error("Enqueue process failed.");
+  return NewCmd->getEvent();
+}
+
+EventImplPtr Scheduler::addCopyBack(Requirement *Req) {
+  Command *NewCmd = MGraphBuilder.addCopyBack(Req);
+  // Command was not creted because there were no operations with
+  // buffer.
+  if (!NewCmd)
+    return nullptr;
+  Command *FailedCommand = GraphProcessor::enqueueCommand(NewCmd);
+  if (FailedCommand)
+    // TODO: Reschedule commands.
+    throw runtime_error("Enqueue process failed.");
+  return NewCmd->getEvent();
+}
+
+Scheduler::~Scheduler() {
+  // TODO: Make running wait and release on destruction configurable?
+  // TODO: Process release commands only?
+  //std::lock_guard<std::mutex> lock(MGraphLock);
+  //for (GraphBuilder::MemObjRecord &Record : MGraphBuilder.MMemObjRecords)
+    //waitForRecordToFinish(&Record);
+  //MGraphBuilder.cleanupCommands([>CleanupReleaseCommands = <] true);
+}
+
+Scheduler &Scheduler::getInstance() {
+  static Scheduler instance;
+  return instance;
+}
+
+std::vector<EventImplPtr> Scheduler::getWaitList(EventImplPtr Event) {
+  std::lock_guard<std::mutex> lock(MGraphLock);
+  return GraphProcessor::getWaitList(std::move(Event));
+}
+
+void Scheduler::waitForEvent(EventImplPtr Event) {
+  std::lock_guard<std::mutex> lock(MGraphLock);
+  GraphProcessor::waitForEvent(std::move(Event));
+}
+
+void Scheduler::removeMemoryObject(detail::SYCLMemObjT *MemObj) {
+  std::lock_guard<std::mutex> lock(MGraphLock);
+
+  GraphBuilder::MemObjRecord *Record = MGraphBuilder.getMemObjRecord(MemObj);
+  if (!Record) {
+    assert("No operations were performed on the mem object?");
+    return;
+  }
+  waitForRecordToFinish(Record);
+  MGraphBuilder.cleanupCommands(/*CleanupReleaseCommands = */ true);
+  MGraphBuilder.removeRecordForMemObj(MemObj);
+}
+
+EventImplPtr Scheduler::addHostAccessor(Requirement *Req) {
+  std::lock_guard<std::mutex> lock(MGraphLock);
+
+  EventImplPtr RetEvent;
+  Command *NewCmd = MGraphBuilder.addHostAccessor(Req, RetEvent);
+
+  if (!NewCmd)
+    return nullptr;
+  Command *FailedCommand = GraphProcessor::enqueueCommand(NewCmd);
+  if (FailedCommand)
+    // TODO: Reschedule commands.
+    throw runtime_error("Enqueue process failed.");
+  return RetEvent;
+}
+
+Scheduler::Scheduler() {
+  sycl::device HostDevice;
+  DefaultHostQueue = QueueImplPtr(
+      new queue_impl(HostDevice, /*AsyncHandler=*/{}, /*PropList=*/{}));
+}
+
+} // namespace detail
+} // namespace sycl
+} // namespace cl
diff --git a/sycl/test/basic_tests/access_to_subset.cpp b/sycl/test/basic_tests/access_to_subset.cpp
index 3ba2eb8a77dc..7a1babbf764b 100644
--- a/sycl/test/basic_tests/access_to_subset.cpp
+++ b/sycl/test/basic_tests/access_to_subset.cpp
@@ -29,6 +29,11 @@ int main() {
     auto offset = id<2>(1, 1);
     auto subRange = range<2>(M - 2, N - 2);
     queue myQueue;
+    myQueue.submit([&](handler &cgh) {
+      acc_w B(Buffer, cgh);
+      cgh.parallel_for<class bufferByRange2_init>(
+          origRange, [=](id<2> index) { B[index] = 0; });
+    });
     myQueue.submit([&](handler &cgh) {
       acc_w B(Buffer, cgh, subRange, offset);
       cgh.parallel_for<class bufferByRange2>(
diff --git a/sycl/test/basic_tests/accessor/accessor.cpp b/sycl/test/basic_tests/accessor/accessor.cpp
index 13291a8510fa..d64e2740ed2d 100644
--- a/sycl/test/basic_tests/accessor/accessor.cpp
+++ b/sycl/test/basic_tests/accessor/accessor.cpp
@@ -80,7 +80,6 @@ int main() {
     assert(acc_src.get_size() == sizeof(src));
     assert(acc_src.get_count() == 2);
     assert(acc_src.get_range() == sycl::range<1>(2));
-    assert(acc_src.get_pointer() == src);
 
     // Make sure that operator[] is defined for both size_t and id<1>.
     // Implicit conversion from IdxSzT to size_t guarantees that no
@@ -105,7 +104,6 @@ int main() {
       assert(acc.get_size() == sizeof(data));
       assert(acc.get_count() == 24);
       assert(acc.get_range() == sycl::range<3>(2, 3, 4));
-      assert(acc.get_pointer() != data);
 
       for (int i = 0; i < 2; ++i)
         for (int j = 0; j < 3; ++j)
diff --git a/sycl/test/basic_tests/accessor_static_check.cpp b/sycl/test/basic_tests/accessor_static_check.cpp
deleted file mode 100644
index d4f38d080fae..000000000000
--- a/sycl/test/basic_tests/accessor_static_check.cpp
+++ /dev/null
@@ -1,99 +0,0 @@
-// RUN: %clang -std=c++11 -fsyntax-only %s
-
-// Check that the test can be compiled with device compiler as well.
-// RUN: %clang --sycl -fsyntax-only %s
-//==--- accessor_static_check.cpp - Static checks for SYCL accessors -------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-#include <CL/sycl.hpp>
-
-namespace sycl {
-  using namespace cl::sycl;
-}
-
-struct SomeStructure {
-  char a;
-  float b;
-  union {
-    int x;
-    double y;
-  } v;
-};
-
-// Check that accessor_impl is the only data field in accessor class,
-// and that the accessor is a standard-layout structure. A pointer to
-// a standard-layout class may be converted (with reinterpret_cast) to
-// a pointer to its first non-static data member and vice versa.
-// Along the way, many specializations of accessor are instantiated.
-
-#define CHECK_ACCESSOR_SIZEOF(DataT, Dimensions, AccessMode, AccessTarget,     \
-                              IsPlaceholder)                                   \
-  static_assert(                                                               \
-      std::is_standard_layout<sycl::accessor<                                  \
-          DataT, Dimensions, AccessMode, AccessTarget, IsPlaceholder>>::value, \
-      "accessor is not a standard-layout structure");                          \
-  static_assert(                                                               \
-      sizeof(sycl::accessor<DataT, Dimensions, AccessMode, AccessTarget,       \
-                            IsPlaceholder>) ==                                 \
-          sizeof(sycl::detail::accessor_impl<                                  \
-                 typename sycl::detail::DeviceValueType<DataT,                 \
-                                                        AccessTarget>::type,   \
-                 Dimensions, AccessMode, AccessTarget, IsPlaceholder>),        \
-      "accessor_impl is not the only data field in accessor class");
-
-#define CHECK_ACCESSOR_SIZEOF_PH(DataT, Dimensions, AccessMode, AccessTarget) \
-  CHECK_ACCESSOR_SIZEOF(DataT, Dimensions, AccessMode, AccessTarget,          \
-                        sycl::access::placeholder::true_t);                   \
-  CHECK_ACCESSOR_SIZEOF(DataT, Dimensions, AccessMode, AccessTarget,          \
-                        sycl::access::placeholder::false_t);
-
-#define CHECK_ACCESSOR_SIZEOF_AT(DataT, Dimensions, AccessMode)    \
-  CHECK_ACCESSOR_SIZEOF_PH(DataT, Dimensions, AccessMode,          \
-                           sycl::access::target::global_buffer);   \
-  CHECK_ACCESSOR_SIZEOF_PH(DataT, Dimensions, AccessMode,          \
-                           sycl::access::target::constant_buffer); \
-  CHECK_ACCESSOR_SIZEOF_PH(DataT, Dimensions, AccessMode,          \
-                           sycl::access::target::local);           \
-  CHECK_ACCESSOR_SIZEOF_PH(DataT, Dimensions, AccessMode,          \
-                           sycl::access::target::host_buffer);
-
-#if 0
-// TODO:
-// The following checks should be enabled after the corresponding
-// access::targets are supported by DeviceValueType metafunction.
-  CHECK_ACCESSOR_SIZEOF_PH(DataT, Dimensions, AccessMode,          \
-                           sycl::access::target::image);           \
-  CHECK_ACCESSOR_SIZEOF_PH(DataT, Dimensions, AccessMode,          \
-                           sycl::access::target::host_image);      \
-  CHECK_ACCESSOR_SIZEOF_PH(DataT, Dimensions, AccessMode,          \
-                           sycl::access::target::image_array);
-#endif
-
-#define CHECK_ACCESSOR_SIZEOF_AM(DataT, Dimensions)                            \
-  CHECK_ACCESSOR_SIZEOF_AT(DataT, Dimensions, sycl::access::mode::read);       \
-  CHECK_ACCESSOR_SIZEOF_AT(DataT, Dimensions, sycl::access::mode::write);      \
-  CHECK_ACCESSOR_SIZEOF_AT(DataT, Dimensions, sycl::access::mode::read_write); \
-  CHECK_ACCESSOR_SIZEOF_AT(DataT, Dimensions,                                  \
-                           sycl::access::mode::discard_write);                 \
-  CHECK_ACCESSOR_SIZEOF_AT(DataT, Dimensions,                                  \
-                           sycl::access::mode::discard_read_write);            \
-  CHECK_ACCESSOR_SIZEOF_AT(DataT, Dimensions, sycl::access::mode::atomic);
-
-#define CHECK_ACCESSOR_SIZEOF_DIM(DataT) \
-  CHECK_ACCESSOR_SIZEOF_AM(DataT, 0); \
-  CHECK_ACCESSOR_SIZEOF_AM(DataT, 1); \
-  CHECK_ACCESSOR_SIZEOF_AM(DataT, 2); \
-  CHECK_ACCESSOR_SIZEOF_AM(DataT, 3);
-
-#define CHECK_ACCESSOR_SIZEOF_ALL \
-  CHECK_ACCESSOR_SIZEOF_DIM(char); \
-  CHECK_ACCESSOR_SIZEOF_DIM(unsigned); \
-  CHECK_ACCESSOR_SIZEOF_DIM(long long); \
-  CHECK_ACCESSOR_SIZEOF_DIM(double); \
-  CHECK_ACCESSOR_SIZEOF_DIM(SomeStructure);
-
-CHECK_ACCESSOR_SIZEOF_ALL
diff --git a/sycl/test/basic_tests/accessor_syntax_only.cpp b/sycl/test/basic_tests/accessor_syntax_only.cpp
deleted file mode 100644
index b37c56baeb6e..000000000000
--- a/sycl/test/basic_tests/accessor_syntax_only.cpp
+++ /dev/null
@@ -1,197 +0,0 @@
-//==--- accessor_syntax_only.cpp - Syntax checks for SYCL accessors --------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// This test is supposed to check that interface of sycl::accessor
-// conforms to the specification. It checks that valid code can be
-// compiled and invalid code causes compilation errors.
-
-// RUN: %clang -std=c++11 -fsyntax-only -Xclang -verify %s
-
-// Check that the test an be compiled with device compiler as well.
-// RUN: %clang --sycl -fsyntax-only -Xclang -verify %s
-
-#include <CL/sycl.hpp>
-
-namespace sycl {
-  using namespace cl::sycl;
-  using namespace cl::sycl::access;
-}
-
-struct IdxSz {
-  operator size_t() { return 1; }
-};
-
-struct IdxId1 {
-  operator sycl::id<1>() { return sycl::id<1>(1); }
-};
-
-struct IdxId2 {
-  operator sycl::id<2>() { return sycl::id<2>(1, 1); }
-};
-
-struct IdxId3 {
-  operator sycl::id<3>() { return sycl::id<3>(1, 1, 1); }
-};
-
-struct IdxIdAny {
-  operator sycl::id<1>() { return sycl::id<1>(1); }
-  operator sycl::id<2>() { return sycl::id<2>(1, 1); }
-  operator sycl::id<3>() { return sycl::id<3>(1, 1, 1); }
-};
-
-struct IdxIdSz {
-  operator size_t() { return 1; }
-  operator sycl::id<1>() { return sycl::id<1>(1); }
-  operator sycl::id<2>() { return sycl::id<2>(1, 1); }
-  operator sycl::id<3>() { return sycl::id<3>(1, 1, 1); }
-};
-
-template <int dimensions, sycl::mode accessMode, sycl::target accessTarget>
-using acc_t = sycl::accessor<int, dimensions, accessMode, accessTarget,
-                             sycl::placeholder::false_t>;
-
-// Check that operator dataT is defined only if (dimensions == 0).
-void test1() {
-  int data = 5;
-  sycl::buffer<int, 1> buf(&data, 1);
-  auto acc = buf.get_access<sycl::access::mode::read>();
-  (int) acc; // expected-error {{cannot convert}}
-}
-
-// Check that operator dataT returns by value in case of read-accessor
-// and by reference in case of write-accessor.
-void test2(acc_t<0, sycl::mode::read, sycl::target::host_buffer> acc0,
-           acc_t<0, sycl::mode::write, sycl::target::global_buffer> acc1,
-           acc_t<0, sycl::mode::read_write, sycl::target::constant_buffer> acc2,
-           acc_t<0, sycl::mode::discard_write, sycl::target::local> acc3) {
-  int val0 = acc0;
-  int &val0_r = acc0; // expected-error {{cannot bind}}
-
-  int val1 = acc1;
-  int &val1_r = acc1;
-
-  int val2 = acc2;
-  int &val2_r = acc2;
-
-  int val3 = acc3;
-  int &val3_r = acc3;
-}
-
-// Check that operator[](size_t) is defined according to spec.
-void test3(acc_t<0, sycl::mode::discard_read_write, sycl::target::host_buffer> acc0,
-           acc_t<1, sycl::mode::write, sycl::target::global_buffer> acc1,
-           acc_t<2, sycl::mode::read, sycl::target::constant_buffer> acc2,
-           acc_t<3, sycl::mode::read_write, sycl::target::local> acc3) {
-  IdxSz idx;
-  acc0[idx]; // expected-error {{does not provide a subscript operator}}
-  acc1[idx];
-  acc1[idx] = 1;
-  acc2[idx][idx];
-  acc2[idx][idx] = 2; // expected-error {{expression is not assignable}}
-  acc3[idx][idx][idx];
-  acc3[idx][idx][idx] = 3;
-}
-
-// Check that operator[](id<n>) is not defined if (dimensions == 0 || dimensions != n).
-void test4(acc_t<0, sycl::mode::read_write, sycl::target::local> acc0,
-           acc_t<1, sycl::mode::read, sycl::target::host_buffer> acc1,
-           acc_t<2, sycl::mode::write, sycl::target::global_buffer> acc2,
-           acc_t<3, sycl::mode::discard_write, sycl::target::constant_buffer> acc3) {
-  IdxIdAny idx;
-  acc0[idx]; // expected-error {{does not provide a subscript operator}}
-  acc1[idx];
-  acc2[idx];
-  acc3[idx];
-
-  IdxId1 idx1;
-  IdxId2 idx2;
-  IdxId3 idx3;
-
-  acc1[idx1];
-  acc1[idx2]; // expected-error {{no viable overloaded operator[]}}
-  // expected-note@* {{no known conversion from 'IdxId2' to 'id<1>'}}
-  // expected-note@* {{no known conversion from 'IdxId2' to 'size_t'}}
-  acc1[idx3]; // expected-error {{no viable overloaded operator[]}}
-  // expected-note@* {{no known conversion from 'IdxId3' to 'id<1>'}}
-  // expected-note@* {{no known conversion from 'IdxId3' to 'size_t'}}
-
-  acc2[idx1]; // expected-error {{no viable overloaded operator[]}}
-  // expected-note@* {{no known conversion from 'IdxId1' to 'id<2>'}}
-  acc2[idx2];
-  acc2[idx3]; // expected-error {{no viable overloaded operator[]}}
-  // expected-note@* {{no known conversion from 'IdxId3' to 'id<2>'}}
-
-  acc3[idx1]; // expected-error {{no viable overloaded operator[]}}
-  // expected-note@* {{no known conversion from 'IdxId1' to 'id<3>'}}
-  acc3[idx2]; // expected-error {{no viable overloaded operator[]}}
-  // expected-note@* {{no known conversion from 'IdxId2' to 'id<3>'}}
-  acc3[idx3];
-}
-
-// Check that operator[] returns values by value if accessMode == mode::read,
-// and by reference otherwise.
-void test5(acc_t<1, sycl::mode::read, sycl::target::global_buffer> acc1,
-           acc_t<2, sycl::mode::write, sycl::target::host_buffer> acc2,
-           acc_t<3, sycl::mode::read_write, sycl::target::local> acc3) {
-  IdxIdAny idx;
-
-  int val1 = acc1[idx];
-  int &val1_r = acc1[idx]; // expected-error {{cannot bind}}
-
-  int val2 = acc2[idx];
-  int &val2_r = acc2[idx];
-
-  int val3 = acc3[idx];
-  int &val3_r = acc3[idx];
-}
-
-// Check get_pointer() method.
-void test6(acc_t<1, sycl::mode::read, sycl::target::host_buffer> acc1,
-           acc_t<2, sycl::mode::write, sycl::target::global_buffer> acc2,
-           acc_t<3, sycl::mode::read_write, sycl::target::constant_buffer> acc3) {
-  int *val = acc1.get_pointer();
-  acc2.get_pointer();
-  acc3.get_pointer();
-}
-
-// Check that there are two different versions of operator[] if
-// (dimensions == 1) and only one if (dimensions > 1).
-void test7(acc_t<1, sycl::mode::read_write, sycl::target::host_buffer> acc1,
-           acc_t<2, sycl::mode::write, sycl::target::global_buffer> acc2,
-           acc_t<3, sycl::mode::read, sycl::target::constant_buffer> acc3) {
-  IdxIdSz idx;
-  acc1[idx]; // expected-error {{use of overloaded operator '[]' is ambiguous}}
-  // expected-note@* {{candidate function}}
-  // expected-note@* {{candidate function}}
-  // expected-note@* {{candidate function}}
-  // expected-note@* {{candidate function}}
-  acc2[idx][idx]; // expected-error {{use of overloaded operator '[]' is ambiguous}}
-  // expected-note@* {{candidate function}}
-  // expected-note@* {{candidate function}}
-  // expected-note@* {{candidate function}}
-  acc3[idx][idx][idx]; // expected-error {{use of overloaded operator '[]' is ambiguous}}
-  // expected-note@* {{candidate function}}
-  // expected-note@* {{candidate function}}
-  // expected-note@* {{candidate function}}
-
-}
-
-// Check that there is no operator[] if (dimensions == 0).
-struct A {
-  int operator[](size_t x);
-};
-template <sycl::target Target>
-struct X : acc_t<0, sycl::mode::read, Target>, A {};
-void test8(X<sycl::target::host_buffer> acc1,
-           X<sycl::target::global_buffer> acc2,
-           X<sycl::target::constant_buffer> acc3,
-           X<sycl::target::local> acc4) {
-  acc1[42];
-  acc2[42];
-  acc3[42];
-  acc4[42];
-};
diff --git a/sycl/test/basic_tests/buffer/reinterpret.cpp b/sycl/test/basic_tests/buffer/reinterpret.cpp
index 7ffe633bb3cb..2c3fd7213bc6 100644
--- a/sycl/test/basic_tests/buffer/reinterpret.cpp
+++ b/sycl/test/basic_tests/buffer/reinterpret.cpp
@@ -12,6 +12,8 @@
 
 #include <CL/sycl.hpp>
 
+#include <climits>
+
 // This tests verifies basic cases of using cl::sycl::buffer::reinterpret
 // functionality - changing buffer type and range. This test checks that
 // original buffer updates when we write to reinterpreted buffer and also checks
diff --git a/sycl/test/basic_tests/buffer/subbuffer.cpp b/sycl/test/basic_tests/buffer/subbuffer.cpp
index 92b091313b13..df8d02036b2f 100644
--- a/sycl/test/basic_tests/buffer/subbuffer.cpp
+++ b/sycl/test/basic_tests/buffer/subbuffer.cpp
@@ -29,6 +29,13 @@ int main() {
       auto Offset = id<2>(1, 1);
       auto SubRange = range<2>(M - 2, N - 2);
       queue MyQueue;
+
+      MyQueue.submit([&](handler &cgh) {
+        auto B = Buffer.get_access<access::mode::read_write>(cgh);
+        cgh.parallel_for<class Subbuf_test_init>(
+            OrigRange, [=](id<2> Index) { B[Index] = 0; });
+      });
+
       buffer<int, 2> SubBuffer(Buffer, Offset, SubRange);
       MyQueue.submit([&](handler &cgh) {
         auto B = SubBuffer.get_access<access::mode::read_write>(cgh);
diff --git a/sycl/test/fpga_tests/fpga_queue.cpp b/sycl/test/fpga_tests/fpga_queue.cpp
index 5c8b86abb96b..60458729e393 100644
--- a/sycl/test/fpga_tests/fpga_queue.cpp
+++ b/sycl/test/fpga_tests/fpga_queue.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 #include <CL/sycl.hpp>
 #include <iostream>
+#include <set>
 
 using namespace cl::sycl;
 
diff --git a/sycl/test/scheduler/Dump.cpp b/sycl/test/scheduler/Dump.cpp
deleted file mode 100644
index f372aac545a3..000000000000
--- a/sycl/test/scheduler/Dump.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-//==--------------- Dump.cpp - Test SYCL scheduler graph dumping -----------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// RUN: %clang -std=c++11 -g %s -o %t.out -lstdc++ -lOpenCL -lsycl
-// RUN: %t.out
-// RUN: env SS_DUMP_TEXT=1 %t.out
-// RUN: env SS_DUMP_WHOLE_GRAPH=1 %t.out
-// RUN: env SS_DUMP_RUN_GRAPH=1 %t.out
-
-#include <CL/sycl.hpp>
-
-#include <cassert>
-#include <cstdlib>
-
-using namespace cl::sycl::simple_scheduler;
-
-int main() {
-  const bool TextFlag = Scheduler::getInstance().getDumpFlagValue(
-      Scheduler::DumpOptions::Text);
-  const bool TextEnv = std::getenv("SS_DUMP_TEXT");
-  assert(TextFlag == TextEnv);
-
-  const bool WholeGraphFlag = Scheduler::getInstance().getDumpFlagValue(
-      Scheduler::DumpOptions::WholeGraph);
-  const bool WholeGraphEnv = std::getenv("SS_DUMP_WHOLE_GRAPH");
-  assert(WholeGraphFlag == WholeGraphEnv);
-
-  const bool RunGraphFlag = Scheduler::getInstance().getDumpFlagValue(
-      Scheduler::DumpOptions::RunGraph);
-  const bool RunGraphEnv = std::getenv("SS_DUMP_RUN_GRAPH");
-  assert(RunGraphFlag == RunGraphEnv);
-
-  return 0;
-}
diff --git a/sycl/test/scheduler/parallelReadOpt.cpp b/sycl/test/scheduler/parallelReadOpt.cpp
deleted file mode 100644
index e7cfa04b52a5..000000000000
--- a/sycl/test/scheduler/parallelReadOpt.cpp
+++ /dev/null
@@ -1,92 +0,0 @@
-// RUN: %clang -std=c++11 -g %s -o %t.out -lstdc++ -lOpenCL -lsycl
-// RUN: env SYCL_DEVICE_TYPE=HOST %t.out | FileCheck %s
-//
-// CHECK:Buffer A [[A:.*]]
-// CHECK:Evolution of buffer [[A]]
-// CHECK-NEXT:ID = [[ALLOCA_A:[0-9]+]] ; ALLOCA ON [[DEVICE_TYPE:.*]]
-// CHECK-NEXT:  Buf : [[A]]  Access : read_write
-// CHECK-NEXT:    Dependency:
-// CHECK-NEXT:ID = [[INIT:[0-9]+]] ; RUN_KERNEL init_kernel ON [[DEVICE_TYPE]]
-// CHECK-NEXT:    Dependency:
-// CHECK-NEXT:        Dep on buf [[A]] write from Command ID = {{[0-9]+}}
-// CHECK-NEXT:ID = [[READ1:[0-9]+]] ; RUN_KERNEL read1 ON [[DEVICE_TYPE]]
-// CHECK-NEXT:    Dependency:
-// CHECK-DAG:        Dep on buf [[B:.*]] write from Command ID = {{[0-9]+}}
-// CHECK-DAG:        Dep on buf [[A]] read from Command ID = [[INIT]]
-// CHECK-NEXT:ID = [[READ2:[0-9]+]] ; RUN_KERNEL read2 ON [[DEVICE_TYPE]]
-// CHECK-NEXT:    Dependency:
-// CHECK-NEXT:        Dep on buf [[C:.*]] write from Command ID = {{[0-9]+}}
-// CHECK-NEXT:        Dep on buf [[A]] read from Command ID = [[INIT]]
-
-//==---- parallelReadOpt.cpp - SYCL scheduler parallel read test -----------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// XFAIL: *
-
-#include "CL/sycl.hpp"
-#include "CL/sycl/detail/scheduler/scheduler.h"
-
-using namespace cl::sycl;
-static constexpr const detail::kernel_param_desc_t kernel_signatures[] = {
-    //--- init_kernel
-    {detail::kernel_param_kind_t::kind_accessor, 2014, 0},
-    //--- read1 and read2
-    {detail::kernel_param_kind_t::kind_accessor, 2014, 0},
-    {detail::kernel_param_kind_t::kind_accessor, 2014, 192}};
-
-int main() {
-  auto M = detail::OSUtil::ExeModuleHandle;
-
-  queue Queue;
-  auto QueueImpl = detail::getSyclObjImpl(Queue);
-  const size_t N = 10;
-
-  buffer<float, 1> A(range<1>{N});
-  buffer<float, 1> B(range<1>{N});
-  buffer<float, 1> C(range<1>{N});
-
-  { // Adding node that requires write access to A
-    simple_scheduler::Node InitNode(QueueImpl);
-    InitNode.template addBufRequirement<access::mode::write,
-                                        access::target::global_buffer>(
-        *detail::getSyclObjImpl(A));
-    InitNode.addKernel(M, "init_kernel", 1, kernel_signatures, []() {});
-    simple_scheduler::Scheduler::getInstance().addNode(std::move(InitNode));
-  }
-
-  { // Adding node that requires read access to A, write to B
-    simple_scheduler::Node ReadNode1(QueueImpl);
-    ReadNode1.template addBufRequirement<access::mode::read,
-                                         access::target::global_buffer>(
-        *detail::getSyclObjImpl(A));
-    ReadNode1.template addBufRequirement<access::mode::write,
-                                         access::target::global_buffer>(
-        *detail::getSyclObjImpl(B));
-    ReadNode1.addKernel(M, "read1", 2, kernel_signatures + 2, []() {});
-    simple_scheduler::Scheduler::getInstance().addNode(std::move(ReadNode1));
-  }
-
-  { // Adding node that requires read access to A, write to C
-    simple_scheduler::Node ReadNode2(QueueImpl);
-    ReadNode2.template addBufRequirement<access::mode::read,
-                                         access::target::global_buffer>(
-        *detail::getSyclObjImpl(A));
-    ReadNode2.template addBufRequirement<access::mode::write,
-                                         access::target::global_buffer>(
-        *detail::getSyclObjImpl(C));
-    ReadNode2.addKernel(M, "read2", 2, kernel_signatures + 2, []() {});
-    simple_scheduler::Scheduler::getInstance().addNode(std::move(ReadNode2));
-  }
-
-  std::cout << "Buffer A " << detail::getSyclObjImpl(A).get() << std::endl;
-
-  // Expected that read2 kernel doesn't depend on read1.
-  simple_scheduler::Scheduler::getInstance().parallelReadOpt();
-
-  simple_scheduler::Scheduler::getInstance().dump();
-}
diff --git a/sycl/test/sub_group/barrier.cpp b/sycl/test/sub_group/barrier.cpp
index 3f7d8bd85587..473ff23b74fa 100644
--- a/sycl/test/sub_group/barrier.cpp
+++ b/sycl/test/sub_group/barrier.cpp
@@ -44,10 +44,16 @@ template <typename T> void check(queue &Queue, size_t G = 240, size_t L = 60) {
           sgsizeacc[0] = SG.get_max_local_range()[0];
       });
     });
-    auto addacc = addbuf.template get_access<access::mode::read_write>();
-    auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
 
-    size_t sg_size = sgsizeacc[0];
+    // Temporary workaround to avoid waiting for the task while still blocked
+    // task is located in the same queue.
+    size_t sg_size = 0;
+    {
+      auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
+      sg_size = sgsizeacc[0];
+    }
+
+    auto addacc = addbuf.template get_access<access::mode::read_write>();
     int WGid = -1, SGid = 0;
     T add = 0;
     for (int j = 0; j < G; j++) {
diff --git a/sycl/test/sub_group/broadcast.cpp b/sycl/test/sub_group/broadcast.cpp
index f6f8465bb25d..7ba5da112139 100644
--- a/sycl/test/sub_group/broadcast.cpp
+++ b/sycl/test/sub_group/broadcast.cpp
@@ -33,9 +33,16 @@ template <typename T> void check(queue &Queue) {
           sgsizeacc[0] = SG.get_max_local_range()[0];
       });
     });
+
+    // Temporary workaround to avoid waiting for the task while still blocked
+    // task is located in the same queue.
+    size_t sg_size = 0;
+    {
+      auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
+      sg_size = sgsizeacc[0];
+    }
+
     auto syclacc = syclbuf.template get_access<access::mode::read_write>();
-    auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
-    size_t sg_size = sgsizeacc[0];
     if (sg_size == 0)
       sg_size = L;
     int WGid = -1, SGid = 0;
diff --git a/sycl/test/sub_group/common_ocl.cpp b/sycl/test/sub_group/common_ocl.cpp
index 62caa29d0fdb..e1507b901c1c 100644
--- a/sycl/test/sub_group/common_ocl.cpp
+++ b/sycl/test/sub_group/common_ocl.cpp
@@ -3,7 +3,7 @@
 // RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out %T/kernel_ocl.spv
-// RUN: %GPU_RUN_PLACEHOLDER %t.out %T/kernel_ocl.spv
+// RUNx: %GPU_RUN_PLACEHOLDER %t.out %T/kernel_ocl.spv
 // RUN: %ACC_RUN_PLACEHOLDER %t.out %T/kernel_ocl.spv
 //==--- common_ocl.cpp - basic SG methods in SYCL vs OpenCL  ---*- C++ -*---==//
 //
diff --git a/sycl/test/sub_group/load_store.cpp b/sycl/test/sub_group/load_store.cpp
index 3f1e4748173f..ae91bc753dea 100644
--- a/sycl/test/sub_group/load_store.cpp
+++ b/sycl/test/sub_group/load_store.cpp
@@ -57,9 +57,16 @@ template <typename T, int N> void check(queue &Queue) {
           sgsizeacc[0] = SG.get_max_local_range()[0];
       });
     });
+
+    // Temporary workaround to avoid waiting for the task while still blocked
+    // task is located in the same queue.
+    size_t sg_size = 0;
+    {
+      auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
+      sg_size = sgsizeacc[0];
+    }
+
     auto acc = syclbuf.template get_access<access::mode::read_write>();
-    auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
-    size_t sg_size = sgsizeacc[0];
     int WGid = -1, SGid = 0;
     for (int j = 0; j < (G - (sg_size * N)); j++) {
       if (j % L % sg_size == 0) {
@@ -122,9 +129,14 @@ template <typename T> void check(queue &Queue) {
         SG.store<T>(mp, s);
       });
     });
+    // Temporary workaround to avoid waiting for the task while still blocked
+    // task is located in the same queue.
+    size_t sg_size = 0;
+    {
+      auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
+      sg_size = sgsizeacc[0];
+    }
     auto acc = syclbuf.template get_access<access::mode::read_write>();
-    auto sgsizeacc = sgsizebuf.get_access<access::mode::read_write>();
-    size_t sg_size = sgsizeacc[0];
     int WGid = -1, SGid = 0;
     for (int j = 0; j < G; j++) {
       if (j % L % sg_size == 0) {
diff --git a/sycl/test/sub_group/reduce.cpp b/sycl/test/sub_group/reduce.cpp
index 5d78586910d9..bb0ec49f517a 100644
--- a/sycl/test/sub_group/reduce.cpp
+++ b/sycl/test/sub_group/reduce.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUNx: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 //==--------------- reduce.cpp - SYCL sub_group reduce test ----*- C++ -*---==//
 //
diff --git a/sycl/test/sub_group/scan.cpp b/sycl/test/sub_group/scan.cpp
index 683733d7ab27..3c6865e5f596 100644
--- a/sycl/test/sub_group/scan.cpp
+++ b/sycl/test/sub_group/scan.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUNx: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 //==--------------- scan.cpp - SYCL sub_group scan test --------*- C++ -*---==//
 //
diff --git a/sycl/test/sub_group/vote.cpp b/sycl/test/sub_group/vote.cpp
index f3113deed98b..19ad64534811 100644
--- a/sycl/test/sub_group/vote.cpp
+++ b/sycl/test/sub_group/vote.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang -std=c++11 -fsycl %s -o %t.out -lstdc++ -lOpenCL -lsycl
 // RUN: env SYCL_DEVICE_TYPE=HOST %t.out
 // RUN: %CPU_RUN_PLACEHOLDER %t.out
-// RUN: %GPU_RUN_PLACEHOLDER %t.out
+// RUNx: %GPU_RUN_PLACEHOLDER %t.out
 // RUN: %ACC_RUN_PLACEHOLDER %t.out
 //==--------------- vote.cpp - SYCL sub_group vote test --*- C++ -*---------==//
 //
@@ -35,6 +35,15 @@ void check(queue Queue, const int G, const int L, const int D, const int R) {
       });
     });
 
+    Queue.submit([&](handler &cgh) {
+      auto sganyacc = sganybuf.get_access<access::mode::read_write>(cgh);
+      auto sgallacc = sgallbuf.get_access<access::mode::read_write>(cgh);
+      cgh.parallel_for<class init_bufs>(NdRange, [=](nd_item<1> NdItem) {
+        sganyacc[NdItem.get_global_id()] = 0;
+        sgallacc[NdItem.get_global_id()] = 0;
+      });
+    });
+
     Queue.submit([&](handler &cgh) {
       auto sganyacc = sganybuf.get_access<access::mode::read_write>(cgh);
       auto sgallacc = sgallbuf.get_access<access::mode::read_write>(cgh);

From d75ca0104dc02ba73e38b7ce36fb277632599d01 Mon Sep 17 00:00:00 2001
From: vromanov <vlad.romanov@intel.com>
Date: Tue, 16 Apr 2019 11:12:55 +0300
Subject: [PATCH 6/6] [SYCL] Fix compilation of sycl application with default
 compiler.

This patch fixes compilation of sycl application with default
compiler(without specifying -fsycl). To do this default KernelInfo
implementation is added + check that skips integration header
processing if specialization for KernelInfo(which is supposed to
come from integraion header) is not available.

Signed-off-by: Vlad Romanov <vlad.romanov@intel.com>
---
 sycl/include/CL/sycl/detail/kernel_desc.hpp |  9 ++++++++-
 sycl/include/CL/sycl/handler2.hpp           | 12 +++++++-----
 2 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/sycl/include/CL/sycl/detail/kernel_desc.hpp b/sycl/include/CL/sycl/detail/kernel_desc.hpp
index 54a76b6773db..c887fea8d844 100644
--- a/sycl/include/CL/sycl/detail/kernel_desc.hpp
+++ b/sycl/include/CL/sycl/detail/kernel_desc.hpp
@@ -36,7 +36,14 @@ struct kernel_param_desc_t {
   int offset;
 };
 
-template <class KernelNameType> struct KernelInfo;
+template <class KernelNameType> struct KernelInfo {
+  static constexpr unsigned getNumParams() { return 0; }
+  static const kernel_param_desc_t &getParamDesc(int Idx) {
+    static kernel_param_desc_t Dummy;
+    return Dummy;
+  }
+  static constexpr const char *getName() { return ""; }
+};
 
 } // namespace detail
 } // namespace sycl
diff --git a/sycl/include/CL/sycl/handler2.hpp b/sycl/include/CL/sycl/handler2.hpp
index 2dfc88875741..d7a99bbd1e5b 100644
--- a/sycl/include/CL/sycl/handler2.hpp
+++ b/sycl/include/CL/sycl/handler2.hpp
@@ -513,11 +513,13 @@ class handler {
     using KI = sycl::detail::KernelInfo<KernelName>;
     // Empty name indicates that the compilation happens without integration
     // header, so don't perform things that require it.
-    MArgs.clear();
-    extractArgsAndReqsFromLambda(MHostKernel->getPtr(), KI::getNumParams(),
-                                 &KI::getParamDesc(0));
-    MKernelName = KI::getName();
-    MOSModuleHandle = csd::OSUtil::getOSModuleHandle(KI::getName());
+    if (KI::getName() != "") {
+      MArgs.clear();
+      extractArgsAndReqsFromLambda(MHostKernel->getPtr(), KI::getNumParams(),
+                                   &KI::getParamDesc(0));
+      MKernelName = KI::getName();
+      MOSModuleHandle = csd::OSUtil::getOSModuleHandle(KI::getName());
+    }
   }
 
   // single_task version with a kernel represented as a lambda.