From b704d08734298fda4c00ecdac3cca6b8605c4a8d Mon Sep 17 00:00:00 2001
From: "Sarnie, Nick" <nick.sarnie@intel.com>
Date: Tue, 21 Mar 2023 16:03:31 -0400
Subject: [PATCH] [SYCL][ESIMD] Add InlineAsm tests

Signed-off-by: Sarnie, Nick <nick.sarnie@intel.com>
---
 SYCL/ESIMD/InlineAsm/asm_glb.cpp       | 107 ++++++++++++++++++++++
 SYCL/ESIMD/InlineAsm/asm_simd_mask.cpp | 119 +++++++++++++++++++++++++
 SYCL/ESIMD/InlineAsm/asm_simd_view.cpp | 119 +++++++++++++++++++++++++
 SYCL/ESIMD/InlineAsm/asm_vadd.cpp      | 106 ++++++++++++++++++++++
 4 files changed, 451 insertions(+)
 create mode 100644 SYCL/ESIMD/InlineAsm/asm_glb.cpp
 create mode 100644 SYCL/ESIMD/InlineAsm/asm_simd_mask.cpp
 create mode 100644 SYCL/ESIMD/InlineAsm/asm_simd_view.cpp
 create mode 100644 SYCL/ESIMD/InlineAsm/asm_vadd.cpp
diff --git a/SYCL/ESIMD/InlineAsm/asm_glb.cpp b/SYCL/ESIMD/InlineAsm/asm_glb.cpp
new file mode 100644
index 0000000000..37f893260c
--- /dev/null
+++ b/SYCL/ESIMD/InlineAsm/asm_glb.cpp
@@ -0,0 +1,107 @@
+//==---------------- asm_glb.cpp  - DPC++ ESIMD on-device test -------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu
+// UNSUPPORTED: cuda || hip
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+#include "../esimd_test_utils.hpp"
+
+#include <iostream>
+#include <sycl/ext/intel/esimd.hpp>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+using namespace sycl::ext::intel::esimd;
+
+ESIMD_PRIVATE ESIMD_REGISTER(0) simd<float, 16> va;
+
+int main(void) {
+  constexpr unsigned Size = 1024 * 128;
+  constexpr unsigned VL = 16;
+
+  float *A = new float[Size];
+  float *B = new float[Size];
+  float *C = new float[Size];
+
+  for (unsigned i = 0; i < Size; ++i) {
+    A[i] = B[i] = i;
+    C[i] = 0.0f;
+  }
+
+  try {
+    buffer<float, 1> bufa(A, range<1>(Size));
+    buffer<float, 1> bufb(B, range<1>(Size));
+    buffer<float, 1> bufc(C, range<1>(Size));
+
+    // We need that many workgroups
+    range<1> GlobalRange{Size / VL};
+
+    // We need that many threads in each group
+    range<1> LocalRange{1};
+
+    queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+
+    auto dev = q.get_device();
+    std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
+
+    auto e = q.submit([&](handler &cgh) {
+      auto PA = bufa.get_access<access::mode::read>(cgh);
+      auto PB = bufb.get_access<access::mode::read>(cgh);
+      auto PC = bufc.get_access<access::mode::write>(cgh);
+      cgh.parallel_for<class Test>(
+          GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
+            using namespace sycl::ext::intel::esimd;
+            unsigned int offset = i * VL * sizeof(float);
+            va.copy_from(PA, offset);
+            simd<float, VL> vb;
+            vb.copy_from(PB, offset);
+            simd<float, VL> vc;
+#ifdef __SYCL_DEVICE_ONLY__
+            __asm__("add (M1, 16) %0 %1 %2"
+                    : "=rw"(vc.data_ref())
+                    : "rw"(va.data()), "rw"(vb.data()));
+#else
+                    vc = va+vb;
+#endif
+            vc.copy_to(PC, offset);
+          });
+    });
+    e.wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+
+    delete[] A;
+    delete[] B;
+    delete[] C;
+    return 1;
+  }
+
+  int err_cnt = 0;
+
+  for (unsigned i = 0; i < Size; ++i) {
+    if (A[i] + B[i] != C[i]) {
+      if (++err_cnt < 10) {
+        std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i]
+                  << " + " << B[i] << "\n";
+      }
+    }
+  }
+  if (err_cnt > 0) {
+    std::cout << "  pass rate: "
+              << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% ("
+              << (Size - err_cnt) << "/" << Size << ")\n";
+  }
+
+  delete[] A;
+  delete[] B;
+  delete[] C;
+
+  std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
+  return err_cnt > 0 ? 1 : 0;
+}
diff --git a/SYCL/ESIMD/InlineAsm/asm_simd_mask.cpp b/SYCL/ESIMD/InlineAsm/asm_simd_mask.cpp
new file mode 100644
index 0000000000..f80ca880bb
--- /dev/null
+++ b/SYCL/ESIMD/InlineAsm/asm_simd_mask.cpp
@@ -0,0 +1,119 @@
+//==---------------- asm_simd_mask.cpp  - DPC++ ESIMD on-device test
+//-------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu
+// UNSUPPORTED: cuda || hip
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+#include "../esimd_test_utils.hpp"
+
+#include <iostream>
+#include <sycl/ext/intel/esimd.hpp>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main(void) {
+  constexpr unsigned Size = 1024 * 128;
+  constexpr unsigned VL = 8;
+
+  float *A = new float[Size];
+  float *B = new float[Size];
+  float *C = new float[Size];
+
+  for (unsigned i = 0; i < Size; ++i) {
+    A[i] = B[i] = i;
+    C[i] = 0.0f;
+  }
+
+  try {
+    buffer<float, 1> bufa(A, range<1>(Size));
+    buffer<float, 1> bufb(B, range<1>(Size));
+    buffer<float, 1> bufc(C, range<1>(Size));
+
+    // We need that many workgroups
+    range<1> GlobalRange{Size / VL};
+
+    // We need that many threads in each group
+    range<1> LocalRange{1};
+
+    queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+
+    auto dev = q.get_device();
+    std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
+
+    auto e = q.submit([&](handler &cgh) {
+      auto PA = bufa.get_access<access::mode::read>(cgh);
+      auto PB = bufb.get_access<access::mode::read>(cgh);
+      auto PC = bufc.get_access<access::mode::write>(cgh);
+      cgh.parallel_for<class Test>(
+          GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
+            using namespace sycl::ext::intel::esimd;
+            unsigned int offset = i * VL * sizeof(float);
+            simd<float, VL> va;
+            va.copy_from(PA, offset);
+            simd<float, VL> vb;
+            vb.copy_from(PB, offset);
+            simd<float, VL> vc;
+#ifdef __SYCL_DEVICE_ONLY__
+            simd_mask<VL> m;
+            __asm__("mov (M1, 8) %0 0x1010101:v" : "=rw"(m.data_ref()));
+            __asm__("{\n"
+                    ".decl P1 v_type=P num_elts=8\n"
+                    "mov (M1, 8) %0 0x1:ud\n"
+                    "setp (M1, 8) P1 %3\n"
+                    "(P1) add (M1, 8) %0 %1 %2\n"
+                    "}"
+                    : "=rw"(vc.data_ref())
+                    : "rw"(va.data()), "rw"(vb.data()), "rw"(m.data()));
+#else
+            simd_mask<VL> m({1,0,1,0,1,0,1,0});
+            vc = va+vb;
+            vc.merge(1, !m);
+#endif
+            vc.copy_to(PC, offset);
+          });
+    });
+    e.wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+
+    delete[] A;
+    delete[] B;
+    delete[] C;
+    return 1;
+  }
+
+  int err_cnt = 0;
+
+  for (unsigned i = 0; i < Size; ++i) {
+    if ((i % 2 == 0) && (A[i] + B[i] != C[i])) {
+      if (++err_cnt < 10) {
+        std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i]
+                  << " + " << B[i] << "\n";
+      }
+    } else if ((i % 2 == 1) && (C[i] != 1)) {
+      if (++err_cnt < 10) {
+        std::cout << "failed at index " << i << ", " << C[i] << " != 1\n";
+      }
+    }
+  }
+  if (err_cnt > 0) {
+    std::cout << "  pass rate: "
+              << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% ("
+              << (Size - err_cnt) << "/" << Size << ")\n";
+  }
+
+  delete[] A;
+  delete[] B;
+  delete[] C;
+
+  std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
+  return err_cnt > 0 ? 1 : 0;
+}
diff --git a/SYCL/ESIMD/InlineAsm/asm_simd_view.cpp b/SYCL/ESIMD/InlineAsm/asm_simd_view.cpp
new file mode 100644
index 0000000000..bd480b903e
--- /dev/null
+++ b/SYCL/ESIMD/InlineAsm/asm_simd_view.cpp
@@ -0,0 +1,119 @@
+//==---------------- asm_simd_view.cpp  - DPC++ ESIMD on-device test
+//-------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu
+// UNSUPPORTED: cuda || hip
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+#include "../esimd_test_utils.hpp"
+
+#include <iostream>
+#include <sycl/ext/intel/esimd.hpp>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main(void) {
+  constexpr unsigned Size = 1024 * 128;
+  constexpr unsigned VL = 16;
+
+  float *A = new float[Size];
+  float *B = new float[Size];
+  float *C = new float[Size];
+
+  for (unsigned i = 0; i < Size; ++i) {
+    A[i] = B[i] = i;
+    C[i] = 0.0f;
+  }
+
+  try {
+    buffer<float, 1> bufa(A, range<1>(Size));
+    buffer<float, 1> bufb(B, range<1>(Size));
+    buffer<float, 1> bufc(C, range<1>(Size));
+
+    // We need that many workgroups
+    range<1> GlobalRange{Size / VL};
+
+    // We need that many threads in each group
+    range<1> LocalRange{1};
+
+    queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+
+    auto dev = q.get_device();
+    std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
+
+    auto e = q.submit([&](handler &cgh) {
+      auto PA = bufa.get_access<access::mode::read>(cgh);
+      auto PB = bufb.get_access<access::mode::read>(cgh);
+      auto PC = bufc.get_access<access::mode::write>(cgh);
+      cgh.parallel_for<class Test>(
+          GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
+            using namespace sycl::ext::intel::esimd;
+            unsigned int offset = i * VL * sizeof(float);
+            simd<float, VL> va;
+            va.copy_from(PA, offset);
+            simd<float, VL> vb;
+            vb.copy_from(PB, offset);
+#ifdef __SYCL_DEVICE_ONLY__
+            auto va_half1 = va.select<8, 1>();
+            auto va_half2 = va.select<8, 1>(8);
+            auto vb_half1 = vb.select<8, 1>();
+            auto vb_half2 = vb.select<8, 1>(8);
+            simd<float, VL / 2> out1;
+            simd<float, VL / 2> out2;
+            // simd_view is not supported in l-value context in inline asm, so
+            // use simd to store the result
+            __asm__("add (M1, 8) %0 %1 %2"
+                    : "=rw"(out1.data_ref())
+                    : "rw"(va_half1.data()), "rw"(vb_half1.data()));
+            __asm__("add (M1, 8) %0 %1 %2"
+                    : "=rw"(out2.data_ref())
+                    : "rw"(va_half2.data()), "rw"(vb_half2.data()));
+            out1.copy_to(PC, offset);
+            out2.copy_to(PC, offset + ((VL / 2) * sizeof(float)));
+#else
+             simd<float, VL> vc;
+             vc = va+vb;
+             vc.copy_to(PC, offset);
+#endif
+          });
+    });
+    e.wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+
+    delete[] A;
+    delete[] B;
+    delete[] C;
+    return 1;
+  }
+
+  int err_cnt = 0;
+
+  for (unsigned i = 0; i < Size; ++i) {
+    if (A[i] + B[i] != C[i]) {
+      if (++err_cnt < 10) {
+        std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i]
+                  << " + " << B[i] << "\n";
+      }
+    }
+  }
+  if (err_cnt > 0) {
+    std::cout << "  pass rate: "
+              << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% ("
+              << (Size - err_cnt) << "/" << Size << ")\n";
+  }
+
+  delete[] A;
+  delete[] B;
+  delete[] C;
+
+  std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
+  return err_cnt > 0 ? 1 : 0;
+}
diff --git a/SYCL/ESIMD/InlineAsm/asm_vadd.cpp b/SYCL/ESIMD/InlineAsm/asm_vadd.cpp
new file mode 100644
index 0000000000..1516a1d4d1
--- /dev/null
+++ b/SYCL/ESIMD/InlineAsm/asm_vadd.cpp
@@ -0,0 +1,106 @@
+//==---------------- asm_vadd.cpp  - DPC++ ESIMD on-device test
+//-------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// REQUIRES: gpu
+// UNSUPPORTED: cuda || hip
+// RUN: %clangxx -fsycl %s -o %t.out
+// RUN: %GPU_RUN_PLACEHOLDER %t.out
+
+#include "../esimd_test_utils.hpp"
+
+#include <iostream>
+#include <sycl/ext/intel/esimd.hpp>
+#include <sycl/sycl.hpp>
+
+using namespace sycl;
+
+int main(void) {
+  constexpr unsigned Size = 1024 * 128;
+  constexpr unsigned VL = 16;
+
+  float *A = new float[Size];
+  float *B = new float[Size];
+  float *C = new float[Size];
+
+  for (unsigned i = 0; i < Size; ++i) {
+    A[i] = B[i] = i;
+    C[i] = 0.0f;
+  }
+
+  try {
+    buffer<float, 1> bufa(A, range<1>(Size));
+    buffer<float, 1> bufb(B, range<1>(Size));
+    buffer<float, 1> bufc(C, range<1>(Size));
+
+    // We need that many workgroups
+    range<1> GlobalRange{Size / VL};
+
+    // We need that many threads in each group
+    range<1> LocalRange{1};
+
+    queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler());
+
+    auto dev = q.get_device();
+    std::cout << "Running on " << dev.get_info<info::device::name>() << "\n";
+
+    auto e = q.submit([&](handler &cgh) {
+      auto PA = bufa.get_access<access::mode::read>(cgh);
+      auto PB = bufb.get_access<access::mode::read>(cgh);
+      auto PC = bufc.get_access<access::mode::write>(cgh);
+      cgh.parallel_for<class Test>(
+          GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL {
+            using namespace sycl::ext::intel::esimd;
+            unsigned int offset = i * VL * sizeof(float);
+            simd<float, VL> va;
+            va.copy_from(PA, offset);
+            simd<float, VL> vb;
+            vb.copy_from(PB, offset);
+            simd<float, VL> vc;
+#ifdef __SYCL_DEVICE_ONLY__
+            __asm__("add (M1, 16) %0 %1 %2"
+                    : "=rw"(vc.data_ref())
+                    : "rw"(va.data()), "rw"(vb.data()));
+#else
+                    vc = va+vb;
+#endif
+            vc.copy_to(PC, offset);
+          });
+    });
+    e.wait();
+  } catch (sycl::exception const &e) {
+    std::cout << "SYCL exception caught: " << e.what() << '\n';
+
+    delete[] A;
+    delete[] B;
+    delete[] C;
+    return 1;
+  }
+
+  int err_cnt = 0;
+
+  for (unsigned i = 0; i < Size; ++i) {
+    if (A[i] + B[i] != C[i]) {
+      if (++err_cnt < 10) {
+        std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i]
+                  << " + " << B[i] << "\n";
+      }
+    }
+  }
+  if (err_cnt > 0) {
+    std::cout << "  pass rate: "
+              << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% ("
+              << (Size - err_cnt) << "/" << Size << ")\n";
+  }
+
+  delete[] A;
+  delete[] B;
+  delete[] C;
+
+  std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n");
+  return err_cnt > 0 ? 1 : 0;
+}