From b704d08734298fda4c00ecdac3cca6b8605c4a8d Mon Sep 17 00:00:00 2001 From: "Sarnie, Nick" Date: Tue, 21 Mar 2023 16:03:31 -0400 Subject: [PATCH] [SYCL][ESIMD] Add InlineAsm tests Signed-off-by: Sarnie, Nick --- SYCL/ESIMD/InlineAsm/asm_glb.cpp | 107 ++++++++++++++++++++++ SYCL/ESIMD/InlineAsm/asm_simd_mask.cpp | 119 +++++++++++++++++++++++++ SYCL/ESIMD/InlineAsm/asm_simd_view.cpp | 119 +++++++++++++++++++++++++ SYCL/ESIMD/InlineAsm/asm_vadd.cpp | 106 ++++++++++++++++++++++ 4 files changed, 451 insertions(+) create mode 100644 SYCL/ESIMD/InlineAsm/asm_glb.cpp create mode 100644 SYCL/ESIMD/InlineAsm/asm_simd_mask.cpp create mode 100644 SYCL/ESIMD/InlineAsm/asm_simd_view.cpp create mode 100644 SYCL/ESIMD/InlineAsm/asm_vadd.cpp diff --git a/SYCL/ESIMD/InlineAsm/asm_glb.cpp b/SYCL/ESIMD/InlineAsm/asm_glb.cpp new file mode 100644 index 0000000000..37f893260c --- /dev/null +++ b/SYCL/ESIMD/InlineAsm/asm_glb.cpp @@ -0,0 +1,107 @@ +//==---------------- asm_glb.cpp - DPC++ ESIMD on-device test -------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu +// UNSUPPORTED: cuda || hip +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include "../esimd_test_utils.hpp" + +#include +#include +#include + +using namespace sycl; +using namespace sycl::ext::intel::esimd; + +ESIMD_PRIVATE ESIMD_REGISTER(0) simd va; + +int main(void) { + constexpr unsigned Size = 1024 * 128; + constexpr unsigned VL = 16; + + float *A = new float[Size]; + float *B = new float[Size]; + float *C = new float[Size]; + + for (unsigned i = 0; i < Size; ++i) { + A[i] = B[i] = i; + C[i] = 0.0f; + } + + try { + buffer bufa(A, range<1>(Size)); + buffer bufb(B, range<1>(Size)); + buffer bufc(C, range<1>(Size)); + + // We need that many workgroups + range<1> GlobalRange{Size / VL}; + + // We need that many threads in each group + range<1> LocalRange{1}; + + queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); + + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + + auto e = q.submit([&](handler &cgh) { + auto PA = bufa.get_access(cgh); + auto PB = bufb.get_access(cgh); + auto PC = bufc.get_access(cgh); + cgh.parallel_for( + GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL { + using namespace sycl::ext::intel::esimd; + unsigned int offset = i * VL * sizeof(float); + va.copy_from(PA, offset); + simd vb; + vb.copy_from(PB, offset); + simd vc; +#ifdef __SYCL_DEVICE_ONLY__ + __asm__("add (M1, 16) %0 %1 %2" + : "=rw"(vc.data_ref()) + : "rw"(va.data()), "rw"(vb.data())); +#else + vc = va+vb; +#endif + vc.copy_to(PC, offset); + }); + }); + e.wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + + delete[] A; + delete[] B; + delete[] C; + return 1; + } + + int err_cnt = 0; + + for (unsigned i = 0; i < Size; ++i) { + if (A[i] + B[i] != C[i]) { + if (++err_cnt < 10) { + std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i] + << " + " << B[i] << "\n"; + } + } + } + if (err_cnt > 0) { + std::cout << " pass rate: " + << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% (" + << (Size - err_cnt) << "/" << Size << ")\n"; + } + + delete[] A; + delete[] B; + delete[] C; + + std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n"); + return err_cnt > 0 ? 1 : 0; +} diff --git a/SYCL/ESIMD/InlineAsm/asm_simd_mask.cpp b/SYCL/ESIMD/InlineAsm/asm_simd_mask.cpp new file mode 100644 index 0000000000..f80ca880bb --- /dev/null +++ b/SYCL/ESIMD/InlineAsm/asm_simd_mask.cpp @@ -0,0 +1,119 @@ +//==---------------- asm_simd_mask.cpp - DPC++ ESIMD on-device test +//-------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu +// UNSUPPORTED: cuda || hip +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include "../esimd_test_utils.hpp" + +#include +#include +#include + +using namespace sycl; + +int main(void) { + constexpr unsigned Size = 1024 * 128; + constexpr unsigned VL = 8; + + float *A = new float[Size]; + float *B = new float[Size]; + float *C = new float[Size]; + + for (unsigned i = 0; i < Size; ++i) { + A[i] = B[i] = i; + C[i] = 0.0f; + } + + try { + buffer bufa(A, range<1>(Size)); + buffer bufb(B, range<1>(Size)); + buffer bufc(C, range<1>(Size)); + + // We need that many workgroups + range<1> GlobalRange{Size / VL}; + + // We need that many threads in each group + range<1> LocalRange{1}; + + queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); + + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + + auto e = q.submit([&](handler &cgh) { + auto PA = bufa.get_access(cgh); + auto PB = bufb.get_access(cgh); + auto PC = bufc.get_access(cgh); + cgh.parallel_for( + GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL { + using namespace sycl::ext::intel::esimd; + unsigned int offset = i * VL * sizeof(float); + simd va; + va.copy_from(PA, offset); + simd vb; + vb.copy_from(PB, offset); + simd vc; +#ifdef __SYCL_DEVICE_ONLY__ + simd_mask m; + __asm__("mov (M1, 8) %0 0x1010101:v" : "=rw"(m.data_ref())); + __asm__("{\n" + ".decl P1 v_type=P num_elts=8\n" + "mov (M1, 8) %0 0x1:ud\n" + "setp (M1, 8) P1 %3\n" + "(P1) add (M1, 8) %0 %1 %2\n" + "}" + : "=rw"(vc.data_ref()) + : "rw"(va.data()), "rw"(vb.data()), "rw"(m.data())); +#else + simd_mask m({1,0,1,0,1,0,1,0}); + vc = va+vb; + vc.merge(1, !m); +#endif + vc.copy_to(PC, offset); + }); + }); + e.wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + + delete[] A; + delete[] B; + delete[] C; + return 1; + } + + int err_cnt = 0; + + for (unsigned i = 0; i < Size; ++i) { + if ((i % 2 == 0) && (A[i] + B[i] != C[i])) { + if (++err_cnt < 10) { + std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i] + << " + " << B[i] << "\n"; + } + } else if ((i % 2 == 1) && (C[i] != 1)) { + if (++err_cnt < 10) { + std::cout << "failed at index " << i << ", " << C[i] << " != 1\n"; + } + } + } + if (err_cnt > 0) { + std::cout << " pass rate: " + << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% (" + << (Size - err_cnt) << "/" << Size << ")\n"; + } + + delete[] A; + delete[] B; + delete[] C; + + std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n"); + return err_cnt > 0 ? 1 : 0; +} diff --git a/SYCL/ESIMD/InlineAsm/asm_simd_view.cpp b/SYCL/ESIMD/InlineAsm/asm_simd_view.cpp new file mode 100644 index 0000000000..bd480b903e --- /dev/null +++ b/SYCL/ESIMD/InlineAsm/asm_simd_view.cpp @@ -0,0 +1,119 @@ +//==---------------- asm_simd_view.cpp - DPC++ ESIMD on-device test +//-------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu +// UNSUPPORTED: cuda || hip +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include "../esimd_test_utils.hpp" + +#include +#include +#include + +using namespace sycl; + +int main(void) { + constexpr unsigned Size = 1024 * 128; + constexpr unsigned VL = 16; + + float *A = new float[Size]; + float *B = new float[Size]; + float *C = new float[Size]; + + for (unsigned i = 0; i < Size; ++i) { + A[i] = B[i] = i; + C[i] = 0.0f; + } + + try { + buffer bufa(A, range<1>(Size)); + buffer bufb(B, range<1>(Size)); + buffer bufc(C, range<1>(Size)); + + // We need that many workgroups + range<1> GlobalRange{Size / VL}; + + // We need that many threads in each group + range<1> LocalRange{1}; + + queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); + + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + + auto e = q.submit([&](handler &cgh) { + auto PA = bufa.get_access(cgh); + auto PB = bufb.get_access(cgh); + auto PC = bufc.get_access(cgh); + cgh.parallel_for( + GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL { + using namespace sycl::ext::intel::esimd; + unsigned int offset = i * VL * sizeof(float); + simd va; + va.copy_from(PA, offset); + simd vb; + vb.copy_from(PB, offset); +#ifdef __SYCL_DEVICE_ONLY__ + auto va_half1 = va.select<8, 1>(); + auto va_half2 = va.select<8, 1>(8); + auto vb_half1 = vb.select<8, 1>(); + auto vb_half2 = vb.select<8, 1>(8); + simd out1; + simd out2; + // simd_view is not supported in l-value context in inline asm, so + // use simd to store the result + __asm__("add (M1, 8) %0 %1 %2" + : "=rw"(out1.data_ref()) + : "rw"(va_half1.data()), "rw"(vb_half1.data())); + __asm__("add (M1, 8) %0 %1 %2" + : "=rw"(out2.data_ref()) + : "rw"(va_half2.data()), "rw"(vb_half2.data())); + out1.copy_to(PC, offset); + out2.copy_to(PC, offset + ((VL / 2) * sizeof(float))); +#else + simd vc; + vc = va+vb; + vc.copy_to(PC, offset); +#endif + }); + }); + e.wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + + delete[] A; + delete[] B; + delete[] C; + return 1; + } + + int err_cnt = 0; + + for (unsigned i = 0; i < Size; ++i) { + if (A[i] + B[i] != C[i]) { + if (++err_cnt < 10) { + std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i] + << " + " << B[i] << "\n"; + } + } + } + if (err_cnt > 0) { + std::cout << " pass rate: " + << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% (" + << (Size - err_cnt) << "/" << Size << ")\n"; + } + + delete[] A; + delete[] B; + delete[] C; + + std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n"); + return err_cnt > 0 ? 1 : 0; +} diff --git a/SYCL/ESIMD/InlineAsm/asm_vadd.cpp b/SYCL/ESIMD/InlineAsm/asm_vadd.cpp new file mode 100644 index 0000000000..1516a1d4d1 --- /dev/null +++ b/SYCL/ESIMD/InlineAsm/asm_vadd.cpp @@ -0,0 +1,106 @@ +//==---------------- asm_vadd.cpp - DPC++ ESIMD on-device test +//-------------==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: gpu +// UNSUPPORTED: cuda || hip +// RUN: %clangxx -fsycl %s -o %t.out +// RUN: %GPU_RUN_PLACEHOLDER %t.out + +#include "../esimd_test_utils.hpp" + +#include +#include +#include + +using namespace sycl; + +int main(void) { + constexpr unsigned Size = 1024 * 128; + constexpr unsigned VL = 16; + + float *A = new float[Size]; + float *B = new float[Size]; + float *C = new float[Size]; + + for (unsigned i = 0; i < Size; ++i) { + A[i] = B[i] = i; + C[i] = 0.0f; + } + + try { + buffer bufa(A, range<1>(Size)); + buffer bufb(B, range<1>(Size)); + buffer bufc(C, range<1>(Size)); + + // We need that many workgroups + range<1> GlobalRange{Size / VL}; + + // We need that many threads in each group + range<1> LocalRange{1}; + + queue q(esimd_test::ESIMDSelector, esimd_test::createExceptionHandler()); + + auto dev = q.get_device(); + std::cout << "Running on " << dev.get_info() << "\n"; + + auto e = q.submit([&](handler &cgh) { + auto PA = bufa.get_access(cgh); + auto PB = bufb.get_access(cgh); + auto PC = bufc.get_access(cgh); + cgh.parallel_for( + GlobalRange * LocalRange, [=](id<1> i) SYCL_ESIMD_KERNEL { + using namespace sycl::ext::intel::esimd; + unsigned int offset = i * VL * sizeof(float); + simd va; + va.copy_from(PA, offset); + simd vb; + vb.copy_from(PB, offset); + simd vc; +#ifdef __SYCL_DEVICE_ONLY__ + __asm__("add (M1, 16) %0 %1 %2" + : "=rw"(vc.data_ref()) + : "rw"(va.data()), "rw"(vb.data())); +#else + vc = va+vb; +#endif + vc.copy_to(PC, offset); + }); + }); + e.wait(); + } catch (sycl::exception const &e) { + std::cout << "SYCL exception caught: " << e.what() << '\n'; + + delete[] A; + delete[] B; + delete[] C; + return 1; + } + + int err_cnt = 0; + + for (unsigned i = 0; i < Size; ++i) { + if (A[i] + B[i] != C[i]) { + if (++err_cnt < 10) { + std::cout << "failed at index " << i << ", " << C[i] << " != " << A[i] + << " + " << B[i] << "\n"; + } + } + } + if (err_cnt > 0) { + std::cout << " pass rate: " + << ((float)(Size - err_cnt) / (float)Size) * 100.0f << "% (" + << (Size - err_cnt) << "/" << Size << ")\n"; + } + + delete[] A; + delete[] B; + delete[] C; + + std::cout << (err_cnt > 0 ? "FAILED\n" : "Passed\n"); + return err_cnt > 0 ? 1 : 0; +}