From a48f153c72eb24f57434a13adbcb7690f3d3a776 Mon Sep 17 00:00:00 2001
From: Alex McCaskey <amccaskey@nvidia.com>
Date: Fri, 21 Apr 2023 07:55:56 -0400
Subject: [PATCH] Remove qpud (#91)

* Remove qpud
---------

Signed-off-by: Alex McCaskey <amccaskey@nvidia.com>
---
 .gitmodules                                   |   3 -
 CMakeLists.txt                                |   8 -
 NOTICE                                        |   9 -
 Overview.md                                   |  16 -
 docs/sphinx/using/advanced/cudaq_ir.rst       |   2 +-
 python/utils/LinkedLibraryHolder.cpp          |   2 -
 runtime/CMakeLists.txt                        |   1 -
 runtime/cudaq.h                               |   3 +-
 runtime/cudaq/platform/CMakeLists.txt         |   1 -
 .../default/DefaultQuantumPlatform.cpp        |   4 +-
 .../platform/default/rest/RemoteRESTQPU.cpp   |   4 +-
 runtime/cudaq/platform/qpud/CMakeLists.txt    |  26 -
 .../qpud/DefaultQPUDQuantumPlatform.cpp       | 132 ----
 runtime/cudaq/platform/quantum_platform.h     |   2 +-
 runtime/qpud-client/CMakeLists.txt            |  26 -
 runtime/qpud-client/qpud_client.cpp           | 454 -------------
 runtime/qpud-client/qpud_client.h             | 337 ----------
 test/NVQPP/auto_kernel.cpp                    |   3 +-
 test/NVQPP/testIQPE_Conditionals.cpp          |   1 -
 test/NVQPP/testQPUDObserve.cpp                |  76 ---
 test/NVQPP/testQPUDSample.cpp                 |  31 -
 tools/CMakeLists.txt                          |   3 -
 tools/nvqpp/nvq++.in                          |   6 +-
 tools/qpud/CMakeLists.txt                     |  63 --
 tools/qpud/backends/CMakeLists.txt            |  10 -
 tools/qpud/backends/TargetBackend.cpp         | 224 -------
 tools/qpud/backends/TargetBackend.h           | 139 ----
 tools/qpud/backends/default/CMakeLists.txt    |   9 -
 .../qpud/backends/default/DefaultBackend.cpp  | 258 --------
 tools/qpud/jit/CMakeLists.txt                 |   9 -
 tools/qpud/jit/KernelJIT.cpp                  |  69 --
 tools/qpud/jit/KernelJIT.h                    |  77 ---
 tools/qpud/qpud.cpp                           | 419 ------------
 tools/qpud/utils/CMakeLists.txt               |  17 -
 tools/qpud/utils/FakeNvidiaPlatformHelper.cpp |  17 -
 tools/qpud/utils/NvidiaPlatformHelper.cu      |  40 --
 tools/qpud/utils/NvidiaPlatformHelper.h       |  33 -
 tpls/rpclib                                   |   1 -
 unittests/CMakeLists.txt                      |  12 -
 unittests/Optimizer/CMakeLists.txt            |   1 -
 unittests/Optimizer/QuakeSynthTester.cpp      |   2 +-
 unittests/qpud_client/QPUDClientTester.cpp    | 594 ------------------
 42 files changed, 9 insertions(+), 3135 deletions(-)
 delete mode 100644 runtime/cudaq/platform/qpud/CMakeLists.txt
 delete mode 100644 runtime/cudaq/platform/qpud/DefaultQPUDQuantumPlatform.cpp
 delete mode 100644 runtime/qpud-client/CMakeLists.txt
 delete mode 100644 runtime/qpud-client/qpud_client.cpp
 delete mode 100644 runtime/qpud-client/qpud_client.h
 delete mode 100644 test/NVQPP/testQPUDObserve.cpp
 delete mode 100644 test/NVQPP/testQPUDSample.cpp
 delete mode 100644 tools/qpud/CMakeLists.txt
 delete mode 100644 tools/qpud/backends/CMakeLists.txt
 delete mode 100644 tools/qpud/backends/TargetBackend.cpp
 delete mode 100644 tools/qpud/backends/TargetBackend.h
 delete mode 100644 tools/qpud/backends/default/CMakeLists.txt
 delete mode 100644 tools/qpud/backends/default/DefaultBackend.cpp
 delete mode 100644 tools/qpud/jit/CMakeLists.txt
 delete mode 100644 tools/qpud/jit/KernelJIT.cpp
 delete mode 100644 tools/qpud/jit/KernelJIT.h
 delete mode 100644 tools/qpud/qpud.cpp
 delete mode 100644 tools/qpud/utils/CMakeLists.txt
 delete mode 100644 tools/qpud/utils/FakeNvidiaPlatformHelper.cpp
 delete mode 100644 tools/qpud/utils/NvidiaPlatformHelper.cu
 delete mode 100644 tools/qpud/utils/NvidiaPlatformHelper.h
 delete mode 160000 tpls/rpclib
 delete mode 100644 unittests/qpud_client/QPUDClientTester.cpp

diff --git a/.gitmodules b/.gitmodules
index effaefb20b..c834f0ceaf 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,9 +4,6 @@
 [submodule "tpls/googletest-src"]
 	path = tpls/googletest-src
 	url = https://github.com/google/googletest
-[submodule "tpls/rpclib"]
-	path = tpls/rpclib
-	url = https://github.com/rpclib/rpclib
 [submodule "tpls/llvm"]
 	path = tpls/llvm
 	url = https://github.com/llvm/llvm-project.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
index cc78a4e4bc..e586759666 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -312,14 +312,6 @@ add_subdirectory(tpls/spdlog)
 set_property(TARGET spdlog APPEND PROPERTY INTERFACE_SYSTEM_INCLUDE_DIRECTORIES)
 target_compile_options(spdlog PRIVATE -Wno-covered-switch-default)
 
-# May want to provide a hook to this
-if (CUDAQ_ENABLE_RPC_LOGGING)
-  set(RPCLIB_ENABLE_LOGGING TRUE)
-endif()
-set(RPCLIB_EXTRA_BUILD_FLAGS "-Wno-suggest-override;-Wno-documentation;-Wno-unused-variable;-Wno-unused-but-set-variable;-Wno-unsafe-buffer-usage")
-add_subdirectory(tpls/rpclib EXCLUDE_FROM_ALL)
-set_property(TARGET rpc APPEND PROPERTY INTERFACE_SYSTEM_INCLUDE_DIRECTORIES)
-
 # Check for CUDA Support
 # ==============================================================================
 include(CheckLanguage)
diff --git a/NOTICE b/NOTICE
index a6702177bb..6f4f98b29e 100644
--- a/NOTICE
+++ b/NOTICE
@@ -70,15 +70,6 @@ License at <https://github.com/fmtlib/fmt/blob/master/LICENSE.rst>
 
 ----------------------------------------------------------------
 
-RPCLib - MIT License
-<https://github.com/rpclib/rpclib>
-
-Originally developed by Tamas Szelei.
-The incorporated source code and its license can be found as a submodule on the CUDA Quantum repository.
-License at <https://github.com/rpclib/rpclib/blob/master/LICENSE.md>
-
-----------------------------------------------------------------
-
 SPDLog - MIT License
 <https://github.com/gabime/spdlog>
 
diff --git a/Overview.md b/Overview.md
index 5d4ea10d63..1dab1fad93 100644
--- a/Overview.md
+++ b/Overview.md
@@ -128,22 +128,6 @@ product terms.
 This library defines the `quantum_platform` architecture, enabling CUDA Quantum
 to target both simulated and physical quantum computing architectures.
 
-### `runtime/qpud_client`
-
-This folder contains a client library for interacting with the remote `qpud`
-process daemon.
-
-### `tools/qpud`
-
-This folder implements the `qpud` executable. `qpud` is meant to serve as a
-separate daemon process that emulates the true host, classical driver cpu,
-quantum register architectural separation. It implements a client/server model
-and accepts Quake code, JIT compiles it, and enables its execution on local
-emulators, or physical remote vendor quantum computers.
-
-This folder contains a TargetBackend type with specializations that target the
-nvqir, quantinuum, and rigetti backends.
-
 ### `tools/cudaq-quake`
 
 This folder contains the implementation of the `cudaq-quake` tool. This tool
diff --git a/docs/sphinx/using/advanced/cudaq_ir.rst b/docs/sphinx/using/advanced/cudaq_ir.rst
index 9eea844e6d..754c17906c 100644
--- a/docs/sphinx/using/advanced/cudaq_ir.rst
+++ b/docs/sphinx/using/advanced/cudaq_ir.rst
@@ -32,7 +32,7 @@ saved to file :code:`simple.cpp`, we see the following output from :code:`nvq++`
   llc --relocation-model=pic --filetype=obj -O2 simple.ll.p3De4L -o simple.qke.o
   llc --relocation-model=pic --filetype=obj -O2 simple.ll -o simple.classic.o
   clang++ -L/usr/lib/gcc/x86_64-linux-gnu/11 -L/usr/lib64 -L/lib/x86_64-linux-gnu -L/lib64 -L/usr/lib/x86_64-linux-gnu -L/lib -L/usr/lib -L/usr/local/cuda/lib64/stubs -r simple.qke.o simple.classic.o -o simple.o
-  clang++ -Wl,-rpath,lib -Llib -L/usr/lib/gcc/x86_64-linux-gnu/11 -L/usr/lib64 -L/lib/x86_64-linux-gnu -L/lib64 -L/usr/lib/x86_64-linux-gnu -L/lib -L/usr/lib -L/usr/local/cuda/lib64/stubs simple.o -lcudaq -lcudaq-common -lcudaq-mlir-runtime -lcudaq-builder -lcudaq-ensmallen -lcudaq-nlopt -lcudaq-spin -lcudaq-qpud-client -lcudaq-em-qir -lcudaq-platform-default -lnvqir -lnvqir-qpp
+  clang++ -Wl,-rpath,lib -Llib -L/usr/lib/gcc/x86_64-linux-gnu/11 -L/usr/lib64 -L/lib/x86_64-linux-gnu -L/lib64 -L/usr/lib/x86_64-linux-gnu -L/lib -L/usr/lib -L/usr/local/cuda/lib64/stubs simple.o -lcudaq -lcudaq-common -lcudaq-mlir-runtime -lcudaq-builder -lcudaq-ensmallen -lcudaq-nlopt -lcudaq-spin -lcudaq-em-qir -lcudaq-platform-default -lnvqir -lnvqir-qpp
 
 The workflow orchestrated above is best visualized in the following figure. 
 
diff --git a/python/utils/LinkedLibraryHolder.cpp b/python/utils/LinkedLibraryHolder.cpp
index b522e55add..099aeb9109 100644
--- a/python/utils/LinkedLibraryHolder.cpp
+++ b/python/utils/LinkedLibraryHolder.cpp
@@ -223,8 +223,6 @@ void LinkedLibraryHolder::setPlatform(
     const std::string &name, std::map<std::string, std::string> config) {
 
   std::string mutableName = name;
-  if (name == "qpud")
-    mutableName = "default-qpud";
 
   // need to set qpu to cuquantum for mqpu
   if (name == "mqpu")
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index 9dc8a032b1..af84b93ce5 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -11,7 +11,6 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib)
 add_subdirectory(common)
 add_subdirectory(nvqir)
 add_subdirectory(cudaq)
-add_subdirectory(qpud-client)
 
 # Install
 # ==============================================================================
diff --git a/runtime/cudaq.h b/runtime/cudaq.h
index 2b501e30b0..994a5ca816 100644
--- a/runtime/cudaq.h
+++ b/runtime/cudaq.h
@@ -176,8 +176,7 @@ KernelArgsCreator getArgsCreator(const std::string &kernelName);
 /// @return
 bool kernelHasConditionalFeedback(const std::string &kernelName);
 
-/// @brief Provide a hook to set the remote qpud target backend.
-/// @param backend
+/// @brief Provide a hook to set the target backend.
 void set_qpu_backend(const char *backend);
 
 /// @brief Utility function for setting the shots on the platform
diff --git a/runtime/cudaq/platform/CMakeLists.txt b/runtime/cudaq/platform/CMakeLists.txt
index 804ac4b37a..02e034a90d 100644
--- a/runtime/cudaq/platform/CMakeLists.txt
+++ b/runtime/cudaq/platform/CMakeLists.txt
@@ -7,7 +7,6 @@
 # ============================================================================ #
 
 add_subdirectory(default)
-add_subdirectory(qpud)
 if (CUDA_FOUND AND CUSTATEVEC_ROOT)
   add_subdirectory(mqpu)
 endif()
\ No newline at end of file
diff --git a/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp b/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp
index c99d167f7c..e12610fc1d 100644
--- a/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp
+++ b/runtime/cudaq/platform/default/DefaultQuantumPlatform.cpp
@@ -23,9 +23,7 @@ LLVM_INSTANTIATE_REGISTRY(cudaq::QPU::RegistryType)
 
 namespace {
 /// The DefaultQPU models a simulated QPU by specifically
-/// targeting the QIS ExecutionManager. This QPU is meant
-/// to be used in Library Mode (no qpud daemon or remote
-/// physical QPU invocation)
+/// targeting the QIS ExecutionManager. 
 class DefaultQPU : public cudaq::QPU {
 public:
   DefaultQPU() = default;
diff --git a/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp b/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp
index b95688ec4f..f3f782cfaf 100644
--- a/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp
+++ b/runtime/cudaq/platform/default/rest/RemoteRESTQPU.cpp
@@ -116,11 +116,11 @@ class RemoteRESTQPU : public cudaq::QPU {
     execution_queue->enqueue(task);
   }
 
-  /// @brief Ask qpud if the current backend is a simulator
+  /// @brief Return true if the current backend is a simulator
   /// @return
   bool isSimulator() override { return false; }
 
-  /// @brief Ask qpud if the current backend supports conditional feedback
+  /// @brief Return true if the current backend supports conditional feedback
   bool supportsConditionalFeedback() override { return false; }
 
   /// Provide the number of shots
diff --git a/runtime/cudaq/platform/qpud/CMakeLists.txt b/runtime/cudaq/platform/qpud/CMakeLists.txt
deleted file mode 100644
index 1e737e0a27..0000000000
--- a/runtime/cudaq/platform/qpud/CMakeLists.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-# ============================================================================ #
-# Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                   #
-# All rights reserved.                                                         #
-#                                                                              #
-# This source code and the accompanying materials are made available under     #
-# the terms of the Apache License 2.0 which accompanies this distribution.     #
-# ============================================================================ #
-
-set(LIBRARY_NAME cudaq-platform-default-qpud)
-set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ctad-maybe-unsupported")
-set(INTERFACE_POSITION_INDEPENDENT_CODE ON)
-
-set(CUDAQ_DEFAULTPLATFORM_SRC
-  DefaultQPUDQuantumPlatform.cpp
-  ../common/QuantumExecutionQueue.cpp
-)
-
-add_library(${LIBRARY_NAME} SHARED ${CUDAQ_DEFAULTPLATFORM_SRC})
-target_include_directories(${LIBRARY_NAME}
-  PUBLIC . ${CMAKE_BINARY_DIR} ${CMAKE_SOURCE_DIR}/runtime )
-target_link_libraries(${LIBRARY_NAME}
-  PUBLIC fmt::fmt-header-only cudaq-qpud-client PRIVATE fmt::fmt-header-only rpc LLVMSupport)
-
-cudaq_library_set_rpath(${LIBRARY_NAME})
-
-install(TARGETS ${LIBRARY_NAME} DESTINATION lib)
diff --git a/runtime/cudaq/platform/qpud/DefaultQPUDQuantumPlatform.cpp b/runtime/cudaq/platform/qpud/DefaultQPUDQuantumPlatform.cpp
deleted file mode 100644
index 7618b84e08..0000000000
--- a/runtime/cudaq/platform/qpud/DefaultQPUDQuantumPlatform.cpp
+++ /dev/null
@@ -1,132 +0,0 @@
-/*************************************************************** -*- C++ -*- ***
- * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- *******************************************************************************/
-
-#ifdef __GNUC__
-#pragma GCC diagnostic ignored "-Wsuggest-override"
-#endif
-#include "common/ExecutionContext.h"
-#include "cudaq/platform/qpu.h"
-#include "cudaq/platform/quantum_platform.h"
-#include "nvqpp_config.h"
-#include "qpud_client.h"
-#include "rpc/client.h"
-#include "llvm/Support/Program.h"
-#include <cudaq/spin_op.h>
-#include <fmt/core.h>
-#include <iostream>
-#include <netinet/in.h>
-#include <sys/socket.h>
-#include <sys/types.h>
-
-/// This file defines the qpud default platform. It is meant to be
-/// used in conjuction with the --enable-mlir flag of nvq++. It takes
-/// kernel invocations which invoke altLaunchKernel and forwards them to
-/// the remote qpud daemon for execution.
-
-namespace {
-
-/// The QPUD QPU is a QPU that enables kernel invocation via
-/// remote process calls to the qpud daemon. It's job is to connect to
-/// the remote daemon (or start it if one is not specified), and forward
-/// all calls to launchKernel to the daemon via the rpc client.
-class QpudQPU : public cudaq::QPU {
-protected:
-  /// @brief The QPUD Client, enables kernel launches
-  cudaq::qpud_client qpudClient;
-
-  /// The number of shots
-  std::optional<int> nShots;
-
-public:
-  QpudQPU() : QPU() {}
-  QpudQPU(QpudQPU &&) = delete;
-  virtual ~QpudQPU() = default;
-
-  void enqueue(cudaq::QuantumTask &task) override {
-    execution_queue->enqueue(task);
-  }
-
-  /// @brief Ask qpud if the current backend is a simulator
-  /// @return
-  bool isSimulator() override { return qpudClient.is_simulator(); }
-
-  /// @brief Ask qpud if the current backend supports conditional feedback
-  bool supportsConditionalFeedback() override {
-    return qpudClient.supports_conditional_feedback();
-  }
-
-  /// Provide the number of shots
-  void setShots(int _nShots) override { nShots = _nShots; }
-
-  /// Clear the number of shots
-  void clearShots() override { nShots = std::nullopt; }
-
-  /// Store the execution context for launchKernel
-  void setExecutionContext(cudaq::ExecutionContext *context) override {
-    executionContext = context;
-  }
-
-  /// Reset the execution context
-  void resetExecutionContext() override {
-    // do nothing here
-    executionContext = nullptr;
-  }
-
-  void setTargetBackend(const std::string &backend) override {
-    qpudClient.set_backend(backend);
-  }
-
-  /// Launch the kernel with given name and runtime arguments.
-  void launchKernel(const std::string &kernelName, void (*kernelFunc)(void *),
-                    void *args, std::uint64_t voidStarSize,
-                    std::uint64_t resultOffset) override {
-    // Execute based on the context...
-    if (executionContext &&
-        executionContext->name.find("sample") != std::string::npos) {
-      // Sample the state generated by the quake code
-      executionContext->result = qpudClient.sample(
-          kernelName, nShots.value_or(1000), args, voidStarSize);
-    } else if (executionContext && executionContext->name == "observe") {
-      // Observe the state with respect to the given operator
-      if (!executionContext->spin.has_value())
-        throw std::runtime_error(
-            "Observe ExecutionContext specified without a cudaq::spin_op.");
-      auto H = *executionContext->spin.value();
-      auto res = qpudClient.observe(kernelName, H, args, voidStarSize,
-                                    (std::size_t)nShots.value_or(0));
-      executionContext->expectationValue = res.exp_val_z();
-      executionContext->result = res.raw_data();
-    } else {
-      // Just execute the kernel
-      qpudClient.execute(kernelName, args, voidStarSize, resultOffset);
-    }
-  }
-};
-
-class DefaultQPUDQuantumPlatform : public cudaq::quantum_platform {
-public:
-  DefaultQPUDQuantumPlatform() : quantum_platform() {
-    // Populate the information and add the QPUs
-    platformQPUs.emplace_back(std::make_unique<QpudQPU>());
-    platformNumQPUs = platformQPUs.size();
-  }
-
-  /// @brief Set the target backend on the remote qpud process.
-  /// @param backend
-  void setTargetBackend(const std::string &backend) override {
-    platformQPUs.front()->setTargetBackend(backend);
-  }
-
-  void set_shots(int numShots) override {
-    cudaq::quantum_platform::set_shots(numShots);
-    platformQPUs.back()->setShots(numShots);
-  }
-};
-} // namespace
-
-CUDAQ_REGISTER_PLATFORM(DefaultQPUDQuantumPlatform, qpud)
diff --git a/runtime/cudaq/platform/quantum_platform.h b/runtime/cudaq/platform/quantum_platform.h
index c8f078e793..9d4c9b0a55 100644
--- a/runtime/cudaq/platform/quantum_platform.h
+++ b/runtime/cudaq/platform/quantum_platform.h
@@ -104,7 +104,7 @@ class quantum_platform {
   // enqueueAsyncObserveTask(const std::size_t qpu_id, ObserveTask &t);
 
   // This method is the hook for the kernel rewrites to invoke
-  // quantum kernels on the asynchronously executing qpud daemon process.
+  // quantum kernels.
   void launchKernel(std::string kernelName, void (*kernelFunc)(void *),
                     void *args, std::uint64_t voidStarSize,
                     std::uint64_t resultOffset);
diff --git a/runtime/qpud-client/CMakeLists.txt b/runtime/qpud-client/CMakeLists.txt
deleted file mode 100644
index cf704ed5c8..0000000000
--- a/runtime/qpud-client/CMakeLists.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-# ============================================================================ #
-# Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                   #
-# All rights reserved.                                                         #
-#                                                                              #
-# This source code and the accompanying materials are made available under     #
-# the terms of the Apache License 2.0 which accompanies this distribution.     #
-# ============================================================================ #
-
-set(LIBRARY_NAME cudaq-qpud-client)
-set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ctad-maybe-unsupported")
-set(INTERFACE_POSITION_INDEPENDENT_CODE ON)
-
-set(SRC
-  qpud_client.cpp
-)
-
-add_library(${LIBRARY_NAME} SHARED ${SRC})
-target_include_directories(${LIBRARY_NAME}
-  PUBLIC . ${CMAKE_BINARY_DIR} ${CMAKE_SOURCE_DIR}/runtime )
-target_link_libraries(${LIBRARY_NAME}
-  PUBLIC fmt::fmt-header-only rpc LLVMSupport cudaq)
-
-cudaq_library_set_rpath(${LIBRARY_NAME})
-
-install (FILES qpud_client.h DESTINATION include/cudaq)
-install(TARGETS ${LIBRARY_NAME} DESTINATION lib)
diff --git a/runtime/qpud-client/qpud_client.cpp b/runtime/qpud-client/qpud_client.cpp
deleted file mode 100644
index 82e92a6807..0000000000
--- a/runtime/qpud-client/qpud_client.cpp
+++ /dev/null
@@ -1,454 +0,0 @@
-/*************************************************************** -*- C++ -*- ***
- * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- *******************************************************************************/
-
-#ifdef __GNUC__
-#pragma GCC diagnostic ignored "-Wsuggest-override"
-#endif
-#include "qpud_client.h"
-#include "nlohmann/json.hpp"
-#include "rpc/client.h"
-#include "rpc/rpc_error.h"
-#include "llvm/Support/Program.h"
-
-#include <fmt/core.h>
-#include <fstream>
-#include <iostream>
-#include <random>
-
-#if defined(__APPLE__) && defined(__MACH__)
-#include <mach-o/dyld.h>
-#else
-#include <link.h>
-#endif
-
-namespace cudaq {
-std::string get_quake_by_name(const std::string &);
-static std::unique_ptr<qpud_client> qpudClient = nullptr;
-
-qpud_client &get_qpud_client() {
-  if (!qpudClient)
-    qpudClient = std::make_unique<qpud_client>();
-  return *qpudClient.get();
-}
-
-/// @brief Simple struct for extracting the library
-/// names this executable is linked to
-struct NVQIRLibraryData {
-  std::string path;
-};
-
-// We need to get the NVQIR Backend Library that we are linked to
-// so we can give that information to the qpud process.
-#if defined(__APPLE__) && defined(__MACH__)
-// https://stackoverflow.com/questions/10009043/dl-iterate-phdr-equivalent-on-mac
-static void getNVQIRLibraryPath(NVQIRLibraryData *data) {
-  auto nLibs = _dyld_image_count();
-  for (uint32_t i = 0; i < nLibs; i++) {
-    auto ptr = _dyld_get_image_name(i);
-    std::string libName(ptr);
-    if (libName.find("nvqir-") != std::string::npos) {
-      auto casted = static_cast<NVQIRLibraryData *>(data);
-      casted->path = std::string(ptr);
-    }
-  }
-}
-#else
-/// @brief Extract the NVQIR backend library path
-static int getNVQIRLibraryPath(struct dl_phdr_info *info, size_t size,
-                               void *data) {
-  std::string libraryName(info->dlpi_name);
-  if (libraryName.find("nvqir-") != std::string::npos) {
-    auto casted = static_cast<NVQIRLibraryData *>(data);
-    casted->path = std::string(info->dlpi_name);
-  }
-  return 0;
-}
-#endif
-
-/// @brief Pointer to captured runtime arguments
-std::unique_ptr<std::vector<std::string>> capturedArgs;
-/// @brief Capture any user-provided runtime args so we can forward to QPUD
-/// @param argc
-/// @param argv
-/// @param
-void captureHostCommandLineArgs(int argc, char **argv, char **) {
-  capturedArgs =
-      std::make_unique<std::vector<std::string>>(argv + 1, argv + argc);
-}
-
-#if defined(__APPLE__) && defined(__MACH__)
-#define INIT_ARRAY section("__DATA, __mod_init_func")
-#else
-#define INIT_ARRAY section(".init_array")
-#endif
-
-[[maybe_unused]] __attribute__((INIT_ARRAY)) typeof(captureHostCommandLineArgs)
-    *__captureHostCommandLineArgs = captureHostCommandLineArgs;
-
-#undef INIT_ARRAY
-
-/// @brief Invoke the call, catch any exceptions, stop the server if we hit an
-/// error
-template <typename... Args>
-auto invokeCall(rpc::client *client, const std::string &functionName,
-                Args &...args) {
-  try {
-    // Load the quake code to the QPU
-    return client->call(functionName, args...);
-  } catch (rpc::rpc_error &e) {
-    client->call("stopServer");
-    std::string msg = "[qpud::" + e.get_function_name() + "] " +
-                      e.get_error().as<std::string>();
-    throw std::runtime_error(msg);
-  }
-}
-
-llvm::sys::ProcessInfo qpud_client::startDaemon() {
-  // We need to know what NVQIR backend we were compiled
-  // with. Here we loop over all linked libraries to get the nvqir backend
-  // library
-  NVQIRLibraryData data;
-#if defined(__APPLE__) && defined(__MACH__)
-  getNVQIRLibraryPath(&data);
-#else
-  dl_iterate_phdr(getNVQIRLibraryPath, &data);
-#endif
-  qpudJITExtraLibraries.push_back(data.path);
-
-  std::random_device rd;
-  std::mt19937 mt(rd());
-  std::uniform_int_distribution<int> dist(10000, 65534);
-  port = dist(mt);
-
-  std::filesystem::path nvqirPath{data.path};
-  auto installLibPath = nvqirPath.parent_path();
-  auto installPath = installLibPath.parent_path();
-  auto qpudExePath = installPath / "bin" / "qpud";
-
-  std::string qpudError = "";
-
-  std::vector<llvm::StringRef> qpudArgs{qpudExePath.string(), "--port",
-                                        std::to_string(port), "--qpu",
-                                        std::to_string(qpu_id)};
-  // Forward the captured args.
-  if (capturedArgs)
-    for (auto &arg : *capturedArgs) {
-      qpudArgs.push_back(arg);
-    }
-
-  bool execFailed = false;
-  auto qpudProcInfo =
-      llvm::sys::ExecuteNoWait(qpudExePath.string(), qpudArgs, std::nullopt, {},
-                               0, &qpudError, &execFailed);
-  if (execFailed) {
-    std::cerr << "Failed to launch qpud process on port " << port << ":\n"
-              << qpudError << "\n";
-    throw std::runtime_error("Could not create qpud process.");
-  }
-
-  return qpudProcInfo;
-}
-
-rpc::client *qpud_client::getClient(bool startServer) {
-  if (!rpcClient && startServer) {
-    auto qpudProcInfo = startDaemon();
-    // Since the client is starting the server and connecting to it.  We might
-    // run into the problem that when we try to connect to it, the server didn't
-    // had enough time to initialize. So here, we try to connect 10 times with
-    // a waiting time between each try.
-    rpc::client::connection_state state;
-    for (auto i = 0; i < 10; ++i) {
-      rpcClient = std::make_unique<rpc::client>(url, port);
-      state = rpcClient->get_connection_state();
-      // Upon construction, the client is at state `initial` and tries to
-      // connect to the server using an asynchronous call.  This is basically
-      // a spin lock to wait for the return of this call, which should be
-      // either a `connected` or `disconnected` state.
-      while (state == rpc::client::connection_state::initial) {
-        state = rpcClient->get_connection_state();
-      };
-      if (state == rpc::client::connection_state::connected)
-        break;
-      std::this_thread::sleep_for(std::chrono::milliseconds(500));
-    }
-    if (state == rpc::client::connection_state::disconnected) {
-      if (qpudProcInfo.Pid) {
-        // If the QPU daemon stated, but we were not able to connect to it, we
-        // kill it. (This call needs a SecondsToWait > 0 to kill the process)
-        llvm::sys::Wait(qpudProcInfo, /*SecondsToWait=*/1, nullptr, nullptr);
-      }
-      throw std::runtime_error(fmt::format(
-          "Could not connect to remote qpud process at {}:{}", url, port));
-    }
-  }
-
-  return rpcClient.get();
-}
-
-void qpud_client::jitQuakeIfUnseen(const std::string &kernelName) {
-  rpc::client *client = getClient();
-  // Upload the kernel mlir code if we haven't already.
-  if (std::find(launchedKernels.begin(), launchedKernels.end(), kernelName) ==
-      std::end(launchedKernels)) {
-    auto quakeCode = cudaq::get_quake_by_name(kernelName);
-    invokeCall(client, "loadQuakeCode", kernelName, quakeCode,
-               qpudJITExtraLibraries);
-    launchedKernels.push_back(kernelName);
-  }
-}
-
-/// By default, let startDaemon create the qpud proc
-/// This will also allocate a random port
-qpud_client::qpud_client() {}
-
-void qpud_client::set_backend(const std::string &backend) {
-  // Get the client, if not set, create it
-  rpc::client *client = getClient();
-
-  // invoke the setTargetBackend function
-  invokeCall(client, "setTargetBackend", backend);
-}
-
-bool qpud_client::is_simulator() {
-  // Get the client, if not set, create it
-  rpc::client *client = getClient();
-  return invokeCall(client, "getIsSimulator").as<bool>();
-}
-
-bool qpud_client::supports_conditional_feedback() {
-  rpc::client *client = getClient();
-  return invokeCall(client, "getSupportsConditionalFeedback").as<bool>();
-}
-
-void qpud_client::execute(const std::string &kernelName, void *runtimeArgs,
-                          std::uint64_t argsSize, std::uint64_t resultOff) {
-  rpc::client *client = getClient();
-
-  // Tell the QPUD to JIT compile the code
-  jitQuakeIfUnseen(kernelName);
-
-  // Map the runtime args to a vector<uint8_t>
-  uint8_t *buf = (uint8_t *)runtimeArgs;
-  std::vector<uint8_t> vec_buf(buf, buf + argsSize);
-
-  // No context has been set, just calling base execute
-  auto updatedArgs = invokeCall(client, "executeKernel", kernelName, vec_buf)
-                         .as<std::vector<uint8_t>>();
-
-  if (updatedArgs.size() > argsSize) {
-    assert(resultOff != NoResultOffset && "result offset must be given");
-    // The return buffer is longer than the argument buffer, therefore the
-    // return buffer includes dynamically allocated result values. Return these
-    // in a new heap allocated buffer.
-    const std::uint64_t dynResSize = updatedArgs.size() - argsSize;
-    char *resBuff = reinterpret_cast<char *>(std::malloc(dynResSize));
-    std::memcpy(resBuff, &updatedArgs[argsSize], dynResSize);
-    // Update the pointer to the new buffer in updatedArgs before copying it.
-    void **resultPtr = reinterpret_cast<void **>(&updatedArgs[resultOff]);
-    *resultPtr = resBuff;
-    assert(dynResSize == *(reinterpret_cast<uint64_t *>(
-                             &updatedArgs[resultOff + sizeof(void *)])));
-  }
-
-  // If this function has a return type, it has been
-  // set as part of the void* args, set it here.
-  std::memcpy(runtimeArgs, updatedArgs.data(), argsSize);
-}
-
-sample_result qpud_client::sample(const std::string &kernelName,
-                                  const std::size_t shots, void *runtimeArgs,
-                                  std::size_t argsSize) {
-  using ResultType = std::vector<std::size_t>;
-  rpc::client *client = getClient();
-
-  // Tell the QPUD to JIT compile the code
-  jitQuakeIfUnseen(kernelName);
-
-  // Map the runtime args to a vector<uint8_t>
-  uint8_t *buf = (uint8_t *)runtimeArgs;
-  std::vector<uint8_t> vec_buf(buf, buf + argsSize);
-
-  // Tell teh QPUD to sample
-  auto countsData =
-      invokeCall(client, "sampleKernel", kernelName, shots, vec_buf)
-          .as<ResultType>();
-
-  // Deserialize the result and return
-  sample_result counts;
-  counts.deserialize(countsData);
-  return counts;
-}
-
-detached_job qpud_client::sample_detach(const std::string &kernelName,
-                                        const std::size_t shots,
-                                        void *runtimeArgs,
-                                        std::size_t argsSize) {
-  using ResultType = std::tuple<std::string, std::string>;
-
-  rpc::client *client = getClient();
-
-  // Tell the QPUD to JIT compile the code
-  jitQuakeIfUnseen(kernelName);
-
-  // Map the runtime args to a vector<uint8_t>
-  uint8_t *buf = (uint8_t *)runtimeArgs;
-  std::vector<uint8_t> vec_buf(buf, buf + argsSize);
-
-  using ResultType = std::tuple<std::string, std::string>;
-  // Invoke the sample function and detach
-  auto retJob =
-      invokeCall(client, "sampleKernelDetach", kernelName, shots, vec_buf)
-          .as<ResultType>();
-
-  detached_job job;
-  auto id = std::get<0>(retJob);
-  auto name = std::get<1>(retJob);
-  job.emplace_back(name, id);
-  return job;
-}
-
-sample_result qpud_client::sample(detached_job &job) {
-  using ResultType = std::vector<std::size_t>;
-  rpc::client *client = getClient();
-  auto countsData =
-      invokeCall(client, "sampleKernelFromJobId", job[0].id).as<ResultType>();
-  sample_result counts;
-  counts.deserialize(countsData);
-  return counts;
-}
-
-observe_result qpud_client::observe(const std::string &kernelName,
-                                    cudaq::spin_op &spinOp, void *runtimeArgs,
-                                    std::size_t argsSize, std::size_t shots) {
-  using ResultType = std::tuple<double, std::vector<std::size_t>>;
-  rpc::client *client = getClient();
-
-  // Tell the QPUD to JIT compile the code
-  jitQuakeIfUnseen(kernelName);
-
-  // Serialize the spin op
-  std::vector<double> H_data = spinOp.getDataRepresentation();
-
-  // Map the runtime args to a vector<uint8_t>
-  uint8_t *buf = (uint8_t *)runtimeArgs;
-  std::vector<uint8_t> vec_buf(buf, buf + argsSize);
-
-  // Invoke the observation function
-  auto result =
-      invokeCall(client, "observeKernel", kernelName, H_data, shots, vec_buf)
-          .as<ResultType>();
-  // Handle counts
-  sample_result data;
-  data.deserialize(std::get<1>(result));
-  return observe_result(std::get<0>(result), spinOp, data);
-}
-
-detached_job qpud_client::observe_detach(const std::string &kernelName,
-                                         cudaq::spin_op &spinOp,
-                                         void *runtimeArgs,
-                                         std::size_t argsSize,
-                                         std::size_t shots) {
-  using ResultType =
-      std::tuple<std::vector<std::string>, std::vector<std::string>>;
-
-  rpc::client *client = getClient();
-
-  // Tell the QPUD to JIT compile the code
-  jitQuakeIfUnseen(kernelName);
-
-  // Serialize the spin op
-  std::vector<double> H_data = spinOp.getDataRepresentation();
-
-  // Map the runtime args to a vector<uint8_t>
-  uint8_t *buf = (uint8_t *)runtimeArgs;
-  std::vector<uint8_t> vec_buf(buf, buf + argsSize);
-
-  // Invoke the observation function and detach
-  auto retJobs = invokeCall(client, "observeKernelDetach", kernelName, H_data,
-                            shots, vec_buf)
-                     .as<ResultType>();
-
-  detached_job jobs;
-  auto ids = std::get<0>(retJobs);
-  for (auto [i, id] : cudaq::enumerate(ids)) {
-    jobs.emplace_back(std::get<1>(retJobs)[i], id);
-  }
-  return jobs;
-}
-
-observe_result qpud_client::observe(cudaq::spin_op &spinOp, detached_job &job) {
-  using ResultType = std::tuple<double, std::vector<std::size_t>>;
-  rpc::client *client = getClient();
-  int counter = 0;
-  double sum = 0.0;
-  sample_result global;
-  std::vector<ExecutionResult> sampleResults;
-  for (std::size_t i = 0; i < spinOp.n_terms(); i++) {
-    auto term = spinOp[i];
-    auto realCoeff = term.get_term_coefficient(0).real();
-    if (term.is_identity())
-      sum += realCoeff;
-    else {
-      auto result =
-          invokeCall(client, "observeKernelFromJobId", job[counter].id)
-              .as<ResultType>();
-      // Handle counts
-      sample_result m;
-      m.deserialize(std::get<1>(result));
-
-      sampleResults.emplace_back(m.to_map(), term.to_string(false));
-
-      sum += realCoeff * std::get<0>(result);
-      counter++;
-    }
-  }
-  sample_result m(sampleResults);
-  return observe_result(sum, spinOp, m);
-}
-
-void qpud_client::stop_qpud() {
-  if (!stopRequested &&
-      (rpcClient && rpcClient->get_connection_state() ==
-                        rpc::client::connection_state::connected)) {
-    stopRequested = true;
-    rpcClient->call("stopServer");
-  }
-}
-
-qpud_client::~qpud_client() {
-  // Upon destruction, stop the server
-  stop_qpud();
-}
-
-void detached_job::serialize(const std::string &fileName,
-                             const std::vector<double> params) {
-  std::ofstream out(fileName);
-  nlohmann::ordered_json j;
-  if (!params.empty())
-    j["parameters"] = params;
-
-  for (auto &job : *this) {
-    j[job.name] = job.id;
-  }
-  out << j.dump(4);
-  out.close();
-}
-
-void detached_job::deserialize(const std::string &fileName) {
-  std::ifstream i(fileName);
-  nlohmann::ordered_json j;
-  i >> j;
-
-  for (auto &element : j.items()) {
-    if (element.key() == "parameters")
-      params = element.value().get<std::vector<double>>();
-    else
-      emplace_back(element.key(), element.value().get<std::string>());
-  }
-}
-} // namespace cudaq
diff --git a/runtime/qpud-client/qpud_client.h b/runtime/qpud-client/qpud_client.h
deleted file mode 100644
index 1f9194c337..0000000000
--- a/runtime/qpud-client/qpud_client.h
+++ /dev/null
@@ -1,337 +0,0 @@
-/*************************************************************** -*- C++ -*- ***
- * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- *******************************************************************************/
-
-#pragma once
-
-#include "common/ObserveResult.h"
-#include "cudaq/spin_op.h"
-
-#include <memory>
-#include <stack>
-
-// Forward declare the RPC client type
-namespace rpc {
-class client;
-}
-
-namespace llvm::sys {
-struct ProcessInfo;
-}
-
-namespace cudaq {
-
-static constexpr std::size_t NoResultOffset = ~0u >> 1;
-
-/// @brief An observe_job contains metadata describing
-/// a detached cudaq"::"observe invocation. By detached, we mean
-/// this job has been submitted to a remote queuing system. This type
-/// describes the Job Ids for each term evaluation.
-class detached_job {
-private:
-  std::vector<double> params;
-
-  /// @brief A job has a name and an ID
-  struct TermJob {
-    TermJob(const std::string &n, const std::string &i) : name(n), id(i) {}
-    std::string name;
-    std::string id;
-  };
-
-  /// @brief For a SpinOp, we'll have N ansatz+measure evaluations
-  /// each of these is a single job on the remote processor
-  std::vector<TermJob> jobs;
-
-public:
-  /// Thin wrapper around vector"::"emplace_back
-  template <typename... Args>
-  void emplace_back(Args &&...args) {
-    jobs.emplace_back(args...);
-  }
-
-  std::string id_from_name(const std::string &name) {
-    for (auto &job : jobs) {
-      if (job.name == name)
-        return job.id;
-    }
-    throw std::runtime_error(
-        "Invalid name, no job ID corresponding to that name.");
-  }
-
-  /// @brief  Thin wrapper around vector"::"operator[]
-  TermJob &operator[](const std::size_t idx) { return jobs[idx]; }
-
-  /// @brief Enable range based iteration
-  auto begin() { return jobs.begin(); }
-
-  /// @brief Enable range-based iteration
-  auto end() { return jobs.end(); }
-
-  /// @brief Return the parameters the ansatz was evaluated at
-  std::vector<double> parameters() { return params; }
-
-  /// @brief Serialize this observe_job to file, can optionally provide
-  /// the parameters used to evaluate the ansatz
-  void serialize(const std::string &fileName,
-                 const std::vector<double> params = {});
-
-  /// @brief Read in this observe_job from file
-  void deserialize(const std::string &fileName);
-};
-
-/// Typedef the KernelArgs Creator Function
-typedef std::size_t (*Creator)(void **, void **);
-
-/// Retrieve the kernel args creator function for the kernel name
-Creator getArgsCreator(const std::string &);
-/// @brief Utility function for mapping variadic args to qpud required void*,
-/// size_t. Note clients of this function own the allocated rawArgs.
-template <typename... Args>
-std::pair<void *, std::size_t> mapToRawArgs(const std::string &kernelName,
-                                            Args &&...args) {
-  void *rawArgs = nullptr;
-  auto argsCreator = getArgsCreator(kernelName);
-  void *argPointers[sizeof...(Args)] = {&args...};
-  auto argsSize = argsCreator(argPointers, &rawArgs);
-  return std::make_pair(rawArgs, argsSize);
-}
-
-/// @brief The QPUD Client provides a high-level API for interacting
-/// with a external qpud process.
-class qpud_client {
-protected:
-  /// Observed kernel names, stored here so we don't
-  /// call loadQuakeCode more than once.
-  std::vector<std::string> launchedKernels;
-
-  // The RPC Client, which submits function invocations
-  // to the remote qpud server
-  std::unique_ptr<rpc::client> rpcClient;
-
-  // Extra libraries that our qpud JIT engine will need
-  std::vector<std::string> qpudJITExtraLibraries;
-
-  /// @brief The url of the remote qpud proc
-  const std::string url = "127.0.0.1";
-
-  /// @brief the port for the remote qpud proc
-  int port = 0;
-
-  /// @brief The QPU that we are targeting on the remote qpud proc
-  int qpu_id = 0;
-
-  /// @brief Bool indicating if a stop of qpud has been requested
-  bool stopRequested = false;
-
-  /// @brief Return a raw pointer to the rpc client.
-  /// @param connectClient
-  /// @return
-  rpc::client *getClient(bool connectClient = true);
-
-  /// @brief Utility function for starting the qpud proc
-  llvm::sys::ProcessInfo startDaemon();
-
-  /// @brief Utility function for JIT compiling the quakeCode once
-  /// @param kernelName
-  void jitQuakeIfUnseen(const std::string &kernelName);
-
-public:
-  /// @brief The constructor
-  qpud_client();
-
-  /// @brief The constructor, does not create the qpud proc but
-  /// instead connects to an existing one
-  qpud_client(const std::string &qpudUrl, const int qpudPort);
-
-  /// @brief Set the qpud proc target backend
-  void set_backend(const std::string &backend);
-
-  /// @brief Return true if the current backend is a simulator
-  bool is_simulator();
-
-  /// @brief Return true if the current backend supports conditional feedback
-  bool supports_conditional_feedback();
-
-  /// Execute a circuit and return the results
-  void execute(const std::string &kernelName, void *runtimeArgs,
-               std::uint64_t argsSize, std::uint64_t resultOffset);
-
-  /// @brief Execute a circuit and return the results, automate the args
-  /// processing
-  template <typename ArgsType>
-  void execute(const std::string &kernelName, ArgsType &argsTypeInstance) {
-    auto [rawArgs, size, resultOff] = process_args(argsTypeInstance);
-    return execute(kernelName, rawArgs, size, resultOff);
-  }
-
-  /// @brief Sample the circuit generated by the quakeCode for the given kernel
-  /// name.
-  sample_result sample(const std::string &kernelName, const std::size_t shots,
-                       void *runtimeArgs, std::size_t argsSize);
-
-  /// @brief Sample the circuit generated by the quakeCode for the given kernel
-  /// name. Automate the args processing
-  template <typename ArgsType>
-  sample_result sample(const std::string &kernelName, const std::size_t shots,
-                       ArgsType &argsTypeInstance) {
-    auto [rawArgs, size, resultOff] = process_args(argsTypeInstance);
-    return sample(kernelName, shots, rawArgs, size);
-  }
-
-  /// @brief Sample the circuit generated by the given quantum kernel
-  template <typename QuantumKernel, typename... Args,
-            typename R = typename std::invoke_result_t<QuantumKernel, Args...>,
-            typename = std::enable_if_t<std::is_void_v<R>>>
-  sample_result sample(QuantumKernel &&kernel, const std::size_t shots,
-                       Args &&...args) {
-    auto kernelName = cudaq::getKernelName(kernel);
-    auto [rawArgs, size] = mapToRawArgs(kernelName, args...);
-    return sample(kernelName, shots, rawArgs, size);
-  }
-
-  /// @brief Launch a sampling job and detach, returning a unique job id.
-  detached_job sample_detach(const std::string &kernelName,
-                             const std::size_t shots, void *runtimeArgs,
-                             std::size_t argsSize);
-  /// @brief Launch a sampling job and detach, returning a job id. Automate the
-  /// args processing.
-  template <typename ArgsType>
-  detached_job sample_detach(const std::string &kernelName,
-                             const std::size_t shots,
-                             ArgsType &argsTypeInstance) {
-    auto [rawArgs, size, resultOff] = process_args(argsTypeInstance);
-    return sample_detach(kernelName, shots, rawArgs, size);
-  }
-
-  /// @brief Return the measure count result from a detached sample job.
-  sample_result sample(detached_job &job);
-
-  /// @brief Observe the state generated by the quakeCode at the given kernel
-  /// name with respect to the given spin op.
-  observe_result observe(const std::string &kernelName, cudaq::spin_op &spinOp,
-                         void *runtimeArgs, std::size_t argsSize,
-                         std::size_t shots = 0);
-
-  /// @brief Observe the state generated by the quakeCode at the given kernel
-  /// name with respect to the given spin op. Automate the args processing
-  template <typename ArgsType>
-  observe_result observe(const std::string &kernelName, cudaq::spin_op &spinOp,
-                         ArgsType &argsTypeInstance) {
-    auto [rawArgs, size, resultOff] = process_args(argsTypeInstance);
-    return observe(kernelName, spinOp, rawArgs, size);
-  }
-
-  /// @brief Observe the state generated by the given kernel
-  /// with respect to the given spin op. Automate the args processing
-  template <
-      typename QuantumKernel, typename... Args,
-      typename = std::enable_if_t<std::is_invocable_v<QuantumKernel, Args...>>>
-  observe_result observe(QuantumKernel &&kernel, cudaq::spin_op &spinOp,
-                         Args &&...args) {
-    auto kernelName = cudaq::getKernelName(kernel);
-    auto [rawArgs, size] = mapToRawArgs(kernelName, args...);
-    return observe(kernelName, spinOp, rawArgs, size);
-  }
-
-  /// @brief Observe the state generated by the given kernel
-  /// with respect to the given spin op. Automate the args processing
-  template <
-      typename QuantumKernel, typename... Args,
-      typename = std::enable_if_t<std::is_invocable_v<QuantumKernel, Args...>>>
-  observe_result observe(QuantumKernel &&kernel, std::size_t shots,
-                         cudaq::spin_op &spinOp, Args &&...args) {
-    auto kernelName = cudaq::getKernelName(kernel);
-    auto [rawArgs, size] = mapToRawArgs(kernelName, args...);
-    return observe(kernelName, spinOp, rawArgs, size, shots);
-  }
-
-  /// @brief Invoke an observe task, but detach and return the job id
-  detached_job observe_detach(const std::string &kernelName,
-                              cudaq::spin_op &spinOp, void *runtimeArgs,
-                              std::size_t argsSize, std::size_t shots = 0);
-
-  /// @brief Invoke an observe task, but detach and return the job id.
-  /// Automate the args processing
-  template <typename ArgsType>
-  detached_job observe_detach(const std::string &kernelName,
-                              cudaq::spin_op &spinOp,
-                              ArgsType &argsTypeInstance) {
-    auto [rawArgs, size, resultOff] = process_args(argsTypeInstance);
-    return observe_detach(kernelName, spinOp, rawArgs, size);
-  }
-
-  /// @brief Return the observe result based on a detached job
-  observe_result observe(cudaq::spin_op &spinOp, detached_job &job);
-
-  /// @brief Convert a user specified kernel argument struct to a raw void
-  /// pointer and its associated size.
-  template <typename ArgsType>
-  std::tuple<void *, std::uint64_t, std::uint64_t>
-  process_args(ArgsType &argsTypeInstance) {
-    return std::make_tuple(reinterpret_cast<void *>(&argsTypeInstance),
-                           sizeof(ArgsType), NoResultOffset);
-  }
-
-  /// @brief Manually stop the qpud proc. If called, one must create a new
-  /// qpud_client to continue interacting with a qpud proc
-  void stop_qpud();
-
-  /// The destructor, will automatically stop the qpud proc
-  ~qpud_client();
-};
-
-qpud_client &get_qpud_client();
-
-/// @brief Launch a sampling job and detach, returning the job id. Takes the
-/// kernel runtime arguments as a variadic parameter pack.
-template <typename... Args>
-detached_job sample_detach(const std::string &kernelName,
-                           const std::size_t shots, Args &&...args) {
-  auto [rawArgs, argsSize] = mapToRawArgs(kernelName, args...);
-  auto &client = get_qpud_client();
-  auto job = client.sample_detach(kernelName, shots, rawArgs, argsSize);
-  std::free(rawArgs);
-  return job;
-}
-
-/// @brief Return the sample result for the given detached job id.
-/// @param job
-/// @return
-sample_result sample(detached_job &job) {
-  auto &client = get_qpud_client();
-  return client.sample(job);
-}
-
-/// @brief Observe the state generated by the kernel with given spinOp
-/// asynchronously, detach and return the job id.
-template <typename... Args>
-detached_job observe_detach(const std::string &kernelName, spin_op &spinOp,
-                            Args &&...args) {
-  auto [rawArgs, argsSize] = mapToRawArgs(kernelName, args...);
-  auto &client = get_qpud_client();
-  auto job = client.observe_detach(kernelName, spinOp, rawArgs, argsSize);
-  std::free(rawArgs);
-  return job;
-}
-
-/// @brief Observe the state generated by the kernel with given spinOp
-/// asynchronously, detach and return the job id. Set the number of shots
-/// explicitly
-template <typename... Args>
-detached_job observe_detach(const std::string &kernelName, std::size_t shots,
-                            spin_op &spinOp, Args &&...args) {
-  auto [rawArgs, argsSize] = mapToRawArgs(kernelName, args...);
-  auto &client = get_qpud_client();
-  return client.observe_detach(kernelName, spinOp, rawArgs, argsSize, shots);
-}
-
-/// @brief Return the observe result for the given detached job
-observe_result observe(spin_op &spinOp, detached_job &detachedJob) {
-  auto &client = get_qpud_client();
-  return client.observe(spinOp, detachedJob);
-}
-} // namespace cudaq
diff --git a/test/NVQPP/auto_kernel.cpp b/test/NVQPP/auto_kernel.cpp
index 528f12ac44..6f01ab1088 100644
--- a/test/NVQPP/auto_kernel.cpp
+++ b/test/NVQPP/auto_kernel.cpp
@@ -6,8 +6,7 @@
  * the terms of the Apache License 2.0 which accompanies this distribution.    *
  *******************************************************************************/
 
-// RUN: nvq++ -v --nvqir-simulator qpp --enable-mlir %s -o out_auto_kernel.x && ./out_auto_kernel.x | FileCheck %s
-// RUN: nvq++ -v --nvqir-simulator qpp --enable-mlir --platform default-qpud %s -o out_auto_kernel.x && ./out_auto_kernel.x | FileCheck %s
+// RUN: nvq++ -v %s -o out_auto_kernel.x && ./out_auto_kernel.x | FileCheck %s
 
 #include <cudaq.h>
 
diff --git a/test/NVQPP/testIQPE_Conditionals.cpp b/test/NVQPP/testIQPE_Conditionals.cpp
index e407d667e5..eaea35c710 100644
--- a/test/NVQPP/testIQPE_Conditionals.cpp
+++ b/test/NVQPP/testIQPE_Conditionals.cpp
@@ -7,7 +7,6 @@
  *******************************************************************************/
 
 // RUN: nvq++ %s -o out_testifstmts_iqpe.x && ./out_testifstmts_iqpe.x | FileCheck %s && rm out_testifstmts_iqpe.x
-// RUN: nvq++ --platform default-qpud %s -o out_testifstmts_iqpe2.x && ./out_testifstmts_iqpe2.x | FileCheck %s && rm out_testifstmts_iqpe2.x
 
 #include <cudaq.h>
 
diff --git a/test/NVQPP/testQPUDObserve.cpp b/test/NVQPP/testQPUDObserve.cpp
deleted file mode 100644
index 97691ae217..0000000000
--- a/test/NVQPP/testQPUDObserve.cpp
+++ /dev/null
@@ -1,76 +0,0 @@
-/*************************************************************** -*- C++ -*- ***
- * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- *******************************************************************************/
-
-// RUN: nvq++ --enable-mlir --platform default-qpud %s -o out_testqpudobserve.x && ./out_testqpudobserve.x | FileCheck %s && rm out_testqpudobserve.x
-
-#include <cudaq.h>
-#include <cudaq/algorithm.h>
-
-// The example here shows a simple use case for the cudaq::observe()
-// function in computing expected values of provided spin_ops.
-
-// CHECK: Energy is -1.7487
-// CHECK: Energy with shots is -1.
-
-struct ansatz {
-  auto operator()(double theta) __qpu__ {
-    cudaq::qreg q(2);
-    x(q[0]);
-    ry(theta, q[1]);
-    x<cudaq::ctrl>(q[1], q[0]);
-  }
-};
-
-struct ansatzVector {
-  auto operator()(std::vector<double> theta) __qpu__ {
-    cudaq::qreg q(2);
-    x(q[0]);
-    ry(theta[0], q[1]);
-    x<cudaq::ctrl>(q[1], q[0]);
-  }
-};
-int main() {
-
-  // Build up your spin op algebraically
-  using namespace cudaq::spin;
-  cudaq::spin_op h = 5.907 - 2.1433 * x(0) * x(1) - 2.1433 * y(0) * y(1) +
-                    .21829 * z(0) - 6.125 * z(1);
-
-  {
-    // Observe takes the kernel, the spin_op, and the concrete params for the
-    // kernel
-    double energy = cudaq::observe(ansatz{}, h, .59);
-    printf("Energy is %lf\n", energy);
-
-    // Set shots high enough that we're accurate to -1.7
-    cudaq::set_shots(10000);
-    auto result = cudaq::observe(ansatz{}, h, .59);
-    printf("Energy with shots is %lf\n", result.exp_val_z());
-
-    auto z1Counts = result.counts(z(1));
-    assert(z1Counts.size() == 2);
-    assert(z1Counts.count("0") && z1Counts.count("1"));
-  }
-
-  {
-    // Observe takes the kernel, the spin_op, and the concrete params for the
-    // kernel
-    double energy = cudaq::observe(ansatzVector{}, h, std::vector<double>{.59});
-    printf("Energy is %lf\n", energy);
-
-    // Set shots high enough that we're accurate to -1.7
-    cudaq::set_shots(10000);
-    auto result = cudaq::observe(ansatzVector{}, h, std::vector<double>{.59});
-    printf("Energy with shots is %lf\n", result.exp_val_z());
-
-    auto z1Counts = result.counts(z(1));
-    assert(z1Counts.size() == 2);
-    assert(z1Counts.count("0") && z1Counts.count("1"));
-  }
-  return 0;
-}
diff --git a/test/NVQPP/testQPUDSample.cpp b/test/NVQPP/testQPUDSample.cpp
deleted file mode 100644
index 89fad3169e..0000000000
--- a/test/NVQPP/testQPUDSample.cpp
+++ /dev/null
@@ -1,31 +0,0 @@
-/*************************************************************** -*- C++ -*- ***
- * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- *******************************************************************************/
-
-// RUN: nvq++ --enable-mlir --platform default-qpud %s -o out_testqpudsample.x && ./out_testqpudsample.x | FileCheck %s && rm out_testqpudsample.x
-
-#include <cudaq.h>
-
-// CHECK: { [[B0:.*]]:[[C0:.*]] [[B1:.*]]:[[C1:.*]] }
-
-// Define a quantum kernel
-struct ghz {
-  auto operator()(const int N) __qpu__ {
-    cudaq::qreg q(N);
-    h(q[0]);
-    for (int i = 0; i < N - 1; i++) {
-      x<cudaq::ctrl>(q[i], q[i + 1]);
-    }
-    mz(q);
-  }
-};
-
-int main() {
-  auto counts = cudaq::sample(ghz{}, 3);
-  counts.dump();
-  return 0;
-}
diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt
index cde6ca8414..2e4a1e3fee 100644
--- a/tools/CMakeLists.txt
+++ b/tools/CMakeLists.txt
@@ -10,7 +10,4 @@ add_subdirectory(nvqpp)
 add_subdirectory(cudaq-lsp-server)
 add_subdirectory(cudaq-opt)
 add_subdirectory(cudaq-quake)
-if (NOT CUDAQ_DISABLE_RUNTIME)
-  add_subdirectory(qpud)
-endif()
 add_subdirectory(cudaq-translate)
diff --git a/tools/nvqpp/nvq++.in b/tools/nvqpp/nvq++.in
index 727113810e..32ce397e92 100755
--- a/tools/nvqpp/nvq++.in
+++ b/tools/nvqpp/nvq++.in
@@ -180,7 +180,7 @@ LIBRARY_MODE_EXECUTION_MANAGER="qir"
 PLATFORM_LIBRARY="default"
 LLVM_QUANTUM_TARGET="qir"
 LINKDIRS="-L${install_dir}/lib @CUDAQ_CXX_NVQPP_LINK_STR@"
-LINKLIBS="-lcudaq -lcudaq-common -lcudaq-mlir-runtime -lcudaq-builder -lcudaq-ensmallen -lcudaq-nlopt -lcudaq-spin -lcudaq-qpud-client"
+LINKLIBS="-lcudaq -lcudaq-common -lcudaq-mlir-runtime -lcudaq-builder -lcudaq-ensmallen -lcudaq-nlopt -lcudaq-spin"
 
 # Provide a default backend, user can override
 NVQIR_SIMULATION_BACKEND="qpp"
@@ -348,10 +348,6 @@ if ${SHOW_HELP}; then
 	show_help
 fi
 
-if [[ "qpud" == "$PLATFORM_LIBRARY" ]]; then
-	PLATFORM_LIBRARY="default-qpud"
-fi
-
 TMPFILES=
 function delete_temp_files {
 	if ${DELETE_TEMPS}; then
diff --git a/tools/qpud/CMakeLists.txt b/tools/qpud/CMakeLists.txt
deleted file mode 100644
index ccd4afad4f..0000000000
--- a/tools/qpud/CMakeLists.txt
+++ /dev/null
@@ -1,63 +0,0 @@
-# ============================================================================ #
-# Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                   #
-# All rights reserved.                                                         #
-#                                                                              #
-# This source code and the accompanying materials are made available under     #
-# the terms of the Apache License 2.0 which accompanies this distribution.     #
-# ============================================================================ #
-
-set(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_PREFIX}/lib")
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin )
-
-# Create the qpud target
-add_executable(qpud qpud.cpp)
-
-# Add source for Kernel JIT compilation
-add_subdirectory(jit)
-
-# Add utility sources
-add_subdirectory(utils)
-
-# Add the backends 
-add_subdirectory(backends)
-
-# Add some include paths
-target_include_directories(qpud SYSTEM PRIVATE
-  ${CMAKE_SOURCE_DIR}/tpls/json/nlohmann)
-target_include_directories(qpud PRIVATE . 
-  backends/ 
-  backends/rest 
-  utils
-  jit
-  ${LLVM_INCLUDE_DIRS}
-)
-
-# Link with rpc, nvqir, cudaq and MLIR libs
-get_property(dialect_libs GLOBAL PROPERTY MLIR_DIALECT_LIBS)
-get_property(conversion_libs GLOBAL PROPERTY MLIR_CONVERSION_LIBS)
-target_link_libraries(qpud PRIVATE fmt::fmt-header-only
-  rpc 
-  nvqir
-  ${dialect_libs}
-  ${conversion_libs}
-  MLIRIR
-  MLIRParser
-  MLIRPass
-  MLIRTranslateLib
-  MLIRSupport
-  MLIROptLib
-  MLIRExecutionEngine
-  MLIRTransforms
-  MLIRTargetLLVMIRExport
-  MLIRLLVMCommonConversion
-  MLIRLLVMToLLVMIRTranslation
-  
-  CCDialect
-  OptCodeGen
-  OptTransforms
-  QuakeDialect
-  cudaq
-)
-
-# Install the target
-install(TARGETS qpud DESTINATION bin)
diff --git a/tools/qpud/backends/CMakeLists.txt b/tools/qpud/backends/CMakeLists.txt
deleted file mode 100644
index ca8e57cafe..0000000000
--- a/tools/qpud/backends/CMakeLists.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# ============================================================================ #
-# Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                   #
-# All rights reserved.                                                         #
-#                                                                              #
-# This source code and the accompanying materials are made available under     #
-# the terms of the Apache License 2.0 which accompanies this distribution.     #
-# ============================================================================ #
-
-target_sources(qpud PRIVATE TargetBackend.cpp)
-add_subdirectory(default)
\ No newline at end of file
diff --git a/tools/qpud/backends/TargetBackend.cpp b/tools/qpud/backends/TargetBackend.cpp
deleted file mode 100644
index f5d7b66636..0000000000
--- a/tools/qpud/backends/TargetBackend.cpp
+++ /dev/null
@@ -1,224 +0,0 @@
-/*************************************************************** -*- C++ -*- ***
- * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- *******************************************************************************/
-
-#include "TargetBackend.h"
-#include "cudaq/Optimizer/CodeGen/Passes.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
-#include "cudaq/Optimizer/Transforms/Passes.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/MC/SubtargetFeature.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/Host.h"
-#include "llvm/Target/TargetMachine.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/ExecutionEngine/OptUtils.h"
-#include "mlir/IR/BuiltinDialect.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/InitAllPasses.h"
-#include "mlir/Parser/Parser.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Pass/PassManager.h"
-#include "mlir/Support/LogicalResult.h"
-#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
-#include "mlir/Target/LLVMIR/Export.h"
-#include <iostream>
-
-using namespace mlir;
-
-namespace cudaq {
-
-const static std::string BOLD = "\033[1m";
-const static std::string RED = "\033[91m";
-const static std::string BLUE = "\033[94m";
-const static std::string CLEAR = "\033[0m";
-
-bool TargetBackend::setupTargetTriple(llvm::Module *llvmModule) {
-  // Setup the machine properties from the current architecture.
-  auto targetTriple = llvm::sys::getDefaultTargetTriple();
-  std::string errorMessage;
-  const auto *target =
-      llvm::TargetRegistry::lookupTarget(targetTriple, errorMessage);
-  if (!target) {
-    llvm::errs() << "NO target: " << errorMessage << "\n";
-    return false;
-  }
-
-  std::string cpu(llvm::sys::getHostCPUName());
-  llvm::SubtargetFeatures features;
-  llvm::StringMap<bool> hostFeatures;
-
-  if (llvm::sys::getHostCPUFeatures(hostFeatures))
-    for (auto &f : hostFeatures)
-      features.AddFeature(f.first(), f.second);
-
-  std::unique_ptr<llvm::TargetMachine> machine(target->createTargetMachine(
-      targetTriple, cpu, features.getString(), {}, {}));
-  if (!machine) {
-    llvm::errs() << "Unable to create target machine\n";
-    return false;
-  }
-  llvmModule->setDataLayout(machine->createDataLayout());
-  llvmModule->setTargetTriple(targetTriple);
-
-  return true;
-}
-
-std::unique_ptr<llvm::Module>
-TargetBackend::compile(MLIRContext &context, const std::string_view quakeCode) {
-  auto m_module = parseSourceString<ModuleOp>(quakeCode, &context);
-  DiagnosticEngine &engine = context.getDiagEngine();
-  engine.registerHandler([&](Diagnostic &diag) -> LogicalResult {
-    std::cout << BOLD << RED << "[nvqpp-mlir] Dumping Module after error.\n"
-              << CLEAR;
-    for (auto &n : diag.getNotes()) {
-      std::string s;
-      llvm::raw_string_ostream os(s);
-      n.print(os);
-      os.flush();
-      std::cout << BOLD << RED << "[nvqpp-mlir] Reported Error: " << s << "\n"
-                << CLEAR;
-    }
-    bool should_propagate_diagnostic = true;
-    return failure(should_propagate_diagnostic);
-  });
-
-  PassManager pm(&context);
-  // Apply any generic pass manager command line options and run the pipeline.
-  applyPassManagerCLOptions(pm);
-  pm.addPass(createInlinerPass());
-
-  pm.addPass(createCanonicalizerPass());
-  pm.addPass(cudaq::opt::createExpandMeasurementsPass());
-  pm.addNestedPass<func::FuncOp>(cudaq::opt::createQuakeAddDeallocs());
-  auto loop_unroller = createLoopUnrollPass(
-      /*unrollFactor*/ -1, /*unrollUpToFactor*/ false, /*unrollFull*/ true);
-  pm.addNestedPass<func::FuncOp>(std::move(loop_unroller));
-  pm.addNestedPass<func::FuncOp>(cudaq::opt::createLowerToCFGPass());
-  pm.addPass(createCanonicalizerPass());
-  pm.addPass(cudaq::opt::createConvertToQIRPass());
-
-  if (failed(pm.run(*m_module))) {
-    llvm::errs() << "MLIRRuntime pass pipeline failed!\n";
-    return nullptr;
-  }
-
-  // Convert the module to LLVM IR in a new LLVM IR context.
-  llvmContext.setOpaquePointers(false);
-  auto llvmModule = translateModuleToLLVMIR(m_module.get(), llvmContext);
-  if (!llvmModule) {
-    llvm::errs() << "Failed to translate module to LLVM IR\n";
-    return nullptr;
-  }
-
-  // Initialize LLVM targets.
-  /// Optionally run an optimization pipeline over the llvm module.
-  bool enableOpt = true;
-  auto optPipeline = makeOptimizingTransformer(
-      /*optLevel=*/enableOpt ? 3 : 0, /*sizeLevel=*/0,
-      /*targetMachine=*/nullptr);
-  if (auto err = optPipeline(llvmModule.get()))
-    llvm::errs() << "Failed to optimize LLVM IR " << err << "\n";
-
-  if (!setupTargetTriple(llvmModule.get())) {
-    llvm::errs() << "Failed to setup the llvm module target triple.\n";
-    return nullptr;
-  }
-
-  return llvmModule;
-}
-
-/// Pointer to the global MLIR Context
-extern std::unique_ptr<MLIRContext> mlirContext;
-
-std::unique_ptr<llvm::Module>
-TargetBackend::lowerQuakeToBaseProfile(Kernel &thunk,
-                                       llvm::LLVMContext &localLLVMContext,
-                                       cudaq::spin_op *term, void *kernelArgs) {
-  MLIRContext *localContext = mlirContext.get();
-  auto quakeCode = thunk.getQuakeCode();
-
-  auto m_module = parseSourceString<ModuleOp>(quakeCode, localContext);
-  DiagnosticEngine &engine = localContext->getDiagEngine();
-  engine.registerHandler([&](Diagnostic &diag) -> LogicalResult {
-    std::cout << "[qpud-mlir] Dumping Module after error.\n";
-    for (auto &n : diag.getNotes()) {
-      std::string s;
-      llvm::raw_string_ostream os(s);
-      n.print(os);
-      os.flush();
-      std::cout << "[qpud-mlir] Reported Error: " << s << "\n";
-    }
-    bool should_propagate_diagnostic = true;
-    return failure(should_propagate_diagnostic);
-  });
-
-  // remove the thunk function
-  std::string kernelName(thunk.name());
-  auto thunkFunction =
-      m_module->lookupSymbol<func::FuncOp>(kernelName + ".thunk");
-  if (thunkFunction)
-    thunkFunction->erase();
-
-  // Synthesize the Quake code and lower to QIR
-  PassManager pm(localContext);
-  // Only synthesize if we have runtime args
-  if (kernelArgs)
-    pm.addPass(cudaq::opt::createQuakeSynthesizer(kernelName, kernelArgs));
-  pm.addPass(createInlinerPass());
-  applyNativeGateSetPasses(pm);
-
-  pm.addPass(createCanonicalizerPass());
-  pm.addPass(cudaq::opt::createExpandMeasurementsPass());
-  pm.addNestedPass<func::FuncOp>(cudaq::opt::createLowerToCFGPass());
-  pm.addPass(createCanonicalizerPass());
-
-  pm.addNestedPass<func::FuncOp>(createLoopUnrollPass(-1, false, true));
-  if (term) {
-    // add a pass that adds measures.
-    auto binarySymplecticForm = term->get_bsf()[0];
-    pm.addNestedPass<func::FuncOp>(
-        cudaq::opt::createQuakeObserveAnsatzPass(binarySymplecticForm));
-    pm.addPass(createCanonicalizerPass());
-  }
-  pm.addPass(cudaq::opt::createConvertToQIRPass());
-  cudaq::opt::addBaseProfilePipeline(pm);
-
-  if (failed(pm.run(*m_module))) {
-    llvm::errs() << "Failed to run the MLIR Pass Manager.\n";
-    return nullptr;
-  }
-
-  auto llvmModule = translateModuleToLLVMIR(m_module.get(), localLLVMContext);
-  if (!llvmModule) {
-    llvm::errs() << "Failed to translate module to LLVM IR\n";
-    return nullptr;
-  }
-
-  // Initialize LLVM targets.
-  /// Optionally run an optimization pipeline over the llvm module.
-  bool enableOpt = true;
-  auto optPipeline = makeOptimizingTransformer(
-      /*optLevel=*/enableOpt ? 3 : 0, /*sizeLevel=*/0,
-      /*targetMachine=*/nullptr);
-  if (auto err = optPipeline(llvmModule.get())) {
-    llvm::errs() << "Failed to optimize LLVM IR " << err << "\n";
-    return nullptr;
-  }
-
-  if (!setupTargetTriple(llvmModule.get())) {
-    llvm::errs() << "Failed to setup the llvm module target triple.\n";
-    return nullptr;
-  }
-
-  return llvmModule;
-}
-} // namespace cudaq
diff --git a/tools/qpud/backends/TargetBackend.h b/tools/qpud/backends/TargetBackend.h
deleted file mode 100644
index 4416da1f40..0000000000
--- a/tools/qpud/backends/TargetBackend.h
+++ /dev/null
@@ -1,139 +0,0 @@
-/*************************************************************** -*- C++ -*- ***
- * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- *******************************************************************************/
-
-#pragma once
-
-#include "common/Registry.h"
-#include "cudaq/spin_op.h"
-#include "llvm/IR/Module.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/Pass/PassManager.h"
-
-#include <deque>
-
-namespace cudaq {
-
-struct DynamicResult {
-  char *ptr;
-  std::uint64_t len;
-};
-
-// Typedef for the KERNEL.thunk() function
-using ThunkFunction = DynamicResult (*)(void *, bool);
-class Kernel {
-protected:
-  ThunkFunction thunk;
-  std::string kernelName;
-  std::string qirCode;
-  std::string quakeCode;
-
-public:
-  Kernel(ThunkFunction tf, const std::string name, const std::string qirC,
-         const std::string qC)
-      : thunk(tf), kernelName(name), qirCode(qirC), quakeCode(qC) {}
-  std::string_view getQuakeCode() { return quakeCode; }
-  std::string_view getQIRCode() { return qirCode; }
-  std::string_view name() const { return kernelName; }
-  DynamicResult operator()(void *args, bool isCS) { return thunk(args, isCS); }
-};
-
-/// The TargetBackend class provides an extension point for the
-/// invocation of auto-generated thunk functions under a variety of
-/// execution contexts. Specifically, thunk functions can be invoked for
-/// final state sampling (produce counts dictionary), spin_op observation
-/// (produce expectation value <psi | H | psi>), and base execution (just
-/// invoke the function and get the return value).
-class TargetBackend : public cudaq::registry::RegisteredType<TargetBackend> {
-protected:
-  llvm::LLVMContext llvmContext;
-
-  bool setupTargetTriple(llvm::Module *llvmModule);
-
-  /// @brief Given the kernel, extract the Quake code, synthesize with the
-  /// kernel
-  /// args, and lower to the Base Profile QIR
-  /// @param thunk
-  /// @param localLLVMContext
-  /// @param kernelArgs
-  /// @return
-  std::unique_ptr<llvm::Module>
-  lowerQuakeToBaseProfile(Kernel &thunk, llvm::LLVMContext &localLLVMContext,
-                          cudaq::spin_op *term, void *kernelArgs);
-
-  virtual void applyNativeGateSetPasses(mlir::PassManager &) {}
-
-public:
-  /// Compile the quakeCode further to an LLVM Module that can be JIT executed.
-  /// Default implementation here lowers the quake code to dynamic full QIR.
-  virtual std::unique_ptr<llvm::Module>
-  compile(mlir::MLIRContext &context, const std::string_view quakeCode);
-
-  /// Initialize the backend, one time execution
-  virtual void initialize() = 0;
-  virtual bool isInitialized() { return true; }
-
-  /// @brief Return if this backend is a QPU simulator
-  virtual bool isSimulator() { return true; }
-
-  /// @brief Return true if this backend supports conditional feedback
-  virtual bool supportsConditionalFeedback() { return true; }
-
-  /// Execute the ThunkFunction with the given args. The return value
-  /// (if kernel does not define a void return type) will be packed into
-  /// the kernelArgs opaque pointer.
-  virtual DynamicResult baseExecute(Kernel &thunk, void *kernelArgs,
-                                    bool isClientServer = true) = 0;
-
-  /// Execute the ThunkFunction with the given args, return the histogram
-  /// of observed bit strings to number of times observed (the counts
-  /// dictionary). Sample the final state `shots` times, return a vector<size_t>
-  /// of size 3 * n_unique_bit_strings of the format [bitStringAsLong,
-  /// N_MeasuredBitsInString, Count, ... repeat ...]
-  virtual std::vector<std::size_t> sample(Kernel &thunk, std::size_t shots,
-                                          void *kernelArgs) = 0;
-
-  /// Execute the ThunkFunction with the given args, return the expected value
-  /// of the provided spin_op (represented as a vector<double>, with format
-  /// [OP0 OP1 OP2 ... COEFF_REAL COEFF_IMAG | OP0 OP1 OP2 ... COEFF_REAL
-  /// COEFF_IMAG | ... | NTERMS]).
-  virtual std::tuple<double, std::vector<std::size_t>>
-  observe(Kernel &thunk, std::vector<double> &spin_op_data,
-          const std::size_t shots, void *kernelArgs) = 0;
-
-  /// Execute an Observe task with the given ansatz and spin op data, but
-  /// detach and return the job ids and job names for the given task.
-  virtual std::tuple<std::vector<std::string>, std::vector<std::string>>
-  observeDetach(Kernel &thunk, std::vector<double> &spin_op_data,
-                const std::size_t shots, void *kernelArgs) {
-    throw std::runtime_error("observeDetach not supported for this backend.");
-  }
-
-  /// Given a valid job id, return the observe result.
-  virtual std::tuple<double, std::vector<std::size_t>>
-  observeFromJobId(const std::string &jobId) {
-    throw std::runtime_error(
-        "observeFromJobId not supported for this backend.");
-  }
-
-  virtual std::tuple<std::string, std::string>
-  sampleDetach(Kernel &thunk, const std::size_t shots, void *kernelArgs) {
-    throw std::runtime_error("sampleDetach not supported for this backend.");
-  }
-
-  /// Given a valid job id, return the sample result.
-  virtual std::vector<std::size_t> sampleFromJobId(const std::string &jobId) {
-    throw std::runtime_error("sampleFromJobId not supported for this backend.");
-  }
-
-  /// @brief Provide a hook for specifying a specific platform backend (e.g.
-  /// Quantinuum H1-2)
-  virtual void setSpecificBackend(const std::string &backend) {}
-
-  virtual ~TargetBackend() {}
-};
-} // namespace cudaq
diff --git a/tools/qpud/backends/default/CMakeLists.txt b/tools/qpud/backends/default/CMakeLists.txt
deleted file mode 100644
index 199cbe8900..0000000000
--- a/tools/qpud/backends/default/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-# ============================================================================ #
-# Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                   #
-# All rights reserved.                                                         #
-#                                                                              #
-# This source code and the accompanying materials are made available under     #
-# the terms of the Apache License 2.0 which accompanies this distribution.     #
-# ============================================================================ #
-
-target_sources(qpud PRIVATE DefaultBackend.cpp)
\ No newline at end of file
diff --git a/tools/qpud/backends/default/DefaultBackend.cpp b/tools/qpud/backends/default/DefaultBackend.cpp
deleted file mode 100644
index 4376383ad4..0000000000
--- a/tools/qpud/backends/default/DefaultBackend.cpp
+++ /dev/null
@@ -1,258 +0,0 @@
-/*************************************************************** -*- C++ -*- ***
- * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- *******************************************************************************/
-
-#include "TargetBackend.h"
-#include "common/ExecutionContext.h"
-#include "cudaq/utils/registry.h"
-#include <cudaq/spin_op.h>
-
-// Instantiate the registry for all backends
-LLVM_INSTANTIATE_REGISTRY(cudaq::TargetBackend::RegistryType);
-
-using TuplePtr = int8_t *;
-struct Array;
-
-extern "C" {
-void __quantum__rt__setExecutionContext(cudaq::ExecutionContext *);
-void __quantum__rt__resetExecutionContext();
-int8_t *__quantum__rt__array_get_element_ptr_1d(Array *q, uint64_t);
-int64_t __quantum__rt__array_get_size_1d(Array *);
-Array *__quantum__rt__array_create_1d(int, int64_t);
-void __quantum__qis__exp__body(Array *paulis, double angle, Array *qubits);
-void __quantum__qis__measure__body(Array *, Array *);
-}
-
-using namespace cudaq;
-
-namespace cudaq {
-bool kernelHasConditionalFeedback(const std::string &);
-}
-namespace {
-
-Array *spinToArray(cudaq::spin_op &op) {
-  // How to pack the data:
-  // add all term data as correct pointer to double for x,y,z,or I.
-  // After each term add a pointer to real part of term coeff,
-  // add imag part of coeff.
-  // End the data array with the number of terms in the list
-  // x0 y1 - y0 x1 would be
-  // 1 3 coeff.real coeff.imag 3 1 coeff.real coeff.imag NTERMS
-  auto n_qubits = op.n_qubits();
-  auto n_terms = op.n_terms();
-
-  auto arr = __quantum__rt__array_create_1d(
-      sizeof(double), n_qubits * n_terms + 2 * n_terms + 1);
-  auto data = op.get_bsf();
-
-  for (std::size_t i = 0; i < n_terms; i++) {
-    auto term_data = data[i];
-    std::size_t row_size = n_qubits + 2;
-    for (std::size_t j = 0; j < row_size; j++) {
-      int8_t *ptr =
-          __quantum__rt__array_get_element_ptr_1d(arr, i * row_size + j);
-      auto ptr_el = reinterpret_cast<double *>(ptr);
-      if (j == n_qubits) {
-        *ptr_el = op.get_term_coefficient(i).real();
-        continue;
-      }
-      if (j == n_qubits + 1) {
-        *ptr_el = op.get_term_coefficient(i).imag();
-        break;
-      }
-
-      if (term_data[j] && term_data[j + n_qubits]) {
-        // Y term
-        *ptr_el = 3.0; // new double(3);
-      } else if (term_data[j]) {
-        // X term
-        *ptr_el = 1.0; // new double(1.0);
-      } else if (term_data[j + n_qubits]) {
-        // Z term
-        *ptr_el = 2.0; // new double(2);
-      } else {
-        *ptr_el = 0.0; // new double(0);
-      }
-    }
-  }
-
-  int8_t *ptr = __quantum__rt__array_get_element_ptr_1d(
-      arr, n_qubits * n_terms + 2 * n_terms);
-  auto ptr_el = reinterpret_cast<double *>(ptr);
-  *ptr_el = n_terms;
-  // cached_internal_data_rep = arr;
-  return arr;
-}
-
-double measure(cudaq::spin_op &term, ExecutionContext *ctx) {
-  Array *term_arr = spinToArray(term);
-  __quantum__qis__measure__body(term_arr, nullptr);
-  auto exp = ctx->expectationValue;
-  return exp.value();
-}
-
-// The DefaultBackend subclasses TargetBackend to provide
-// thunk function execution for base, sampling, and observation
-// by delegating directly to QIR calls. This type relies on
-// libnvqir being provided at link time.
-class DefaultBackend : public TargetBackend {
-public:
-  void initialize() override { return; }
-  DynamicResult baseExecute(Kernel &thunk, void *kernelArgs,
-                            bool isClientServer) override {
-    return thunk(kernelArgs, isClientServer);
-  }
-  std::vector<std::size_t> sample(Kernel &thunk, std::size_t shots,
-                                  void *kernelArgs) override {
-    cudaq::ExecutionContext ctx("sample", shots);
-
-    // First see if this Quake representation has conditional feedbackF
-    auto quakeCode = thunk.getQuakeCode();
-    ctx.hasConditionalsOnMeasureResults =
-        !quakeCode.empty() &&
-        quakeCode.find("qubitMeasurementFeedback = true") != std::string::npos;
-
-    __quantum__rt__setExecutionContext(&ctx);
-
-    sample_result counts;
-    if (ctx.hasConditionalsOnMeasureResults) {
-      // If it has conditionals, loop over individual circuit executions
-      for (auto &i : cudaq::range(shots)) {
-        // Run the kernel
-        thunk(kernelArgs, /*isClientServer=*/false);
-        // Reset the context and get the single measure result,
-        // add it to the sample_result and clear the context result
-        __quantum__rt__resetExecutionContext();
-        counts += ctx.result;
-        ctx.result.clear();
-        // Reset the context for the next round
-        if (i < (unsigned)shots)
-          __quantum__rt__setExecutionContext(&ctx);
-      }
-      return counts.serialize();
-    }
-
-    // Just run the kernel, context will get the sampling results
-    thunk(kernelArgs, /*isClientServer=*/false);
-    __quantum__rt__resetExecutionContext();
-    return ctx.result.serialize();
-  }
-  std::tuple<double, std::vector<std::size_t>>
-  observe(Kernel &thunk, std::vector<double> &spin_op_data,
-          const std::size_t shots, void *kernelArgs) override {
-
-    ExecutionContext ctx("observe");
-    ctx.shots = shots == 0 ? -1 : shots;
-
-    __quantum__rt__setExecutionContext(&ctx);
-    // default to never using CircuitSimulator::observe()
-    ctx.canHandleObserve = false;
-
-    auto n_terms = (int)spin_op_data.back();
-    auto nQubits = (spin_op_data.size() - 2 * n_terms) / n_terms;
-    cudaq::spin_op H(spin_op_data, nQubits);
-    double sum = 0.0;
-    thunk(kernelArgs, /*isClientServer=*/false);
-
-    std::vector<ExecutionResult> results;
-    for (std::size_t i = 0; i < H.n_terms(); i++) {
-      auto term = H[i];
-      if (!term.is_identity()) {
-        auto exp = measure(term, &ctx);
-        results.emplace_back(ctx.result.to_map(), term.to_string(false));
-        sum += term.get_term_coefficient(0).real() * exp;
-      } else {
-        sum += term.get_term_coefficient(0).real();
-      }
-    }
-    __quantum__rt__resetExecutionContext();
-    sample_result counts(results);
-    return std::make_tuple(sum, counts.serialize());
-  }
-
-  std::string genRandomString() {
-    const int len = 32;
-    static const char alphanum[] = "0123456789"
-                                   "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
-                                   "abcdefghijklmnopqrstuvwxyz";
-    std::string tmp_s;
-    tmp_s.reserve(len);
-
-    for (int i = 0; i < len; ++i) {
-      tmp_s += alphanum[rand() % (sizeof(alphanum) - 1)];
-    }
-
-    return tmp_s;
-  }
-
-  std::tuple<std::string, std::string>
-  sampleDetach(Kernel &thunk, std::size_t shots, void *kernelArgs) override {
-    cudaq::ExecutionContext ctx("sample", shots);
-    __quantum__rt__setExecutionContext(&ctx);
-    thunk(kernelArgs, /*isClientServer=*/false);
-    __quantum__rt__resetExecutionContext();
-    auto jobId = genRandomString();
-    detachedSampleResults.insert({jobId, ctx.result});
-    return std::make_tuple(jobId,
-                           std::string(thunk.name()) + std::string(".sample"));
-  }
-
-  std::tuple<std::vector<std::string>, std::vector<std::string>>
-  observeDetach(Kernel &thunk, std::vector<double> &spin_op_data,
-                const std::size_t shots, void *kernelArgs) override {
-    // Local declarations
-    ExecutionContext ctx("observe");
-    ctx.shots = shots == 0 ? -1 : shots;
-
-    __quantum__rt__setExecutionContext(&ctx);
-    auto n_terms = (int)spin_op_data.back();
-    auto nQubits = (spin_op_data.size() - 2 * n_terms) / n_terms;
-
-    cudaq::spin_op H(spin_op_data, nQubits);
-    thunk(kernelArgs, /*isClientServer=*/false);
-
-    std::vector<std::string> jobIds, jobNames;
-    for (std::size_t i = 0; i < H.n_terms(); i++) {
-      auto term = H[i];
-      if (!term.is_identity()) {
-        auto jobId = genRandomString();
-        auto exp = measure(term, &ctx);
-        detachedObserveResults.insert(
-            {jobId, std::make_pair(exp, ctx.result.serialize())});
-        jobIds.push_back(jobId);
-        jobNames.push_back(term.to_string(false));
-      }
-    }
-    __quantum__rt__resetExecutionContext();
-
-    return std::make_tuple(jobIds, jobNames);
-  }
-
-  std::tuple<double, std::vector<std::size_t>>
-  observeFromJobId(const std::string &jobId) override {
-    return detachedObserveResults[jobId];
-  }
-
-  std::vector<std::size_t> sampleFromJobId(const std::string &jobId) override {
-    return detachedSampleResults[jobId].serialize();
-  }
-
-  virtual ~DefaultBackend() = default;
-
-protected:
-  /// @brief For each detached sample task, store jobId -> results
-  std::map<std::string, sample_result> detachedSampleResults;
-
-  /// @brief For each detached observe task, store jobId -> tuple(ExpVal,
-  /// Serialized Counts)
-  std::map<std::string, std::tuple<double, std::vector<std::size_t>>>
-      detachedObserveResults;
-};
-
-} // namespace
-
-CUDAQ_REGISTER_TYPE(cudaq::TargetBackend, DefaultBackend, default);
diff --git a/tools/qpud/jit/CMakeLists.txt b/tools/qpud/jit/CMakeLists.txt
deleted file mode 100644
index ef2f70dba8..0000000000
--- a/tools/qpud/jit/CMakeLists.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-# ============================================================================ #
-# Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                   #
-# All rights reserved.                                                         #
-#                                                                              #
-# This source code and the accompanying materials are made available under     #
-# the terms of the Apache License 2.0 which accompanies this distribution.     #
-# ============================================================================ #
-
-target_sources(qpud PRIVATE KernelJIT.cpp)
\ No newline at end of file
diff --git a/tools/qpud/jit/KernelJIT.cpp b/tools/qpud/jit/KernelJIT.cpp
deleted file mode 100644
index 909f88360e..0000000000
--- a/tools/qpud/jit/KernelJIT.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-/*************************************************************** -*- C++ -*- ***
- * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- *******************************************************************************/
-
-#include "KernelJIT.h"
-
-using namespace llvm;
-using namespace llvm::orc;
-
-namespace cudaq {
-KernelJIT::KernelJIT(std::unique_ptr<ExecutionSession> ES,
-                     JITTargetMachineBuilder JTMB, DataLayout DL,
-                     const std::vector<std::string> &extraLibraries,
-                     std::unique_ptr<LLVMContext> ctx)
-    : ES(std::move(ES)),
-      ObjectLayer(*this->ES,
-                  []() { return std::make_unique<SectionMemoryManager>(); }),
-      CompileLayer(*this->ES, ObjectLayer,
-                   std::make_unique<ConcurrentIRCompiler>(std::move(JTMB))),
-      DL(std::move(DL)), Ctx(std::move(ctx)),
-      MainJD(this->ES->createBareJITDylib("<main>")) {
-  MainJD.addGenerator(
-      cantFail(DynamicLibrarySearchGenerator::GetForCurrentProcess(
-          DL.getGlobalPrefix())));
-  for (auto &library : extraLibraries) {
-    MainJD.addGenerator(cantFail(DynamicLibrarySearchGenerator::Load(
-        library.data(), DL.getGlobalPrefix())));
-  }
-}
-
-KernelJIT::~KernelJIT() {
-  // End the session.
-  if (auto Err = ES->endSession()) {
-    ES->reportError(std::move(Err));
-  }
-}
-
-Expected<std::unique_ptr<KernelJIT>>
-KernelJIT::Create(const std::vector<std::string> &extraLibraries) {
-  auto EPC = SelfExecutorProcessControl::Create();
-  if (!EPC)
-    return EPC.takeError();
-
-  auto ES = std::make_unique<ExecutionSession>(std::move(*EPC));
-  JITTargetMachineBuilder JTMB(
-      ES->getExecutorProcessControl().getTargetTriple());
-
-  auto DL = JTMB.getDefaultDataLayoutForTarget();
-  if (!DL)
-    return DL.takeError();
-
-  return std::make_unique<KernelJIT>(std::move(ES), std::move(JTMB),
-                                     std::move(*DL), extraLibraries);
-}
-
-Error KernelJIT::addModule(std::unique_ptr<llvm::Module> M,
-                           std::vector<std::string> extra_paths) {
-  auto rt = MainJD.getDefaultResourceTracker();
-  return CompileLayer.add(rt, ThreadSafeModule(std::move(M), Ctx));
-}
-
-Expected<JITEvaluatedSymbol> KernelJIT::lookup(StringRef Name) {
-  return ES->lookup({&MainJD}, Name.str());
-}
-} // namespace cudaq
diff --git a/tools/qpud/jit/KernelJIT.h b/tools/qpud/jit/KernelJIT.h
deleted file mode 100644
index 1b32fa04e1..0000000000
--- a/tools/qpud/jit/KernelJIT.h
+++ /dev/null
@@ -1,77 +0,0 @@
-/*************************************************************** -*- C++ -*- ***
- * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- *******************************************************************************/
-
-#pragma once
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
-#include "llvm/ExecutionEngine/Orc/ExecutorProcessControl.h"
-#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
-#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
-#include "llvm/ExecutionEngine/SectionMemoryManager.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/LLVMContext.h"
-#include <memory>
-
-using namespace llvm;
-using namespace llvm::orc;
-
-namespace cudaq {
-
-// The KernelJIT class wraps the LLVM JIT utility types to
-// take as input a llvm::Module and enable one to extract
-// a function pointer for the contained llvm::Functions.
-class KernelJIT {
-private:
-  // The LLVM ExecutionSession representing the JIT program
-  std::unique_ptr<ExecutionSession> ES;
-
-  // LLVM helper for object linking
-  RTDyldObjectLinkingLayer ObjectLayer;
-
-  // LLVM helper for compiling Modules
-  IRCompileLayer CompileLayer;
-
-  // Representation of the target triple
-  DataLayout DL;
-
-  // Thread-safe LLVM Context
-  ThreadSafeContext Ctx;
-
-  // LLVM Helper representing JIT dynamic library
-  JITDylib &MainJD;
-
-public:
-  // The constructor, not meant to be used publicly, see KernelJIT::Create()
-  KernelJIT(std::unique_ptr<ExecutionSession> ES, JITTargetMachineBuilder JTMB,
-            DataLayout DL, const std::vector<std::string> &extraLibraries,
-            std::unique_ptr<LLVMContext> ctx = std::make_unique<LLVMContext>());
-
-  // The destructor
-  ~KernelJIT();
-
-  // Static creation method for the KernelJIT
-  static Expected<std::unique_ptr<KernelJIT>>
-  Create(const std::vector<std::string> &extraLibraries);
-
-  // Add an LLVM Module to be JIT compiled, optionally
-  // provide extra linker paths to search.
-  Error addModule(std::unique_ptr<llvm::Module> M,
-                  std::vector<std::string> extra_paths = {});
-
-  // Lookup and return a symbol JIT compiled from the Module
-  // i.e. get a handle to a specific compiled function
-  // and cast to a function pointer to invoke.
-  Expected<JITEvaluatedSymbol> lookup(StringRef Name);
-};
-
-} // namespace cudaq
diff --git a/tools/qpud/qpud.cpp b/tools/qpud/qpud.cpp
deleted file mode 100644
index cdeaaf6888..0000000000
--- a/tools/qpud/qpud.cpp
+++ /dev/null
@@ -1,419 +0,0 @@
-/*************************************************************** -*- C++ -*- ***
- * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- *******************************************************************************/
-
-#include "KernelJIT.h"
-#include "NvidiaPlatformHelper.h"
-#include "TargetBackend.h"
-#include "common/Logger.h"
-#include "cudaq/Optimizer/Dialect/CC/CCDialect.h"
-#include "cudaq/Optimizer/Dialect/Quake/QuakeDialect.h"
-#include "cudaq/utils/cudaq_utils.h"
-#include "nvqpp_config.h"
-#include "rpc/server.h"
-#include "rpc/this_handler.h"
-#include "llvm/Support/TargetSelect.h"
-#include "mlir/Dialect/Affine/IR/AffineOps.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-#include "mlir/Dialect/Math/IR/Math.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/InitAllPasses.h"
-#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
-#include <charconv>
-#include <dlfcn.h>
-#include <filesystem>
-#include <fstream>
-#include <sstream>
-#include <thread>
-#include <unordered_map>
-
-namespace cudaq {
-/// Flag that stops the server and exits the qpud process
-static std::atomic<bool> _stopServer = false;
-
-/// Storage for loaded Kernels
-static std::unordered_map<std::string, Kernel> loadedThunkSymbols;
-
-/// Storage for the JIT engine used for each kernel
-std::unordered_map<std::string, std::unique_ptr<KernelJIT>> jitStorage;
-
-/// Pointer to the targeted QPUD backend
-std::unique_ptr<TargetBackend> backend = nullptr;
-
-/// Pointer to the global MLIR context
-std::unique_ptr<mlir::MLIRContext> mlirContext;
-
-/// @brief Respond to the client with an error and
-/// return from the calling function.
-template <typename RetType>
-RetType returnWithError(const std::string &errorMsg) {
-  rpc::this_handler().respond_error(errorMsg);
-  if constexpr (std::is_same_v<void, RetType>)
-    return;
-  else
-    return RetType{};
-}
-
-/// @brief Utility function that will let us wrap our
-/// backend invocations in a try-catch to better report errors to the client.
-template <typename Functor, typename R = typename std::invoke_result_t<Functor>>
-auto backendInvokeHandleErrors(Functor &&functor, const std::string &errorMsg) {
-  try {
-    return functor();
-  } catch (std::exception &e) {
-    auto msg = errorMsg + " " + std::string(e.what());
-    return returnWithError<R>(msg);
-  }
-}
-
-/// @brief Stop the server.
-void stopServer() { _stopServer = true; }
-
-/// @brief Reset the backend to the given target backend
-/// @param backend
-void setTargetBackend(const std::string &backend) {
-  cudaq::info("Setting qpud backend to {}", backend);
-  std::string mutableName = backend, subBackend = "";
-  auto split = cudaq::split(backend, ':');
-  if (split.size() > 1) {
-    mutableName = split[0];
-    subBackend = split[1];
-  }
-
-  // Set the backend, check that it is valid
-  cudaq::backend = cudaq::registry::get<cudaq::TargetBackend>(mutableName);
-  if (!cudaq::backend)
-    returnWithError<void>("Invalid target backend. (" + backend + ")");
-
-  // Set the sub backend if we have one
-  if (!subBackend.empty())
-    cudaq::backend->setSpecificBackend(subBackend);
-
-  return;
-}
-
-bool getIsSimulator() { return cudaq::backend->isSimulator(); }
-bool getSupportsConditionalFeedback() {
-  return cudaq::backend->supportsConditionalFeedback();
-}
-
-/// @brief If it has not been loaded, JIT the provided quakeCode to LLVM.
-/// @param kernelName
-/// @param quakeCode
-void loadQuakeCode(const std::string &kernelName, const std::string &quakeCode,
-                   const std::vector<std::string> &extraLibraries) {
-  cudaq::ScopedTrace trace("qpud::loadQuakeCode", kernelName, extraLibraries);
-
-  if (!cudaq::backend->isInitialized())
-    cudaq::backend->initialize();
-
-  // Will need to JIT the quakeCode to LLVM only
-  // Load as MLIR Module, run the PassManager to lower to LLVM Dialect
-  // Translate to LLVM Module and use MLIR ExecutionEngine
-  // add to loadedThunkSymbols
-  if (!loadedThunkSymbols.count(kernelName)) {
-    // Get the LLVM Module from the backend compile phase
-    // Default will lower quake to QIR llvm, others may
-    // lower to the QIR base profile.
-    auto llvmModule = cudaq::backend->compile(*mlirContext.get(), quakeCode);
-    if (!llvmModule)
-      returnWithError<void>(
-          "[qpud::loadQuake] Failed to lower quake code to LLVM IR: " +
-          kernelName);
-
-    std::string qirCode;
-    llvm::raw_string_ostream os(qirCode);
-    llvmModule->print(os, nullptr);
-    os.flush();
-    // Create and store the KernelJIT instance, get pointer to it
-    auto result = jitStorage.insert(
-        {kernelName, cantFail(KernelJIT::Create(extraLibraries))});
-    auto kernelJIT = result.first->second.get();
-
-    // Add the LLVM Module, Get the KERNEL.thunk function pointer
-    cantFail(kernelJIT->addModule(std::move(llvmModule)),
-             "Could not load the llvm::Module for thunk JIT.");
-
-    // Ensure we have the thunk symbol
-    std::string symbolName = kernelName + ".thunk";
-    if (quakeCode.find(symbolName) == std::string::npos) {
-      return returnWithError<void>(
-          symbolName +
-          " symbol not available. Please compile with --enable-mlir.");
-    }
-
-    // Apple for some reason prepends a "_"
-#if defined(__APPLE__) && defined(__MACH__)
-    symbolName = "_" + symbolName;
-#endif
-
-    // Get the thunk symbol
-    auto symbol =
-        cantFail(kernelJIT->lookup(symbolName), "Could not find the symbol");
-    auto *thunkFunctor =
-        reinterpret_cast<DynamicResult (*)(void *, bool)>(symbol.getAddress());
-
-    // Store the thunk pointer.
-    loadedThunkSymbols.insert(
-        {kernelName, Kernel(thunkFunctor, kernelName, qirCode, quakeCode)});
-  }
-}
-
-/// @brief Direct the server to execute the kernel with given name and provided
-/// runtime arguments.
-/// @param kernelName name of the kernel to execute
-/// @param args vector<uint8_t> representation of the void* kernelArgs.
-/// @return The modified kernelArgs (the return value is in there)
-std::vector<uint8_t> executeKernel(const std::string &kernelName,
-                                   std::vector<uint8_t> args) {
-  cudaq::ScopedTrace trace("qpud::executeKernel", kernelName);
-
-  if (!cudaq::backend->isInitialized())
-    cudaq::backend->initialize();
-
-  auto f_iter = loadedThunkSymbols.find(kernelName);
-  if (f_iter == loadedThunkSymbols.end())
-    returnWithError<std::vector<uint8_t>>(
-        "[qpud::base_exec] Invalid CUDA Quantum kernel name: " + kernelName);
-
-  auto function = f_iter->second;
-  auto raw_args = static_cast<void *>(args.data());
-
-  return backendInvokeHandleErrors(
-      [&]() -> std::vector<uint8_t> {
-        auto res = cudaq::backend->baseExecute(function, raw_args,
-                                               /*isClientServer=*/true);
-        if (!res.ptr)
-          return args;
-        return {&res.ptr[0], &res.ptr[res.len]};
-      },
-      "Error in base execute.");
-}
-
-/// @brief Sample the state generated by the kernel with given name
-/// @param kernelName name of the kernel to execute
-/// @param shots
-/// @param args vector<uint8_t> representation of the void* kernelArgs.
-/// @return The modified kernelArgs (the return value is in there)
-std::vector<std::size_t> sampleKernel(const std::string &kernelName,
-                                      std::size_t shots,
-                                      std::vector<uint8_t> args) {
-  cudaq::ScopedTrace trace("qpud::sampleKernel", kernelName, shots);
-
-  if (!cudaq::backend->isInitialized())
-    cudaq::backend->initialize();
-
-  auto f_iter = loadedThunkSymbols.find(kernelName);
-  if (f_iter == loadedThunkSymbols.end())
-    returnWithError<std::vector<std::size_t>>(
-        "[qpud::sample] Invalid CUDA Quantum kernel name: " + kernelName);
-
-  auto function = f_iter->second;
-  auto raw_args = static_cast<void *>(args.data());
-  return backendInvokeHandleErrors(
-      [&]() { return backend->sample(function, shots, raw_args); },
-      "Error in sample.");
-}
-
-/// @brief Observe the state generated by the kernel with the given spin
-/// operator
-/// @param kernelName name of the kernel to execute
-/// @param spin_op_data The operator representation <kernel|H|kernel, H.
-/// @param args vector<uint8_t> representation of the void* kernelArgs.
-/// @return
-std::tuple<double, std::vector<std::size_t>>
-observeKernel(const std::string &kernelName, std::vector<double> spin_op_data,
-              const std::size_t shots, std::vector<uint8_t> &args) {
-  cudaq::ScopedTrace trace("qpud::observeKernel", kernelName, shots);
-  if (!cudaq::backend->isInitialized())
-    cudaq::backend->initialize();
-
-  auto f_iter = loadedThunkSymbols.find(kernelName);
-  if (f_iter == loadedThunkSymbols.end())
-    returnWithError<double>(
-        "[qpud::observe] Invalid CUDA Quantum kernel name: " + kernelName);
-
-  auto function = f_iter->second;
-  auto raw_args = static_cast<void *>(args.data());
-  return backendInvokeHandleErrors(
-      [&]() {
-        return backend->observe(function, spin_op_data, shots, raw_args);
-      },
-      "Error in observe.");
-}
-
-/// @brief Observe the state generated by the kernel with the given spin
-/// operator, but immediately return with the Job ID information.
-/// @param kernelName name of the kernel to execute
-/// @param spin_op_data The operator representation <kernel|H|kernel, H.
-/// @param args vector<uint8_t> representation of the void* kernelArgs.
-/// @return
-std::tuple<std::vector<std::string>, std::vector<std::string>>
-observeKernelDetach(const std::string &kernelName,
-                    std::vector<double> spin_op_data, const std::size_t shots,
-                    std::vector<uint8_t> &args) {
-  cudaq::ScopedTrace trace("qpud::observeKernelDetach", kernelName, shots);
-
-  if (!cudaq::backend->isInitialized())
-    backend->initialize();
-
-  auto f_iter = loadedThunkSymbols.find(kernelName);
-  if (f_iter == loadedThunkSymbols.end())
-    returnWithError<double>(
-        "[qpud::observe] Invalid CUDA Quantum kernel name: " + kernelName);
-
-  auto function = f_iter->second;
-  auto raw_args = static_cast<void *>(args.data());
-  return backendInvokeHandleErrors(
-      [&]() {
-        return backend->observeDetach(function, spin_op_data, shots, raw_args);
-      },
-      "Error in detached observe.");
-}
-
-/// @brief Produce the result from a detached observe job
-/// @param jobId
-/// @return
-std::tuple<double, std::vector<std::size_t>>
-observeKernelFromJobId(const std::string &jobId) {
-  cudaq::ScopedTrace trace("qpud::observeKernelFromJobId", jobId);
-
-  if (!cudaq::backend->isInitialized())
-    backend->initialize();
-  return backendInvokeHandleErrors(
-      [&]() { return backend->observeFromJobId(jobId); },
-      "Error in observe from Job ID.");
-}
-
-/// @brief Sample the given kernel asynchronously, detach and return the job
-/// information
-std::tuple<std::string, std::string>
-sampleKernelDetach(const std::string &kernelName, const std::size_t shots,
-                   std::vector<uint8_t> &args) {
-  cudaq::ScopedTrace trace("qpud::sampleKernelDetach", kernelName, shots);
-  if (!cudaq::backend->isInitialized())
-    backend->initialize();
-
-  auto f_iter = loadedThunkSymbols.find(kernelName);
-  if (f_iter == loadedThunkSymbols.end())
-    returnWithError<double>(
-        "[qpud::sampleDetach] Invalid CUDA Quantum kernel name: " + kernelName);
-
-  auto function = f_iter->second;
-  auto raw_args = static_cast<void *>(args.data());
-  return backendInvokeHandleErrors(
-      [&]() { return backend->sampleDetach(function, shots, raw_args); },
-      "Error in detached sample.");
-}
-
-/// @brief Produce the result from a detached observe job
-/// @param jobId
-/// @return
-std::vector<std::size_t> sampleKernelFromJobId(const std::string &jobId) {
-  cudaq::ScopedTrace trace("qpud::sampleKernelFromJobId", jobId);
-
-  if (!cudaq::backend->isInitialized())
-    backend->initialize();
-  return backendInvokeHandleErrors(
-      [&]() { return backend->sampleFromJobId(jobId); },
-      "Error in sample from Job ID.");
-}
-
-} // namespace cudaq
-
-int main(int argc, char **argv) {
-  int qpu_id = 0, port = 8888;
-  std::vector<std::string> args(&argv[0], &argv[0] + argc);
-  for (std::size_t i = 0; i < args.size(); i++) {
-    if (args[i] == "--qpu") {
-      if (i == args.size() - 1) {
-        llvm::errs() << "--qpu specified but no qpu id provided.\n";
-        return -1;
-      }
-      std::string arg = args[i + 1];
-      auto [ptr, ec] =
-          std::from_chars(arg.data(), arg.data() + arg.size(), qpu_id);
-      if (ec == std::errc::invalid_argument) {
-        llvm::errs() << "[qpud] Invalid QPU ID (" << arg
-                     << "). Provide an integer [0,N_QPUS).\n";
-        return -1;
-      }
-    }
-
-    if (args[i] == "--port") {
-      if (i == args.size() - 1) {
-        llvm::errs() << "--port specified but no port provided.\n";
-        return -1;
-      }
-      std::string arg = args[i + 1];
-      auto [ptr, ec] =
-          std::from_chars(arg.data(), arg.data() + arg.size(), port);
-      if (ec == std::errc::invalid_argument) {
-        llvm::errs() << "[qpud] Invalid Port (" << arg << ").\n";
-        return -1;
-      }
-    }
-  }
-
-  // One time initialization of LLVM
-  llvm::InitializeNativeTarget();
-  llvm::InitializeNativeTargetAsmPrinter();
-  mlir::registerAllPasses();
-
-  // One time initialization of MLIR
-  mlir::DialectRegistry registry;
-  registry.insert<cudaq::cc::CCDialect, quake::QuakeDialect,
-                  mlir::math::MathDialect, mlir::scf::SCFDialect,
-                  mlir::LLVM::LLVMDialect, mlir::arith::ArithDialect,
-                  mlir::AffineDialect, mlir::memref::MemRefDialect,
-                  mlir::func::FuncDialect>();
-  cudaq::mlirContext = std::make_unique<mlir::MLIRContext>(registry);
-  cudaq::mlirContext->loadAllAvailableDialects();
-  // Register the translation to LLVM IR with the MLIR context.
-  mlir::registerLLVMDialectTranslation(*cudaq::mlirContext.get());
-
-  // Create the Default Target Backend.
-  cudaq::backend = cudaq::registry::get<cudaq::TargetBackend>("default");
-
-  // Create the server and bind the functions
-  std::unique_ptr<rpc::server> server;
-  try {
-    server = std::make_unique<rpc::server>(port);
-    server->bind("loadQuakeCode", &cudaq::loadQuakeCode);
-    server->bind("executeKernel", &cudaq::executeKernel);
-    server->bind("sampleKernel", &cudaq::sampleKernel);
-    server->bind("sampleKernelDetach", &cudaq::sampleKernelDetach);
-    server->bind("sampleKernelFromJobId", &cudaq::sampleKernelFromJobId);
-    server->bind("observeKernel", &cudaq::observeKernel);
-    server->bind("observeKernelFromJobId", &cudaq::observeKernelFromJobId);
-    server->bind("observeKernelDetach", &::cudaq::observeKernelDetach);
-    server->bind("setTargetBackend", &cudaq::setTargetBackend);
-    server->bind("getIsSimulator", &cudaq::getIsSimulator);
-    server->bind("getSupportsConditionalFeedback",
-                 &cudaq::getSupportsConditionalFeedback);
-    server->bind("stopServer", &cudaq::stopServer);
-
-    cudaq::NvidiaPlatformHelper helper;
-    helper.setQPU(qpu_id);
-
-    server->async_run();
-  } catch (std::exception &e) {
-    printf("%s\n", e.what());
-    return -1;
-  }
-
-  while (true) {
-    std::this_thread::sleep_for(std::chrono::milliseconds(100));
-    if (cudaq::_stopServer)
-      break;
-  }
-  return 0;
-}
diff --git a/tools/qpud/utils/CMakeLists.txt b/tools/qpud/utils/CMakeLists.txt
deleted file mode 100644
index e2605bbe32..0000000000
--- a/tools/qpud/utils/CMakeLists.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-# ============================================================================ #
-# Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                   #
-# All rights reserved.                                                         #
-#                                                                              #
-# This source code and the accompanying materials are made available under     #
-# the terms of the Apache License 2.0 which accompanies this distribution.     #
-# ============================================================================ #
-
-# If we have CUDA, then we want to be able to 
-# call cudaSetDevice.
-if(CUDA_FOUND)
-  target_sources(qpud PRIVATE NvidiaPlatformHelper.cu)
-  target_link_libraries(qpud PRIVATE ${CUDA_LIBRARIES})
-else()
-  message(STATUS "CUDA Not Found, QPUD using FakeNvidiaPlatformHelper")
-  target_sources(qpud PRIVATE FakeNvidiaPlatformHelper.cpp)
-endif()
\ No newline at end of file
diff --git a/tools/qpud/utils/FakeNvidiaPlatformHelper.cpp b/tools/qpud/utils/FakeNvidiaPlatformHelper.cpp
deleted file mode 100644
index c7589caa91..0000000000
--- a/tools/qpud/utils/FakeNvidiaPlatformHelper.cpp
+++ /dev/null
@@ -1,17 +0,0 @@
-/*************************************************************** -*- C++ -*- ***
- * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- *******************************************************************************/
-
-#include "NvidiaPlatformHelper.h"
-
-void cudaq::NvidiaPlatformHelper::createLogicalToPhysicalDeviceMap() {}
-
-std::size_t cudaq::NvidiaPlatformHelper::setQPU(const std::size_t deviceID) {
-  return 0;
-}
-
-int cudaq::NvidiaPlatformHelper::getNumQPUs() { return 0; }
diff --git a/tools/qpud/utils/NvidiaPlatformHelper.cu b/tools/qpud/utils/NvidiaPlatformHelper.cu
deleted file mode 100644
index 664c3672d7..0000000000
--- a/tools/qpud/utils/NvidiaPlatformHelper.cu
+++ /dev/null
@@ -1,40 +0,0 @@
-/*************************************************************** -*- C++ -*- ***
- * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- *******************************************************************************/
-
-#include <sys/wait.h>
-#include <unistd.h>
-
-#include <map>
-#include <string>
-#include <utility>
-
-#include "NvidiaPlatformHelper.h"
-#include "cuda_runtime_api.h"
-namespace cudaq {
-void NvidiaPlatformHelper::createLogicalToPhysicalDeviceMap() {
-  int nDevices;
-  cudaGetDeviceCount(&nDevices);
-  int counter = 0;
-  for (int i = 0; i < nDevices; i++) {
-    cudaDeviceProp prop;
-    cudaGetDeviceProperties(&prop, i);
-    if (std::string(prop.name).find("Display") == std::string::npos) {
-      logical_to_physical_device_id.insert({counter, i});
-      counter++;
-    }
-  }
-}
-std::size_t NvidiaPlatformHelper::setQPU(const std::size_t deviceID) {
-  int currentQPU = logical_to_physical_device_id[deviceID];
-  cudaSetDevice(currentQPU);
-  return currentQPU;
-}
-int NvidiaPlatformHelper::getNumQPUs() {
-  return logical_to_physical_device_id.size();
-}
-} // namespace cudaq
diff --git a/tools/qpud/utils/NvidiaPlatformHelper.h b/tools/qpud/utils/NvidiaPlatformHelper.h
deleted file mode 100644
index 1f4b3aea96..0000000000
--- a/tools/qpud/utils/NvidiaPlatformHelper.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/*************************************************************** -*- C++ -*- ***
- * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- *******************************************************************************/
-
-#pragma once
-#include <map>
-
-namespace cudaq {
-
-/// This handles how the DGX QunatumPlatform interfaces with the GPU(s).
-class NvidiaPlatformHelper {
-public:
-  NvidiaPlatformHelper() = default;
-  ~NvidiaPlatformHelper() = default;
-
-  /// Initialize the device map.
-  void createLogicalToPhysicalDeviceMap();
-
-  /// Set the GPU corresponding to the QPU deviceID.
-  std::size_t setQPU(const std::size_t deviceID);
-
-  /// Get the number of "QPUs" (GPUs)
-  int getNumQPUs();
-
-private:
-  /// This maps the IDs of the QPUs to the GPU devices.
-  std::map<std::size_t, std::size_t> logical_to_physical_device_id;
-};
-} // namespace cudaq
diff --git a/tpls/rpclib b/tpls/rpclib
deleted file mode 160000
index 08e4171956..0000000000
--- a/tpls/rpclib
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 08e4171956e4749e2f7188e4181793df959a5bda
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index 9e73ae113c..722a1ad1e5 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -103,18 +103,6 @@ target_link_libraries(test_spin
   gtest_main)
 gtest_discover_tests(test_spin)
 
-add_executable(test_qpud_client main.cpp qpud_client/QPUDClientTester.cpp)
-# Need to force the link to nvqir-qpp here if gcc.
-if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT APPLE)
-  target_link_options(test_qpud_client PRIVATE -Wl,--no-as-needed)
-endif()
-target_link_libraries(test_qpud_client
-  PRIVATE 
-  cudaq-qpud-client
-  nvqir-qpp
-  gtest_main)
-gtest_discover_tests(test_qpud_client)
-
 add_subdirectory(backends)
 add_subdirectory(pass)
 add_subdirectory(Optimizer)
diff --git a/unittests/Optimizer/CMakeLists.txt b/unittests/Optimizer/CMakeLists.txt
index 931c5d5dc1..a6f6901062 100644
--- a/unittests/Optimizer/CMakeLists.txt
+++ b/unittests/Optimizer/CMakeLists.txt
@@ -29,7 +29,6 @@ target_link_libraries(test_quake_synth
   cudaq-mlir-runtime
   cudaq-builder
   cudaq-common
-  cudaq-qpud-client
   cudaq-platform-default
   nvqir-qpp 
   nvqir
diff --git a/unittests/Optimizer/QuakeSynthTester.cpp b/unittests/Optimizer/QuakeSynthTester.cpp
index e9e5995b98..642805761e 100644
--- a/unittests/Optimizer/QuakeSynthTester.cpp
+++ b/unittests/Optimizer/QuakeSynthTester.cpp
@@ -34,7 +34,7 @@ typedef std::size_t (*Creator)(void **, void **);
 /// Retrieve the kernel args creator function for the kernel name
 Creator getArgsCreator(const std::string &);
 
-/// @brief Utility function for mapping variadic args to qpud required void*,
+/// @brief Utility function for mapping variadic args to required void*,
 /// size_t. Note clients of this function own the allocated rawArgs.
 template <typename... Args>
 std::pair<void *, std::size_t> mapToRawArgs(const std::string &kernelName,
diff --git a/unittests/qpud_client/QPUDClientTester.cpp b/unittests/qpud_client/QPUDClientTester.cpp
deleted file mode 100644
index 52f05cb86f..0000000000
--- a/unittests/qpud_client/QPUDClientTester.cpp
+++ /dev/null
@@ -1,594 +0,0 @@
-/*************************************************************** -*- C++ -*- ***
- * Copyright (c) 2022 - 2023 NVIDIA Corporation & Affiliates.                  *
- * All rights reserved.                                                        *
- *                                                                             *
- * This source code and the accompanying materials are made available under    *
- * the terms of the Apache License 2.0 which accompanies this distribution.    *
- *******************************************************************************/
-
-#include <gtest/gtest.h>
-
-#include "cudaq/utils/registry.h"
-
-#include <cudaq.h>
-
-#include "qpud_client.h"
-
-TEST(QPUDClientTester, checkSample) {
-
-  const std::string_view quakeCode =
-      R"#(module attributes {qtx.mangled_name_map = {__nvqpp__mlirgen__ghz = "_ZN3ghzclEi"}} {
-  func.func @__nvqpp__mlirgen__ghz(%arg0: i32) {
-    %c0_i64 = arith.constant 0 : i64
-    %c1_i32 = arith.constant 1 : i32
-    %c0_i32 = arith.constant 0 : i32
-    %0 = memref.alloca() : memref<i32>
-    memref.store %arg0, %0[] : memref<i32>
-    %1 = memref.load %0[] : memref<i32>
-    %2 = arith.extsi %1 : i32 to i64
-    %3 = quake.alloca(%2 : i64) : !quake.qvec<?>
-    %4 = quake.qextract %3[%c0_i64] : !quake.qvec<?>[i64]  -> !quake.qref
-    quake.h (%4)
-    cc.scope {
-      %9 = memref.alloca() : memref<i32>
-      memref.store %c0_i32, %9[] : memref<i32>
-      cc.loop while {
-        %10 = memref.load %9[] : memref<i32>
-        %11 = memref.load %0[] : memref<i32>
-        %12 = arith.subi %11, %c1_i32 : i32
-        %13 = arith.cmpi slt, %10, %12 : i32
-        cc.condition %13
-      } do {
-        cc.scope {
-          %10 = memref.load %9[] : memref<i32>
-          %11 = arith.extsi %10 : i32 to i64
-          %12 = quake.qextract %3[%11] : !quake.qvec<?>[i64] -> !quake.qref
-          %13 = memref.load %9[] : memref<i32>
-          %14 = arith.addi %13, %c1_i32 : i32
-          %15 = arith.extsi %14 : i32 to i64
-          %16 = quake.qextract %3[%15] : !quake.qvec<?>[i64] -> !quake.qref
-          quake.x [%12 : !quake.qref] (%16)
-        }
-        cc.continue
-      } step {
-        %10 = memref.load %9[] : memref<i32>
-        %11 = arith.addi %10, %c1_i32 : i32
-        memref.store %11, %9[] : memref<i32>
-      }
-    }
-    %5 = quake.qvec_size %3 : (!quake.qvec<?>) -> i64
-    %6 = arith.index_cast %5 : i64 to index
-    %7 = llvm.alloca %5 x i1 : (i64) -> !llvm.ptr<i1>
-    affine.for %arg1 = 0 to %6 {
-      %9 = quake.qextract %3[%arg1] : !quake.qvec<?>[index] -> !quake.qref
-      %10 = quake.mz(%9 : !quake.qref) : i1
-      %11 = arith.index_cast %arg1 : index to i64
-      %12 = llvm.getelementptr %7[%11] : (!llvm.ptr<i1>, i64) -> !llvm.ptr<i1>
-      llvm.store %10, %12 : !llvm.ptr<i1>
-    }
-    %8 = cc.stdvec_init %7, %5 : (!llvm.ptr<i1>, i64) -> !cc.stdvec<i1>
-    return
-  }
-  func.func private @__nvqpp_zeroDynamicResult() -> !llvm.struct<(ptr<i8>, i64)> {
-    %c0_i64 = arith.constant 0 : i64
-    %0 = llvm.inttoptr %c0_i64 : i64 to !llvm.ptr<i8>
-    %1 = llvm.mlir.undef : !llvm.struct<(ptr<i8>, i64)>
-    %2 = llvm.insertvalue %0, %1[0] : !llvm.struct<(ptr<i8>, i64)> 
-    %3 = llvm.insertvalue %c0_i64, %2[1] : !llvm.struct<(ptr<i8>, i64)> 
-    return %3 : !llvm.struct<(ptr<i8>, i64)>
-  }
-  func.func @ghz.thunk(%arg0: !llvm.ptr<i8>, %arg1: i1) -> !llvm.struct<(ptr<i8>, i64)> {
-    %0 = llvm.bitcast %arg0 : !llvm.ptr<i8> to !llvm.ptr<struct<(i32)>>
-    %1 = llvm.load %0 : !llvm.ptr<struct<(i32)>>
-    %2 = llvm.mlir.constant(0 : i64) : i64
-    %3 = llvm.inttoptr %2 : i64 to !llvm.ptr<struct<(i32)>>
-    %4 = llvm.getelementptr %3[1] : (!llvm.ptr<struct<(i32)>>) -> !llvm.ptr<struct<(i32)>>
-    %5 = llvm.ptrtoint %4 : !llvm.ptr<struct<(i32)>> to i64
-    %6 = llvm.getelementptr %arg0[%5] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
-    %7 = llvm.extractvalue %1[0] : !llvm.struct<(i32)> 
-    call @__nvqpp__mlirgen__ghz(%7) : (i32) -> ()
-    %nil = call @__nvqpp_zeroDynamicResult() : () -> !llvm.struct<(ptr<i8>, i64)>
-    return %nil : !llvm.struct<(ptr<i8>, i64)>
-  }
-})#";
-
-  std::size_t shots = 500;
-  cudaq::registry::deviceCodeHolderAdd("ghz", quakeCode.data());
-
-  // Here is the main qpud_client sampling workflow
-
-  // Create the client
-  cudaq::qpud_client client;
-
-  // Create a struct defining the runtime args for the kernel
-  struct KernelArgs {
-    int N = 5;
-  } args;
-
-  // Map those args to a void pointer and its associated size
-  auto [rawArgs, size, resultOff] = client.process_args(args);
-
-  // Invoke the sampling workflow, get the MeasureCounts
-  auto counts = client.sample("ghz", shots, rawArgs, size);
-
-  // Test the results.
-  int counter = 0;
-  for (auto &[bits, count] : counts) {
-    counter += count;
-    EXPECT_TRUE(bits == "00000" || bits == "11111");
-  }
-  EXPECT_EQ(counter, shots);
-
-  counts.dump();
-
-  // Try it again with the simpler API
-  counts = client.sample("ghz", shots, args);
-  counter = 0;
-  for (auto &[bits, count] : counts) {
-    counter += count;
-    EXPECT_TRUE(bits == "00000" || bits == "11111");
-  }
-  EXPECT_EQ(counter, shots);
-}
-
-TEST(QPUDClientTester, checkObserve) {
-
-  const std::string_view quakeCode =
-      R"#(module attributes {qtx.mangled_name_map = {__nvqpp__mlirgen__ansatz = "_ZN6ansatzclEd"}} {
-  func.func @__nvqpp__mlirgen__ansatz(%arg0: f64) {
-    %c0_i64 = arith.constant 0 : i64
-    %c1_i64 = arith.constant 1 : i64
-    %0 = memref.alloca() : memref<f64>
-    memref.store %arg0, %0[] : memref<f64>
-    %1 = quake.alloca : !quake.qvec<2>
-    %2 = quake.qextract %1[%c0_i64] : !quake.qvec<2>[i64] -> !quake.qref
-    quake.x (%2)
-    %3 = memref.load %0[] : memref<f64>
-    %4 = quake.qextract %1[%c1_i64] : !quake.qvec<2>[i64] -> !quake.qref
-    quake.ry |%3 : f64|(%4)
-    %5 = quake.qextract %1[%c1_i64] : !quake.qvec<2>[i64] -> !quake.qref
-    %6 = quake.qextract %1[%c0_i64] : !quake.qvec<2>[i64] -> !quake.qref
-    quake.x [%5 : !quake.qref] (%6)
-    return
-  }
-  func.func private @__nvqpp_zeroDynamicResult() -> !llvm.struct<(ptr<i8>, i64)> {
-    %c0_i64 = arith.constant 0 : i64
-    %0 = llvm.inttoptr %c0_i64 : i64 to !llvm.ptr<i8>
-    %1 = llvm.mlir.undef : !llvm.struct<(ptr<i8>, i64)>
-    %2 = llvm.insertvalue %0, %1[0] : !llvm.struct<(ptr<i8>, i64)> 
-    %3 = llvm.insertvalue %c0_i64, %2[1] : !llvm.struct<(ptr<i8>, i64)> 
-    return %3 : !llvm.struct<(ptr<i8>, i64)>
-  }
-  func.func @ansatz.thunk(%arg0: !llvm.ptr<i8>, %arg1: i1) -> !llvm.struct<(ptr<i8>, i64)> {
-    %0 = llvm.bitcast %arg0 : !llvm.ptr<i8> to !llvm.ptr<struct<(f64)>>
-    %1 = llvm.load %0 : !llvm.ptr<struct<(f64)>>
-    %2 = llvm.mlir.constant(0 : i64) : i64
-    %3 = llvm.inttoptr %2 : i64 to !llvm.ptr<struct<(f64)>>
-    %4 = llvm.getelementptr %3[1] : (!llvm.ptr<struct<(f64)>>) -> !llvm.ptr<struct<(f64)>>
-    %5 = llvm.ptrtoint %4 : !llvm.ptr<struct<(f64)>> to i64
-    %6 = llvm.getelementptr %arg0[%5] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
-    %7 = llvm.extractvalue %1[0] : !llvm.struct<(f64)> 
-    call @__nvqpp__mlirgen__ansatz(%7) : (f64) -> ()
-    %nil = call @__nvqpp_zeroDynamicResult() : () -> !llvm.struct<(ptr<i8>, i64)>
-    return %nil : !llvm.struct<(ptr<i8>, i64)>
-  }
-})#";
-
-  cudaq::registry::deviceCodeHolderAdd("ansatz", quakeCode.data());
-
-  // Here is the main qpud_client sampling workflow
-
-  // Create the client
-  cudaq::qpud_client client;
-
-  // Create a struct defining the runtime args for the kernel
-  struct KernelArgs {
-    double theta = 0.59;
-  } args;
-
-  // Map those args to a void pointer and its associated size
-  auto [rawArgs, size, resultOff] = client.process_args(args);
-
-  using namespace cudaq::spin;
-  cudaq::spin_op h = 5.907 - 2.1433 * x(0) * x(1) - 2.1433 * y(0) * y(1) +
-                     .21829 * z(0) - 6.125 * z(1);
-  double expVal = client.observe("ansatz", h, rawArgs, size);
-
-  EXPECT_NEAR(expVal, -1.74, 1e-2);
-
-  // Try it again with the simpler API
-  expVal = client.observe("ansatz", h, args);
-  EXPECT_NEAR(expVal, -1.74, 1e-2);
-}
-
-TEST(QPUDClientTester, checkExecute) {
-
-  const std::string_view quakeCode =
-      R"#(module attributes {qtx.mangled_name_map = {__nvqpp__mlirgen__super = "_ZN5superclEd"}} {
-  func.func @__nvqpp__mlirgen__super(%arg0: f64) -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"} {
-    %alloca = memref.alloca() : memref<f64>
-    memref.store %arg0, %alloca[] : memref<f64>
-    %0 = quake.alloca : !quake.qref
-    %1 = memref.load %alloca[] : memref<f64>
-    quake.rx |%1 : f64|(%0)
-    %2 = memref.load %alloca[] : memref<f64>
-    %cst = arith.constant 2.000000e+00 : f64
-    %3 = arith.divf %2, %cst : f64
-    quake.ry |%3 : f64|(%0)
-    %4 = quake.mz(%0 : !quake.qref) : i1
-    return %4 : i1
-  }
-  func.func private @__nvqpp_zeroDynamicResult() -> !llvm.struct<(ptr<i8>, i64)> {
-    %c0_i64 = arith.constant 0 : i64
-    %0 = llvm.inttoptr %c0_i64 : i64 to !llvm.ptr<i8>
-    %1 = llvm.mlir.undef : !llvm.struct<(ptr<i8>, i64)>
-    %2 = llvm.insertvalue %0, %1[0] : !llvm.struct<(ptr<i8>, i64)> 
-    %3 = llvm.insertvalue %c0_i64, %2[1] : !llvm.struct<(ptr<i8>, i64)> 
-    return %3 : !llvm.struct<(ptr<i8>, i64)>
-  }
-  func.func @super.thunk(%arg0: !llvm.ptr<i8>, %arg1: i1) -> !llvm.struct<(ptr<i8>, i64)> {
-    %0 = llvm.bitcast %arg0 : !llvm.ptr<i8> to !llvm.ptr<struct<(f64, i1)>>
-    %1 = llvm.load %0 : !llvm.ptr<struct<(f64, i1)>>
-    %2 = llvm.mlir.constant(0 : i64) : i64
-    %3 = llvm.inttoptr %2 : i64 to !llvm.ptr<struct<(f64, i1)>>
-    %4 = llvm.getelementptr %3[1] : (!llvm.ptr<struct<(f64, i1)>>) -> !llvm.ptr<struct<(f64, i1)>>
-    %5 = llvm.ptrtoint %4 : !llvm.ptr<struct<(f64, i1)>> to i64
-    %6 = llvm.getelementptr %arg0[%5] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
-    %7 = llvm.extractvalue %1[0] : !llvm.struct<(f64, i1)> 
-    %8 = call @__nvqpp__mlirgen__super(%7) : (f64) -> i1
-    %9 = llvm.getelementptr %0[0, 1] : (!llvm.ptr<struct<(f64, i1)>>) -> !llvm.ptr<i1>
-    llvm.store %8, %9 : !llvm.ptr<i1>
-    %10 = call @__nvqpp_zeroDynamicResult() : () -> !llvm.struct<(ptr<i8>, i64)>
-    return %10 : !llvm.struct<(ptr<i8>, i64)>
-  }
-})#";
-
-  cudaq::registry::deviceCodeHolderAdd("super", quakeCode.data());
-
-  // Here is the main qpud_client sampling workflow
-
-  // Create the client
-  cudaq::qpud_client client;
-
-  // Create a struct defining the runtime args for the kernel
-  struct KernelArgs {
-    double theta = M_PI;
-    bool retVal;
-  } args;
-
-  // Map those args to a void pointer and its associated size
-  auto [rawArgs, size, resultOff] = client.process_args(args);
-
-  for (std::size_t i = 0; i < 10; i++) {
-    client.execute("super", rawArgs, size, resultOff);
-    auto retVal = args.retVal;
-    EXPECT_TRUE(retVal == 0 || retVal == 1);
-  }
-}
-
-std::size_t ghzArgsCreator(void **packedArgs, void **argMem) {
-  struct KernelArgs {
-    int N;
-  };
-
-  // This is freed by sample_detach...
-  KernelArgs *args = new KernelArgs();
-  args->N = *reinterpret_cast<int *>(packedArgs[0]);
-  *argMem = args;
-  return sizeof(KernelArgs);
-}
-
-TEST(QPUDClientTester, checkSampleDetached) {
-
-  const std::string_view quakeCode =
-      R"#(module attributes {qtx.mangled_name_map = {__nvqpp__mlirgen__ghz = "_ZN3ghzclEi"}} {
-  func.func @__nvqpp__mlirgen__ghz(%arg0: i32) {
-    %c0_i64 = arith.constant 0 : i64
-    %c1_i32 = arith.constant 1 : i32
-    %c0_i32 = arith.constant 0 : i32
-    %0 = memref.alloca() : memref<i32>
-    memref.store %arg0, %0[] : memref<i32>
-    %1 = memref.load %0[] : memref<i32>
-    %2 = arith.extsi %1 : i32 to i64
-    %3 = quake.alloca(%2 : i64) : !quake.qvec<?>
-    %4 = quake.qextract %3[%c0_i64] : !quake.qvec<?>[i64] -> !quake.qref
-    quake.h (%4)
-    cc.scope {
-      %9 = memref.alloca() : memref<i32>
-      memref.store %c0_i32, %9[] : memref<i32>
-      cc.loop while {
-        %10 = memref.load %9[] : memref<i32>
-        %11 = memref.load %0[] : memref<i32>
-        %12 = arith.subi %11, %c1_i32 : i32
-        %13 = arith.cmpi slt, %10, %12 : i32
-        cc.condition %13
-      } do {
-        cc.scope {
-          %10 = memref.load %9[] : memref<i32>
-          %11 = arith.extsi %10 : i32 to i64
-          %12 = quake.qextract %3[%11] : !quake.qvec<?>[i64] -> !quake.qref
-          %13 = memref.load %9[] : memref<i32>
-          %14 = arith.addi %13, %c1_i32 : i32
-          %15 = arith.extsi %14 : i32 to i64
-          %16 = quake.qextract %3[%15] : !quake.qvec<?>[i64] -> !quake.qref
-          quake.x [%12 : !quake.qref] (%16)
-        }
-        cc.continue
-      } step {
-        %10 = memref.load %9[] : memref<i32>
-        %11 = arith.addi %10, %c1_i32 : i32
-        memref.store %11, %9[] : memref<i32>
-      }
-    }
-    %5 = quake.qvec_size %3 : (!quake.qvec<?>) -> i64
-    %6 = arith.index_cast %5 : i64 to index
-    %7 = llvm.alloca %5 x i1 : (i64) -> !llvm.ptr<i1>
-    affine.for %arg1 = 0 to %6 {
-      %9 = quake.qextract %3[%arg1] : !quake.qvec<?>[index] -> !quake.qref
-      %10 = quake.mz(%9 : !quake.qref) : i1
-      %11 = arith.index_cast %arg1 : index to i64
-      %12 = llvm.getelementptr %7[%11] : (!llvm.ptr<i1>, i64) -> !llvm.ptr<i1>
-      llvm.store %10, %12 : !llvm.ptr<i1>
-    }
-    %8 = cc.stdvec_init %7, %5 : (!llvm.ptr<i1>, i64) -> !cc.stdvec<i1>
-    return
-  }
-  func.func private @__nvqpp_zeroDynamicResult() -> !llvm.struct<(ptr<i8>, i64)> {
-    %c0_i64 = arith.constant 0 : i64
-    %0 = llvm.inttoptr %c0_i64 : i64 to !llvm.ptr<i8>
-    %1 = llvm.mlir.undef : !llvm.struct<(ptr<i8>, i64)>
-    %2 = llvm.insertvalue %0, %1[0] : !llvm.struct<(ptr<i8>, i64)> 
-    %3 = llvm.insertvalue %c0_i64, %2[1] : !llvm.struct<(ptr<i8>, i64)> 
-    return %3 : !llvm.struct<(ptr<i8>, i64)>
-  }
-  func.func @ghz.thunk(%arg0: !llvm.ptr<i8>, %arg1: i1) -> !llvm.struct<(ptr<i8>, i64)> {
-    %0 = llvm.bitcast %arg0 : !llvm.ptr<i8> to !llvm.ptr<struct<(i32)>>
-    %1 = llvm.load %0 : !llvm.ptr<struct<(i32)>>
-    %2 = llvm.mlir.constant(0 : i64) : i64
-    %3 = llvm.inttoptr %2 : i64 to !llvm.ptr<struct<(i32)>>
-    %4 = llvm.getelementptr %3[1] : (!llvm.ptr<struct<(i32)>>) -> !llvm.ptr<struct<(i32)>>
-    %5 = llvm.ptrtoint %4 : !llvm.ptr<struct<(i32)>> to i64
-    %6 = llvm.getelementptr %arg0[%5] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
-    %7 = llvm.extractvalue %1[0] : !llvm.struct<(i32)> 
-    call @__nvqpp__mlirgen__ghz(%7) : (i32) -> ()
-    %nil = call @__nvqpp_zeroDynamicResult() : () -> !llvm.struct<(ptr<i8>, i64)>
-    return %nil : !llvm.struct<(ptr<i8>, i64)>
-  }
-})#";
-
-  std::size_t shots = 500;
-  cudaq::registry::deviceCodeHolderAdd("ghz", quakeCode.data());
-  std::size_t (*ptr)(void **, void **);
-  ptr = ghzArgsCreator;
-  cudaq::registry::cudaqRegisterArgsCreator("ghz",
-                                            reinterpret_cast<char *>(ptr));
-
-  // Here is the main qpud_client sampling workflow
-
-  // Create the client
-  auto job = cudaq::sample_detach("ghz", shots, 5);
-  std::cout << job[0].id << "\n";
-
-  auto counts = cudaq::sample(job);
-  int counter = 0;
-  for (auto &[bits, count] : counts) {
-    counter += count;
-    EXPECT_TRUE(bits == "00000" || bits == "11111");
-  }
-  EXPECT_EQ(counter, shots);
-
-  counts.dump();
-}
-
-std::size_t ansatzArgsCreator(void **packedArgs, void **argMem) {
-  struct KernelArgs {
-    double theta;
-  };
-
-  KernelArgs *args = new KernelArgs();
-  args->theta = *reinterpret_cast<double *>(packedArgs[0]);
-  *argMem = args;
-
-  return sizeof(KernelArgs);
-}
-
-TEST(QPUDClientTester, checkObserveDetached) {
-
-  const std::string_view quakeCode =
-      R"#(module attributes {qtx.mangled_name_map = {__nvqpp__mlirgen__ansatz = "_ZN6ansatzclEd"}} {
-  func.func @__nvqpp__mlirgen__ansatz(%arg0: f64) {
-    %c0_i64 = arith.constant 0 : i64
-    %c1_i64 = arith.constant 1 : i64
-    %0 = memref.alloca() : memref<f64>
-    memref.store %arg0, %0[] : memref<f64>
-    %1 = quake.alloca : !quake.qvec<2>
-    %2 = quake.qextract %1[%c0_i64] : !quake.qvec<2>[i64] -> !quake.qref
-    quake.x (%2)
-    %3 = memref.load %0[] : memref<f64>
-    %4 = quake.qextract %1[%c1_i64] : !quake.qvec<2>[i64] -> !quake.qref
-    quake.ry |%3 : f64|(%4)
-    %5 = quake.qextract %1[%c1_i64] : !quake.qvec<2>[i64] -> !quake.qref
-    %6 = quake.qextract %1[%c0_i64] : !quake.qvec<2>[i64] -> !quake.qref
-    quake.x [%5 : !quake.qref] (%6)
-    return
-  }
-  func.func private @__nvqpp_zeroDynamicResult() -> !llvm.struct<(ptr<i8>, i64)> {
-    %c0_i64 = arith.constant 0 : i64
-    %0 = llvm.inttoptr %c0_i64 : i64 to !llvm.ptr<i8>
-    %1 = llvm.mlir.undef : !llvm.struct<(ptr<i8>, i64)>
-    %2 = llvm.insertvalue %0, %1[0] : !llvm.struct<(ptr<i8>, i64)> 
-    %3 = llvm.insertvalue %c0_i64, %2[1] : !llvm.struct<(ptr<i8>, i64)> 
-    return %3 : !llvm.struct<(ptr<i8>, i64)>
-  }
-  func.func @ansatz.thunk(%arg0: !llvm.ptr<i8>, %arg1: i1) -> !llvm.struct<(ptr<i8>, i64)> {
-    %0 = llvm.bitcast %arg0 : !llvm.ptr<i8> to !llvm.ptr<struct<(f64)>>
-    %1 = llvm.load %0 : !llvm.ptr<struct<(f64)>>
-    %2 = llvm.mlir.constant(0 : i64) : i64
-    %3 = llvm.inttoptr %2 : i64 to !llvm.ptr<struct<(f64)>>
-    %4 = llvm.getelementptr %3[1] : (!llvm.ptr<struct<(f64)>>) -> !llvm.ptr<struct<(f64)>>
-    %5 = llvm.ptrtoint %4 : !llvm.ptr<struct<(f64)>> to i64
-    %6 = llvm.getelementptr %arg0[%5] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
-    %7 = llvm.extractvalue %1[0] : !llvm.struct<(f64)> 
-    call @__nvqpp__mlirgen__ansatz(%7) : (f64) -> ()
-    %nil = call @__nvqpp_zeroDynamicResult() : () -> !llvm.struct<(ptr<i8>, i64)>
-    return %nil : !llvm.struct<(ptr<i8>, i64)>
-  }
-})#";
-
-  cudaq::registry::deviceCodeHolderAdd("ansatz", quakeCode.data());
-  std::size_t (*ptr)(void **, void **);
-  ptr = ansatzArgsCreator;
-  cudaq::registry::cudaqRegisterArgsCreator("ansatz",
-                                            reinterpret_cast<char *>(ptr));
-
-  using namespace cudaq::spin;
-  cudaq::spin_op h = 5.907 - 2.1433 * x(0) * x(1) - 2.1433 * y(0) * y(1) +
-                     .21829 * z(0) - 6.125 * z(1);
-  {
-    auto jobs = cudaq::observe_detach("ansatz", h, 0.59);
-    for (auto &j : jobs) {
-      printf("%s %s\n", j.id.data(), j.name.data());
-    }
-    double res = cudaq::observe(h, jobs);
-
-    EXPECT_NEAR(res, -1.74, 1e-2);
-  }
-
-  // Observe with shots
-  {
-    std::size_t shots = 10000;
-    auto jobs = cudaq::observe_detach("ansatz", shots, h, 0.59);
-    for (auto &j : jobs) {
-      printf("%s %s\n", j.id.data(), j.name.data());
-    }
-    auto res = cudaq::observe(h, jobs);
-
-    EXPECT_NEAR(res.exp_val_z(), -1.74, 1e-1);
-    res.dump();
-    auto x0x1Counts = res.counts(x(0) * x(1));
-    x0x1Counts.dump();
-    EXPECT_TRUE(x0x1Counts.size() == 4);
-
-    auto z1Counts = res.counts(z(1));
-    z1Counts.dump();
-
-    EXPECT_EQ(2, z1Counts.size());
-  }
-}
-
-std::size_t ansatzArgsCreatorVector(void **packedArgs, void **argMem) {
-  struct KernelArgs {
-    std::size_t size;
-    double data;
-  };
-
-  // This is effectively what the automated code is doing.
-  auto vector = *reinterpret_cast<std::vector<double> *>(packedArgs[0]);
-  KernelArgs *args = new KernelArgs();
-  args->size = sizeof(double);
-  args->data = vector[0];
-  *argMem = args;
-
-  return sizeof(std::size_t) + sizeof(double);
-}
-
-TEST(QPUDClientTester, checkObserveDetachedWithVector) {
-
-  const std::string_view quakeCode =
-      R"#(module attributes {qtx.mangled_name_map = {__nvqpp__mlirgen__ansatzVec = "_ZN6ansatzVecclESt6vectorIdSaIdEE"}} {
-  func.func @__nvqpp__mlirgen__ansatzVec(%arg0: !cc.stdvec<f64>) attributes {"cudaq-entrypoint"} {
-    %c2_i32 = arith.constant 2 : i32
-    %0 = arith.extsi %c2_i32 : i32 to i64
-    %1 = quake.alloca(%0 : i64) : !quake.qvec<?>
-    %c0_i32 = arith.constant 0 : i32
-    %2 = arith.extsi %c0_i32 : i32 to i64
-    %3 = quake.qextract %1[%2] : !quake.qvec<?>[i64] -> !quake.qref
-    quake.x (%3)
-    %c0_i32_0 = arith.constant 0 : i32
-    %4 = arith.extsi %c0_i32_0 : i32 to i64
-    %5 = cc.stdvec_data %arg0 : (!cc.stdvec<f64>) -> !llvm.ptr<f64>
-    %6 = llvm.getelementptr %5[%4] : (!llvm.ptr<f64>, i64) -> !llvm.ptr<f64>
-    %7 = llvm.load %6 : !llvm.ptr<f64>
-    %c1_i32 = arith.constant 1 : i32
-    %8 = arith.extsi %c1_i32 : i32 to i64
-    %9 = quake.qextract %1[%8] : !quake.qvec<?>[i64] -> !quake.qref
-    quake.ry |%7 : f64|(%9)
-    %c1_i32_1 = arith.constant 1 : i32
-    %10 = arith.extsi %c1_i32_1 : i32 to i64
-    %11 = quake.qextract %1[%10] : !quake.qvec<?>[i64] -> !quake.qref
-    %c0_i32_2 = arith.constant 0 : i32
-    %12 = arith.extsi %c0_i32_2 : i32 to i64
-    %13 = quake.qextract %1[%12] : !quake.qvec<?>[i64] -> !quake.qref
-    quake.x [%11 : !quake.qref] (%13)
-    return
-  }
-  func.func private @__nvqpp_zeroDynamicResult() -> !llvm.struct<(ptr<i8>, i64)> {
-    %c0_i64 = arith.constant 0 : i64
-    %0 = llvm.inttoptr %c0_i64 : i64 to !llvm.ptr<i8>
-    %1 = llvm.mlir.undef : !llvm.struct<(ptr<i8>, i64)>
-    %2 = llvm.insertvalue %0, %1[0] : !llvm.struct<(ptr<i8>, i64)> 
-    %3 = llvm.insertvalue %c0_i64, %2[1] : !llvm.struct<(ptr<i8>, i64)> 
-    return %3 : !llvm.struct<(ptr<i8>, i64)>
-  }
-  func.func @ansatzVec.thunk(%arg0: !llvm.ptr<i8>, %arg1: i1) -> !llvm.struct<(ptr<i8>, i64)> {
-    %0 = llvm.bitcast %arg0 : !llvm.ptr<i8> to !llvm.ptr<struct<(i64)>>
-    %1 = llvm.load %0 : !llvm.ptr<struct<(i64)>>
-    %2 = llvm.mlir.constant(0 : i64) : i64
-    %3 = llvm.inttoptr %2 : i64 to !llvm.ptr<struct<(i64)>>
-    %4 = llvm.getelementptr %3[1] : (!llvm.ptr<struct<(i64)>>) -> !llvm.ptr<struct<(i64)>>
-    %5 = llvm.ptrtoint %4 : !llvm.ptr<struct<(i64)>> to i64
-    %6 = llvm.getelementptr %arg0[%5] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
-    %7 = llvm.extractvalue %1[0] : !llvm.struct<(i64)> 
-    %8 = llvm.mlir.constant(8 : i64) : i64
-    %9 = llvm.sdiv %7, %8  : i64
-    %10 = llvm.bitcast %6 : !llvm.ptr<i8> to !llvm.ptr<f64>
-    %11 = cc.stdvec_init %10, %9 : (!llvm.ptr<f64>, i64) -> !cc.stdvec<f64>
-    %12 = llvm.getelementptr %6[%7] : (!llvm.ptr<i8>, i64) -> !llvm.ptr<i8>
-    call @__nvqpp__mlirgen__ansatzVec(%11) : (!cc.stdvec<f64>) -> ()
-    %nil = call @__nvqpp_zeroDynamicResult() : () -> !llvm.struct<(ptr<i8>, i64)>
-    return %nil : !llvm.struct<(ptr<i8>, i64)>
-  }
-})#";
-
-  cudaq::registry::deviceCodeHolderAdd("ansatzVec", quakeCode.data());
-  std::size_t (*ptr)(void **, void **);
-  ptr = ansatzArgsCreatorVector;
-  cudaq::registry::cudaqRegisterArgsCreator("ansatzVec",
-                                            reinterpret_cast<char *>(ptr));
-
-  using namespace cudaq::spin;
-  cudaq::spin_op h = 5.907 - 2.1433 * x(0) * x(1) - 2.1433 * y(0) * y(1) +
-                     .21829 * z(0) - 6.125 * z(1);
-  {
-    auto jobs =
-        cudaq::observe_detach("ansatzVec", h, std::vector<double>{0.59});
-    for (auto &j : jobs) {
-      printf("%s %s\n", j.id.data(), j.name.data());
-    }
-    double res = cudaq::observe(h, jobs);
-
-    EXPECT_NEAR(res, -1.74, 1e-2);
-  }
-
-  // Observe with shots
-  {
-    std::size_t shots = 10000;
-    auto jobs =
-        cudaq::observe_detach("ansatzVec", shots, h, std::vector<double>{0.59});
-    for (auto &j : jobs) {
-      printf("%s %s\n", j.id.data(), j.name.data());
-    }
-    auto res = cudaq::observe(h, jobs);
-
-    EXPECT_NEAR(res.exp_val_z(), -1.74, 1e-1);
-    res.dump();
-    auto x0x1Counts = res.counts(x(0) * x(1));
-    x0x1Counts.dump();
-    EXPECT_TRUE(x0x1Counts.size() == 4);
-
-    auto z1Counts = res.counts(z(1));
-    z1Counts.dump();
-
-    EXPECT_EQ(2, z1Counts.size());
-  }
-}