NVIDIA · 1tnguyen · Mar 21, 2024 · Dec 5, 2023 · Dec 5, 2023 · Jan 27, 2024
diff --git a/runtime/common/ObserveResult.h b/runtime/common/ObserveResult.h
@@ -35,16 +35,13 @@ class observe_result {
 
   /// @brief Constructor, takes the precomputed expectation value for
   /// <psi(x) | H | psi(x)> for the given spin_op.
-  observe_result(double &e, const spin_op &H) : expVal(e), spinOp(H) {}
+  observe_result(double e, const spin_op &H) : expVal(e), spinOp(H) {}
 
   /// @brief Constructor, takes the precomputed expectation value for
   /// <psi(x) | H | psi(x)> for the given spin_op. If this execution
   /// was shots based, also provide the sample_result data containing counts
   /// for each term in H.
-  observe_result(double &e, const spin_op &H, sample_result counts)
-      : expVal(e), spinOp(H), data(counts) {}
-
-  observe_result(double &&e, const spin_op &H, sample_result counts)
+  observe_result(double e, const spin_op &H, sample_result counts)
       : expVal(e), spinOp(H), data(counts) {}
 
   /// @brief Return the raw counts data for all terms

diff --git a/runtime/cudaq/platform/qpu.h b/runtime/cudaq/platform/qpu.h
@@ -65,9 +65,8 @@ class QPU : public registry::RegisteredType<QPU> {
       // and computing <ZZ..ZZZ>
       if (localContext->canHandleObserve) {
         auto [exp, data] = cudaq::measure(H);
-        results.emplace_back(data.to_map(), H.to_string(false), exp);
         localContext->expectationValue = exp;
-        localContext->result = cudaq::sample_result(results);
+        localContext->result = data;
       } else {
 
         // Loop over each term and compute coeff * <term>

diff --git a/runtime/cudaq/qis/managers/default/DefaultExecutionManager.cpp b/runtime/cudaq/qis/managers/default/DefaultExecutionManager.cpp
@@ -162,8 +162,8 @@ class DefaultExecutionManager : public cudaq::BasicExecutionManager {
 
     if (executionContext->canHandleObserve) {
       auto result = simulator()->observe(*executionContext->spin.value());
-      executionContext->expectationValue = result.expectationValue;
-      executionContext->result = cudaq::sample_result(result);
+      executionContext->expectationValue = result.expectation();
+      executionContext->result = result.raw_data();
       return;
     }
 

diff --git a/runtime/nvqir/CircuitSimulator.h b/runtime/nvqir/CircuitSimulator.h
@@ -126,7 +126,7 @@ class CircuitSimulator {
 
   /// @brief Compute the expected value of the given spin op
   /// with respect to the current state, <psi | H | psi>.
-  virtual cudaq::ExecutionResult observe(const cudaq::spin_op &term) = 0;
+  virtual cudaq::observe_result observe(const cudaq::spin_op &term) = 0;
 
   /// @brief Allocate a single qubit, return the qubit as a logical index
   virtual std::size_t allocateQubit() = 0;
@@ -661,16 +661,23 @@ class CircuitSimulatorBase : public CircuitSimulator {
 
   /// @brief Return true if expectation values should be computed from
   /// sampling + parity of bit strings.
-  bool shouldObserveFromSampling() {
+  /// Default is to enable observe from sampling, i.e., simulating the
+  /// change-of-basis circuit for each term.
+  ///
+  /// The environment variable "CUDAQ_OBSERVE_FROM_SAMPLING" can be used to turn
+  /// on or off this setting.
+  bool shouldObserveFromSampling(bool defaultConfig = true) {
     if (auto envVar = std::getenv(observeSamplingEnvVar); envVar) {
       std::string asString = envVar;
       std::transform(asString.begin(), asString.end(), asString.begin(),
                      [](auto c) { return std::tolower(c); });
       if (asString == "false" || asString == "off" || asString == "0")
         return false;
+      if (asString == "true" || asString == "on" || asString == "1")
+        return true;
     }
 
-    return true;
+    return defaultConfig;
   }
 
 public:
@@ -690,7 +697,7 @@ class CircuitSimulatorBase : public CircuitSimulator {
 
   /// @brief Compute the expected value of the given spin op
   /// with respect to the current state, <psi | H | psi>.
-  cudaq::ExecutionResult observe(const cudaq::spin_op &term) override {
+  cudaq::observe_result observe(const cudaq::spin_op &term) override {
     throw std::runtime_error("This CircuitSimulator does not implement "
                              "observe(const cudaq::spin_op &).");
   }

diff --git a/runtime/nvqir/NVQIR.cpp b/runtime/nvqir/NVQIR.cpp
@@ -420,8 +420,8 @@ Result *__quantum__qis__measure__body(Array *pauli_arr, Array *qubits) {
   if (currentContext->canHandleObserve) {
     circuitSimulator->flushGateQueue();
     auto result = circuitSimulator->observe(*currentContext->spin.value());
-    currentContext->expectationValue = result.expectationValue;
-    currentContext->result = cudaq::sample_result(result);
+    currentContext->expectationValue = result.expectation();
+    currentContext->result = result.raw_data();
     return ResultZero;
   }
 

diff --git a/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu b/runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu
@@ -188,10 +188,10 @@ protected:
       controls32.push_back((int)c);
     custatevecPauli_t pauli[] = {pauliStringToEnum(gate.name())};
     int targets[] = {(int)qubitIdx};
-    custatevecApplyPauliRotation(handle, deviceStateVector,
-                                 cuStateVecCudaDataType, nQubitsAllocated,
-                                 -0.5 * angle, pauli, targets, 1,
-                                 controls32.data(), nullptr, controls32.size());
+    HANDLE_ERROR(custatevecApplyPauliRotation(
+        handle, deviceStateVector, cuStateVecCudaDataType, nQubitsAllocated,
+        -0.5 * angle, pauli, targets, 1, controls32.data(), nullptr,
+        controls32.size()));
   }
 
   /// @brief Increase the state size by the given number of qubits.
@@ -439,39 +439,102 @@ public:
   /// via sampling
   bool canHandleObserve() override {
     // Do not compute <H> from matrix if shots based sampling requested
-    if (executionContext &&
+    // i.e., a valid shots count value was set.
+    // Note: -1 is also used to denote non-sampling execution. Hence, we need to
+    // check for this particular -1 value as being casted to an unsigned type.
+    if (executionContext && executionContext->shots > 0 &&
         executionContext->shots != static_cast<std::size_t>(-1)) {
       return false;
     }
 
-    /// Seems that FP32 is faster with
-    /// custatevecComputeExpectationsOnPauliBasis
-    if constexpr (std::is_same_v<ScalarType, float>) {
-      return false;
-    }
-
-    return !shouldObserveFromSampling();
+    // If no shots requested (exact expectation calulation), don't use
+    // term-by-term observe as the default since
+    // `CuStateVecCircuitSimulator::observe` will do a batched expectation value
+    // calculation to compute all expectation values for all terms at once.
+    return !shouldObserveFromSampling(/*defaultConfig=*/false);
   }
 
   /// @brief Compute the expected value from the observable matrix.
-  cudaq::ExecutionResult observe(const cudaq::spin_op &op) override {
+  cudaq::observe_result observe(const cudaq::spin_op &op) override {
+    {
+      cudaq::ScopedTrace trace(
+          "CuStateVecCircuitSimulator::observe - flushGateQueue");
+      flushGateQueue();
+    }
 
-    flushGateQueue();
+    // Use batched custatevecComputeExpectationsOnPauliBasis to compute all term
+    // expectation values in one go
+    uint32_t nPauliOperatorArrays = op.num_terms();
+    // Stable holders of vectors since we need to send vectors of pointers to
+    // custatevec
+    std::deque<std::vector<custatevecPauli_t>> pauliOperatorsArrayHolder;
+    std::deque<std::vector<int32_t>> basisBitsArrayHolder;
+    std::vector<const custatevecPauli_t *> pauliOperatorsArray;
+    std::vector<const int32_t *> basisBitsArray;
+    std::vector<std::complex<double>> coeffs;
+    std::vector<uint32_t> nBasisBitsArray;
+    pauliOperatorsArray.reserve(nPauliOperatorArrays);
+    basisBitsArray.reserve(nPauliOperatorArrays);
+    coeffs.reserve(nPauliOperatorArrays);
+    nBasisBitsArray.reserve(nPauliOperatorArrays);
+    // Helper to convert Pauli enums
+    const auto cudaqToCustateVec = [](cudaq::pauli pauli) -> custatevecPauli_t {
+      switch (pauli) {
+      case cudaq::pauli::I:
+        return CUSTATEVEC_PAULI_I;
+      case cudaq::pauli::X:
+        return CUSTATEVEC_PAULI_X;
+      case cudaq::pauli::Y:
+        return CUSTATEVEC_PAULI_Y;
+      case cudaq::pauli::Z:
+        return CUSTATEVEC_PAULI_Z;
+      }
+      __builtin_unreachable();
+    };
 
-    // The op is on the following target bits.
-    std::set<std::size_t> targets;
+    // Contruct data to send on to custatevec
+    std::vector<std::string> termStrs;
+    termStrs.reserve(nPauliOperatorArrays);
     op.for_each_term([&](cudaq::spin_op &term) {
-      term.for_each_pauli(
-          [&](cudaq::pauli p, std::size_t idx) { targets.insert(idx); });
+      coeffs.emplace_back(term.get_coefficient());
+      std::vector<custatevecPauli_t> paulis;
+      std::vector<int32_t> idxs;
+      paulis.reserve(term.num_qubits());
+      idxs.reserve(term.num_qubits());
+      term.for_each_pauli([&](cudaq::pauli p, std::size_t idx) {
+        if (p != cudaq::pauli::I) {
+          paulis.emplace_back(cudaqToCustateVec(p));
+          idxs.emplace_back(idx);
+        }
+      });
+      pauliOperatorsArrayHolder.emplace_back(std::move(paulis));
+      basisBitsArrayHolder.emplace_back(std::move(idxs));
+      pauliOperatorsArray.emplace_back(pauliOperatorsArrayHolder.back().data());
+      basisBitsArray.emplace_back(basisBitsArrayHolder.back().data());
+      nBasisBitsArray.emplace_back(pauliOperatorsArrayHolder.back().size());
+      termStrs.emplace_back(term.to_string(false));
     });
-
-    std::vector<std::size_t> targetsVec(targets.begin(), targets.end());
-
-    // Get the matrix
-    auto matrix = op.to_matrix();
-    /// Compute the expectation value.
-    auto ee = getExpectationFromOperatorMatrix(matrix.data(), targetsVec);
-    return cudaq::ExecutionResult({}, ee);
+    std::vector<double> expectationValues(nPauliOperatorArrays);
+    {
+      cudaq::ScopedTrace trace("CuStateVecCircuitSimulator::observe - "
+                               "custatevecComputeExpectationsOnPauliBasis");
+      HANDLE_ERROR(custatevecComputeExpectationsOnPauliBasis(
+          handle, deviceStateVector, cuStateVecCudaDataType, nQubitsAllocated,
+          expectationValues.data(), pauliOperatorsArray.data(),
+          nPauliOperatorArrays, basisBitsArray.data(), nBasisBitsArray.data()));
+    }
+    std::complex<double> expVal = 0.0;
+    std::vector<cudaq::ExecutionResult> results;
+    results.reserve(nPauliOperatorArrays);
+    for (uint32_t i = 0; i < nPauliOperatorArrays; ++i) {
+      expVal += coeffs[i] * expectationValues[i];
+      results.emplace_back(
+          cudaq::ExecutionResult({}, termStrs[i], expectationValues[i]));
+    }
+    cudaq::sample_result perTermData(static_cast<double>(expVal.real()),
+                                     results);
+    return cudaq::observe_result(static_cast<double>(expVal.real()), op,
+                                 perTermData);
   }
 
   /// @brief Sample the multi-qubit state.

diff --git a/runtime/nvqir/cutensornet/simulator_cutensornet.cpp b/runtime/nvqir/cutensornet/simulator_cutensornet.cpp
@@ -186,7 +186,7 @@ SimulatorTensorNetBase::sample(const std::vector<std::size_t> &measuredBits,
       allZTerm.at(m_state->getNumQubits() + m) = 1;
     cudaq::spin_op allZ(allZTerm, 1.0);
     // Just compute the expected value on <Z...Z>
-    return observe(allZ);
+    return cudaq::ExecutionResult({}, observe(allZ).expectation());
   }
 
   prepareQubitTensorState();
@@ -224,7 +224,7 @@ static nvqir::CutensornetExecutor *getPluginInstance() {
   return fcn();
 }
 /// @brief Evaluate the expectation value of a given observable
-cudaq::ExecutionResult
+cudaq::observe_result
 SimulatorTensorNetBase::observe(const cudaq::spin_op &ham) {
   LOG_API_TIME();
   prepareQubitTensorState();
@@ -238,13 +238,13 @@ SimulatorTensorNetBase::observe(const cudaq::spin_op &ham) {
     for (std::size_t i = 0; i < terms.size(); ++i) {
       expVal += (coeffs[i] * termExpVals[i]);
     }
-    return cudaq::ExecutionResult({}, expVal.real());
+    return cudaq::observe_result(expVal.real(), ham);
   } else {
     TensorNetworkSpinOp spinOp(ham, m_cutnHandle);
     std::complex<double> expVal =
         m_state->computeExpVal(spinOp.getNetworkOperator());
     expVal += spinOp.getIdentityTermOffset();
-    return cudaq::ExecutionResult({}, expVal.real());
+    return cudaq::observe_result(expVal.real(), ham);
   }
 }
 

diff --git a/runtime/nvqir/cutensornet/simulator_cutensornet.h b/runtime/nvqir/cutensornet/simulator_cutensornet.h
@@ -49,7 +49,7 @@ class SimulatorTensorNetBase : public nvqir::CircuitSimulatorBase<double> {
          const int shots) override;
 
   /// @brief Evaluate the expectation value of a given observable
-  virtual cudaq::ExecutionResult observe(const cudaq::spin_op &op) override;
+  virtual cudaq::observe_result observe(const cudaq::spin_op &op) override;
 
   /// @brief Add qubits to the underlying quantum state
   virtual void addQubitsToState(std::size_t count) override;

diff --git a/runtime/nvqir/qpp/QppCircuitSimulator.cpp b/runtime/nvqir/qpp/QppCircuitSimulator.cpp
@@ -181,7 +181,7 @@ class QppCircuitSimulator : public nvqir::CircuitSimulatorBase<double> {
     return !shouldObserveFromSampling();
   }
 
-  cudaq::ExecutionResult observe(const cudaq::spin_op &op) override {
+  cudaq::observe_result observe(const cudaq::spin_op &op) override {
 
     flushGateQueue();
 
@@ -212,7 +212,7 @@ class QppCircuitSimulator : public nvqir::CircuitSimulatorBase<double> {
       ee = qpp::apply(asEigen, state, targets).trace().real();
     }
 
-    return cudaq::ExecutionResult({}, ee);
+    return cudaq::observe_result(ee, op);
   }
 
   /// @brief Reset the qubit

diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
@@ -120,7 +120,30 @@ if (CUSTATEVEC_ROOT AND CUDA_FOUND)
     cudaq-platform-mqpu
     nvqir-custatevec-fp64
     gtest_main)
-    gtest_discover_tests(test_mqpu PROPERTIES LABELS "gpu_required")
+  gtest_discover_tests(test_mqpu PROPERTIES LABELS "gpu_required")
+
+  # Test CUDAQ_OBSERVE_FROM_SAMPLING=ON mode.
+  # (term-by-term expectation value calculation by applying change-of-basis gates then reverting them)
+  add_executable(test_custatevec_observe_from_sampling 
+    integration/builder_tester.cpp  
+    integration/deuteron_variational_tester.cpp
+    integration/gradient_tester.cpp
+    integration/nlopt_tester.cpp
+  )
+  target_include_directories(test_custatevec_observe_from_sampling PRIVATE .)
+  target_compile_definitions(test_custatevec_observe_from_sampling PRIVATE -DNVQIR_BACKEND_NAME=custatevec_fp32)
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT APPLE)
+    target_link_options(test_custatevec_observe_from_sampling PRIVATE -Wl,--no-as-needed)
+  endif()
+  target_link_libraries(test_custatevec_observe_from_sampling
+    PRIVATE 
+    cudaq
+    cudaq-builder
+    cudaq-platform-default
+    nvqir-custatevec-fp32
+    gtest_main)
+  # Run this test with "CUDAQ_OBSERVE_FROM_SAMPLING=1"
+  gtest_discover_tests(test_custatevec_observe_from_sampling TEST_SUFFIX _DirectObserve PROPERTIES ENVIRONMENT "CUDAQ_OBSERVE_FROM_SAMPLING=1" PROPERTIES LABELS "gpu_required")
 
     if (MPI_CXX_FOUND)
       # Count the number of GPUs

diff --git a/unittests/backends/qpp_observe/QPPObserveBackend.cpp b/unittests/backends/qpp_observe/QPPObserveBackend.cpp
@@ -19,7 +19,7 @@ class QppObserveTester : public nvqir::QppCircuitSimulator<qpp::ket> {
 public:
   NVQIR_SIMULATOR_CLONE_IMPL(QppObserveTester)
   bool canHandleObserve() override { return true; }
-  cudaq::ExecutionResult observe(const cudaq::spin_op &op) override {
+  cudaq::observe_result observe(const cudaq::spin_op &op) override {
     flushGateQueue();
 
     ::qpp::cmat X = ::qpp::Gates::get_instance().X;
@@ -51,7 +51,7 @@ class QppObserveTester : public nvqir::QppCircuitSimulator<qpp::ket> {
       }
     }
 
-    return cudaq::ExecutionResult({}, sum);
+    return cudaq::observe_result(sum, op);
   }
 
   std::string name() const override { return "qpp-test"; }

diff --git a/unittests/backends/qpp_observe/QppObserveTester.cpp b/unittests/backends/qpp_observe/QppObserveTester.cpp
@@ -27,7 +27,7 @@ CUDAQ_TEST(QPPBackendTester, checkBackendObserve) {
                      .21829 * z(0) - 6.125 * z(1);
 
   auto expVal = qpp.observe(h);
-  EXPECT_NEAR(expVal.expectationValue.value(), -1.74, 1e-2);
+  EXPECT_NEAR(expVal.expectation(), -1.74, 1e-2);
 
   struct ansatzTest {
     auto operator()(double theta) __qpu__ {