Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Perf. improvement for CuStateVecCircuitSimulator::observe #1002

Merged
merged 11 commits into from
Mar 21, 2024
7 changes: 2 additions & 5 deletions runtime/common/ObserveResult.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,13 @@ class observe_result {

/// @brief Constructor, takes the precomputed expectation value for
/// <psi(x) | H | psi(x)> for the given spin_op.
observe_result(double &e, const spin_op &H) : expVal(e), spinOp(H) {}
observe_result(double e, const spin_op &H) : expVal(e), spinOp(H) {}

/// @brief Constructor, takes the precomputed expectation value for
/// <psi(x) | H | psi(x)> for the given spin_op. If this execution
/// was shots based, also provide the sample_result data containing counts
/// for each term in H.
observe_result(double &e, const spin_op &H, sample_result counts)
: expVal(e), spinOp(H), data(counts) {}

observe_result(double &&e, const spin_op &H, sample_result counts)
observe_result(double e, const spin_op &H, sample_result counts)
: expVal(e), spinOp(H), data(counts) {}

/// @brief Return the raw counts data for all terms
Expand Down
3 changes: 1 addition & 2 deletions runtime/cudaq/platform/qpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,8 @@ class QPU : public registry::RegisteredType<QPU> {
// and computing <ZZ..ZZZ>
if (localContext->canHandleObserve) {
auto [exp, data] = cudaq::measure(H);
results.emplace_back(data.to_map(), H.to_string(false), exp);
localContext->expectationValue = exp;
localContext->result = cudaq::sample_result(results);
localContext->result = data;
} else {

// Loop over each term and compute coeff * <term>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -162,8 +162,8 @@ class DefaultExecutionManager : public cudaq::BasicExecutionManager {

if (executionContext->canHandleObserve) {
auto result = simulator()->observe(*executionContext->spin.value());
executionContext->expectationValue = result.expectationValue;
executionContext->result = cudaq::sample_result(result);
executionContext->expectationValue = result.expectation();
executionContext->result = result.raw_data();
return;
}

Expand Down
15 changes: 11 additions & 4 deletions runtime/nvqir/CircuitSimulator.h
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ class CircuitSimulator {

/// @brief Compute the expected value of the given spin op
/// with respect to the current state, <psi | H | psi>.
virtual cudaq::ExecutionResult observe(const cudaq::spin_op &term) = 0;
virtual cudaq::observe_result observe(const cudaq::spin_op &term) = 0;

/// @brief Allocate a single qubit, return the qubit as a logical index
virtual std::size_t allocateQubit() = 0;
Expand Down Expand Up @@ -661,16 +661,23 @@ class CircuitSimulatorBase : public CircuitSimulator {

/// @brief Return true if expectation values should be computed from
/// sampling + parity of bit strings.
bool shouldObserveFromSampling() {
/// Default is to enable observe from sampling, i.e., simulating the
/// change-of-basis circuit for each term.
///
/// The environment variable "CUDAQ_OBSERVE_FROM_SAMPLING" can be used to turn
/// on or off this setting.
bool shouldObserveFromSampling(bool defaultConfig = true) {
if (auto envVar = std::getenv(observeSamplingEnvVar); envVar) {
std::string asString = envVar;
std::transform(asString.begin(), asString.end(), asString.begin(),
[](auto c) { return std::tolower(c); });
if (asString == "false" || asString == "off" || asString == "0")
return false;
if (asString == "true" || asString == "on" || asString == "1")
return true;
}

return true;
return defaultConfig;
}

public:
Expand All @@ -690,7 +697,7 @@ class CircuitSimulatorBase : public CircuitSimulator {

/// @brief Compute the expected value of the given spin op
/// with respect to the current state, <psi | H | psi>.
cudaq::ExecutionResult observe(const cudaq::spin_op &term) override {
cudaq::observe_result observe(const cudaq::spin_op &term) override {
throw std::runtime_error("This CircuitSimulator does not implement "
"observe(const cudaq::spin_op &).");
}
Expand Down
4 changes: 2 additions & 2 deletions runtime/nvqir/NVQIR.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -420,8 +420,8 @@ Result *__quantum__qis__measure__body(Array *pauli_arr, Array *qubits) {
if (currentContext->canHandleObserve) {
circuitSimulator->flushGateQueue();
auto result = circuitSimulator->observe(*currentContext->spin.value());
currentContext->expectationValue = result.expectationValue;
currentContext->result = cudaq::sample_result(result);
currentContext->expectationValue = result.expectation();
currentContext->result = result.raw_data();
return ResultZero;
}

Expand Down
115 changes: 89 additions & 26 deletions runtime/nvqir/custatevec/CuStateVecCircuitSimulator.cu
Original file line number Diff line number Diff line change
Expand Up @@ -188,10 +188,10 @@ protected:
controls32.push_back((int)c);
custatevecPauli_t pauli[] = {pauliStringToEnum(gate.name())};
int targets[] = {(int)qubitIdx};
custatevecApplyPauliRotation(handle, deviceStateVector,
cuStateVecCudaDataType, nQubitsAllocated,
-0.5 * angle, pauli, targets, 1,
controls32.data(), nullptr, controls32.size());
HANDLE_ERROR(custatevecApplyPauliRotation(
handle, deviceStateVector, cuStateVecCudaDataType, nQubitsAllocated,
-0.5 * angle, pauli, targets, 1, controls32.data(), nullptr,
controls32.size()));
}

/// @brief Increase the state size by the given number of qubits.
Expand Down Expand Up @@ -439,39 +439,102 @@ public:
/// via sampling
bool canHandleObserve() override {
// Do not compute <H> from matrix if shots based sampling requested
if (executionContext &&
// i.e., a valid shots count value was set.
// Note: -1 is also used to denote non-sampling execution. Hence, we need to
// check for this particular -1 value as being casted to an unsigned type.
if (executionContext && executionContext->shots > 0 &&
executionContext->shots != static_cast<std::size_t>(-1)) {
return false;
}

/// Seems that FP32 is faster with
/// custatevecComputeExpectationsOnPauliBasis
if constexpr (std::is_same_v<ScalarType, float>) {
return false;
}

return !shouldObserveFromSampling();
// If no shots requested (exact expectation calulation), don't use
// term-by-term observe as the default since
// `CuStateVecCircuitSimulator::observe` will do a batched expectation value
// calculation to compute all expectation values for all terms at once.
return !shouldObserveFromSampling(/*defaultConfig=*/false);
}

/// @brief Compute the expected value from the observable matrix.
cudaq::ExecutionResult observe(const cudaq::spin_op &op) override {
cudaq::observe_result observe(const cudaq::spin_op &op) override {
{
cudaq::ScopedTrace trace(
"CuStateVecCircuitSimulator::observe - flushGateQueue");
flushGateQueue();
}

flushGateQueue();
// Use batched custatevecComputeExpectationsOnPauliBasis to compute all term
// expectation values in one go
uint32_t nPauliOperatorArrays = op.num_terms();
// Stable holders of vectors since we need to send vectors of pointers to
// custatevec
std::deque<std::vector<custatevecPauli_t>> pauliOperatorsArrayHolder;
std::deque<std::vector<int32_t>> basisBitsArrayHolder;
1tnguyen marked this conversation as resolved.
Show resolved Hide resolved
std::vector<const custatevecPauli_t *> pauliOperatorsArray;
std::vector<const int32_t *> basisBitsArray;
std::vector<std::complex<double>> coeffs;
std::vector<uint32_t> nBasisBitsArray;
pauliOperatorsArray.reserve(nPauliOperatorArrays);
basisBitsArray.reserve(nPauliOperatorArrays);
coeffs.reserve(nPauliOperatorArrays);
nBasisBitsArray.reserve(nPauliOperatorArrays);
// Helper to convert Pauli enums
const auto cudaqToCustateVec = [](cudaq::pauli pauli) -> custatevecPauli_t {
switch (pauli) {
case cudaq::pauli::I:
return CUSTATEVEC_PAULI_I;
case cudaq::pauli::X:
return CUSTATEVEC_PAULI_X;
case cudaq::pauli::Y:
return CUSTATEVEC_PAULI_Y;
case cudaq::pauli::Z:
return CUSTATEVEC_PAULI_Z;
}
__builtin_unreachable();
};

// The op is on the following target bits.
std::set<std::size_t> targets;
// Contruct data to send on to custatevec
std::vector<std::string> termStrs;
termStrs.reserve(nPauliOperatorArrays);
op.for_each_term([&](cudaq::spin_op &term) {
term.for_each_pauli(
[&](cudaq::pauli p, std::size_t idx) { targets.insert(idx); });
coeffs.emplace_back(term.get_coefficient());
std::vector<custatevecPauli_t> paulis;
std::vector<int32_t> idxs;
paulis.reserve(term.num_qubits());
idxs.reserve(term.num_qubits());
term.for_each_pauli([&](cudaq::pauli p, std::size_t idx) {
if (p != cudaq::pauli::I) {
paulis.emplace_back(cudaqToCustateVec(p));
idxs.emplace_back(idx);
}
});
pauliOperatorsArrayHolder.emplace_back(std::move(paulis));
basisBitsArrayHolder.emplace_back(std::move(idxs));
pauliOperatorsArray.emplace_back(pauliOperatorsArrayHolder.back().data());
basisBitsArray.emplace_back(basisBitsArrayHolder.back().data());
nBasisBitsArray.emplace_back(pauliOperatorsArrayHolder.back().size());
termStrs.emplace_back(term.to_string(false));
});

std::vector<std::size_t> targetsVec(targets.begin(), targets.end());

// Get the matrix
auto matrix = op.to_matrix();
/// Compute the expectation value.
auto ee = getExpectationFromOperatorMatrix(matrix.data(), targetsVec);
return cudaq::ExecutionResult({}, ee);
std::vector<double> expectationValues(nPauliOperatorArrays);
{
cudaq::ScopedTrace trace("CuStateVecCircuitSimulator::observe - "
"custatevecComputeExpectationsOnPauliBasis");
HANDLE_ERROR(custatevecComputeExpectationsOnPauliBasis(
handle, deviceStateVector, cuStateVecCudaDataType, nQubitsAllocated,
expectationValues.data(), pauliOperatorsArray.data(),
nPauliOperatorArrays, basisBitsArray.data(), nBasisBitsArray.data()));
}
std::complex<double> expVal = 0.0;
std::vector<cudaq::ExecutionResult> results;
results.reserve(nPauliOperatorArrays);
for (uint32_t i = 0; i < nPauliOperatorArrays; ++i) {
expVal += coeffs[i] * expectationValues[i];
results.emplace_back(
cudaq::ExecutionResult({}, termStrs[i], expectationValues[i]));
}
cudaq::sample_result perTermData(static_cast<double>(expVal.real()),
results);
return cudaq::observe_result(static_cast<double>(expVal.real()), op,
perTermData);
}

/// @brief Sample the multi-qubit state.
Expand Down
8 changes: 4 additions & 4 deletions runtime/nvqir/cutensornet/simulator_cutensornet.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ SimulatorTensorNetBase::sample(const std::vector<std::size_t> &measuredBits,
allZTerm.at(m_state->getNumQubits() + m) = 1;
cudaq::spin_op allZ(allZTerm, 1.0);
// Just compute the expected value on <Z...Z>
return observe(allZ);
return cudaq::ExecutionResult({}, observe(allZ).expectation());
}

prepareQubitTensorState();
Expand Down Expand Up @@ -224,7 +224,7 @@ static nvqir::CutensornetExecutor *getPluginInstance() {
return fcn();
}
/// @brief Evaluate the expectation value of a given observable
cudaq::ExecutionResult
cudaq::observe_result
SimulatorTensorNetBase::observe(const cudaq::spin_op &ham) {
LOG_API_TIME();
prepareQubitTensorState();
Expand All @@ -238,13 +238,13 @@ SimulatorTensorNetBase::observe(const cudaq::spin_op &ham) {
for (std::size_t i = 0; i < terms.size(); ++i) {
expVal += (coeffs[i] * termExpVals[i]);
}
return cudaq::ExecutionResult({}, expVal.real());
return cudaq::observe_result(expVal.real(), ham);
} else {
TensorNetworkSpinOp spinOp(ham, m_cutnHandle);
std::complex<double> expVal =
m_state->computeExpVal(spinOp.getNetworkOperator());
expVal += spinOp.getIdentityTermOffset();
return cudaq::ExecutionResult({}, expVal.real());
return cudaq::observe_result(expVal.real(), ham);
}
}

Expand Down
2 changes: 1 addition & 1 deletion runtime/nvqir/cutensornet/simulator_cutensornet.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ class SimulatorTensorNetBase : public nvqir::CircuitSimulatorBase<double> {
const int shots) override;

/// @brief Evaluate the expectation value of a given observable
virtual cudaq::ExecutionResult observe(const cudaq::spin_op &op) override;
virtual cudaq::observe_result observe(const cudaq::spin_op &op) override;

/// @brief Add qubits to the underlying quantum state
virtual void addQubitsToState(std::size_t count) override;
Expand Down
4 changes: 2 additions & 2 deletions runtime/nvqir/qpp/QppCircuitSimulator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ class QppCircuitSimulator : public nvqir::CircuitSimulatorBase<double> {
return !shouldObserveFromSampling();
}

cudaq::ExecutionResult observe(const cudaq::spin_op &op) override {
cudaq::observe_result observe(const cudaq::spin_op &op) override {

flushGateQueue();

Expand Down Expand Up @@ -212,7 +212,7 @@ class QppCircuitSimulator : public nvqir::CircuitSimulatorBase<double> {
ee = qpp::apply(asEigen, state, targets).trace().real();
}

return cudaq::ExecutionResult({}, ee);
return cudaq::observe_result(ee, op);
}

/// @brief Reset the qubit
Expand Down
25 changes: 24 additions & 1 deletion unittests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,30 @@ if (CUSTATEVEC_ROOT AND CUDA_FOUND)
cudaq-platform-mqpu
nvqir-custatevec-fp64
gtest_main)
gtest_discover_tests(test_mqpu PROPERTIES LABELS "gpu_required")
gtest_discover_tests(test_mqpu PROPERTIES LABELS "gpu_required")

# Test CUDAQ_OBSERVE_FROM_SAMPLING=ON mode.
bmhowe23 marked this conversation as resolved.
Show resolved Hide resolved
# (term-by-term expectation value calculation by applying change-of-basis gates then reverting them)
add_executable(test_custatevec_observe_from_sampling
integration/builder_tester.cpp
integration/deuteron_variational_tester.cpp
integration/gradient_tester.cpp
integration/nlopt_tester.cpp
)
target_include_directories(test_custatevec_observe_from_sampling PRIVATE .)
target_compile_definitions(test_custatevec_observe_from_sampling PRIVATE -DNVQIR_BACKEND_NAME=custatevec_fp32)
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT APPLE)
target_link_options(test_custatevec_observe_from_sampling PRIVATE -Wl,--no-as-needed)
endif()
target_link_libraries(test_custatevec_observe_from_sampling
PRIVATE
cudaq
cudaq-builder
cudaq-platform-default
nvqir-custatevec-fp32
gtest_main)
# Run this test with "CUDAQ_OBSERVE_FROM_SAMPLING=1"
gtest_discover_tests(test_custatevec_observe_from_sampling TEST_SUFFIX _DirectObserve PROPERTIES ENVIRONMENT "CUDAQ_OBSERVE_FROM_SAMPLING=1" PROPERTIES LABELS "gpu_required")

if (MPI_CXX_FOUND)
# Count the number of GPUs
Expand Down
4 changes: 2 additions & 2 deletions unittests/backends/qpp_observe/QPPObserveBackend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ class QppObserveTester : public nvqir::QppCircuitSimulator<qpp::ket> {
public:
NVQIR_SIMULATOR_CLONE_IMPL(QppObserveTester)
bool canHandleObserve() override { return true; }
cudaq::ExecutionResult observe(const cudaq::spin_op &op) override {
cudaq::observe_result observe(const cudaq::spin_op &op) override {
flushGateQueue();

::qpp::cmat X = ::qpp::Gates::get_instance().X;
Expand Down Expand Up @@ -51,7 +51,7 @@ class QppObserveTester : public nvqir::QppCircuitSimulator<qpp::ket> {
}
}

return cudaq::ExecutionResult({}, sum);
return cudaq::observe_result(sum, op);
}

std::string name() const override { return "qpp-test"; }
Expand Down
2 changes: 1 addition & 1 deletion unittests/backends/qpp_observe/QppObserveTester.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ CUDAQ_TEST(QPPBackendTester, checkBackendObserve) {
.21829 * z(0) - 6.125 * z(1);

auto expVal = qpp.observe(h);
EXPECT_NEAR(expVal.expectationValue.value(), -1.74, 1e-2);
EXPECT_NEAR(expVal.expectation(), -1.74, 1e-2);
bmhowe23 marked this conversation as resolved.
Show resolved Hide resolved

struct ansatzTest {
auto operator()(double theta) __qpu__ {
Expand Down
Loading