Don't implicit handle density matrix as a flattened vector (#6)

1tnguyen · web-flow · commit dcb00446cd4b · 2025-11-19T13:39:13.000-08:00
Signed-off-by: Thien Nguyen &lt;thiennguyen@nvidia.com&gt;
diff --git a/runtime/common/ExecutionContext.h b/runtime/common/ExecutionContext.h
@@ -11,7 +11,6 @@
 #include "Future.h"
 #include "NoiseModel.h"
 #include "SampleResult.h"
-#include "SimulationState.h"
 #include "Trace.h"
 #include "cudaq/algorithms/optimizer.h"
 #include "cudaq/operators.h"
@@ -20,6 +19,8 @@
 
 namespace cudaq {
 
+class SimulationState;
+
 /// The ExecutionContext is an abstraction to indicate how a CUDA-Q kernel
 /// should be executed.
 class ExecutionContext {
diff --git a/runtime/common/SimulationState.h b/runtime/common/SimulationState.h
@@ -8,7 +8,10 @@
 
 #pragma once
 
+#include "cudaq/utils/cudaq_utils.h"
+#include "cudaq/utils/matrix.h"
 #include <algorithm>
+#include <bitset>
 #include <complex>
 #include <memory>
 #include <optional>
@@ -28,10 +31,11 @@ using TensorStateData =
 /// @brief state_data is a variant type
 /// encoding different forms of user state vector data
 /// we support.
-using state_data = std::variant<
-    std::vector<std::complex<double>>, std::vector<std::complex<float>>,
-    std::pair<std::complex<double> *, std::size_t>,
-    std::pair<std::complex<float> *, std::size_t>, TensorStateData>;
+using state_data = std::variant<std::vector<std::complex<double>>,
+                                std::vector<std::complex<float>>,
+                                std::pair<std::complex<double> *, std::size_t>,
+                                std::pair<std::complex<float> *, std::size_t>,
+                                complex_matrix, TensorStateData>;
 
 /// @brief The `SimulationState` interface provides and extension point
 /// for concrete circuit simulation sub-types to describe their
@@ -71,16 +75,41 @@ class SimulationState {
   auto getSizeAndPtr(const state_data &data) {
     auto type = data.index();
     std::tuple<std::size_t, void *> sizeAndPtr;
-    if (type == 0)
+    if (type ==
+        cudaq::detail::variant_index<cudaq::state_data,
+                                     std::vector<std::complex<double>>>())
       sizeAndPtr = getSizeAndPtrFromVec<double, ScalarType>(data);
-    else if (type == 1)
+    else if (type ==
+             cudaq::detail::variant_index<cudaq::state_data,
+                                          std::vector<std::complex<float>>>())
       sizeAndPtr = getSizeAndPtrFromVec<float, ScalarType>(data);
-    else if (type == 2)
+    else if (type == cudaq::detail::variant_index<
+                         cudaq::state_data,
+                         std::pair<std::complex<double> *, std::size_t>>())
       sizeAndPtr = getSizeAndPtrFromPair<double, ScalarType>(data);
-    else if (type == 3)
+    else if (type == cudaq::detail::variant_index<
+                         cudaq::state_data,
+                         std::pair<std::complex<float> *, std::size_t>>())
       sizeAndPtr = getSizeAndPtrFromPair<float, ScalarType>(data);
-    else
-      throw std::runtime_error("unsupported data type for state.");
+    else if (type == cudaq::detail::variant_index<cudaq::state_data,
+                                                  complex_matrix>()) {
+      // Complex matrix is double precision only
+      if constexpr (!std::is_same_v<double, ScalarType>)
+        throw std::runtime_error("[sim-state] invalid data precision.");
+      auto &cMat = std::get<complex_matrix>(data);
+      if (cMat.rows() != cMat.cols())
+        throw std::runtime_error(
+            "[sim-state] complex matrix must be square for density matrix.");
+      // Check that it must be a power of 2
+      if (std::bitset<64>(cMat.rows()).count() != 1)
+        throw std::runtime_error("[sim-state] complex matrix size must be a "
+                                 "power of 2 for density matrix.");
+      return std::make_tuple(
+          cMat.size(),
+          reinterpret_cast<void *>(const_cast<complex_matrix &>(cMat).get_data(
+              complex_matrix::order::row_major)));
+    } else
+      throw std::runtime_error("unsupported data type for state vector.");
 
     return sizeAndPtr;
   }
diff --git a/runtime/cudaq/qis/managers/photonics/CMakeLists.txt b/runtime/cudaq/qis/managers/photonics/CMakeLists.txt
@@ -17,7 +17,7 @@ target_include_directories(${LIBRARY_NAME}
        $<INSTALL_INTERFACE:include>)
 
 set (PHOTONICS_DEPENDENCIES "")
-list(APPEND PHOTONICS_DEPENDENCIES cudaq-common libqpp fmt::fmt-header-only)
+list(APPEND PHOTONICS_DEPENDENCIES cudaq cudaq-common libqpp fmt::fmt-header-only)
 add_openmp_configurations(${LIBRARY_NAME} PHOTONICS_DEPENDENCIES)
 
 target_link_libraries(${LIBRARY_NAME}
diff --git a/runtime/nvqir/cutensornet/simulator_mps.h b/runtime/nvqir/cutensornet/simulator_mps.h
@@ -57,8 +57,28 @@ class SimulatorMPS : public SimulatorTensorNetBase<ScalarType> {
       throw std::invalid_argument(
           "[SimulatorMPS simulator] Incompatible state input");
     if (!m_state) {
+      std::vector<MPSTensor> copiedTensors;
+      copiedTensors.reserve(casted->getMpsTensors().size());
+      for (const auto &mpsTensor : casted->getMpsTensors()) {
+        std::vector<int64_t> extents = mpsTensor.extents;
+        const auto numElements =
+            std::reduce(extents.begin(), extents.end(), 1, std::multiplies());
+        const auto tensorSizeBytes =
+            sizeof(std::complex<ScalarType>) * numElements;
+        void *mpsTensorCopy{nullptr};
+        HANDLE_CUDA_ERROR(cudaMalloc(&mpsTensorCopy, tensorSizeBytes));
+        HANDLE_CUDA_ERROR(cudaMemcpy(mpsTensorCopy, mpsTensor.deviceData,
+                                     tensorSizeBytes, cudaMemcpyDefault));
+        copiedTensors.emplace_back(MPSTensor(mpsTensorCopy, extents));
+      }
+
       m_state = TensorNetState<ScalarType>::createFromMpsTensors(
-          casted->getMpsTensors(), scratchPad, m_cutnHandle, m_randomEngine);
+          copiedTensors, scratchPad, m_cutnHandle, m_randomEngine);
+      for (const auto &mpsTensor : copiedTensors) {
+        m_state->m_tempDevicePtrs.emplace_back(
+            mpsTensor.deviceData,
+            typename TensorNetState<ScalarType>::TempDevicePtrDeleter{});
+      }
     } else {
       // Expand an existing state: Append MPS tensors
       // Factor the existing state
diff --git a/runtime/nvqir/cutensornet/simulator_tensornet.h b/runtime/nvqir/cutensornet/simulator_tensornet.h
@@ -123,6 +123,9 @@ class SimulatorTensorNet : public SimulatorTensorNetBase<ScalarType> {
       m_state = TensorNetState<ScalarType>::createFromOpTensors(
           in_state.getNumQubits(), casted->getAppliedTensors(), scratchPad,
           m_cutnHandle, m_randomEngine);
+      // Need to extend lifetime of all the device pointers stored in the input
+      // state.
+      m_state->m_tempDevicePtrs = casted->m_state->m_tempDevicePtrs;
     } else {
       // Expand an existing state:
       //  (1) Create a blank tensor network with combined number of qubits
@@ -149,6 +152,11 @@ class SimulatorTensorNet : public SimulatorTensorNetBase<ScalarType> {
           m_state->applyQubitProjector(op.deviceData,
                                        mapQubitIdxs(op.targetQubitIds));
       }
+      // Append the temp. pointer
+      m_state->m_tempDevicePtrs.insert(
+          m_state->m_tempDevicePtrs.end(),
+          casted->m_state->m_tempDevicePtrs.begin(),
+          casted->m_state->m_tempDevicePtrs.end());
     }
   }
   bool requireCacheWorkspace() const override { return true; }
diff --git a/runtime/nvqir/cutensornet/tensornet_state.h b/runtime/nvqir/cutensornet/tensornet_state.h
@@ -77,8 +77,15 @@ class TensorNetState {
   cutensornetState_t m_quantumState;
   /// Track id of gate tensors that are applied to the state tensors.
   std::int64_t m_tensorId = InvalidTensorIndexValue;
+  struct TempDevicePtrDeleter {
+    void operator()(void *ptr) const {
+      if (ptr)
+        cudaFree(ptr);
+    }
+  };
+
   // Device memory pointers to be cleaned up.
-  std::vector<void *> m_tempDevicePtrs;
+  std::vector<std::shared_ptr<void>> m_tempDevicePtrs;
   // Tensor ops that have been applied to the state.
   std::vector<AppliedTensorOp> m_tensorOps;
   ScratchDeviceMem &scratchPad;
@@ -233,6 +240,8 @@ class TensorNetState {
   template <typename ScalarTy>
   friend class SimulatorMPS;
   template <typename ScalarTy>
+  friend class SimulatorTensorNet;
+  template <typename ScalarTy>
   friend class TensorNetSimulationState;
   /// Internal method to contract the tensor network.
   /// Returns device memory pointer and size (number of elements).
diff --git a/runtime/nvqir/cutensornet/tensornet_state.inc b/runtime/nvqir/cutensornet/tensornet_state.inc
@@ -73,7 +73,7 @@ TensorNetState<ScalarType>::TensorNetState(const std::vector<int> &basisState,
   HANDLE_CUDA_ERROR(cudaMalloc(&d_gate, sizeBytes));
   HANDLE_CUDA_ERROR(
       cudaMemcpy(d_gate, h_xGate, sizeBytes, cudaMemcpyHostToDevice));
-  m_tempDevicePtrs.emplace_back(d_gate);
+  m_tempDevicePtrs.emplace_back(d_gate, TempDevicePtrDeleter{});
   for (int32_t qId = 0; const auto &bit : basisState) {
     if (bit == 1) {
       applyGate({}, {qId}, d_gate);
@@ -251,7 +251,7 @@ void TensorNetState<ScalarType>::addQubits(
 
   // Project the state of those new qubits to the input state.
   applyQubitProjector(d_proj, qubitIdx);
-  m_tempDevicePtrs.emplace_back(d_proj);
+  m_tempDevicePtrs.emplace_back(d_proj, TempDevicePtrDeleter{});
 }
 
 template <typename ScalarType>
@@ -1214,16 +1214,15 @@ TensorNetState<ScalarType>::createFromStateVector(
   std::iota(qubitIdx.begin(), qubitIdx.end(), 0);
   // Project the state to the input state.
   state->applyQubitProjector(d_proj, qubitIdx);
-  state->m_tempDevicePtrs.emplace_back(d_proj);
+  state->m_tempDevicePtrs.emplace_back(d_proj, TempDevicePtrDeleter{});
   return state;
 }
 
 template <typename ScalarType>
 TensorNetState<ScalarType>::~TensorNetState() {
   // Destroy the quantum circuit state
   HANDLE_CUTN_ERROR(cutensornetDestroyState(m_quantumState));
-  for (auto *ptr : m_tempDevicePtrs)
-    HANDLE_CUDA_ERROR(cudaFree(ptr));
+  m_tempDevicePtrs.clear();
 }
 
 } // namespace nvqir
diff --git a/runtime/nvqir/cutensornet/tn_simulation_state.h b/runtime/nvqir/cutensornet/tn_simulation_state.h
@@ -81,6 +81,9 @@ class TensorNetSimulationState : public cudaq::SimulationState {
     return m_state->m_tensorOps;
   }
 
+  template <typename ScalarTy> 
+  friend class SimulatorTensorNet;
+
 protected:
   std::unique_ptr<TensorNetState<ScalarType>> m_state;
   ScratchDeviceMem &scratchPad;
diff --git a/runtime/nvqir/cutensornet/tn_simulation_state.inc b/runtime/nvqir/cutensornet/tn_simulation_state.inc
@@ -305,7 +305,7 @@ TensorNetSimulationState<ScalarType>::createFromSizeAndPtr(
 
 template <typename ScalarType>
 void TensorNetSimulationState<ScalarType>::destroyState() {
-  CUDAQ_INFO("mps-state destroying state vector handle.");
+  CUDAQ_INFO("tn-state destroying state vector handle.");
   m_state.reset();
 }
 
diff --git a/runtime/nvqir/qpp/QppDMCircuitSimulator.cpp b/runtime/nvqir/qpp/QppDMCircuitSimulator.cpp
@@ -109,10 +109,21 @@ struct QppDmState : public cudaq::SimulationState {
   }
 
   std::unique_ptr<SimulationState>
-  createFromSizeAndPtr(std::size_t size, void *ptr, std::size_t) override {
-    return std::make_unique<QppDmState>(
-        Eigen::Map<qpp::cmat>(reinterpret_cast<std::complex<double> *>(ptr),
-                              std::sqrt(size), std::sqrt(size)));
+  createFromSizeAndPtr(std::size_t size, void *ptr, std::size_t type) override {
+    const bool isMatrixData =
+        type == cudaq::detail::variant_index<cudaq::state_data,
+                                             cudaq::complex_matrix>();
+
+    if (isMatrixData)
+      return std::make_unique<QppDmState>(
+          Eigen::Map<qpp::cmat>(reinterpret_cast<std::complex<double> *>(ptr),
+                                std::sqrt(size), std::sqrt(size)));
+    // This is state vector data, convert it to density matrix: rho = |psi><psi|
+    auto *stateData =
+        reinterpret_cast<std::complex<double> *>(const_cast<void *>(ptr));
+    qpp::ket psi = qpp::ket::Map(stateData, size);
+    qpp::cmat dm = psi * psi.adjoint();
+    return std::make_unique<QppDmState>(std::move(dm));
   }
 
   void dump(std::ostream &os) const override { os << state << "\n"; }
diff --git a/runtime/test/CMakeLists.txt b/runtime/test/CMakeLists.txt
@@ -12,7 +12,7 @@ set(TEST_NAME test_argument_conversion)
 
 add_llvm_executable(${TEST_NAME} test_argument_conversion.cpp)
 
-target_compile_options(${TEST_NAME} PUBLIC -Wno-type-limits -fexceptions)
+target_compile_options(${TEST_NAME} PUBLIC -Wno-type-limits -fexceptions -DCUDAQ_RTTI_DISABLED)
 
 target_include_directories(${TEST_NAME}
   PUBLIC
diff --git a/unittests/integration/qubit_allocation.cpp b/unittests/integration/qubit_allocation.cpp
@@ -14,7 +14,7 @@
 #ifndef CUDAQ_BACKEND_STIM
 
 std::vector<cudaq::complex> randomState(int numQubits) {
-  std::vector<cudaq::complex> stateVec(2 * numQubits);
+  std::vector<cudaq::complex> stateVec(1 << numQubits);
   std::generate(stateVec.begin(), stateVec.end(), []() -> cudaq::complex {
     thread_local std::default_random_engine
         generator; // thread_local so we don't have to do any locking

Original file line number	Diff line number	Diff line change
`@@ -81,6 +81,9 @@ class TensorNetSimulationState : public cudaq::SimulationState {`
`81`	`81`	`return m_state->m_tensorOps;`
`82`	`82`	`}`
`83`	`83`
	`84`	`+ template <typename ScalarTy>`
	`85`	`+ friend class SimulatorTensorNet;`
	`86`	`+`
`84`	`87`	`protected:`
`85`	`88`	`std::unique_ptr<TensorNetState<ScalarType>> m_state;`
`86`	`89`	`ScratchDeviceMem &scratchPad;`
Original file line number	Diff line number	Diff line change
`@@ -305,7 +305,7 @@ TensorNetSimulationState<ScalarType>::createFromSizeAndPtr(`
`305`	`305`
`306`	`306`	`template <typename ScalarType>`
`307`	`307`	`void TensorNetSimulationState<ScalarType>::destroyState() {`
`308`		`- CUDAQ_INFO("mps-state destroying state vector handle.");`
	`308`	`+ CUDAQ_INFO("tn-state destroying state vector handle.");`
`309`	`309`	`m_state.reset();`
`310`	`310`	`}`
`311`	`311`