diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 9fb984055d8..8536275b042 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -241,9 +241,9 @@
 				"C_Cpp.inlayHints.referenceOperator.enabled": false,
 				"C_Cpp.doxygen.generateOnType": false,
 				"C_Cpp.default.cStandard": "c17",
-				"C_Cpp.default.cppStandard": "c++20",
+				"C_Cpp.default.cppStandard": "c++17",
 				"C_Cpp.default.configurationProvider": "ms-vscode.cmake-tools",
-				"C_Cpp.default.compileCommands": "${workspaceFolder}/build/compile_commands.json",
+				"C_Cpp.default.compileCommands": "${workspaceFolder}/build/debug/compile_commands.json",
 				"C_Cpp.default.includePath": [
 					"${workspaceFolder}/**",
 					"${CUDAQ_INSTALL_PREFIX}/**",
diff --git a/.github/workflows/config/spelling_allowlist.txt b/.github/workflows/config/spelling_allowlist.txt
index b398ad6d154..d2e3a023b12 100644
--- a/.github/workflows/config/spelling_allowlist.txt
+++ b/.github/workflows/config/spelling_allowlist.txt
@@ -192,6 +192,7 @@ discoverable
 discretization
 discretize
 discretized
+eigenbasis
 eigensolver
 eigensolvers
 eigenstate
@@ -201,6 +202,7 @@ eigenvalues
 eigenvector
 eigenvectors
 endian
+endianness
 enqueue
 enqueues
 enqueuing
diff --git a/.github/workflows/publishing.yml b/.github/workflows/publishing.yml
index 675a0f96da6..9e7ca456245 100644
--- a/.github/workflows/publishing.yml
+++ b/.github/workflows/publishing.yml
@@ -839,7 +839,7 @@ jobs:
                   fi; \
               done`
 
-          rm -rf examples applications targets && mv github-repo/docs/sphinx/examples examples && mv github-repo/docs/sphinx/applications applications && mv github-repo/docs/sphinx/targets targets
+          rm -rf examples applications targets snippets && mv github-repo/docs/sphinx/examples examples && mv github-repo/docs/sphinx/applications applications && mv github-repo/docs/sphinx/targets targets && mv github-repo/docs/sphinx/snippets/python snippets
           mv github-repo/docs/notebook_validation.py .
           GITHUB_STEP_SUMMARY=$GITHUB_STEP_SUMMARY \
           bash github-repo/scripts/validate_container.sh $backends_to_test | tee /tmp/validation.out
diff --git a/.github/workflows/python_wheels.yml b/.github/workflows/python_wheels.yml
index 83016c39969..996a667e9e9 100644
--- a/.github/workflows/python_wheels.yml
+++ b/.github/workflows/python_wheels.yml
@@ -260,7 +260,9 @@ jobs:
             done
 
       - name: Run Python MPI tests
-        if: matrix.os_image == 'redhat/ubi9:9.2'
+        # Temporary disabling this test for the bug bash
+        # matrix.os_image == 'redhat/ubi9:9.2'
+        if: false
         uses: ./.github/actions/run-in-docker
         with:
           image: wheel_validation:local
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5b9a0acaaab..1a44cf74941 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -151,6 +151,9 @@ endif()
 if(NOT CUTENSORNET_ROOT)
   SET(CUTENSORNET_ROOT "$ENV{CUQUANTUM_INSTALL_PREFIX}")
 endif()
+if(NOT CUDENSITYMAT_ROOT)
+  SET(CUDENSITYMAT_ROOT "$ENV{CUQUANTUM_INSTALL_PREFIX}")
+endif()
 if(NOT CUTENSOR_ROOT)
   SET(CUTENSOR_ROOT "$ENV{CUTENSOR_INSTALL_PREFIX}")
 endif()
diff --git a/cmake/Modules/CMakeLists.txt b/cmake/Modules/CMakeLists.txt
index b894b3cc260..8c26addf34a 100644
--- a/cmake/Modules/CMakeLists.txt
+++ b/cmake/Modules/CMakeLists.txt
@@ -11,6 +11,7 @@ set(CONFIG_FILES
   CUDAQEmDefaultConfig.cmake
   CUDAQNloptConfig.cmake
   CUDAQSpinConfig.cmake
+  CUDAQOperatorConfig.cmake
   CUDAQConfig.cmake
   CUDAQEnsmallenConfig.cmake
   CUDAQPlatformDefaultConfig.cmake
diff --git a/cmake/Modules/CUDAQConfig.cmake b/cmake/Modules/CUDAQConfig.cmake
index d27e9d57ace..1d29e8bef5f 100644
--- a/cmake/Modules/CUDAQConfig.cmake
+++ b/cmake/Modules/CUDAQConfig.cmake
@@ -14,6 +14,9 @@ list(APPEND CMAKE_MODULE_PATH "${CUDAQ_CMAKE_DIR}")
 set (CUDAQSpin_DIR "${CUDAQ_CMAKE_DIR}")
 find_dependency(CUDAQSpin REQUIRED)
 
+set (CUDAQOperator_DIR "${CUDAQ_CMAKE_DIR}")
+find_dependency(CUDAQOperator REQUIRED)
+
 set (CUDAQCommon_DIR "${CUDAQ_CMAKE_DIR}")
 find_dependency(CUDAQCommon REQUIRED)
 
diff --git a/cmake/Modules/CUDAQEmDefaultConfig.cmake b/cmake/Modules/CUDAQEmDefaultConfig.cmake
index 663334cf3d0..a591cd002d0 100644
--- a/cmake/Modules/CUDAQEmDefaultConfig.cmake
+++ b/cmake/Modules/CUDAQEmDefaultConfig.cmake
@@ -11,6 +11,9 @@ get_filename_component(CUDAQ_EM_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
 set (CUDAQSpin_DIR "${CUDAQ_EM_CMAKE_DIR}")
 find_dependency(CUDAQSpin REQUIRED)
 
+set (CUDAQOperator_DIR "${CUDAQ_EM_CMAKE_DIR}")
+find_dependency(CUDAQOperator REQUIRED)
+
 set (CUDAQCommon_DIR "${CUDAQ_EM_CMAKE_DIR}")
 find_dependency(CUDAQCommon REQUIRED)
 
diff --git a/cmake/Modules/CUDAQOperatorConfig.cmake b/cmake/Modules/CUDAQOperatorConfig.cmake
new file mode 100644
index 00000000000..c54ec8e1fed
--- /dev/null
+++ b/cmake/Modules/CUDAQOperatorConfig.cmake
@@ -0,0 +1,13 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+get_filename_component(CUDAQ_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+
+if(NOT TARGET cudaq::cudaq-operator)
+  include("${CUDAQ_CMAKE_DIR}/CUDAQOperatorTargets.cmake")
+endif()
diff --git a/cmake/Modules/CUDAQPlatformDefaultConfig.cmake b/cmake/Modules/CUDAQPlatformDefaultConfig.cmake
index 213804e52ac..7731250facd 100644
--- a/cmake/Modules/CUDAQPlatformDefaultConfig.cmake
+++ b/cmake/Modules/CUDAQPlatformDefaultConfig.cmake
@@ -11,6 +11,9 @@ get_filename_component(CUDAQ_CMAKE_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
 set (CUDAQEmDefault_DIR "${CUDAQ_CMAKE_DIR}")
 find_dependency(CUDAQEmDefault REQUIRED)
 
+set (CUDAQOperator_DIR "${CUDAQ_CMAKE_DIR}")
+find_dependency(CUDAQOperator REQUIRED)
+
 set (CUDAQSpin_DIR "${CUDAQ_CMAKE_DIR}")
 find_dependency(CUDAQSpin REQUIRED)
 
diff --git a/cmake/Modules/NVQIRConfig.cmake.in b/cmake/Modules/NVQIRConfig.cmake.in
index 0406c339ea7..17cd691cbc8 100644
--- a/cmake/Modules/NVQIRConfig.cmake.in
+++ b/cmake/Modules/NVQIRConfig.cmake.in
@@ -12,6 +12,7 @@ include(CMakeFindDependencyMacro)
 get_filename_component(PARENT_DIRECTORY ${NVQIR_CMAKE_DIR} DIRECTORY)
 
 find_dependency(CUDAQSpin REQUIRED HINTS "${PARENT_DIRECTORY}/cudaq")
+find_dependency(CUDAQOperator REQUIRED HINTS "${PARENT_DIRECTORY}/cudaq")
 find_dependency(CUDAQCommon REQUIRED HINTS "${PARENT_DIRECTORY}/cudaq")
 find_package(fmt QUIET)
 if (NOT fmt_FOUND)
diff --git a/docker/build/devdeps.ext.Dockerfile b/docker/build/devdeps.ext.Dockerfile
index 343ffa34764..a04f82ddfae 100644
--- a/docker/build/devdeps.ext.Dockerfile
+++ b/docker/build/devdeps.ext.Dockerfile
@@ -181,7 +181,3 @@ ENV CUTENSOR_INSTALL_PREFIX="$CUTENSOR_INSTALL_PREFIX"
 ENV CUTENSOR_ROOT="$CUTENSOR_INSTALL_PREFIX"
 ENV LD_LIBRARY_PATH="$CUTENSOR_INSTALL_PREFIX/lib:$LD_LIBRARY_PATH"
 ENV CPATH="$CUTENSOR_INSTALL_PREFIX/include:$CPATH"
-
-# Active MPI support for the cuTensorNet library
-RUN cd "$CUQUANTUM_INSTALL_PREFIX/distributed_interfaces/" && source activate_mpi_cutn.sh
-ENV CUTENSORNET_COMM_LIB="$CUQUANTUM_INSTALL_PREFIX/distributed_interfaces/libcutensornet_distributed_interface_mpi.so"
diff --git a/docs/sphinx/examples/cpp/dynamics/cavity_qed.cpp b/docs/sphinx/examples/cpp/dynamics/cavity_qed.cpp
new file mode 100644
index 00000000000..bff358a195e
--- /dev/null
+++ b/docs/sphinx/examples/cpp/dynamics/cavity_qed.cpp
@@ -0,0 +1,144 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// Compile and run with:
+// ```
+// nvq++ --target dynamics cavity_qed.cpp -o a.out && ./a.out
+// ```
+
+#include "cudaq/algorithms/evolve.h"
+#include "cudaq/dynamics_integrators.h"
+#include "cudaq/operators.h"
+#include "export_csv_helper.h"
+#include <cudaq.h>
+
+int main() {
+
+  // Dimension of our composite quantum system:
+  // subsystem 0 (atom) has 2 levels (ground and excited states).
+  // subsystem 1 (cavity) has 10 levels (Fock states, representing photon number
+  // states).
+  std::map<int, int> dimensions{{0, 2}, {1, 10}};
+
+  // For the cavity subsystem 1
+  // We create the annihilation (a) and creation (a+) operators.
+  // These operators lower and raise the photon number, respectively.
+  auto a = cudaq::boson_operator::annihilate(1);
+  auto a_dag = cudaq::boson_operator::create(1);
+
+  // For the atom subsystem 0
+  // We create the annihilation (sm) and creation (sm_dag) operators.
+  // These operators lower and raise the excitation number, respectively.
+  auto sm = cudaq::boson_operator::annihilate(0);
+  auto sm_dag = cudaq::boson_operator::create(0);
+
+  // Number operators
+  // These operators count the number of excitations.
+  // For the atom (subsytem 0) and the cavity (subsystem 1) they give the
+  // population in each subsystem.
+  auto atom_occ_op = cudaq::matrix_operator::number(0);
+  auto cavity_occ_op = cudaq::matrix_operator::number(1);
+
+  // Hamiltonian
+  // The hamiltonian models the dynamics of the atom-cavity (cavity QED) system
+  // It has 3 parts:
+  // 1. Caity energy: 2 * pi * photon_number_operator -> energy proportional to
+  // the number of photons.
+  // 2. Atomic energy: 2 * pi * atom_number_operator -> energy proportional to
+  // the atomic excitation.
+  // 3. Atomic-cavity interaction: 2 * pi * 0.25 * (sm * a_dag + sm_dag * a) ->
+  // represents the exchange of energy between the atom and the cavity. It is
+  // analogous to the Jaynes-Cummings model in cavity QED.
+  auto hamiltonian = (2 * M_PI * cavity_occ_op) + (2 * M_PI * atom_occ_op) +
+                     (2 * M_PI * 0.25 * (sm * a_dag + sm_dag * a));
+
+  // Build the initial state
+  // Atom (sub-system 0) in ground state.
+  // Cavity (sub-system 1) has 5 photons (Fock space).
+  // The overall Hilbert space is 2 * 10 = 20.
+  const int num_photons = 5;
+  std::vector<std::complex<double>> initial_state_vec(20, 0.0);
+  // The index is chosen such that the atom is in the ground state while the
+  // cavity is in the Fock state with 5 photons.
+  initial_state_vec[dimensions[0] * num_photons] = 1;
+
+  // Define a time evolution schedule
+  // We define a time grid from 0 to 10 (in arbitray time units) with 201 time
+  // steps This schedule is used by the integrator to simulate the dynamics.
+  const int num_steps = 201;
+  auto steps = cudaq::linspace(0.0, 10.0, num_steps);
+  cudaq::Schedule schedule(steps);
+
+  // Create a CUDA quantum state
+  // The initial state is converted into a quantum state object for evolution.
+  auto rho0 = cudaq::state::from_data(initial_state_vec);
+
+  // Numerical integrator
+  // Here we choose a Runge-Kutta method for time evolution.
+  // dt defines the time step for the numerical integration, and order 4
+  // indicates a 4th order method.
+  std::shared_ptr<cudaq::RungeKuttaIntegrator> integrator =
+      std::make_shared<cudaq::RungeKuttaIntegrator>();
+  integrator->dt = 0.01;
+  integrator->order = 4;
+
+  // Evolve without collapse operators
+  // This evolution is ideal (closed system) dynamics governed solely by the
+  // Hamiltonian. The expectation values of the observables (cavity photon
+  // number and atom excitation probability) are recorded.
+  cudaq::evolve_result evolve_result =
+      cudaq::evolve(hamiltonian, dimensions, schedule, rho0, integrator, {},
+                    {cavity_occ_op, atom_occ_op}, true);
+
+  // Adding dissipation
+  // To simulate a realistic scenario, we introduce decay (dissipation).
+  // Here, the collapse operator represents photon loss from the cavity.
+  constexpr double decay_rate = 0.1;
+  auto collapse_operator = std::sqrt(decay_rate) * a;
+  // Evolve with the collapse operator to incorporate the effect of decay.
+  cudaq::evolve_result evolve_result_decay =
+      cudaq::evolve(hamiltonian, dimensions, schedule, rho0, integrator,
+                    {collapse_operator}, {cavity_occ_op, atom_occ_op}, true);
+
+  // Lambda to extract expectation values for a given observable index
+  // Here, index 0 corresponds to the cavity photon number and index 1
+  // corresponds to the atom excitation probability.
+  auto get_expectation = [](int idx,
+                            const auto &result) -> std::vector<double> {
+    std::vector<double> expectations;
+
+    auto all_exps = result.get_expectation_values().value();
+    for (auto exp_vals : all_exps) {
+      expectations.push_back((double)exp_vals[idx]);
+    }
+    return expectations;
+  };
+
+  // Retrieve expectation values from both the ideal and decaying evolutions.
+  auto ideal_result0 = get_expectation(0, evolve_result);
+  auto ideal_result1 = get_expectation(1, evolve_result);
+  auto decay_result0 = get_expectation(0, evolve_result_decay);
+  auto decay_result1 = get_expectation(1, evolve_result_decay);
+
+  // Export the results to CSV files
+  // "cavity_qed_ideal_result.csv" contains the ideal evolution results.
+  // "cavity_qed_decay_result.csv" contains the evolution results with cavity
+  // decay.
+  export_csv("cavity_qed_ideal_result.csv", "time", steps,
+             "cavity_photon_number", ideal_result0,
+             "atom_excitation_probability", ideal_result1);
+  export_csv("cavity_qed_decay_result.csv", "time", steps,
+             "cavity_photon_number", decay_result0,
+             "atom_excitation_probability", decay_result1);
+
+  std::cout << "Simulation complete. The results are saved in "
+               "cavity_qed_ideal_result.csv "
+               "and cavity_qed_decay_result.csv files."
+            << std::endl;
+  return 0;
+}
\ No newline at end of file
diff --git a/docs/sphinx/examples/cpp/dynamics/cross_resonance.cpp b/docs/sphinx/examples/cpp/dynamics/cross_resonance.cpp
new file mode 100644
index 00000000000..c6e5f8fdf2d
--- /dev/null
+++ b/docs/sphinx/examples/cpp/dynamics/cross_resonance.cpp
@@ -0,0 +1,168 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// Compile and run with:
+// ```
+// nvq++ --target dynamics cavity_qed.cpp -o a.out && ./a.out
+// ```
+
+#include "cudaq/algorithms/evolve.h"
+#include "cudaq/dynamics_integrators.h"
+#include "cudaq/operators.h"
+#include "export_csv_helper.h"
+#include <cudaq.h>
+
+int main() {
+
+  // `delta` represents the detuning between the two qubits.
+  // In physical terms, detuning is the energy difference (or frequency offset)
+  // between qubit levels. Detuning term (in angular frequency units).
+  double delta = 100 * 2 * M_PI;
+  // `J` is the static coupling strength between the two qubits.
+  // This terms facilitates energy exchange between the qubits, effectively
+  // coupling their dynamics.
+  double J = 7 * 2 * M_PI;
+  // `m_12` models spurious electromagnetic crosstalk.
+  // Crosstalk is an unwanted interaction , here represented as a fraction of
+  // the drive strength applied to the second qubit.
+  double m_12 = 0.2;
+  // `Omega` is the drive strength applied to the qubits.
+  // A driving field can induce transitions between qubit states.
+  double Omega = 20 * 2 * M_PI;
+
+  // For a spin-1/2 system, the raising operator S^+ and lowering operator S^-
+  // are defined as: S^+ = 0.5 * (X + iY) and S^- = 0.5 * (X - iY) These
+  // operators allow transitions between the spin states (|0> and |1>).
+  auto spin_plus = [](int degree) {
+    return 0.5 *
+           (cudaq::spin_operator::x(degree) +
+            std::complex<double>(0.0, 1.0) * cudaq::spin_operator::y(degree));
+  };
+
+  auto spin_minus = [](int degree) {
+    return 0.5 *
+           (cudaq::spin_operator::x(degree) -
+            std::complex<double>(0.0, 1.0) * cudaq::spin_operator::y(degree));
+  };
+
+  // The Hamiltonian describes the energy and dynamics of our 2-qubit system.
+  // It consist of several parts:
+  // 1. Detuning term for qubit 0: (delta / 2) * Z. This sets the energy
+  // splitting for qubit 0.
+  // 2. Exchange interaction: J * (S^-_1 * S^+_0 + S^+_1 * S^-_0). This couples
+  // the two qubits, enabling excitation transfer.
+  // 3. Drive on qubit 0: Omega * X. A control field that drives transition in
+  // qubit 0.
+  // 4. Crosstalk drive on qubit 1: m_12 * Omega * X. A reduces drive on qubit 1
+  // due to electromagnetic crosstalk.
+  auto hamiltonian =
+      (delta / 2.0) * cudaq::spin_operator::z(0) +
+      J * (spin_minus(1) * spin_plus(0) + spin_plus(1) * spin_minus(0)) +
+      Omega * cudaq::spin_operator::x(0) +
+      m_12 * Omega * cudaq::spin_operator::x(1);
+
+  // Each qubit is a 2-level system (dimension 2).
+  // The composite system (two qubits) has a total Hilbert space dimension of 2
+  // * 2 = 4.
+  std::map<int, int> dimensions{{0, 2}, {1, 2}};
+
+  // Build the initial state
+  // psi_00 represents the state |00> (both qubits in the ground state).
+  // psi_10 represents the state |10> (first qubit excited, second qubit in the
+  // ground state).
+  std::vector<std::complex<double>> psi00_data = {
+      {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}};
+  std::vector<std::complex<double>> psi10_data = {
+      {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}};
+
+  // Two initial state vectors for the 2-qubit system (dimension 4)
+  auto psi_00 = cudaq::state::from_data(psi00_data);
+  auto psi_10 = cudaq::state::from_data(psi10_data);
+
+  // Create a list of time steps for the simulation.
+  // Here we use 1001 points linearly spaced between time 0 and 1.
+  // This schedule will be used to integrate the time evolution of the system.
+  const int num_steps = 1001;
+  std::vector<double> steps = cudaq::linspace(0.0, 1.0, num_steps);
+  cudaq::Schedule schedule(steps);
+
+  // Use Runge-Kutta integrator (4th order) to solve the time-dependent
+  // evolution. `dt` is the integration time step, and `order` sets the accuracy
+  // of the integrator method.
+  std::shared_ptr<cudaq::RungeKuttaIntegrator> integrator =
+      std::make_shared<cudaq::RungeKuttaIntegrator>();
+  integrator->dt = 0.0001;
+  integrator->order = 4;
+
+  // The observables are the spin components along the x, y, and z directions
+  // for both qubits. These observables will be measured during the evolution.
+  auto observables = {cudaq::spin_operator::x(0), cudaq::spin_operator::y(0),
+                      cudaq::spin_operator::z(0), cudaq::spin_operator::x(1),
+                      cudaq::spin_operator::y(1), cudaq::spin_operator::z(1)};
+
+  // Evolution with 2 initial states
+  // We evolve the system under the defined Hamiltonian for both initial states
+  // simulataneously. No collapsed operators are provided (closed system
+  // evolution). The evolution returns expectation values for all defined
+  // observables at each time step.
+  const auto evolution_results =
+      cudaq::evolve(hamiltonian, dimensions, schedule, {psi_00, psi_10},
+                    integrator, {}, observables, true);
+
+  // Retrieve the evolution result corresponding to each initial state.
+  auto &evolution_result_00 = evolution_results[0];
+  auto &evolution_result_10 = evolution_results[1];
+
+  // Lambda to extract expectation values for a given observable index
+  auto get_expectation = [](int idx,
+                            const auto &result) -> std::vector<double> {
+    std::vector<double> expectations;
+
+    auto all_exps = result.get_expectation_values().value();
+    for (auto exp_vals : all_exps) {
+      expectations.push_back((double)exp_vals[idx]);
+    }
+    return expectations;
+  };
+
+  // For the two evolutions, extract the six observable trajectories.
+  // For the |00> initial state, we extract the expectation trajectories for
+  // each observable.
+  auto result_00_0 = get_expectation(0, evolution_result_00);
+  auto result_00_1 = get_expectation(1, evolution_result_00);
+  auto result_00_2 = get_expectation(2, evolution_result_00);
+  auto result_00_3 = get_expectation(3, evolution_result_00);
+  auto result_00_4 = get_expectation(4, evolution_result_00);
+  auto result_00_5 = get_expectation(5, evolution_result_00);
+
+  // Similarly, for the |10> initial state:
+  auto result_10_0 = get_expectation(0, evolution_result_10);
+  auto result_10_1 = get_expectation(1, evolution_result_10);
+  auto result_10_2 = get_expectation(2, evolution_result_10);
+  auto result_10_3 = get_expectation(3, evolution_result_10);
+  auto result_10_4 = get_expectation(4, evolution_result_10);
+  auto result_10_5 = get_expectation(5, evolution_result_10);
+
+  // Export the results to a CSV file
+  // Export the Z-component of qubit 1's expectation values for both initial
+  // states. The CSV file "cross_resonance_z.csv" will have time versus (Z1)
+  // data for both |00> and |10> initial conditions.
+  export_csv("cross_resonance_z.csv", "time", steps, "<Z1>_00", result_00_5,
+             "<Z1>_10", result_10_5);
+  // Export the Y-component of qubit 1's expectation values for both initial
+  // states. The CSV file "cross_resonance_y.csv" will have time versus (Y1)
+  // data.
+  export_csv("cross_resonance_y.csv", "time", steps, "<Y1>_00", result_00_4,
+             "<Y1>_10", result_10_4);
+
+  std::cout
+      << "Simulation complete. The results are saved in cross_resonance_z.csv "
+         "and cross_resonance_y.csv files."
+      << std::endl;
+  return 0;
+}
\ No newline at end of file
diff --git a/docs/sphinx/examples/cpp/dynamics/export_csv_helper.h b/docs/sphinx/examples/cpp/dynamics/export_csv_helper.h
new file mode 100644
index 00000000000..ccc670906cb
--- /dev/null
+++ b/docs/sphinx/examples/cpp/dynamics/export_csv_helper.h
@@ -0,0 +1,75 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+inline void
+export_csv_helper(std::vector<std::string> & /*headers*/,
+                  std::vector<const std::vector<double> *> & /*columns*/) {}
+
+template <typename... Rest>
+inline void export_csv_helper(std::vector<std::string> &headers,
+                              std::vector<const std::vector<double> *> &columns,
+                              const std::string &header,
+                              const std::vector<double> &column,
+                              const Rest &...rest) {
+  headers.push_back(header);
+  columns.push_back(&column);
+  export_csv_helper(headers, columns, rest...);
+}
+
+template <typename... Args>
+void export_csv(const std::string &filename, const std::string &header1,
+                const std::vector<double> &col1, const Args &...args) {
+  static_assert(sizeof...(args) % 2 == 0,
+                "Parameters must be provided in header/vector pairs.");
+
+  std::vector<std::string> headers;
+  std::vector<const std::vector<double> *> columns;
+  headers.push_back(header1);
+  columns.push_back(&col1);
+  export_csv_helper(headers, columns, args...);
+
+  size_t n = columns.front()->size();
+  for (const auto *column : columns) {
+    if (column->size() != n) {
+      std::cerr << "Error: all columns must have the same length." << std::endl;
+      return;
+    }
+  }
+
+  std::ofstream file(filename);
+  if (!file.is_open()) {
+    std::cerr << "Error: could not open file " << filename << std::endl;
+    return;
+  }
+
+  for (size_t i = 0; i < headers.size(); ++i) {
+    file << headers[i];
+    if (i < headers.size() - 1) {
+      file << ",";
+    }
+  }
+  file << std::endl;
+
+  for (size_t i = 0; i < n; ++i) {
+    for (size_t j = 0; j < columns.size(); ++j) {
+      file << std::fixed << std::setprecision(8) << (*columns[j])[i];
+      if (j < columns.size() - 1) {
+        file << ",";
+      }
+    }
+    file << std::endl;
+  }
+  file.close();
+}
\ No newline at end of file
diff --git a/docs/sphinx/examples/cpp/dynamics/heisenberg_model.cpp b/docs/sphinx/examples/cpp/dynamics/heisenberg_model.cpp
new file mode 100644
index 00000000000..d5197f4cbfb
--- /dev/null
+++ b/docs/sphinx/examples/cpp/dynamics/heisenberg_model.cpp
@@ -0,0 +1,155 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// Compile and run with:
+// ```
+// nvq++ --target dynamics heisenberg_model.cpp -o a.out && ./a.out
+// ```
+
+#include "cudaq/algorithms/evolve.h"
+#include "cudaq/dynamics_integrators.h"
+#include "cudaq/operators.h"
+#include "export_csv_helper.h"
+#include <cudaq.h>
+
+int main() {
+
+  // Set up a 9-spin chain, where each spin is a two-level system.
+  const int num_spins = 9;
+  std::map<int, int> dimensions;
+  for (int i = 0; i < num_spins; i++) {
+    dimensions[i] = 2; // Each spin (site) has dimension 2.
+  }
+
+  // Initial state
+  // Prepare an initial state where the spins are arranged in a staggered
+  // configuration. Even indices get the value '0' and odd indices get '1'. For
+  // example, for 9 spins: spins: 0 1 0 1 0 1 0 1 0
+  std::string spin_state;
+  for (int i = 0; i < num_spins; i++) {
+    spin_state.push_back((i % 2 == 0) ? '0' : '1');
+  }
+
+  // Convert the binary string to an integer index
+  // In the Hilbert space of 9 spins (size 2^9 = 512), this index corresponds to
+  // the state |0 1 0 1 0 1 0 1 0>
+  int initial_state_index = std::stoi(spin_state, nullptr, 2);
+
+  // Build the staggered magnetization operator
+  // The staggered magnetization operator is used to measure antiferromagnetic
+  // order. It is defined as a sum over all spins of the Z operator, alternating
+  // in sign. For even sites, we add sz; for odd sites, we subtract sz.
+  auto staggered_magnetization_t = cudaq::matrix_operator::empty();
+  for (int i = 0; i < num_spins; i++) {
+    auto sz = cudaq::spin_operator::z(i);
+    if (i % 2 == 0) {
+      staggered_magnetization_t += sz;
+    } else {
+      staggered_magnetization_t -= sz;
+    }
+  }
+
+  // Normalize the number of spins so that the observable is intensive.
+  auto stagged_magnetization_op =
+      (1 / static_cast<double>(num_spins)) * staggered_magnetization_t;
+
+  // Each entry will associate a value of g (the anisotropy in the Z coupling)
+  // with its corresponding time-series of expectation values of the staggered
+  // magnetization.
+  std::vector<std::pair<double, std::vector<double>>> observe_results;
+
+  // Simulate the dynamics over 1000 time steps spanning from time 0 to 5.
+  const int num_steps = 1000;
+  const auto steps = cudaq::linspace(0.0, 5.0, num_steps);
+
+  // For three different values of g, which sets the strength of the Z-Z
+  // interaction: g = 0.0 (isotropic in the XY plane), 0.25, and 4.0 (strongly
+  // anisotropy).
+  std::vector<double> g_values = {0.0, 0.25, 4.0};
+
+  for (auto g : g_values) {
+    // Set the coupling strengths:
+    // Jx and Jy are set to 1.0 (coupling along X and Y axes), while Jz is set
+    // to the current g value (coupling along the Z axis).
+    double Jx = 1.0, Jy = 1.0, Jz = g;
+
+    // The Hamiltonian is built from the nearest-neighbor interactions:
+    // H = H + Jx * Sx_i * Sx_{i+1}
+    // H = H + Jy * Sy_i * Sy_{i+1}
+    // H = H + Jz * Sz_i * Sz_{i+1}
+    // This is a form of the anisotropic Heisenberg (or XYZ) model.
+    auto hamiltonian = cudaq::spin_operator::empty();
+    for (int i = 0; i < num_spins - 1; i++) {
+      hamiltonian = hamiltonian + Jx * cudaq::spin_operator::x(i) *
+                                      cudaq::spin_operator::x(i + 1);
+      hamiltonian = hamiltonian + Jy * cudaq::spin_operator::y(i) *
+                                      cudaq::spin_operator::y(i + 1);
+      hamiltonian = hamiltonian + Jz * cudaq::spin_operator::z(i) *
+                                      cudaq::spin_operator::z(i + 1);
+    }
+
+    // Initial state vector
+    // For a 9-spin system, the Hilbert space dimension is 2^9 = 512.
+    // Initialize the state as a vector with all zeros except for a 1 at the
+    // index corresponding to our staggered state.
+    const int state_size = 1 << num_spins;
+    std::vector<std::complex<double>> psi0_data(state_size, {0.0, 0.0});
+    psi0_data[initial_state_index] = {1.0, 0.0};
+    auto psi0 = cudaq::state::from_data(psi0_data);
+
+    // The schedule is built using the time steps array.
+    cudaq::Schedule schedule(steps, {"time"});
+
+    // Use a Runge-Kutta integrator (4th order) with a small time step dt =
+    // 0.001.
+    std::shared_ptr<cudaq::RungeKuttaIntegrator> integrator =
+        std::make_shared<cudaq::RungeKuttaIntegrator>();
+    integrator->dt = 0.001;
+    integrator->order = 4;
+
+    // Evolve the initial state psi0 under the Hamiltonian, using the specified
+    // schedule and integrator. No collapse operators are included (closed
+    // system evolution). Measure the expectation value of the staggered
+    // magnetization operator at each time step.
+    auto evolve_result =
+        cudaq::evolve(hamiltonian, dimensions, schedule, psi0, integrator, {},
+                      {stagged_magnetization_op}, true);
+
+    // Lambda to extract expectation values for a given observable index
+    auto get_expectation = [](int idx,
+                              const auto &result) -> std::vector<double> {
+      std::vector<double> expectations;
+
+      auto all_exps = result.get_expectation_values().value();
+      for (auto exp_vals : all_exps) {
+        expectations.push_back((double)exp_vals[idx]);
+      }
+      return expectations;
+    };
+
+    observe_results.push_back({g, get_expectation(0, evolve_result)});
+  }
+
+  if (observe_results.size() != 3) {
+    std::cerr << "Unexpected number of g values" << std::endl;
+    return 1;
+  }
+
+  // The CSV file "heisenberg_model.csv" will contain column with:
+  //    - The time steps
+  //    - The expectation values of the staggered magnetization for each g value
+  //    (labeled g_0, g_0.25, g_4).
+  export_csv("heisenberg_model_result.csv", "time", steps, "g_0",
+             observe_results[0].second, "g_0.25", observe_results[1].second,
+             "g_4", observe_results[2].second);
+
+  std::cout << "Simulation complete. The results are saved in "
+               "heisenberg_model_result.csv file."
+            << std::endl;
+  return 0;
+}
\ No newline at end of file
diff --git a/docs/sphinx/examples/cpp/dynamics/qubit_control.cpp b/docs/sphinx/examples/cpp/dynamics/qubit_control.cpp
new file mode 100644
index 00000000000..9bc00e8d847
--- /dev/null
+++ b/docs/sphinx/examples/cpp/dynamics/qubit_control.cpp
@@ -0,0 +1,137 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// Compile and run with:
+// ```
+// nvq++ --target dynamics qubit_control.cpp -o a.out && ./a.out
+// ```
+
+#include "cudaq/algorithms/evolve.h"
+#include "cudaq/dynamics_integrators.h"
+#include "cudaq/operators.h"
+#include "export_csv_helper.h"
+#include <cudaq.h>
+
+int main() {
+  // Qubit resonant frequency (energy splitting along Z).
+  double omega_z = 10.0 * 2 * M_PI;
+  // Transverse driving term (amplitude of the drive along the X-axis).
+  double omega_x = 2 * M_PI;
+  // Driving frequency, chosen to be slightly off-resonance (0.99 og omega_z).
+  double omega_drive = 0.99 * omega_z;
+
+  // The lambda function acts as a callback that returns a modulation factor for
+  // the drive. It extracts the time `t` from the provided parameters and
+  // computes cos(omega_drive * t).
+  auto mod_func =
+      [omega_drive](
+          const std::unordered_map<std::string, std::complex<double>> &params)
+      -> std::complex<double> {
+    auto it = params.find("t");
+    if (it != params.end()) {
+      double t = it->second.real();
+      const auto result = std::cos(omega_drive * t);
+      return result;
+    }
+    throw std::runtime_error("Cannot find the time parameter.");
+  };
+
+  // The Hamiltonian consists of two terms:
+  // 1. A static term: 0.5 * omega_z * Sz_0, representing the qubit's intrinsic
+  // energy splitting.
+  // 2. A time-dependent driving term: omega_x * cos(omega_drive * t) * Sx_0,
+  // which induces rotations about the X-axis. The scalar_operator(mod_func)
+  // allows the drive term to vary in time according to mod_func.
+  auto hamiltonian =
+      0.5 * omega_z * cudaq::spin_operator::z(0) +
+      omega_x * cudaq::scalar_operator(mod_func) * cudaq::spin_operator::x(0);
+
+  // A single qubit with dimension 2.
+  std::map<int, int> dimensions = {{0, 2}};
+
+  // The qubit starts in the |0> state, represented by the vector [1, 0].
+  std::vector<std::complex<double>> initial_state_vec = {1.0, 0.0};
+  auto psi0 = cudaq::state::from_data(initial_state_vec);
+
+  // Set the final simulation time such that t_final = pi / omega_x, which
+  // relates to a specific qubit rotation.
+  double t_final = M_PI / omega_x;
+  // Define the integration time step dt as a small fraction of the drive
+  // period.
+  double dt = 2.0 * M_PI / omega_drive / 100;
+  // Compute the number of steps required for the simulation
+  int num_steps = static_cast<int>(std::ceil(t_final / dt)) + 1;
+  // Create a schedule with time steps from 0 to t_final.
+  std::vector<double> steps = cudaq::linspace(0, t_final, num_steps);
+  // The schedule carries the time parameter labelled `t`, which is used by
+  // mod_func.
+  cudaq::Schedule schedule(steps, {"t"});
+
+  // A Runge-Kutta integrator (4th order) with a small time step dt.
+  std::shared_ptr<cudaq::RungeKuttaIntegrator> integrator =
+      std::make_shared<cudaq::RungeKuttaIntegrator>();
+  integrator->dt = dt;
+  integrator->order = 4;
+
+  // Measure the expectation values of the qubit's spin components along the X,
+  // Y, and Z directions.
+  auto observables = {cudaq::spin_operator::x(0), cudaq::spin_operator::y(0),
+                      cudaq::spin_operator::z(0)};
+
+  // Simulation without decoherence
+  // Evolve the system under the Hamiltonian, using the specified schedule and
+  // integrator. No collapse operators are included (closed system evolution).
+  auto evolve_result = cudaq::evolve(hamiltonian, dimensions, schedule, psi0,
+                                     integrator, {}, observables, true);
+
+  // Simulation with decoherence
+  // Introduce dephasing (decoherence) through a collapse operator.
+  // Here, gamma_sz = 1.0 is the dephasing rate, and the collapse operator is
+  // sqrt(gamma_sz) * Sz_0 which simulates decoherence in teh energy basis
+  // (Z-basis dephasing).
+  double gamma_sz = 1.0;
+  auto evolve_result_decay = cudaq::evolve(
+      hamiltonian, dimensions, schedule, psi0, integrator,
+      {std::sqrt(gamma_sz) * cudaq::spin_operator::z(0)}, observables, true);
+
+  // Lambda to extract expectation values for a given observable index
+  auto get_expectation = [](int idx,
+                            const auto &result) -> std::vector<double> {
+    std::vector<double> expectations;
+
+    auto all_exps = result.get_expectation_values().value();
+    for (auto exp_vals : all_exps) {
+      expectations.push_back(exp_vals[idx].expectation());
+    }
+    return expectations;
+  };
+
+  // For the ideal evolution
+  auto ideal_result_x = get_expectation(0, evolve_result);
+  auto ideal_result_y = get_expectation(1, evolve_result);
+  auto ideal_result_z = get_expectation(2, evolve_result);
+
+  // For the decoherence evolution
+  auto decoherence_result_x = get_expectation(0, evolve_result_decay);
+  auto decoherence_result_y = get_expectation(1, evolve_result_decay);
+  auto decoherence_result_z = get_expectation(2, evolve_result_decay);
+
+  // Export the results to a CSV file
+  export_csv("qubit_control_ideal_result.csv", "t", steps, "sigma_x",
+             ideal_result_x, "sigma_y", ideal_result_y, "sigma_z",
+             ideal_result_z);
+  export_csv("qubit_control_decoherence_result.csv", "t", steps, "sigma_x",
+             decoherence_result_x, "sigma_y", decoherence_result_y, "sigma_z",
+             decoherence_result_z);
+
+  std::cout << "Results exported to qubit_control_ideal_result.csv and "
+               "qubit_control_decoherence_result.csv"
+            << std::endl;
+
+  return 0;
+}
diff --git a/docs/sphinx/examples/cpp/dynamics/qubit_dynamics.cpp b/docs/sphinx/examples/cpp/dynamics/qubit_dynamics.cpp
new file mode 100644
index 00000000000..ad90995a630
--- /dev/null
+++ b/docs/sphinx/examples/cpp/dynamics/qubit_dynamics.cpp
@@ -0,0 +1,83 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// Compile and run with:
+// ```
+// nvq++ --target dynamics qubit_dynamics.cpp -o a.out && ./a.out
+// ```
+
+#include "cudaq/algorithms/evolve.h"
+#include "cudaq/dynamics_integrators.h"
+#include "cudaq/operators.h"
+#include "export_csv_helper.h"
+#include <cudaq.h>
+
+int main() {
+  // Qubit hamiltonian: 2 * pi * 0.1 * sigma_x
+  // Physically, this represents a qubit (a two-level system) driven by a weak
+  // transverse field along the x-axis.
+  auto hamiltonian = 2.0 * M_PI * 0.1 * cudaq::spin_operator::x(0);
+
+  // Dimensions: one subsystem of dimension 2 (a two-level system).
+  const std::map<int, int> dimensions = {{0, 2}};
+
+  // Initial state: ground state
+  std::vector<std::complex<double>> initial_state_vec = {1.0, 0.0};
+  auto psi0 = cudaq::state::from_data(initial_state_vec);
+
+  // Create a schedule of time steps from 0 to 10 with 101 points
+  auto steps = cudaq::linspace(0.0, 10.0, 101);
+  cudaq::Schedule schedule(steps, {"time"});
+
+  // Runge-Kutta integrator with a time step of 0.001 and order 4
+  std::shared_ptr<cudaq::RungeKuttaIntegrator> integrator =
+      std::make_shared<cudaq::RungeKuttaIntegrator>();
+  integrator->dt = 0.01;
+  integrator->order = 4;
+
+  // Run the simulation without collapse operators (ideal evolution)
+  auto evolve_result = cudaq::evolve(
+      hamiltonian, dimensions, schedule, psi0, integrator, {},
+      {cudaq::spin_operator::y(0), cudaq::spin_operator::z(0)}, true);
+
+  constexpr double decay_rate = 0.05;
+  auto collapse_operator = std::sqrt(decay_rate) * cudaq::spin_operator::x(0);
+
+  // Evolve with collapse operators
+  cudaq::evolve_result evolve_result_decay = cudaq::evolve(
+      hamiltonian, dimensions, schedule, psi0, integrator, {collapse_operator},
+      {cudaq::spin_operator::y(0), cudaq::spin_operator::z(0)}, true);
+
+  // Lambda to extract expectation values for a given observable index
+  auto get_expectation = [](int idx,
+                            const auto &result) -> std::vector<double> {
+    std::vector<double> expectations;
+
+    auto all_exps = result.get_expectation_values().value();
+    for (auto exp_vals : all_exps) {
+      expectations.push_back((double)exp_vals[idx]);
+    }
+    return expectations;
+  };
+
+  auto ideal_result0 = get_expectation(0, evolve_result);
+  auto ideal_result1 = get_expectation(1, evolve_result);
+  auto decay_result0 = get_expectation(0, evolve_result_decay);
+  auto decay_result1 = get_expectation(1, evolve_result_decay);
+
+  export_csv("qubit_dynamics_ideal_result.csv", "time", steps, "sigma_y",
+             ideal_result0, "sigma_z", ideal_result1);
+  export_csv("qubit_dynamics_decay_result.csv", "time", steps, "sigma_y",
+             decay_result0, "sigma_z", decay_result1);
+
+  std::cout << "Results exported to qubit_dynamics_ideal_result.csv and "
+               "qubit_dynamics_decay_result.csv"
+            << std::endl;
+
+  return 0;
+}
\ No newline at end of file
diff --git a/docs/sphinx/using/backends/sims/tnsims.rst b/docs/sphinx/using/backends/sims/tnsims.rst
index 24c6fbd76aa..2a2f56d11b9 100644
--- a/docs/sphinx/using/backends/sims/tnsims.rst
+++ b/docs/sphinx/using/backends/sims/tnsims.rst
@@ -70,11 +70,12 @@ Use the following commands to enable distribution across multiple GPUs (adjust t
         mpiexec -np 2 ./program.x
 
 .. note::
+  MPI parallelization on the :code:`tensornet` backend requires CUDA-Q's MPI support. 
+  Please refer to the instructions on how to :ref:`enable MPI parallelization <distributed-computing-with-mpi>` within CUDA-Q.  
+  CUDA-Q containers are shipped with a pre-built MPI plugin; hence no additional setup is needed.  
 
-  If the `CUTENSORNET_COMM_LIB` environment variable is not set, MPI parallelization on the :code:`tensornet` backend may fail.
-  If you are using a CUDA-Q container, this variable is pre-configured and no additional setup is needed. If you are customizing your installation or have built CUDA-Q from source, please follow the instructions for `activating the distributed interface <https://docs.nvidia.com/cuda/cuquantum/latest/getting-started/index.html#from-nvidia-devzone>`__ for the `cuTensorNet` library. This requires 
-  :ref:`installing CUDA development dependencies <additional-cuda-tools>`, and setting the `CUTENSORNET_COMM_LIB`
-  environment variable to the newly built `libcutensornet_distributed_interface_mpi.so` library.
+.. note::  
+  If the `CUTENSORNET_COMM_LIB` environment variable is set following the activation procedure described in the `cuTensorNet documentation <https://docs.nvidia.com/cuda/cuquantum/latest/getting-started/index.html#from-nvidia-devzone>`__, the cuTensorNet MPI plugin will take precedence over the builtin support from CUDA-Q.
 
 Specific aspects of the simulation can be configured by setting the following of environment variables:
 
diff --git a/include/cudaq/Optimizer/Dialect/CC/CCOps.td b/include/cudaq/Optimizer/Dialect/CC/CCOps.td
index 8e059b41ed0..9b1c3d71169 100644
--- a/include/cudaq/Optimizer/Dialect/CC/CCOps.td
+++ b/include/cudaq/Optimizer/Dialect/CC/CCOps.td
@@ -1459,8 +1459,8 @@ def cc_CallCallableOp : CCOp<"call_callable", [CallOpInterface]> {
   }];
 }
 
-def cc_CallIndirectCallableOp : CCOp<"call_indirect_callable",
-    [CallOpInterface]> {
+def cc_CallIndirectCallableOp :
+    CCOp<"call_indirect_callable", [CallOpInterface]> {
   let summary = "Call a C++ callable, unresolved, at run-time.";
   let description = [{
     This effectively connects a call from one kernel to another kernel, which
@@ -1649,6 +1649,43 @@ def cc_CallableClosureOp : CCOp<"callable_closure", [Pure]> {
   }];
 }
 
+def cc_VarargCallOp :
+    CCOp<"call_vararg", [CallOpInterface, SymbolUserOpInterface]> {
+  let summary = "Create a call to an llvm.func with variadic arguments.";
+  let description = [{
+    This operation lets us create a call to an LLVMIR FuncOp with variadic
+    arguments without the restriction that all the arguments have to be
+    converted to LLVMIR types first. These conversions are just code bloat and
+    make the code harder to read.
+  }];
+
+  let arguments = (ins
+    FlatSymbolRefAttr:$callee,
+    Variadic<AnyType>:$args
+  );
+  let results = (outs Variadic<AnyType>);
+
+  let assemblyFormat = [{
+    $callee `(` $args `)` `:` functional-type(operands, results) attr-dict
+  }];
+
+  let extraClassDeclaration = [{
+    operand_range getArgOperands() {
+      return {arg_operand_begin(), arg_operand_end()};
+    }
+
+    operand_iterator arg_operand_begin() { return operand_begin(); }
+    operand_iterator arg_operand_end() { return operand_end(); }
+
+    /// Return the callee of this operation.
+    mlir::CallInterfaceCallable getCallableForCallee() {
+      return getCalleeAttr();
+    }
+
+    mlir::LogicalResult verifySymbolUses(mlir::SymbolTableCollection &);
+  }];
+}
+
 def cc_CreateStringLiteralOp : CCOp<"string_literal"> {
   let summary = "Create a constant string literal.";
   let description = [{
diff --git a/include/cudaq/Optimizer/InitAllPasses.h b/include/cudaq/Optimizer/InitAllPasses.h
index cda83274ddf..91724d36a40 100644
--- a/include/cudaq/Optimizer/InitAllPasses.h
+++ b/include/cudaq/Optimizer/InitAllPasses.h
@@ -22,6 +22,7 @@ inline void registerCudaqPassesAndPipelines() {
   // CUDA-Q pipelines
   opt::registerAggressiveEarlyInliningPipeline();
   opt::registerUnrollingPipeline();
+  opt::registerClassicalOptimizationPipeline();
   opt::registerToExecutionManagerCCPipeline();
   opt::registerToQIRAPIPipeline();
   opt::registerTargetPipelines();
diff --git a/include/cudaq/Optimizer/Transforms/Passes.h b/include/cudaq/Optimizer/Transforms/Passes.h
index 4bfddf6101d..bd9ef52b470 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.h
+++ b/include/cudaq/Optimizer/Transforms/Passes.h
@@ -25,6 +25,7 @@ void addAggressiveEarlyInlining(mlir::OpPassManager &pm);
 void registerAggressiveEarlyInliningPipeline();
 
 void registerUnrollingPipeline();
+void registerClassicalOptimizationPipeline();
 void registerMappingPipeline();
 
 std::unique_ptr<mlir::Pass> createApplyOpSpecializationPass();
diff --git a/include/cudaq/Optimizer/Transforms/Passes.td b/include/cudaq/Optimizer/Transforms/Passes.td
index 1ccec4a4278..8b9b75b89e6 100644
--- a/include/cudaq/Optimizer/Transforms/Passes.td
+++ b/include/cudaq/Optimizer/Transforms/Passes.td
@@ -141,9 +141,36 @@ def CheckKernelCalls : Pass<"check-kernel-calls", "mlir::func::FuncOp"> {
   }];
 }
 
+def ClassicalOptimization : Pass<"classical-optimization"> {
+  let summary = "Perform classical optimizations until a fix point is reached.";
+  let description = [{
+    Performs a number of classical optimizations greedily until a fix point
+    is reached:
+    - canonicalization
+    - simplify regions
+    - write-after-write-elimination
+    - lift-array-alloc
+    - cc-loop-normalize
+    - cc-loop-unroll
+  }];
+
+  let dependentDialects = ["mlir::arith::ArithDialect",
+                           "mlir::cf::ControlFlowDialect",
+                           "cudaq::cc::CCDialect"];
+
+  let options = [
+    Option<"threshold", "maximum-iterations", "unsigned", /*default=*/"50",
+      "Maximum iterations to unroll.">,
+    Option<"allowClosedInterval", "allow-closed-interval", "bool",
+      /*default=*/"true", "Allow loop iterations on a closed interval.">,
+    Option<"allowBreak", "allow-early-exit", "bool", /*default=*/"false",
+      "Allow unrolling of loop with early exit (i.e. break statement).">
+  ];
+}
+
 def CombineMeasurements :
     Pass<"combine-measurements", "mlir::func::FuncOp"> {
-  let summary = "Extends mesurements on subveqs adds output names";
+  let summary = "Extends measurements on subveqs adds output names";
   let description = [{
     Replace a pattern such as:
     ```
diff --git a/lib/Optimizer/CodeGen/CCToLLVM.cpp b/lib/Optimizer/CodeGen/CCToLLVM.cpp
index 5d8881ce44f..4f6dd6fbf3e 100644
--- a/lib/Optimizer/CodeGen/CCToLLVM.cpp
+++ b/lib/Optimizer/CodeGen/CCToLLVM.cpp
@@ -710,18 +710,33 @@ class UndefOpPattern : public ConvertOpToLLVMPattern<cudaq::cc::UndefOp> {
     return success();
   }
 };
+
+class VarargCallPattern
+    : public ConvertOpToLLVMPattern<cudaq::cc::VarargCallOp> {
+  using ConvertOpToLLVMPattern::ConvertOpToLLVMPattern;
+
+  LogicalResult
+  matchAndRewrite(cudaq::cc::VarargCallOp vcall, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    SmallVector<Type> types;
+    for (auto ty : vcall.getResultTypes())
+      types.push_back(getTypeConverter()->convertType(ty));
+    rewriter.replaceOpWithNewOp<LLVM::CallOp>(vcall, types, vcall.getCallee(),
+                                              adaptor.getArgs());
+    return success();
+  }
+};
 } // namespace
 
 void cudaq::opt::populateCCToLLVMPatterns(LLVMTypeConverter &typeConverter,
                                           RewritePatternSet &patterns) {
-  patterns.insert<AddressOfOpPattern, AllocaOpPattern, CallableClosureOpPattern,
-                  CallableFuncOpPattern, CallCallableOpPattern,
-                  CallIndirectCallableOpPattern, CastOpPattern,
-                  ComputePtrOpPattern, CreateStringLiteralOpPattern,
-                  ExtractValueOpPattern, FuncToPtrOpPattern, GlobalOpPattern,
-                  InsertValueOpPattern, InstantiateCallableOpPattern,
-                  LoadOpPattern, OffsetOfOpPattern, PoisonOpPattern,
-                  SizeOfOpPattern, StdvecDataOpPattern, StdvecInitOpPattern,
-                  StdvecSizeOpPattern, StoreOpPattern, UndefOpPattern>(
-      typeConverter);
+  patterns.insert<
+      AddressOfOpPattern, AllocaOpPattern, CallableClosureOpPattern,
+      CallableFuncOpPattern, CallCallableOpPattern,
+      CallIndirectCallableOpPattern, CastOpPattern, ComputePtrOpPattern,
+      CreateStringLiteralOpPattern, ExtractValueOpPattern, FuncToPtrOpPattern,
+      GlobalOpPattern, InsertValueOpPattern, InstantiateCallableOpPattern,
+      LoadOpPattern, OffsetOfOpPattern, PoisonOpPattern, SizeOfOpPattern,
+      StdvecDataOpPattern, StdvecInitOpPattern, StdvecSizeOpPattern,
+      StoreOpPattern, UndefOpPattern, VarargCallPattern>(typeConverter);
 }
diff --git a/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp b/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp
index fb48d7c753d..6f8818855a8 100644
--- a/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp
+++ b/lib/Optimizer/CodeGen/ConvertToQIRAPI.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
- * Copyright (c) 2022 - 2024 NVIDIA Corporation & Affiliates.                  *
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
  * This source code and the accompanying materials are made available under    *
@@ -41,6 +41,9 @@ namespace cudaq::opt {
 
 using namespace mlir;
 
+// Attribute name used to mark kernels that have been processed.
+static constexpr const char FuncIsQIRAPI[] = "qir-api";
+
 //===----------------------------------------------------------------------===//
 
 static std::string getGateName(Operation *op) {
@@ -1150,8 +1153,8 @@ struct QuantumGatePattern : public OpConversionPattern<OP> {
     args.append(opTargs.begin(), opTargs.end());
 
     // Call the generalized version of the gate invocation.
-    rewriter.create<LLVM::CallOp>(loc, TypeRange{},
-                                  cudaq::opt::NVQIRGeneralizedInvokeAny, args);
+    rewriter.create<cudaq::cc::VarargCallOp>(
+        loc, TypeRange{}, cudaq::opt::NVQIRGeneralizedInvokeAny, args);
     return forwardOrEraseOp();
   }
 
@@ -1228,17 +1231,30 @@ struct FuncSignaturePattern : public OpConversionPattern<func::FuncOp> {
     auto funcTy = func.getFunctionType();
     auto newFuncTy =
         cast<FunctionType>(getTypeConverter()->convertType(funcTy));
-    if (funcTy == newFuncTy)
-      return failure();
-    if (funcTy.getNumInputs() && !func.getBody().empty()) {
-      // Replace the block argument types.
-      for (auto [blockArg, argTy] : llvm::zip(
-               func.getBody().front().getArguments(), newFuncTy.getInputs()))
-        blockArg.setType(argTy);
+    if (funcTy != newFuncTy) {
+      // Convert the entry block to the new argument types.
+      if (funcTy.getNumInputs() && !func.getBody().empty()) {
+        // Replace the block argument types.
+        for (auto [blockArg, argTy] : llvm::zip(
+                 func.getBody().front().getArguments(), newFuncTy.getInputs()))
+          blockArg.setType(argTy);
+      }
+    }
+    // Convert any other blocks, as needed.
+    for (auto &block : func.getBody().getBlocks()) {
+      if (&block == &func.getBody().front())
+        continue;
+      SmallVector<Type> newTypes;
+      for (auto blockArg : block.getArguments())
+        newTypes.push_back(getTypeConverter()->convertType(blockArg.getType()));
+      for (auto [blockArg, newTy] : llvm::zip(block.getArguments(), newTypes))
+        blockArg.setType(newTy);
     }
     // Replace the signature.
-    rewriter.updateRootInPlace(func,
-                               [&]() { func.setFunctionType(newFuncTy); });
+    rewriter.updateRootInPlace(func, [&]() {
+      func.setFunctionType(newFuncTy);
+      func->setAttr(FuncIsQIRAPI, rewriter.getUnitAttr());
+    });
     return success();
   }
 };
@@ -1359,10 +1375,36 @@ struct CallOpInterfacePattern : public OpConversionPattern<CALLOP> {
 
 using CallOpPattern = CallOpInterfacePattern<func::CallOp>;
 using CallIndirectOpPattern = CallOpInterfacePattern<func::CallIndirectOp>;
+using CallVarargOpPattern = CallOpInterfacePattern<cudaq::cc::VarargCallOp>;
 using CallCallableOpPattern = CallOpInterfacePattern<cudaq::cc::CallCallableOp>;
 using CallIndirectCallableOpPattern =
     CallOpInterfacePattern<cudaq::cc::CallIndirectCallableOp>;
 
+struct BranchOpPattern : public OpConversionPattern<cf::BranchOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(cf::BranchOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<cf::BranchOp>(op, adaptor.getDestOperands(),
+                                              op.getDest());
+    return success();
+  }
+};
+
+struct CondBranchOpPattern : public OpConversionPattern<cf::CondBranchOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(cf::CondBranchOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    rewriter.replaceOpWithNewOp<cf::CondBranchOp>(
+        op, adaptor.getCondition(), adaptor.getTrueDestOperands(),
+        adaptor.getFalseDestOperands(), op.getTrueDest(), op.getFalseDest());
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Patterns that are common to all QIR conversions.
 //===----------------------------------------------------------------------===//
@@ -1370,9 +1412,10 @@ using CallIndirectCallableOpPattern =
 static void commonClassicalHandlingPatterns(RewritePatternSet &patterns,
                                             TypeConverter &typeConverter,
                                             MLIRContext *ctx) {
-  patterns.insert<AllocaOpPattern, CallableFuncPattern, CallCallableOpPattern,
-                  CallIndirectCallableOpPattern, CallIndirectOpPattern,
-                  CallOpPattern, CastOpPattern, CreateLambdaPattern,
+  patterns.insert<AllocaOpPattern, BranchOpPattern, CallableFuncPattern,
+                  CallCallableOpPattern, CallIndirectCallableOpPattern,
+                  CallIndirectOpPattern, CallOpPattern, CallVarargOpPattern,
+                  CastOpPattern, CondBranchOpPattern, CreateLambdaPattern,
                   FuncConstantPattern, FuncSignaturePattern, FuncToPtrPattern,
                   InstantiateCallablePattern, LoadOpPattern, PoisonOpPattern,
                   StoreOpPattern, UndefOpPattern>(typeConverter, ctx);
@@ -1590,8 +1633,10 @@ struct QuakeToQIRAPIPass
     target.addIllegalDialect<quake::QuakeDialect,
                              cudaq::codegen::CodeGenDialect>();
     target.addLegalOp<cudaq::codegen::MaterializeConstantArrayOp>();
-    target.addDynamicallyLegalOp<func::FuncOp>(
-        [&](func::FuncOp fn) { return !hasQuakeType(fn.getFunctionType()); });
+    target.addDynamicallyLegalOp<func::FuncOp>([&](func::FuncOp fn) {
+      return !hasQuakeType(fn.getFunctionType()) &&
+             (!fn->hasAttr(cudaq::kernelAttrName) || fn->hasAttr(FuncIsQIRAPI));
+    });
     target.addDynamicallyLegalOp<func::ConstantOp>([&](func::ConstantOp fn) {
       return !hasQuakeType(fn.getResult().getType());
     });
@@ -1615,19 +1660,26 @@ struct QuakeToQIRAPIPass
         [&](cudaq::cc::AllocaOp op) {
           return !hasQuakeType(op.getElementType());
         });
-    target.addDynamicallyLegalOp<
-        func::CallOp, func::CallIndirectOp, cudaq::cc::CallCallableOp,
-        cudaq::cc::CallIndirectCallableOp, cudaq::cc::CastOp,
-        cudaq::cc::FuncToPtrOp, cudaq::cc::StoreOp, cudaq::cc::LoadOp>(
+    target.addDynamicallyLegalOp<cf::BranchOp, cf::CondBranchOp>(
         [&](Operation *op) {
           for (auto opnd : op->getOperands())
             if (hasQuakeType(opnd.getType()))
               return false;
-          for (auto res : op->getResults())
-            if (hasQuakeType(res.getType()))
-              return false;
           return true;
         });
+    target.addDynamicallyLegalOp<
+        func::CallOp, func::CallIndirectOp, cudaq::cc::VarargCallOp,
+        cudaq::cc::CallCallableOp, cudaq::cc::CallIndirectCallableOp,
+        cudaq::cc::CastOp, cudaq::cc::FuncToPtrOp, cudaq::cc::StoreOp,
+        cudaq::cc::LoadOp>([&](Operation *op) {
+      for (auto opnd : op->getOperands())
+        if (hasQuakeType(opnd.getType()))
+          return false;
+      for (auto res : op->getResults())
+        if (hasQuakeType(res.getType()))
+          return false;
+      return true;
+    });
     target.markUnknownOpDynamicallyLegal([](Operation *) { return true; });
     if (failed(applyPartialConversion(op, target, std::move(patterns))))
       signalPassFailure();
diff --git a/lib/Optimizer/Dialect/CC/CCOps.cpp b/lib/Optimizer/Dialect/CC/CCOps.cpp
index 823c92de99d..8a1cb392716 100644
--- a/lib/Optimizer/Dialect/CC/CCOps.cpp
+++ b/lib/Optimizer/Dialect/CC/CCOps.cpp
@@ -2347,6 +2347,50 @@ LogicalResult cudaq::cc::UnwindReturnOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// VarargCallOp
+//===----------------------------------------------------------------------===//
+
+LogicalResult
+cudaq::cc::VarargCallOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
+  // Check that the callee attribute was specified.
+  auto fnAttr = (*this)->getAttrOfType<FlatSymbolRefAttr>("callee");
+  if (!fnAttr)
+    return emitOpError("requires a 'callee' symbol reference attribute");
+  LLVM::LLVMFuncOp fn =
+      symbolTable.lookupNearestSymbolFrom<LLVM::LLVMFuncOp>(*this, fnAttr);
+  if (!fn)
+    return emitOpError() << "'" << fnAttr.getValue()
+                         << "' does not reference a valid LLVM function";
+
+  // Verify that the operand and result types match the callee.
+  auto fnType = fn.getFunctionType();
+  if (fnType.getNumParams() > getNumOperands())
+    return emitOpError("incorrect number of operands for callee");
+
+  for (unsigned i = 0, e = fnType.getNumParams(); i != e; ++i)
+    if (getOperand(i).getType() != fnType.getParams()[i]) {
+      return emitOpError("operand type mismatch: expected operand type ")
+             << fnType.getParams()[i] << ", but provided "
+             << getOperand(i).getType() << " for operand number " << i;
+    }
+
+  if (fnType.getReturnType() == LLVM::LLVMVoidType::get(getContext()) &&
+      getNumResults() == 0)
+    return success();
+
+  if (getNumResults() > 1)
+    return emitOpError("wrong number of result types: ") << getNumResults();
+
+  if (getResult(1).getType() != fnType.getReturnType()) {
+    auto diag = emitOpError("result type mismatch ");
+    diag.attachNote() << "      op result types: " << getResultTypes();
+    diag.attachNote() << "function result types: " << fnType.getReturnType();
+    return diag;
+  }
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Generated logic
 //===----------------------------------------------------------------------===//
diff --git a/lib/Optimizer/Transforms/CMakeLists.txt b/lib/Optimizer/Transforms/CMakeLists.txt
index d0a6b85799c..55f20cc8589 100644
--- a/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/lib/Optimizer/Transforms/CMakeLists.txt
@@ -17,6 +17,7 @@ add_cudaq_library(OptTransforms
   ApplyOpSpecialization.cpp
   ArgumentSynthesis.cpp
   BasisConversion.cpp
+  ClassicalOptimization.cpp
   CombineMeasurements.cpp
   CombineQuantumAlloc.cpp
   ConstPropComplex.cpp
@@ -54,6 +55,7 @@ add_cudaq_library(OptTransforms
   RegToMem.cpp
   StatePreparation.cpp
   UnitarySynthesis.cpp
+  UpdateRegisterNames.cpp
   WiresToWiresets.cpp
   WriteAfterWriteElimination.cpp
 
diff --git a/lib/Optimizer/Transforms/ClassicalOptimization.cpp b/lib/Optimizer/Transforms/ClassicalOptimization.cpp
new file mode 100644
index 00000000000..695efd37e0e
--- /dev/null
+++ b/lib/Optimizer/Transforms/ClassicalOptimization.cpp
@@ -0,0 +1,152 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "LoopAnalysis.h"
+#include "PassDetails.h"
+#include "cudaq/Optimizer/Transforms/Passes.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/RegionUtils.h"
+
+namespace cudaq::opt {
+#define GEN_PASS_DEF_CLASSICALOPTIMIZATION
+#include "cudaq/Optimizer/Transforms/Passes.h.inc"
+} // namespace cudaq::opt
+
+#define DEBUG_TYPE "classical-optimizations"
+
+using namespace mlir;
+
+#include "LiftArrayAllocPatterns.inc"
+#include "LoopNormalizePatterns.inc"
+#include "LoopUnrollPatterns.inc"
+#include "LowerToCFGPatterns.inc"
+#include "WriteAfterWriteEliminationPatterns.inc"
+
+namespace {
+
+/// The classical optimization pass performs a number of classical
+/// optimizations greedily until changes no more changes can be done.
+class ClassicalOptimizationPass
+    : public cudaq::opt::impl::ClassicalOptimizationBase<
+          ClassicalOptimizationPass> {
+public:
+  using ClassicalOptimizationBase::ClassicalOptimizationBase;
+
+  void runOnOperation() override {
+    auto *ctx = &getContext();
+    auto *op = getOperation();
+    DominanceInfo domInfo(op);
+    auto func = dyn_cast<func::FuncOp>(op);
+    auto numLoops = countLoopOps(op);
+    unsigned progress = 0;
+
+    RewritePatternSet patterns(ctx);
+    for (auto *dialect : ctx->getLoadedDialects())
+      dialect->getCanonicalizationPatterns(patterns);
+    for (RegisteredOperationName op : ctx->getRegisteredOperations())
+      op.getCanonicalizationPatterns(patterns, ctx);
+
+    // Add patterns that help const prop loop boundaries computed
+    // in conditional statements, other loops, or arrays.
+    patterns.insert<RewriteIf>(ctx, /*rewriteOnlyIfConst=*/true);
+    patterns.insert<LoopPat>(ctx, allowClosedInterval, allowBreak);
+    patterns.insert<AllocaPattern>(
+        ctx, domInfo, func == nullptr ? "unknown" : func.getName());
+    if (numLoops)
+      patterns.insert<UnrollCountedLoop>(ctx, threshold,
+                                         /*signalFailure=*/false, allowBreak,
+                                         progress);
+
+    FrozenRewritePatternSet frozen(std::move(patterns));
+    // Iterate over the loops until a fixed-point is reached. Some loops can
+    // only be unrolled if other loops are unrolled first and the constants
+    // iteratively propagated.
+    do {
+      // Remove overridden writes.
+      auto analysis = SimplifyWritesAnalysis(domInfo, op);
+      analysis.removeOverriddenStores();
+      // Clean up dead code.
+      {
+        auto builder = OpBuilder(op);
+        IRRewriter rewriter(builder);
+        [[maybe_unused]] auto unused =
+            simplifyRegions(rewriter, op->getRegions());
+      }
+      progress = 0;
+      (void)applyPatternsAndFoldGreedily(op, frozen);
+    } while (progress);
+  }
+
+  static unsigned countLoopOps(Operation *op) {
+    unsigned result = 0;
+    op->walk([&](cudaq::cc::LoopOp loop) { result++; });
+    LLVM_DEBUG(llvm::dbgs() << "Total number of loops: " << result << '\n');
+    return result;
+  }
+};
+
+/// Classical optimization pipeline command-line options. These options are
+/// similar to the ClassicalOptimization pass options, but have different
+/// default settings.
+struct ClassicalOptimizationPipelineOptions
+    : public PassPipelineOptions<ClassicalOptimizationPipelineOptions> {
+  PassOptions::Option<unsigned> threshold{
+      *this, "threshold",
+      llvm::cl::desc("Maximum iterations to unroll. (default: 1024)"),
+      llvm::cl::init(1024)};
+  PassOptions::Option<bool> allowBreak{
+      *this, "allow-early-exit",
+      llvm::cl::desc("Allow unrolling of loop with early exit (i.e. break "
+                     "statement). (default: true)"),
+      llvm::cl::init(true)};
+  PassOptions::Option<bool> allowClosedInterval{
+      *this, "allow-closed-interval",
+      llvm::cl::desc("Allow unrolling of loop with a closed interval form. "
+                     "(default: true)"),
+      llvm::cl::init(true)};
+};
+} // namespace
+
+/// Add a pass pipeline to apply the requisite passes to optimize classical
+/// code. When converting to a quantum circuit, the static control program is
+/// fully expanded to eliminate control flow.
+static void createClassicalOptimizationPipeline(OpPassManager &pm,
+                                                unsigned threshold,
+                                                bool allowBreak,
+                                                bool allowClosedInterval) {
+  pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
+  pm.addNestedPass<func::FuncOp>(createCSEPass());
+  pm.addNestedPass<func::FuncOp>(cudaq::opt::createClassicalMemToReg());
+
+  // Run classical optimization twice with a cse in between to optimize more
+  // code.
+  // TODO: run cse as a part of classical-optimization when we update the llvm
+  // version.
+  cudaq::opt::ClassicalOptimizationOptions options{
+      threshold, allowClosedInterval, allowBreak};
+  pm.addNestedPass<func::FuncOp>(
+      cudaq::opt::createClassicalOptimization(options));
+  pm.addNestedPass<func::FuncOp>(createCSEPass());
+  pm.addNestedPass<func::FuncOp>(
+      cudaq::opt::createClassicalOptimization(options));
+  pm.addNestedPass<func::FuncOp>(cudaq::opt::createUpdateRegisterNames());
+}
+
+void cudaq::opt::registerClassicalOptimizationPipeline() {
+  PassPipelineRegistration<ClassicalOptimizationPipelineOptions>(
+      "classical-optimization-pipeline", "Fully optimize classical code.",
+      [](OpPassManager &pm,
+         const ClassicalOptimizationPipelineOptions &options) {
+        createClassicalOptimizationPipeline(pm, options.threshold,
+                                            options.allowBreak,
+                                            options.allowClosedInterval);
+      });
+}
diff --git a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
index 0f7647b5796..70113cf3e9f 100644
--- a/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
+++ b/lib/Optimizer/Transforms/LiftArrayAlloc.cpp
@@ -27,263 +27,9 @@ namespace cudaq::opt {
 
 using namespace mlir;
 
-namespace {
-class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
-public:
-  explicit AllocaPattern(MLIRContext *ctx, DominanceInfo &di, StringRef fn)
-      : OpRewritePattern(ctx), dom(di), funcName(fn) {}
-
-  LogicalResult matchAndRewrite(cudaq::cc::AllocaOp alloc,
-                                PatternRewriter &rewriter) const override {
-    SmallVector<Operation *> stores;
-    if (!isGoodCandidate(alloc, stores, dom))
-      return failure();
-
-    LLVM_DEBUG(llvm::dbgs() << "Candidate was found\n");
-    auto allocTy = alloc.getElementType();
-    auto arrTy = cast<cudaq::cc::ArrayType>(allocTy);
-    auto eleTy = arrTy.getElementType();
-
-    SmallVector<Attribute> values;
-
-    // Every element of `stores` must be a cc::StoreOp with a ConstantOp as the
-    // value argument. Build the array attr to attach to a cc.const_array.
-    for (auto *op : stores) {
-      auto store = cast<cudaq::cc::StoreOp>(op);
-      auto *valOp = store.getValue().getDefiningOp();
-      if (auto con = dyn_cast<arith::ConstantOp>(valOp))
-        values.push_back(con.getValueAttr());
-      else if (auto con = dyn_cast<complex::ConstantOp>(valOp))
-        values.push_back(con.getValueAttr());
-      else
-        return alloc.emitOpError("could not fold");
-    }
-
-    // Create the cc.const_array.
-    auto valuesAttr = rewriter.getArrayAttr(values);
-    auto loc = alloc.getLoc();
-    Value conArr =
-        rewriter.create<cudaq::cc::ConstantArrayOp>(loc, arrTy, valuesAttr);
-
-    assert(conArr && "must have created the constant array");
-    LLVM_DEBUG(llvm::dbgs() << "constant array is:\n" << conArr << '\n');
-    bool cannotEraseAlloc = false;
-
-    // Collect all the stores, casts, and compute_ptr to be erased safely and in
-    // topological order.
-    SmallVector<Operation *> opsToErase;
-    auto insertOpToErase = [&](Operation *op) {
-      auto iter = std::find(opsToErase.begin(), opsToErase.end(), op);
-      if (iter == opsToErase.end())
-        opsToErase.push_back(op);
-    };
-
-    // Rewalk all the uses of alloc, u, which must be cc.cast or cc.compute_ptr.
-    // For each u remove a store and replace a load with a cc.extract_value.
-    for (auto *user : alloc->getUsers()) {
-      if (!user)
-        continue;
-      std::int32_t offset = 0;
-      if (auto cptr = dyn_cast<cudaq::cc::ComputePtrOp>(user))
-        offset = cptr.getRawConstantIndices()[0];
-      bool isLive = false;
-      if (!isa<cudaq::cc::CastOp, cudaq::cc::ComputePtrOp>(user)) {
-        cannotEraseAlloc = isLive = true;
-      } else {
-        for (auto *useuser : user->getUsers()) {
-          if (!useuser)
-            continue;
-          if (auto load = dyn_cast<cudaq::cc::LoadOp>(useuser)) {
-            rewriter.setInsertionPointAfter(useuser);
-            LLVM_DEBUG(llvm::dbgs() << "replaced load\n");
-            rewriter.replaceOpWithNewOp<cudaq::cc::ExtractValueOp>(
-                load, eleTy, conArr,
-                ArrayRef<cudaq::cc::ExtractValueArg>{offset});
-            continue;
-          }
-          if (isa<cudaq::cc::StoreOp>(useuser)) {
-            insertOpToErase(useuser);
-            continue;
-          }
-          LLVM_DEBUG(llvm::dbgs() << "alloc is live\n");
-          cannotEraseAlloc = isLive = true;
-        }
-      }
-      if (!isLive)
-        insertOpToErase(user);
-    }
-
-    for (auto *e : opsToErase)
-      rewriter.eraseOp(e);
-
-    if (cannotEraseAlloc) {
-      rewriter.setInsertionPointAfter(alloc);
-      rewriter.create<cudaq::cc::StoreOp>(loc, conArr, alloc);
-      return success();
-    }
-    rewriter.eraseOp(alloc);
-    return success();
-  }
-
-  // Determine if \p alloc is a legit candidate for promotion to a constant
-  // array value. \p scoreboard is a vector of store operations. Each element of
-  // the allocated array must be written to exactly 1 time, and the scoreboard
-  // is used to track these stores. \p dom is the dominance info for this
-  // function (to ensure the stores happen before uses).
-  static bool isGoodCandidate(cudaq::cc::AllocaOp alloc,
-                              SmallVectorImpl<Operation *> &scoreboard,
-                              DominanceInfo &dom) {
-    LLVM_DEBUG(llvm::dbgs() << "checking candidate\n");
-    if (alloc.getSeqSize())
-      return false;
-    auto arrTy = dyn_cast<cudaq::cc::ArrayType>(alloc.getElementType());
-    if (!arrTy || arrTy.isUnknownSize())
-      return false;
-    auto arrEleTy = arrTy.getElementType();
-    if (!isa<IntegerType, FloatType, ComplexType>(arrEleTy))
-      return false;
-
-    // There must be at least `size` uses to initialize the entire array.
-    auto size = arrTy.getSize();
-    if (std::distance(alloc->getUses().begin(), alloc->getUses().end()) < size)
-      return false;
-
-    // Keep a scoreboard for every element in the array. Every element *must* be
-    // stored to with a constant exactly one time.
-    scoreboard.resize(size);
-    for (int i = 0; i < size; i++)
-      scoreboard[i] = nullptr;
-
-    SmallVector<Operation *> toGlobalUses;
-    SmallVector<SmallPtrSet<Operation *, 2>> loadSets(size);
-
-    auto getWriteOp = [&](auto op, std::int32_t index) -> Operation * {
-      Operation *theStore = nullptr;
-      for (auto &use : op->getUses()) {
-        Operation *u = use.getOwner();
-        if (!u)
-          return nullptr;
-        if (auto store = dyn_cast<cudaq::cc::StoreOp>(u)) {
-          if (op.getOperation() == store.getPtrvalue().getDefiningOp()) {
-            if (theStore) {
-              LLVM_DEBUG(llvm::dbgs()
-                         << "more than 1 store to element of array\n");
-              return nullptr;
-            }
-            LLVM_DEBUG(llvm::dbgs() << "found store: " << store << "\n");
-            theStore = u;
-          }
-          continue;
-        }
-        if (isa<quake::InitializeStateOp>(u)) {
-          toGlobalUses.push_back(u);
-          continue;
-        }
-        if (isa<cudaq::cc::LoadOp>(u)) {
-          loadSets[index].insert(u);
-          continue;
-        }
-        return nullptr;
-      }
-      return theStore &&
-                     isa_and_present<arith::ConstantOp, complex::ConstantOp>(
-                         dyn_cast<cudaq::cc::StoreOp>(theStore)
-                             .getValue()
-                             .getDefiningOp())
-                 ? theStore
-                 : nullptr;
-    };
-
-    auto unsizedArrTy = cudaq::cc::ArrayType::get(arrEleTy);
-    auto ptrUnsizedArrTy = cudaq::cc::PointerType::get(unsizedArrTy);
-    auto ptrArrEleTy = cudaq::cc::PointerType::get(arrEleTy);
-    for (auto &use : alloc->getUses()) {
-      // All uses *must* be a degenerate cc.cast, cc.compute_ptr, or
-      // cc.init_state.
-      auto *op = use.getOwner();
-      if (!op) {
-        LLVM_DEBUG(llvm::dbgs() << "use was not an op\n");
-        return false;
-      }
-      if (auto cptr = dyn_cast<cudaq::cc::ComputePtrOp>(op)) {
-        if (auto index = cptr.getConstantIndex(0))
-          if (auto w = getWriteOp(cptr, *index))
-            if (!scoreboard[*index]) {
-              scoreboard[*index] = w;
-              continue;
-            }
-        return false;
-      }
-      if (auto cast = dyn_cast<cudaq::cc::CastOp>(op)) {
-        // Process casts that are used in store ops.
-        if (cast.getType() == ptrArrEleTy) {
-          if (auto w = getWriteOp(cast, 0))
-            if (!scoreboard[0]) {
-              scoreboard[0] = w;
-              continue;
-            }
-          return false;
-        }
-        // Process casts that are used in quake.init_state.
-        if (cast.getType() == ptrUnsizedArrTy) {
-          if (cast->hasOneUse()) {
-            auto &use = *cast->getUses().begin();
-            Operation *u = use.getOwner();
-            if (isa_and_present<quake::InitializeStateOp>(u)) {
-              toGlobalUses.push_back(op);
-              continue;
-            }
-          }
-          return false;
-        }
-        LLVM_DEBUG(llvm::dbgs() << "unexpected cast: " << *op << '\n');
-        toGlobalUses.push_back(op);
-        continue;
-      }
-      if (isa<quake::InitializeStateOp>(op)) {
-        toGlobalUses.push_back(op);
-        continue;
-      }
-      LLVM_DEBUG(llvm::dbgs() << "unexpected use: " << *op << '\n');
-      toGlobalUses.push_back(op);
-    }
-
-    bool ok = std::all_of(scoreboard.begin(), scoreboard.end(),
-                          [](bool b) { return b; });
-    LLVM_DEBUG(llvm::dbgs() << "all elements of array are set: " << ok << '\n');
-    if (ok) {
-      // Verify dominance relations.
-
-      // For all stores, the store of an element $e$ must dominate all loads of
-      // $e$.
-      for (int i = 0; i < size; ++i) {
-        for (auto *load : loadSets[i])
-          if (!dom.dominates(scoreboard[i], load)) {
-            LLVM_DEBUG(llvm::dbgs()
-                       << "store " << scoreboard[i]
-                       << " doesn't dominate load: " << *load << '\n');
-            return false;
-          }
-      }
-
-      // For all global uses, all of the stores must dominate every use.
-      for (auto *glob : toGlobalUses) {
-        for (auto *store : scoreboard)
-          if (!dom.dominates(store, glob)) {
-            LLVM_DEBUG(llvm::dbgs()
-                       << "store " << store << " doesn't dominate op: " << *glob
-                       << '\n');
-            return false;
-          }
-      }
-    }
-    return ok;
-  }
-
-  DominanceInfo &dom;
-  StringRef funcName;
-};
+#include "LiftArrayAllocPatterns.inc"
 
+namespace {
 class LiftArrayAllocPass
     : public cudaq::opt::impl::LiftArrayAllocBase<LiftArrayAllocPass> {
 public:
diff --git a/lib/Optimizer/Transforms/LiftArrayAllocPatterns.inc b/lib/Optimizer/Transforms/LiftArrayAllocPatterns.inc
new file mode 100644
index 00000000000..d87995ffb8d
--- /dev/null
+++ b/lib/Optimizer/Transforms/LiftArrayAllocPatterns.inc
@@ -0,0 +1,276 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// These patterns are used by the loft-array-alloc and cc-loop-unroll passes.
+
+// This file must be included after a `using namespace mlir;` as it uses bare
+// identifiers from that namespace.
+
+namespace {
+class AllocaPattern : public OpRewritePattern<cudaq::cc::AllocaOp> {
+public:
+  explicit AllocaPattern(MLIRContext *ctx, DominanceInfo &di, StringRef fn)
+      : OpRewritePattern(ctx), dom(di), funcName(fn) {}
+
+  LogicalResult matchAndRewrite(cudaq::cc::AllocaOp alloc,
+                                PatternRewriter &rewriter) const override {
+    SmallVector<Operation *> stores;
+    if (!isGoodCandidate(alloc, stores, dom))
+      return failure();
+
+    LLVM_DEBUG(llvm::dbgs() << "Candidate was found\n");
+    auto allocTy = alloc.getElementType();
+    auto arrTy = cast<cudaq::cc::ArrayType>(allocTy);
+    auto eleTy = arrTy.getElementType();
+
+    SmallVector<Attribute> values;
+
+    // Every element of `stores` must be a cc::StoreOp with a ConstantOp as the
+    // value argument. Build the array attr to attach to a cc.const_array.
+    for (auto *op : stores) {
+      auto store = cast<cudaq::cc::StoreOp>(op);
+      auto *valOp = store.getValue().getDefiningOp();
+      if (auto con = dyn_cast<arith::ConstantOp>(valOp))
+        values.push_back(con.getValueAttr());
+      else if (auto con = dyn_cast<complex::ConstantOp>(valOp))
+        values.push_back(con.getValueAttr());
+      else
+        return alloc.emitOpError("could not fold");
+    }
+
+    // Create the cc.const_array.
+    auto valuesAttr = rewriter.getArrayAttr(values);
+    auto loc = alloc.getLoc();
+    Value conArr =
+        rewriter.create<cudaq::cc::ConstantArrayOp>(loc, arrTy, valuesAttr);
+
+    assert(conArr && "must have created the constant array");
+    LLVM_DEBUG(llvm::dbgs() << "constant array is:\n" << conArr << '\n');
+    bool cannotEraseAlloc = false;
+
+    // Collect all the stores, casts, and compute_ptr to be erased safely and in
+    // topological order.
+    SmallVector<Operation *> opsToErase;
+    auto insertOpToErase = [&](Operation *op) {
+      auto iter = std::find(opsToErase.begin(), opsToErase.end(), op);
+      if (iter == opsToErase.end())
+        opsToErase.push_back(op);
+    };
+
+    // Rewalk all the uses of alloc, u, which must be cc.cast or cc.compute_ptr.
+    // For each u remove a store and replace a load with a cc.extract_value.
+    for (auto *user : alloc->getUsers()) {
+      if (!user)
+        continue;
+      std::int32_t offset = 0;
+      if (auto cptr = dyn_cast<cudaq::cc::ComputePtrOp>(user))
+        offset = cptr.getRawConstantIndices()[0];
+      bool isLive = false;
+      if (!isa<cudaq::cc::CastOp, cudaq::cc::ComputePtrOp>(user)) {
+        cannotEraseAlloc = isLive = true;
+      } else {
+        for (auto *useuser : user->getUsers()) {
+          if (!useuser)
+            continue;
+          if (auto load = dyn_cast<cudaq::cc::LoadOp>(useuser)) {
+            rewriter.setInsertionPointAfter(useuser);
+            LLVM_DEBUG(llvm::dbgs() << "replaced load\n");
+            // rewriter.replaceOpWithNewOp<cudaq::cc::ExtractValueOp>(
+            //     load, eleTy, conArr,
+            //     ArrayRef<cudaq::cc::ExtractValueArg>{offset});
+
+            auto extractValue = rewriter.create<cudaq::cc::ExtractValueOp>(
+                loc, eleTy, conArr,
+                ArrayRef<cudaq::cc::ExtractValueArg>{offset});
+            rewriter.replaceAllUsesWith(load, extractValue);
+            insertOpToErase(load);
+            continue;
+          }
+          if (isa<cudaq::cc::StoreOp>(useuser)) {
+            insertOpToErase(useuser);
+            continue;
+          }
+          LLVM_DEBUG(llvm::dbgs() << "alloc is live\n");
+          cannotEraseAlloc = isLive = true;
+        }
+      }
+      if (!isLive)
+        insertOpToErase(user);
+    }
+
+    for (auto *e : opsToErase)
+      rewriter.eraseOp(e);
+
+    if (cannotEraseAlloc) {
+      rewriter.setInsertionPointAfter(alloc);
+      rewriter.create<cudaq::cc::StoreOp>(loc, conArr, alloc);
+      return success();
+    }
+    rewriter.eraseOp(alloc);
+    return success();
+  }
+
+  // Determine if \p alloc is a legit candidate for promotion to a constant
+  // array value. \p scoreboard is a vector of store operations. Each element of
+  // the allocated array must be written to exactly 1 time, and the scoreboard
+  // is used to track these stores. \p dom is the dominance info for this
+  // function (to ensure the stores happen before uses).
+  static bool isGoodCandidate(cudaq::cc::AllocaOp alloc,
+                              SmallVectorImpl<Operation *> &scoreboard,
+                              DominanceInfo &dom) {
+    if (alloc.getSeqSize())
+      return false;
+    auto arrTy = dyn_cast<cudaq::cc::ArrayType>(alloc.getElementType());
+    if (!arrTy || arrTy.isUnknownSize())
+      return false;
+    auto arrEleTy = arrTy.getElementType();
+    if (!isa<IntegerType, FloatType, ComplexType>(arrEleTy))
+      return false;
+
+    // There must be at least `size` uses to initialize the entire array.
+    auto size = arrTy.getSize();
+    if (std::distance(alloc->getUses().begin(), alloc->getUses().end()) < size)
+      return false;
+
+    // Keep a scoreboard for every element in the array. Every element *must* be
+    // stored to with a constant exactly one time.
+    scoreboard.resize(size);
+    for (int i = 0; i < size; i++)
+      scoreboard[i] = nullptr;
+
+    SmallVector<Operation *> toGlobalUses;
+    SmallVector<SmallPtrSet<Operation *, 2>> loadSets(size);
+
+    auto getWriteOp = [&](auto op, std::int32_t index) -> Operation * {
+      Operation *theStore = nullptr;
+      for (auto &use : op->getUses()) {
+        Operation *u = use.getOwner();
+        if (!u)
+          return nullptr;
+
+        if (auto store = dyn_cast<cudaq::cc::StoreOp>(u)) {
+          if (op.getOperation() == store.getPtrvalue().getDefiningOp()) {
+            if (theStore) {
+              LLVM_DEBUG(llvm::dbgs()
+                         << "more than 1 store to element of array\n");
+              return nullptr;
+            }
+            LLVM_DEBUG(llvm::dbgs() << "found store: " << store << "\n");
+            theStore = u;
+          }
+          continue;
+        }
+        if (isa<quake::InitializeStateOp>(u)) {
+          toGlobalUses.push_back(u);
+          continue;
+        }
+        if (isa<cudaq::cc::LoadOp>(u)) {
+          loadSets[index].insert(u);
+          continue;
+        }
+        return nullptr;
+      }
+      return theStore &&
+                     isa_and_present<arith::ConstantOp, complex::ConstantOp>(
+                         dyn_cast<cudaq::cc::StoreOp>(theStore)
+                             .getValue()
+                             .getDefiningOp())
+                 ? theStore
+                 : nullptr;
+    };
+
+    auto unsizedArrTy = cudaq::cc::ArrayType::get(arrEleTy);
+    auto ptrUnsizedArrTy = cudaq::cc::PointerType::get(unsizedArrTy);
+    auto ptrArrEleTy = cudaq::cc::PointerType::get(arrEleTy);
+    for (auto &use : alloc->getUses()) {
+      // All uses *must* be a degenerate cc.cast, cc.compute_ptr, or
+      // cc.init_state.
+      auto *op = use.getOwner();
+      if (!op) {
+        LLVM_DEBUG(llvm::dbgs() << "use was not an op\n");
+        return false;
+      }
+      if (auto cptr = dyn_cast<cudaq::cc::ComputePtrOp>(op)) {
+        if (auto index = cptr.getConstantIndex(0))
+          if (auto w = getWriteOp(cptr, *index))
+            if (!scoreboard[*index]) {
+              scoreboard[*index] = w;
+              continue;
+            }
+        return false;
+      }
+      if (auto cast = dyn_cast<cudaq::cc::CastOp>(op)) {
+        // Process casts that are used in store ops.
+        if (cast.getType() == ptrArrEleTy) {
+          if (auto w = getWriteOp(cast, 0))
+            if (!scoreboard[0]) {
+              scoreboard[0] = w;
+              continue;
+            }
+          return false;
+        }
+        // Process casts that are used in quake.init_state.
+        if (cast.getType() == ptrUnsizedArrTy) {
+          if (cast->hasOneUse()) {
+            auto &use = *cast->getUses().begin();
+            Operation *u = use.getOwner();
+            if (isa_and_present<quake::InitializeStateOp>(u)) {
+              toGlobalUses.push_back(op);
+              continue;
+            }
+          }
+          return false;
+        }
+        LLVM_DEBUG(llvm::dbgs() << "unexpected cast: " << *op << '\n');
+        toGlobalUses.push_back(op);
+        continue;
+      }
+      if (isa<quake::InitializeStateOp>(op)) {
+        toGlobalUses.push_back(op);
+        continue;
+      }
+      LLVM_DEBUG(llvm::dbgs() << "unexpected use: " << *op << '\n');
+      toGlobalUses.push_back(op);
+    }
+
+    bool ok = std::all_of(scoreboard.begin(), scoreboard.end(),
+                          [](bool b) { return b; });
+    LLVM_DEBUG(llvm::dbgs() << "all elements of array are set: " << ok << '\n');
+    if (ok) {
+      // Verify dominance relations.
+
+      // For all stores, the store of an element $e$ must dominate all loads of
+      // $e$.
+      for (int i = 0; i < size; ++i) {
+        for (auto *load : loadSets[i])
+          if (!dom.dominates(scoreboard[i], load)) {
+            LLVM_DEBUG(llvm::dbgs()
+                       << "store " << scoreboard[i]
+                       << " doesn't dominate load: " << *load << '\n');
+            return false;
+          }
+      }
+
+      // For all global uses, all of the stores must dominate every use.
+      for (auto *glob : toGlobalUses) {
+        for (auto *store : scoreboard)
+          if (!dom.dominates(store, glob)) {
+            LLVM_DEBUG(llvm::dbgs()
+                       << "store " << store << " doesn't dominate op: " << *glob
+                       << '\n');
+            return false;
+          }
+      }
+    }
+    return ok;
+  }
+
+  DominanceInfo &dom;
+  StringRef funcName;
+};
+} // namespace
diff --git a/lib/Optimizer/Transforms/LoopAnalysis.cpp b/lib/Optimizer/Transforms/LoopAnalysis.cpp
index c40e2b7e302..d069c6eca04 100644
--- a/lib/Optimizer/Transforms/LoopAnalysis.cpp
+++ b/lib/Optimizer/Transforms/LoopAnalysis.cpp
@@ -7,6 +7,7 @@
  ******************************************************************************/
 
 #include "LoopAnalysis.h"
+#include "cudaq/Optimizer/Builder/Factory.h"
 #include "mlir/IR/Dominance.h"
 
 using namespace mlir;
@@ -210,6 +211,10 @@ static BlockArgument getLinearExpr(Value expr,
   return scaledIteration(expr);
 }
 
+static unsigned bitWidth(Value val) {
+  return cast<IntegerType>(val.getType()).getWidth();
+}
+
 namespace cudaq {
 
 bool opt::isSemiOpenPredicate(arith::CmpIPredicate p) {
@@ -223,6 +228,11 @@ bool opt::isUnsignedPredicate(arith::CmpIPredicate p) {
          p == arith::CmpIPredicate::ugt || p == arith::CmpIPredicate::uge;
 }
 
+bool opt::isSignedPredicate(arith::CmpIPredicate p) {
+  return p == arith::CmpIPredicate::slt || p == arith::CmpIPredicate::sle ||
+         p == arith::CmpIPredicate::sgt || p == arith::CmpIPredicate::sge;
+}
+
 // We expect the loop control value to have the following form.
 //
 //   %final = cc.loop while ((%iter = %initial) -> (iN)) {
@@ -314,26 +324,296 @@ bool opt::isaConstantUpperBoundLoop(cc::LoopOp loop, bool allowClosedInterval) {
          isaConstant(c.compareValue);
 }
 
-Value opt::LoopComponents::getCompareInduction() {
+Value opt::LoopComponents::getCompareInduction() const {
   auto cmpOp = cast<arith::CmpIOp>(compareOp);
   return cmpOp.getLhs() == compareValue ? cmpOp.getRhs() : cmpOp.getLhs();
 }
 
-bool opt::LoopComponents::stepIsAnAddOp() { return isa<arith::AddIOp>(stepOp); }
+bool opt::LoopComponents::stepIsAnAddOp() const {
+  return isa<arith::AddIOp>(stepOp);
+}
 
-bool opt::LoopComponents::shouldCommuteStepOp() {
+bool opt::LoopComponents::shouldCommuteStepOp() const {
   if (auto addOp = dyn_cast_or_null<arith::AddIOp>(stepOp))
     return addOp.getRhs() == stepRegion->front().getArgument(induction);
   // Note: we don't allow induction on lhs of subtraction.
   return false;
 }
 
-bool opt::LoopComponents::isClosedIntervalForm() {
+bool opt::LoopComponents::isClosedIntervalForm() const {
   auto cmp = cast<arith::CmpIOp>(compareOp);
   return ::isClosedIntervalForm(cmp.getPredicate());
 }
 
-bool opt::LoopComponents::isLinearExpr() { return addendValue || scaleValue; }
+bool opt::LoopComponents::isLinearExpr() const {
+  return addendValue || scaleValue;
+}
+
+std::int64_t opt::LoopComponents::extendValue(unsigned width,
+                                              std::size_t val) const {
+  const bool signExt =
+      isSignedPredicate(cast<arith::CmpIOp>(compareOp).getPredicate());
+  std::int64_t result = val;
+  switch (width) {
+  case 8:
+    if (signExt) {
+      std::int8_t v = val & 0xFF;
+      result = v;
+    } else {
+      std::uint8_t v = val & 0xFF;
+      result = v;
+    }
+    break;
+  case 16:
+    if (signExt) {
+      std::int16_t v = val & 0xFFFF;
+      result = v;
+    } else {
+      std::uint16_t v = val & 0xFFFF;
+      result = v;
+    }
+    break;
+  case 32:
+    if (signExt) {
+      std::int32_t v = val & 0xFFFFFFFF;
+      result = v;
+    } else {
+      std::uint32_t v = val & 0xFFFFFFFF;
+      result = v;
+    }
+    break;
+  default:
+    break;
+  }
+  return result;
+}
+
+bool opt::LoopComponents::hasAlwaysTrueCondition() const {
+  auto cmpValOpt = factory::maybeValueOfIntConstant(compareValue);
+  if (!cmpValOpt)
+    return false;
+  auto width = bitWidth(compareValue);
+  std::int64_t cmpVal = *cmpValOpt;
+  auto pred = cast<arith::CmpIOp>(compareOp).getPredicate();
+  switch (width) {
+  case 8: {
+    switch (pred) {
+    case arith::CmpIPredicate::sge:
+      return static_cast<std::int8_t>(cmpVal) ==
+             std::numeric_limits<std::int8_t>::min();
+    case arith::CmpIPredicate::sle:
+      return static_cast<std::int8_t>(cmpVal) ==
+             std::numeric_limits<std::int8_t>::max();
+    case arith::CmpIPredicate::uge:
+      return static_cast<std::uint8_t>(cmpVal) ==
+             std::numeric_limits<std::uint8_t>::min();
+    case arith::CmpIPredicate::ule:
+      return static_cast<std::uint8_t>(cmpVal) ==
+             std::numeric_limits<std::uint8_t>::max();
+    default:
+      break;
+    }
+  } break;
+  case 16: {
+    switch (pred) {
+    case arith::CmpIPredicate::sge:
+      return static_cast<std::int16_t>(cmpVal) ==
+             std::numeric_limits<std::int16_t>::min();
+    case arith::CmpIPredicate::sle:
+      return static_cast<std::int16_t>(cmpVal) ==
+             std::numeric_limits<std::int16_t>::max();
+    case arith::CmpIPredicate::uge:
+      return static_cast<std::uint16_t>(cmpVal) ==
+             std::numeric_limits<std::uint16_t>::min();
+    case arith::CmpIPredicate::ule:
+      return static_cast<std::uint16_t>(cmpVal) ==
+             std::numeric_limits<std::uint16_t>::max();
+    default:
+      break;
+    }
+  } break;
+  case 32: {
+    switch (pred) {
+    case arith::CmpIPredicate::sge:
+      return static_cast<std::int32_t>(cmpVal) ==
+             std::numeric_limits<std::int32_t>::min();
+    case arith::CmpIPredicate::sle:
+      return static_cast<std::int32_t>(cmpVal) ==
+             std::numeric_limits<std::int32_t>::max();
+    case arith::CmpIPredicate::uge:
+      return static_cast<std::uint32_t>(cmpVal) ==
+             std::numeric_limits<std::uint32_t>::min();
+    case arith::CmpIPredicate::ule:
+      return static_cast<std::uint32_t>(cmpVal) ==
+             std::numeric_limits<std::uint32_t>::max();
+    default:
+      break;
+    }
+  } break;
+  case 64: {
+    switch (pred) {
+    case arith::CmpIPredicate::sge:
+      return static_cast<std::int64_t>(cmpVal) ==
+             std::numeric_limits<std::int64_t>::min();
+    case arith::CmpIPredicate::sle:
+      return static_cast<std::int64_t>(cmpVal) ==
+             std::numeric_limits<std::int64_t>::max();
+    case arith::CmpIPredicate::uge:
+      return static_cast<std::uint64_t>(cmpVal) ==
+             std::numeric_limits<std::uint64_t>::min();
+    case arith::CmpIPredicate::ule:
+      return static_cast<std::uint64_t>(cmpVal) ==
+             std::numeric_limits<std::uint64_t>::max();
+    default:
+      break;
+    }
+  } break;
+  default:
+    break;
+  }
+  return false;
+}
+
+bool opt::LoopComponents::hasAlwaysFalseCondition() const {
+  auto cmpValOpt = factory::maybeValueOfIntConstant(compareValue);
+  if (!cmpValOpt)
+    return false;
+  auto width = bitWidth(compareValue);
+  std::int64_t cmpVal = *cmpValOpt;
+  auto pred = cast<arith::CmpIOp>(compareOp).getPredicate();
+  switch (width) {
+  case 8: {
+    switch (pred) {
+    case arith::CmpIPredicate::slt:
+      return static_cast<std::int8_t>(cmpVal) ==
+             std::numeric_limits<std::int8_t>::min();
+    case arith::CmpIPredicate::sgt:
+      return static_cast<std::int8_t>(cmpVal) ==
+             std::numeric_limits<std::int8_t>::max();
+    case arith::CmpIPredicate::ult:
+      return static_cast<std::uint8_t>(cmpVal) ==
+             std::numeric_limits<std::uint8_t>::min();
+    case arith::CmpIPredicate::ugt:
+      return static_cast<std::uint8_t>(cmpVal) ==
+             std::numeric_limits<std::uint8_t>::max();
+    default:
+      break;
+    }
+  } break;
+  case 16: {
+    switch (pred) {
+    case arith::CmpIPredicate::slt:
+      return static_cast<std::int16_t>(cmpVal) ==
+             std::numeric_limits<std::int16_t>::min();
+    case arith::CmpIPredicate::sgt:
+      return static_cast<std::int16_t>(cmpVal) ==
+             std::numeric_limits<std::int16_t>::max();
+    case arith::CmpIPredicate::ult:
+      return static_cast<std::uint16_t>(cmpVal) ==
+             std::numeric_limits<std::uint16_t>::min();
+    case arith::CmpIPredicate::ugt:
+      return static_cast<std::uint16_t>(cmpVal) ==
+             std::numeric_limits<std::uint16_t>::max();
+    default:
+      break;
+    }
+  } break;
+  case 32: {
+    switch (pred) {
+    case arith::CmpIPredicate::slt:
+      return static_cast<std::int32_t>(cmpVal) ==
+             std::numeric_limits<std::int32_t>::min();
+    case arith::CmpIPredicate::sgt:
+      return static_cast<std::int32_t>(cmpVal) ==
+             std::numeric_limits<std::int32_t>::max();
+    case arith::CmpIPredicate::ult:
+      return static_cast<std::uint32_t>(cmpVal) ==
+             std::numeric_limits<std::uint32_t>::min();
+    case arith::CmpIPredicate::ugt:
+      return static_cast<std::uint32_t>(cmpVal) ==
+             std::numeric_limits<std::uint32_t>::max();
+    default:
+      break;
+    }
+  } break;
+  case 64: {
+    switch (pred) {
+    case arith::CmpIPredicate::slt:
+      return static_cast<std::int64_t>(cmpVal) ==
+             std::numeric_limits<std::int64_t>::min();
+    case arith::CmpIPredicate::sgt:
+      return static_cast<std::int64_t>(cmpVal) ==
+             std::numeric_limits<std::int64_t>::max();
+    case arith::CmpIPredicate::ult:
+      return static_cast<std::uint64_t>(cmpVal) ==
+             std::numeric_limits<std::uint64_t>::min();
+    case arith::CmpIPredicate::ugt:
+      return static_cast<std::uint64_t>(cmpVal) ==
+             std::numeric_limits<std::uint64_t>::max();
+    default:
+      break;
+    }
+  } break;
+  default:
+    break;
+  }
+  return false;
+}
+
+std::optional<std::size_t> opt::LoopComponents::getIterationsConstant() const {
+  auto initValOpt = factory::maybeValueOfIntConstant(initialValue);
+  if (!initValOpt)
+    return std::nullopt;
+  std::int64_t initVal = extendValue(bitWidth(initialValue), *initValOpt);
+  auto endValOpt = factory::maybeValueOfIntConstant(compareValue);
+  if (!endValOpt)
+    return std::nullopt;
+  std::int64_t endVal = extendValue(bitWidth(compareValue), *endValOpt);
+  auto stepValOpt = factory::maybeValueOfIntConstant(stepValue);
+  if (!stepValOpt)
+    return std::nullopt;
+  std::int64_t stepVal = extendValue(bitWidth(stepValue), *stepValOpt);
+  if (!stepIsAnAddOp())
+    stepVal = -stepVal;
+  if (isLinearExpr()) {
+    if (addendValue) {
+      auto addendOpt = factory::maybeValueOfIntConstant(addendValue);
+      if (!addendOpt)
+        return std::nullopt;
+      std::int64_t addend = extendValue(bitWidth(addendValue), *addendOpt);
+      if (negatedAddend)
+        endVal += addend;
+      else
+        endVal -= addend;
+    }
+    if (minusOneMult) {
+      initVal = -initVal;
+      stepVal = -stepVal;
+    }
+    if (scaleValue) {
+      auto scaleValOpt = factory::maybeValueOfIntConstant(scaleValue);
+      if (!scaleValOpt)
+        return std::nullopt;
+      std::int64_t scaleVal = extendValue(bitWidth(scaleValue), *scaleValOpt);
+      if (reciprocalScale) {
+        endVal *= scaleVal;
+      } else {
+        endVal *= scaleVal;
+        stepVal *= scaleVal;
+      }
+    }
+  }
+  if (!isClosedIntervalForm()) {
+    if (stepVal < 0)
+      endVal += 1;
+    else
+      endVal -= 1;
+  }
+  std::int64_t result = (endVal - initVal + stepVal) / stepVal;
+  if (result < 0)
+    result = 0;
+  return {result};
+}
 
 template <typename T>
 constexpr int computeArgsOffset() {
@@ -350,7 +630,9 @@ std::optional<opt::LoopComponents> opt::getLoopComponents(cc::LoopOp loop) {
   auto &whileEntry = whileRegion.front();
   auto condOp = cast<cc::ConditionOp>(whileRegion.back().back());
   result.compareOp = condOp.getCondition().getDefiningOp();
-  auto cmpOp = cast<arith::CmpIOp>(result.compareOp);
+  auto cmpOp = dyn_cast<arith::CmpIOp>(result.compareOp);
+  if (!cmpOp)
+    return {};
 
   auto argumentToCompare = [&](unsigned idx) -> bool {
     return (getLinearExpr(cmpOp.getLhs(), result, loop) ==
diff --git a/lib/Optimizer/Transforms/LoopAnalysis.h b/lib/Optimizer/Transforms/LoopAnalysis.h
index 1d2f6181f0d..12f73106555 100644
--- a/lib/Optimizer/Transforms/LoopAnalysis.h
+++ b/lib/Optimizer/Transforms/LoopAnalysis.h
@@ -14,17 +14,29 @@ namespace cudaq::opt {
 
 // Loops that are transformed into normal form have this attribute.
 static constexpr char NormalizedLoopAttr[] = "normalized";
+static constexpr char DeadLoopAttr[] = "dead";
 
 struct LoopComponents {
   LoopComponents() = default;
 
   // Get the induction expression of the comparison.
-  mlir::Value getCompareInduction();
+  mlir::Value getCompareInduction() const;
 
-  bool stepIsAnAddOp();
-  bool shouldCommuteStepOp();
-  bool isClosedIntervalForm();
-  bool isLinearExpr();
+  bool stepIsAnAddOp() const;
+  bool shouldCommuteStepOp() const;
+  bool isClosedIntervalForm() const;
+  bool isLinearExpr() const;
+  std::optional<std::size_t> getIterationsConstant() const;
+
+  // Determine if the condition is always true. e.g., `x uge 0`.
+  bool hasAlwaysTrueCondition() const;
+  // Determine if the condition is always false. e.g., `x ult 0`.
+  bool hasAlwaysFalseCondition() const;
+  bool hasInvariantCondition() const {
+    return hasAlwaysTrueCondition() || hasAlwaysFalseCondition();
+  }
+
+  std::int64_t extendValue(unsigned width, std::size_t val) const;
 
   unsigned induction = 0;
   mlir::Value initialValue;
@@ -50,6 +62,7 @@ struct LoopComponents {
 /// Does boundary test defines a semi-open interval?
 bool isSemiOpenPredicate(mlir::arith::CmpIPredicate p);
 bool isUnsignedPredicate(mlir::arith::CmpIPredicate p);
+bool isSignedPredicate(mlir::arith::CmpIPredicate p);
 
 /// A counted loop is defined to be a loop that will execute some compile-time
 /// constant number of iterations. We recognize a normalized, semi-open interval
diff --git a/lib/Optimizer/Transforms/LoopNormalize.cpp b/lib/Optimizer/Transforms/LoopNormalize.cpp
index faf0b3ea648..2e15bbb069d 100644
--- a/lib/Optimizer/Transforms/LoopNormalize.cpp
+++ b/lib/Optimizer/Transforms/LoopNormalize.cpp
@@ -23,145 +23,9 @@ namespace cudaq::opt {
 
 using namespace mlir;
 
-// Return true if \p loop is not monotonic or it is an invariant loop.
-// Normalization is to be done on any loop that is monotonic and not
-// invariant (which includes loops that are already in counted form).
-static bool isNotMonotonicOrInvariant(cudaq::cc::LoopOp loop,
-                                      bool allowClosedInterval,
-                                      bool allowEarlyExit) {
-  cudaq::opt::LoopComponents c;
-  return !cudaq::opt::isaMonotonicLoop(loop, allowEarlyExit, &c) ||
-         (cudaq::opt::isaInvariantLoop(c, allowClosedInterval) &&
-          !c.isLinearExpr());
-}
+#include "LoopNormalizePatterns.inc"
 
 namespace {
-class LoopPat : public OpRewritePattern<cudaq::cc::LoopOp> {
-public:
-  explicit LoopPat(MLIRContext *ctx, bool aci, bool ab)
-      : OpRewritePattern(ctx), allowClosedInterval(aci), allowEarlyExit(ab) {}
-
-  LogicalResult matchAndRewrite(cudaq::cc::LoopOp loop,
-                                PatternRewriter &rewriter) const override {
-    if (loop->hasAttr(cudaq::opt::NormalizedLoopAttr))
-      return failure();
-    if (isNotMonotonicOrInvariant(loop, allowClosedInterval, allowEarlyExit))
-      return failure();
-
-    // loop is monotonic but not invariant.
-    LLVM_DEBUG(llvm::dbgs() << "loop before normalization: " << loop << '\n');
-    auto componentsOpt = cudaq::opt::getLoopComponents(loop);
-    assert(componentsOpt && "loop must have components");
-    auto c = *componentsOpt;
-    auto loc = loop.getLoc();
-
-    // 1) Set initial value to 0.
-    auto ty = c.initialValue.getType();
-    rewriter.startRootUpdate(loop);
-    auto createConstantOp = [&](std::int64_t val) -> Value {
-      return rewriter.create<arith::ConstantIntOp>(loc, val, ty);
-    };
-    auto zero = createConstantOp(0);
-    loop->setOperand(c.induction, zero);
-
-    // 2) Compute the number of iterations as an invariant. `iterations = max(0,
-    // (upper - lower + step) / step)`.
-    Value upper = c.compareValue;
-    auto one = createConstantOp(1);
-    Value step = c.stepValue;
-    Value lower = c.initialValue;
-    if (!c.stepIsAnAddOp())
-      step = rewriter.create<arith::SubIOp>(loc, zero, step);
-    if (c.isLinearExpr()) {
-      // Induction is part of a linear expression. Deal with the terms of the
-      // equation. `m` scales the step. `b` is an addend to the lower bound.
-      if (c.addendValue) {
-        if (c.negatedAddend) {
-          // `m * i - b`, u += `b`.
-          upper = rewriter.create<arith::AddIOp>(loc, upper, c.addendValue);
-        } else {
-          // `m * i + b`, u -= `b`.
-          upper = rewriter.create<arith::SubIOp>(loc, upper, c.addendValue);
-        }
-      }
-      if (c.minusOneMult) {
-        // `b - m * i` (b eliminated), multiply lower and step by `-1` (`m`
-        // follows).
-        auto negOne = createConstantOp(-1);
-        lower = rewriter.create<arith::MulIOp>(loc, lower, negOne);
-        step = rewriter.create<arith::MulIOp>(loc, step, negOne);
-      }
-      if (c.scaleValue) {
-        if (c.reciprocalScale) {
-          // `1/m * i + b` (b eliminated), multiply upper by `m`.
-          upper = rewriter.create<arith::MulIOp>(loc, upper, c.scaleValue);
-        } else {
-          // `m * i + b` (b eliminated), multiple lower and step by `m`.
-          lower = rewriter.create<arith::MulIOp>(loc, lower, c.scaleValue);
-          step = rewriter.create<arith::MulIOp>(loc, step, c.scaleValue);
-        }
-      }
-    }
-    if (!c.isClosedIntervalForm()) {
-      // Note: treating the step as a signed value to process countdown loops as
-      // well as countup loops.
-      Value negStepCond = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::slt, step, zero);
-      auto negOne = createConstantOp(-1);
-      Value adj =
-          rewriter.create<arith::SelectOp>(loc, ty, negStepCond, negOne, one);
-      upper = rewriter.create<arith::SubIOp>(loc, upper, adj);
-    }
-    Value diff = rewriter.create<arith::SubIOp>(loc, upper, lower);
-    Value disp = rewriter.create<arith::AddIOp>(loc, diff, step);
-    auto cmpOp = cast<arith::CmpIOp>(c.compareOp);
-    Value up1 = rewriter.create<arith::DivSIOp>(loc, disp, step);
-    Value noLoopCond = rewriter.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::sgt, up1, zero);
-    Value newUpper =
-        rewriter.create<arith::SelectOp>(loc, ty, noLoopCond, up1, zero);
-
-    // 3) Rewrite the comparison (!=) and step operations (+1).
-    Value v1 = c.getCompareInduction();
-    rewriter.setInsertionPoint(cmpOp);
-    Value newCmp = rewriter.create<arith::CmpIOp>(
-        cmpOp.getLoc(), arith::CmpIPredicate::ne, v1, newUpper);
-    cmpOp->replaceAllUsesWith(ValueRange{newCmp});
-    auto v2 = c.stepOp->getOperand(
-        c.stepIsAnAddOp() && c.shouldCommuteStepOp() ? 1 : 0);
-    rewriter.setInsertionPoint(c.stepOp);
-    auto newStep = rewriter.create<arith::AddIOp>(c.stepOp->getLoc(), v2, one);
-    c.stepOp->replaceAllUsesWith(ValueRange{newStep.getResult()});
-
-    // 4) Compute original induction value as a loop variant and replace the
-    // uses. `lower + step * i`. Careful to not replace the new induction.
-    if (!loop.getBodyRegion().empty()) {
-      Block *entry = &loop.getBodyRegion().front();
-      rewriter.setInsertionPointToStart(entry);
-      Value induct = entry->getArgument(c.induction);
-      auto mul = rewriter.create<arith::MulIOp>(loc, induct, c.stepValue);
-      Value newInd;
-      if (c.stepIsAnAddOp())
-        newInd = rewriter.create<arith::AddIOp>(loc, c.initialValue, mul);
-      else
-        newInd = rewriter.create<arith::SubIOp>(loc, c.initialValue, mul);
-      induct.replaceUsesWithIf(newInd, [&](OpOperand &opnd) {
-        auto *op = opnd.getOwner();
-        return op != newStep.getOperation() && op != mul &&
-               !isa<cudaq::cc::ContinueOp>(op);
-      });
-    }
-    loop->setAttr(cudaq::opt::NormalizedLoopAttr, rewriter.getUnitAttr());
-
-    rewriter.finalizeRootUpdate(loop);
-    LLVM_DEBUG(llvm::dbgs() << "loop after normalization: " << loop << '\n');
-    return success();
-  }
-
-  bool allowClosedInterval;
-  bool allowEarlyExit;
-};
-
 class LoopNormalizePass
     : public cudaq::opt::impl::LoopNormalizeBase<LoopNormalizePass> {
 public:
diff --git a/lib/Optimizer/Transforms/LoopNormalizePatterns.inc b/lib/Optimizer/Transforms/LoopNormalizePatterns.inc
new file mode 100644
index 00000000000..a147c78bd36
--- /dev/null
+++ b/lib/Optimizer/Transforms/LoopNormalizePatterns.inc
@@ -0,0 +1,169 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// These loop normalization patterns are used by the cc-loop-normalize pass
+// and cc-loop-unroll pass
+
+// This file must be included after a `using namespace mlir;` as it uses bare
+// identifiers from that namespace.
+
+// Return true if \p loop is not monotonic or it is an invariant loop.
+// Normalization is to be done on any loop that is monotonic and not invariant
+// (which includes loops that are already in counted form).
+static bool isNotMonotonicOrInvariant(cudaq::cc::LoopOp loop,
+                                      bool allowClosedInterval,
+                                      bool allowEarlyExit) {
+  cudaq::opt::LoopComponents c;
+  return !cudaq::opt::isaMonotonicLoop(loop, allowEarlyExit, &c) ||
+         (cudaq::opt::isaInvariantLoop(c, allowClosedInterval) &&
+          !c.isLinearExpr());
+}
+
+namespace {
+class LoopPat : public OpRewritePattern<cudaq::cc::LoopOp> {
+public:
+  explicit LoopPat(MLIRContext *ctx, bool aci, bool ab)
+      : OpRewritePattern(ctx), allowClosedInterval(aci), allowEarlyExit(ab) {}
+
+  LogicalResult matchAndRewrite(cudaq::cc::LoopOp loop,
+                                PatternRewriter &rewriter) const override {
+    if (loop->hasAttr(cudaq::opt::NormalizedLoopAttr) ||
+        loop->hasAttr(cudaq::opt::DeadLoopAttr))
+      return failure();
+    if (isNotMonotonicOrInvariant(loop, allowClosedInterval, allowEarlyExit))
+      return failure();
+
+    // loop is monotonic but not invariant.
+    LLVM_DEBUG(llvm::dbgs() << "loop before normalization: " << loop << '\n');
+    auto componentsOpt = cudaq::opt::getLoopComponents(loop);
+    assert(componentsOpt && "loop must have components");
+    auto c = *componentsOpt;
+    if (c.hasAlwaysTrueCondition()) {
+      loop->emitWarning("Loop condition is always true. This loop is not "
+                        "supported in a kernel.");
+      return failure();
+    }
+
+    if (c.hasAlwaysFalseCondition()) {
+      rewriter.startRootUpdate(loop);
+      rewriter.replaceOpWithNewOp<arith::ConstantIntOp>(c.compareOp, 0, 1);
+      loop->setAttr(cudaq::opt::DeadLoopAttr, rewriter.getUnitAttr());
+      rewriter.finalizeRootUpdate(loop);
+      return success();
+    }
+    auto loc = loop.getLoc();
+
+    // 1) Set initial value to 0.
+    auto ty = c.initialValue.getType();
+    rewriter.startRootUpdate(loop);
+    auto createConstantOp = [&](std::int64_t val) -> Value {
+      return rewriter.create<arith::ConstantIntOp>(loc, val, ty);
+    };
+    auto zero = createConstantOp(0);
+    loop->setOperand(c.induction, zero);
+
+    // 2) Compute the number of iterations as an invariant. `iterations = max(0,
+    // (upper - lower + step) / step)`.
+    Value upper = c.compareValue;
+    auto one = createConstantOp(1);
+    Value step = c.stepValue;
+    Value lower = c.initialValue;
+    if (!c.stepIsAnAddOp())
+      step = rewriter.create<arith::SubIOp>(loc, zero, step);
+    if (c.isLinearExpr()) {
+      // Induction is part of a linear expression. Deal with the terms of the
+      // equation. `m` scales the step. `b` is an addend to the lower bound.
+      if (c.addendValue) {
+        if (c.negatedAddend) {
+          // `m * i - b`, u += `b`.
+          upper = rewriter.create<arith::AddIOp>(loc, upper, c.addendValue);
+        } else {
+          // `m * i + b`, u -= `b`.
+          upper = rewriter.create<arith::SubIOp>(loc, upper, c.addendValue);
+        }
+      }
+      if (c.minusOneMult) {
+        // `b - m * i` (b eliminated), multiply lower and step by `-1` (`m`
+        // follows).
+        auto negOne = createConstantOp(-1);
+        lower = rewriter.create<arith::MulIOp>(loc, lower, negOne);
+        step = rewriter.create<arith::MulIOp>(loc, step, negOne);
+      }
+      if (c.scaleValue) {
+        if (c.reciprocalScale) {
+          // `1/m * i + b` (b eliminated), multiply upper by `m`.
+          upper = rewriter.create<arith::MulIOp>(loc, upper, c.scaleValue);
+        } else {
+          // `m * i + b` (b eliminated), multiple lower and step by `m`.
+          lower = rewriter.create<arith::MulIOp>(loc, lower, c.scaleValue);
+          step = rewriter.create<arith::MulIOp>(loc, step, c.scaleValue);
+        }
+      }
+    }
+    if (!c.isClosedIntervalForm()) {
+      // Note: treating the step as a signed value to process countdown loops as
+      // well as countup loops.
+      Value negStepCond = rewriter.create<arith::CmpIOp>(
+          loc, arith::CmpIPredicate::slt, step, zero);
+      auto negOne = createConstantOp(-1);
+      Value adj =
+          rewriter.create<arith::SelectOp>(loc, ty, negStepCond, negOne, one);
+      upper = rewriter.create<arith::SubIOp>(loc, upper, adj);
+    }
+    Value diff = rewriter.create<arith::SubIOp>(loc, upper, lower);
+    Value disp = rewriter.create<arith::AddIOp>(loc, diff, step);
+    auto cmpOp = cast<arith::CmpIOp>(c.compareOp);
+    Value newUpper = rewriter.create<arith::DivSIOp>(loc, disp, step);
+    if (cudaq::opt::isSignedPredicate(cmpOp.getPredicate())) {
+      Value noLoopCond = rewriter.create<arith::CmpIOp>(
+          loc, arith::CmpIPredicate::sgt, newUpper, zero);
+      newUpper =
+          rewriter.create<arith::SelectOp>(loc, ty, noLoopCond, newUpper, zero);
+    }
+
+    // 3) Rewrite the comparison (!=) and step operations (+1).
+    Value v1 = c.getCompareInduction();
+    rewriter.setInsertionPoint(cmpOp);
+    Value newCmp = rewriter.create<arith::CmpIOp>(
+        cmpOp.getLoc(), arith::CmpIPredicate::ne, v1, newUpper);
+    cmpOp->replaceAllUsesWith(ValueRange{newCmp});
+    auto v2 = c.stepOp->getOperand(
+        c.stepIsAnAddOp() && c.shouldCommuteStepOp() ? 1 : 0);
+    rewriter.setInsertionPoint(c.stepOp);
+    auto newStep = rewriter.create<arith::AddIOp>(c.stepOp->getLoc(), v2, one);
+    c.stepOp->replaceAllUsesWith(ValueRange{newStep.getResult()});
+
+    // 4) Compute original induction value as a loop variant and replace the
+    // uses. `lower + step * i`. Careful to not replace the new induction.
+    if (!loop.getBodyRegion().empty()) {
+      Block *entry = &loop.getBodyRegion().front();
+      rewriter.setInsertionPointToStart(entry);
+      Value induct = entry->getArgument(c.induction);
+      auto mul = rewriter.create<arith::MulIOp>(loc, induct, c.stepValue);
+      Value newInd;
+      if (c.stepIsAnAddOp())
+        newInd = rewriter.create<arith::AddIOp>(loc, c.initialValue, mul);
+      else
+        newInd = rewriter.create<arith::SubIOp>(loc, c.initialValue, mul);
+      induct.replaceUsesWithIf(newInd, [&](OpOperand &opnd) {
+        auto *op = opnd.getOwner();
+        return op != newStep.getOperation() && op != mul &&
+               !isa<cudaq::cc::ContinueOp>(op);
+      });
+    }
+    loop->setAttr(cudaq::opt::NormalizedLoopAttr, rewriter.getUnitAttr());
+
+    rewriter.finalizeRootUpdate(loop);
+    LLVM_DEBUG(llvm::dbgs() << "loop after normalization: " << loop << '\n');
+    return success();
+  }
+
+  bool allowClosedInterval;
+  bool allowEarlyExit;
+};
+} // namespace
diff --git a/lib/Optimizer/Transforms/LoopUnroll.cpp b/lib/Optimizer/Transforms/LoopUnroll.cpp
index b7f0053f263..b9ba7f137c5 100644
--- a/lib/Optimizer/Transforms/LoopUnroll.cpp
+++ b/lib/Optimizer/Transforms/LoopUnroll.cpp
@@ -9,13 +9,14 @@
 #include "LoopAnalysis.h"
 #include "PassDetails.h"
 #include "cudaq/Optimizer/Transforms/Passes.h"
+#include "mlir/IR/Dominance.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/RegionUtils.h"
 
 namespace cudaq::opt {
 #define GEN_PASS_DEF_LOOPUNROLL
-#define GEN_PASS_DEF_UPDATEREGISTERNAMES
 #include "cudaq/Optimizer/Transforms/Passes.h.inc"
 } // namespace cudaq::opt
 
@@ -23,202 +24,9 @@ namespace cudaq::opt {
 
 using namespace mlir;
 
-inline std::pair<Block *, Block *> findCloneRange(Block *first, Block *last) {
-  return {first->getNextNode(), last->getPrevNode()};
-}
-
-static std::size_t
-unrollLoopByValue(cudaq::cc::LoopOp loop,
-                  const cudaq::opt::LoopComponents &components) {
-  auto c = components.compareValue.getDefiningOp<arith::ConstantOp>();
-  return cast<IntegerAttr>(c.getValue()).getInt();
-}
-
-static std::size_t unrollLoopByValue(cudaq::cc::LoopOp loop) {
-  auto components = cudaq::opt::getLoopComponents(loop);
-  return unrollLoopByValue(loop, *components);
-}
-
-static bool exceedsThresholdValue(cudaq::cc::LoopOp loop,
-                                  std::size_t threshold) {
-  auto upperBound = unrollLoopByValue(loop);
-  return upperBound >= threshold;
-}
+#include "LoopUnrollPatterns.inc"
 
 namespace {
-
-/// We fully unroll a counted loop (so marked with the counted attribute) as
-/// long as the number of iterations is constant and that constant is less than
-/// the threshold value.
-///
-/// Assumptions are made that the counted loop has a particular structural
-/// layout as is consistent with the factory producing the counted loop.
-///
-/// After this pass, all loops marked counted will be unrolled or marked
-/// invariant. An invariant loop means the loop must execute exactly some
-/// specific number of times, even if that number is only known at runtime.
-struct UnrollCountedLoop : public OpRewritePattern<cudaq::cc::LoopOp> {
-  explicit UnrollCountedLoop(MLIRContext *ctx, std::size_t t, bool sf, bool ab,
-                             unsigned &p)
-      : OpRewritePattern(ctx), threshold(t), signalFailure(sf), allowBreak(ab),
-        progress(p) {}
-
-  LogicalResult matchAndRewrite(cudaq::cc::LoopOp loop,
-                                PatternRewriter &rewriter) const override {
-    // When the signalFailure flag is set, all loops are matched since that flag
-    // requires that all LoopOp operations be rewritten. Despite the setting of
-    // this flag, it may not be possible to fully unroll every LoopOp anyway.
-    // Check for cases that are clearly not going to be unrolled.
-    if (!allowBreak && !cudaq::opt::isaCountedLoop(loop)) {
-      if (signalFailure)
-        loop.emitOpError("not a simple counted loop");
-      return failure();
-    }
-    if (allowBreak && !cudaq::opt::isaConstantUpperBoundLoop(loop)) {
-      if (signalFailure)
-        loop.emitOpError("not a constant upper bound loop");
-      return failure();
-    }
-    if (exceedsThresholdValue(loop, threshold)) {
-      if (signalFailure)
-        loop.emitOpError("loop bounds exceed iteration threshold");
-      return failure();
-    }
-
-    // At this point, we're ready to unroll the loop and replace it with a
-    // sequence of blocks. Each block will receive a block argument that is the
-    // iteration number. The original cc.loop will be replaced by a constant,
-    // the total number of iterations.
-    // TODO: Allow the threading of other block arguments to the result.
-    auto components = cudaq::opt::getLoopComponents(loop);
-    assert(components && "counted loop must have components");
-    auto unrollBy = unrollLoopByValue(loop, *components);
-    if (components->isClosedIntervalForm())
-      ++unrollBy;
-    Type inductionTy = loop.getOperands()[components->induction].getType();
-    LLVM_DEBUG(llvm::dbgs()
-               << "unrolling loop by " << unrollBy << " iterations\n");
-    auto loc = loop.getLoc();
-    // Split the basic block in which this cc.loop appears.
-    auto *insBlock = rewriter.getInsertionBlock();
-    auto insPos = rewriter.getInsertionPoint();
-    auto *endBlock = rewriter.splitBlock(insBlock, insPos);
-    auto argTys = loop.getResultTypes();
-    SmallVector<Location> argLocs(argTys.size(), loop.getLoc());
-    endBlock->addArguments(argTys, argLocs);
-    rewriter.setInsertionPointToEnd(insBlock);
-    Value iterCount = getIntegerConstant(loc, inductionTy, 0, rewriter);
-    SmallVector<Location> locsRange(loop.getNumResults(), loc);
-    auto &bodyRegion = loop.getBodyRegion();
-    SmallVector<Value> iterationOpers = loop.getOperands();
-    auto setIterationOpers = [&](auto from) {
-      assert(iterationOpers.size() == from.size());
-      for (auto i : llvm::enumerate(from))
-        iterationOpers[i.index()] = i.value();
-    };
-
-    // Make a constant number of copies of the body.
-    Block *contBlock = nullptr;
-    Value nextIterCount;
-    for (std::size_t i = 0u; i < unrollBy; ++i) {
-      // 1. Clone the while region.
-      rewriter.cloneRegionBefore(loop.getWhileRegion(), endBlock);
-      Block *whileBlock = insBlock->getNextNode();
-      // 2. Clone the body region.
-      rewriter.cloneRegionBefore(bodyRegion, endBlock);
-      // Replace the ConditionOp in the while region clone with a direct branch.
-      // This makes the comparison there dead. DCE will delete any unneeded code
-      // associated with it.
-      auto cond = cast<cudaq::cc::ConditionOp>(whileBlock->getTerminator());
-      rewriter.setInsertionPoint(cond);
-      rewriter.replaceOpWithNewOp<cf::BranchOp>(cond, whileBlock->getNextNode(),
-                                                cond.getResults());
-      auto cloneRange = findCloneRange(insBlock, endBlock);
-      // 3. If the loop has a step region, clone it as well. Otherwise create an
-      // empty block to target as the next "continue" block.
-      if (loop.hasStep()) {
-        contBlock = endBlock->getPrevNode();
-        rewriter.cloneRegionBefore(loop.getStepRegion(), endBlock);
-        contBlock = contBlock->getNextNode();
-      } else {
-        contBlock = rewriter.createBlock(endBlock, argTys, argLocs);
-      }
-      // Replace any continue and (possibly) break ops in the body region. They
-      // are repalced with branches to the continue block or exit block, resp.
-      for (Block *b = cloneRange.first; b != contBlock; b = b->getNextNode()) {
-        auto *term = b->getTerminator();
-        if (auto cont = dyn_cast<cudaq::cc::ContinueOp>(term)) {
-          auto termOpers = cont.getOperands();
-          rewriter.setInsertionPoint(cont);
-          rewriter.replaceOpWithNewOp<cf::BranchOp>(cont, contBlock, termOpers);
-        }
-        if (allowBreak) {
-          if (auto brk = dyn_cast<cudaq::cc::BreakOp>(term)) {
-            auto termOpers = brk.getOperands();
-            rewriter.setInsertionPoint(brk);
-            rewriter.replaceOpWithNewOp<cf::BranchOp>(brk, endBlock, termOpers);
-          }
-        }
-      }
-      // If there was a step region, its entry block is the continue block.
-      // However, it may have multiple exit blocks. Thread each of these to a
-      // merge block. The continue block is updated to this new empty merge
-      // block.
-      if (loop.hasStep()) {
-        Block *mergeBlock = rewriter.createBlock(endBlock, argTys, argLocs);
-        for (Block *b = contBlock; b != mergeBlock; b = b->getNextNode())
-          if (auto cont = dyn_cast<cudaq::cc::ContinueOp>(b->getTerminator())) {
-            auto termOpers = cont.getOperands();
-            rewriter.setInsertionPoint(cont);
-            rewriter.replaceOpWithNewOp<cf::BranchOp>(cont, mergeBlock,
-                                                      termOpers);
-          }
-        contBlock = mergeBlock;
-      }
-      // At this point, the continue block is a new, empty block. Generate the
-      // next iteration number in this continue block.
-      rewriter.setInsertionPointToEnd(contBlock);
-      nextIterCount = getIntegerConstant(loc, inductionTy, i + 1, rewriter);
-      rewriter.setInsertionPointToEnd(insBlock);
-      // Propagate the previous iteration number into the new block. This makes
-      // any unneeded computation dead. DCE will clean that up as well.
-      iterationOpers[components->induction] = iterCount;
-      rewriter.create<cf::BranchOp>(loc, cloneRange.first, iterationOpers);
-      // Bookkeeping for the next iteration, which uses the new continue block,
-      // `conBlock`, and its arguments.
-      setIterationOpers(contBlock->getArguments());
-      iterCount = nextIterCount;
-      insBlock = contBlock;
-    }
-
-    // Finish up the last block.
-    rewriter.setInsertionPointToEnd(insBlock);
-    if (contBlock) {
-      iterationOpers[components->induction] = nextIterCount;
-      setIterationOpers(contBlock->getArguments());
-    }
-    [[maybe_unused]] auto lastBranch =
-        rewriter.create<cf::BranchOp>(loc, endBlock, iterationOpers);
-    rewriter.replaceOp(loop, endBlock->getArguments());
-
-    LLVM_DEBUG(llvm::dbgs() << "after unrolling a loop:\n";
-               lastBranch->getParentOfType<func::FuncOp>().dump());
-    progress++;
-    return success();
-  }
-
-  static Value getIntegerConstant(Location loc, Type ty, std::int64_t val,
-                                  PatternRewriter &rewriter) {
-    auto attr = rewriter.getIntegerAttr(ty, val);
-    return rewriter.create<arith::ConstantOp>(loc, ty, attr);
-  }
-
-  std::size_t threshold;
-  bool signalFailure;
-  bool allowBreak;
-  unsigned &progress;
-};
-
 /// The loop unrolling pass will fully unroll a `cc::LoopOp` when the loop is
 /// known to always execute a constant number of iterations. That is, the loop
 /// is a counted loop. (A threshold value can be used to bound the legal range
@@ -262,53 +70,15 @@ class LoopUnrollPass : public cudaq::opt::impl::LoopUnrollBase<LoopUnrollPass> {
 
   static unsigned countLoopOps(Operation *op) {
     unsigned result = 0;
-    op->walk([&](cudaq::cc::LoopOp loop) { result++; });
+    op->walk([&](cudaq::cc::LoopOp loop) {
+      if (!loop->hasAttr(cudaq::opt::DeadLoopAttr))
+        result++;
+    });
     LLVM_DEBUG(llvm::dbgs() << "Total number of loops: " << result << '\n');
     return result;
   }
 };
 
-/// After unrolling the loops, there may be duplicate registerName attributes in
-/// use. This pass will assign them unique names by appending a counter.
-class UpdateRegisterNamesPass
-    : public cudaq::opt::impl::UpdateRegisterNamesBase<
-          UpdateRegisterNamesPass> {
-public:
-  using UpdateRegisterNamesBase::UpdateRegisterNamesBase;
-
-  void runOnOperation() override {
-    auto *mod = getOperation();
-
-    // First save the op's that contain a registerName attribute
-    DenseMap<StringRef, SmallVector<Operation *>> regOps;
-    mod->walk([&](mlir::Operation *walkOp) {
-      if (auto prevAttr = walkOp->getAttr("registerName")) {
-        auto registerName = prevAttr.cast<StringAttr>().getValue();
-        regOps[registerName].push_back(walkOp);
-      }
-      return WalkResult::advance();
-    });
-
-    // Now apply new labels, appending a counter if necessary
-    for (auto &[registerName, opVec] : regOps) {
-      if (opVec.size() == 1)
-        continue; // don't rename individual qubit measurements
-      auto strLen = std::to_string(opVec.size()).size();
-      int bit = 0;
-      for (auto &regOp : opVec)
-        if (auto prevAttr = regOp->getAttr("registerName")) {
-          auto suffix = std::to_string(bit++);
-          if (suffix.size() < strLen)
-            suffix = std::string(strLen - suffix.size(), '0') + suffix;
-          // Note Quantinuum can't support a ":" delimiter, so use '%'
-          auto newAttr = OpBuilder(&getContext())
-                             .getStringAttr(registerName + "%" + suffix);
-          regOp->setAttr("registerName", newAttr);
-        }
-    }
-  }
-};
-
 /// Unrolling pass pipeline command-line options. These options are similar to
 /// the LoopUnroll pass options, but have different default settings.
 struct UnrollPipelineOptions
diff --git a/lib/Optimizer/Transforms/LoopUnrollPatterns.inc b/lib/Optimizer/Transforms/LoopUnrollPatterns.inc
new file mode 100644
index 00000000000..0db404d0502
--- /dev/null
+++ b/lib/Optimizer/Transforms/LoopUnrollPatterns.inc
@@ -0,0 +1,217 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// These patterns are used by classical-optimization pass and cc-loop-unroll
+// pass.
+
+// This file must be included after a `using namespace mlir;` as it uses bare
+// identifiers from that namespace.
+
+inline std::pair<Block *, Block *> findCloneRange(Block *first, Block *last) {
+  return {first->getNextNode(), last->getPrevNode()};
+}
+
+static std::size_t
+unrollLoopByValue(cudaq::cc::LoopOp loop,
+                  const cudaq::opt::LoopComponents &components) {
+  auto c = components.compareValue.getDefiningOp<arith::ConstantOp>();
+  if (loop->hasAttr(cudaq::opt::NormalizedLoopAttr))
+    return cast<IntegerAttr>(c.getValue()).getInt();
+  if (components.hasAlwaysFalseCondition())
+    return 0;
+  auto resultOpt = components.getIterationsConstant();
+  assert(resultOpt.has_value() && "must be counted loop");
+  return *resultOpt;
+}
+
+static bool exceedsThresholdValue(cudaq::cc::LoopOp loop,
+                                  std::size_t threshold) {
+  auto components = cudaq::opt::getLoopComponents(loop);
+  if (components->hasAlwaysTrueCondition()) {
+    loop->emitWarning("Loop condition is always true. This loop is not "
+                      "supported in a kernel.");
+    return true;
+  }
+  auto upperBound = unrollLoopByValue(loop, *components);
+  return upperBound >= threshold;
+}
+
+namespace {
+
+/// We fully unroll a counted loop (so marked with the counted attribute) as
+/// long as the number of iterations is constant and that constant is less than
+/// the threshold value.
+///
+/// Assumptions are made that the counted loop has a particular structural
+/// layout as is consistent with the factory producing the counted loop.
+///
+/// After this pass, all loops marked counted will be unrolled or marked
+/// invariant. An invariant loop means the loop must execute exactly some
+/// specific number of times, even if that number is only known at runtime.
+struct UnrollCountedLoop : public OpRewritePattern<cudaq::cc::LoopOp> {
+  explicit UnrollCountedLoop(MLIRContext *ctx, std::size_t t, bool sf, bool ab,
+                             unsigned &p)
+      : OpRewritePattern(ctx), threshold(t), signalFailure(sf), allowBreak(ab),
+        progress(p) {}
+
+  LogicalResult matchAndRewrite(cudaq::cc::LoopOp loop,
+                                PatternRewriter &rewriter) const override {
+    // When the signalFailure flag is set, all loops are matched since that flag
+    // requires that all LoopOp operations be rewritten. Despite the setting of
+    // this flag, it may not be possible to fully unroll every LoopOp anyway.
+    // Check for cases that are clearly not going to be unrolled.
+    if (loop->hasAttr(cudaq::opt::DeadLoopAttr))
+      return failure();
+    if (!allowBreak && !cudaq::opt::isaCountedLoop(loop)) {
+      if (signalFailure)
+        loop.emitOpError("not a simple counted loop");
+      return failure();
+    }
+    if (allowBreak && !cudaq::opt::isaConstantUpperBoundLoop(loop)) {
+      if (signalFailure)
+        loop.emitOpError("not a constant upper bound loop");
+      return failure();
+    }
+    if (exceedsThresholdValue(loop, threshold)) {
+      if (signalFailure)
+        loop.emitOpError("loop bounds exceed iteration threshold");
+      return failure();
+    }
+
+    // At this point, we're ready to unroll the loop and replace it with a
+    // sequence of blocks. Each block will receive a block argument that is the
+    // iteration number. The original cc.loop will be replaced by a constant,
+    // the total number of iterations.
+    // TODO: Allow the threading of other block arguments to the result.
+    auto components = cudaq::opt::getLoopComponents(loop);
+    assert(components && "counted loop must have components");
+    auto unrollBy = unrollLoopByValue(loop, *components);
+    Type inductionTy = loop.getOperands()[components->induction].getType();
+    LLVM_DEBUG(llvm::dbgs()
+               << "unrolling loop by " << unrollBy << " iterations\n");
+    auto loc = loop.getLoc();
+    // Split the basic block in which this cc.loop appears.
+    auto *insBlock = rewriter.getInsertionBlock();
+    auto insPos = rewriter.getInsertionPoint();
+    auto *endBlock = rewriter.splitBlock(insBlock, insPos);
+    auto argTys = loop.getResultTypes();
+    SmallVector<Location> argLocs(argTys.size(), loop.getLoc());
+    endBlock->addArguments(argTys, argLocs);
+    rewriter.setInsertionPointToEnd(insBlock);
+    Value iterCount = getIntegerConstant(loc, inductionTy, 0, rewriter);
+    SmallVector<Location> locsRange(loop.getNumResults(), loc);
+    auto &bodyRegion = loop.getBodyRegion();
+    SmallVector<Value> iterationOpers = loop.getOperands();
+    auto setIterationOpers = [&](auto from) {
+      assert(iterationOpers.size() == from.size());
+      for (auto i : llvm::enumerate(from))
+        iterationOpers[i.index()] = i.value();
+    };
+
+    // Make a constant number of copies of the body.
+    Block *contBlock = nullptr;
+    Value nextIterCount;
+    for (std::size_t i = 0u; i < unrollBy; ++i) {
+      // 1. Clone the while region.
+      rewriter.cloneRegionBefore(loop.getWhileRegion(), endBlock);
+      Block *whileBlock = insBlock->getNextNode();
+      // 2. Clone the body region.
+      rewriter.cloneRegionBefore(bodyRegion, endBlock);
+      // Replace the ConditionOp in the while region clone with a direct branch.
+      // This makes the comparison there dead. DCE will delete any unneeded code
+      // associated with it.
+      auto cond = cast<cudaq::cc::ConditionOp>(whileBlock->getTerminator());
+      rewriter.setInsertionPoint(cond);
+      rewriter.replaceOpWithNewOp<cf::BranchOp>(cond, whileBlock->getNextNode(),
+                                                cond.getResults());
+      auto cloneRange = findCloneRange(insBlock, endBlock);
+      // 3. If the loop has a step region, clone it as well. Otherwise create an
+      // empty block to target as the next "continue" block.
+      if (loop.hasStep()) {
+        contBlock = endBlock->getPrevNode();
+        rewriter.cloneRegionBefore(loop.getStepRegion(), endBlock);
+        contBlock = contBlock->getNextNode();
+      } else {
+        contBlock = rewriter.createBlock(endBlock, argTys, argLocs);
+      }
+      // Replace any continue and (possibly) break ops in the body region. They
+      // are replaced with branches to the continue block or exit block, resp.
+      for (Block *b = cloneRange.first; b != contBlock; b = b->getNextNode()) {
+        auto *term = b->getTerminator();
+        if (auto cont = dyn_cast<cudaq::cc::ContinueOp>(term)) {
+          auto termOpers = cont.getOperands();
+          rewriter.setInsertionPoint(cont);
+          rewriter.replaceOpWithNewOp<cf::BranchOp>(cont, contBlock, termOpers);
+        }
+        if (allowBreak) {
+          if (auto brk = dyn_cast<cudaq::cc::BreakOp>(term)) {
+            auto termOpers = brk.getOperands();
+            rewriter.setInsertionPoint(brk);
+            rewriter.replaceOpWithNewOp<cf::BranchOp>(brk, endBlock, termOpers);
+          }
+        }
+      }
+      // If there was a step region, its entry block is the continue block.
+      // However, it may have multiple exit blocks. Thread each of these to a
+      // merge block. The continue block is updated to this new empty merge
+      // block.
+      if (loop.hasStep()) {
+        Block *mergeBlock = rewriter.createBlock(endBlock, argTys, argLocs);
+        for (Block *b = contBlock; b != mergeBlock; b = b->getNextNode())
+          if (auto cont = dyn_cast<cudaq::cc::ContinueOp>(b->getTerminator())) {
+            auto termOpers = cont.getOperands();
+            rewriter.setInsertionPoint(cont);
+            rewriter.replaceOpWithNewOp<cf::BranchOp>(cont, mergeBlock,
+                                                      termOpers);
+          }
+        contBlock = mergeBlock;
+      }
+      // At this point, the continue block is a new, empty block. Generate the
+      // next iteration number in this continue block.
+      rewriter.setInsertionPointToEnd(contBlock);
+      nextIterCount = getIntegerConstant(loc, inductionTy, i + 1, rewriter);
+      rewriter.setInsertionPointToEnd(insBlock);
+      // Propagate the previous iteration number into the new block. This makes
+      // any unneeded computation dead. DCE will clean that up as well.
+      iterationOpers[components->induction] = iterCount;
+      rewriter.create<cf::BranchOp>(loc, cloneRange.first, iterationOpers);
+      // Bookkeeping for the next iteration, which uses the new continue block,
+      // `conBlock`, and its arguments.
+      setIterationOpers(contBlock->getArguments());
+      iterCount = nextIterCount;
+      insBlock = contBlock;
+    }
+
+    // Finish up the last block.
+    rewriter.setInsertionPointToEnd(insBlock);
+    if (contBlock) {
+      iterationOpers[components->induction] = nextIterCount;
+      setIterationOpers(contBlock->getArguments());
+    }
+    [[maybe_unused]] auto lastBranch =
+        rewriter.create<cf::BranchOp>(loc, endBlock, iterationOpers);
+    rewriter.replaceOp(loop, endBlock->getArguments());
+
+    LLVM_DEBUG(llvm::dbgs() << "after unrolling a loop:\n";
+               lastBranch->getParentOfType<func::FuncOp>().dump());
+    progress++;
+    return success();
+  }
+
+  static Value getIntegerConstant(Location loc, Type ty, std::int64_t val,
+                                  PatternRewriter &rewriter) {
+    auto attr = rewriter.getIntegerAttr(ty, val);
+    return rewriter.create<arith::ConstantOp>(loc, ty, attr);
+  }
+
+  std::size_t threshold;
+  bool signalFailure;
+  bool allowBreak;
+  unsigned &progress;
+};
+} // namespace
diff --git a/lib/Optimizer/Transforms/LowerToCFG.cpp b/lib/Optimizer/Transforms/LowerToCFG.cpp
index fb050fabffb..6431153542a 100644
--- a/lib/Optimizer/Transforms/LowerToCFG.cpp
+++ b/lib/Optimizer/Transforms/LowerToCFG.cpp
@@ -20,6 +20,8 @@
 
 using namespace mlir;
 
+#include "LowerToCFGPatterns.inc"
+
 namespace {
 class RewriteScope : public OpRewritePattern<cudaq::cc::ScopeOp> {
 public:
@@ -278,80 +280,6 @@ class RewriteLoop : public OpRewritePattern<cudaq::cc::LoopOp> {
   }
 };
 
-class RewriteIf : public OpRewritePattern<cudaq::cc::IfOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  /// Rewrites an if construct like
-  /// ```mlir
-  /// (0)
-  /// quake.if %cond {
-  ///   (1)
-  /// } else {
-  ///   (2)
-  /// }
-  /// (3)
-  /// ```
-  /// to a CFG like
-  /// ```mlir
-  ///   (0)
-  ///   cf.cond_br %cond, ^bb1, ^bb2
-  /// ^bb1:
-  ///   (1)
-  ///   cf.br ^bb3
-  /// ^bb2:
-  ///   (2)
-  ///   cf.br ^bb3
-  /// ^bb3:
-  ///   (3)
-  /// ```
-  LogicalResult matchAndRewrite(cudaq::cc::IfOp ifOp,
-                                PatternRewriter &rewriter) const override {
-    auto loc = ifOp.getLoc();
-    auto *initBlock = rewriter.getInsertionBlock();
-    auto initPos = rewriter.getInsertionPoint();
-    auto *endBlock = rewriter.splitBlock(initBlock, initPos);
-    if (ifOp.getNumResults() != 0) {
-      Block *continueBlock = rewriter.createBlock(
-          endBlock, ifOp.getResultTypes(),
-          SmallVector<Location>(ifOp.getNumResults(), loc));
-      rewriter.create<cf::BranchOp>(loc, endBlock);
-      endBlock = continueBlock;
-    }
-    auto *thenBlock = &ifOp.getThenRegion().front();
-    bool hasElse = !ifOp.getElseRegion().empty();
-    auto *elseBlock = hasElse ? &ifOp.getElseRegion().front() : endBlock;
-    updateBodyBranches(&ifOp.getThenRegion(), rewriter, endBlock);
-    updateBodyBranches(&ifOp.getElseRegion(), rewriter, endBlock);
-    rewriter.inlineRegionBefore(ifOp.getThenRegion(), endBlock);
-    if (hasElse)
-      rewriter.inlineRegionBefore(ifOp.getElseRegion(), endBlock);
-    rewriter.setInsertionPointToEnd(initBlock);
-    rewriter.create<cf::CondBranchOp>(loc, ifOp.getCondition(), thenBlock,
-                                      ifOp.getLinearArgs(), elseBlock,
-                                      ifOp.getLinearArgs());
-    rewriter.replaceOp(ifOp, endBlock->getArguments());
-    return success();
-  }
-
-  // Replace all the ContinueOp in the body region with branches to the correct
-  // basic blocks.
-  void updateBodyBranches(Region *bodyRegion, PatternRewriter &rewriter,
-                          Block *continueBlock) const {
-    // Walk body region and replace all continue and break ops.
-    for (Block &block : *bodyRegion) {
-      auto *terminator = block.getTerminator();
-      if (auto cont = dyn_cast<cudaq::cc::ContinueOp>(terminator)) {
-        rewriter.setInsertionPointToEnd(&block);
-        LLVM_DEBUG(llvm::dbgs() << "replacing " << *terminator << '\n');
-        rewriter.replaceOpWithNewOp<cf::BranchOp>(cont, continueBlock,
-                                                  cont.getOperands());
-      }
-      // Other ad-hoc control flow in the region need not be rewritten.
-    }
-  }
-};
-
 class RewriteReturn : public OpRewritePattern<cudaq::cc::ReturnOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
diff --git a/lib/Optimizer/Transforms/LowerToCFGPatterns.inc b/lib/Optimizer/Transforms/LowerToCFGPatterns.inc
new file mode 100644
index 00000000000..a449d6df698
--- /dev/null
+++ b/lib/Optimizer/Transforms/LowerToCFGPatterns.inc
@@ -0,0 +1,104 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+// These patterns are used by the lower-to-cfg pass and cc-loop-unroll pass.
+
+// This file must be included after a `using namespace mlir;` as it uses bare
+// identifiers from that namespace.
+
+namespace {
+class RewriteIf : public OpRewritePattern<cudaq::cc::IfOp> {
+public:
+  using OpRewritePattern::OpRewritePattern;
+
+  explicit RewriteIf(MLIRContext *ctx)
+      : OpRewritePattern(ctx), rewriteOnlyIfConst(false) {}
+
+  RewriteIf(MLIRContext *ctx, bool rewriteOnlyIfConst)
+      : OpRewritePattern(ctx), rewriteOnlyIfConst(rewriteOnlyIfConst) {}
+
+  /// Rewrites an if construct like
+  /// ```mlir
+  /// (0)
+  /// quake.if %cond {
+  ///   (1)
+  /// } else {
+  ///   (2)
+  /// }
+  /// (3)
+  /// ```
+  /// to a CFG like
+  /// ```mlir
+  ///   (0)
+  ///   cf.cond_br %cond, ^bb1, ^bb2
+  /// ^bb1:
+  ///   (1)
+  ///   cf.br ^bb3
+  /// ^bb2:
+  ///   (2)
+  ///   cf.br ^bb3
+  /// ^bb3:
+  ///   (3)
+  /// ```
+  LogicalResult matchAndRewrite(cudaq::cc::IfOp ifOp,
+                                PatternRewriter &rewriter) const override {
+    // Bail out on non-constant conditions if we just need to
+    // const-prop if($const).
+    if (rewriteOnlyIfConst) {
+      auto cond = ifOp.getCondition();
+      if (!isa_and_present<arith::ConstantOp>(cond.getDefiningOp()))
+        return failure();
+    }
+
+    auto loc = ifOp.getLoc();
+    auto *initBlock = rewriter.getInsertionBlock();
+    auto initPos = rewriter.getInsertionPoint();
+    auto *endBlock = rewriter.splitBlock(initBlock, initPos);
+    if (ifOp.getNumResults() != 0) {
+      Block *continueBlock = rewriter.createBlock(
+          endBlock, ifOp.getResultTypes(),
+          SmallVector<Location>(ifOp.getNumResults(), loc));
+      rewriter.create<cf::BranchOp>(loc, endBlock);
+      endBlock = continueBlock;
+    }
+    auto *thenBlock = &ifOp.getThenRegion().front();
+    bool hasElse = !ifOp.getElseRegion().empty();
+    auto *elseBlock = hasElse ? &ifOp.getElseRegion().front() : endBlock;
+    updateBodyBranches(&ifOp.getThenRegion(), rewriter, endBlock);
+    updateBodyBranches(&ifOp.getElseRegion(), rewriter, endBlock);
+    rewriter.inlineRegionBefore(ifOp.getThenRegion(), endBlock);
+    if (hasElse)
+      rewriter.inlineRegionBefore(ifOp.getElseRegion(), endBlock);
+    rewriter.setInsertionPointToEnd(initBlock);
+    rewriter.create<cf::CondBranchOp>(loc, ifOp.getCondition(), thenBlock,
+                                      ifOp.getLinearArgs(), elseBlock,
+                                      ifOp.getLinearArgs());
+    rewriter.replaceOp(ifOp, endBlock->getArguments());
+    return success();
+  }
+
+  // Replace all the ContinueOp in the body region with branches to the correct
+  // basic blocks.
+  void updateBodyBranches(Region *bodyRegion, PatternRewriter &rewriter,
+                          Block *continueBlock) const {
+    // Walk body region and replace all continue and break ops.
+    for (Block &block : *bodyRegion) {
+      auto *terminator = block.getTerminator();
+      if (auto cont = dyn_cast<cudaq::cc::ContinueOp>(terminator)) {
+        rewriter.setInsertionPointToEnd(&block);
+        LLVM_DEBUG(llvm::dbgs() << "replacing " << *terminator << '\n');
+        rewriter.replaceOpWithNewOp<cf::BranchOp>(cont, continueBlock,
+                                                  cont.getOperands());
+      }
+      // Other ad-hoc control flow in the region need not be rewritten.
+    }
+  }
+
+private:
+  bool rewriteOnlyIfConst = false;
+};
+} // namespace
diff --git a/lib/Optimizer/Transforms/UpdateRegisterNames.cpp b/lib/Optimizer/Transforms/UpdateRegisterNames.cpp
new file mode 100644
index 00000000000..5bf00d241f4
--- /dev/null
+++ b/lib/Optimizer/Transforms/UpdateRegisterNames.cpp
@@ -0,0 +1,68 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "LoopAnalysis.h"
+#include "PassDetails.h"
+#include "cudaq/Optimizer/Transforms/Passes.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "mlir/Transforms/Passes.h"
+#include "mlir/Transforms/RegionUtils.h"
+
+namespace cudaq::opt {
+#define GEN_PASS_DEF_UPDATEREGISTERNAMES
+#include "cudaq/Optimizer/Transforms/Passes.h.inc"
+} // namespace cudaq::opt
+
+#define DEBUG_TYPE "update-register-names"
+
+using namespace mlir;
+
+namespace {
+/// After unrolling the loops, there may be duplicate registerName attributes in
+/// use. This pass will assign them unique names by appending a counter.
+class UpdateRegisterNamesPass
+    : public cudaq::opt::impl::UpdateRegisterNamesBase<
+          UpdateRegisterNamesPass> {
+public:
+  using UpdateRegisterNamesBase::UpdateRegisterNamesBase;
+
+  void runOnOperation() override {
+    auto *mod = getOperation();
+
+    // First save the op's that contain a registerName attribute
+    DenseMap<StringRef, SmallVector<Operation *>> regOps;
+    mod->walk([&](mlir::Operation *walkOp) {
+      if (auto prevAttr = walkOp->getAttr("registerName")) {
+        auto registerName = prevAttr.cast<StringAttr>().getValue();
+        regOps[registerName].push_back(walkOp);
+      }
+      return WalkResult::advance();
+    });
+
+    // Now apply new labels, appending a counter if necessary
+    for (auto &[registerName, opVec] : regOps) {
+      if (opVec.size() == 1)
+        continue; // don't rename individual qubit measurements
+      auto strLen = std::to_string(opVec.size()).size();
+      int bit = 0;
+      for (auto &regOp : opVec)
+        if (auto prevAttr = regOp->getAttr("registerName")) {
+          auto suffix = std::to_string(bit++);
+          if (suffix.size() < strLen)
+            suffix = std::string(strLen - suffix.size(), '0') + suffix;
+          // Note Quantinuum can't support a ":" delimiter, so use '%'
+          auto newAttr = OpBuilder(&getContext())
+                             .getStringAttr(registerName + "%" + suffix);
+          regOp->setAttr("registerName", newAttr);
+        }
+    }
+  }
+};
+} // namespace
diff --git a/lib/Optimizer/Transforms/WriteAfterWriteElimination.cpp b/lib/Optimizer/Transforms/WriteAfterWriteElimination.cpp
index 6ab5f2272ab..9f9b1d571d9 100644
--- a/lib/Optimizer/Transforms/WriteAfterWriteElimination.cpp
+++ b/lib/Optimizer/Transforms/WriteAfterWriteElimination.cpp
@@ -27,121 +27,9 @@ namespace cudaq::opt {
 
 using namespace mlir;
 
-namespace {
-/// Remove stores followed by a store to the same pointer
-/// if the pointer is not used in between.
-/// ```
-/// cc.store %c0_i64, %1 : !cc.ptr<i64>
-/// // no use of %1 until next line
-/// cc.store %0, %1 : !cc.ptr<i64>
-/// ───────────────────────────────────────────
-/// cc.store %0, %1 : !cc.ptr<i64>
-/// ```
-class SimplifyWritesAnalysis {
-public:
-  SimplifyWritesAnalysis(DominanceInfo &di, Operation *op) : dom(di) {
-    for (auto &region : op->getRegions())
-      for (auto &b : region)
-        collectBlockInfo(&b);
-  }
-
-  /// Remove stores followed by a store to the same pointer if the pointer is
-  /// not used in between, using collected block info.
-  void removeOverriddenStores() {
-    SmallVector<Operation *> toErase;
-
-    for (const auto &[block, ptrToStores] : blockInfo) {
-      for (const auto &[ptr, stores] : ptrToStores) {
-        if (stores.size() > 1) {
-          auto replacement = stores.back();
-          for (auto *store : stores) {
-            if (isReplacement(ptr, store, replacement)) {
-              LLVM_DEBUG(llvm::dbgs() << "replacing store " << *store
-                                      << " by: " << *replacement << '\n');
-              toErase.push_back(store);
-            }
-          }
-        }
-      }
-    }
-
-    for (auto *op : toErase)
-      op->erase();
-  }
-
-private:
-  /// Detect if value is used in the op or its nested blocks.
-  bool isReplacement(Operation *ptr, Operation *store,
-                     Operation *replacement) const {
-    if (store == replacement)
-      return false;
-
-    // Check that there are no non-store uses dominated by the store and
-    // not dominated by the replacement, i.e. only uses between the two
-    // stores are other stores to the same pointer.
-    for (auto *user : ptr->getUsers()) {
-      if (user != store && user != replacement) {
-        if (!isStoreToPtr(user, ptr) && dom.dominates(store, user) &&
-            !dom.dominates(replacement, user)) {
-          LLVM_DEBUG(llvm::dbgs() << "store " << replacement
-                                  << " is used before: " << store << '\n');
-          return false;
-        }
-      }
-    }
-    return true;
-  }
-
-  /// Detects a store to the pointer.
-  static bool isStoreToPtr(Operation *op, Operation *ptr) {
-    return isa_and_present<cudaq::cc::StoreOp>(op) &&
-           (dyn_cast<cudaq::cc::StoreOp>(op).getPtrvalue().getDefiningOp() ==
-            ptr);
-  }
-
-  /// Collect all stores to a pointer for a block.
-  void collectBlockInfo(Block *block) {
-    for (auto &op : *block) {
-      for (auto &region : op.getRegions())
-        for (auto &b : region)
-          collectBlockInfo(&b);
-
-      if (auto store = dyn_cast<cudaq::cc::StoreOp>(&op)) {
-        auto ptr = store.getPtrvalue().getDefiningOp();
-        if (isStoreToStack(store)) {
-          auto &[b, ptrToStores] = blockInfo.FindAndConstruct(block);
-          auto &[p, stores] = ptrToStores.FindAndConstruct(ptr);
-          stores.push_back(&op);
-        }
-      }
-    }
-  }
-
-  /// Detect stores to stack locations, for example:
-  /// ```
-  /// %1 = cc.alloca !cc.array<i64 x 2>
-  ///
-  /// %2 = cc.cast %1 : (!cc.ptr<!cc.array<i64 x 2>>) -> !cc.ptr<i64>
-  /// cc.store %c0_i64, %2 : !cc.ptr<i64>
-  ///
-  /// %3 = cc.compute_ptr %1[1] : (!cc.ptr<!cc.array<i64 x 2>>) -> !cc.ptr<i64>
-  /// cc.store %c0_i64, %3 : !cc.ptr<i64>
-  /// ```
-  static bool isStoreToStack(cudaq::cc::StoreOp store) {
-    auto ptrOp = store.getPtrvalue();
-    if (auto cast = ptrOp.getDefiningOp<cudaq::cc::CastOp>())
-      ptrOp = cast.getOperand();
-
-    if (auto computePtr = ptrOp.getDefiningOp<cudaq::cc::ComputePtrOp>())
-      ptrOp = computePtr.getBase();
-
-    return isa_and_present<cudaq::cc::AllocaOp>(ptrOp.getDefiningOp());
-  }
-
-  DominanceInfo &dom;
-  DenseMap<Block *, DenseMap<Operation *, SmallVector<Operation *>>> blockInfo;
-};
+#include "WriteAfterWriteEliminationPatterns.inc"
 
+namespace {
 class WriteAfterWriteEliminationPass
     : public cudaq::opt::impl::WriteAfterWriteEliminationBase<
           WriteAfterWriteEliminationPass> {
diff --git a/lib/Optimizer/Transforms/WriteAfterWriteEliminationPatterns.inc b/lib/Optimizer/Transforms/WriteAfterWriteEliminationPatterns.inc
new file mode 100644
index 00000000000..f6fbf30d3f8
--- /dev/null
+++ b/lib/Optimizer/Transforms/WriteAfterWriteEliminationPatterns.inc
@@ -0,0 +1,129 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// These patterns are used by the write-after-write-elimination and
+// cc-loop-unroll passes.
+
+// This file must be included after a `using namespace mlir;` as it uses bare
+// identifiers from that namespace.
+
+namespace {
+/// Remove stores followed by a store to the same pointer
+/// if the pointer is not used in between.
+/// ```
+/// cc.store %c0_i64, %1 : !cc.ptr<i64>
+/// // no use of %1 until next line
+/// cc.store %0, %1 : !cc.ptr<i64>
+/// ───────────────────────────────────────────
+/// cc.store %0, %1 : !cc.ptr<i64>
+/// ```
+class SimplifyWritesAnalysis {
+public:
+  SimplifyWritesAnalysis(DominanceInfo &di, Operation *op) : dom(di) {
+    for (auto &region : op->getRegions())
+      for (auto &b : region)
+        collectBlockInfo(&b);
+  }
+
+  /// Remove stores followed by a store to the same pointer if the pointer is
+  /// not used in between, using collected block info.
+  void removeOverriddenStores() {
+    SmallVector<Operation *> toErase;
+
+    for (const auto &[block, ptrToStores] : blockInfo) {
+      for (const auto &[ptr, stores] : ptrToStores) {
+        if (stores.size() > 1) {
+          auto replacement = stores.back();
+          for (auto *store : stores) {
+            if (isReplacement(ptr, store, replacement)) {
+              LLVM_DEBUG(llvm::dbgs() << "replacing store " << *store
+                                      << " by: " << *replacement << '\n');
+              toErase.push_back(store);
+            }
+          }
+        }
+      }
+    }
+
+    for (auto *op : toErase)
+      op->erase();
+  }
+
+private:
+  /// Detect if value is used in the op or its nested blocks.
+  bool isReplacement(Operation *ptr, Operation *store,
+                     Operation *replacement) const {
+    if (store == replacement)
+      return false;
+
+    // Check that there are no non-store uses dominated by the store and
+    // not dominated by the replacement, i.e. only uses between the two
+    // stores are other stores to the same pointer.
+    for (auto *user : ptr->getUsers()) {
+      if (user != store && user != replacement) {
+        if (!isStoreToPtr(user, ptr) && dom.dominates(store, user) &&
+            !dom.dominates(replacement, user)) {
+          LLVM_DEBUG(llvm::dbgs() << "store " << replacement
+                                  << " is used before: " << store << '\n');
+          return false;
+        }
+      }
+    }
+    return true;
+  }
+
+  /// Detects a store to the pointer.
+  static bool isStoreToPtr(Operation *op, Operation *ptr) {
+    return isa_and_present<cudaq::cc::StoreOp>(op) &&
+           (dyn_cast<cudaq::cc::StoreOp>(op).getPtrvalue().getDefiningOp() ==
+            ptr);
+  }
+
+  /// Collect all stores to a pointer for a block.
+  void collectBlockInfo(Block *block) {
+    for (auto &op : *block) {
+      for (auto &region : op.getRegions())
+        for (auto &b : region)
+          collectBlockInfo(&b);
+
+      if (auto store = dyn_cast<cudaq::cc::StoreOp>(&op)) {
+        auto ptr = store.getPtrvalue().getDefiningOp();
+        if (isStoreToStack(store)) {
+          auto &[b, ptrToStores] = blockInfo.FindAndConstruct(block);
+          auto &[p, stores] = ptrToStores.FindAndConstruct(ptr);
+          stores.push_back(&op);
+        }
+      }
+    }
+  }
+
+  /// Detect stores to stack locations, for example:
+  /// ```
+  /// %1 = cc.alloca !cc.array<i64 x 2>
+  ///
+  /// %2 = cc.cast %1 : (!cc.ptr<!cc.array<i64 x 2>>) -> !cc.ptr<i64>
+  /// cc.store %c0_i64, %2 : !cc.ptr<i64>
+  ///
+  /// %3 = cc.compute_ptr %1[1] : (!cc.ptr<!cc.array<i64 x 2>>) -> !cc.ptr<i64>
+  /// cc.store %c0_i64, %3 : !cc.ptr<i64>
+  /// ```
+  static bool isStoreToStack(cudaq::cc::StoreOp store) {
+    auto ptrOp = store.getPtrvalue();
+    if (auto cast = ptrOp.getDefiningOp<cudaq::cc::CastOp>())
+      ptrOp = cast.getOperand();
+
+    if (auto computePtr = ptrOp.getDefiningOp<cudaq::cc::ComputePtrOp>())
+      ptrOp = computePtr.getBase();
+
+    return isa_and_present<cudaq::cc::AllocaOp>(ptrOp.getDefiningOp());
+  }
+
+  DominanceInfo &dom;
+  DenseMap<Block *, DenseMap<Operation *, SmallVector<Operation *>>> blockInfo;
+};
+} // namespace
diff --git a/python/cudaq/kernel/analysis.py b/python/cudaq/kernel/analysis.py
index c68e47c7ab4..3f8f86c145f 100644
--- a/python/cudaq/kernel/analysis.py
+++ b/python/cudaq/kernel/analysis.py
@@ -177,7 +177,7 @@ def visit_Call(self, node):
 
                     if name not in globalAstRegistry:
                         raise RuntimeError(
-                            f"{name} is not a valid kernel to call ({'.'.join(moduleNames)})."
+                            f"{name} is not a valid kernel to call ({'.'.join(moduleNames)}). Registry: {globalAstRegistry}"
                         )
 
                     self.depKernels[name] = globalAstRegistry[name]
diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index 3463b4a1e0e..499d49e8d6f 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -658,6 +658,73 @@ def __insertDbgStmt(self, value, dbgStmt):
         func.CallOp(printFunc, [strLit, value])
         return
 
+    def __get_vector_size(self, vector):
+        """
+        Get the size of a vector or array type.
+        
+        Args:
+            vector: MLIR Value of vector/array type
+            
+        Returns:
+            MLIR Value containing the size as an integer
+        """
+        if cc.StdvecType.isinstance(vector.type):
+            return cc.StdvecSizeOp(self.getIntegerType(), vector).result
+        return self.getConstantInt(
+            cc.ArrayType.getSize(cc.PointerType.getElementType(vector.type)))
+
+    def __load_vector_element(self, vector, index):
+        """
+        Load an element from a vector or array at the given index.
+        
+        Args:
+            vector: MLIR Value of vector/array type
+            index: MLIR Value containing integer index
+            
+        Returns:
+            MLIR Value containing the loaded element
+        """
+        if cc.StdvecType.isinstance(vector.type):
+            data_ptr = cc.StdvecDataOp(
+                cc.PointerType.get(
+                    self.ctx,
+                    cc.ArrayType.get(self.ctx,
+                                     cc.StdvecType.getElementType(
+                                         vector.type))), vector).result
+            return cc.LoadOp(
+                cc.ComputePtrOp(
+                    cc.PointerType.get(
+                        self.ctx,
+                        cc.StdvecType.getElementType(vector.type)), data_ptr,
+                    [index], DenseI32ArrayAttr.get([kDynamicPtrIndex]))).result
+        return cc.LoadOp(
+            cc.ComputePtrOp(
+                cc.PointerType.get(
+                    self.ctx,
+                    cc.ArrayType.getElementType(
+                        cc.PointerType.getElementType(vector.type))), vector,
+                [index], DenseI32ArrayAttr.get([kDynamicPtrIndex]))).result
+
+    def __get_superior_type(self, a, b):
+        """
+        Get the superior numeric type between two MLIR Values.
+        F64 > F32 > Integer, with integers promoting to the wider width.
+        
+        Args:
+            a: First MLIR Value
+            b: Second MLIR Value
+            
+        Returns:
+            MLIR Type representing the superior type
+        """
+        if F64Type.isinstance(a.type) or F64Type.isinstance(b.type):
+            return F64Type.get()
+        if F32Type.isinstance(a.type) or F32Type.isinstance(b.type):
+            return F32Type.get()
+        return self.getIntegerType(
+            max(IntegerType(a.type).width,
+                IntegerType(b.type).width))
+
     def convertArithmeticToSuperiorType(self, values, type):
         """
         Assuming all values provided are arithmetic, convert each one to the 
@@ -3536,6 +3603,50 @@ def visit_Compare(self, node):
                                  right).result)
             return
 
+        if isinstance(op, (ast.In, ast.NotIn)):
+            right_val = right
+            left_val = left
+
+            # Type validation and vector initialization
+            if not (cc.StdvecType.isinstance(right_val.type) or
+                    cc.ArrayType.isinstance(right_val.type)):
+                self.emitFatalError(
+                    "Right operand must be a list/vector for 'in' comparison")
+
+            # Loop setup
+            i1_type = self.getIntegerType(1)
+            accumulator = cc.AllocaOp(cc.PointerType.get(self.ctx, i1_type),
+                                      TypeAttr.get(i1_type)).result
+            cc.StoreOp(self.getConstantInt(0, 1), accumulator)
+
+            # Element comparison loop
+            def check_element(idx):
+                element = self.__load_vector_element(right_val, idx)
+                promoted_left, promoted_element = self.convertArithmeticToSuperiorType(
+                    [left_val, element],
+                    self.__get_superior_type(left_val, element))
+
+                iCondPred = IntegerAttr.get(self.getIntegerType(), 0)
+                fCondPred = IntegerAttr.get(self.getIntegerType(), 0)
+                cmp_result = (
+                    arith.CmpIOp(iCondPred, promoted_left, promoted_element)
+                    if IntegerType.isinstance(promoted_left.type) else
+                    arith.CmpFOp(fCondPred, promoted_left, promoted_element))
+
+                current = cc.LoadOp(accumulator).result
+                cc.StoreOp(arith.OrIOp(current, cmp_result.result), accumulator)
+
+            self.createInvariantForLoop(self.__get_vector_size(right_val),
+                                        check_element)
+
+            final_result = cc.LoadOp(accumulator).result
+            if isinstance(op, ast.NotIn):
+                final_result = arith.XOrIOp(final_result,
+                                            self.getConstantInt(1, 1)).result
+            self.pushValue(final_result)
+
+            return
+
     def visit_AugAssign(self, node):
         """
         Visit augment-assign operations (e.g. +=). 
diff --git a/python/cudaq/kernels/uccsd.py b/python/cudaq/kernels/uccsd.py
index d8754226ea5..b6de4247a27 100644
--- a/python/cudaq/kernels/uccsd.py
+++ b/python/cudaq/kernels/uccsd.py
@@ -350,29 +350,37 @@ def uccsd_odd_electrons(qubits: cudaq.qview, thetas: list[float],
     lenVirtA = len(virtual_alpha_indices)
     lenVirtB = len(virtual_beta_indices)
 
-    singles_a = [[0, 0] for k in range(lenOccA * lenVirtA)]
+    singles_a0 = [0 for k in range(lenOccA * lenVirtA)]
+    singles_a1 = [0 for k in range(lenOccA * lenVirtA)]
     counter = 0
     for p in occupied_alpha_indices:
         for q in virtual_alpha_indices:
-            singles_a[counter] = [p, q]
+            singles_a0[counter] = p
+            singles_a1[counter] = q
             counter = counter + 1
 
     counter = 0
-    singles_b = [[0, 0] for k in range(lenOccB * lenVirtB)]
+    singles_b0 = [0 for k in range(lenOccB * lenVirtB)]
+    singles_b1 = [0 for k in range(lenOccB * lenVirtB)]
     for p in occupied_beta_indices:
         for q in virtual_beta_indices:
-            singles_b[counter] = [p, q]
+            singles_b0[counter] = p
+            singles_b1[counter] = q
             counter = counter + 1
 
     counter = 0
-    doubles_m = [
-        [0, 0, 0, 0] for k in range(lenOccB * lenVirtB * lenOccA * lenVirtA)
-    ]
+    doubles_m0 = [0 for k in range(lenOccB * lenVirtB * lenOccA * lenVirtA)]
+    doubles_m1 = [0 for k in range(lenOccB * lenVirtB * lenOccA * lenVirtA)]
+    doubles_m2 = [0 for k in range(lenOccB * lenVirtB * lenOccA * lenVirtA)]
+    doubles_m3 = [0 for k in range(lenOccB * lenVirtB * lenOccA * lenVirtA)]
     for p in occupied_alpha_indices:
         for q in occupied_beta_indices:
             for r in virtual_beta_indices:
                 for s in virtual_alpha_indices:
-                    doubles_m[counter] = [p, q, r, s]
+                    doubles_m0[counter] = p
+                    doubles_m1[counter] = q
+                    doubles_m2[counter] = r
+                    doubles_m3[counter] = s
                     counter = counter + 1
 
     counter = 0
@@ -384,13 +392,18 @@ def uccsd_odd_electrons(qubits: cudaq.qview, thetas: list[float],
                     nEle = nEle + 1
 
     counter = 0
-    doubles_a = [[0, 0, 0, 0] for k in range(nEle)]
+    doubles_a0 = [0 for k in range(nEle)]
+    doubles_a1 = [0 for k in range(nEle)]
+    doubles_a2 = [0 for k in range(nEle)]
+    doubles_a3 = [0 for k in range(nEle)]
     for p in range(lenOccA - 1):
         for q in range(p + 1, lenOccA):
             for r in range(lenVirtA - 1):
                 for s in range(r + 1, lenVirtA):
-                    doubles_a[counter] = [occupied_alpha_indices[p],occupied_alpha_indices[q],\
-                                     virtual_alpha_indices[r],virtual_alpha_indices[s]]
+                    doubles_a0[counter] = occupied_alpha_indices[p]
+                    doubles_a1[counter] = occupied_alpha_indices[q]
+                    doubles_a2[counter] = virtual_alpha_indices[r]
+                    doubles_a3[counter] = virtual_alpha_indices[s]
                     counter = counter + 1
 
     counter = 0
@@ -400,47 +413,53 @@ def uccsd_odd_electrons(qubits: cudaq.qview, thetas: list[float],
             for r in range(lenVirtB - 1):
                 for s in range(r + 1, lenVirtB):
                     nEle = nEle + 1
-    doubles_b = [[0, 0, 0, 0] for k in range(nEle)]
+
+    doubles_b0 = [0 for k in range(nEle)]
+    doubles_b1 = [0 for k in range(nEle)]
+    doubles_b2 = [0 for k in range(nEle)]
+    doubles_b3 = [0 for k in range(nEle)]
     for p in range(lenOccB - 1):
         for q in range(p + 1, lenOccB):
             for r in range(lenVirtB - 1):
                 for s in range(r + 1, lenVirtB):
-                    doubles_b[counter] = [occupied_beta_indices[p],occupied_beta_indices[q],\
-                                     virtual_beta_indices[r],virtual_beta_indices[s]]
+                    doubles_b0[counter] = occupied_beta_indices[p]
+                    doubles_b1[counter] = occupied_beta_indices[q]
+                    doubles_b2[counter] = virtual_beta_indices[r]
+                    doubles_b3[counter] = virtual_beta_indices[s]
                     counter = counter + 1
 
-    n_alpha_singles = len(singles_a)
-    n_beta_singles = len(singles_b)
-    n_mixed_doubles = len(doubles_m)
-    n_alpha_doubles = len(doubles_a)
-    n_beta_doubles = len(doubles_b)
+    n_alpha_singles = len(singles_a0)
+    n_beta_singles = len(singles_b0)
+    n_mixed_doubles = len(doubles_m0)
+    n_alpha_doubles = len(doubles_a0)
+    n_beta_doubles = len(doubles_b0)
 
     thetaCounter = 0
     for i in range(n_alpha_singles):
-        single_excitation(qubits, singles_a[i][0], singles_a[i][1],
+        single_excitation(qubits, singles_a0[i], singles_a1[i],
                           thetas[thetaCounter])
         thetaCounter += 1
 
     for i in range(n_beta_singles):
-        single_excitation(qubits, singles_b[i][0], singles_b[i][1],
+        single_excitation(qubits, singles_b0[i], singles_b1[i],
                           thetas[thetaCounter])
         thetaCounter += 1
 
     for i in range(n_mixed_doubles):
-        double_excitation_opt(qubits, doubles_m[i][0], doubles_m[i][1],
-                              doubles_m[i][2], doubles_m[i][3],
+        double_excitation_opt(qubits, doubles_m0[i], doubles_m1[i],
+                              doubles_m2[i], doubles_m3[i],
                               thetas[thetaCounter])
         thetaCounter += 1
 
     for i in range(n_alpha_doubles):
-        double_excitation_opt(qubits, doubles_a[i][0], doubles_a[i][1],
-                              doubles_a[i][2], doubles_a[i][3],
+        double_excitation_opt(qubits, doubles_a0[i], doubles_a1[i],
+                              doubles_a2[i], doubles_a3[i],
                               thetas[thetaCounter])
         thetaCounter += 1
 
     for i in range(n_beta_doubles):
-        double_excitation_opt(qubits, doubles_b[i][0], doubles_b[i][1],
-                              doubles_b[i][2], doubles_b[i][3],
+        double_excitation_opt(qubits, doubles_b0[i], doubles_b1[i],
+                              doubles_b2[i], doubles_b3[i],
                               thetas[thetaCounter])
         thetaCounter += 1
 
@@ -463,29 +482,37 @@ def uccsd_even_electrons(qubits: cudaq.qview, thetas: list[float],
     lenVirtA = len(virtual_alpha_indices)
     lenVirtB = len(virtual_beta_indices)
 
-    singles_a = [[0, 0] for k in range(lenOccA * lenVirtA)]
+    singles_a0 = [0 for k in range(lenOccA * lenVirtA)]
+    singles_a1 = [0 for k in range(lenOccA * lenVirtA)]
     counter = 0
     for p in occupied_alpha_indices:
         for q in virtual_alpha_indices:
-            singles_a[counter] = [p, q]
+            singles_a0[counter] = p
+            singles_a1[counter] = q
             counter = counter + 1
 
     counter = 0
-    singles_b = [[0, 0] for k in range(lenOccB * lenVirtB)]
+    singles_b0 = [0 for k in range(lenOccB * lenVirtB)]
+    singles_b1 = [0 for k in range(lenOccB * lenVirtB)]
     for p in occupied_beta_indices:
         for q in virtual_beta_indices:
-            singles_b[counter] = [p, q]
+            singles_b0[counter] = p
+            singles_b1[counter] = q
             counter = counter + 1
 
     counter = 0
-    doubles_m = [
-        [0, 0, 0, 0] for k in range(lenOccB * lenVirtB * lenOccA * lenVirtA)
-    ]
+    doubles_m0 = [0 for k in range(lenOccB * lenVirtB * lenOccA * lenVirtA)]
+    doubles_m1 = [0 for k in range(lenOccB * lenVirtB * lenOccA * lenVirtA)]
+    doubles_m2 = [0 for k in range(lenOccB * lenVirtB * lenOccA * lenVirtA)]
+    doubles_m3 = [0 for k in range(lenOccB * lenVirtB * lenOccA * lenVirtA)]
     for p in occupied_alpha_indices:
         for q in occupied_beta_indices:
             for r in virtual_beta_indices:
                 for s in virtual_alpha_indices:
-                    doubles_m[counter] = [p, q, r, s]
+                    doubles_m0[counter] = p
+                    doubles_m1[counter] = q
+                    doubles_m2[counter] = r
+                    doubles_m3[counter] = s
                     counter = counter + 1
 
     counter = 0
@@ -497,13 +524,18 @@ def uccsd_even_electrons(qubits: cudaq.qview, thetas: list[float],
                     nEle = nEle + 1
 
     counter = 0
-    doubles_a = [[0, 0, 0, 0] for k in range(nEle)]
+    doubles_a0 = [0 for k in range(nEle)]
+    doubles_a1 = [0 for k in range(nEle)]
+    doubles_a2 = [0 for k in range(nEle)]
+    doubles_a3 = [0 for k in range(nEle)]
     for p in range(lenOccA - 1):
         for q in range(p + 1, lenOccA):
             for r in range(lenVirtA - 1):
                 for s in range(r + 1, lenVirtA):
-                    doubles_a[counter] = [occupied_alpha_indices[p],occupied_alpha_indices[q],\
-                                     virtual_alpha_indices[r],virtual_alpha_indices[s]]
+                    doubles_a0[counter] = occupied_alpha_indices[p]
+                    doubles_a1[counter] = occupied_alpha_indices[q]
+                    doubles_a2[counter] = virtual_alpha_indices[r]
+                    doubles_a3[counter] = virtual_alpha_indices[s]
                     counter = counter + 1
 
     counter = 0
@@ -513,47 +545,53 @@ def uccsd_even_electrons(qubits: cudaq.qview, thetas: list[float],
             for r in range(lenVirtB - 1):
                 for s in range(r + 1, lenVirtB):
                     nEle = nEle + 1
-    doubles_b = [[0, 0, 0, 0] for k in range(nEle)]
+
+    doubles_b0 = [0 for k in range(nEle)]
+    doubles_b1 = [0 for k in range(nEle)]
+    doubles_b2 = [0 for k in range(nEle)]
+    doubles_b3 = [0 for k in range(nEle)]
     for p in range(lenOccB - 1):
         for q in range(p + 1, lenOccB):
             for r in range(lenVirtB - 1):
                 for s in range(r + 1, lenVirtB):
-                    doubles_b[counter] = [occupied_beta_indices[p],occupied_beta_indices[q],\
-                                     virtual_beta_indices[r],virtual_beta_indices[s]]
+                    doubles_b0[counter] = occupied_beta_indices[p]
+                    doubles_b1[counter] = occupied_beta_indices[q]
+                    doubles_b2[counter] = virtual_beta_indices[r]
+                    doubles_b3[counter] = virtual_beta_indices[s]
                     counter = counter + 1
 
-    n_alpha_singles = len(singles_a)
-    n_beta_singles = len(singles_b)
-    n_mixed_doubles = len(doubles_m)
-    n_alpha_doubles = len(doubles_a)
-    n_beta_doubles = len(doubles_b)
+    n_alpha_singles = len(singles_a0)
+    n_beta_singles = len(singles_b0)
+    n_mixed_doubles = len(doubles_m0)
+    n_alpha_doubles = len(doubles_a0)
+    n_beta_doubles = len(doubles_b0)
 
     thetaCounter = 0
     for i in range(n_alpha_singles):
-        single_excitation(qubits, singles_a[i][0], singles_a[i][1],
+        single_excitation(qubits, singles_a0[i], singles_a1[i],
                           thetas[thetaCounter])
         thetaCounter += 1
 
     for i in range(n_beta_singles):
-        single_excitation(qubits, singles_b[i][0], singles_b[i][1],
+        single_excitation(qubits, singles_b0[i], singles_b1[i],
                           thetas[thetaCounter])
         thetaCounter += 1
 
     for i in range(n_mixed_doubles):
-        double_excitation_opt(qubits, doubles_m[i][0], doubles_m[i][1],
-                              doubles_m[i][2], doubles_m[i][3],
+        double_excitation_opt(qubits, doubles_m0[i], doubles_m1[i],
+                              doubles_m2[i], doubles_m3[i],
                               thetas[thetaCounter])
         thetaCounter += 1
 
     for i in range(n_alpha_doubles):
-        double_excitation_opt(qubits, doubles_a[i][0], doubles_a[i][1],
-                              doubles_a[i][2], doubles_a[i][3],
+        double_excitation_opt(qubits, doubles_a0[i], doubles_a1[i],
+                              doubles_a2[i], doubles_a3[i],
                               thetas[thetaCounter])
         thetaCounter += 1
 
     for i in range(n_beta_doubles):
-        double_excitation_opt(qubits, doubles_b[i][0], doubles_b[i][1],
-                              doubles_b[i][2], doubles_b[i][3],
+        double_excitation_opt(qubits, doubles_b0[i], doubles_b1[i],
+                              doubles_b2[i], doubles_b3[i],
                               thetas[thetaCounter])
         thetaCounter += 1
 
diff --git a/python/cudaq/operator/definitions.py b/python/cudaq/operator/definitions.py
index 25bdcfacb4c..2ae89d9c698 100644
--- a/python/cudaq/operator/definitions.py
+++ b/python/cudaq/operator/definitions.py
@@ -16,7 +16,7 @@
 
 
 # Operators as defined here (watch out of differences in convention):
-# https://www.dynamiqs.org/python_api/utils/operators/sigmay.html
+# https://www.dynamiqs.org/stable/python_api/utils/operators/create.html
 class operators:
 
     class matrices:
diff --git a/python/cudaq/operator/dynamics.yml b/python/cudaq/operator/dynamics.yml
index 537d00bc212..df7fb564c96 100644
--- a/python/cudaq/operator/dynamics.yml
+++ b/python/cudaq/operator/dynamics.yml
@@ -11,3 +11,6 @@ description: "Dynamics simulation backend"
 gpu-requirements: true
 config:
   nvqir-simulation-backend: dynamics
+  platform-library: mqpu
+  preprocessor-defines: ["-D CUDAQ_DYNAMICS_TARGET"]
+  library-mode: true
diff --git a/python/extension/CMakeLists.txt b/python/extension/CMakeLists.txt
index fe8431828e0..6c245960c94 100644
--- a/python/extension/CMakeLists.txt
+++ b/python/extension/CMakeLists.txt
@@ -39,6 +39,9 @@ declare_mlir_dialect_python_bindings(
     dialects/cc.py
   DIALECT_NAME cc)
 
+# FIXME: remove this flag   
+set (CMAKE_CXX_FLAGS "-fPIC ${CMAKE_CXX_FLAGS} -Wno-suggest-override")
+
 declare_mlir_python_extension(CUDAQuantumPythonSources.Extension
   MODULE_NAME _quakeDialects
   ADD_TO_PARENT CUDAQuantumPythonSources
diff --git a/python/runtime/common/py_EvolveResult.cpp b/python/runtime/common/py_EvolveResult.cpp
index dbcd5d7c66d..33a0ca2f3b6 100644
--- a/python/runtime/common/py_EvolveResult.cpp
+++ b/python/runtime/common/py_EvolveResult.cpp
@@ -8,7 +8,7 @@
 
 #include "py_EvolveResult.h"
 #include "common/EvolveResult.h"
-#include "cudaq/algorithms/evolve.h"
+#include "cudaq/algorithms/evolve_internal.h"
 #include <optional>
 #include <pybind11/stl.h>
 
diff --git a/python/runtime/cudaq/algorithms/py_evolve.cpp b/python/runtime/cudaq/algorithms/py_evolve.cpp
index 4aa9aa7e636..2149001c4bd 100644
--- a/python/runtime/cudaq/algorithms/py_evolve.cpp
+++ b/python/runtime/cudaq/algorithms/py_evolve.cpp
@@ -9,7 +9,7 @@
 #include "LinkedLibraryHolder.h"
 #include "common/ArgumentWrapper.h"
 #include "common/Logger.h"
-#include "cudaq/algorithms/evolve.h"
+#include "cudaq/algorithms/evolve_internal.h"
 #include "utils/OpaqueArguments.h"
 #include "mlir/Bindings/Python/PybindAdaptors.h"
 #include "mlir/CAPI/IR.h"
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 3232e26b11e..b9d21ded785 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -106,6 +106,14 @@ jitAndCreateArgs(const std::string &name, MlirModule module,
     pm.addPass(createSymbolDCEPass());
     cudaq::opt::addPipelineConvertToQIR(pm);
 
+    auto enablePrintMLIREachPass =
+        getEnvBool("CUDAQ_MLIR_PRINT_EACH_PASS", false);
+
+    if (enablePrintMLIREachPass) {
+      cloned.getContext()->disableMultithreading();
+      pm.enableIRPrinting();
+    }
+
     DefaultTimingManager tm;
     tm.setEnabled(cudaq::isTimingTagEnabled(cudaq::TIMING_JIT_PASSES));
     auto timingScope = tm.getRootScope(); // starts the timer
diff --git a/python/tests/backends/test_IQM.py b/python/tests/backends/test_IQM.py
index 718d5ae3e9b..5747fa1a5cc 100644
--- a/python/tests/backends/test_IQM.py
+++ b/python/tests/backends/test_IQM.py
@@ -65,6 +65,7 @@ def startUpMockServer():
         pytest.exit("Mock server did not start in time, skipping tests.",
                     returncode=1)
 
+    cudaq.set_random_seed(13)
     # Set the targeted QPU
     os.environ["IQM_TOKENS_FILE"] = tmp_tokens_file.name
     kwargs = {"qpu-architecture": "Apollo"}
@@ -227,8 +228,7 @@ def basic_x():
         custom_x(qubit)
 
     counts = cudaq.sample(basic_x)
-    counts.dump()
-    # Gives result like { 1:999 0:0 }
+    # Gives result like { 0:0 1:1000 }
     assert counts['0'] == 0
 
     @cudaq.kernel
@@ -237,8 +237,8 @@ def basic_h():
         custom_h(qubit)
 
     counts = cudaq.sample(basic_h)
-    counts.dump()
-    assert "0" in counts and "1" in counts
+    # Gives result like { 0:500 1:500 }
+    assert counts['0'] > 0 and counts['1'] > 0
 
     @cudaq.kernel
     def bell():
@@ -247,7 +247,7 @@ def bell():
         custom_x.ctrl(qubits[0], qubits[1])
 
     counts = cudaq.sample(bell)
-    # Gives result like { 11:499 10:0 01:0 00:499 }
+    # Gives result like { 00:500 01:0 10:0 11:500 }
     assert counts['01'] == 0 and counts['10'] == 0
 
 
@@ -264,7 +264,7 @@ def bell_pair():
         custom_cnot(qubits[0], qubits[1])
 
     counts = cudaq.sample(bell_pair)
-    # Gives result like { 11:499 10:0 01:0 00:499 }
+    # Gives result like { 00:500 01:0 10:0 11:500 }
     assert counts['01'] == 0 and counts['10'] == 0
 
     cudaq.register_operation(
@@ -281,7 +281,7 @@ def ctrl_z_kernel():
         x(controls)
 
     counts = cudaq.sample(ctrl_z_kernel)
-    assert counts["0010011"] == 999
+    assert counts["0010011"] == 1000
 
 
 # leave for gdb debugging
diff --git a/python/tests/backends/test_Ionq_LocalEmulation_kernel.py b/python/tests/backends/test_Ionq_LocalEmulation_kernel.py
new file mode 100644
index 00000000000..84a7b2bb073
--- /dev/null
+++ b/python/tests/backends/test_Ionq_LocalEmulation_kernel.py
@@ -0,0 +1,62 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+import cudaq
+import cudaq.kernels
+import pytest
+import os
+from typing import List
+
+
+@pytest.fixture(scope="function", autouse=True)
+def configureTarget():
+    # Set the targeted QPU
+    cudaq.set_target('ionq', emulate='true')
+
+    yield "Running the tests."
+
+    cudaq.reset_target()
+
+
+def test_Ionq_cudaq_uccsd():
+
+    num_electrons = 2
+    num_qubits = 8
+
+    thetas = [
+        -0.00037043841404585794, 0.0003811110195084151, 0.2286823796532558,
+        -0.00037043841404585794, 0.0003811110195084151, 0.2286823796532558,
+        -0.00037043841404585794, 0.0003811110195084151, 0.2286823796532558,
+        -0.00037043841404585794, 0.0003811110195084151, 0.2286823796532558,
+        -0.00037043841404585794, 0.0003811110195084151, 0.2286823796532558,
+        -0.00037043841404585794, 0.0003811110195084151, 0.2286823796532558,
+        -0.00037043841404585794, 0.0003811110195084151, 0.2286823796532558,
+        -0.00037043841404585794, 0.0003811110195084151, 0.2286823796532558
+    ]
+
+    @cudaq.kernel
+    def kernel():
+        qubits = cudaq.qvector(num_qubits)
+        for i in range(num_electrons):
+            x(qubits[i])
+        cudaq.kernels.uccsd(qubits, thetas, num_electrons, num_qubits)
+
+    counts = cudaq.sample(kernel, shots_count=1000)
+    assert len(counts) == 6
+    assert '00000011' in counts
+    assert '00000110' in counts
+    assert '00010010' in counts
+    assert '01000010' in counts
+    assert '10000001' in counts
+    assert '11000000' in counts
+
+
+# leave for gdb debugging
+if __name__ == "__main__":
+    loc = os.path.abspath(__file__)
+    pytest.main([loc, "-s"])
diff --git a/python/tests/backends/test_braket.py b/python/tests/backends/test_braket.py
index 504be549107..ec005b52a3d 100644
--- a/python/tests/backends/test_braket.py
+++ b/python/tests/backends/test_braket.py
@@ -26,6 +26,10 @@ def do_something():
     cudaq.reset_target()
 
 
+def assert_close(got) -> bool:
+    return got < -1.5 and got > -1.9
+
+
 def test_simple_kernel():
 
     @cudaq.kernel
@@ -249,8 +253,23 @@ def ansatz(theta: float):
     hamiltonian = 5.907 - 2.1433 * spin.x(0) * spin.x(1) - 2.1433 * spin.y(
         0) * spin.y(1) + .21829 * spin.z(0) - 6.125 * spin.z(1)
 
-    res = cudaq.observe(ansatz, hamiltonian, .59, shots_count=1)
+    res = cudaq.observe(ansatz, hamiltonian, .59, shots_count=2000)
     print(res.expectation())
+    assert assert_close(res.expectation())
+
+
+def test_observe_async():
+
+    @cudaq.kernel
+    def kernel():
+        qubits = cudaq.qvector(2)
+        x(qubits[0])
+
+    hamiltonian = spin.z(0) * spin.z(1)
+    future = cudaq.observe_async(kernel, hamiltonian, shots_count=1)
+    result = future.get()
+    print(result.expectation())
+    assert result.expectation() == -1.0
 
 
 def test_custom_operations():
diff --git a/python/tests/kernel/test_kernel_features.py b/python/tests/kernel/test_kernel_features.py
index c581b741bad..69271c511db 100644
--- a/python/tests/kernel/test_kernel_features.py
+++ b/python/tests/kernel/test_kernel_features.py
@@ -594,13 +594,13 @@ def kernel3():
     @cudaq.kernel
     def kernel4():
         qubits = cudaq.qvector(4)
-        r = [i * 2 + 1 for i in range(-1)]
+        r = [i * 2 + 1 for i in range(1)]
         for i in r:
             x(qubits[i])
 
     counts = cudaq.sample(kernel4)
     assert len(counts) == 1
-    assert '0000' in counts
+    assert '0100' in counts
 
     @cudaq.kernel
     def kernel5():
@@ -625,6 +625,25 @@ def kernel6():
     assert '0101' in counts
 
 
+def test_array_value_assignment():
+
+    @cudaq.kernel()
+    def foo():
+        a = [1, 1]
+        b = [0, 0]
+        b[0] = a[0]
+        b[1] = a[1]
+        q0 = cudaq.qubit()
+        q1 = cudaq.qubit()
+        if (b[0]):
+            x(q0)
+        if (b[1]):
+            x(q1)
+
+    counts = cudaq.sample(foo)
+    assert "11" in counts
+
+
 def test_control_operations():
 
     @cudaq.kernel
@@ -1995,6 +2014,41 @@ def invalid_unsupported():
     with pytest.raises(RuntimeError):
         cudaq.sample(invalid_unsupported)
 
+def test_in_comparator():
+
+    @cudaq.kernel
+    def kernel(ind: int):
+        q = cudaq.qubit()
+        if ind in [6, 13, 20, 27, 34]:
+            x(q)    
+
+    c = cudaq.sample(kernel, 1)
+    assert len(c) == 1 and '0' in c
+    c = cudaq.sample(kernel, 20)
+    assert len(c) == 1 and '1' in c
+    c = cudaq.sample(kernel, 14)
+    assert len(c) == 1 and '0' in c
+    c = cudaq.sample(kernel, 13)
+    assert len(c) == 1 and '1' in c
+    c = cudaq.sample(kernel, 26)
+    assert len(c) == 1 and '0' in c
+    c = cudaq.sample(kernel, 27)
+    assert len(c) == 1 and '1' in c
+    c = cudaq.sample(kernel, 34)
+    assert len(c) == 1 and '1' in c
+    c = cudaq.sample(kernel, 36)
+    assert len(c) == 1 and '0' in c
+
+    @cudaq.kernel
+    def kernel(ind: int):
+        q = cudaq.qubit()
+        if ind not in [6, 13, 20, 27, 34]:
+            x(q)    
+
+    c = cudaq.sample(kernel, 1)
+    assert len(c) == 1 and '1' in c
+    c = cudaq.sample(kernel, 20)
+    assert len(c) == 1 and '0' in c
 
 # leave for gdb debugging
 if __name__ == "__main__":
diff --git a/python/tests/kernel/test_kernel_uccsd.py b/python/tests/kernel/test_kernel_uccsd.py
new file mode 100644
index 00000000000..4b608a9eccb
--- /dev/null
+++ b/python/tests/kernel/test_kernel_uccsd.py
@@ -0,0 +1,553 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+import pytest
+import cudaq
+
+
+# Use a snapshot of the uccsd.py to make sure we can compile
+# complex code. Importing uccsd from cudaq.kernels fails due
+# clearing the caches in the tests.
+# Issue: https://github.com/NVIDIA/cuda-quantum/issues/1954
+def test_cudaq_uccsd1():
+
+    @cudaq.kernel
+    def single_excitation1(qubits: cudaq.qview, p_occ: int, q_virt: int,
+                           theta: float):
+
+        # Y_p X_q
+        rx(np.pi / 2.0, qubits[p_occ])
+        h(qubits[q_virt])
+
+        for i in range(p_occ, q_virt):
+            x.ctrl(qubits[i], qubits[i + 1])
+
+        rz(0.5 * theta, qubits[q_virt])
+
+        for i in range(q_virt, p_occ, -1):
+            x.ctrl(qubits[i - 1], qubits[i])
+
+        h(qubits[q_virt])
+        rx(-np.pi / 2.0, qubits[p_occ])
+
+        # -X_p Y_q
+        h(qubits[p_occ])
+        rx(np.pi / 2.0, qubits[q_virt])
+
+        for i in range(p_occ, q_virt):
+            x.ctrl(qubits[i], qubits[i + 1])
+
+        rz(-0.5 * theta, qubits[q_virt])
+
+        for i in range(q_virt, p_occ, -1):
+            x.ctrl(qubits[i - 1], qubits[i])
+
+        rx(-np.pi / 2.0, qubits[q_virt])
+        h(qubits[p_occ])
+
+    @cudaq.kernel
+    def double_excitation_opt1(qubits: cudaq.qview, p_occ: int, q_occ: int,
+                               r_virt: int, s_virt: int, theta: float):
+
+        i_occ = 0
+        j_occ = 0
+        a_virt = 0
+        b_virt = 0
+        if (p_occ < q_occ) and (r_virt < s_virt):
+            i_occ = p_occ
+            j_occ = q_occ
+            a_virt = r_virt
+            b_virt = s_virt
+
+        elif (p_occ > q_occ) and (r_virt > s_virt):
+            i_occ = q_occ
+            j_occ = p_occ
+            a_virt = s_virt
+            b_virt = r_virt
+
+        elif (p_occ < q_occ) and (r_virt > s_virt):
+            i_occ = p_occ
+            j_occ = q_occ
+            a_virt = s_virt
+            b_virt = r_virt
+            # theta *= -1.0 FIXME
+            theta *= -1.
+
+        elif (p_occ > q_occ) and (r_virt < s_virt):
+            i_occ = q_occ
+            j_occ = p_occ
+            a_virt = r_virt
+            b_virt = s_virt
+            theta *= -1.0
+        #Block I: x_i x_j x_a y_b + x_i x_j y_a x_b + x_i y_i y_a y_b - x_i y_j x_a x_b
+        #Block II: - y_i x_j x_a x_b +y_i x_j y_a y_b - y_i x_j x_a x_b - y_i y_j y_a x_b
+
+        h(qubits[i_occ])
+        h(qubits[j_occ])
+        h(qubits[a_virt])
+        rx(np.pi / 2.0, qubits[b_virt])
+
+        for i in range(i_occ, j_occ):
+            x.ctrl(qubits[i], qubits[i + 1])
+        x.ctrl(qubits[j_occ], qubits[a_virt])
+        for i in range(a_virt, b_virt):
+            x.ctrl(qubits[i], qubits[i + 1])
+
+        rz(0.125 * theta, qubits[b_virt])
+
+        for i in range(b_virt, a_virt, -1):
+            x.ctrl(qubits[i - 1], qubits[i])
+        x.ctrl(qubits[j_occ], qubits[a_virt])
+
+        rx(-np.pi / 2.0, qubits[b_virt])
+        h(qubits[a_virt])
+
+        rx(np.pi / 2.0, qubits[a_virt])
+        h(qubits[b_virt])
+
+        x.ctrl(qubits[j_occ], qubits[a_virt])
+        for i in range(a_virt, b_virt):
+            x.ctrl(qubits[i], qubits[i + 1])
+
+        rz(0.125 * theta, qubits[b_virt])
+
+        for i in range(b_virt, a_virt, -1):
+            x.ctrl(qubits[i - 1], qubits[i])
+        x.ctrl(qubits[j_occ], qubits[a_virt])
+        for i in range(j_occ, i_occ, -1):
+            x.ctrl(qubits[i - 1], qubits[i])
+
+        rx(-np.pi / 2.0, qubits[a_virt])
+        h(qubits[j_occ])
+
+        rx(np.pi / 2.0, qubits[j_occ])
+        h(qubits[a_virt])
+
+        for i in range(i_occ, j_occ):
+            x.ctrl(qubits[i], qubits[i + 1])
+        x.ctrl(qubits[j_occ], qubits[a_virt])
+        for i in range(a_virt, b_virt):
+            x.ctrl(qubits[i], qubits[i + 1])
+
+        rz(-0.125 * theta, qubits[b_virt])
+
+        for i in range(b_virt, a_virt, -1):
+            x.ctrl(qubits[i - 1], qubits[i])
+        x.ctrl(qubits[j_occ], qubits[a_virt])
+
+        h(qubits[b_virt])
+        h(qubits[a_virt])
+
+        rx(np.pi / 2.0, qubits[a_virt])
+        rx(np.pi / 2.0, qubits[b_virt])
+
+        x.ctrl(qubits[j_occ], qubits[a_virt])
+        for i in range(a_virt, b_virt):
+            x.ctrl(qubits[i], qubits[i + 1])
+
+        rz(0.125 * theta, qubits[b_virt])
+
+        for i in range(b_virt, a_virt, -1):
+            x.ctrl(qubits[i - 1], qubits[i])
+        x.ctrl(qubits[j_occ], qubits[a_virt])
+        for i in range(j_occ, i_occ, -1):
+            x.ctrl(qubits[i - 1], qubits[i])
+
+        rx(-np.pi / 2.0, qubits[j_occ])
+        h(qubits[i_occ])
+
+        rx(np.pi / 2.0, qubits[i_occ])
+        h(qubits[j_occ])
+
+        for i in range(i_occ, j_occ):
+            x.ctrl(qubits[i], qubits[i + 1])
+        x.ctrl(qubits[j_occ], qubits[a_virt])
+        for i in range(a_virt, b_virt):
+            x.ctrl(qubits[i], qubits[i + 1])
+
+        rz(0.125 * theta, qubits[b_virt])
+
+        for i in range(b_virt, a_virt, -1):
+            x.ctrl(qubits[i - 1], qubits[i])
+        x.ctrl(qubits[j_occ], qubits[a_virt])
+
+        rx(-np.pi / 2.0, qubits[b_virt])
+        rx(-np.pi / 2.0, qubits[a_virt])
+
+        h(qubits[a_virt])
+        h(qubits[b_virt])
+
+        x.ctrl(qubits[j_occ], qubits[a_virt])
+        for i in range(a_virt, b_virt):
+            x.ctrl(qubits[i], qubits[i + 1])
+
+        rz(-0.125 * theta, qubits[b_virt])
+
+        for i in range(b_virt, a_virt, -1):
+            x.ctrl(qubits[i - 1], qubits[i])
+        x.ctrl(qubits[j_occ], qubits[a_virt])
+        for i in range(j_occ, i_occ, -1):
+            x.ctrl(qubits[i - 1], qubits[i])
+
+        h(qubits[b_virt])
+        h(qubits[j_occ])
+
+        rx(np.pi / 2.0, qubits[j_occ])
+        rx(np.pi / 2.0, qubits[b_virt])
+
+        for i in range(i_occ, j_occ):
+            x.ctrl(qubits[i], qubits[i + 1])
+        x.ctrl(qubits[j_occ], qubits[a_virt])
+        for i in range(a_virt, b_virt):
+            x.ctrl(qubits[i], qubits[i + 1])
+
+        rz(-0.125 * theta, qubits[b_virt])
+
+        for i in range(b_virt, a_virt, -1):
+            x.ctrl(qubits[i - 1], qubits[i])
+        x.ctrl(qubits[j_occ], qubits[a_virt])
+
+        rx(-np.pi / 2.0, qubits[b_virt])
+        h(qubits[a_virt])
+
+        rx(np.pi / 2.0, qubits[a_virt])
+        h(qubits[b_virt])
+
+        x.ctrl(qubits[j_occ], qubits[a_virt])
+        for i in range(a_virt, b_virt):
+            x.ctrl(qubits[i], qubits[i + 1])
+
+        rz(-0.125 * theta, qubits[b_virt])
+
+        for i in range(b_virt, a_virt, -1):
+            x.ctrl(qubits[i - 1], qubits[i])
+        x.ctrl(qubits[j_occ], qubits[a_virt])
+        for i in range(j_occ, i_occ, -1):
+            x.ctrl(qubits[i - 1], qubits[i])
+
+        h(qubits[b_virt])
+        rx(-np.pi / 2.0, qubits[a_virt])
+        rx(-np.pi / 2.0, qubits[j_occ])
+        rx(-np.pi / 2.0, qubits[i_occ])
+
+    @cudaq.kernel
+    def uccsd1_odd_electrons(qubits: cudaq.qview, thetas: list[float],
+                             n_electrons: int, n_qubits: int):
+        n_spatial_orbitals = n_qubits // 2
+        n_occupied = int(np.ceil(n_electrons / 2))
+        n_virtual = n_spatial_orbitals - n_occupied
+
+        occupied_alpha_indices = [i * 2 for i in range(n_occupied)]
+        virtual_alpha_indices = [
+            i * 2 + n_electrons + 1 for i in range(n_virtual)
+        ]
+
+        occupied_beta_indices = [i * 2 + 1 for i in range(n_occupied - 1)]
+        virtual_beta_indices = [0 for k in range(n_virtual + 1)]
+        virtual_beta_indices[0] = 2 * n_occupied - 1
+        for i in range(n_virtual):
+            virtual_beta_indices[i + 1] = i * 2 + 1 + n_electrons
+
+        lenOccA = len(occupied_alpha_indices)
+        lenOccB = len(occupied_beta_indices)
+        lenVirtA = len(virtual_alpha_indices)
+        lenVirtB = len(virtual_beta_indices)
+
+        singles_a0 = [0 for k in range(lenOccA * lenVirtA)]
+        singles_a1 = [0 for k in range(lenOccA * lenVirtA)]
+        counter = 0
+        for p in occupied_alpha_indices:
+            for q in virtual_alpha_indices:
+                singles_a0[counter] = p
+                singles_a1[counter] = q
+                counter = counter + 1
+
+        counter = 0
+        singles_b0 = [0 for k in range(lenOccB * lenVirtB)]
+        singles_b1 = [0 for k in range(lenOccB * lenVirtB)]
+        for p in occupied_beta_indices:
+            for q in virtual_beta_indices:
+                singles_b0[counter] = p
+                singles_b1[counter] = q
+                counter = counter + 1
+
+        counter = 0
+        doubles_m0 = [0 for k in range(lenOccB * lenVirtB * lenOccA * lenVirtA)]
+        doubles_m1 = [0 for k in range(lenOccB * lenVirtB * lenOccA * lenVirtA)]
+        doubles_m2 = [0 for k in range(lenOccB * lenVirtB * lenOccA * lenVirtA)]
+        doubles_m3 = [0 for k in range(lenOccB * lenVirtB * lenOccA * lenVirtA)]
+        for p in occupied_alpha_indices:
+            for q in occupied_beta_indices:
+                for r in virtual_beta_indices:
+                    for s in virtual_alpha_indices:
+                        doubles_m0[counter] = p
+                        doubles_m1[counter] = q
+                        doubles_m2[counter] = r
+                        doubles_m3[counter] = s
+                        counter = counter + 1
+
+        counter = 0
+        nEle = 0
+        for p in range(lenOccA - 1):
+            for q in range(p + 1, lenOccA):
+                for r in range(lenVirtA - 1):
+                    for s in range(r + 1, lenVirtA):
+                        nEle = nEle + 1
+
+        counter = 0
+        doubles_a0 = [0 for k in range(nEle)]
+        doubles_a1 = [0 for k in range(nEle)]
+        doubles_a2 = [0 for k in range(nEle)]
+        doubles_a3 = [0 for k in range(nEle)]
+        for p in range(lenOccA - 1):
+            for q in range(p + 1, lenOccA):
+                for r in range(lenVirtA - 1):
+                    for s in range(r + 1, lenVirtA):
+                        doubles_a0[counter] = occupied_alpha_indices[p]
+                        doubles_a1[counter] = occupied_alpha_indices[q]
+                        doubles_a2[counter] = virtual_alpha_indices[r]
+                        doubles_a3[counter] = virtual_alpha_indices[s]
+                        counter = counter + 1
+
+        counter = 0
+        nEle = 0
+        for p in range(lenOccB - 1):
+            for q in range(p + 1, lenOccB):
+                for r in range(lenVirtB - 1):
+                    for s in range(r + 1, lenVirtB):
+                        nEle = nEle + 1
+
+        doubles_b0 = [0 for k in range(nEle)]
+        doubles_b1 = [0 for k in range(nEle)]
+        doubles_b2 = [0 for k in range(nEle)]
+        doubles_b3 = [0 for k in range(nEle)]
+        for p in range(lenOccB - 1):
+            for q in range(p + 1, lenOccB):
+                for r in range(lenVirtB - 1):
+                    for s in range(r + 1, lenVirtB):
+                        doubles_b0[counter] = occupied_beta_indices[p]
+                        doubles_b1[counter] = occupied_beta_indices[q]
+                        doubles_b2[counter] = virtual_beta_indices[r]
+                        doubles_b3[counter] = virtual_beta_indices[s]
+                        counter = counter + 1
+
+        n_alpha_singles = len(singles_a0)
+        n_beta_singles = len(singles_b0)
+        n_mixed_doubles = len(doubles_m0)
+        n_alpha_doubles = len(doubles_a0)
+        n_beta_doubles = len(doubles_b0)
+
+        thetaCounter = 0
+        for i in range(n_alpha_singles):
+            single_excitation1(qubits, singles_a0[i], singles_a1[i],
+                               thetas[thetaCounter])
+            thetaCounter += 1
+
+        for i in range(n_beta_singles):
+            single_excitation1(qubits, singles_b0[i], singles_b1[i],
+                               thetas[thetaCounter])
+            thetaCounter += 1
+
+        for i in range(n_mixed_doubles):
+            double_excitation_opt1(qubits, doubles_m0[i], doubles_m1[i],
+                                   doubles_m2[i], doubles_m3[i],
+                                   thetas[thetaCounter])
+            thetaCounter += 1
+
+        for i in range(n_alpha_doubles):
+            double_excitation_opt1(qubits, doubles_a0[i], doubles_a1[i],
+                                   doubles_a2[i], doubles_a3[i],
+                                   thetas[thetaCounter])
+            thetaCounter += 1
+
+        for i in range(n_beta_doubles):
+            double_excitation_opt1(qubits, doubles_b0[i], doubles_b1[i],
+                                   doubles_b2[i], doubles_b3[i],
+                                   thetas[thetaCounter])
+            thetaCounter += 1
+
+    @cudaq.kernel
+    def uccsd1_even_electrons(qubits: cudaq.qview, thetas: list[float],
+                              n_electrons: int, n_qubits: int):
+        n_spatial_orbitals = n_qubits // 2
+        n_occupied = int(np.ceil(n_electrons / 2))
+        n_virtual = n_spatial_orbitals - n_occupied
+
+        occupied_alpha_indices = [i * 2 for i in range(n_occupied)]
+        virtual_alpha_indices = [i * 2 + n_electrons for i in range(n_virtual)]
+
+        occupied_beta_indices = [i * 2 + 1 for i in range(n_occupied)]
+        virtual_beta_indices = [
+            i * 2 + 1 + n_electrons for i in range(n_virtual)
+        ]
+
+        lenOccA = len(occupied_alpha_indices)
+        lenOccB = len(occupied_beta_indices)
+        lenVirtA = len(virtual_alpha_indices)
+        lenVirtB = len(virtual_beta_indices)
+
+        singles_a0 = [0 for k in range(lenOccA * lenVirtA)]
+        singles_a1 = [0 for k in range(lenOccA * lenVirtA)]
+        counter = 0
+        for p in occupied_alpha_indices:
+            for q in virtual_alpha_indices:
+                singles_a0[counter] = p
+                singles_a1[counter] = q
+                counter = counter + 1
+
+        counter = 0
+        singles_b0 = [0 for k in range(lenOccB * lenVirtB)]
+        singles_b1 = [0 for k in range(lenOccB * lenVirtB)]
+        for p in occupied_beta_indices:
+            for q in virtual_beta_indices:
+                singles_b0[counter] = p
+                singles_b1[counter] = q
+                counter = counter + 1
+
+        counter = 0
+        doubles_m0 = [0 for k in range(lenOccB * lenVirtB * lenOccA * lenVirtA)]
+        doubles_m1 = [0 for k in range(lenOccB * lenVirtB * lenOccA * lenVirtA)]
+        doubles_m2 = [0 for k in range(lenOccB * lenVirtB * lenOccA * lenVirtA)]
+        doubles_m3 = [0 for k in range(lenOccB * lenVirtB * lenOccA * lenVirtA)]
+        for p in occupied_alpha_indices:
+            for q in occupied_beta_indices:
+                for r in virtual_beta_indices:
+                    for s in virtual_alpha_indices:
+                        doubles_m0[counter] = p
+                        doubles_m1[counter] = q
+                        doubles_m2[counter] = r
+                        doubles_m3[counter] = s
+                        counter = counter + 1
+
+        counter = 0
+        nEle = 0
+        for p in range(lenOccA - 1):
+            for q in range(p + 1, lenOccA):
+                for r in range(lenVirtA - 1):
+                    for s in range(r + 1, lenVirtA):
+                        nEle = nEle + 1
+
+        counter = 0
+        doubles_a0 = [0 for k in range(nEle)]
+        doubles_a1 = [0 for k in range(nEle)]
+        doubles_a2 = [0 for k in range(nEle)]
+        doubles_a3 = [0 for k in range(nEle)]
+        for p in range(lenOccA - 1):
+            for q in range(p + 1, lenOccA):
+                for r in range(lenVirtA - 1):
+                    for s in range(r + 1, lenVirtA):
+                        doubles_a0[counter] = occupied_alpha_indices[p]
+                        doubles_a1[counter] = occupied_alpha_indices[q]
+                        doubles_a2[counter] = virtual_alpha_indices[r]
+                        doubles_a3[counter] = virtual_alpha_indices[s]
+                        counter = counter + 1
+
+        counter = 0
+        nEle = 0
+        for p in range(lenOccB - 1):
+            for q in range(p + 1, lenOccB):
+                for r in range(lenVirtB - 1):
+                    for s in range(r + 1, lenVirtB):
+                        nEle = nEle + 1
+
+        doubles_b0 = [0 for k in range(nEle)]
+        doubles_b1 = [0 for k in range(nEle)]
+        doubles_b2 = [0 for k in range(nEle)]
+        doubles_b3 = [0 for k in range(nEle)]
+        for p in range(lenOccB - 1):
+            for q in range(p + 1, lenOccB):
+                for r in range(lenVirtB - 1):
+                    for s in range(r + 1, lenVirtB):
+                        doubles_b0[counter] = occupied_beta_indices[p]
+                        doubles_b1[counter] = occupied_beta_indices[q]
+                        doubles_b2[counter] = virtual_beta_indices[r]
+                        doubles_b3[counter] = virtual_beta_indices[s]
+                        counter = counter + 1
+
+        n_alpha_singles = len(singles_a0)
+        n_beta_singles = len(singles_b0)
+        n_mixed_doubles = len(doubles_m0)
+        n_alpha_doubles = len(doubles_a0)
+        n_beta_doubles = len(doubles_b0)
+
+        thetaCounter = 0
+        for i in range(n_alpha_singles):
+            single_excitation1(qubits, singles_a0[i], singles_a1[i],
+                               thetas[thetaCounter])
+            thetaCounter += 1
+
+        for i in range(n_beta_singles):
+            single_excitation1(qubits, singles_b0[i], singles_b1[i],
+                               thetas[thetaCounter])
+            thetaCounter += 1
+
+        for i in range(n_mixed_doubles):
+            double_excitation_opt1(qubits, doubles_m0[i], doubles_m1[i],
+                                   doubles_m2[i], doubles_m3[i],
+                                   thetas[thetaCounter])
+            thetaCounter += 1
+
+        for i in range(n_alpha_doubles):
+            double_excitation_opt1(qubits, doubles_a0[i], doubles_a1[i],
+                                   doubles_a2[i], doubles_a3[i],
+                                   thetas[thetaCounter])
+            thetaCounter += 1
+
+        for i in range(n_beta_doubles):
+            double_excitation_opt1(qubits, doubles_b0[i], doubles_b1[i],
+                                   doubles_b2[i], doubles_b3[i],
+                                   thetas[thetaCounter])
+            thetaCounter += 1
+
+    @cudaq.kernel
+    def uccsd1(qubits: cudaq.qview, thetas: list[float], n_electrons: int,
+               n_qubits: int):
+        """
+        Generate the unitary coupled cluster singlet doublet CUDA-Q kernel.
+
+        Args:
+            qubits (:class:`qview`): Pre-allocated qubits
+            thetas (list[float]): List of parameters
+            n_electrons (int): Number of electrons
+            n_qubits (int): Number of qubits
+        """
+
+        if n_electrons % 2 == 0:
+            uccsd1_even_electrons(qubits, thetas, n_electrons, n_qubits)
+        else:
+            uccsd1_odd_electrons(qubits, thetas, n_electrons, n_qubits)
+
+        num_electrons = 2
+        num_qubits = 8
+
+        thetas = [
+            -0.00037043841404585794, 0.0003811110195084151, 0.2286823796532558,
+            -0.00037043841404585794, 0.0003811110195084151, 0.2286823796532558,
+            -0.00037043841404585794, 0.0003811110195084151, 0.2286823796532558,
+            -0.00037043841404585794, 0.0003811110195084151, 0.2286823796532558,
+            -0.00037043841404585794, 0.0003811110195084151, 0.2286823796532558,
+            -0.00037043841404585794, 0.0003811110195084151, 0.2286823796532558,
+            -0.00037043841404585794, 0.0003811110195084151, 0.2286823796532558,
+            -0.00037043841404585794, 0.0003811110195084151, 0.2286823796532558
+        ]
+
+        @cudaq.kernel
+        def kernel():
+            qubits = cudaq.qvector(num_qubits)
+            for i in range(num_electrons):
+                x(qubits[i])
+            uccsd1(qubits, thetas, num_electrons, num_qubits)
+
+        counts = cudaq.sample(kernel, shots_count=1000)
+        assert len(counts) == 6
+        assert '00000011' in counts
+        assert '00000110' in counts
+        assert '00010010' in counts
+        assert '01000010' in counts
+        assert '10000001' in counts
+        assert '11000000' in counts
diff --git a/runtime/common/BaseRemoteRESTQPU.h b/runtime/common/BaseRemoteRESTQPU.h
index bdf74957ebb..90a98361027 100644
--- a/runtime/common/BaseRemoteRESTQPU.h
+++ b/runtime/common/BaseRemoteRESTQPU.h
@@ -293,8 +293,11 @@ class BaseRemoteRESTQPU : public cudaq::QPU {
     }
     std::string allowEarlyExitSetting =
         (codegenTranslation == "qir-adaptive") ? "1" : "0";
-    passPipelineConfig = std::string("cc-loop-unroll{allow-early-exit=") +
-                         allowEarlyExitSetting + "}," + passPipelineConfig;
+
+    passPipelineConfig =
+        std::string(
+            "func.func(memtoreg{quantum=0},cc-loop-unroll{allow-early-exit=") +
+        allowEarlyExitSetting + "})," + passPipelineConfig;
 
     auto disableQM = backendConfig.find("disable_qubit_mapping");
     if (disableQM != backendConfig.end() && disableQM->second == "true") {
diff --git a/runtime/common/EvolveResult.h b/runtime/common/EvolveResult.h
index 06382f4ff6d..c254e3af180 100644
--- a/runtime/common/EvolveResult.h
+++ b/runtime/common/EvolveResult.h
@@ -108,8 +108,8 @@ class evolve_result {
     return final_expectation_values;
   }
 
-  std::optional<std::vector<std::vector<observe_result>>>
-  get_expectation_values() {
+  const std::optional<std::vector<std::vector<observe_result>>> &
+  get_expectation_values() const {
     return expectation_values;
   }
 
diff --git a/runtime/cudaq/BaseIntegrator.h b/runtime/cudaq/BaseIntegrator.h
new file mode 100644
index 00000000000..d42f4f27ed9
--- /dev/null
+++ b/runtime/cudaq/BaseIntegrator.h
@@ -0,0 +1,34 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "operators.h"
+#include "schedule.h"
+#include <map>
+#include <memory>
+#include <vector>
+
+namespace cudaq {
+class BaseIntegrator {
+public:
+  /// @brief Default constructor
+  BaseIntegrator() = default;
+
+  virtual ~BaseIntegrator() = default;
+
+  /// @brief Set the initial state and time
+  virtual void setState(cudaq::state initialState, double t0) = 0;
+
+  /// @brief Perform integration to the target time.
+  virtual void integrate(double targetTime) = 0;
+
+  /// @brief Get the current time and state.
+  virtual std::pair<double, cudaq::state> getState() = 0;
+};
+} // namespace cudaq
diff --git a/runtime/cudaq/BaseTimeStepper.h b/runtime/cudaq/BaseTimeStepper.h
new file mode 100644
index 00000000000..638cd391fb3
--- /dev/null
+++ b/runtime/cudaq/BaseTimeStepper.h
@@ -0,0 +1,22 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+#include "cudaq/qis/state.h"
+
+namespace cudaq {
+class BaseTimeStepper {
+public:
+  virtual ~BaseTimeStepper() = default;
+
+  virtual state
+  compute(const state &inputState, double t, double stepSize,
+          const std::unordered_map<std::string, std::complex<double>>
+              &parameters) = 0;
+};
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/cudaq/CMakeLists.txt b/runtime/cudaq/CMakeLists.txt
index a4eac4f89d8..d4115de94b8 100644
--- a/runtime/cudaq/CMakeLists.txt
+++ b/runtime/cudaq/CMakeLists.txt
@@ -40,7 +40,7 @@ if (CUDA_FOUND)
     PRIVATE .)
 
   target_link_libraries(${LIBRARY_NAME}
-    PUBLIC dl cudaq-spin cudaq-common cudaq-nlopt cudaq-ensmallen
+    PUBLIC dl cudaq-spin cudaq-operator cudaq-common cudaq-nlopt cudaq-ensmallen
     PRIVATE nvqir fmt::fmt-header-only CUDA::cudart_static)
 
   target_compile_definitions(${LIBRARY_NAME} PRIVATE CUDAQ_HAS_CUDA)
@@ -52,7 +52,7 @@ else()
     PRIVATE .)
 
   target_link_libraries(${LIBRARY_NAME}
-    PUBLIC dl cudaq-spin cudaq-common cudaq-nlopt cudaq-ensmallen
+    PUBLIC dl cudaq-spin cudaq-operator cudaq-common cudaq-nlopt cudaq-ensmallen
     PRIVATE nvqir fmt::fmt-header-only)
 endif()
 
@@ -61,6 +61,7 @@ add_subdirectory(algorithms)
 add_subdirectory(platform)
 add_subdirectory(builder)
 add_subdirectory(domains)
+add_subdirectory(dynamics)
 
 install(TARGETS ${LIBRARY_NAME} EXPORT cudaq-targets DESTINATION lib)
 
diff --git a/runtime/cudaq/algorithms/evolve.h b/runtime/cudaq/algorithms/evolve.h
index 38ee16e5a4f..3aae5050d33 100644
--- a/runtime/cudaq/algorithms/evolve.h
+++ b/runtime/cudaq/algorithms/evolve.h
@@ -10,10 +10,14 @@
 
 #include "common/EvolveResult.h"
 #include "common/KernelWrapper.h"
+#include "cudaq/BaseIntegrator.h"
 #include "cudaq/algorithms/get_state.h"
 #include "cudaq/host_config.h"
+#include "cudaq/operators.h"
 #include "cudaq/platform.h"
 #include "cudaq/platform/QuantumExecutionQueue.h"
+#include "cudaq/schedule.h"
+#include "evolve_internal.h"
 
 namespace cudaq {
 
@@ -21,135 +25,231 @@ namespace cudaq {
 using async_evolve_result = std::future<evolve_result>;
 
 namespace __internal__ {
-// Internal methods for evolve implementation on circuit simulators.
-
-/// @brief Evolve from an initial state to the final state, no intermediate
-/// states.
-template <typename QuantumKernel>
-evolve_result evolve(state initial_state, QuantumKernel &&kernel,
-                     const std::vector<spin_op> &observables = {},
-                     int shots_count = -1) {
-  state final_state =
-      get_state(std::forward<QuantumKernel>(kernel), initial_state);
-  if (observables.size() == 0)
-    return evolve_result(final_state);
-
-  auto prepare_state = [final_state]() { auto qs = qvector<2>(final_state); };
-  std::vector<observe_result> final_expectations;
-  for (auto observable : observables) {
-    shots_count <= 0
-        ? final_expectations.push_back(observe(prepare_state, observable))
-        : final_expectations.push_back(
-              observe(shots_count, prepare_state, observable));
+template <typename OpTy>
+cudaq::operator_sum<cudaq::matrix_operator> convertOp(const OpTy &op) {
+  if constexpr (std::is_convertible_v<
+                    OpTy, cudaq::product_operator<cudaq::matrix_operator>>) {
+    cudaq::operator_sum<cudaq::matrix_operator> convertedOp(op);
+    return convertedOp;
+  } else if constexpr (std::is_convertible_v<
+                           OpTy, cudaq::operator_sum<cudaq::matrix_operator>>) {
+    return op;
+  } else {
+    throw std::invalid_argument("Invalid operator type: cannot convert type " +
+                                std::string(typeid(op).name()) +
+                                " to cudaq::product_operator or "
+                                "cudaq::operator_sum");
   }
-  return evolve_result(final_state, final_expectations);
 }
 
-/// @brief Evolve from an initial state to the final state and gather
-/// intermediate states.
-// Step evolution is provided as `kernels`.
-template <typename QuantumKernel>
-evolve_result evolve(state initial_state, std::vector<QuantumKernel> kernels,
-                     const std::vector<std::vector<spin_op>> &observables = {},
-                     int shots_count = -1) {
-  std::vector<state> intermediate_states = {};
-  std::vector<std::vector<observe_result>> expectation_values = {};
-  int step_idx = -1;
-  for (auto kernel : kernels) {
-    if (intermediate_states.size() == 0) {
-      intermediate_states.push_back(get_state(kernel, initial_state));
-    } else {
-      intermediate_states.push_back(
-          get_state(kernel, intermediate_states.back()));
-    }
-    if (observables.size() > 0) {
-      std::vector<observe_result> expectations = {};
-      auto prepare_state = [intermediate_states]() {
-        auto qs = qvector<2>(intermediate_states.back());
-      };
-      for (auto observable : observables[++step_idx]) {
-        shots_count <= 0
-            ? expectations.push_back(observe(prepare_state, observable))
-            : expectations.push_back(
-                  observe(shots_count, prepare_state, observable));
-      }
-      expectation_values.push_back(expectations);
-    }
-  }
-  if (step_idx < 0)
-    return evolve_result(intermediate_states);
-  return evolve_result(intermediate_states, expectation_values);
+template <typename OpTy>
+std::vector<cudaq::operator_sum<cudaq::matrix_operator>>
+convertOps(const std::vector<OpTy> &ops) {
+  std::vector<cudaq::operator_sum<cudaq::matrix_operator>> converted;
+  for (const auto &op : ops)
+    converted.emplace_back(convertOp(op));
+  return converted;
 }
 
-template <typename QuantumKernel>
-async_evolve_result
-evolve_async(state initial_state, QuantumKernel &&kernel,
-             const std::vector<spin_op> &observables = {},
-             std::size_t qpu_id = 0,
-             std::optional<cudaq::noise_model> noise_model = std::nullopt,
-             int shots_count = -1) {
-  auto &platform = cudaq::get_platform();
-  std::promise<evolve_result> promise;
-  auto f = promise.get_future();
-
-  QuantumTask wrapped = detail::make_copyable_function(
-      [p = std::move(promise), func = std::forward<QuantumKernel>(kernel),
-       initial_state, observables, noise_model, shots_count,
-       &platform]() mutable {
-        if (noise_model.has_value())
-          platform.set_noise(&noise_model.value());
-        p.set_value(evolve(initial_state, func, observables, shots_count));
-        if (noise_model.has_value())
-          platform.set_noise(nullptr);
-      });
+template <typename OpTy>
+std::vector<cudaq::operator_sum<cudaq::matrix_operator>>
+convertOps(const std::initializer_list<OpTy> &ops) {
+  std::vector<cudaq::operator_sum<cudaq::matrix_operator>> converted;
+  for (const auto &op : ops)
+    converted.emplace_back(convertOp(op));
+  return converted;
+}
+} // namespace __internal__
 
-  platform.enqueueAsyncTask(qpu_id, wrapped);
-  return f;
+template <typename HamTy,
+          typename CollapseOpTy = cudaq::operator_sum<cudaq::matrix_operator>,
+          typename ObserveOpTy = cudaq::operator_sum<cudaq::matrix_operator>>
+evolve_result
+evolve(const HamTy &hamiltonian, const std::map<int, int> &dimensions,
+       const Schedule &schedule, const state &initial_state,
+       std::shared_ptr<BaseIntegrator> integrator = {},
+       std::initializer_list<CollapseOpTy> collapse_operators = {},
+       std::initializer_list<ObserveOpTy> observables = {},
+       bool store_intermediate_results = false,
+       std::optional<int> shots_count = std::nullopt) {
+#if defined(CUDAQ_DYNAMICS_TARGET)
+  return cudaq::__internal__::evolveSingle(
+      cudaq::__internal__::convertOp(hamiltonian), dimensions, schedule,
+      initial_state, *integrator,
+      cudaq::__internal__::convertOps(collapse_operators),
+      cudaq::__internal__::convertOps(observables), store_intermediate_results);
+#else
+  static_assert(
+      false, "cudaq::evolve is only supported on the 'dynamics' target. Please "
+             "recompile your application with '--target dynamics' flag.");
+#endif
 }
 
-template <typename QuantumKernel>
-async_evolve_result
-evolve_async(state initial_state, std::vector<QuantumKernel> kernels,
-             const std::vector<std::vector<spin_op>> &observables = {},
-             std::size_t qpu_id = 0,
-             std::optional<cudaq::noise_model> noise_model = std::nullopt,
-             int shots_count = -1) {
-  auto &platform = cudaq::get_platform();
-  std::promise<evolve_result> promise;
-  auto f = promise.get_future();
+template <typename HamTy, typename CollapseOpTy, typename ObserveOpTy>
+evolve_result evolve(const HamTy &hamiltonian,
+                     const std::map<int, int> &dimensions,
+                     const Schedule &schedule, const state &initial_state,
+                     std::shared_ptr<BaseIntegrator> integrator = {},
+                     const std::vector<CollapseOpTy> &collapse_operators = {},
+                     const std::vector<ObserveOpTy> &observables = {},
+                     bool store_intermediate_results = false,
+                     std::optional<int> shots_count = std::nullopt) {
+#if defined(CUDAQ_DYNAMICS_TARGET)
+  return cudaq::__internal__::evolveSingle(
+      cudaq::__internal__::convertOp(hamiltonian), dimensions, schedule,
+      initial_state, *integrator,
+      cudaq::__internal__::convertOps(collapse_operators),
+      cudaq::__internal__::convertOps(observables), store_intermediate_results);
+#else
+  static_assert(
+      false, "cudaq::evolve is only supported on the 'dynamics' target. Please "
+             "recompile your application with '--target dynamics' flag.");
+#endif
+}
 
-  QuantumTask wrapped = detail::make_copyable_function(
-      [p = std::move(promise), kernels, initial_state, observables, noise_model,
-       shots_count, &platform]() mutable {
-        if (noise_model.has_value())
-          platform.set_noise(&noise_model.value());
-        p.set_value(evolve(initial_state, kernels, observables, shots_count));
-        if (noise_model.has_value())
-          platform.set_noise(nullptr);
-      });
+template <typename HamTy,
+          typename CollapseOpTy = cudaq::operator_sum<cudaq::matrix_operator>,
+          typename ObserveOpTy = cudaq::operator_sum<cudaq::matrix_operator>>
+std::vector<evolve_result>
+evolve(const HamTy &hamiltonian, const std::map<int, int> &dimensions,
+       const Schedule &schedule, const std::vector<state> &initial_states,
+       std::shared_ptr<BaseIntegrator> integrator = {},
+       std::initializer_list<CollapseOpTy> collapse_operators = {},
+       std::initializer_list<ObserveOpTy> observables = {},
+       bool store_intermediate_results = false,
+       std::optional<int> shots_count = std::nullopt) {
+#if defined(CUDAQ_DYNAMICS_TARGET)
+  std::vector<evolve_result> results;
+  for (const auto &initial_state : initial_states)
+    results.emplace_back(evolve(hamiltonian, dimensions, schedule,
+                                initial_state, integrator, collapse_operators,
+                                observables, store_intermediate_results,
+                                shots_count));
+  return results;
+#else
+  static_assert(
+      false, "cudaq::evolve is only supported on the 'dynamics' target. Please "
+             "recompile your application with '--target dynamics' flag.");
+#endif
+}
 
-  platform.enqueueAsyncTask(qpu_id, wrapped);
-  return f;
+template <typename HamTy, typename CollapseOpTy, typename ObserveOpTy>
+std::vector<evolve_result>
+evolve(const HamTy &hamiltonian, const std::map<int, int> &dimensions,
+       const Schedule &schedule, const std::vector<state> &initial_states,
+       std::shared_ptr<BaseIntegrator> integrator = {},
+       const std::vector<CollapseOpTy> &collapse_operators = {},
+       const std::vector<ObserveOpTy> &observables = {},
+       bool store_intermediate_results = false,
+       std::optional<int> shots_count = std::nullopt) {
+#if defined(CUDAQ_DYNAMICS_TARGET)
+  std::vector<evolve_result> results;
+  for (const auto &initial_state : initial_states)
+    results.emplace_back(evolve(hamiltonian, dimensions, schedule,
+                                initial_state, integrator, collapse_operators,
+                                observables, store_intermediate_results,
+                                shots_count));
+  return results;
+#else
+  static_assert(
+      false, "cudaq::evolve is only supported on the 'dynamics' target. Please "
+             "recompile your application with '--target dynamics' flag.");
+#endif
 }
 
-inline async_evolve_result
-evolve_async(std::function<evolve_result()> evolveFunctor,
-             std::size_t qpu_id = 0) {
-  auto &platform = cudaq::get_platform();
-  if (qpu_id >= platform.num_qpus()) {
-    throw std::invalid_argument(
-        "Provided qpu_id is invalid (must be <= to platform.num_qpus()).");
+template <typename HamTy,
+          typename CollapseOpTy = cudaq::operator_sum<cudaq::matrix_operator>,
+          typename ObserveOpTy = cudaq::operator_sum<cudaq::matrix_operator>>
+async_evolve_result
+evolve_async(const HamTy &hamiltonian, const std::map<int, int> &dimensions,
+             const Schedule &schedule, const state &initial_state,
+             std::shared_ptr<BaseIntegrator> integrator = {},
+             std::initializer_list<CollapseOpTy> collapse_operators = {},
+             std::initializer_list<ObserveOpTy> observables = {},
+             bool store_intermediate_results = false,
+             std::optional<int> shots_count = std::nullopt, int qpu_id = 0) {
+#if defined(CUDAQ_DYNAMICS_TARGET)
+  if (collapse_operators.size() > 0 && observables.size() > 0) {
+    std::vector<CollapseOpTy> collapseOperators(collapse_operators);
+    std::vector<ObserveOpTy> observableOperators(observables);
+    return __internal__::evolve_async(
+        [=, cOps = std::move(collapseOperators),
+         obs = std::move(observableOperators)]() {
+          ExecutionContext context("evolve");
+          cudaq::get_platform().set_exec_ctx(&context, qpu_id);
+          return evolve(hamiltonian, dimensions, schedule, initial_state,
+                        integrator, cOps, obs, store_intermediate_results,
+                        shots_count);
+        },
+        qpu_id);
+  } else if (collapse_operators.size() > 0) {
+    std::vector<CollapseOpTy> collapseOperators(collapse_operators);
+    std::vector<CollapseOpTy> observableOperators;
+    return __internal__::evolve_async(
+        [=, cOps = std::move(collapseOperators),
+         obs = std::move(observableOperators)]() {
+          ExecutionContext context("evolve");
+          cudaq::get_platform().set_exec_ctx(&context, qpu_id);
+          return evolve(hamiltonian, dimensions, schedule, initial_state,
+                        integrator, cOps, obs, store_intermediate_results,
+                        shots_count);
+        },
+        qpu_id);
+  } else if (observables.size()) {
+    std::vector<ObserveOpTy> observableOperators(observables);
+    std::vector<ObserveOpTy> collapseOperators;
+    return __internal__::evolve_async(
+        [=, cOps = std::move(collapseOperators),
+         obs = std::move(observableOperators)]() {
+          ExecutionContext context("evolve");
+          cudaq::get_platform().set_exec_ctx(&context, qpu_id);
+          return evolve(hamiltonian, dimensions, schedule, initial_state,
+                        integrator, cOps, obs, store_intermediate_results,
+                        shots_count);
+        },
+        qpu_id);
+  } else {
+    return __internal__::evolve_async(
+        [=]() {
+          ExecutionContext context("evolve");
+          cudaq::get_platform().set_exec_ctx(&context, qpu_id);
+          return evolve(hamiltonian, dimensions, schedule, initial_state,
+                        integrator, {}, {}, store_intermediate_results,
+                        shots_count);
+        },
+        qpu_id);
   }
-  std::promise<evolve_result> promise;
-  auto f = promise.get_future();
 
-  QuantumTask wrapped = detail::make_copyable_function(
-      [p = std::move(promise), evolveFunctor]() mutable {
-        p.set_value(evolveFunctor());
-      });
+#else
+  static_assert(
+      false, "cudaq::evolve is only supported on the 'dynamics' target. Please "
+             "recompile your application with '--target dynamics' flag.");
+#endif
+}
 
-  platform.enqueueAsyncTask(qpu_id, wrapped);
-  return f;
+template <typename HamTy, typename CollapseOpTy, typename ObserveOpTy>
+async_evolve_result
+evolve_async(const HamTy &hamiltonian, const std::map<int, int> &dimensions,
+             const Schedule &schedule, const state &initial_state,
+             std::shared_ptr<BaseIntegrator> integrator = {},
+             const std::vector<CollapseOpTy> &collapse_operators = {},
+             const std::vector<ObserveOpTy> &observables = {},
+             bool store_intermediate_results = false,
+             std::optional<int> shots_count = std::nullopt, int qpu_id = 0) {
+#if defined(CUDAQ_DYNAMICS_TARGET)
+  return __internal__::evolve_async(
+      [=]() {
+        ExecutionContext context("evolve");
+        cudaq::get_platform().set_exec_ctx(&context, qpu_id);
+        return evolve(hamiltonian, dimensions, schedule, initial_state,
+                      integrator, collapse_operators, observables,
+                      store_intermediate_results, shots_count);
+      },
+      qpu_id);
+#else
+  static_assert(
+      false, "cudaq::evolve is only supported on the 'dynamics' target. Please "
+             "recompile your application with '--target dynamics' flag.");
+#endif
 }
-} // namespace __internal__
 } // namespace cudaq
diff --git a/runtime/cudaq/algorithms/evolve_internal.h b/runtime/cudaq/algorithms/evolve_internal.h
new file mode 100644
index 00000000000..63c3c60178d
--- /dev/null
+++ b/runtime/cudaq/algorithms/evolve_internal.h
@@ -0,0 +1,198 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "common/EvolveResult.h"
+#include "common/KernelWrapper.h"
+#include "cudaq/BaseIntegrator.h"
+#include "cudaq/algorithms/get_state.h"
+#include "cudaq/host_config.h"
+#include "cudaq/operators.h"
+#include "cudaq/platform.h"
+#include "cudaq/platform/QuantumExecutionQueue.h"
+#include "cudaq/schedule.h"
+
+namespace cudaq {
+
+/// @brief Return type for asynchronous `evolve_async`.
+using async_evolve_result = std::future<evolve_result>;
+
+namespace __internal__ {
+// Internal methods for evolve implementation on circuit simulators.
+
+/// @brief Evolve from an initial state to the final state, no intermediate
+/// states.
+template <typename QuantumKernel>
+evolve_result evolve(state initial_state, QuantumKernel &&kernel,
+                     const std::vector<spin_op> &observables = {},
+                     int shots_count = -1) {
+#if defined(CUDAQ_DYNAMICS_TARGET)
+  state final_state =
+      get_state(std::forward<QuantumKernel>(kernel), initial_state);
+  if (observables.size() == 0)
+    return evolve_result(final_state);
+
+  auto prepare_state = [final_state]() { auto qs = qvector<2>(final_state); };
+  std::vector<observe_result> final_expectations;
+  for (auto observable : observables) {
+    shots_count <= 0
+        ? final_expectations.push_back(observe(prepare_state, observable))
+        : final_expectations.push_back(
+              observe(shots_count, prepare_state, observable));
+  }
+  return evolve_result(final_state, final_expectations);
+#else
+  throw std::runtime_error(
+      "cudaq::evolve is only supported on the 'dynamics' target. Please "
+      "recompile your application with '--target dynamics' flag.");
+#endif
+}
+
+/// @brief Evolve from an initial state to the final state and gather
+/// intermediate states.
+// Step evolution is provided as `kernels`.
+template <typename QuantumKernel>
+evolve_result evolve(state initial_state, std::vector<QuantumKernel> kernels,
+                     const std::vector<std::vector<spin_op>> &observables = {},
+                     int shots_count = -1) {
+#if defined(CUDAQ_DYNAMICS_TARGET)
+  std::vector<state> intermediate_states = {};
+  std::vector<std::vector<observe_result>> expectation_values = {};
+  int step_idx = -1;
+  for (auto kernel : kernels) {
+    if (intermediate_states.size() == 0) {
+      intermediate_states.push_back(get_state(kernel, initial_state));
+    } else {
+      intermediate_states.push_back(
+          get_state(kernel, intermediate_states.back()));
+    }
+    if (observables.size() > 0) {
+      std::vector<observe_result> expectations = {};
+      auto prepare_state = [intermediate_states]() {
+        auto qs = qvector<2>(intermediate_states.back());
+      };
+      for (auto observable : observables[++step_idx]) {
+        shots_count <= 0
+            ? expectations.push_back(observe(prepare_state, observable))
+            : expectations.push_back(
+                  observe(shots_count, prepare_state, observable));
+      }
+      expectation_values.push_back(expectations);
+    }
+  }
+  if (step_idx < 0)
+    return evolve_result(intermediate_states);
+  return evolve_result(intermediate_states, expectation_values);
+#else
+  throw std::runtime_error(
+      "cudaq::evolve is only supported on the 'dynamics' target. Please "
+      "recompile your application with '--target dynamics' flag.");
+#endif
+}
+
+template <typename QuantumKernel>
+async_evolve_result
+evolve_async(state initial_state, QuantumKernel &&kernel,
+             const std::vector<spin_op> &observables = {},
+             std::size_t qpu_id = 0,
+             std::optional<cudaq::noise_model> noise_model = std::nullopt,
+             int shots_count = -1) {
+#if defined(CUDAQ_DYNAMICS_TARGET)
+  auto &platform = cudaq::get_platform();
+  std::promise<evolve_result> promise;
+  auto f = promise.get_future();
+
+  QuantumTask wrapped = detail::make_copyable_function(
+      [p = std::move(promise), func = std::forward<QuantumKernel>(kernel),
+       initial_state, observables, noise_model, shots_count,
+       &platform]() mutable {
+        if (noise_model.has_value())
+          platform.set_noise(&noise_model.value());
+        p.set_value(evolve(initial_state, func, observables, shots_count));
+        if (noise_model.has_value())
+          platform.set_noise(nullptr);
+      });
+
+  platform.enqueueAsyncTask(qpu_id, wrapped);
+  return f;
+#else
+  throw std::runtime_error(
+      "cudaq::evolve is only supported on the 'dynamics' target. Please "
+      "recompile your application with '--target dynamics' flag.");
+#endif
+}
+
+template <typename QuantumKernel>
+async_evolve_result
+evolve_async(state initial_state, std::vector<QuantumKernel> kernels,
+             const std::vector<std::vector<spin_op>> &observables = {},
+             std::size_t qpu_id = 0,
+             std::optional<cudaq::noise_model> noise_model = std::nullopt,
+             int shots_count = -1) {
+#if defined(CUDAQ_DYNAMICS_TARGET)
+  auto &platform = cudaq::get_platform();
+  std::promise<evolve_result> promise;
+  auto f = promise.get_future();
+
+  QuantumTask wrapped = detail::make_copyable_function(
+      [p = std::move(promise), kernels, initial_state, observables, noise_model,
+       shots_count, &platform]() mutable {
+        if (noise_model.has_value())
+          platform.set_noise(&noise_model.value());
+        p.set_value(evolve(initial_state, kernels, observables, shots_count));
+        if (noise_model.has_value())
+          platform.set_noise(nullptr);
+      });
+
+  platform.enqueueAsyncTask(qpu_id, wrapped);
+  return f;
+#else
+  throw std::runtime_error(
+      "cudaq::evolve is only supported on the 'dynamics' target. Please "
+      "recompile your application with '--target dynamics' flag.");
+#endif
+}
+
+inline async_evolve_result
+evolve_async(std::function<evolve_result()> evolveFunctor,
+             std::size_t qpu_id = 0) {
+#if defined(CUDAQ_DYNAMICS_TARGET)
+  auto &platform = cudaq::get_platform();
+  if (qpu_id >= platform.num_qpus()) {
+    throw std::invalid_argument(
+        "Provided qpu_id is invalid (must be <= to platform.num_qpus()).");
+  }
+  std::promise<evolve_result> promise;
+  auto f = promise.get_future();
+
+  QuantumTask wrapped = detail::make_copyable_function(
+      [p = std::move(promise), evolveFunctor]() mutable {
+        p.set_value(evolveFunctor());
+      });
+
+  platform.enqueueAsyncTask(qpu_id, wrapped);
+  return f;
+#else
+  throw std::runtime_error(
+      "cudaq::evolve is only supported on the 'dynamics' target. Please "
+      "recompile your application with '--target dynamics' flag.");
+#endif
+}
+
+evolve_result evolveSingle(
+    const operator_sum<cudaq::matrix_operator> &hamiltonian,
+    const std::map<int, int> &dimensions, const Schedule &schedule,
+    const state &initial_state, BaseIntegrator &integrator,
+    const std::vector<operator_sum<cudaq::matrix_operator>>
+        &collapse_operators = {},
+    const std::vector<operator_sum<cudaq::matrix_operator>> &observables = {},
+    bool store_intermediate_results = false,
+    std::optional<int> shots_count = std::nullopt);
+} // namespace __internal__
+} // namespace cudaq
diff --git a/runtime/cudaq/dynamics/CMakeLists.txt b/runtime/cudaq/dynamics/CMakeLists.txt
new file mode 100644
index 00000000000..eca012762e8
--- /dev/null
+++ b/runtime/cudaq/dynamics/CMakeLists.txt
@@ -0,0 +1,50 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+set(LIBRARY_NAME cudaq-operator)
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-ctad-maybe-unsupported")
+set(INTERFACE_POSITION_INDEPENDENT_CODE ON)
+
+set(CUDAQ_OPS_SRC
+  callback.cpp
+  scalar_operators.cpp 
+  spin_operators.cpp
+  boson_operators.cpp
+  fermion_operators.cpp
+  matrix_operators.cpp 
+  product_operators.cpp 
+  operator_sum.cpp 
+  handler.cpp
+  schedule.cpp
+  helpers.cpp
+)
+
+add_library(${LIBRARY_NAME} SHARED ${CUDAQ_OPS_SRC})
+set_property(GLOBAL APPEND PROPERTY CUDAQ_RUNTIME_LIBS ${LIBRARY_NAME})
+target_compile_definitions(${LIBRARY_NAME} PRIVATE -DCUDAQ_INSTANTIATE_TEMPLATES)
+
+target_include_directories(${LIBRARY_NAME} 
+    PUBLIC 
+       $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/runtime>
+       $<BUILD_INTERFACE:${CMAKE_SOURCE_DIR}/tpls/eigen>
+       $<BUILD_INTERFACE:${CUQUANTUM_INSTALL_PREFIX}/include>
+       $<INSTALL_INTERFACE:include>
+    PRIVATE .)
+
+set (OPERATOR_DEPENDENCIES "")
+list(APPEND OPERATOR_DEPENDENCIES fmt::fmt-header-only)
+add_openmp_configurations(${LIBRARY_NAME} OPERATOR_DEPENDENCIES)
+
+target_link_libraries(${LIBRARY_NAME} PRIVATE ${OPERATOR_DEPENDENCIES})
+
+install(TARGETS ${LIBRARY_NAME} EXPORT cudaq-operator-targets DESTINATION lib)
+
+install(EXPORT cudaq-operator-targets
+        FILE CUDAQOperatorTargets.cmake
+        NAMESPACE cudaq::
+        DESTINATION lib/cmake/cudaq)
diff --git a/runtime/cudaq/dynamics/boson_operators.cpp b/runtime/cudaq/dynamics/boson_operators.cpp
new file mode 100644
index 00000000000..a0d8db374fd
--- /dev/null
+++ b/runtime/cudaq/dynamics/boson_operators.cpp
@@ -0,0 +1,239 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include <cmath>
+#include <complex>
+#include <unordered_map>
+#include <vector>
+
+#include "cudaq/operators.h"
+#include "cudaq/utils/tensor.h"
+
+namespace cudaq {
+
+// private helpers
+
+std::string boson_operator::op_code_to_string() const {
+  // Note that we can (and should) have the same op codes across boson, fermion,
+  // and spin ops, since individual operators with the same op codes are
+  // actually equal. Note that the matrix definition for creation, annihilation
+  // and number operators are equal despite the different
+  // commutation/anticommutation relations; what makes them behave differently
+  // is effectively the "finite size effects" for fermions. Specifically, if we
+  // were to blindly multiply the matrices for d=2 for bosons, we would get the
+  // same behavior as we have for a single fermion due to the finite size of the
+  // matrix. To avoid this, we ensure that we reorder the operators for bosons
+  // appropriately as part of the in-place multiplication, whereas for fermions,
+  // this effect is desired/correct.
+  if (this->additional_terms == 0 && this->number_offsets.size() == 0)
+    return "I";
+  std::string str;
+  for (auto offset : this->number_offsets) {
+    if (offset == 0)
+      str += "N";
+    else if (offset > 0)
+      str += "(N+" + std::to_string(offset) + ")";
+    else
+      str += "(N" + std::to_string(offset) + ")";
+  }
+  for (auto i = 0; i < this->additional_terms; ++i)
+    str += "Ad";
+  for (auto i = 0; i > this->additional_terms; --i)
+    str += "A";
+  return std::move(str);
+}
+
+std::string boson_operator::op_code_to_string(
+    std::unordered_map<int, int> &dimensions) const {
+  auto it = dimensions.find(this->target);
+  if (it == dimensions.end())
+    throw std::runtime_error("missing dimension for degree " +
+                             std::to_string(this->target));
+  return this->op_code_to_string();
+}
+
+void boson_operator::inplace_mult(const boson_operator &other) {
+  this->number_offsets.reserve(this->number_offsets.size() +
+                               other.number_offsets.size());
+
+  // first permute all number operators of RHS to the left; for x = #
+  // permutations, if we have "unpaired" creation operators, the number operator
+  // becomes (N + x), if we have "unpaired" annihilation operators, the number
+  // operator becomes (N - x).
+  auto it = this->number_offsets
+                .cbegin(); // we will sort the offsets from biggest to smallest
+  for (auto offset : other.number_offsets) {
+    while (it != this->number_offsets.cend() &&
+           *it >= offset - this->additional_terms)
+      ++it;
+    this->number_offsets.insert(it, offset - this->additional_terms);
+  }
+
+  // now we can combine the creation and annihilation operators;
+  if (this->additional_terms > 0) { // we have "unpaired" creation operators
+    // using ad*a = N and ad*N = (N - 1)*ad, each created number operator has an
+    // offset of -(x - 1 - i), where x is the number of creation operators, and
+    // i is the number of creation operators we already combined
+    it = this->number_offsets.cbegin();
+    for (auto i = std::min(this->additional_terms, -other.additional_terms);
+         i > 0; --i) {
+      // we make sure to have offsets get smaller as we go to keep the sorting
+      // cheap
+      while (it != this->number_offsets.cend() &&
+             *it >= i - this->additional_terms)
+        ++it;
+      this->number_offsets.insert(it, i - this->additional_terms);
+    }
+  } else if (this->additional_terms <
+             0) { // we have "unpaired" annihilation operators
+    // using a*ad = (N + 1) and a*N = (N + 1)*a, each created number operator
+    // has an offset of (x - i), where x is the number of annihilation
+    // operators, and i is the number of annihilation operators we already
+    // combined
+    it = this->number_offsets.cbegin();
+    for (auto i = 0; i > this->additional_terms && i > -other.additional_terms;
+         --i) {
+      // we make sure to have offsets get smaller as we go to keep the sorting
+      // cheap
+      while (it != this->number_offsets.cend() &&
+             *it >= i - this->additional_terms)
+        ++it;
+      this->number_offsets.insert(it, i - this->additional_terms);
+    }
+  }
+
+  // finally, we update the number of remaining unpaired operators
+  this->additional_terms += other.additional_terms;
+
+#if !defined(NDEBUG)
+  // we sort the number offsets, such that the equality comparison and the
+  // operator id perfectly reflects the mathematical evaluation of the operator
+  auto sorted_offsets = this->number_offsets;
+  std::sort(sorted_offsets.begin(), sorted_offsets.end(), std::greater<int>());
+  assert(sorted_offsets == this->number_offsets);
+#endif
+}
+
+// read-only properties
+
+std::string boson_operator::unique_id() const {
+  return this->op_code_to_string() + std::to_string(target);
+}
+
+std::vector<int> boson_operator::degrees() const { return {this->target}; }
+
+// constructors
+
+boson_operator::boson_operator(int target)
+    : target(target), additional_terms(0) {}
+
+boson_operator::boson_operator(int target, int op_id)
+    : target(target), additional_terms(0) {
+  assert(0 <= op_id < 4);
+  if (op_id == 1) // create
+    this->additional_terms = 1;
+  else if (op_id == 2) // annihilate
+    this->additional_terms = -1;
+  else if (op_id == 3) // number
+    this->number_offsets.push_back(0);
+}
+
+// evaluations
+
+matrix_2 boson_operator::to_matrix(
+    std::unordered_map<int, int> &dimensions,
+    const std::unordered_map<std::string, std::complex<double>> &parameters)
+    const {
+  auto it = dimensions.find(this->target);
+  if (it == dimensions.end())
+    throw std::runtime_error("missing dimension for degree " +
+                             std::to_string(this->target));
+  auto dim = it->second;
+
+  auto mat = matrix_2(dim, dim);
+  if (this->additional_terms > 0) {
+    for (std::size_t column = 0; column + this->additional_terms < dim;
+         column++) {
+      auto row = column + this->additional_terms;
+      mat[{row, column}] = 1.;
+      for (auto offset : this->number_offsets)
+        mat[{row, column}] *= (row + offset);
+      for (auto offset = this->additional_terms; offset > 0; --offset)
+        mat[{row, column}] *= std::sqrt(column + offset);
+    }
+  } else if (this->additional_terms < 0) {
+    for (std::size_t row = 0; row - this->additional_terms < dim; row++) {
+      auto column = row - this->additional_terms;
+      mat[{row, column}] = 1.;
+      for (auto offset : this->number_offsets)
+        mat[{row, column}] *= (row + offset);
+      for (auto offset = -this->additional_terms; offset > 0; --offset)
+        mat[{row, column}] *= std::sqrt(row + offset);
+    }
+  } else {
+    for (std::size_t i = 0; i < dim; i++) {
+      mat[{i, i}] = 1.;
+      for (auto offset : this->number_offsets)
+        mat[{i, i}] *= (i + offset);
+    }
+  }
+  return std::move(mat);
+}
+
+std::string boson_operator::to_string(bool include_degrees) const {
+  if (include_degrees)
+    return this->op_code_to_string() + "(" + std::to_string(target) + ")";
+  else
+    return this->op_code_to_string();
+}
+
+// comparisons
+
+bool boson_operator::operator==(const boson_operator &other) const {
+  return this->target == other.target &&
+         this->additional_terms == other.additional_terms &&
+         this->number_offsets == other.number_offsets;
+}
+
+// defined operators
+
+operator_sum<boson_operator> boson_operator::empty() {
+  return operator_handler::empty<boson_operator>();
+}
+
+product_operator<boson_operator> boson_operator::identity() {
+  return operator_handler::identity<boson_operator>();
+}
+
+product_operator<boson_operator> boson_operator::identity(int degree) {
+  return product_operator(boson_operator(degree));
+}
+
+product_operator<boson_operator> boson_operator::create(int degree) {
+  return product_operator(boson_operator(degree, 1));
+}
+
+product_operator<boson_operator> boson_operator::annihilate(int degree) {
+  return product_operator(boson_operator(degree, 2));
+}
+
+product_operator<boson_operator> boson_operator::number(int degree) {
+  return product_operator(boson_operator(degree, 3));
+}
+
+operator_sum<boson_operator> boson_operator::position(int degree) {
+  return 0.5 *
+         (boson_operator::create(degree) + boson_operator::annihilate(degree));
+}
+
+operator_sum<boson_operator> boson_operator::momentum(int degree) {
+  return 0.5j *
+         (boson_operator::create(degree) - boson_operator::annihilate(degree));
+}
+
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/cudaq/dynamics/boson_operators.h b/runtime/cudaq/dynamics/boson_operators.h
new file mode 100644
index 00000000000..d3ee7764940
--- /dev/null
+++ b/runtime/cudaq/dynamics/boson_operators.h
@@ -0,0 +1,91 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include <complex>
+#include <unordered_map>
+#include <vector>
+
+#include "cudaq/operators.h"
+#include "cudaq/utils/tensor.h"
+
+namespace cudaq {
+
+// FIXME: rename?
+class boson_operator : public operator_handler {
+  template <typename T>
+  friend class product_operator;
+
+private:
+  // Each boson operator is represented as number operators along with an
+  // offset to add to each number operator, as well as an integer indicating
+  // how many creation or annihilation terms follow the number operators.
+  // See the implementation of the in-place multiplication to understand
+  // the meaning and purpose of this representation. In short, this
+  // representation allows us to perform a perfect in-place multiplication.
+  int additional_terms;
+  std::vector<int> number_offsets;
+  int target;
+
+  // 0 = I, Ad = 1, A = 2, AdA = 3
+  boson_operator(int target, int op_code);
+
+  std::string op_code_to_string() const;
+  virtual std::string
+  op_code_to_string(std::unordered_map<int, int> &dimensions) const;
+
+  void inplace_mult(const boson_operator &other);
+
+public:
+  // read-only properties
+
+  virtual std::string unique_id() const;
+
+  virtual std::vector<int> degrees() const;
+
+  // constructors and destructors
+
+  boson_operator(int target);
+
+  ~boson_operator() = default;
+
+  // evaluations
+
+  /// @brief Return the matrix representation of the operator in the eigenbasis
+  /// of the number operator.
+  /// @arg  `dimensions` : A map specifying the dimension, that is the number of
+  /// eigenstates, for each degree of freedom.
+  virtual matrix_2
+  to_matrix(std::unordered_map<int, int> &dimensions,
+            const std::unordered_map<std::string, std::complex<double>>
+                &parameters = {}) const;
+
+  virtual std::string to_string(bool include_degrees) const;
+
+  // comparisons
+
+  /// @returns True if, and only if, the two operators have the same effect on
+  /// any state.
+  bool operator==(const boson_operator &other) const;
+
+  // defined operators
+
+  static operator_sum<boson_operator> empty();
+  static product_operator<boson_operator> identity();
+
+  static product_operator<boson_operator> identity(int degree);
+  static product_operator<boson_operator> create(int degree);
+  static product_operator<boson_operator> annihilate(int degree);
+  static product_operator<boson_operator> number(int degree);
+
+  static operator_sum<boson_operator> position(int degree);
+  static operator_sum<boson_operator> momentum(int degree);
+};
+
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/cudaq/dynamics/callback.cpp b/runtime/cudaq/dynamics/callback.cpp
new file mode 100644
index 00000000000..ffd0b0fcc6e
--- /dev/null
+++ b/runtime/cudaq/dynamics/callback.cpp
@@ -0,0 +1,56 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "callback.h"
+
+#include <complex>
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace cudaq {
+
+// ScalarCallbackFunction
+
+std::complex<double> ScalarCallbackFunction::operator()(
+    const std::unordered_map<std::string, std::complex<double>> &parameters)
+    const {
+  return this->callback_func(parameters);
+}
+
+// MatrixCallbackFunction
+
+matrix_2 MatrixCallbackFunction::operator()(
+    const std::vector<int> &relevant_dimensions,
+    const std::unordered_map<std::string, std::complex<double>> &parameters)
+    const {
+  return this->callback_func(relevant_dimensions, parameters);
+}
+
+// Definition
+
+Definition::Definition(std::string operator_id,
+                       const std::vector<int> &expected_dimensions,
+                       MatrixCallbackFunction &&create)
+    : id(operator_id), generator(std::move(create)),
+      required_dimensions(expected_dimensions) {}
+
+Definition::Definition(Definition &&def)
+    : id(def.id), generator(std::move(def.generator)),
+      required_dimensions(std::move(def.expected_dimensions)) {}
+
+matrix_2 Definition::generate_matrix(
+    const std::vector<int> &relevant_dimensions,
+    const std::unordered_map<std::string, std::complex<double>> &parameters)
+    const {
+  return generator(relevant_dimensions, parameters);
+}
+
+Definition::~Definition() = default;
+} // namespace cudaq
diff --git a/runtime/cudaq/dynamics/callback.h b/runtime/cudaq/dynamics/callback.h
new file mode 100644
index 00000000000..45e1e2f3e89
--- /dev/null
+++ b/runtime/cudaq/dynamics/callback.h
@@ -0,0 +1,112 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "cudaq/qis/state.h"
+#include "cudaq/utils/tensor.h"
+
+#include <complex>
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace cudaq {
+
+class ScalarCallbackFunction {
+private:
+  // The user provided callback function that takes a map of complex
+  // parameters.
+  std::function<std::complex<double>(
+      const std::unordered_map<std::string, std::complex<double>> &)>
+      callback_func;
+
+public:
+  template <typename Callable>
+  ScalarCallbackFunction(Callable &&callable) {
+    static_assert(
+        std::is_invocable_r_v<
+            std::complex<double>, Callable,
+            const std::unordered_map<std::string, std::complex<double>> &>,
+        "Invalid callback function. Must have signature std::complex<double>("
+        "const std::unordered_map<std::string, std::complex<double>>&)");
+    callback_func = std::forward<Callable>(callable);
+  }
+
+  ScalarCallbackFunction(const ScalarCallbackFunction &other) = default;
+  ScalarCallbackFunction(ScalarCallbackFunction &&other) = default;
+
+  ScalarCallbackFunction &
+  operator=(const ScalarCallbackFunction &other) = default;
+  ScalarCallbackFunction &operator=(ScalarCallbackFunction &&other) = default;
+
+  std::complex<double> operator()(
+      const std::unordered_map<std::string, std::complex<double>> &parameters)
+      const;
+};
+
+class MatrixCallbackFunction {
+private:
+  // The user provided callback function that takes a vector defining the
+  // dimension for each degree of freedom it acts on, and a map of complex
+  // parameters.
+  std::function<matrix_2(
+      const std::vector<int> &,
+      const std::unordered_map<std::string, std::complex<double>> &)>
+      callback_func;
+
+public:
+  template <typename Callable>
+  MatrixCallbackFunction(Callable &&callable) {
+    static_assert(
+        std::is_invocable_r_v<
+            matrix_2, Callable, const std::vector<int> &,
+            const std::unordered_map<std::string, std::complex<double>> &>,
+        "Invalid callback function. Must have signature "
+        "matrix_2(const std::vector<int>&, const "
+        "std::unordered_map<std::string, std::complex<double>>&)");
+    callback_func = std::forward<Callable>(callable);
+  }
+
+  MatrixCallbackFunction(const MatrixCallbackFunction &other) = default;
+  MatrixCallbackFunction(MatrixCallbackFunction &&other) = default;
+
+  MatrixCallbackFunction &
+  operator=(const MatrixCallbackFunction &other) = default;
+  MatrixCallbackFunction &operator=(MatrixCallbackFunction &&other) = default;
+
+  matrix_2
+  operator()(const std::vector<int> &relevant_dimensions,
+             const std::unordered_map<std::string, std::complex<double>>
+                 &parameters) const;
+};
+
+/// @brief Object used to store the definition of a custom matrix operator.
+class Definition {
+private:
+  std::string id;
+  MatrixCallbackFunction generator;
+  std::vector<int> required_dimensions;
+
+public:
+  const std::vector<int> &expected_dimensions = this->required_dimensions;
+
+  Definition(std::string operator_id,
+             const std::vector<int> &expected_dimensions,
+             MatrixCallbackFunction &&create);
+  Definition(Definition &&def);
+  ~Definition();
+
+  // To call the generator function
+  matrix_2
+  generate_matrix(const std::vector<int> &relevant_dimensions,
+                  const std::unordered_map<std::string, std::complex<double>>
+                      &parameters) const;
+};
+} // namespace cudaq
diff --git a/runtime/cudaq/dynamics/evaluation.h b/runtime/cudaq/dynamics/evaluation.h
new file mode 100644
index 00000000000..125de6b72af
--- /dev/null
+++ b/runtime/cudaq/dynamics/evaluation.h
@@ -0,0 +1,211 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "cudaq/utils/tensor.h"
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+#include "helpers.h"
+#include "operator_leafs.h"
+
+namespace cudaq {
+
+template <typename EvalTy>
+class operator_arithmetics {
+public:
+  operator_arithmetics(
+      std::unordered_map<int, int> &dimensions,
+      const std::unordered_map<std::string, std::complex<double>> &parameters);
+
+  /// Whether to inject tensor products with identity to each term in the
+  /// sum to ensure that all terms are acting on the same degrees of freedom
+  /// by the time they are added.
+  const bool pad_sum_terms;
+  /// Whether to inject tensor products with identity to each term in the
+  /// product to ensure that each term has its full size by the time they
+  /// are multiplied.
+  const bool pad_product_terms;
+
+  /// @brief Accesses the relevant data to evaluate an operator expression
+  /// in the leaf nodes, that is in elementary and scalar operators.
+  EvalTy evaluate(const operator_handler &op);
+  EvalTy evaluate(const scalar_operator &op);
+
+  /// @brief Computes the tensor product of two operators that act on different
+  /// degrees of freedom.
+  EvalTy tensor(EvalTy &&val1, EvalTy &&val2);
+
+  /// @brief Multiplies two operators that act on the same degrees of freedom.
+  EvalTy mul(EvalTy &&val1, EvalTy &&val2);
+
+  /// @brief Multiplies an evaluated operator with a scalar.
+  EvalTy mul(const scalar_operator &scalar, EvalTy &&op);
+
+  /// @brief Adds two operators that act on the same degrees of freedom.
+  EvalTy add(EvalTy &&val1, EvalTy &&val2);
+};
+
+template <>
+class operator_arithmetics<operator_handler::matrix_evaluation> {
+
+private:
+  std::unordered_map<int, int> &dimensions; // may be updated during evaluation
+  const std::unordered_map<std::string, std::complex<double>> &parameters;
+
+  // Given a matrix representation that acts on the given degrees or freedom,
+  // sorts the degrees and permutes the matrix to match that canonical order.
+  void canonicalize(matrix_2 &matrix, std::vector<int> &degrees) {
+    auto current_degrees = degrees;
+    std::sort(degrees.begin(), degrees.end(),
+              operator_handler::canonical_order);
+    if (current_degrees != degrees) {
+      auto permutation = cudaq::detail::compute_permutation(
+          current_degrees, degrees, this->dimensions);
+      cudaq::detail::permute_matrix(matrix, permutation);
+    }
+  }
+
+public:
+  const bool pad_sum_terms = true;
+  const bool pad_product_terms = true;
+
+  operator_arithmetics(
+      std::unordered_map<int, int> &dimensions,
+      const std::unordered_map<std::string, std::complex<double>> &parameters)
+      : dimensions(dimensions), parameters(parameters) {}
+
+  operator_handler::matrix_evaluation evaluate(const operator_handler &op) {
+    return operator_handler::matrix_evaluation(
+        op.degrees(), op.to_matrix(this->dimensions, this->parameters));
+  }
+
+  operator_handler::matrix_evaluation evaluate(const scalar_operator &op) {
+    return operator_handler::matrix_evaluation({},
+                                               op.to_matrix(this->parameters));
+  }
+
+  operator_handler::matrix_evaluation
+  tensor(operator_handler::matrix_evaluation &&op1,
+         operator_handler::matrix_evaluation &&op2) {
+    op1.degrees.reserve(op1.degrees.size() + op2.degrees.size());
+    for (auto d : op2.degrees) {
+      assert(std::find(op1.degrees.cbegin(), op1.degrees.cend(), d) ==
+             op1.degrees.cend());
+      op1.degrees.push_back(d);
+    }
+    auto matrix =
+        cudaq::kronecker(std::move(op1.matrix), std::move(op2.matrix));
+    this->canonicalize(matrix, op1.degrees);
+    return operator_handler::matrix_evaluation(std::move(op1.degrees),
+                                               std::move(matrix));
+  }
+
+  operator_handler::matrix_evaluation
+  mul(const scalar_operator &scalar, operator_handler::matrix_evaluation &&op) {
+    auto matrix = scalar.evaluate(this->parameters) * std::move(op.matrix);
+    return operator_handler::matrix_evaluation(std::move(op.degrees),
+                                               std::move(matrix));
+  }
+
+  operator_handler::matrix_evaluation
+  mul(operator_handler::matrix_evaluation &&op1,
+      operator_handler::matrix_evaluation &&op2) {
+    // Elementary operators have sorted degrees such that we have a unique
+    // convention for how to define the matrix. Tensor products permute the
+    // computed matrix if necessary to guarantee that all operators always have
+    // sorted degrees.
+    assert(op1.degrees == op2.degrees);
+    op1.matrix *= std::move(op2.matrix);
+    return operator_handler::matrix_evaluation(std::move(op1.degrees),
+                                               std::move(op1.matrix));
+  }
+
+  operator_handler::matrix_evaluation
+  add(operator_handler::matrix_evaluation &&op1,
+      operator_handler::matrix_evaluation &&op2) {
+    // Elementary operators have sorted degrees such that we have a unique
+    // convention for how to define the matrix. Tensor products permute the
+    // computed matrix if necessary to guarantee that all operators always have
+    // sorted degrees.
+    assert(op1.degrees == op2.degrees);
+    op1.matrix += std::move(op2.matrix);
+    return operator_handler::matrix_evaluation(std::move(op1.degrees),
+                                               std::move(op1.matrix));
+  }
+};
+
+template <>
+class operator_arithmetics<operator_handler::canonical_evaluation> {
+
+private:
+  std::unordered_map<int, int> &dimensions; // may be updated during evaluation
+  const std::unordered_map<std::string, std::complex<double>> &parameters;
+
+public:
+  const bool pad_sum_terms = true;
+  const bool pad_product_terms = false;
+
+  operator_arithmetics(
+      std::unordered_map<int, int> &dimensions,
+      const std::unordered_map<std::string, std::complex<double>> &parameters)
+      : dimensions(dimensions), parameters(parameters) {}
+
+  operator_handler::canonical_evaluation evaluate(const operator_handler &op) {
+    auto canon_str = op.op_code_to_string(this->dimensions);
+    operator_handler::canonical_evaluation eval;
+    eval.push_back(
+        std::make_pair(std::complex<double>(1.), std::move(canon_str)));
+    return std::move(eval);
+  }
+
+  operator_handler::canonical_evaluation
+  evaluate(const scalar_operator &scalar) {
+    operator_handler::canonical_evaluation eval;
+    eval.push_back(std::make_pair(scalar.evaluate(this->parameters), ""));
+    return std::move(eval);
+  }
+
+  operator_handler::canonical_evaluation
+  tensor(operator_handler::canonical_evaluation &&val1,
+         operator_handler::canonical_evaluation &&val2) {
+    assert(val1.terms.size() == 1 && val2.terms.size() == 1);
+    assert(val2.terms[0].first ==
+           std::complex<double>(1.)); // should be trivial
+    val1.push_back(val2.terms[0].second);
+    return std::move(val1);
+  }
+
+  operator_handler::canonical_evaluation
+  mul(const scalar_operator &scalar,
+      operator_handler::canonical_evaluation &&op) {
+    throw std::runtime_error(
+        "multiplication should never be called on canonicalized operator - "
+        "product padding is disabled");
+  }
+
+  operator_handler::canonical_evaluation
+  mul(operator_handler::canonical_evaluation &&val1,
+      operator_handler::canonical_evaluation &&val2) {
+    throw std::runtime_error(
+        "multiplication should never be called on canonicalized operator - "
+        "product padding is disabled");
+  }
+
+  operator_handler::canonical_evaluation
+  add(operator_handler::canonical_evaluation &&val1,
+      operator_handler::canonical_evaluation &&val2) {
+    assert(val2.terms.size() == 1);
+    val1.push_back(std::move(val2.terms[0]));
+    return std::move(val1);
+  }
+};
+
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/cudaq/dynamics/fermion_operators.cpp b/runtime/cudaq/dynamics/fermion_operators.cpp
new file mode 100644
index 00000000000..d358a0af475
--- /dev/null
+++ b/runtime/cudaq/dynamics/fermion_operators.cpp
@@ -0,0 +1,204 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include <algorithm>
+#include <cmath>
+#include <complex>
+#include <unordered_map>
+#include <vector>
+
+#include "cudaq/operators.h"
+#include "cudaq/utils/tensor.h"
+#include "fermion_operators.h"
+
+namespace cudaq {
+
+// private helpers
+
+#if !defined(NDEBUG)
+void fermion_operator::validate_opcode() const {
+  std::vector<int> valid_op_codes = {0, 1, 2, 4, 8, 9};
+  assert(std::find(valid_op_codes.cbegin(), valid_op_codes.cend(),
+                   this->op_code) != valid_op_codes.cend());
+  assert(this->commutes_across_degrees ==
+         (!(this->op_code & 2) && !(this->op_code & 4)));
+}
+#endif
+
+std::string fermion_operator::op_code_to_string() const {
+  // Note that we can (and should) have the same op codes across boson, fermion,
+  // and spin ops, since individual operators with the same op codes are
+  // actually equal. Note that the matrix definition for creation, annihilation
+  // and number operators are equal despite the different
+  // commutation/anticommutation relations; what makes them behave differently
+  // is effectively the "finite size effects" for fermions. Specifically, if we
+  // were to blindly multiply the matrices for d=2 for bosons, we would get the
+  // same behavior as we have for a single fermion due to the finite size of the
+  // matrix. To avoid this, we ensure that we reorder the operators for bosons
+  // appropriately as part of the in-place multiplication, whereas for fermions,
+  // this effect is desired/correct.
+  if (this->op_code == 0)
+    return "0";
+  if (this->op_code & 1)
+    return "(1-N)";
+  if (this->op_code & 2)
+    return "A";
+  if (this->op_code & 4)
+    return "Ad";
+  if (this->op_code & 8)
+    return "N";
+  return "I";
+}
+
+std::string fermion_operator::op_code_to_string(
+    std::unordered_map<int, int> &dimensions) const {
+  auto it = dimensions.find(this->target);
+  if (it == dimensions.end())
+    dimensions[this->target] = 2;
+  else if (it->second != 2)
+    throw std::runtime_error("dimension for fermion operator must be 2");
+  return this->op_code_to_string();
+}
+
+void fermion_operator::inplace_mult(const fermion_operator &other) {
+#if !defined(NDEBUG)
+  other.validate_opcode();
+#endif
+
+  // The below code is just a bitwise implementation of a matrix multiplication;
+  // Multiplication becomes a bitwise and, addition becomes an exclusive or.
+  auto get_entry = [](const fermion_operator &op, int quadrant) {
+    return (op.op_code & (1 << quadrant)) >> quadrant;
+  };
+
+  auto res00 = (get_entry(*this, 0) & get_entry(other, 0)) ^
+               (get_entry(*this, 1) & get_entry(other, 2));
+  auto res01 = (get_entry(*this, 0) & get_entry(other, 1)) ^
+               (get_entry(*this, 1) & get_entry(other, 3));
+  auto res10 = (get_entry(*this, 2) & get_entry(other, 0)) ^
+               (get_entry(*this, 3) & get_entry(other, 2));
+  auto res11 = (get_entry(*this, 2) & get_entry(other, 1)) ^
+               (get_entry(*this, 3) & get_entry(other, 3));
+
+  this->op_code = res00 ^ (res01 << 1) ^ (res10 << 2) ^ (res11 << 3);
+  this->commutes = !(this->op_code & 2) && !(this->op_code & 4);
+#if !defined(NDEBUG)
+  this->validate_opcode();
+#endif
+}
+
+// read-only properties
+
+std::string fermion_operator::unique_id() const {
+  return this->op_code_to_string() + std::to_string(target);
+}
+
+std::vector<int> fermion_operator::degrees() const { return {this->target}; }
+
+// constructors
+
+fermion_operator::fermion_operator(int target)
+    : target(target), op_code(9), commutes(true) {}
+
+fermion_operator::fermion_operator(int target, int op_id)
+    : target(target), op_code(9), commutes(true) {
+  assert(0 <= op_id < 4);
+  if (op_id == 1) { // create
+    this->op_code = 4;
+    this->commutes = false;
+  } else if (op_id == 2) { // annihilate
+    this->op_code = 2;
+    this->commutes = false;
+  } else if (op_id == 3) // number
+    this->op_code = 8;
+}
+
+fermion_operator::fermion_operator(const fermion_operator &other)
+    : op_code(other.op_code), commutes(other.commutes), target(other.target) {}
+
+// assignments
+
+fermion_operator &fermion_operator::operator=(const fermion_operator &other) {
+  if (this != &other) {
+    this->op_code = other.op_code;
+    this->commutes = other.commutes;
+    this->target = other.target;
+  }
+  return *this;
+}
+
+// evaluations
+
+matrix_2 fermion_operator::to_matrix(
+    std::unordered_map<int, int> &dimensions,
+    const std::unordered_map<std::string, std::complex<double>> &parameters)
+    const {
+  auto it = dimensions.find(this->target);
+  if (it == dimensions.end())
+    dimensions[this->target] = 2;
+  else if (it->second != 2)
+    throw std::runtime_error("dimension for fermion operator must be 2");
+
+#if !defined(NDEBUG)
+  this->validate_opcode();
+#endif
+
+  auto mat = matrix_2(2, 2);
+  if (this->op_code & 1)
+    mat[{0, 0}] = 1.;
+  if (this->op_code & 2)
+    mat[{0, 1}] = 1.;
+  if (this->op_code & 4)
+    mat[{1, 0}] = 1.;
+  if (this->op_code & 8)
+    mat[{1, 1}] = 1.;
+  return std::move(mat);
+}
+
+std::string fermion_operator::to_string(bool include_degrees) const {
+  if (include_degrees)
+    return this->op_code_to_string() + "(" + std::to_string(target) + ")";
+  else
+    return this->op_code_to_string();
+}
+
+// comparisons
+
+bool fermion_operator::operator==(const fermion_operator &other) const {
+  return this->target == other.target &&
+         this->op_code == other.op_code; // no need to compare commutes (is
+                                         // determined by op_code)
+}
+
+// defined operators
+
+operator_sum<fermion_operator> fermion_operator::empty() {
+  return operator_handler::empty<fermion_operator>();
+}
+
+product_operator<fermion_operator> fermion_operator::identity() {
+  return operator_handler::identity<fermion_operator>();
+}
+
+product_operator<fermion_operator> fermion_operator::identity(int degree) {
+  return product_operator(fermion_operator(degree));
+}
+
+product_operator<fermion_operator> fermion_operator::create(int degree) {
+  return product_operator(fermion_operator(degree, 1));
+}
+
+product_operator<fermion_operator> fermion_operator::annihilate(int degree) {
+  return product_operator(fermion_operator(degree, 2));
+}
+
+product_operator<fermion_operator> fermion_operator::number(int degree) {
+  return product_operator(fermion_operator(degree, 3));
+}
+
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/cudaq/dynamics/fermion_operators.h b/runtime/cudaq/dynamics/fermion_operators.h
new file mode 100644
index 00000000000..dea0ca8bc1e
--- /dev/null
+++ b/runtime/cudaq/dynamics/fermion_operators.h
@@ -0,0 +1,117 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include <complex>
+#include <unordered_map>
+#include <vector>
+
+#include "cudaq/operators.h"
+#include "cudaq/utils/tensor.h"
+
+namespace cudaq {
+
+// FIXME: rename?
+class fermion_operator : public operator_handler {
+  template <typename T>
+  friend class product_operator;
+
+private:
+  // Given that the dimension for fermion operators has to be 2,
+  // we effectively may just as well store a 2 x 2 matrix.
+  // Since we only ever need the operator Ad, A, N, (1-N), I, 0,
+  // we choose to store this as a single integer whose bits
+  // correspond to the quadrant entry.
+  // That is:
+  // 0 = 0000 = 0,
+  // 1 = 0001 = (1-N),
+  // 2 = 0010 = A,
+  // 4 = 0100 = Ad
+  // 8 = 1000 = N
+  // 9 = 1001 = I
+  int8_t op_code;
+  bool commutes;
+  int target;
+
+  // Note that this constructor is chosen to be independent
+  // on the internal encoding; to be less critic, we here use the usual
+  // 0 = I, Ad = 1, A = 2, AdA = 3
+  fermion_operator(int target, int op_id);
+
+  std::string op_code_to_string() const;
+  virtual std::string
+  op_code_to_string(std::unordered_map<int, int> &dimensions) const;
+
+#if !defined(NDEBUG)
+  // Here to check if my reasoning regarding only ever needing the operators
+  // above were correct.
+  void validate_opcode() const;
+#endif
+
+  void inplace_mult(const fermion_operator &other);
+
+public:
+  static constexpr commutation_relations commutation_group =
+      operator_handler::fermion_commutation_relations;
+
+  // read-only properties
+
+  const bool &commutes_across_degrees = this->commutes;
+
+  virtual std::string unique_id() const;
+
+  virtual std::vector<int> degrees() const;
+
+  // constructors and destructors
+
+  fermion_operator(int target);
+
+  fermion_operator(const fermion_operator &other);
+
+  ~fermion_operator() = default;
+
+  // assignments
+
+  fermion_operator &operator=(const fermion_operator &other);
+
+  // evaluations
+
+  /// @brief Return the matrix representation of the operator in the eigenbasis
+  /// of the number operator.
+  /// @arg  `dimensions` : A map specifying the dimension, that is the number of
+  /// eigenstates, for each degree of freedom.
+  virtual matrix_2
+  to_matrix(std::unordered_map<int, int> &dimensions,
+            const std::unordered_map<std::string, std::complex<double>>
+                &parameters = {}) const;
+
+  virtual std::string to_string(bool include_degrees) const;
+
+  // comparisons
+
+  /// @returns True if, and only if, the two operators have the same effect on
+  /// any state.
+  bool operator==(const fermion_operator &other) const;
+
+  // defined operators
+
+  static operator_sum<fermion_operator> empty();
+  static product_operator<fermion_operator> identity();
+
+  static product_operator<fermion_operator> identity(int degree);
+  static product_operator<fermion_operator> create(int degree);
+  static product_operator<fermion_operator> annihilate(int degree);
+  static product_operator<fermion_operator> number(int degree);
+
+  // Note that we don't define position and momentum here, since physically they
+  // do not make much sense; see e.g.
+  // https://physics.stackexchange.com/questions/319296/why-does-a-fermionic-hamiltonian-always-obey-fermionic-parity-symmetry
+};
+
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/cudaq/dynamics/handler.cpp b/runtime/cudaq/dynamics/handler.cpp
new file mode 100644
index 00000000000..9035b36310c
--- /dev/null
+++ b/runtime/cudaq/dynamics/handler.cpp
@@ -0,0 +1,111 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/utils/tensor.h"
+#include "operator_leafs.h"
+#include <complex>
+#include <set>
+#include <unordered_map>
+#include <vector>
+
+namespace cudaq {
+
+// commutation_relations
+
+std::unordered_map<uint, std::complex<double>>
+    commutation_relations::exchange_factors = {
+        {-1, 1.},  // default relation
+        {-2, -1.}, // fermion relation
+};
+
+void commutation_relations::define(uint group_id,
+                                   std::complex<double> exchange_factor) {
+  auto result = commutation_relations::exchange_factors.insert(
+      {group_id, exchange_factor});
+  if (!result.second)
+    throw std::invalid_argument("commutation relations for group id '" +
+                                std::to_string(group_id) +
+                                "' are already defined");
+}
+
+std::complex<double> commutation_relations::commutation_factor() const {
+  auto it = commutation_relations::exchange_factors.find(id);
+  assert(it != commutation_relations::exchange_factors.cend());
+  return it->second;
+}
+
+bool commutation_relations::operator==(
+    const commutation_relations &other) const {
+  return this->id == other.id;
+}
+
+// operator_handler
+
+commutation_relations operator_handler::custom_commutation_relations(uint id) {
+  auto it = commutation_relations::exchange_factors.find(id);
+  if (it == commutation_relations::exchange_factors.cend())
+    throw std::range_error("no commutation relations with id '" +
+                           std::to_string(id) + "' has been defined");
+  return commutation_relations(id);
+}
+
+// operator_handler::matrix_evaluation
+
+operator_handler::matrix_evaluation::matrix_evaluation() = default;
+
+operator_handler::matrix_evaluation::matrix_evaluation(
+    std::vector<int> &&degrees, matrix_2 &&matrix)
+    : degrees(std::move(degrees)), matrix(std::move(matrix)) {
+#if !defined(NDEBUG)
+  std::set<int> unique_degrees;
+  for (auto d : this->degrees)
+    unique_degrees.insert(d);
+  assert(unique_degrees.size() == this->degrees.size());
+#endif
+}
+
+operator_handler::matrix_evaluation::matrix_evaluation(
+    matrix_evaluation &&other)
+    : degrees(std::move(other.degrees)), matrix(std::move(other.matrix)) {}
+
+operator_handler::matrix_evaluation &
+operator_handler::matrix_evaluation::operator=(matrix_evaluation &&other) {
+  if (this != &other) {
+    this->degrees = std::move(other.degrees);
+    this->matrix = std::move(other.matrix);
+  }
+  return *this;
+}
+
+// operator_handler::canonical_evaluation
+
+operator_handler::canonical_evaluation::canonical_evaluation() = default;
+
+operator_handler::canonical_evaluation::canonical_evaluation(
+    canonical_evaluation &&other)
+    : terms(std::move(other.terms)) {}
+
+operator_handler::canonical_evaluation &
+operator_handler::canonical_evaluation::operator=(
+    canonical_evaluation &&other) {
+  if (this != &other)
+    this->terms = std::move(other.terms);
+  return *this;
+}
+
+void operator_handler::canonical_evaluation::push_back(
+    std::pair<std::complex<double>, std::string> &&term) {
+  this->terms.push_back(term);
+}
+
+void operator_handler::canonical_evaluation::push_back(const std::string &op) {
+  assert(this->terms.size() != 0);
+  this->terms.back().second.append(op);
+}
+
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/cudaq/dynamics/helpers.cpp b/runtime/cudaq/dynamics/helpers.cpp
new file mode 100644
index 00000000000..fcd6a479b5b
--- /dev/null
+++ b/runtime/cudaq/dynamics/helpers.cpp
@@ -0,0 +1,95 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "helpers.h"
+#include <unordered_map>
+
+namespace cudaq {
+namespace detail {
+
+std::vector<std::string>
+generate_all_states(const std::vector<int> &degrees,
+                    const std::unordered_map<int, int> &dimensions) {
+  if (degrees.size() == 0)
+    return {};
+
+  std::vector<std::string> states;
+  auto entry = dimensions.find(degrees[0]);
+  assert(entry != dimensions.end());
+  for (auto state = 0; state < entry->second; state++) {
+    states.push_back(std::to_string(state));
+  }
+
+  for (auto idx = 1; idx < degrees.size(); ++idx) {
+    auto entry = dimensions.find(degrees[idx]);
+    assert(entry != dimensions.end());
+    std::vector<std::string> result;
+    for (auto current : states) {
+      for (auto state = 0; state < entry->second; state++) {
+        result.push_back(current + std::to_string(state));
+      }
+    }
+    states = result;
+  }
+
+  return states;
+}
+
+std::vector<int>
+compute_permutation(const std::vector<int> &op_degrees,
+                    const std::vector<int> &canon_degrees,
+                    const std::unordered_map<int, int> dimensions) {
+  assert(op_degrees.size() == canon_degrees.size());
+  auto states = cudaq::detail::generate_all_states(canon_degrees, dimensions);
+
+  std::vector<int> reordering;
+  for (auto degree : op_degrees) {
+    auto it = std::find(canon_degrees.cbegin(), canon_degrees.cend(), degree);
+    reordering.push_back(it - canon_degrees.cbegin());
+  }
+
+  std::vector<std::string> op_states =
+      cudaq::detail::generate_all_states(op_degrees, dimensions);
+
+  std::vector<int> permutation;
+  for (const auto &state : states) {
+    std::string term;
+    for (auto i : reordering) {
+      term += state[i];
+    }
+    auto it = std::find(op_states.cbegin(), op_states.cend(), term);
+    permutation.push_back(it - op_states.cbegin());
+  }
+
+  return std::move(permutation);
+}
+
+void permute_matrix(cudaq::matrix_2 &matrix,
+                    const std::vector<int> &permutation) {
+  if (permutation.size() == 0) {
+    assert(matrix.get_rows() == matrix.get_columns() == 1);
+    return;
+  }
+
+  std::vector<std::complex<double>> sorted_values;
+  for (std::size_t permuted : permutation) {
+    for (std::size_t permuted_again : permutation) {
+      sorted_values.push_back(matrix[{permuted, permuted_again}]);
+    }
+  }
+  int idx = 0;
+  for (std::size_t row = 0; row < matrix.get_rows(); row++) {
+    for (std::size_t col = 0; col < matrix.get_columns(); col++) {
+      matrix[{row, col}] = sorted_values[idx];
+      idx++;
+    }
+  }
+}
+
+} // namespace detail
+} // namespace cudaq
diff --git a/runtime/cudaq/dynamics/helpers.h b/runtime/cudaq/dynamics/helpers.h
new file mode 100644
index 00000000000..83fe74cfa9e
--- /dev/null
+++ b/runtime/cudaq/dynamics/helpers.h
@@ -0,0 +1,40 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/utils/tensor.h"
+#include <unordered_map>
+#include <vector>
+
+namespace cudaq {
+namespace detail {
+
+/// Generates all possible states for the given dimensions ordered according
+/// to the sequence of degrees (ordering is relevant if dimensions differ).
+std::vector<std::string>
+generate_all_states(const std::vector<int> &degrees,
+                    const std::unordered_map<int, int> &dimensions);
+
+/// Computes a vector describing the permutation to reorder a matrix that is
+/// ordered according to `op_degrees` to apply to `canon_degrees` instead.
+/// The dimensions define the number of levels for each degree of freedom.
+/// The degrees of freedom in `op_degrees` and `canon_degrees` have to match.
+std::vector<int>
+compute_permutation(const std::vector<int> &op_degrees,
+                    const std::vector<int> &canon_degrees,
+                    const std::unordered_map<int, int> dimensions);
+
+// Permutes the given matrix according to the given permutation.
+// If states is the current order of vector entries on which the given matrix
+// acts, and permuted_states is the desired order of an array on which the
+// permuted matrix should act, then the permutation is defined such that
+// [states[i] for i in permutation] produces permuted_states.
+void permute_matrix(cudaq::matrix_2 &matrix,
+                    const std::vector<int> &permutation);
+
+} // namespace detail
+} // namespace cudaq
diff --git a/runtime/cudaq/dynamics/matrix_operators.cpp b/runtime/cudaq/dynamics/matrix_operators.cpp
new file mode 100644
index 00000000000..7e7c6e46db0
--- /dev/null
+++ b/runtime/cudaq/dynamics/matrix_operators.cpp
@@ -0,0 +1,529 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include <complex>
+#include <unordered_map>
+#include <vector>
+
+#include "cudaq/operators.h"
+#include "cudaq/utils/tensor.h"
+
+#include "boson_operators.h"
+#include "fermion_operators.h"
+#include "matrix_operators.h"
+#include "spin_operators.h"
+
+namespace cudaq {
+
+#if !defined(NDEBUG)
+bool matrix_operator::can_be_canonicalized = false;
+#endif
+
+// tools for custom operators
+
+std::unordered_map<std::string, Definition> matrix_operator::defined_ops = {};
+
+template <typename T>
+std::string matrix_operator::type_prefix() {
+  return typeid(T).name();
+}
+
+// no need to prefix the operator id and op code with the type name for these
+// (same names mean the same thing)
+template <>
+std::string matrix_operator::type_prefix<spin_operator>() {
+  return "";
+}
+template <>
+std::string matrix_operator::type_prefix<boson_operator>() {
+  return "";
+}
+template <>
+std::string matrix_operator::type_prefix<fermion_operator>() {
+  return "";
+}
+
+void matrix_operator::define(std::string operator_id,
+                             std::vector<int> expected_dimensions,
+                             MatrixCallbackFunction &&create) {
+  auto defn = Definition(operator_id, expected_dimensions,
+                         std::forward<MatrixCallbackFunction>(create));
+  auto result =
+      matrix_operator::defined_ops.insert({operator_id, std::move(defn)});
+  if (!result.second)
+    throw std::runtime_error("an matrix operator with name " + operator_id +
+                             "is already defined");
+}
+
+product_operator<matrix_operator>
+matrix_operator::instantiate(std::string operator_id,
+                             const std::vector<int> &degrees,
+                             const commutation_behavior &commutation_behavior) {
+  auto it = matrix_operator::defined_ops.find(operator_id);
+  if (it == matrix_operator::defined_ops.end())
+    throw std::range_error("not matrix operator with the name '" + operator_id +
+                           "' has been defined");
+  auto application_degrees = degrees;
+  std::sort(application_degrees.begin(), application_degrees.end(),
+            operator_handler::user_facing_order);
+  if (application_degrees != degrees) {
+    std::stringstream err_msg;
+    err_msg << "incorrect ordering of degrees (expected order {"
+            << application_degrees[0];
+    for (auto i = 1; i < application_degrees.size(); ++i)
+      err_msg << ", " << std::to_string(application_degrees[i]);
+    err_msg << "})";
+    throw std::runtime_error(err_msg.str());
+  }
+  return product_operator(
+      matrix_operator(operator_id, degrees, commutation_behavior));
+}
+
+product_operator<matrix_operator>
+matrix_operator::instantiate(std::string operator_id,
+                             std::vector<int> &&degrees,
+                             const commutation_behavior &commutation_behavior) {
+  auto it = matrix_operator::defined_ops.find(operator_id);
+  if (it == matrix_operator::defined_ops.end())
+    throw std::range_error("not matrix operator with the name '" + operator_id +
+                           "' has been defined");
+  auto application_degrees = degrees;
+  std::sort(application_degrees.begin(), application_degrees.end(),
+            operator_handler::user_facing_order);
+  if (application_degrees != degrees) {
+    std::stringstream err_msg;
+    err_msg << "incorrect ordering of degrees (expected order {"
+            << application_degrees[0];
+    for (auto i = 1; i < application_degrees.size(); ++i)
+      err_msg << ", " << std::to_string(application_degrees[i]);
+    err_msg << "})";
+    throw std::runtime_error(err_msg.str());
+  }
+  return product_operator(
+      matrix_operator(operator_id, std::move(degrees), commutation_behavior));
+}
+
+// private helpers
+
+std::string matrix_operator::op_code_to_string(
+    std::unordered_map<int, int> &dimensions) const {
+  auto it = matrix_operator::defined_ops.find(this->op_code);
+  assert(it != matrix_operator::defined_ops
+                   .end()); // should be validated upon instantiation
+
+  for (auto i = 0; i < this->targets.size(); ++i) {
+    auto entry = dimensions.find(this->targets[i]);
+    auto expected_dim = it->second.expected_dimensions[i];
+    if (expected_dim <= 0) {
+      if (entry == dimensions.end())
+        throw std::runtime_error("missing dimension for degree " +
+                                 std::to_string(this->targets[i]));
+    } else {
+      if (entry == dimensions.end())
+        dimensions[this->targets[i]] = expected_dim;
+      else if (entry->second != expected_dim)
+        throw std::runtime_error(
+            "invalid dimension for degree " + std::to_string(this->targets[i]) +
+            ", expected dimension is " + std::to_string(expected_dim));
+    }
+  }
+  return this->op_code;
+}
+
+// read-only properties
+
+std::string matrix_operator::unique_id() const {
+  auto it = this->targets.cbegin();
+  auto str = this->op_code + std::to_string(*it);
+  while (++it != this->targets.cend())
+    str += "." + std::to_string(*it);
+  return std::move(str);
+}
+
+std::vector<int> matrix_operator::degrees() const { return this->targets; }
+
+// constructors
+
+matrix_operator::matrix_operator(int degree)
+    : op_code("I"), commutes(true),
+      group(operator_handler::default_commutation_relations) {
+  this->targets.push_back(degree);
+  if (matrix_operator::defined_ops.find(this->op_code) ==
+      matrix_operator::defined_ops.end()) {
+    auto func =
+        [](const std::vector<int> &dimensions,
+           const std::unordered_map<std::string, std::complex<double>> &_none) {
+          std::size_t dimension = dimensions[0];
+          auto mat = matrix_2(dimension, dimension);
+
+          // Build up the identity matrix.
+          for (std::size_t i = 0; i < dimension; i++) {
+            mat[{i, i}] = 1.0 + 0.0j;
+          }
+          return mat;
+        };
+    matrix_operator::define(this->op_code, {-1}, std::move(func));
+  }
+}
+
+matrix_operator::matrix_operator(
+    std::string operator_id, const std::vector<int> &degrees,
+    const commutation_behavior &commutation_behavior)
+    : op_code(operator_id),
+      commutes(commutation_behavior.commutes_across_degrees),
+      group(commutation_behavior.group), targets(degrees) {
+  assert(this->targets.size() > 0);
+  if (!commutation_behavior.commutes_across_degrees && this->targets.size() > 1)
+    // We cannot support this with the current mechanism for achieving
+    // non-trivial commutation relations for operators acting on different
+    // degrees. See also the comment in the `find_insert_at` template for
+    // product operators. We still want to stick with that mechanism, since it
+    // is more general and by far more performant than e.g. achieving
+    // anti-commutation via phase operator instead. It should be fine, however,
+    // for a multi-qubit operator to belong to a non-zero commutation set as
+    // long as the operator itself commutes with all operators acting on
+    // different degrees (as indicated by teh boolean value of
+    // commutation_behavior); this effectively "marks" the degrees that the
+    // operator acts on as being a certain kind of particles.
+    throw std::runtime_error("non-trivial commutation behavior is not "
+                             "supported for multi-target operators");
+}
+
+matrix_operator::matrix_operator(
+    std::string operator_id, std::vector<int> &&degrees,
+    const commutation_behavior &commutation_behavior)
+    : op_code(operator_id),
+      commutes(commutation_behavior.commutes_across_degrees),
+      group(commutation_behavior.group), targets(std::move(degrees)) {
+  assert(this->targets.size() > 0);
+  if (!commutation_behavior.commutes_across_degrees && this->targets.size() > 1)
+    // We cannot support this with the current mechanism for achieving
+    // non-trivial commutation relations for operators acting on different
+    // degrees. See also the comment in the `find_insert_at` template for
+    // product operators. We still want to stick with that mechanism, since it
+    // is more general and by far more performant than e.g. achieving
+    // anti-commutation via phase operator instead. It should be fine, however,
+    // for a multi-qubit operator to belong to a non-zero commutation set as
+    // long as the operator itself commutes with all operators acting on
+    // different degrees (as indicated by teh boolean value of
+    // commutation_behavior); this effectively "marks" the degrees that the
+    // operator acts on as being a certain kind of particles.
+    throw std::runtime_error("non-trivial commutation behavior is not "
+                             "supported for multi-target operators");
+}
+
+template <typename T,
+          std::enable_if_t<std::is_base_of_v<operator_handler, T>, bool>>
+matrix_operator::matrix_operator(const T &other)
+    : matrix_operator::matrix_operator(
+          other, commutation_behavior(other.commutation_group,
+                                      other.commutes_across_degrees)) {}
+
+template <typename T,
+          std::enable_if_t<std::is_base_of_v<operator_handler, T>, bool> = true>
+matrix_operator::matrix_operator(const T &other,
+                                 const commutation_behavior &behavior)
+    : op_code(matrix_operator::type_prefix<T>() + other.to_string(false)),
+      commutes(behavior.commutes_across_degrees), group(behavior.group),
+      targets(other.degrees()) {
+  if (matrix_operator::defined_ops.find(this->op_code) ==
+      matrix_operator::defined_ops.end()) {
+    auto func = [other](const std::vector<int> &dimensions,
+                        const std::unordered_map<std::string,
+                                                 std::complex<double>> &_none) {
+      std::unordered_map<int, int> dims;
+      auto targets = other.degrees();
+      for (auto i = 0; i < dimensions.size(); ++i)
+        dims[targets[i]] = dimensions[i];
+      return other.to_matrix(dims, std::move(_none));
+    };
+    // the to_matrix method on the spin op will check the dimensions, so we
+    // allow arbitrary here
+    std::vector<int> required_dimensions(this->targets.size(), -1);
+    matrix_operator::define(this->op_code, std::move(required_dimensions),
+                            func);
+  }
+}
+
+template matrix_operator::matrix_operator(const spin_operator &other);
+template matrix_operator::matrix_operator(const boson_operator &other);
+template matrix_operator::matrix_operator(const fermion_operator &other);
+
+template matrix_operator::matrix_operator(const spin_operator &other,
+                                          const commutation_behavior &behavior);
+template matrix_operator::matrix_operator(const boson_operator &other,
+                                          const commutation_behavior &behavior);
+template matrix_operator::matrix_operator(const fermion_operator &other,
+                                          const commutation_behavior &behavior);
+
+matrix_operator::matrix_operator(const matrix_operator &other)
+    : op_code(other.op_code), commutes(other.commutes), group(other.group),
+      targets(other.targets) {}
+
+matrix_operator::matrix_operator(matrix_operator &&other)
+    : op_code(other.op_code), commutes(other.commutes),
+      group(std::move(other.group)), targets(std::move(other.targets)) {}
+
+// assignments
+
+matrix_operator &matrix_operator::operator=(matrix_operator &&other) {
+  if (this != &other) {
+    this->op_code = other.op_code;
+    this->commutes = other.commutes;
+    this->group = std::move(other.group);
+    this->targets = std::move(other.targets);
+  }
+  return *this;
+}
+
+matrix_operator &matrix_operator::operator=(const matrix_operator &other) {
+  if (this != &other) {
+    this->op_code = other.op_code;
+    this->commutes = other.commutes;
+    this->group = other.group;
+    this->targets = other.targets;
+  }
+  return *this;
+}
+
+template <typename T,
+          std::enable_if_t<!std::is_same<T, matrix_operator>::value &&
+                               std::is_base_of_v<operator_handler, T>,
+                           bool>>
+matrix_operator &matrix_operator::operator=(const T &other) {
+  *this = matrix_operator(other);
+  return *this;
+}
+
+template matrix_operator &
+matrix_operator::operator=(const spin_operator &other);
+template matrix_operator &
+matrix_operator::operator=(const boson_operator &other);
+template matrix_operator &
+matrix_operator::operator=(const fermion_operator &other);
+
+// evaluations
+
+matrix_2 matrix_operator::to_matrix(
+    std::unordered_map<int, int> &dimensions,
+    const std::unordered_map<std::string, std::complex<double>> &parameters)
+    const {
+  auto it = matrix_operator::defined_ops.find(this->op_code);
+  assert(it != matrix_operator::defined_ops
+                   .end()); // should be validated upon instantiation
+
+  std::vector<int> relevant_dimensions;
+  relevant_dimensions.reserve(this->targets.size());
+  for (auto i = 0; i < this->targets.size(); ++i) {
+    auto entry = dimensions.find(this->targets[i]);
+    auto expected_dim = it->second.expected_dimensions[i];
+    if (expected_dim <= 0) {
+      if (entry == dimensions.end())
+        throw std::runtime_error("missing dimension for degree " +
+                                 std::to_string(this->targets[i]));
+      relevant_dimensions.push_back(entry->second);
+    } else {
+      if (entry == dimensions.end())
+        dimensions[this->targets[i]] = expected_dim;
+      else if (entry->second != expected_dim)
+        throw std::runtime_error(
+            "invalid dimension for degree " + std::to_string(this->targets[i]) +
+            ", expected dimension is " + std::to_string(expected_dim));
+      relevant_dimensions.push_back(expected_dim);
+    }
+  }
+
+  return it->second.generate_matrix(relevant_dimensions, parameters);
+}
+
+std::string matrix_operator::to_string(bool include_degrees) const {
+  if (!include_degrees)
+    return this->op_code;
+  else if (this->targets.size() == 0)
+    return this->op_code + "()";
+  auto it = this->targets.cbegin();
+  std::string str = this->op_code + "(" + std::to_string(*it);
+  while (++it != this->targets.cend())
+    str += ", " + std::to_string(*it);
+  return str + ")";
+}
+
+// comparisons
+
+bool matrix_operator::operator==(const matrix_operator &other) const {
+  return this->op_code == other.op_code && this->group == other.group &&
+         // no need to compare commutes (should be determined by op_code and
+         // commutation group)
+         this->targets == other.targets;
+}
+
+// predefined operators
+
+operator_sum<matrix_operator> matrix_operator::empty() {
+  return operator_handler::empty<matrix_operator>();
+}
+
+product_operator<matrix_operator> matrix_operator::identity() {
+  return operator_handler::identity<matrix_operator>();
+}
+
+product_operator<matrix_operator> matrix_operator::identity(int degree) {
+  return product_operator(matrix_operator(degree));
+}
+
+product_operator<matrix_operator> matrix_operator::number(int degree) {
+  std::string op_code = "number";
+  if (matrix_operator::defined_ops.find(op_code) ==
+      matrix_operator::defined_ops.end()) {
+    auto func =
+        [](const std::vector<int> &dimensions,
+           const std::unordered_map<std::string, std::complex<double>> &_none) {
+          std::size_t dimension = dimensions[0];
+          auto mat = matrix_2(dimension, dimension);
+          for (std::size_t i = 0; i < dimension; i++) {
+            mat[{i, i}] = static_cast<double>(i) + 0.0j;
+          }
+          return mat;
+        };
+    matrix_operator::define(op_code, {-1}, func);
+  }
+  auto op = matrix_operator(op_code, {degree});
+  return product_operator(std::move(op));
+}
+
+product_operator<matrix_operator> matrix_operator::parity(int degree) {
+  std::string op_code = "parity";
+  if (matrix_operator::defined_ops.find(op_code) ==
+      matrix_operator::defined_ops.end()) {
+    auto func =
+        [](const std::vector<int> &dimensions,
+           const std::unordered_map<std::string, std::complex<double>> &_none) {
+          std::size_t dimension = dimensions[0];
+          auto mat = matrix_2(dimension, dimension);
+          for (std::size_t i = 0; i < dimension; i++) {
+            mat[{i, i}] = std::pow(-1., static_cast<double>(i)) + 0.0j;
+          }
+          return mat;
+        };
+    matrix_operator::define(op_code, {-1}, func);
+  }
+  auto op = matrix_operator(op_code, {degree});
+  return product_operator(std::move(op));
+}
+
+product_operator<matrix_operator> matrix_operator::position(int degree) {
+  std::string op_code = "position";
+  if (matrix_operator::defined_ops.find(op_code) ==
+      matrix_operator::defined_ops.end()) {
+    auto func =
+        [](const std::vector<int> &dimensions,
+           const std::unordered_map<std::string, std::complex<double>> &_none) {
+          std::size_t dimension = dimensions[0];
+          auto mat = matrix_2(dimension, dimension);
+          // position = 0.5 * (create + annihilate)
+          for (std::size_t i = 0; i + 1 < dimension; i++) {
+            mat[{i + 1, i}] =
+                0.5 * std::sqrt(static_cast<double>(i + 1)) + 0.0 * 'j';
+            mat[{i, i + 1}] =
+                0.5 * std::sqrt(static_cast<double>(i + 1)) + 0.0 * 'j';
+          }
+          return mat;
+        };
+    matrix_operator::define(op_code, {-1}, func);
+  }
+  auto op = matrix_operator(op_code, {degree});
+  return product_operator(std::move(op));
+}
+
+product_operator<matrix_operator> matrix_operator::momentum(int degree) {
+  std::string op_code = "momentum";
+  if (matrix_operator::defined_ops.find(op_code) ==
+      matrix_operator::defined_ops.end()) {
+    auto func =
+        [](const std::vector<int> &dimensions,
+           const std::unordered_map<std::string, std::complex<double>> &_none) {
+          std::size_t dimension = dimensions[0];
+          auto mat = matrix_2(dimension, dimension);
+          // momentum = 0.5j * (create - annihilate)
+          for (std::size_t i = 0; i + 1 < dimension; i++) {
+            mat[{i + 1, i}] =
+                (0.5j) * std::sqrt(static_cast<double>(i + 1)) + 0.0 * 'j';
+            mat[{i, i + 1}] =
+                -1. * (0.5j) * std::sqrt(static_cast<double>(i + 1)) +
+                0.0 * 'j';
+          }
+          return mat;
+        };
+    matrix_operator::define(op_code, {-1}, func);
+  }
+  auto op = matrix_operator(op_code, {degree});
+  return product_operator(std::move(op));
+}
+
+product_operator<matrix_operator> matrix_operator::displace(int degree) {
+  std::string op_code = "displace";
+  if (matrix_operator::defined_ops.find(op_code) ==
+      matrix_operator::defined_ops.end()) {
+    auto func = [](const std::vector<int> &dimensions,
+                   const std::unordered_map<std::string, std::complex<double>>
+                       &parameters) {
+      std::size_t dimension = dimensions[0];
+      auto entry = parameters.find("displacement");
+      if (entry == parameters.end())
+        throw std::runtime_error("missing value for parameter 'displacement'");
+      auto displacement_amplitude = entry->second;
+      auto create = matrix_2(dimension, dimension);
+      auto annihilate = matrix_2(dimension, dimension);
+      for (std::size_t i = 0; i + 1 < dimension; i++) {
+        create[{i + 1, i}] = std::sqrt(static_cast<double>(i + 1)) + 0.0 * 'j';
+        annihilate[{i, i + 1}] =
+            std::sqrt(static_cast<double>(i + 1)) + 0.0 * 'j';
+      }
+      auto term1 = displacement_amplitude * create;
+      auto term2 = std::conj(displacement_amplitude) * annihilate;
+      return (term1 - term2).exponential();
+    };
+    matrix_operator::define(op_code, {-1}, func);
+  }
+  auto op = matrix_operator(op_code, {degree});
+  return product_operator(std::move(op));
+}
+
+product_operator<matrix_operator> matrix_operator::squeeze(int degree) {
+  std::string op_code = "squeeze";
+  if (matrix_operator::defined_ops.find(op_code) ==
+      matrix_operator::defined_ops.end()) {
+    auto func = [](const std::vector<int> &dimensions,
+                   const std::unordered_map<std::string, std::complex<double>>
+                       &parameters) {
+      std::size_t dimension = dimensions[0];
+      auto entry = parameters.find("squeezing");
+      if (entry == parameters.end())
+        throw std::runtime_error("missing value for parameter 'squeezing'");
+      auto squeezing = entry->second;
+      auto create = matrix_2(dimension, dimension);
+      auto annihilate = matrix_2(dimension, dimension);
+      for (std::size_t i = 0; i + 1 < dimension; i++) {
+        create[{i + 1, i}] = std::sqrt(static_cast<double>(i + 1)) + 0.0 * 'j';
+        annihilate[{i, i + 1}] =
+            std::sqrt(static_cast<double>(i + 1)) + 0.0 * 'j';
+      }
+      auto term1 = std::conj(squeezing) * annihilate.power(2);
+      auto term2 = squeezing * create.power(2);
+      auto difference = 0.5 * (term1 - term2);
+      return difference.exponential();
+    };
+    matrix_operator::define(op_code, {-1}, func);
+  }
+  auto op = matrix_operator(op_code, {degree});
+  return product_operator(std::move(op));
+}
+
+// tools for custom operators
+
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/cudaq/dynamics/matrix_operators.h b/runtime/cudaq/dynamics/matrix_operators.h
new file mode 100644
index 00000000000..c1e2b4bd82d
--- /dev/null
+++ b/runtime/cudaq/dynamics/matrix_operators.h
@@ -0,0 +1,193 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include <complex>
+#include <unordered_map>
+#include <vector>
+
+#include "cudaq/operators.h"
+#include "cudaq/utils/tensor.h"
+
+namespace cudaq {
+
+class matrix_operator : public operator_handler {
+public:
+  struct commutation_behavior {
+    commutation_relations group =
+        operator_handler::default_commutation_relations;
+    bool commutes_across_degrees = true;
+
+    commutation_behavior() {
+      this->group = operator_handler::default_commutation_relations;
+      this->commutes_across_degrees = true;
+    }
+
+    commutation_behavior(commutation_relations commutation_group,
+                         bool commutes_across_degrees) {
+      this->group = commutation_group;
+      this->commutes_across_degrees = commutes_across_degrees;
+    }
+  };
+
+private:
+  static std::unordered_map<std::string, Definition> defined_ops;
+
+  // used when converting other operators to matrix operators
+  template <typename T>
+  static std::string type_prefix();
+
+  virtual std::string
+  op_code_to_string(std::unordered_map<int, int> &dimensions) const;
+
+protected:
+  std::string op_code;
+  commutation_relations group;
+  bool commutes;
+  std::vector<int> targets;
+
+  matrix_operator(
+      std::string operator_id, const std::vector<int> &degrees,
+      const commutation_behavior &behavior = commutation_behavior());
+  matrix_operator(
+      std::string operator_id, std::vector<int> &&degrees,
+      const commutation_behavior &behavior = commutation_behavior());
+
+public:
+#if !defined(NDEBUG)
+  static bool
+      can_be_canonicalized; // needs to be false; no canonical order can be
+                            // defined for matrix operator expressions
+#endif
+
+  // tools for custom operators
+
+  /// @brief Adds the definition of an elementary operator with the given id to
+  /// the class. After definition, an the defined elementary operator can be
+  /// instantiated by providing the operator id as well as the degree(s) of
+  /// freedom that it acts on. An elementary operator is a parameterized object
+  /// acting on certain degrees of freedom. To evaluate an operator, for example
+  /// to compute its matrix, the level, that is the dimension, for each degree
+  /// of freedom it acts on must be provided, as well as all additional
+  /// parameters. Additional parameters must be provided in the form of keyword
+  /// arguments. Note: The dimensions passed during operator evaluation are
+  /// automatically validated against the expected dimensions specified during
+  /// definition - the `create` function does not need to do this.
+  /// @arg operator_id : A string that uniquely identifies the defined operator.
+  /// @arg expected_dimensions : Defines the number of levels, that is the
+  /// dimension,
+  ///      for each degree of freedom in canonical (that is sorted) order. A
+  ///      negative or zero value for one (or more) of the expected dimensions
+  ///      indicates that the operator is defined for any dimension of the
+  ///      corresponding degree of freedom.
+  /// @arg create : Takes any number of complex-valued arguments and returns the
+  ///      matrix representing the operator. The matrix must be ordered such
+  ///      that the value returned by `op.degrees()` matches the order of the
+  ///      matrix, where `op` is the instantiated the operator defined here. The
+  ///      `create` function must take a vector of integers that specifies the
+  ///      "number of levels" (the dimension) for each degree of freedom that
+  ///      the operator acts on, and an unordered map from string to complex
+  ///      double that contains additional parameters the operator may use.
+  static void define(std::string operator_id,
+                     std::vector<int> expected_dimensions,
+                     MatrixCallbackFunction &&create);
+
+  /// @brief Instantiates a custom operator.
+  /// @arg operator_id : The ID of the operator as specified when it was
+  /// defined.
+  /// @arg degrees : the degrees of freedom that the operator acts upon.
+  static product_operator<matrix_operator>
+  instantiate(std::string operator_id, const std::vector<int> &degrees,
+              const commutation_behavior &behavior = commutation_behavior());
+
+  /// @brief Instantiates a custom operator.
+  /// @arg operator_id : The ID of the operator as specified when it was
+  /// defined.
+  /// @arg degrees : the degrees of freedom that the operator acts upon.
+  static product_operator<matrix_operator>
+  instantiate(std::string operator_id, std::vector<int> &&degrees,
+              const commutation_behavior &behavior = commutation_behavior());
+
+  // read-only properties
+
+  const commutation_relations &commutation_group = this->group;
+  const bool &commutes_across_degrees = this->commutes;
+
+  virtual std::string unique_id() const;
+
+  virtual std::vector<int> degrees() const;
+
+  // constructors and destructors
+
+  matrix_operator(int target);
+
+  template <typename T, std::enable_if_t<std::is_base_of_v<operator_handler, T>,
+                                         bool> = true>
+  matrix_operator(const T &other);
+
+  template <typename T, std::enable_if_t<std::is_base_of_v<operator_handler, T>,
+                                         bool> = true>
+  matrix_operator(const T &other, const commutation_behavior &behavior);
+
+  // copy constructor
+  matrix_operator(const matrix_operator &other);
+
+  // move constructor
+  matrix_operator(matrix_operator &&other);
+
+  ~matrix_operator() = default;
+
+  // assignments
+
+  matrix_operator &operator=(matrix_operator &&other);
+
+  matrix_operator &operator=(const matrix_operator &other);
+
+  template <typename T,
+            std::enable_if_t<!std::is_same<T, matrix_operator>::value &&
+                                 std::is_base_of_v<operator_handler, T>,
+                             bool> = true>
+  matrix_operator &operator=(const T &other);
+
+  // evaluations
+
+  /// @brief Return the `matrix_operator` as a matrix.
+  /// @arg  `dimensions` : A map specifying the number of levels,
+  ///                      that is, the dimension of each degree of freedom
+  ///                      that the operator acts on. Example for two, 2-level
+  ///                      degrees of freedom: `{0 : 2, 1 : 2}`.
+  virtual matrix_2
+  to_matrix(std::unordered_map<int, int> &dimensions,
+            const std::unordered_map<std::string, std::complex<double>>
+                &parameters = {}) const;
+
+  virtual std::string to_string(bool include_degrees) const;
+
+  // comparisons
+
+  /// @brief True, if the other value is an elementary operator with the same id
+  /// acting on the same degrees of freedom, and False otherwise.
+  bool operator==(const matrix_operator &other) const;
+
+  // predefined operators
+
+  static operator_sum<matrix_operator> empty();
+  static product_operator<matrix_operator> identity();
+
+  static product_operator<matrix_operator> identity(int degree);
+  static product_operator<matrix_operator> number(int degree);
+  static product_operator<matrix_operator> parity(int degree);
+  static product_operator<matrix_operator> position(int degree);
+  static product_operator<matrix_operator> momentum(int degree);
+  /// Operators that accept parameters at runtime.
+  static product_operator<matrix_operator> squeeze(int degree);
+  static product_operator<matrix_operator> displace(int degree);
+};
+
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/cudaq/dynamics/operator_leafs.h b/runtime/cudaq/dynamics/operator_leafs.h
new file mode 100644
index 00000000000..cde17abbcbe
--- /dev/null
+++ b/runtime/cudaq/dynamics/operator_leafs.h
@@ -0,0 +1,341 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include <complex>
+#include <functional>
+#include <map>
+#include <type_traits>
+#include <variant>
+#include <vector>
+
+#include "callback.h"
+#include "cudaq/utils/tensor.h"
+
+namespace cudaq {
+
+class scalar_operator {
+
+private:
+  // If someone gave us a constant value, we will just return that
+  // directly to them when they call `evaluate`.
+  std::variant<std::complex<double>, ScalarCallbackFunction> value;
+
+public:
+  // constructors and destructors
+
+  scalar_operator(double value);
+
+  bool is_constant() const;
+
+  /// @brief Constructor that just takes and returns a complex double value.
+  /// @NOTE: This replicates the behavior of the python `scalar_operator::const`
+  /// without the need for an extra member function.
+  scalar_operator(std::complex<double> value);
+
+  scalar_operator(const ScalarCallbackFunction &create);
+
+  /// @brief Constructor that just takes a callback function with no
+  /// arguments.
+  scalar_operator(ScalarCallbackFunction &&create);
+
+  // copy constructor
+  scalar_operator(const scalar_operator &other);
+
+  // move constructor
+  scalar_operator(scalar_operator &&other);
+
+  ~scalar_operator() = default;
+
+  // assignments
+
+  // assignment operator
+  scalar_operator &operator=(const scalar_operator &other);
+
+  // move assignment operator
+  scalar_operator &operator=(scalar_operator &&other);
+
+  // evaluations
+
+  /// @brief Return the scalar operator as a concrete complex value.
+  std::complex<double>
+  evaluate(const std::unordered_map<std::string, std::complex<double>>
+               &parameters = {}) const;
+
+  // Return the scalar operator as a 1x1 matrix. This is needed for
+  // compatibility with the other inherited classes.
+  matrix_2 to_matrix(const std::unordered_map<std::string, std::complex<double>>
+                         &parameters = {}) const;
+
+  std::string to_string() const;
+
+  // comparisons
+
+  bool operator==(scalar_operator other) const;
+
+  // unary operators
+
+  scalar_operator operator-() const &;
+  scalar_operator operator-() &&;
+  scalar_operator operator+() const &;
+  scalar_operator operator+() &&;
+
+  // right-hand arithmetics
+
+  scalar_operator operator*(double other) const &;
+  scalar_operator operator*(double other) &&;
+  scalar_operator operator/(double other) const &;
+  scalar_operator operator/(double other) &&;
+  scalar_operator operator+(double other) const &;
+  scalar_operator operator+(double other) &&;
+  scalar_operator operator-(double other) const &;
+  scalar_operator operator-(double other) &&;
+  scalar_operator &operator*=(double other);
+  scalar_operator &operator/=(double other);
+  scalar_operator &operator+=(double other);
+  scalar_operator &operator-=(double other);
+  scalar_operator operator*(std::complex<double> other) const &;
+  scalar_operator operator*(std::complex<double> other) &&;
+  scalar_operator operator/(std::complex<double> other) const &;
+  scalar_operator operator/(std::complex<double> other) &&;
+  scalar_operator operator+(std::complex<double> other) const &;
+  scalar_operator operator+(std::complex<double> other) &&;
+  scalar_operator operator-(std::complex<double> other) const &;
+  scalar_operator operator-(std::complex<double> other) &&;
+  scalar_operator &operator*=(std::complex<double> other);
+  scalar_operator &operator/=(std::complex<double> other);
+  scalar_operator &operator+=(std::complex<double> other);
+  scalar_operator &operator-=(std::complex<double> other);
+  scalar_operator operator*(const scalar_operator &other) const &;
+  scalar_operator operator*(const scalar_operator &other) &&;
+  scalar_operator operator/(const scalar_operator &other) const &;
+  scalar_operator operator/(const scalar_operator &other) &&;
+  scalar_operator operator+(const scalar_operator &other) const &;
+  scalar_operator operator+(const scalar_operator &other) &&;
+  scalar_operator operator-(const scalar_operator &other) const &;
+  scalar_operator operator-(const scalar_operator &other) &&;
+  scalar_operator &operator*=(const scalar_operator &other);
+  scalar_operator &operator/=(const scalar_operator &other);
+  scalar_operator &operator+=(const scalar_operator &other);
+  scalar_operator &operator-=(const scalar_operator &other);
+
+  // left-hand arithmetics
+
+  friend scalar_operator operator*(double other, const scalar_operator &self);
+  friend scalar_operator operator*(double other, scalar_operator &&self);
+  friend scalar_operator operator/(double other, const scalar_operator &self);
+  friend scalar_operator operator/(double other, scalar_operator &&self);
+  friend scalar_operator operator+(double other, const scalar_operator &self);
+  friend scalar_operator operator+(double other, scalar_operator &&self);
+  friend scalar_operator operator-(double other, const scalar_operator &self);
+  friend scalar_operator operator-(double other, scalar_operator &&self);
+  friend scalar_operator operator*(std::complex<double> other,
+                                   const scalar_operator &self);
+  friend scalar_operator operator*(std::complex<double> other,
+                                   scalar_operator &&self);
+  friend scalar_operator operator/(std::complex<double> other,
+                                   const scalar_operator &self);
+  friend scalar_operator operator/(std::complex<double> other,
+                                   scalar_operator &&self);
+  friend scalar_operator operator+(std::complex<double> other,
+                                   const scalar_operator &self);
+  friend scalar_operator operator+(std::complex<double> other,
+                                   scalar_operator &&self);
+  friend scalar_operator operator-(std::complex<double> other,
+                                   const scalar_operator &self);
+  friend scalar_operator operator-(std::complex<double> other,
+                                   scalar_operator &&self);
+};
+
+// Generally speaking, degrees of freedom can (and should) be grouped
+// into particles/states of different kind. For example, a system may
+// consist of both boson and fermion particles. Regardless of how an
+// operator is composed, the particle kind of each degree should always
+// remain fixed. Since "degrees" are runtime information, we approximate
+// the distinction of particles via distinguishing operator types. This
+// distinction at the type system level is only possible when we have
+// a single kind of particle in an operator. As soon as we have different
+// kinds, the operators get converted to a general "matrix operator" type,
+// and we rely on runtime tracking to enforce the correct particle-kind
+// specific behavior for subsequent manipulations of the operator.
+// The commutation relations declared below store the information about
+// what kind of particles an operator acts on. Each "kind" of particles
+// is assigned a unique id, as well as a complex value that reflects the
+// factor acquired when two operator that act on the same group of particles
+// are exchanged (e.g. 1 for bosons, and -1 for fermions).
+struct commutation_relations {
+  friend class operator_handler;
+
+private:
+  // The factor that should be applied when exchanging two operators with the
+  // same group id that act on different degrees of freedom. E.g. for fermion
+  // relations {a†(k), a(q)} = 0 for k != q, the exchange factor is -1.
+  static std::unordered_map<uint, std::complex<double>> exchange_factors;
+
+  // The id for the "commutation set" an operator class or instance belongs to.
+  // If the id is negative, it indicates that operators of this kind
+  // always commute with all other operators.
+  int id;
+
+  constexpr commutation_relations(int group_id) : id(group_id) {}
+
+public:
+  // Negative ids are reserved for the operator classes that CUDA-Q defines.
+  void define(uint group_id, std::complex<double> exchange_factor);
+
+  constexpr commutation_relations(const commutation_relations &other)
+      : id(other.id) {}
+
+  // Explicit copy assignment operator
+  constexpr commutation_relations &
+  operator=(const commutation_relations &other) {
+    if (this != &other) {
+      id = other.id;
+    }
+    return *this;
+  }
+
+  std::complex<double> commutation_factor() const;
+
+  bool operator==(const commutation_relations &other) const;
+};
+
+template <typename HandlerTy>
+class product_operator;
+
+template <typename HandlerTy>
+class operator_sum;
+
+class operator_handler {
+  template <typename T>
+  friend class product_operator;
+  template <typename T>
+  friend class operator_sum;
+  template <typename T>
+  friend class operator_arithmetics;
+
+private:
+  // Validate or populate the dimension defined for the degree(s) of freedom the
+  // operator acts on, and return a string that identifies the operator but not
+  // what degrees it acts on.
+  virtual std::string
+  op_code_to_string(std::unordered_map<int, int> &dimensions) const = 0;
+
+  // data storage classes for evaluation
+
+  class matrix_evaluation {
+    template <typename T>
+    friend class product_operator;
+    template <typename T>
+    friend class operator_sum;
+    template <typename T>
+    friend class operator_arithmetics;
+
+  private:
+    std::vector<int> degrees;
+    matrix_2 matrix;
+
+  public:
+    matrix_evaluation();
+    matrix_evaluation(std::vector<int> &&degrees, matrix_2 &&matrix);
+    matrix_evaluation(matrix_evaluation &&other);
+    matrix_evaluation &operator=(matrix_evaluation &&other);
+    // delete copy constructor and copy assignment to avoid unnecessary copies
+    matrix_evaluation(const matrix_evaluation &other) = delete;
+    matrix_evaluation &operator=(const matrix_evaluation &other) = delete;
+  };
+
+  class canonical_evaluation {
+    template <typename T>
+    friend class product_operator;
+    template <typename T>
+    friend class operator_sum;
+    template <typename T>
+    friend class operator_arithmetics;
+
+  private:
+    std::vector<std::pair<std::complex<double>, std::string>> terms;
+
+  public:
+    canonical_evaluation();
+    canonical_evaluation(canonical_evaluation &&other);
+    canonical_evaluation &operator=(canonical_evaluation &&other);
+    // delete copy constructor and copy assignment to avoid unnecessary copies
+    canonical_evaluation(const canonical_evaluation &other) = delete;
+    canonical_evaluation &operator=(const canonical_evaluation &other) = delete;
+    void push_back(std::pair<std::complex<double>, std::string> &&term);
+    void push_back(const std::string &op);
+  };
+
+public:
+#if !defined(NDEBUG)
+  static bool can_be_canonicalized; // whether a canonical order can be defined
+                                    // for operator expressions
+#endif
+
+  // Individual handlers should *not* override this but rather adhere to it.
+  // The canonical ordering is the ordering used internally by the operator
+  // classes. The user facing ordering is the ordering that matches CUDA-Q
+  // convention, i.e. the order in which custom matrix operators are defined,
+  // the order returned by to_matrix and degree, and the order in which a user
+  // would define a state vector.
+  static constexpr auto canonical_order = std::less<int>();
+  static constexpr auto user_facing_order = std::greater<int>();
+
+  /// Default commutation relations mean that two operator always commute as
+  /// long as they act on different degrees of freedom.
+  static constexpr commutation_relations default_commutation_relations =
+      commutation_relations(-1);
+  static constexpr commutation_relations fermion_commutation_relations =
+      commutation_relations(-2);
+  static constexpr commutation_relations boson_commutation_relations =
+      default_commutation_relations;
+  static commutation_relations custom_commutation_relations(uint id);
+
+  static constexpr commutation_relations commutation_group =
+      default_commutation_relations;
+  // Indicates whether operators of this type commute with any other operator,
+  // as long as both operators don't act on the same degree.
+  // Handlers that require non-trivial commutation relations across different
+  // degrees should define an instance variable with the same name and set it
+  // to true or false as appropriate for the concrete instance (e.g. the
+  // identity operator will always commute regardless of what kind of particle
+  // it acts on).
+  static constexpr bool commutes_across_degrees = true;
+
+  virtual ~operator_handler() = default;
+
+  // returns a unique string id for the operator
+  virtual std::string unique_id() const = 0;
+
+  virtual std::vector<int> degrees() const = 0;
+
+  /// @brief Return the `matrix_operator` as a matrix.
+  /// @arg  `dimensions` : A map specifying the number of levels,
+  ///                      that is, the dimension of each degree of freedom
+  ///                      that the operator acts on. Example for two, 2-level
+  ///                      degrees of freedom: `{0 : 2, 1 : 2}`.
+  virtual matrix_2
+  to_matrix(std::unordered_map<int, int> &dimensions,
+            const std::unordered_map<std::string, std::complex<double>>
+                &parameters = {}) const = 0;
+
+  virtual std::string to_string(bool include_degrees = true) const = 0;
+
+  template <typename HandlerTy>
+  static operator_sum<HandlerTy> empty();
+
+  template <
+      typename HandlerTy, typename... Args,
+      std::enable_if_t<std::conjunction<std::is_same<int, Args>...>::value,
+                       bool> = true>
+  static product_operator<HandlerTy> identity(Args... targets);
+};
+
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/cudaq/dynamics/operator_sum.cpp b/runtime/cudaq/dynamics/operator_sum.cpp
new file mode 100644
index 00000000000..7cc0a177760
--- /dev/null
+++ b/runtime/cudaq/dynamics/operator_sum.cpp
@@ -0,0 +1,1237 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include <algorithm>
+#include <numeric>
+#include <set>
+#include <type_traits>
+
+#include "cudaq/operators.h"
+#include "cudaq/spin_op.h"
+#include "evaluation.h"
+#include "helpers.h"
+
+namespace cudaq {
+
+// private methods
+
+template <typename HandlerTy>
+void operator_sum<HandlerTy>::insert(const product_operator<HandlerTy> &other) {
+  auto term_id = other.get_term_id();
+  auto it = this->term_map.find(term_id);
+  if (it == this->term_map.cend()) {
+    this->coefficients.push_back(other.coefficient);
+    this->term_map.insert(
+        it, std::make_pair(std::move(term_id), this->terms.size()));
+    this->terms.push_back(other.operators);
+  } else {
+    this->coefficients[it->second] += other.coefficient;
+  }
+}
+
+template <typename HandlerTy>
+void operator_sum<HandlerTy>::insert(product_operator<HandlerTy> &&other) {
+  auto term_id = other.get_term_id();
+  auto it = this->term_map.find(term_id);
+  if (it == this->term_map.cend()) {
+    this->coefficients.push_back(std::move(other.coefficient));
+    this->term_map.insert(
+        it, std::make_pair(std::move(term_id), this->terms.size()));
+    this->terms.push_back(std::move(other.operators));
+  } else {
+    this->coefficients[it->second] += other.coefficient;
+  }
+}
+
+template <typename HandlerTy>
+void operator_sum<HandlerTy>::aggregate_terms() {}
+
+template <typename HandlerTy>
+template <typename... Args>
+void operator_sum<HandlerTy>::aggregate_terms(
+    product_operator<HandlerTy> &&head, Args &&...args) {
+  this->insert(std::forward<product_operator<HandlerTy>>(head));
+  aggregate_terms(std::forward<Args>(args)...);
+}
+
+template <typename HandlerTy>
+template <typename EvalTy>
+EvalTy operator_sum<HandlerTy>::evaluate(
+    operator_arithmetics<EvalTy> arithmetics) const {
+
+  if (terms.size() == 0)
+    return EvalTy();
+  auto terms = this->get_terms();
+  auto degrees = this->degrees(false); // keep in canonical order
+
+  // Adding a tensor product with the identity for degrees that an operator
+  // doesn't act on. Needed e.g. to make sure all matrices are of the same size
+  // before summing them up.
+  auto paddedTerm = [&arithmetics, &degrees = std::as_const(degrees)](
+                        product_operator<HandlerTy> &&term) {
+    std::vector<HandlerTy> prod_ops;
+    prod_ops.reserve(degrees.size());
+    auto term_degrees =
+        term.degrees(false); // ordering does not really matter here
+    for (auto degree : degrees) {
+      auto it = std::find(term_degrees.begin(), term_degrees.end(), degree);
+      if (it == term_degrees.end()) {
+        HandlerTy identity(degree);
+        prod_ops.push_back(std::move(identity));
+      }
+    }
+    product_operator<HandlerTy> prod(1, std::move(prod_ops));
+    prod *= term; // ensures canonical ordering (if possible)
+    return prod;
+  };
+
+  if (arithmetics.pad_sum_terms) {
+    product_operator<HandlerTy> padded_term = paddedTerm(std::move(terms[0]));
+    EvalTy sum = padded_term.template evaluate<EvalTy>(arithmetics);
+    for (auto term_idx = 1; term_idx < terms.size(); ++term_idx) {
+      padded_term = paddedTerm(std::move(terms[term_idx]));
+      EvalTy term_eval = padded_term.template evaluate<EvalTy>(arithmetics);
+      sum = arithmetics.add(std::move(sum), std::move(term_eval));
+    }
+    return sum;
+  } else {
+    EvalTy sum = terms[0].template evaluate<EvalTy>(arithmetics);
+    for (auto term_idx = 1; term_idx < terms.size(); ++term_idx) {
+      EvalTy term_eval = terms[term_idx].template evaluate<EvalTy>(arithmetics);
+      sum = arithmetics.add(std::move(sum), std::move(term_eval));
+    }
+    return sum;
+  }
+}
+
+#define INSTANTIATE_SUM_PRIVATE_METHODS(HandlerTy)                             \
+                                                                               \
+  template void operator_sum<HandlerTy>::aggregate_terms(                      \
+      product_operator<HandlerTy> &&item2);                                    \
+                                                                               \
+  template void operator_sum<HandlerTy>::aggregate_terms(                      \
+      product_operator<HandlerTy> &&item1,                                     \
+      product_operator<HandlerTy> &&item2);                                    \
+                                                                               \
+  template void operator_sum<HandlerTy>::aggregate_terms(                      \
+      product_operator<HandlerTy> &&item1,                                     \
+      product_operator<HandlerTy> &&item2,                                     \
+      product_operator<HandlerTy> &&item3);
+
+INSTANTIATE_SUM_PRIVATE_METHODS(matrix_operator);
+INSTANTIATE_SUM_PRIVATE_METHODS(spin_operator);
+INSTANTIATE_SUM_PRIVATE_METHODS(boson_operator);
+INSTANTIATE_SUM_PRIVATE_METHODS(fermion_operator);
+
+#define INSTANTIATE_SUM_EVALUATE_METHODS(HandlerTy, EvalTy)                    \
+                                                                               \
+  template EvalTy operator_sum<HandlerTy>::evaluate(                           \
+      operator_arithmetics<EvalTy> arithmetics) const;
+
+INSTANTIATE_SUM_EVALUATE_METHODS(matrix_operator,
+                                 operator_handler::matrix_evaluation);
+INSTANTIATE_SUM_EVALUATE_METHODS(spin_operator,
+                                 operator_handler::canonical_evaluation);
+INSTANTIATE_SUM_EVALUATE_METHODS(boson_operator,
+                                 operator_handler::matrix_evaluation);
+INSTANTIATE_SUM_EVALUATE_METHODS(fermion_operator,
+                                 operator_handler::matrix_evaluation);
+
+// read-only properties
+
+template <typename HandlerTy>
+std::vector<int>
+operator_sum<HandlerTy>::degrees(bool application_order) const {
+  std::set<int> unsorted_degrees;
+  for (const std::vector<HandlerTy> &term : this->terms) {
+    for (const HandlerTy &op : term) {
+      auto op_degrees = op.degrees();
+      unsorted_degrees.insert(op_degrees.cbegin(), op_degrees.cend());
+    }
+  }
+  auto degrees =
+      std::vector<int>(unsorted_degrees.cbegin(), unsorted_degrees.cend());
+  if (application_order)
+    std::sort(degrees.begin(), degrees.end(),
+              operator_handler::user_facing_order);
+  else
+    std::sort(degrees.begin(), degrees.end(),
+              operator_handler::canonical_order);
+  return std::move(degrees);
+}
+
+template <typename HandlerTy>
+int operator_sum<HandlerTy>::num_terms() const {
+  return this->terms.size();
+}
+
+template <typename HandlerTy>
+std::vector<product_operator<HandlerTy>>
+operator_sum<HandlerTy>::get_terms() const {
+  std::vector<product_operator<HandlerTy>> prods;
+  prods.reserve(this->terms.size());
+  for (size_t i = 0; i < this->terms.size(); ++i) {
+    prods.push_back(
+        product_operator<HandlerTy>(this->coefficients[i], this->terms[i]));
+  }
+  return std::move(prods);
+}
+
+#define INSTANTIATE_SUM_PROPERTIES(HandlerTy)                                  \
+                                                                               \
+  template std::vector<int> operator_sum<HandlerTy>::degrees(                  \
+      bool application_order) const;                                           \
+                                                                               \
+  template int operator_sum<HandlerTy>::num_terms() const;                     \
+                                                                               \
+  template std::vector<product_operator<HandlerTy>>                            \
+  operator_sum<HandlerTy>::get_terms() const;
+
+INSTANTIATE_SUM_PROPERTIES(matrix_operator);
+INSTANTIATE_SUM_PROPERTIES(spin_operator);
+INSTANTIATE_SUM_PROPERTIES(boson_operator);
+INSTANTIATE_SUM_PROPERTIES(fermion_operator);
+
+// constructors
+
+template <typename HandlerTy>
+operator_sum<HandlerTy>::operator_sum(const product_operator<HandlerTy> &prod) {
+  this->insert(prod);
+}
+
+template <typename HandlerTy>
+template <typename... Args,
+          std::enable_if_t<std::conjunction<std::is_same<
+                               product_operator<HandlerTy>, Args>...>::value,
+                           bool>>
+operator_sum<HandlerTy>::operator_sum(Args &&...args) {
+  this->coefficients.reserve(sizeof...(Args));
+  this->term_map.reserve(sizeof...(Args));
+  this->terms.reserve(sizeof...(Args));
+  aggregate_terms(std::forward<product_operator<HandlerTy> &&>(args)...);
+}
+
+template <typename HandlerTy>
+template <typename T,
+          std::enable_if_t<!std::is_same<T, HandlerTy>::value &&
+                               std::is_constructible<HandlerTy, T>::value,
+                           bool>>
+operator_sum<HandlerTy>::operator_sum(const operator_sum<T> &other)
+    : coefficients(other.coefficients) {
+  this->term_map.reserve(other.terms.size());
+  this->terms.reserve(other.terms.size());
+  for (const auto &operators : other.terms) {
+    product_operator<HandlerTy> term(
+        product_operator<T>(1., operators)); // coefficient does not matter
+    this->term_map.insert(
+        this->term_map.cend(),
+        std::make_pair(term.get_term_id(), this->terms.size()));
+    this->terms.push_back(std::move(term.operators));
+  }
+}
+
+template <typename HandlerTy>
+template <typename T,
+          std::enable_if_t<std::is_same<HandlerTy, matrix_operator>::value &&
+                               !std::is_same<T, HandlerTy>::value &&
+                               std::is_constructible<HandlerTy, T>::value,
+                           bool>>
+operator_sum<HandlerTy>::operator_sum(
+    const operator_sum<T> &other,
+    const matrix_operator::commutation_behavior &behavior)
+    : coefficients(other.coefficients) {
+  this->term_map.reserve(other.terms.size());
+  this->terms.reserve(other.terms.size());
+  for (const auto &operators : other.terms) {
+    product_operator<HandlerTy> term(product_operator<T>(1., operators),
+                                     behavior); // coefficient does not matter
+    this->term_map.insert(
+        this->term_map.cend(),
+        std::make_pair(term.get_term_id(), this->terms.size()));
+    this->terms.push_back(std::move(term.operators));
+  }
+}
+
+template <typename HandlerTy>
+operator_sum<HandlerTy>::operator_sum(const operator_sum<HandlerTy> &other,
+                                      int size) {
+  if (size <= 0) {
+    this->coefficients = other.coefficients;
+    this->term_map = other.term_map;
+    this->terms = other.terms;
+  } else {
+    this->coefficients.reserve(size);
+    this->term_map.reserve(size);
+    this->terms.reserve(size);
+    for (const auto &coeff : other.coefficients)
+      this->coefficients.push_back(coeff);
+    for (const auto &entry : other.term_map)
+      this->term_map.insert(this->term_map.cend(), entry);
+    for (const auto &term : other.terms)
+      this->terms.push_back(term);
+  }
+}
+
+template <typename HandlerTy>
+operator_sum<HandlerTy>::operator_sum(operator_sum<HandlerTy> &&other, int size)
+    : coefficients(std::move(other.coefficients)),
+      term_map(std::move(other.term_map)), terms(std::move(other.terms)) {
+  if (size > 0) {
+    this->coefficients.reserve(size);
+    this->term_map.reserve(size);
+    this->terms.reserve(size);
+  }
+}
+
+#define INSTANTIATE_SUM_CONSTRUCTORS(HandlerTy)                                \
+                                                                               \
+  template operator_sum<HandlerTy>::operator_sum(                              \
+      const product_operator<HandlerTy> &item2);                               \
+                                                                               \
+  template operator_sum<HandlerTy>::operator_sum(                              \
+      product_operator<HandlerTy> &&item2);                                    \
+                                                                               \
+  template operator_sum<HandlerTy>::operator_sum(                              \
+      product_operator<HandlerTy> &&item1,                                     \
+      product_operator<HandlerTy> &&item2);                                    \
+                                                                               \
+  template operator_sum<HandlerTy>::operator_sum(                              \
+      product_operator<HandlerTy> &&item1,                                     \
+      product_operator<HandlerTy> &&item2,                                     \
+      product_operator<HandlerTy> &&item3);                                    \
+                                                                               \
+  template operator_sum<HandlerTy>::operator_sum(                              \
+      const operator_sum<HandlerTy> &other, int size);                         \
+                                                                               \
+  template operator_sum<HandlerTy>::operator_sum(                              \
+      operator_sum<HandlerTy> &&other, int size);
+
+template operator_sum<matrix_operator>::operator_sum(
+    const operator_sum<spin_operator> &other);
+template operator_sum<matrix_operator>::operator_sum(
+    const operator_sum<boson_operator> &other);
+template operator_sum<matrix_operator>::operator_sum(
+    const operator_sum<fermion_operator> &other);
+template operator_sum<matrix_operator>::operator_sum(
+    const operator_sum<spin_operator> &other,
+    const matrix_operator::commutation_behavior &behavior);
+template operator_sum<matrix_operator>::operator_sum(
+    const operator_sum<boson_operator> &other,
+    const matrix_operator::commutation_behavior &behavior);
+template operator_sum<matrix_operator>::operator_sum(
+    const operator_sum<fermion_operator> &other,
+    const matrix_operator::commutation_behavior &behavior);
+
+INSTANTIATE_SUM_CONSTRUCTORS(matrix_operator);
+INSTANTIATE_SUM_CONSTRUCTORS(spin_operator);
+INSTANTIATE_SUM_CONSTRUCTORS(boson_operator);
+INSTANTIATE_SUM_CONSTRUCTORS(fermion_operator);
+
+// assignments
+
+template <typename HandlerTy>
+template <typename T,
+          std::enable_if_t<!std::is_same<T, HandlerTy>::value &&
+                               std::is_constructible<HandlerTy, T>::value,
+                           bool>>
+operator_sum<HandlerTy> &
+operator_sum<HandlerTy>::operator=(const product_operator<T> &other) {
+  *this = product_operator<HandlerTy>(other);
+  return *this;
+}
+
+template <typename HandlerTy>
+operator_sum<HandlerTy> &
+operator_sum<HandlerTy>::operator=(const product_operator<HandlerTy> &other) {
+  this->coefficients.clear();
+  this->term_map.clear();
+  this->terms.clear();
+  this->coefficients.push_back(other.coefficient);
+  this->term_map.insert(this->term_map.cend(),
+                        std::make_pair(other.get_term_id(), 0));
+  this->terms.push_back(other.operators);
+  return *this;
+}
+
+template <typename HandlerTy>
+operator_sum<HandlerTy> &
+operator_sum<HandlerTy>::operator=(product_operator<HandlerTy> &&other) {
+  this->coefficients.clear();
+  this->term_map.clear();
+  this->terms.clear();
+  this->coefficients.push_back(std::move(other.coefficient));
+  this->term_map.insert(this->term_map.cend(),
+                        std::make_pair(other.get_term_id(), 0));
+  this->terms.push_back(std::move(other.operators));
+  return *this;
+}
+
+template <typename HandlerTy>
+template <typename T,
+          std::enable_if_t<!std::is_same<T, HandlerTy>::value &&
+                               std::is_constructible<HandlerTy, T>::value,
+                           bool>>
+operator_sum<HandlerTy> &
+operator_sum<HandlerTy>::operator=(const operator_sum<T> &other) {
+  *this = operator_sum<HandlerTy>(other);
+  return *this;
+}
+
+template <typename HandlerTy>
+operator_sum<HandlerTy> &
+operator_sum<HandlerTy>::operator=(const operator_sum<HandlerTy> &other) {
+  if (this != &other) {
+    this->coefficients = other.coefficients;
+    this->term_map = other.term_map;
+    this->terms = other.terms;
+  }
+  return *this;
+}
+
+template <typename HandlerTy>
+operator_sum<HandlerTy> &
+operator_sum<HandlerTy>::operator=(operator_sum<HandlerTy> &&other) {
+  if (this != &other) {
+    this->coefficients = std::move(other.coefficients);
+    this->term_map = std::move(other.term_map);
+    this->terms = std::move(other.terms);
+  }
+  return *this;
+}
+
+#define INSTANTIATE_SUM_ASSIGNMENTS(HandlerTy)                                 \
+                                                                               \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator=(        \
+      product_operator<HandlerTy> &&other);                                    \
+                                                                               \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator=(        \
+      const product_operator<HandlerTy> &other);                               \
+                                                                               \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator=(        \
+      const operator_sum<HandlerTy> &other);                                   \
+                                                                               \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator=(        \
+      operator_sum<HandlerTy> &&other);
+
+template operator_sum<matrix_operator> &
+operator_sum<matrix_operator>::operator=(
+    const product_operator<spin_operator> &other);
+template operator_sum<matrix_operator> &
+operator_sum<matrix_operator>::operator=(
+    const product_operator<boson_operator> &other);
+template operator_sum<matrix_operator> &
+operator_sum<matrix_operator>::operator=(
+    const product_operator<fermion_operator> &other);
+template operator_sum<matrix_operator> &
+operator_sum<matrix_operator>::operator=(
+    const operator_sum<spin_operator> &other);
+template operator_sum<matrix_operator> &
+operator_sum<matrix_operator>::operator=(
+    const operator_sum<boson_operator> &other);
+template operator_sum<matrix_operator> &
+operator_sum<matrix_operator>::operator=(
+    const operator_sum<fermion_operator> &other);
+
+INSTANTIATE_SUM_ASSIGNMENTS(matrix_operator);
+INSTANTIATE_SUM_ASSIGNMENTS(spin_operator);
+INSTANTIATE_SUM_ASSIGNMENTS(boson_operator);
+INSTANTIATE_SUM_ASSIGNMENTS(fermion_operator);
+
+// evaluations
+
+template <typename HandlerTy>
+std::string operator_sum<HandlerTy>::to_string() const {
+  auto prods = this->get_terms();
+  auto it = prods.cbegin();
+  std::string str = it->to_string();
+  while (++it != prods.cend())
+    str += " + " + it->to_string();
+  return std::move(str);
+}
+
+template <typename HandlerTy>
+matrix_2 operator_sum<HandlerTy>::to_matrix(
+    std::unordered_map<int, int> dimensions,
+    const std::unordered_map<std::string, std::complex<double>> &parameters,
+    bool application_order) const {
+  auto evaluated =
+      this->evaluate(operator_arithmetics<operator_handler::matrix_evaluation>(
+          dimensions, parameters));
+  if (!application_order || operator_handler::canonical_order(1, 0) ==
+                                operator_handler::user_facing_order(1, 0))
+    return std::move(evaluated.matrix);
+
+  auto degrees = evaluated.degrees;
+  std::sort(degrees.begin(), degrees.end(),
+            operator_handler::user_facing_order);
+  auto permutation = cudaq::detail::compute_permutation(evaluated.degrees,
+                                                        degrees, dimensions);
+  cudaq::detail::permute_matrix(evaluated.matrix, permutation);
+  return std::move(evaluated.matrix);
+}
+
+template <>
+matrix_2 operator_sum<spin_operator>::to_matrix(
+    std::unordered_map<int, int> dimensions,
+    const std::unordered_map<std::string, std::complex<double>> &parameters,
+    bool application_order) const {
+  auto evaluated = this->evaluate(
+      operator_arithmetics<operator_handler::canonical_evaluation>(dimensions,
+                                                                   parameters));
+  if (evaluated.terms.size() == 0)
+    return cudaq::matrix_2(0, 0);
+
+  bool invert_order =
+      application_order && operator_handler::canonical_order(1, 0) !=
+                               operator_handler::user_facing_order(1, 0);
+  auto matrix = spin_operator::to_matrix(
+      evaluated.terms[0].second, evaluated.terms[0].first, invert_order);
+  for (auto i = 1; i < terms.size(); ++i)
+    matrix += spin_operator::to_matrix(evaluated.terms[i].second,
+                                       evaluated.terms[i].first, invert_order);
+  return std::move(matrix);
+}
+
+#define INSTANTIATE_SUM_EVALUATIONS(HandlerTy)                                 \
+                                                                               \
+  template std::string operator_sum<HandlerTy>::to_string() const;             \
+                                                                               \
+  template matrix_2 operator_sum<HandlerTy>::to_matrix(                        \
+      std::unordered_map<int, int> dimensions,                                 \
+      const std::unordered_map<std::string, std::complex<double>> &params,     \
+      bool application_order) const;
+
+INSTANTIATE_SUM_EVALUATIONS(matrix_operator);
+INSTANTIATE_SUM_EVALUATIONS(spin_operator);
+INSTANTIATE_SUM_EVALUATIONS(boson_operator);
+INSTANTIATE_SUM_EVALUATIONS(fermion_operator);
+
+// unary operators
+
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator_sum<HandlerTy>::operator-() const & {
+  operator_sum<HandlerTy> sum;
+  sum.coefficients.reserve(this->coefficients.size());
+  sum.term_map = this->term_map;
+  sum.terms = this->terms;
+  for (auto &coeff : this->coefficients)
+    sum.coefficients.push_back(-1. * coeff);
+  return std::move(sum);
+}
+
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator_sum<HandlerTy>::operator-() && {
+  for (auto &coeff : this->coefficients)
+    coeff *= -1.;
+  return std::move(*this);
+}
+
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator_sum<HandlerTy>::operator+() const & {
+  return *this;
+}
+
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator_sum<HandlerTy>::operator+() && {
+  return std::move(*this);
+}
+
+#define INSTANTIATE_SUM_UNARY_OPS(HandlerTy)                                   \
+                                                                               \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator-()        \
+      const &;                                                                 \
+                                                                               \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator-() &&;    \
+                                                                               \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator+()        \
+      const &;                                                                 \
+                                                                               \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator+() &&;
+
+INSTANTIATE_SUM_UNARY_OPS(matrix_operator);
+INSTANTIATE_SUM_UNARY_OPS(spin_operator);
+INSTANTIATE_SUM_UNARY_OPS(boson_operator);
+INSTANTIATE_SUM_UNARY_OPS(fermion_operator);
+
+// right-hand arithmetics
+
+#define SUM_MULTIPLICATION(otherTy)                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> operator_sum<HandlerTy>::operator*(otherTy other)    \
+      const & {                                                                \
+    operator_sum<HandlerTy> sum;                                               \
+    sum.coefficients.reserve(this->coefficients.size());                       \
+    sum.term_map = this->term_map;                                             \
+    sum.terms = this->terms;                                                   \
+    for (const auto &coeff : this->coefficients)                               \
+      sum.coefficients.push_back(coeff *other);                                \
+    return std::move(sum);                                                     \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> operator_sum<HandlerTy>::operator*(                  \
+      otherTy other) && {                                                      \
+    for (auto &coeff : this->coefficients)                                     \
+      coeff *= other;                                                          \
+    return std::move(*this);                                                   \
+  }
+
+SUM_MULTIPLICATION(double);
+SUM_MULTIPLICATION(std::complex<double>);
+SUM_MULTIPLICATION(const scalar_operator &);
+
+#define SUM_ADDITION(otherTy, op)                                              \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> operator_sum<HandlerTy>::operator op(otherTy other)  \
+      const & {                                                                \
+    operator_sum<HandlerTy> sum(*this, this->terms.size() + 1);                \
+    sum.insert(product_operator<HandlerTy>(op other));                         \
+    return std::move(sum);                                                     \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> operator_sum<HandlerTy>::operator op(                \
+      otherTy other) && {                                                      \
+    this->insert(product_operator<HandlerTy>(op other));                       \
+    return std::move(*this);                                                   \
+  }
+
+SUM_ADDITION(double, +);
+SUM_ADDITION(double, -);
+SUM_ADDITION(std::complex<double>, +);
+SUM_ADDITION(std::complex<double>, -);
+SUM_ADDITION(const scalar_operator &, +);
+SUM_ADDITION(const scalar_operator &, -);
+
+#define INSTANTIATE_SUM_RHSIMPLE_OPS(HandlerTy)                                \
+                                                                               \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator*(         \
+      double other) const &;                                                   \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator*(         \
+      double other) &&;                                                        \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator+(         \
+      double other) const &;                                                   \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator+(         \
+      double other) &&;                                                        \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator-(         \
+      double other) const &;                                                   \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator-(         \
+      double other) &&;                                                        \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator*(         \
+      std::complex<double> other) const &;                                     \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator*(         \
+      std::complex<double> other) &&;                                          \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator+(         \
+      std::complex<double> other) const &;                                     \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator+(         \
+      std::complex<double> other) &&;                                          \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator-(         \
+      std::complex<double> other) const &;                                     \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator-(         \
+      std::complex<double> other) &&;                                          \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator*(         \
+      const scalar_operator &other) const &;                                   \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator*(         \
+      const scalar_operator &other) &&;                                        \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator+(         \
+      const scalar_operator &other) const &;                                   \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator+(         \
+      const scalar_operator &other) &&;                                        \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator-(         \
+      const scalar_operator &other) const &;                                   \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator-(         \
+      const scalar_operator &other) &&;
+
+INSTANTIATE_SUM_RHSIMPLE_OPS(matrix_operator);
+INSTANTIATE_SUM_RHSIMPLE_OPS(spin_operator);
+INSTANTIATE_SUM_RHSIMPLE_OPS(boson_operator);
+INSTANTIATE_SUM_RHSIMPLE_OPS(fermion_operator);
+
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator_sum<HandlerTy>::operator*(
+    const product_operator<HandlerTy> &other) const {
+  operator_sum<HandlerTy> sum; // the entire sum needs to be rebuilt
+  sum.coefficients.reserve(this->coefficients.size());
+  sum.term_map.reserve(this->terms.size());
+  sum.terms.reserve(this->terms.size());
+  for (auto i = 0; i < this->terms.size(); ++i) {
+    auto max_size = this->terms[i].size() + other.operators.size();
+    product_operator<HandlerTy> prod(this->coefficients[i] * other.coefficient,
+                                     this->terms[i], max_size);
+    for (HandlerTy op : other.operators)
+      prod.insert(std::move(op));
+    sum.insert(std::move(prod));
+  }
+  return std::move(sum);
+}
+
+#define SUM_ADDITION_PRODUCT(op)                                               \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> operator_sum<HandlerTy>::operator op(                \
+      const product_operator<HandlerTy> &other) const & {                      \
+    operator_sum<HandlerTy> sum(*this, this->terms.size() + 1);                \
+    sum.insert(op other);                                                      \
+    return std::move(sum);                                                     \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> operator_sum<HandlerTy>::operator op(                \
+      const product_operator<HandlerTy> &other) && {                           \
+    this->insert(op other);                                                    \
+    return std::move(*this);                                                   \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> operator_sum<HandlerTy>::operator op(                \
+      product_operator<HandlerTy> &&other) const & {                           \
+    operator_sum<HandlerTy> sum(*this, this->terms.size() + 1);                \
+    sum.insert(op std::move(other));                                           \
+    return std::move(sum);                                                     \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> operator_sum<HandlerTy>::operator op(                \
+      product_operator<HandlerTy> &&other) && {                                \
+    this->insert(op std::move(other));                                         \
+    return std::move(*this);                                                   \
+  }
+
+SUM_ADDITION_PRODUCT(+)
+SUM_ADDITION_PRODUCT(-)
+
+template <typename HandlerTy>
+operator_sum<HandlerTy>
+operator_sum<HandlerTy>::operator*(const operator_sum<HandlerTy> &other) const {
+  operator_sum<HandlerTy> sum; // the entire sum needs to be rebuilt
+  auto max_size = this->terms.size() * other.terms.size();
+  sum.coefficients.reserve(max_size);
+  sum.term_map.reserve(max_size);
+  sum.terms.reserve(max_size);
+  for (auto i = 0; i < this->terms.size(); ++i) {
+    for (auto j = 0; j < other.terms.size(); ++j) {
+      auto max_size = this->terms[i].size() + other.terms[j].size();
+      product_operator<HandlerTy> prod(this->coefficients[i] *
+                                           other.coefficients[j],
+                                       this->terms[i], max_size);
+      for (HandlerTy op : other.terms[j])
+        prod.insert(std::move(op));
+      sum.insert(std::move(prod));
+    }
+  }
+  return std::move(sum);
+}
+
+#define SUM_ADDITION_SUM(op)                                                   \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> operator_sum<HandlerTy>::operator op(                \
+      const operator_sum<HandlerTy> &other) const & {                          \
+    operator_sum<HandlerTy> sum(*this,                                         \
+                                this->terms.size() + other.terms.size());      \
+    for (auto i = 0; i < other.terms.size(); ++i) {                            \
+      product_operator<HandlerTy> prod(op other.coefficients[i],               \
+                                       other.terms[i]);                        \
+      sum.insert(std::move(prod));                                             \
+    }                                                                          \
+    return std::move(sum);                                                     \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> operator_sum<HandlerTy>::operator op(                \
+      const operator_sum<HandlerTy> &other) && {                               \
+    auto max_size = this->terms.size() + other.terms.size();                   \
+    this->coefficients.reserve(max_size);                                      \
+    this->term_map.reserve(max_size);                                          \
+    this->terms.reserve(max_size);                                             \
+    for (auto i = 0; i < other.terms.size(); ++i)                              \
+      this->insert(product_operator<HandlerTy>(op other.coefficients[i],       \
+                                               other.terms[i]));               \
+    return std::move(*this);                                                   \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> operator_sum<HandlerTy>::operator op(                \
+      operator_sum<HandlerTy> &&other) const & {                               \
+    operator_sum<HandlerTy> sum(*this,                                         \
+                                this->terms.size() + other.terms.size());      \
+    for (auto i = 0; i < other.terms.size(); ++i) {                            \
+      product_operator<HandlerTy> prod(op std::move(other.coefficients[i]),    \
+                                       std::move(other.terms[i]));             \
+      sum.insert(std::move(prod));                                             \
+    }                                                                          \
+    return std::move(sum);                                                     \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> operator_sum<HandlerTy>::operator op(                \
+      operator_sum<HandlerTy> &&other) && {                                    \
+    auto max_size = this->terms.size() + other.terms.size();                   \
+    this->coefficients.reserve(max_size);                                      \
+    this->term_map.reserve(max_size);                                          \
+    this->terms.reserve(max_size);                                             \
+    for (auto i = 0; i < other.terms.size(); ++i)                              \
+      this->insert(product_operator<HandlerTy>(                                \
+          op std::move(other.coefficients[i]), std::move(other.terms[i])));    \
+    return std::move(*this);                                                   \
+  }
+
+SUM_ADDITION_SUM(+);
+SUM_ADDITION_SUM(-);
+
+#define INSTANTIATE_SUM_RHCOMPOSITE_OPS(HandlerTy)                             \
+                                                                               \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator*(         \
+      const product_operator<HandlerTy> &other) const;                         \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator+(         \
+      const product_operator<HandlerTy> &other) const &;                       \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator+(         \
+      const product_operator<HandlerTy> &other) &&;                            \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator+(         \
+      product_operator<HandlerTy> &&other) const &;                            \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator+(         \
+      product_operator<HandlerTy> &&other) &&;                                 \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator-(         \
+      const product_operator<HandlerTy> &other) const &;                       \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator-(         \
+      const product_operator<HandlerTy> &other) &&;                            \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator-(         \
+      product_operator<HandlerTy> &&other) const &;                            \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator-(         \
+      product_operator<HandlerTy> &&other) &&;                                 \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator*(         \
+      const operator_sum<HandlerTy> &other) const;                             \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator+(         \
+      const operator_sum<HandlerTy> &other) const &;                           \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator+(         \
+      const operator_sum<HandlerTy> &other) &&;                                \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator+(         \
+      operator_sum<HandlerTy> &&other) const &;                                \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator+(         \
+      operator_sum<HandlerTy> &&other) &&;                                     \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator-(         \
+      const operator_sum<HandlerTy> &other) const &;                           \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator-(         \
+      const operator_sum<HandlerTy> &other) &&;                                \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator-(         \
+      operator_sum<HandlerTy> &&other) const &;                                \
+  template operator_sum<HandlerTy> operator_sum<HandlerTy>::operator-(         \
+      operator_sum<HandlerTy> &&other) &&;
+
+INSTANTIATE_SUM_RHCOMPOSITE_OPS(matrix_operator);
+INSTANTIATE_SUM_RHCOMPOSITE_OPS(spin_operator);
+INSTANTIATE_SUM_RHCOMPOSITE_OPS(boson_operator);
+INSTANTIATE_SUM_RHCOMPOSITE_OPS(fermion_operator);
+
+#define SUM_MULTIPLICATION_ASSIGNMENT(otherTy)                                 \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator*=(                \
+      otherTy other) {                                                         \
+    for (auto &coeff : this->coefficients)                                     \
+      coeff *= other;                                                          \
+    return *this;                                                              \
+  }
+
+SUM_MULTIPLICATION_ASSIGNMENT(double);
+SUM_MULTIPLICATION_ASSIGNMENT(std::complex<double>);
+SUM_MULTIPLICATION_ASSIGNMENT(const scalar_operator &);
+
+#define SUM_ADDITION_ASSIGNMENT(otherTy, op)                                   \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator op##=(            \
+      otherTy other) {                                                         \
+    this->insert(product_operator<HandlerTy>(op other));                       \
+    return *this;                                                              \
+  }
+
+SUM_ADDITION_ASSIGNMENT(double, +);
+SUM_ADDITION_ASSIGNMENT(double, -);
+SUM_ADDITION_ASSIGNMENT(std::complex<double>, +);
+SUM_ADDITION_ASSIGNMENT(std::complex<double>, -);
+SUM_ADDITION_ASSIGNMENT(const scalar_operator &, +);
+SUM_ADDITION_ASSIGNMENT(const scalar_operator &, -);
+
+template <typename HandlerTy>
+operator_sum<HandlerTy> &
+operator_sum<HandlerTy>::operator*=(const product_operator<HandlerTy> &other) {
+  operator_sum<HandlerTy> sum;
+  sum.coefficients.reserve(this->coefficients.size());
+  sum.term_map.reserve(this->terms.size());
+  sum.terms.reserve(this->terms.size());
+  for (auto i = 0; i < this->terms.size(); ++i) {
+    auto max_size = this->terms[i].size() + other.operators.size();
+    product_operator<HandlerTy> prod(this->coefficients[i] * other.coefficient,
+                                     this->terms[i], max_size);
+    for (HandlerTy op : other.operators)
+      prod.insert(std::move(op));
+    sum.insert(std::move(prod));
+  }
+  *this = std::move(sum);
+  return *this;
+}
+
+#define SUM_ADDITION_PRODUCT_ASSIGNMENT(op)                                    \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator op##=(            \
+      const product_operator<HandlerTy> &other) {                              \
+    this->insert(op other);                                                    \
+    return *this;                                                              \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator op##=(            \
+      product_operator<HandlerTy> &&other) {                                   \
+    this->insert(op std::move(other));                                         \
+    return *this;                                                              \
+  }
+
+SUM_ADDITION_PRODUCT_ASSIGNMENT(+)
+SUM_ADDITION_PRODUCT_ASSIGNMENT(-)
+
+template <typename HandlerTy>
+operator_sum<HandlerTy> &
+operator_sum<HandlerTy>::operator*=(const operator_sum<HandlerTy> &other) {
+  operator_sum<HandlerTy> sum; // the entire sum needs to be rebuilt
+  auto max_size = this->terms.size() * other.terms.size();
+  sum.coefficients.reserve(max_size);
+  sum.term_map.reserve(max_size);
+  sum.terms.reserve(max_size);
+  for (auto i = 0; i < this->terms.size(); ++i) {
+    for (auto j = 0; j < other.terms.size(); ++j) {
+      auto max_size = this->terms[i].size() + other.terms[j].size();
+      product_operator<HandlerTy> prod(this->coefficients[i] *
+                                           other.coefficients[j],
+                                       this->terms[i], max_size);
+      for (HandlerTy op : other.terms[j])
+        prod.insert(std::move(op));
+      sum.insert(std::move(prod));
+    }
+  }
+  *this = std::move(sum);
+  return *this;
+}
+
+#define SUM_ADDITION_SUM_ASSIGNMENT(op)                                        \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator op##=(            \
+      const operator_sum<HandlerTy> &other) {                                  \
+    auto max_size = this->terms.size() + other.terms.size();                   \
+    this->coefficients.reserve(max_size);                                      \
+    this->term_map.reserve(max_size);                                          \
+    this->terms.reserve(max_size);                                             \
+    for (auto i = 0; i < other.terms.size(); ++i)                              \
+      this->insert(product_operator<HandlerTy>(op other.coefficients[i],       \
+                                               other.terms[i]));               \
+    return *this;                                                              \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator op##=(            \
+      operator_sum<HandlerTy> &&other) {                                       \
+    auto max_size = this->terms.size() + other.terms.size();                   \
+    this->coefficients.reserve(max_size);                                      \
+    this->term_map.reserve(max_size);                                          \
+    this->terms.reserve(max_size);                                             \
+    for (auto i = 0; i < other.terms.size(); ++i)                              \
+      this->insert(product_operator<HandlerTy>(                                \
+          op std::move(other.coefficients[i]), std::move(other.terms[i])));    \
+    return *this;                                                              \
+  }
+
+SUM_ADDITION_SUM_ASSIGNMENT(+);
+SUM_ADDITION_SUM_ASSIGNMENT(-);
+
+#define INSTANTIATE_SUM_OPASSIGNMENTS(HandlerTy)                               \
+                                                                               \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator*=(       \
+      double other);                                                           \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator+=(       \
+      double other);                                                           \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator-=(       \
+      double other);                                                           \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator*=(       \
+      std::complex<double> other);                                             \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator+=(       \
+      std::complex<double> other);                                             \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator-=(       \
+      std::complex<double> other);                                             \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator*=(       \
+      const scalar_operator &other);                                           \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator+=(       \
+      const scalar_operator &other);                                           \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator-=(       \
+      const scalar_operator &other);                                           \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator*=(       \
+      const product_operator<HandlerTy> &other);                               \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator+=(       \
+      const product_operator<HandlerTy> &other);                               \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator+=(       \
+      product_operator<HandlerTy> &&other);                                    \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator-=(       \
+      const product_operator<HandlerTy> &other);                               \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator-=(       \
+      product_operator<HandlerTy> &&other);                                    \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator*=(       \
+      const operator_sum<HandlerTy> &other);                                   \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator+=(       \
+      const operator_sum<HandlerTy> &other);                                   \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator+=(       \
+      operator_sum<HandlerTy> &&other);                                        \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator-=(       \
+      const operator_sum<HandlerTy> &other);                                   \
+  template operator_sum<HandlerTy> &operator_sum<HandlerTy>::operator-=(       \
+      operator_sum<HandlerTy> &&other);
+
+INSTANTIATE_SUM_OPASSIGNMENTS(matrix_operator);
+INSTANTIATE_SUM_OPASSIGNMENTS(spin_operator);
+INSTANTIATE_SUM_OPASSIGNMENTS(boson_operator);
+INSTANTIATE_SUM_OPASSIGNMENTS(fermion_operator);
+
+// left-hand arithmetics
+
+#define SUM_MULTIPLICATION_REVERSE(otherTy)                                    \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> operator*(otherTy other,                             \
+                                    const operator_sum<HandlerTy> &self) {     \
+    operator_sum<HandlerTy> sum;                                               \
+    sum.coefficients.reserve(self.coefficients.size());                        \
+    sum.terms = self.terms;                                                    \
+    sum.term_map = self.term_map;                                              \
+    for (const auto &coeff : self.coefficients)                                \
+      sum.coefficients.push_back(coeff *other);                                \
+    return std::move(sum);                                                     \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> operator*(otherTy other,                             \
+                                    operator_sum<HandlerTy> &&self) {          \
+    for (auto &&coeff : self.coefficients)                                     \
+      coeff *= other;                                                          \
+    return std::move(self);                                                    \
+  }
+
+SUM_MULTIPLICATION_REVERSE(double);
+SUM_MULTIPLICATION_REVERSE(std::complex<double>);
+SUM_MULTIPLICATION_REVERSE(const scalar_operator &);
+
+#define SUM_ADDITION_REVERSE(otherTy, op)                                      \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> operator op(otherTy other,                           \
+                                      const operator_sum<HandlerTy> &self) {   \
+    operator_sum<HandlerTy> sum(op self);                                      \
+    sum.insert(product_operator<HandlerTy>(other));                            \
+    return std::move(sum);                                                     \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> operator op(otherTy other,                           \
+                                      operator_sum<HandlerTy> &&self) {        \
+    for (auto &&coeff : self.coefficients)                                     \
+      coeff = std::move(op coeff);                                             \
+    self.insert(product_operator<HandlerTy>(other));                           \
+    return std::move(self);                                                    \
+  }
+
+SUM_ADDITION_REVERSE(double, +);
+SUM_ADDITION_REVERSE(double, -);
+SUM_ADDITION_REVERSE(std::complex<double>, +);
+SUM_ADDITION_REVERSE(std::complex<double>, -);
+SUM_ADDITION_REVERSE(const scalar_operator &, +);
+SUM_ADDITION_REVERSE(const scalar_operator &, -);
+
+#define INSTANTIATE_SUM_LHCOMPOSITE_OPS(HandlerTy)                             \
+                                                                               \
+  template operator_sum<HandlerTy> operator*(                                  \
+      double other, const operator_sum<HandlerTy> &self);                      \
+  template operator_sum<HandlerTy> operator*(double other,                     \
+                                             operator_sum<HandlerTy> &&self);  \
+  template operator_sum<HandlerTy> operator+(                                  \
+      double other, const operator_sum<HandlerTy> &self);                      \
+  template operator_sum<HandlerTy> operator+(double other,                     \
+                                             operator_sum<HandlerTy> &&self);  \
+  template operator_sum<HandlerTy> operator-(                                  \
+      double other, const operator_sum<HandlerTy> &self);                      \
+  template operator_sum<HandlerTy> operator-(double other,                     \
+                                             operator_sum<HandlerTy> &&self);  \
+  template operator_sum<HandlerTy> operator*(                                  \
+      std::complex<double> other, const operator_sum<HandlerTy> &self);        \
+  template operator_sum<HandlerTy> operator*(std::complex<double> other,       \
+                                             operator_sum<HandlerTy> &&self);  \
+  template operator_sum<HandlerTy> operator+(                                  \
+      std::complex<double> other, const operator_sum<HandlerTy> &self);        \
+  template operator_sum<HandlerTy> operator+(std::complex<double> other,       \
+                                             operator_sum<HandlerTy> &&self);  \
+  template operator_sum<HandlerTy> operator-(                                  \
+      std::complex<double> other, const operator_sum<HandlerTy> &self);        \
+  template operator_sum<HandlerTy> operator-(std::complex<double> other,       \
+                                             operator_sum<HandlerTy> &&self);  \
+  template operator_sum<HandlerTy> operator*(                                  \
+      const scalar_operator &other, const operator_sum<HandlerTy> &self);      \
+  template operator_sum<HandlerTy> operator*(const scalar_operator &other,     \
+                                             operator_sum<HandlerTy> &&self);  \
+  template operator_sum<HandlerTy> operator+(                                  \
+      const scalar_operator &other, const operator_sum<HandlerTy> &self);      \
+  template operator_sum<HandlerTy> operator+(const scalar_operator &other,     \
+                                             operator_sum<HandlerTy> &&self);  \
+  template operator_sum<HandlerTy> operator-(                                  \
+      const scalar_operator &other, const operator_sum<HandlerTy> &self);      \
+  template operator_sum<HandlerTy> operator-(const scalar_operator &other,     \
+                                             operator_sum<HandlerTy> &&self);
+
+INSTANTIATE_SUM_LHCOMPOSITE_OPS(matrix_operator);
+INSTANTIATE_SUM_LHCOMPOSITE_OPS(spin_operator);
+INSTANTIATE_SUM_LHCOMPOSITE_OPS(boson_operator);
+INSTANTIATE_SUM_LHCOMPOSITE_OPS(fermion_operator);
+
+// arithmetics that require conversions
+
+#define SUM_CONVERSIONS_OPS(op)                                                \
+                                                                               \
+  template <typename LHtype, typename RHtype,                                  \
+            TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>                 \
+  operator_sum<matrix_operator> operator op(                                   \
+      const operator_sum<LHtype> &other,                                       \
+      const product_operator<RHtype> &self) {                                  \
+    return operator_sum<matrix_operator>(other) op self;                       \
+  }                                                                            \
+                                                                               \
+  template <typename LHtype, typename RHtype,                                  \
+            TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>                 \
+  operator_sum<matrix_operator> operator op(                                   \
+      const product_operator<LHtype> &other,                                   \
+      const operator_sum<RHtype> &self) {                                      \
+    return product_operator<matrix_operator>(other) op self;                   \
+  }                                                                            \
+                                                                               \
+  template <typename LHtype, typename RHtype,                                  \
+            TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>                 \
+  operator_sum<matrix_operator> operator op(                                   \
+      const operator_sum<LHtype> &other, const operator_sum<RHtype> &self) {   \
+    return operator_sum<matrix_operator>(other) op self;                       \
+  }
+
+SUM_CONVERSIONS_OPS(*);
+SUM_CONVERSIONS_OPS(+);
+SUM_CONVERSIONS_OPS(-);
+
+#define INSTANTIATE_SUM_CONVERSION_OPS(op)                                     \
+                                                                               \
+  template operator_sum<matrix_operator> operator op(                          \
+      const operator_sum<spin_operator> &other,                                \
+      const product_operator<matrix_operator> &self);                          \
+  template operator_sum<matrix_operator> operator op(                          \
+      const operator_sum<boson_operator> &other,                               \
+      const product_operator<matrix_operator> &self);                          \
+  template operator_sum<matrix_operator> operator op(                          \
+      const operator_sum<fermion_operator> &other,                             \
+      const product_operator<matrix_operator> &self);                          \
+  template operator_sum<matrix_operator> operator op(                          \
+      const operator_sum<spin_operator> &other,                                \
+      const product_operator<boson_operator> &self);                           \
+  template operator_sum<matrix_operator> operator op(                          \
+      const operator_sum<boson_operator> &other,                               \
+      const product_operator<spin_operator> &self);                            \
+  template operator_sum<matrix_operator> operator op(                          \
+      const operator_sum<spin_operator> &other,                                \
+      const product_operator<fermion_operator> &self);                         \
+  template operator_sum<matrix_operator> operator op(                          \
+      const operator_sum<fermion_operator> &other,                             \
+      const product_operator<spin_operator> &self);                            \
+  template operator_sum<matrix_operator> operator op(                          \
+      const operator_sum<boson_operator> &other,                               \
+      const product_operator<fermion_operator> &self);                         \
+  template operator_sum<matrix_operator> operator op(                          \
+      const operator_sum<fermion_operator> &other,                             \
+      const product_operator<boson_operator> &self);                           \
+                                                                               \
+  template operator_sum<matrix_operator> operator op(                          \
+      const product_operator<spin_operator> &other,                            \
+      const operator_sum<matrix_operator> &self);                              \
+  template operator_sum<matrix_operator> operator op(                          \
+      const product_operator<boson_operator> &other,                           \
+      const operator_sum<matrix_operator> &self);                              \
+  template operator_sum<matrix_operator> operator op(                          \
+      const product_operator<fermion_operator> &other,                         \
+      const operator_sum<matrix_operator> &self);                              \
+  template operator_sum<matrix_operator> operator op(                          \
+      const product_operator<spin_operator> &other,                            \
+      const operator_sum<boson_operator> &self);                               \
+  template operator_sum<matrix_operator> operator op(                          \
+      const product_operator<boson_operator> &other,                           \
+      const operator_sum<spin_operator> &self);                                \
+  template operator_sum<matrix_operator> operator op(                          \
+      const product_operator<spin_operator> &other,                            \
+      const operator_sum<fermion_operator> &self);                             \
+  template operator_sum<matrix_operator> operator op(                          \
+      const product_operator<fermion_operator> &other,                         \
+      const operator_sum<spin_operator> &self);                                \
+  template operator_sum<matrix_operator> operator op(                          \
+      const product_operator<boson_operator> &other,                           \
+      const operator_sum<fermion_operator> &self);                             \
+  template operator_sum<matrix_operator> operator op(                          \
+      const product_operator<fermion_operator> &other,                         \
+      const operator_sum<boson_operator> &self);                               \
+                                                                               \
+  template operator_sum<matrix_operator> operator op(                          \
+      const operator_sum<spin_operator> &other,                                \
+      const operator_sum<matrix_operator> &self);                              \
+  template operator_sum<matrix_operator> operator op(                          \
+      const operator_sum<boson_operator> &other,                               \
+      const operator_sum<matrix_operator> &self);                              \
+  template operator_sum<matrix_operator> operator op(                          \
+      const operator_sum<fermion_operator> &other,                             \
+      const operator_sum<matrix_operator> &self);                              \
+  template operator_sum<matrix_operator> operator op(                          \
+      const operator_sum<spin_operator> &other,                                \
+      const operator_sum<boson_operator> &self);                               \
+  template operator_sum<matrix_operator> operator op(                          \
+      const operator_sum<boson_operator> &other,                               \
+      const operator_sum<spin_operator> &self);                                \
+  template operator_sum<matrix_operator> operator op(                          \
+      const operator_sum<spin_operator> &other,                                \
+      const operator_sum<fermion_operator> &self);                             \
+  template operator_sum<matrix_operator> operator op(                          \
+      const operator_sum<fermion_operator> &other,                             \
+      const operator_sum<spin_operator> &self);                                \
+  template operator_sum<matrix_operator> operator op(                          \
+      const operator_sum<boson_operator> &other,                               \
+      const operator_sum<fermion_operator> &self);                             \
+  template operator_sum<matrix_operator> operator op(                          \
+      const operator_sum<fermion_operator> &other,                             \
+      const operator_sum<boson_operator> &self);
+
+INSTANTIATE_SUM_CONVERSION_OPS(*);
+INSTANTIATE_SUM_CONVERSION_OPS(+);
+INSTANTIATE_SUM_CONVERSION_OPS(-);
+
+// common operators
+
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator_handler::empty() {
+  return operator_sum<HandlerTy>();
+}
+
+template operator_sum<matrix_operator> operator_handler::empty();
+template operator_sum<spin_operator> operator_handler::empty();
+template operator_sum<boson_operator> operator_handler::empty();
+template operator_sum<fermion_operator> operator_handler::empty();
+
+#ifdef CUDAQ_INSTANTIATE_TEMPLATES
+template class operator_sum<matrix_operator>;
+template class operator_sum<spin_operator>;
+template class operator_sum<boson_operator>;
+template class operator_sum<fermion_operator>;
+#endif
+
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/cudaq/dynamics/product_operators.cpp b/runtime/cudaq/dynamics/product_operators.cpp
new file mode 100644
index 00000000000..216c40ae737
--- /dev/null
+++ b/runtime/cudaq/dynamics/product_operators.cpp
@@ -0,0 +1,1196 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include <algorithm>
+#include <numeric>
+#include <set>
+#include <type_traits>
+#include <unordered_map>
+
+#include "cudaq/operators.h"
+#include "cudaq/spin_op.h"
+#include "evaluation.h"
+#include "helpers.h"
+
+namespace cudaq {
+
+// private methods
+
+#if !defined(NDEBUG)
+// check canonicalization by default, individual handlers can set it to false to
+// disable the check
+bool operator_handler::can_be_canonicalized = true;
+
+// returns true if and only if applying the operators in sequence acts only once
+// on each degree of freedom and in canonical order
+template <typename HandlerTy>
+bool product_operator<HandlerTy>::is_canonicalized() const {
+  auto canon_degrees = this->degrees(false);
+  std::vector<int> degrees;
+  degrees.reserve(canon_degrees.size());
+  for (const auto &op : this->operators) {
+    for (auto d : op.degrees())
+      degrees.push_back(d);
+  }
+  return degrees == canon_degrees;
+}
+#endif
+
+template <typename HandlerTy>
+std::vector<HandlerTy>::const_iterator
+product_operator<HandlerTy>::find_insert_at(const HandlerTy &other) {
+  // This is the simplest logic for finding the insertion point that should be
+  // used only when all operators are guaranteed to belong to commutation set 0
+  // and have a single target. Neither is the case for matrix operators; we
+  // handle this with a template specialization below. That template
+  // specialization handles the most general case, but (necessarily) is also the
+  // slowest. We have an additional template specialization for operators that
+  // have non-trivial commutation relations across different degrees but have a
+  // single target (e.g. fermions). That specialization is just to avoid the
+  // overhead of having to compare multiple targets.
+  assert(other.commutation_group ==
+         operator_handler::default_commutation_relations);
+
+  // The logic below ensures that terms are fully ordered in canonical order, as
+  // long as the HandlerTy supports in-place multiplication.
+  return std::find_if(this->operators.crbegin(), this->operators.crend(),
+                      [other_target = other.target](const HandlerTy &self_op) {
+                        return !operator_handler::canonical_order(
+                            other_target, self_op.target);
+                      })
+      .base(); // base causes insert after for reverse iterator
+}
+
+template <>
+std::vector<matrix_operator>::const_iterator
+product_operator<matrix_operator>::find_insert_at(
+    const matrix_operator &other) {
+  // This template specialization contains the most general (and least
+  // performant) version of the logic to determine where to insert an operator
+  // into the product. It takes commutation relations into account to try and
+  // order the operators in canonical order as much as possible. For
+  // multi-target operators, a canonical ordering cannot be defined/achieved.
+  // IMPORTANT: It is necessary that we do try to partially canonicalize to
+  // ensure that any anti- commutation relations are correct, since we achieve
+  // non-trivial commutation relations by updating the coefficient upon
+  // reordering terms! We should still get the correct relations even if we
+  // necessarily don't have a full canonical order, since incomplete ordering
+  // only occurs for multi-qubit terms. The only thing that does not work in
+  // general is having an non-trivial commutation relation for multi- target
+  // operators. The matrix operator class should fail to construct such an
+  // operator.
+  int nr_permutations = 0;
+  auto rit = std::find_if(
+      this->operators.crbegin(), this->operators.crend(),
+      [&nr_permutations,
+       &other_degrees = static_cast<const std::vector<int> &>(other.degrees()),
+       &other](const matrix_operator &self_op) {
+        const std::vector<int> &self_op_degrees = self_op.degrees();
+        for (auto other_degree : other_degrees) {
+          auto item_it =
+              std::find_if(self_op_degrees.crbegin(), self_op_degrees.crend(),
+                           [other_degree](int self_degree) {
+                             return !operator_handler::canonical_order(
+                                 other_degree, self_degree);
+                           });
+          if (item_it != self_op_degrees.crend()) {
+            // we need to run this again to properly validate the defined
+            // commutation sets - we need to know if we have an exact match of
+            // the degree somewhere
+            item_it =
+                std::find_if(self_op_degrees.crbegin(), self_op_degrees.crend(),
+                             [other_degree](int self_degree) {
+                               return other_degree == self_degree;
+                             });
+            if (item_it != self_op_degrees.crend() &&
+                other.commutation_group != self_op.commutation_group)
+              // this indicates that the same degree of freedom is acted upon by
+              // an operator of a different "commutation class", e.g. a fermion
+              // operator has been applied and now we are trying to apply a
+              // boson operator
+              throw std::runtime_error(
+                  "conflicting commutation relations defined for target " +
+                  std::to_string(other_degree));
+            return true;
+          } else if (!other.commutes_across_degrees &&
+                     !self_op.commutes_across_degrees &&
+                     other.commutation_group !=
+                         operator_handler::default_commutation_relations &&
+                     self_op.commutation_group == other.commutation_group)
+            nr_permutations += 1;
+        }
+        return false;
+      });
+  if (nr_permutations != 0)
+    this->coefficient *=
+        other.commutation_group.commutation_factor() * (double)nr_permutations;
+  return rit.base(); // base causes insert after for reverse iterator
+}
+
+template <>
+std::vector<fermion_operator>::const_iterator
+product_operator<fermion_operator>::find_insert_at(
+    const fermion_operator &other) {
+  assert(other.commutation_group ==
+         operator_handler::fermion_commutation_relations);
+  // This template specialization contains the same logic as the specialization
+  // for matrix operators above, just written to rely on having a single target
+  // qubit and a matching set id for all operators for the sake of avoiding
+  // unnecessary overhead.
+  bool negate_coefficient = false;
+  auto rit = std::find_if(
+      this->operators.crbegin(), this->operators.crend(),
+      [&negate_coefficient, &other](const fermion_operator &self_op) {
+        if (!operator_handler::canonical_order(other.target, self_op.target))
+          return true;
+        if (!other.commutes_across_degrees && !self_op.commutes_across_degrees)
+          negate_coefficient = !negate_coefficient;
+        return false;
+      });
+  if (negate_coefficient)
+    this->coefficient *= -1.;
+  return rit.base(); // base causes insert after for reverse iterator
+}
+
+template <typename HandlerTy>
+template <typename T,
+          std::enable_if_t<std::is_same<HandlerTy, T>::value &&
+                               !product_operator<T>::supports_inplace_mult,
+                           std::false_type>>
+void product_operator<HandlerTy>::insert(T &&other) {
+  auto pos = this->find_insert_at(other);
+  this->operators.insert(pos, other);
+}
+
+template <typename HandlerTy>
+template <typename T,
+          std::enable_if_t<std::is_same<HandlerTy, T>::value &&
+                               product_operator<T>::supports_inplace_mult,
+                           std::true_type>>
+void product_operator<HandlerTy>::insert(T &&other) {
+  auto pos = this->find_insert_at(other);
+  if (pos != this->operators.begin() && (pos - 1)->target == other.target) {
+    auto it = this->operators.erase(
+        pos - 1,
+        pos - 1); // erase: constant time conversion to non-const iterator
+    it->inplace_mult(other);
+  } else
+    this->operators.insert(pos, std::move(other));
+}
+
+template <>
+template <typename T,
+          std::enable_if_t<std::is_same<spin_operator, T>::value &&
+                               product_operator<T>::supports_inplace_mult,
+                           std::true_type>>
+void product_operator<spin_operator>::insert(T &&other) {
+  auto pos = this->find_insert_at(other);
+  if (pos != this->operators.begin() && (pos - 1)->target == other.target) {
+    auto it = this->operators.erase(
+        pos - 1,
+        pos - 1); // erase: constant time conversion to non-const iterator
+    this->coefficient *= it->inplace_mult(other);
+  } else
+    this->operators.insert(pos, std::move(other));
+}
+
+template <typename HandlerTy>
+std::string product_operator<HandlerTy>::get_term_id() const {
+  std::string term_id;
+  for (const auto &op : this->operators)
+    term_id += op.unique_id();
+  return std::move(term_id);
+}
+
+template <typename HandlerTy>
+void product_operator<HandlerTy>::aggregate_terms() {}
+
+template <typename HandlerTy>
+template <typename... Args>
+void product_operator<HandlerTy>::aggregate_terms(HandlerTy &&head,
+                                                  Args &&...args) {
+  this->insert(std::forward<HandlerTy>(head));
+  aggregate_terms(std::forward<Args>(args)...);
+}
+
+template <typename HandlerTy>
+template <typename EvalTy>
+EvalTy product_operator<HandlerTy>::evaluate(
+    operator_arithmetics<EvalTy> arithmetics) const {
+  auto degrees = this->degrees(false); // keep in canonical order
+
+  auto padded_op = [&arithmetics,
+                    &degrees = std::as_const(degrees)](const HandlerTy &op) {
+    std::vector<EvalTy> evaluated;
+    auto op_degrees = op.degrees();
+    bool op_evaluated = false;
+    for (const auto &degree : degrees) {
+      if (std::find(op_degrees.cbegin(), op_degrees.cend(), degree) ==
+          op_degrees.cend())
+        evaluated.push_back(arithmetics.evaluate(HandlerTy(degree)));
+      // if op has more than one degree of freedom, then evaluating it here
+      // would potentially lead to a matrix reordering upon application of each
+      // subsequent id
+      else if (op_degrees.size() == 1 && !op_evaluated) {
+        evaluated.push_back(arithmetics.evaluate(op));
+        op_evaluated = true;
+      }
+    }
+
+    if (evaluated.size() == 0)
+      return arithmetics.evaluate(op);
+
+    // Creating the tensor product with op being last is most efficient if op
+    // acts on more than one degree of freedom - this ensure that only a single
+    // reordering happens at at the end.
+    EvalTy product = std::move(evaluated[0]);
+    for (auto i = 1; i < evaluated.size(); ++i)
+      product = arithmetics.tensor(std::move(product), std::move(evaluated[i]));
+    if (op_evaluated)
+      return std::move(product);
+    else
+      return arithmetics.tensor(std::move(product), arithmetics.evaluate(op));
+  };
+
+  if (arithmetics.pad_product_terms) {
+    if (degrees.size() == 0)
+      return arithmetics.evaluate(this->coefficient);
+    EvalTy prod = padded_op(this->operators[0]);
+    for (auto op_idx = 1; op_idx < this->operators.size(); ++op_idx) {
+      auto op_degrees = this->operators[op_idx].degrees();
+      if (op_degrees.size() != 1 ||
+          this->operators[op_idx] != HandlerTy(op_degrees[0]))
+        prod = arithmetics.mul(std::move(prod),
+                               padded_op(this->operators[op_idx]));
+    }
+    return arithmetics.mul(this->coefficient, std::move(prod));
+  } else {
+    EvalTy prod = arithmetics.evaluate(this->coefficient);
+    for (auto op_idx = 0; op_idx < this->operators.size(); ++op_idx) {
+      EvalTy eval = arithmetics.evaluate(this->operators[op_idx]);
+      prod = arithmetics.tensor(std::move(prod), std::move(eval));
+    }
+    return std::move(prod);
+  }
+}
+
+#define INSTANTIATE_PRODUCT_PRIVATE_METHODS(HandlerTy)                         \
+                                                                               \
+  template void product_operator<HandlerTy>::aggregate_terms(                  \
+      HandlerTy &&item1, HandlerTy &&item2);                                   \
+                                                                               \
+  template void product_operator<HandlerTy>::aggregate_terms(                  \
+      HandlerTy &&item1, HandlerTy &&item2, HandlerTy &&item3);
+
+INSTANTIATE_PRODUCT_PRIVATE_METHODS(matrix_operator);
+INSTANTIATE_PRODUCT_PRIVATE_METHODS(spin_operator);
+INSTANTIATE_PRODUCT_PRIVATE_METHODS(boson_operator);
+INSTANTIATE_PRODUCT_PRIVATE_METHODS(fermion_operator);
+
+#define INSTANTIATE_PRODUCT_EVALUATE_METHODS(HandlerTy, EvalTy)                \
+                                                                               \
+  template EvalTy product_operator<HandlerTy>::evaluate(                       \
+      operator_arithmetics<EvalTy> arithmetics) const;
+
+INSTANTIATE_PRODUCT_EVALUATE_METHODS(matrix_operator,
+                                     operator_handler::matrix_evaluation);
+INSTANTIATE_PRODUCT_EVALUATE_METHODS(spin_operator,
+                                     operator_handler::canonical_evaluation);
+INSTANTIATE_PRODUCT_EVALUATE_METHODS(boson_operator,
+                                     operator_handler::matrix_evaluation);
+INSTANTIATE_PRODUCT_EVALUATE_METHODS(fermion_operator,
+                                     operator_handler::matrix_evaluation);
+
+// read-only properties
+
+template <typename HandlerTy>
+std::vector<int>
+product_operator<HandlerTy>::degrees(bool application_order) const {
+  std::set<int> unsorted_degrees;
+  for (const HandlerTy &term : this->operators) {
+    auto term_degrees = term.degrees();
+    unsorted_degrees.insert(term_degrees.cbegin(), term_degrees.cend());
+  }
+  auto degrees =
+      std::vector<int>(unsorted_degrees.cbegin(), unsorted_degrees.cend());
+  if (application_order)
+    std::sort(degrees.begin(), degrees.end(),
+              operator_handler::user_facing_order);
+  else
+    std::sort(degrees.begin(), degrees.end(),
+              operator_handler::canonical_order);
+  return std::move(degrees);
+}
+
+template <typename HandlerTy>
+int product_operator<HandlerTy>::num_terms() const {
+  return this->operators.size();
+}
+
+template <typename HandlerTy>
+const std::vector<HandlerTy> &product_operator<HandlerTy>::get_terms() const {
+  return this->operators;
+}
+
+template <typename HandlerTy>
+scalar_operator product_operator<HandlerTy>::get_coefficient() const {
+  return this->coefficient;
+}
+
+#define INSTANTIATE_PRODUCT_PROPERTIES(HandlerTy)                              \
+                                                                               \
+  template std::vector<int> product_operator<HandlerTy>::degrees(              \
+      bool application_order) const;                                           \
+                                                                               \
+  template int product_operator<HandlerTy>::num_terms() const;                 \
+                                                                               \
+  template const std::vector<HandlerTy> &                                      \
+  product_operator<HandlerTy>::get_terms() const;                              \
+                                                                               \
+  template scalar_operator product_operator<HandlerTy>::get_coefficient() const;
+
+INSTANTIATE_PRODUCT_PROPERTIES(matrix_operator);
+INSTANTIATE_PRODUCT_PROPERTIES(spin_operator);
+INSTANTIATE_PRODUCT_PROPERTIES(boson_operator);
+INSTANTIATE_PRODUCT_PROPERTIES(fermion_operator);
+
+// constructors
+
+template <typename HandlerTy>
+product_operator<HandlerTy>::product_operator(double coefficient)
+    : coefficient(coefficient) {}
+
+template <typename HandlerTy>
+product_operator<HandlerTy>::product_operator(HandlerTy &&atomic)
+    : coefficient(1.) {
+  this->operators.push_back(std::move(atomic));
+  assert(!HandlerTy::can_be_canonicalized ||
+         this->is_canonicalized()); // relevant for custom matrix operators
+                                    // acting on multiple degrees of freedom
+}
+
+template <typename HandlerTy>
+template <typename... Args,
+          std::enable_if_t<
+              std::conjunction<std::is_same<HandlerTy, Args>...>::value, bool>>
+product_operator<HandlerTy>::product_operator(scalar_operator coefficient,
+                                              Args &&...args)
+    : coefficient(std::move(coefficient)) {
+  this->operators.reserve(sizeof...(Args));
+  aggregate_terms(std::forward<HandlerTy &&>(args)...);
+  assert(!HandlerTy::can_be_canonicalized || this->is_canonicalized());
+}
+
+// assumes canonical ordering (if possible)
+template <typename HandlerTy>
+product_operator<HandlerTy>::product_operator(
+    scalar_operator coefficient, const std::vector<HandlerTy> &atomic_operators,
+    int size)
+    : coefficient(std::move(coefficient)) {
+  if (size <= 0)
+    this->operators = atomic_operators;
+  else {
+    this->operators.reserve(size);
+    for (const auto &op : atomic_operators)
+      this->operators.push_back(op);
+  }
+  assert(!HandlerTy::can_be_canonicalized || this->is_canonicalized());
+}
+
+// assumes canonical ordering (if possible)
+template <typename HandlerTy>
+product_operator<HandlerTy>::product_operator(
+    scalar_operator coefficient, std::vector<HandlerTy> &&atomic_operators,
+    int size)
+    : coefficient(std::move(coefficient)),
+      operators(std::move(atomic_operators)) {
+  if (size > 0)
+    this->operators.reserve(size);
+  assert(!HandlerTy::can_be_canonicalized || this->is_canonicalized());
+}
+
+template <typename HandlerTy>
+template <typename T,
+          std::enable_if_t<!std::is_same<T, HandlerTy>::value &&
+                               std::is_constructible<HandlerTy, T>::value,
+                           bool>>
+product_operator<HandlerTy>::product_operator(const product_operator<T> &other)
+    : coefficient(other.coefficient) {
+  this->operators.reserve(other.operators.size());
+  for (const T &other_op : other.operators) {
+    HandlerTy op(other_op);
+    this->operators.push_back(op);
+  }
+}
+
+template <typename HandlerTy>
+template <typename T,
+          std::enable_if_t<std::is_same<HandlerTy, matrix_operator>::value &&
+                               !std::is_same<T, HandlerTy>::value &&
+                               std::is_constructible<HandlerTy, T>::value,
+                           bool>>
+product_operator<HandlerTy>::product_operator(
+    const product_operator<T> &other,
+    const matrix_operator::commutation_behavior &behavior)
+    : coefficient(other.coefficient) {
+  this->operators.reserve(other.operators.size());
+  for (const T &other_op : other.operators) {
+    HandlerTy op(other_op, behavior);
+    this->operators.push_back(op);
+  }
+}
+
+template <typename HandlerTy>
+product_operator<HandlerTy>::product_operator(
+    const product_operator<HandlerTy> &other, int size)
+    : coefficient(other.coefficient) {
+  if (size <= 0)
+    this->operators = other.operators;
+  else {
+    this->operators.reserve(size);
+    for (const auto &op : other.operators)
+      this->operators.push_back(op);
+  }
+}
+
+template <typename HandlerTy>
+product_operator<HandlerTy>::product_operator(
+    product_operator<HandlerTy> &&other, int size)
+    : coefficient(std::move(other.coefficient)),
+      operators(std::move(other.operators)) {
+  if (size > 0)
+    this->operators.reserve(size);
+}
+
+#define INSTANTIATE_PRODUCT_CONSTRUCTORS(HandlerTy)                            \
+                                                                               \
+  template product_operator<HandlerTy>::product_operator(double coefficient);  \
+                                                                               \
+  template product_operator<HandlerTy>::product_operator(                      \
+      scalar_operator coefficient);                                            \
+                                                                               \
+  template product_operator<HandlerTy>::product_operator(HandlerTy &&atomic);  \
+                                                                               \
+  template product_operator<HandlerTy>::product_operator(                      \
+      scalar_operator coefficient, HandlerTy &&atomic1);                       \
+                                                                               \
+  template product_operator<HandlerTy>::product_operator(                      \
+      scalar_operator coefficient, HandlerTy &&atomic1, HandlerTy &&atomic2);  \
+                                                                               \
+  template product_operator<HandlerTy>::product_operator(                      \
+      scalar_operator coefficient, HandlerTy &&atomic1, HandlerTy &&atomic2,   \
+      HandlerTy &&atomic3);                                                    \
+                                                                               \
+  template product_operator<HandlerTy>::product_operator(                      \
+      scalar_operator coefficient,                                             \
+      const std::vector<HandlerTy> &atomic_operators, int size);               \
+                                                                               \
+  template product_operator<HandlerTy>::product_operator(                      \
+      scalar_operator coefficient, std::vector<HandlerTy> &&atomic_operators,  \
+      int size);                                                               \
+                                                                               \
+  template product_operator<HandlerTy>::product_operator(                      \
+      const product_operator<HandlerTy> &other, int size);                     \
+                                                                               \
+  template product_operator<HandlerTy>::product_operator(                      \
+      product_operator<HandlerTy> &&other, int size);
+
+template product_operator<matrix_operator>::product_operator(
+    const product_operator<spin_operator> &other);
+template product_operator<matrix_operator>::product_operator(
+    const product_operator<boson_operator> &other);
+template product_operator<matrix_operator>::product_operator(
+    const product_operator<fermion_operator> &other);
+template product_operator<matrix_operator>::product_operator(
+    const product_operator<spin_operator> &other,
+    const matrix_operator::commutation_behavior &behavior);
+template product_operator<matrix_operator>::product_operator(
+    const product_operator<boson_operator> &other,
+    const matrix_operator::commutation_behavior &behavior);
+template product_operator<matrix_operator>::product_operator(
+    const product_operator<fermion_operator> &other,
+    const matrix_operator::commutation_behavior &behavior);
+
+INSTANTIATE_PRODUCT_CONSTRUCTORS(matrix_operator);
+INSTANTIATE_PRODUCT_CONSTRUCTORS(spin_operator);
+INSTANTIATE_PRODUCT_CONSTRUCTORS(boson_operator);
+INSTANTIATE_PRODUCT_CONSTRUCTORS(fermion_operator);
+
+// assignments
+
+template <typename HandlerTy>
+template <typename T,
+          std::enable_if_t<!std::is_same<T, HandlerTy>::value &&
+                               std::is_constructible<HandlerTy, T>::value,
+                           bool>>
+product_operator<HandlerTy> &
+product_operator<HandlerTy>::operator=(const product_operator<T> &other) {
+  *this = product_operator<HandlerTy>(other);
+  return *this;
+}
+
+template <typename HandlerTy>
+product_operator<HandlerTy> &product_operator<HandlerTy>::operator=(
+    const product_operator<HandlerTy> &other) {
+  if (this != &other) {
+    this->coefficient = other.coefficient;
+    this->operators = other.operators;
+  }
+  return *this;
+}
+
+template <typename HandlerTy>
+product_operator<HandlerTy> &
+product_operator<HandlerTy>::operator=(product_operator<HandlerTy> &&other) {
+  if (this != &other) {
+    this->coefficient = std::move(other.coefficient);
+    this->operators = std::move(other.operators);
+  }
+  return *this;
+}
+
+#define INSTANTIATE_PRODUCT_ASSIGNMENTS(HandlerTy)                             \
+                                                                               \
+  template product_operator<HandlerTy> &                                       \
+  product_operator<HandlerTy>::operator=(                                      \
+      const product_operator<HandlerTy> &other);                               \
+                                                                               \
+  template product_operator<HandlerTy> &                                       \
+  product_operator<HandlerTy>::operator=(product_operator<HandlerTy> &&other);
+
+template product_operator<matrix_operator> &
+product_operator<matrix_operator>::operator=(
+    const product_operator<spin_operator> &other);
+template product_operator<matrix_operator> &
+product_operator<matrix_operator>::operator=(
+    const product_operator<boson_operator> &other);
+template product_operator<matrix_operator> &
+product_operator<matrix_operator>::operator=(
+    const product_operator<fermion_operator> &other);
+
+INSTANTIATE_PRODUCT_ASSIGNMENTS(matrix_operator);
+INSTANTIATE_PRODUCT_ASSIGNMENTS(spin_operator);
+INSTANTIATE_PRODUCT_ASSIGNMENTS(boson_operator);
+INSTANTIATE_PRODUCT_ASSIGNMENTS(fermion_operator);
+
+// evaluations
+
+template <typename HandlerTy>
+std::string product_operator<HandlerTy>::to_string() const {
+  auto str = "(" + this->coefficient.to_string() + ") * ";
+  for (const auto &op : this->operators)
+    str += op.to_string(true);
+  return std::move(str);
+}
+
+template <typename HandlerTy>
+matrix_2 product_operator<HandlerTy>::to_matrix(
+    std::unordered_map<int, int> dimensions,
+    const std::unordered_map<std::string, std::complex<double>> &parameters,
+    bool application_order) const {
+  auto evaluated =
+      this->evaluate(operator_arithmetics<operator_handler::matrix_evaluation>(
+          dimensions, parameters));
+  auto matrix = std::move(evaluated.matrix);
+  if (!application_order || operator_handler::canonical_order(1, 0) ==
+                                operator_handler::user_facing_order(1, 0))
+    return std::move(matrix);
+
+  auto degrees = evaluated.degrees;
+  std::sort(degrees.begin(), degrees.end(),
+            operator_handler::user_facing_order);
+  auto permutation = cudaq::detail::compute_permutation(evaluated.degrees,
+                                                        degrees, dimensions);
+  cudaq::detail::permute_matrix(matrix, permutation);
+  return std::move(matrix);
+}
+
+template <>
+matrix_2 product_operator<spin_operator>::to_matrix(
+    std::unordered_map<int, int> dimensions,
+    const std::unordered_map<std::string, std::complex<double>> &parameters,
+    bool application_order) const {
+  auto terms = std::move(
+      this->evaluate(
+              operator_arithmetics<operator_handler::canonical_evaluation>(
+                  dimensions, parameters))
+          .terms);
+  assert(terms.size() == 1);
+  bool invert_order =
+      application_order && operator_handler::canonical_order(1, 0) !=
+                               operator_handler::user_facing_order(1, 0);
+  auto matrix =
+      spin_operator::to_matrix(terms[0].second, terms[0].first, invert_order);
+  return std::move(matrix);
+}
+
+#define INSTANTIATE_PRODUCT_EVALUATIONS(HandlerTy)                             \
+                                                                               \
+  template std::string product_operator<HandlerTy>::to_string() const;         \
+                                                                               \
+  template matrix_2 product_operator<HandlerTy>::to_matrix(                    \
+      std::unordered_map<int, int> dimensions,                                 \
+      const std::unordered_map<std::string, std::complex<double>> &parameters, \
+      bool application_order) const;
+
+INSTANTIATE_PRODUCT_EVALUATIONS(matrix_operator);
+INSTANTIATE_PRODUCT_EVALUATIONS(spin_operator);
+INSTANTIATE_PRODUCT_EVALUATIONS(boson_operator);
+INSTANTIATE_PRODUCT_EVALUATIONS(fermion_operator);
+
+// comparisons
+
+template <typename HandlerTy>
+bool product_operator<HandlerTy>::operator==(
+    const product_operator<HandlerTy> &other) const {
+  return this->coefficient == other.coefficient &&
+         this->get_term_id() == other.get_term_id();
+}
+
+#define INSTANTIATE_PRODUCT_COMPARISONS(HandlerTy)                             \
+                                                                               \
+  template bool product_operator<HandlerTy>::operator==(                       \
+      const product_operator<HandlerTy> &other) const;
+
+INSTANTIATE_PRODUCT_COMPARISONS(matrix_operator);
+INSTANTIATE_PRODUCT_COMPARISONS(spin_operator);
+INSTANTIATE_PRODUCT_COMPARISONS(boson_operator);
+INSTANTIATE_PRODUCT_COMPARISONS(fermion_operator);
+
+// unary operators
+
+template <typename HandlerTy>
+product_operator<HandlerTy> product_operator<HandlerTy>::operator-() const & {
+  return product_operator<HandlerTy>(-1. * this->coefficient, this->operators);
+}
+
+template <typename HandlerTy>
+product_operator<HandlerTy> product_operator<HandlerTy>::operator-() && {
+  this->coefficient *= -1.;
+  return std::move(*this);
+}
+
+template <typename HandlerTy>
+product_operator<HandlerTy> product_operator<HandlerTy>::operator+() const & {
+  return *this;
+}
+
+template <typename HandlerTy>
+product_operator<HandlerTy> product_operator<HandlerTy>::operator+() && {
+  return std::move(*this);
+}
+
+#define INSTANTIATE_PRODUCT_UNARY_OPS(HandlerTy)                               \
+  template product_operator<HandlerTy>                                         \
+  product_operator<HandlerTy>::operator-() const &;                            \
+  template product_operator<HandlerTy>                                         \
+  product_operator<HandlerTy>::operator-() &&;                                 \
+  template product_operator<HandlerTy>                                         \
+  product_operator<HandlerTy>::operator+() const &;                            \
+  template product_operator<HandlerTy>                                         \
+  product_operator<HandlerTy>::operator+() &&;
+
+INSTANTIATE_PRODUCT_UNARY_OPS(matrix_operator);
+INSTANTIATE_PRODUCT_UNARY_OPS(spin_operator);
+INSTANTIATE_PRODUCT_UNARY_OPS(boson_operator);
+INSTANTIATE_PRODUCT_UNARY_OPS(fermion_operator);
+
+// right-hand arithmetics
+
+#define PRODUCT_MULTIPLICATION(otherTy)                                        \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  product_operator<HandlerTy> product_operator<HandlerTy>::operator*(          \
+      otherTy other) const & {                                                 \
+    return product_operator<HandlerTy>(other * this->coefficient,              \
+                                       this->operators);                       \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  product_operator<HandlerTy> product_operator<HandlerTy>::operator*(          \
+      otherTy other) && {                                                      \
+    this->coefficient *= other;                                                \
+    return std::move(*this);                                                   \
+  }
+
+PRODUCT_MULTIPLICATION(double);
+PRODUCT_MULTIPLICATION(std::complex<double>);
+PRODUCT_MULTIPLICATION(const scalar_operator &);
+
+#define PRODUCT_ADDITION(otherTy, op)                                          \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> product_operator<HandlerTy>::operator op(            \
+      otherTy other) const & {                                                 \
+    return operator_sum<HandlerTy>(product_operator<HandlerTy>(op other),      \
+                                   product_operator<HandlerTy>(*this));        \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> product_operator<HandlerTy>::operator op(            \
+      otherTy other) && {                                                      \
+    return operator_sum<HandlerTy>(product_operator<HandlerTy>(op other),      \
+                                   std::move(*this));                          \
+  }
+
+PRODUCT_ADDITION(double, +);
+PRODUCT_ADDITION(double, -);
+PRODUCT_ADDITION(std::complex<double>, +);
+PRODUCT_ADDITION(std::complex<double>, -);
+PRODUCT_ADDITION(const scalar_operator &, +);
+PRODUCT_ADDITION(const scalar_operator &, -);
+
+#define INSTANTIATE_PRODUCT_RHSIMPLE_OPS(HandlerTy)                            \
+                                                                               \
+  template product_operator<HandlerTy> product_operator<HandlerTy>::operator*( \
+      double other) const &;                                                   \
+  template product_operator<HandlerTy> product_operator<HandlerTy>::operator*( \
+      double other) &&;                                                        \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator+(     \
+      double other) const &;                                                   \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator+(     \
+      double other) &&;                                                        \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator-(     \
+      double other) const &;                                                   \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator-(     \
+      double other) &&;                                                        \
+  template product_operator<HandlerTy> product_operator<HandlerTy>::operator*( \
+      std::complex<double> other) const &;                                     \
+  template product_operator<HandlerTy> product_operator<HandlerTy>::operator*( \
+      std::complex<double> other) &&;                                          \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator+(     \
+      std::complex<double> other) const &;                                     \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator+(     \
+      std::complex<double> other) &&;                                          \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator-(     \
+      std::complex<double> other) const &;                                     \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator-(     \
+      std::complex<double> other) &&;                                          \
+  template product_operator<HandlerTy> product_operator<HandlerTy>::operator*( \
+      const scalar_operator &other) const &;                                   \
+  template product_operator<HandlerTy> product_operator<HandlerTy>::operator*( \
+      const scalar_operator &other) &&;                                        \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator+(     \
+      const scalar_operator &other) const &;                                   \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator+(     \
+      const scalar_operator &other) &&;                                        \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator-(     \
+      const scalar_operator &other) const &;                                   \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator-(     \
+      const scalar_operator &other) &&;
+
+INSTANTIATE_PRODUCT_RHSIMPLE_OPS(matrix_operator);
+INSTANTIATE_PRODUCT_RHSIMPLE_OPS(spin_operator);
+INSTANTIATE_PRODUCT_RHSIMPLE_OPS(boson_operator);
+INSTANTIATE_PRODUCT_RHSIMPLE_OPS(fermion_operator);
+
+template <typename HandlerTy>
+product_operator<HandlerTy> product_operator<HandlerTy>::operator*(
+    const product_operator<HandlerTy> &other) const & {
+  product_operator<HandlerTy> prod(
+      this->coefficient * other.coefficient, this->operators,
+      this->operators.size() + other.operators.size());
+  for (HandlerTy op : other.operators)
+    prod.insert(std::move(op));
+  return std::move(prod);
+}
+
+template <typename HandlerTy>
+product_operator<HandlerTy> product_operator<HandlerTy>::operator*(
+    const product_operator<HandlerTy> &other) && {
+  this->coefficient *= other.coefficient;
+  this->operators.reserve(this->operators.size() + other.operators.size());
+  for (HandlerTy op : other.operators)
+    this->insert(std::move(op));
+  return std::move(*this);
+}
+
+template <typename HandlerTy>
+product_operator<HandlerTy> product_operator<HandlerTy>::operator*(
+    product_operator<HandlerTy> &&other) const & {
+  product_operator<HandlerTy> prod(
+      this->coefficient * std::move(other.coefficient), this->operators,
+      this->operators.size() + other.operators.size());
+  for (auto &&op : other.operators)
+    prod.insert(std::move(op));
+  return std::move(prod);
+}
+
+template <typename HandlerTy>
+product_operator<HandlerTy>
+product_operator<HandlerTy>::operator*(product_operator<HandlerTy> &&other) && {
+  this->coefficient *= std::move(other.coefficient);
+  this->operators.reserve(this->operators.size() + other.operators.size());
+  for (auto &&op : other.operators)
+    this->insert(std::move(op));
+  return std::move(*this);
+}
+
+#define PRODUCT_ADDITION_PRODUCT(op)                                           \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> product_operator<HandlerTy>::operator op(            \
+      const product_operator<HandlerTy> &other) const & {                      \
+    return operator_sum<HandlerTy>(product_operator<HandlerTy>(*this),         \
+                                   op other);                                  \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> product_operator<HandlerTy>::operator op(            \
+      const product_operator<HandlerTy> &other) && {                           \
+    return operator_sum<HandlerTy>(std::move(*this), op other);                \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> product_operator<HandlerTy>::operator op(            \
+      product_operator<HandlerTy> &&other) const & {                           \
+    return operator_sum<HandlerTy>(product_operator<HandlerTy>(*this),         \
+                                   op std::move(other));                       \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> product_operator<HandlerTy>::operator op(            \
+      product_operator<HandlerTy> &&other) && {                                \
+    return operator_sum<HandlerTy>(std::move(*this), op std::move(other));     \
+  }
+
+PRODUCT_ADDITION_PRODUCT(+)
+PRODUCT_ADDITION_PRODUCT(-)
+
+template <typename HandlerTy>
+operator_sum<HandlerTy> product_operator<HandlerTy>::operator*(
+    const operator_sum<HandlerTy> &other) const {
+  operator_sum<HandlerTy>
+      sum; // everything needs to be updated, so creating a new sum makes sense
+  sum.coefficients.reserve(other.coefficients.size());
+  sum.term_map.reserve(other.terms.size());
+  sum.terms.reserve(other.terms.size());
+  for (auto i = 0; i < other.terms.size(); ++i) {
+    auto prod = *this * product_operator<HandlerTy>(other.coefficients[i],
+                                                    other.terms[i]);
+    sum.insert(std::move(prod));
+  }
+  return std::move(sum);
+}
+
+#define PRODUCT_ADDITION_SUM(op)                                               \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> product_operator<HandlerTy>::operator op(            \
+      const operator_sum<HandlerTy> &other) const & {                          \
+    operator_sum<HandlerTy> sum;                                               \
+    sum.coefficients.reserve(other.coefficients.size() + 1);                   \
+    sum.term_map = other.term_map;                                             \
+    sum.terms = other.terms;                                                   \
+    for (auto &coeff : other.coefficients)                                     \
+      sum.coefficients.push_back(op coeff);                                    \
+    sum.insert(*this);                                                         \
+    return std::move(sum);                                                     \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> product_operator<HandlerTy>::operator op(            \
+      const operator_sum<HandlerTy> &other) && {                               \
+    operator_sum<HandlerTy> sum;                                               \
+    sum.coefficients.reserve(other.coefficients.size() + 1);                   \
+    sum.term_map = other.term_map;                                             \
+    sum.terms = other.terms;                                                   \
+    for (auto &coeff : other.coefficients)                                     \
+      sum.coefficients.push_back(op coeff);                                    \
+    sum.insert(std::move(*this));                                              \
+    return std::move(sum);                                                     \
+  }                                                                            \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> product_operator<HandlerTy>::operator op(            \
+      operator_sum<HandlerTy> &&other) const & {                               \
+    operator_sum<HandlerTy> sum(op std::move(other));                          \
+    sum.insert(*this);                                                         \
+    return std::move(sum);                                                     \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> product_operator<HandlerTy>::operator op(            \
+      operator_sum<HandlerTy> &&other) && {                                    \
+    operator_sum<HandlerTy> sum(op std::move(other));                          \
+    sum.insert(std::move(*this));                                              \
+    return std::move(sum);                                                     \
+  }
+
+PRODUCT_ADDITION_SUM(+)
+PRODUCT_ADDITION_SUM(-)
+
+#define INSTANTIATE_PRODUCT_RHCOMPOSITE_OPS(HandlerTy)                         \
+                                                                               \
+  template product_operator<HandlerTy> product_operator<HandlerTy>::operator*( \
+      const product_operator<HandlerTy> &other) const &;                       \
+  template product_operator<HandlerTy> product_operator<HandlerTy>::operator*( \
+      const product_operator<HandlerTy> &other) &&;                            \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator+(     \
+      const product_operator<HandlerTy> &other) const &;                       \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator+(     \
+      const product_operator<HandlerTy> &other) &&;                            \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator+(     \
+      product_operator<HandlerTy> &&other) const &;                            \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator+(     \
+      product_operator<HandlerTy> &&other) &&;                                 \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator-(     \
+      const product_operator<HandlerTy> &other) const &;                       \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator-(     \
+      const product_operator<HandlerTy> &other) &&;                            \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator-(     \
+      product_operator<HandlerTy> &&other) const &;                            \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator-(     \
+      product_operator<HandlerTy> &&other) &&;                                 \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator*(     \
+      const operator_sum<HandlerTy> &other) const;                             \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator+(     \
+      const operator_sum<HandlerTy> &other) const &;                           \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator+(     \
+      const operator_sum<HandlerTy> &other) &&;                                \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator+(     \
+      operator_sum<HandlerTy> &&other) const &;                                \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator+(     \
+      operator_sum<HandlerTy> &&other) &&;                                     \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator-(     \
+      const operator_sum<HandlerTy> &other) const &;                           \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator-(     \
+      const operator_sum<HandlerTy> &other) &&;                                \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator-(     \
+      operator_sum<HandlerTy> &&other) const &;                                \
+  template operator_sum<HandlerTy> product_operator<HandlerTy>::operator-(     \
+      operator_sum<HandlerTy> &&other) &&;
+
+INSTANTIATE_PRODUCT_RHCOMPOSITE_OPS(matrix_operator);
+INSTANTIATE_PRODUCT_RHCOMPOSITE_OPS(spin_operator);
+INSTANTIATE_PRODUCT_RHCOMPOSITE_OPS(boson_operator);
+INSTANTIATE_PRODUCT_RHCOMPOSITE_OPS(fermion_operator);
+
+#define PRODUCT_MULTIPLICATION_ASSIGNMENT(otherTy)                             \
+  template <typename HandlerTy>                                                \
+  product_operator<HandlerTy> &product_operator<HandlerTy>::operator*=(        \
+      otherTy other) {                                                         \
+    this->coefficient *= other;                                                \
+    return *this;                                                              \
+  }
+
+PRODUCT_MULTIPLICATION_ASSIGNMENT(double);
+PRODUCT_MULTIPLICATION_ASSIGNMENT(std::complex<double>);
+PRODUCT_MULTIPLICATION_ASSIGNMENT(const scalar_operator &);
+
+template <typename HandlerTy>
+product_operator<HandlerTy> &product_operator<HandlerTy>::operator*=(
+    const product_operator<HandlerTy> &other) {
+  this->coefficient *= other.coefficient;
+  this->operators.reserve(this->operators.size() + other.operators.size());
+  for (HandlerTy op : other.operators)
+    this->insert(std::move(op));
+  return *this;
+}
+
+template <typename HandlerTy>
+product_operator<HandlerTy> &
+product_operator<HandlerTy>::operator*=(product_operator<HandlerTy> &&other) {
+  this->coefficient *= std::move(other.coefficient);
+  this->operators.reserve(this->operators.size() + other.operators.size());
+  for (auto &&op : other.operators)
+    this->insert(std::move(op));
+  return *this;
+}
+
+#define INSTANTIATE_PRODUCT_OPASSIGNMENTS(HandlerTy)                           \
+                                                                               \
+  template product_operator<HandlerTy> &                                       \
+  product_operator<HandlerTy>::operator*=(double other);                       \
+  template product_operator<HandlerTy> &                                       \
+  product_operator<HandlerTy>::operator*=(std::complex<double> other);         \
+  template product_operator<HandlerTy> &                                       \
+  product_operator<HandlerTy>::operator*=(const scalar_operator &other);       \
+  template product_operator<HandlerTy> &                                       \
+  product_operator<HandlerTy>::operator*=(                                     \
+      const product_operator<HandlerTy> &other);                               \
+  template product_operator<HandlerTy> &                                       \
+  product_operator<HandlerTy>::operator*=(                                     \
+      product_operator<HandlerTy> &&other);
+
+INSTANTIATE_PRODUCT_OPASSIGNMENTS(matrix_operator);
+INSTANTIATE_PRODUCT_OPASSIGNMENTS(spin_operator);
+INSTANTIATE_PRODUCT_OPASSIGNMENTS(boson_operator);
+INSTANTIATE_PRODUCT_OPASSIGNMENTS(fermion_operator);
+
+// left-hand arithmetics
+
+#define PRODUCT_MULTIPLICATION_REVERSE(otherTy)                                \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  product_operator<HandlerTy> operator*(                                       \
+      otherTy other, const product_operator<HandlerTy> &self) {                \
+    return product_operator<HandlerTy>(other * self.coefficient,               \
+                                       self.operators);                        \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  product_operator<HandlerTy> operator*(otherTy other,                         \
+                                        product_operator<HandlerTy> &&self) {  \
+    self.coefficient *= other;                                                 \
+    return std::move(self);                                                    \
+  }
+
+PRODUCT_MULTIPLICATION_REVERSE(double);
+PRODUCT_MULTIPLICATION_REVERSE(std::complex<double>);
+PRODUCT_MULTIPLICATION_REVERSE(const scalar_operator &);
+
+#define PRODUCT_ADDITION_REVERSE(otherTy, op)                                  \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> operator op(                                         \
+      otherTy other, const product_operator<HandlerTy> &self) {                \
+    return operator_sum<HandlerTy>(product_operator<HandlerTy>(other),         \
+                                   op self);                                   \
+  }                                                                            \
+                                                                               \
+  template <typename HandlerTy>                                                \
+  operator_sum<HandlerTy> operator op(otherTy other,                           \
+                                      product_operator<HandlerTy> &&self) {    \
+    return operator_sum<HandlerTy>(product_operator<HandlerTy>(other),         \
+                                   op std::move(self));                        \
+  }
+
+PRODUCT_ADDITION_REVERSE(double, +);
+PRODUCT_ADDITION_REVERSE(double, -);
+PRODUCT_ADDITION_REVERSE(std::complex<double>, +);
+PRODUCT_ADDITION_REVERSE(std::complex<double>, -);
+PRODUCT_ADDITION_REVERSE(const scalar_operator &, +);
+PRODUCT_ADDITION_REVERSE(const scalar_operator &, -);
+
+#define INSTANTIATE_PRODUCT_LHCOMPOSITE_OPS(HandlerTy)                         \
+                                                                               \
+  template product_operator<HandlerTy> operator*(                              \
+      double other, const product_operator<HandlerTy> &self);                  \
+  template product_operator<HandlerTy> operator*(                              \
+      double other, product_operator<HandlerTy> &&self);                       \
+  template operator_sum<HandlerTy> operator+(                                  \
+      double other, const product_operator<HandlerTy> &self);                  \
+  template operator_sum<HandlerTy> operator+(                                  \
+      double other, product_operator<HandlerTy> &&self);                       \
+  template operator_sum<HandlerTy> operator-(                                  \
+      double other, const product_operator<HandlerTy> &self);                  \
+  template operator_sum<HandlerTy> operator-(                                  \
+      double other, product_operator<HandlerTy> &&self);                       \
+  template product_operator<HandlerTy> operator*(                              \
+      std::complex<double> other, const product_operator<HandlerTy> &self);    \
+  template product_operator<HandlerTy> operator*(                              \
+      std::complex<double> other, product_operator<HandlerTy> &&self);         \
+  template operator_sum<HandlerTy> operator+(                                  \
+      std::complex<double> other, const product_operator<HandlerTy> &self);    \
+  template operator_sum<HandlerTy> operator+(                                  \
+      std::complex<double> other, product_operator<HandlerTy> &&self);         \
+  template operator_sum<HandlerTy> operator-(                                  \
+      std::complex<double> other, const product_operator<HandlerTy> &self);    \
+  template operator_sum<HandlerTy> operator-(                                  \
+      std::complex<double> other, product_operator<HandlerTy> &&self);         \
+  template product_operator<HandlerTy> operator*(                              \
+      const scalar_operator &other, const product_operator<HandlerTy> &self);  \
+  template product_operator<HandlerTy> operator*(                              \
+      const scalar_operator &other, product_operator<HandlerTy> &&self);       \
+  template operator_sum<HandlerTy> operator+(                                  \
+      const scalar_operator &other, const product_operator<HandlerTy> &self);  \
+  template operator_sum<HandlerTy> operator+(                                  \
+      const scalar_operator &other, product_operator<HandlerTy> &&self);       \
+  template operator_sum<HandlerTy> operator-(                                  \
+      const scalar_operator &other, const product_operator<HandlerTy> &self);  \
+  template operator_sum<HandlerTy> operator-(                                  \
+      const scalar_operator &other, product_operator<HandlerTy> &&self);
+
+INSTANTIATE_PRODUCT_LHCOMPOSITE_OPS(matrix_operator);
+INSTANTIATE_PRODUCT_LHCOMPOSITE_OPS(spin_operator);
+INSTANTIATE_PRODUCT_LHCOMPOSITE_OPS(boson_operator);
+INSTANTIATE_PRODUCT_LHCOMPOSITE_OPS(fermion_operator);
+
+// arithmetics that require conversions
+
+#define PRODUCT_CONVERSIONS_OPS(op, returnTy)                                  \
+  template <typename LHtype, typename RHtype,                                  \
+            TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>                 \
+  returnTy<matrix_operator> operator op(                                       \
+      const product_operator<LHtype> &other,                                   \
+      const product_operator<RHtype> &self) {                                  \
+    return product_operator<matrix_operator>(other) op self;                   \
+  }
+
+PRODUCT_CONVERSIONS_OPS(*, product_operator);
+PRODUCT_CONVERSIONS_OPS(+, operator_sum);
+PRODUCT_CONVERSIONS_OPS(-, operator_sum);
+
+#define INSTANTIATE_PRODUCT_CONVERSION_OPS(op, returnTy)                       \
+                                                                               \
+  template returnTy<matrix_operator> operator op(                              \
+      const product_operator<spin_operator> &other,                            \
+      const product_operator<matrix_operator> &self);                          \
+  template returnTy<matrix_operator> operator op(                              \
+      const product_operator<boson_operator> &other,                           \
+      const product_operator<matrix_operator> &self);                          \
+  template returnTy<matrix_operator> operator op(                              \
+      const product_operator<fermion_operator> &other,                         \
+      const product_operator<matrix_operator> &self);                          \
+  template returnTy<matrix_operator> operator op(                              \
+      const product_operator<spin_operator> &other,                            \
+      const product_operator<boson_operator> &self);                           \
+  template returnTy<matrix_operator> operator op(                              \
+      const product_operator<boson_operator> &other,                           \
+      const product_operator<spin_operator> &self);                            \
+  template returnTy<matrix_operator> operator op(                              \
+      const product_operator<spin_operator> &other,                            \
+      const product_operator<fermion_operator> &self);                         \
+  template returnTy<matrix_operator> operator op(                              \
+      const product_operator<fermion_operator> &other,                         \
+      const product_operator<spin_operator> &self);                            \
+  template returnTy<matrix_operator> operator op(                              \
+      const product_operator<boson_operator> &other,                           \
+      const product_operator<fermion_operator> &self);                         \
+  template returnTy<matrix_operator> operator op(                              \
+      const product_operator<fermion_operator> &other,                         \
+      const product_operator<boson_operator> &self);
+
+INSTANTIATE_PRODUCT_CONVERSION_OPS(*, product_operator);
+INSTANTIATE_PRODUCT_CONVERSION_OPS(+, operator_sum);
+INSTANTIATE_PRODUCT_CONVERSION_OPS(-, operator_sum);
+
+// common operators
+
+template <typename HandlerTy, typename... Args,
+          std::enable_if_t<std::conjunction<std::is_same<int, Args>...>::value,
+                           bool> = true>
+product_operator<HandlerTy> operator_handler::identity(Args... targets) {
+  static_assert(
+      std::is_constructible_v<HandlerTy, int>,
+      "operator handlers must have a constructor that take a single degree of "
+      "freedom and returns the identity operator on that degree.");
+  return product_operator<HandlerTy>(1.0, HandlerTy(targets)...);
+}
+
+template product_operator<matrix_operator> operator_handler::identity();
+template product_operator<spin_operator> operator_handler::identity();
+template product_operator<boson_operator> operator_handler::identity();
+template product_operator<fermion_operator> operator_handler::identity();
+
+template product_operator<matrix_operator>
+operator_handler::identity(int target);
+template product_operator<spin_operator> operator_handler::identity(int target);
+template product_operator<boson_operator>
+operator_handler::identity(int target);
+template product_operator<fermion_operator>
+operator_handler::identity(int target);
+
+#ifdef CUDAQ_INSTANTIATE_TEMPLATES
+template class product_operator<matrix_operator>;
+template class product_operator<spin_operator>;
+template class product_operator<boson_operator>;
+template class product_operator<fermion_operator>;
+#endif
+
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/cudaq/dynamics/rydberg_hamiltonian.cpp b/runtime/cudaq/dynamics/rydberg_hamiltonian.cpp
new file mode 100644
index 00000000000..3d8b125ad3c
--- /dev/null
+++ b/runtime/cudaq/dynamics/rydberg_hamiltonian.cpp
@@ -0,0 +1,55 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/operators.h"
+#include <sstream>
+#include <stdexcept>
+
+namespace cudaq {
+rydberg_hamiltonian::rydberg_hamiltonian(
+    const std::vector<Coordinate> &atom_sites, const scalar_operator &amplitude,
+    const scalar_operator &phase, const scalar_operator &delta_global,
+    const std::vector<int> &atom_filling,
+    const std::optional<std::pair<scalar_operator, std::vector<double>>>
+        &delta_local)
+    : atom_sites(atom_sites), amplitude(amplitude), phase(phase),
+      delta_global(delta_global), delta_local(delta_local) {
+  if (atom_filling.empty()) {
+    this->atom_filling = std::vector<int>(atom_sites.size(), 1);
+  } else if (atom_sites.size() != atom_filling.size()) {
+    throw std::invalid_argument(
+        "Size of `atom_sites` and `atom_filling` must be equal.");
+  } else {
+    this->atom_filling = atom_filling;
+  }
+
+  if (delta_local.has_value()) {
+    throw std::runtime_error(
+        "Local detuning is an experimental feature not yet supported.");
+  }
+}
+
+const std::vector<rydberg_hamiltonian::Coordinate> &
+rydberg_hamiltonian::get_atom_sites() const {
+  return atom_sites;
+}
+
+const std::vector<int> &rydberg_hamiltonian::get_atom_filling() const {
+  return atom_filling;
+}
+
+const scalar_operator &rydberg_hamiltonian::get_amplitude() const {
+  return amplitude;
+}
+
+const scalar_operator &rydberg_hamiltonian::get_phase() const { return phase; }
+
+const scalar_operator &rydberg_hamiltonian::get_delta_global() const {
+  return delta_global;
+}
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/cudaq/dynamics/scalar_operators.cpp b/runtime/cudaq/dynamics/scalar_operators.cpp
new file mode 100644
index 00000000000..bdffc3221dc
--- /dev/null
+++ b/runtime/cudaq/dynamics/scalar_operators.cpp
@@ -0,0 +1,263 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/operators.h"
+
+#include <iostream>
+#include <set>
+
+namespace cudaq {
+
+// constructors and destructors
+bool scalar_operator::is_constant() const {
+  return std::holds_alternative<std::complex<double>>(value);
+}
+
+scalar_operator::scalar_operator(double value)
+    : value(std::variant<std::complex<double>, ScalarCallbackFunction>(
+          std::complex<double>(value))) {}
+
+scalar_operator::scalar_operator(std::complex<double> value)
+    : value(std::variant<std::complex<double>, ScalarCallbackFunction>(value)) {
+}
+
+scalar_operator::scalar_operator(const ScalarCallbackFunction &create)
+    : value(
+          std::variant<std::complex<double>, ScalarCallbackFunction>(create)) {}
+
+scalar_operator::scalar_operator(ScalarCallbackFunction &&create)
+    : value(std::variant<std::complex<double>, ScalarCallbackFunction>(
+          std::move(create))) {}
+
+scalar_operator::scalar_operator(const scalar_operator &other)
+    : value(other.value) {}
+
+scalar_operator::scalar_operator(scalar_operator &&other)
+    : value(std::move(other.value)) {}
+
+// assignments
+
+scalar_operator &scalar_operator::operator=(const scalar_operator &other) {
+  if (this != &other)
+    this->value = other.value;
+  return *this;
+}
+
+scalar_operator &scalar_operator::operator=(scalar_operator &&other) {
+  if (this != &other)
+    this->value = std::move(other.value);
+  return *this;
+}
+
+// evaluations
+
+std::complex<double> scalar_operator::evaluate(
+    const std::unordered_map<std::string, std::complex<double>> &parameters)
+    const {
+  if (std::holds_alternative<ScalarCallbackFunction>(this->value))
+    return std::get<ScalarCallbackFunction>(this->value)(parameters);
+  return std::get<std::complex<double>>(this->value);
+}
+
+matrix_2 scalar_operator::to_matrix(
+    const std::unordered_map<std::string, std::complex<double>> &parameters)
+    const {
+  auto returnOperator = matrix_2(1, 1);
+  returnOperator[{0, 0}] = evaluate(parameters);
+  return returnOperator;
+}
+
+std::string scalar_operator::to_string() const {
+  if (std::holds_alternative<std::complex<double>>(this->value)) {
+    auto value = std::get<std::complex<double>>(this->value);
+    return "(" + std::to_string(value.real()) + "+" +
+           std::to_string(value.imag()) + "i)";
+  }
+  return "scalar";
+}
+
+// comparison
+
+bool scalar_operator::operator==(scalar_operator other) const {
+  if (std::holds_alternative<ScalarCallbackFunction>(this->value)) {
+    return std::holds_alternative<ScalarCallbackFunction>(other.value) &&
+           &std::get<ScalarCallbackFunction>(this->value) ==
+               &std::get<ScalarCallbackFunction>(other.value);
+  } else {
+    return std::holds_alternative<std::complex<double>>(this->value) &&
+           std::get<std::complex<double>>(this->value) ==
+               std::get<std::complex<double>>(other.value);
+  }
+}
+
+// unary operators
+
+scalar_operator scalar_operator::operator-() const & { return *this * (-1.); }
+
+scalar_operator scalar_operator::operator-() && {
+  *this *= -1.;
+  return std::move(*this);
+}
+
+scalar_operator scalar_operator::operator+() const & { return *this; }
+
+scalar_operator scalar_operator::operator+() && { return std::move(*this); }
+
+// right-hand arithmetics
+
+#define ARITHMETIC_OPERATIONS(op, otherTy)                                     \
+  scalar_operator scalar_operator::operator op(otherTy other) const & {        \
+    if (std::holds_alternative<std::complex<double>>(this->value)) {           \
+      return scalar_operator(std::get<std::complex<double>>(this->value)       \
+                                 op other);                                    \
+    }                                                                          \
+    auto newGenerator =                                                        \
+        [other, generator = std::get<ScalarCallbackFunction>(this->value)](    \
+            const std::unordered_map<std::string, std::complex<double>>        \
+                &parameters) { return generator(parameters) op other; };       \
+    return scalar_operator(std::move(newGenerator));                           \
+  }
+
+ARITHMETIC_OPERATIONS(*, double);
+ARITHMETIC_OPERATIONS(/, double);
+ARITHMETIC_OPERATIONS(+, double);
+ARITHMETIC_OPERATIONS(-, double);
+ARITHMETIC_OPERATIONS(*, std::complex<double>);
+ARITHMETIC_OPERATIONS(/, std::complex<double>);
+ARITHMETIC_OPERATIONS(+, std::complex<double>);
+ARITHMETIC_OPERATIONS(-, std::complex<double>);
+
+#define ARITHMETIC_OPERATIONS_SCALAR_OPS(op)                                   \
+  scalar_operator scalar_operator::operator op(const scalar_operator &other)   \
+      const & {                                                                \
+    if (std::holds_alternative<std::complex<double>>(this->value) &&           \
+        std::holds_alternative<std::complex<double>>(other.value)) {           \
+      return scalar_operator(std::get<std::complex<double>>(                   \
+          this->value) op std::get<std::complex<double>>(other.value));        \
+    }                                                                          \
+    auto newGenerator =                                                        \
+        [other,                                                                \
+         *this](const std::unordered_map<std::string, std::complex<double>>    \
+                    &parameters) {                                             \
+          return this->evaluate(parameters) op other.evaluate(parameters);     \
+        };                                                                     \
+    return scalar_operator(std::move(newGenerator));                           \
+  }
+
+ARITHMETIC_OPERATIONS_SCALAR_OPS(*);
+ARITHMETIC_OPERATIONS_SCALAR_OPS(/);
+ARITHMETIC_OPERATIONS_SCALAR_OPS(+);
+ARITHMETIC_OPERATIONS_SCALAR_OPS(-);
+
+#define ARITHMETIC_OPERATIONS_ASSIGNMENT(op, otherTy)                          \
+  scalar_operator &scalar_operator::operator op##=(otherTy other) {            \
+    if (std::holds_alternative<std::complex<double>>(this->value)) {           \
+      this->value = std::get<std::complex<double>>(this->value) op other;      \
+      return *this;                                                            \
+    }                                                                          \
+    auto newGenerator =                                                        \
+        [other, generator =                                                    \
+                    std::move(std::get<ScalarCallbackFunction>(this->value))]( \
+            const std::unordered_map<std::string, std::complex<double>>        \
+                &parameters) { return generator(parameters) op## = other; };   \
+    this->value = std::move(newGenerator);                                     \
+    return *this;                                                              \
+  }
+
+ARITHMETIC_OPERATIONS_ASSIGNMENT(*, double);
+ARITHMETIC_OPERATIONS_ASSIGNMENT(/, double);
+ARITHMETIC_OPERATIONS_ASSIGNMENT(+, double);
+ARITHMETIC_OPERATIONS_ASSIGNMENT(-, double);
+ARITHMETIC_OPERATIONS_ASSIGNMENT(*, std::complex<double>);
+ARITHMETIC_OPERATIONS_ASSIGNMENT(/, std::complex<double>);
+ARITHMETIC_OPERATIONS_ASSIGNMENT(+, std::complex<double>);
+ARITHMETIC_OPERATIONS_ASSIGNMENT(-, std::complex<double>);
+
+#define ARITHMETIC_OPERATIONS_SCALAR_OPS_ASSIGNMENT(op)                        \
+  scalar_operator &scalar_operator::operator op##=(                            \
+      const scalar_operator &other) {                                          \
+    if (std::holds_alternative<std::complex<double>>(this->value) &&           \
+        std::holds_alternative<std::complex<double>>(other.value)) {           \
+      this->value = std::get<std::complex<double>>(this->value)                \
+          op std::get<std::complex<double>>(other.value);                      \
+      return *this;                                                            \
+    }                                                                          \
+    auto newGenerator =                                                        \
+        [other,                                                                \
+         *this](const std::unordered_map<std::string, std::complex<double>>    \
+                    &parameters) {                                             \
+          return this->evaluate(parameters) op## = other.evaluate(parameters); \
+        };                                                                     \
+    this->value = std::move(newGenerator);                                     \
+    return *this;                                                              \
+  }
+
+ARITHMETIC_OPERATIONS_SCALAR_OPS_ASSIGNMENT(*);
+ARITHMETIC_OPERATIONS_SCALAR_OPS_ASSIGNMENT(/);
+ARITHMETIC_OPERATIONS_SCALAR_OPS_ASSIGNMENT(+);
+ARITHMETIC_OPERATIONS_SCALAR_OPS_ASSIGNMENT(-);
+
+#define ARITHMETIC_OPERATIONS_RVALUE(op, otherTy)                              \
+  scalar_operator scalar_operator::operator op(otherTy other) && {             \
+    *this op## = other;                                                        \
+    return std::move(*this);                                                   \
+  }
+
+ARITHMETIC_OPERATIONS_RVALUE(*, double);
+ARITHMETIC_OPERATIONS_RVALUE(/, double);
+ARITHMETIC_OPERATIONS_RVALUE(+, double);
+ARITHMETIC_OPERATIONS_RVALUE(-, double);
+ARITHMETIC_OPERATIONS_RVALUE(*, std::complex<double>);
+ARITHMETIC_OPERATIONS_RVALUE(/, std::complex<double>);
+ARITHMETIC_OPERATIONS_RVALUE(+, std::complex<double>);
+ARITHMETIC_OPERATIONS_RVALUE(-, std::complex<double>);
+ARITHMETIC_OPERATIONS_RVALUE(*, const scalar_operator &);
+ARITHMETIC_OPERATIONS_RVALUE(/, const scalar_operator &);
+ARITHMETIC_OPERATIONS_RVALUE(+, const scalar_operator &);
+ARITHMETIC_OPERATIONS_RVALUE(-, const scalar_operator &);
+
+// left-hand arithmetics
+
+#define ARITHMETIC_OPERATIONS_REVERSE(op, otherTy)                             \
+                                                                               \
+  scalar_operator operator op(otherTy other, const scalar_operator &self) {    \
+    if (std::holds_alternative<std::complex<double>>(self.value)) {            \
+      return scalar_operator(                                                  \
+          other op std::get<std::complex<double>>(self.value));                \
+    }                                                                          \
+    auto newGenerator =                                                        \
+        [other, generator = std::get<ScalarCallbackFunction>(self.value)](     \
+            const std::unordered_map<std::string, std::complex<double>>        \
+                &parameters) { return other op generator(parameters); };       \
+    return scalar_operator(std::move(newGenerator));                           \
+  }                                                                            \
+                                                                               \
+  scalar_operator operator op(otherTy other, scalar_operator &&self) {         \
+    if (std::holds_alternative<std::complex<double>>(self.value)) {            \
+      return scalar_operator(                                                  \
+          other op std::get<std::complex<double>>(self.value));                \
+    }                                                                          \
+    auto newGenerator =                                                        \
+        [other,                                                                \
+         generator = std::move(std::get<ScalarCallbackFunction>(self.value))]( \
+            const std::unordered_map<std::string, std::complex<double>>        \
+                &parameters) { return other op generator(parameters); };       \
+    self.value = std::move(newGenerator);                                      \
+    return std::move(self);                                                    \
+  }
+
+ARITHMETIC_OPERATIONS_REVERSE(*, double);
+ARITHMETIC_OPERATIONS_REVERSE(/, double);
+ARITHMETIC_OPERATIONS_REVERSE(+, double);
+ARITHMETIC_OPERATIONS_REVERSE(-, double);
+ARITHMETIC_OPERATIONS_REVERSE(*, std::complex<double>);
+ARITHMETIC_OPERATIONS_REVERSE(/, std::complex<double>);
+ARITHMETIC_OPERATIONS_REVERSE(+, std::complex<double>);
+ARITHMETIC_OPERATIONS_REVERSE(-, std::complex<double>);
+
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/cudaq/dynamics/schedule.cpp b/runtime/cudaq/dynamics/schedule.cpp
new file mode 100644
index 00000000000..757f5151aa2
--- /dev/null
+++ b/runtime/cudaq/dynamics/schedule.cpp
@@ -0,0 +1,108 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/schedule.h"
+#include <optional>
+#include <stdexcept>
+
+namespace cudaq {
+
+// Constructor
+Schedule::Schedule(
+    const std::vector<std::complex<double>> &steps,
+    const std::vector<std::string> &parameters,
+    const std::function<std::complex<double>(
+        const std::string &, const std::complex<double> &)> &value_function)
+    : steps(steps), parameters(parameters), value_function(value_function),
+      current_idx(-1) {
+  if (!steps.empty())
+    ptr = &this->steps[0];
+  else
+    ptr = nullptr;
+}
+
+// Dereference operator
+Schedule::reference Schedule::operator*() const { return *ptr; }
+
+// Arrow operator
+Schedule::pointer Schedule::operator->() { return ptr; }
+
+// Prefix increment
+Schedule &Schedule::operator++() {
+  if (current_idx + 1 < static_cast<int>(steps.size())) {
+    current_idx++;
+    ptr = &steps[current_idx];
+  } else {
+    throw std::out_of_range("No more steps in the schedule.");
+  }
+  return *this;
+}
+
+// Postfix increment
+Schedule Schedule::operator++(int) {
+  Schedule tmp = *this;
+  ++(*this);
+  return tmp;
+}
+
+// Comparison operators
+bool operator==(const Schedule &a, const Schedule &b) {
+  return a.ptr == b.ptr;
+};
+
+bool operator!=(const Schedule &a, const Schedule &b) {
+  return a.ptr != b.ptr;
+};
+
+// Reset schedule
+void Schedule::reset() {
+  current_idx = -1;
+  if (!steps.empty()) {
+    ptr = &steps[0];
+  } else {
+    ptr = nullptr;
+  }
+}
+
+// Get the current step
+std::optional<std::complex<double>> Schedule::current_step() const {
+  if (current_idx >= 0 && current_idx < static_cast<int>(steps.size())) {
+    return steps[current_idx];
+  }
+  return std::nullopt;
+}
+
+std::vector<std::complex<double>>::iterator Schedule::begin() {
+  return steps.begin();
+}
+
+std::vector<std::complex<double>>::iterator Schedule::end() {
+  return steps.end();
+}
+
+std::vector<std::complex<double>>::const_iterator Schedule::begin() const {
+  return steps.cbegin();
+}
+
+std::vector<std::complex<double>>::const_iterator Schedule::end() const {
+  return steps.cend();
+}
+
+// Get the parameters of the schedule.
+const std::vector<std::string> &Schedule::get_parameters() const {
+  return parameters;
+}
+
+// Get the value function of the schedule.
+const std::function<std::complex<double>(const std::string &,
+                                         const std::complex<double> &)>
+Schedule::get_value_function() const {
+  return value_function;
+}
+
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/cudaq/dynamics/spin_operators.cpp b/runtime/cudaq/dynamics/spin_operators.cpp
new file mode 100644
index 00000000000..21e1dfc2e8c
--- /dev/null
+++ b/runtime/cudaq/dynamics/spin_operators.cpp
@@ -0,0 +1,170 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include <complex>
+#include <unordered_map>
+#include <vector>
+
+#include "cudaq/utils/tensor.h"
+#include "spin_operators.h"
+
+namespace cudaq {
+
+// private helpers
+
+std::string spin_operator::op_code_to_string() const {
+  if (this->op_code == 1)
+    return "Z";
+  if (this->op_code == 2)
+    return "X";
+  if (this->op_code == 3)
+    return "Y";
+  return "I";
+}
+
+std::string spin_operator::op_code_to_string(
+    std::unordered_map<int, int> &dimensions) const {
+  auto it = dimensions.find(this->target);
+  if (it == dimensions.end())
+    dimensions[this->target] = 2;
+  else if (it->second != 2)
+    throw std::runtime_error("dimension for spin operator must be 2");
+  return this->op_code_to_string();
+}
+
+std::complex<double> spin_operator::inplace_mult(const spin_operator &other) {
+  assert(this->target == other.target);
+  std::complex<double> factor;
+  if (this->op_code == 0 || other.op_code == 0 ||
+      this->op_code == other.op_code)
+    factor = 1.0;
+  else if (this->op_code + 1 == other.op_code ||
+           this->op_code - 2 == other.op_code)
+    factor = 1.0j;
+  else
+    factor = -1.0j;
+  this->op_code ^= other.op_code;
+  return factor;
+}
+
+// read-only properties
+
+std::string spin_operator::unique_id() const {
+  return this->op_code_to_string() + std::to_string(target);
+}
+
+std::vector<int> spin_operator::degrees() const { return {this->target}; }
+
+// constructors
+
+spin_operator::spin_operator(int target) : op_code(0), target(target) {}
+
+spin_operator::spin_operator(int target, int op_id)
+    : op_code(op_id), target(target) {
+  assert(0 <= op_id < 4);
+}
+
+// evaluations
+
+matrix_2 spin_operator::to_matrix(std::string pauli_word,
+                                  std::complex<double> coeff,
+                                  bool invert_order) {
+  auto map_state = [&pauli_word](char pauli, bool state) {
+    if (state) {
+      if (pauli == 'Z')
+        return std::make_pair<std::complex<double>, bool>(-1., bool(state));
+      if (pauli == 'X')
+        return std::make_pair<std::complex<double>, bool>(1., !state);
+      if (pauli == 'Y')
+        return std::make_pair<std::complex<double>, bool>(-1.j, !state);
+      return std::make_pair<std::complex<double>, bool>(1., bool(state));
+    } else {
+      if (pauli == 'Z')
+        return std::make_pair<std::complex<double>, bool>(1., bool(state));
+      if (pauli == 'X')
+        return std::make_pair<std::complex<double>, bool>(1., !state);
+      if (pauli == 'Y')
+        return std::make_pair<std::complex<double>, bool>(1.j, !state);
+      return std::make_pair<std::complex<double>, bool>(1., bool(state));
+    }
+  };
+
+  auto dim = 1 << pauli_word.size();
+  auto nr_deg = pauli_word.size();
+
+  matrix_2 matrix(dim, dim);
+  for (std::size_t old_state = 0; old_state < dim; ++old_state) {
+    std::size_t new_state = 0;
+    std::complex<double> entry = 1.;
+    for (auto degree = 0; degree < nr_deg; ++degree) {
+      auto canon_degree = degree;
+      auto state = (old_state & (1 << canon_degree)) >> canon_degree;
+      // Note that indeed to have the matrix match the ordering (endianness) of
+      // the pauli word, we have to look at word index nr_deg-1-degree here.
+      auto op = pauli_word[invert_order ? degree : nr_deg - 1 - degree];
+      auto mapped = map_state(op, state);
+      entry *= mapped.first;
+      new_state |= (mapped.second << canon_degree);
+    }
+    matrix[{new_state, old_state}] = coeff * entry;
+  }
+  return std::move(matrix);
+}
+
+matrix_2 spin_operator::to_matrix(
+    std::unordered_map<int, int> &dimensions,
+    const std::unordered_map<std::string, std::complex<double>> &parameters)
+    const {
+  auto it = dimensions.find(this->target);
+  if (it == dimensions.end())
+    dimensions[this->target] = 2;
+  else if (it->second != 2)
+    throw std::runtime_error("dimension for spin operator must be 2");
+  return spin_operator::to_matrix(this->op_code_to_string());
+}
+
+std::string spin_operator::to_string(bool include_degrees) const {
+  if (include_degrees)
+    return this->op_code_to_string() + "(" + std::to_string(target) + ")";
+  else
+    return this->op_code_to_string();
+}
+
+// comparisons
+
+bool spin_operator::operator==(const spin_operator &other) const {
+  return this->op_code == other.op_code && this->target == other.target;
+}
+
+// defined operators
+
+operator_sum<spin_operator> spin_operator::empty() {
+  return operator_handler::empty<spin_operator>();
+}
+
+product_operator<spin_operator> spin_operator::identity() {
+  return operator_handler::identity<spin_operator>();
+}
+
+product_operator<spin_operator> spin_operator::i(int degree) {
+  return product_operator(spin_operator(degree));
+}
+
+product_operator<spin_operator> spin_operator::z(int degree) {
+  return product_operator(spin_operator(degree, 1));
+}
+
+product_operator<spin_operator> spin_operator::x(int degree) {
+  return product_operator(spin_operator(degree, 2));
+}
+
+product_operator<spin_operator> spin_operator::y(int degree) {
+  return product_operator(spin_operator(degree, 3));
+}
+
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/cudaq/dynamics/spin_operators.h b/runtime/cudaq/dynamics/spin_operators.h
new file mode 100644
index 00000000000..cfc42dfc82e
--- /dev/null
+++ b/runtime/cudaq/dynamics/spin_operators.h
@@ -0,0 +1,87 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include <complex>
+#include <unordered_map>
+#include <vector>
+
+#include "cudaq/operators.h"
+#include "cudaq/utils/tensor.h"
+
+namespace cudaq {
+
+// FIXME: rename to spin ...
+class spin_operator : public operator_handler {
+  template <typename T>
+  friend class product_operator;
+
+private:
+  // I = 0, Z = 1, X = 2, Y = 3
+  int op_code;
+  int target;
+
+  spin_operator(int target, int op_code);
+
+  // private helpers
+
+  std::string op_code_to_string() const;
+  std::string op_code_to_string(std::unordered_map<int, int> &dimensions) const;
+
+  std::complex<double> inplace_mult(const spin_operator &other);
+
+public:
+  // read-only properties
+
+  virtual std::string unique_id() const;
+
+  virtual std::vector<int> degrees() const;
+
+  // constructors and destructors
+
+  spin_operator(int target);
+
+  ~spin_operator() = default;
+
+  // evaluations
+
+  /// @brief Computes the matrix representation of a Pauli string.
+  /// By default, the ordering of the matrix matches the ordering of the Pauli
+  /// string,
+  static matrix_2 to_matrix(std::string pauli, std::complex<double> coeff = 1.,
+                            bool invert_order = false);
+
+  /// @brief Return the `matrix_operator` as a matrix.
+  /// @arg  `dimensions` : A map specifying the number of levels,
+  ///                      that is, the dimension of each degree of freedom
+  ///                      that the operator acts on. Example for two, 2-level
+  ///                      degrees of freedom: `{0 : 2, 1 : 2}`.
+  virtual matrix_2
+  to_matrix(std::unordered_map<int, int> &dimensions,
+            const std::unordered_map<std::string, std::complex<double>>
+                &parameters = {}) const;
+
+  virtual std::string to_string(bool include_degrees) const;
+
+  // comparisons
+
+  bool operator==(const spin_operator &other) const;
+
+  // defined operators
+
+  static operator_sum<spin_operator> empty();
+  static product_operator<spin_operator> identity();
+
+  static product_operator<spin_operator> i(int degree);
+  static product_operator<spin_operator> z(int degree);
+  static product_operator<spin_operator> x(int degree);
+  static product_operator<spin_operator> y(int degree);
+};
+
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/cudaq/dynamics/templates.h b/runtime/cudaq/dynamics/templates.h
new file mode 100644
index 00000000000..d65685cd35c
--- /dev/null
+++ b/runtime/cudaq/dynamics/templates.h
@@ -0,0 +1,390 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include <complex>
+
+#include "boson_operators.h"
+#include "fermion_operators.h"
+#include "matrix_operators.h"
+#include "operator_leafs.h"
+#include "spin_operators.h"
+
+namespace cudaq {
+
+template <typename HandlerTy>
+class product_operator;
+
+template <typename HandlerTy>
+class operator_sum;
+
+#define TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype)                             \
+  std::enable_if_t<!std::is_same<LHtype, RHtype>::value &&                     \
+                       !std::is_same<matrix_operator, LHtype>::value &&        \
+                       std::is_base_of<operator_handler, LHtype>::value &&     \
+                       std::is_base_of<operator_handler, RHtype>::value,       \
+                   bool>
+
+template <typename HandlerTy>
+product_operator<HandlerTy> operator*(double other,
+                                      const product_operator<HandlerTy> &self);
+template <typename HandlerTy>
+product_operator<HandlerTy> operator*(double other,
+                                      product_operator<HandlerTy> &&self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator+(double other,
+                                  const product_operator<HandlerTy> &self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator+(double other,
+                                  product_operator<HandlerTy> &&self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator-(double other,
+                                  const product_operator<HandlerTy> &self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator-(double other,
+                                  product_operator<HandlerTy> &&self);
+template <typename HandlerTy>
+product_operator<HandlerTy> operator*(std::complex<double> other,
+                                      const product_operator<HandlerTy> &self);
+template <typename HandlerTy>
+product_operator<HandlerTy> operator*(std::complex<double> other,
+                                      product_operator<HandlerTy> &&self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator+(std::complex<double> other,
+                                  const product_operator<HandlerTy> &self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator+(std::complex<double> other,
+                                  product_operator<HandlerTy> &&self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator-(std::complex<double> other,
+                                  const product_operator<HandlerTy> &self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator-(std::complex<double> other,
+                                  product_operator<HandlerTy> &&self);
+template <typename HandlerTy>
+product_operator<HandlerTy> operator*(const scalar_operator &other,
+                                      const product_operator<HandlerTy> &self);
+template <typename HandlerTy>
+product_operator<HandlerTy> operator*(const scalar_operator &other,
+                                      product_operator<HandlerTy> &&self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator+(const scalar_operator &other,
+                                  const product_operator<HandlerTy> &self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator+(const scalar_operator &other,
+                                  product_operator<HandlerTy> &&self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator-(const scalar_operator &other,
+                                  const product_operator<HandlerTy> &self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator-(const scalar_operator &other,
+                                  product_operator<HandlerTy> &&self);
+
+template <typename LHtype, typename RHtype,
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+product_operator<matrix_operator>
+operator*(const product_operator<LHtype> &other,
+          const product_operator<RHtype> &self);
+template <typename LHtype, typename RHtype,
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+operator_sum<matrix_operator> operator+(const product_operator<LHtype> &other,
+                                        const product_operator<RHtype> &self);
+template <typename LHtype, typename RHtype,
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+operator_sum<matrix_operator> operator-(const product_operator<LHtype> &other,
+                                        const product_operator<RHtype> &self);
+
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator*(double other,
+                                  const operator_sum<HandlerTy> &self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator*(double other, operator_sum<HandlerTy> &&self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator+(double other,
+                                  const operator_sum<HandlerTy> &self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator+(double other, operator_sum<HandlerTy> &&self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator-(double other,
+                                  const operator_sum<HandlerTy> &self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator-(double other, operator_sum<HandlerTy> &&self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator*(std::complex<double> other,
+                                  const operator_sum<HandlerTy> &self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator*(std::complex<double> other,
+                                  operator_sum<HandlerTy> &&self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator+(std::complex<double> other,
+                                  const operator_sum<HandlerTy> &self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator+(std::complex<double> other,
+                                  operator_sum<HandlerTy> &&self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator-(std::complex<double> other,
+                                  const operator_sum<HandlerTy> &self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator-(std::complex<double> other,
+                                  operator_sum<HandlerTy> &&self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator*(const scalar_operator &other,
+                                  const operator_sum<HandlerTy> &self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator*(const scalar_operator &other,
+                                  operator_sum<HandlerTy> &&self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator+(const scalar_operator &other,
+                                  const operator_sum<HandlerTy> &self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator+(const scalar_operator &other,
+                                  operator_sum<HandlerTy> &&self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator-(const scalar_operator &other,
+                                  const operator_sum<HandlerTy> &self);
+template <typename HandlerTy>
+operator_sum<HandlerTy> operator-(const scalar_operator &other,
+                                  operator_sum<HandlerTy> &&self);
+
+template <typename LHtype, typename RHtype,
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+operator_sum<matrix_operator> operator*(const operator_sum<LHtype> &other,
+                                        const product_operator<RHtype> &self);
+template <typename LHtype, typename RHtype,
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+operator_sum<matrix_operator> operator+(const operator_sum<LHtype> &other,
+                                        const product_operator<RHtype> &self);
+template <typename LHtype, typename RHtype,
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+operator_sum<matrix_operator> operator-(const operator_sum<LHtype> &other,
+                                        const product_operator<RHtype> &self);
+template <typename LHtype, typename RHtype,
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+operator_sum<matrix_operator> operator*(const product_operator<LHtype> &other,
+                                        const operator_sum<RHtype> &self);
+template <typename LHtype, typename RHtype,
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+operator_sum<matrix_operator> operator+(const product_operator<LHtype> &other,
+                                        const operator_sum<RHtype> &self);
+template <typename LHtype, typename RHtype,
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+operator_sum<matrix_operator> operator-(const product_operator<LHtype> &other,
+                                        const operator_sum<RHtype> &self);
+template <typename LHtype, typename RHtype,
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+operator_sum<matrix_operator> operator*(const operator_sum<LHtype> &other,
+                                        const operator_sum<RHtype> &self);
+template <typename LHtype, typename RHtype,
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+operator_sum<matrix_operator> operator+(const operator_sum<LHtype> &other,
+                                        const operator_sum<RHtype> &self);
+template <typename LHtype, typename RHtype,
+          TYPE_CONVERSION_CONSTRAINT(LHtype, RHtype) = true>
+operator_sum<matrix_operator> operator-(const operator_sum<LHtype> &other,
+                                        const operator_sum<RHtype> &self);
+
+#ifndef CUDAQ_INSTANTIATE_TEMPLATES
+#define EXTERN_TEMPLATE_SPECIALIZATIONS(HandlerTy)                             \
+                                                                               \
+  extern template product_operator<HandlerTy> operator*(                       \
+      double other, const product_operator<HandlerTy> &self);                  \
+  extern template product_operator<HandlerTy> operator*(                       \
+      double other, product_operator<HandlerTy> &&self);                       \
+  extern template operator_sum<HandlerTy> operator+(                           \
+      double other, const product_operator<HandlerTy> &self);                  \
+  extern template operator_sum<HandlerTy> operator+(                           \
+      double other, product_operator<HandlerTy> &&self);                       \
+  extern template operator_sum<HandlerTy> operator-(                           \
+      double other, const product_operator<HandlerTy> &self);                  \
+  extern template operator_sum<HandlerTy> operator-(                           \
+      double other, product_operator<HandlerTy> &&self);                       \
+  extern template product_operator<HandlerTy> operator*(                       \
+      std::complex<double> other, const product_operator<HandlerTy> &self);    \
+  extern template product_operator<HandlerTy> operator*(                       \
+      std::complex<double> other, product_operator<HandlerTy> &&self);         \
+  extern template operator_sum<HandlerTy> operator+(                           \
+      std::complex<double> other, const product_operator<HandlerTy> &self);    \
+  extern template operator_sum<HandlerTy> operator+(                           \
+      std::complex<double> other, product_operator<HandlerTy> &&self);         \
+  extern template operator_sum<HandlerTy> operator-(                           \
+      std::complex<double> other, const product_operator<HandlerTy> &self);    \
+  extern template operator_sum<HandlerTy> operator-(                           \
+      std::complex<double> other, product_operator<HandlerTy> &&self);         \
+  extern template product_operator<HandlerTy> operator*(                       \
+      const scalar_operator &other, const product_operator<HandlerTy> &self);  \
+  extern template product_operator<HandlerTy> operator*(                       \
+      const scalar_operator &other, product_operator<HandlerTy> &&self);       \
+  extern template operator_sum<HandlerTy> operator+(                           \
+      const scalar_operator &other, const product_operator<HandlerTy> &self);  \
+  extern template operator_sum<HandlerTy> operator+(                           \
+      const scalar_operator &other, product_operator<HandlerTy> &&self);       \
+  extern template operator_sum<HandlerTy> operator-(                           \
+      const scalar_operator &other, const product_operator<HandlerTy> &self);  \
+  extern template operator_sum<HandlerTy> operator-(                           \
+      const scalar_operator &other, product_operator<HandlerTy> &&self);       \
+                                                                               \
+  extern template operator_sum<HandlerTy> operator*(                           \
+      double other, const operator_sum<HandlerTy> &self);                      \
+  extern template operator_sum<HandlerTy> operator*(                           \
+      double other, operator_sum<HandlerTy> &&self);                           \
+  extern template operator_sum<HandlerTy> operator+(                           \
+      double other, const operator_sum<HandlerTy> &self);                      \
+  extern template operator_sum<HandlerTy> operator+(                           \
+      double other, operator_sum<HandlerTy> &&self);                           \
+  extern template operator_sum<HandlerTy> operator-(                           \
+      double other, const operator_sum<HandlerTy> &self);                      \
+  extern template operator_sum<HandlerTy> operator-(                           \
+      double other, operator_sum<HandlerTy> &&self);                           \
+  extern template operator_sum<HandlerTy> operator*(                           \
+      std::complex<double> other, const operator_sum<HandlerTy> &self);        \
+  extern template operator_sum<HandlerTy> operator*(                           \
+      std::complex<double> other, operator_sum<HandlerTy> &&self);             \
+  extern template operator_sum<HandlerTy> operator+(                           \
+      std::complex<double> other, const operator_sum<HandlerTy> &self);        \
+  extern template operator_sum<HandlerTy> operator+(                           \
+      std::complex<double> other, operator_sum<HandlerTy> &&self);             \
+  extern template operator_sum<HandlerTy> operator-(                           \
+      std::complex<double> other, const operator_sum<HandlerTy> &self);        \
+  extern template operator_sum<HandlerTy> operator-(                           \
+      std::complex<double> other, operator_sum<HandlerTy> &&self);             \
+  extern template operator_sum<HandlerTy> operator*(                           \
+      const scalar_operator &other, const operator_sum<HandlerTy> &self);      \
+  extern template operator_sum<HandlerTy> operator*(                           \
+      const scalar_operator &other, operator_sum<HandlerTy> &&self);           \
+  extern template operator_sum<HandlerTy> operator+(                           \
+      const scalar_operator &other, const operator_sum<HandlerTy> &self);      \
+  extern template operator_sum<HandlerTy> operator+(                           \
+      const scalar_operator &other, operator_sum<HandlerTy> &&self);           \
+  extern template operator_sum<HandlerTy> operator-(                           \
+      const scalar_operator &other, const operator_sum<HandlerTy> &self);      \
+  extern template operator_sum<HandlerTy> operator-(                           \
+      const scalar_operator &other, operator_sum<HandlerTy> &&self);
+
+EXTERN_TEMPLATE_SPECIALIZATIONS(matrix_operator);
+EXTERN_TEMPLATE_SPECIALIZATIONS(spin_operator);
+EXTERN_TEMPLATE_SPECIALIZATIONS(boson_operator);
+
+#define EXTERN_CONVERSION_TEMPLATE_SPECIALIZATIONS(op, returnTy)               \
+                                                                               \
+  extern template returnTy<matrix_operator> operator op(                       \
+      const product_operator<spin_operator> &other,                            \
+      const product_operator<matrix_operator> &self);                          \
+  extern template returnTy<matrix_operator> operator op(                       \
+      const product_operator<boson_operator> &other,                           \
+      const product_operator<matrix_operator> &self);                          \
+  extern template returnTy<matrix_operator> operator op(                       \
+      const product_operator<fermion_operator> &other,                         \
+      const product_operator<matrix_operator> &self);                          \
+  extern template returnTy<matrix_operator> operator op(                       \
+      const product_operator<spin_operator> &other,                            \
+      const product_operator<boson_operator> &self);                           \
+  extern template returnTy<matrix_operator> operator op(                       \
+      const product_operator<boson_operator> &other,                           \
+      const product_operator<spin_operator> &self);                            \
+  extern template returnTy<matrix_operator> operator op(                       \
+      const product_operator<spin_operator> &other,                            \
+      const product_operator<fermion_operator> &self);                         \
+  extern template returnTy<matrix_operator> operator op(                       \
+      const product_operator<fermion_operator> &other,                         \
+      const product_operator<spin_operator> &self);                            \
+  extern template returnTy<matrix_operator> operator op(                       \
+      const product_operator<boson_operator> &other,                           \
+      const product_operator<fermion_operator> &self);                         \
+  extern template returnTy<matrix_operator> operator op(                       \
+      const product_operator<fermion_operator> &other,                         \
+      const product_operator<boson_operator> &self);                           \
+                                                                               \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const operator_sum<spin_operator> &other,                                \
+      const product_operator<matrix_operator> &self);                          \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const operator_sum<boson_operator> &other,                               \
+      const product_operator<matrix_operator> &self);                          \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const operator_sum<fermion_operator> &other,                             \
+      const product_operator<matrix_operator> &self);                          \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const operator_sum<spin_operator> &other,                                \
+      const product_operator<boson_operator> &self);                           \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const operator_sum<boson_operator> &other,                               \
+      const product_operator<spin_operator> &self);                            \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const operator_sum<spin_operator> &other,                                \
+      const product_operator<fermion_operator> &self);                         \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const operator_sum<fermion_operator> &other,                             \
+      const product_operator<spin_operator> &self);                            \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const operator_sum<boson_operator> &other,                               \
+      const product_operator<fermion_operator> &self);                         \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const operator_sum<fermion_operator> &other,                             \
+      const product_operator<boson_operator> &self);                           \
+                                                                               \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const product_operator<spin_operator> &other,                            \
+      const operator_sum<matrix_operator> &self);                              \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const product_operator<boson_operator> &other,                           \
+      const operator_sum<matrix_operator> &self);                              \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const product_operator<fermion_operator> &other,                         \
+      const operator_sum<matrix_operator> &self);                              \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const product_operator<spin_operator> &other,                            \
+      const operator_sum<boson_operator> &self);                               \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const product_operator<boson_operator> &other,                           \
+      const operator_sum<spin_operator> &self);                                \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const product_operator<spin_operator> &other,                            \
+      const operator_sum<fermion_operator> &self);                             \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const product_operator<fermion_operator> &other,                         \
+      const operator_sum<spin_operator> &self);                                \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const product_operator<boson_operator> &other,                           \
+      const operator_sum<fermion_operator> &self);                             \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const product_operator<fermion_operator> &other,                         \
+      const operator_sum<boson_operator> &self);                               \
+                                                                               \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const operator_sum<spin_operator> &other,                                \
+      const operator_sum<matrix_operator> &self);                              \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const operator_sum<boson_operator> &other,                               \
+      const operator_sum<matrix_operator> &self);                              \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const operator_sum<fermion_operator> &other,                             \
+      const operator_sum<matrix_operator> &self);                              \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const operator_sum<spin_operator> &other,                                \
+      const operator_sum<boson_operator> &self);                               \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const operator_sum<boson_operator> &other,                               \
+      const operator_sum<spin_operator> &self);                                \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const operator_sum<spin_operator> &other,                                \
+      const operator_sum<fermion_operator> &self);                             \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const operator_sum<fermion_operator> &other,                             \
+      const operator_sum<spin_operator> &self);                                \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const operator_sum<boson_operator> &other,                               \
+      const operator_sum<fermion_operator> &self);                             \
+  extern template operator_sum<matrix_operator> operator op(                   \
+      const operator_sum<fermion_operator> &other,                             \
+      const operator_sum<boson_operator> &self);
+
+EXTERN_CONVERSION_TEMPLATE_SPECIALIZATIONS(*, product_operator);
+EXTERN_CONVERSION_TEMPLATE_SPECIALIZATIONS(+, operator_sum);
+EXTERN_CONVERSION_TEMPLATE_SPECIALIZATIONS(-, operator_sum);
+#endif
+
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/cudaq/dynamics_integrators.h b/runtime/cudaq/dynamics_integrators.h
new file mode 100644
index 00000000000..9a935a3d5a9
--- /dev/null
+++ b/runtime/cudaq/dynamics_integrators.h
@@ -0,0 +1,44 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "cudaq/BaseIntegrator.h"
+#include "cudaq/BaseTimeStepper.h"
+#include "cudaq/operators.h"
+#include <memory>
+
+namespace cudaq {
+struct SystemDynamics {
+  operator_sum<cudaq::matrix_operator> *hamiltonian = nullptr;
+  std::vector<operator_sum<cudaq::matrix_operator>> collapseOps;
+  std::vector<int64_t> modeExtents;
+  std::unordered_map<std::string, std::complex<double>> parameters;
+};
+
+class RungeKuttaIntegrator : public BaseIntegrator {
+
+public:
+  std::optional<int> order;
+  std::optional<double> dt;
+
+public:
+  RungeKuttaIntegrator();
+  void integrate(double targetTime) override;
+  void setState(cudaq::state initialState, double t0) override;
+  std::pair<double, cudaq::state> getState() override;
+  void setSystem(const SystemDynamics &system, const cudaq::Schedule &schedule);
+
+private:
+  double m_t;
+  std::shared_ptr<cudaq::state> m_state;
+  SystemDynamics m_system;
+  std::unique_ptr<BaseTimeStepper> m_stepper;
+  cudaq::Schedule m_schedule;
+};
+} // namespace cudaq
diff --git a/runtime/cudaq/operators.h b/runtime/cudaq/operators.h
new file mode 100644
index 00000000000..cc534b7c293
--- /dev/null
+++ b/runtime/cudaq/operators.h
@@ -0,0 +1,672 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include <set>
+#include <type_traits>
+#include <unordered_map>
+#include <vector>
+
+#include "dynamics/evaluation.h"
+#include "dynamics/operator_leafs.h"
+#include "dynamics/templates.h"
+#include "utils/tensor.h"
+
+namespace cudaq {
+
+/// @brief Represents an operator expression consisting of a sum of terms, where
+/// each term is a product of elementary and scalar operators. Operator
+/// expressions cannot be used within quantum kernels, but they provide methods
+/// to convert them to data types that can.
+template <typename HandlerTy>
+class operator_sum {
+  template <typename T>
+  friend class operator_sum;
+  template <typename T>
+  friend class product_operator;
+
+private:
+  // inserts a new term combining it with an existing one if possible
+  void insert(product_operator<HandlerTy> &&other);
+  void insert(const product_operator<HandlerTy> &other);
+
+  void aggregate_terms();
+
+  template <typename... Args>
+  void aggregate_terms(product_operator<HandlerTy> &&head, Args &&...args);
+
+  template <typename EvalTy>
+  EvalTy evaluate(operator_arithmetics<EvalTy> arithmetics) const;
+
+protected:
+  std::unordered_map<std::string, int>
+      term_map; // quick access to term index given its id (used for aggregating
+                // terms)
+  std::vector<std::vector<HandlerTy>> terms;
+  std::vector<scalar_operator> coefficients;
+
+  template <typename... Args,
+            std::enable_if_t<std::conjunction<std::is_same<
+                                 product_operator<HandlerTy>, Args>...>::value,
+                             bool> = true>
+  operator_sum(Args &&...args);
+
+public:
+  // read-only properties
+
+  /// @brief The degrees of freedom that the operator acts on.
+  /// By default, degrees reflect the ordering convention (endianness) used in
+  /// CUDA-Q, and the ordering of the matrix returned by default by `to_matrix`.
+  std::vector<int> degrees(bool application_order = true) const;
+
+  /// @brief Return the number of operator terms that make up this operator sum.
+  int num_terms() const;
+
+  /// FIXME: GET RID OF THIS (MAKE ITERABLE INSTEAD)
+  std::vector<product_operator<HandlerTy>> get_terms() const;
+
+  // constructors and destructors
+
+  operator_sum(const product_operator<HandlerTy> &other);
+
+  template <typename T,
+            std::enable_if_t<!std::is_same<T, HandlerTy>::value &&
+                                 std::is_constructible<HandlerTy, T>::value,
+                             bool> = true>
+  operator_sum(const operator_sum<T> &other);
+
+  template <typename T,
+            std::enable_if_t<std::is_same<HandlerTy, matrix_operator>::value &&
+                                 !std::is_same<T, HandlerTy>::value &&
+                                 std::is_constructible<HandlerTy, T>::value,
+                             bool> = true>
+  operator_sum(const operator_sum<T> &other,
+               const matrix_operator::commutation_behavior &behavior);
+
+  // copy constructor
+  operator_sum(const operator_sum<HandlerTy> &other, int size = 0);
+
+  // move constructor
+  operator_sum(operator_sum<HandlerTy> &&other, int size = 0);
+
+  ~operator_sum() = default;
+
+  // assignments
+
+  template <typename T,
+            std::enable_if_t<!std::is_same<T, HandlerTy>::value &&
+                                 std::is_constructible<HandlerTy, T>::value,
+                             bool> = true>
+  operator_sum<HandlerTy> &operator=(const product_operator<T> &other);
+
+  operator_sum<HandlerTy> &operator=(const product_operator<HandlerTy> &other);
+
+  operator_sum<HandlerTy> &operator=(product_operator<HandlerTy> &&other);
+
+  template <typename T,
+            std::enable_if_t<!std::is_same<T, HandlerTy>::value &&
+                                 std::is_constructible<HandlerTy, T>::value,
+                             bool> = true>
+  operator_sum<HandlerTy> &operator=(const operator_sum<T> &other);
+
+  // assignment operator
+  operator_sum<HandlerTy> &operator=(const operator_sum<HandlerTy> &other);
+
+  // move assignment operator
+  operator_sum<HandlerTy> &operator=(operator_sum<HandlerTy> &&other);
+
+  // evaluations
+
+  /// @brief Return the operator_sum<HandlerTy> as a string.
+  std::string to_string() const;
+
+  /// @brief Return the matrix representation of the operator.
+  /// By default, the matrix is ordered according to the convention (endianness)
+  /// used in CUDA-Q, and the ordering returned by default by `degrees`.
+  /// @arg `dimensions` : A mapping that specifies the number of levels,
+  ///                      that is, the dimension of each degree of freedom
+  ///                      that the operator acts on. Example for two, 2-level
+  ///                      degrees of freedom: `{0:2, 1:2}`.
+  /// @arg `parameters` : A map of the parameter names to their concrete,
+  /// complex values.
+  matrix_2 to_matrix(std::unordered_map<int, int> dimensions = {},
+                     const std::unordered_map<std::string, std::complex<double>>
+                         &parameters = {},
+                     bool application_order = true) const;
+
+  // unary operators
+
+  operator_sum<HandlerTy> operator-() const &;
+  operator_sum<HandlerTy> operator-() &&;
+  operator_sum<HandlerTy> operator+() const &;
+  operator_sum<HandlerTy> operator+() &&;
+
+  // right-hand arithmetics
+
+  operator_sum<HandlerTy> operator*(double other) const &;
+  operator_sum<HandlerTy> operator*(double other) &&;
+  operator_sum<HandlerTy> operator+(double other) const &;
+  operator_sum<HandlerTy> operator+(double other) &&;
+  operator_sum<HandlerTy> operator-(double other) const &;
+  operator_sum<HandlerTy> operator-(double other) &&;
+  operator_sum<HandlerTy> operator*(std::complex<double> other) const &;
+  operator_sum<HandlerTy> operator*(std::complex<double> other) &&;
+  operator_sum<HandlerTy> operator+(std::complex<double> other) const &;
+  operator_sum<HandlerTy> operator+(std::complex<double> other) &&;
+  operator_sum<HandlerTy> operator-(std::complex<double> other) const &;
+  operator_sum<HandlerTy> operator-(std::complex<double> other) &&;
+  operator_sum<HandlerTy> operator*(const scalar_operator &other) const &;
+  operator_sum<HandlerTy> operator*(const scalar_operator &other) &&;
+  operator_sum<HandlerTy> operator+(const scalar_operator &other) const &;
+  operator_sum<HandlerTy> operator+(const scalar_operator &other) &&;
+  operator_sum<HandlerTy> operator-(const scalar_operator &other) const &;
+  operator_sum<HandlerTy> operator-(const scalar_operator &other) &&;
+  operator_sum<HandlerTy>
+  operator*(const product_operator<HandlerTy> &other) const;
+  operator_sum<HandlerTy>
+  operator+(const product_operator<HandlerTy> &other) const &;
+  operator_sum<HandlerTy>
+  operator+(const product_operator<HandlerTy> &other) &&;
+  operator_sum<HandlerTy>
+  operator+(product_operator<HandlerTy> &&other) const &;
+  operator_sum<HandlerTy> operator+(product_operator<HandlerTy> &&other) &&;
+  operator_sum<HandlerTy>
+  operator-(const product_operator<HandlerTy> &other) const &;
+  operator_sum<HandlerTy>
+  operator-(const product_operator<HandlerTy> &other) &&;
+  operator_sum<HandlerTy>
+  operator-(product_operator<HandlerTy> &&other) const &;
+  operator_sum<HandlerTy> operator-(product_operator<HandlerTy> &&other) &&;
+  operator_sum<HandlerTy> operator*(const operator_sum<HandlerTy> &other) const;
+  operator_sum<HandlerTy>
+  operator+(const operator_sum<HandlerTy> &other) const &;
+  operator_sum<HandlerTy> operator+(const operator_sum<HandlerTy> &other) &&;
+  operator_sum<HandlerTy> operator+(operator_sum<HandlerTy> &&other) const &;
+  operator_sum<HandlerTy> operator+(operator_sum<HandlerTy> &&other) &&;
+  operator_sum<HandlerTy>
+  operator-(const operator_sum<HandlerTy> &other) const &;
+  operator_sum<HandlerTy> operator-(const operator_sum<HandlerTy> &other) &&;
+  operator_sum<HandlerTy> operator-(operator_sum<HandlerTy> &&other) const &;
+  operator_sum<HandlerTy> operator-(operator_sum<HandlerTy> &&other) &&;
+
+  operator_sum<HandlerTy> &operator*=(double other);
+  operator_sum<HandlerTy> &operator+=(double other);
+  operator_sum<HandlerTy> &operator-=(double other);
+  operator_sum<HandlerTy> &operator*=(std::complex<double> other);
+  operator_sum<HandlerTy> &operator+=(std::complex<double> other);
+  operator_sum<HandlerTy> &operator-=(std::complex<double> other);
+  operator_sum<HandlerTy> &operator*=(const scalar_operator &other);
+  operator_sum<HandlerTy> &operator+=(const scalar_operator &other);
+  operator_sum<HandlerTy> &operator-=(const scalar_operator &other);
+  operator_sum<HandlerTy> &operator*=(const product_operator<HandlerTy> &other);
+  operator_sum<HandlerTy> &operator+=(const product_operator<HandlerTy> &other);
+  operator_sum<HandlerTy> &operator+=(product_operator<HandlerTy> &&other);
+  operator_sum<HandlerTy> &operator-=(const product_operator<HandlerTy> &other);
+  operator_sum<HandlerTy> &operator-=(product_operator<HandlerTy> &&other);
+  operator_sum<HandlerTy> &operator*=(const operator_sum<HandlerTy> &other);
+  operator_sum<HandlerTy> &operator+=(const operator_sum<HandlerTy> &other);
+  operator_sum<HandlerTy> &operator+=(operator_sum<HandlerTy> &&other);
+  operator_sum<HandlerTy> &operator-=(const operator_sum<HandlerTy> &other);
+  operator_sum<HandlerTy> &operator-=(operator_sum<HandlerTy> &&other);
+
+  // left-hand arithmetics
+
+  // Being a bit permissive here, since otherwise the explicit template
+  // instantiation is a nightmare.
+  template <typename T>
+  friend operator_sum<T> operator*(double other, const operator_sum<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator*(double other, operator_sum<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator+(double other, const operator_sum<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator+(double other, operator_sum<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator-(double other, const operator_sum<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator-(double other, operator_sum<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator*(std::complex<double> other,
+                                   const operator_sum<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator*(std::complex<double> other,
+                                   operator_sum<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator+(std::complex<double> other,
+                                   const operator_sum<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator+(std::complex<double> other,
+                                   operator_sum<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator-(std::complex<double> other,
+                                   const operator_sum<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator-(std::complex<double> other,
+                                   operator_sum<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator*(const scalar_operator &other,
+                                   const operator_sum<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator*(const scalar_operator &other,
+                                   operator_sum<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator+(const scalar_operator &other,
+                                   const operator_sum<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator+(const scalar_operator &other,
+                                   operator_sum<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator-(const scalar_operator &other,
+                                   const operator_sum<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator-(const scalar_operator &other,
+                                   operator_sum<T> &&self);
+
+  template <typename T>
+  friend operator_sum<T> operator+(double other,
+                                   const product_operator<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator+(double other, product_operator<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator-(double other,
+                                   const product_operator<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator-(double other, product_operator<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator+(std::complex<double> other,
+                                   const product_operator<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator+(std::complex<double> other,
+                                   product_operator<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator-(std::complex<double> other,
+                                   const product_operator<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator-(std::complex<double> other,
+                                   product_operator<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator+(const scalar_operator &other,
+                                   const product_operator<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator+(const scalar_operator &other,
+                                   product_operator<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator-(const scalar_operator &other,
+                                   const product_operator<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator-(const scalar_operator &other,
+                                   product_operator<T> &&self);
+
+  // common operators
+
+  template <typename T>
+  friend operator_sum<T> operator_handler::empty();
+};
+
+/// @brief Represents an operator expression consisting of a product of
+/// elementary and scalar operators. Operator expressions cannot be used within
+/// quantum kernels, but they provide methods to convert them to data types
+/// that can.
+template <typename HandlerTy>
+class product_operator {
+  template <typename T>
+  friend class product_operator;
+  template <typename T>
+  friend class operator_sum;
+
+private:
+  // template defined as long as T implements an in-place multiplication -
+  // won't work if the in-place multiplication was inherited from a base class
+  template <typename T>
+  static decltype(std::declval<T>().inplace_mult(std::declval<T>()))
+  handler_mult(int);
+  template <typename T>
+  static std::false_type handler_mult(
+      ...); // ellipsis ensures the template above is picked if it exists
+  static constexpr bool supports_inplace_mult =
+      !std::is_same<decltype(handler_mult<HandlerTy>(0)),
+                    std::false_type>::value;
+
+#if !defined(NDEBUG)
+  bool is_canonicalized() const;
+#endif
+
+  typename std::vector<HandlerTy>::const_iterator
+  find_insert_at(const HandlerTy &other);
+
+  template <typename T,
+            std::enable_if_t<std::is_same<HandlerTy, T>::value &&
+                                 !product_operator<T>::supports_inplace_mult,
+                             std::false_type> = std::false_type()>
+  void insert(T &&other);
+
+  template <typename T,
+            std::enable_if_t<std::is_same<HandlerTy, T>::value &&
+                                 product_operator<T>::supports_inplace_mult,
+                             std::true_type> = std::true_type()>
+  void insert(T &&other);
+
+  std::string get_term_id() const;
+
+  void aggregate_terms();
+
+  template <typename... Args>
+  void aggregate_terms(HandlerTy &&head, Args &&...args);
+
+  template <typename EvalTy>
+  EvalTy evaluate(operator_arithmetics<EvalTy> arithmetics) const;
+
+protected:
+  std::vector<HandlerTy> operators;
+  scalar_operator coefficient;
+
+  template <typename... Args,
+            std::enable_if_t<
+                std::conjunction<std::is_same<HandlerTy, Args>...>::value,
+                bool> = true>
+  product_operator(scalar_operator coefficient, Args &&...args);
+
+  // keep this constructor protected (otherwise it needs to ensure canonical
+  // order)
+  product_operator(scalar_operator coefficient,
+                   const std::vector<HandlerTy> &atomic_operators,
+                   int size = 0);
+
+  // keep this constructor protected (otherwise it needs to ensure canonical
+  // order)
+  product_operator(scalar_operator coefficient,
+                   std::vector<HandlerTy> &&atomic_operators, int size = 0);
+
+public:
+  // read-only properties
+
+  /// @brief The degrees of freedom that the operator acts on.
+  /// By default, degrees reflect the ordering convention (endianness) used in
+  /// CUDA-Q, and the ordering of the matrix returned by default by `to_matrix`.
+  std::vector<int> degrees(bool application_order = true) const;
+
+  /// @brief Return the number of operator terms that make up this product
+  /// operator.
+  int num_terms() const;
+
+  /// FIXME: GET RID OF THIS (MAKE ITERABLE INSTEAD)
+  const std::vector<HandlerTy> &get_terms() const;
+
+  scalar_operator get_coefficient() const;
+
+  // constructors and destructors
+
+  product_operator(double coefficient);
+
+  product_operator(HandlerTy &&atomic);
+
+  template <typename T,
+            std::enable_if_t<!std::is_same<T, HandlerTy>::value &&
+                                 std::is_constructible<HandlerTy, T>::value,
+                             bool> = true>
+  product_operator(const product_operator<T> &other);
+
+  template <typename T,
+            std::enable_if_t<std::is_same<HandlerTy, matrix_operator>::value &&
+                                 !std::is_same<T, HandlerTy>::value &&
+                                 std::is_constructible<HandlerTy, T>::value,
+                             bool> = true>
+  product_operator(const product_operator<T> &other,
+                   const matrix_operator::commutation_behavior &behavior);
+
+  // copy constructor
+  product_operator(const product_operator<HandlerTy> &other, int size = 0);
+
+  // move constructor
+  product_operator(product_operator<HandlerTy> &&other, int size = 0);
+
+  ~product_operator() = default;
+
+  // assignments
+
+  template <typename T,
+            std::enable_if_t<!std::is_same<T, HandlerTy>::value &&
+                                 std::is_constructible<HandlerTy, T>::value,
+                             bool> = true>
+  product_operator<HandlerTy> &operator=(const product_operator<T> &other);
+
+  // assignment operator
+  product_operator<HandlerTy> &
+  operator=(const product_operator<HandlerTy> &other);
+
+  // move assignment operator
+  product_operator<HandlerTy> &operator=(product_operator<HandlerTy> &&other);
+
+  // evaluations
+
+  /// @brief Return the `product_operator<HandlerTy>` as a string.
+  std::string to_string() const;
+
+  /// @brief Return the matrix representation of the operator.
+  /// By default, the matrix is ordered according to the convention (endianness)
+  /// used in CUDA-Q, and the ordering returned by default by `degrees`.
+  /// @arg  `dimensions` : A mapping that specifies the number of levels,
+  ///                      that is, the dimension of each degree of freedom
+  ///                      that the operator acts on. Example for two, 2-level
+  ///                      degrees of freedom: `{0:2, 1:2}`.
+  /// @arg `parameters` : A map of the parameter names to their concrete,
+  /// complex values.
+  matrix_2 to_matrix(std::unordered_map<int, int> dimensions = {},
+                     const std::unordered_map<std::string, std::complex<double>>
+                         &parameters = {},
+                     bool application_order = true) const;
+
+  // comparisons
+
+  /// @brief True, if the other value is an operator_sum<HandlerTy> with
+  /// equivalent terms,
+  ///  and False otherwise. The equality takes into account that operator
+  ///  addition is commutative, as is the product of two operators if they
+  ///  act on different degrees of freedom.
+  ///  The equality comparison does *not* take commutation relations into
+  ///  account, and does not try to reorder terms `blockwise`; it may hence
+  ///  evaluate to False, even if two operators in reality are the same.
+  ///  If the equality evaluates to True, on the other hand, the operators
+  ///  are guaranteed to represent the same transformation for all arguments.
+  bool operator==(const product_operator<HandlerTy> &other) const;
+
+  // unary operators
+
+  product_operator<HandlerTy> operator-() const &;
+  product_operator<HandlerTy> operator-() &&;
+  product_operator<HandlerTy> operator+() const &;
+  product_operator<HandlerTy> operator+() &&;
+
+  // right-hand arithmetics
+
+  product_operator<HandlerTy> operator*(double other) const &;
+  product_operator<HandlerTy> operator*(double other) &&;
+  operator_sum<HandlerTy> operator+(double other) const &;
+  operator_sum<HandlerTy> operator+(double other) &&;
+  operator_sum<HandlerTy> operator-(double other) const &;
+  operator_sum<HandlerTy> operator-(double other) &&;
+  product_operator<HandlerTy> operator*(std::complex<double> other) const &;
+  product_operator<HandlerTy> operator*(std::complex<double> other) &&;
+  operator_sum<HandlerTy> operator+(std::complex<double> other) const &;
+  operator_sum<HandlerTy> operator+(std::complex<double> other) &&;
+  operator_sum<HandlerTy> operator-(std::complex<double> other) const &;
+  operator_sum<HandlerTy> operator-(std::complex<double> other) &&;
+  product_operator<HandlerTy> operator*(const scalar_operator &other) const &;
+  product_operator<HandlerTy> operator*(const scalar_operator &other) &&;
+  operator_sum<HandlerTy> operator+(const scalar_operator &other) const &;
+  operator_sum<HandlerTy> operator+(const scalar_operator &other) &&;
+  operator_sum<HandlerTy> operator-(const scalar_operator &other) const &;
+  operator_sum<HandlerTy> operator-(const scalar_operator &other) &&;
+  product_operator<HandlerTy>
+  operator*(const product_operator<HandlerTy> &other) const &;
+  product_operator<HandlerTy>
+  operator*(const product_operator<HandlerTy> &other) &&;
+  product_operator<HandlerTy>
+  operator*(product_operator<HandlerTy> &&other) const &;
+  product_operator<HandlerTy> operator*(product_operator<HandlerTy> &&other) &&;
+  operator_sum<HandlerTy>
+  operator+(const product_operator<HandlerTy> &other) const &;
+  operator_sum<HandlerTy>
+  operator+(const product_operator<HandlerTy> &other) &&;
+  operator_sum<HandlerTy>
+  operator+(product_operator<HandlerTy> &&other) const &;
+  operator_sum<HandlerTy> operator+(product_operator<HandlerTy> &&other) &&;
+  operator_sum<HandlerTy>
+  operator-(const product_operator<HandlerTy> &other) const &;
+  operator_sum<HandlerTy>
+  operator-(const product_operator<HandlerTy> &other) &&;
+  operator_sum<HandlerTy>
+  operator-(product_operator<HandlerTy> &&other) const &;
+  operator_sum<HandlerTy> operator-(product_operator<HandlerTy> &&other) &&;
+  operator_sum<HandlerTy> operator*(const operator_sum<HandlerTy> &other) const;
+  operator_sum<HandlerTy>
+  operator+(const operator_sum<HandlerTy> &other) const &;
+  operator_sum<HandlerTy> operator+(const operator_sum<HandlerTy> &other) &&;
+  operator_sum<HandlerTy> operator+(operator_sum<HandlerTy> &&other) const &;
+  operator_sum<HandlerTy> operator+(operator_sum<HandlerTy> &&other) &&;
+  operator_sum<HandlerTy>
+  operator-(const operator_sum<HandlerTy> &other) const &;
+  operator_sum<HandlerTy> operator-(const operator_sum<HandlerTy> &other) &&;
+  operator_sum<HandlerTy> operator-(operator_sum<HandlerTy> &&other) const &;
+  operator_sum<HandlerTy> operator-(operator_sum<HandlerTy> &&other) &&;
+
+  product_operator<HandlerTy> &operator*=(double other);
+  product_operator<HandlerTy> &operator*=(std::complex<double> other);
+  product_operator<HandlerTy> &operator*=(const scalar_operator &other);
+  product_operator<HandlerTy> &
+  operator*=(const product_operator<HandlerTy> &other);
+  product_operator<HandlerTy> &operator*=(product_operator<HandlerTy> &&other);
+
+  // left-hand arithmetics
+
+  // Being a bit permissive here, since otherwise the explicit template
+  // instantiation is a nightmare.
+  template <typename T>
+  friend product_operator<T> operator*(double other,
+                                       const product_operator<T> &self);
+  template <typename T>
+  friend product_operator<T> operator*(double other,
+                                       product_operator<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator+(double other,
+                                   const product_operator<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator+(double other, product_operator<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator-(double other,
+                                   const product_operator<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator-(double other, product_operator<T> &&self);
+  template <typename T>
+  friend product_operator<T> operator*(std::complex<double> other,
+                                       const product_operator<T> &self);
+  template <typename T>
+  friend product_operator<T> operator*(std::complex<double> other,
+                                       product_operator<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator+(std::complex<double> other,
+                                   const product_operator<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator+(std::complex<double> other,
+                                   product_operator<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator-(std::complex<double> other,
+                                   const product_operator<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator-(std::complex<double> other,
+                                   product_operator<T> &&self);
+  template <typename T>
+  friend product_operator<T> operator*(const scalar_operator &other,
+                                       const product_operator<T> &self);
+  template <typename T>
+  friend product_operator<T> operator*(const scalar_operator &other,
+                                       product_operator<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator+(const scalar_operator &other,
+                                   const product_operator<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator+(const scalar_operator &other,
+                                   product_operator<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator-(const scalar_operator &other,
+                                   const product_operator<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator-(const scalar_operator &other,
+                                   product_operator<T> &&self);
+
+  template <typename T>
+  friend operator_sum<T> operator*(double other, const operator_sum<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator*(double other, operator_sum<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator+(double other, const operator_sum<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator+(double other, operator_sum<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator-(double other, const operator_sum<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator-(double other, operator_sum<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator*(std::complex<double> other,
+                                   const operator_sum<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator*(std::complex<double> other,
+                                   operator_sum<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator+(std::complex<double> other,
+                                   const operator_sum<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator+(std::complex<double> other,
+                                   operator_sum<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator-(std::complex<double> other,
+                                   const operator_sum<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator-(std::complex<double> other,
+                                   operator_sum<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator*(const scalar_operator &other,
+                                   const operator_sum<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator*(const scalar_operator &other,
+                                   operator_sum<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator+(const scalar_operator &other,
+                                   const operator_sum<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator+(const scalar_operator &other,
+                                   operator_sum<T> &&self);
+  template <typename T>
+  friend operator_sum<T> operator-(const scalar_operator &other,
+                                   const operator_sum<T> &self);
+  template <typename T>
+  friend operator_sum<T> operator-(const scalar_operator &other,
+                                   operator_sum<T> &&self);
+
+  // common operators
+
+  template <typename T, typename... Args,
+            std::enable_if_t<
+                std::conjunction<std::is_same<int, Args>...>::value, bool>>
+  friend product_operator<T> operator_handler::identity(Args... targets);
+};
+
+#ifndef CUDAQ_INSTANTIATE_TEMPLATES
+extern template class product_operator<matrix_operator>;
+extern template class product_operator<spin_operator>;
+extern template class product_operator<boson_operator>;
+extern template class product_operator<fermion_operator>;
+
+extern template class operator_sum<matrix_operator>;
+extern template class operator_sum<spin_operator>;
+extern template class operator_sum<boson_operator>;
+extern template class operator_sum<fermion_operator>;
+#endif
+
+} // namespace cudaq
diff --git a/runtime/cudaq/platform/default/rest/helpers/anyon/anyon.yml b/runtime/cudaq/platform/default/rest/helpers/anyon/anyon.yml
index 13451f89e99..00a72c32d81 100644
--- a/runtime/cudaq/platform/default/rest/helpers/anyon/anyon.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/anyon/anyon.yml
@@ -16,7 +16,7 @@ config:
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
-  platform-lowering-config: "func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),anyon-%Q_GATE%-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
+  platform-lowering-config: "classical-optimization-pipeline,globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,classical-optimization-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),anyon-%Q_GATE%-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
   # Tell the rest-qpu that we are generating Adaptive QIR.
   codegen-emission: qir-adaptive
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/runtime/cudaq/platform/default/rest/helpers/braket/BraketExecutor.cpp b/runtime/cudaq/platform/default/rest/helpers/braket/BraketExecutor.cpp
index 2ea8eea8bab..66b376d2837 100644
--- a/runtime/cudaq/platform/default/rest/helpers/braket/BraketExecutor.cpp
+++ b/runtime/cudaq/platform/default/rest/helpers/braket/BraketExecutor.cpp
@@ -210,7 +210,7 @@ BraketExecutor::execute(std::vector<KernelExecution> &codesToExecute,
 
   return std::async(
       std::launch::async,
-      [this, codesToExecute](
+      [this, codesToExecute, isObserve](
           std::vector<Aws::Braket::Model::CreateQuantumTaskOutcomeCallable>
               createOutcomes) {
         std::vector<ExecutionResult> results;
@@ -271,9 +271,16 @@ BraketExecutor::execute(std::vector<KernelExecution> &codesToExecute,
 
           auto c = serverHelper->processResults(resultsJson, taskArn);
 
-          for (auto &regName : c.register_names()) {
-            results.emplace_back(c.to_map(regName), regName);
-            results.back().sequentialData = c.sequential_data(regName);
+          if (isObserve) {
+            // Use the job name instead of the global register.
+            results.emplace_back(c.to_map(), codesToExecute[i].name);
+            results.back().sequentialData = c.sequential_data();
+          } else {
+            // For each register, add the results into result.
+            for (auto &regName : c.register_names()) {
+              results.emplace_back(c.to_map(regName), regName);
+              results.back().sequentialData = c.sequential_data(regName);
+            }
           }
           i++;
         }
diff --git a/runtime/cudaq/platform/default/rest/helpers/braket/braket.yml b/runtime/cudaq/platform/default/rest/helpers/braket/braket.yml
index 0c0e3784a7b..7e2b573f65c 100644
--- a/runtime/cudaq/platform/default/rest/helpers/braket/braket.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/braket/braket.yml
@@ -17,7 +17,7 @@ config:
   # Tell NVQ++ to generate glue code to set the target backend name
   gen-target-backend: true
   # Define the lowering pipeline
-  platform-lowering-config: "func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,unrolling-pipeline,func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),decomposition{enable-patterns=SToR1,TToR1,R1ToU3,U3ToRotations,CHToCX,CCZToCX,CRzToCX,CRyToCX,CRxToCX,CR1ToCX,RxAdjToRx,RyAdjToRy,RzAdjToRz},quake-to-cc-prep,func.func(expand-control-veqs,combine-quantum-alloc,canonicalize,combine-measurements),symbol-dce"
+  platform-lowering-config: "classical-optimization-pipeline,globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,classical-optimization-pipeline,func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),decomposition{enable-patterns=SToR1,TToR1,R1ToU3,U3ToRotations,CHToCX,CCZToCX,CRzToCX,CRyToCX,CRxToCX,CR1ToCX,RxAdjToRx,RyAdjToRy,RzAdjToRz},quake-to-cc-prep,func.func(expand-control-veqs,combine-quantum-alloc,canonicalize,combine-measurements),symbol-dce"
   # Tell the rest-qpu that we are generating OpenQASM 2.0.
   codegen-emission: qasm2
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/runtime/cudaq/platform/default/rest/helpers/infleqtion/infleqtion.yml b/runtime/cudaq/platform/default/rest/helpers/infleqtion/infleqtion.yml
index 81d151a6f1a..a9fa9484526 100644
--- a/runtime/cudaq/platform/default/rest/helpers/infleqtion/infleqtion.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/infleqtion/infleqtion.yml
@@ -17,7 +17,7 @@ config:
   # Tell NVQ++ to generate glue code to set the target backend name
   gen-target-backend: true
   # Define the lowering pipeline
-  platform-lowering-config: "func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,unrolling-pipeline,func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),decomposition{enable-patterns=SToR1,TToR1,CCZToCX,CRyToCX,CRxToCX,R1AdjToR1,RxAdjToRx,RyAdjToRy,RzAdjToRz},quake-to-cc-prep,func.func(memtoreg{quantum=0}),symbol-dce"
+  platform-lowering-config: "classical-optimization-pipeline,globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,classical-optimization-pipeline,func.func(lower-to-cfg),canonicalize,func.func(multicontrol-decomposition),decomposition{enable-patterns=SToR1,TToR1,CCZToCX,CRyToCX,CRxToCX,R1AdjToR1,RxAdjToRx,RyAdjToRy,RzAdjToRz},quake-to-cc-prep,func.func(memtoreg{quantum=0}),symbol-dce"
   # Tell the rest-qpu that we are generating OpenQASM 2.0.
   codegen-emission: qasm2
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.yml b/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.yml
index 72ce7f2d659..23e29a15d17 100644
--- a/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/ionq/ionq.yml
@@ -16,7 +16,7 @@ config:
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
-  platform-lowering-config: "func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),ionq-gate-set-mapping"
+  platform-lowering-config: "classical-optimization-pipeline,globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,classical-optimization-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),ionq-gate-set-mapping"
   # Tell the rest-qpu that we are generating QIR.
   codegen-emission: qir-base
   # Additional passes to run after lowering to QIR
diff --git a/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.yml b/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.yml
index 546327f7bbf..67aa5ab4516 100644
--- a/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/iqm/iqm.yml
@@ -16,7 +16,7 @@ config:
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
-  platform-lowering-config: "func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),iqm-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(delay-measurements,regtomem),symbol-dce,iqm-gate-set-mapping"
+  platform-lowering-config: "classical-optimization-pipeline,globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,classical-optimization-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),iqm-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(delay-measurements,regtomem),symbol-dce,iqm-gate-set-mapping"
   # Tell the rest-qpu that we are generating IQM JSON.
   codegen-emission: iqm
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.yml b/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.yml
index 3ba8809a8af..73bb0aec7a5 100644
--- a/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/oqc/oqc.yml
@@ -16,7 +16,7 @@ config:
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
-  platform-lowering-config: "func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),oqc-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
+  platform-lowering-config: "classical-optimization-pipeline,globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,classical-optimization-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),oqc-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
   # Tell the rest-qpu that we are generating QIR.
   codegen-emission: qir-base
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.yml b/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.yml
index dbacbbfbf83..440a3ab297a 100644
--- a/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.yml
+++ b/runtime/cudaq/platform/default/rest/helpers/quantinuum/quantinuum.yml
@@ -16,7 +16,7 @@ config:
   # Add the rest-qpu library to the link list
   link-libs: ["-lcudaq-rest-qpu"]
   # Define the lowering pipeline
-  platform-lowering-config: "func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),quantinuum-gate-set-mapping"
+  platform-lowering-config: "classical-optimization-pipeline,globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,classical-optimization-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),quantinuum-gate-set-mapping"
   # Tell the rest-qpu that we are generating Adaptive QIR.
   codegen-emission: qir-adaptive
   # Library mode is only for simulators, physical backends must turn this off
diff --git a/runtime/cudaq/platform/fermioniq/fermioniq.yml b/runtime/cudaq/platform/fermioniq/fermioniq.yml
index 5f122d71f3f..ec87efd03f4 100644
--- a/runtime/cudaq/platform/fermioniq/fermioniq.yml
+++ b/runtime/cudaq/platform/fermioniq/fermioniq.yml
@@ -18,7 +18,7 @@ config:
   # Library mode is only for simulators, physical backends must turn this off
   library-mode: false
   # lowering config
-  platform-lowering-config: "func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),fermioniq-gate-set-mapping"
+  platform-lowering-config: "classical-optimization-pipeline,globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,classical-optimization-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),fermioniq-gate-set-mapping"
   # Tell the rest-qpu that we are generating QIR.
   codegen-emission: qir-base
 
diff --git a/runtime/cudaq/qis/state.cpp b/runtime/cudaq/qis/state.cpp
index d5f9240119a..717ca94272d 100644
--- a/runtime/cudaq/qis/state.cpp
+++ b/runtime/cudaq/qis/state.cpp
@@ -119,7 +119,7 @@ state::~state() {
   // Current use count is 1, so the
   // shared_ptr is about to go out of scope,
   // there are no users. Delete the state data.
-  if (internal.use_count() == 1)
+  if (internal && internal.use_count() == 1)
     internal->destroyState();
 }
 
diff --git a/runtime/cudaq/schedule.h b/runtime/cudaq/schedule.h
new file mode 100644
index 00000000000..f12284d345f
--- /dev/null
+++ b/runtime/cudaq/schedule.h
@@ -0,0 +1,120 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+#include <complex>
+#include <cstddef>
+#include <cudaq.h>
+#include <functional>
+#include <iterator>
+#include <string>
+#include <vector>
+
+namespace cudaq {
+
+/// @brief Create a schedule for evaluating an operator expression at different
+/// steps.
+class Schedule {
+public:
+  /// Iterator tags. May be superfluous.
+  using iterator_category = std::forward_iterator_tag;
+  using difference_type = std::ptrdiff_t;
+  using value_type = std::complex<double>;
+  using pointer = std::complex<double> *;
+  using reference = std::complex<double> &;
+
+private:
+  pointer ptr;
+  std::vector<std::complex<double>> steps;
+  std::vector<std::string> parameters;
+  std::function<std::complex<double>(const std::string &,
+                                     const std::complex<double> &)>
+      value_function;
+  int current_idx;
+
+public:
+  Schedule(pointer ptr) : ptr(ptr){};
+
+  /// @brief Constructor.
+  /// @arg steps: The sequence of steps in the schedule. Restricted to a vector
+  /// of complex values.
+  /// @arg parameters: A sequence of strings representing the parameter names of
+  /// an operator expression.
+  /// @arg value_function: A function that takes the name of a parameter as well
+  /// as an additional value ("step") of arbitrary type as argument and returns
+  /// the complex value for that parameter at the given step.
+  /// @details current_idx: Intializes the current index (_current_idx) to -1 to
+  /// indicate that iteration has not yet begun. Once iteration starts,
+  /// _current_idx will be used to track the position in the sequence of steps.
+  Schedule(
+      const std::vector<std::complex<double>> &steps,
+      const std::vector<std::string> &parameters,
+      const std::function<std::complex<double>(
+          const std::string &, const std::complex<double> &)> &value_function);
+
+  /// Below, I define what I believe are the minimal necessary methods needed
+  /// for this to behave like an iterable. This should be revisited in the
+  /// implementation phase.
+
+  // Pointers.
+  /// @brief `Dereference` operator to access the current step value.
+  /// @return Reference to current complex step value.
+  reference operator*() const;
+
+  /// @brief Arrow operator to access the pointer the current step value.
+  /// @return Pointer to the current complex step value.
+  pointer operator->();
+
+  // Prefix increment.
+  /// @brief Prefix increment operator to move to the next step in the schedule.
+  /// @return Reference to the updated Schedule object.
+  Schedule &operator++();
+
+  // Postfix increment.
+  /// @brief `Postfix` increment operator to move to the next step in the
+  /// schedule.
+  /// @return Copy of the previous Schedule state.
+  Schedule operator++(int);
+
+  // Comparison.
+  /// @brief Equality comparison operator.
+  /// @param a: First Schedule object.
+  /// @param b: Second Schedule object.
+  /// @return True if both schedules point to the same step, false otherwise
+  friend bool operator==(const Schedule &a, const Schedule &b);
+
+  /// @brief Inequality comparison operator.
+  /// @param a: First Schedule object.
+  /// @param b: Second Schedule object.
+  /// @return True if both schedules point to different steps, false otherwise
+  friend bool operator!=(const Schedule &a, const Schedule &b);
+
+  /// @brief Reset the schedule iterator to the beginning.
+  void reset();
+
+  /// @brief Get the current step in the schedule.
+  /// @return The current complex step value as an optional. If no valid step,
+  /// returns std::nullopt.
+  std::optional<std::complex<double>> current_step() const;
+
+  std::vector<std::complex<double>>::iterator begin();
+  std::vector<std::complex<double>>::iterator end();
+  std::vector<std::complex<double>>::const_iterator begin() const;
+  std::vector<std::complex<double>>::const_iterator end() const;
+
+  /// @brief Get the parameters of the schedule.
+  /// @return The parameters of the schedule.
+  const std::vector<std::string> &get_parameters() const;
+
+  /// @brief Get the value function of the schedule.
+  /// @return The value function of the schedule.
+  const std::function<std::complex<double>(const std::string &,
+                                           const std::complex<double> &)>
+  get_value_function() const;
+};
+} // namespace cudaq
diff --git a/runtime/cudaq/utils/tensor.cpp b/runtime/cudaq/utils/tensor.cpp
index f37f9590902..d00d8a17081 100644
--- a/runtime/cudaq/utils/tensor.cpp
+++ b/runtime/cudaq/utils/tensor.cpp
@@ -7,6 +7,7 @@
  ******************************************************************************/
 
 #include "cudaq/utils/tensor.h"
+#include <cmath>
 #include <sstream>
 
 inline std::complex<double> &access(std::complex<double> *p,
@@ -63,6 +64,23 @@ cudaq::matrix_2 &cudaq::matrix_2::operator-=(const cudaq::matrix_2 &right) {
   return *this;
 }
 
+bool cudaq::operator==(const cudaq::matrix_2 &lhs, const cudaq::matrix_2 &rhs) {
+  if (lhs.get_rows() != rhs.get_rows() ||
+      lhs.get_columns() != rhs.get_columns()) {
+    return false;
+  }
+
+  for (std::size_t i = 0; i < lhs.get_rows(); i++) {
+    for (std::size_t j = 0; j < lhs.get_columns(); j++) {
+      if (lhs[{i, j}] != rhs[{i, j}]) {
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
 cudaq::matrix_2 &
 cudaq::matrix_2::kronecker_inplace(const cudaq::matrix_2 &right) {
   Dimensions new_dim{get_rows() * right.get_rows(),
@@ -127,3 +145,71 @@ std::string cudaq::matrix_2::dump() const {
   out << '}';
   return out.str();
 }
+
+// Calculate the power of a given matrix, `powers` times.
+cudaq::matrix_2 cudaq::matrix_2::power(int powers) {
+  // Initialize as identity.
+  std::size_t rows = get_rows();
+  std::size_t columns = get_columns();
+  if (rows != columns)
+    throw std::runtime_error("Matrix power expects a square matrix.");
+  auto result = cudaq::matrix_2(rows, columns);
+  for (std::size_t i = 0; i < rows; i++) {
+    result[{i, i}] = 1.0 + 0.0j;
+  }
+
+  // Calculate the matrix power iteratively.
+  for (std::size_t i = 0; i < powers; i++) {
+    result = result * *this;
+  }
+  return result;
+}
+
+// Calculate the Taylor approximation to the exponential of the given matrix.
+cudaq::matrix_2 cudaq::matrix_2::exponential() {
+  auto factorial = [](std::size_t value) {
+    auto res = 1;
+    while (value-- > 1)
+      res *= value;
+    return (double)res;
+  };
+
+  std::size_t rows = get_rows();
+  std::size_t columns = get_columns();
+  if (rows != columns)
+    throw std::runtime_error("Matrix exponential expects a square matrix.");
+  auto result = cudaq::matrix_2(rows, columns);
+  // Taylor Series Approximation, fixed at 20 steps.
+  std::size_t taylor_steps = 20;
+  for (std::size_t step = 0; step < taylor_steps; step++) {
+    auto term = this->power(step);
+    for (std::size_t i = 0; i < rows; i++) {
+      for (std::size_t j = 0; j < columns; j++) {
+        result[{i, j}] += term[{i, j}] / factorial(step);
+      }
+    }
+  }
+  return result;
+}
+
+cudaq::matrix_2 cudaq::matrix_2::identity(const std::size_t rows) {
+  auto result = cudaq::matrix_2(rows, rows);
+  for (std::size_t i = 0; i < rows; i++)
+    result[{i, i}] = 1. + 0.0j;
+  return result;
+}
+
+// Transpose + Conjugate
+cudaq::matrix_2 cudaq::matrix_2::adjoint(const matrix_2 &matrix) {
+  std::size_t rows = matrix.get_rows();
+  std::size_t cols = matrix.get_columns();
+  matrix_2 result(cols, rows);
+
+  for (std::size_t i = 0; i < rows; i++) {
+    for (std::size_t j = 0; j < cols; j++) {
+      result[{j, i}] = std::conj(matrix[{i, j}]);
+    }
+  }
+
+  return result;
+}
\ No newline at end of file
diff --git a/runtime/cudaq/utils/tensor.h b/runtime/cudaq/utils/tensor.h
index d9f9099264c..801f54ab5bc 100644
--- a/runtime/cudaq/utils/tensor.h
+++ b/runtime/cudaq/utils/tensor.h
@@ -27,6 +27,8 @@ matrix_2 kronecker(const matrix_2 &, const matrix_2 &);
 template <typename Iterable,
           typename T = typename std::iterator_traits<Iterable>::value_type>
 matrix_2 kronecker(Iterable begin, Iterable end);
+// Equality comparison operator.
+bool operator==(const matrix_2 &, const matrix_2 &);
 
 //===----------------------------------------------------------------------===//
 
@@ -38,6 +40,9 @@ class matrix_2 {
   using Dimensions = std::pair<std::size_t, std::size_t>;
 
   matrix_2() = default;
+  matrix_2(std::size_t rows, std::size_t cols)
+      : dimensions(std::make_pair(rows, cols)),
+        data{new std::complex<double>[rows * cols]} {}
   matrix_2(const matrix_2 &other)
       : dimensions{other.dimensions},
         data{new std::complex<double>[get_size(other.dimensions)]} {
@@ -96,6 +101,18 @@ class matrix_2 {
   friend matrix_2 kronecker(const matrix_2 &, const matrix_2 &);
   matrix_2 &kronecker_inplace(const matrix_2 &);
 
+  /// Matrix exponential, uses 20 terms of Taylor Series approximation.
+  matrix_2 exponential();
+
+  /// Matrix power.
+  matrix_2 power(int powers);
+
+  /// Returns the conjugate transpose of a matrix.
+  static matrix_2 adjoint(const matrix_2 &matrix);
+
+  /// Return a square identity matrix for the given size.
+  static matrix_2 identity(const std::size_t rows);
+
   /// Kronecker a list of matrices. The list can be any container that has
   /// iterators defined.
   template <typename Iterable, typename T>
diff --git a/runtime/nvqir/cudensitymat/CMakeLists.txt b/runtime/nvqir/cudensitymat/CMakeLists.txt
index 20611b2d6af..ef2306918b2 100644
--- a/runtime/nvqir/cudensitymat/CMakeLists.txt
+++ b/runtime/nvqir/cudensitymat/CMakeLists.txt
@@ -25,8 +25,19 @@ find_file(CUDENSITYMAT_INC
 message(STATUS "cudensitymat header: ${CUDENSITYMAT_INC}")
 get_filename_component(CUDENSITYMAT_INCLUDE_DIR ${CUDENSITYMAT_INC} DIRECTORY)
 
-add_library(${LIBRARY_NAME} SHARED CuDensityMatSim.cpp mpi_support.cpp)
+add_library(${LIBRARY_NAME} SHARED 
+  CuDensityMatSim.cpp 
+  mpi_support.cpp
+  CuDensityMatTimeStepper.cpp
+  RungeKuttaIntegrator.cpp
+  CuDensityMatExpectation.cpp
+  CuDensityMatEvolution.cpp
+  CuDensityMatState.cpp
+  CuDensityMatContext.cpp
+  CuDensityMatOpConverter.cpp
+)
 
+message("CUDAToolkit_INCLUDE_DIRS = ${CUDAToolkit_INCLUDE_DIRS}")
 target_include_directories(${LIBRARY_NAME}
   PRIVATE 
     . .. 
@@ -35,11 +46,17 @@ target_include_directories(${LIBRARY_NAME}
     ${CUDENSITYMAT_INCLUDE_DIR}
 )
 
-target_link_libraries(${LIBRARY_NAME} 
+target_link_libraries(${LIBRARY_NAME}
                       PRIVATE
                         fmt::fmt-header-only
                         cudaq-common
+                        ${CUDENSITYMAT_ROOT}/lib/libcudensitymat.so.0
                         CUDA::cudart_static
                      )
+target_link_libraries(${LIBRARY_NAME}
+                     PUBLIC
+                       cudaq-operator
+                       ${CUDENSITYMAT_ROOT}/lib/libcudensitymat.so.0
+                    )
 install(TARGETS ${LIBRARY_NAME} DESTINATION lib)
 
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatContext.cpp b/runtime/nvqir/cudensitymat/CuDensityMatContext.cpp
new file mode 100644
index 00000000000..080ebada0ee
--- /dev/null
+++ b/runtime/nvqir/cudensitymat/CuDensityMatContext.cpp
@@ -0,0 +1,90 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "CuDensityMatContext.h"
+#include "CuDensityMatErrorHandling.h"
+#include "common/Logger.h"
+#include <memory>
+#include <mutex>
+
+namespace {
+static std::unordered_map<int, std::unique_ptr<cudaq::dynamics::Context>>
+    g_contexts;
+static std::mutex g_contextMutex;
+} // namespace
+
+namespace cudaq {
+namespace dynamics {
+
+/// @brief Get the current CUDA context for the active device.
+/// @return Context* Pointer to the current context.
+Context *Context::getCurrentContext() {
+  int currentDevice = -1;
+  HANDLE_CUDA_ERROR(cudaGetDevice(&currentDevice));
+  std::lock_guard<std::mutex> guard(g_contextMutex);
+  const auto iter = g_contexts.find(currentDevice);
+  if (iter == g_contexts.end()) {
+    cudaq::info("Create cudensitymat context for device Id {}", currentDevice);
+    const auto [insertedIter, success] = g_contexts.emplace(std::make_pair(
+        currentDevice, std::unique_ptr<Context>(new Context(currentDevice))));
+    if (!success)
+      throw std::runtime_error("Failed to create cudensitymat context");
+    return insertedIter->second.get();
+  }
+
+  return iter->second.get();
+}
+
+/// @brief Get or allocate scratch space on the device.
+/// @arg minSizeBytes Minimum size of the scratch space in bytes.
+/// @return void* Pointer to the scratch space.
+void *Context::getScratchSpace(std::size_t minSizeBytes) {
+  if (minSizeBytes > m_scratchSpaceSizeBytes) {
+    // Realloc
+    if (m_scratchSpace)
+      HANDLE_CUDA_ERROR(cudaFree(m_scratchSpace));
+
+    cudaq::info("Allocate scratch buffer of size {} bytes on device {}",
+                minSizeBytes, m_deviceId);
+
+    HANDLE_CUDA_ERROR(cudaMalloc(&m_scratchSpace, minSizeBytes));
+    m_scratchSpaceSizeBytes = minSizeBytes;
+  }
+
+  return m_scratchSpace;
+}
+
+/// @brief Get the recommended workspace limit based on available memory.
+/// @return std::size_t Recommended workspace limit in bytes.
+std::size_t Context::getRecommendedWorkSpaceLimit() {
+  std::size_t freeMem = 0, totalMem = 0;
+  HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeMem, &totalMem));
+  // Take 80% of free memory
+  freeMem = static_cast<std::size_t>(static_cast<double>(freeMem) * 0.80);
+  return freeMem;
+}
+
+/// @brief Construct a new Context object for a specific device.
+/// @arg deviceId ID of the CUDA device.
+Context::Context(int deviceId) : m_deviceId(deviceId) {
+  HANDLE_CUDA_ERROR(cudaSetDevice(deviceId));
+  HANDLE_CUDM_ERROR(cudensitymatCreate(&m_cudmHandle));
+  HANDLE_CUBLAS_ERROR(cublasCreate(&m_cublasHandle));
+  m_opConverter = std::make_unique<CuDensityMatOpConverter>(m_cudmHandle);
+}
+
+/// @brief Destroy the Context object and release resources.
+Context::~Context() {
+  m_opConverter.reset();
+  cudensitymatDestroy(m_cudmHandle);
+  cublasDestroy(m_cublasHandle);
+  if (m_scratchSpaceSizeBytes > 0)
+    cudaFree(m_scratchSpace);
+}
+} // namespace dynamics
+} // namespace cudaq
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatContext.h b/runtime/nvqir/cudensitymat/CuDensityMatContext.h
new file mode 100644
index 00000000000..d9dec04caff
--- /dev/null
+++ b/runtime/nvqir/cudensitymat/CuDensityMatContext.h
@@ -0,0 +1,63 @@
+/*************************************************************** -*- C++ -*- ***
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+#include "CuDensityMatOpConverter.h"
+#include <cublas_v2.h>
+#include <cuda_runtime.h>
+#include <cudensitymat.h>
+
+namespace cudaq {
+namespace dynamics {
+
+/// @brief Class representing the CUDAQ context for density matrix operations.
+class Context {
+public:
+  Context(Context const &) = delete;
+  Context &operator=(Context const &) = delete;
+  ~Context();
+
+  /// @brief Get the cudensitymat handle.
+  /// @return cudensitymatHandle_t Handle to the cudensitymat context.
+  cudensitymatHandle_t getHandle() const { return m_cudmHandle; }
+
+  /// @brief Get the cuBLAS handle.
+  /// @return cublasHandle_t Handle to the cuBLAS context.
+  cublasHandle_t getCublasHandle() const { return m_cublasHandle; }
+
+  /// @brief Get the operation converter.
+  /// @return CuDensityMatOpConverter& Reference to the operation converter.
+  CuDensityMatOpConverter &getOpConverter() { return *m_opConverter; }
+
+  /// @brief Get the current CUDAQ context for the active device.
+  /// @return Context* Pointer to the current context.
+  static Context *getCurrentContext();
+
+  /// @brief Get or allocate scratch space on the device.
+  /// @arg minSizeBytes Minimum size of the scratch space in bytes.
+  /// @return void* Pointer to the scratch space.
+  void *getScratchSpace(std::size_t minSizeBytes);
+
+  /// @brief Get the recommended workspace limit based on available memory.
+  /// @return std::size_t Recommended workspace limit in bytes.
+  static std::size_t getRecommendedWorkSpaceLimit();
+
+private:
+  /// @brief Construct a new Context object for a specific device.
+  /// @param deviceId ID of the CUDA device.
+  Context(int deviceId);
+
+  cudensitymatHandle_t m_cudmHandle;
+  cublasHandle_t m_cublasHandle;
+  std::unique_ptr<CuDensityMatOpConverter> m_opConverter;
+  int m_deviceId;
+  void *m_scratchSpace{nullptr};
+  std::size_t m_scratchSpaceSizeBytes{0};
+};
+} // namespace dynamics
+} // namespace cudaq
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatErrorHandling.h b/runtime/nvqir/cudensitymat/CuDensityMatErrorHandling.h
new file mode 100644
index 00000000000..f6833099d0f
--- /dev/null
+++ b/runtime/nvqir/cudensitymat/CuDensityMatErrorHandling.h
@@ -0,0 +1,40 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+#include <cublas_v2.h>
+#include <cudensitymat.h>
+#include <fmt/core.h>
+#include <stdexcept>
+
+#define HANDLE_CUDM_ERROR(x)                                                   \
+  {                                                                            \
+    const auto err = x;                                                        \
+    if (err != CUDENSITYMAT_STATUS_SUCCESS) {                                  \
+      throw std::runtime_error(fmt::format("[cudaq] %{} in {} (line {})", err, \
+                                           __FUNCTION__, __LINE__));           \
+    }                                                                          \
+  }
+
+#define HANDLE_CUDA_ERROR(x)                                                   \
+  {                                                                            \
+    const auto err = x;                                                        \
+    if (err != cudaSuccess) {                                                  \
+      throw std::runtime_error(fmt::format("[cuda] %{} in {} (line {})", err,  \
+                                           __FUNCTION__, __LINE__));           \
+    }                                                                          \
+  }
+
+#define HANDLE_CUBLAS_ERROR(err)                                               \
+  do {                                                                         \
+    cublasStatus_t err_ = (err);                                               \
+    if (err_ != CUBLAS_STATUS_SUCCESS) {                                       \
+      std::printf("[cublas] error %d at %s:%d\n", err_, __FILE__, __LINE__);   \
+      throw std::runtime_error("cublas error");                                \
+    }                                                                          \
+  } while (0)
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatEvolution.cpp b/runtime/nvqir/cudensitymat/CuDensityMatEvolution.cpp
new file mode 100644
index 00000000000..584d2e455f0
--- /dev/null
+++ b/runtime/nvqir/cudensitymat/CuDensityMatEvolution.cpp
@@ -0,0 +1,114 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "CuDensityMatContext.h"
+#include "CuDensityMatErrorHandling.h"
+#include "CuDensityMatExpectation.h"
+#include "CuDensityMatState.h"
+#include "CuDensityMatTimeStepper.h"
+#include "cudaq/algorithms/evolve_internal.h"
+#include "cudaq/dynamics_integrators.h"
+#include <iterator>
+#include <random>
+#include <stdexcept>
+namespace cudaq {
+namespace __internal__ {
+/// @brief Evolve the system for a single time step.
+/// @param hamiltonian Hamiltonian operator.
+/// @param dimensions Dimension of the system.
+/// @param schedule Time schedule.
+/// @param initialState Initial state.
+/// @param inIntegrator Integrator.
+/// @param collapseOperators Collapse operators.
+/// @param observables Observables.
+/// @param storeIntermediateResults Store intermediate results.
+/// @param shotsCount Number of shots.
+/// @return evolve_result Result of the evolution.
+evolve_result evolveSingle(
+    const operator_sum<cudaq::matrix_operator> &hamiltonian,
+    const std::map<int, int> &dimensions, const Schedule &schedule,
+    const state &initialState, BaseIntegrator &inIntegrator,
+    const std::vector<operator_sum<cudaq::matrix_operator>> &collapseOperators,
+    const std::vector<operator_sum<cudaq::matrix_operator>> &observables,
+    bool storeIntermediateResults, std::optional<int> shotsCount) {
+  cudensitymatHandle_t handle =
+      dynamics::Context::getCurrentContext()->getHandle();
+  std::vector<int64_t> dims;
+  for (const auto &[id, dim] : dimensions)
+    dims.emplace_back(dim);
+  const auto asCudmState = [](cudaq::state &cudaqState) -> CuDensityMatState * {
+    auto *simState = cudaq::state_helper::getSimulationState(&cudaqState);
+    auto *castSimState = dynamic_cast<CuDensityMatState *>(simState);
+    if (!castSimState)
+      throw std::runtime_error("Invalid state.");
+    return castSimState;
+  };
+
+  auto *cudmState = asCudmState(const_cast<state &>(initialState));
+  cudmState->initialize_cudm(handle, dims);
+
+  state initial_State = [&]() {
+    if (!collapseOperators.empty() && !cudmState->is_density_matrix()) {
+      return state(new CuDensityMatState(cudmState->to_density_matrix()));
+    }
+    return initialState;
+  }();
+
+  RungeKuttaIntegrator &integrator =
+      dynamic_cast<RungeKuttaIntegrator &>(inIntegrator);
+  SystemDynamics system;
+  system.hamiltonian =
+      const_cast<operator_sum<cudaq::matrix_operator> *>(&hamiltonian);
+  system.collapseOps = collapseOperators;
+  system.modeExtents = dims;
+  integrator.setSystem(system, schedule);
+  integrator.setState(initial_State, 0.0);
+  std::vector<CuDensityMatExpectation> expectations;
+  for (auto &obs : observables)
+    expectations.emplace_back(CuDensityMatExpectation(
+        handle, cudaq::dynamics::Context::getCurrentContext()
+                    ->getOpConverter()
+                    .convertToCudensitymatOperator({}, obs, dims)));
+
+  std::vector<std::vector<double>> expectationVals;
+  std::vector<cudaq::state> intermediateStates;
+  for (const auto &step : schedule) {
+    integrator.integrate(step.real());
+    auto [t, currentState] = integrator.getState();
+    if (storeIntermediateResults) {
+      std::vector<double> expVals;
+
+      for (auto &expectation : expectations) {
+        auto *cudmState = asCudmState(currentState);
+        expectation.prepare(cudmState->get_impl());
+        const auto expVal =
+            expectation.compute(cudmState->get_impl(), step.real());
+        expVals.emplace_back(expVal.real());
+      }
+      expectationVals.emplace_back(std::move(expVals));
+      intermediateStates.emplace_back(currentState);
+    }
+  }
+
+  if (storeIntermediateResults) {
+    return evolve_result(intermediateStates, expectationVals);
+  } else {
+    // Only final state is needed
+    auto [finalTime, finalState] = integrator.getState();
+    std::vector<double> expVals;
+    auto *cudmState = asCudmState(finalState);
+    for (auto &expectation : expectations) {
+      expectation.prepare(cudmState->get_impl());
+      const auto expVal = expectation.compute(cudmState->get_impl(), finalTime);
+      expVals.emplace_back(expVal.real());
+    }
+    return evolve_result(finalState, expVals);
+  }
+}
+} // namespace __internal__
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatExpectation.cpp b/runtime/nvqir/cudensitymat/CuDensityMatExpectation.cpp
new file mode 100644
index 00000000000..a08c0b296c6
--- /dev/null
+++ b/runtime/nvqir/cudensitymat/CuDensityMatExpectation.cpp
@@ -0,0 +1,68 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "CuDensityMatExpectation.h"
+#include "CuDensityMatContext.h"
+#include "CuDensityMatErrorHandling.h"
+#include "CuDensityMatUtils.h"
+#include "common/Logger.h"
+namespace cudaq {
+CuDensityMatExpectation::CuDensityMatExpectation(cudensitymatHandle_t handle,
+                                                 cudensitymatOperator_t op)
+    : m_handle(handle), m_hamOp(op) {
+  HANDLE_CUDM_ERROR(
+      cudensitymatCreateExpectation(m_handle, m_hamOp, &m_expectation));
+  HANDLE_CUDM_ERROR(cudensitymatCreateWorkspace(m_handle, &m_workspace));
+}
+
+CuDensityMatExpectation::~CuDensityMatExpectation() {
+  if (m_workspace)
+    cudensitymatDestroyWorkspace(m_workspace);
+  if (m_expectation)
+    cudensitymatDestroyExpectation(m_expectation);
+}
+
+void CuDensityMatExpectation::prepare(cudensitymatState_t state) {
+  HANDLE_CUDM_ERROR(cudensitymatExpectationPrepare(
+      m_handle, m_expectation, state, CUDENSITYMAT_COMPUTE_64F,
+      dynamics::Context::getRecommendedWorkSpaceLimit(), m_workspace, 0x0));
+}
+std::complex<double> CuDensityMatExpectation::compute(cudensitymatState_t state,
+                                                      double time) {
+  std::size_t requiredBufferSize = 0;
+  HANDLE_CUDM_ERROR(cudensitymatWorkspaceGetMemorySize(
+      m_handle, m_workspace, CUDENSITYMAT_MEMSPACE_DEVICE,
+      CUDENSITYMAT_WORKSPACE_SCRATCH, &requiredBufferSize));
+
+  void *workspaceBuffer = nullptr;
+  if (requiredBufferSize > 0) {
+    cudaq::info("Required buffer size for expectation compute: {}",
+                requiredBufferSize);
+
+    workspaceBuffer = dynamics::Context::getCurrentContext()->getScratchSpace(
+        requiredBufferSize);
+
+    // Attach workspace buffer
+    HANDLE_CUDM_ERROR(cudensitymatWorkspaceSetMemory(
+        m_handle, m_workspace, CUDENSITYMAT_MEMSPACE_DEVICE,
+        CUDENSITYMAT_WORKSPACE_SCRATCH, workspaceBuffer, requiredBufferSize));
+  }
+
+  auto *expectationValue_d = cudaq::dynamics::createArrayGpu(
+      std::vector<std::complex<double>>(1, {0.0, 0.0}));
+  HANDLE_CUDM_ERROR(cudensitymatExpectationCompute(
+      m_handle, m_expectation, time, 0, nullptr, state, expectationValue_d,
+      m_workspace, 0x0));
+  std::complex<double> result;
+  HANDLE_CUDA_ERROR(cudaMemcpy(&result, expectationValue_d,
+                               sizeof(std::complex<double>),
+                               cudaMemcpyDefault));
+  cudaq::dynamics::destroyArrayGpu(expectationValue_d);
+  return result;
+}
+} // namespace cudaq
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatExpectation.h b/runtime/nvqir/cudensitymat/CuDensityMatExpectation.h
new file mode 100644
index 00000000000..adc744ca45f
--- /dev/null
+++ b/runtime/nvqir/cudensitymat/CuDensityMatExpectation.h
@@ -0,0 +1,45 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+#include <complex>
+#include <cudensitymat.h>
+namespace cudaq {
+
+class CuDensityMatExpectation {
+  cudensitymatHandle_t m_handle{nullptr};
+  cudensitymatOperator_t m_hamOp{nullptr};
+  cudensitymatExpectation_t m_expectation{nullptr};
+  cudensitymatWorkspaceDescriptor_t m_workspace{nullptr};
+
+public:
+  CuDensityMatExpectation(cudensitymatHandle_t handle,
+                          cudensitymatOperator_t op);
+  /// @brief Deleted copy constructor
+  CuDensityMatExpectation(const CuDensityMatExpectation &) = delete;
+  /// @brief Deleted copy assignment
+  CuDensityMatExpectation &operator=(const CuDensityMatExpectation &) = delete;
+  CuDensityMatExpectation(CuDensityMatExpectation &&src) {
+    std::swap(m_handle, src.m_handle);
+    std::swap(m_hamOp, src.m_hamOp);
+    std::swap(m_expectation, src.m_expectation);
+    std::swap(m_workspace, src.m_workspace);
+  }
+  ~CuDensityMatExpectation();
+  /// @brief Prepare the expectation operator for computation
+  /// @param state The state to compute the expectation value
+  void prepare(cudensitymatState_t state);
+
+  /// @brief Compute the expectation value
+  /// @param state The state to compute the expectation value
+  /// @param time The time at which the expectation value is computed
+  /// @return The expectation value
+  std::complex<double> compute(cudensitymatState_t state, double time);
+};
+
+} // namespace cudaq
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatOpConverter.cpp b/runtime/nvqir/cudensitymat/CuDensityMatOpConverter.cpp
new file mode 100644
index 00000000000..1d7f9e4ee40
--- /dev/null
+++ b/runtime/nvqir/cudensitymat/CuDensityMatOpConverter.cpp
@@ -0,0 +1,706 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "CuDensityMatOpConverter.h"
+#include "CuDensityMatErrorHandling.h"
+#include "common/Logger.h"
+#include <iostream>
+#include <ranges>
+
+namespace {
+std::vector<int64_t> getSubspaceExtents(const std::vector<int64_t> &modeExtents,
+                                        const std::vector<int> &degrees) {
+  std::vector<int64_t> subspaceExtents;
+
+  for (int degree : degrees) {
+    if (degree >= modeExtents.size())
+      throw std::out_of_range("Degree exceeds modeExtents size.");
+
+    subspaceExtents.push_back(modeExtents[degree]);
+  }
+
+  return subspaceExtents;
+}
+
+std::unordered_map<int, int>
+convertDimensions(const std::vector<int64_t> &modeExtents) {
+
+  std::unordered_map<int, int> dimensions;
+  for (size_t i = 0; i < modeExtents.size(); ++i)
+    dimensions[static_cast<int>(i)] = static_cast<int>(modeExtents[i]);
+
+  return dimensions;
+}
+
+// Function to flatten a matrix into a 1D array (column major)
+std::vector<std::complex<double>>
+flattenMatrixColumnMajor(const cudaq::matrix_2 &matrix) {
+  std::vector<std::complex<double>> flatMatrix;
+  flatMatrix.reserve(matrix.get_size());
+  for (size_t col = 0; col < matrix.get_columns(); col++) {
+    for (size_t row = 0; row < matrix.get_rows(); row++) {
+      flatMatrix.push_back(matrix[{row, col}]);
+    }
+  }
+
+  return flatMatrix;
+}
+void *createArrayGpu(const std::vector<std::complex<double>> &cpuArray) {
+  void *gpuArray{nullptr};
+  const std::size_t array_size = cpuArray.size() * sizeof(std::complex<double>);
+  if (array_size > 0) {
+    HANDLE_CUDA_ERROR(cudaMalloc(&gpuArray, array_size));
+    HANDLE_CUDA_ERROR(cudaMemcpy(gpuArray,
+                                 static_cast<const void *>(cpuArray.data()),
+                                 array_size, cudaMemcpyHostToDevice));
+  }
+  return gpuArray;
+}
+
+// Function to destroy a previously created array copy in GPU memory
+void destroyArrayGpu(void *gpuArray) {
+  if (gpuArray)
+    HANDLE_CUDA_ERROR(cudaFree(gpuArray));
+}
+
+cudaq::product_operator<cudaq::matrix_operator>
+computeDagger(const cudaq::matrix_operator &op) {
+  const std::string daggerOpName = op.to_string(false) + "_dagger";
+  try {
+    auto func = [op](const std::vector<int> &dimensions,
+                     const std::unordered_map<std::string, std::complex<double>>
+                         &params) {
+      std::unordered_map<int, int> dims;
+      if (dimensions.size() != op.degrees().size())
+        throw std::runtime_error("Dimension mismatched");
+
+      for (int i = 0; i < dimensions.size(); ++i) {
+        dims[op.degrees()[i]] = dimensions[i];
+      }
+      auto originalMat = op.to_matrix(dims, params);
+      return cudaq::matrix_2::adjoint(originalMat);
+    };
+    cudaq::matrix_operator::define(daggerOpName, {-1}, std::move(func));
+  } catch (...) {
+    // Nothing, this has been define
+  }
+  return cudaq::matrix_operator::instantiate(daggerOpName, op.degrees());
+}
+
+cudaq::scalar_operator computeDagger(const cudaq::scalar_operator &scalar) {
+  if (scalar.is_constant()) {
+    return cudaq::scalar_operator(std::conj(scalar.evaluate()));
+  } else {
+    return cudaq::scalar_operator(
+        [scalar](
+            const std::unordered_map<std::string, std::complex<double>> &params)
+            -> std::complex<double> {
+          return std::conj(scalar.evaluate(params));
+        });
+  }
+}
+
+cudaq::product_operator<cudaq::matrix_operator> computeDagger(
+    const cudaq::product_operator<cudaq::matrix_operator> &productOp) {
+  std::vector<cudaq::product_operator<cudaq::matrix_operator>> daggerOps;
+  for (const auto &component : productOp.get_terms()) {
+    if (const auto *elemOp =
+            dynamic_cast<const cudaq::matrix_operator *>(&component)) {
+      daggerOps.emplace_back(computeDagger(*elemOp));
+    } else {
+      throw std::runtime_error("Unhandled type!");
+    }
+  }
+  std::reverse(daggerOps.begin(), daggerOps.end());
+
+  if (daggerOps.empty()) {
+    throw std::runtime_error("Empty product operator");
+  }
+  cudaq::product_operator<cudaq::matrix_operator> daggerProduct = daggerOps[0];
+  for (std::size_t i = 1; i < daggerOps.size(); ++i) {
+    daggerProduct *= daggerOps[i];
+  }
+  daggerProduct *= computeDagger(productOp.get_coefficient());
+  return daggerProduct;
+}
+} // namespace
+
+cudensitymatOperator_t
+cudaq::dynamics::CuDensityMatOpConverter::constructLiouvillian(
+    const operator_sum<cudaq::matrix_operator> &ham,
+    const std::vector<operator_sum<cudaq::matrix_operator>> &collapseOperators,
+    const std::vector<int64_t> &modeExtents,
+    const std::unordered_map<std::string, std::complex<double>> &parameters,
+    bool isMasterEquation) {
+  if (!isMasterEquation && collapseOperators.empty()) {
+    cudaq::info("Construct state vector Liouvillian");
+    auto liouvillian = ham * std::complex<double>(0.0, -1.0);
+    return convertToCudensitymatOperator(parameters, liouvillian, modeExtents);
+  } else {
+    cudaq::info("Construct density matrix Liouvillian");
+    cudensitymatOperator_t liouvillian;
+    HANDLE_CUDM_ERROR(cudensitymatCreateOperator(
+        m_handle, static_cast<int32_t>(modeExtents.size()), modeExtents.data(),
+        &liouvillian));
+    // Append an operator term to the operator (super-operator)
+    // Handle the Hamiltonian
+    const std::map<std::string, std::complex<double>> sortedParameters(
+        parameters.begin(), parameters.end());
+    auto ks = std::views::keys(sortedParameters);
+    const std::vector<std::string> keys{ks.begin(), ks.end()};
+    for (auto &[coeff, term] :
+         convertToCudensitymat(ham, parameters, modeExtents)) {
+      cudensitymatWrappedScalarCallback_t wrappedCallback = {nullptr, nullptr};
+      if (coeff.is_constant()) {
+        const auto coeffVal = coeff.evaluate();
+        const auto leftCoeff = std::complex<double>(0.0, -1.0) * coeffVal;
+        // -i constant (left multiplication)
+        HANDLE_CUDM_ERROR(cudensitymatOperatorAppendTerm(
+            m_handle, liouvillian, term, 0,
+            make_cuDoubleComplex(leftCoeff.real(), leftCoeff.imag()),
+            wrappedCallback));
+
+        // +i constant (right multiplication, i.e., dual)
+        const auto rightCoeff = std::complex<double>(0.0, 1.0) * coeffVal;
+        HANDLE_CUDM_ERROR(cudensitymatOperatorAppendTerm(
+            m_handle, liouvillian, term, 1,
+            make_cuDoubleComplex(rightCoeff.real(), rightCoeff.imag()),
+            wrappedCallback));
+      } else {
+        wrappedCallback = wrapScalarCallback(coeff, keys);
+        // -i constant (left multiplication)
+        HANDLE_CUDM_ERROR(cudensitymatOperatorAppendTerm(
+            m_handle, liouvillian, term, 0, make_cuDoubleComplex(0.0, -1.0),
+            wrappedCallback));
+
+        // +i constant (right multiplication, i.e., dual)
+        HANDLE_CUDM_ERROR(cudensitymatOperatorAppendTerm(
+            m_handle, liouvillian, term, 1, make_cuDoubleComplex(0.0, 1.0),
+            wrappedCallback));
+      }
+    }
+
+    // Handle collapsed operators
+    for (auto &collapseOperator : collapseOperators) {
+      for (auto &[coeff, term] :
+           computeLindbladTerms(collapseOperator, modeExtents, parameters)) {
+        cudensitymatWrappedScalarCallback_t wrappedCallback = {nullptr,
+                                                               nullptr};
+        if (coeff.is_constant()) {
+          const auto coeffVal = coeff.evaluate();
+          HANDLE_CUDM_ERROR(cudensitymatOperatorAppendTerm(
+              m_handle, liouvillian, term, 0,
+              make_cuDoubleComplex(coeffVal.real(), coeffVal.imag()),
+              wrappedCallback));
+        } else {
+          wrappedCallback = wrapScalarCallback(coeff, keys);
+          HANDLE_CUDM_ERROR(cudensitymatOperatorAppendTerm(
+              m_handle, liouvillian, term, 0, make_cuDoubleComplex(1.0, 0.0),
+              wrappedCallback));
+        }
+      }
+    }
+
+    return liouvillian;
+  }
+}
+
+cudaq::dynamics::CuDensityMatOpConverter::~CuDensityMatOpConverter() {
+  for (auto term : m_operatorTerms)
+    cudensitymatDestroyOperatorTerm(term);
+
+  for (auto op : m_elementaryOperators)
+    cudensitymatDestroyElementaryOperator(op);
+
+  for (auto *buffer : m_deviceBuffers)
+    cudaFree(buffer);
+}
+
+cudensitymatElementaryOperator_t
+cudaq::dynamics::CuDensityMatOpConverter::createElementaryOperator(
+    const cudaq::matrix_operator &elemOp,
+    const std::unordered_map<std::string, std::complex<double>> &parameters,
+    const std::vector<int64_t> &modeExtents) {
+  auto subspaceExtents = getSubspaceExtents(modeExtents, elemOp.degrees());
+  std::unordered_map<int, int> dimensions = convertDimensions(modeExtents);
+  cudensitymatWrappedTensorCallback_t wrappedTensorCallback = {nullptr,
+                                                               nullptr};
+
+  static const std::vector<std::string> g_knownNonParametricOps = []() {
+    std::vector<std::string> opNames;
+    opNames.emplace_back(
+        cudaq::boson_operator::identity(0).get_terms()[0].to_string(false));
+    // These are ops that we created during lindblad generation
+    opNames.emplace_back(opNames.back() + "_dagger");
+    opNames.emplace_back(
+        cudaq::boson_operator::create(0).get_terms()[0].to_string(false));
+    opNames.emplace_back(opNames.back() + "_dagger");
+    opNames.emplace_back(
+        cudaq::boson_operator::annihilate(0).get_terms()[0].to_string(false));
+    opNames.emplace_back(opNames.back() + "_dagger");
+    opNames.emplace_back(
+        cudaq::boson_operator::number(0).get_terms()[0].to_string(false));
+    opNames.emplace_back(opNames.back() + "_dagger");
+    opNames.emplace_back(
+        cudaq::spin_operator::i(0).get_terms()[0].to_string(false));
+    opNames.emplace_back(opNames.back() + "_dagger");
+    opNames.emplace_back(
+        cudaq::spin_operator::x(0).get_terms()[0].to_string(false));
+    opNames.emplace_back(opNames.back() + "_dagger");
+    opNames.emplace_back(
+        cudaq::spin_operator::y(0).get_terms()[0].to_string(false));
+    opNames.emplace_back(opNames.back() + "_dagger");
+    opNames.emplace_back(
+        cudaq::spin_operator::z(0).get_terms()[0].to_string(false));
+    return opNames;
+  }();
+
+  // This is a callback
+  if (!parameters.empty() &&
+      std::find(g_knownNonParametricOps.begin(), g_knownNonParametricOps.end(),
+                elemOp.to_string(false)) == g_knownNonParametricOps.end()) {
+    const std::map<std::string, std::complex<double>> sortedParameters(
+        parameters.begin(), parameters.end());
+    auto ks = std::views::keys(sortedParameters);
+    const std::vector<std::string> keys{ks.begin(), ks.end()};
+    wrappedTensorCallback = wrapTensorCallback(elemOp, keys);
+  }
+
+  auto flatMatrix =
+      flattenMatrixColumnMajor(elemOp.to_matrix(dimensions, parameters));
+
+  if (flatMatrix.empty()) {
+    throw std::invalid_argument("Input matrix (flat matrix) cannot be empty.");
+  }
+
+  if (subspaceExtents.empty()) {
+    throw std::invalid_argument("subspaceExtents cannot be empty.");
+  }
+
+  auto *elementaryMat_d = createArrayGpu(flatMatrix);
+  cudensitymatElementaryOperator_t cudmElemOp = nullptr;
+
+  HANDLE_CUDM_ERROR(cudensitymatCreateElementaryOperator(
+      m_handle, static_cast<int32_t>(subspaceExtents.size()),
+      subspaceExtents.data(), CUDENSITYMAT_OPERATOR_SPARSITY_NONE, 0, nullptr,
+      CUDA_C_64F, elementaryMat_d, wrappedTensorCallback, &cudmElemOp));
+
+  if (!cudmElemOp) {
+    std::cerr << "[ERROR] cudmElemOp is NULL in createElementaryOperator !"
+              << std::endl;
+    destroyArrayGpu(elementaryMat_d);
+    throw std::runtime_error("Failed to create elementary operator.");
+  }
+  m_elementaryOperators.emplace(cudmElemOp);
+  m_deviceBuffers.emplace(elementaryMat_d);
+  return cudmElemOp;
+}
+
+cudensitymatOperatorTerm_t
+cudaq::dynamics::CuDensityMatOpConverter::createProductOperatorTerm(
+    const std::vector<cudensitymatElementaryOperator_t> &elemOps,
+    const std::vector<int64_t> &modeExtents,
+    const std::vector<std::vector<int>> &degrees,
+    const std::vector<std::vector<int>> &dualModalities) {
+
+  cudensitymatOperatorTerm_t term;
+  HANDLE_CUDM_ERROR(cudensitymatCreateOperatorTerm(
+      m_handle, static_cast<int32_t>(modeExtents.size()), modeExtents.data(),
+      &term));
+  m_operatorTerms.emplace(term);
+  if (degrees.empty()) {
+    throw std::invalid_argument("Degrees vector cannot be empty.");
+  }
+
+  if (elemOps.empty()) {
+    throw std::invalid_argument("elemOps cannot be null.");
+  }
+
+  if (degrees.size() != elemOps.size()) {
+    throw std::invalid_argument("elemOps and degrees must have the same size.");
+  }
+
+  const bool hasDualModalities = !dualModalities.empty();
+
+  if (hasDualModalities && degrees.size() != dualModalities.size()) {
+    throw std::invalid_argument(
+        "degrees and dualModalities must have the same size.");
+  }
+
+  std::vector<int32_t> allDegrees;
+  std::vector<int32_t> allModeActionDuality;
+  for (size_t i = 0; i < degrees.size(); i++) {
+    const auto &sub_degrees = degrees[i];
+    const auto &modalities = hasDualModalities
+                                 ? dualModalities[i]
+                                 : std::vector<int>(sub_degrees.size(), 0);
+
+    if (sub_degrees.size() != modalities.size()) {
+      throw std::runtime_error(
+          "Mismatch between degrees and modalities sizes.");
+    }
+    if (sub_degrees.size() != 1) {
+      throw std::runtime_error(
+          "Elementary operator must act on a single degree.");
+    }
+
+    for (size_t j = 0; j < sub_degrees.size(); j++) {
+      int degree = sub_degrees[j];
+      int modality = modalities[j];
+
+      if (sub_degrees[i] < 0) {
+        throw std::out_of_range("Degree cannot be negative!");
+      }
+      allDegrees.emplace_back(degree);
+      allModeActionDuality.emplace_back(modality);
+    }
+  }
+
+  assert(elemOps.size() == degrees.size());
+  HANDLE_CUDM_ERROR(cudensitymatOperatorTermAppendElementaryProduct(
+      m_handle, term, static_cast<int32_t>(elemOps.size()), elemOps.data(),
+      allDegrees.data(), allModeActionDuality.data(),
+      make_cuDoubleComplex(1.0, 0.0), {nullptr, nullptr}));
+  return term;
+}
+
+cudensitymatOperator_t
+cudaq::dynamics::CuDensityMatOpConverter::convertToCudensitymatOperator(
+    const std::unordered_map<std::string, std::complex<double>> &parameters,
+    const operator_sum<cudaq::matrix_operator> &op,
+    const std::vector<int64_t> &modeExtents) {
+  if (op.get_terms().empty()) {
+    throw std::invalid_argument("Operator sum cannot be empty.");
+  }
+
+  cudensitymatOperator_t cudmOperator;
+  HANDLE_CUDM_ERROR(cudensitymatCreateOperator(
+      m_handle, static_cast<int32_t>(modeExtents.size()), modeExtents.data(),
+      &cudmOperator));
+
+  const std::map<std::string, std::complex<double>> sortedParameters(
+      parameters.begin(), parameters.end());
+  auto ks = std::views::keys(sortedParameters);
+  const std::vector<std::string> keys{ks.begin(), ks.end()};
+  for (auto &[coeff, term] :
+       convertToCudensitymat(op, parameters, modeExtents)) {
+    cudensitymatWrappedScalarCallback_t wrappedCallback = {nullptr, nullptr};
+
+    if (coeff.is_constant()) {
+      const auto coeffVal = coeff.evaluate();
+      HANDLE_CUDM_ERROR(cudensitymatOperatorAppendTerm(
+          m_handle, cudmOperator, term, 0,
+          make_cuDoubleComplex(coeffVal.real(), coeffVal.imag()),
+          wrappedCallback));
+    } else {
+      wrappedCallback = wrapScalarCallback(coeff, keys);
+      HANDLE_CUDM_ERROR(cudensitymatOperatorAppendTerm(
+          m_handle, cudmOperator, term, 0, make_cuDoubleComplex(1.0, 0.0),
+          wrappedCallback));
+    }
+  }
+
+  return cudmOperator;
+}
+
+std::vector<std::pair<cudaq::scalar_operator, cudensitymatOperatorTerm_t>>
+cudaq::dynamics::CuDensityMatOpConverter::convertToCudensitymat(
+    const operator_sum<cudaq::matrix_operator> &op,
+    const std::unordered_map<std::string, std::complex<double>> &parameters,
+    const std::vector<int64_t> &modeExtents) {
+  if (op.get_terms().empty()) {
+    throw std::invalid_argument("Operator sum cannot be empty.");
+  }
+
+  std::vector<std::pair<cudaq::scalar_operator, cudensitymatOperatorTerm_t>>
+      result;
+
+  for (const auto &productOp : op.get_terms()) {
+    std::vector<cudensitymatElementaryOperator_t> elemOps;
+    std::vector<std::vector<int>> allDegrees;
+    for (const auto &component : productOp.get_terms()) {
+      // No need to check type
+      // just call to_matrix on it
+      if (const auto *elemOp =
+              dynamic_cast<const cudaq::matrix_operator *>(&component)) {
+        auto cudmElemOp =
+            createElementaryOperator(*elemOp, parameters, modeExtents);
+        elemOps.emplace_back(cudmElemOp);
+        allDegrees.emplace_back(elemOp->degrees());
+      } else {
+        // Catch anything that we don't know
+        throw std::runtime_error("Unhandled type!");
+      }
+    }
+    // Note: the order of operator application is the opposite of the writing:
+    // i.e., ABC means C to be applied first.
+    std::reverse(elemOps.begin(), elemOps.end());
+    std::reverse(allDegrees.begin(), allDegrees.end());
+    result.emplace_back(std::make_pair(
+        productOp.get_coefficient(),
+        createProductOperatorTerm(elemOps, modeExtents, allDegrees, {})));
+  }
+  return result;
+}
+
+std::vector<std::pair<cudaq::scalar_operator, cudensitymatOperatorTerm_t>>
+cudaq::dynamics::CuDensityMatOpConverter::computeLindbladTerms(
+    const operator_sum<cudaq::matrix_operator> &collapseOp,
+    const std::vector<int64_t> &modeExtents,
+    const std::unordered_map<std::string, std::complex<double>> &parameters) {
+  std::vector<std::pair<cudaq::scalar_operator, cudensitymatOperatorTerm_t>>
+      lindbladTerms;
+  for (const product_operator<matrix_operator> &l_op : collapseOp.get_terms()) {
+    for (const product_operator<matrix_operator> &r_op :
+         collapseOp.get_terms()) {
+      scalar_operator coeff =
+          l_op.get_coefficient() * computeDagger(r_op.get_coefficient());
+      auto ldag = computeDagger(r_op);
+      {
+        // L * rho * L_dag
+        std::vector<cudensitymatElementaryOperator_t> elemOps;
+        std::vector<std::vector<int>> allDegrees;
+        std::vector<std::vector<int>> all_action_dual_modalities;
+
+        for (const auto &component : l_op.get_terms()) {
+          if (const auto *elemOp =
+                  dynamic_cast<const cudaq::matrix_operator *>(&component)) {
+            auto cudmElemOp =
+                createElementaryOperator(*elemOp, parameters, modeExtents);
+            elemOps.emplace_back(cudmElemOp);
+            allDegrees.emplace_back(elemOp->degrees());
+            all_action_dual_modalities.emplace_back(
+                std::vector<int>(elemOp->degrees().size(), 0));
+          } else {
+            // Catch anything that we don't know
+            throw std::runtime_error("Unhandled type!");
+          }
+        }
+
+        for (const auto &component : ldag.get_terms()) {
+          if (const auto *elemOp =
+                  dynamic_cast<const cudaq::matrix_operator *>(&component)) {
+            auto cudmElemOp =
+                createElementaryOperator(*elemOp, parameters, modeExtents);
+            elemOps.emplace_back(cudmElemOp);
+            allDegrees.emplace_back(elemOp->degrees());
+            all_action_dual_modalities.emplace_back(
+                std::vector<int>(elemOp->degrees().size(), 1));
+          } else {
+            // Catch anything that we don't know
+            throw std::runtime_error("Unhandled type!");
+          }
+        }
+
+        cudensitymatOperatorTerm_t D1_term = createProductOperatorTerm(
+            elemOps, modeExtents, allDegrees, all_action_dual_modalities);
+        lindbladTerms.emplace_back(std::make_pair(coeff, D1_term));
+      }
+
+      product_operator<matrix_operator> L_daggerTimesL = -0.5 * ldag * l_op;
+      {
+        std::vector<cudensitymatElementaryOperator_t> elemOps;
+        std::vector<std::vector<int>> allDegrees;
+        std::vector<std::vector<int>> all_action_dual_modalities_left;
+        std::vector<std::vector<int>> all_action_dual_modalities_right;
+        for (const auto &component : L_daggerTimesL.get_terms()) {
+          if (const auto *elemOp =
+                  dynamic_cast<const cudaq::matrix_operator *>(&component)) {
+            auto cudmElemOp =
+                createElementaryOperator(*elemOp, parameters, modeExtents);
+            elemOps.emplace_back(cudmElemOp);
+            allDegrees.emplace_back(elemOp->degrees());
+            all_action_dual_modalities_left.emplace_back(
+                std::vector<int>(elemOp->degrees().size(), 0));
+            all_action_dual_modalities_right.emplace_back(
+                std::vector<int>(elemOp->degrees().size(), 1));
+          } else {
+            // Catch anything that we don't know
+            throw std::runtime_error("Unhandled type!");
+          }
+        }
+        {
+
+          // For left side, we need to reverse the order
+          std::vector<cudensitymatElementaryOperator_t> d2Ops(elemOps);
+          std::reverse(d2Ops.begin(), d2Ops.end());
+          std::vector<std::vector<int>> d2Degrees(allDegrees);
+          std::reverse(d2Degrees.begin(), d2Degrees.end());
+          cudensitymatOperatorTerm_t D2_term = createProductOperatorTerm(
+              d2Ops, modeExtents, d2Degrees, all_action_dual_modalities_left);
+          lindbladTerms.emplace_back(
+              std::make_pair(L_daggerTimesL.get_coefficient(), D2_term));
+        }
+        {
+          cudensitymatOperatorTerm_t D3_term =
+              createProductOperatorTerm(elemOps, modeExtents, allDegrees,
+                                        all_action_dual_modalities_right);
+          lindbladTerms.emplace_back(
+              std::make_pair(L_daggerTimesL.get_coefficient(), D3_term));
+        }
+      }
+    }
+  }
+  return lindbladTerms;
+}
+
+cudensitymatWrappedScalarCallback_t
+cudaq::dynamics::CuDensityMatOpConverter::wrapScalarCallback(
+    const scalar_operator &scalarOp,
+    const std::vector<std::string> &paramNames) {
+  if (scalarOp.is_constant()) {
+    throw std::runtime_error(
+        "scalar_operator does not have a valid generator function.");
+  }
+
+  m_scalarCallbacks.push_back(ScalarCallBackContext(scalarOp, paramNames));
+  ScalarCallBackContext *storedCallbackContext = &m_scalarCallbacks.back();
+  using WrapperFuncType =
+      int32_t (*)(cudensitymatScalarCallback_t, double, int32_t, const double[],
+                  cudaDataType_t, void *);
+
+  auto wrapper = [](cudensitymatScalarCallback_t callback, double time,
+                    int32_t numParams, const double params[],
+                    cudaDataType_t dataType, void *scalarStorage) -> int32_t {
+    try {
+      ScalarCallBackContext *context =
+          reinterpret_cast<ScalarCallBackContext *>(callback);
+      scalar_operator &storedOp = context->scalarOp;
+      if (numParams != 2 * context->paramNames.size())
+        throw std::runtime_error(
+            fmt::format("[Internal Error] Invalid number of callback "
+                        "parameters encountered. Expected {} double params "
+                        "representing {} complex values but received {}.",
+                        2 * context->paramNames.size(),
+                        context->paramNames.size(), numParams));
+
+      std::unordered_map<std::string, std::complex<double>> param_map;
+      for (size_t i = 0; i < context->paramNames.size(); ++i) {
+        param_map[context->paramNames[i]] =
+            std::complex<double>(params[2 * i], params[2 * i + 1]);
+        cudaq::debug("Callback param name {}, value {}", context->paramNames[i],
+                     param_map[context->paramNames[i]]);
+      }
+
+      std::complex<double> result = storedOp.evaluate(param_map);
+      cudaq::debug("Scalar callback evaluated result = {}", result);
+      auto *tdCoef = static_cast<std::complex<double> *>(scalarStorage);
+      *tdCoef = result;
+      return CUDENSITYMAT_STATUS_SUCCESS;
+    } catch (const std::exception &e) {
+      std::cerr << "Error in scalar callback: " << e.what() << std::endl;
+      return CUDENSITYMAT_STATUS_INTERNAL_ERROR;
+    }
+  };
+
+  cudensitymatWrappedScalarCallback_t wrappedCallback;
+  wrappedCallback.callback =
+      reinterpret_cast<cudensitymatScalarCallback_t>(storedCallbackContext);
+  wrappedCallback.wrapper =
+      reinterpret_cast<void *>(static_cast<WrapperFuncType>(wrapper));
+  return wrappedCallback;
+}
+
+cudensitymatWrappedTensorCallback_t
+cudaq::dynamics::CuDensityMatOpConverter::wrapTensorCallback(
+    const matrix_operator &matrixOp,
+    const std::vector<std::string> &paramNames) {
+  m_tensorCallbacks.push_back(TensorCallBackContext(matrixOp, paramNames));
+  TensorCallBackContext *storedCallbackContext = &m_tensorCallbacks.back();
+  using WrapperFuncType = int32_t (*)(
+      cudensitymatTensorCallback_t, cudensitymatElementaryOperatorSparsity_t,
+      int32_t, const int64_t[], const int32_t[], double, int32_t,
+      const double[], cudaDataType_t, void *, cudaStream_t);
+
+  auto wrapper = [](cudensitymatTensorCallback_t callback,
+                    cudensitymatElementaryOperatorSparsity_t sparsity,
+                    int32_t num_modes, const int64_t modeExtents[],
+                    const int32_t diagonal_offsets[], double time,
+                    int32_t num_params, const double params[],
+                    cudaDataType_t data_type, void *tensor_storage,
+                    cudaStream_t stream) -> int32_t {
+    try {
+      auto *context = reinterpret_cast<TensorCallBackContext *>(callback);
+      matrix_operator &storedOp = context->tensorOp;
+
+      if (num_modes <= 0) {
+        std::cerr << "num_modes is invalid: " << num_modes << std::endl;
+        return CUDENSITYMAT_STATUS_INVALID_VALUE;
+      }
+
+      if (num_params != 2 * context->paramNames.size())
+        throw std::runtime_error(
+            fmt::format("[Internal Error] Invalid number of tensor callback "
+                        "parameters. Expected {} double values "
+                        "representing {} complex parameters but received "
+                        "{}.",
+                        std::to_string(2 * context->paramNames.size()),
+                        std::to_string(context->paramNames.size()),
+                        std::to_string(num_params)));
+
+      std::unordered_map<std::string, std::complex<double>> param_map;
+      for (size_t i = 0; i < context->paramNames.size(); ++i) {
+        param_map[context->paramNames[i]] =
+            std::complex<double>(params[2 * i], params[2 * i + 1]);
+        cudaq::debug("Tensor callback param name {}, value {}",
+                     context->paramNames[i], param_map[context->paramNames[i]]);
+      }
+
+      std::unordered_map<int, int> dimensions;
+      for (int i = 0; i < num_modes; ++i) {
+        dimensions[i] = static_cast<int>(modeExtents[i]);
+      }
+
+      if (dimensions.empty()) {
+        std::cerr << "Dimension map is empty!" << std::endl;
+        return CUDENSITYMAT_STATUS_INVALID_VALUE;
+      }
+
+      matrix_2 matrix_data = storedOp.to_matrix(dimensions, param_map);
+
+      std::size_t rows = matrix_data.get_rows();
+      std::size_t cols = matrix_data.get_columns();
+
+      if (rows != cols) {
+        std::cerr << "Non-square matrix encountered: " << rows << "x" << cols
+                  << std::endl;
+        return CUDENSITYMAT_STATUS_INVALID_VALUE;
+      }
+
+      const std::vector<std::complex<double>> flatMatrix =
+          flattenMatrixColumnMajor(matrix_data);
+
+      if (data_type == CUDA_C_64F) {
+        memcpy(tensor_storage, flatMatrix.data(),
+               flatMatrix.size() * sizeof(cuDoubleComplex));
+      } else if (data_type == CUDA_C_32F) {
+        std::vector<std::complex<float>> flatMatrix_float(flatMatrix.begin(),
+                                                          flatMatrix.end());
+
+        memcpy(tensor_storage, flatMatrix_float.data(),
+               flatMatrix_float.size() * sizeof(cuFloatComplex));
+      } else {
+        std::cerr << "Invalid CUDA data type: " << data_type << std::endl;
+        return CUDENSITYMAT_STATUS_INVALID_VALUE;
+      }
+
+      return CUDENSITYMAT_STATUS_SUCCESS;
+    } catch (const std::exception &e) {
+      std::cerr << "Error in tensor callback: " << e.what() << std::endl;
+      return CUDENSITYMAT_STATUS_INTERNAL_ERROR;
+    }
+  };
+
+  cudensitymatWrappedTensorCallback_t wrappedCallback;
+  wrappedCallback.callback =
+      reinterpret_cast<cudensitymatTensorCallback_t>(storedCallbackContext);
+  wrappedCallback.wrapper =
+      reinterpret_cast<void *>(static_cast<WrapperFuncType>(wrapper));
+
+  return wrappedCallback;
+}
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatOpConverter.h b/runtime/nvqir/cudensitymat/CuDensityMatOpConverter.h
new file mode 100644
index 00000000000..987e173a9dc
--- /dev/null
+++ b/runtime/nvqir/cudensitymat/CuDensityMatOpConverter.h
@@ -0,0 +1,107 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "cudaq/operators.h"
+#include "cudaq/utils/tensor.h"
+#include <cudensitymat.h>
+#include <deque>
+#include <unordered_set>
+
+namespace cudaq {
+namespace dynamics {
+class CuDensityMatOpConverter {
+public:
+  CuDensityMatOpConverter(cudensitymatHandle_t handle) : m_handle(handle){};
+
+  /// @brief Convert a matrix operator to a cudensity matrix operator.
+  /// @param parameters The parameters of the operator.
+  /// @param op The matrix operator to convert.
+  /// @param modeExtents The extents of the modes.
+  /// @return The converted operator.
+  cudensitymatOperator_t convertToCudensitymatOperator(
+      const std::unordered_map<std::string, std::complex<double>> &parameters,
+      const operator_sum<cudaq::matrix_operator> &op,
+      const std::vector<int64_t> &modeExtents);
+
+  /// @brief Construct a Liouvillian operator.
+  /// @param ham The Hamiltonian operator.
+  /// @param collapseOperators The collapse operators.
+  /// @param modeExtents The extents of the modes.
+  /// @param parameters The parameters of the operators.
+  /// @param isMasterEquation Whether the Liouvillian is a master equation.
+  /// @return The constructed Liouvillian operator.
+  cudensitymatOperator_t constructLiouvillian(
+      const operator_sum<cudaq::matrix_operator> &ham,
+      const std::vector<operator_sum<cudaq::matrix_operator>>
+          &collapseOperators,
+      const std::vector<int64_t> &modeExtents,
+      const std::unordered_map<std::string, std::complex<double>> &parameters,
+      bool isMasterEquation);
+
+  ~CuDensityMatOpConverter();
+
+private:
+  std::vector<std::pair<cudaq::scalar_operator, cudensitymatOperatorTerm_t>>
+  convertToCudensitymat(
+      const operator_sum<cudaq::matrix_operator> &op,
+      const std::unordered_map<std::string, std::complex<double>> &parameters,
+      const std::vector<int64_t> &modeExtents);
+  cudensitymatElementaryOperator_t createElementaryOperator(
+      const cudaq::matrix_operator &elemOp,
+      const std::unordered_map<std::string, std::complex<double>> &parameters,
+      const std::vector<int64_t> &modeExtents);
+  cudensitymatOperatorTerm_t createProductOperatorTerm(
+      const std::vector<cudensitymatElementaryOperator_t> &elemOps,
+      const std::vector<int64_t> &modeExtents,
+      const std::vector<std::vector<int>> &degrees,
+      const std::vector<std::vector<int>> &dualModalities);
+
+  std::vector<std::pair<cudaq::scalar_operator, cudensitymatOperatorTerm_t>>
+  computeLindbladTerms(
+      const operator_sum<cudaq::matrix_operator> &collapseOp,
+      const std::vector<int64_t> &modeExtents,
+      const std::unordered_map<std::string, std::complex<double>> &parameters);
+
+  struct ScalarCallBackContext {
+    scalar_operator scalarOp;
+    std::vector<std::string> paramNames;
+    ScalarCallBackContext(const scalar_operator &scalar_op,
+                          const std::vector<std::string> &paramNames)
+        : scalarOp(scalar_op), paramNames(paramNames){};
+  };
+
+  struct TensorCallBackContext {
+    matrix_operator tensorOp;
+    std::vector<std::string> paramNames;
+
+    TensorCallBackContext(const matrix_operator &tensor_op,
+                          const std::vector<std::string> &param_names)
+        : tensorOp(tensor_op), paramNames(param_names){};
+  };
+
+  cudensitymatWrappedScalarCallback_t
+  wrapScalarCallback(const scalar_operator &scalarOp,
+                     const std::vector<std::string> &paramNames);
+  cudensitymatWrappedTensorCallback_t
+  wrapTensorCallback(const matrix_operator &matrixOp,
+                     const std::vector<std::string> &paramNames);
+
+private:
+  cudensitymatHandle_t m_handle;
+  // Things that we create that need to be cleaned up.
+  // Use a set so that it's safe to push pointer multiple times.
+  std::unordered_set<void *> m_deviceBuffers;
+  std::unordered_set<cudensitymatElementaryOperator_t> m_elementaryOperators;
+  std::unordered_set<cudensitymatOperatorTerm_t> m_operatorTerms;
+  std::deque<ScalarCallBackContext> m_scalarCallbacks;
+  std::deque<TensorCallBackContext> m_tensorCallbacks;
+};
+} // namespace dynamics
+} // namespace cudaq
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp b/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp
index ba56d6fdbfc..b3f46e28fb6 100644
--- a/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp
+++ b/runtime/nvqir/cudensitymat/CuDensityMatSim.cpp
@@ -1,4 +1,4 @@
-/*************************************************************** -*- C++ -*- ***
+/*******************************************************************************
  * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
  * All rights reserved.                                                        *
  *                                                                             *
@@ -7,6 +7,7 @@
  ******************************************************************************/
 
 #include "CircuitSimulator.h"
+#include "CuDensityMatErrorHandling.h"
 #include "CuDensityMatState.h"
 #include "cudaq.h"
 #include "cudaq/distributed/mpi_plugin.h"
@@ -80,16 +81,18 @@ class CuDensityMatSim : public nvqir::CircuitSimulatorBase<double> {
   CuDensityMatSim() {
     int numDevices{0};
     HANDLE_CUDA_ERROR(cudaGetDeviceCount(&numDevices));
-    const int deviceId =
-        cudaq::mpi::is_initialized() ? cudaq::mpi::rank() % numDevices : 0;
+    int currentDevice = -1;
+    HANDLE_CUDA_ERROR(cudaGetDevice(&currentDevice));
+    const int deviceId = cudaq::mpi::is_initialized()
+                             ? cudaq::mpi::rank() % numDevices
+                             : currentDevice;
     if (cudaq::mpi::is_initialized())
       initCuDensityMatCommLib();
     HANDLE_CUDA_ERROR(cudaSetDevice(deviceId));
   }
 
   /// The destructor
-  virtual ~CuDensityMatSim() = default;
-
+  virtual ~CuDensityMatSim() {}
   std::unique_ptr<cudaq::SimulationState> getSimulationState() override {
     return std::make_unique<cudaq::CuDensityMatState>();
   }
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatState.cpp b/runtime/nvqir/cudensitymat/CuDensityMatState.cpp
new file mode 100644
index 00000000000..a62cd0bfaa5
--- /dev/null
+++ b/runtime/nvqir/cudensitymat/CuDensityMatState.cpp
@@ -0,0 +1,610 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+#include "CuDensityMatState.h"
+#include "CuDensityMatContext.h"
+#include "CuDensityMatErrorHandling.h"
+#include "common/EigenDense.h"
+#include "common/Logger.h"
+#include "cudaq/utils/cudaq_utils.h"
+
+namespace cudaq {
+
+std::complex<double>
+CuDensityMatState::overlap(const cudaq::SimulationState &other) {
+  if (getTensor().extents != other.getTensor().extents)
+    throw std::runtime_error("[CuDensityMatState] overlap error - other state "
+                             "dimension not equal to this state dimension.");
+
+  if (other.getPrecision() != getPrecision()) {
+    throw std::runtime_error(
+        "[CuDensityMatState] overlap error - precision mismatch.");
+  }
+
+  if (!isDensityMatrix) {
+    Eigen::VectorXcd state(dimension);
+    const auto size = dimension;
+    HANDLE_CUDA_ERROR(cudaMemcpy(state.data(), devicePtr,
+                                 size * sizeof(std::complex<double>),
+                                 cudaMemcpyDeviceToHost));
+
+    Eigen::VectorXcd otherState(dimension);
+    HANDLE_CUDA_ERROR(cudaMemcpy(otherState.data(), other.getTensor().data,
+                                 size * sizeof(std::complex<double>),
+                                 cudaMemcpyDeviceToHost));
+    return std::abs(std::inner_product(
+        state.begin(), state.end(), otherState.begin(),
+        std::complex<double>{0., 0.}, [](auto a, auto b) { return a + b; },
+        [](auto a, auto b) { return a * std::conj(b); }));
+  }
+
+  // FIXME: implement this in GPU memory
+  Eigen::MatrixXcd state(dimension, dimension);
+  const auto size = dimension * dimension;
+  HANDLE_CUDA_ERROR(cudaMemcpy(state.data(), devicePtr,
+                               size * sizeof(std::complex<double>),
+                               cudaMemcpyDeviceToHost));
+
+  Eigen::MatrixXcd otherState(dimension, dimension);
+  HANDLE_CUDA_ERROR(cudaMemcpy(otherState.data(), other.getTensor().data,
+                               size * sizeof(std::complex<double>),
+                               cudaMemcpyDeviceToHost));
+
+  return (state.adjoint() * otherState).trace();
+}
+
+std::complex<double>
+CuDensityMatState::getAmplitude(const std::vector<int> &basisState) {
+  throw std::runtime_error(
+      "[CuDensityMatState] getAmplitude by basis states is not supported. "
+      "Please use direct indexing access instead.");
+}
+
+// Dump the state to the given output stream
+void CuDensityMatState::dump(std::ostream &os) const {
+  // get state data from device to print
+  Eigen::MatrixXcd state(dimension, isDensityMatrix ? dimension : 1);
+  const auto size = isDensityMatrix ? dimension * dimension : dimension;
+  HANDLE_CUDA_ERROR(cudaMemcpy(state.data(), devicePtr,
+                               size * sizeof(std::complex<double>),
+                               cudaMemcpyDeviceToHost));
+  os << state << std::endl;
+}
+
+std::unique_ptr<SimulationState>
+CuDensityMatState::createFromSizeAndPtr(std::size_t size, void *dataPtr,
+                                        std::size_t type) {
+  bool isDm = false;
+  if (type == cudaq::detail::variant_index<cudaq::state_data,
+                                           cudaq::TensorStateData>()) {
+    if (size != 1)
+      throw std::runtime_error("[CuDensityMatState]: createFromSizeAndPtr "
+                               "expects a single tensor");
+    auto *casted =
+        reinterpret_cast<cudaq::TensorStateData::value_type *>(dataPtr);
+
+    auto [ptr, extents] = casted[0];
+    if (extents.size() > 2)
+      throw std::runtime_error("[CuDensityMatState]: createFromSizeAndPtr only "
+                               "accept 1D or 2D arrays");
+
+    isDm = extents.size() == 2;
+    size = std::reduce(extents.begin(), extents.end(), 1, std::multiplies());
+    dataPtr = const_cast<void *>(ptr);
+  }
+
+  std::complex<double> *devicePtr = nullptr;
+
+  HANDLE_CUDA_ERROR(
+      cudaMalloc((void **)&devicePtr, size * sizeof(std::complex<double>)));
+  HANDLE_CUDA_ERROR(cudaMemcpy(devicePtr, dataPtr,
+                               size * sizeof(std::complex<double>),
+                               cudaMemcpyDefault));
+  // printf("Created CuDensityMatState ptr %p\n", devicePtr);
+  return std::make_unique<CuDensityMatState>(size, devicePtr, isDm);
+}
+
+// Return the tensor at the given index. Throws
+// for an invalid tensor index.
+cudaq::SimulationState::Tensor
+CuDensityMatState::getTensor(std::size_t tensorIdx) const {
+  if (tensorIdx != 0) {
+    throw std::runtime_error(
+        "CuDensityMatState state only supports a single tensor");
+  }
+  const std::vector<std::size_t> extents =
+      isDensityMatrix ? std::vector<std::size_t>{dimension, dimension}
+                      : std::vector<std::size_t>{dimension};
+  return Tensor{devicePtr, extents, precision::fp64};
+}
+
+std::complex<double>
+CuDensityMatState::operator()(std::size_t tensorIdx,
+                              const std::vector<std::size_t> &indices) {
+  const auto extractValue = [&](std::size_t idx) {
+    std::complex<double> value;
+    HANDLE_CUDA_ERROR(cudaMemcpy(
+        &value, reinterpret_cast<std::complex<double> *>(devicePtr) + idx,
+        sizeof(std::complex<double>), cudaMemcpyDeviceToHost));
+    return value;
+  };
+
+  if (tensorIdx != 0)
+    throw std::runtime_error(
+        "CuDensityMatState state only supports a single tensor");
+  if (isDensityMatrix) {
+    if (indices.size() != 2)
+      throw std::runtime_error("CuDensityMatState holding a density matrix "
+                               "supports only 2-dimensional indices");
+    if (indices[0] >= dimension || indices[1] >= dimension)
+      throw std::runtime_error("CuDensityMatState indices out of range");
+    return extractValue(indices[0] * dimension + indices[1]);
+  }
+  if (indices.size() != 1)
+    throw std::runtime_error(
+        "CuDensityMatState holding a state vector supports "
+        "only 1-dimensional indices");
+  if (indices[0] >= dimension)
+    throw std::runtime_error("Index out of bounds");
+  return extractValue(indices[0]);
+}
+
+// Copy the state device data to the user-provided host data pointer.
+void CuDensityMatState::toHost(std::complex<double> *userData,
+                               std::size_t numElements) const {
+  if (numElements != dimension * (isDensityMatrix ? dimension : 1)) {
+    throw std::runtime_error("Number of elements in user data does not match "
+                             "the size of the state");
+  }
+  HANDLE_CUDA_ERROR(cudaMemcpy(userData, devicePtr,
+                               numElements * sizeof(std::complex<double>),
+                               cudaMemcpyDeviceToHost));
+}
+
+// Copy the state device data to the user-provided host data pointer.
+void CuDensityMatState::toHost(std::complex<float> *userData,
+                               std::size_t numElements) const {
+  throw std::runtime_error(
+      "CuDensityMatState: Data type mismatches - expecting "
+      "double-precision array.");
+}
+
+// Free the device data.
+void CuDensityMatState::destroyState() {
+  if (cudmState) {
+    cudensitymatDestroyState(cudmState);
+    cudmState = nullptr;
+  }
+  if (devicePtr != nullptr) {
+    HANDLE_CUDA_ERROR(cudaFree(devicePtr));
+    devicePtr = nullptr;
+    dimension = 0;
+    isDensityMatrix = false;
+  }
+}
+
+static size_t
+calculate_state_vector_size(const std::vector<int64_t> &hilbertSpaceDims) {
+  return std::accumulate(hilbertSpaceDims.begin(), hilbertSpaceDims.end(), 1,
+                         std::multiplies<>());
+}
+
+static size_t
+calculate_density_matrix_size(const std::vector<int64_t> &hilbertSpaceDims) {
+  size_t vectorSize = calculate_state_vector_size(hilbertSpaceDims);
+  return vectorSize * vectorSize;
+}
+
+CuDensityMatState::CuDensityMatState(
+    cudensitymatHandle_t handle,
+    const std::vector<std::complex<double>> &rawData,
+    const std::vector<int64_t> &dims)
+    : cudmHandle(handle), dimension(rawData.size()), cudmState(nullptr),
+      hilbertSpaceDims(dims) {
+  if (rawData.empty()) {
+    throw std::invalid_argument("Raw data cannot be empty.");
+  }
+  // Allocate device memory
+  size_t dataSize = rawData.size() * sizeof(std::complex<double>);
+  HANDLE_CUDA_ERROR(
+      cudaMalloc(reinterpret_cast<void **>(&devicePtr), dataSize));
+
+  // Copy data from host to device
+  HANDLE_CUDA_ERROR(
+      cudaMemcpy(devicePtr, rawData.data(), dataSize, cudaMemcpyHostToDevice));
+
+  // Determine if this is a denisty matrix or state vector
+  size_t rawDataSize = rawData.size();
+  size_t expectedDensityMatrixSize =
+      calculate_density_matrix_size(hilbertSpaceDims);
+  size_t expectedStateVectorSize =
+      calculate_state_vector_size(hilbertSpaceDims);
+
+  if (rawDataSize != expectedDensityMatrixSize &&
+      rawDataSize != expectedStateVectorSize) {
+    throw std::invalid_argument(
+        "Invalid rawData size for the given Hilbert space dimensions.");
+  }
+
+  cudensitymatStatePurity_t purity;
+
+  if (rawDataSize == expectedDensityMatrixSize) {
+    purity = CUDENSITYMAT_STATE_PURITY_MIXED;
+  } else if (rawDataSize == expectedStateVectorSize) {
+    purity = CUDENSITYMAT_STATE_PURITY_PURE;
+  }
+
+  HANDLE_CUDM_ERROR(cudensitymatCreateState(
+      cudmHandle, purity, static_cast<int32_t>(hilbertSpaceDims.size()),
+      hilbertSpaceDims.data(), 1, CUDA_C_64F, &cudmState));
+
+  // Retrieve the number of state components
+  int32_t numStateComponents;
+  HANDLE_CUDM_ERROR(cudensitymatStateGetNumComponents(cudmHandle, cudmState,
+                                                      &numStateComponents));
+
+  // Retrieve the storage size for each component
+  std::vector<size_t> componentBufferSizes(numStateComponents);
+  HANDLE_CUDM_ERROR(cudensitymatStateGetComponentStorageSize(
+      cudmHandle, cudmState, numStateComponents, componentBufferSizes.data()));
+
+  // Validate device memory
+  size_t totalSize = std::accumulate(componentBufferSizes.begin(),
+                                     componentBufferSizes.end(), 0);
+  if (totalSize > rawData.size() * sizeof(std::complex<double>)) {
+    throw std::invalid_argument(
+        "Device memory size is insufficient to cover all components.");
+  }
+
+  // Attach storage for using device memory (devicePtr)
+  std::vector<void *> componentBuffers(numStateComponents);
+  size_t offset = 0;
+  for (int32_t i = 0; i < numStateComponents; i++) {
+    componentBuffers[i] = static_cast<void *>(
+        static_cast<std::complex<double> *>(devicePtr) + offset);
+    offset += componentBufferSizes[i] / sizeof(std::complex<double>);
+  }
+
+  HANDLE_CUDM_ERROR(cudensitymatStateAttachComponentStorage(
+      cudmHandle, cudmState, numStateComponents, componentBuffers.data(),
+      componentBufferSizes.data()));
+}
+
+CuDensityMatState::CuDensityMatState(cudensitymatHandle_t handle,
+                                     const CuDensityMatState &simState,
+                                     const std::vector<int64_t> &dims)
+    : cudmHandle(handle), hilbertSpaceDims(dims) {
+
+  const bool isDensityMat =
+      simState.dimension == calculate_density_matrix_size(hilbertSpaceDims);
+  dimension = simState.dimension;
+
+  const size_t dataSize = dimension * sizeof(std::complex<double>);
+  HANDLE_CUDA_ERROR(
+      cudaMalloc(reinterpret_cast<void **>(&devicePtr), dataSize));
+
+  HANDLE_CUDA_ERROR(
+      cudaMemcpy(devicePtr, simState.devicePtr, dataSize, cudaMemcpyDefault));
+
+  const cudensitymatStatePurity_t purity = isDensityMat
+                                               ? CUDENSITYMAT_STATE_PURITY_MIXED
+                                               : CUDENSITYMAT_STATE_PURITY_PURE;
+  HANDLE_CUDM_ERROR(cudensitymatCreateState(
+      cudmHandle, purity, static_cast<int32_t>(hilbertSpaceDims.size()),
+      hilbertSpaceDims.data(), 1, CUDA_C_64F, &cudmState));
+
+  // Query the size of the quantum state storage
+  std::size_t storageSize{0}; // only one storage component (tensor) is needed
+  HANDLE_CUDM_ERROR(cudensitymatStateGetComponentStorageSize(
+      cudmHandle, cudmState,
+      1,              // only one storage component
+      &storageSize)); // storage size in bytes
+  const std::size_t stateVolume =
+      storageSize / sizeof(std::complex<double>); // quantum state tensor volume
+                                                  // (number of elements)
+  assert(stateVolume == dimension);
+  // std::cout << "Quantum state storage size (bytes) = " << storageSize
+  //           << std::endl;
+
+  // Attach initialized GPU storage to the input quantum state
+  HANDLE_CUDM_ERROR(cudensitymatStateAttachComponentStorage(
+      cudmHandle, cudmState,
+      1, // only one storage component (tensor)
+      std::vector<void *>({devicePtr})
+          .data(), // pointer to the GPU storage for the quantum state
+      std::vector<std::size_t>({storageSize})
+          .data())); // size of the GPU storage for the quantum state
+}
+
+CuDensityMatState CuDensityMatState::zero_like(const CuDensityMatState &other) {
+  CuDensityMatState state;
+  state.cudmHandle = other.cudmHandle;
+  state.hilbertSpaceDims = other.hilbertSpaceDims;
+  state.dimension = other.dimension;
+  const size_t dataSize = state.dimension * sizeof(std::complex<double>);
+  HANDLE_CUDA_ERROR(
+      cudaMalloc(reinterpret_cast<void **>(&state.devicePtr), dataSize));
+  HANDLE_CUDA_ERROR(cudaMemset(state.devicePtr, 0, dataSize));
+
+  const size_t expectedDensityMatrixSize =
+      calculate_density_matrix_size(state.hilbertSpaceDims);
+  const bool isDensityMat = expectedDensityMatrixSize == state.dimension;
+  const cudensitymatStatePurity_t purity = isDensityMat
+                                               ? CUDENSITYMAT_STATE_PURITY_MIXED
+                                               : CUDENSITYMAT_STATE_PURITY_PURE;
+  HANDLE_CUDM_ERROR(cudensitymatCreateState(
+      state.cudmHandle, purity,
+      static_cast<int32_t>(state.hilbertSpaceDims.size()),
+      state.hilbertSpaceDims.data(), 1, CUDA_C_64F, &state.cudmState));
+
+  // Query the size of the quantum state storage
+  std::size_t storageSize{0}; // only one storage component (tensor) is needed
+  HANDLE_CUDM_ERROR(cudensitymatStateGetComponentStorageSize(
+      state.cudmHandle, state.cudmState,
+      1,              // only one storage component
+      &storageSize)); // storage size in bytes
+  const std::size_t stateVolume =
+      storageSize / sizeof(std::complex<double>); // quantum state tensor volume
+                                                  // (number of elements)
+  assert(stateVolume == state.dimension);
+  // std::cout << "Quantum state storage size (bytes) = " << storageSize
+  //           << std::endl;
+
+  // Attach initialized GPU storage to the input quantum state
+  HANDLE_CUDM_ERROR(cudensitymatStateAttachComponentStorage(
+      state.cudmHandle, state.cudmState,
+      1, // only one storage component (tensor)
+      std::vector<void *>({state.devicePtr})
+          .data(), // pointer to the GPU storage for the quantum state
+      std::vector<std::size_t>({storageSize})
+          .data())); // size of the GPU storage for the quantum state
+  return state;
+}
+
+CuDensityMatState CuDensityMatState::clone(const CuDensityMatState &other) {
+  CuDensityMatState state;
+  state.cudmHandle = other.cudmHandle;
+  state.hilbertSpaceDims = other.hilbertSpaceDims;
+  state.dimension = other.dimension;
+  const size_t dataSize = state.dimension * sizeof(std::complex<double>);
+  HANDLE_CUDA_ERROR(
+      cudaMalloc(reinterpret_cast<void **>(&state.devicePtr), dataSize));
+  HANDLE_CUDA_ERROR(cudaMemcpy(state.devicePtr, other.devicePtr, dataSize,
+                               cudaMemcpyDefault));
+
+  const size_t expectedDensityMatrixSize =
+      calculate_density_matrix_size(state.hilbertSpaceDims);
+  const bool isDensityMat = expectedDensityMatrixSize == state.dimension;
+  const cudensitymatStatePurity_t purity = isDensityMat
+                                               ? CUDENSITYMAT_STATE_PURITY_MIXED
+                                               : CUDENSITYMAT_STATE_PURITY_PURE;
+  HANDLE_CUDM_ERROR(cudensitymatCreateState(
+      state.cudmHandle, purity,
+      static_cast<int32_t>(state.hilbertSpaceDims.size()),
+      state.hilbertSpaceDims.data(), 1, CUDA_C_64F, &state.cudmState));
+
+  // Query the size of the quantum state storage
+  std::size_t storageSize{0}; // only one storage component (tensor) is needed
+  HANDLE_CUDM_ERROR(cudensitymatStateGetComponentStorageSize(
+      state.cudmHandle, state.cudmState,
+      1,              // only one storage component
+      &storageSize)); // storage size in bytes
+  const std::size_t stateVolume =
+      storageSize / sizeof(std::complex<double>); // quantum state tensor volume
+                                                  // (number of elements)
+  assert(stateVolume == state.dimension);
+  // std::cout << "Quantum state storage size (bytes) = " << storageSize
+  //           << std::endl;
+
+  // Attach initialized GPU storage to the input quantum state
+  HANDLE_CUDM_ERROR(cudensitymatStateAttachComponentStorage(
+      state.cudmHandle, state.cudmState,
+      1, // only one storage component (tensor)
+      std::vector<void *>({state.devicePtr})
+          .data(), // pointer to the GPU storage for the quantum state
+      std::vector<std::size_t>({storageSize})
+          .data())); // size of the GPU storage for the quantum state
+  return state;
+}
+
+CuDensityMatState::CuDensityMatState(CuDensityMatState &&other) noexcept
+    : isDensityMatrix(other.isDensityMatrix), dimension(other.dimension),
+      devicePtr(other.devicePtr), cudmState(other.cudmState),
+      cudmHandle(other.cudmHandle), hilbertSpaceDims(other.hilbertSpaceDims) {
+  other.isDensityMatrix = false;
+  other.dimension = 0;
+  other.devicePtr = nullptr;
+
+  other.cudmState = nullptr;
+  other.cudmHandle = nullptr;
+  other.hilbertSpaceDims.clear();
+}
+
+CuDensityMatState &
+CuDensityMatState::operator=(CuDensityMatState &&other) noexcept {
+  if (this != &other) {
+    // Free existing resources
+    if (cudmState) {
+      cudensitymatDestroyState(cudmState);
+    }
+    if (devicePtr) {
+      cudaFree(devicePtr);
+    }
+
+    // Move data from other
+    isDensityMatrix = other.isDensityMatrix;
+    dimension = other.dimension;
+    devicePtr = other.devicePtr;
+    cudmState = other.cudmState;
+    cudmHandle = other.cudmHandle;
+    hilbertSpaceDims = std::move(other.hilbertSpaceDims);
+
+    // Nullify other
+    other.isDensityMatrix = false;
+    other.dimension = 0;
+    other.devicePtr = nullptr;
+
+    other.cudmState = nullptr;
+  }
+  return *this;
+}
+
+CuDensityMatState::~CuDensityMatState() { destroyState(); }
+
+bool CuDensityMatState::is_initialized() const { return cudmState != nullptr; }
+
+bool cudaq::CuDensityMatState::is_density_matrix() const {
+  if (!is_initialized()) {
+    return false;
+  }
+
+  return dimension == calculate_density_matrix_size(hilbertSpaceDims);
+}
+
+CuDensityMatState cudaq::CuDensityMatState::to_density_matrix() const {
+  if (!is_initialized()) {
+    throw std::runtime_error("State is not initialized.");
+  }
+
+  if (is_density_matrix()) {
+    throw std::runtime_error("State is already a density matrix.");
+  }
+
+  size_t vectorSize = calculate_state_vector_size(hilbertSpaceDims);
+  std::vector<std::complex<double>> stateVecData(vectorSize);
+  HANDLE_CUDA_ERROR(cudaMemcpy(stateVecData.data(), devicePtr,
+                               dimension * sizeof(std::complex<double>),
+                               cudaMemcpyDeviceToHost));
+  size_t expectedDensityMatrixSize = vectorSize * vectorSize;
+  std::vector<std::complex<double>> densityMatrix(expectedDensityMatrixSize);
+
+  for (size_t i = 0; i < vectorSize; i++) {
+    for (size_t j = 0; j < vectorSize; j++) {
+      densityMatrix[i * vectorSize + j] =
+          stateVecData[i] * std::conj(stateVecData[j]);
+    }
+  }
+
+  return CuDensityMatState(cudmHandle, densityMatrix, hilbertSpaceDims);
+}
+
+cudensitymatState_t cudaq::CuDensityMatState::get_impl() const {
+  return cudmState;
+}
+
+void *cudaq::CuDensityMatState::get_device_pointer() const { return devicePtr; }
+
+std::vector<int64_t> cudaq::CuDensityMatState::get_hilbert_space_dims() const {
+  return hilbertSpaceDims;
+}
+
+cudensitymatHandle_t cudaq::CuDensityMatState::get_handle() const {
+  return cudmHandle;
+}
+
+void CuDensityMatState::initialize_cudm(cudensitymatHandle_t handleToSet,
+                                        const std::vector<int64_t> &dims) {
+  cudmHandle = handleToSet;
+  hilbertSpaceDims = dims;
+  size_t expectedDensityMatrixSize =
+      calculate_density_matrix_size(hilbertSpaceDims);
+  size_t expectedStateVectorSize =
+      calculate_state_vector_size(hilbertSpaceDims);
+
+  if (dimension != expectedDensityMatrixSize &&
+      dimension != expectedStateVectorSize) {
+    throw std::invalid_argument("Invalid hilbertSpaceDims for the state data");
+  }
+
+  const cudensitymatStatePurity_t purity =
+      dimension == expectedDensityMatrixSize ? CUDENSITYMAT_STATE_PURITY_MIXED
+                                             : CUDENSITYMAT_STATE_PURITY_PURE;
+
+  HANDLE_CUDM_ERROR(cudensitymatCreateState(
+      cudmHandle, purity, static_cast<int32_t>(hilbertSpaceDims.size()),
+      hilbertSpaceDims.data(), 1, CUDA_C_64F, &cudmState));
+
+  std::size_t storageSize;
+  HANDLE_CUDM_ERROR(cudensitymatStateGetComponentStorageSize(
+      cudmHandle, cudmState,
+      1,              // only one storage component
+      &storageSize)); // storage size in bytes
+  // Attach initialized GPU storage to the input quantum state
+  HANDLE_CUDM_ERROR(cudensitymatStateAttachComponentStorage(
+      cudmHandle, cudmState,
+      1, // only one storage component (tensor)
+      std::vector<void *>({devicePtr})
+          .data(), // pointer to the GPU storage for the quantum state
+      std::vector<std::size_t>({storageSize})
+          .data())); // size of the GPU storage for the quantum state
+}
+
+CuDensityMatState
+cudaq::CuDensityMatState::operator+(const CuDensityMatState &other) const {
+  if (dimension != other.dimension) {
+    throw std::invalid_argument("State size mismatch for addition.");
+  }
+
+  CuDensityMatState result = CuDensityMatState::clone(*this);
+
+  result += other;
+
+  return result;
+}
+
+CuDensityMatState &
+cudaq::CuDensityMatState::operator+=(const CuDensityMatState &other) {
+  if (dimension != other.dimension) {
+    throw std::invalid_argument(
+        fmt::format("State size mismatch for addition ({} vs {}).", dimension,
+                    other.dimension));
+  }
+
+  // double scalingFactor = 1.0;
+  // double *gpuScalingFactor;
+  // cudaMalloc(reinterpret_cast<void **>(&gpuScalingFactor), sizeof(double));
+  // cudaMemcpy(gpuScalingFactor, &scalingFactor, sizeof(double),
+  //            cudaMemcpyHostToDevice);
+
+  // HANDLE_CUDM_ERROR(cudensitymatStateComputeAccumulation(
+  //     cudmHandle, other.get_impl(), cudmState, gpuScalingFactor, 0));
+
+  // cudaFree(gpuScalingFactor);
+  cuDoubleComplex scalar{1.0, 0.0};
+  HANDLE_CUBLAS_ERROR(cublasZaxpy(
+      dynamics::Context::getCurrentContext()->getCublasHandle(), dimension,
+      &scalar, reinterpret_cast<const cuDoubleComplex *>(other.devicePtr), 1,
+      reinterpret_cast<cuDoubleComplex *>(devicePtr), 1));
+  return *this;
+}
+
+CuDensityMatState &
+cudaq::CuDensityMatState::operator*=(const std::complex<double> &scalar) {
+  // void *gpuScalar;
+  // HANDLE_CUDA_ERROR(cudaMalloc(&gpuScalar, sizeof(std::complex<double>)));
+  // HANDLE_CUDA_ERROR(cudaMemcpy(gpuScalar, &scalar,
+  // sizeof(std::complex<double>),
+  //                              cudaMemcpyHostToDevice));
+
+  // HANDLE_CUDM_ERROR(
+  //     cudensitymatStateComputeScaling(cudmHandle, cudmState, gpuScalar, 0));
+
+  // HANDLE_CUDA_ERROR(cudaFree(gpuScalar));
+  HANDLE_CUBLAS_ERROR(
+      cublasZscal(dynamics::Context::getCurrentContext()->getCublasHandle(),
+                  dimension, reinterpret_cast<const cuDoubleComplex *>(&scalar),
+                  reinterpret_cast<cuDoubleComplex *>(devicePtr), 1));
+
+  return *this;
+}
+
+CuDensityMatState cudaq::CuDensityMatState::operator*(double scalar) const {
+  CuDensityMatState result = CuDensityMatState::clone(*this);
+  result *= scalar;
+  return result;
+}
+} // namespace cudaq
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatState.h b/runtime/nvqir/cudensitymat/CuDensityMatState.h
index 5c0b195d4d4..630663b4ce3 100644
--- a/runtime/nvqir/cudensitymat/CuDensityMatState.h
+++ b/runtime/nvqir/cudensitymat/CuDensityMatState.h
@@ -7,21 +7,8 @@
  ******************************************************************************/
 #pragma once
 
-#include "common/EigenDense.h"
-#include "common/Logger.h"
 #include "common/SimulationState.h"
-#include "cudaq/utils/cudaq_utils.h"
-#include <cuda_runtime_api.h>
-
-#define HANDLE_CUDA_ERROR(x)                                                   \
-  {                                                                            \
-    const auto err = x;                                                        \
-    if (err != cudaSuccess) {                                                  \
-      throw std::runtime_error(                                                \
-          fmt::format("[CuDensityMatState] %{} in {} (line {})",               \
-                      cudaGetErrorString(err), __FUNCTION__, __LINE__));       \
-    }                                                                          \
-  };
+#include <cudensitymat.h>
 
 namespace cudaq {
 /// @cond
@@ -34,6 +21,10 @@ class CuDensityMatState : public cudaq::SimulationState {
   // State device data pointer.
   void *devicePtr = nullptr;
 
+  cudensitymatState_t cudmState = nullptr;
+  cudensitymatHandle_t cudmHandle = nullptr;
+  std::vector<int64_t> hilbertSpaceDims;
+
 public:
   CuDensityMatState(std::size_t s, void *ptr, bool isDm)
       : isDensityMatrix(isDm), devicePtr(ptr),
@@ -43,66 +34,13 @@ class CuDensityMatState : public cudaq::SimulationState {
 
   std::size_t getNumQubits() const override { return std::log2(dimension); }
 
-  std::complex<double> overlap(const cudaq::SimulationState &other) override {
-    if (getTensor().extents != other.getTensor().extents)
-      throw std::runtime_error(
-          "[CuDensityMatState] overlap error - other state "
-          "dimension not equal to this state dimension.");
-
-    if (other.getPrecision() != getPrecision()) {
-      throw std::runtime_error(
-          "[CuDensityMatState] overlap error - precision mismatch.");
-    }
-
-    if (!isDensityMatrix) {
-      Eigen::VectorXcd state(dimension);
-      const auto size = dimension;
-      HANDLE_CUDA_ERROR(cudaMemcpy(state.data(), devicePtr,
-                                   size * sizeof(std::complex<double>),
-                                   cudaMemcpyDeviceToHost));
-
-      Eigen::VectorXcd otherState(dimension);
-      HANDLE_CUDA_ERROR(cudaMemcpy(otherState.data(), other.getTensor().data,
-                                   size * sizeof(std::complex<double>),
-                                   cudaMemcpyDeviceToHost));
-      return std::abs(std::inner_product(
-          state.begin(), state.end(), otherState.begin(),
-          std::complex<double>{0., 0.}, [](auto a, auto b) { return a + b; },
-          [](auto a, auto b) { return a * std::conj(b); }));
-    }
-
-    // FIXME: implement this in GPU memory
-    Eigen::MatrixXcd state(dimension, dimension);
-    const auto size = dimension * dimension;
-    HANDLE_CUDA_ERROR(cudaMemcpy(state.data(), devicePtr,
-                                 size * sizeof(std::complex<double>),
-                                 cudaMemcpyDeviceToHost));
-
-    Eigen::MatrixXcd otherState(dimension, dimension);
-    HANDLE_CUDA_ERROR(cudaMemcpy(otherState.data(), other.getTensor().data,
-                                 size * sizeof(std::complex<double>),
-                                 cudaMemcpyDeviceToHost));
-
-    return (state.adjoint() * otherState).trace();
-  }
+  std::complex<double> overlap(const cudaq::SimulationState &other) override;
 
   std::complex<double>
-  getAmplitude(const std::vector<int> &basisState) override {
-    throw std::runtime_error(
-        "[CuDensityMatState] getAmplitude by basis states is not supported. "
-        "Please use direct indexing access instead.");
-  }
+  getAmplitude(const std::vector<int> &basisState) override;
 
   // Dump the state to the given output stream
-  void dump(std::ostream &os) const override {
-    // get state data from device to print
-    Eigen::MatrixXcd state(dimension, isDensityMatrix ? dimension : 1);
-    const auto size = isDensityMatrix ? dimension * dimension : dimension;
-    HANDLE_CUDA_ERROR(cudaMemcpy(state.data(), devicePtr,
-                                 size * sizeof(std::complex<double>),
-                                 cudaMemcpyDeviceToHost));
-    os << state << std::endl;
-  }
+  void dump(std::ostream &os) const override;
 
   // This state is GPU device data, always return true.
   bool isDeviceData() const override { return true; }
@@ -116,50 +54,11 @@ class CuDensityMatState : public cudaq::SimulationState {
 
   std::unique_ptr<SimulationState>
   createFromSizeAndPtr(std::size_t size, void *dataPtr,
-                       std::size_t type) override {
-    bool isDm = false;
-    if (type == cudaq::detail::variant_index<cudaq::state_data,
-                                             cudaq::TensorStateData>()) {
-      if (size != 1)
-        throw std::runtime_error("[CuDensityMatState]: createFromSizeAndPtr "
-                                 "expects a single tensor");
-      auto *casted =
-          reinterpret_cast<cudaq::TensorStateData::value_type *>(dataPtr);
-
-      auto [ptr, extents] = casted[0];
-      if (extents.size() > 2)
-        throw std::runtime_error(
-            "[CuDensityMatState]: createFromSizeAndPtr only "
-            "accept 1D or 2D arrays");
-
-      isDm = extents.size() == 2;
-      size = std::reduce(extents.begin(), extents.end(), 1, std::multiplies());
-      dataPtr = const_cast<void *>(ptr);
-    }
-
-    std::complex<double> *devicePtr = nullptr;
-
-    HANDLE_CUDA_ERROR(
-        cudaMalloc((void **)&devicePtr, size * sizeof(std::complex<double>)));
-    HANDLE_CUDA_ERROR(cudaMemcpy(devicePtr, dataPtr,
-                                 size * sizeof(std::complex<double>),
-                                 cudaMemcpyDefault));
-    // printf("Created CuDensityMatState ptr %p\n", devicePtr);
-    return std::make_unique<CuDensityMatState>(size, devicePtr, isDm);
-  }
+                       std::size_t type) override;
 
   // Return the tensor at the given index. Throws
   // for an invalid tensor index.
-  Tensor getTensor(std::size_t tensorIdx = 0) const override {
-    if (tensorIdx != 0) {
-      throw std::runtime_error(
-          "CuDensityMatState state only supports a single tensor");
-    }
-    const std::vector<std::size_t> extents =
-        isDensityMatrix ? std::vector<std::size_t>{dimension, dimension}
-                        : std::vector<std::size_t>{dimension};
-    return Tensor{devicePtr, extents, precision::fp64};
-  }
+  Tensor getTensor(std::size_t tensorIdx = 0) const override;
 
   // Return all tensors that represent this state
   std::vector<Tensor> getTensors() const override { return {getTensor()}; }
@@ -169,64 +68,84 @@ class CuDensityMatState : public cudaq::SimulationState {
 
   std::complex<double>
   operator()(std::size_t tensorIdx,
-             const std::vector<std::size_t> &indices) override {
-    const auto extractValue = [&](std::size_t idx) {
-      std::complex<double> value;
-      HANDLE_CUDA_ERROR(cudaMemcpy(
-          &value, reinterpret_cast<std::complex<double> *>(devicePtr) + idx,
-          sizeof(std::complex<double>), cudaMemcpyDeviceToHost));
-      return value;
-    };
-
-    if (tensorIdx != 0)
-      throw std::runtime_error(
-          "CuDensityMatState state only supports a single tensor");
-    if (isDensityMatrix) {
-      if (indices.size() != 2)
-        throw std::runtime_error("CuDensityMatState holding a density matrix "
-                                 "supports only 2-dimensional indices");
-      if (indices[0] >= dimension || indices[1] >= dimension)
-        throw std::runtime_error("CuDensityMatState indices out of range");
-      return extractValue(indices[0] * dimension + indices[1]);
-    }
-    if (indices.size() != 1)
-      throw std::runtime_error(
-          "CuDensityMatState holding a state vector supports "
-          "only 1-dimensional indices");
-    if (indices[0] >= dimension)
-      throw std::runtime_error("Index out of bounds");
-    return extractValue(indices[0]);
-  }
+             const std::vector<std::size_t> &indices) override;
 
   // Copy the state device data to the user-provided host data pointer.
   void toHost(std::complex<double> *userData,
-              std::size_t numElements) const override {
-    if (numElements != dimension * (isDensityMatrix ? dimension : 1)) {
-      throw std::runtime_error("Number of elements in user data does not match "
-                               "the size of the state");
-    }
-    HANDLE_CUDA_ERROR(cudaMemcpy(userData, devicePtr,
-                                 numElements * sizeof(std::complex<double>),
-                                 cudaMemcpyDeviceToHost));
-  }
+              std::size_t numElements) const override;
 
   // Copy the state device data to the user-provided host data pointer.
   void toHost(std::complex<float> *userData,
-              std::size_t numElements) const override {
-    throw std::runtime_error(
-        "CuDensityMatState: Data type mismatches - expecting "
-        "double-precision array.");
-  }
-
+              std::size_t numElements) const override;
   // Free the device data.
-  void destroyState() override {
-    if (devicePtr != nullptr) {
-      HANDLE_CUDA_ERROR(cudaFree(devicePtr));
-      devicePtr = nullptr;
-      dimension = 0;
-      isDensityMatrix = false;
-    }
-  }
+  void destroyState() override;
+
+  // TODO: Tidy this up, remove unnecessary methods
+  /// @brief To initialize state with raw data.
+  explicit CuDensityMatState(cudensitymatHandle_t handle,
+                             const std::vector<std::complex<double>> &rawData,
+                             const std::vector<int64_t> &hilbertSpaceDims);
+  /// @brief To initialize state from a `cudaq::state`
+  explicit CuDensityMatState(cudensitymatHandle_t handle,
+                             const CuDensityMatState &simState,
+                             const std::vector<int64_t> &hilbertSpaceDims);
+  // @brief Create a zero state
+  static CuDensityMatState zero_like(const CuDensityMatState &other);
+  static CuDensityMatState clone(const CuDensityMatState &other);
+  // Prevent copies (avoids double free issues)
+  CuDensityMatState(const CuDensityMatState &) = delete;
+  CuDensityMatState &operator=(const CuDensityMatState &) = delete;
+
+  // Allow move semantics
+  CuDensityMatState(CuDensityMatState &&other) noexcept;
+  CuDensityMatState &operator=(CuDensityMatState &&other) noexcept;
+
+  /// @brief Destructor to clean up resources
+  ~CuDensityMatState();
+
+  /// @brief Check if the state is initialized.
+  /// @return True if the state is initialized, false otherwise.
+  bool is_initialized() const;
+
+  /// @brief Check if the state is a density matrix.
+  /// @return True if the state is a density matrix, false otherwise.
+  bool is_density_matrix() const;
+
+  /// @brief Convert the state vector to a density matrix.
+  /// @return A new CuDensityMatState representing the density matrix.
+  CuDensityMatState to_density_matrix() const;
+
+  /// @brief Get the underlying implementation (if any).
+  /// @return The underlying state implementation.
+  cudensitymatState_t get_impl() const;
+
+  /// @brief Get the pointer to device memory buffer storing the state.
+  /// @return GPU device pointer
+  void *get_device_pointer() const;
+
+  /// @brief Get a copy of the hilbert space dimensions for the quantum state.
+  /// @return A copy of the hilbert space dimensions of a vector of integers.
+  std::vector<int64_t> get_hilbert_space_dims() const;
+
+  /// @brief Returns the handle
+  /// @return The handle associated with the state
+  cudensitymatHandle_t get_handle() const;
+
+  void initialize_cudm(cudensitymatHandle_t handleToSet,
+                       const std::vector<int64_t> &hilbertSpaceDims);
+  /// @brief Addition operator (element-wise)
+  /// @return The new state after the summation of two states.
+  CuDensityMatState operator+(const CuDensityMatState &other) const;
+
+  /// @brief Accumulation operator
+  /// @return Accumulates the summation of two states.
+  CuDensityMatState &operator+=(const CuDensityMatState &other);
+
+  /// @brief Scalar multiplication operator
+  /// @return The new state after multiplying scalar with the current state.
+  CuDensityMatState &operator*=(const std::complex<double> &scalar);
+
+  CuDensityMatState operator*(double scalar) const;
 };
 /// @endcond
 } // namespace cudaq
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatTimeStepper.cpp b/runtime/nvqir/cudensitymat/CuDensityMatTimeStepper.cpp
new file mode 100644
index 00000000000..3640285b109
--- /dev/null
+++ b/runtime/nvqir/cudensitymat/CuDensityMatTimeStepper.cpp
@@ -0,0 +1,91 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "CuDensityMatTimeStepper.h"
+#include "CuDensityMatContext.h"
+#include "CuDensityMatErrorHandling.h"
+namespace cudaq {
+CuDensityMatTimeStepper::CuDensityMatTimeStepper(
+    cudensitymatHandle_t handle, cudensitymatOperator_t liouvillian)
+    : m_handle(handle), m_liouvillian(liouvillian){};
+
+state CuDensityMatTimeStepper::compute(
+    const state &inputState, double t, double step_size,
+    const std::unordered_map<std::string, std::complex<double>> &parameters) {
+  if (step_size == 0.0) {
+    throw std::runtime_error("Step size cannot be zero.");
+  }
+
+  auto *simState =
+      cudaq::state_helper::getSimulationState(const_cast<state *>(&inputState));
+  auto *castSimState = dynamic_cast<CuDensityMatState *>(simState);
+  if (!castSimState)
+    throw std::runtime_error("Invalid state.");
+  CuDensityMatState &state = *castSimState;
+  // Prepare workspace
+  cudensitymatWorkspaceDescriptor_t workspace;
+  HANDLE_CUDM_ERROR(cudensitymatCreateWorkspace(m_handle, &workspace));
+
+  // Create a new state for the next step
+  auto next_state = CuDensityMatState::zero_like(state);
+
+  if (!next_state.is_initialized()) {
+    throw std::runtime_error("Next state failed to initialize.");
+  }
+
+  if (state.get_hilbert_space_dims() != next_state.get_hilbert_space_dims()) {
+    throw std::runtime_error("As the dimensions of both the old and the new "
+                             "state do no match, the "
+                             "operator cannot act on the states.");
+  }
+
+  // Prepare the operator for action
+  HANDLE_CUDM_ERROR(cudensitymatOperatorPrepareAction(
+      m_handle, m_liouvillian, state.get_impl(), next_state.get_impl(),
+      CUDENSITYMAT_COMPUTE_64F,
+      dynamics::Context::getRecommendedWorkSpaceLimit(), workspace, 0x0));
+
+  // Query required workspace buffer size
+  std::size_t requiredBufferSize = 0;
+  HANDLE_CUDM_ERROR(cudensitymatWorkspaceGetMemorySize(
+      m_handle, workspace, CUDENSITYMAT_MEMSPACE_DEVICE,
+      CUDENSITYMAT_WORKSPACE_SCRATCH, &requiredBufferSize));
+
+  void *workspaceBuffer = nullptr;
+  if (requiredBufferSize > 0) {
+    workspaceBuffer = dynamics::Context::getCurrentContext()->getScratchSpace(
+        requiredBufferSize);
+
+    // Attach workspace buffer
+    HANDLE_CUDM_ERROR(cudensitymatWorkspaceSetMemory(
+        m_handle, workspace, CUDENSITYMAT_MEMSPACE_DEVICE,
+        CUDENSITYMAT_WORKSPACE_SCRATCH, workspaceBuffer, requiredBufferSize));
+  }
+
+  // Apply the operator action
+  std::map<std::string, std::complex<double>> sortedParameters(
+      parameters.begin(), parameters.end());
+  std::vector<double> paramValues;
+  for (const auto &[k, v] : sortedParameters) {
+    paramValues.emplace_back(v.real());
+    paramValues.emplace_back(v.imag());
+  }
+  HANDLE_CUDA_ERROR(cudaDeviceSynchronize());
+  HANDLE_CUDM_ERROR(cudensitymatOperatorComputeAction(
+      m_handle, m_liouvillian, t, paramValues.size(), paramValues.data(),
+      state.get_impl(), next_state.get_impl(), workspace, 0x0));
+  HANDLE_CUDA_ERROR(cudaDeviceSynchronize());
+
+  // Cleanup
+  HANDLE_CUDM_ERROR(cudensitymatDestroyWorkspace(workspace));
+
+  return cudaq::state(
+      std::make_unique<CuDensityMatState>(std::move(next_state)).release());
+}
+
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatTimeStepper.h b/runtime/nvqir/cudensitymat/CuDensityMatTimeStepper.h
new file mode 100644
index 00000000000..b4bce189b60
--- /dev/null
+++ b/runtime/nvqir/cudensitymat/CuDensityMatTimeStepper.h
@@ -0,0 +1,29 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "CuDensityMatState.h"
+#include "cudaq/BaseTimeStepper.h"
+#include <cudensitymat.h>
+
+namespace cudaq {
+class CuDensityMatTimeStepper : public BaseTimeStepper {
+public:
+  explicit CuDensityMatTimeStepper(cudensitymatHandle_t handle,
+                                   cudensitymatOperator_t liouvillian);
+
+  state compute(const state &inputState, double t, double step_size,
+                const std::unordered_map<std::string, std::complex<double>>
+                    &parameters) override;
+
+private:
+  cudensitymatHandle_t m_handle;
+  cudensitymatOperator_t m_liouvillian;
+};
+} // namespace cudaq
\ No newline at end of file
diff --git a/runtime/nvqir/cudensitymat/CuDensityMatUtils.h b/runtime/nvqir/cudensitymat/CuDensityMatUtils.h
new file mode 100644
index 00000000000..34d92e13586
--- /dev/null
+++ b/runtime/nvqir/cudensitymat/CuDensityMatUtils.h
@@ -0,0 +1,35 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+#include "CuDensityMatErrorHandling.h"
+#include <complex>
+#include <concepts>
+#include <vector>
+
+namespace cudaq {
+namespace dynamics {
+// GPU memory management
+template <std::floating_point T>
+void *createArrayGpu(const std::vector<std::complex<T>> &cpuArray) {
+  void *gpuArray{nullptr};
+  const std::size_t arraySizeBytes = cpuArray.size() * sizeof(std::complex<T>);
+  if (arraySizeBytes > 0) {
+    HANDLE_CUDA_ERROR(cudaMalloc(&gpuArray, arraySizeBytes));
+    HANDLE_CUDA_ERROR(cudaMemcpy(gpuArray,
+                                 static_cast<const void *>(cpuArray.data()),
+                                 arraySizeBytes, cudaMemcpyHostToDevice));
+  }
+  return gpuArray;
+}
+inline void destroyArrayGpu(void *gpuArray) {
+  if (gpuArray)
+    HANDLE_CUDA_ERROR(cudaFree(gpuArray));
+}
+} // namespace dynamics
+} // namespace cudaq
diff --git a/runtime/nvqir/cudensitymat/RungeKuttaIntegrator.cpp b/runtime/nvqir/cudensitymat/RungeKuttaIntegrator.cpp
new file mode 100644
index 00000000000..61acff8150a
--- /dev/null
+++ b/runtime/nvqir/cudensitymat/RungeKuttaIntegrator.cpp
@@ -0,0 +1,159 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "CuDensityMatContext.h"
+#include "CuDensityMatErrorHandling.h"
+#include "CuDensityMatState.h"
+#include "CuDensityMatTimeStepper.h"
+#include "cudaq/dynamics_integrators.h"
+
+namespace cudaq {
+
+cudaq::Schedule createDummySchedule() {
+  std::vector<std::complex<double>> dummy_steps = {{0.0, 0.0}};
+  std::vector<std::string> dummy_params = {"t"};
+  return cudaq::Schedule(
+      dummy_steps, dummy_params,
+      [](const std::string &, const std::complex<double> &val) { return val; });
+}
+
+RungeKuttaIntegrator::RungeKuttaIntegrator()
+    : m_t(0.0), m_schedule(createDummySchedule()) {}
+
+void RungeKuttaIntegrator::setSystem(const SystemDynamics &system,
+                                     const cudaq::Schedule &schedule) {
+  m_system = system;
+  m_schedule = schedule;
+  m_stepper.reset();
+}
+
+void RungeKuttaIntegrator::setState(cudaq::state initial_state, double t0) {
+  m_state = std::make_shared<cudaq::state>(initial_state);
+  m_t = t0;
+}
+
+std::pair<double, cudaq::state> RungeKuttaIntegrator::getState() {
+  auto *simState = cudaq::state_helper::getSimulationState(m_state.get());
+  auto *castSimState = dynamic_cast<CuDensityMatState *>(simState);
+  if (!castSimState)
+    throw std::runtime_error("Invalid state.");
+
+  auto cudmState =
+      new CuDensityMatState(castSimState->get_handle(), *castSimState,
+                            castSimState->get_hilbert_space_dims());
+
+  return std::make_pair(m_t, cudaq::state(cudmState));
+}
+
+void RungeKuttaIntegrator::integrate(double targetTime) {
+  const auto asCudmState = [](cudaq::state &cudaqState) -> CuDensityMatState * {
+    auto *simState = cudaq::state_helper::getSimulationState(&cudaqState);
+    auto *castSimState = dynamic_cast<CuDensityMatState *>(simState);
+    if (!castSimState)
+      throw std::runtime_error("Invalid state.");
+    return castSimState;
+  };
+  auto &castSimState = *asCudmState(*m_state);
+  std::unordered_map<std::string, std::complex<double>> params;
+  if (!m_stepper) {
+    for (const auto &param : m_schedule.get_parameters()) {
+      params[param] = m_schedule.get_value_function()(param, 0.0);
+    }
+
+    auto liouvillian =
+        cudaq::dynamics::Context::getCurrentContext()
+            ->getOpConverter()
+            .constructLiouvillian(*m_system.hamiltonian, m_system.collapseOps,
+                                  m_system.modeExtents, params,
+                                  castSimState.is_density_matrix());
+    m_stepper = std::make_unique<CuDensityMatTimeStepper>(
+        castSimState.get_handle(), liouvillian);
+  }
+  const auto substeps = order.value_or(4);
+  while (m_t < targetTime) {
+    double step_size =
+        std::min(dt.value_or(targetTime - m_t), targetTime - m_t);
+
+    // std::cout << "Runge-Kutta step at time " << m_t
+    //           << " with step size: " << step_size << std::endl;
+
+    if (substeps == 1) {
+      // Euler method (1st order)
+      for (const auto &param : m_schedule.get_parameters()) {
+        params[param] = m_schedule.get_value_function()(param, m_t);
+      }
+      auto k1State = m_stepper->compute(*m_state, m_t, step_size, params);
+      auto &k1 = *asCudmState(k1State);
+      // k1.dump(std::cout);
+      k1 *= step_size;
+      castSimState += k1;
+    } else if (substeps == 2) {
+      // Midpoint method (2nd order)
+      for (const auto &param : m_schedule.get_parameters()) {
+        params[param] = m_schedule.get_value_function()(param, m_t);
+      }
+      auto k1State = m_stepper->compute(*m_state, m_t, step_size, params);
+      auto &k1 = *asCudmState(k1State);
+      k1 *= (step_size / 2.0);
+
+      castSimState += k1;
+      for (const auto &param : m_schedule.get_parameters()) {
+        params[param] =
+            m_schedule.get_value_function()(param, m_t + step_size / 2.0);
+      }
+      auto k2State = m_stepper->compute(*m_state, m_t + step_size / 2.0,
+                                        step_size, params);
+      auto &k2 = *asCudmState(k2State);
+      k2 *= (step_size / 2.0);
+
+      castSimState += k2;
+    } else if (substeps == 4) {
+      // Runge-Kutta method (4th order)
+      for (const auto &param : m_schedule.get_parameters()) {
+        params[param] = m_schedule.get_value_function()(param, m_t);
+      }
+      auto k1State = m_stepper->compute(*m_state, m_t, step_size, params);
+      auto &k1 = *asCudmState(k1State);
+      CuDensityMatState rho_temp = CuDensityMatState::clone(castSimState);
+      rho_temp += (k1 * (step_size / 2));
+
+      for (const auto &param : m_schedule.get_parameters()) {
+        params[param] =
+            m_schedule.get_value_function()(param, m_t + step_size / 2.0);
+      }
+      auto k2State = m_stepper->compute(
+          cudaq::state(new CuDensityMatState(std::move(rho_temp))),
+          m_t + step_size / 2.0, step_size, params);
+      auto &k2 = *asCudmState(k2State);
+      CuDensityMatState rho_temp_2 = CuDensityMatState::clone(castSimState);
+      rho_temp_2 += (k2 * (step_size / 2));
+
+      auto k3State = m_stepper->compute(
+          cudaq::state(new CuDensityMatState(std::move(rho_temp_2))),
+          m_t + step_size / 2.0, step_size, params);
+      auto &k3 = *asCudmState(k3State);
+      CuDensityMatState rho_temp_3 = CuDensityMatState::clone(castSimState);
+      rho_temp_3 += (k3 * step_size);
+
+      for (const auto &param : m_schedule.get_parameters()) {
+        params[param] = m_schedule.get_value_function()(param, m_t + step_size);
+      }
+      auto k4State = m_stepper->compute(
+          cudaq::state(new CuDensityMatState(std::move(rho_temp_3))),
+          m_t + step_size, step_size, params);
+      auto &k4 = *asCudmState(k4State);
+      castSimState += (k1 + k2 * 2.0 + k3 * 2.0 + k4) * (step_size / 6.0);
+    } else {
+      throw std::runtime_error("Invalid integrator order");
+    }
+
+    // Update time
+    m_t += step_size;
+  }
+}
+} // namespace cudaq
diff --git a/scripts/build_cudaq.sh b/scripts/build_cudaq.sh
index 1384620bcc5..eb559afaf39 100644
--- a/scripts/build_cudaq.sh
+++ b/scripts/build_cudaq.sh
@@ -72,8 +72,10 @@ this_file_dir=`dirname "$(readlink -f "${BASH_SOURCE[0]}")"`
 repo_root=$(cd "$this_file_dir" && git rev-parse --show-toplevel)
 
 # Prepare the build directory
+build_dir="$working_dir/build/"$(echo "$build_configuration" | tr '[:upper:]' '[:lower:]')
+echo "Build directory: $build_dir"
 mkdir -p "$CUDAQ_INSTALL_PREFIX/bin"
-mkdir -p "$working_dir/build" && cd "$working_dir/build" && rm -rf * 
+mkdir -p "$build_dir" && cd "$build_dir" # && rm -rf * 
 mkdir -p logs && rm -rf logs/*
 
 if [ -n "$install_toolchain" ]; then
diff --git a/scripts/validate_container.sh b/scripts/validate_container.sh
index 58efa1e1fa1..ced784dc3de 100644
--- a/scripts/validate_container.sh
+++ b/scripts/validate_container.sh
@@ -355,6 +355,30 @@ else
     echo ":white_flag: Notebooks validation skipped." >> "${tmpFile}"
 fi
 
+# Python snippet validation 
+if [ -d "snippets/" ];
+then
+    # Skip NVQC and multi-GPU snippets.
+    for ex in `find snippets/ -name '*.py' -not -path '*/nvqc/*' -not -path '*/multi_gpu_workflows/*' | sort`;
+    do 
+        filename=$(basename -- "$ex")
+        filename="${filename%.*}"
+        echo "Testing $filename:"
+        echo "Source: $ex"
+        let "samples+=1"
+        python3 $ex 1> /dev/null
+        status=$?
+        echo "Exited with code $status"
+        if [ "$status" -eq "0" ]; then 
+            let "passed+=1"
+            echo ":white_check_mark: Successfully ran $filename." >> "${tmpFile}"
+        else
+            let "failed+=1"
+            echo ":x: Failed to run $filename." >> "${tmpFile}"
+        fi 
+    done
+fi
+
 if [ -f "$GITHUB_STEP_SUMMARY" ]; 
 then
     for t in $requested_backends
diff --git a/targettests/TargetConfig/RegressionValidation/anyon.config b/targettests/TargetConfig/RegressionValidation/anyon.config
index 954bf0293bb..506d7f7061a 100644
--- a/targettests/TargetConfig/RegressionValidation/anyon.config
+++ b/targettests/TargetConfig/RegressionValidation/anyon.config
@@ -20,7 +20,7 @@
 # Define the lowering pipeline. telegraph-8q has an 8-qubit ring topology, so mapping
 # uses ring(8).
 # Berkeley-25q uses a bidiratctional connectivity lattice with 8 connectivity per qubit in the bulk.
-# CHECK-DAG: PLATFORM_LOWERING_CONFIG="func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),anyon-%Q_GATE%-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
+# CHECK-DAG: PLATFORM_LOWERING_CONFIG="classical-optimization-pipeline,globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,classical-optimization-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),anyon-%Q_GATE%-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
 
 
 # Tell the rest-qpu that we are generating QIR.
diff --git a/targettests/TargetConfig/RegressionValidation/ionq.config b/targettests/TargetConfig/RegressionValidation/ionq.config
index 337987c3a70..338686eafb9 100644
--- a/targettests/TargetConfig/RegressionValidation/ionq.config
+++ b/targettests/TargetConfig/RegressionValidation/ionq.config
@@ -18,7 +18,7 @@
 # CHECK-DAG: LINKLIBS="${LINKLIBS} -lcudaq-rest-qpu"
 
 # Define the lowering pipeline
-# CHECK-DAG: PLATFORM_LOWERING_CONFIG="func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),ionq-gate-set-mapping"
+# CHECK-DAG: PLATFORM_LOWERING_CONFIG="classical-optimization-pipeline,globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,classical-optimization-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),ionq-gate-set-mapping"
 
 # Tell the rest-qpu that we are generating QIR.
 # CHECK-DAG: CODEGEN_EMISSION=qir-base
diff --git a/targettests/TargetConfig/RegressionValidation/iqm.config b/targettests/TargetConfig/RegressionValidation/iqm.config
index f1ced5aa756..5bb80da7fb8 100644
--- a/targettests/TargetConfig/RegressionValidation/iqm.config
+++ b/targettests/TargetConfig/RegressionValidation/iqm.config
@@ -20,7 +20,7 @@
 # Define the lowering pipeline, here we lower to Base QIR
 # Note: the runtime will dynamically substitute %QPU_ARCH% based on
 # qpu-architecture
-# CHECK-DAG: PLATFORM_LOWERING_CONFIG="func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),iqm-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(delay-measurements,regtomem),symbol-dce,iqm-gate-set-mapping"
+# CHECK-DAG: PLATFORM_LOWERING_CONFIG="classical-optimization-pipeline,globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,classical-optimization-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),iqm-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(delay-measurements,regtomem),symbol-dce,iqm-gate-set-mapping"
 
 # Tell the rest-qpu that we are generating IQM JSON.
 # CHECK-DAG: CODEGEN_EMISSION=iqm
diff --git a/targettests/TargetConfig/RegressionValidation/oqc.config b/targettests/TargetConfig/RegressionValidation/oqc.config
index 4bc0123ef74..072d58d59f4 100644
--- a/targettests/TargetConfig/RegressionValidation/oqc.config
+++ b/targettests/TargetConfig/RegressionValidation/oqc.config
@@ -20,7 +20,7 @@
 # Define the lowering pipeline. Lucy has an 8-qubit ring topology, so mapping
 # uses ring(8).
 # Toshiko uses a Kagome lattice with 2-3 connectivity per qubit
-# CHECK-DAG: PLATFORM_LOWERING_CONFIG="func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),oqc-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
+# CHECK-DAG: PLATFORM_LOWERING_CONFIG="classical-optimization-pipeline,globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,classical-optimization-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),oqc-gate-set-mapping,func.func(add-dealloc,combine-quantum-alloc,canonicalize,factor-quantum-alloc,memtoreg),add-wireset,func.func(assign-wire-indices),qubit-mapping{device=file(%QPU_ARCH%)},func.func(regtomem),symbol-dce"
 
 
 # Tell the rest-qpu that we are generating QIR.
diff --git a/targettests/TargetConfig/RegressionValidation/quantinuum.config b/targettests/TargetConfig/RegressionValidation/quantinuum.config
index 83d66f5e4ec..86bc7860de8 100644
--- a/targettests/TargetConfig/RegressionValidation/quantinuum.config
+++ b/targettests/TargetConfig/RegressionValidation/quantinuum.config
@@ -18,7 +18,7 @@
 # CHECK-DAG: LINKLIBS="${LINKLIBS} -lcudaq-rest-qpu"
 
 # Define the lowering pipeline, here we lower to Adaptive QIR
-# CHECK-DAG: PLATFORM_LOWERING_CONFIG="func.func(const-prop-complex,canonicalize,cse,lift-array-alloc),globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,unrolling-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),quantinuum-gate-set-mapping"
+# CHECK-DAG: PLATFORM_LOWERING_CONFIG="classical-optimization-pipeline,globalize-array-values,func.func(state-prep),unitary-synthesis,canonicalize,apply-op-specialization,aggressive-early-inlining,expand-measurements,classical-optimization-pipeline,decomposition{enable-patterns=U3ToRotations},func.func(lower-to-cfg,canonicalize,multicontrol-decomposition),quantinuum-gate-set-mapping"
 
 # Tell the rest-qpu that we are generating QIR.
 # CHECK-DAG: CODEGEN_EMISSION=qir-adaptive
diff --git a/targettests/execution/uccsd.cpp b/targettests/execution/uccsd.cpp
new file mode 100644
index 00000000000..e692d1c1b71
--- /dev/null
+++ b/targettests/execution/uccsd.cpp
@@ -0,0 +1,535 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// clang-format off
+// RUN: nvq++ %cpp_std --target anyon --emulate %s -o %t && %t | FileCheck %s
+// RUN: if %braket_avail; then nvq++ %cpp_std --target braket --emulate %s -o %t && %t | FileCheck %s ; fi
+// RUN: nvq++ %cpp_std --target ionq --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target iqm --iqm-machine Apollo --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target oqc --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --target quantinuum --emulate %s -o %t && %t | FileCheck %s
+// RUN: nvq++ %cpp_std --enable-mlir %s -o %t
+// clang-format on
+
+#include <cudaq.h>
+#include <iostream>
+
+namespace test_uccsd {
+
+__qpu__ void single_excitation(cudaq::qview<> qubits, double theta,
+                               std::size_t p_occ, std::size_t q_virt) {
+  // Y_p X_q
+  rx(M_PI_2, qubits[p_occ]);
+  h(qubits[q_virt]);
+
+  for (std::size_t i = p_occ; i < q_virt; i++)
+    cx(qubits[i], qubits[i + 1]);
+
+  rz(0.5 * theta, qubits[q_virt]);
+
+  for (std::size_t i = q_virt; i > p_occ; i--)
+    cx(qubits[i - 1], qubits[i]);
+
+  h(qubits[q_virt]);
+  rx(-M_PI_2, qubits[p_occ]);
+
+  // -X_p Y_q
+  h(qubits[p_occ]);
+  rx(M_PI_2, qubits[q_virt]);
+
+  for (std::size_t i = p_occ; i < q_virt; i++)
+    cx(qubits[i], qubits[i + 1]);
+
+  rz(-0.5 * theta, qubits[q_virt]);
+
+  for (std::size_t i = q_virt; i > p_occ; i--)
+    cx(qubits[i - 1], qubits[i]);
+
+  rx(-M_PI_2, qubits[q_virt]);
+  h(qubits[p_occ]);
+}
+
+__qpu__ void double_excitation(cudaq::qview<> qubits, double theta,
+                               std::size_t pOcc, std::size_t qOcc,
+                               std::size_t rVirt, std::size_t sVirt) {
+  std::size_t iOcc = 0, jOcc = 0, aVirt = 0, bVirt = 0;
+  if ((pOcc < qOcc) && (rVirt < sVirt)) {
+    iOcc = pOcc;
+    jOcc = qOcc;
+    aVirt = rVirt;
+    bVirt = sVirt;
+  } else if ((pOcc > qOcc) && (rVirt > sVirt)) {
+    iOcc = qOcc;
+    jOcc = pOcc;
+    aVirt = sVirt;
+    bVirt = rVirt;
+  } else if ((pOcc < qOcc) && (rVirt > sVirt)) {
+    iOcc = pOcc;
+    jOcc = qOcc;
+    aVirt = sVirt;
+    bVirt = rVirt;
+    theta *= -1.;
+  } else if ((pOcc > qOcc) && (rVirt < sVirt)) {
+    iOcc = qOcc;
+    jOcc = pOcc;
+    aVirt = rVirt;
+    bVirt = sVirt;
+    theta *= -1.;
+  }
+
+  h(qubits[iOcc]);
+  h(qubits[jOcc]);
+  h(qubits[aVirt]);
+  rx(M_PI_2, qubits[bVirt]);
+
+  for (std::size_t i = iOcc; i < jOcc; i++)
+    cx(qubits[i], qubits[i + 1]);
+
+  cx(qubits[jOcc], qubits[aVirt]);
+
+  for (std::size_t i = aVirt; i < bVirt; i++)
+    cx(qubits[i], qubits[i + 1]);
+
+  rz(0.125 * theta, qubits[bVirt]);
+
+  for (std::size_t i = bVirt; i > aVirt; i--)
+    cx(qubits[i - 1], qubits[i]);
+
+  cx(qubits[jOcc], qubits[aVirt]);
+
+  rx(-M_PI_2, qubits[bVirt]);
+  h(qubits[aVirt]);
+
+  rx(M_PI_2, qubits[aVirt]);
+  h(qubits[bVirt]);
+
+  cx(qubits[jOcc], qubits[aVirt]);
+  for (std::size_t i = aVirt; i < bVirt; i++)
+    cx(qubits[i], qubits[i + 1]);
+
+  rz(0.125 * theta, qubits[bVirt]);
+
+  for (std::size_t i = bVirt; i > aVirt; i--)
+    cx(qubits[i - 1], qubits[i]);
+  cx(qubits[jOcc], qubits[aVirt]);
+
+  for (std::size_t i = jOcc; i > iOcc; i--)
+    cx(qubits[i - 1], qubits[i]);
+
+  rx(-M_PI_2, qubits[aVirt]);
+  h(qubits[jOcc]);
+
+  rx(M_PI_2, qubits[jOcc]);
+  h(qubits[aVirt]);
+
+  for (std::size_t i = iOcc; i < jOcc; i++)
+    cx(qubits[i], qubits[i + 1]);
+  cx(qubits[jOcc], qubits[aVirt]);
+
+  for (std::size_t i = aVirt; i < bVirt; i++)
+    cx(qubits[i], qubits[i + 1]);
+
+  rz(-0.125 * theta, qubits[bVirt]);
+
+  for (std::size_t i = bVirt; i > aVirt; i--)
+    cx(qubits[i - 1], qubits[i]);
+  cx(qubits[jOcc], qubits[aVirt]);
+
+  h(qubits[bVirt]);
+  h(qubits[aVirt]);
+
+  rx(M_PI_2, qubits[aVirt]);
+  rx(M_PI_2, qubits[bVirt]);
+
+  cx(qubits[jOcc], qubits[aVirt]);
+  for (std::size_t i = aVirt; i < bVirt; i++)
+    cx(qubits[i], qubits[i + 1]);
+
+  rz(0.125 * theta, qubits[bVirt]);
+
+  for (std::size_t i = bVirt; i > aVirt; i--)
+    cx(qubits[i - 1], qubits[i]);
+
+  cx(qubits[jOcc], qubits[aVirt]);
+
+  for (std::size_t i = jOcc; i > iOcc; i--)
+    cx(qubits[i - 1], qubits[i]);
+
+  rx(-M_PI_2, qubits[jOcc]);
+  h(qubits[iOcc]);
+
+  rx(M_PI_2, qubits[iOcc]);
+  h(qubits[jOcc]);
+
+  for (std::size_t i = iOcc; i < jOcc; i++)
+    cx(qubits[i], qubits[i + 1]);
+  cx(qubits[jOcc], qubits[aVirt]);
+
+  for (std::size_t i = aVirt; i < bVirt; i++)
+    cx(qubits[i], qubits[i + 1]);
+
+  rz(0.125 * theta, qubits[bVirt]);
+
+  for (std::size_t i = bVirt; i > aVirt; i--)
+    cx(qubits[i - 1], qubits[i]);
+  cx(qubits[jOcc], qubits[aVirt]);
+
+  rx(-M_PI_2, qubits[bVirt]);
+  rx(-M_PI_2, qubits[aVirt]);
+
+  h(qubits[aVirt]);
+  h(qubits[bVirt]);
+
+  cx(qubits[jOcc], qubits[aVirt]);
+  for (std::size_t i = aVirt; i < bVirt; i++)
+    cx(qubits[i], qubits[i + 1]);
+
+  rz(-0.125 * theta, qubits[bVirt]);
+
+  for (std::size_t i = bVirt; i > aVirt; i--)
+    cx(qubits[i - 1], qubits[i]);
+  cx(qubits[jOcc], qubits[aVirt]);
+
+  for (std::size_t i = jOcc; i > iOcc; i--)
+    cx(qubits[i - 1], qubits[i]);
+
+  h(qubits[bVirt]);
+  h(qubits[jOcc]);
+
+  rx(M_PI_2, qubits[jOcc]);
+  rx(M_PI_2, qubits[bVirt]);
+
+  for (std::size_t i = iOcc; i < jOcc; i++)
+    cx(qubits[i], qubits[i + 1]);
+
+  cx(qubits[jOcc], qubits[aVirt]);
+
+  for (std::size_t i = aVirt; i < bVirt; i++)
+    cx(qubits[i], qubits[i + 1]);
+
+  rz(-0.125 * theta, qubits[bVirt]);
+
+  for (std::size_t i = bVirt; i > aVirt; i--)
+    cx(qubits[i - 1], qubits[i]);
+  cx(qubits[jOcc], qubits[aVirt]);
+
+  rx(-M_PI_2, qubits[bVirt]);
+  h(qubits[aVirt]);
+
+  rx(M_PI_2, qubits[aVirt]);
+  h(qubits[bVirt]);
+
+  cx(qubits[jOcc], qubits[aVirt]);
+  for (std::size_t i = aVirt; i < bVirt; i++)
+    cx(qubits[i], qubits[i + 1]);
+
+  rz(-0.125 * theta, qubits[bVirt]);
+
+  for (std::size_t i = bVirt; i > aVirt; i--)
+    cx(qubits[i - 1], qubits[i]);
+  cx(qubits[jOcc], qubits[aVirt]);
+
+  for (std::size_t i = jOcc; i > iOcc; i--)
+    cx(qubits[i - 1], qubits[i]);
+
+  h(qubits[bVirt]);
+  rx(-M_PI_2, qubits[aVirt]);
+  rx(-M_PI_2, qubits[jOcc]);
+  rx(-M_PI_2, qubits[iOcc]);
+}
+
+__qpu__ float positive_floor(float x) {
+  int integer_part = (int)x;
+  return (float)integer_part;
+}
+
+__qpu__ std::size_t getNumOccupiedAlpha(std::size_t numElectrons,
+                                        std::size_t spin,
+                                        std::size_t numQubits) {
+  auto numSpatialOrbs = numQubits / 2;
+  if (spin > 0) {
+    auto n_occupied_beta = static_cast<std::size_t>(
+        positive_floor((float)(numElectrons - spin) / 2));
+    auto n_occupied_alpha = numElectrons - n_occupied_beta;
+    return n_occupied_alpha;
+  }
+
+  auto n_occupied_alpha = static_cast<std::size_t>(
+      positive_floor((float)numElectrons / 2));
+  return n_occupied_alpha;
+}
+
+__qpu__ std::size_t getNumOccupiedBeta(std::size_t numElectrons,
+                                       std::size_t spin,
+                                       std::size_t numQubits) {
+
+  auto numSpatialOrbs = numQubits / 2;
+  if (spin > 0) {
+    auto n_occupied_beta = static_cast<std::size_t>(
+        positive_floor((float)(numElectrons - spin) / 2));
+    return n_occupied_beta;
+  }
+
+  auto n_occupied_alpha = static_cast<std::size_t>(
+      positive_floor((float)numElectrons / 2));
+  return n_occupied_alpha;
+}
+
+__qpu__ std::size_t getNumVirtualAlpha(std::size_t numElectrons,
+                                       std::size_t spin,
+                                       std::size_t numQubits) {
+
+  auto numSpatialOrbs = numQubits / 2;
+  if (spin > 0) {
+    auto n_occupied_beta = static_cast<std::size_t>(
+        positive_floor((float)(numElectrons - spin) / 2));
+    auto n_occupied_alpha = numElectrons - n_occupied_beta;
+    auto n_virtual_alpha = numSpatialOrbs - n_occupied_alpha;
+    return n_virtual_alpha;
+  }
+  auto n_occupied_alpha = static_cast<std::size_t>(
+      positive_floor((float)numElectrons / 2));
+  auto n_virtual_alpha = numSpatialOrbs - n_occupied_alpha;
+  return n_virtual_alpha;
+}
+
+__qpu__ std::size_t getNumVirtualBeta(std::size_t numElectrons,
+                                      std::size_t spin, std::size_t numQubits) {
+
+  auto numSpatialOrbs = numQubits / 2;
+  if (spin > 0) {
+    auto n_occupied_beta = static_cast<std::size_t>(
+        positive_floor((float)(numElectrons - spin) / 2));
+    auto n_virtual_beta = numSpatialOrbs - n_occupied_beta;
+    return n_virtual_beta;
+  }
+
+  auto n_occupied_alpha =
+      static_cast<std::size_t>(positive_floor((float)numElectrons / 2));
+  auto n_virtual_beta = numSpatialOrbs - n_occupied_alpha;
+  return n_virtual_beta;
+}
+
+__qpu__ void uccsd2(cudaq::qview<> qubits, const std::vector<double> &thetas,
+                    std::size_t numElectrons, std::size_t spin) {
+
+  int numOccAlpha =
+      getNumOccupiedAlpha(numElectrons, spin, qubits.size());
+  int numOccBeta = getNumOccupiedBeta(numElectrons, spin, qubits.size());
+  int numVirtAlpha =
+      getNumVirtualAlpha(numElectrons, spin, qubits.size());
+  int numVirtBeta = getNumVirtualBeta(numElectrons, spin, qubits.size());
+  std::vector<std::size_t> occupiedAlpha(numOccAlpha),
+      virtualAlpha(numVirtAlpha), occupiedBeta(numOccBeta),
+      virtualBeta(numVirtBeta);
+  if (spin > 0) {
+
+    int counter = 0;
+    for (std::size_t i = 0; i < numOccAlpha; i++) {
+      occupiedAlpha[counter] = i * 2;
+      counter++;
+    }
+    counter = 0;
+
+    for (std::size_t i = 0; i < numVirtAlpha; i++) {
+      virtualAlpha[counter] = i * 2 + numElectrons + 1;
+      counter++;
+    }
+
+    counter = 0;
+    for (std::size_t i = 0; i < numOccBeta; i++) {
+      occupiedBeta[counter] = i * 2 + 1;
+      counter++;
+    }
+    counter = 0;
+
+    for (std::size_t i = 0; i < numVirtBeta; i++) {
+      virtualBeta[counter] = i * 2 + numElectrons - 1;
+      counter++;
+    }
+
+  } else {
+    auto numOccupied = numOccAlpha;
+    auto numVirtual = numVirtAlpha;
+
+    int counter = 0;
+    for (std::size_t i = 0; i < numOccupied; i++) {
+      occupiedAlpha[counter] = i * 2;
+      counter++;
+    }
+    counter = 0;
+    for (std::size_t i = 0; i < numVirtual; i++) {
+      virtualAlpha[counter] = i * 2 + numElectrons;
+      counter++;
+    }
+    counter = 0;
+
+    for (std::size_t i = 0; i < numOccupied; i++) {
+      occupiedBeta[counter] = i * 2 + 1;
+      counter++;
+    }
+    counter = 0;
+    for (std::size_t i = 0; i < numVirtual; i++) {
+      virtualBeta[counter] = i * 2 + numElectrons + 1;
+      counter++;
+    }
+  }
+
+  std::size_t counter = 0;
+  std::vector<std::size_t> singlesAlpha(2 * occupiedAlpha.size() *
+                                        virtualAlpha.size());
+  for (auto p : occupiedAlpha)
+    for (auto q : virtualAlpha) {
+      singlesAlpha[counter] = p;
+      counter++;
+      singlesAlpha[counter] = q;
+      counter++;
+    }
+
+  counter = 0;
+  std::vector<std::size_t> singlesBeta(2 * occupiedBeta.size() *
+                                       virtualBeta.size());
+  for (auto p : occupiedBeta)
+    for (auto q : virtualBeta) {
+      singlesBeta[counter] = p;
+      counter++;
+      singlesBeta[counter] = q;
+      counter++;
+    }
+
+  counter = 0;
+  std::vector<std::size_t> doublesMixed(
+      4 * occupiedAlpha.size() * virtualAlpha.size() * occupiedBeta.size() *
+      virtualBeta.size());
+  for (auto p : occupiedAlpha)
+    for (auto q : occupiedBeta)
+      for (auto r : virtualBeta)
+        for (auto s : virtualAlpha) {
+          doublesMixed[counter] = p;
+          counter++;
+          doublesMixed[counter] = q;
+          counter++;
+          doublesMixed[counter] = r;
+          counter++;
+          doublesMixed[counter] = s;
+          counter++;
+        }
+
+  counter = 0;
+  for (int p = 0; p < numOccAlpha - 1; p++)
+    for (int q = p + 1; q < numOccAlpha; q++)
+      for (int r = 0; r < numVirtAlpha - 1; r++)
+        for (int s = r + 1; s < numVirtAlpha; s++)
+          counter++;
+
+  std::vector<std::size_t> doublesAlpha(4 * counter);
+  counter = 0;
+  for (int p = 0; p < numOccAlpha - 1; p++)
+    for (int q = p + 1; q < numOccAlpha; q++)
+      for (int r = 0; r < numVirtAlpha - 1; r++)
+        for (int s = r + 1; s < numVirtAlpha; s++) {
+          doublesAlpha[counter] = occupiedAlpha[p];
+          counter++;
+          doublesAlpha[counter] = occupiedAlpha[q];
+          counter++;
+          doublesAlpha[counter] = virtualAlpha[r];
+          counter++;
+          doublesAlpha[counter] = virtualAlpha[s];
+          counter++;
+        }
+
+  counter = 0;
+  for (int p = 0; p < numOccBeta - 1; p++)
+    for (int q = p + 1; q < numOccBeta; q++)
+      for (int r = 0; r < numVirtBeta - 1; r++)
+        for (int s = r + 1; s < numVirtBeta; s++)
+          counter++;
+  std::vector<std::size_t> doublesBeta(4 * counter);
+  counter = 0;
+  for (int p = 0; p < numOccBeta - 1; p++)
+    for (int q = p + 1; q < numOccBeta; q++)
+      for (int r = 0; r < numVirtBeta - 1; r++)
+        for (int s = r + 1; s < numVirtBeta; s++) {
+          doublesBeta[counter] = occupiedBeta[p];
+          counter++;
+          doublesBeta[counter] = occupiedBeta[q];
+          counter++;
+          doublesBeta[counter] = virtualBeta[r];
+          counter++;
+          doublesBeta[counter] = virtualBeta[s];
+          counter++;
+        }
+
+  std::size_t thetaCounter = 0;
+  for (std::size_t i = 0; i < singlesAlpha.size(); i += 2)
+    single_excitation(qubits, thetas[thetaCounter++], singlesAlpha[i],
+                      singlesAlpha[i + 1]);
+
+  for (std::size_t i = 0; i < singlesBeta.size(); i += 2)
+    single_excitation(qubits, thetas[thetaCounter++], singlesBeta[i],
+                      singlesBeta[i + 1]);
+
+  for (std::size_t i = 0; i < doublesMixed.size(); i += 4)
+    double_excitation(qubits, thetas[thetaCounter++], doublesMixed[i],
+                      doublesMixed[i + 1], doublesMixed[i + 2],
+                      doublesMixed[i + 3]);
+
+  for (std::size_t i = 0; i < doublesAlpha.size(); i += 4)
+    double_excitation(qubits, thetas[thetaCounter++], doublesAlpha[i],
+                      doublesAlpha[i + 1], doublesAlpha[i + 2],
+                      doublesAlpha[i + 3]);
+
+  for (std::size_t i = 0; i < doublesBeta.size(); i += 4)
+    double_excitation(qubits, thetas[thetaCounter++], doublesBeta[i],
+                      doublesBeta[i + 1], doublesBeta[i + 2],
+                      doublesBeta[i + 3]);
+}
+
+} // namespace test_uccsd
+
+__qpu__ void test_trial_state(cudaq::qview<> qubits, std::size_t num_electrons,
+                              const std::vector<double> &thetas) {
+  for (std::size_t i = 0; i < num_electrons; i++)
+    x(qubits[i]);
+  test_uccsd::uccsd2(qubits, thetas, num_electrons, 0);
+}
+
+__qpu__ void test(std::size_t num_qubits, std::size_t num_electrons,
+                  const std::vector<double> &thetas) {
+  cudaq::qvector qubits(num_qubits);
+  test_trial_state(qubits, num_electrons, thetas);
+}
+
+void printCounts(cudaq::sample_result &result) {
+  std::vector<std::string> values{};
+  for (auto &&[bits, counts] : result) {
+    values.push_back(bits);
+  }
+
+  std::sort(values.begin(), values.end());
+  for (auto &&bits : values) {
+    std::cout << bits << '\n';
+  }
+}
+
+int main() {
+  std::size_t num_electrons = 2;
+  std::size_t num_qubits = 6;
+  std::vector<double> thetas = {
+      -0.00037043841404585794, 0.0003811110195084151, 0.2286823796532558,
+      -0.00037043841404585794, 0.0003811110195084151, 0.2286823796532558,
+      -0.00037043841404585794, 0.0003811110195084151};
+
+  auto counts = cudaq::sample(test, 6, 2, thetas);
+  printCounts(counts);
+  return 0;
+}
+
+// CHECK: 000110
+// CHECK: 100100
+// CHECK: 110000
diff --git a/test/AST-Quake/infinite_loop.cpp b/test/AST-Quake/infinite_loop.cpp
new file mode 100644
index 00000000000..10c803a78de
--- /dev/null
+++ b/test/AST-Quake/infinite_loop.cpp
@@ -0,0 +1,63 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// RUN: cudaq-quake %cpp_std %s | cudaq-opt --memtoreg=quantum=0 --canonicalize --cc-loop-normalize |& FileCheck %s
+
+#include <cudaq.h>
+
+// Counted loop structure when condition is always true
+
+__qpu__ int t1() {
+  cudaq::qubit q;
+  for (std::uint32_t u = 1; u <= 0xffffffff; u++)
+    x(q);
+  return 0;
+}
+
+__qpu__ int t2() {
+  cudaq::qubit q;
+  for (std::int32_t u = 1; u <= 0x7fffffff; u++)
+    x(q);
+  return 0;
+}
+
+__qpu__ int t3() {
+  cudaq::qubit q;
+  for (std::uint64_t u = 5; u <= 0xffffffffffffffff; u++)
+    x(q);
+  return 0;
+}
+
+__qpu__ int t4() {
+  cudaq::qubit q;
+  for (std::int64_t u = 16; u <= 0x7fffffffffffffff; u++)
+    x(q);
+  return 0;
+}
+
+__qpu__ int t5() {
+  cudaq::qubit q;
+  for (std::uint64_t u = -14; u >= 0; u--)
+    x(q);
+  return 0;
+}
+
+__qpu__ int t6() {
+  cudaq::qubit q;
+  std::int64_t cmp = 0x8000000000000000;
+  for (std::int64_t u = 83; u >= cmp; u++)
+    x(q);
+  return 0;
+}
+
+// CHECK: Loop condition is always true. This loop is not supported in a kernel.
+// CHECK: Loop condition is always true. This loop is not supported in a kernel.
+// CHECK: Loop condition is always true. This loop is not supported in a kernel.
+// CHECK: Loop condition is always true. This loop is not supported in a kernel.
+// CHECK: Loop condition is always true. This loop is not supported in a kernel.
+// CHECK: Loop condition is always true. This loop is not supported in a kernel.
diff --git a/test/AST-Quake/loop_normal.cpp b/test/AST-Quake/loop_normal.cpp
index a4bf4bfad50..62ff0aa75b3 100644
--- a/test/AST-Quake/loop_normal.cpp
+++ b/test/AST-Quake/loop_normal.cpp
@@ -402,3 +402,278 @@ __qpu__ void linear_expr6() {
 // CHECK:             %[[VAL_15:.*]] = arith.addi %[[VAL_14]], %[[VAL_1]] : i32
 // CHECK:             cc.continue %[[VAL_15]] : i32
 // CHECK:           } {normalized}
+
+// In cases where the number of iterations is invalid, we should normalize to
+// a count of 0.
+
+__qpu__ void non_iterating_loop2() {
+  cudaq::qvector q(100);
+  for (std::int64_t i = 1; i < -1; i++)
+    x(q[i]);   
+}
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_non_iterating_loop2._Z19non_iterating_loop2v() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i64
+// CHECK-DAG:       %[[VAL_2:.*]] = quake.alloca !quake.veq<100>
+// CHECK:           %[[VAL_3:.*]] = cc.loop while ((%[[VAL_4:.*]] = %[[VAL_0]]) -> (i64)) {
+// CHECK:             %[[VAL_5:.*]] = arith.cmpi ne, %[[VAL_4]], %[[VAL_0]] : i64
+// CHECK:             cc.condition %[[VAL_5]](%[[VAL_4]] : i64)
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_6:.*]]: i64):
+// CHECK:             %[[VAL_7:.*]] = arith.addi %[[VAL_6]], %[[VAL_1]] : i64
+// CHECK:             %[[VAL_8:.*]] = quake.extract_ref %[[VAL_2]]{{\[}}%[[VAL_7]]] : (!quake.veq<100>, i64) -> !quake.ref
+// CHECK:             quake.x %[[VAL_8]] : (!quake.ref) -> ()
+// CHECK:             cc.continue %[[VAL_6]] : i64
+// CHECK:           } step {
+// CHECK:           ^bb0(%[[VAL_9:.*]]: i64):
+// CHECK:             %[[VAL_10:.*]] = arith.addi %[[VAL_9]], %[[VAL_1]] : i64
+// CHECK:             cc.continue %[[VAL_10]] : i64
+// CHECK:           } {normalized}
+// CHECK:           return
+// CHECK:         }
+
+__qpu__ int f2a() {
+  cudaq::qubit q;
+  for (int u = 1; u < 0; u++)
+    x(q);
+  return 0;
+}
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_f2a._Z3f2av() -> i32 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 0 : i32
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK-DAG:       %[[VAL_2:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_3:.*]] = cc.loop while ((%[[VAL_4:.*]] = %[[VAL_0]]) -> (i32)) {
+// CHECK:             %[[VAL_5:.*]] = arith.cmpi ne, %[[VAL_4]], %[[VAL_0]] : i32
+// CHECK:             cc.condition %[[VAL_5]](%[[VAL_4]] : i32)
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_6:.*]]: i32):
+// CHECK:             quake.x %[[VAL_2]] : (!quake.ref) -> ()
+// CHECK:             cc.continue %[[VAL_6]] : i32
+// CHECK:           } step {
+// CHECK:           ^bb0(%[[VAL_7:.*]]: i32):
+// CHECK:             %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_1]] : i32
+// CHECK:             cc.continue %[[VAL_8]] : i32
+// CHECK:           } {normalized}
+// CHECK:           return %[[VAL_0]] : i32
+// CHECK:         }
+
+__qpu__ int f2b() {
+  cudaq::qubit q;
+  for (int u = 10; u < 0; u++)
+    x(q);
+  return 0;
+}
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_f2b._Z3f2bv() -> i32 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 0 : i32
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK-DAG:       %[[VAL_2:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_3:.*]] = cc.loop while ((%[[VAL_4:.*]] = %[[VAL_0]]) -> (i32)) {
+// CHECK:             %[[VAL_5:.*]] = arith.cmpi ne, %[[VAL_4]], %[[VAL_0]] : i32
+// CHECK:             cc.condition %[[VAL_5]](%[[VAL_4]] : i32)
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_6:.*]]: i32):
+// CHECK:             quake.x %[[VAL_2]] : (!quake.ref) -> ()
+// CHECK:             cc.continue %[[VAL_6]] : i32
+// CHECK:           } step {
+// CHECK:           ^bb0(%[[VAL_7:.*]]: i32):
+// CHECK:             %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_1]] : i32
+// CHECK:             cc.continue %[[VAL_8]] : i32
+// CHECK:           } {normalized}
+// CHECK:           return %[[VAL_0]] : i32
+// CHECK:         }
+
+__qpu__ int f4() {
+  cudaq::qubit q;
+  for (std::int64_t u = 6; u < 0; u++)
+    x(q);
+  return 0;
+}
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_f4._Z2f4v() -> i32 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 0 : i64
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i64
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : i32
+// CHECK-DAG:       %[[VAL_3:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_4:.*]] = cc.loop while ((%[[VAL_5:.*]] = %[[VAL_0]]) -> (i64)) {
+// CHECK:             %[[VAL_6:.*]] = arith.cmpi ne, %[[VAL_5]], %[[VAL_0]] : i64
+// CHECK:             cc.condition %[[VAL_6]](%[[VAL_5]] : i64)
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_7:.*]]: i64):
+// CHECK:             quake.x %[[VAL_3]] : (!quake.ref) -> ()
+// CHECK:             cc.continue %[[VAL_7]] : i64
+// CHECK:           } step {
+// CHECK:           ^bb0(%[[VAL_8:.*]]: i64):
+// CHECK:             %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_1]] : i64
+// CHECK:             cc.continue %[[VAL_9]] : i64
+// CHECK:           } {normalized}
+// CHECK:           return %[[VAL_2]] : i32
+// CHECK:         }
+
+__qpu__ int m1(unsigned z) {
+  cudaq::qubit q;
+  for (unsigned u = 1; u < z; u++)
+    x(q);
+  return 0;
+}
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_m1._Z2m1j(
+// CHECK-SAME:      %[[VAL_0:.*]]: i32) -> i32 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 0 : i32
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_4:.*]] = arith.subi %[[VAL_0]], %[[VAL_2]] : i32
+// CHECK:           %[[VAL_7:.*]] = cc.loop while ((%[[VAL_8:.*]] = %[[VAL_1]]) -> (i32)) {
+// CHECK:             %[[VAL_9:.*]] = arith.cmpi ne, %[[VAL_8]], %[[VAL_4]] : i32
+// CHECK:             cc.condition %[[VAL_9]](%[[VAL_8]] : i32)
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_10:.*]]: i32):
+// CHECK:             quake.x %[[VAL_3]] : (!quake.ref) -> ()
+// CHECK:             cc.continue %[[VAL_10]] : i32
+// CHECK:           } step {
+// CHECK:           ^bb0(%[[VAL_11:.*]]: i32):
+// CHECK:             %[[VAL_12:.*]] = arith.addi %[[VAL_11]], %[[VAL_2]] : i32
+// CHECK:             cc.continue %[[VAL_12]] : i32
+// CHECK:           } {normalized}
+// CHECK:           return %[[VAL_1]] : i32
+// CHECK:         }
+
+__qpu__ int m2(int z) {
+  cudaq::qubit q;
+  for (int u = 1; u < z; u++)
+    x(q);
+  return 0;
+}
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_m2._Z2m2i(
+// CHECK-SAME:      %[[VAL_0:.*]]: i32) -> i32 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 0 : i32
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_4:.*]] = arith.subi %[[VAL_0]], %[[VAL_2]] : i32
+// CHECK:           %[[VAL_5:.*]] = arith.cmpi sgt, %[[VAL_4]], %[[VAL_1]] : i32
+// CHECK:           %[[VAL_6:.*]] = arith.select %[[VAL_5]], %[[VAL_4]], %[[VAL_1]] : i32
+// CHECK:           %[[VAL_7:.*]] = cc.loop while ((%[[VAL_8:.*]] = %[[VAL_1]]) -> (i32)) {
+// CHECK:             %[[VAL_9:.*]] = arith.cmpi ne, %[[VAL_8]], %[[VAL_6]] : i32
+// CHECK:             cc.condition %[[VAL_9]](%[[VAL_8]] : i32)
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_10:.*]]: i32):
+// CHECK:             quake.x %[[VAL_3]] : (!quake.ref) -> ()
+// CHECK:             cc.continue %[[VAL_10]] : i32
+// CHECK:           } step {
+// CHECK:           ^bb0(%[[VAL_11:.*]]: i32):
+// CHECK:             %[[VAL_12:.*]] = arith.addi %[[VAL_11]], %[[VAL_2]] : i32
+// CHECK:             cc.continue %[[VAL_12]] : i32
+// CHECK:           } {normalized}
+// CHECK:           return %[[VAL_1]] : i32
+// CHECK:         }
+
+// Dead loops: no unsigned value will ever be less than 0, so these loops will
+// never execute. Make sure they are marked "dead" by the normalizer.
+
+__qpu__ void non_iterating_loop1() {
+  cudaq::qvector q(100);
+  for (std::uint64_t i = 1; i < 0; i++)
+    x(q[i]);   
+}
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_non_iterating_loop1._Z19non_iterating_loop1v() attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = arith.constant false
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<100>
+// CHECK:           %[[VAL_3:.*]] = cc.loop while ((%[[VAL_4:.*]] = %[[VAL_1]]) -> (i64)) {
+// CHECK:             cc.condition %[[VAL_0]](%[[VAL_4]] : i64)
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_5:.*]]: i64):
+// CHECK:             %[[VAL_6:.*]] = quake.extract_ref %[[VAL_2]]{{\[}}%[[VAL_5]]] : (!quake.veq<100>, i64) -> !quake.ref
+// CHECK:             quake.x %[[VAL_6]] : (!quake.ref) -> ()
+// CHECK:             cc.continue %[[VAL_5]] : i64
+// CHECK:           } step {
+// CHECK:           ^bb0(%[[VAL_7:.*]]: i64):
+// CHECK:             %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_1]] : i64
+// CHECK:             cc.continue %[[VAL_8]] : i64
+// CHECK:           } {dead}
+// CHECK:           return
+// CHECK:         }
+
+__qpu__ int f1a() {
+  cudaq::qubit q;
+  for (unsigned u = 1; u < 0; u++)
+    x(q);
+  return 0;
+}
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_f1a._Z3f1av() -> i32 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = arith.constant false
+// CHECK:           %[[VAL_1:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_3:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_4:.*]] = cc.loop while ((%[[VAL_5:.*]] = %[[VAL_2]]) -> (i32)) {
+// CHECK:             cc.condition %[[VAL_0]](%[[VAL_5]] : i32)
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_6:.*]]: i32):
+// CHECK:             quake.x %[[VAL_3]] : (!quake.ref) -> ()
+// CHECK:             cc.continue %[[VAL_6]] : i32
+// CHECK:           } step {
+// CHECK:           ^bb0(%[[VAL_7:.*]]: i32):
+// CHECK:             %[[VAL_8:.*]] = arith.addi %[[VAL_7]], %[[VAL_2]] : i32
+// CHECK:             cc.continue %[[VAL_8]] : i32
+// CHECK:           } {dead}
+// CHECK:           return %[[VAL_1]] : i32
+// CHECK:         }
+
+__qpu__ int f1b() {
+  cudaq::qubit q;
+  for (unsigned u = 10; u < 0; u++)
+    x(q);
+  return 0;
+}
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_f1b._Z3f1bv() -> i32 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = arith.constant false
+// CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i32
+// CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_3:.*]] = arith.constant 10 : i32
+// CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_5:.*]] = cc.loop while ((%[[VAL_6:.*]] = %[[VAL_3]]) -> (i32)) {
+// CHECK:             cc.condition %[[VAL_0]](%[[VAL_6]] : i32)
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_7:.*]]: i32):
+// CHECK:             quake.x %[[VAL_4]] : (!quake.ref) -> ()
+// CHECK:             cc.continue %[[VAL_7]] : i32
+// CHECK:           } step {
+// CHECK:           ^bb0(%[[VAL_8:.*]]: i32):
+// CHECK:             %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_1]] : i32
+// CHECK:             cc.continue %[[VAL_9]] : i32
+// CHECK:           } {dead}
+// CHECK:           return %[[VAL_2]] : i32
+// CHECK:         }
+
+__qpu__ int f3() {
+  cudaq::qubit q;
+  for (std::uint64_t u = 22; u < 0; u++)
+    x(q);
+  return 0;
+}
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__function_f3._Z2f3v() -> i32 attributes {"cudaq-entrypoint", "cudaq-kernel", no_this} {
+// CHECK:           %[[VAL_0:.*]] = arith.constant false
+// CHECK:           %[[VAL_1:.*]] = arith.constant 22 : i64
+// CHECK:           %[[VAL_2:.*]] = arith.constant 1 : i64
+// CHECK:           %[[VAL_3:.*]] = arith.constant 0 : i32
+// CHECK:           %[[VAL_4:.*]] = quake.alloca !quake.ref
+// CHECK:           %[[VAL_5:.*]] = cc.loop while ((%[[VAL_6:.*]] = %[[VAL_1]]) -> (i64)) {
+// CHECK:             cc.condition %[[VAL_0]](%[[VAL_6]] : i64)
+// CHECK:           } do {
+// CHECK:           ^bb0(%[[VAL_7:.*]]: i64):
+// CHECK:             quake.x %[[VAL_4]] : (!quake.ref) -> ()
+// CHECK:             cc.continue %[[VAL_7]] : i64
+// CHECK:           } step {
+// CHECK:           ^bb0(%[[VAL_8:.*]]: i64):
+// CHECK:             %[[VAL_9:.*]] = arith.addi %[[VAL_8]], %[[VAL_2]] : i64
+// CHECK:             cc.continue %[[VAL_9]] : i64
+// CHECK:           } {dead}
+// CHECK:           return %[[VAL_3]] : i32
+// CHECK:         }
diff --git a/test/Quake/classical_optimization.qke b/test/Quake/classical_optimization.qke
new file mode 100644
index 00000000000..b3bf994b764
--- /dev/null
+++ b/test/Quake/classical_optimization.qke
@@ -0,0 +1,253 @@
+// ========================================================================== //
+// Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                 //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+// RUN: cudaq-opt -classical-optimization-pipeline %s | FileCheck %s
+
+func.func @test_array_copy() -> i1 {
+  %c0_i64 = arith.constant 0 : i64
+  %0 = cc.alloca !cc.array<i64 x 1>
+  %1 = cc.cast %0 : (!cc.ptr<!cc.array<i64 x 1>>) -> !cc.ptr<i64>
+  cc.store %c0_i64, %1 : !cc.ptr<i64>
+  %2 = cc.alloca !cc.array<i64 x 1>
+  %3 = cc.load %1 : !cc.ptr<i64>
+  %4 = cc.cast %2 : (!cc.ptr<!cc.array<i64 x 1>>) -> !cc.ptr<i64>
+  cc.store %3, %4 : !cc.ptr<i64>
+  %6 = cc.load %1 : !cc.ptr<i64>
+  %7 = cc.load %4 : !cc.ptr<i64>
+  %8 = arith.cmpi eq, %6, %7 : i64
+  return %8 : i1
+}
+
+// CHECK-LABEL:   func.func @test_array_copy() -> i1 {
+// CHECK:           %[[VAL_0:.*]] = arith.constant true
+// CHECK:           return %[[VAL_0]] : i1
+// CHECK:         }
+
+func.func @test_nested_loop_unroll() {
+  %c1_i64 = arith.constant 1 : i64
+  %c2_i64 = arith.constant 2 : i64
+  %c0_i64 = arith.constant 0 : i64
+  %0 = quake.alloca !quake.veq<6>
+  %1 = quake.extract_ref %0[0] : (!quake.veq<6>) -> !quake.ref
+  quake.x %1 : (!quake.ref) -> ()
+  %2 = math.absi %c2_i64 : i64
+  %3 = cc.alloca i64[%2 : i64]
+  %4:2 = cc.loop while ((%arg0 = %c0_i64, %arg1 = %c0_i64) -> (i64, i64)) {
+    %25 = arith.cmpi slt, %arg0, %c2_i64 : i64
+    cc.condition %25(%arg0, %arg1 : i64, i64)
+  } do {
+  ^bb0(%arg0: i64, %arg1: i64):
+    %25 = cc.compute_ptr %3[%arg1] : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.ptr<i64>
+    cc.store %arg1, %25 : !cc.ptr<i64>
+    %26 = arith.addi %arg1, %c1_i64 : i64
+    cc.continue %arg0, %26 : i64, i64
+  } step {
+  ^bb0(%arg0: i64, %arg1: i64):
+    %25 = arith.addi %arg0, %c1_i64 : i64
+    cc.continue %25, %arg1 : i64, i64
+  } {invariant}
+  %5 = cc.alloca i64[%c2_i64 : i64]
+  %6 = cc.loop while ((%arg0 = %c0_i64) -> (i64)) {
+    %25 = arith.cmpi slt, %arg0, %c2_i64 : i64
+    cc.condition %25(%arg0 : i64)
+  } do {
+  ^bb0(%arg0: i64):
+    %25 = cc.compute_ptr %3[%arg0] : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.ptr<i64>
+    %26 = cc.load %25 : !cc.ptr<i64>
+    %27 = arith.muli %26, %c2_i64 : i64
+    %28 = cc.compute_ptr %5[%arg0] : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.ptr<i64>
+    cc.store %27, %28 : !cc.ptr<i64>
+    cc.continue %arg0 : i64
+  } step {
+  ^bb0(%arg0: i64):
+    %25 = arith.addi %arg0, %c1_i64 : i64
+    cc.continue %25 : i64
+  } {invariant}
+  %7 = cc.stdvec_init %5, %c2_i64 : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.stdvec<i64>
+  %8 = cc.stdvec_size %7 : (!cc.stdvec<i64>) -> i64
+  %9 = arith.subi %8, %c1_i64 : i64
+  %10:2 = cc.loop while ((%arg0 = %c0_i64, %arg1 = %c0_i64) -> (i64, i64)) {
+    %25 = arith.cmpi slt, %arg0, %9 : i64
+    cc.condition %25(%arg0, %arg1 : i64, i64)
+  } do {
+  ^bb0(%arg0: i64, %arg1: i64):
+    %25 = arith.addi %arg0, %c1_i64 : i64
+    %26:2 = cc.loop while ((%arg2 = %25, %arg3 = %arg1) -> (i64, i64)) {
+      %27 = arith.cmpi slt, %arg2, %8 : i64
+      cc.condition %27(%arg2, %arg3 : i64, i64)
+    } do {
+    ^bb0(%arg2: i64, %arg3: i64):
+      %27 = arith.addi %arg3, %c1_i64 : i64
+      cc.continue %arg2, %27 : i64, i64
+    } step {
+    ^bb0(%arg2: i64, %arg3: i64):
+      %27 = arith.addi %arg2, %c1_i64 : i64
+      cc.continue %27, %arg3 : i64, i64
+    } {invariant}
+    cc.continue %arg0, %26#1 : i64, i64
+  } step {
+  ^bb0(%arg0: i64, %arg1: i64):
+    %25 = arith.addi %arg0, %c1_i64 : i64
+    cc.continue %25, %arg1 : i64, i64
+  } {invariant}
+  %11 = math.absi %10#1 : i64
+  %12 = cc.alloca i64[%11 : i64]
+  %13:2 = cc.loop while ((%arg0 = %c0_i64, %arg1 = %c0_i64) -> (i64, i64)) {
+    %25 = arith.cmpi slt, %arg0, %10#1 : i64
+    cc.condition %25(%arg0, %arg1 : i64, i64)
+  } do {
+  ^bb0(%arg0: i64, %arg1: i64):
+    %25 = cc.compute_ptr %12[%arg1] : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.ptr<i64>
+    cc.store %arg1, %25 : !cc.ptr<i64>
+    %26 = arith.addi %arg1, %c1_i64 : i64
+    cc.continue %arg0, %26 : i64, i64
+  } step {
+  ^bb0(%arg0: i64, %arg1: i64):
+    %25 = arith.addi %arg0, %c1_i64 : i64
+    cc.continue %25, %arg1 : i64, i64
+  } {invariant}
+  %14 = cc.alloca i64[%10#1 : i64]
+  %15 = cc.loop while ((%arg0 = %c0_i64) -> (i64)) {
+    %25 = arith.cmpi slt, %arg0, %10#1 : i64
+    cc.condition %25(%arg0 : i64)
+  } do {
+  ^bb0(%arg0: i64):
+    %25 = cc.compute_ptr %14[%arg0] : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.ptr<i64>
+    cc.store %c0_i64, %25 : !cc.ptr<i64>
+    cc.continue %arg0 : i64
+  } step {
+  ^bb0(%arg0: i64):
+    %25 = arith.addi %arg0, %c1_i64 : i64
+    cc.continue %25 : i64
+  } {invariant}
+  %16 = cc.stdvec_init %14, %10#1 : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.stdvec<i64>
+  %17 = cc.alloca i64[%11 : i64]
+  %18:2 = cc.loop while ((%arg0 = %c0_i64, %arg1 = %c0_i64) -> (i64, i64)) {
+    %25 = arith.cmpi slt, %arg0, %10#1 : i64
+    cc.condition %25(%arg0, %arg1 : i64, i64)
+  } do {
+  ^bb0(%arg0: i64, %arg1: i64):
+    %25 = cc.compute_ptr %17[%arg1] : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.ptr<i64>
+    cc.store %arg1, %25 : !cc.ptr<i64>
+    %26 = arith.addi %arg1, %c1_i64 : i64
+    cc.continue %arg0, %26 : i64, i64
+  } step {
+  ^bb0(%arg0: i64, %arg1: i64):
+    %25 = arith.addi %arg0, %c1_i64 : i64
+    cc.continue %25, %arg1 : i64, i64
+  } {invariant}
+  %19 = cc.alloca i64[%10#1 : i64]
+  %20 = cc.loop while ((%arg0 = %c0_i64) -> (i64)) {
+    %25 = arith.cmpi slt, %arg0, %10#1 : i64
+    cc.condition %25(%arg0 : i64)
+  } do {
+  ^bb0(%arg0: i64):
+    %25 = cc.compute_ptr %19[%arg0] : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.ptr<i64>
+    cc.store %c0_i64, %25 : !cc.ptr<i64>
+    cc.continue %arg0 : i64
+  } step {
+  ^bb0(%arg0: i64):
+    %25 = arith.addi %arg0, %c1_i64 : i64
+    cc.continue %25 : i64
+  } {invariant}
+  %21 = cc.stdvec_init %19, %10#1 : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.stdvec<i64>
+  %22:2 = cc.loop while ((%arg0 = %c0_i64, %arg1 = %c0_i64) -> (i64, i64)) {
+    %25 = arith.cmpi slt, %arg0, %9 : i64
+    cc.condition %25(%arg0, %arg1 : i64, i64)
+  } do {
+  ^bb0(%arg0: i64, %arg1: i64):
+    %25 = arith.addi %arg0, %c1_i64 : i64
+    %26:2 = cc.loop while ((%arg2 = %25, %arg3 = %arg1) -> (i64, i64)) {
+      %27 = arith.cmpi slt, %arg2, %8 : i64
+      cc.condition %27(%arg2, %arg3 : i64, i64)
+    } do {
+    ^bb0(%arg2: i64, %arg3: i64):
+      %27 = cc.stdvec_data %16 : (!cc.stdvec<i64>) -> !cc.ptr<!cc.array<i64 x ?>>
+      %28 = cc.compute_ptr %27[%arg3] : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.ptr<i64>
+      %29 = cc.stdvec_data %7 : (!cc.stdvec<i64>) -> !cc.ptr<!cc.array<i64 x ?>>
+      %30 = cc.compute_ptr %29[%arg0] : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.ptr<i64>
+      %31 = cc.load %30 : !cc.ptr<i64>
+      cc.store %31, %28 : !cc.ptr<i64>
+      %32 = cc.stdvec_data %21 : (!cc.stdvec<i64>) -> !cc.ptr<!cc.array<i64 x ?>>
+      %33 = cc.compute_ptr %32[%arg3] : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.ptr<i64>
+      %34 = cc.compute_ptr %29[%arg2] : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.ptr<i64>
+      %35 = cc.load %34 : !cc.ptr<i64>
+      cc.store %35, %33 : !cc.ptr<i64>
+      %36 = arith.addi %arg3, %c1_i64 : i64
+      cc.continue %arg2, %36 : i64, i64
+    } step {
+    ^bb0(%arg2: i64, %arg3: i64):
+      %27 = arith.addi %arg2, %c1_i64 : i64
+      cc.continue %27, %arg3 : i64, i64
+    } {invariant}
+    cc.continue %arg0, %26#1 : i64, i64
+  } step {
+  ^bb0(%arg0: i64, %arg1: i64):
+    %25 = arith.addi %arg0, %c1_i64 : i64
+    cc.continue %25, %arg1 : i64, i64
+  } {invariant}
+  %23 = cc.stdvec_size %16 : (!cc.stdvec<i64>) -> i64
+  %24 = cc.loop while ((%arg0 = %c0_i64) -> (i64)) {
+    %25 = arith.cmpi slt, %arg0, %23 : i64
+    cc.condition %25(%arg0 : i64)
+  } do {
+  ^bb0(%arg0: i64):
+    %25 = cc.stdvec_data %16 : (!cc.stdvec<i64>) -> !cc.ptr<!cc.array<i64 x ?>>
+    %26 = cc.compute_ptr %25[%arg0] : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.ptr<i64>
+    %27 = cc.load %26 : !cc.ptr<i64>
+    %28 = cc.stdvec_data %21 : (!cc.stdvec<i64>) -> !cc.ptr<!cc.array<i64 x ?>>
+    %29 = cc.compute_ptr %28[%arg0] : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.ptr<i64>
+    %30 = cc.load %29 : !cc.ptr<i64>
+    %31 = arith.cmpi slt, %27, %30 : i64
+    %32:2 = cc.if(%31) -> (i64, i64) {
+      cc.continue %27, %30 : i64, i64
+    } else {
+      %34 = arith.cmpi sgt, %27, %30 : i64
+      %35:2 = cc.if(%34) -> (i64, i64) {
+        cc.continue %30, %27 : i64, i64
+      } else {
+        cc.continue %c0_i64, %c0_i64 : i64, i64
+      }
+      cc.continue %35#0, %35#1 : i64, i64
+    }
+    %33 = cc.loop while ((%arg1 = %32#0) -> (i64)) {
+      %34 = arith.cmpi slt, %arg1, %32#1 : i64
+      cc.condition %34(%arg1 : i64)
+    } do {
+    ^bb0(%arg1: i64):
+      %34 = quake.extract_ref %0[%arg1] : (!quake.veq<6>, i64) -> !quake.ref
+      %35 = arith.addi %arg1, %c1_i64 : i64
+      %36 = quake.extract_ref %0[%35] : (!quake.veq<6>, i64) -> !quake.ref
+      quake.x [%34] %36 : (!quake.ref, !quake.ref) -> ()
+      cc.continue %arg1 : i64
+    } step {
+    ^bb0(%arg1: i64):
+      %34 = arith.addi %arg1, %c1_i64 : i64
+      cc.continue %34 : i64
+    } {invariant}
+    cc.continue %arg0 : i64
+  } step {
+  ^bb0(%arg0: i64):
+    %25 = arith.addi %arg0, %c1_i64 : i64
+    cc.continue %25 : i64
+  } {invariant}
+  return
+}
+
+// CHECK-LABEL:   func.func @test_nested_loop_unroll() {
+// CHECK:           %[[VAL_0:.*]] = quake.alloca !quake.veq<6>
+// CHECK:           %[[VAL_1:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<6>) -> !quake.ref
+// CHECK:           quake.x %[[VAL_1]] : (!quake.ref) -> ()
+// CHECK:           %[[VAL_2:.*]] = quake.extract_ref %[[VAL_0]][0] : (!quake.veq<6>) -> !quake.ref
+// CHECK:           %[[VAL_3:.*]] = quake.extract_ref %[[VAL_0]][1] : (!quake.veq<6>) -> !quake.ref
+// CHECK:           quake.x [%[[VAL_2]]] %[[VAL_3]] : (!quake.ref, !quake.ref) -> ()
+// CHECK:           %[[VAL_4:.*]] = quake.extract_ref %[[VAL_0]][1] : (!quake.veq<6>) -> !quake.ref
+// CHECK:           %[[VAL_5:.*]] = quake.extract_ref %[[VAL_0]][2] : (!quake.veq<6>) -> !quake.ref
+// CHECK:           quake.x [%[[VAL_4]]] %[[VAL_5]] : (!quake.ref, !quake.ref) -> ()
+// CHECK:           return
+// CHECK:         }
diff --git a/test/Quake/qir_api_branching.qke b/test/Quake/qir_api_branching.qke
new file mode 100644
index 00000000000..3a0dfb8c449
--- /dev/null
+++ b/test/Quake/qir_api_branching.qke
@@ -0,0 +1,96 @@
+// ========================================================================== //
+// Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                 //
+// All rights reserved.                                                       //
+//                                                                            //
+// This source code and the accompanying materials are made available under   //
+// the terms of the Apache License 2.0 which accompanies this distribution.   //
+// ========================================================================== //
+
+// RUN: cudaq-opt -convert-to-qir-api %s | FileCheck %s
+
+func.func @__nvqpp__mlirgen__kernel() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
+  %c0_i64 = arith.constant 0 : i64
+  %c6_i64 = arith.constant 6 : i64
+  %c1_i64 = arith.constant 1 : i64
+  %c-1_i64 = arith.constant -1 : i64
+  %c4_i64 = arith.constant 4 : i64
+  %0 = quake.alloca !quake.veq<12>
+  %1 = cc.alloca !cc.array<i64 x 1>
+  %2 = cc.cast %1 : (!cc.ptr<!cc.array<i64 x 1>>) -> !cc.ptr<i64>
+  cc.store %c6_i64, %2 : !cc.ptr<i64>
+  %3 = cc.load %2 : !cc.ptr<i64>
+  %4 = quake.extract_ref %0[4] : (!quake.veq<12>) -> !quake.ref
+  quake.h %4 : (!quake.ref) -> ()
+  %5 = arith.subi %c4_i64, %3 : i64
+  %6 = arith.divsi %5, %c-1_i64 : i64
+  %7 = arith.cmpi sgt, %6, %c0_i64 : i64
+  %8 = arith.select %7, %6, %c0_i64 : i64
+  cf.br ^bb1(%c0_i64 : i64)
+^bb1(%9: i64):  // 2 preds: ^bb0, ^bb2
+  %10 = arith.cmpi ne, %9, %8 : i64
+  cf.cond_br %10, ^bb2(%9 : i64), ^bb3(%4 : !quake.ref)
+^bb2(%11: i64):  // pred: ^bb1
+  %12 = arith.muli %11, %c-1_i64 : i64
+  %13 = arith.addi %3, %12 : i64
+  %14 = arith.subi %13, %c1_i64 : i64
+  %15 = quake.extract_ref %0[%14] : (!quake.veq<12>, i64) -> !quake.ref
+  %16 = quake.extract_ref %0[%13] : (!quake.veq<12>, i64) -> !quake.ref
+  quake.x [%15] %16 : (!quake.ref, !quake.ref) -> ()
+  %17 = arith.addi %11, %c1_i64 : i64
+  cf.br ^bb1(%17 : i64)
+^bb3(%18: !quake.ref):  // pred: ^bb1
+  %19 = quake.extract_ref %0[2] : (!quake.veq<12>) -> !quake.ref
+  quake.x [%19] %18 : (!quake.ref, !quake.ref) -> ()
+  quake.dealloc %0 : !quake.veq<12>
+  return
+}
+
+// CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel() attributes {"cudaq-entrypoint", "cudaq-kernel", "qir-api"} {
+// CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 2 : i64
+// CHECK-DAG:       %[[VAL_1:.*]] = constant @__quantum__qis__x__ctl : (!cc.ptr<!llvm.struct<"Array", opaque>>, !cc.ptr<!llvm.struct<"Qubit", opaque>>) -> ()
+// CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : i64
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 6 : i64
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : i64
+// CHECK-DAG:       %[[VAL_5:.*]] = arith.constant -1 : i64
+// CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 4 : i64
+// CHECK-DAG:       %[[VAL_7:.*]] = arith.constant 12 : i64
+// CHECK:           %[[VAL_8:.*]] = call @__quantum__rt__qubit_allocate_array(%[[VAL_7]]) : (i64) -> !cc.ptr<!llvm.struct<"Array", opaque>>
+// CHECK:           %[[VAL_9:.*]] = cc.alloca !cc.array<i64 x 1>
+// CHECK:           %[[VAL_10:.*]] = cc.cast %[[VAL_9]] : (!cc.ptr<!cc.array<i64 x 1>>) -> !cc.ptr<i64>
+// CHECK:           cc.store %[[VAL_3]], %[[VAL_10]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_11:.*]] = cc.load %[[VAL_10]] : !cc.ptr<i64>
+// CHECK:           %[[VAL_12:.*]] = call @__quantum__rt__array_get_element_ptr_1d(%[[VAL_8]], %[[VAL_6]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
+// CHECK:           %[[VAL_13:.*]] = cc.load %[[VAL_12]] : !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
+// CHECK:           call @__quantum__qis__h(%[[VAL_13]]) : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> ()
+// CHECK:           %[[VAL_14:.*]] = arith.subi %[[VAL_6]], %[[VAL_11]] : i64
+// CHECK:           %[[VAL_15:.*]] = arith.divsi %[[VAL_14]], %[[VAL_5]] : i64
+// CHECK:           %[[VAL_16:.*]] = arith.cmpi sgt, %[[VAL_15]], %[[VAL_2]] : i64
+// CHECK:           %[[VAL_17:.*]] = arith.select %[[VAL_16]], %[[VAL_15]], %[[VAL_2]] : i64
+// CHECK:           cf.br ^bb1(%[[VAL_2]] : i64)
+// CHECK:         ^bb1(%[[VAL_18:.*]]: i64):
+// CHECK:           %[[VAL_19:.*]] = arith.cmpi ne, %[[VAL_18]], %[[VAL_17]] : i64
+// CHECK:           cf.cond_br %[[VAL_19]], ^bb2(%[[VAL_18]] : i64), ^bb3(%[[VAL_13]] : !cc.ptr<!llvm.struct<"Qubit", opaque>>)
+// CHECK:         ^bb2(%[[VAL_20:.*]]: i64):
+// CHECK:           %[[VAL_21:.*]] = arith.muli %[[VAL_20]], %[[VAL_5]] : i64
+// CHECK:           %[[VAL_22:.*]] = arith.addi %[[VAL_11]], %[[VAL_21]] : i64
+// CHECK:           %[[VAL_23:.*]] = arith.subi %[[VAL_22]], %[[VAL_4]] : i64
+// CHECK:           %[[VAL_24:.*]] = call @__quantum__rt__array_get_element_ptr_1d(%[[VAL_8]], %[[VAL_23]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
+// CHECK:           %[[VAL_25:.*]] = cc.load %[[VAL_24]] : !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
+// CHECK:           %[[VAL_26:.*]] = call @__quantum__rt__array_get_element_ptr_1d(%[[VAL_8]], %[[VAL_22]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
+// CHECK:           %[[VAL_27:.*]] = cc.load %[[VAL_26]] : !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
+// CHECK:           %[[VAL_28:.*]] = cc.cast %[[VAL_25]] : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> !llvm.ptr<i8>
+// CHECK:           %[[VAL_29:.*]] = cc.func_ptr %[[VAL_1]] : ((!cc.ptr<!llvm.struct<"Array", opaque>>, !cc.ptr<!llvm.struct<"Qubit", opaque>>) -> ()) -> !llvm.ptr<i8>
+// CHECK:           %[[VAL_30:.*]] = cc.cast %[[VAL_27]] : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> !llvm.ptr<i8>
+// CHECK:           cc.call_vararg @generalizedInvokeWithRotationsControlsTargets(%[[VAL_2]], %[[VAL_2]], %[[VAL_4]], %[[VAL_4]], %[[VAL_29]], %[[VAL_28]], %[[VAL_30]]) : (i64, i64, i64, i64, !llvm.ptr<i8>, !llvm.ptr<i8>, !llvm.ptr<i8>) -> ()
+// CHECK:           %[[VAL_31:.*]] = arith.addi %[[VAL_20]], %[[VAL_4]] : i64
+// CHECK:           cf.br ^bb1(%[[VAL_31]] : i64)
+// CHECK:         ^bb3(%[[VAL_32:.*]]: !cc.ptr<!llvm.struct<"Qubit", opaque>>):
+// CHECK:           %[[VAL_33:.*]] = call @__quantum__rt__array_get_element_ptr_1d(%[[VAL_8]], %[[VAL_0]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>, i64) -> !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
+// CHECK:           %[[VAL_34:.*]] = cc.load %[[VAL_33]] : !cc.ptr<!cc.ptr<!llvm.struct<"Qubit", opaque>>>
+// CHECK:           %[[VAL_35:.*]] = cc.cast %[[VAL_34]] : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> !llvm.ptr<i8>
+// CHECK:           %[[VAL_36:.*]] = cc.func_ptr %[[VAL_1]] : ((!cc.ptr<!llvm.struct<"Array", opaque>>, !cc.ptr<!llvm.struct<"Qubit", opaque>>) -> ()) -> !llvm.ptr<i8>
+// CHECK:           %[[VAL_37:.*]] = cc.cast %[[VAL_32]] : (!cc.ptr<!llvm.struct<"Qubit", opaque>>) -> !llvm.ptr<i8>
+// CHECK:           cc.call_vararg @generalizedInvokeWithRotationsControlsTargets(%[[VAL_2]], %[[VAL_2]], %[[VAL_4]], %[[VAL_4]], %[[VAL_36]], %[[VAL_35]], %[[VAL_37]]) : (i64, i64, i64, i64, !llvm.ptr<i8>, !llvm.ptr<i8>, !llvm.ptr<i8>) -> ()
+// CHECK:           call @__quantum__rt__qubit_release_array(%[[VAL_8]]) : (!cc.ptr<!llvm.struct<"Array", opaque>>) -> ()
+// CHECK:           return
+// CHECK:         }
diff --git a/test/Quake/roundtrip-ops.qke b/test/Quake/roundtrip-ops.qke
index 39fb636319d..806939d5267 100644
--- a/test/Quake/roundtrip-ops.qke
+++ b/test/Quake/roundtrip-ops.qke
@@ -811,6 +811,23 @@ func.func @indirect_callable1(%arg : !cc.indirect_callable<() -> ()>) {
 // CHECK:           return
 // CHECK:         }
 
+func.func @varargs_test() {
+  %1 = arith.constant 12 : i32
+  %2 = cc.undef !cc.ptr<none>
+  cc.call_vararg @my_variadic(%1, %2) : (i32, !cc.ptr<none>) -> ()
+  return
+}
+
+llvm.func @my_variadic(i32, ...)
+
+// CHECK-LABEL:   func.func @varargs_test() {
+// CHECK:           %[[VAL_0:.*]] = arith.constant 12 : i32
+// CHECK:           %[[VAL_1:.*]] = cc.undef !cc.ptr<none>
+// CHECK:           cc.call_vararg @my_variadic(%[[VAL_0]], %[[VAL_1]]) : (i32, !cc.ptr<none>) -> ()
+// CHECK:           return
+// CHECK:         }
+// CHECK:         llvm.func @my_variadic(i32, ...)
+
 func.func @indirect_callable2(%arg : !cc.indirect_callable<(i32) -> i64>) -> i64 {
   %cst = arith.constant 4 : i32
   %0 = cc.call_indirect_callable %arg, %cst : (!cc.indirect_callable<(i32) -> i64>, i32) -> i64
diff --git a/tools/nvqpp/nvq++.in b/tools/nvqpp/nvq++.in
index ae0aabdd3a9..ae6a6ab265d 100644
--- a/tools/nvqpp/nvq++.in
+++ b/tools/nvqpp/nvq++.in
@@ -303,7 +303,7 @@ LIBRARY_MODE_EXECUTION_MANAGER="default"
 PLATFORM_LIBRARY="default"
 LLVM_QUANTUM_TARGET="qir"
 LINKDIRS="-L${install_dir}/lib -L${install_dir}/lib/plugins @CUDAQ_CXX_NVQPP_LINK_STR@"
-LINKLIBS="-lcudaq -lcudaq-common -lcudaq-ensmallen -lcudaq-nlopt -lcudaq-spin"
+LINKLIBS="-lcudaq -lcudaq-common -lcudaq-ensmallen -lcudaq-nlopt -lcudaq-spin -lcudaq-operator"
 
 # Add any plugin libraries to the link stage
 CUDAQ_PLUGIN_DIR=${install_dir}/lib/plugins
@@ -587,7 +587,7 @@ while [ $# -ne 0 ]; do
 	*.o | *.so | *.bundle)
 		OBJS="${OBJS} $1"
 		;;
-	*.cpp | *.cc)
+	*.cpp | *.cc | *.cxx | *.c++)
 		SRCS="${SRCS} $1"
 		;;
 	*.a | *.dylib)
@@ -746,7 +746,8 @@ if ${SHOW_VERSION} && [ -z "$SRCS" ] && [ -z "$OBJS" ]; then
 fi
 
 for i in ${SRCS}; do
-	file=$(basename -s .cc -s .cpp $i)
+	file_with_suffix=$(basename $i)
+	file=${file_with_suffix%.*}
 
 	# If LIBRARY_MODE explicitly requested, then
 	# simply compile with the classical compiler.
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index fa1a9be06d4..0ea938ca4c3 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -49,6 +49,8 @@ set(CUDAQ_RUNTIME_TEST_SOURCES
 # Make it so we can get function symbols
 set (CMAKE_ENABLE_EXPORTS TRUE)
 
+include_directories(/usr/local/cuda/targets/x86_64-linux/include)
+
 ## This Macro allows us to create a test_runtime executable for 
 ## the sources in CUDAQ_RUNTIME_TEST_SOURCE for a specific backend simulator
 macro (create_tests_with_backend NVQIR_BACKEND EXTRA_BACKEND_TESTER) 
@@ -67,8 +69,10 @@ macro (create_tests_with_backend NVQIR_BACKEND EXTRA_BACKEND_TESTER)
   endif()
   target_link_libraries(${TEST_EXE_NAME}
     PUBLIC 
-    nvqir-${NVQIR_BACKEND} nvqir  
-    cudaq fmt::fmt-header-only
+    nvqir-${NVQIR_BACKEND}
+    nvqir  
+    cudaq
+    fmt::fmt-header-only
     cudaq-platform-default
     cudaq-builder
     gtest_main)
@@ -258,6 +262,89 @@ target_link_libraries(test_spin
   gtest_main)
 gtest_discover_tests(test_spin)
 
+# Create an executable for operators UnitTests
+set(CUDAQ_OPERATOR_TEST_SOURCES 
+   dynamics/utils.cpp
+   dynamics/scalar_operator.cpp
+   dynamics/matrix_operator.cpp
+   dynamics/spin_operator.cpp
+   dynamics/boson_operator.cpp
+   dynamics/fermion_operator.cpp
+   dynamics/operator_conversions.cpp
+   dynamics/product_operator.cpp
+   dynamics/operator_sum.cpp
+)
+add_executable(test_operators main.cpp ${CUDAQ_OPERATOR_TEST_SOURCES})
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT APPLE)
+  target_link_options(test_operators PRIVATE -Wl,--no-as-needed)
+endif()
+target_link_libraries(test_operators
+  PRIVATE
+  cudaq-spin
+  cudaq-operator
+  cudaq
+  gtest_main
+  fmt::fmt-header-only)
+gtest_discover_tests(test_operators)
+
+if (CUDA_FOUND)
+  find_package(CUDAToolkit REQUIRED)
+  
+  # Create an executable for dynamics UnitTests
+  set(CUDAQ_DYNAMICS_TEST_SOURCES 
+    dynamics/test_RungeKuttaIntegrator.cpp
+    dynamics/test_Helpers.cpp
+    dynamics/test_CuDensityMatState.cpp
+    dynamics/test_CuDensityMatTimeStepper.cpp
+    dynamics/test_CuDensityMatExpectation.cpp
+    dynamics/test_EvolveSingle.cpp
+    dynamics/test_EvolveApi.cpp
+  )
+  add_executable(test_dynamics main.cpp ${CUDAQ_DYNAMICS_TEST_SOURCES})
+  target_compile_definitions(test_dynamics PRIVATE -DCUDAQ_DYNAMICS_TARGET)
+  if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT APPLE)
+    target_link_options(test_dynamics PRIVATE -Wl,--no-as-needed)
+  endif()
+  target_link_libraries(test_dynamics
+    PRIVATE
+    cudaq-spin
+    cudaq-operator
+    cudaq
+    nvqir-dynamics
+    ${CUDENSITYMAT_ROOT}/lib/libcudensitymat.so.0
+    CUDA::cudart_static
+    gtest_main
+    fmt::fmt-header-only)
+  target_include_directories(test_dynamics PRIVATE ${CMAKE_SOURCE_DIR}/runtime/nvqir/cudensitymat)
+  gtest_discover_tests(test_dynamics)
+
+  # Multi-QPU evolve_async test
+  # Count the number of GPUs
+  find_program(NVIDIA_SMI "nvidia-smi")
+  if(NVIDIA_SMI)
+    execute_process(COMMAND bash -c "nvidia-smi --list-gpus | wc -l" OUTPUT_VARIABLE NGPUS)
+    # Only build this test if we have more than 1 GPU
+    if (${NGPUS} GREATER_EQUAL 2)
+      add_executable(test_evolve_async mqpu/dynamics_async_tester.cpp)
+      if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT APPLE)
+        target_link_options(test_evolve_async PRIVATE -Wl,--no-as-needed)
+      endif()
+      target_compile_definitions(test_evolve_async PRIVATE -DCUDAQ_DYNAMICS_TARGET)
+      target_link_libraries(test_evolve_async
+        PRIVATE
+        cudaq-spin
+        cudaq-operator
+        cudaq
+        cudaq-platform-mqpu
+        nvqir-dynamics
+        ${CUDENSITYMAT_ROOT}/lib/libcudensitymat.so.0
+        CUDA::cudart_static
+        gtest_main
+        fmt::fmt-header-only)
+      gtest_discover_tests(test_evolve_async)
+    endif()
+  endif() 
+endif()
 add_subdirectory(plugin)
 
 # build the test qudit execution manager
@@ -332,58 +419,6 @@ endif()
 add_subdirectory(backends)
 add_subdirectory(Optimizer)
 
-set(CUDAQ_BRAKET_RUNTIME_TEST_SOURCES
-  # Integration tests
-  integration/adjoint_tester.cpp
-  integration/builder_tester.cpp
-  integration/ccnot_tester.cpp
-  integration/draw_tester.cpp
-  integration/ghz_nisq_tester.cpp
-  integration/gradient_tester.cpp
-  integration/grover_test.cpp
-  integration/nlopt_tester.cpp
-  integration/qpe_ftqc.cpp
-  integration/qpe_nisq.cpp
-  integration/qubit_allocation.cpp
-  integration/vqe_tester.cpp
-  integration/bug67_vqe_then_sample.cpp
-  integration/bug77_vqe_with_shots.cpp
-  integration/bug116_cusv_measure_bug.cpp
-  integration/async_tester.cpp
-  integration/negative_controls_tester.cpp
-  integration/observe_result_tester.cpp
-  integration/noise_tester.cpp
-  integration/get_state_tester.cpp
-  integration/kernels_tester.cpp
-  common/MeasureCountsTester.cpp
-  common/NoiseModelTester.cpp
-  integration/gate_library_tester.cpp
-)
-set(TEST_EXE_NAME "test_runtime_braket")
-set(NVQIR_BACKEND "braket")
-set(NVQIR_BACKEND_NAME "braket")
-add_executable(${TEST_EXE_NAME} main.cpp ${CUDAQ_BRAKET_RUNTIME_TEST_SOURCES} "")
-target_compile_definitions(${TEST_EXE_NAME} PRIVATE -DNVQIR_BACKEND_NAME=braket)
-target_compile_definitions(${TEST_EXE_NAME} PRIVATE __MATH_LONG_DOUBLE_CONSTANTS)
-target_include_directories(${TEST_EXE_NAME} PRIVATE .)
-if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND NOT APPLE)
-  target_link_options(${TEST_EXE_NAME} PRIVATE -Wl,--no-as-needed)
-endif()
-target_link_libraries(${TEST_EXE_NAME}
-  PUBLIC
-  nvqir-qpp nvqir
-  cudaq fmt::fmt-header-only
-  cudaq-platform-default
-  cudaq-rest-qpu
-  cudaq-builder
-  gtest_main)
-set(TEST_LABELS "")
-if ("${TEST_LABELS}" STREQUAL "")
-  gtest_discover_tests(${TEST_EXE_NAME})
-else()
-  gtest_discover_tests(${TEST_EXE_NAME} PROPERTIES LABELS "${TEST_LABELS}")
-endif()
-
 if (CUDAQ_ENABLE_PYTHON)
   if (NOT Python_FOUND)
     message(FATAL_ERROR "find_package(Python) not run?")
@@ -418,4 +453,3 @@ if (CUDAQ_ENABLE_PYTHON)
   gtest_discover_tests(test_domains
     TEST_SUFFIX _Sampling PROPERTIES ENVIRONMENT "PYTHONPATH=${CMAKE_BINARY_DIR}/python")
 endif()
-
diff --git a/unittests/dynamics/boson_operator.cpp b/unittests/dynamics/boson_operator.cpp
new file mode 100644
index 00000000000..bb2a8971c1b
--- /dev/null
+++ b/unittests/dynamics/boson_operator.cpp
@@ -0,0 +1,898 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/operators.h"
+#include "utils.h"
+#include <gtest/gtest.h>
+#include <iostream>
+
+TEST(OperatorExpressions, checkBosonOpsUnary) {
+  auto op = cudaq::boson_operator::number(0);
+  utils::checkEqual((+op).to_matrix({{0, 3}}), utils::number_matrix(3));
+  utils::checkEqual((-op).to_matrix({{0, 3}}), -1.0 * utils::number_matrix(3));
+  utils::checkEqual(op.to_matrix({{0, 3}}), utils::number_matrix(3));
+}
+
+TEST(OperatorExpressions, checkBosonOpsConstruction) {
+  auto prod = cudaq::boson_operator::identity();
+  cudaq::matrix_2 expected(1, 1);
+
+  expected[{0, 0}] = 1.;
+  utils::checkEqual(prod.to_matrix(), expected);
+
+  prod *= -1.j;
+  expected[{0, 0}] = std::complex<double>(-1.j);
+  utils::checkEqual(prod.to_matrix(), expected);
+
+  prod *= cudaq::boson_operator::number(0);
+  expected = cudaq::matrix_2(3, 3);
+  expected[{1, 1}] = std::complex<double>(-1.j);
+  expected[{2, 2}] = std::complex<double>(-2.j);
+  utils::checkEqual(prod.to_matrix({{0, 3}}), expected);
+
+  auto sum = cudaq::boson_operator::empty();
+  expected = cudaq::matrix_2(0, 0);
+  utils::checkEqual(sum.to_matrix(), expected);
+
+  sum *=
+      cudaq::boson_operator::number(1); // empty times something is still empty
+  std::vector<int> expected_degrees = {};
+  ASSERT_EQ(sum.degrees(), expected_degrees);
+  utils::checkEqual(sum.to_matrix(), expected);
+
+  sum += cudaq::boson_operator::identity(1);
+  expected = cudaq::matrix_2(3, 3);
+  for (size_t i = 0; i < 3; ++i)
+    expected[{i, i}] = 1.;
+  utils::checkEqual(sum.to_matrix({{1, 3}}), expected);
+
+  sum *= cudaq::boson_operator::number(1);
+  expected = cudaq::matrix_2(3, 3);
+  expected[{1, 1}] = 1.;
+  expected[{2, 2}] = 2.;
+  utils::checkEqual(sum.to_matrix({{1, 3}}), expected);
+
+  sum = cudaq::boson_operator::empty();
+  sum -= cudaq::boson_operator::identity(0);
+  expected = cudaq::matrix_2(3, 3);
+  for (size_t i = 0; i < 3; ++i)
+    expected[{i, i}] = -1.;
+  utils::checkEqual(sum.to_matrix({{0, 3}}), expected);
+}
+
+TEST(OperatorExpressions, checkPreBuiltBosonOps) {
+
+  // number operator
+  {
+    auto nr_op = cudaq::boson_operator::number(0);
+    for (auto d = 2; d < 7; ++d) {
+      auto nr_mat = utils::number_matrix(d);
+      for (auto pow = 1; pow < 4; ++pow) {
+        auto expected = nr_mat;
+        auto got = nr_op;
+        for (auto i = 1; i < pow; ++i) {
+          expected *= nr_mat;
+          got *= nr_op;
+        }
+        utils::checkEqual(expected, got.to_matrix({{0, d}}));
+      }
+    }
+  }
+
+  // creation operator
+  {
+    auto ad_op = cudaq::boson_operator::create(0);
+    for (auto d = 2; d < 7; ++d) {
+      auto ad_mat = utils::create_matrix(d);
+      for (auto pow = 1; pow < 4; ++pow) {
+        auto expected = ad_mat;
+        auto got = ad_op;
+        for (auto i = 1; i < pow; ++i) {
+          expected *= ad_mat;
+          got *= ad_op;
+        }
+        utils::checkEqual(expected, got.to_matrix({{0, d}}));
+      }
+    }
+  }
+
+  // annihilation operator
+  {
+    auto a_op = cudaq::boson_operator::annihilate(0);
+    for (auto d = 2; d < 7; ++d) {
+      auto a_mat = utils::annihilate_matrix(d);
+      for (auto pow = 1; pow < 4; ++pow) {
+        auto expected = a_mat;
+        auto got = a_op;
+        for (auto i = 1; i < pow; ++i) {
+          expected *= a_mat;
+          got *= a_op;
+        }
+        utils::checkEqual(expected, got.to_matrix({{0, d}}));
+      }
+    }
+  }
+
+  // basic in-place multiplication
+  {
+    auto max_nr_consecutive = 5;
+    auto nr_op = cudaq::boson_operator::number(0);
+    auto ad_op = cudaq::boson_operator::create(0);
+    auto a_op = cudaq::boson_operator::annihilate(0);
+    for (auto d = 2; d < 5; ++d) {
+
+      // we use a larger dimension to compute the correct expected matrices
+      // to ensure the expected matrix is no impacted by finite-size errors
+      auto nr_mat = utils::number_matrix(d + max_nr_consecutive);
+      auto ad_mat = utils::create_matrix(d + max_nr_consecutive);
+      auto a_mat = utils::annihilate_matrix(d + max_nr_consecutive);
+
+      for (auto nrs = 0; nrs < max_nr_consecutive; ++nrs) {
+        for (auto ads = 0; ads < max_nr_consecutive; ++ads) {
+          for (auto as = 0; as < max_nr_consecutive; ++as) {
+
+            // Check Ads * Ns * As
+
+            std::cout << "# Ads: " << ads << ", ";
+            std::cout << "# Ns: " << nrs << ", ";
+            std::cout << "# As: " << as << std::endl;
+
+            auto padded = utils::id_matrix(d + max_nr_consecutive);
+            for (auto i = 0; i < ads; ++i)
+              padded *= ad_mat;
+            for (auto i = 0; i < nrs; ++i)
+              padded *= nr_mat;
+            for (auto i = 0; i < as; ++i)
+              padded *= a_mat;
+            auto expected = cudaq::matrix_2(d, d);
+            for (std::size_t i = 0; i < d; i++) {
+              for (std::size_t j = 0; j < d; j++)
+                expected[{i, j}] = padded[{i, j}];
+            }
+
+            auto got = cudaq::boson_operator::identity(0);
+            for (auto i = 0; i < ads; ++i)
+              got *= ad_op;
+            for (auto i = 0; i < nrs; ++i)
+              got *= nr_op;
+            for (auto i = 0; i < as; ++i)
+              got *= a_op;
+
+            utils::checkEqual(expected, got.to_matrix({{0, d}}));
+
+            // Check  Ads * As * Ns
+
+            std::cout << "# Ads: " << ads << ", ";
+            std::cout << "# As: " << as << ", ";
+            std::cout << "# Ns: " << nrs << std::endl;
+
+            padded = utils::id_matrix(d + max_nr_consecutive);
+            for (auto i = 0; i < ads; ++i)
+              padded *= ad_mat;
+            for (auto i = 0; i < as; ++i)
+              padded *= a_mat;
+            for (auto i = 0; i < nrs; ++i)
+              padded *= nr_mat;
+            expected = cudaq::matrix_2(d, d);
+            for (std::size_t i = 0; i < d; i++) {
+              for (std::size_t j = 0; j < d; j++)
+                expected[{i, j}] = padded[{i, j}];
+            }
+
+            got = cudaq::boson_operator::identity(0);
+            for (auto i = 0; i < ads; ++i)
+              got *= ad_op;
+            for (auto i = 0; i < as; ++i)
+              got *= a_op;
+            for (auto i = 0; i < nrs; ++i)
+              got *= nr_op;
+
+            utils::checkEqual(expected, got.to_matrix({{0, d}}));
+
+            // Check Ns * Ads * As
+
+            std::cout << "# Ns: " << nrs << ", ";
+            std::cout << "# Ads: " << ads << ", ";
+            std::cout << "# As: " << as << std::endl;
+
+            padded = utils::id_matrix(d + max_nr_consecutive);
+            for (auto i = 0; i < nrs; ++i)
+              padded *= nr_mat;
+            for (auto i = 0; i < ads; ++i)
+              padded *= ad_mat;
+            for (auto i = 0; i < as; ++i)
+              padded *= a_mat;
+            expected = cudaq::matrix_2(d, d);
+            for (std::size_t i = 0; i < d; i++) {
+              for (std::size_t j = 0; j < d; j++)
+                expected[{i, j}] = padded[{i, j}];
+            }
+
+            got = cudaq::boson_operator::identity(0);
+            for (auto i = 0; i < nrs; ++i)
+              got *= nr_op;
+            for (auto i = 0; i < ads; ++i)
+              got *= ad_op;
+            for (auto i = 0; i < as; ++i)
+              got *= a_op;
+
+            utils::checkEqual(expected, got.to_matrix({{0, d}}));
+
+            // check Ns * As * Ads
+
+            std::cout << "# Ns: " << nrs << ", ";
+            std::cout << "# As: " << as << ", ";
+            std::cout << "# Ads: " << ads << std::endl;
+
+            padded = utils::id_matrix(d + max_nr_consecutive);
+            for (auto i = 0; i < nrs; ++i)
+              padded *= nr_mat;
+            for (auto i = 0; i < as; ++i)
+              padded *= a_mat;
+            for (auto i = 0; i < ads; ++i)
+              padded *= ad_mat;
+            expected = cudaq::matrix_2(d, d);
+            for (std::size_t i = 0; i < d; i++) {
+              for (std::size_t j = 0; j < d; j++)
+                expected[{i, j}] = padded[{i, j}];
+            }
+
+            got = cudaq::boson_operator::identity(0);
+            for (auto i = 0; i < nrs; ++i)
+              got *= nr_op;
+            for (auto i = 0; i < as; ++i)
+              got *= a_op;
+            for (auto i = 0; i < ads; ++i)
+              got *= ad_op;
+
+            utils::checkEqual(expected, got.to_matrix({{0, d}}));
+
+            // check As * Ns * Ads
+
+            std::cout << "# As: " << as << ", ";
+            std::cout << "# Ns: " << nrs << ", ";
+            std::cout << "# Ads: " << ads << std::endl;
+
+            padded = utils::id_matrix(d + max_nr_consecutive);
+            for (auto i = 0; i < as; ++i)
+              padded *= a_mat;
+            for (auto i = 0; i < nrs; ++i)
+              padded *= nr_mat;
+            for (auto i = 0; i < ads; ++i)
+              padded *= ad_mat;
+            expected = cudaq::matrix_2(d, d);
+            for (std::size_t i = 0; i < d; i++) {
+              for (std::size_t j = 0; j < d; j++)
+                expected[{i, j}] = padded[{i, j}];
+            }
+
+            got = cudaq::boson_operator::identity(0);
+            for (auto i = 0; i < as; ++i)
+              got *= a_op;
+            for (auto i = 0; i < nrs; ++i)
+              got *= nr_op;
+            for (auto i = 0; i < ads; ++i)
+              got *= ad_op;
+
+            utils::checkEqual(expected, got.to_matrix({{0, d}}));
+
+            // check As * Ads * Ns
+
+            std::cout << "# As: " << as << ", ";
+            std::cout << "# Ads: " << ads << ", ";
+            std::cout << "# Ns: " << nrs << std::endl;
+
+            padded = utils::id_matrix(d + max_nr_consecutive);
+            for (auto i = 0; i < as; ++i)
+              padded *= a_mat;
+            for (auto i = 0; i < ads; ++i)
+              padded *= ad_mat;
+            for (auto i = 0; i < nrs; ++i)
+              padded *= nr_mat;
+            expected = cudaq::matrix_2(d, d);
+            for (std::size_t i = 0; i < d; i++) {
+              for (std::size_t j = 0; j < d; j++)
+                expected[{i, j}] = padded[{i, j}];
+            }
+
+            got = cudaq::boson_operator::identity(0);
+            for (auto i = 0; i < as; ++i)
+              got *= a_op;
+            for (auto i = 0; i < ads; ++i)
+              got *= ad_op;
+            for (auto i = 0; i < nrs; ++i)
+              got *= nr_op;
+
+            utils::checkEqual(expected, got.to_matrix({{0, d}}));
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(OperatorExpressions, checkBosonOpsWithComplex) {
+  std::complex<double> value = 0.125 + 0.125j;
+  auto dimension = 3;
+
+  // `boson_operator` + `complex<double>`
+  {
+    auto elementary = cudaq::boson_operator::create(0);
+
+    auto sum = value + elementary;
+    auto reverse = elementary + value;
+
+    auto got_matrix = sum.to_matrix({{0, dimension}});
+    auto got_matrix_reverse = reverse.to_matrix({{0, dimension}});
+
+    auto scaled_identity = value * utils::id_matrix(dimension);
+    auto want_matrix = scaled_identity + utils::create_matrix(dimension);
+    auto want_matrix_reverse =
+        utils::create_matrix(dimension) + scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  // `boson_operator` - `complex<double>`
+  {
+    auto elementary = cudaq::boson_operator::number(0);
+
+    auto difference = value - elementary;
+    auto reverse = elementary - value;
+
+    auto got_matrix = difference.to_matrix({{0, dimension}});
+    auto got_matrix_reverse = reverse.to_matrix({{0, dimension}});
+
+    auto scaled_identity = value * utils::id_matrix(dimension);
+    auto want_matrix = scaled_identity - utils::number_matrix(dimension);
+    auto want_matrix_reverse =
+        utils::number_matrix(dimension) - scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  // `boson_operator` * `complex<double>`
+  {
+    auto elementary = cudaq::boson_operator::annihilate(0);
+
+    auto product = value * elementary;
+    auto reverse = elementary * value;
+
+    auto got_matrix = product.to_matrix({{0, dimension}});
+    auto got_matrix_reverse = reverse.to_matrix({{0, dimension}});
+
+    auto scaled_identity = value * utils::id_matrix(dimension);
+    auto want_matrix = scaled_identity * utils::annihilate_matrix(dimension);
+    auto want_matrix_reverse =
+        utils::annihilate_matrix(dimension) * scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+}
+
+TEST(OperatorExpressions, checkBosonOpsWithScalars) {
+
+  auto function = [](const std::unordered_map<std::string, std::complex<double>>
+                         &parameters) {
+    auto entry = parameters.find("value");
+    if (entry == parameters.end())
+      throw std::runtime_error("value not defined in parameters");
+    return entry->second;
+  };
+
+  /// Keeping these fixed for these more simple tests.
+  int degree_index = 0;
+  int dimension = 3;
+  double const_scale_factor = 2.0;
+
+  // `boson_operator + scalar_operator`
+  {
+    auto self = cudaq::boson_operator::number(0);
+    auto other = cudaq::scalar_operator(const_scale_factor);
+
+    auto sum = self + other;
+    auto reverse = other + self;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(dimension);
+    auto got_matrix = sum.to_matrix({{0, dimension}});
+    auto got_reverse_matrix = reverse.to_matrix({{0, dimension}});
+    auto want_matrix = utils::number_matrix(dimension) + scaled_identity;
+    auto want_reverse_matrix =
+        scaled_identity + utils::number_matrix(dimension);
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `boson_operator + scalar_operator`
+  {
+    auto self = cudaq::boson_operator::annihilate(0);
+    auto other = cudaq::scalar_operator(function);
+
+    auto sum = self + other;
+    auto reverse = other + self;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(dimension);
+    auto got_matrix =
+        sum.to_matrix({{0, dimension}}, {{"value", const_scale_factor}});
+    auto got_reverse_matrix =
+        reverse.to_matrix({{0, dimension}}, {{"value", const_scale_factor}});
+    auto want_matrix = utils::annihilate_matrix(dimension) + scaled_identity;
+    auto want_reverse_matrix =
+        scaled_identity + utils::annihilate_matrix(dimension);
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `boson_operator - scalar_operator`
+  {
+    auto self = cudaq::boson_operator::identity(0);
+    auto other = cudaq::scalar_operator(const_scale_factor);
+
+    auto sum = self - other;
+    auto reverse = other - self;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(dimension);
+    auto got_matrix = sum.to_matrix({{0, dimension}});
+    auto got_reverse_matrix = reverse.to_matrix({{0, dimension}});
+    auto want_matrix = utils::id_matrix(dimension) - scaled_identity;
+    auto want_reverse_matrix = scaled_identity - utils::id_matrix(dimension);
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `boson_operator - scalar_operator`
+  {
+    auto self = cudaq::boson_operator::create(0);
+    auto other = cudaq::scalar_operator(function);
+
+    auto sum = self - other;
+    auto reverse = other - self;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(dimension);
+    auto got_matrix =
+        sum.to_matrix({{0, dimension}}, {{"value", const_scale_factor}});
+    auto got_reverse_matrix =
+        reverse.to_matrix({{0, dimension}}, {{"value", const_scale_factor}});
+    auto want_matrix = utils::create_matrix(dimension) - scaled_identity;
+    auto want_reverse_matrix =
+        scaled_identity - utils::create_matrix(dimension);
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `boson_operator * scalar_operator`
+  {
+    auto self = cudaq::boson_operator::number(0);
+    auto other = cudaq::scalar_operator(const_scale_factor);
+
+    auto product = self * other;
+    auto reverse = other * self;
+
+    std::vector<int> want_degrees = {0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(dimension);
+    auto got_matrix = product.to_matrix({{0, dimension}});
+    auto got_reverse_matrix = reverse.to_matrix({{0, dimension}});
+    auto want_matrix = utils::number_matrix(dimension) * scaled_identity;
+    auto want_reverse_matrix =
+        scaled_identity * utils::number_matrix(dimension);
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `boson_operator * scalar_operator`
+  {
+    auto self = cudaq::boson_operator::annihilate(0);
+    auto other = cudaq::scalar_operator(function);
+
+    auto product = self * other;
+    auto reverse = other * self;
+
+    std::vector<int> want_degrees = {0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(dimension);
+    auto got_matrix =
+        product.to_matrix({{0, dimension}}, {{"value", const_scale_factor}});
+    auto got_reverse_matrix =
+        reverse.to_matrix({{0, dimension}}, {{"value", const_scale_factor}});
+    auto want_matrix = utils::annihilate_matrix(dimension) * scaled_identity;
+    auto want_reverse_matrix =
+        scaled_identity * utils::annihilate_matrix(dimension);
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+}
+
+TEST(OperatorExpressions, checkBosonOpsSimpleArithmetics) {
+  std::unordered_map<int, int> dimensions = {{0, 3}, {1, 2}, {2, 4}};
+
+  // Addition, same DOF.
+  {
+    auto self = cudaq::boson_operator::number(0);
+    auto other = cudaq::boson_operator::annihilate(0);
+
+    auto sum = self + other;
+    ASSERT_TRUE(sum.num_terms() == 2);
+
+    auto got_matrix = sum.to_matrix(dimensions);
+    auto want_matrix = utils::number_matrix(3) + utils::annihilate_matrix(3);
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // Addition, different DOF's.
+  {
+    auto self = cudaq::boson_operator::create(0);
+    auto other = cudaq::boson_operator::identity(1);
+
+    auto sum = self + other;
+    ASSERT_TRUE(sum.num_terms() == 2);
+
+    auto matrix_self =
+        cudaq::kronecker(utils::id_matrix(2), utils::create_matrix(3));
+    auto matrix_other =
+        cudaq::kronecker(utils::id_matrix(2), utils::id_matrix(3));
+    auto got_matrix = sum.to_matrix(dimensions);
+    auto want_matrix = matrix_self + matrix_other;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // Subtraction, same DOF.
+  {
+    auto self = cudaq::boson_operator::identity(0);
+    auto other = cudaq::boson_operator::number(0);
+
+    auto sum = self - other;
+    ASSERT_TRUE(sum.num_terms() == 2);
+
+    auto got_matrix = sum.to_matrix(dimensions);
+    auto want_matrix = utils::id_matrix(3) - utils::number_matrix(3);
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // Subtraction, different DOF's.
+  {
+    auto self = cudaq::boson_operator::annihilate(0);
+    auto other = cudaq::boson_operator::create(1);
+
+    auto sum = self - other;
+    ASSERT_TRUE(sum.num_terms() == 2);
+
+    auto annihilate_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::annihilate_matrix(3));
+    auto create_full =
+        cudaq::kronecker(utils::create_matrix(2), utils::id_matrix(3));
+    auto got_matrix = sum.to_matrix(dimensions);
+    auto want_matrix = annihilate_full - create_full;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // Multiplication, same DOF.
+  {
+    auto self = cudaq::boson_operator::create(0);
+    auto other = cudaq::boson_operator::annihilate(0);
+
+    auto product = self * other;
+    ASSERT_TRUE(product.num_terms() == 1);
+
+    std::vector<int> want_degrees = {0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+
+    auto got_matrix = product.to_matrix(dimensions);
+    auto want_matrix = utils::number_matrix(3);
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // Multiplication, different DOF's.
+  {
+    auto self = cudaq::boson_operator::position(0);
+    auto other = cudaq::boson_operator::momentum(1);
+
+    auto result =
+        self * other; // nnote that position and momentum are each 2-term sums
+    ASSERT_TRUE(result.num_terms() == 4);
+
+    std::vector<int> want_degrees = {1, 0};
+    ASSERT_TRUE(result.degrees() == want_degrees);
+
+    auto annihilate_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::position_matrix(3));
+    auto create_full =
+        cudaq::kronecker(utils::momentum_matrix(2), utils::id_matrix(3));
+    auto got_matrix = result.to_matrix(dimensions);
+    auto want_matrix = annihilate_full * create_full;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+}
+
+TEST(OperatorExpressions, checkBosonOpsAdvancedArithmetics) {
+
+  // Keeping this fixed throughout.
+  std::complex<double> value = 0.125 + 0.5j;
+  std::unordered_map<int, int> dimensions = {{0, 3}, {1, 2}, {2, 4}, {3, 2}};
+
+  // `boson_operator + operator_sum`
+  {
+    auto self = cudaq::boson_operator::create(2);
+    auto operator_sum =
+        cudaq::boson_operator::annihilate(2) + cudaq::boson_operator::number(1);
+
+    auto got = self + operator_sum;
+    auto reverse = operator_sum + self;
+
+    ASSERT_TRUE(got.num_terms() == 3);
+    ASSERT_TRUE(reverse.num_terms() == 3);
+
+    auto self_full =
+        cudaq::kronecker(utils::create_matrix(4), utils::id_matrix(2));
+    auto term_0_full =
+        cudaq::kronecker(utils::annihilate_matrix(4), utils::id_matrix(2));
+    auto term_1_full =
+        cudaq::kronecker(utils::id_matrix(4), utils::number_matrix(2));
+
+    auto got_matrix = got.to_matrix(dimensions);
+    auto got_reverse_matrix = reverse.to_matrix(dimensions);
+    auto want_matrix = self_full + term_0_full + term_1_full;
+    auto want_reverse_matrix = term_0_full + term_1_full + self_full;
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `boson_operator - operator_sum`
+  {
+    auto self = cudaq::boson_operator::annihilate(0);
+    auto operator_sum =
+        cudaq::boson_operator::create(0) + cudaq::boson_operator::identity(1);
+
+    auto got = self - operator_sum;
+    auto reverse = operator_sum - self;
+
+    ASSERT_TRUE(got.num_terms() == 3);
+    ASSERT_TRUE(reverse.num_terms() == 3);
+
+    auto self_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::annihilate_matrix(3));
+    auto term_0_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::create_matrix(3));
+    auto term_1_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::id_matrix(3));
+
+    auto got_matrix = got.to_matrix(dimensions);
+    auto got_reverse_matrix = reverse.to_matrix(dimensions);
+    auto want_matrix = self_full - term_0_full - term_1_full;
+    auto want_reverse_matrix = term_0_full + term_1_full - self_full;
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `boson_operator * operator_sum`
+  {
+    auto self = cudaq::boson_operator::number(0);
+    auto operator_sum =
+        cudaq::boson_operator::create(0) + cudaq::boson_operator::number(2);
+
+    auto got = self * operator_sum;
+    auto reverse = operator_sum * self;
+
+    ASSERT_TRUE(got.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+    for (auto &term : got.get_terms())
+      ASSERT_TRUE(term.num_terms() == term.degrees().size());
+    for (auto &term : reverse.get_terms())
+      ASSERT_TRUE(term.num_terms() == term.degrees().size());
+
+    auto self_full =
+        cudaq::kronecker(utils::id_matrix(4), utils::number_matrix(3));
+    auto term_0_full =
+        cudaq::kronecker(utils::id_matrix(4), utils::create_matrix(3));
+    auto term_1_full =
+        cudaq::kronecker(utils::number_matrix(4), utils::id_matrix(3));
+    auto sum_full = term_0_full + term_1_full;
+
+    auto got_matrix = got.to_matrix(dimensions);
+    auto got_reverse_matrix = reverse.to_matrix(dimensions);
+    auto want_matrix = self_full * sum_full;
+    auto want_reverse_matrix = sum_full * self_full;
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `operator_sum += boson_operator`
+  {
+    auto operator_sum = cudaq::boson_operator::momentum(0) +
+                        cudaq::boson_operator::annihilate(2);
+    operator_sum += cudaq::boson_operator::position(0);
+
+    ASSERT_TRUE(operator_sum.num_terms() == 3);
+
+    auto term_0_full =
+        cudaq::kronecker(utils::annihilate_matrix(4), utils::id_matrix(3));
+    auto term_1_full =
+        cudaq::kronecker(utils::id_matrix(4), utils::position_matrix(3));
+    auto added_full =
+        cudaq::kronecker(utils::id_matrix(4), utils::momentum_matrix(3));
+
+    auto got_matrix = operator_sum.to_matrix(dimensions);
+    auto want_matrix = term_0_full + term_1_full + added_full;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `operator_sum -= boson_operator`
+  {
+    auto operator_sum =
+        cudaq::boson_operator::create(0) + cudaq::boson_operator::annihilate(1);
+    operator_sum -= cudaq::boson_operator::momentum(0);
+
+    ASSERT_TRUE(operator_sum.num_terms() == 3);
+
+    auto term_0_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::create_matrix(3));
+    auto term_1_full =
+        cudaq::kronecker(utils::annihilate_matrix(2), utils::id_matrix(3));
+    auto subtr_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::momentum_matrix(3));
+
+    auto got_matrix = operator_sum.to_matrix(dimensions);
+    auto want_matrix = term_0_full + term_1_full - subtr_full;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `operator_sum *= boson_operator`
+  {
+    auto operator_sum =
+        cudaq::boson_operator::momentum(0) + cudaq::boson_operator::momentum(1);
+    auto self = cudaq::boson_operator::position(0);
+
+    operator_sum *= self;
+
+    ASSERT_TRUE(operator_sum.num_terms() == 8);
+    for (auto &term : operator_sum.get_terms())
+      ASSERT_TRUE(term.num_terms() == term.degrees().size());
+
+    // Note that here we need to again expand the matrices for the product
+    // computation to ensure that the expected matrix is correct.
+    // (naive construction with "the right size" will lead to finite size
+    // errors).
+    auto padded_term0 = utils::momentum_matrix(5) * utils::position_matrix(5);
+    auto term0 = cudaq::matrix_2(3, 3);
+    for (size_t i = 0; i < 3; ++i) {
+      for (size_t j = 0; j < 3; ++j)
+        term0[{i, j}] = padded_term0[{i, j}];
+    }
+
+    auto expected_term0 = cudaq::kronecker(utils::id_matrix(2), term0);
+    auto expected_term1 =
+        cudaq::kronecker(utils::momentum_matrix(2), utils::position_matrix(3));
+
+    auto got_matrix = operator_sum.to_matrix(dimensions);
+    auto want_matrix = expected_term0 + expected_term1;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+}
+
+TEST(OperatorExpressions, checkBosonOpsDegreeVerification) {
+  auto op1 = cudaq::boson_operator::create(2);
+  auto op2 = cudaq::boson_operator::annihilate(0);
+  std::unordered_map<int, int> dimensions = {{0, 2}, {1, 2}, {2, 3}, {3, 3}};
+
+  ASSERT_THROW(op1.to_matrix({}), std::runtime_error);
+  ASSERT_THROW(op1.to_matrix({{1, 2}}), std::runtime_error);
+  ASSERT_THROW((op1 * op2).to_matrix({{2, 3}}), std::runtime_error);
+  ASSERT_THROW((op1 + op2).to_matrix({{0, 3}}), std::runtime_error);
+  ASSERT_NO_THROW((op1 * op2).to_matrix(dimensions));
+  ASSERT_NO_THROW((op1 + op2).to_matrix(dimensions));
+}
+
+TEST(OperatorExpressions, checkCommutationRelations) {
+
+  // Doing some testing for the tests - if the reference matrices do not satisfy
+  // the correct relations, then all tests are wrong...
+
+  auto ad_mat = utils::create_matrix(5);
+  auto a_mat = utils::annihilate_matrix(5);
+
+  auto padded_commutator = a_mat * ad_mat - ad_mat * a_mat;
+  cudaq::matrix_2 commutator_mat(4, 4);
+  for (size_t i = 0; i < 4; ++i) {
+    for (size_t j = 0; j < 4; ++j)
+      commutator_mat[{i, j}] = padded_commutator[{i, j}];
+  }
+
+  auto padded_aad = a_mat * ad_mat;
+  cudaq::matrix_2 aad_mat(4, 4);
+  for (size_t i = 0; i < 4; ++i) {
+    for (size_t j = 0; j < 4; ++j)
+      aad_mat[{i, j}] = padded_aad[{i, j}];
+  }
+
+  utils::checkEqual(commutator_mat, utils::id_matrix(4));
+  utils::checkEqual(ad_mat * a_mat, utils::number_matrix(5));
+  utils::checkEqual(aad_mat, utils::number_matrix(4) + utils::id_matrix(4));
+
+  // Expected commutation relations:
+  // [a(k), a†(q)] = δkq
+  // [a†(k), a†(q)] = [a(k), a(q)] = 0
+
+  std::unordered_map<int, int> dimensions = {{0, 4}, {1, 4}};
+  auto commutator = [](cudaq::product_operator<cudaq::boson_operator> ad,
+                       cudaq::product_operator<cudaq::boson_operator> a) {
+    return a * ad - ad * a;
+  };
+
+  // check [a(q), a†(q)] = 1
+
+  auto rel1 = commutator(cudaq::boson_operator::create(0),
+                         cudaq::boson_operator::annihilate(0));
+  auto rel2 = commutator(cudaq::boson_operator::create(1),
+                         cudaq::boson_operator::annihilate(1));
+  utils::checkEqual(rel1.to_matrix(dimensions), utils::id_matrix(4));
+  utils::checkEqual(rel2.to_matrix(dimensions), utils::id_matrix(4));
+
+  // check [a(k), a†(q)] = 0 for k != q
+
+  auto rel3 = commutator(cudaq::boson_operator::create(0),
+                         cudaq::boson_operator::annihilate(1));
+  auto rel4 = commutator(cudaq::boson_operator::create(1),
+                         cudaq::boson_operator::annihilate(0));
+  utils::checkEqual(rel3.to_matrix(dimensions), utils::zero_matrix(16));
+  utils::checkEqual(rel4.to_matrix(dimensions), utils::zero_matrix(16));
+
+  // check [a†(q), a†(q)] = 0
+
+  auto rel5 = commutator(cudaq::boson_operator::create(0),
+                         cudaq::boson_operator::create(0));
+  auto rel6 = commutator(cudaq::boson_operator::create(1),
+                         cudaq::boson_operator::create(1));
+  utils::checkEqual(rel5.to_matrix(dimensions), utils::zero_matrix(4));
+  utils::checkEqual(rel6.to_matrix(dimensions), utils::zero_matrix(4));
+
+  // check [a(q), a(q)] = 0
+
+  auto rel7 = commutator(cudaq::boson_operator::annihilate(0),
+                         cudaq::boson_operator::annihilate(0));
+  auto rel8 = commutator(cudaq::boson_operator::annihilate(1),
+                         cudaq::boson_operator::annihilate(1));
+  utils::checkEqual(rel7.to_matrix(dimensions), utils::zero_matrix(4));
+  utils::checkEqual(rel8.to_matrix(dimensions), utils::zero_matrix(4));
+
+  // check [a†(k), a†(q)] = 0 for k != q
+
+  auto rel9 = commutator(cudaq::boson_operator::create(0),
+                         cudaq::boson_operator::create(1));
+  auto rel10 = commutator(cudaq::boson_operator::create(1),
+                          cudaq::boson_operator::create(0));
+  utils::checkEqual(rel9.to_matrix(dimensions), utils::zero_matrix(16));
+  utils::checkEqual(rel10.to_matrix(dimensions), utils::zero_matrix(16));
+
+  // check [a(k), a(q)] = 0 for k != q
+
+  auto rel11 = commutator(cudaq::boson_operator::annihilate(0),
+                          cudaq::boson_operator::annihilate(1));
+  auto rel12 = commutator(cudaq::boson_operator::annihilate(1),
+                          cudaq::boson_operator::annihilate(0));
+  utils::checkEqual(rel11.to_matrix(dimensions), utils::zero_matrix(16));
+  utils::checkEqual(rel12.to_matrix(dimensions), utils::zero_matrix(16));
+}
diff --git a/unittests/dynamics/fermion_operator.cpp b/unittests/dynamics/fermion_operator.cpp
new file mode 100644
index 00000000000..8b55ec529ee
--- /dev/null
+++ b/unittests/dynamics/fermion_operator.cpp
@@ -0,0 +1,843 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/operators.h"
+#include "utils.h"
+#include <gtest/gtest.h>
+#include <iostream>
+
+TEST(OperatorExpressions, checkFermionOpsUnary) {
+  auto op = cudaq::fermion_operator::number(0);
+  utils::checkEqual((+op).to_matrix(), utils::number_matrix(2));
+  utils::checkEqual((-op).to_matrix(), -1.0 * utils::number_matrix(2));
+  utils::checkEqual(op.to_matrix(), utils::number_matrix(2));
+}
+
+TEST(OperatorExpressions, checkFermionOpsConstruction) {
+  auto prod = cudaq::fermion_operator::identity();
+  cudaq::matrix_2 expected(1, 1);
+
+  expected[{0, 0}] = 1.;
+  utils::checkEqual(prod.to_matrix(), expected);
+
+  prod *= -1.j;
+  expected[{0, 0}] = std::complex<double>(-1.j);
+  utils::checkEqual(prod.to_matrix(), expected);
+
+  prod *= cudaq::fermion_operator::number(0);
+  expected = cudaq::matrix_2(2, 2);
+  expected[{1, 1}] = std::complex<double>(-1.j);
+  utils::checkEqual(prod.to_matrix(), expected);
+
+  auto sum = cudaq::fermion_operator::empty();
+  expected = cudaq::matrix_2(0, 0);
+  utils::checkEqual(sum.to_matrix(), expected);
+
+  sum *= cudaq::fermion_operator::number(
+      1); // empty times something is still empty
+  std::vector<int> expected_degrees = {};
+  ASSERT_EQ(sum.degrees(), expected_degrees);
+  utils::checkEqual(sum.to_matrix(), expected);
+
+  sum += cudaq::fermion_operator::identity(1);
+  expected = cudaq::matrix_2(2, 2);
+  for (size_t i = 0; i < 2; ++i)
+    expected[{i, i}] = 1.;
+  utils::checkEqual(sum.to_matrix(), expected);
+
+  sum *= cudaq::fermion_operator::number(1);
+  expected = cudaq::matrix_2(2, 2);
+  expected[{1, 1}] = 1.;
+  utils::checkEqual(sum.to_matrix(), expected);
+
+  sum = cudaq::fermion_operator::empty();
+  sum -= cudaq::fermion_operator::identity(0);
+  expected = cudaq::matrix_2(2, 2);
+  for (size_t i = 0; i < 2; ++i)
+    expected[{i, i}] = -1.;
+  utils::checkEqual(sum.to_matrix(), expected);
+}
+
+TEST(OperatorExpressions, checkPreBuiltFermionOps) {
+
+  // number operator
+  {
+    auto nr_op = cudaq::fermion_operator::number(0);
+    auto nr_mat = utils::number_matrix(2);
+    for (auto pow = 1; pow < 4; ++pow) {
+      auto expected = nr_mat;
+      auto got = nr_op;
+      for (auto i = 1; i < pow; ++i) {
+        expected *= nr_mat;
+        got *= nr_op;
+      }
+      utils::checkEqual(expected, got.to_matrix());
+    }
+  }
+
+  // creation operator
+  {
+    auto ad_op = cudaq::fermion_operator::create(0);
+    auto ad_mat = utils::create_matrix(2);
+    for (auto pow = 1; pow < 4; ++pow) {
+      auto expected = ad_mat;
+      auto got = ad_op;
+      for (auto i = 1; i < pow; ++i) {
+        expected *= ad_mat;
+        got *= ad_op;
+      }
+      utils::checkEqual(expected, got.to_matrix());
+    }
+  }
+
+  // annihilation operator
+  {
+    auto a_op = cudaq::fermion_operator::annihilate(0);
+    auto a_mat = utils::annihilate_matrix(2);
+    for (auto pow = 1; pow < 4; ++pow) {
+      auto expected = a_mat;
+      auto got = a_op;
+      for (auto i = 1; i < pow; ++i) {
+        expected *= a_mat;
+        got *= a_op;
+      }
+      utils::checkEqual(expected, got.to_matrix());
+    }
+  }
+
+  // basic in-place multiplication
+  {
+    auto max_nr_consecutive = 3;
+    auto nr_op = cudaq::fermion_operator::number(0);
+    auto ad_op = cudaq::fermion_operator::create(0);
+    auto a_op = cudaq::fermion_operator::annihilate(0);
+
+    auto nr_mat = utils::number_matrix(2);
+    auto ad_mat = utils::create_matrix(2);
+    auto a_mat = utils::annihilate_matrix(2);
+
+    for (auto nrs = 0; nrs < max_nr_consecutive; ++nrs) {
+      for (auto ads = 0; ads < max_nr_consecutive; ++ads) {
+        for (auto as = 0; as < max_nr_consecutive; ++as) {
+
+          // Check Ads * Ns * As
+
+          std::cout << "# Ads: " << ads << ", ";
+          std::cout << "# Ns: " << nrs << ", ";
+          std::cout << "# As: " << as << std::endl;
+
+          auto expected = utils::id_matrix(2);
+          for (auto i = 0; i < ads; ++i)
+            expected *= ad_mat;
+          for (auto i = 0; i < nrs; ++i)
+            expected *= nr_mat;
+          for (auto i = 0; i < as; ++i)
+            expected *= a_mat;
+
+          auto got = cudaq::fermion_operator::identity(0);
+          for (auto i = 0; i < ads; ++i)
+            got *= ad_op;
+          for (auto i = 0; i < nrs; ++i)
+            got *= nr_op;
+          for (auto i = 0; i < as; ++i)
+            got *= a_op;
+
+          utils::checkEqual(expected, got.to_matrix());
+
+          // Check  Ads * As * Ns
+
+          std::cout << "# Ads: " << ads << ", ";
+          std::cout << "# As: " << as << ", ";
+          std::cout << "# Ns: " << nrs << std::endl;
+
+          expected = utils::id_matrix(2);
+          for (auto i = 0; i < ads; ++i)
+            expected *= ad_mat;
+          for (auto i = 0; i < as; ++i)
+            expected *= a_mat;
+          for (auto i = 0; i < nrs; ++i)
+            expected *= nr_mat;
+
+          got = cudaq::fermion_operator::identity(0);
+          for (auto i = 0; i < ads; ++i)
+            got *= ad_op;
+          for (auto i = 0; i < as; ++i)
+            got *= a_op;
+          for (auto i = 0; i < nrs; ++i)
+            got *= nr_op;
+
+          utils::checkEqual(expected, got.to_matrix());
+
+          // Check Ns * Ads * As
+
+          std::cout << "# Ns: " << nrs << ", ";
+          std::cout << "# Ads: " << ads << ", ";
+          std::cout << "# As: " << as << std::endl;
+
+          expected = utils::id_matrix(2);
+          for (auto i = 0; i < nrs; ++i)
+            expected *= nr_mat;
+          for (auto i = 0; i < ads; ++i)
+            expected *= ad_mat;
+          for (auto i = 0; i < as; ++i)
+            expected *= a_mat;
+
+          got = cudaq::fermion_operator::identity(0);
+          for (auto i = 0; i < nrs; ++i)
+            got *= nr_op;
+          for (auto i = 0; i < ads; ++i)
+            got *= ad_op;
+          for (auto i = 0; i < as; ++i)
+            got *= a_op;
+
+          utils::checkEqual(expected, got.to_matrix());
+
+          // check Ns * As * Ads
+
+          std::cout << "# Ns: " << nrs << ", ";
+          std::cout << "# As: " << as << ", ";
+          std::cout << "# Ads: " << ads << std::endl;
+
+          expected = utils::id_matrix(2);
+          for (auto i = 0; i < nrs; ++i)
+            expected *= nr_mat;
+          for (auto i = 0; i < as; ++i)
+            expected *= a_mat;
+          for (auto i = 0; i < ads; ++i)
+            expected *= ad_mat;
+
+          got = cudaq::fermion_operator::identity(0);
+          for (auto i = 0; i < nrs; ++i)
+            got *= nr_op;
+          for (auto i = 0; i < as; ++i)
+            got *= a_op;
+          for (auto i = 0; i < ads; ++i)
+            got *= ad_op;
+
+          utils::checkEqual(expected, got.to_matrix());
+
+          // check As * Ns * Ads
+
+          std::cout << "# As: " << as << ", ";
+          std::cout << "# Ns: " << nrs << ", ";
+          std::cout << "# Ads: " << ads << std::endl;
+
+          expected = utils::id_matrix(2);
+          for (auto i = 0; i < as; ++i)
+            expected *= a_mat;
+          for (auto i = 0; i < nrs; ++i)
+            expected *= nr_mat;
+          for (auto i = 0; i < ads; ++i)
+            expected *= ad_mat;
+
+          got = cudaq::fermion_operator::identity(0);
+          for (auto i = 0; i < as; ++i)
+            got *= a_op;
+          for (auto i = 0; i < nrs; ++i)
+            got *= nr_op;
+          for (auto i = 0; i < ads; ++i)
+            got *= ad_op;
+
+          utils::checkEqual(expected, got.to_matrix());
+
+          // check As * Ads * Ns
+
+          std::cout << "# As: " << as << ", ";
+          std::cout << "# Ads: " << ads << ", ";
+          std::cout << "# Ns: " << nrs << std::endl;
+
+          expected = utils::id_matrix(2);
+          for (auto i = 0; i < as; ++i)
+            expected *= a_mat;
+          for (auto i = 0; i < ads; ++i)
+            expected *= ad_mat;
+          for (auto i = 0; i < nrs; ++i)
+            expected *= nr_mat;
+
+          got = cudaq::fermion_operator::identity(0);
+          for (auto i = 0; i < as; ++i)
+            got *= a_op;
+          for (auto i = 0; i < ads; ++i)
+            got *= ad_op;
+          for (auto i = 0; i < nrs; ++i)
+            got *= nr_op;
+
+          utils::checkEqual(expected, got.to_matrix());
+        }
+      }
+    }
+  }
+}
+
+TEST(OperatorExpressions, checkFermionOpsWithComplex) {
+  std::complex<double> value = 0.125 + 0.125j;
+
+  // `fermion_operator` + `complex<double>`
+  {
+    auto elementary = cudaq::fermion_operator::create(0);
+
+    auto sum = value + elementary;
+    auto reverse = elementary + value;
+
+    auto got_matrix = sum.to_matrix();
+    auto got_matrix_reverse = reverse.to_matrix();
+
+    auto scaled_identity = value * utils::id_matrix(2);
+    auto want_matrix = scaled_identity + utils::create_matrix(2);
+    auto want_matrix_reverse = utils::create_matrix(2) + scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  // `fermion_operator` - `complex<double>`
+  {
+    auto elementary = cudaq::fermion_operator::number(0);
+
+    auto difference = value - elementary;
+    auto reverse = elementary - value;
+
+    auto got_matrix = difference.to_matrix();
+    auto got_matrix_reverse = reverse.to_matrix();
+
+    auto scaled_identity = value * utils::id_matrix(2);
+    auto want_matrix = scaled_identity - utils::number_matrix(2);
+    auto want_matrix_reverse = utils::number_matrix(2) - scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  // `fermion_operator` * `complex<double>`
+  {
+    auto elementary = cudaq::fermion_operator::annihilate(0);
+
+    auto product = value * elementary;
+    auto reverse = elementary * value;
+
+    auto got_matrix = product.to_matrix();
+    auto got_matrix_reverse = reverse.to_matrix();
+
+    auto scaled_identity = value * utils::id_matrix(2);
+    auto want_matrix = scaled_identity * utils::annihilate_matrix(2);
+    auto want_matrix_reverse = utils::annihilate_matrix(2) * scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+}
+
+TEST(OperatorExpressions, checkFermionOpsWithScalars) {
+
+  auto function = [](const std::unordered_map<std::string, std::complex<double>>
+                         &parameters) {
+    auto entry = parameters.find("value");
+    if (entry == parameters.end())
+      throw std::runtime_error("value not defined in parameters");
+    return entry->second;
+  };
+
+  /// Keeping these fixed for these more simple tests.
+  double const_scale_factor = 2.0;
+
+  // `fermion_operator + scalar_operator`
+  {
+    auto self = cudaq::fermion_operator::number(0);
+    auto other = cudaq::scalar_operator(const_scale_factor);
+
+    auto sum = self + other;
+    auto reverse = other + self;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(2);
+    auto got_matrix = sum.to_matrix();
+    auto got_reverse_matrix = reverse.to_matrix();
+    auto want_matrix = utils::number_matrix(2) + scaled_identity;
+    auto want_reverse_matrix = scaled_identity + utils::number_matrix(2);
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `fermion_operator + scalar_operator`
+  {
+    auto self = cudaq::fermion_operator::annihilate(0);
+    auto other = cudaq::scalar_operator(function);
+
+    auto sum = self + other;
+    auto reverse = other + self;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(2);
+    auto got_matrix = sum.to_matrix({}, {{"value", const_scale_factor}});
+    auto got_reverse_matrix =
+        reverse.to_matrix({}, {{"value", const_scale_factor}});
+    auto want_matrix = utils::annihilate_matrix(2) + scaled_identity;
+    auto want_reverse_matrix = scaled_identity + utils::annihilate_matrix(2);
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `fermion_operator - scalar_operator`
+  {
+    auto self = cudaq::fermion_operator::identity(0);
+    auto other = cudaq::scalar_operator(const_scale_factor);
+
+    auto sum = self - other;
+    auto reverse = other - self;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(2);
+    auto got_matrix = sum.to_matrix();
+    auto got_reverse_matrix = reverse.to_matrix();
+    auto want_matrix = utils::id_matrix(2) - scaled_identity;
+    auto want_reverse_matrix = scaled_identity - utils::id_matrix(2);
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `fermion_operator - scalar_operator`
+  {
+    auto self = cudaq::fermion_operator::create(0);
+    auto other = cudaq::scalar_operator(function);
+
+    auto sum = self - other;
+    auto reverse = other - self;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(2);
+    auto got_matrix = sum.to_matrix({}, {{"value", const_scale_factor}});
+    auto got_reverse_matrix =
+        reverse.to_matrix({}, {{"value", const_scale_factor}});
+    auto want_matrix = utils::create_matrix(2) - scaled_identity;
+    auto want_reverse_matrix = scaled_identity - utils::create_matrix(2);
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `fermion_operator * scalar_operator`
+  {
+    auto self = cudaq::fermion_operator::number(0);
+    auto other = cudaq::scalar_operator(const_scale_factor);
+
+    auto product = self * other;
+    auto reverse = other * self;
+
+    std::vector<int> want_degrees = {0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(2);
+    auto got_matrix = product.to_matrix();
+    auto got_reverse_matrix = reverse.to_matrix();
+    auto want_matrix = utils::number_matrix(2) * scaled_identity;
+    auto want_reverse_matrix = scaled_identity * utils::number_matrix(2);
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `fermion_operator * scalar_operator`
+  {
+    auto self = cudaq::fermion_operator::annihilate(0);
+    auto other = cudaq::scalar_operator(function);
+
+    auto product = self * other;
+    auto reverse = other * self;
+
+    std::vector<int> want_degrees = {0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(2);
+    auto got_matrix = product.to_matrix({}, {{"value", const_scale_factor}});
+    auto got_reverse_matrix =
+        reverse.to_matrix({}, {{"value", const_scale_factor}});
+    auto want_matrix = utils::annihilate_matrix(2) * scaled_identity;
+    auto want_reverse_matrix = scaled_identity * utils::annihilate_matrix(2);
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+}
+
+TEST(OperatorExpressions, checkFermionOpsSimpleArithmetics) {
+
+  // Addition, same DOF.
+  {
+    auto self = cudaq::fermion_operator::number(0);
+    auto other = cudaq::fermion_operator::annihilate(0);
+
+    auto sum = self + other;
+    ASSERT_TRUE(sum.num_terms() == 2);
+
+    auto got_matrix = sum.to_matrix();
+    auto want_matrix = utils::number_matrix(2) + utils::annihilate_matrix(2);
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // Addition, different DOF's.
+  {
+    auto self = cudaq::fermion_operator::create(0);
+    auto other = cudaq::fermion_operator::identity(1);
+
+    auto sum = self + other;
+    ASSERT_TRUE(sum.num_terms() == 2);
+
+    auto matrix_self =
+        cudaq::kronecker(utils::id_matrix(2), utils::create_matrix(2));
+    auto matrix_other =
+        cudaq::kronecker(utils::id_matrix(2), utils::id_matrix(2));
+    auto got_matrix = sum.to_matrix();
+    auto want_matrix = matrix_self + matrix_other;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // Subtraction, same DOF.
+  {
+    auto self = cudaq::fermion_operator::identity(0);
+    auto other = cudaq::fermion_operator::number(0);
+
+    auto sum = self - other;
+    ASSERT_TRUE(sum.num_terms() == 2);
+
+    auto got_matrix = sum.to_matrix();
+    auto want_matrix = utils::id_matrix(2) - utils::number_matrix(2);
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // Subtraction, different DOF's.
+  {
+    auto self = cudaq::fermion_operator::annihilate(0);
+    auto other = cudaq::fermion_operator::create(1);
+
+    auto sum = self - other;
+    ASSERT_TRUE(sum.num_terms() == 2);
+
+    auto annihilate_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::annihilate_matrix(2));
+    auto create_full =
+        cudaq::kronecker(utils::create_matrix(2), utils::id_matrix(2));
+    auto got_matrix = sum.to_matrix();
+    auto want_matrix = annihilate_full - create_full;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // Multiplication, same DOF.
+  {
+    auto self = cudaq::fermion_operator::create(0);
+    auto other = cudaq::fermion_operator::annihilate(0);
+
+    auto product = self * other;
+    ASSERT_TRUE(product.num_terms() == 1);
+
+    std::vector<int> want_degrees = {0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+
+    auto got_matrix = product.to_matrix();
+    auto want_matrix = utils::number_matrix(2);
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // Multiplication, different DOF's.
+  {
+    auto self = cudaq::fermion_operator::identity(0);
+    auto other = cudaq::fermion_operator::annihilate(1);
+
+    auto result = self * other;
+    ASSERT_TRUE(result.num_terms() == 2);
+
+    std::vector<int> want_degrees = {1, 0};
+    ASSERT_TRUE(result.degrees() == want_degrees);
+
+    auto annihilate_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::id_matrix(2));
+    auto create_full =
+        cudaq::kronecker(utils::annihilate_matrix(2), utils::id_matrix(2));
+    auto got_matrix = result.to_matrix();
+    auto want_matrix = annihilate_full * create_full;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+}
+
+TEST(OperatorExpressions, checkFermionOpsAdvancedArithmetics) {
+
+  // Keeping this fixed throughout.
+  std::complex<double> value = 0.125 + 0.5j;
+
+  // `fermion_operator + operator_sum`
+  {
+    auto self = cudaq::fermion_operator::create(2);
+    auto operator_sum = cudaq::fermion_operator::annihilate(2) +
+                        cudaq::fermion_operator::number(1);
+
+    auto got = self + operator_sum;
+    auto reverse = operator_sum + self;
+
+    ASSERT_TRUE(got.num_terms() == 3);
+    ASSERT_TRUE(reverse.num_terms() == 3);
+
+    auto self_full =
+        cudaq::kronecker(utils::create_matrix(2), utils::id_matrix(2));
+    auto term_0_full =
+        cudaq::kronecker(utils::annihilate_matrix(2), utils::id_matrix(2));
+    auto term_1_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::number_matrix(2));
+
+    auto got_matrix = got.to_matrix();
+    auto got_reverse_matrix = reverse.to_matrix();
+    auto want_matrix = self_full + term_0_full + term_1_full;
+    auto want_reverse_matrix = term_0_full + term_1_full + self_full;
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `fermion_operator - operator_sum`
+  {
+    auto self = cudaq::fermion_operator::annihilate(0);
+    auto operator_sum = cudaq::fermion_operator::create(0) +
+                        cudaq::fermion_operator::identity(1);
+
+    auto got = self - operator_sum;
+    auto reverse = operator_sum - self;
+
+    ASSERT_TRUE(got.num_terms() == 3);
+    ASSERT_TRUE(reverse.num_terms() == 3);
+
+    auto self_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::annihilate_matrix(2));
+    auto term_0_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::create_matrix(2));
+    auto term_1_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::id_matrix(2));
+
+    auto got_matrix = got.to_matrix();
+    auto got_reverse_matrix = reverse.to_matrix();
+    auto want_matrix = self_full - term_0_full - term_1_full;
+    auto want_reverse_matrix = term_0_full + term_1_full - self_full;
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `fermion_operator * operator_sum`
+  {
+    auto self = cudaq::fermion_operator::number(0);
+    auto operator_sum =
+        cudaq::fermion_operator::create(0) + cudaq::fermion_operator::number(2);
+
+    auto got = self * operator_sum;
+    auto reverse = operator_sum * self;
+
+    ASSERT_TRUE(got.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+    for (auto &term : got.get_terms())
+      ASSERT_TRUE(term.num_terms() == term.degrees().size());
+    for (auto &term : reverse.get_terms())
+      ASSERT_TRUE(term.num_terms() == term.degrees().size());
+
+    auto self_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::number_matrix(2));
+    auto term_0_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::create_matrix(2));
+    auto term_1_full =
+        cudaq::kronecker(utils::number_matrix(2), utils::id_matrix(2));
+    auto sum_full = term_0_full + term_1_full;
+
+    auto got_matrix = got.to_matrix();
+    auto got_reverse_matrix = reverse.to_matrix();
+    auto want_matrix = self_full * sum_full;
+    auto want_reverse_matrix = sum_full * self_full;
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `operator_sum += fermion_operator`
+  {
+    auto operator_sum = cudaq::fermion_operator::create(0) +
+                        cudaq::fermion_operator::annihilate(2);
+    operator_sum += cudaq::fermion_operator::number(0);
+
+    ASSERT_TRUE(operator_sum.num_terms() == 3);
+
+    auto term_0_full =
+        cudaq::kronecker(utils::annihilate_matrix(2), utils::id_matrix(2));
+    auto term_1_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::number_matrix(2));
+    auto added_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::create_matrix(2));
+
+    auto got_matrix = operator_sum.to_matrix();
+    auto want_matrix = term_0_full + term_1_full + added_full;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `operator_sum -= fermion_operator`
+  {
+    auto operator_sum = cudaq::fermion_operator::create(0) +
+                        cudaq::fermion_operator::annihilate(1);
+    operator_sum -= cudaq::fermion_operator::identity(0);
+
+    ASSERT_TRUE(operator_sum.num_terms() == 3);
+
+    auto term_0_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::create_matrix(2));
+    auto term_1_full =
+        cudaq::kronecker(utils::annihilate_matrix(2), utils::id_matrix(2));
+    auto subtr_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::id_matrix(2));
+
+    auto got_matrix = operator_sum.to_matrix();
+    auto want_matrix = term_0_full + term_1_full - subtr_full;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `operator_sum *= fermion_operator`
+  {
+    auto operator_sum = cudaq::fermion_operator::number(0) +
+                        cudaq::fermion_operator::annihilate(1);
+    auto self = cudaq::fermion_operator::create(0);
+
+    operator_sum *= self;
+
+    ASSERT_TRUE(operator_sum.num_terms() == 2);
+    for (auto &term : operator_sum.get_terms())
+      ASSERT_TRUE(term.num_terms() == term.degrees().size());
+
+    auto expected_term0 = cudaq::kronecker(
+        utils::id_matrix(2), utils::number_matrix(2) * utils::create_matrix(2));
+    // Minus one here only because of how we choose to implement the
+    // anti-commutation relations; for products of creation and annihilation, we
+    // give the term a minus sign whenever their application order does not
+    // match the canonical order.
+    auto expected_term1 = -1. * cudaq::kronecker(utils::annihilate_matrix(2),
+                                                 utils::create_matrix(2));
+
+    auto got_matrix = operator_sum.to_matrix();
+    auto want_matrix = expected_term0 + expected_term1;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+}
+
+TEST(OperatorExpressions, checkFermionOpsDegreeVerification) {
+  auto op1 = cudaq::fermion_operator::create(2);
+  auto op2 = cudaq::fermion_operator::annihilate(0);
+
+  std::map<int, int> dimensions = {{0, 1}, {2, 3}};
+
+  ASSERT_THROW(op1.to_matrix({{2, 3}}), std::runtime_error);
+  ASSERT_THROW((op1 * op2).to_matrix({{0, 3}, {2, 3}}), std::runtime_error);
+  ASSERT_THROW((op1 + op2).to_matrix({{0, 3}}), std::runtime_error);
+  ASSERT_NO_THROW(op1.to_matrix({{0, 3}}));
+}
+
+TEST(OperatorExpressions, checkAntiCommutationRelations) {
+
+  // Doing some testing for the tests - if the reference matrices do not satisfy
+  // the correct relations, then all tests are wrong...
+
+  auto ad_mat = utils::create_matrix(2);
+  auto a_mat = utils::annihilate_matrix(2);
+
+  auto eval_mat = a_mat * ad_mat + ad_mat * a_mat;
+  utils::checkEqual(eval_mat, utils::id_matrix(2));
+  utils::checkEqual(ad_mat * a_mat, utils::number_matrix(2));
+  utils::checkEqual(a_mat * ad_mat,
+                    utils::id_matrix(2) - utils::number_matrix(2));
+
+  // Expected anti-commutation relations:
+  // {a†(k), a(q)} = δkq
+  // {a†(k), a†(q)} = {a(k), a(q)} = 0
+
+  auto anticommutator = [](cudaq::product_operator<cudaq::fermion_operator> ad,
+                           cudaq::product_operator<cudaq::fermion_operator> a) {
+    return ad * a + a * ad;
+  };
+
+  // check {a†(q), a(q)} = 1
+
+  auto rel1 = anticommutator(cudaq::fermion_operator::create(0),
+                             cudaq::fermion_operator::annihilate(0));
+  auto rel2 = anticommutator(cudaq::fermion_operator::create(1),
+                             cudaq::fermion_operator::annihilate(1));
+  utils::checkEqual(rel1.to_matrix(), utils::id_matrix(2));
+  utils::checkEqual(rel2.to_matrix(), utils::id_matrix(2));
+
+  // check {a†(k), a(q)} = 0 for k != q
+
+  auto rel3 = anticommutator(cudaq::fermion_operator::create(0),
+                             cudaq::fermion_operator::annihilate(1));
+  auto rel4 = anticommutator(cudaq::fermion_operator::create(1),
+                             cudaq::fermion_operator::annihilate(0));
+  utils::checkEqual(rel3.to_matrix(), utils::zero_matrix(4));
+  utils::checkEqual(rel4.to_matrix(), utils::zero_matrix(4));
+
+  // check {a†(q), a†(q)} = 0
+
+  auto rel5 = anticommutator(cudaq::fermion_operator::create(0),
+                             cudaq::fermion_operator::create(0));
+  auto rel6 = anticommutator(cudaq::fermion_operator::create(1),
+                             cudaq::fermion_operator::create(1));
+  utils::checkEqual(rel5.to_matrix(), utils::zero_matrix(2));
+  utils::checkEqual(rel6.to_matrix(), utils::zero_matrix(2));
+
+  // check {a(q), a(q)} = 0
+
+  auto rel7 = anticommutator(cudaq::fermion_operator::annihilate(0),
+                             cudaq::fermion_operator::annihilate(0));
+  auto rel8 = anticommutator(cudaq::fermion_operator::annihilate(1),
+                             cudaq::fermion_operator::annihilate(1));
+  utils::checkEqual(rel7.to_matrix(), utils::zero_matrix(2));
+  utils::checkEqual(rel8.to_matrix(), utils::zero_matrix(2));
+
+  // check {a†(k), a†(q)} = 0 for k != q
+
+  auto rel9 = anticommutator(cudaq::fermion_operator::create(0),
+                             cudaq::fermion_operator::create(1));
+  auto rel10 = anticommutator(cudaq::fermion_operator::create(1),
+                              cudaq::fermion_operator::create(0));
+  utils::checkEqual(rel9.to_matrix(), utils::zero_matrix(4));
+  utils::checkEqual(rel10.to_matrix(), utils::zero_matrix(4));
+
+  // check {a(k), a(q)} = 0 for k != q
+
+  auto rel11 = anticommutator(cudaq::fermion_operator::annihilate(0),
+                              cudaq::fermion_operator::annihilate(1));
+  auto rel12 = anticommutator(cudaq::fermion_operator::annihilate(1),
+                              cudaq::fermion_operator::annihilate(0));
+  utils::checkEqual(rel11.to_matrix(), utils::zero_matrix(4));
+  utils::checkEqual(rel12.to_matrix(), utils::zero_matrix(4));
+
+  // check that [N(k), a†(q)] = 0 for k != q
+
+  auto rel13 =
+      cudaq::fermion_operator::number(0) * cudaq::fermion_operator::create(1) -
+      cudaq::fermion_operator::create(1) * cudaq::fermion_operator::number(0);
+  auto rel14 =
+      cudaq::fermion_operator::number(1) * cudaq::fermion_operator::create(0) -
+      cudaq::fermion_operator::create(0) * cudaq::fermion_operator::number(1);
+  utils::checkEqual(rel13.to_matrix(), utils::zero_matrix(4));
+  utils::checkEqual(rel14.to_matrix(), utils::zero_matrix(4));
+
+  // check that [N(k), a(q)] = 0 for k != q
+
+  auto rel15 = cudaq::fermion_operator::number(0) *
+                   cudaq::fermion_operator::annihilate(1) -
+               cudaq::fermion_operator::annihilate(1) *
+                   cudaq::fermion_operator::number(0);
+  auto rel16 = cudaq::fermion_operator::number(1) *
+                   cudaq::fermion_operator::annihilate(0) -
+               cudaq::fermion_operator::annihilate(0) *
+                   cudaq::fermion_operator::number(1);
+  utils::checkEqual(rel15.to_matrix(), utils::zero_matrix(4));
+  utils::checkEqual(rel16.to_matrix(), utils::zero_matrix(4));
+}
diff --git a/unittests/dynamics/matrix_operator.cpp b/unittests/dynamics/matrix_operator.cpp
new file mode 100644
index 00000000000..513a1f04790
--- /dev/null
+++ b/unittests/dynamics/matrix_operator.cpp
@@ -0,0 +1,804 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/operators.h"
+#include "utils.h"
+#include <gtest/gtest.h>
+
+TEST(OperatorExpressions, checkMatrixOpsUnary) {
+  auto op = cudaq::matrix_operator::position(0);
+  utils::checkEqual((+op).to_matrix({{0, 2}}), utils::position_matrix(2));
+  utils::checkEqual((-op).to_matrix({{0, 2}}),
+                    -1.0 * utils::position_matrix(2));
+  utils::checkEqual(op.to_matrix({{0, 2}}), utils::position_matrix(2));
+}
+
+TEST(OperatorExpressions, checkMatrixOpsConstruction) {
+  auto prod = cudaq::matrix_operator::identity();
+  cudaq::matrix_2 expected(1, 1);
+
+  expected[{0, 0}] = 1.;
+  utils::checkEqual(prod.to_matrix(), expected);
+
+  prod *= -1.j;
+  expected[{0, 0}] = std::complex<double>(-1.j);
+  utils::checkEqual(prod.to_matrix(), expected);
+
+  prod *= cudaq::matrix_operator::number(0);
+  expected = cudaq::matrix_2(3, 3);
+  expected[{1, 1}] = std::complex<double>(-1.j);
+  expected[{2, 2}] = std::complex<double>(-2.j);
+  utils::checkEqual(prod.to_matrix({{0, 3}}), expected);
+
+  auto sum = cudaq::matrix_operator::empty();
+  expected = cudaq::matrix_2(0, 0);
+  utils::checkEqual(sum.to_matrix(), expected);
+
+  sum *=
+      cudaq::matrix_operator::number(1); // empty times something is still empty
+  std::vector<int> expected_degrees = {};
+  ASSERT_EQ(sum.degrees(), expected_degrees);
+  utils::checkEqual(sum.to_matrix(), expected);
+
+  sum += cudaq::matrix_operator::identity(1);
+  expected = cudaq::matrix_2(3, 3);
+  for (size_t i = 0; i < 3; ++i)
+    expected[{i, i}] = 1.;
+  utils::checkEqual(sum.to_matrix({{1, 3}}), expected);
+
+  sum *= cudaq::matrix_operator::number(1);
+  expected = cudaq::matrix_2(3, 3);
+  expected[{1, 1}] = 1.;
+  expected[{2, 2}] = 2.;
+  utils::checkEqual(sum.to_matrix({{1, 3}}), expected);
+
+  sum = cudaq::matrix_operator::empty();
+  sum -= cudaq::matrix_operator::identity(0);
+  expected = cudaq::matrix_2(3, 3);
+  for (size_t i = 0; i < 3; ++i)
+    expected[{i, i}] = -1.;
+  utils::checkEqual(sum.to_matrix({{0, 3}}), expected);
+}
+
+TEST(OperatorExpressions, checkPreBuiltMatrixOps) {
+  std::vector<std::size_t> levels = {2, 3, 4, 5};
+
+  // Keeping this fixed throughout.
+  int degree_index = 0;
+
+  // Identity operator.
+  {
+    for (auto level_count : levels) {
+      auto id = cudaq::matrix_operator::identity(degree_index);
+      auto got_id = id.to_matrix({{degree_index, level_count}});
+      auto want_id = utils::id_matrix(level_count);
+      utils::checkEqual(want_id, got_id);
+    }
+  }
+
+  // Number operator.
+  {
+    for (auto level_count : levels) {
+      auto number = cudaq::matrix_operator::number(degree_index);
+      auto got_number = number.to_matrix({{degree_index, level_count}});
+      auto want_number = utils::number_matrix(level_count);
+      utils::checkEqual(want_number, got_number);
+    }
+  }
+
+  // Parity operator.
+  {
+    for (auto level_count : levels) {
+      auto parity = cudaq::matrix_operator::parity(degree_index);
+      auto got_parity = parity.to_matrix({{degree_index, level_count}});
+      auto want_parity = utils::parity_matrix(level_count);
+      utils::checkEqual(want_parity, got_parity);
+    }
+  }
+
+  // Position operator.
+  {
+    for (auto level_count : levels) {
+      auto position = cudaq::matrix_operator::position(degree_index);
+      auto got_position = position.to_matrix({{degree_index, level_count}});
+      auto want_position = utils::position_matrix(level_count);
+      utils::checkEqual(want_position, got_position);
+    }
+  }
+
+  // Momentum operator.
+  {
+    for (auto level_count : levels) {
+      auto momentum = cudaq::matrix_operator::momentum(degree_index);
+      auto got_momentum = momentum.to_matrix({{degree_index, level_count}});
+      auto want_momentum = utils::momentum_matrix(level_count);
+      utils::checkEqual(want_momentum, got_momentum);
+    }
+  }
+
+  // Displacement operator.
+  {
+    for (auto level_count : levels) {
+      auto displacement = 2.0 + 1.0j;
+      auto displace = cudaq::matrix_operator::displace(degree_index);
+      auto got_displace = displace.to_matrix({{degree_index, level_count}},
+                                             {{"displacement", displacement}});
+      auto want_displace = utils::displace_matrix(level_count, displacement);
+      utils::checkEqual(want_displace, got_displace);
+    }
+  }
+
+  // Squeeze operator.
+  {
+    for (auto level_count : levels) {
+      auto squeezing = 2.0 + 1.0j;
+      auto squeeze = cudaq::matrix_operator::squeeze(degree_index);
+      auto got_squeeze = squeeze.to_matrix({{degree_index, level_count}},
+                                           {{"squeezing", squeezing}});
+      auto want_squeeze = utils::squeeze_matrix(level_count, squeezing);
+      utils::checkEqual(want_squeeze, got_squeeze);
+    }
+  }
+}
+
+TEST(OperatorExpressions, checkCustomMatrixOps) {
+  auto level_count = 2;
+  std::unordered_map<int, int> dimensions = {
+      {0, level_count + 1}, {1, level_count + 2}, {3, level_count}};
+
+  {
+    auto func0 =
+        [](const std::vector<int> &dimensions,
+           const std::unordered_map<std::string, std::complex<double>> &_none) {
+          return cudaq::kronecker(utils::position_matrix(dimensions[0]),
+                                  utils::momentum_matrix(dimensions[1]));
+          ;
+        };
+    auto func1 =
+        [](const std::vector<int> &dimensions,
+           const std::unordered_map<std::string, std::complex<double>> &_none) {
+          return cudaq::kronecker(utils::number_matrix(dimensions[0]),
+                                  utils::position_matrix(dimensions[1]));
+          ;
+        };
+    cudaq::matrix_operator::define("custom_op0", {-1, -1}, func0);
+    cudaq::matrix_operator::define("custom_op1", {-1, -1}, func1);
+  }
+
+  // check that we force user facing conventions when defining/instantiating
+  // a custom operator
+  ASSERT_THROW(cudaq::matrix_operator::instantiate("custom_op0", {0, 1}),
+               std::runtime_error);
+  ASSERT_THROW(cudaq::matrix_operator::instantiate("custom_op0", {1, 3}),
+               std::runtime_error);
+
+  // op 0:
+  // momentum level+1 on 0
+  // position level+2 on 1
+  // op 1:
+  // number level on 3
+  // create level+2 on 1
+  auto op0 = cudaq::matrix_operator::instantiate("custom_op0", {1, 0});
+  auto op1 = cudaq::matrix_operator::instantiate("custom_op1", {3, 1});
+
+  auto matrix0 = cudaq::kronecker(utils::position_matrix(level_count + 2),
+                                  utils::momentum_matrix(level_count + 1));
+  auto matrix1 = cudaq::kronecker(utils::number_matrix(level_count),
+                                  utils::position_matrix(level_count + 2));
+
+  std::vector<cudaq::matrix_2> product_matrices = {
+      utils::number_matrix(level_count),
+      utils::position_matrix(level_count + 2) *
+          utils::position_matrix(level_count + 2),
+      utils::momentum_matrix(level_count + 1)};
+  std::vector<cudaq::matrix_2> product_reverse_matrices = {
+      utils::number_matrix(level_count),
+      utils::position_matrix(level_count + 2) *
+          utils::position_matrix(level_count + 2),
+      utils::momentum_matrix(level_count + 1)};
+  std::vector<cudaq::matrix_2> sum_matrices_term0 = {
+      utils::id_matrix(level_count), utils::position_matrix(level_count + 2),
+      utils::momentum_matrix(level_count + 1)};
+  std::vector<cudaq::matrix_2> sum_matrices_term1 = {
+      utils::number_matrix(level_count),
+      utils::position_matrix(level_count + 2),
+      utils::id_matrix(level_count + 1)};
+
+  auto expected_product =
+      cudaq::kronecker(product_matrices.begin(), product_matrices.end());
+  auto expected_product_reverse = cudaq::kronecker(
+      product_reverse_matrices.begin(), product_reverse_matrices.end());
+  auto expected_sum_term0 =
+      cudaq::kronecker(sum_matrices_term0.begin(), sum_matrices_term0.end());
+  auto expected_sum_term1 =
+      cudaq::kronecker(sum_matrices_term1.begin(), sum_matrices_term1.end());
+
+  utils::checkEqual(
+      op0.to_matrix(dimensions),
+      matrix0); // reordered to match CUDA-Q conventions by default
+  utils::checkEqual(
+      op1.to_matrix(dimensions),
+      matrix1); // reordered to match CUDA-Q conventions by default
+  utils::checkEqual((op0 * op1).to_matrix(dimensions), expected_product);
+  utils::checkEqual((op1 * op0).to_matrix(dimensions),
+                    expected_product_reverse);
+  utils::checkEqual((op0 + op1).to_matrix(dimensions),
+                    expected_sum_term0 + expected_sum_term1);
+  utils::checkEqual((op1 + op0).to_matrix(dimensions),
+                    expected_sum_term0 + expected_sum_term1);
+}
+
+TEST(OperatorExpressions, checkMatrixOpsWithComplex) {
+  std::complex<double> value = 0.125 + 0.125j;
+
+  // `matrix_operator` + `complex<double>` and `complex<double>` +
+  // `matrix_operator`
+  {
+    auto elementary = cudaq::matrix_operator::momentum(0);
+
+    auto sum = value + elementary;
+    auto reverse = elementary + value;
+
+    auto got_matrix = sum.to_matrix({{0, 3}});
+    auto got_matrix_reverse = reverse.to_matrix({{0, 3}});
+
+    auto scaled_identity = value * utils::id_matrix(3);
+    auto want_matrix = scaled_identity + utils::momentum_matrix(3);
+    auto want_matrix_reverse = utils::momentum_matrix(3) + scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  // `matrix_operator` - `complex<double>` and `complex<double>` -
+  // `matrix_operator`
+  {
+    auto elementary = cudaq::matrix_operator::position(0);
+
+    auto difference = value - elementary;
+    auto reverse = elementary - value;
+
+    auto got_matrix = difference.to_matrix({{0, 3}});
+    auto got_matrix_reverse = reverse.to_matrix({{0, 3}});
+
+    auto scaled_identity = value * utils::id_matrix(3);
+    auto want_matrix = scaled_identity - utils::position_matrix(3);
+    auto want_matrix_reverse = utils::position_matrix(3) - scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  // `matrix_operator` * `complex<double>` and `complex<double>` *
+  // `matrix_operator`
+  {
+    auto elementary = cudaq::matrix_operator::number(0);
+
+    auto product = value * elementary;
+    auto reverse = elementary * value;
+
+    auto got_matrix = product.to_matrix({{0, 3}});
+    auto got_matrix_reverse = reverse.to_matrix({{0, 3}});
+
+    auto scaled_identity = value * utils::id_matrix(3);
+    auto want_matrix = scaled_identity * utils::number_matrix(3);
+    auto want_matrix_reverse = utils::number_matrix(3) * scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+}
+
+TEST(OperatorExpressions, checkMatrixOpsWithScalars) {
+
+  auto function = [](const std::unordered_map<std::string, std::complex<double>>
+                         &parameters) {
+    auto entry = parameters.find("value");
+    if (entry == parameters.end())
+      throw std::runtime_error("value not defined in parameters");
+    return entry->second;
+  };
+
+  /// Keeping these fixed for these more simple tests.
+  int level_count = 3;
+  int degree_index = 0;
+  double const_scale_factor = 2.0;
+
+  // `matrix_operator + scalar_operator`
+  {
+    auto self = cudaq::matrix_operator::momentum(0);
+    auto other = cudaq::scalar_operator(const_scale_factor);
+
+    auto sum = self + other;
+    auto reverse = other + self;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(level_count);
+    auto got_matrix = sum.to_matrix({{degree_index, level_count}});
+    auto got_reverse_matrix = reverse.to_matrix({{degree_index, level_count}});
+    auto want_matrix = utils::momentum_matrix(level_count) + scaled_identity;
+    auto want_reverse_matrix =
+        scaled_identity + utils::momentum_matrix(level_count);
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `matrix_operator + scalar_operator`
+  {
+    auto self = cudaq::matrix_operator::parity(0);
+    auto other = cudaq::scalar_operator(function);
+
+    auto sum = self + other;
+    auto reverse = other + self;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(level_count);
+    auto got_matrix = sum.to_matrix({{degree_index, level_count}},
+                                    {{"value", const_scale_factor}});
+    auto got_reverse_matrix = reverse.to_matrix(
+        {{degree_index, level_count}}, {{"value", const_scale_factor}});
+    auto want_matrix = utils::parity_matrix(level_count) + scaled_identity;
+    auto want_reverse_matrix =
+        scaled_identity + utils::parity_matrix(level_count);
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `matrix_operator - scalar_operator`
+  {
+    auto self = cudaq::matrix_operator::number(0);
+    auto other = cudaq::scalar_operator(const_scale_factor);
+
+    auto sum = self - other;
+    auto reverse = other - self;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(level_count);
+    auto got_matrix = sum.to_matrix({{degree_index, level_count}});
+    auto got_reverse_matrix = reverse.to_matrix({{degree_index, level_count}});
+    auto want_matrix = utils::number_matrix(level_count) - scaled_identity;
+    auto want_reverse_matrix =
+        scaled_identity - utils::number_matrix(level_count);
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `matrix_operator - scalar_operator`
+  {
+    auto self = cudaq::matrix_operator::position(0);
+    auto other = cudaq::scalar_operator(function);
+
+    auto sum = self - other;
+    auto reverse = other - self;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(level_count);
+    auto got_matrix = sum.to_matrix({{degree_index, level_count}},
+                                    {{"value", const_scale_factor}});
+    auto got_reverse_matrix = reverse.to_matrix(
+        {{degree_index, level_count}}, {{"value", const_scale_factor}});
+    auto want_matrix = utils::position_matrix(level_count) - scaled_identity;
+    auto want_reverse_matrix =
+        scaled_identity - utils::position_matrix(level_count);
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `matrix_operator * scalar_operator`
+  {
+    auto self = cudaq::matrix_operator::momentum(0);
+    auto other = cudaq::scalar_operator(const_scale_factor);
+
+    auto product = self * other;
+    auto reverse = other * self;
+
+    auto momentum = cudaq::matrix_operator::momentum(0).get_terms()[0];
+    utils::assert_product_equal(product, const_scale_factor, {momentum});
+    utils::assert_product_equal(reverse, const_scale_factor, {momentum});
+
+    std::vector<int> want_degrees = {0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(level_count);
+    auto got_matrix = product.to_matrix({{degree_index, level_count}});
+    auto got_reverse_matrix = reverse.to_matrix({{degree_index, level_count}});
+    auto want_matrix = utils::momentum_matrix(level_count) * scaled_identity;
+    auto want_reverse_matrix =
+        scaled_identity * utils::momentum_matrix(level_count);
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `matrix_operator * scalar_operator`
+  {
+    auto self = cudaq::matrix_operator::position(0);
+    auto other = cudaq::scalar_operator(function);
+
+    auto product = self * other;
+    auto reverse = other * self;
+
+    std::vector<int> want_degrees = {0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(level_count);
+    auto got_matrix = product.to_matrix({{degree_index, level_count}},
+                                        {{"value", const_scale_factor}});
+    auto got_reverse_matrix = reverse.to_matrix(
+        {{degree_index, level_count}}, {{"value", const_scale_factor}});
+    auto want_matrix = utils::position_matrix(level_count) * scaled_identity;
+    auto want_reverse_matrix =
+        scaled_identity * utils::position_matrix(level_count);
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+}
+
+TEST(OperatorExpressions, checkMatrixOpsSimpleArithmetics) {
+
+  /// Keeping this fixed throughout.
+  int level_count = 3;
+  std::unordered_map<int, int> dimensions = {{0, level_count},
+                                             {1, level_count}};
+
+  // Addition, same DOF.
+  {
+    auto self = cudaq::matrix_operator::momentum(0);
+    auto other = cudaq::matrix_operator::position(0);
+
+    auto sum = self + other;
+    ASSERT_TRUE(sum.num_terms() == 2);
+
+    auto got_matrix = sum.to_matrix(dimensions);
+    auto want_matrix = utils::momentum_matrix(level_count) +
+                       utils::position_matrix(level_count);
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // Addition, different DOF's.
+  {
+    auto self = cudaq::matrix_operator::momentum(0);
+    auto other = cudaq::matrix_operator::position(1);
+
+    auto sum = self + other;
+    ASSERT_TRUE(sum.num_terms() == 2);
+
+    auto annihilate_full = cudaq::kronecker(
+        utils::id_matrix(level_count), utils::momentum_matrix(level_count));
+    auto create_full = cudaq::kronecker(utils::position_matrix(level_count),
+                                        utils::id_matrix(level_count));
+    auto got_matrix = sum.to_matrix({{0, level_count}, {1, level_count}});
+    auto want_matrix = annihilate_full + create_full;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // Subtraction, same DOF.
+  {
+    auto self = cudaq::matrix_operator::momentum(0);
+    auto other = cudaq::matrix_operator::position(0);
+
+    auto sum = self - other;
+    ASSERT_TRUE(sum.num_terms() == 2);
+
+    auto got_matrix = sum.to_matrix(dimensions);
+    auto want_matrix = utils::momentum_matrix(level_count) -
+                       utils::position_matrix(level_count);
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // Subtraction, different DOF's.
+  {
+    auto self = cudaq::matrix_operator::momentum(0);
+    auto other = cudaq::matrix_operator::position(1);
+
+    auto sum = self - other;
+    ASSERT_TRUE(sum.num_terms() == 2);
+
+    auto annihilate_full = cudaq::kronecker(
+        utils::id_matrix(level_count), utils::momentum_matrix(level_count));
+    auto create_full = cudaq::kronecker(utils::position_matrix(level_count),
+                                        utils::id_matrix(level_count));
+    auto got_matrix = sum.to_matrix(dimensions);
+    auto want_matrix = annihilate_full - create_full;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // Multiplication, same DOF.
+  {
+    auto self = cudaq::matrix_operator::momentum(0);
+    auto other = cudaq::matrix_operator::position(0);
+
+    auto product = self * other;
+    ASSERT_TRUE(product.num_terms() == 2);
+
+    std::vector<int> want_degrees = {0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+
+    auto got_matrix = product.to_matrix(dimensions);
+    auto want_matrix = utils::momentum_matrix(level_count) *
+                       utils::position_matrix(level_count);
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // Multiplication, different DOF's.
+  {
+    auto self = cudaq::matrix_operator::momentum(0);
+    auto other = cudaq::matrix_operator::position(1);
+
+    auto product = self * other;
+    ASSERT_TRUE(product.num_terms() == 2);
+
+    std::vector<int> want_degrees = {1, 0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+
+    auto annihilate_full = cudaq::kronecker(
+        utils::id_matrix(level_count), utils::momentum_matrix(level_count));
+    auto create_full = cudaq::kronecker(utils::position_matrix(level_count),
+                                        utils::id_matrix(level_count));
+    auto got_matrix = product.to_matrix(dimensions);
+    auto want_matrix = annihilate_full * create_full;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+}
+
+TEST(OperatorExpressions, checkMatrixOpsAdvancedArithmetics) {
+
+  /// Keeping this fixed throughout.
+  int level_count = 3;
+  std::complex<double> value = 0.125 + 0.5j;
+
+  // `matrix_operator + operator_sum`
+  {
+    auto self = cudaq::matrix_operator::momentum(0);
+    auto operator_sum = cudaq::matrix_operator::position(0) +
+                        cudaq::matrix_operator::identity(1);
+
+    auto got = self + operator_sum;
+    auto reverse = operator_sum + self;
+
+    ASSERT_TRUE(got.num_terms() == 3);
+    ASSERT_TRUE(reverse.num_terms() == 3);
+
+    auto self_full = cudaq::kronecker(utils::id_matrix(level_count),
+                                      utils::momentum_matrix(level_count));
+    auto term_0_full = cudaq::kronecker(utils::id_matrix(level_count),
+                                        utils::position_matrix(level_count));
+    auto term_1_full = cudaq::kronecker(utils::id_matrix(level_count),
+                                        utils::id_matrix(level_count));
+
+    auto got_matrix = got.to_matrix({{0, level_count}, {1, level_count}});
+    auto got_reverse_matrix =
+        reverse.to_matrix({{0, level_count}, {1, level_count}});
+    auto want_matrix = self_full + term_0_full + term_1_full;
+    auto want_reverse_matrix = term_0_full + term_1_full + self_full;
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `matrix_operator - operator_sum`
+  {
+    auto self = cudaq::matrix_operator::momentum(0);
+    auto operator_sum = cudaq::matrix_operator::position(0) +
+                        cudaq::matrix_operator::identity(1);
+
+    auto got = self - operator_sum;
+    auto reverse = operator_sum - self;
+
+    ASSERT_TRUE(got.num_terms() == 3);
+    ASSERT_TRUE(reverse.num_terms() == 3);
+
+    auto self_full = cudaq::kronecker(utils::id_matrix(level_count),
+                                      utils::momentum_matrix(level_count));
+    auto term_0_full = cudaq::kronecker(utils::id_matrix(level_count),
+                                        utils::position_matrix(level_count));
+    auto term_1_full = cudaq::kronecker(utils::id_matrix(level_count),
+                                        utils::id_matrix(level_count));
+
+    auto got_matrix = got.to_matrix({{0, level_count}, {1, level_count}});
+    auto got_reverse_matrix =
+        reverse.to_matrix({{0, level_count}, {1, level_count}});
+    auto want_matrix = self_full - term_0_full - term_1_full;
+    auto want_reverse_matrix = term_0_full + term_1_full - self_full;
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `matrix_operator * operator_sum`
+  {
+    auto self = cudaq::matrix_operator::momentum(0);
+    auto operator_sum = cudaq::matrix_operator::squeeze(0) +
+                        cudaq::matrix_operator::identity(1);
+
+    auto got = self * operator_sum;
+    auto reverse = operator_sum * self;
+
+    ASSERT_TRUE(got.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+    for (auto &term : got.get_terms())
+      ASSERT_TRUE(term.num_terms() == 2);
+    for (auto &term : reverse.get_terms())
+      ASSERT_TRUE(term.num_terms() == 2);
+
+    auto self_full = cudaq::kronecker(utils::id_matrix(level_count),
+                                      utils::momentum_matrix(level_count));
+    auto term_0_full =
+        cudaq::kronecker(utils::id_matrix(level_count),
+                         utils::squeeze_matrix(level_count, value));
+    auto term_1_full = cudaq::kronecker(utils::id_matrix(level_count),
+                                        utils::id_matrix(level_count));
+    auto sum_full = term_0_full + term_1_full;
+
+    auto got_matrix = got.to_matrix({{0, level_count}, {1, level_count}},
+                                    {{"squeezing", value}});
+    auto got_reverse_matrix = reverse.to_matrix(
+        {{0, level_count}, {1, level_count}}, {{"squeezing", value}});
+    auto want_matrix = self_full * sum_full;
+    auto want_reverse_matrix = sum_full * self_full;
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `operator_sum += matrix_operator`
+  {
+    auto operator_sum = cudaq::matrix_operator::position(0) +
+                        cudaq::matrix_operator::identity(1);
+    operator_sum += cudaq::matrix_operator::displace(0);
+
+    ASSERT_TRUE(operator_sum.num_terms() == 3);
+
+    auto self_full =
+        cudaq::kronecker(utils::id_matrix(level_count),
+                         utils::displace_matrix(level_count, value));
+    auto term_0_full = cudaq::kronecker(utils::id_matrix(level_count),
+                                        utils::position_matrix(level_count));
+    auto term_1_full = cudaq::kronecker(utils::id_matrix(level_count),
+                                        utils::id_matrix(level_count));
+
+    auto got_matrix = operator_sum.to_matrix(
+        {{0, level_count}, {1, level_count}}, {{"displacement", value}});
+    auto want_matrix = term_0_full + term_1_full + self_full;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `operator_sum -= matrix_operator`
+  {
+    auto operator_sum = cudaq::matrix_operator::position(0) +
+                        cudaq::matrix_operator::identity(1);
+    operator_sum -= cudaq::matrix_operator::momentum(0);
+
+    ASSERT_TRUE(operator_sum.num_terms() == 3);
+
+    auto self_full = cudaq::kronecker(utils::id_matrix(level_count),
+                                      utils::momentum_matrix(level_count));
+    auto term_0_full = cudaq::kronecker(utils::id_matrix(level_count),
+                                        utils::position_matrix(level_count));
+    auto term_1_full = cudaq::kronecker(utils::id_matrix(level_count),
+                                        utils::id_matrix(level_count));
+
+    auto got_matrix =
+        operator_sum.to_matrix({{0, level_count}, {1, level_count}});
+    auto want_matrix = term_0_full + term_1_full - self_full;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `operator_sum *= matrix_operator`
+  {
+    auto self = cudaq::matrix_operator::momentum(0);
+    auto operator_sum = cudaq::matrix_operator::position(0) +
+                        cudaq::matrix_operator::identity(1);
+
+    operator_sum *= self;
+
+    ASSERT_TRUE(operator_sum.num_terms() == 2);
+    for (auto &term : operator_sum.get_terms())
+      ASSERT_TRUE(term.num_terms() == 2);
+
+    auto self_full = cudaq::kronecker(utils::id_matrix(level_count),
+                                      utils::momentum_matrix(level_count));
+    auto term_0_full = cudaq::kronecker(utils::id_matrix(level_count),
+                                        utils::position_matrix(level_count));
+    auto term_1_full = cudaq::kronecker(utils::id_matrix(level_count),
+                                        utils::id_matrix(level_count));
+    auto sum_full = term_0_full + term_1_full;
+
+    auto got_matrix =
+        operator_sum.to_matrix({{0, level_count}, {1, level_count}});
+    auto want_matrix = sum_full * self_full;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+}
+
+TEST(OperatorExpressions, checkMatrixOpsDegreeVerification) {
+  auto op1 = cudaq::matrix_operator::position(2);
+  auto op2 = cudaq::matrix_operator::momentum(0);
+  std::unordered_map<int, int> dimensions = {{0, 2}, {1, 2}, {2, 3}, {3, 3}};
+
+  {
+    auto func0 =
+        [](const std::vector<int> &dimensions,
+           const std::unordered_map<std::string, std::complex<double>> &_none) {
+          return cudaq::kronecker(utils::momentum_matrix(dimensions[0]),
+                                  utils::position_matrix(dimensions[1]));
+          ;
+        };
+    auto func1 =
+        [](const std::vector<int> &dimensions,
+           const std::unordered_map<std::string, std::complex<double>> &_none) {
+          return cudaq::kronecker(utils::position_matrix(dimensions[0]),
+                                  utils::number_matrix(dimensions[1]));
+          ;
+        };
+    cudaq::matrix_operator::define("custom_op0", {-1, -1}, func0);
+    cudaq::matrix_operator::define("custom_op1", {-1, -1}, func1);
+
+    cudaq::matrix_operator::define("custom_op2", {3, 3}, func0);
+    cudaq::matrix_operator::define("custom_op3", {2, -1}, func1);
+  }
+
+  auto custom_op0 = cudaq::matrix_operator::instantiate("custom_op0", {3, 1});
+  auto custom_op1 = cudaq::matrix_operator::instantiate("custom_op1", {1, 0});
+
+  ASSERT_THROW(op1.to_matrix(), std::runtime_error);
+  ASSERT_THROW(op1.to_matrix({{1, 2}}), std::runtime_error);
+  ASSERT_THROW((op1 * op2).to_matrix({{2, 3}}), std::runtime_error);
+  ASSERT_THROW((op1 + op2).to_matrix({{0, 3}}), std::runtime_error);
+  ASSERT_NO_THROW((op1 * op2).to_matrix(dimensions));
+  ASSERT_NO_THROW((op1 + op2).to_matrix(dimensions));
+
+  ASSERT_THROW(custom_op0.to_matrix(), std::runtime_error);
+  ASSERT_THROW(custom_op1.to_matrix({{1, 2}}), std::runtime_error);
+  ASSERT_THROW((custom_op1 * custom_op0).to_matrix({{0, 2}, {1, 2}}),
+               std::runtime_error);
+  ASSERT_THROW((custom_op1 + custom_op0).to_matrix({{0, 2}, {1, 2}, {2, 2}}),
+               std::runtime_error);
+  ASSERT_NO_THROW((custom_op0 * custom_op1).to_matrix(dimensions));
+  ASSERT_NO_THROW((custom_op0 + custom_op1).to_matrix(dimensions));
+
+  auto custom_op2 = cudaq::matrix_operator::instantiate("custom_op2", {3, 1});
+  auto custom_op3 = cudaq::matrix_operator::instantiate("custom_op3", {1, 0});
+
+  dimensions = {{0, 2}};
+  ASSERT_NO_THROW(custom_op2.to_matrix());
+  ASSERT_THROW(custom_op3.to_matrix(), std::runtime_error);
+  ASSERT_NO_THROW(custom_op3.to_matrix(dimensions));
+  dimensions = {};
+  ASSERT_NO_THROW(custom_op2.to_matrix(
+      dimensions)); // degree 1 should be set to required dim 3
+  ASSERT_THROW(custom_op3.to_matrix(dimensions),
+               std::runtime_error); // degree 1 needs to be 2
+}
+
+TEST(OperatorExpressions, checkMatrixOpsParameterVerification) {
+
+  std::unordered_map<std::string, std::complex<double>> parameters = {
+      {"squeezing", 0.5}, {"displacement", 0.25}};
+  std::unordered_map<int, int> dimensions = {{0, 2}, {1, 2}};
+
+  auto squeeze = cudaq::matrix_operator::squeeze(1);
+  auto displace = cudaq::matrix_operator::displace(0);
+
+  ASSERT_THROW(squeeze.to_matrix(dimensions), std::runtime_error);
+  ASSERT_THROW(squeeze.to_matrix(dimensions, {{"displacement", 0.25}}),
+               std::runtime_error);
+  ASSERT_THROW(
+      (squeeze * displace).to_matrix(dimensions, {{"displacement", 0.25}}),
+      std::runtime_error);
+  ASSERT_THROW((squeeze + displace).to_matrix(dimensions, {{"squeezing", 0.5}}),
+               std::runtime_error);
+  ASSERT_NO_THROW((squeeze * displace).to_matrix(dimensions, parameters));
+  ASSERT_NO_THROW((squeeze + displace).to_matrix(dimensions, parameters));
+}
diff --git a/unittests/dynamics/operator_conversions.cpp b/unittests/dynamics/operator_conversions.cpp
new file mode 100644
index 00000000000..694e4ad4ed4
--- /dev/null
+++ b/unittests/dynamics/operator_conversions.cpp
@@ -0,0 +1,656 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/operators.h"
+#include "utils.h"
+#include <gtest/gtest.h>
+
+TEST(OperatorExpressions, checkElementaryOpsConversions) {
+
+  std::unordered_map<std::string, std::complex<double>> parameters = {
+      {"squeezing", 0.5}, {"displacement", 0.25}};
+  std::unordered_map<int, int> dimensions = {{0, 2}, {1, 2}};
+
+  auto matrix_elementary = cudaq::matrix_operator::parity(1);
+  auto matrix_elementary_expected = utils::parity_matrix(2);
+  auto spin_elementary = cudaq::spin_operator::y(1);
+  auto spin_elementary_expected = utils::PauliY_matrix();
+  auto boson_elementary = cudaq::boson_operator::annihilate(1);
+  auto boson_elementary_expected = utils::annihilate_matrix(2);
+
+  auto checkSumEquals =
+      [dimensions, parameters](cudaq::operator_sum<cudaq::matrix_operator> sum,
+                               cudaq::matrix_2 expected,
+                               int expected_num_terms = 2) {
+        auto got = sum.to_matrix(dimensions, parameters);
+        ASSERT_TRUE(sum.num_terms() == expected_num_terms);
+        utils::checkEqual(got, expected);
+      };
+
+  auto checkProductEquals =
+      [dimensions,
+       parameters](cudaq::product_operator<cudaq::matrix_operator> prod,
+                   cudaq::matrix_2 expected, int expected_num_terms = 2) {
+        auto got = prod.to_matrix(dimensions, parameters);
+        ASSERT_TRUE(prod.num_terms() == expected_num_terms);
+        utils::checkEqual(got, expected);
+      };
+
+  // `elementary + elementary`
+  {
+    checkSumEquals(matrix_elementary + matrix_elementary,
+                   matrix_elementary_expected + matrix_elementary_expected, 1);
+    checkSumEquals(spin_elementary + spin_elementary,
+                   spin_elementary_expected + spin_elementary_expected, 1);
+    checkSumEquals(boson_elementary + boson_elementary,
+                   boson_elementary_expected + boson_elementary_expected, 1);
+    checkSumEquals(matrix_elementary + spin_elementary,
+                   matrix_elementary_expected + spin_elementary_expected);
+    checkSumEquals(spin_elementary + matrix_elementary,
+                   matrix_elementary_expected + spin_elementary_expected);
+    checkSumEquals(matrix_elementary + boson_elementary,
+                   matrix_elementary_expected + boson_elementary_expected);
+    checkSumEquals(boson_elementary + matrix_elementary,
+                   matrix_elementary_expected + boson_elementary_expected);
+    checkSumEquals(spin_elementary + boson_elementary,
+                   spin_elementary_expected + boson_elementary_expected);
+    checkSumEquals(boson_elementary + spin_elementary,
+                   spin_elementary_expected + boson_elementary_expected);
+  }
+
+  // `elementary - elementary`
+  {
+    checkSumEquals(matrix_elementary - matrix_elementary,
+                   matrix_elementary_expected - matrix_elementary_expected, 1);
+    checkSumEquals(spin_elementary - spin_elementary,
+                   spin_elementary_expected - spin_elementary_expected, 1);
+    checkSumEquals(boson_elementary - boson_elementary,
+                   boson_elementary_expected - boson_elementary_expected, 1);
+    checkSumEquals(matrix_elementary - spin_elementary,
+                   matrix_elementary_expected - spin_elementary_expected);
+    checkSumEquals(spin_elementary - matrix_elementary,
+                   spin_elementary_expected - matrix_elementary_expected);
+    checkSumEquals(matrix_elementary - boson_elementary,
+                   matrix_elementary_expected - boson_elementary_expected);
+    checkSumEquals(boson_elementary - matrix_elementary,
+                   boson_elementary_expected - matrix_elementary_expected);
+    checkSumEquals(spin_elementary - boson_elementary,
+                   spin_elementary_expected - boson_elementary_expected);
+    checkSumEquals(boson_elementary - spin_elementary,
+                   boson_elementary_expected - spin_elementary_expected);
+  }
+
+  // `elementary * elementary`
+  {
+    checkProductEquals(matrix_elementary * matrix_elementary,
+                       matrix_elementary_expected * matrix_elementary_expected);
+    checkProductEquals(spin_elementary * spin_elementary,
+                       spin_elementary_expected * spin_elementary_expected, 1);
+    checkProductEquals(boson_elementary * boson_elementary,
+                       boson_elementary_expected * boson_elementary_expected,
+                       1);
+    checkProductEquals(matrix_elementary * spin_elementary,
+                       matrix_elementary_expected * spin_elementary_expected);
+    checkProductEquals(spin_elementary * matrix_elementary,
+                       spin_elementary_expected * matrix_elementary_expected);
+    checkProductEquals(matrix_elementary * boson_elementary,
+                       matrix_elementary_expected * boson_elementary_expected);
+    checkProductEquals(boson_elementary * matrix_elementary,
+                       boson_elementary_expected * matrix_elementary_expected);
+    checkProductEquals(spin_elementary * boson_elementary,
+                       spin_elementary_expected * boson_elementary_expected);
+    checkProductEquals(boson_elementary * spin_elementary,
+                       boson_elementary_expected * spin_elementary_expected);
+  }
+
+  // `elementary *= elementary`
+  {
+    auto matrix_product = cudaq::product_operator(matrix_elementary);
+    matrix_product *= matrix_elementary;
+    checkProductEquals(matrix_product,
+                       matrix_elementary_expected * matrix_elementary_expected);
+
+    auto spin_product = cudaq::product_operator(spin_elementary);
+    spin_product *= spin_elementary;
+    checkProductEquals(spin_product,
+                       spin_elementary_expected * spin_elementary_expected, 1);
+
+    auto boson_product = cudaq::product_operator(boson_elementary);
+    boson_product *= boson_elementary;
+    checkProductEquals(boson_product,
+                       boson_elementary_expected * boson_elementary_expected,
+                       1);
+
+    matrix_product = cudaq::product_operator(matrix_elementary);
+    matrix_product *= spin_elementary;
+    checkProductEquals(matrix_product,
+                       matrix_elementary_expected * spin_elementary_expected);
+
+    matrix_product = cudaq::product_operator(matrix_elementary);
+    matrix_product *= boson_elementary;
+    checkProductEquals(matrix_product,
+                       matrix_elementary_expected * boson_elementary_expected);
+  }
+}
+
+TEST(OperatorExpressions, checkProductOperatorConversions) {
+
+  std::unordered_map<std::string, std::complex<double>> parameters = {
+      {"squeezing", 0.5}, {"displacement", 0.25}};
+  std::unordered_map<int, int> dimensions = {{0, 2}, {1, 2}};
+  auto matrix_product =
+      cudaq::matrix_operator::squeeze(0) * cudaq::matrix_operator::displace(1);
+  auto matrix_product_expected = cudaq::kronecker(
+      utils::displace_matrix(2, 0.25), utils::squeeze_matrix(2, 0.5));
+  auto spin_product = cudaq::spin_operator::y(1) * cudaq::spin_operator::x(0);
+  auto spin_product_expected =
+      cudaq::kronecker(utils::PauliY_matrix(), utils::PauliX_matrix());
+  auto boson_product =
+      cudaq::boson_operator::annihilate(1) * cudaq::boson_operator::number(0);
+  auto boson_product_expected =
+      cudaq::kronecker(utils::annihilate_matrix(2), utils::number_matrix(2));
+
+  auto checkSumEquals =
+      [dimensions, parameters](cudaq::operator_sum<cudaq::matrix_operator> sum,
+                               cudaq::matrix_2 expected,
+                               int expected_num_terms = 2) {
+        auto got = sum.to_matrix(dimensions, parameters);
+        ASSERT_TRUE(sum.num_terms() == expected_num_terms);
+        utils::checkEqual(got, expected);
+      };
+
+  auto checkProductEquals =
+      [dimensions,
+       parameters](cudaq::product_operator<cudaq::matrix_operator> prod,
+                   cudaq::matrix_2 expected, int expected_num_terms = 4) {
+        auto got = prod.to_matrix(dimensions, parameters);
+        ASSERT_TRUE(prod.num_terms() == expected_num_terms);
+        utils::checkEqual(got, expected);
+      };
+
+  // `product + product`
+  {
+    checkSumEquals(matrix_product + matrix_product,
+                   matrix_product_expected + matrix_product_expected, 1);
+    checkSumEquals(spin_product + spin_product,
+                   spin_product_expected + spin_product_expected, 1);
+    checkSumEquals(boson_product + boson_product,
+                   boson_product_expected + boson_product_expected, 1);
+    checkSumEquals(matrix_product + spin_product,
+                   matrix_product_expected + spin_product_expected);
+    checkSumEquals(spin_product + matrix_product,
+                   matrix_product_expected + spin_product_expected);
+    checkSumEquals(matrix_product + boson_product,
+                   matrix_product_expected + boson_product_expected);
+    checkSumEquals(boson_product + matrix_product,
+                   matrix_product_expected + boson_product_expected);
+    checkSumEquals(spin_product + boson_product,
+                   spin_product_expected + boson_product_expected);
+    checkSumEquals(boson_product + spin_product,
+                   spin_product_expected + boson_product_expected);
+  }
+
+  // `product - product`
+  {
+    checkSumEquals(matrix_product - matrix_product,
+                   matrix_product_expected - matrix_product_expected, 1);
+    checkSumEquals(spin_product - spin_product,
+                   spin_product_expected - spin_product_expected, 1);
+    checkSumEquals(boson_product - boson_product,
+                   boson_product_expected - boson_product_expected, 1);
+    checkSumEquals(matrix_product - spin_product,
+                   matrix_product_expected - spin_product_expected);
+    checkSumEquals(spin_product - matrix_product,
+                   spin_product_expected - matrix_product_expected);
+    checkSumEquals(matrix_product - boson_product,
+                   matrix_product_expected - boson_product_expected);
+    checkSumEquals(boson_product - matrix_product,
+                   boson_product_expected - matrix_product_expected);
+    checkSumEquals(spin_product - boson_product,
+                   spin_product_expected - boson_product_expected);
+    checkSumEquals(boson_product - spin_product,
+                   boson_product_expected - spin_product_expected);
+  }
+
+  // `product * product`
+  {
+    checkProductEquals(matrix_product * matrix_product,
+                       matrix_product_expected * matrix_product_expected);
+    checkProductEquals(spin_product * spin_product,
+                       spin_product_expected * spin_product_expected, 2);
+    checkProductEquals(boson_product * boson_product,
+                       boson_product_expected * boson_product_expected, 2);
+    checkProductEquals(matrix_product * spin_product,
+                       matrix_product_expected * spin_product_expected);
+    checkProductEquals(spin_product * matrix_product,
+                       spin_product_expected * matrix_product_expected);
+    checkProductEquals(matrix_product * boson_product,
+                       matrix_product_expected * boson_product_expected);
+    checkProductEquals(boson_product * matrix_product,
+                       boson_product_expected * matrix_product_expected);
+    checkProductEquals(spin_product * boson_product,
+                       spin_product_expected * boson_product_expected);
+    checkProductEquals(boson_product * spin_product,
+                       boson_product_expected * spin_product_expected);
+  }
+
+  // `product *= product`
+  {
+    auto matrix_product_0 = matrix_product;
+    matrix_product_0 *= matrix_product;
+    checkProductEquals(matrix_product_0,
+                       matrix_product_expected * matrix_product_expected);
+
+    auto spin_product_0 = spin_product;
+    spin_product_0 *= spin_product;
+    checkProductEquals(spin_product_0,
+                       spin_product_expected * spin_product_expected, 2);
+
+    auto boson_product_0 = boson_product;
+    boson_product_0 *= boson_product;
+    checkProductEquals(boson_product_0,
+                       boson_product_expected * boson_product_expected, 2);
+
+    matrix_product_0 = matrix_product;
+    matrix_product_0 *= spin_product;
+    checkProductEquals(matrix_product_0,
+                       matrix_product_expected * spin_product_expected);
+
+    matrix_product_0 = matrix_product;
+    matrix_product_0 *= boson_product;
+    checkProductEquals(matrix_product_0,
+                       matrix_product_expected * boson_product_expected);
+  }
+}
+
+TEST(OperatorExpressions, checkOperatorSumConversions) {
+
+  std::unordered_map<std::string, std::complex<double>> parameters = {
+      {"squeezing", 0.5}, {"displacement", 0.25}};
+  std::unordered_map<int, int> dimensions = {{0, 2}, {1, 2}};
+
+  auto matrix_product =
+      cudaq::matrix_operator::squeeze(0) * cudaq::matrix_operator::displace(1);
+  auto matrix_product_expected = cudaq::kronecker(
+      utils::displace_matrix(2, 0.25), utils::squeeze_matrix(2, 0.5));
+  auto spin_product = cudaq::spin_operator::y(1) * cudaq::spin_operator::x(0);
+  auto spin_product_expected =
+      cudaq::kronecker(utils::PauliY_matrix(), utils::PauliX_matrix());
+  auto boson_product =
+      cudaq::boson_operator::annihilate(1) * cudaq::boson_operator::number(0);
+  auto boson_product_expected =
+      cudaq::kronecker(utils::annihilate_matrix(2), utils::number_matrix(2));
+
+  auto matrix_sum =
+      cudaq::matrix_operator::squeeze(0) + cudaq::matrix_operator::displace(1);
+  auto matrix_sum_expected =
+      cudaq::kronecker(utils::displace_matrix(2, 0.25), utils::id_matrix(2)) +
+      cudaq::kronecker(utils::id_matrix(2), utils::squeeze_matrix(2, 0.5));
+  auto spin_sum = cudaq::spin_operator::y(1) + cudaq::spin_operator::x(0);
+  auto spin_sum_expected =
+      cudaq::kronecker(utils::PauliY_matrix(), utils::id_matrix(2)) +
+      cudaq::kronecker(utils::id_matrix(2), utils::PauliX_matrix());
+  auto boson_sum =
+      cudaq::boson_operator::annihilate(1) + cudaq::boson_operator::number(0);
+  auto boson_sum_expected =
+      cudaq::kronecker(utils::annihilate_matrix(2), utils::id_matrix(2)) +
+      cudaq::kronecker(utils::id_matrix(2), utils::number_matrix(2));
+
+  auto checkSumEquals =
+      [dimensions, parameters](cudaq::operator_sum<cudaq::matrix_operator> sum,
+                               cudaq::matrix_2 expected, int num_terms = 4) {
+        auto got = sum.to_matrix(dimensions, parameters);
+        ASSERT_TRUE(sum.num_terms() == num_terms);
+        utils::checkEqual(got, expected);
+      };
+
+  // `sum + product`
+  {
+    checkSumEquals(matrix_sum + matrix_product,
+                   matrix_sum_expected + matrix_product_expected, 3);
+    checkSumEquals(spin_sum + spin_product,
+                   spin_sum_expected + spin_product_expected, 3);
+    checkSumEquals(boson_sum + boson_product,
+                   boson_sum_expected + boson_product_expected, 3);
+    checkSumEquals(matrix_sum + spin_product,
+                   matrix_sum_expected + spin_product_expected, 3);
+    checkSumEquals(spin_sum + matrix_product,
+                   spin_sum_expected + matrix_product_expected, 3);
+    checkSumEquals(matrix_sum + boson_product,
+                   matrix_sum_expected + boson_product_expected, 3);
+    checkSumEquals(boson_sum + matrix_product,
+                   boson_sum_expected + matrix_product_expected, 3);
+    checkSumEquals(spin_sum + boson_product,
+                   spin_sum_expected + boson_product_expected, 3);
+    checkSumEquals(boson_sum + spin_product,
+                   boson_sum_expected + spin_product_expected, 3);
+  }
+
+  // `product + sum`
+  {
+    checkSumEquals(matrix_product + matrix_sum,
+                   matrix_product_expected + matrix_sum_expected, 3);
+    checkSumEquals(spin_product + spin_sum,
+                   spin_product_expected + spin_sum_expected, 3);
+    checkSumEquals(boson_product + boson_sum,
+                   boson_product_expected + boson_sum_expected, 3);
+    checkSumEquals(matrix_product + spin_sum,
+                   matrix_product_expected + spin_sum_expected, 3);
+    checkSumEquals(spin_product + matrix_sum,
+                   spin_product_expected + matrix_sum_expected, 3);
+    checkSumEquals(matrix_product + boson_sum,
+                   matrix_product_expected + boson_sum_expected, 3);
+    checkSumEquals(boson_product + matrix_sum,
+                   boson_product_expected + matrix_sum_expected, 3);
+    checkSumEquals(spin_product + boson_sum,
+                   spin_product_expected + boson_sum_expected, 3);
+    checkSumEquals(boson_product + spin_sum,
+                   boson_product_expected + spin_sum_expected, 3);
+  }
+
+  // `sum + sum`
+  {
+    checkSumEquals(matrix_sum + matrix_sum,
+                   matrix_sum_expected + matrix_sum_expected, 2);
+    checkSumEquals(spin_sum + spin_sum, spin_sum_expected + spin_sum_expected,
+                   2);
+    checkSumEquals(boson_sum + boson_sum,
+                   boson_sum_expected + boson_sum_expected, 2);
+    checkSumEquals(matrix_sum + spin_sum,
+                   matrix_sum_expected + spin_sum_expected);
+    checkSumEquals(spin_sum + matrix_sum,
+                   matrix_sum_expected + spin_sum_expected);
+    checkSumEquals(matrix_sum + boson_sum,
+                   matrix_sum_expected + boson_sum_expected);
+    checkSumEquals(boson_sum + matrix_sum,
+                   matrix_sum_expected + boson_sum_expected);
+    checkSumEquals(spin_sum + boson_sum,
+                   spin_sum_expected + boson_sum_expected);
+    checkSumEquals(boson_sum + spin_sum,
+                   spin_sum_expected + boson_sum_expected);
+  }
+
+  // `sum - product`
+  {
+    checkSumEquals(matrix_sum - matrix_product,
+                   matrix_sum_expected - matrix_product_expected, 3);
+    checkSumEquals(spin_sum - spin_product,
+                   spin_sum_expected - spin_product_expected, 3);
+    checkSumEquals(boson_sum - boson_product,
+                   boson_sum_expected - boson_product_expected, 3);
+    checkSumEquals(matrix_sum - spin_product,
+                   matrix_sum_expected - spin_product_expected, 3);
+    checkSumEquals(spin_sum - matrix_product,
+                   spin_sum_expected - matrix_product_expected, 3);
+    checkSumEquals(matrix_sum - boson_product,
+                   matrix_sum_expected - boson_product_expected, 3);
+    checkSumEquals(boson_sum - matrix_product,
+                   boson_sum_expected - matrix_product_expected, 3);
+    checkSumEquals(spin_sum - boson_product,
+                   spin_sum_expected - boson_product_expected, 3);
+    checkSumEquals(boson_sum - spin_product,
+                   boson_sum_expected - spin_product_expected, 3);
+  }
+
+  // `product - sum`
+  {
+    checkSumEquals(matrix_product - matrix_sum,
+                   matrix_product_expected - matrix_sum_expected, 3);
+    checkSumEquals(spin_product - spin_sum,
+                   spin_product_expected - spin_sum_expected, 3);
+    checkSumEquals(boson_product - boson_sum,
+                   boson_product_expected - boson_sum_expected, 3);
+    checkSumEquals(matrix_product - spin_sum,
+                   matrix_product_expected - spin_sum_expected, 3);
+    checkSumEquals(spin_product - matrix_sum,
+                   spin_product_expected - matrix_sum_expected, 3);
+    checkSumEquals(matrix_product - boson_sum,
+                   matrix_product_expected - boson_sum_expected, 3);
+    checkSumEquals(boson_product - matrix_sum,
+                   boson_product_expected - matrix_sum_expected, 3);
+    checkSumEquals(spin_product - boson_sum,
+                   spin_product_expected - boson_sum_expected, 3);
+    checkSumEquals(boson_product - spin_sum,
+                   boson_product_expected - spin_sum_expected, 3);
+  }
+
+  // `sum - sum`
+  {
+    checkSumEquals(matrix_sum - matrix_sum,
+                   matrix_sum_expected - matrix_sum_expected, 2);
+    checkSumEquals(spin_sum - spin_sum, spin_sum_expected - spin_sum_expected,
+                   2);
+    checkSumEquals(boson_sum - boson_sum,
+                   boson_sum_expected - boson_sum_expected, 2);
+    checkSumEquals(matrix_sum - spin_sum,
+                   matrix_sum_expected - spin_sum_expected);
+    checkSumEquals(spin_sum - matrix_sum,
+                   spin_sum_expected - matrix_sum_expected);
+    checkSumEquals(matrix_sum - boson_sum,
+                   matrix_sum_expected - boson_sum_expected);
+    checkSumEquals(boson_sum - matrix_sum,
+                   boson_sum_expected - matrix_sum_expected);
+    checkSumEquals(spin_sum - boson_sum,
+                   spin_sum_expected - boson_sum_expected);
+    checkSumEquals(boson_sum - spin_sum,
+                   boson_sum_expected - spin_sum_expected);
+  }
+
+  // `sum * product`
+  {
+    checkSumEquals(matrix_sum * matrix_product,
+                   matrix_sum_expected * matrix_product_expected, 2);
+    checkSumEquals(spin_sum * spin_product,
+                   spin_sum_expected * spin_product_expected, 2);
+    checkSumEquals(boson_sum * boson_product,
+                   boson_sum_expected * boson_product_expected, 2);
+    checkSumEquals(matrix_sum * spin_product,
+                   matrix_sum_expected * spin_product_expected, 2);
+    checkSumEquals(spin_sum * matrix_product,
+                   spin_sum_expected * matrix_product_expected, 2);
+    checkSumEquals(matrix_sum * boson_product,
+                   matrix_sum_expected * boson_product_expected, 2);
+    checkSumEquals(boson_sum * matrix_product,
+                   boson_sum_expected * matrix_product_expected, 2);
+    checkSumEquals(spin_sum * boson_product,
+                   spin_sum_expected * boson_product_expected, 2);
+    checkSumEquals(boson_sum * spin_product,
+                   boson_sum_expected * spin_product_expected, 2);
+  }
+
+  // `product * sum`
+  {
+    checkSumEquals(matrix_product * matrix_sum,
+                   matrix_product_expected * matrix_sum_expected, 2);
+    checkSumEquals(spin_product * spin_sum,
+                   spin_product_expected * spin_sum_expected, 2);
+    checkSumEquals(boson_product * boson_sum,
+                   boson_product_expected * boson_sum_expected, 2);
+    checkSumEquals(matrix_product * spin_sum,
+                   matrix_product_expected * spin_sum_expected, 2);
+    checkSumEquals(spin_product * matrix_sum,
+                   spin_product_expected * matrix_sum_expected, 2);
+    checkSumEquals(matrix_product * boson_sum,
+                   matrix_product_expected * boson_sum_expected, 2);
+    checkSumEquals(boson_product * matrix_sum,
+                   boson_product_expected * matrix_sum_expected, 2);
+    checkSumEquals(spin_product * boson_sum,
+                   spin_product_expected * boson_sum_expected, 2);
+    checkSumEquals(boson_product * spin_sum,
+                   boson_product_expected * spin_sum_expected, 2);
+  }
+
+  // `sum * sum`
+  {
+    checkSumEquals(matrix_sum * matrix_sum,
+                   matrix_sum_expected * matrix_sum_expected, 3);
+    checkSumEquals(spin_sum * spin_sum, spin_sum_expected * spin_sum_expected,
+                   3);
+    checkSumEquals(boson_sum * boson_sum,
+                   boson_sum_expected * boson_sum_expected, 3);
+    checkSumEquals(matrix_sum * spin_sum,
+                   matrix_sum_expected * spin_sum_expected);
+    checkSumEquals(spin_sum * matrix_sum,
+                   spin_sum_expected * matrix_sum_expected);
+    checkSumEquals(matrix_sum * boson_sum,
+                   matrix_sum_expected * boson_sum_expected);
+    checkSumEquals(boson_sum * matrix_sum,
+                   boson_sum_expected * matrix_sum_expected);
+    checkSumEquals(spin_sum * boson_sum,
+                   spin_sum_expected * boson_sum_expected);
+    checkSumEquals(boson_sum * spin_sum,
+                   boson_sum_expected * spin_sum_expected);
+  }
+
+  // `sum += product`
+  {
+    auto matrix_sum_0 = matrix_sum;
+    matrix_sum_0 += matrix_product;
+    checkSumEquals(matrix_sum_0, matrix_sum_expected + matrix_product_expected,
+                   3);
+
+    auto spin_sum_0 = spin_sum;
+    spin_sum_0 += spin_product;
+    checkSumEquals(spin_sum_0, spin_sum_expected + spin_product_expected, 3);
+
+    auto boson_sum_0 = boson_sum;
+    boson_sum_0 += boson_product;
+    checkSumEquals(boson_sum_0, boson_sum_expected + boson_product_expected, 3);
+
+    matrix_sum_0 = matrix_sum;
+    matrix_sum_0 += spin_product;
+    checkSumEquals(matrix_sum_0, matrix_sum_expected + spin_product_expected,
+                   3);
+
+    matrix_sum_0 = matrix_sum;
+    matrix_sum_0 += boson_product;
+    checkSumEquals(matrix_sum_0, matrix_sum_expected + boson_product_expected,
+                   3);
+  }
+
+  // `sum += sum`
+  {
+    auto matrix_sum_0 = matrix_sum;
+    matrix_sum_0 += matrix_sum;
+    checkSumEquals(matrix_sum_0, matrix_sum_expected + matrix_sum_expected, 2);
+
+    auto spin_sum_0 = spin_sum;
+    spin_sum_0 += spin_sum;
+    checkSumEquals(spin_sum_0, spin_sum_expected + spin_sum_expected, 2);
+
+    auto boson_sum_0 = boson_sum;
+    boson_sum_0 += boson_sum;
+    checkSumEquals(boson_sum_0, boson_sum_expected + boson_sum_expected, 2);
+
+    matrix_sum_0 = matrix_sum;
+    matrix_sum_0 += spin_sum;
+    checkSumEquals(matrix_sum_0, matrix_sum_expected + spin_sum_expected);
+
+    matrix_sum_0 = matrix_sum;
+    matrix_sum_0 += boson_sum;
+    checkSumEquals(matrix_sum_0, matrix_sum_expected + boson_sum_expected);
+  }
+
+  // `sum -= product`
+  {
+    auto matrix_sum_0 = matrix_sum;
+    matrix_sum_0 -= matrix_product;
+    checkSumEquals(matrix_sum_0, matrix_sum_expected - matrix_product_expected,
+                   3);
+
+    auto spin_sum_0 = spin_sum;
+    spin_sum_0 -= spin_product;
+    checkSumEquals(spin_sum_0, spin_sum_expected - spin_product_expected, 3);
+
+    auto boson_sum_0 = boson_sum;
+    boson_sum_0 -= boson_product;
+    checkSumEquals(boson_sum_0, boson_sum_expected - boson_product_expected, 3);
+
+    matrix_sum_0 = matrix_sum;
+    matrix_sum_0 -= spin_product;
+    checkSumEquals(matrix_sum_0, matrix_sum_expected - spin_product_expected,
+                   3);
+
+    matrix_sum_0 = matrix_sum;
+    matrix_sum_0 -= boson_product;
+    checkSumEquals(matrix_sum_0, matrix_sum_expected - boson_product_expected,
+                   3);
+  }
+
+  // `sum -= sum`
+  {
+    auto matrix_sum_0 = matrix_sum;
+    matrix_sum_0 -= matrix_sum;
+    checkSumEquals(matrix_sum_0, matrix_sum_expected - matrix_sum_expected, 2);
+
+    auto spin_sum_0 = spin_sum;
+    spin_sum_0 -= spin_sum;
+    checkSumEquals(spin_sum_0, spin_sum_expected - spin_sum_expected, 2);
+
+    auto boson_sum_0 = boson_sum;
+    boson_sum_0 -= boson_sum;
+    checkSumEquals(boson_sum_0, boson_sum_expected - boson_sum_expected, 2);
+
+    matrix_sum_0 = matrix_sum;
+    matrix_sum_0 -= spin_sum;
+    checkSumEquals(matrix_sum_0, matrix_sum_expected - spin_sum_expected);
+
+    matrix_sum_0 = matrix_sum;
+    matrix_sum_0 -= boson_sum;
+    checkSumEquals(matrix_sum_0, matrix_sum_expected - boson_sum_expected);
+  }
+
+  // `sum *= product`
+  {
+    auto matrix_sum_0 = matrix_sum;
+    matrix_sum_0 *= matrix_product;
+    checkSumEquals(matrix_sum_0, matrix_sum_expected * matrix_product_expected,
+                   2);
+
+    auto spin_sum_0 = spin_sum;
+    spin_sum_0 *= spin_product;
+    checkSumEquals(spin_sum_0, spin_sum_expected * spin_product_expected, 2);
+
+    auto boson_sum_0 = boson_sum;
+    boson_sum_0 *= boson_product;
+    checkSumEquals(boson_sum_0, boson_sum_expected * boson_product_expected, 2);
+
+    matrix_sum_0 = matrix_sum;
+    matrix_sum_0 *= spin_product;
+    checkSumEquals(matrix_sum_0, matrix_sum_expected * spin_product_expected,
+                   2);
+
+    matrix_sum_0 = matrix_sum;
+    matrix_sum_0 *= boson_product;
+    checkSumEquals(matrix_sum_0, matrix_sum_expected * boson_product_expected,
+                   2);
+  }
+
+  // `sum *= sum`
+  {
+    auto matrix_sum_0 = matrix_sum;
+    matrix_sum_0 *= matrix_sum;
+    checkSumEquals(matrix_sum_0, matrix_sum_expected * matrix_sum_expected, 3);
+
+    auto spin_sum_0 = spin_sum;
+    spin_sum_0 *= spin_sum;
+    checkSumEquals(spin_sum_0, spin_sum_expected * spin_sum_expected, 3);
+
+    auto boson_sum_0 = boson_sum;
+    boson_sum_0 *= boson_sum;
+    checkSumEquals(boson_sum_0, boson_sum_expected * boson_sum_expected, 3);
+
+    matrix_sum_0 = matrix_sum;
+    matrix_sum_0 *= spin_sum;
+    checkSumEquals(matrix_sum_0, matrix_sum_expected * spin_sum_expected);
+
+    matrix_sum_0 = matrix_sum;
+    matrix_sum_0 *= boson_sum;
+    checkSumEquals(matrix_sum_0, matrix_sum_expected * boson_sum_expected);
+  }
+}
diff --git a/unittests/dynamics/operator_sum.cpp b/unittests/dynamics/operator_sum.cpp
new file mode 100644
index 00000000000..29cb667f9b5
--- /dev/null
+++ b/unittests/dynamics/operator_sum.cpp
@@ -0,0 +1,1371 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/operators.h"
+#include "utils.h"
+#include <gtest/gtest.h>
+
+TEST(OperatorExpressions, checkOperatorSumBasics) {
+  std::vector<int> levels = {2, 3, 4};
+
+  std::complex<double> value_0 = 0.1 + 0.1;
+  std::complex<double> value_1 = 0.1 + 1.0;
+  std::complex<double> value_2 = 2.0 + 0.1;
+  std::complex<double> value_3 = 2.0 + 1.0;
+
+  {// Same degrees of freedom.
+   {auto spin0 = cudaq::spin_operator::x(5);
+  auto spin1 = cudaq::spin_operator::z(5);
+  auto spin_sum = spin0 + spin1;
+
+  std::vector<int> want_degrees = {5};
+  auto spin_matrix = utils::PauliX_matrix() + utils::PauliZ_matrix();
+
+  ASSERT_TRUE(spin_sum.degrees() == want_degrees);
+  utils::checkEqual(spin_matrix, spin_sum.to_matrix());
+
+  for (auto level_count : levels) {
+    auto op0 = cudaq::matrix_operator::number(5);
+    auto op1 = cudaq::matrix_operator::parity(5);
+
+    auto sum = op0 + op1;
+    ASSERT_TRUE(sum.degrees() == want_degrees);
+
+    auto got_matrix = sum.to_matrix({{5, level_count}});
+    auto matrix0 = utils::number_matrix(level_count);
+    auto matrix1 = utils::parity_matrix(level_count);
+    auto want_matrix = matrix0 + matrix1;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+}
+
+// Different degrees of freedom.
+{
+  auto spin0 = cudaq::spin_operator::x(0);
+  auto spin1 = cudaq::spin_operator::z(1);
+  auto spin_sum = spin0 + spin1;
+
+  std::vector<int> want_degrees = {1, 0};
+  auto spin_matrix =
+      cudaq::kronecker(utils::id_matrix(2), utils::PauliX_matrix()) +
+      cudaq::kronecker(utils::PauliZ_matrix(), utils::id_matrix(2));
+
+  ASSERT_TRUE(spin_sum.degrees() == want_degrees);
+  utils::checkEqual(spin_matrix, spin_sum.to_matrix());
+
+  for (auto level_count : levels) {
+    auto op0 = cudaq::matrix_operator::number(0);
+    auto op1 = cudaq::matrix_operator::parity(1);
+
+    auto got = op0 + op1;
+    auto got_reverse = op1 + op0;
+
+    ASSERT_TRUE(got.degrees() == want_degrees);
+    ASSERT_TRUE(got_reverse.degrees() == want_degrees);
+
+    auto got_matrix = got.to_matrix({{0, level_count}, {1, level_count}});
+    auto got_matrix_reverse =
+        got_reverse.to_matrix({{0, level_count}, {1, level_count}});
+
+    auto identity = utils::id_matrix(level_count);
+    auto matrix0 = utils::number_matrix(level_count);
+    auto matrix1 = utils::parity_matrix(level_count);
+
+    auto fullHilbert0 = cudaq::kronecker(identity, matrix0);
+    auto fullHilbert1 = cudaq::kronecker(matrix1, identity);
+    auto want_matrix = fullHilbert0 + fullHilbert1;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix, got_matrix_reverse);
+  }
+}
+
+// Different degrees of freedom, non-consecutive.
+// Should produce the same matrices as the above test.
+{
+  auto spin0 = cudaq::spin_operator::x(0);
+  auto spin1 = cudaq::spin_operator::z(2);
+  auto spin_sum = spin0 + spin1;
+
+  std::vector<int> want_degrees = {2, 0};
+  auto spin_matrix =
+      cudaq::kronecker(utils::id_matrix(2), utils::PauliX_matrix()) +
+      cudaq::kronecker(utils::PauliZ_matrix(), utils::id_matrix(2));
+
+  ASSERT_TRUE(spin_sum.degrees() == want_degrees);
+  utils::checkEqual(spin_matrix, spin_sum.to_matrix());
+
+  for (auto level_count : levels) {
+    auto op0 = cudaq::matrix_operator::number(0);
+    auto op1 = cudaq::matrix_operator::parity(2);
+
+    auto got = op0 + op1;
+    auto got_reverse = op1 + op0;
+
+    ASSERT_TRUE(got.degrees() == want_degrees);
+    ASSERT_TRUE(got_reverse.degrees() == want_degrees);
+
+    auto got_matrix = got.to_matrix({{0, level_count}, {2, level_count}});
+    auto got_matrix_reverse =
+        got_reverse.to_matrix({{0, level_count}, {2, level_count}});
+
+    auto identity = utils::id_matrix(level_count);
+    auto matrix0 = utils::number_matrix(level_count);
+    auto matrix1 = utils::parity_matrix(level_count);
+
+    auto fullHilbert0 = cudaq::kronecker(identity, matrix0);
+    auto fullHilbert1 = cudaq::kronecker(matrix1, identity);
+    auto want_matrix = fullHilbert0 + fullHilbert1;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix, got_matrix_reverse);
+  }
+}
+
+// Different degrees of freedom, non-consecutive but all dimensions
+// provided.
+{
+  auto spin0 = cudaq::spin_operator::x(0);
+  auto spin1 = cudaq::spin_operator::z(2);
+  auto spin_sum = spin0 + spin1;
+
+  std::vector<int> want_degrees = {2, 0};
+  auto spin_matrix =
+      cudaq::kronecker(utils::id_matrix(2), utils::PauliX_matrix()) +
+      cudaq::kronecker(utils::PauliZ_matrix(), utils::id_matrix(2));
+  std::unordered_map<int, int> dimensions = {{0, 2}, {1, 2}, {2, 2}};
+
+  ASSERT_TRUE(spin_sum.degrees() == want_degrees);
+  utils::checkEqual(spin_matrix, spin_sum.to_matrix(dimensions));
+
+  for (auto level_count : levels) {
+    auto op0 = cudaq::matrix_operator::number(0);
+    auto op1 = cudaq::matrix_operator::parity(2);
+
+    auto got = op0 + op1;
+    auto got_reverse = op1 + op0;
+
+    std::vector<int> want_degrees = {2, 0};
+    ASSERT_TRUE(got.degrees() == want_degrees);
+    ASSERT_TRUE(got_reverse.degrees() == want_degrees);
+
+    dimensions = {{0, level_count}, {1, level_count}, {2, level_count}};
+    auto got_matrix = got.to_matrix(dimensions);
+    auto got_matrix_reverse = got_reverse.to_matrix(dimensions);
+
+    auto identity = utils::id_matrix(level_count);
+    auto matrix0 = utils::number_matrix(level_count);
+    auto matrix1 = utils::parity_matrix(level_count);
+    std::vector<cudaq::matrix_2> matrices_0 = {identity, matrix0};
+    std::vector<cudaq::matrix_2> matrices_1 = {matrix1, identity};
+
+    auto fullHilbert0 = cudaq::kronecker(matrices_0.begin(), matrices_0.end());
+    auto fullHilbert1 = cudaq::kronecker(matrices_1.begin(), matrices_1.end());
+    auto want_matrix = fullHilbert0 + fullHilbert1;
+    auto want_matrix_reverse = fullHilbert1 + fullHilbert0;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(got_matrix, want_matrix);
+  }
+}
+}
+
+// Scalar Ops against Elementary Ops
+{
+  auto function = [](const std::unordered_map<std::string, std::complex<double>>
+                         &parameters) {
+    auto entry = parameters.find("value");
+    if (entry == parameters.end())
+      throw std::runtime_error("value not defined in parameters");
+    return entry->second;
+  };
+
+  // matrix operator against constant
+  {
+    auto op = cudaq::matrix_operator::parity(0);
+    auto scalar_op = cudaq::scalar_operator(value_0);
+    auto sum = scalar_op + op;
+    auto reverse = op + scalar_op;
+
+    std::vector<int> want_degrees = {0};
+    auto op_matrix = utils::parity_matrix(2);
+    auto scalar_matrix = value_0 * utils::id_matrix(2);
+
+    ASSERT_TRUE(sum.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+    utils::checkEqual(scalar_matrix + op_matrix, sum.to_matrix({{0, 2}}));
+    utils::checkEqual(scalar_matrix + op_matrix, reverse.to_matrix({{0, 2}}));
+  }
+
+  // spin operator against constant
+  {
+    auto op = cudaq::spin_operator::x(0);
+    auto scalar_op = cudaq::scalar_operator(value_0);
+    auto sum = scalar_op + op;
+    auto reverse = op + scalar_op;
+
+    std::vector<int> want_degrees = {0};
+    auto op_matrix = utils::PauliX_matrix();
+    auto scalar_matrix = value_0 * utils::id_matrix(2);
+
+    ASSERT_TRUE(sum.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+    utils::checkEqual(scalar_matrix + op_matrix, sum.to_matrix());
+    utils::checkEqual(scalar_matrix + op_matrix, reverse.to_matrix());
+  }
+
+  // matrix operator against constant from lambda
+  {
+    auto op = cudaq::matrix_operator::parity(1);
+    auto scalar_op = cudaq::scalar_operator(function);
+    auto sum = scalar_op + op;
+    auto reverse = op + scalar_op;
+
+    std::vector<int> want_degrees = {1};
+    auto op_matrix = utils::parity_matrix(2);
+    auto scalar_matrix =
+        scalar_op.evaluate({{"value", 0.3}}) * utils::id_matrix(2);
+
+    ASSERT_TRUE(sum.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+    utils::checkEqual(scalar_matrix + op_matrix,
+                      sum.to_matrix({{1, 2}}, {{"value", 0.3}}));
+    utils::checkEqual(scalar_matrix + op_matrix,
+                      reverse.to_matrix({{1, 2}}, {{"value", 0.3}}));
+  }
+
+  // spin operator against constant from lambda
+  {
+    auto op = cudaq::spin_operator::x(1);
+    auto scalar_op = cudaq::scalar_operator(function);
+    auto sum = scalar_op + op;
+    auto reverse = op + scalar_op;
+
+    std::vector<int> want_degrees = {1};
+    auto op_matrix = utils::PauliX_matrix();
+    auto scalar_matrix =
+        scalar_op.evaluate({{"value", 0.3}}) * utils::id_matrix(2);
+
+    ASSERT_TRUE(sum.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+    utils::checkEqual(scalar_matrix + op_matrix,
+                      sum.to_matrix({{1, 2}}, {{"value", 0.3}}));
+    utils::checkEqual(scalar_matrix + op_matrix,
+                      reverse.to_matrix({{1, 2}}, {{"value", 0.3}}));
+  }
+}
+}
+
+TEST(OperatorExpressions, checkOperatorSumAgainstScalars) {
+  int level_count = 3;
+  std::complex<double> value = 0.1 + 0.1j;
+  double double_value = 0.1;
+
+  // `operator_sum + double`
+  {
+    auto original = cudaq::matrix_operator::momentum(1) +
+                    cudaq::matrix_operator::position(2);
+
+    auto sum = original + double_value;
+    auto reverse = double_value + original;
+
+    ASSERT_TRUE(sum.num_terms() == 3);
+    ASSERT_TRUE(reverse.num_terms() == 3);
+
+    auto got_matrix = sum.to_matrix({{1, level_count}, {2, level_count + 1}});
+    auto got_matrix_reverse =
+        reverse.to_matrix({{1, level_count}, {2, level_count + 1}});
+
+    auto matrix0 = cudaq::kronecker(utils::id_matrix(level_count + 1),
+                                    utils::momentum_matrix(level_count));
+    auto matrix1 = cudaq::kronecker(utils::position_matrix(level_count + 1),
+                                    utils::id_matrix(level_count));
+    auto scaled_identity =
+        double_value * utils::id_matrix((level_count) * (level_count + 1));
+    auto want_matrix = matrix0 + matrix1 + scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix, got_matrix_reverse);
+  }
+
+  // `operator_sum + std::complex<double>`
+  {
+    auto original =
+        cudaq::matrix_operator::parity(1) + cudaq::matrix_operator::parity(2);
+
+    auto sum = original + value;
+    auto reverse = value + original;
+
+    ASSERT_TRUE(sum.num_terms() == 3);
+    ASSERT_TRUE(reverse.num_terms() == 3);
+
+    auto got_matrix = sum.to_matrix({{1, level_count}, {2, level_count + 1}});
+    auto got_matrix_reverse =
+        reverse.to_matrix({{1, level_count}, {2, level_count + 1}});
+
+    auto matrix0 = cudaq::kronecker(utils::id_matrix(level_count + 1),
+                                    utils::parity_matrix(level_count));
+    auto matrix1 = cudaq::kronecker(utils::parity_matrix(level_count + 1),
+                                    utils::id_matrix(level_count));
+    auto scaled_identity =
+        value * utils::id_matrix((level_count) * (level_count + 1));
+    auto want_matrix = matrix0 + matrix1 + scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix, got_matrix_reverse);
+  }
+
+  // `spin sum + std::complex<double>`
+  {
+    auto original = cudaq::spin_operator::x(1) + cudaq::spin_operator::y(2);
+
+    auto sum = original + value;
+    auto reverse = value + original;
+
+    ASSERT_TRUE(sum.num_terms() == 3);
+    ASSERT_TRUE(reverse.num_terms() == 3);
+
+    auto got_matrix = sum.to_matrix();
+    auto got_matrix_reverse = reverse.to_matrix();
+
+    auto matrix0 =
+        cudaq::kronecker(utils::id_matrix(2), utils::PauliX_matrix());
+    auto matrix1 =
+        cudaq::kronecker(utils::PauliY_matrix(), utils::id_matrix(2));
+    auto scaled_identity = value * utils::id_matrix(2 * 2);
+    auto want_matrix = matrix0 + matrix1 + scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix, got_matrix_reverse);
+  }
+
+  // `operator_sum + scalar_operator`
+  {
+    level_count = 2;
+    auto original =
+        cudaq::matrix_operator::parity(1) + cudaq::matrix_operator::parity(2);
+
+    auto sum = original + cudaq::scalar_operator(value);
+    auto reverse = cudaq::scalar_operator(value) + original;
+
+    ASSERT_TRUE(sum.num_terms() == 3);
+    ASSERT_TRUE(reverse.num_terms() == 3);
+
+    auto got_matrix = sum.to_matrix({{1, level_count}, {2, level_count + 1}});
+    auto got_matrix_reverse =
+        reverse.to_matrix({{1, level_count}, {2, level_count + 1}});
+
+    auto matrix0 = cudaq::kronecker(utils::id_matrix(level_count + 1),
+                                    utils::parity_matrix(level_count));
+    auto matrix1 = cudaq::kronecker(utils::parity_matrix(level_count + 1),
+                                    utils::id_matrix(level_count));
+    auto sum_matrix = matrix0 + matrix1;
+    auto scaled_identity =
+        value * utils::id_matrix((level_count) * (level_count + 1));
+
+    auto want_matrix = sum_matrix + scaled_identity;
+    auto want_matrix_reverse = scaled_identity + sum_matrix;
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  // `operator_sum - double`
+  {
+    auto original =
+        cudaq::matrix_operator::parity(1) + cudaq::matrix_operator::number(2);
+
+    auto difference = original - double_value;
+    auto reverse = double_value - original;
+
+    ASSERT_TRUE(difference.num_terms() == 3);
+    ASSERT_TRUE(reverse.num_terms() == 3);
+
+    auto got_matrix =
+        difference.to_matrix({{1, level_count}, {2, level_count + 1}});
+    auto got_matrix_reverse =
+        reverse.to_matrix({{1, level_count}, {2, level_count + 1}});
+
+    auto matrix0 = cudaq::kronecker(utils::id_matrix(level_count + 1),
+                                    utils::parity_matrix(level_count));
+    auto matrix1 = cudaq::kronecker(utils::number_matrix(level_count + 1),
+                                    utils::id_matrix(level_count));
+    auto sum_matrix = matrix0 + matrix1;
+    auto scaled_identity =
+        double_value * utils::id_matrix((level_count) * (level_count + 1));
+
+    auto want_matrix = sum_matrix - scaled_identity;
+    auto want_matrix_reverse = scaled_identity - sum_matrix;
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  // `spin sum - double`
+  {
+    auto original = cudaq::spin_operator::x(1) + cudaq::spin_operator::z(2);
+
+    auto difference = original - double_value;
+    auto reverse = double_value - original;
+
+    ASSERT_TRUE(difference.num_terms() == 3);
+    ASSERT_TRUE(reverse.num_terms() == 3);
+
+    auto got_matrix = difference.to_matrix();
+    auto got_matrix_reverse = reverse.to_matrix();
+
+    auto matrix0 =
+        cudaq::kronecker(utils::id_matrix(2), utils::PauliX_matrix());
+    auto matrix1 =
+        cudaq::kronecker(utils::PauliZ_matrix(), utils::id_matrix(2));
+    auto sum_matrix = matrix0 + matrix1;
+    auto scaled_identity = double_value * utils::id_matrix(2 * 2);
+
+    auto want_matrix = sum_matrix - scaled_identity;
+    auto want_matrix_reverse = scaled_identity - sum_matrix;
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  // `operator_sum - std::complex<double>`
+  {
+    auto original =
+        cudaq::matrix_operator::parity(1) + cudaq::matrix_operator::parity(2);
+
+    auto difference = original - value;
+    auto reverse = value - original;
+
+    ASSERT_TRUE(difference.num_terms() == 3);
+    ASSERT_TRUE(reverse.num_terms() == 3);
+
+    auto got_matrix =
+        difference.to_matrix({{1, level_count}, {2, level_count + 1}});
+    auto got_matrix_reverse =
+        reverse.to_matrix({{1, level_count}, {2, level_count + 1}});
+
+    auto matrix0 = cudaq::kronecker(utils::id_matrix(level_count + 1),
+                                    utils::parity_matrix(level_count));
+    auto matrix1 = cudaq::kronecker(utils::parity_matrix(level_count + 1),
+                                    utils::id_matrix(level_count));
+    auto sum_matrix = matrix0 + matrix1;
+    auto scaled_identity =
+        value * utils::id_matrix((level_count) * (level_count + 1));
+    auto want_matrix = sum_matrix - scaled_identity;
+    auto want_matrix_reverse = scaled_identity - sum_matrix;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  // `operator_sum - scalar_operator`
+  {
+    auto original =
+        cudaq::matrix_operator::parity(1) + cudaq::matrix_operator::parity(2);
+
+    auto difference = original - cudaq::scalar_operator(value);
+    auto reverse = cudaq::scalar_operator(value) - original;
+
+    ASSERT_TRUE(difference.num_terms() == 3);
+    ASSERT_TRUE(reverse.num_terms() == 3);
+
+    auto got_matrix =
+        difference.to_matrix({{1, level_count}, {2, level_count + 1}});
+    auto got_matrix_reverse =
+        reverse.to_matrix({{1, level_count}, {2, level_count + 1}});
+
+    auto matrix0 = cudaq::kronecker(utils::id_matrix(level_count + 1),
+                                    utils::parity_matrix(level_count));
+    auto matrix1 = cudaq::kronecker(utils::parity_matrix(level_count + 1),
+                                    utils::id_matrix(level_count));
+    auto sum_matrix = matrix0 + matrix1;
+    auto scaled_identity =
+        value * utils::id_matrix((level_count) * (level_count + 1));
+
+    auto want_matrix = sum_matrix - scaled_identity;
+    auto want_matrix_reverse = scaled_identity - sum_matrix;
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  // `operator_sum * double`
+  {
+    auto sum =
+        cudaq::matrix_operator::parity(1) + cudaq::matrix_operator::parity(2);
+
+    auto product = sum * double_value;
+    auto reverse = double_value * sum;
+
+    ASSERT_TRUE(product.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    for (auto term : product.get_terms()) {
+      ASSERT_TRUE(term.num_terms() == 1);
+      ASSERT_TRUE(term.get_coefficient().evaluate() ==
+                  std::complex<double>(double_value));
+    }
+
+    for (auto term : reverse.get_terms()) {
+      ASSERT_TRUE(term.num_terms() == 1);
+      ASSERT_TRUE(term.get_coefficient().evaluate() ==
+                  std::complex<double>(double_value));
+    }
+
+    auto got_matrix =
+        product.to_matrix({{1, level_count}, {2, level_count + 1}});
+    auto got_matrix_reverse =
+        reverse.to_matrix({{1, level_count}, {2, level_count + 1}});
+
+    auto matrix0 = cudaq::kronecker(utils::id_matrix(level_count + 1),
+                                    utils::parity_matrix(level_count));
+    auto matrix1 = cudaq::kronecker(utils::parity_matrix(level_count + 1),
+                                    utils::id_matrix(level_count));
+    auto scaled_identity =
+        double_value * utils::id_matrix((level_count) * (level_count + 1));
+    auto want_matrix = (matrix0 + matrix1) * scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix, got_matrix_reverse);
+  }
+
+  // `operator_sum * std::complex<double>`
+  {
+    auto sum =
+        cudaq::matrix_operator::parity(1) + cudaq::matrix_operator::parity(2);
+
+    auto product = sum * value;
+    auto reverse = value * sum;
+
+    ASSERT_TRUE(product.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    for (auto term : product.get_terms()) {
+      ASSERT_TRUE(term.num_terms() == 1);
+      ASSERT_TRUE(term.get_coefficient().evaluate() == value);
+    }
+
+    for (auto term : reverse.get_terms()) {
+      ASSERT_TRUE(term.num_terms() == 1);
+      ASSERT_TRUE(term.get_coefficient().evaluate() == value);
+    }
+
+    auto got_matrix =
+        product.to_matrix({{1, level_count}, {2, level_count + 1}});
+    auto got_matrix_reverse =
+        reverse.to_matrix({{1, level_count}, {2, level_count + 1}});
+
+    auto matrix0 = cudaq::kronecker(utils::id_matrix(level_count + 1),
+                                    utils::parity_matrix(level_count));
+    auto matrix1 = cudaq::kronecker(utils::parity_matrix(level_count + 1),
+                                    utils::id_matrix(level_count));
+    auto scaled_identity =
+        value * utils::id_matrix((level_count) * (level_count + 1));
+    auto want_matrix = (matrix0 + matrix1) * scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix, got_matrix_reverse);
+  }
+
+  // `operator_sum * scalar_operator`
+  {
+    auto sum =
+        cudaq::matrix_operator::parity(1) + cudaq::matrix_operator::parity(2);
+
+    auto product = sum * cudaq::scalar_operator(value);
+    auto reverse = cudaq::scalar_operator(value) * sum;
+
+    ASSERT_TRUE(product.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    for (auto term : product.get_terms()) {
+      ASSERT_TRUE(term.num_terms() == 1);
+      ASSERT_TRUE(term.get_coefficient().evaluate() == value);
+    }
+
+    for (auto term : reverse.get_terms()) {
+      ASSERT_TRUE(term.num_terms() == 1);
+      ASSERT_TRUE(term.get_coefficient().evaluate() == value);
+    }
+
+    auto got_matrix =
+        product.to_matrix({{1, level_count}, {2, level_count + 1}});
+    auto got_matrix_reverse =
+        reverse.to_matrix({{1, level_count}, {2, level_count + 1}});
+
+    auto matrix0 = cudaq::kronecker(utils::id_matrix(level_count + 1),
+                                    utils::parity_matrix(level_count));
+    auto matrix1 = cudaq::kronecker(utils::parity_matrix(level_count + 1),
+                                    utils::id_matrix(level_count));
+    auto sum_matrix = matrix0 + matrix1;
+    auto scaled_identity =
+        value * utils::id_matrix((level_count) * (level_count + 1));
+
+    auto want_matrix = sum_matrix * scaled_identity;
+    auto want_matrix_reverse = scaled_identity * sum_matrix;
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  // `spin sum * scalar_operator`
+  {
+    auto sum = cudaq::spin_operator::i(1) + cudaq::spin_operator::y(2);
+
+    auto product = sum * cudaq::scalar_operator(value);
+    auto reverse = cudaq::scalar_operator(value) * sum;
+
+    ASSERT_TRUE(product.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    for (auto term : product.get_terms()) {
+      ASSERT_TRUE(term.num_terms() == 1);
+      ASSERT_TRUE(term.get_coefficient().evaluate() == value);
+    }
+
+    for (auto term : reverse.get_terms()) {
+      ASSERT_TRUE(term.num_terms() == 1);
+      ASSERT_TRUE(term.get_coefficient().evaluate() == value);
+    }
+
+    auto got_matrix = product.to_matrix();
+    auto got_matrix_reverse = reverse.to_matrix();
+
+    auto matrix0 = cudaq::kronecker(utils::id_matrix(2), utils::id_matrix(2));
+    auto matrix1 =
+        cudaq::kronecker(utils::PauliY_matrix(), utils::id_matrix(2));
+    auto scaled_identity = value * utils::id_matrix(2 * 2);
+    auto want_matrix = (matrix0 + matrix1) * scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix, got_matrix_reverse);
+  }
+
+  // `operator_sum *= double`
+  {
+    auto sum =
+        cudaq::matrix_operator::squeeze(1) + cudaq::matrix_operator::squeeze(2);
+
+    sum *= double_value;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    for (auto term : sum.get_terms()) {
+      ASSERT_TRUE(term.num_terms() == 1);
+      ASSERT_TRUE(term.get_coefficient().evaluate() ==
+                  std::complex<double>(double_value));
+    }
+
+    auto got_matrix = sum.to_matrix({{1, level_count}, {2, level_count + 1}},
+                                    {{"squeezing", value}});
+
+    auto matrix0 = cudaq::kronecker(utils::id_matrix(level_count + 1),
+                                    utils::squeeze_matrix(level_count, value));
+    auto matrix1 =
+        cudaq::kronecker(utils::squeeze_matrix(level_count + 1, value),
+                         utils::id_matrix(level_count));
+    auto sum_matrix = matrix0 + matrix1;
+    auto scaled_identity =
+        double_value * utils::id_matrix((level_count) * (level_count + 1));
+
+    auto want_matrix = sum_matrix * scaled_identity;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `spin sum *= double`
+  {
+    auto sum = cudaq::spin_operator::y(1) + cudaq::spin_operator::i(2);
+
+    sum *= double_value;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    for (auto term : sum.get_terms()) {
+      ASSERT_TRUE(term.num_terms() == 1);
+      ASSERT_TRUE(term.get_coefficient().evaluate() ==
+                  std::complex<double>(double_value));
+    }
+
+    auto got_matrix = sum.to_matrix();
+    auto matrix0 =
+        cudaq::kronecker(utils::id_matrix(2), utils::PauliY_matrix());
+    auto matrix1 = cudaq::kronecker(utils::id_matrix(2), utils::id_matrix(2));
+    auto scaled_identity = double_value * utils::id_matrix(2 * 2);
+    auto want_matrix = (matrix0 + matrix1) * scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `operator_sum *= std::complex<double>`
+  {
+    auto sum =
+        cudaq::matrix_operator::displace(1) + cudaq::matrix_operator::parity(2);
+
+    sum *= value;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    for (auto term : sum.get_terms()) {
+      ASSERT_TRUE(term.num_terms() == 1);
+      ASSERT_TRUE(term.get_coefficient().evaluate() == value);
+    }
+
+    auto got_matrix = sum.to_matrix({{1, level_count}, {2, level_count + 1}},
+                                    {{"displacement", value}});
+
+    auto matrix0 = cudaq::kronecker(utils::id_matrix(level_count + 1),
+                                    utils::displace_matrix(level_count, value));
+    auto matrix1 = cudaq::kronecker(utils::parity_matrix(level_count + 1),
+                                    utils::id_matrix(level_count));
+    auto scaled_identity =
+        value * utils::id_matrix((level_count) * (level_count + 1));
+    auto want_matrix = (matrix0 + matrix1) * scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `operator_sum *= scalar_operator`
+  {
+    auto sum =
+        cudaq::matrix_operator::parity(1) + cudaq::matrix_operator::momentum(2);
+
+    sum *= cudaq::scalar_operator(value);
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    for (auto term : sum.get_terms()) {
+      ASSERT_TRUE(term.num_terms() == 1);
+      ASSERT_TRUE(term.get_coefficient().evaluate() == value);
+    }
+
+    auto got_matrix = sum.to_matrix(
+        {{0, level_count}, {1, level_count}, {2, level_count + 1}});
+
+    std::vector<cudaq::matrix_2> matrices_1 = {
+        utils::id_matrix(level_count + 1), utils::parity_matrix(level_count)};
+    std::vector<cudaq::matrix_2> matrices_2 = {
+        utils::momentum_matrix(level_count + 1), utils::id_matrix(level_count)};
+    auto matrix0 = cudaq::kronecker(matrices_1.begin(), matrices_1.end());
+    auto matrix1 = cudaq::kronecker(matrices_2.begin(), matrices_2.end());
+    auto scaled_identity =
+        value * utils::id_matrix((level_count + 1) * level_count);
+
+    auto want_matrix = (matrix0 + matrix1) * scaled_identity;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `operator_sum += double`
+  {
+    auto sum =
+        cudaq::matrix_operator::parity(1) + cudaq::matrix_operator::parity(2);
+
+    sum += double_value;
+
+    ASSERT_TRUE(sum.num_terms() == 3);
+
+    auto got_matrix = sum.to_matrix({{1, level_count}, {2, level_count + 1}});
+
+    auto matrix0 = cudaq::kronecker(utils::id_matrix(level_count + 1),
+                                    utils::parity_matrix(level_count));
+    auto matrix1 = cudaq::kronecker(utils::parity_matrix(level_count + 1),
+                                    utils::id_matrix(level_count));
+    auto scaled_identity =
+        double_value * utils::id_matrix((level_count) * (level_count + 1));
+    auto want_matrix = matrix0 + matrix1 + scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `spin sum += double`
+  {
+    auto sum = cudaq::spin_operator::y(1) + cudaq::spin_operator::y(2);
+
+    sum += double_value;
+    ASSERT_TRUE(sum.num_terms() == 3);
+
+    auto got_matrix = sum.to_matrix({{1, 2}, {2, 2}});
+    auto matrix0 =
+        cudaq::kronecker(utils::id_matrix(2), utils::PauliY_matrix());
+    auto matrix1 =
+        cudaq::kronecker(utils::PauliY_matrix(), utils::id_matrix(2));
+    auto scaled_identity = double_value * utils::id_matrix(2 * 2);
+    auto want_matrix = matrix0 + matrix1 + scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `operator_sum += std::complex<double>`
+  {
+    auto sum = cudaq::matrix_operator::momentum(1) +
+               cudaq::matrix_operator::squeeze(2);
+
+    sum += value;
+
+    ASSERT_TRUE(sum.num_terms() == 3);
+
+    auto got_matrix = sum.to_matrix({{1, level_count}, {2, level_count + 1}},
+                                    {{"squeezing", value}});
+
+    auto matrix0 = cudaq::kronecker(utils::id_matrix(level_count + 1),
+                                    utils::momentum_matrix(level_count));
+    auto matrix1 =
+        cudaq::kronecker(utils::squeeze_matrix(level_count + 1, value),
+                         utils::id_matrix(level_count));
+    auto scaled_identity =
+        value * utils::id_matrix((level_count) * (level_count + 1));
+    auto want_matrix = matrix0 + matrix1 + scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `operator_sum += scalar_operator`
+  {
+    auto sum =
+        cudaq::matrix_operator::parity(1) + cudaq::matrix_operator::position(2);
+
+    sum += cudaq::scalar_operator(value);
+
+    ASSERT_TRUE(sum.num_terms() == 3);
+
+    auto got_matrix = sum.to_matrix(
+        {{0, level_count}, {1, level_count}, {2, level_count + 1}});
+
+    std::vector<cudaq::matrix_2> matrices_1 = {
+        utils::id_matrix(level_count + 1), utils::parity_matrix(level_count)};
+    std::vector<cudaq::matrix_2> matrices_2 = {
+        utils::position_matrix(level_count + 1), utils::id_matrix(level_count)};
+    auto matrix0 = cudaq::kronecker(matrices_1.begin(), matrices_1.end());
+    auto matrix1 = cudaq::kronecker(matrices_2.begin(), matrices_2.end());
+    auto scaled_identity =
+        value * utils::id_matrix((level_count + 1) * level_count);
+
+    auto want_matrix = matrix0 + matrix1 + scaled_identity;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `operator_sum -= double`
+  {
+    auto sum =
+        cudaq::matrix_operator::parity(1) + cudaq::matrix_operator::parity(2);
+
+    sum -= double_value;
+
+    ASSERT_TRUE(sum.num_terms() == 3);
+
+    auto got_matrix = sum.to_matrix({{1, level_count}, {2, level_count + 1}});
+
+    auto matrix0 = cudaq::kronecker(utils::id_matrix(level_count + 1),
+                                    utils::parity_matrix(level_count));
+    auto matrix1 = cudaq::kronecker(utils::parity_matrix(level_count + 1),
+                                    utils::id_matrix(level_count));
+
+    auto sum_matrix = matrix0 + matrix1;
+    auto scaled_identity =
+        double_value * utils::id_matrix((level_count) * (level_count + 1));
+
+    auto want_matrix = sum_matrix - scaled_identity;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `operator_sum -= std::complex<double>`
+  {
+    auto sum =
+        cudaq::matrix_operator::position(1) + cudaq::matrix_operator::number(2);
+
+    sum -= value;
+
+    ASSERT_TRUE(sum.num_terms() == 3);
+
+    auto got_matrix = sum.to_matrix({{1, level_count}, {2, level_count + 1}});
+
+    auto matrix0 = cudaq::kronecker(utils::id_matrix(level_count + 1),
+                                    utils::position_matrix(level_count));
+    auto matrix1 = cudaq::kronecker(utils::number_matrix(level_count + 1),
+                                    utils::id_matrix(level_count));
+    auto scaled_identity =
+        value * utils::id_matrix((level_count) * (level_count + 1));
+    auto want_matrix = matrix0 + matrix1 - scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `operator_sum -= scalar_operator`
+  {
+    auto sum =
+        cudaq::matrix_operator::number(1) + cudaq::matrix_operator::identity(2);
+
+    sum -= cudaq::scalar_operator(value);
+
+    ASSERT_TRUE(sum.num_terms() == 3);
+
+    auto got_matrix = sum.to_matrix(
+        {{0, level_count}, {1, level_count}, {2, level_count + 1}});
+
+    std::vector<cudaq::matrix_2> matrices_1 = {
+        utils::id_matrix(level_count + 1), utils::number_matrix(level_count)};
+    std::vector<cudaq::matrix_2> matrices_2 = {
+        utils::id_matrix(level_count + 1), utils::id_matrix(level_count)};
+    auto matrix0 = cudaq::kronecker(matrices_1.begin(), matrices_1.end());
+    auto matrix1 = cudaq::kronecker(matrices_2.begin(), matrices_2.end());
+    auto scaled_identity =
+        value * utils::id_matrix((level_count + 1) * level_count);
+
+    auto want_matrix = (matrix0 + matrix1) - scaled_identity;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `spin sum -= scalar_operator`
+  {
+    auto sum = cudaq::spin_operator::z(1) + cudaq::spin_operator::y(2);
+
+    sum -= cudaq::scalar_operator(value);
+    ASSERT_TRUE(sum.num_terms() == 3);
+
+    auto got_matrix = sum.to_matrix();
+
+    std::vector<cudaq::matrix_2> matrices_1 = {utils::id_matrix(2),
+                                               utils::PauliZ_matrix()};
+    std::vector<cudaq::matrix_2> matrices_2 = {utils::PauliY_matrix(),
+                                               utils::id_matrix(2)};
+    auto matrix0 = cudaq::kronecker(matrices_1.begin(), matrices_1.end());
+    auto matrix1 = cudaq::kronecker(matrices_2.begin(), matrices_2.end());
+    auto scaled_identity = value * utils::id_matrix(2 * 2);
+    auto want_matrix = matrix0 + matrix1 - scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+}
+
+// FIXME: add tests to check sums against elementary
+
+// FIXME: add tests to combine general sum with spin product
+TEST(OperatorExpressions, checkOperatorSumAgainstProduct) {
+  // NOTE: Much of the simpler arithmetic between the two is tested in the
+  // product operator test file. This mainly just tests the assignment operators
+  // between the two types.
+  int level_count = 2;
+
+  // `operator_sum += product_operator`
+  {
+    auto product =
+        cudaq::matrix_operator::number(0) * cudaq::matrix_operator::number(1);
+    auto sum =
+        cudaq::matrix_operator::parity(1) + cudaq::matrix_operator::parity(2);
+
+    sum += product;
+
+    ASSERT_TRUE(sum.num_terms() == 3);
+
+    auto got_matrix = sum.to_matrix(
+        {{0, level_count}, {1, level_count + 1}, {2, level_count + 2}});
+    std::vector<cudaq::matrix_2> matrices_0_0 = {
+        utils::id_matrix(level_count + 2), utils::id_matrix(level_count + 1),
+        utils::number_matrix(level_count)};
+    std::vector<cudaq::matrix_2> matrices_0_1 = {
+        utils::id_matrix(level_count + 2),
+        utils::number_matrix(level_count + 1), utils::id_matrix(level_count)};
+
+    std::vector<cudaq::matrix_2> matrices_1_0 = {
+        utils::id_matrix(level_count + 2),
+        utils::parity_matrix(level_count + 1), utils::id_matrix(level_count)};
+    std::vector<cudaq::matrix_2> matrices_1_1 = {
+        utils::parity_matrix(level_count + 2),
+        utils::id_matrix(level_count + 1), utils::id_matrix(level_count)};
+
+    auto product_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end()) *
+        cudaq::kronecker(matrices_0_1.begin(), matrices_0_1.end());
+    auto sum_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) +
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end());
+
+    auto want_matrix = sum_matrix + product_matrix;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `operator_sum -= product_operator`
+  {
+    auto product =
+        cudaq::matrix_operator::number(0) * cudaq::matrix_operator::number(1);
+    auto sum =
+        cudaq::matrix_operator::parity(1) + cudaq::matrix_operator::parity(2);
+
+    sum -= product;
+
+    ASSERT_TRUE(sum.num_terms() == 3);
+
+    auto got_matrix = sum.to_matrix(
+        {{0, level_count}, {1, level_count + 1}, {2, level_count + 2}});
+    std::vector<cudaq::matrix_2> matrices_0_0 = {
+        utils::id_matrix(level_count + 2), utils::id_matrix(level_count + 1),
+        utils::number_matrix(level_count)};
+    std::vector<cudaq::matrix_2> matrices_0_1 = {
+        utils::id_matrix(level_count + 2),
+        utils::number_matrix(level_count + 1), utils::id_matrix(level_count)};
+
+    std::vector<cudaq::matrix_2> matrices_1_0 = {
+        utils::id_matrix(level_count + 2),
+        utils::parity_matrix(level_count + 1), utils::id_matrix(level_count)};
+    std::vector<cudaq::matrix_2> matrices_1_1 = {
+        utils::parity_matrix(level_count + 2),
+        utils::id_matrix(level_count + 1), utils::id_matrix(level_count)};
+
+    auto product_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end()) *
+        cudaq::kronecker(matrices_0_1.begin(), matrices_0_1.end());
+    auto sum_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) +
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end());
+
+    auto want_matrix = sum_matrix - product_matrix;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `operator_sum *= product_operator`
+  {
+    auto product =
+        cudaq::matrix_operator::number(0) * cudaq::matrix_operator::number(1);
+    auto sum =
+        cudaq::matrix_operator::parity(1) + cudaq::matrix_operator::parity(2);
+
+    sum *= product;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    for (auto term : sum.get_terms()) {
+      ASSERT_TRUE(term.num_terms() == 3);
+    }
+
+    auto got_matrix = sum.to_matrix(
+        {{0, level_count}, {1, level_count + 1}, {2, level_count + 2}});
+    std::vector<cudaq::matrix_2> matrices_0_0 = {
+        utils::id_matrix(level_count + 2), utils::id_matrix(level_count + 1),
+        utils::number_matrix(level_count)};
+    std::vector<cudaq::matrix_2> matrices_0_1 = {
+        utils::id_matrix(level_count + 2),
+        utils::number_matrix(level_count + 1), utils::id_matrix(level_count)};
+
+    std::vector<cudaq::matrix_2> matrices_1_0 = {
+        utils::id_matrix(level_count + 2),
+        utils::parity_matrix(level_count + 1), utils::id_matrix(level_count)};
+    std::vector<cudaq::matrix_2> matrices_1_1 = {
+        utils::parity_matrix(level_count + 2),
+        utils::id_matrix(level_count + 1), utils::id_matrix(level_count)};
+
+    auto product_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end()) *
+        cudaq::kronecker(matrices_0_1.begin(), matrices_0_1.end());
+    auto sum_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) +
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end());
+
+    auto want_matrix = sum_matrix * product_matrix;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+}
+
+// FIXME: add tests to combine general sum with spin sum
+TEST(OperatorExpressions, checkOperatorSumAgainstOperatorSum) {
+  int level_count = 2;
+
+  // `operator_sum + operator_sum`
+  {
+    auto sum_0 =
+        cudaq::matrix_operator::parity(1) + cudaq::matrix_operator::parity(2);
+    auto sum_1 = cudaq::matrix_operator::parity(0) +
+                 cudaq::matrix_operator::number(1) +
+                 cudaq::matrix_operator::parity(3);
+
+    auto sum = sum_0 + sum_1;
+
+    ASSERT_TRUE(sum.num_terms() == 5);
+
+    auto got_matrix = sum.to_matrix({{0, level_count},
+                                     {1, level_count + 1},
+                                     {2, level_count + 2},
+                                     {3, level_count + 3}});
+
+    std::vector<cudaq::matrix_2> matrices_0_0;
+    std::vector<cudaq::matrix_2> matrices_0_1;
+    std::vector<cudaq::matrix_2> matrices_1_0;
+    std::vector<cudaq::matrix_2> matrices_1_1;
+    std::vector<cudaq::matrix_2> matrices_1_2;
+
+    matrices_0_0 = {
+        utils::id_matrix(level_count + 3), utils::id_matrix(level_count + 2),
+        utils::parity_matrix(level_count + 1), utils::id_matrix(level_count)};
+    matrices_0_1 = {utils::id_matrix(level_count + 3),
+                    utils::parity_matrix(level_count + 2),
+                    utils::id_matrix(level_count + 1),
+                    utils::id_matrix(level_count)};
+    matrices_1_0 = {
+        utils::id_matrix(level_count + 3), utils::id_matrix(level_count + 2),
+        utils::id_matrix(level_count + 1), utils::parity_matrix(level_count)};
+    matrices_1_1 = {
+        utils::id_matrix(level_count + 3), utils::id_matrix(level_count + 2),
+        utils::number_matrix(level_count + 1), utils::id_matrix(level_count)};
+    matrices_1_2 = {utils::parity_matrix(level_count + 3),
+                    utils::id_matrix(level_count + 2),
+                    utils::id_matrix(level_count + 1),
+                    utils::id_matrix(level_count)};
+
+    auto sum_0_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end()) +
+        cudaq::kronecker(matrices_0_1.begin(), matrices_0_1.end());
+    auto sum_1_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) +
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end()) +
+        cudaq::kronecker(matrices_1_2.begin(), matrices_1_2.end());
+
+    auto want_matrix = sum_0_matrix + sum_1_matrix;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `operator_sum - operator_sum`
+  {
+    auto sum_0 =
+        cudaq::matrix_operator::parity(1) + cudaq::matrix_operator::position(2);
+    auto sum_1 = cudaq::matrix_operator::parity(0) +
+                 cudaq::matrix_operator::number(1) +
+                 cudaq::matrix_operator::momentum(3);
+
+    auto difference = sum_0 - sum_1;
+
+    ASSERT_TRUE(difference.num_terms() == 5);
+
+    auto got_matrix = difference.to_matrix({{0, level_count},
+                                            {1, level_count + 1},
+                                            {2, level_count + 2},
+                                            {3, level_count + 3}});
+
+    std::vector<cudaq::matrix_2> matrices_0_0;
+    std::vector<cudaq::matrix_2> matrices_0_1;
+    std::vector<cudaq::matrix_2> matrices_1_0;
+    std::vector<cudaq::matrix_2> matrices_1_1;
+    std::vector<cudaq::matrix_2> matrices_1_2;
+
+    matrices_0_0 = {
+        utils::id_matrix(level_count + 3), utils::id_matrix(level_count + 2),
+        utils::parity_matrix(level_count + 1), utils::id_matrix(level_count)};
+    matrices_0_1 = {utils::id_matrix(level_count + 3),
+                    utils::position_matrix(level_count + 2),
+                    utils::id_matrix(level_count + 1),
+                    utils::id_matrix(level_count)};
+    matrices_1_0 = {
+        utils::id_matrix(level_count + 3), utils::id_matrix(level_count + 2),
+        utils::id_matrix(level_count + 1), utils::parity_matrix(level_count)};
+    matrices_1_1 = {
+        utils::id_matrix(level_count + 3), utils::id_matrix(level_count + 2),
+        utils::number_matrix(level_count + 1), utils::id_matrix(level_count)};
+    matrices_1_2 = {utils::momentum_matrix(level_count + 3),
+                    utils::id_matrix(level_count + 2),
+                    utils::id_matrix(level_count + 1),
+                    utils::id_matrix(level_count)};
+
+    auto sum_0_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end()) +
+        cudaq::kronecker(matrices_0_1.begin(), matrices_0_1.end());
+    auto sum_1_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) +
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end()) +
+        cudaq::kronecker(matrices_1_2.begin(), matrices_1_2.end());
+
+    auto want_matrix = sum_0_matrix - sum_1_matrix;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `operator_sum * operator_sum`
+  {
+    auto sum_0 =
+        cudaq::matrix_operator::parity(1) + cudaq::matrix_operator::parity(2);
+    auto sum_1 = cudaq::matrix_operator::parity(0) +
+                 cudaq::matrix_operator::number(1) +
+                 cudaq::matrix_operator::parity(3);
+
+    auto sum_product = sum_0 * sum_1;
+    auto sum_product_reverse = sum_1 * sum_0;
+
+    ASSERT_TRUE(sum_product.num_terms() == 6);
+    ASSERT_TRUE(sum_product_reverse.num_terms() == 6);
+    for (auto term : sum_product.get_terms())
+      ASSERT_TRUE(term.num_terms() == 2);
+    for (auto term : sum_product_reverse.get_terms())
+      ASSERT_TRUE(term.num_terms() == 2);
+
+    auto got_matrix = sum_product.to_matrix({{0, level_count},
+                                             {1, level_count + 1},
+                                             {2, level_count + 2},
+                                             {3, level_count + 3}});
+    auto got_matrix_reverse =
+        sum_product_reverse.to_matrix({{0, level_count},
+                                       {1, level_count + 1},
+                                       {2, level_count + 2},
+                                       {3, level_count + 3}});
+
+    std::vector<cudaq::matrix_2> matrices_0_0;
+    std::vector<cudaq::matrix_2> matrices_0_1;
+    std::vector<cudaq::matrix_2> matrices_1_0;
+    std::vector<cudaq::matrix_2> matrices_1_1;
+    std::vector<cudaq::matrix_2> matrices_1_2;
+
+    matrices_0_0 = {
+        utils::id_matrix(level_count + 3), utils::id_matrix(level_count + 2),
+        utils::parity_matrix(level_count + 1), utils::id_matrix(level_count)};
+    matrices_0_1 = {utils::id_matrix(level_count + 3),
+                    utils::parity_matrix(level_count + 2),
+                    utils::id_matrix(level_count + 1),
+                    utils::id_matrix(level_count)};
+    matrices_1_0 = {
+        utils::id_matrix(level_count + 3), utils::id_matrix(level_count + 2),
+        utils::id_matrix(level_count + 1), utils::parity_matrix(level_count)};
+    matrices_1_1 = {
+        utils::id_matrix(level_count + 3), utils::id_matrix(level_count + 2),
+        utils::number_matrix(level_count + 1), utils::id_matrix(level_count)};
+    matrices_1_2 = {utils::parity_matrix(level_count + 3),
+                    utils::id_matrix(level_count + 2),
+                    utils::id_matrix(level_count + 1),
+                    utils::id_matrix(level_count)};
+
+    auto sum_0_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end()) +
+        cudaq::kronecker(matrices_0_1.begin(), matrices_0_1.end());
+    auto sum_1_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) +
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end()) +
+        cudaq::kronecker(matrices_1_2.begin(), matrices_1_2.end());
+
+    auto want_matrix = sum_0_matrix * sum_1_matrix;
+    auto want_matrix_reverse = sum_1_matrix * sum_0_matrix;
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  // `operator_sum *= operator_sum`
+  {
+    auto sum =
+        cudaq::matrix_operator::parity(1) + cudaq::matrix_operator::parity(2);
+    auto sum_1 = cudaq::matrix_operator::parity(0) +
+                 cudaq::matrix_operator::number(1) +
+                 cudaq::matrix_operator::parity(3);
+
+    sum *= sum_1;
+
+    ASSERT_TRUE(sum.num_terms() == 6);
+    for (auto term : sum.get_terms())
+      ASSERT_TRUE(term.num_terms() == 2);
+
+    auto got_matrix = sum.to_matrix({{0, level_count},
+                                     {1, level_count + 1},
+                                     {2, level_count + 2},
+                                     {3, level_count + 3}});
+
+    std::vector<cudaq::matrix_2> matrices_0_0;
+    std::vector<cudaq::matrix_2> matrices_0_1;
+    std::vector<cudaq::matrix_2> matrices_1_0;
+    std::vector<cudaq::matrix_2> matrices_1_1;
+    std::vector<cudaq::matrix_2> matrices_1_2;
+
+    matrices_0_0 = {
+        utils::id_matrix(level_count + 3), utils::id_matrix(level_count + 2),
+        utils::parity_matrix(level_count + 1), utils::id_matrix(level_count)};
+    matrices_0_1 = {utils::id_matrix(level_count + 3),
+                    utils::parity_matrix(level_count + 2),
+                    utils::id_matrix(level_count + 1),
+                    utils::id_matrix(level_count)};
+    matrices_1_0 = {
+        utils::id_matrix(level_count + 3), utils::id_matrix(level_count + 2),
+        utils::id_matrix(level_count + 1), utils::parity_matrix(level_count)};
+    matrices_1_1 = {
+        utils::id_matrix(level_count + 3), utils::id_matrix(level_count + 2),
+        utils::number_matrix(level_count + 1), utils::id_matrix(level_count)};
+    matrices_1_2 = {utils::parity_matrix(level_count + 3),
+                    utils::id_matrix(level_count + 2),
+                    utils::id_matrix(level_count + 1),
+                    utils::id_matrix(level_count)};
+
+    auto sum_0_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end()) +
+        cudaq::kronecker(matrices_0_1.begin(), matrices_0_1.end());
+    auto sum_1_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) +
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end()) +
+        cudaq::kronecker(matrices_1_2.begin(), matrices_1_2.end());
+
+    auto want_matrix = sum_0_matrix * sum_1_matrix;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+}
+
+TEST(OperatorExpressions, checkCustomOperatorSum) {
+  auto level_count = 2;
+  std::unordered_map<int, int> dimensions = {{0, level_count + 1},
+                                             {1, level_count + 2},
+                                             {2, level_count},
+                                             {3, level_count + 3}};
+
+  {
+    auto func0 =
+        [](const std::vector<int> &dimensions,
+           const std::unordered_map<std::string, std::complex<double>> &_none) {
+          return cudaq::kronecker(utils::momentum_matrix(dimensions[0]),
+                                  utils::position_matrix(dimensions[1]));
+        };
+    auto func1 =
+        [](const std::vector<int> &dimensions,
+           const std::unordered_map<std::string, std::complex<double>> &_none) {
+          return cudaq::kronecker(utils::parity_matrix(dimensions[0]),
+                                  utils::number_matrix(dimensions[1]));
+        };
+    cudaq::matrix_operator::define("custom_op0", {-1, -1}, func0);
+    cudaq::matrix_operator::define("custom_op1", {-1, -1}, func1);
+  }
+
+  auto op0 = cudaq::matrix_operator::instantiate("custom_op0", {1, 0});
+  auto op1 = cudaq::matrix_operator::instantiate("custom_op1", {2, 1});
+  auto sum = op0 + op1;
+  auto sum_reverse = op1 + op0;
+  auto difference = op0 - op1;
+  auto difference_reverse = op1 - op0;
+
+  std::vector<cudaq::matrix_2> matrices_0 = {
+      utils::id_matrix(level_count), utils::momentum_matrix(level_count + 2),
+      utils::position_matrix(level_count + 1)};
+  std::vector<cudaq::matrix_2> matrices_1 = {
+      utils::parity_matrix(level_count), utils::number_matrix(level_count + 2),
+      utils::id_matrix(level_count + 1)};
+  auto sum_expected = cudaq::kronecker(matrices_0.begin(), matrices_0.end()) +
+                      cudaq::kronecker(matrices_1.begin(), matrices_1.end());
+  auto diff_expected = cudaq::kronecker(matrices_0.begin(), matrices_0.end()) -
+                       cudaq::kronecker(matrices_1.begin(), matrices_1.end());
+  auto diff_reverse_expected =
+      cudaq::kronecker(matrices_1.begin(), matrices_1.end()) -
+      cudaq::kronecker(matrices_0.begin(), matrices_0.end());
+
+  utils::checkEqual(sum.to_matrix(dimensions), sum_expected);
+  utils::checkEqual(sum_reverse.to_matrix(dimensions), sum_expected);
+  utils::checkEqual(difference.to_matrix(dimensions), diff_expected);
+  utils::checkEqual(difference_reverse.to_matrix(dimensions),
+                    diff_reverse_expected);
+
+  op0 = cudaq::matrix_operator::instantiate("custom_op0", {3, 2});
+  op1 = cudaq::matrix_operator::instantiate("custom_op1", {2, 0});
+  sum = op0 + op1;
+  sum_reverse = op1 + op0;
+  difference = op0 - op1;
+  difference_reverse = op1 - op0;
+
+  matrices_0 = {utils::momentum_matrix(level_count + 3),
+                utils::position_matrix(level_count),
+                utils::id_matrix(level_count + 1)};
+  matrices_1 = {utils::id_matrix(level_count + 3),
+                utils::parity_matrix(level_count),
+                utils::number_matrix(level_count + 1)};
+  sum_expected = cudaq::kronecker(matrices_0.begin(), matrices_0.end()) +
+                 cudaq::kronecker(matrices_1.begin(), matrices_1.end());
+  diff_expected = cudaq::kronecker(matrices_0.begin(), matrices_0.end()) -
+                  cudaq::kronecker(matrices_1.begin(), matrices_1.end());
+  diff_reverse_expected =
+      cudaq::kronecker(matrices_1.begin(), matrices_1.end()) -
+      cudaq::kronecker(matrices_0.begin(), matrices_0.end());
+
+  utils::checkEqual(sum.to_matrix(dimensions), sum_expected);
+  utils::checkEqual(sum_reverse.to_matrix(dimensions), sum_expected);
+  utils::checkEqual(difference.to_matrix(dimensions), diff_expected);
+  utils::checkEqual(difference_reverse.to_matrix(dimensions),
+                    diff_reverse_expected);
+}
diff --git a/unittests/dynamics/product_operator.cpp b/unittests/dynamics/product_operator.cpp
new file mode 100644
index 00000000000..cba9938711b
--- /dev/null
+++ b/unittests/dynamics/product_operator.cpp
@@ -0,0 +1,1439 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/operators.h"
+#include "utils.h"
+#include <gtest/gtest.h>
+
+#include <numeric>
+
+TEST(OperatorExpressions, checkProductOperatorBasics) {
+  std::vector<int> levels = {2, 3, 4};
+
+  std::complex<double> value_0 = 0.1 + 0.1;
+  std::complex<double> value_1 = 0.1 + 1.0;
+  std::complex<double> value_2 = 2.0 + 0.1;
+  std::complex<double> value_3 = 2.0 + 1.0;
+
+  {// Same degrees of freedom.
+   {auto spin0 = cudaq::spin_operator::x(5);
+  auto spin1 = cudaq::spin_operator::z(5);
+  auto spin_prod = spin0 * spin1;
+
+  std::vector<int> want_degrees = {5};
+  auto spin_matrix = utils::PauliX_matrix() * utils::PauliZ_matrix();
+
+  ASSERT_TRUE(spin_prod.degrees() == want_degrees);
+  utils::checkEqual(spin_matrix, spin_prod.to_matrix());
+
+  for (auto level_count : levels) {
+    auto op0 = cudaq::matrix_operator::position(5);
+    auto op1 = cudaq::matrix_operator::momentum(5);
+
+    auto got = op0 * op1;
+    utils::assert_product_equal(got, 1.,
+                                {op0.get_terms()[0], op1.get_terms()[0]});
+    ASSERT_TRUE(got.degrees() == want_degrees);
+
+    auto got_matrix = got.to_matrix({{5, level_count}});
+    auto matrix0 = utils::position_matrix(level_count);
+    auto matrix1 = utils::momentum_matrix(level_count);
+    auto want_matrix = matrix0 * matrix1;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+}
+
+// Different degrees of freedom.
+{
+  auto spin0 = cudaq::spin_operator::x(0);
+  auto spin1 = cudaq::spin_operator::z(1);
+  auto spin_prod = spin0 * spin1;
+
+  std::vector<int> want_degrees = {1, 0};
+  auto spin_matrix =
+      cudaq::kronecker(utils::PauliZ_matrix(), utils::PauliX_matrix());
+
+  ASSERT_TRUE(spin_prod.degrees() == want_degrees);
+  utils::checkEqual(spin_matrix, spin_prod.to_matrix());
+
+  for (auto level_count : levels) {
+    auto op0 = cudaq::matrix_operator::position(0);
+    auto op1 = cudaq::matrix_operator::momentum(1);
+
+    cudaq::product_operator got = op0 * op1;
+    cudaq::product_operator got_reverse = op1 * op0;
+
+    ASSERT_TRUE(got.degrees() == want_degrees);
+    ASSERT_TRUE(got_reverse.degrees() == want_degrees);
+
+    auto got_matrix = got.to_matrix({{0, level_count}, {1, level_count}});
+    auto got_matrix_reverse =
+        got_reverse.to_matrix({{0, level_count}, {1, level_count}});
+
+    auto identity = utils::id_matrix(level_count);
+    auto matrix0 = utils::position_matrix(level_count);
+    auto matrix1 = utils::momentum_matrix(level_count);
+
+    auto fullHilbert0 = cudaq::kronecker(identity, matrix0);
+    auto fullHilbert1 = cudaq::kronecker(matrix1, identity);
+    auto want_matrix = fullHilbert0 * fullHilbert1;
+    auto want_matrix_reverse = fullHilbert1 * fullHilbert0;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+}
+
+// Different degrees of freedom, non-consecutive.
+// Should produce the same matrices as the above test.
+{
+  auto spin0 = cudaq::spin_operator::x(0);
+  auto spin1 = cudaq::spin_operator::z(2);
+  auto spin_prod = spin0 * spin1;
+
+  std::vector<int> want_degrees = {2, 0};
+  auto spin_matrix =
+      cudaq::kronecker(utils::PauliZ_matrix(), utils::PauliX_matrix());
+
+  ASSERT_TRUE(spin_prod.degrees() == want_degrees);
+  utils::checkEqual(spin_matrix, spin_prod.to_matrix());
+
+  for (auto level_count : levels) {
+    auto op0 = cudaq::matrix_operator::position(0);
+    auto op1 = cudaq::matrix_operator::momentum(2);
+
+    cudaq::product_operator got = op0 * op1;
+    cudaq::product_operator got_reverse = op1 * op0;
+
+    ASSERT_TRUE(got.degrees() == want_degrees);
+    ASSERT_TRUE(got_reverse.degrees() == want_degrees);
+
+    auto got_matrix = got.to_matrix({{0, level_count}, {2, level_count}});
+    auto got_matrix_reverse =
+        got_reverse.to_matrix({{0, level_count}, {2, level_count}});
+
+    auto identity = utils::id_matrix(level_count);
+    auto matrix0 = utils::position_matrix(level_count);
+    auto matrix1 = utils::momentum_matrix(level_count);
+
+    auto fullHilbert0 = cudaq::kronecker(identity, matrix0);
+    auto fullHilbert1 = cudaq::kronecker(matrix1, identity);
+    auto want_matrix = fullHilbert0 * fullHilbert1;
+    auto want_matrix_reverse = fullHilbert1 * fullHilbert0;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+}
+
+// Different degrees of freedom, non-consecutive but all dimensions
+// provided.
+{
+  auto spin0 = cudaq::spin_operator::x(0);
+  auto spin1 = cudaq::spin_operator::z(2);
+  auto spin_prod = spin0 * spin1;
+
+  std::vector<int> want_degrees = {2, 0};
+  auto spin_matrix =
+      cudaq::kronecker(utils::PauliZ_matrix(), utils::PauliX_matrix());
+  std::unordered_map<int, int> dimensions = {{0, 2}, {1, 2}, {2, 2}};
+
+  ASSERT_TRUE(spin_prod.degrees() == want_degrees);
+  utils::checkEqual(spin_matrix, spin_prod.to_matrix(dimensions));
+
+  for (auto level_count : levels) {
+    auto op0 = cudaq::matrix_operator::position(0);
+    auto op1 = cudaq::matrix_operator::momentum(2);
+
+    cudaq::product_operator got = op0 * op1;
+    cudaq::product_operator got_reverse = op1 * op0;
+
+    std::vector<int> want_degrees = {2, 0};
+    ASSERT_TRUE(got.degrees() == want_degrees);
+    ASSERT_TRUE(got_reverse.degrees() == want_degrees);
+
+    dimensions = {{0, level_count}, {1, level_count}, {2, level_count}};
+    auto got_matrix = got.to_matrix(dimensions);
+    auto got_matrix_reverse = got_reverse.to_matrix(dimensions);
+
+    auto identity = utils::id_matrix(level_count);
+    auto matrix0 = utils::position_matrix(level_count);
+    auto matrix1 = utils::momentum_matrix(level_count);
+
+    std::vector<cudaq::matrix_2> matrices_0;
+    std::vector<cudaq::matrix_2> matrices_1;
+    matrices_0 = {identity, matrix0};
+    matrices_1 = {matrix1, identity};
+
+    auto fullHilbert0 = cudaq::kronecker(matrices_0.begin(), matrices_0.end());
+    auto fullHilbert1 = cudaq::kronecker(matrices_1.begin(), matrices_1.end());
+    auto want_matrix = fullHilbert0 * fullHilbert1;
+    auto want_matrix_reverse = fullHilbert1 * fullHilbert0;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(got_matrix, want_matrix);
+  }
+}
+}
+
+// Scalar Ops against Elementary Ops
+{
+  auto function = [](const std::unordered_map<std::string, std::complex<double>>
+                         &parameters) {
+    auto entry = parameters.find("value");
+    if (entry == parameters.end())
+      throw std::runtime_error("value not defined in parameters");
+    return entry->second;
+  };
+
+  // matrix operator against constant
+  {
+    auto op = cudaq::matrix_operator::position(0);
+    auto scalar_op = cudaq::scalar_operator(value_0);
+    auto product = scalar_op * op;
+    auto reverse = op * scalar_op;
+
+    std::vector<int> want_degrees = {0};
+    auto op_matrix = utils::position_matrix(2);
+
+    ASSERT_TRUE(product.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+    utils::checkEqual(value_0 * op_matrix, product.to_matrix({{0, 2}}));
+    utils::checkEqual(value_0 * op_matrix, reverse.to_matrix({{0, 2}}));
+  }
+
+  // spin operator against constant
+  {
+    auto op = cudaq::spin_operator::x(0);
+    auto scalar_op = cudaq::scalar_operator(value_0);
+    auto product = scalar_op * op;
+    auto reverse = op * scalar_op;
+
+    std::vector<int> want_degrees = {0};
+    auto op_matrix = utils::PauliX_matrix();
+
+    ASSERT_TRUE(product.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+    utils::checkEqual(value_0 * op_matrix, product.to_matrix());
+    utils::checkEqual(value_0 * op_matrix, reverse.to_matrix());
+  }
+
+  // matrix operator against constant from lambda
+  {
+    auto op = cudaq::matrix_operator::position(1);
+    auto scalar_op = cudaq::scalar_operator(function);
+    auto product = scalar_op * op;
+    auto reverse = op * scalar_op;
+
+    std::vector<int> want_degrees = {1};
+    auto op_matrix = utils::position_matrix(2);
+
+    ASSERT_TRUE(product.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+    utils::checkEqual(scalar_op.evaluate({{"value", 0.3}}) * op_matrix,
+                      product.to_matrix({{1, 2}}, {{"value", 0.3}}));
+    utils::checkEqual(scalar_op.evaluate({{"value", 0.3}}) * op_matrix,
+                      reverse.to_matrix({{1, 2}}, {{"value", 0.3}}));
+  }
+
+  // spin operator against constant from lambda
+  {
+    auto op = cudaq::spin_operator::x(1);
+    auto scalar_op = cudaq::scalar_operator(function);
+    auto product = scalar_op * op;
+    auto reverse = op * scalar_op;
+
+    std::vector<int> want_degrees = {1};
+    auto op_matrix = utils::PauliX_matrix();
+
+    ASSERT_TRUE(product.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+    utils::checkEqual(scalar_op.evaluate({{"value", 0.3}}) * op_matrix,
+                      product.to_matrix({}, {{"value", 0.3}}));
+    utils::checkEqual(scalar_op.evaluate({{"value", 0.3}}) * op_matrix,
+                      reverse.to_matrix({}, {{"value", 0.3}}));
+  }
+}
+}
+
+TEST(OperatorExpressions, checkProductOperatorAgainstScalars) {
+  std::complex<double> value_0 = 0.1 + 0.1;
+  int level_count = 3;
+
+  /// `product_operator + double`
+  {
+    auto product_op = cudaq::matrix_operator::position(0) *
+                      cudaq::matrix_operator::position(1);
+
+    auto sum = 2.0 + product_op;
+    auto reverse = product_op + 2.0;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    std::vector<int> want_degrees = {1, 0};
+    ASSERT_TRUE(sum.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+
+    auto got_matrix = sum.to_matrix({{0, level_count}, {1, level_count}});
+    auto got_matrix_reverse =
+        reverse.to_matrix({{0, level_count}, {1, level_count}});
+
+    auto term_0 = cudaq::kronecker(utils::id_matrix(level_count),
+                                   utils::position_matrix(level_count));
+    auto term_1 = cudaq::kronecker(utils::position_matrix(level_count),
+                                   utils::id_matrix(level_count));
+    auto product = term_0 * term_1;
+    auto scaled_identity = 2.0 * utils::id_matrix(level_count * level_count);
+
+    auto want_matrix = scaled_identity + product;
+    auto want_matrix_reverse = product + scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  /// `product_operator + complex<double>`
+  {
+    auto product_op = cudaq::matrix_operator::position(0) *
+                      cudaq::matrix_operator::position(1);
+
+    auto sum = value_0 + product_op;
+    auto reverse = product_op + value_0;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    std::vector<int> want_degrees = {1, 0};
+    ASSERT_TRUE(sum.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+
+    auto got_matrix = sum.to_matrix({{0, level_count}, {1, level_count}});
+    auto got_matrix_reverse =
+        reverse.to_matrix({{0, level_count}, {1, level_count}});
+
+    auto term_0 = cudaq::kronecker(utils::id_matrix(level_count),
+                                   utils::position_matrix(level_count));
+    auto term_1 = cudaq::kronecker(utils::position_matrix(level_count),
+                                   utils::id_matrix(level_count));
+    auto product = term_0 * term_1;
+    auto scaled_identity =
+        value_0 * utils::id_matrix(level_count * level_count);
+
+    auto want_matrix = scaled_identity + product;
+    auto want_matrix_reverse = product + scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  /// `spin product + complex<double>`
+  {
+    auto product_op = cudaq::spin_operator::x(0) * cudaq::spin_operator::y(1);
+
+    auto sum = value_0 + product_op;
+    auto reverse = product_op + value_0;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    std::vector<int> want_degrees = {1, 0};
+    ASSERT_TRUE(sum.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+
+    auto got_matrix = sum.to_matrix();
+    auto got_matrix_reverse = reverse.to_matrix();
+
+    auto term_0 = cudaq::kronecker(utils::id_matrix(2), utils::PauliX_matrix());
+    auto term_1 = cudaq::kronecker(utils::PauliY_matrix(), utils::id_matrix(2));
+    auto product = term_0 * term_1;
+    auto scaled_identity = value_0 * utils::id_matrix(2 * 2);
+
+    auto want_matrix = scaled_identity + product;
+    auto want_matrix_reverse = product + scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  /// `product_operator + scalar_operator`
+  {
+    auto product_op = cudaq::matrix_operator::position(0) *
+                      cudaq::matrix_operator::position(1);
+    auto scalar_op = cudaq::scalar_operator(value_0);
+
+    auto sum = scalar_op + product_op;
+    auto reverse = product_op + scalar_op;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    std::vector<int> want_degrees = {1, 0};
+    ASSERT_TRUE(sum.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+
+    auto got_matrix = sum.to_matrix({{0, level_count}, {1, level_count}});
+    auto got_matrix_reverse =
+        reverse.to_matrix({{0, level_count}, {1, level_count}});
+
+    auto term_0 = cudaq::kronecker(utils::id_matrix(level_count),
+                                   utils::position_matrix(level_count));
+    auto term_1 = cudaq::kronecker(utils::position_matrix(level_count),
+                                   utils::id_matrix(level_count));
+    auto product = term_0 * term_1;
+    auto scaled_identity =
+        value_0 * utils::id_matrix(level_count * level_count);
+
+    auto want_matrix = scaled_identity + product;
+    auto want_matrix_reverse = product + scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  /// `product_operator - double`
+  {
+    auto product_op = cudaq::matrix_operator::position(0) *
+                      cudaq::matrix_operator::position(1);
+
+    auto difference = 2.0 - product_op;
+    auto reverse = product_op - 2.0;
+
+    ASSERT_TRUE(difference.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    std::vector<int> want_degrees = {1, 0};
+    ASSERT_TRUE(difference.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+
+    auto got_matrix =
+        difference.to_matrix({{0, level_count}, {1, level_count}});
+    auto got_matrix_reverse =
+        reverse.to_matrix({{0, level_count}, {1, level_count}});
+
+    auto term_0 = cudaq::kronecker(utils::id_matrix(level_count),
+                                   utils::position_matrix(level_count));
+    auto term_1 = cudaq::kronecker(utils::position_matrix(level_count),
+                                   utils::id_matrix(level_count));
+    auto product = term_0 * term_1;
+    auto scaled_identity = 2.0 * utils::id_matrix(level_count * level_count);
+
+    auto want_matrix = scaled_identity - product;
+    auto want_matrix_reverse = product - scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  /// `spin product - double`
+  {
+    auto product_op = cudaq::spin_operator::i(0) * cudaq::spin_operator::z(1);
+
+    auto sum = 2.0 - product_op;
+    auto reverse = product_op - 2.0;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    std::vector<int> want_degrees = {1, 0};
+    ASSERT_TRUE(sum.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+
+    auto got_matrix = sum.to_matrix();
+    auto got_matrix_reverse = reverse.to_matrix();
+
+    auto term_0 = cudaq::kronecker(utils::id_matrix(2), utils::id_matrix(2));
+    auto term_1 = cudaq::kronecker(utils::PauliZ_matrix(), utils::id_matrix(2));
+    auto product = term_0 * term_1;
+    auto scaled_identity = 2.0 * utils::id_matrix(2 * 2);
+
+    auto want_matrix = scaled_identity - product;
+    auto want_matrix_reverse = product - scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  /// `product_operator - complex<double>`
+  {
+    auto product_op = cudaq::matrix_operator::position(0) *
+                      cudaq::matrix_operator::position(1);
+
+    auto difference = value_0 - product_op;
+    auto reverse = product_op - value_0;
+
+    ASSERT_TRUE(difference.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    std::vector<int> want_degrees = {1, 0};
+    ASSERT_TRUE(difference.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+
+    auto got_matrix =
+        difference.to_matrix({{0, level_count}, {1, level_count}});
+    auto got_matrix_reverse =
+        reverse.to_matrix({{0, level_count}, {1, level_count}});
+
+    auto term_0 = cudaq::kronecker(utils::id_matrix(level_count),
+                                   utils::position_matrix(level_count));
+    auto term_1 = cudaq::kronecker(utils::position_matrix(level_count),
+                                   utils::id_matrix(level_count));
+    auto product = term_0 * term_1;
+    auto scaled_identity =
+        value_0 * utils::id_matrix(level_count * level_count);
+
+    auto want_matrix = scaled_identity - product;
+    auto want_matrix_reverse = product - scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  /// `product_operator - scalar_operator`
+  {
+    auto product_op = cudaq::matrix_operator::momentum(0) *
+                      cudaq::matrix_operator::momentum(1);
+    auto scalar_op = cudaq::scalar_operator(value_0);
+
+    auto difference = scalar_op - product_op;
+    auto reverse = product_op - scalar_op;
+
+    ASSERT_TRUE(difference.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    std::vector<int> want_degrees = {1, 0};
+    ASSERT_TRUE(difference.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+
+    auto got_matrix =
+        difference.to_matrix({{0, level_count}, {1, level_count}});
+    auto got_matrix_reverse =
+        reverse.to_matrix({{0, level_count}, {1, level_count}});
+
+    auto term_0 = cudaq::kronecker(utils::id_matrix(level_count),
+                                   utils::momentum_matrix(level_count));
+    auto term_1 = cudaq::kronecker(utils::momentum_matrix(level_count),
+                                   utils::id_matrix(level_count));
+    auto product = term_0 * term_1;
+    auto scaled_identity =
+        value_0 * utils::id_matrix(level_count * level_count);
+
+    auto want_matrix = scaled_identity - product;
+    auto want_matrix_reverse = product - scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  /// `product_operator * double`
+  {
+    auto product_op =
+        cudaq::matrix_operator::parity(0) * cudaq::matrix_operator::parity(1);
+    ASSERT_TRUE(product_op.num_terms() == 2);
+    ASSERT_TRUE(product_op.get_coefficient().evaluate() ==
+                std::complex<double>(1.));
+
+    auto product = 2.0 * product_op;
+    auto reverse = product_op * 2.0;
+
+    ASSERT_TRUE(product.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+    ASSERT_TRUE(product.get_coefficient().evaluate() ==
+                std::complex<double>(2.));
+    ASSERT_TRUE(reverse.get_coefficient().evaluate() ==
+                std::complex<double>(2.));
+
+    std::vector<int> want_degrees = {1, 0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+
+    auto got_matrix = product.to_matrix({{0, level_count}, {1, level_count}});
+    auto got_matrix_reverse =
+        reverse.to_matrix({{0, level_count}, {1, level_count}});
+
+    auto term_0 = cudaq::kronecker(utils::id_matrix(level_count),
+                                   utils::parity_matrix(level_count));
+    auto term_1 = cudaq::kronecker(utils::parity_matrix(level_count),
+                                   utils::id_matrix(level_count));
+    auto product_matrix = term_0 * term_1;
+    auto scaled_identity = 2.0 * utils::id_matrix(level_count * level_count);
+
+    auto want_matrix = scaled_identity * product_matrix;
+    auto want_matrix_reverse = product_matrix * scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  /// `product_operator * complex<double>`
+  {
+    auto product_op =
+        cudaq::matrix_operator::number(0) * cudaq::matrix_operator::number(1);
+    ASSERT_TRUE(product_op.num_terms() == 2);
+    ASSERT_TRUE(product_op.get_coefficient().evaluate() ==
+                std::complex<double>(1.));
+
+    auto product = value_0 * product_op;
+    auto reverse = product_op * value_0;
+
+    ASSERT_TRUE(product.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+    ASSERT_TRUE(product.get_coefficient().evaluate() == value_0);
+    ASSERT_TRUE(reverse.get_coefficient().evaluate() == value_0);
+
+    std::vector<int> want_degrees = {1, 0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+
+    auto got_matrix = product.to_matrix({{0, level_count}, {1, level_count}});
+    auto got_matrix_reverse =
+        reverse.to_matrix({{0, level_count}, {1, level_count}});
+
+    auto term_0 = cudaq::kronecker(utils::id_matrix(level_count),
+                                   utils::number_matrix(level_count));
+    auto term_1 = cudaq::kronecker(utils::number_matrix(level_count),
+                                   utils::id_matrix(level_count));
+    auto product_matrix = term_0 * term_1;
+    auto scaled_identity =
+        value_0 * utils::id_matrix(level_count * level_count);
+
+    auto want_matrix = scaled_identity * product_matrix;
+    auto want_matrix_reverse = product_matrix * scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  /// `product_operator * scalar_operator`
+  {
+    auto product_op = cudaq::matrix_operator::position(0) *
+                      cudaq::matrix_operator::position(1);
+    auto scalar_op = cudaq::scalar_operator(value_0);
+
+    auto product = scalar_op * product_op;
+    auto reverse = product_op * scalar_op;
+
+    ASSERT_TRUE(product.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+    ASSERT_TRUE(product.get_coefficient().evaluate() == scalar_op.evaluate());
+    ASSERT_TRUE(reverse.get_coefficient().evaluate() == scalar_op.evaluate());
+
+    std::vector<int> want_degrees = {1, 0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+
+    auto got_matrix = product.to_matrix({{0, level_count}, {1, level_count}});
+    auto got_matrix_reverse =
+        reverse.to_matrix({{0, level_count}, {1, level_count}});
+
+    auto term_0 = cudaq::kronecker(utils::id_matrix(level_count),
+                                   utils::position_matrix(level_count));
+    auto term_1 = cudaq::kronecker(utils::position_matrix(level_count),
+                                   utils::id_matrix(level_count));
+    auto product_matrix = term_0 * term_1;
+    auto scaled_identity =
+        value_0 * utils::id_matrix(level_count * level_count);
+
+    auto want_matrix = scaled_identity * product_matrix;
+    auto want_matrix_reverse = product_matrix * scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  /// `spin product * scalar_operator`
+  {
+    auto product_op = cudaq::spin_operator::z(0) * cudaq::spin_operator::y(1);
+    auto scalar_op = cudaq::scalar_operator(value_0);
+
+    auto product = scalar_op * product_op;
+    auto reverse = product_op * scalar_op;
+
+    ASSERT_TRUE(product.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+    ASSERT_TRUE(product.get_coefficient().evaluate() == scalar_op.evaluate());
+    ASSERT_TRUE(reverse.get_coefficient().evaluate() == scalar_op.evaluate());
+
+    std::vector<int> want_degrees = {1, 0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+
+    auto got_matrix = product.to_matrix();
+    auto got_matrix_reverse = reverse.to_matrix();
+
+    auto term_0 = cudaq::kronecker(utils::id_matrix(2), utils::PauliZ_matrix());
+    auto term_1 = cudaq::kronecker(utils::PauliY_matrix(), utils::id_matrix(2));
+    auto product_matrix = term_0 * term_1;
+    auto scaled_identity = value_0 * utils::id_matrix(2 * 2);
+
+    auto want_matrix = scaled_identity * product_matrix;
+    auto want_matrix_reverse = product_matrix * scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  /// `product_operator *= double`
+  {
+    auto product = cudaq::matrix_operator::position(0) *
+                   cudaq::matrix_operator::momentum(1);
+    product *= 2.0;
+
+    ASSERT_TRUE(product.num_terms() == 2);
+    ASSERT_TRUE(product.get_coefficient().evaluate() ==
+                std::complex<double>(2.));
+
+    std::vector<int> want_degrees = {1, 0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+
+    auto got_matrix = product.to_matrix({{0, level_count}, {1, level_count}});
+
+    auto term_0 = cudaq::kronecker(utils::id_matrix(level_count),
+                                   utils::position_matrix(level_count));
+    auto term_1 = cudaq::kronecker(utils::momentum_matrix(level_count),
+                                   utils::id_matrix(level_count));
+    auto product_matrix = term_0 * term_1;
+    auto scaled_identity = 2.0 * utils::id_matrix(level_count * level_count);
+
+    auto want_matrix = product_matrix * scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  /// `spin product *= double`
+  {
+    auto product = cudaq::spin_operator::y(0) * cudaq::spin_operator::i(1);
+    product *= 2.0;
+
+    ASSERT_TRUE(product.num_terms() == 2);
+    ASSERT_TRUE(product.get_coefficient().evaluate() ==
+                std::complex<double>(2.));
+
+    std::vector<int> want_degrees = {1, 0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+
+    auto got_matrix = product.to_matrix();
+
+    auto term_0 = cudaq::kronecker(utils::id_matrix(2), utils::PauliY_matrix());
+    auto term_1 = cudaq::kronecker(utils::id_matrix(2), utils::id_matrix(2));
+    auto product_matrix = term_0 * term_1;
+    auto scaled_identity = 2.0 * utils::id_matrix(2 * 2);
+
+    auto want_matrix = product_matrix * scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  /// `product_operator *= complex<double>`
+  {
+    auto product =
+        cudaq::matrix_operator::number(0) * cudaq::matrix_operator::momentum(1);
+    product *= value_0;
+
+    ASSERT_TRUE(product.num_terms() == 2);
+    ASSERT_TRUE(product.get_coefficient().evaluate() == value_0);
+
+    std::vector<int> want_degrees = {1, 0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+
+    auto got_matrix = product.to_matrix({{0, level_count}, {1, level_count}});
+
+    auto term_0 = cudaq::kronecker(utils::id_matrix(level_count),
+                                   utils::number_matrix(level_count));
+    auto term_1 = cudaq::kronecker(utils::momentum_matrix(level_count),
+                                   utils::id_matrix(level_count));
+    auto product_matrix = term_0 * term_1;
+    auto scaled_identity =
+        value_0 * utils::id_matrix(level_count * level_count);
+
+    auto want_matrix = product_matrix * scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  /// `product_operator *= scalar_operator`
+  {
+    auto product =
+        cudaq::matrix_operator::number(0) * cudaq::matrix_operator::momentum(1);
+    auto scalar_op = cudaq::scalar_operator(value_0);
+
+    product *= scalar_op;
+
+    ASSERT_TRUE(product.num_terms() == 2);
+    ASSERT_TRUE(product.get_coefficient().evaluate() == scalar_op.evaluate());
+    ASSERT_TRUE(scalar_op.evaluate() == value_0);
+
+    std::vector<int> want_degrees = {1, 0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+
+    auto got_matrix = product.to_matrix({{0, level_count}, {1, level_count}});
+
+    auto term_0 = cudaq::kronecker(utils::id_matrix(level_count),
+                                   utils::number_matrix(level_count));
+    auto term_1 = cudaq::kronecker(utils::momentum_matrix(level_count),
+                                   utils::id_matrix(level_count));
+    auto product_matrix = term_0 * term_1;
+    auto scaled_identity =
+        value_0 * utils::id_matrix(level_count * level_count);
+
+    auto want_matrix = product_matrix * scaled_identity;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+}
+
+TEST(OperatorExpressions, checkProductOperatorAgainstProduct) {
+
+  int level_count = 3;
+  std::unordered_map<int, int> dimensions = {
+      {0, level_count}, {1, level_count}, {2, level_count + 1}};
+
+  // `product_operator + product_operator`
+  {
+    auto term_0 = cudaq::matrix_operator::position(0) *
+                  cudaq::matrix_operator::position(1);
+    auto term_1 = cudaq::matrix_operator::momentum(1) *
+                  cudaq::matrix_operator::position(2);
+
+    auto sum = term_0 + term_1;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+
+    std::vector<int> want_degrees = {2, 1, 0};
+    ASSERT_TRUE(sum.degrees() == want_degrees);
+
+    auto got_matrix = sum.to_matrix(dimensions);
+
+    std::vector<cudaq::matrix_2> matrices_0_0;
+    std::vector<cudaq::matrix_2> matrices_0_1;
+    matrices_0_0 = {utils::id_matrix(level_count + 1),
+                    utils::id_matrix(level_count),
+                    utils::position_matrix(level_count)};
+    matrices_0_1 = {utils::id_matrix(level_count + 1),
+                    utils::position_matrix(level_count),
+                    utils::id_matrix(level_count)};
+
+    std::vector<cudaq::matrix_2> matrices_1_0;
+    std::vector<cudaq::matrix_2> matrices_1_1;
+    matrices_1_0 = {utils::id_matrix(level_count + 1),
+                    utils::momentum_matrix(level_count),
+                    utils::id_matrix(level_count)};
+    matrices_1_1 = {utils::position_matrix(level_count + 1),
+                    utils::id_matrix(level_count),
+                    utils::id_matrix(level_count)};
+
+    auto term_0_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end()) *
+        cudaq::kronecker(matrices_0_1.begin(), matrices_0_1.end());
+    auto term_1_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) *
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end());
+
+    auto want_matrix = term_0_matrix + term_1_matrix;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `spin product + spin product`
+  {
+    auto term_0 = cudaq::spin_operator::z(0) * cudaq::spin_operator::y(2);
+    auto term_1 = cudaq::spin_operator::x(2) * cudaq::spin_operator::z(4);
+
+    auto sum = term_0 + term_1;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+
+    std::vector<int> want_degrees = {4, 2, 0};
+    ASSERT_TRUE(sum.degrees() == want_degrees);
+
+    auto got_matrix = sum.to_matrix();
+
+    std::vector<cudaq::matrix_2> matrices_0_0;
+    std::vector<cudaq::matrix_2> matrices_0_1;
+    matrices_0_0 = {utils::id_matrix(2), utils::id_matrix(2),
+                    utils::PauliZ_matrix()};
+    matrices_0_1 = {utils::id_matrix(2), utils::PauliY_matrix(),
+                    utils::id_matrix(2)};
+
+    std::vector<cudaq::matrix_2> matrices_1_0;
+    std::vector<cudaq::matrix_2> matrices_1_1;
+    matrices_1_0 = {utils::id_matrix(2), utils::PauliX_matrix(),
+                    utils::id_matrix(2)};
+    matrices_1_1 = {utils::PauliZ_matrix(), utils::id_matrix(2),
+                    utils::id_matrix(2)};
+
+    auto term_0_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end()) *
+        cudaq::kronecker(matrices_0_1.begin(), matrices_0_1.end());
+    auto term_1_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) *
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end());
+
+    auto want_matrix = term_0_matrix + term_1_matrix;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `product_operator - product_operator`
+  {
+    auto term_0 =
+        cudaq::matrix_operator::position(0) * cudaq::matrix_operator::number(1);
+    auto term_1 = cudaq::matrix_operator::momentum(1) *
+                  cudaq::matrix_operator::momentum(2);
+
+    auto difference = term_0 - term_1;
+
+    ASSERT_TRUE(difference.num_terms() == 2);
+
+    auto got_matrix = difference.to_matrix(dimensions);
+
+    std::vector<cudaq::matrix_2> matrices_0_0;
+    std::vector<cudaq::matrix_2> matrices_0_1;
+    matrices_0_0 = {utils::id_matrix(level_count + 1),
+                    utils::id_matrix(level_count),
+                    utils::position_matrix(level_count)};
+    matrices_0_1 = {utils::id_matrix(level_count + 1),
+                    utils::number_matrix(level_count),
+                    utils::id_matrix(level_count)};
+
+    std::vector<cudaq::matrix_2> matrices_1_0;
+    std::vector<cudaq::matrix_2> matrices_1_1;
+    matrices_1_0 = {utils::id_matrix(level_count + 1),
+                    utils::momentum_matrix(level_count),
+                    utils::id_matrix(level_count)};
+    matrices_1_1 = {utils::momentum_matrix(level_count + 1),
+                    utils::id_matrix(level_count),
+                    utils::id_matrix(level_count)};
+
+    auto term_0_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end()) *
+        cudaq::kronecker(matrices_0_1.begin(), matrices_0_1.end());
+    auto term_1_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) *
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end());
+
+    auto want_matrix = term_0_matrix - term_1_matrix;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `spin product - spin product`
+  {
+    auto term_0 = cudaq::spin_operator::i(0);
+    auto term_1 = cudaq::spin_operator::x(1) * cudaq::spin_operator::y(2);
+
+    auto difference = term_0 - term_1;
+    auto reverse = term_1 - term_0;
+
+    ASSERT_TRUE(difference.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    auto got_matrix = difference.to_matrix();
+    auto reverse_matrix = reverse.to_matrix();
+
+    std::vector<cudaq::matrix_2> matrices_0_0;
+    matrices_0_0 = {utils::id_matrix(2), utils::id_matrix(2),
+                    utils::id_matrix(2)};
+
+    std::vector<cudaq::matrix_2> matrices_1_0;
+    std::vector<cudaq::matrix_2> matrices_1_1;
+    matrices_1_0 = {utils::id_matrix(2), utils::PauliX_matrix(),
+                    utils::id_matrix(2)};
+    matrices_1_1 = {utils::PauliY_matrix(), utils::id_matrix(2),
+                    utils::id_matrix(2)};
+
+    auto term_0_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end());
+    auto term_1_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) *
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end());
+
+    auto want_matrix = term_0_matrix - term_1_matrix;
+    auto want_reverse_matrix = term_1_matrix - term_0_matrix;
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, reverse_matrix);
+  }
+
+  // `product_operator * product_operator`
+  {
+    auto term_0 = cudaq::matrix_operator::position(0) *
+                  cudaq::matrix_operator::position(1);
+    auto term_1 =
+        cudaq::matrix_operator::momentum(1) * cudaq::matrix_operator::parity(2);
+
+    auto product = term_0 * term_1;
+
+    ASSERT_TRUE(product.num_terms() == 4);
+
+    auto got_matrix = product.to_matrix(dimensions);
+
+    std::vector<cudaq::matrix_2> matrices_0_0;
+    std::vector<cudaq::matrix_2> matrices_0_1;
+    matrices_0_0 = {utils::id_matrix(level_count + 1),
+                    utils::id_matrix(level_count),
+                    utils::position_matrix(level_count)};
+    matrices_0_1 = {utils::id_matrix(level_count + 1),
+                    utils::position_matrix(level_count),
+                    utils::id_matrix(level_count)};
+
+    std::vector<cudaq::matrix_2> matrices_1_0;
+    std::vector<cudaq::matrix_2> matrices_1_1;
+    matrices_1_0 = {utils::id_matrix(level_count + 1),
+                    utils::momentum_matrix(level_count),
+                    utils::id_matrix(level_count)};
+    matrices_1_1 = {utils::parity_matrix(level_count + 1),
+                    utils::id_matrix(level_count),
+                    utils::id_matrix(level_count)};
+
+    auto term_0_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end()) *
+        cudaq::kronecker(matrices_0_1.begin(), matrices_0_1.end());
+    auto term_1_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) *
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end());
+
+    auto want_matrix = term_0_matrix * term_1_matrix;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `spin product * spin product`
+  {
+    auto term_0 = cudaq::spin_operator::y(0) * cudaq::spin_operator::x(1);
+    auto term_1 = cudaq::spin_operator::z(1) * cudaq::spin_operator::i(3);
+
+    auto product = term_0 * term_1;
+    auto reverse = term_1 * term_0;
+    std::vector<int> expected_degrees = {3, 1, 0};
+
+    ASSERT_TRUE(product.num_terms() == 3);
+    ASSERT_TRUE(reverse.num_terms() == 3);
+    ASSERT_TRUE(product.degrees() == expected_degrees);
+
+    auto got_matrix = product.to_matrix();
+    auto got_reverse_matrix = reverse.to_matrix();
+
+    std::vector<cudaq::matrix_2> matrices_0_0;
+    std::vector<cudaq::matrix_2> matrices_0_1;
+    matrices_0_0 = {utils::id_matrix(2), utils::id_matrix(2),
+                    utils::PauliY_matrix()};
+    matrices_0_1 = {utils::id_matrix(2), utils::PauliX_matrix(),
+                    utils::id_matrix(2)};
+
+    std::vector<cudaq::matrix_2> matrices_1_0;
+    std::vector<cudaq::matrix_2> matrices_1_1;
+    matrices_1_0 = {utils::id_matrix(2), utils::PauliZ_matrix(),
+                    utils::id_matrix(2)};
+    matrices_1_1 = {utils::id_matrix(2), utils::id_matrix(2),
+                    utils::id_matrix(2)};
+
+    auto term_0_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end()) *
+        cudaq::kronecker(matrices_0_1.begin(), matrices_0_1.end());
+    auto term_1_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) *
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end());
+
+    auto want_matrix = term_0_matrix * term_1_matrix;
+    auto want_reverse_matrix = term_1_matrix * term_0_matrix;
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `product_operator *= product_operator`
+  {
+    auto term_0 =
+        cudaq::matrix_operator::position(0) * cudaq::matrix_operator::number(1);
+    auto term_1 = cudaq::matrix_operator::momentum(1) *
+                  cudaq::matrix_operator::position(2);
+
+    term_0 *= term_1;
+
+    ASSERT_TRUE(term_0.num_terms() == 4);
+
+    auto got_matrix = term_0.to_matrix(dimensions);
+
+    std::vector<cudaq::matrix_2> matrices_0_0;
+    std::vector<cudaq::matrix_2> matrices_0_1;
+    matrices_0_0 = {utils::id_matrix(level_count + 1),
+                    utils::id_matrix(level_count),
+                    utils::position_matrix(level_count)};
+    matrices_0_1 = {utils::id_matrix(level_count + 1),
+                    utils::number_matrix(level_count),
+                    utils::id_matrix(level_count)};
+
+    std::vector<cudaq::matrix_2> matrices_1_0;
+    std::vector<cudaq::matrix_2> matrices_1_1;
+    matrices_1_0 = {utils::id_matrix(level_count + 1),
+                    utils::momentum_matrix(level_count),
+                    utils::id_matrix(level_count)};
+    matrices_1_1 = {utils::position_matrix(level_count + 1),
+                    utils::id_matrix(level_count),
+                    utils::id_matrix(level_count)};
+
+    auto term_0_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end()) *
+        cudaq::kronecker(matrices_0_1.begin(), matrices_0_1.end());
+    auto term_1_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) *
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end());
+
+    auto want_matrix = term_0_matrix * term_1_matrix;
+    auto term1_only_matrix =
+        cudaq::kronecker(utils::position_matrix(level_count + 1),
+                         utils::momentum_matrix(level_count));
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(term1_only_matrix, term_1.to_matrix(dimensions));
+  }
+
+  // `spin product *= spin product`
+  {
+    auto term_0 = cudaq::spin_operator::y(3) * cudaq::spin_operator::y(1);
+    auto term_1 = cudaq::spin_operator::z(1) * cudaq::spin_operator::x(0);
+
+    term_0 *= term_1;
+
+    ASSERT_TRUE(term_0.num_terms() == 3);
+
+    auto got_matrix = term_0.to_matrix();
+
+    std::vector<cudaq::matrix_2> matrices_0_0;
+    std::vector<cudaq::matrix_2> matrices_0_1;
+    matrices_0_0 = {utils::PauliY_matrix(), utils::id_matrix(2),
+                    utils::id_matrix(2)};
+    matrices_0_1 = {utils::id_matrix(2), utils::PauliY_matrix(),
+                    utils::id_matrix(2)};
+
+    std::vector<cudaq::matrix_2> matrices_1_0;
+    std::vector<cudaq::matrix_2> matrices_1_1;
+    matrices_1_0 = {utils::id_matrix(2), utils::PauliZ_matrix(),
+                    utils::id_matrix(2)};
+    matrices_1_1 = {utils::id_matrix(2), utils::id_matrix(2),
+                    utils::PauliX_matrix()};
+
+    auto term_0_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end()) *
+        cudaq::kronecker(matrices_0_1.begin(), matrices_0_1.end());
+    auto term_1_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) *
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end());
+
+    auto want_matrix = term_0_matrix * term_1_matrix;
+    auto term1_only_matrix =
+        cudaq::kronecker(utils::PauliZ_matrix(), utils::PauliX_matrix());
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(term1_only_matrix, term_1.to_matrix());
+  }
+}
+
+TEST(OperatorExpressions, checkProductOperatorAgainstOperatorSum) {
+
+  int level_count = 3;
+  std::unordered_map<int, int> dimensions = {
+      {0, level_count}, {1, level_count}, {2, level_count + 1}};
+
+  // `product_operator + operator_sum`
+  {
+    auto product = cudaq::matrix_operator::position(0) *
+                   cudaq::matrix_operator::position(1);
+    auto original_sum = cudaq::matrix_operator::momentum(1) +
+                        cudaq::matrix_operator::momentum(2);
+
+    auto sum = product + original_sum;
+    auto reverse = original_sum + product;
+
+    ASSERT_TRUE(sum.num_terms() == 3);
+    ASSERT_TRUE(reverse.num_terms() == 3);
+
+    auto got_matrix = sum.to_matrix(dimensions);
+    auto got_matrix_reverse = reverse.to_matrix(dimensions);
+
+    std::vector<cudaq::matrix_2> matrices_0_0 = {
+        utils::id_matrix(level_count + 1), utils::id_matrix(level_count),
+        utils::position_matrix(level_count)};
+    std::vector<cudaq::matrix_2> matrices_0_1 = {
+        utils::id_matrix(level_count + 1), utils::position_matrix(level_count),
+        utils::id_matrix(level_count)};
+    std::vector<cudaq::matrix_2> matrices_1_0 = {
+        utils::id_matrix(level_count + 1), utils::momentum_matrix(level_count),
+        utils::id_matrix(level_count)};
+    std::vector<cudaq::matrix_2> matrices_1_1 = {
+        utils::momentum_matrix(level_count + 1), utils::id_matrix(level_count),
+        utils::id_matrix(level_count)};
+    auto product_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end()) *
+        cudaq::kronecker(matrices_0_1.begin(), matrices_0_1.end());
+    auto sum_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) +
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end());
+
+    auto want_matrix = product_matrix + sum_matrix;
+    auto want_matrix_reverse = sum_matrix + product_matrix;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  // `spin product + spin sum`
+  {
+    auto product = cudaq::spin_operator::x(0) * cudaq::spin_operator::y(1);
+    auto original_sum = cudaq::spin_operator::z(1) + cudaq::spin_operator::i(2);
+
+    auto sum = product + original_sum;
+    auto reverse = original_sum + product;
+
+    ASSERT_TRUE(sum.num_terms() == 3);
+    ASSERT_TRUE(reverse.num_terms() == 3);
+
+    auto got_matrix = sum.to_matrix();
+    auto got_matrix_reverse = reverse.to_matrix();
+
+    std::vector<cudaq::matrix_2> matrices_0_0 = {
+        utils::id_matrix(2), utils::id_matrix(2), utils::PauliX_matrix()};
+    std::vector<cudaq::matrix_2> matrices_0_1 = {
+        utils::id_matrix(2), utils::PauliY_matrix(), utils::id_matrix(2)};
+    std::vector<cudaq::matrix_2> matrices_1_0 = {
+        utils::id_matrix(2), utils::PauliZ_matrix(), utils::id_matrix(2)};
+    std::vector<cudaq::matrix_2> matrices_1_1 = {
+        utils::id_matrix(2), utils::id_matrix(2), utils::id_matrix(2)};
+    auto product_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end()) *
+        cudaq::kronecker(matrices_0_1.begin(), matrices_0_1.end());
+    auto sum_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) +
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end());
+
+    auto want_matrix = product_matrix + sum_matrix;
+    auto want_matrix_reverse = sum_matrix + product_matrix;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  // `product_operator - operator_sum`
+  {
+    auto product = cudaq::matrix_operator::position(0) *
+                   cudaq::matrix_operator::position(1);
+    auto original_difference = cudaq::matrix_operator::momentum(1) -
+                               cudaq::matrix_operator::momentum(2);
+
+    auto difference = product - original_difference;
+    auto reverse = original_difference - product;
+
+    ASSERT_TRUE(difference.num_terms() == 3);
+    ASSERT_TRUE(reverse.num_terms() == 3);
+
+    auto got_matrix = difference.to_matrix(dimensions);
+    auto got_matrix_reverse = reverse.to_matrix(dimensions);
+
+    std::vector<cudaq::matrix_2> matrices_0_0 = {
+        utils::id_matrix(level_count + 1), utils::id_matrix(level_count),
+        utils::position_matrix(level_count)};
+    std::vector<cudaq::matrix_2> matrices_0_1 = {
+        utils::id_matrix(level_count + 1), utils::position_matrix(level_count),
+        utils::id_matrix(level_count)};
+    std::vector<cudaq::matrix_2> matrices_1_0 = {
+        utils::id_matrix(level_count + 1), utils::momentum_matrix(level_count),
+        utils::id_matrix(level_count)};
+    std::vector<cudaq::matrix_2> matrices_1_1 = {
+        utils::momentum_matrix(level_count + 1), utils::id_matrix(level_count),
+        utils::id_matrix(level_count)};
+    auto product_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end()) *
+        cudaq::kronecker(matrices_0_1.begin(), matrices_0_1.end());
+    auto difference_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) -
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end());
+
+    auto want_matrix = product_matrix - difference_matrix;
+    auto want_matrix_reverse = difference_matrix - product_matrix;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  // `spin product - spin sum`
+  {
+    auto product = cudaq::spin_operator::y(0) * cudaq::spin_operator::z(1);
+    auto original_difference =
+        cudaq::spin_operator::x(1) - cudaq::spin_operator::i(2);
+
+    auto difference = product - original_difference;
+    auto reverse = original_difference - product;
+
+    ASSERT_TRUE(difference.num_terms() == 3);
+    ASSERT_TRUE(reverse.num_terms() == 3);
+
+    auto got_matrix = difference.to_matrix();
+    auto got_matrix_reverse = reverse.to_matrix();
+
+    std::vector<cudaq::matrix_2> matrices_0_0 = {
+        utils::id_matrix(2), utils::id_matrix(2), utils::PauliY_matrix()};
+    std::vector<cudaq::matrix_2> matrices_0_1 = {
+        utils::id_matrix(2), utils::PauliZ_matrix(), utils::id_matrix(2)};
+    std::vector<cudaq::matrix_2> matrices_1_0 = {
+        utils::id_matrix(2), utils::PauliX_matrix(), utils::id_matrix(2)};
+    std::vector<cudaq::matrix_2> matrices_1_1 = {
+        utils::id_matrix(2), utils::id_matrix(2), utils::id_matrix(2)};
+    auto product_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end()) *
+        cudaq::kronecker(matrices_0_1.begin(), matrices_0_1.end());
+    auto difference_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) -
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end());
+
+    auto want_matrix = product_matrix - difference_matrix;
+    auto want_matrix_reverse = difference_matrix - product_matrix;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  // `product_operator * operator_sum`
+  {
+    auto original_product = cudaq::matrix_operator::position(0) *
+                            cudaq::matrix_operator::position(1);
+    auto sum = cudaq::matrix_operator::momentum(1) +
+               cudaq::matrix_operator::momentum(2);
+
+    auto product = original_product * sum;
+    auto reverse = sum * original_product;
+
+    ASSERT_TRUE(product.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    auto got_matrix = product.to_matrix(dimensions);
+    auto got_matrix_reverse = reverse.to_matrix(dimensions);
+
+    std::vector<cudaq::matrix_2> matrices_0_0 = {
+        utils::id_matrix(level_count + 1), utils::id_matrix(level_count),
+        utils::position_matrix(level_count)};
+    std::vector<cudaq::matrix_2> matrices_0_1 = {
+        utils::id_matrix(level_count + 1), utils::position_matrix(level_count),
+        utils::id_matrix(level_count)};
+    std::vector<cudaq::matrix_2> matrices_1_0 = {
+        utils::id_matrix(level_count + 1), utils::momentum_matrix(level_count),
+        utils::id_matrix(level_count)};
+    std::vector<cudaq::matrix_2> matrices_1_1 = {
+        utils::momentum_matrix(level_count + 1), utils::id_matrix(level_count),
+        utils::id_matrix(level_count)};
+    auto product_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end()) *
+        cudaq::kronecker(matrices_0_1.begin(), matrices_0_1.end());
+    auto sum_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) +
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end());
+
+    auto want_matrix = product_matrix * sum_matrix;
+    auto want_matrix_reverse = sum_matrix * product_matrix;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  // `spin product * spin sum`
+  {
+    auto original_product =
+        cudaq::spin_operator::z(0) * cudaq::spin_operator::y(1);
+    auto sum = cudaq::spin_operator::i(1) + cudaq::spin_operator::x(2);
+
+    auto product = original_product * sum;
+    auto reverse = sum * original_product;
+
+    ASSERT_TRUE(product.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    auto got_matrix = product.to_matrix();
+    auto got_matrix_reverse = reverse.to_matrix();
+
+    std::vector<cudaq::matrix_2> matrices_0_0 = {
+        utils::id_matrix(2), utils::id_matrix(2), utils::PauliZ_matrix()};
+    std::vector<cudaq::matrix_2> matrices_0_1 = {
+        utils::id_matrix(2), utils::PauliY_matrix(), utils::id_matrix(2)};
+    std::vector<cudaq::matrix_2> matrices_1_0 = {
+        utils::id_matrix(2), utils::id_matrix(2), utils::id_matrix(2)};
+    std::vector<cudaq::matrix_2> matrices_1_1 = {
+        utils::PauliX_matrix(), utils::id_matrix(2), utils::id_matrix(2)};
+    auto product_matrix =
+        cudaq::kronecker(matrices_0_0.begin(), matrices_0_0.end()) *
+        cudaq::kronecker(matrices_0_1.begin(), matrices_0_1.end());
+    auto sum_matrix =
+        cudaq::kronecker(matrices_1_0.begin(), matrices_1_0.end()) +
+        cudaq::kronecker(matrices_1_1.begin(), matrices_1_1.end());
+
+    auto want_matrix = product_matrix * sum_matrix;
+    auto want_matrix_reverse = sum_matrix * product_matrix;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+}
+
+TEST(OperatorExpressions, checkCustomProductOps) {
+  auto level_count = 2;
+  std::unordered_map<int, int> dimensions = {{0, level_count + 1},
+                                             {1, level_count + 2},
+                                             {2, level_count},
+                                             {3, level_count + 3}};
+
+  {
+    auto func0 =
+        [](const std::vector<int> &dimensions,
+           const std::unordered_map<std::string, std::complex<double>> &_none) {
+          return cudaq::kronecker(utils::momentum_matrix(dimensions[0]),
+                                  utils::position_matrix(dimensions[1]));
+        };
+    auto func1 =
+        [](const std::vector<int> &dimensions,
+           const std::unordered_map<std::string, std::complex<double>> &_none) {
+          return cudaq::kronecker(utils::momentum_matrix(dimensions[0]),
+                                  utils::number_matrix(dimensions[1]));
+        };
+    cudaq::matrix_operator::define("custom_op0", {-1, -1}, func0);
+    cudaq::matrix_operator::define("custom_op1", {-1, -1}, func1);
+  }
+
+  auto op0 = cudaq::matrix_operator::instantiate("custom_op0", {1, 0});
+  auto op1 = cudaq::matrix_operator::instantiate("custom_op1", {2, 1});
+  auto product = op0 * op1;
+  auto reverse = op1 * op0;
+
+  std::vector<cudaq::matrix_2> matrices = {
+      utils::momentum_matrix(level_count),
+      utils::momentum_matrix(level_count + 2) *
+          utils::number_matrix(level_count + 2),
+      utils::position_matrix(level_count + 1)};
+  auto expected = cudaq::kronecker(matrices.begin(), matrices.end());
+
+  std::vector<cudaq::matrix_2> matrices_reverse = {
+      utils::momentum_matrix(level_count),
+      utils::number_matrix(level_count + 2) *
+          utils::momentum_matrix(level_count + 2),
+      utils::position_matrix(level_count + 1)};
+  auto expected_reverse =
+      cudaq::kronecker(matrices_reverse.begin(), matrices_reverse.end());
+
+  utils::checkEqual(product.to_matrix(dimensions), expected);
+  utils::checkEqual(reverse.to_matrix(dimensions), expected_reverse);
+
+  op0 = cudaq::matrix_operator::instantiate("custom_op0", {3, 2});
+  op1 = cudaq::matrix_operator::instantiate("custom_op1", {2, 0});
+  product = op0 * op1;
+  reverse = op1 * op0;
+
+  matrices = {utils::momentum_matrix(level_count + 3),
+              utils::position_matrix(level_count) *
+                  utils::momentum_matrix(level_count),
+              utils::number_matrix(level_count + 1)};
+  expected = cudaq::kronecker(matrices.begin(), matrices.end());
+
+  matrices_reverse = {utils::momentum_matrix(level_count + 3),
+                      utils::momentum_matrix(level_count) *
+                          utils::position_matrix(level_count),
+                      utils::number_matrix(level_count + 1)};
+  expected_reverse =
+      cudaq::kronecker(matrices_reverse.begin(), matrices_reverse.end());
+
+  utils::checkEqual(product.to_matrix(dimensions), expected);
+  utils::checkEqual(reverse.to_matrix(dimensions), expected_reverse);
+}
diff --git a/unittests/dynamics/rydberg_hamiltonian.cpp b/unittests/dynamics/rydberg_hamiltonian.cpp
new file mode 100644
index 00000000000..f3883366ba0
--- /dev/null
+++ b/unittests/dynamics/rydberg_hamiltonian.cpp
@@ -0,0 +1,124 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved. *
+ * *
+ * This source code and the accompanying materials are made available under *
+ * the terms of the Apache License 2.0 which accompanies this distribution. *
+ ******************************************************************************/
+
+#include "cudaq/operators.h"
+#include <gtest/gtest.h>
+
+using namespace cudaq;
+
+TEST(RydbergHamiltonianTest, ConstructorValidInputs) {
+  // Valid atom sites
+  std::vector<rydberg_hamiltonian::Coordinate> atom_sites = {
+      {0.0, 0.0}, {1.0, 0.0}, {0.0, 1.0}, {1.0, 1.0}};
+
+  // Valid operators
+  scalar_operator amplitude(1.0);
+  scalar_operator phase(0.0);
+  scalar_operator delta_global(-0.5);
+
+  // Valid atom filling
+  rydberg_hamiltonian hamiltonian(atom_sites, amplitude, phase, delta_global);
+
+  EXPECT_EQ(hamiltonian.get_atom_sites().size(), atom_sites.size());
+  EXPECT_EQ(hamiltonian.get_atom_filling().size(), atom_sites.size());
+  EXPECT_EQ(hamiltonian.get_amplitude().evaluate({}),
+            std::complex<double>(1.0, 0.0));
+  EXPECT_EQ(hamiltonian.get_phase().evaluate({}),
+            std::complex<double>(0.0, 0.0));
+  EXPECT_EQ(hamiltonian.get_delta_global().evaluate({}),
+            std::complex<double>(-0.5, 0.0));
+}
+
+TEST(RydbergHamiltonianTest, ConstructorWithAtomFilling) {
+  std::vector<rydberg_hamiltonian::Coordinate> atom_sites = {
+      {0.0, 0.0}, {1.0, 0.0}, {0.0, 1.0}};
+
+  // Valid operators
+  scalar_operator amplitude(1.0);
+  scalar_operator phase(0.0);
+  scalar_operator delta_global(-0.5);
+
+  // Valid atom filling
+  std::vector<int> atom_filling = {1, 0, 1};
+
+  rydberg_hamiltonian hamiltonian(atom_sites, amplitude, phase, delta_global,
+                                  atom_filling);
+
+  EXPECT_EQ(hamiltonian.get_atom_sites().size(), atom_sites.size());
+  EXPECT_EQ(hamiltonian.get_atom_filling(), atom_filling);
+}
+
+TEST(RydbergHamiltonianTest, InvalidAtomFillingSize) {
+  std::vector<rydberg_hamiltonian::Coordinate> atom_sites = {
+      {0.0, 0.0}, {1.0, 0.0}, {0.0, 1.0}};
+
+  // Valid operators
+  scalar_operator amplitude(1.0);
+  scalar_operator phase(0.0);
+  scalar_operator delta_global(-0.5);
+
+  // Invalid atom filling size
+  std::vector<int> atom_filling = {1, 0};
+
+  EXPECT_THROW(rydberg_hamiltonian(atom_sites, amplitude, phase, delta_global,
+                                   atom_filling),
+               std::invalid_argument);
+}
+
+TEST(RydbergHamiltonianTest, UnsupportedLocalDetuning) {
+  std::vector<rydberg_hamiltonian::Coordinate> atom_sites = {
+      {0.0, 0.0}, {1.0, 0.0}, {0.0, 1.0}};
+
+  // Valid operators
+  scalar_operator amplitude(1.0);
+  scalar_operator phase(0.0);
+  scalar_operator delta_global(-0.5);
+
+  // Invalid delta_local
+  auto delta_local =
+      std::make_pair(scalar_operator(0.5), std::vector<double>{0.1, 0.2, 0.3});
+
+  EXPECT_THROW(rydberg_hamiltonian(atom_sites, amplitude, phase, delta_global,
+                                   {}, delta_local),
+               std::runtime_error);
+}
+
+TEST(RydbergHamiltonianTest, Accessors) {
+  std::vector<rydberg_hamiltonian::Coordinate> atom_sites = {
+      {0.0, 0.0}, {1.0, 0.0}, {0.0, 1.0}};
+
+  // Valid operators
+  scalar_operator amplitude(1.0);
+  scalar_operator phase(0.0);
+  scalar_operator delta_global(-0.5);
+
+  rydberg_hamiltonian hamiltonian(atom_sites, amplitude, phase, delta_global);
+
+  EXPECT_EQ(hamiltonian.get_atom_sites(), atom_sites);
+  EXPECT_EQ(hamiltonian.get_amplitude().evaluate({}),
+            std::complex<double>(1.0, 0.0));
+  EXPECT_EQ(hamiltonian.get_phase().evaluate({}),
+            std::complex<double>(0.0, 0.0));
+  EXPECT_EQ(hamiltonian.get_delta_global().evaluate({}),
+            std::complex<double>(-0.5, 0.0));
+}
+
+TEST(RydbergHamiltonianTest, DefaultAtomFilling) {
+  std::vector<rydberg_hamiltonian::Coordinate> atom_sites = {
+      {0.0, 0.0}, {1.0, 0.0}, {0.0, 1.0}, {1.0, 1.0}};
+
+  // Valid operators
+  scalar_operator amplitude(1.0);
+  scalar_operator phase(0.0);
+  scalar_operator delta_global(-0.5);
+
+  rydberg_hamiltonian hamiltonian(atom_sites, amplitude, phase, delta_global);
+
+  std::vector<int> expected_filling(atom_sites.size(), 1);
+  EXPECT_EQ(hamiltonian.get_atom_filling(), expected_filling);
+}
diff --git a/unittests/dynamics/scalar_operator.cpp b/unittests/dynamics/scalar_operator.cpp
new file mode 100644
index 00000000000..4ed6cf418d8
--- /dev/null
+++ b/unittests/dynamics/scalar_operator.cpp
@@ -0,0 +1,627 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/operators.h"
+#include <gtest/gtest.h>
+
+cudaq::scalar_operator negate(cudaq::scalar_operator op) { return -1.0 * op; }
+
+TEST(OperatorExpressions, checkScalarOpsUnary) {
+  auto scalar = cudaq::scalar_operator(1.0);
+  EXPECT_EQ((+scalar).evaluate(), std::complex<double>(1.0));
+  EXPECT_EQ((-scalar).evaluate(), std::complex<double>(-1.0));
+  EXPECT_EQ(negate(scalar).evaluate(), std::complex<double>(-1.0));
+  EXPECT_EQ(scalar.evaluate(), std::complex<double>(1.0));
+}
+
+TEST(OperatorExpressions, checkScalarOpsSimpleComplex) {
+
+  std::complex<double> value_0 = 0.1 + 0.1;
+  std::complex<double> value_1 = 0.1 + 1.0;
+  std::complex<double> value_2 = 2.0 + 0.1;
+  std::complex<double> value_3 = 2.0 + 1.0;
+
+  // From concrete values.
+  {
+    auto operator_0 = cudaq::scalar_operator(value_0);
+    auto operator_1 = cudaq::scalar_operator(value_1);
+    auto operator_2 = cudaq::scalar_operator(value_2);
+    auto operator_3 = cudaq::scalar_operator(value_3);
+
+    auto got_value_0 = operator_0.evaluate();
+    auto got_value_1 = operator_1.evaluate();
+    auto got_value_2 = operator_2.evaluate();
+    auto got_value_3 = operator_3.evaluate();
+
+    EXPECT_NEAR(std::abs(value_0), std::abs(got_value_0), 1e-5);
+    EXPECT_NEAR(std::abs(value_1), std::abs(got_value_1), 1e-5);
+    EXPECT_NEAR(std::abs(value_2), std::abs(got_value_2), 1e-5);
+    EXPECT_NEAR(std::abs(value_3), std::abs(got_value_3), 1e-5);
+  }
+
+  // From a lambda function.
+  {
+    auto function =
+        [](const std::unordered_map<std::string, std::complex<double>>
+               &parameters) {
+          auto entry = parameters.find("value");
+          if (entry == parameters.end())
+            throw std::runtime_error("value not defined in parameters");
+          return entry->second;
+        };
+
+    std::unordered_map<std::string, std::complex<double>> parameter_map;
+
+    auto operator_0 = cudaq::scalar_operator(function);
+    auto operator_1 = cudaq::scalar_operator(function);
+    auto operator_2 = cudaq::scalar_operator(function);
+    auto operator_3 = cudaq::scalar_operator(function);
+
+    parameter_map["value"] = value_0;
+    auto got_value_0 = operator_0.evaluate(parameter_map);
+    parameter_map["value"] = value_1;
+    auto got_value_1 = operator_1.evaluate(parameter_map);
+    parameter_map["value"] = value_2;
+    auto got_value_2 = operator_2.evaluate(parameter_map);
+    parameter_map["value"] = value_3;
+    auto got_value_3 = operator_3.evaluate(parameter_map);
+
+    EXPECT_NEAR(std::abs(value_0), std::abs(got_value_0), 1e-5);
+    EXPECT_NEAR(std::abs(value_1), std::abs(got_value_1), 1e-5);
+    EXPECT_NEAR(std::abs(value_2), std::abs(got_value_2), 1e-5);
+    EXPECT_NEAR(std::abs(value_3), std::abs(got_value_3), 1e-5);
+  }
+}
+
+TEST(OperatorExpressions, checkScalarOpsSimpleDouble) {
+
+  double value_0 = 0.1;
+  double value_1 = 0.2;
+  double value_2 = 2.1;
+  double value_3 = 2.2;
+
+  // From concrete values.
+  {
+    auto operator_0 = cudaq::scalar_operator(value_0);
+    auto operator_1 = cudaq::scalar_operator(value_1);
+    auto operator_2 = cudaq::scalar_operator(value_2);
+    auto operator_3 = cudaq::scalar_operator(value_3);
+
+    auto got_value_0 = operator_0.evaluate();
+    auto got_value_1 = operator_1.evaluate();
+    auto got_value_2 = operator_2.evaluate();
+    auto got_value_3 = operator_3.evaluate();
+
+    EXPECT_NEAR(std::abs(value_0), std::abs(got_value_0), 1e-5);
+    EXPECT_NEAR(std::abs(value_1), std::abs(got_value_1), 1e-5);
+    EXPECT_NEAR(std::abs(value_2), std::abs(got_value_2), 1e-5);
+    EXPECT_NEAR(std::abs(value_3), std::abs(got_value_3), 1e-5);
+  }
+
+  // From a lambda function.
+  {
+    auto function =
+        [](const std::unordered_map<std::string, std::complex<double>>
+               &parameters) {
+          auto entry = parameters.find("value");
+          if (entry == parameters.end())
+            throw std::runtime_error("value not defined in parameters");
+          return entry->second;
+        };
+
+    std::unordered_map<std::string, std::complex<double>> parameter_map;
+
+    auto operator_0 = cudaq::scalar_operator(function);
+    auto operator_1 = cudaq::scalar_operator(function);
+    auto operator_2 = cudaq::scalar_operator(function);
+    auto operator_3 = cudaq::scalar_operator(function);
+
+    parameter_map["value"] = value_0;
+    auto got_value_0 = operator_0.evaluate(parameter_map);
+    parameter_map["value"] = value_1;
+    auto got_value_1 = operator_1.evaluate(parameter_map);
+    parameter_map["value"] = value_2;
+    auto got_value_2 = operator_2.evaluate(parameter_map);
+    parameter_map["value"] = value_3;
+    auto got_value_3 = operator_3.evaluate(parameter_map);
+
+    EXPECT_NEAR(std::abs(value_0), std::abs(got_value_0), 1e-5);
+    EXPECT_NEAR(std::abs(value_1), std::abs(got_value_1), 1e-5);
+    EXPECT_NEAR(std::abs(value_2), std::abs(got_value_2), 1e-5);
+    EXPECT_NEAR(std::abs(value_3), std::abs(got_value_3), 1e-5);
+  }
+}
+
+TEST(OperatorExpressions, checkScalarOpsArithmeticComplex) {
+  // Arithmetic overloads against complex doubles.
+  std::complex<double> value_0 = 0.1 + 0.1;
+  std::complex<double> value_1 = 0.1 + 1.0;
+  std::complex<double> value_2 = 2.0 + 0.1;
+  std::complex<double> value_3 = 2.0 + 1.0;
+
+  auto function = [](const std::unordered_map<std::string, std::complex<double>>
+                         &parameters) {
+    auto entry = parameters.find("value");
+    if (entry == parameters.end())
+      throw std::runtime_error("value not defined in parameters");
+    return entry->second;
+  };
+
+  // + : Constant scalar operator.
+  {
+    auto scalar_op = cudaq::scalar_operator(value_0);
+
+    auto new_scalar_op = value_1 + scalar_op;
+    auto reverse_order_op = scalar_op + value_1;
+    EXPECT_NEAR(std::abs(scalar_op.evaluate()), std::abs(value_0), 1e-5);
+
+    auto got_value = new_scalar_op.evaluate();
+    auto got_value_1 = reverse_order_op.evaluate();
+    auto want_value = value_1 + value_0;
+
+    EXPECT_NEAR(std::abs(got_value), std::abs(want_value), 1e-5);
+    EXPECT_NEAR(std::abs(got_value_1), std::abs(want_value), 1e-5);
+
+    auto third_op = new_scalar_op + reverse_order_op;
+    auto got_value_third = third_op.evaluate();
+    EXPECT_NEAR(std::abs(got_value_third), std::abs(want_value + want_value),
+                1e-5);
+  }
+
+  // + : Scalar operator from lambda.
+  {
+    auto scalar_op = cudaq::scalar_operator(function);
+
+    auto new_scalar_op = value_0 + scalar_op;
+    auto reverse_order_op = scalar_op + value_0;
+
+    auto got_value = new_scalar_op.evaluate({{"value", value_1}});
+    auto got_value_1 = reverse_order_op.evaluate({{"value", value_1}});
+
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_0 + value_1), 1e-5);
+    EXPECT_NEAR(std::abs(got_value_1), std::abs(value_1 + value_0), 1e-5);
+
+    auto third_op = new_scalar_op + reverse_order_op;
+    auto got_value_third = third_op.evaluate({{"value", value_1}});
+    auto want_value = value_0 + value_1 + value_1 + value_0;
+    EXPECT_NEAR(std::abs(got_value_third), std::abs(want_value), 1e-5);
+  }
+
+  // - : Constant scalar operator.
+  {
+    auto scalar_op = cudaq::scalar_operator(value_1);
+
+    auto new_scalar_op = value_3 - scalar_op;
+    auto reverse_order_op = scalar_op - value_3;
+
+    auto got_value = new_scalar_op.evaluate();
+    auto got_value_1 = reverse_order_op.evaluate();
+
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_3 - value_1), 1e-5);
+    EXPECT_NEAR(std::abs(got_value_1), std::abs(value_1 - value_3), 1e-5);
+
+    auto third_op = new_scalar_op - reverse_order_op;
+    auto got_value_third = third_op.evaluate();
+    auto want_value = (value_3 - value_1) - (value_1 - value_3);
+    EXPECT_NEAR(std::abs(got_value_third), std::abs(want_value), 1e-5);
+  }
+
+  // - : Scalar operator from lambda.
+  {
+    auto scalar_op = cudaq::scalar_operator(function);
+
+    auto new_scalar_op = value_2 - scalar_op;
+    auto reverse_order_op = scalar_op - value_2;
+
+    auto got_value = new_scalar_op.evaluate({{"value", value_1}});
+    auto got_value_1 = reverse_order_op.evaluate({{"value", value_1}});
+
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_2 - value_1), 1e-5);
+    EXPECT_NEAR(std::abs(got_value_1), std::abs(value_1 - value_2), 1e-5);
+
+    auto third_op = new_scalar_op - reverse_order_op;
+    auto got_value_third = third_op.evaluate({{"value", value_1}});
+    auto want_value = (value_2 - value_1) - (value_1 - value_2);
+    EXPECT_NEAR(std::abs(got_value_third), std::abs(want_value), 1e-5);
+  }
+
+  // * : Constant scalar operator.
+  {
+    auto scalar_op = cudaq::scalar_operator(value_2);
+
+    auto new_scalar_op = value_3 * scalar_op;
+    auto reverse_order_op = scalar_op * value_3;
+
+    auto got_value = new_scalar_op.evaluate();
+    auto got_value_1 = reverse_order_op.evaluate();
+
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_3 * value_2), 1e-5);
+    EXPECT_NEAR(std::abs(got_value_1), std::abs(value_2 * value_3), 1e-5);
+
+    auto third_op = new_scalar_op * reverse_order_op;
+    auto got_value_third = third_op.evaluate();
+    auto want_value = (value_3 * value_2) * (value_2 * value_3);
+    EXPECT_NEAR(std::abs(got_value_third), std::abs(want_value), 1e-5);
+  }
+
+  // * : Scalar operator from lambda.
+  {
+    auto scalar_op = cudaq::scalar_operator(function);
+
+    auto new_scalar_op = value_3 * scalar_op;
+    auto reverse_order_op = scalar_op * value_3;
+
+    auto got_value = new_scalar_op.evaluate({{"value", value_2}});
+    auto got_value_1 = reverse_order_op.evaluate({{"value", value_2}});
+
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_3 * value_2), 1e-5);
+    EXPECT_NEAR(std::abs(got_value_1), std::abs(value_2 * value_3), 1e-5);
+
+    auto third_op = new_scalar_op * reverse_order_op;
+    auto got_value_third = third_op.evaluate({{"value", value_2}});
+    auto want_value = (value_3 * value_2) * (value_2 * value_3);
+    EXPECT_NEAR(std::abs(got_value_third), std::abs(want_value), 1e-5);
+  }
+
+  // / : Constant scalar operator.
+  {
+    auto scalar_op = cudaq::scalar_operator(value_2);
+
+    auto new_scalar_op = value_3 / scalar_op;
+    auto reverse_order_op = scalar_op / value_3;
+
+    auto got_value = new_scalar_op.evaluate();
+    auto got_value_1 = reverse_order_op.evaluate();
+
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_3 / value_2), 1e-5);
+    EXPECT_NEAR(std::abs(got_value_1), std::abs(value_2 / value_3), 1e-5);
+
+    auto third_op = new_scalar_op / reverse_order_op;
+    auto got_value_third = third_op.evaluate();
+    auto want_value = (value_3 / value_2) / (value_2 / value_3);
+    EXPECT_NEAR(std::abs(got_value_third), std::abs(want_value), 1e-5);
+  }
+
+  // / : Scalar operator from lambda.
+  {
+    auto scalar_op = cudaq::scalar_operator(function);
+
+    auto new_scalar_op = value_3 / scalar_op;
+    auto reverse_order_op = scalar_op / value_3;
+
+    auto got_value = new_scalar_op.evaluate({{"value", value_1}});
+    auto got_value_1 = reverse_order_op.evaluate({{"value", value_1}});
+
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_3 / value_1), 1e-5);
+    EXPECT_NEAR(std::abs(got_value_1), std::abs(value_1 / value_3), 1e-5);
+
+    auto third_op = new_scalar_op / reverse_order_op;
+    auto got_value_third = third_op.evaluate({{"value", value_1}});
+    auto want_value = (value_3 / value_1) / (value_1 / value_3);
+    EXPECT_NEAR(std::abs(got_value_third), std::abs(want_value), 1e-5);
+  }
+
+  // += : Constant scalar operator.
+  {
+    auto scalar_op = cudaq::scalar_operator(value_0);
+    scalar_op += value_0;
+
+    auto got_value = scalar_op.evaluate();
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_0 + value_0), 1e-5);
+  }
+
+  // += : Scalar operator from lambda.
+  {
+    auto scalar_op = cudaq::scalar_operator(function);
+    scalar_op += value_1;
+
+    auto got_value = scalar_op.evaluate({{"value", value_0}});
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_0 + value_1), 1e-5);
+  }
+
+  // -= : Constant scalar operator.
+  {
+    auto scalar_op = cudaq::scalar_operator(value_0);
+    scalar_op -= value_0;
+
+    auto got_value = scalar_op.evaluate();
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_0 - value_0), 1e-5);
+  }
+
+  // -= : Scalar operator from lambda.
+  {
+    auto scalar_op = cudaq::scalar_operator(function);
+    scalar_op -= value_1;
+
+    auto got_value = scalar_op.evaluate({{"value", value_0}});
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_0 - value_1), 1e-5);
+  }
+
+  // *= : Constant scalar operator.
+  {
+    auto scalar_op = cudaq::scalar_operator(value_2);
+    scalar_op *= value_3;
+
+    auto got_value = scalar_op.evaluate();
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_2 * value_3), 1e-5);
+  }
+
+  // *= : Scalar operator from lambda.
+  {
+    auto scalar_op = cudaq::scalar_operator(function);
+    scalar_op *= value_3;
+
+    auto got_value = scalar_op.evaluate({{"value", value_2}});
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_2 * value_3), 1e-5);
+  }
+
+  // /= : Constant scalar operator.
+  {
+    auto scalar_op = cudaq::scalar_operator(value_2);
+    scalar_op /= value_3;
+
+    auto got_value = scalar_op.evaluate();
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_2 / value_3), 1e-5);
+  }
+
+  // /= : Scalar operator from lambda.
+  {
+    auto scalar_op = cudaq::scalar_operator(function);
+    scalar_op /= value_3;
+
+    auto got_value = scalar_op.evaluate({{"value", value_2}});
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_2 / value_3), 1e-5);
+  }
+}
+
+TEST(OperatorExpressions, checkScalarOpsArithmeticScalarOps) {
+  // Arithmetic overloads against other scalar ops.
+  std::complex<double> value_0 = 0.1 + 0.1;
+  std::complex<double> value_1 = 0.1 + 1.0;
+  std::complex<double> value_2 = 2.0 + 0.1;
+  std::complex<double> value_3 = 2.0 + 1.0;
+
+  auto function = [](const std::unordered_map<std::string, std::complex<double>>
+                         &parameters) {
+    auto entry = parameters.find("value");
+    if (entry == parameters.end())
+      throw std::runtime_error("value not defined in parameters");
+    return entry->second;
+  };
+
+  // I use another function here to make sure that local variables
+  // that may be unique to each ScalarOp's generators are both kept
+  // track of when we merge the generators.
+  auto alternative_function =
+      [](const std::unordered_map<std::string, std::complex<double>>
+             &parameters) {
+        auto entry = parameters.find("other");
+        if (entry == parameters.end())
+          throw std::runtime_error("other not defined in parameters");
+        return entry->second;
+      };
+
+  // + : Constant scalar operator.
+  {
+    auto scalar_op = cudaq::scalar_operator(value_0);
+    auto other_scalar_op = cudaq::scalar_operator(value_1);
+
+    auto new_scalar_op = other_scalar_op + scalar_op;
+    auto reverse_order_op = scalar_op + other_scalar_op;
+
+    auto got_value = new_scalar_op.evaluate();
+    auto got_value_1 = reverse_order_op.evaluate();
+    auto want_value = value_1 + value_0;
+
+    EXPECT_NEAR(std::abs(got_value), std::abs(want_value), 1e-5);
+    EXPECT_NEAR(std::abs(got_value_1), std::abs(want_value), 1e-5);
+  }
+
+  // + : Scalar operator from lambda.
+  {
+    auto scalar_op = cudaq::scalar_operator(function);
+    auto other_scalar_op = cudaq::scalar_operator(alternative_function);
+
+    auto new_scalar_op = other_scalar_op + scalar_op;
+    auto reverse_order_op = scalar_op + other_scalar_op;
+
+    std::unordered_map<std::string, std::complex<double>> parameter_map = {
+        {"value", value_1}, {"other", value_0}};
+
+    auto got_value = new_scalar_op.evaluate(parameter_map);
+    auto got_value_1 = reverse_order_op.evaluate(parameter_map);
+
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_0 + value_1), 1e-5);
+    EXPECT_NEAR(std::abs(got_value_1), std::abs(value_1 + value_0), 1e-5);
+  }
+
+  // - : Constant scalar operator.
+  {
+    auto scalar_op = cudaq::scalar_operator(value_2);
+    auto other_scalar_op = cudaq::scalar_operator(value_1);
+
+    auto new_scalar_op = other_scalar_op - scalar_op;
+    auto reverse_order_op = scalar_op - other_scalar_op;
+
+    auto got_value = new_scalar_op.evaluate();
+    auto got_value_1 = reverse_order_op.evaluate();
+    auto want_value = value_1 - value_2;
+
+    EXPECT_NEAR(std::abs(got_value), std::abs(want_value), 1e-5);
+    EXPECT_NEAR(std::abs(got_value_1), std::abs(want_value), 1e-5);
+  }
+
+  // - : Scalar operator from lambda.
+  {
+    auto scalar_op = cudaq::scalar_operator(function);
+    auto other_scalar_op = cudaq::scalar_operator(alternative_function);
+
+    auto new_scalar_op = other_scalar_op - scalar_op;
+    auto reverse_order_op = scalar_op - other_scalar_op;
+
+    std::unordered_map<std::string, std::complex<double>> parameter_map = {
+        {"value", value_1}, {"other", value_3}};
+
+    auto got_value = new_scalar_op.evaluate(parameter_map);
+    auto got_value_1 = reverse_order_op.evaluate(parameter_map);
+
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_3 - value_1), 1e-5);
+    EXPECT_NEAR(std::abs(got_value_1), std::abs(value_1 - value_3), 1e-5);
+  }
+
+  // * : Constant scalar operator.
+  {
+    auto scalar_op = cudaq::scalar_operator(value_2);
+    auto other_scalar_op = cudaq::scalar_operator(value_3);
+
+    auto new_scalar_op = other_scalar_op * scalar_op;
+    auto reverse_order_op = scalar_op * other_scalar_op;
+
+    auto got_value = new_scalar_op.evaluate();
+    auto got_value_1 = reverse_order_op.evaluate();
+    auto want_value = value_3 * value_2;
+    auto reverse_want_value = value_2 * value_3;
+
+    EXPECT_NEAR(std::abs(got_value), std::abs(want_value), 1e-5);
+    EXPECT_NEAR(std::abs(got_value_1), std::abs(reverse_want_value), 1e-5);
+  }
+
+  // * : Scalar operator from lambda.
+  {
+    auto scalar_op = cudaq::scalar_operator(function);
+    auto other_scalar_op = cudaq::scalar_operator(alternative_function);
+
+    auto new_scalar_op = other_scalar_op * scalar_op;
+    auto reverse_order_op = scalar_op * other_scalar_op;
+
+    std::unordered_map<std::string, std::complex<double>> parameter_map = {
+        {"value", value_1}, {"other", value_3}};
+
+    auto got_value = new_scalar_op.evaluate(parameter_map);
+    auto got_value_1 = reverse_order_op.evaluate(parameter_map);
+
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_3 * value_1), 1e-5);
+    EXPECT_NEAR(std::abs(got_value_1), std::abs(value_1 * value_3), 1e-5);
+  }
+
+  // / : Constant scalar operator.
+  {
+    auto scalar_op = cudaq::scalar_operator(value_0);
+    auto other_scalar_op = cudaq::scalar_operator(value_2);
+
+    auto new_scalar_op = other_scalar_op / scalar_op;
+    auto reverse_order_op = scalar_op / other_scalar_op;
+
+    auto got_value = new_scalar_op.evaluate();
+    auto got_value_1 = reverse_order_op.evaluate();
+    auto want_value = value_2 / value_0;
+    auto reverse_want_value = value_0 / value_2;
+
+    EXPECT_NEAR(std::abs(got_value), std::abs(want_value), 1e-5);
+    EXPECT_NEAR(std::abs(got_value_1), std::abs(reverse_want_value), 1e-5);
+  }
+
+  // / : Scalar operator from lambda.
+  {
+    auto scalar_op = cudaq::scalar_operator(function);
+    auto other_scalar_op = cudaq::scalar_operator(alternative_function);
+
+    auto new_scalar_op = other_scalar_op / scalar_op;
+    auto reverse_order_op = scalar_op / other_scalar_op;
+
+    std::unordered_map<std::string, std::complex<double>> parameter_map = {
+        {"value", value_0}, {"other", value_3}};
+
+    auto got_value = new_scalar_op.evaluate(parameter_map);
+    auto got_value_1 = reverse_order_op.evaluate(parameter_map);
+
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_3 / value_0), 1e-5);
+    EXPECT_NEAR(std::abs(got_value_1), std::abs(value_0 / value_3), 1e-5);
+  }
+
+  // += : Constant scalar operator.
+  {
+    auto scalar_op = cudaq::scalar_operator(value_0);
+    auto other = cudaq::scalar_operator(value_0);
+    scalar_op += other;
+
+    auto got_value = scalar_op.evaluate();
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_0 + value_0), 1e-5);
+  }
+
+  // += : Scalar operator from lambda.
+  {
+    auto scalar_op = cudaq::scalar_operator(function);
+    auto other = cudaq::scalar_operator(value_1);
+    scalar_op += other;
+
+    auto scalar_op_1 = cudaq::scalar_operator(function);
+    auto other_function = cudaq::scalar_operator(alternative_function);
+    scalar_op_1 += other_function;
+
+    auto got_value = scalar_op.evaluate({{"value", value_0}});
+    auto got_value_1 =
+        scalar_op_1.evaluate({{"value", value_0}, {"other", value_1}});
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_0 + value_1), 1e-5);
+    EXPECT_NEAR(std::abs(got_value_1), std::abs(value_0 + value_1), 1e-5);
+  }
+
+  // -= : Constant scalar operator.
+  {
+    auto scalar_op = cudaq::scalar_operator(value_0);
+    scalar_op -= value_0;
+
+    auto got_value = scalar_op.evaluate();
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_0 - value_0), 1e-5);
+  }
+
+  // -= : Scalar operator from lambda.
+  {
+    auto scalar_op = cudaq::scalar_operator(function);
+    scalar_op -= value_1;
+
+    auto got_value = scalar_op.evaluate({{"value", value_0}});
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_0 - value_1), 1e-5);
+  }
+
+  // *= : Constant scalar operator.
+  {
+    auto scalar_op = cudaq::scalar_operator(value_2);
+    scalar_op *= value_3;
+
+    auto got_value = scalar_op.evaluate();
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_2 * value_3), 1e-5);
+  }
+
+  // *= : Scalar operator from lambda.
+  {
+    auto scalar_op = cudaq::scalar_operator(function);
+    scalar_op *= value_3;
+
+    auto got_value = scalar_op.evaluate({{"value", value_2}});
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_2 * value_3), 1e-5);
+  }
+
+  // /= : Constant scalar operator.
+  {
+    auto scalar_op = cudaq::scalar_operator(value_2);
+    scalar_op /= value_3;
+
+    auto got_value = scalar_op.evaluate();
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_2 / value_3), 1e-5);
+  }
+
+  // /= : Scalar operator from lambda.
+  {
+    auto scalar_op = cudaq::scalar_operator(function);
+    scalar_op /= value_3;
+
+    auto got_value = scalar_op.evaluate({{"value", value_2}});
+    EXPECT_NEAR(std::abs(got_value), std::abs(value_2 / value_3), 1e-5);
+  }
+}
\ No newline at end of file
diff --git a/unittests/dynamics/spin_operator.cpp b/unittests/dynamics/spin_operator.cpp
new file mode 100644
index 00000000000..baef331688a
--- /dev/null
+++ b/unittests/dynamics/spin_operator.cpp
@@ -0,0 +1,564 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/operators.h"
+#include "utils.h"
+#include <gtest/gtest.h>
+
+TEST(OperatorExpressions, checkSpinOpsUnary) {
+  auto op = cudaq::spin_operator::x(0);
+  utils::checkEqual((+op).to_matrix(), utils::PauliX_matrix());
+  utils::checkEqual((-op).to_matrix(), -1.0 * utils::PauliX_matrix());
+  utils::checkEqual(op.to_matrix(), utils::PauliX_matrix());
+}
+
+TEST(OperatorExpressions, checkSpinOpsConstruction) {
+  auto prod = cudaq::spin_operator::identity();
+  cudaq::matrix_2 expected(1, 1);
+
+  expected[{0, 0}] = 1.;
+  utils::checkEqual(prod.to_matrix(), expected);
+
+  prod *= -1.j;
+  expected[{0, 0}] = std::complex<double>(-1.j);
+  utils::checkEqual(prod.to_matrix(), expected);
+
+  prod *= cudaq::spin_operator::x(0);
+  expected = cudaq::matrix_2(2, 2);
+  expected[{0, 1}] = std::complex<double>(-1.j);
+  expected[{1, 0}] = std::complex<double>(-1.j);
+  utils::checkEqual(prod.to_matrix(), expected);
+
+  auto sum = cudaq::spin_operator::empty();
+  expected = cudaq::matrix_2(0, 0);
+  utils::checkEqual(sum.to_matrix(), expected);
+
+  sum *= cudaq::spin_operator::x(1); // empty times something is still empty
+  std::vector<int> expected_degrees = {};
+  ASSERT_EQ(sum.degrees(), expected_degrees);
+  utils::checkEqual(sum.to_matrix(), expected);
+
+  sum += cudaq::spin_operator::i(1);
+  expected = cudaq::matrix_2(2, 2);
+  for (size_t i = 0; i < 2; ++i)
+    expected[{i, i}] = 1.;
+  utils::checkEqual(sum.to_matrix(), expected);
+
+  sum *= cudaq::spin_operator::x(1);
+  expected = cudaq::matrix_2(2, 2);
+  expected[{0, 1}] = 1.;
+  expected[{1, 0}] = 1.;
+  utils::checkEqual(sum.to_matrix(), expected);
+
+  sum = cudaq::spin_operator::empty();
+  sum -= cudaq::spin_operator::i(0);
+  expected = cudaq::matrix_2(2, 2);
+  for (size_t i = 0; i < 2; ++i)
+    expected[{i, i}] = -1.;
+  utils::checkEqual(sum.to_matrix(), expected);
+}
+
+TEST(OperatorExpressions, checkPreBuiltSpinOps) {
+
+  // Keeping this fixed throughout.
+  int degree_index = 0;
+  auto id = utils::id_matrix(2);
+
+  // Identity operator.
+  {
+    auto op = cudaq::spin_operator::i(degree_index);
+    auto got = op.to_matrix();
+    auto want = utils::id_matrix(2);
+    utils::checkEqual(want, got);
+    utils::checkEqual(id, (op * op).to_matrix());
+  }
+
+  // Z operator.
+  {
+    auto op = cudaq::spin_operator::z(degree_index);
+    auto got = op.to_matrix();
+    auto want = utils::PauliZ_matrix();
+    utils::checkEqual(want, got);
+    utils::checkEqual(id, (op * op).to_matrix());
+  }
+
+  // X operator.
+  {
+    auto op = cudaq::spin_operator::x(degree_index);
+    auto got = op.to_matrix();
+    auto want = utils::PauliX_matrix();
+    utils::checkEqual(want, got);
+    utils::checkEqual(id, (op * op).to_matrix());
+  }
+
+  // Y operator.
+  {
+    auto op = cudaq::spin_operator::y(degree_index);
+    auto got = op.to_matrix();
+    auto want = utils::PauliY_matrix();
+    utils::checkEqual(want, got);
+    utils::checkEqual(id, (op * op).to_matrix());
+  }
+}
+
+TEST(OperatorExpressions, checkSpinOpsWithComplex) {
+  std::complex<double> value = 0.125 + 0.125j;
+
+  // `spin_operator` + `complex<double>`
+  {
+    auto elementary = cudaq::spin_operator::y(0);
+
+    auto sum = value + elementary;
+    auto reverse = elementary + value;
+
+    auto got_matrix = sum.to_matrix();
+    auto got_matrix_reverse = reverse.to_matrix();
+
+    auto scaled_identity = value * utils::id_matrix(2);
+    auto want_matrix = scaled_identity + utils::PauliY_matrix();
+    auto want_matrix_reverse = utils::PauliY_matrix() + scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  // `spin_operator` - `complex<double>`
+  {
+    auto elementary = cudaq::spin_operator::x(0);
+
+    auto difference = value - elementary;
+    auto reverse = elementary - value;
+
+    auto got_matrix = difference.to_matrix();
+    auto got_matrix_reverse = reverse.to_matrix();
+
+    auto scaled_identity = value * utils::id_matrix(2);
+    auto want_matrix = scaled_identity - utils::PauliX_matrix();
+    auto want_matrix_reverse = utils::PauliX_matrix() - scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+
+  // `spin_operator` * `complex<double>`
+  {
+    auto elementary = cudaq::spin_operator::z(0);
+
+    auto product = value * elementary;
+    auto reverse = elementary * value;
+
+    auto got_matrix = product.to_matrix();
+    auto got_matrix_reverse = reverse.to_matrix();
+
+    auto scaled_identity = value * utils::id_matrix(2);
+    auto want_matrix = scaled_identity * utils::PauliZ_matrix();
+    auto want_matrix_reverse = utils::PauliZ_matrix() * scaled_identity;
+
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_matrix_reverse, got_matrix_reverse);
+  }
+}
+
+TEST(OperatorExpressions, checkSpinOpsWithScalars) {
+
+  auto function = [](const std::unordered_map<std::string, std::complex<double>>
+                         &parameters) {
+    auto entry = parameters.find("value");
+    if (entry == parameters.end())
+      throw std::runtime_error("value not defined in parameters");
+    return entry->second;
+  };
+
+  /// Keeping these fixed for these more simple tests.
+  int degree_index = 0;
+  double const_scale_factor = 2.0;
+
+  // `spin_operator + scalar_operator`
+  {
+    auto self = cudaq::spin_operator::x(0);
+    auto other = cudaq::scalar_operator(const_scale_factor);
+
+    auto sum = self + other;
+    auto reverse = other + self;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(2);
+    auto got_matrix = sum.to_matrix();
+    auto got_reverse_matrix = reverse.to_matrix();
+    auto want_matrix = utils::PauliX_matrix() + scaled_identity;
+    auto want_reverse_matrix = scaled_identity + utils::PauliX_matrix();
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `spin_operator + scalar_operator`
+  {
+    auto self = cudaq::spin_operator::y(0);
+    auto other = cudaq::scalar_operator(function);
+
+    auto sum = self + other;
+    auto reverse = other + self;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(2);
+    auto got_matrix = sum.to_matrix({}, {{"value", const_scale_factor}});
+    auto got_reverse_matrix =
+        reverse.to_matrix({}, {{"value", const_scale_factor}});
+    auto want_matrix = utils::PauliY_matrix() + scaled_identity;
+    auto want_reverse_matrix = scaled_identity + utils::PauliY_matrix();
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `spin_operator - scalar_operator`
+  {
+    auto self = cudaq::spin_operator::i(0);
+    auto other = cudaq::scalar_operator(const_scale_factor);
+
+    auto sum = self - other;
+    auto reverse = other - self;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(2);
+    auto got_matrix = sum.to_matrix();
+    auto got_reverse_matrix = reverse.to_matrix();
+    auto want_matrix = utils::id_matrix(2) - scaled_identity;
+    auto want_reverse_matrix = scaled_identity - utils::id_matrix(2);
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `spin_operator - scalar_operator`
+  {
+    auto self = cudaq::spin_operator::z(0);
+    auto other = cudaq::scalar_operator(function);
+
+    auto sum = self - other;
+    auto reverse = other - self;
+
+    ASSERT_TRUE(sum.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(2);
+    auto got_matrix = sum.to_matrix({}, {{"value", const_scale_factor}});
+    auto got_reverse_matrix =
+        reverse.to_matrix({}, {{"value", const_scale_factor}});
+    auto want_matrix = utils::PauliZ_matrix() - scaled_identity;
+    auto want_reverse_matrix = scaled_identity - utils::PauliZ_matrix();
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `spin_operator * scalar_operator`
+  {
+    auto self = cudaq::spin_operator::y(0);
+    auto other = cudaq::scalar_operator(const_scale_factor);
+
+    auto product = self * other;
+    auto reverse = other * self;
+
+    std::vector<int> want_degrees = {0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(2);
+    auto got_matrix = product.to_matrix();
+    auto got_reverse_matrix = reverse.to_matrix();
+    auto want_matrix = utils::PauliY_matrix() * scaled_identity;
+    auto want_reverse_matrix = scaled_identity * utils::PauliY_matrix();
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `spin_operator * scalar_operator`
+  {
+    auto self = cudaq::spin_operator::z(0);
+    auto other = cudaq::scalar_operator(function);
+
+    auto product = self * other;
+    auto reverse = other * self;
+
+    std::vector<int> want_degrees = {0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+    ASSERT_TRUE(reverse.degrees() == want_degrees);
+
+    auto scaled_identity = const_scale_factor * utils::id_matrix(2);
+    auto got_matrix = product.to_matrix({}, {{"value", const_scale_factor}});
+    auto got_reverse_matrix =
+        reverse.to_matrix({}, {{"value", const_scale_factor}});
+    auto want_matrix = utils::PauliZ_matrix() * scaled_identity;
+    auto want_reverse_matrix = scaled_identity * utils::PauliZ_matrix();
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+}
+
+TEST(OperatorExpressions, checkSpinOpsSimpleArithmetics) {
+
+  // Addition, same DOF.
+  {
+    auto self = cudaq::spin_operator::x(0);
+    auto other = cudaq::spin_operator::y(0);
+
+    auto sum = self + other;
+    ASSERT_TRUE(sum.num_terms() == 2);
+
+    auto got_matrix = sum.to_matrix();
+    auto want_matrix = utils::PauliX_matrix() + utils::PauliY_matrix();
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // Addition, different DOF's.
+  {
+    auto self = cudaq::spin_operator::z(0);
+    auto other = cudaq::spin_operator::y(1);
+
+    auto sum = self + other;
+    ASSERT_TRUE(sum.num_terms() == 2);
+
+    auto matrix_self =
+        cudaq::kronecker(utils::id_matrix(2), utils::PauliZ_matrix());
+    auto matrix_other =
+        cudaq::kronecker(utils::PauliY_matrix(), utils::id_matrix(2));
+    auto got_matrix = sum.to_matrix();
+    auto want_matrix = matrix_self + matrix_other;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // Subtraction, same DOF.
+  {
+    auto self = cudaq::spin_operator::z(0);
+    auto other = cudaq::spin_operator::x(0);
+
+    auto sum = self - other;
+    ASSERT_TRUE(sum.num_terms() == 2);
+
+    auto got_matrix = sum.to_matrix();
+    auto want_matrix = utils::PauliZ_matrix() - utils::PauliX_matrix();
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // Subtraction, different DOF's.
+  {
+    auto self = cudaq::spin_operator::y(0);
+    auto other = cudaq::spin_operator::x(1);
+
+    auto sum = self - other;
+    ASSERT_TRUE(sum.num_terms() == 2);
+
+    auto annihilate_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::PauliY_matrix());
+    auto create_full =
+        cudaq::kronecker(utils::PauliX_matrix(), utils::id_matrix(2));
+    auto got_matrix = sum.to_matrix();
+    auto want_matrix = annihilate_full - create_full;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // Multiplication, same DOF.
+  {
+    auto self = cudaq::spin_operator::y(0);
+    auto other = cudaq::spin_operator::z(0);
+
+    auto product = self * other;
+    ASSERT_TRUE(product.num_terms() == 1);
+
+    std::vector<int> want_degrees = {0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+
+    auto got_matrix = product.to_matrix();
+    auto want_matrix = utils::PauliY_matrix() * utils::PauliZ_matrix();
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // Multiplication, different DOF's.
+  {
+    auto self = cudaq::spin_operator::x(0);
+    auto other = cudaq::spin_operator::z(1);
+
+    auto product = self * other;
+    ASSERT_TRUE(product.num_terms() == 2);
+
+    std::vector<int> want_degrees = {1, 0};
+    ASSERT_TRUE(product.degrees() == want_degrees);
+
+    auto annihilate_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::PauliX_matrix());
+    auto create_full =
+        cudaq::kronecker(utils::PauliZ_matrix(), utils::id_matrix(2));
+    auto got_matrix = product.to_matrix();
+    auto want_matrix = annihilate_full * create_full;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+}
+
+TEST(OperatorExpressions, checkSpinOpsAdvancedArithmetics) {
+
+  // Keeping this fixed throughout.
+  std::complex<double> value = 0.125 + 0.5j;
+
+  // `spin_operator + operator_sum`
+  {
+    auto self = cudaq::spin_operator::y(2);
+    auto operator_sum = cudaq::spin_operator::y(2) + cudaq::spin_operator::x(1);
+
+    auto got = self + operator_sum;
+    auto reverse = operator_sum + self;
+
+    ASSERT_TRUE(got.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+
+    auto self_full =
+        cudaq::kronecker(utils::PauliY_matrix(), utils::id_matrix(2));
+    auto term_0_full =
+        cudaq::kronecker(utils::PauliY_matrix(), utils::id_matrix(2));
+    auto term_1_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::PauliX_matrix());
+
+    auto got_matrix = got.to_matrix();
+    auto got_reverse_matrix = reverse.to_matrix();
+    auto want_matrix = self_full + term_0_full + term_1_full;
+    auto want_reverse_matrix = term_0_full + term_1_full + self_full;
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `spin_operator - operator_sum`
+  {
+    auto self = cudaq::spin_operator::i(0);
+    auto operator_sum = cudaq::spin_operator::x(0) + cudaq::spin_operator::z(1);
+
+    auto got = self - operator_sum;
+    auto reverse = operator_sum - self;
+
+    ASSERT_TRUE(got.num_terms() == 3);
+    ASSERT_TRUE(reverse.num_terms() == 3);
+
+    auto self_full = cudaq::kronecker(utils::id_matrix(2), utils::id_matrix(2));
+    auto term_0_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::PauliX_matrix());
+    auto term_1_full =
+        cudaq::kronecker(utils::PauliZ_matrix(), utils::id_matrix(2));
+
+    auto got_matrix = got.to_matrix();
+    auto got_reverse_matrix = reverse.to_matrix();
+    auto want_matrix = self_full - term_0_full - term_1_full;
+    auto want_reverse_matrix = term_0_full + term_1_full - self_full;
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `spin_operator * operator_sum`
+  {
+    auto self = cudaq::spin_operator::y(0);
+    auto operator_sum = cudaq::spin_operator::x(0) + cudaq::spin_operator::y(2);
+
+    auto got = self * operator_sum;
+    auto reverse = operator_sum * self;
+
+    ASSERT_TRUE(got.num_terms() == 2);
+    ASSERT_TRUE(reverse.num_terms() == 2);
+    for (auto &term : got.get_terms())
+      ASSERT_TRUE(term.num_terms() == term.degrees().size());
+    for (auto &term : reverse.get_terms())
+      ASSERT_TRUE(term.num_terms() == term.degrees().size());
+
+    auto self_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::PauliY_matrix());
+    auto term_0_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::PauliX_matrix());
+    auto term_1_full =
+        cudaq::kronecker(utils::PauliY_matrix(), utils::id_matrix(2));
+    auto sum_full = term_0_full + term_1_full;
+
+    auto got_matrix = got.to_matrix();
+    auto got_reverse_matrix = reverse.to_matrix();
+    auto want_matrix = self_full * sum_full;
+    auto want_reverse_matrix = sum_full * self_full;
+    utils::checkEqual(want_matrix, got_matrix);
+    utils::checkEqual(want_reverse_matrix, got_reverse_matrix);
+  }
+
+  // `operator_sum += spin_operator`
+  {
+    auto operator_sum = cudaq::spin_operator::z(0) + cudaq::spin_operator::x(2);
+    operator_sum += cudaq::spin_operator::y(0);
+
+    ASSERT_TRUE(operator_sum.num_terms() == 3);
+
+    auto self_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::PauliZ_matrix());
+    auto term_0_full =
+        cudaq::kronecker(utils::PauliX_matrix(), utils::id_matrix(2));
+    auto term_1_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::PauliY_matrix());
+
+    auto got_matrix = operator_sum.to_matrix();
+    auto want_matrix = term_0_full + term_1_full + self_full;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `operator_sum -= spin_operator`
+  {
+    auto operator_sum = cudaq::spin_operator::x(0) + cudaq::spin_operator::i(1);
+    operator_sum -= cudaq::spin_operator::x(0);
+
+    ASSERT_TRUE(operator_sum.num_terms() == 2);
+
+    auto self_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::PauliX_matrix());
+    auto term_0_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::id_matrix(2));
+    auto term_1_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::PauliX_matrix());
+
+    auto got_matrix = operator_sum.to_matrix();
+    auto want_matrix = term_0_full + term_1_full - self_full;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+
+  // `operator_sum *= spin_operator`
+  {
+    auto self = cudaq::spin_operator::i(0);
+    auto operator_sum = cudaq::spin_operator::y(0) + cudaq::spin_operator::z(1);
+
+    operator_sum *= self;
+
+    ASSERT_TRUE(operator_sum.num_terms() == 2);
+    for (auto &term : operator_sum.get_terms())
+      ASSERT_TRUE(term.num_terms() == term.degrees().size());
+
+    auto self_full = cudaq::kronecker(utils::id_matrix(2), utils::id_matrix(2));
+    auto term_0_full =
+        cudaq::kronecker(utils::id_matrix(2), utils::PauliY_matrix());
+    auto term_1_full =
+        cudaq::kronecker(utils::PauliZ_matrix(), utils::id_matrix(2));
+    auto sum_full = term_0_full + term_1_full;
+
+    auto got_matrix = operator_sum.to_matrix();
+    auto want_matrix = sum_full * self_full;
+    utils::checkEqual(want_matrix, got_matrix);
+  }
+}
+
+TEST(OperatorExpressions, checkSpinOpsDegreeVerification) {
+  auto op1 = cudaq::spin_operator::z(1);
+  auto op2 = cudaq::spin_operator::x(0);
+  std::map<int, int> dimensions = {{0, 1}, {1, 3}};
+
+  ASSERT_THROW(op1.to_matrix({{1, 3}}), std::runtime_error);
+  ASSERT_THROW((op1 * op2).to_matrix({{0, 3}, {1, 3}}), std::runtime_error);
+  ASSERT_THROW((op1 + op2).to_matrix({{0, 3}}), std::runtime_error);
+  ASSERT_NO_THROW(op1.to_matrix({{0, 3}}));
+}
\ No newline at end of file
diff --git a/unittests/dynamics/test_CuDensityMatExpectation.cpp b/unittests/dynamics/test_CuDensityMatExpectation.cpp
new file mode 100644
index 00000000000..7f1902ec23a
--- /dev/null
+++ b/unittests/dynamics/test_CuDensityMatExpectation.cpp
@@ -0,0 +1,87 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "CuDensityMatContext.h"
+#include "CuDensityMatState.h"
+#include "common/EigenDense.h"
+#include "test_Mocks.h"
+#include <CuDensityMatErrorHandling.h>
+#include <CuDensityMatExpectation.h>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+#include <unsupported/Eigen/KroneckerProduct>
+
+using namespace cudaq;
+
+class CuDensityMatExpectationTest : public ::testing::Test {
+protected:
+  cudensitymatHandle_t handle_;
+
+  void SetUp() override {
+    // Create library handle
+    HANDLE_CUDM_ERROR(cudensitymatCreate(&handle_));
+  }
+
+  void TearDown() override {
+    // Clean up
+    HANDLE_CUDM_ERROR(cudensitymatDestroy(handle_));
+  }
+};
+
+TEST_F(CuDensityMatExpectationTest, checkCompute) {
+  const std::vector<int64_t> dims = {10};
+  // Check number operator on boson Fock space
+  auto op = cudaq::matrix_operator::number(0);
+  auto cudmOp = cudaq::dynamics::Context::getCurrentContext()
+                    ->getOpConverter()
+                    .convertToCudensitymatOperator({}, op, dims);
+
+  CuDensityMatExpectation expectation(handle_, cudmOp);
+
+  for (std::size_t stateIdx = 0; stateIdx < dims[0]; ++stateIdx) {
+    std::vector<std::complex<double>> initialState(dims[0], 0.0);
+    initialState[stateIdx] = 1.0;
+    auto inputState =
+        std::make_unique<CuDensityMatState>(handle_, initialState, dims);
+    expectation.prepare(inputState->get_impl());
+    const auto expVal = expectation.compute(inputState->get_impl(), 0.0);
+    EXPECT_NEAR(expVal.real(), 1.0 * stateIdx, 1e-12);
+    EXPECT_NEAR(expVal.imag(), 0.0, 1e-12);
+  }
+}
+
+TEST_F(CuDensityMatExpectationTest, checkCompositeSystem) {
+  const std::vector<int64_t> dims = {2, 10};
+  // Check number operator on boson Fock space
+  auto op = cudaq::matrix_operator::number(1);
+  auto cudmOp = cudaq::dynamics::Context::getCurrentContext()
+                    ->getOpConverter()
+                    .convertToCudensitymatOperator({}, op, dims);
+
+  CuDensityMatExpectation expectation(handle_, cudmOp);
+
+  for (std::size_t stateIdx = 0; stateIdx < dims[1]; ++stateIdx) {
+    Eigen::Vector2cd qubit_state;
+    qubit_state << 1.0, 0.0;
+    Eigen::VectorXcd cavity_state = Eigen::VectorXcd::Zero(dims[1]);
+    cavity_state[stateIdx] = 1.0;
+    Eigen::VectorXcd initial_state_vec =
+        Eigen::kroneckerProduct(cavity_state, qubit_state);
+    std::vector<std::complex<double>> initialState(
+        initial_state_vec.data(),
+        initial_state_vec.data() + initial_state_vec.size());
+    auto inputState =
+        std::make_unique<CuDensityMatState>(handle_, initialState, dims);
+    expectation.prepare(inputState->get_impl());
+    const auto expVal = expectation.compute(inputState->get_impl(), 0.0);
+    std::cout << "Result: " << expVal << "\n";
+    EXPECT_NEAR(expVal.real(), 1.0 * stateIdx, 1e-12);
+    EXPECT_NEAR(expVal.imag(), 0.0, 1e-12);
+  }
+}
\ No newline at end of file
diff --git a/unittests/dynamics/test_CuDensityMatState.cpp b/unittests/dynamics/test_CuDensityMatState.cpp
new file mode 100644
index 00000000000..ec81c872cd1
--- /dev/null
+++ b/unittests/dynamics/test_CuDensityMatState.cpp
@@ -0,0 +1,135 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "CuDensityMatState.h"
+#include <CuDensityMatErrorHandling.h>
+#include <complex>
+#include <gtest/gtest.h>
+#include <stdexcept>
+#include <vector>
+
+using namespace cudaq;
+
+class CuDensityMatStateTest : public ::testing::Test {
+protected:
+  cudensitymatHandle_t handle;
+
+  void SetUp() override {
+    HANDLE_CUDM_ERROR(cudensitymatCreate(&handle));
+
+    // Set up test data for a single 2-qubit system
+    hilbertSpaceDims = {2, 2};
+
+    // State vector (pure state) for |00>
+    stateVectorData = {
+        std::complex<double>(1.0, 0.0), std::complex<double>(0.0, 0.0),
+        std::complex<double>(0.0, 0.0), std::complex<double>(0.0, 0.0)};
+
+    // Density matrix for |00><00|
+    densityMatrixData = {
+        std::complex<double>(1.0, 0.0), std::complex<double>(0.0, 0.0),
+        std::complex<double>(0.0, 0.0), std::complex<double>(0.0, 0.0),
+        std::complex<double>(0.0, 0.0), std::complex<double>(0.0, 0.0),
+        std::complex<double>(0.0, 0.0), std::complex<double>(0.0, 0.0),
+        std::complex<double>(0.0, 0.0), std::complex<double>(0.0, 0.0),
+        std::complex<double>(0.0, 0.0), std::complex<double>(0.0, 0.0),
+        std::complex<double>(0.0, 0.0), std::complex<double>(0.0, 0.0),
+        std::complex<double>(0.0, 0.0), std::complex<double>(0.0, 0.0)};
+  }
+
+  void TearDown() override { cudensitymatDestroy(handle); }
+
+  std::vector<int64_t> hilbertSpaceDims;
+  std::vector<std::complex<double>> stateVectorData;
+  std::vector<std::complex<double>> densityMatrixData;
+};
+
+TEST_F(CuDensityMatStateTest, InitializeWithStateVector) {
+  CuDensityMatState state(handle, stateVectorData, hilbertSpaceDims);
+
+  EXPECT_TRUE(state.is_initialized());
+  EXPECT_FALSE(state.is_density_matrix());
+  EXPECT_NO_THROW(state.dump(std::cout));
+}
+
+TEST_F(CuDensityMatStateTest, InitializeWithDensityMatrix) {
+  CuDensityMatState state(handle, densityMatrixData, hilbertSpaceDims);
+
+  EXPECT_TRUE(state.is_initialized());
+  EXPECT_TRUE(state.is_density_matrix());
+  EXPECT_NO_THROW(state.dump(std::cout));
+}
+
+TEST_F(CuDensityMatStateTest, InvalidInitialization) {
+  // Data size mismatch for hilbertSpaceDims (2x2 system expects size 4 or 16)
+  std::vector<std::complex<double>> invalidData = {{1.0, 0.0}, {0.0, 0.0}};
+
+  EXPECT_THROW(CuDensityMatState state(handle, invalidData, hilbertSpaceDims),
+               std::invalid_argument);
+}
+
+TEST_F(CuDensityMatStateTest, ToDensityMatrixConversion) {
+  CuDensityMatState state(handle, stateVectorData, hilbertSpaceDims);
+  EXPECT_FALSE(state.is_density_matrix());
+
+  CuDensityMatState densityMatrixState = state.to_density_matrix();
+  EXPECT_TRUE(densityMatrixState.is_density_matrix());
+  EXPECT_TRUE(densityMatrixState.is_initialized());
+  EXPECT_NO_THROW(densityMatrixState.dump(std::cout));
+}
+
+TEST_F(CuDensityMatStateTest, AlreadyDensityMatrixConversion) {
+  CuDensityMatState state(handle, densityMatrixData, hilbertSpaceDims);
+
+  EXPECT_TRUE(state.is_density_matrix());
+  EXPECT_THROW(state.to_density_matrix(), std::runtime_error);
+}
+
+TEST_F(CuDensityMatStateTest, DestructorCleansUp) {
+  EXPECT_NO_THROW(
+      { CuDensityMatState state(handle, stateVectorData, hilbertSpaceDims); });
+}
+
+TEST_F(CuDensityMatStateTest, InitializeWithEmptyRawData) {
+  std::vector<std::complex<double>> emptyData;
+
+  EXPECT_THROW(CuDensityMatState state(handle, emptyData, hilbertSpaceDims),
+               std::invalid_argument);
+}
+
+TEST_F(CuDensityMatStateTest, ConversionForSingleQubitSystem) {
+  hilbertSpaceDims = {2};
+  stateVectorData = {{1.0, 0.0}, {0.0, 0.0}};
+  CuDensityMatState state(handle, stateVectorData, hilbertSpaceDims);
+
+  EXPECT_FALSE(state.is_density_matrix());
+
+  CuDensityMatState densityMatrixState = state.to_density_matrix();
+  EXPECT_TRUE(densityMatrixState.is_density_matrix());
+  EXPECT_TRUE(densityMatrixState.is_initialized());
+  EXPECT_NO_THROW(densityMatrixState.dump(std::cout));
+}
+
+TEST_F(CuDensityMatStateTest, InvalidHilbertSpaceDims) {
+  // 3x3 space is not supported by the provided rawData size
+  hilbertSpaceDims = {3, 3};
+  EXPECT_THROW(
+      CuDensityMatState state(handle, stateVectorData, hilbertSpaceDims),
+      std::invalid_argument);
+}
+
+TEST_F(CuDensityMatStateTest, ValidDensityMatrixState) {
+  CuDensityMatState state(handle, densityMatrixData, hilbertSpaceDims);
+  EXPECT_TRUE(state.is_density_matrix());
+  EXPECT_TRUE(state.is_initialized());
+}
+
+TEST_F(CuDensityMatStateTest, DumpWorksForInitializedState) {
+  CuDensityMatState state(handle, stateVectorData, hilbertSpaceDims);
+  EXPECT_NO_THROW(state.dump(std::cout));
+}
diff --git a/unittests/dynamics/test_CuDensityMatTimeStepper.cpp b/unittests/dynamics/test_CuDensityMatTimeStepper.cpp
new file mode 100644
index 00000000000..7e1cf738d53
--- /dev/null
+++ b/unittests/dynamics/test_CuDensityMatTimeStepper.cpp
@@ -0,0 +1,355 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "CuDensityMatContext.h"
+#include "CuDensityMatState.h"
+#include "CuDensityMatTimeStepper.h"
+#include "test_Mocks.h"
+#include <CuDensityMatErrorHandling.h>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <memory>
+using namespace cudaq;
+
+class CuDensityMatTimeStepperTest : public ::testing::Test {
+protected:
+  cudensitymatHandle_t handle_;
+  cudensitymatOperator_t liouvillian_;
+  std::unique_ptr<CuDensityMatTimeStepper> time_stepper_;
+  cudaq::state state_ = cudaq::state(nullptr);
+
+  void SetUp() override {
+    // Create library handle
+    HANDLE_CUDM_ERROR(cudensitymatCreate(&handle_));
+
+    // Create a mock Liouvillian
+    liouvillian_ = mock_liouvillian(handle_);
+
+    // Initialize the time stepper
+    time_stepper_ =
+        std::make_unique<CuDensityMatTimeStepper>(handle_, liouvillian_);
+
+    state_ = cudaq::state::from_data(mock_initial_state_data());
+    auto *simState = cudaq::state_helper::getSimulationState(&state_);
+    auto *castSimState = dynamic_cast<CuDensityMatState *>(simState);
+    EXPECT_TRUE(castSimState != nullptr);
+    castSimState->initialize_cudm(handle_, mock_hilbert_space_dims());
+    ASSERT_TRUE(castSimState->is_initialized());
+  }
+
+  void TearDown() override {
+    // Clean up
+    HANDLE_CUDM_ERROR(cudensitymatDestroyOperator(liouvillian_));
+    // HANDLE_CUDM_ERROR(cudensitymatDestroy(handle_));
+  }
+};
+
+// Test initialization of CuDensityMatTimeStepper
+TEST_F(CuDensityMatTimeStepperTest, Initialization) {
+  ASSERT_NE(time_stepper_, nullptr);
+  auto *simState = cudaq::state_helper::getSimulationState(&state_);
+  auto *castSimState = dynamic_cast<CuDensityMatState *>(simState);
+  EXPECT_TRUE(castSimState != nullptr);
+  ASSERT_TRUE(castSimState->is_initialized());
+  ASSERT_FALSE(castSimState->is_density_matrix());
+}
+
+// Test a single compute step
+TEST_F(CuDensityMatTimeStepperTest, ComputeStep) {
+  EXPECT_NO_THROW(time_stepper_->compute(state_, 0.0, 1.0, {}));
+}
+
+// Compute step when handle is uninitialized
+TEST_F(CuDensityMatTimeStepperTest, ComputeStepUninitializedHandle) {
+  CuDensityMatTimeStepper invalidStepper(nullptr, liouvillian_);
+  EXPECT_THROW(invalidStepper.compute(state_, 0.0, 1.0, {}),
+               std::runtime_error);
+}
+
+// Compute step when liouvillian is missing
+TEST_F(CuDensityMatTimeStepperTest, ComputeStepNoLiouvillian) {
+  CuDensityMatTimeStepper invalidStepper(handle_, nullptr);
+  EXPECT_THROW(invalidStepper.compute(state_, 0.0, 1.0, {}),
+               std::runtime_error);
+}
+
+// Compute step with mismatched dimensions
+TEST_F(CuDensityMatTimeStepperTest, ComputeStepMistmatchedDimensions) {
+  EXPECT_THROW(
+      std::unique_ptr<CuDensityMatState> mismatchedState =
+          std::make_unique<CuDensityMatState>(
+              handle_, mock_initial_state_data(), std::vector<int64_t>{3, 3}),
+      std::invalid_argument);
+}
+
+// Compute step with zero step size
+TEST_F(CuDensityMatTimeStepperTest, ComputeStepZeroStepSize) {
+  EXPECT_THROW(time_stepper_->compute(state_, 0.0, 0.0, {}),
+               std::runtime_error);
+}
+
+// Compute step with large time values
+TEST_F(CuDensityMatTimeStepperTest, ComputeStepLargeTimeValues) {
+  EXPECT_NO_THROW(time_stepper_->compute(state_, 1e6, 1e3, {}));
+}
+
+TEST_F(CuDensityMatTimeStepperTest, ComputeStepCheckOutput) {
+  const std::vector<std::complex<double>> initialState = {
+      {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}};
+  const std::vector<int64_t> dims = {4};
+  auto inputState = cudaq::state::from_data(initialState);
+  auto *simState = cudaq::state_helper::getSimulationState(&inputState);
+  auto *castSimState = dynamic_cast<CuDensityMatState *>(simState);
+  EXPECT_TRUE(castSimState != nullptr);
+  castSimState->initialize_cudm(handle_, dims);
+
+  cudaq::product_operator<cudaq::boson_operator> op_1 =
+      cudaq::boson_operator::create(0);
+  cudaq::operator_sum<cudaq::matrix_operator> op(op_1);
+  auto cudmOp = cudaq::dynamics::Context::getCurrentContext()
+                    ->getOpConverter()
+                    .convertToCudensitymatOperator(
+                        {}, op, dims); // Initialize the time stepper
+  auto time_stepper =
+      std::make_unique<CuDensityMatTimeStepper>(handle_, cudmOp);
+  auto outputState = time_stepper->compute(inputState, 0.0, 1.0, {});
+
+  std::vector<std::complex<double>> outputStateVec(4);
+  outputState.to_host(outputStateVec.data(), outputStateVec.size());
+  // Create operator move the state up 1 step.
+  const std::vector<std::complex<double>> expectedOutputState = {
+      {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}};
+
+  for (std::size_t i = 0; i < expectedOutputState.size(); ++i) {
+    EXPECT_TRUE(std::abs(expectedOutputState[i] - outputStateVec[i]) < 1e-12);
+  }
+  HANDLE_CUDM_ERROR(cudensitymatDestroyOperator(cudmOp));
+}
+
+TEST_F(CuDensityMatTimeStepperTest, TimeSteppingWithLindblad) {
+  std::vector<std::complex<double>> initial_state;
+  initial_state.resize(100, {0.0, 0.0});
+  initial_state[5 * 10 + 5] = {1.0, 0.0};
+
+  const std::vector<int64_t> dims = {10};
+  auto input_state = cudaq::state::from_data(initial_state);
+  auto *simState = cudaq::state_helper::getSimulationState(&input_state);
+  auto *castSimState = dynamic_cast<CuDensityMatState *>(simState);
+  EXPECT_TRUE(castSimState != nullptr);
+  castSimState->initialize_cudm(handle_, dims);
+  cudaq::product_operator<cudaq::matrix_operator> c_op_0 =
+      cudaq::boson_operator::annihilate(0);
+  cudaq::operator_sum<cudaq::matrix_operator> c_op(c_op_0);
+  cudaq::operator_sum<cudaq::matrix_operator> zero_op = 0.0 * c_op;
+  auto cudm_lindblad_op =
+      cudaq::dynamics::Context::getCurrentContext()
+          ->getOpConverter()
+          .constructLiouvillian(zero_op, {c_op}, dims, {}, true);
+
+  auto time_stepper =
+      std::make_unique<CuDensityMatTimeStepper>(handle_, cudm_lindblad_op);
+  auto output_state = time_stepper->compute(input_state, 0.0, 1.0, {});
+
+  std::vector<std::complex<double>> output_state_vec(100);
+  output_state.to_host(output_state_vec.data(), output_state_vec.size());
+  EXPECT_NEAR(
+      std::abs(output_state_vec[4 * 10 + 4] - std::complex<double>(5.0, 0.0)),
+      0.0, 1e-12);
+  EXPECT_NEAR(
+      std::abs(output_state_vec[5 * 10 + 5] - std::complex<double>(-5.0, 0.0)),
+      0.0, 1e-12);
+
+  HANDLE_CUDM_ERROR(cudensitymatDestroyOperator(cudm_lindblad_op));
+}
+
+TEST_F(CuDensityMatTimeStepperTest, CheckScalarCallback) {
+  const std::vector<std::complex<double>> initialState = {
+      {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}};
+  const std::vector<int64_t> dims = {4};
+  auto inputState = cudaq::state::from_data(initialState);
+  auto *simState = cudaq::state_helper::getSimulationState(&inputState);
+  auto *castSimState = dynamic_cast<CuDensityMatState *>(simState);
+  EXPECT_TRUE(castSimState != nullptr);
+  castSimState->initialize_cudm(handle_, dims);
+  const std::string paramName = "alpha";
+  const std::complex<double> paramValue{2.0, 3.0};
+  std::unordered_map<std::string, std::complex<double>> params{
+      {paramName, paramValue}};
+
+  auto function =
+      [paramName](const std::unordered_map<std::string, std::complex<double>>
+                      &parameters) {
+        auto entry = parameters.find(paramName);
+        if (entry == parameters.end())
+          throw std::runtime_error(
+              "Cannot find value of expected parameter named " + paramName);
+        return entry->second;
+      };
+
+  cudaq::product_operator<cudaq::matrix_operator> op_t =
+      cudaq::scalar_operator(function) * cudaq::boson_operator::create(0);
+  cudaq::operator_sum<cudaq::matrix_operator> op(op_t);
+  auto cudmOp = cudaq::dynamics::Context::getCurrentContext()
+                    ->getOpConverter()
+                    .convertToCudensitymatOperator(params, op, dims);
+  // Initialize the time stepper
+  auto time_stepper =
+      std::make_unique<CuDensityMatTimeStepper>(handle_, cudmOp);
+  auto outputState = time_stepper->compute(inputState, 1.0, 1.0, params);
+  outputState.dump(std::cout);
+  std::vector<std::complex<double>> outputStateVec(4);
+  outputState.to_host(outputStateVec.data(), outputStateVec.size());
+  // Create operator move the state up 1 step.
+  const std::vector<std::complex<double>> expectedOutputState = {
+      {0.0, 0.0}, paramValue, {0.0, 0.0}, {0.0, 0.0}};
+
+  for (std::size_t i = 0; i < expectedOutputState.size(); ++i) {
+    EXPECT_TRUE(std::abs(expectedOutputState[i] - outputStateVec[i]) < 1e-12);
+  }
+  HANDLE_CUDM_ERROR(cudensitymatDestroyOperator(cudmOp));
+}
+
+TEST_F(CuDensityMatTimeStepperTest, CheckTensorCallback) {
+  const std::vector<std::complex<double>> initialState = {{1.0, 0.0},
+                                                          {1.0, 0.0}};
+  const std::vector<int64_t> dims = {2};
+  auto inputState = cudaq::state::from_data(initialState);
+  auto *simState = cudaq::state_helper::getSimulationState(&inputState);
+  auto *castSimState = dynamic_cast<CuDensityMatState *>(simState);
+  EXPECT_TRUE(castSimState != nullptr);
+  castSimState->initialize_cudm(handle_, dims);
+
+  const std::string paramName = "beta";
+  const std::complex<double> paramValue{2.0, 3.0};
+  std::unordered_map<std::string, std::complex<double>> params{
+      {paramName, paramValue}};
+
+  auto tensorFunction =
+      [paramName](const std::vector<int> &dimensions,
+                  const std::unordered_map<std::string, std::complex<double>>
+                      &parameters) -> matrix_2 {
+    if (dimensions.empty()) {
+      throw std::runtime_error("Empty dimensions vector received!");
+    }
+
+    auto entry = parameters.find(paramName);
+    if (entry == parameters.end())
+      throw std::runtime_error(
+          "Cannot find value of expected parameter named " + paramName);
+
+    std::complex<double> value = entry->second;
+    matrix_2 mat(2, 2);
+    mat[{0, 0}] = value;
+    mat[{1, 1}] = std::conj(value);
+    mat[{0, 1}] = {0.0, 0.0};
+    mat[{1, 0}] = {0.0, 0.0};
+    return mat;
+  };
+
+  matrix_operator::define("CustomTensorOp", {2}, tensorFunction);
+  auto op = cudaq::matrix_operator::instantiate("CustomTensorOp", {0});
+  auto cudmOp = cudaq::dynamics::Context::getCurrentContext()
+                    ->getOpConverter()
+                    .convertToCudensitymatOperator(params, op, dims);
+  // Initialize the time stepper
+  auto time_stepper =
+      std::make_unique<CuDensityMatTimeStepper>(handle_, cudmOp);
+  auto outputState = time_stepper->compute(inputState, 1.0, 1.0, params);
+  outputState.dump(std::cout);
+  std::vector<std::complex<double>> outputStateVec(2);
+  outputState.to_host(outputStateVec.data(), outputStateVec.size());
+  // Create operator move the state up 1 step.
+  const std::vector<std::complex<double>> expectedOutputState = {
+      paramValue, std::conj(paramValue)};
+
+  for (std::size_t i = 0; i < expectedOutputState.size(); ++i) {
+    EXPECT_TRUE(std::abs(expectedOutputState[i] - outputStateVec[i]) < 1e-12);
+  }
+  HANDLE_CUDM_ERROR(cudensitymatDestroyOperator(cudmOp));
+}
+
+TEST_F(CuDensityMatTimeStepperTest, ComputeOperatorOrder) {
+  const std::vector<std::complex<double>> initialState = {
+      {1.0, 0.0}, {1.0, 0.0}, {1.0, 0.0}, {1.0, 0.0}};
+  const std::vector<int64_t> dims = {4};
+  auto inputState = cudaq::state::from_data(initialState);
+  auto *simState = cudaq::state_helper::getSimulationState(&inputState);
+  auto *castSimState = dynamic_cast<CuDensityMatState *>(simState);
+  EXPECT_TRUE(castSimState != nullptr);
+  castSimState->initialize_cudm(handle_, dims);
+
+  cudaq::product_operator<cudaq::matrix_operator> op_t =
+      cudaq::boson_operator::create(0) *
+      cudaq::boson_operator::annihilate(0); // a_dagger * a
+  cudaq::operator_sum<cudaq::matrix_operator> op(op_t);
+  const auto opMat = op.to_matrix({{0, 4}});
+
+  std::cout << "Op matrix:\n" << opMat.dump() << "\n";
+  auto cudmOp = cudaq::dynamics::Context::getCurrentContext()
+                    ->getOpConverter()
+                    .convertToCudensitymatOperator(
+                        {}, op, dims); // Initialize the time stepper
+  auto time_stepper =
+      std::make_unique<CuDensityMatTimeStepper>(handle_, cudmOp);
+  auto outputState = time_stepper->compute(inputState, 0.0, 1.0, {});
+  std::vector<std::complex<double>> expectedOutputStateVec(4);
+  // Diagonal elements
+  for (std::size_t i = 0; i < expectedOutputStateVec.size(); ++i)
+    expectedOutputStateVec[i] = opMat[{i, i}];
+
+  std::vector<std::complex<double>> outputStateVec(4);
+  outputState.to_host(outputStateVec.data(), outputStateVec.size());
+  HANDLE_CUDM_ERROR(cudensitymatDestroyOperator(cudmOp));
+  for (std::size_t i = 0; i < expectedOutputStateVec.size(); ++i) {
+    std::cout << "Result = " << outputStateVec[i]
+              << "; vs. expected = " << expectedOutputStateVec[i] << "\n";
+    EXPECT_TRUE(std::abs(expectedOutputStateVec[i] - outputStateVec[i]) <
+                1e-12);
+  }
+}
+
+TEST_F(CuDensityMatTimeStepperTest, ComputeOperatorOrderDensityMatrix) {
+  constexpr int N = 4;
+  const std::vector<std::complex<double>> initialState(N * N, 1.0);
+  const std::vector<int64_t> dims = {N};
+  auto inputState = cudaq::state::from_data(initialState);
+  auto *simState = cudaq::state_helper::getSimulationState(&inputState);
+  auto *castSimState = dynamic_cast<CuDensityMatState *>(simState);
+  EXPECT_TRUE(castSimState != nullptr);
+  castSimState->initialize_cudm(handle_, dims);
+
+  cudaq::product_operator<cudaq::matrix_operator> op_t =
+      cudaq::boson_operator::create(0) *
+      cudaq::boson_operator::annihilate(0); // a_dagger * a
+  cudaq::operator_sum<cudaq::matrix_operator> op(op_t);
+  const auto opMat = op.to_matrix({{0, N}});
+  cudaq::matrix_2 rho = cudaq::matrix_2::identity(N);
+  for (std::size_t col = 0; col < N; ++col)
+    for (std::size_t row = 0; row < N; ++row)
+      rho[{row, col}] = 1.0;
+  const auto expectedResult =
+      std::complex<double>(0.0, -1.0) * (opMat * rho - rho * opMat);
+  std::cout << "Expected result:\n" << expectedResult.dump() << "\n";
+  auto cudmOp = cudaq::dynamics::Context::getCurrentContext()
+                    ->getOpConverter()
+                    .constructLiouvillian(op, {}, dims, {}, true);
+  auto time_stepper =
+      std::make_unique<CuDensityMatTimeStepper>(handle_, cudmOp);
+  auto outputState = time_stepper->compute(inputState, 0.0, 1.0, {});
+  std::vector<std::complex<double>> outputStateVec(initialState.size());
+  outputState.to_host(outputStateVec.data(), outputStateVec.size());
+  HANDLE_CUDM_ERROR(cudensitymatDestroyOperator(cudmOp));
+  for (std::size_t i = 0; i < outputStateVec.size(); ++i) {
+    const auto col = i / N;
+    const auto row = i % N;
+    std::cout << "Result = " << outputStateVec[i]
+              << "; vs. expected = " << expectedResult[{row, col}] << "\n";
+    EXPECT_TRUE(std::abs(outputStateVec[i] - expectedResult[{row, col}]) <
+                1e-12);
+  }
+}
diff --git a/unittests/dynamics/test_EvolveApi.cpp b/unittests/dynamics/test_EvolveApi.cpp
new file mode 100644
index 00000000000..20db58b0048
--- /dev/null
+++ b/unittests/dynamics/test_EvolveApi.cpp
@@ -0,0 +1,141 @@
+// /*******************************************************************************
+//  * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates. *
+//  * All rights reserved. *
+//  * *
+//  * This source code and the accompanying materials are made available under *
+//  * the terms of the Apache License 2.0 which accompanies this distribution. *
+//  ******************************************************************************/
+
+#include "cudaq/algorithms/evolve.h"
+#include "cudaq/dynamics_integrators.h"
+#include <cmath>
+#include <complex>
+#include <functional>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <vector>
+
+TEST(EvolveAPITester, checkSimple) {
+  const std::map<int, int> dims = {{0, 2}};
+  auto ham = 2.0 * M_PI * 0.1 * cudaq::spin_operator::x(0);
+  constexpr int numSteps = 10;
+  std::vector<std::complex<double>> steps;
+  for (double t : cudaq::linspace(0.0, 1.0, numSteps)) {
+    steps.emplace_back(t, 0.0);
+  }
+  cudaq::Schedule schedule(
+      steps, {"t"},
+      [](const std::string &, const std::complex<double> &val) { return val; });
+  auto initialState =
+      cudaq::state::from_data(std::vector<std::complex<double>>{1.0, 0.0});
+  auto integrator = std::make_shared<cudaq::RungeKuttaIntegrator>();
+  integrator->order = 1;
+  integrator->dt = 0.001;
+  auto result = cudaq::evolve(ham, dims, schedule, initialState, integrator, {},
+                              {cudaq::spin_operator::z(0)}, true);
+  EXPECT_TRUE(result.get_expectation_values().has_value());
+  EXPECT_EQ(result.get_expectation_values().value().size(), numSteps);
+  std::vector<double> theoryResults;
+  for (const auto &t : schedule) {
+    const double expected = std::cos(2 * 2.0 * M_PI * 0.1 * t.real());
+    theoryResults.emplace_back(expected);
+  }
+
+  int count = 0;
+  for (auto expVals : result.get_expectation_values().value()) {
+    EXPECT_EQ(expVals.size(), 1);
+    EXPECT_NEAR((double)expVals[0], theoryResults[count++], 1e-3);
+  }
+}
+
+TEST(EvolveAPITester, checkCavityModel) {
+  constexpr int N = 10;
+  constexpr int numSteps = 101;
+  std::vector<std::complex<double>> steps;
+  for (double t : cudaq::linspace(0.0, 1.0, numSteps)) {
+    steps.emplace_back(t, 0.0);
+  }
+  cudaq::Schedule schedule(
+      steps, {"t"},
+      [](const std::string &, const std::complex<double> &val) { return val; });
+  auto hamiltonian = cudaq::boson_operator::number(0);
+  const std::map<int, int> dimensions{{0, N}};
+  std::vector<std::complex<double>> psi0_(N, 0.0);
+  psi0_.back() = 1.0;
+  auto psi0 = cudaq::state::from_data(psi0_);
+  constexpr double decay_rate = 0.1;
+  auto integrator = std::make_shared<cudaq::RungeKuttaIntegrator>();
+  integrator->dt = 0.01;
+  auto result = cudaq::evolve(
+      hamiltonian, dimensions, schedule, psi0, integrator,
+      {std::sqrt(decay_rate) * cudaq::boson_operator::annihilate(0)},
+      {hamiltonian}, true);
+  EXPECT_TRUE(result.get_expectation_values().has_value());
+  EXPECT_EQ(result.get_expectation_values().value().size(), numSteps);
+  std::vector<double> theoryResults;
+  for (const auto &t : schedule) {
+    const double expected = (N - 1) * std::exp(-decay_rate * t.real());
+    theoryResults.emplace_back(expected);
+  }
+
+  int count = 0;
+  for (auto expVals : result.get_expectation_values().value()) {
+    EXPECT_EQ(expVals.size(), 1);
+    EXPECT_NEAR((double)expVals[0], theoryResults[count++], 1e-3);
+  }
+}
+
+TEST(EvolveAPITester, checkTimeDependent) {
+  constexpr int N = 10;
+  constexpr int numSteps = 101;
+  std::vector<std::complex<double>> steps;
+  for (double t : cudaq::linspace(0.0, 1.0, numSteps)) {
+    steps.emplace_back(t, 0.0);
+  }
+  cudaq::Schedule schedule(
+      steps, {"t"},
+      [](const std::string &, const std::complex<double> &val) { return val; });
+  auto hamiltonian = cudaq::boson_operator::number(0);
+  const std::map<int, int> dimensions{{0, N}};
+  std::vector<std::complex<double>> psi0_(N, 0.0);
+  psi0_.back() = 1.0;
+  auto psi0 = cudaq::state::from_data(psi0_);
+  constexpr double decay_rate = 0.1;
+
+  auto td_function =
+      [decay_rate](const std::unordered_map<std::string, std::complex<double>>
+                       &parameters) {
+        auto entry = parameters.find("t");
+        if (entry == parameters.end())
+          throw std::runtime_error("Cannot find value of expected parameter");
+        const auto t = entry->second.real();
+        const auto result = std::sqrt(decay_rate * std::exp(-t));
+        return result;
+      };
+
+  auto collapseOperator = cudaq::scalar_operator(td_function) *
+                          cudaq::boson_operator::annihilate(0);
+  auto integrator = std::make_shared<cudaq::RungeKuttaIntegrator>();
+  integrator->dt = 0.01;
+  auto result =
+      cudaq::evolve(hamiltonian, dimensions, schedule, psi0, integrator,
+                    {collapseOperator}, {hamiltonian}, true);
+  EXPECT_TRUE(result.get_expectation_values().has_value());
+  EXPECT_EQ(result.get_expectation_values().value().size(), numSteps);
+  std::vector<double> theoryResults;
+  for (const auto &t : schedule) {
+    const double expected =
+        (N - 1) * std::exp(-decay_rate * (1.0 - std::exp(-t.real())));
+    theoryResults.emplace_back(expected);
+  }
+
+  int count = 0;
+  for (auto expVals : result.get_expectation_values().value()) {
+    EXPECT_EQ(expVals.size(), 1);
+    std::cout << "Result = " << (double)expVals[0] << "; expected "
+              << theoryResults[count] << "\n";
+    EXPECT_NEAR((double)expVals[0], theoryResults[count++], 1e-3);
+  }
+}
diff --git a/unittests/dynamics/test_EvolveSingle.cpp b/unittests/dynamics/test_EvolveSingle.cpp
new file mode 100644
index 00000000000..c3ffefb099a
--- /dev/null
+++ b/unittests/dynamics/test_EvolveSingle.cpp
@@ -0,0 +1,322 @@
+// /*******************************************************************************
+//  * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates. *
+//  * All rights reserved. *
+//  * *
+//  * This source code and the accompanying materials are made available under *
+//  * the terms of the Apache License 2.0 which accompanies this distribution. *
+//  ******************************************************************************/
+
+#include "CuDensityMatState.h"
+#include "common/EigenDense.h"
+#include "cudaq/algorithms/evolve_internal.h"
+#include "cudaq/dynamics_integrators.h"
+#include <cmath>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <unsupported/Eigen/KroneckerProduct>
+
+TEST(EvolveTester, checkSimple) {
+  const std::map<int, int> dims = {{0, 2}};
+  cudaq::product_operator<cudaq::matrix_operator> ham1 =
+      (2.0 * M_PI * 0.1 * cudaq::spin_operator::x(0));
+  cudaq::operator_sum<cudaq::matrix_operator> ham(ham1);
+
+  constexpr int numSteps = 10;
+  std::vector<std::complex<double>> steps;
+  for (double t : cudaq::linspace(0.0, 1.0, numSteps)) {
+    steps.emplace_back(t, 0.0);
+  }
+  cudaq::Schedule schedule(
+      steps, {"t"},
+      [](const std::string &, const std::complex<double> &val) { return val; });
+
+  cudaq::product_operator<cudaq::matrix_operator> pauliZ_t =
+      cudaq::spin_operator::z(0);
+  cudaq::operator_sum<cudaq::matrix_operator> pauliZ(pauliZ_t);
+  auto initialState =
+      cudaq::state::from_data(std::vector<std::complex<double>>{1.0, 0.0});
+
+  auto integrator = std::make_shared<cudaq::RungeKuttaIntegrator>();
+  integrator->dt = 0.001;
+  integrator->order = 1;
+  auto result = cudaq::__internal__::evolveSingle(
+      ham, dims, schedule, initialState, *integrator, {}, {pauliZ}, true);
+  EXPECT_TRUE(result.get_expectation_values().has_value());
+  EXPECT_EQ(result.get_expectation_values().value().size(), numSteps);
+  std::vector<double> theoryResults;
+  for (const auto &t : schedule) {
+    const double expected = std::cos(2 * 2.0 * M_PI * 0.1 * t.real());
+    theoryResults.emplace_back(expected);
+  }
+
+  int count = 0;
+  for (auto expVals : result.get_expectation_values().value()) {
+    EXPECT_EQ(expVals.size(), 1);
+    EXPECT_NEAR((double)expVals[0], theoryResults[count++], 1e-3);
+  }
+}
+
+TEST(EvolveTester, checkSimpleRK4) {
+  const std::map<int, int> dims = {{0, 2}};
+  cudaq::product_operator<cudaq::matrix_operator> ham1 =
+      (2.0 * M_PI * 0.1 * cudaq::spin_operator::x(0));
+  cudaq::operator_sum<cudaq::matrix_operator> ham(ham1);
+
+  constexpr int numSteps = 10;
+  std::vector<std::complex<double>> steps;
+  for (double t : cudaq::linspace(0.0, 1.0, numSteps)) {
+    steps.emplace_back(t, 0.0);
+  }
+  cudaq::Schedule schedule(
+      steps, {"t"},
+      [](const std::string &, const std::complex<double> &val) { return val; });
+
+  cudaq::product_operator<cudaq::matrix_operator> pauliZ_t =
+      cudaq::spin_operator::z(0);
+  cudaq::operator_sum<cudaq::matrix_operator> pauliZ(pauliZ_t);
+  auto initialState =
+      cudaq::state::from_data(std::vector<std::complex<double>>{1.0, 0.0});
+
+  auto integrator = std::make_shared<cudaq::RungeKuttaIntegrator>();
+  integrator->dt = 0.001;
+  integrator->order = 4;
+  auto result = cudaq::__internal__::evolveSingle(
+      ham, dims, schedule, initialState, *integrator, {}, {pauliZ}, true);
+  EXPECT_TRUE(result.get_expectation_values().has_value());
+  EXPECT_EQ(result.get_expectation_values().value().size(), numSteps);
+  std::vector<double> theoryResults;
+  for (const auto &t : schedule) {
+    const double expected = std::cos(2 * 2.0 * M_PI * 0.1 * t.real());
+    theoryResults.emplace_back(expected);
+  }
+
+  int count = 0;
+  for (auto expVals : result.get_expectation_values().value()) {
+    EXPECT_EQ(expVals.size(), 1);
+    EXPECT_NEAR((double)expVals[0], theoryResults[count++], 1e-3);
+  }
+}
+
+TEST(EvolveTester, checkDensityMatrixSimple) {
+  const std::map<int, int> dims = {{0, 2}};
+  cudaq::product_operator<cudaq::matrix_operator> ham1 =
+      (2.0 * M_PI * 0.1 * cudaq::spin_operator::x(0));
+  cudaq::operator_sum<cudaq::matrix_operator> ham(ham1);
+
+  constexpr int numSteps = 10;
+  std::vector<std::complex<double>> steps;
+  for (double t : cudaq::linspace(0.0, 1.0, numSteps)) {
+    steps.emplace_back(t, 0.0);
+  }
+  cudaq::Schedule schedule(
+      steps, {"t"},
+      [](const std::string &, const std::complex<double> &val) { return val; });
+
+  cudaq::product_operator<cudaq::matrix_operator> pauliZ_t =
+      cudaq::spin_operator::z(0);
+  cudaq::operator_sum<cudaq::matrix_operator> pauliZ(pauliZ_t);
+  auto initialState = cudaq::state::from_data(
+      std::vector<std::complex<double>>{1.0, 0.0, 0.0, 0.0});
+
+  auto integrator = std::make_shared<cudaq::RungeKuttaIntegrator>();
+  integrator->dt = 0.001;
+  integrator->order = 1;
+  auto result = cudaq::__internal__::evolveSingle(
+      ham, dims, schedule, initialState, *integrator, {}, {pauliZ}, true);
+  EXPECT_TRUE(result.get_expectation_values().has_value());
+  EXPECT_EQ(result.get_expectation_values().value().size(), numSteps);
+  std::vector<double> theoryResults;
+  for (const auto &t : schedule) {
+    const double expected = std::cos(2 * 2.0 * M_PI * 0.1 * t.real());
+    theoryResults.emplace_back(expected);
+  }
+
+  int count = 0;
+  for (auto expVals : result.get_expectation_values().value()) {
+    EXPECT_EQ(expVals.size(), 1);
+    EXPECT_NEAR((double)expVals[0], theoryResults[count++], 1e-3);
+  }
+}
+
+TEST(EvolveTester, checkCompositeSystem) {
+  constexpr int cavity_levels = 10;
+  const std::map<int, int> dims = {{0, 2}, {1, cavity_levels}};
+  auto a = cudaq::boson_operator::annihilate(1);
+  auto a_dag = cudaq::boson_operator::create(1);
+
+  auto sm = cudaq::boson_operator::annihilate(0);
+  auto sm_dag = cudaq::boson_operator::create(0);
+
+  cudaq::product_operator<cudaq::matrix_operator> atom_occ_op_t =
+      cudaq::matrix_operator::number(0);
+  cudaq::operator_sum<cudaq::matrix_operator> atom_occ_op(atom_occ_op_t);
+
+  cudaq::product_operator<cudaq::matrix_operator> cavity_occ_op_t =
+      cudaq::matrix_operator::number(1);
+  cudaq::operator_sum<cudaq::matrix_operator> cavity_occ_op(cavity_occ_op_t);
+
+  auto hamiltonian = 2 * M_PI * atom_occ_op + 2 * M_PI * cavity_occ_op +
+                     2 * M_PI * 0.25 * (sm * a_dag + sm_dag * a);
+  // auto matrix = hamiltonian.to_matrix(dims);
+  // std::cout << "Matrix:\n" << matrix.dump() << "\n";
+  Eigen::Vector2cd qubit_state;
+  qubit_state << 1.0, 0.0;
+  Eigen::VectorXcd cavity_state = Eigen::VectorXcd::Zero(cavity_levels);
+  const int num_photons = 5;
+  cavity_state[num_photons] = 1.0;
+  Eigen::VectorXcd initial_state_vec =
+      Eigen::kroneckerProduct(cavity_state, qubit_state);
+  constexpr int num_steps = 21;
+  std::vector<std::complex<double>> steps;
+  for (double t : cudaq::linspace(0.0, 1.0, num_steps)) {
+    steps.emplace_back(t, 0.0);
+  }
+  cudaq::Schedule schedule(
+      steps, {"t"},
+      [](const std::string &, const std::complex<double> &val) { return val; });
+  auto initialState = cudaq::state::from_data(
+      std::make_pair(initial_state_vec.data(), initial_state_vec.size()));
+  auto integrator = std::make_shared<cudaq::RungeKuttaIntegrator>();
+  integrator->dt = 0.001;
+  integrator->order = 4;
+
+  auto result = cudaq::__internal__::evolveSingle(
+      hamiltonian, dims, schedule, initialState, *integrator, {},
+      {cavity_occ_op, atom_occ_op}, true);
+  EXPECT_TRUE(result.get_expectation_values().has_value());
+  EXPECT_EQ(result.get_expectation_values().value().size(), num_steps);
+
+  int count = 0;
+  for (auto expVals : result.get_expectation_values().value()) {
+    EXPECT_EQ(expVals.size(), 2);
+    std::cout << expVals[0] << " | ";
+    std::cout << expVals[1] << "\n";
+    // This should be an exchanged interaction
+    EXPECT_NEAR((double)expVals[0] + (double)expVals[1], num_photons, 1e-2);
+  }
+}
+
+TEST(EvolveTester, checkCompositeSystemWithCollapse) {
+  constexpr int cavity_levels = 10;
+  const std::map<int, int> dims = {{0, 2}, {1, cavity_levels}};
+  auto a = cudaq::boson_operator::annihilate(1);
+  auto a_dag = cudaq::boson_operator::create(1);
+
+  auto sm = cudaq::boson_operator::annihilate(0);
+  auto sm_dag = cudaq::boson_operator::create(0);
+
+  cudaq::product_operator<cudaq::matrix_operator> atom_occ_op_t =
+      cudaq::matrix_operator::number(0);
+  cudaq::operator_sum<cudaq::matrix_operator> atom_occ_op(atom_occ_op_t);
+
+  cudaq::product_operator<cudaq::matrix_operator> cavity_occ_op_t =
+      cudaq::matrix_operator::number(1);
+  cudaq::operator_sum<cudaq::matrix_operator> cavity_occ_op(cavity_occ_op_t);
+
+  auto hamiltonian = 2 * M_PI * atom_occ_op + 2 * M_PI * cavity_occ_op +
+                     2 * M_PI * 0.25 * (sm * a_dag + sm_dag * a);
+  // auto matrix = hamiltonian.to_matrix(dims);
+  // std::cout << "Matrix:\n" << matrix.dump() << "\n";
+  Eigen::Vector2cd qubit_state;
+  qubit_state << 1.0, 0.0;
+  Eigen::VectorXcd cavity_state = Eigen::VectorXcd::Zero(cavity_levels);
+  const int num_photons = 5;
+  cavity_state[num_photons] = 1.0;
+  Eigen::VectorXcd initial_state_vec =
+      Eigen::kroneckerProduct(cavity_state, qubit_state);
+  Eigen::MatrixXcd rho0 = initial_state_vec * initial_state_vec.transpose();
+  std::cout << "Initial rho:\n" << rho0 << "\n";
+  constexpr int num_steps = 11;
+  std::vector<std::complex<double>> timeSteps;
+  for (double t : cudaq::linspace(0.0, 1.0, num_steps)) {
+    timeSteps.emplace_back(t, 0.0);
+  }
+  cudaq::Schedule schedule(
+      timeSteps, {"t"},
+      [](const std::string &, const std::complex<double> &val) { return val; });
+  auto initialState =
+      cudaq::state::from_data(std::make_pair(rho0.data(), rho0.size()));
+  auto integrator = std::make_shared<cudaq::RungeKuttaIntegrator>();
+  integrator->dt = 0.001;
+  integrator->order = 4;
+  constexpr double decayRate = 0.1;
+  cudaq::product_operator<cudaq::matrix_operator> collapsedOp_t =
+      std::sqrt(decayRate) * a;
+  cudaq::operator_sum<cudaq::matrix_operator> collapsedOp(collapsedOp_t);
+  cudaq::evolve_result result = cudaq::__internal__::evolveSingle(
+      hamiltonian, dims, schedule, initialState, *integrator, {collapsedOp},
+      {cavity_occ_op, atom_occ_op}, true);
+  EXPECT_TRUE(result.get_expectation_values().has_value());
+  EXPECT_EQ(result.get_expectation_values().value().size(), num_steps);
+
+  int count = 0;
+  for (auto expVals : result.get_expectation_values().value()) {
+    EXPECT_EQ(expVals.size(), 2);
+    const double totalParticleCount = expVals[0] + expVals[1];
+    const auto time = timeSteps[count++];
+    const double expectedResult =
+        num_photons * std::exp(-decayRate * time.real());
+    std::cout << "t = " << time << "; particle count = " << totalParticleCount
+              << " vs " << expectedResult << "\n";
+    EXPECT_NEAR(totalParticleCount, expectedResult, 0.1);
+  }
+}
+
+TEST(EvolveTester, checkScalarTd) {
+  const std::map<int, int> dims = {{0, 10}};
+
+  constexpr int numSteps = 101;
+  std::vector<std::complex<double>> steps;
+  for (double t : cudaq::linspace(0.0, 10.0, numSteps)) {
+    steps.emplace_back(t, 0.0);
+  }
+  cudaq::Schedule schedule(
+      steps, {"t"},
+      [](const std::string &, const std::complex<double> &val) { return val; });
+
+  auto function = [](const std::unordered_map<std::string, std::complex<double>>
+                         &parameters) {
+    auto entry = parameters.find("t");
+    if (entry == parameters.end())
+      throw std::runtime_error("Cannot find value of expected parameter");
+    return 1.0;
+  };
+  cudaq::product_operator<cudaq::matrix_operator> ham1 =
+      cudaq::scalar_operator(function) * cudaq::boson_operator::number(0);
+  cudaq::operator_sum<cudaq::matrix_operator> ham(ham1);
+  cudaq::product_operator<cudaq::matrix_operator> obs1 =
+      cudaq::boson_operator::number(0);
+  cudaq::operator_sum<cudaq::matrix_operator> obs(obs1);
+  const double decayRate = 0.1;
+  cudaq::product_operator<cudaq::matrix_operator> collapseOp1 =
+      std::sqrt(decayRate) * cudaq::boson_operator::annihilate(0);
+  cudaq::operator_sum<cudaq::matrix_operator> collapseOp(collapseOp1);
+  Eigen::VectorXcd initial_state_vec = Eigen::VectorXcd::Zero(10);
+  initial_state_vec[9] = 1.0;
+  Eigen::MatrixXcd rho0 = initial_state_vec * initial_state_vec.transpose();
+  auto initialState =
+      cudaq::state::from_data(std::make_pair(rho0.data(), rho0.size()));
+  auto integrator = std::make_shared<cudaq::RungeKuttaIntegrator>();
+  integrator->dt = 0.001;
+  integrator->order = 4;
+  auto result =
+      cudaq::__internal__::evolveSingle(ham, dims, schedule, initialState,
+                                        *integrator, {collapseOp}, {obs}, true);
+  EXPECT_TRUE(result.get_expectation_values().has_value());
+  EXPECT_EQ(result.get_expectation_values().value().size(), numSteps);
+  std::vector<double> theoryResults;
+  int idx = 0;
+  for (const auto &t : schedule) {
+    const double expected = 9.0 * std::exp(-decayRate * steps[idx++].real());
+    theoryResults.emplace_back(expected);
+  }
+
+  int count = 0;
+  for (auto expVals : result.get_expectation_values().value()) {
+    EXPECT_EQ(expVals.size(), 1);
+    std::cout << "Result = " << (double)expVals[0]
+              << "; expected = " << theoryResults[count] << "\n";
+    EXPECT_NEAR((double)expVals[0], theoryResults[count], 1e-3);
+    count++;
+  }
+}
\ No newline at end of file
diff --git a/unittests/dynamics/test_Helpers.cpp b/unittests/dynamics/test_Helpers.cpp
new file mode 100644
index 00000000000..e971dfef91d
--- /dev/null
+++ b/unittests/dynamics/test_Helpers.cpp
@@ -0,0 +1,83 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/dynamics/helpers.h"
+#include <gtest/gtest.h>
+#include <vector>
+
+using namespace cudaq::detail;
+
+TEST(OperatorHelpersTest, GenerateAllStates_TwoQubits) {
+  std::vector<int> degrees = {0, 1};
+  std::unordered_map<int, int> dimensions = {{0, 2}, {1, 2}};
+
+  auto states = generate_all_states(degrees, dimensions);
+  std::vector<std::string> expected_states = {"00", "01", "10", "11"};
+
+  EXPECT_EQ(states, expected_states);
+}
+
+TEST(OperatorHelpersTest, GenerateAllStates_ThreeQubits) {
+  std::vector<int> degrees = {0, 1, 2};
+  std::unordered_map<int, int> dimensions = {{0, 2}, {1, 2}, {2, 2}};
+
+  auto states = generate_all_states(degrees, dimensions);
+  std::vector<std::string> expected_states = {"000", "001", "010", "011",
+                                              "100", "101", "110", "111"};
+
+  EXPECT_EQ(states, expected_states);
+}
+
+TEST(OperatorHelpersTest, GenerateAllStates_EmptyDegrees) {
+  std::vector<int> degrees;
+  std::unordered_map<int, int> dimensions;
+
+  auto states = generate_all_states(degrees, dimensions);
+  EXPECT_TRUE(states.empty());
+}
+
+TEST(OperatorHelpersTest, PermuteMatrix_SingleSwap) {
+  cudaq::matrix_2 matrix(2, 2);
+  matrix[{0, 0}] = 1;
+  matrix[{0, 1}] = 2;
+  matrix[{1, 0}] = 3;
+  matrix[{1, 1}] = 4;
+
+  // Swap rows and columns
+  std::vector<int> permutation = {1, 0};
+
+  permute_matrix(matrix, permutation);
+
+  cudaq::matrix_2 expected(2, 2);
+  expected[{0, 0}] = 4;
+  expected[{0, 1}] = 3;
+  expected[{1, 0}] = 2;
+  expected[{1, 1}] = 1;
+
+  EXPECT_EQ(matrix, expected);
+}
+
+TEST(OperatorHelpersTest, PermuteMatrix_IdentityPermutation) {
+  cudaq::matrix_2 matrix(3, 3);
+  matrix[{0, 0}] = 1;
+  matrix[{0, 1}] = 2;
+  matrix[{0, 2}] = 3;
+  matrix[{1, 0}] = 4;
+  matrix[{1, 1}] = 5;
+  matrix[{1, 2}] = 6;
+  matrix[{2, 0}] = 7;
+  matrix[{2, 1}] = 8;
+  matrix[{2, 2}] = 9;
+
+  // No change
+  std::vector<int> permutation = {0, 1, 2};
+
+  permute_matrix(matrix, permutation);
+
+  EXPECT_EQ(matrix, matrix);
+}
diff --git a/unittests/dynamics/test_Mocks.h b/unittests/dynamics/test_Mocks.h
new file mode 100644
index 00000000000..cc60ca48f25
--- /dev/null
+++ b/unittests/dynamics/test_Mocks.h
@@ -0,0 +1,62 @@
+/****************************************************************-*- C++ -*-****
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#pragma once
+
+#include "cudaq/operators.h"
+#include "cudaq/utils/tensor.h"
+#include <CuDensityMatErrorHandling.h>
+#include <cmath>
+#include <complex>
+#include <cudensitymat.h>
+#include <iostream>
+
+// Mock cudensitymatHandle_t
+inline cudensitymatHandle_t mock_handle() {
+  cudensitymatHandle_t handle;
+  HANDLE_CUDM_ERROR(cudensitymatCreate(&handle));
+  return handle;
+}
+
+// Mock Liouvillian operator creation
+inline cudensitymatOperator_t mock_liouvillian(cudensitymatHandle_t handle) {
+  cudensitymatOperator_t liouvillian = nullptr;
+  std::vector<int64_t> dimensions = {2, 2};
+  HANDLE_CUDM_ERROR(cudensitymatCreateOperator(
+      handle, static_cast<int32_t>(dimensions.size()), dimensions.data(),
+      &liouvillian));
+
+  if (!liouvillian) {
+    throw std::runtime_error("Failed to create mock Liouvillian!");
+  }
+
+  return liouvillian;
+}
+
+// Mock Hilbert space dimensions
+inline std::vector<std::complex<double>> mock_initial_state_data() {
+  std::vector<std::complex<double>> data = {
+      {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}};
+
+  if (data.size() != 4) {
+    throw std::runtime_error("Mock initial state data has incorrect size!");
+  }
+
+  return data;
+}
+
+// Mock initial raw state data
+inline std::vector<int64_t> mock_hilbert_space_dims() {
+  std::vector<int64_t> dims = {2, 2};
+
+  if (dims.empty()) {
+    throw std::runtime_error("Mock Hilbert space dimensions are empty!");
+  }
+
+  return dims;
+}
diff --git a/unittests/dynamics/test_RungeKuttaIntegrator.cpp b/unittests/dynamics/test_RungeKuttaIntegrator.cpp
new file mode 100644
index 00000000000..47410e28798
--- /dev/null
+++ b/unittests/dynamics/test_RungeKuttaIntegrator.cpp
@@ -0,0 +1,112 @@
+// /*******************************************************************************
+//  * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates. *
+//  * All rights reserved. *
+//  * *
+//  * This source code and the accompanying materials are made available under *
+//  * the terms of the Apache License 2.0 which accompanies this distribution. *
+//  ******************************************************************************/
+
+#include "CuDensityMatState.h"
+#include "CuDensityMatTimeStepper.h"
+#include "cudaq/dynamics_integrators.h"
+#include "test_Mocks.h"
+#include <cmath>
+#include <gtest/gtest.h>
+#include <iostream>
+
+using namespace cudaq;
+
+class RungeKuttaIntegratorTest : public ::testing::Test {
+protected:
+  cudensitymatHandle_t handle_;
+  cudensitymatOperator_t liouvillian_;
+  std::unique_ptr<RungeKuttaIntegrator> integrator_;
+  std::unique_ptr<CuDensityMatState> state_;
+
+  void SetUp() override {
+    // Create library handle
+    HANDLE_CUDM_ERROR(cudensitymatCreate(&handle_));
+
+    // Create a mock Liouvillian
+    liouvillian_ = mock_liouvillian(handle_);
+
+    // Create initial state
+    state_ = std::make_unique<CuDensityMatState>(
+        handle_, mock_initial_state_data(), mock_hilbert_space_dims());
+    ASSERT_NE(state_, nullptr);
+    ASSERT_TRUE(state_->is_initialized());
+
+    double t0 = 0.0;
+    // Initialize the integrator (using substeps = 4, for Runge-Kutta method)
+    ASSERT_NO_THROW(integrator_ = std::make_unique<RungeKuttaIntegrator>());
+    ASSERT_NE(integrator_, nullptr);
+    integrator_->order = 4;
+  }
+
+  void TearDown() override {
+    // Clean up resources
+    HANDLE_CUDM_ERROR(cudensitymatDestroyOperator(liouvillian_));
+    HANDLE_CUDM_ERROR(cudensitymatDestroy(handle_));
+  }
+};
+
+// Test Initialization
+TEST_F(RungeKuttaIntegratorTest, Initialization) {
+  ASSERT_NE(integrator_, nullptr);
+}
+
+TEST_F(RungeKuttaIntegratorTest, CheckEvolve) {
+  const std::vector<std::complex<double>> initialStateVec = {{1.0, 0.0},
+                                                             {0.0, 0.0}};
+  const std::vector<int64_t> dims = {2};
+  auto spin_op_x = cudaq::spin_operator::x(0);
+  cudaq::product_operator<cudaq::matrix_operator> ham1 =
+      2.0 * M_PI * 0.1 * spin_op_x;
+  cudaq::operator_sum<cudaq::matrix_operator> ham(ham1);
+  SystemDynamics system;
+  system.hamiltonian = &ham;
+  system.modeExtents = dims;
+
+  for (int integratorOrder : {1, 2, 4}) {
+    std::cout << "Test RK order " << integratorOrder << "\n";
+    auto integrator = std::make_shared<cudaq::RungeKuttaIntegrator>();
+    integrator->dt = 0.001;
+    integrator->order = integratorOrder;
+    constexpr std::size_t numDataPoints = 10;
+    double t = 0.0;
+    auto initialState = cudaq::state::from_data(initialStateVec);
+    // initialState.dump();
+    auto *simState = cudaq::state_helper::getSimulationState(&initialState);
+    auto *castSimState = dynamic_cast<CuDensityMatState *>(simState);
+    EXPECT_TRUE(castSimState != nullptr);
+    castSimState->initialize_cudm(handle_, dims);
+    integrator->setState(initialState, 0.0);
+    std::vector<std::complex<double>> steps;
+    for (double t : cudaq::linspace(0.0, 1.0 * numDataPoints, numDataPoints)) {
+      steps.emplace_back(t, 0.0);
+    }
+    cudaq::Schedule schedule(
+        steps, {"t"}, [](const std::string &, const std::complex<double> &val) {
+          return val;
+        });
+    integrator->setSystem(system, schedule);
+    std::vector<std::complex<double>> outputStateVec(2);
+    for (std::size_t i = 1; i < numDataPoints; ++i) {
+      integrator->integrate(i);
+      auto [t, state] = integrator->getState();
+      // std::cout << "Time = " << t << "\n";
+      // state.dump();
+      state.to_host(outputStateVec.data(), outputStateVec.size());
+      // Check state vector norm
+      EXPECT_NEAR(std::norm(outputStateVec[0]) + std::norm(outputStateVec[1]),
+                  1.0, 1e-2);
+      const double expValZ =
+          std::norm(outputStateVec[0]) - std::norm(outputStateVec[1]);
+      // Analytical results
+      EXPECT_NEAR(outputStateVec[0].real(), std::cos(2.0 * M_PI * 0.1 * t),
+                  1e-2);
+    }
+  }
+
+  // Add test to test tensor_callback
+}
diff --git a/unittests/dynamics/utils.cpp b/unittests/dynamics/utils.cpp
new file mode 100644
index 00000000000..5c0c1926d38
--- /dev/null
+++ b/unittests/dynamics/utils.cpp
@@ -0,0 +1,148 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/operators.h"
+#include "cudaq/utils/tensor.h"
+#include <gtest/gtest.h>
+#include <iostream>
+
+namespace utils {
+
+void print(cudaq::matrix_2 mat, std::string name = "") {
+  if (name != "")
+    std::cout << name << ":" << std::endl;
+  for (std::size_t i = 0; i < mat.get_rows(); i++) {
+    for (std::size_t j = 0; j < mat.get_columns(); j++)
+      std::cout << mat[{i, j}] << " ";
+    std::cout << std::endl;
+  }
+}
+
+void assert_product_equal(
+    const cudaq::product_operator<cudaq::matrix_operator> &got,
+    const std::complex<double> &expected_coefficient,
+    const std::vector<cudaq::matrix_operator> &expected_terms) {
+  cudaq::operator_sum<cudaq::matrix_operator> sum = got;
+  ASSERT_TRUE(sum.get_terms().size() == 1);
+  ASSERT_TRUE(got.get_coefficient().evaluate() == expected_coefficient);
+  ASSERT_TRUE(got.get_terms() == expected_terms);
+}
+
+void checkEqual(cudaq::matrix_2 a, cudaq::matrix_2 b) {
+  print(a, "matrix a");
+  print(b, "matrix b");
+
+  ASSERT_EQ(a.get_rank(), b.get_rank());
+  ASSERT_EQ(a.get_rows(), b.get_rows());
+  ASSERT_EQ(a.get_columns(), b.get_columns());
+  ASSERT_EQ(a.get_size(), b.get_size());
+  for (std::size_t i = 0; i < a.get_rows(); i++) {
+    for (std::size_t j = 0; j < a.get_columns(); j++) {
+      auto a_val = a[{i, j}];
+      auto b_val = b[{i, j}];
+      EXPECT_NEAR(a_val.real(), b_val.real(), 1e-8);
+      EXPECT_NEAR(a_val.imag(), b_val.imag(), 1e-8);
+    }
+  }
+}
+
+cudaq::matrix_2 zero_matrix(std::size_t size) {
+  auto mat = cudaq::matrix_2(size, size);
+  return mat;
+}
+
+cudaq::matrix_2 id_matrix(std::size_t size) {
+  auto mat = cudaq::matrix_2(size, size);
+  for (std::size_t i = 0; i < size; i++)
+    mat[{i, i}] = 1.0 + 0.0j;
+  return mat;
+}
+
+cudaq::matrix_2 annihilate_matrix(std::size_t size) {
+  auto mat = cudaq::matrix_2(size, size);
+  for (std::size_t i = 0; i + 1 < size; i++)
+    mat[{i, i + 1}] = std::sqrt(static_cast<double>(i + 1)) + 0.0 * 'j';
+  return mat;
+}
+
+cudaq::matrix_2 create_matrix(std::size_t size) {
+  auto mat = cudaq::matrix_2(size, size);
+  for (std::size_t i = 0; i + 1 < size; i++)
+    mat[{i + 1, i}] = std::sqrt(static_cast<double>(i + 1)) + 0.0 * 'j';
+  return mat;
+}
+
+cudaq::matrix_2 position_matrix(std::size_t size) {
+  auto mat = cudaq::matrix_2(size, size);
+  for (std::size_t i = 0; i + 1 < size; i++) {
+    mat[{i + 1, i}] = 0.5 * std::sqrt(static_cast<double>(i + 1)) + 0.0 * 'j';
+    mat[{i, i + 1}] = 0.5 * std::sqrt(static_cast<double>(i + 1)) + 0.0 * 'j';
+  }
+  return mat;
+}
+
+cudaq::matrix_2 momentum_matrix(std::size_t size) {
+  auto mat = cudaq::matrix_2(size, size);
+  for (std::size_t i = 0; i + 1 < size; i++) {
+    mat[{i + 1, i}] =
+        (0.5j) * std::sqrt(static_cast<double>(i + 1)) + 0.0 * 'j';
+    mat[{i, i + 1}] =
+        (-0.5j) * std::sqrt(static_cast<double>(i + 1)) + 0.0 * 'j';
+  }
+  return mat;
+}
+
+cudaq::matrix_2 number_matrix(std::size_t size) {
+  auto mat = cudaq::matrix_2(size, size);
+  for (std::size_t i = 0; i < size; i++)
+    mat[{i, i}] = static_cast<double>(i) + 0.0j;
+  return mat;
+}
+
+cudaq::matrix_2 parity_matrix(std::size_t size) {
+  auto mat = cudaq::matrix_2(size, size);
+  for (std::size_t i = 0; i < size; i++)
+    mat[{i, i}] = std::pow(-1., static_cast<double>(i)) + 0.0j;
+  return mat;
+}
+
+cudaq::matrix_2 displace_matrix(std::size_t size,
+                                std::complex<double> amplitude) {
+  auto term1 = amplitude * create_matrix(size);
+  auto term2 = std::conj(amplitude) * annihilate_matrix(size);
+  auto difference = term1 - term2;
+  return difference.exponential();
+}
+
+cudaq::matrix_2 squeeze_matrix(std::size_t size,
+                               std::complex<double> amplitude) {
+  auto term1 = std::conj(amplitude) * annihilate_matrix(size).power(2);
+  auto term2 = amplitude * create_matrix(size).power(2);
+  auto difference = 0.5 * (term1 - term2);
+  return difference.exponential();
+}
+
+cudaq::matrix_2 PauliX_matrix() {
+  auto mat = cudaq::matrix_2(2, 2);
+  mat[{0, 1}] = 1.0;
+  mat[{1, 0}] = 1.0;
+  return mat;
+}
+
+cudaq::matrix_2 PauliZ_matrix() {
+  auto mat = cudaq::matrix_2(2, 2);
+  mat[{0, 0}] = 1.0;
+  mat[{1, 1}] = -1.0;
+  return mat;
+}
+
+cudaq::matrix_2 PauliY_matrix() {
+  return 1.0j * utils::PauliX_matrix() * utils::PauliZ_matrix();
+}
+
+} // namespace utils
diff --git a/unittests/dynamics/utils.h b/unittests/dynamics/utils.h
new file mode 100644
index 00000000000..a6bab548d5a
--- /dev/null
+++ b/unittests/dynamics/utils.h
@@ -0,0 +1,51 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+#include "cudaq/operators.h"
+#include "cudaq/utils/tensor.h"
+
+namespace utils {
+
+void print(cudaq::matrix_2 mat, std::string name = "");
+
+void assert_product_equal(
+    const cudaq::product_operator<cudaq::matrix_operator> &got,
+    const std::complex<double> &expected_coefficient,
+    const std::vector<cudaq::matrix_operator> &expected_terms);
+
+void checkEqual(cudaq::matrix_2 a, cudaq::matrix_2 b);
+
+cudaq::matrix_2 zero_matrix(std::size_t size);
+
+cudaq::matrix_2 id_matrix(std::size_t size);
+
+cudaq::matrix_2 annihilate_matrix(std::size_t size);
+
+cudaq::matrix_2 create_matrix(std::size_t size);
+
+cudaq::matrix_2 position_matrix(std::size_t size);
+
+cudaq::matrix_2 momentum_matrix(std::size_t size);
+
+cudaq::matrix_2 number_matrix(std::size_t size);
+
+cudaq::matrix_2 parity_matrix(std::size_t size);
+
+cudaq::matrix_2 displace_matrix(std::size_t size,
+                                std::complex<double> amplitude);
+
+cudaq::matrix_2 squeeze_matrix(std::size_t size,
+                               std::complex<double> amplitude);
+
+cudaq::matrix_2 PauliX_matrix();
+
+cudaq::matrix_2 PauliZ_matrix();
+
+cudaq::matrix_2 PauliY_matrix();
+
+} // namespace utils
diff --git a/unittests/mqpu/dynamics_async_tester.cpp b/unittests/mqpu/dynamics_async_tester.cpp
new file mode 100644
index 00000000000..fbab685492c
--- /dev/null
+++ b/unittests/mqpu/dynamics_async_tester.cpp
@@ -0,0 +1,183 @@
+/*******************************************************************************
+ * Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                  *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+#include "cudaq/algorithms/evolve.h"
+#include "cudaq/dynamics_integrators.h"
+#include <cudaq.h>
+#include <gtest/gtest.h>
+
+TEST(DynamicsAsyncTester, checkSimple) {
+  auto &platform = cudaq::get_platform();
+  printf("Num QPUs %lu\n", platform.num_qpus());
+  auto jobHandle1 = []() {
+    const std::map<int, int> dims = {{0, 2}};
+    auto ham = 2.0 * M_PI * 0.1 * cudaq::spin_operator::x(0);
+    constexpr int numSteps = 10;
+    cudaq::Schedule schedule(cudaq::linspace(0.0, 1.0, numSteps));
+    auto initialState =
+        cudaq::state::from_data(std::vector<std::complex<double>>{1.0, 0.0});
+    auto integrator = std::make_shared<cudaq::RungeKuttaIntegrator>();
+    integrator->order = 1;
+    integrator->dt = 0.001;
+    auto resultFuture1 = cudaq::evolve_async(
+        ham, dims, schedule, initialState, integrator,
+        std::vector<cudaq::product_operator<cudaq::spin_operator>>{},
+        std::vector<cudaq::product_operator<cudaq::spin_operator>>{
+            cudaq::spin_operator::z(0)},
+        true, {}, 0);
+    std::cout << "Launched evolve job on QPU 0\n";
+    return resultFuture1;
+  }();
+
+  auto jobHandle2 = []() {
+    constexpr int N = 10;
+    constexpr int numSteps = 101;
+    const auto steps = cudaq::linspace(0, 10, numSteps);
+    cudaq::Schedule schedule(steps);
+    auto hamiltonian = cudaq::boson_operator::number(0);
+    const std::map<int, int> dimensions{{0, N}};
+    std::vector<std::complex<double>> psi0_(N, 0.0);
+    psi0_.back() = 1.0;
+    auto psi0 = cudaq::state::from_data(psi0_);
+    constexpr double decay_rate = 0.1;
+    auto integrator = std::make_shared<cudaq::RungeKuttaIntegrator>();
+    integrator->dt = 0.01;
+    auto resultFuture = cudaq::evolve_async(
+        hamiltonian, dimensions, schedule, psi0, integrator,
+        std::vector<cudaq::product_operator<cudaq::boson_operator>>{
+            std::sqrt(decay_rate) * cudaq::boson_operator::annihilate(0)},
+        std::vector<cudaq::product_operator<cudaq::boson_operator>>{
+            hamiltonian},
+        true, {}, 1);
+    std::cout << "Launched evolve job on QPU 1\n";
+    return resultFuture;
+  }();
+
+  std::cout << "Wait for all the evolve jobs complete...\n";
+  {
+    auto result = jobHandle1.get();
+    std::cout << "Checking the results from QPU 0\n";
+    constexpr int numSteps = 10;
+    EXPECT_TRUE(result.get_expectation_values().has_value());
+    EXPECT_EQ(result.get_expectation_values().value().size(), numSteps);
+    std::vector<double> theoryResults;
+    for (const auto &t : cudaq::linspace(0.0, 1.0, numSteps)) {
+      const double expected = std::cos(2 * 2.0 * M_PI * 0.1 * t);
+      theoryResults.emplace_back(expected);
+    }
+
+    int count = 0;
+    for (auto expVals : result.get_expectation_values().value()) {
+      EXPECT_EQ(expVals.size(), 1);
+      EXPECT_NEAR((double)expVals[0], theoryResults[count++], 1e-3);
+    }
+  }
+  {
+    auto result = jobHandle2.get();
+    std::cout << "Checking the results from QPU 1\n";
+    constexpr int N = 10;
+    constexpr double decay_rate = 0.1;
+    constexpr int numSteps = 101;
+    const auto steps = cudaq::linspace(0, 10, numSteps);
+    EXPECT_TRUE(result.get_expectation_values().has_value());
+    EXPECT_EQ(result.get_expectation_values().value().size(), numSteps);
+    std::vector<double> theoryResults;
+    for (const auto &t : steps) {
+      const double expected = (N - 1) * std::exp(-decay_rate * t);
+      theoryResults.emplace_back(expected);
+    }
+
+    int count = 0;
+    for (auto expVals : result.get_expectation_values().value()) {
+      EXPECT_EQ(expVals.size(), 1);
+      EXPECT_NEAR((double)expVals[0], theoryResults[count++], 1e-3);
+    }
+  }
+}
+
+TEST(DynamicsAsyncTester, checkInitializerArgs) {
+  auto &platform = cudaq::get_platform();
+  printf("Num QPUs %lu\n", platform.num_qpus());
+  auto jobHandle1 = []() {
+    const std::map<int, int> dims = {{0, 2}};
+    auto ham = 2.0 * M_PI * 0.1 * cudaq::spin_operator::x(0);
+    constexpr int numSteps = 10;
+    cudaq::Schedule schedule(cudaq::linspace(0.0, 1.0, numSteps));
+    auto initialState =
+        cudaq::state::from_data(std::vector<std::complex<double>>{1.0, 0.0});
+    auto integrator = std::make_shared<cudaq::RungeKuttaIntegrator>();
+    integrator->order = 1;
+    integrator->dt = 0.001;
+    auto resultFuture1 =
+        cudaq::evolve_async(ham, dims, schedule, initialState, integrator, {},
+                            {cudaq::spin_operator::z(0)}, true, {}, 0);
+    std::cout << "Launched evolve job on QPU 0\n";
+    return resultFuture1;
+  }();
+
+  auto jobHandle2 = []() {
+    constexpr int N = 10;
+    constexpr int numSteps = 101;
+    const auto steps = cudaq::linspace(0, 10, numSteps);
+    cudaq::Schedule schedule(steps);
+    auto hamiltonian = cudaq::boson_operator::number(0);
+    const std::map<int, int> dimensions{{0, N}};
+    std::vector<std::complex<double>> psi0_(N, 0.0);
+    psi0_.back() = 1.0;
+    auto psi0 = cudaq::state::from_data(psi0_);
+    constexpr double decay_rate = 0.1;
+    auto integrator = std::make_shared<cudaq::RungeKuttaIntegrator>();
+    integrator->dt = 0.01;
+    auto resultFuture = cudaq::evolve_async(
+        hamiltonian, dimensions, schedule, psi0, integrator,
+        {std::sqrt(decay_rate) * cudaq::boson_operator::annihilate(0)},
+        {hamiltonian}, true, {}, 1);
+    std::cout << "Launched evolve job on QPU 1\n";
+    return resultFuture;
+  }();
+
+  std::cout << "Wait for all the evolve jobs complete...\n";
+  {
+    auto result = jobHandle1.get();
+    std::cout << "Checking the results from QPU 0\n";
+    constexpr int numSteps = 10;
+    EXPECT_TRUE(result.get_expectation_values().has_value());
+    EXPECT_EQ(result.get_expectation_values().value().size(), numSteps);
+    std::vector<double> theoryResults;
+    for (const auto &t : cudaq::linspace(0.0, 1.0, numSteps)) {
+      const double expected = std::cos(2 * 2.0 * M_PI * 0.1 * t);
+      theoryResults.emplace_back(expected);
+    }
+
+    int count = 0;
+    for (auto expVals : result.get_expectation_values().value()) {
+      EXPECT_EQ(expVals.size(), 1);
+      EXPECT_NEAR((double)expVals[0], theoryResults[count++], 1e-3);
+    }
+  }
+  {
+    auto result = jobHandle2.get();
+    std::cout << "Checking the results from QPU 1\n";
+    constexpr int N = 10;
+    constexpr double decay_rate = 0.1;
+    constexpr int numSteps = 101;
+    const auto steps = cudaq::linspace(0, 10, numSteps);
+    EXPECT_TRUE(result.get_expectation_values().has_value());
+    EXPECT_EQ(result.get_expectation_values().value().size(), numSteps);
+    std::vector<double> theoryResults;
+    for (const auto &t : steps) {
+      const double expected = (N - 1) * std::exp(-decay_rate * t);
+      theoryResults.emplace_back(expected);
+    }
+
+    int count = 0;
+    for (auto expVals : result.get_expectation_values().value()) {
+      EXPECT_EQ(expVals.size(), 1);
+      EXPECT_NEAR((double)expVals[0], theoryResults[count++], 1e-3);
+    }
+  }
+}
\ No newline at end of file
diff --git a/unittests/utils/Tensor.cpp b/unittests/utils/Tensor.cpp
index feb7b05970f..e996ee957a5 100644
--- a/unittests/utils/Tensor.cpp
+++ b/unittests/utils/Tensor.cpp
@@ -162,3 +162,32 @@ TEST(Tensor, kroneckerOnList) {
         "{  { (3,3)  (6,6) }\n   { (4,4)  (8,8) }\n   { (5,5)  (10,10) }\n }");
   }
 }
+
+TEST(Tensor, exponential) {
+  {
+    cudaq::matrix_2 me({1., 1., 0.5, 0.0}, {2, 2});
+    cudaq::matrix_2 mf({1., 0., 1., .5, .7, 0., 1., 0., 2.}, {3, 3});
+    cudaq::matrix_2 mg(
+        {1., 0., .4, .6, .7, .8, .9, 0., .3, .1, .2, 1., 0., 0.5, 0.2, .5},
+        {4, 4});
+
+    auto me_exp = me.exponential();
+    auto mf_exp = mf.exponential();
+    auto mg_exp = mg.exponential();
+
+    EXPECT_EQ(
+        me_exp.dump(),
+        "{  { (3.23795,0)  (1.86268,0) }\n   { (0.93134,0)  (1.37527,0) }\n }");
+
+    EXPECT_EQ(
+        mf_exp.dump(),
+        "{  { (4.84921,0)  (0,0)  (5.4755,0) }\n   { (1.46673,0)  (2.01375,0)  "
+        "(0.977708,0) }\n   { (5.4755,0)  (0,0)  (10.3247,0) }\n }");
+
+    EXPECT_EQ(mg_exp.dump(),
+              "{  { (2.9751,0)  (0.447969,0)  (1.01977,0)  (1.75551,0) }\n   { "
+              "(2.10247,0)  (2.55646,0)  (1.97654,0)  (1.39927,0) }\n   { "
+              "(0.800451,0)  (0.648569,0)  (1.69099,0)  (1.76597,0) }\n   { "
+              "(0.498881,0)  (1.05119,0)  (0.753502,0)  (2.03447,0) }\n }");
+  }
+}
diff --git a/utils/mock_qpu/iqm/__init__.py b/utils/mock_qpu/iqm/__init__.py
index 8c34b733863..d81b547ba31 100644
--- a/utils/mock_qpu/iqm/__init__.py
+++ b/utils/mock_qpu/iqm/__init__.py
@@ -237,7 +237,7 @@ def _simulate_circuit(instructions: list[iqm_client.Instruction],
                                    measurement_qubits_positions)
     probabilities = np.diag(partial_trace)
     return {
-        ms: int(prob * shots) for ms, prob in zip(
+        ms: int(round(prob * shots)) for ms, prob in zip(
             _generate_measurement_strings(len(measurement_qubits_positions)),
             probabilities,
         )