From e62a1355899ef5a7e6f0d37132635d13cb3fa4f9 Mon Sep 17 00:00:00 2001
From: Matthew Jones <majones@nvidia.com>
Date: Mon, 18 Nov 2024 18:22:46 -0700
Subject: [PATCH] sync with internal repo1 (commit 7d07bd599)

---
 CITATION.cff                                  |    8 +-
 README.md                                     |    7 +-
 python/README.md                              |   19 +-
 python/builder/pep517.py                      |    5 +-
 python/builder/utils.py                       |   34 +-
 python/cuquantum/__init__.py                  |    4 +-
 python/cuquantum/__main__.py                  |    9 +-
 python/cuquantum/_utils.pxd                   |    7 +-
 python/cuquantum/_utils.pyx                   |   47 +-
 python/cuquantum/_version.py                  |    2 +-
 python/cuquantum/bindings/__init__.pxd        |    3 +
 python/cuquantum/bindings/__init__.py         |    5 +
 .../cuquantum/bindings/_internal/__init__.pxd |    3 +
 .../bindings/_internal/cudensitymat.pxd       |   54 +
 .../bindings/_internal/cudensitymat_linux.pyx |  969 +++++++++++++
 python/cuquantum/bindings/_utils.pxd          |  184 +++
 python/cuquantum/bindings/_utils.pyx          |  267 ++++
 python/cuquantum/bindings/cudensitymat.pxd    |  102 ++
 python/cuquantum/bindings/cudensitymat.pyx    | 1171 ++++++++++++++++
 python/cuquantum/bindings/cycudensitymat.pxd  |  211 +++
 python/cuquantum/bindings/cycudensitymat.pyx  |  181 +++
 .../custatevec/_internal/custatevec.pxd       |    4 +-
 .../custatevec/_internal/custatevec_linux.pyx |    4 +-
 python/cuquantum/custatevec/custatevec.pxd    |    4 +-
 python/cuquantum/custatevec/custatevec.pyx    |  252 ++--
 python/cuquantum/custatevec/cycustatevec.pxd  |   37 +-
 python/cuquantum/custatevec/cycustatevec.pyx  |    4 +-
 .../_internal/circuit_parser_utils_qiskit.py  |    2 +-
 .../cutensornet/_internal/cutensornet.pxd     |    4 +-
 .../_internal/cutensornet_linux.pyx           |   44 +-
 .../_internal/decomposition_utils.py          |   40 +-
 .../cutensornet/circuit_converter.py          |    2 +-
 python/cuquantum/cutensornet/configuration.py |   39 +-
 python/cuquantum/cutensornet/cutensornet.pxd  |    4 +-
 python/cuquantum/cutensornet/cutensornet.pyx  |  275 ++--
 .../cuquantum/cutensornet/cycutensornet.pxd   |    4 +-
 .../cuquantum/cutensornet/cycutensornet.pyx   |   10 +-
 .../_internal/network_state_utils.py          |   28 +-
 .../cutensornet/experimental/configuration.py |    1 -
 .../cutensornet/experimental/network_state.py |  265 ++--
 .../experimental/tensor_network.py            |   27 +-
 python/cuquantum/cutensornet/tensor.py        |   28 +-
 .../cuquantum/cutensornet/tensor_network.py   |   10 +-
 python/cuquantum/densitymat/__init__.py       |    8 +
 .../densitymat/_internal/__init__.py          |    3 +
 .../densitymat/_internal/callbacks.py         |  148 ++
 .../densitymat/_internal/library_handle.py    |  143 ++
 .../cuquantum/densitymat/_internal/utils.py   |  204 +++
 .../densitymat/elementary_operator.py         |  762 ++++++++++
 python/cuquantum/densitymat/operators.py      | 1249 +++++++++++++++++
 python/cuquantum/densitymat/state.py          |  612 ++++++++
 python/cuquantum/densitymat/work_stream.py    |  355 +++++
 .../cutensornet/approxTN/mps_example.py       |   35 +-
 .../circuits_cirq/example05_mps_exact.py      |    3 +-
 .../circuits_qiskit/example05_mps_exact.py    |    6 +-
 .../generic_states/example01_basic_torch.py   |    3 +-
 .../example02_arbitrary_dimension_numpy.py    |    9 +-
 .../generic_states/example03_mps_mpo_cupy.py  |    9 +-
 .../example04_variational_expectation.py      |   15 +-
 .../example05_noisy_unitary_channels.py       |   88 ++
 .../tensor/example12-qr_mem_limit_handling.py |   33 +
 .../example12-svd_mem_limit_handling.py       |   35 +
 .../samples/densitymat/operator_advanced.py   |  238 ++++
 .../samples/densitymat/operator_defaults.py   |  184 +++
 python/samples/densitymat/operator_mpi.py     |  118 ++
 python/setup.py                               |   28 +-
 .../custatevec_tests/test_custatevec.py       |    4 +
 .../cutensornet_tests/circuit_data.py         |    6 +
 .../cutensornet_tests/circuit_tester.py       |   11 +-
 .../cutensornet_tests/state_data.py           |   17 +
 .../cutensornet_tests/state_tester.py         |  400 +++++-
 .../cutensornet_tests/test_contract.py        |   17 +-
 .../cutensornet_tests/test_cutensornet.py     |    8 +-
 .../cutensornet_tests/test_experimental.py    |  191 ++-
 .../cutensornet_tests/test_tensor.py          |   13 +-
 .../cutensornet_tests/test_utils.py           |   22 +-
 .../trajectories_noise/__init__.py            |    0
 .../trajectories_noise/conftest.py            |   36 +
 .../trajectories_noise/network_state_wrap.py  |  144 ++
 .../trajectories_noise/quantum_channels.py    |   67 +
 .../trajectories_noise/test_large_circuits.py |   87 ++
 .../test_mid_circuit_measurement.py           |  119 ++
 .../test_onequbit_channel.py                  |   69 +
 .../test_quantum_volume_mid_circuit.py        |  115 ++
 .../test_state_compute_mpi.py                 |  341 +++++
 .../densitymat_mpi_tests/test_state_mpi.py    |   57 +
 .../test_work_stream_mpi.py                   |   25 +
 .../densitymat_tests/__init__.py              |   13 +
 .../test_elementary_operator.py               |  427 ++++++
 .../densitymat_tests/test_operators.py        |  130 ++
 .../densitymat_tests/test_state.py            |  283 ++++
 .../densitymat_tests/test_work_stream.py      |   43 +
 python/tests/requirements.txt                 |    2 +
 .../densitymat_tests/__init__.py              |    3 +
 .../test_cudensitymat_samples.py              |   21 +
 samples/cudensitymat/helpers.h                |   96 ++
 .../cudensitymat/operator_action_example.cpp  |  244 ++++
 .../operator_action_mpi_example.cpp           |  288 ++++
 .../transverse_ising_full_fused_noisy.h       |  256 ++++
 .../samples_mpi/distributedIndexBitSwap.cpp   |  111 +-
 samples/cutensornet/README.md                 |    2 +-
 101 files changed, 11688 insertions(+), 648 deletions(-)
 create mode 100644 python/cuquantum/bindings/__init__.pxd
 create mode 100644 python/cuquantum/bindings/__init__.py
 create mode 100644 python/cuquantum/bindings/_internal/__init__.pxd
 create mode 100644 python/cuquantum/bindings/_internal/cudensitymat.pxd
 create mode 100644 python/cuquantum/bindings/_internal/cudensitymat_linux.pyx
 create mode 100644 python/cuquantum/bindings/_utils.pxd
 create mode 100644 python/cuquantum/bindings/_utils.pyx
 create mode 100644 python/cuquantum/bindings/cudensitymat.pxd
 create mode 100644 python/cuquantum/bindings/cudensitymat.pyx
 create mode 100644 python/cuquantum/bindings/cycudensitymat.pxd
 create mode 100644 python/cuquantum/bindings/cycudensitymat.pyx
 create mode 100644 python/cuquantum/densitymat/__init__.py
 create mode 100644 python/cuquantum/densitymat/_internal/__init__.py
 create mode 100644 python/cuquantum/densitymat/_internal/callbacks.py
 create mode 100644 python/cuquantum/densitymat/_internal/library_handle.py
 create mode 100644 python/cuquantum/densitymat/_internal/utils.py
 create mode 100644 python/cuquantum/densitymat/elementary_operator.py
 create mode 100644 python/cuquantum/densitymat/operators.py
 create mode 100644 python/cuquantum/densitymat/state.py
 create mode 100644 python/cuquantum/densitymat/work_stream.py
 create mode 100644 python/samples/cutensornet/experimental/network_state/generic_states/example05_noisy_unitary_channels.py
 create mode 100644 python/samples/cutensornet/tensor/example12-qr_mem_limit_handling.py
 create mode 100644 python/samples/cutensornet/tensor/example12-svd_mem_limit_handling.py
 create mode 100644 python/samples/densitymat/operator_advanced.py
 create mode 100644 python/samples/densitymat/operator_defaults.py
 create mode 100644 python/samples/densitymat/operator_mpi.py
 create mode 100644 python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/__init__.py
 create mode 100644 python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/conftest.py
 create mode 100644 python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/network_state_wrap.py
 create mode 100644 python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/quantum_channels.py
 create mode 100644 python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/test_large_circuits.py
 create mode 100644 python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/test_mid_circuit_measurement.py
 create mode 100644 python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/test_onequbit_channel.py
 create mode 100644 python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/test_quantum_volume_mid_circuit.py
 create mode 100644 python/tests/cuquantum_tests/densitymat_mpi_tests/test_state_compute_mpi.py
 create mode 100644 python/tests/cuquantum_tests/densitymat_mpi_tests/test_state_mpi.py
 create mode 100644 python/tests/cuquantum_tests/densitymat_mpi_tests/test_work_stream_mpi.py
 create mode 100644 python/tests/cuquantum_tests/densitymat_tests/__init__.py
 create mode 100644 python/tests/cuquantum_tests/densitymat_tests/test_elementary_operator.py
 create mode 100644 python/tests/cuquantum_tests/densitymat_tests/test_operators.py
 create mode 100644 python/tests/cuquantum_tests/densitymat_tests/test_state.py
 create mode 100644 python/tests/cuquantum_tests/densitymat_tests/test_work_stream.py
 create mode 100644 python/tests/samples_tests/densitymat_tests/__init__.py
 create mode 100644 python/tests/samples_tests/densitymat_tests/test_cudensitymat_samples.py
 create mode 100644 samples/cudensitymat/helpers.h
 create mode 100644 samples/cudensitymat/operator_action_example.cpp
 create mode 100644 samples/cudensitymat/operator_action_mpi_example.cpp
 create mode 100644 samples/cudensitymat/transverse_ising_full_fused_noisy.h

diff --git a/CITATION.cff b/CITATION.cff
index f282aac..9edf883 100644
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -1,12 +1,12 @@
-cff-version: 1.2.0
+cff-version: 1.3.0
 title: 'NVIDIA cuQuantum SDK'
 message: 'If you use this software, please cite it as below.'
 authors:
-  - name: "The cuQuantum development team"
+  - name: "The cuQuantum Development Team"
 license: BSD-3-Clause
 license-url: "https://github.com/NVIDIA/cuQuantum/blob/main/LICENSE"
 repository-code: "https://github.com/nvidia/cuquantum"
 type: software
 url: "https://github.com/nvidia/cuquantum"
-doi: 10.5281/zenodo.6385574
-version: "v23.03.0"
+doi: 10.1109/QCE57702.2023.00119
+version: "v24.11.0"
diff --git a/README.md b/README.md
index 0a28e26..ff07da6 100644
--- a/README.md
+++ b/README.md
@@ -12,15 +12,18 @@ This public repository contains a few sets of files related to the [NVIDIA cuQua
   - Available for download on
     - conda-forge:
       - `cuquantum` [![Conda Version](https://img.shields.io/conda/vn/conda-forge/cuquantum.svg)](https://anaconda.org/conda-forge/cuquantum)
+        - `cudensitymat` [![Conda Version](https://img.shields.io/conda/vn/conda-forge/cudensitymat.svg)](https://anaconda.org/conda-forge/cudensitymat)
         - `custatevec` [![Conda Version](https://img.shields.io/conda/vn/conda-forge/custatevec.svg)](https://anaconda.org/conda-forge/custatevec)
         - `cutensornet` [![Conda Version](https://img.shields.io/conda/vn/conda-forge/cutensornet.svg)](https://anaconda.org/conda-forge/cutensornet)
       - `cuquantum-python` [![Conda Version](https://img.shields.io/conda/vn/conda-forge/cuquantum-python.svg)](https://anaconda.org/conda-forge/cuquantum-python)
     - PyPI:
       - `cuquantum` [![pypi](https://img.shields.io/pypi/v/cuquantum.svg)](https://pypi.python.org/pypi/cuquantum)
         - `cuquantum-cu11` [![pypi](https://img.shields.io/pypi/v/cuquantum-cu11.svg)](https://pypi.python.org/pypi/cuquantum-cu11)
+          - `cudensitymat-cu11` [![pypi](https://img.shields.io/pypi/v/cudensitymat-cu11.svg)](https://pypi.python.org/pypi/cudensitymat-cu11)
           - `custatevec-cu11` [![pypi](https://img.shields.io/pypi/v/custatevec-cu11.svg)](https://pypi.python.org/pypi/custatevec-cu11)
           - `cutensornet-cu11` [![pypi](https://img.shields.io/pypi/v/cutensornet-cu11.svg)](https://pypi.python.org/pypi/cutensornet-cu11)
         - `cuquantum-cu12` [![pypi](https://img.shields.io/pypi/v/cuquantum-cu12.svg)](https://pypi.python.org/pypi/cuquantum-cu12)
+          - `cudensitymat-cu12` [![pypi](https://img.shields.io/pypi/v/cudensitymat-cu12.svg)](https://pypi.python.org/pypi/cudensitymat-cu12)
           - `custatevec-cu12` [![pypi](https://img.shields.io/pypi/v/custatevec-cu12.svg)](https://pypi.python.org/pypi/custatevec-cu12)
           - `cutensornet-cu12` [![pypi](https://img.shields.io/pypi/v/cutensornet-cu12.svg)](https://pypi.python.org/pypi/cutensornet-cu12)
       - `cuquantum-python` [![pypi](https://img.shields.io/pypi/v/cuquantum-python.svg)](https://pypi.python.org/pypi/cuquantum-python)
@@ -39,6 +42,4 @@ All files hosted in this repository are subject to the [BSD-3-Clause](./LICENSE)
 
 ## Citing cuQuantum
 
-This repository is uploaded to Zenodo automatically. Click the badge below to see citation formats.
-
-[![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.6385574.svg)](https://doi.org/10.5281/zenodo.6385574)
+H. Bayraktar et al., "cuQuantum SDK: A High-Performance Library for Accelerating Quantum Science", 2023 IEEE International Conference on Quantum Computing and Engineering (QCE), Bellevue, WA, USA, 2023, pp. 1050-1061, doi: [10.1109/QCE57702.2023.00119](https://doi.org/10.1109/QCE57702.2023.00119)
diff --git a/python/README.md b/python/README.md
index 631eb10..815b3fc 100644
--- a/python/README.md
+++ b/python/README.md
@@ -14,8 +14,9 @@ For instructions on installing *cuQuantum Python*, refer to our
 The build-time dependencies of the cuQuantum Python package include:
 
 * CUDA Toolkit 11.x or 12.x
-* cuStateVec 1.4.0+
-* cuTensorNet 2.5.0+
+* cuStateVec 1.7.0+
+* cuTensorNet 2.6.0+
+* cuDensityMat >=0.0.5, <0.1.0
 * Python 3.10+
 * Cython >=0.29.22,<3
 * pip 21.3.1+
@@ -43,10 +44,9 @@ Notes:
   * `-v`: enable more verbose output
   * `--no-deps`: avoid installing the *run-time* dependencies
   * `--no-build-isolation`: reuse the current Python environment instead of creating a new one for building the package (this avoids installing any *build-time* dependencies)
-- As an alternative to setting `CUQUANTUM_ROOT`, `CUSTATEVEC_ROOT` and `CUTENSORNET_ROOT` can be set to point to the cuStateVec and the cuTensorNet libraries, respectively. The latter two environment variables take precedence if defined.
+- As an alternative to setting `CUQUANTUM_ROOT`, `CUSTATEVEC_ROOT`, `CUTENSORNET_ROOT` and `CUDENSITYMAT_ROOT` can be set to point to the cuStateVec, cuTensorNet and cuDensityMat libraries, respectively. The latter three environment variables take precedence if defined.
 - Please ensure that you use consistent binaries and packages for either CUDA 11 or 12. Mixing-and-matching will result in undefined behavior.
 
-
 ## Running
 
 ### Requirements
@@ -56,8 +56,9 @@ Runtime dependencies of the cuQuantum Python package include:
 * An NVIDIA GPU with compute capability 7.0+
 * Driver: Linux (450.80.02+ for CUDA 11, 525.60.13+ for CUDA 12)
 * CUDA Toolkit 11.x or 12.x
-* cuStateVec 1.4.0+
-* cuTensorNet 2.5.0+
+* cuStateVec 1.7.0+
+* cuTensorNet 2.6.0+
+* cuDensityMat >=0.0.5, <0.1.0
 * Python 3.10+
 * NumPy v1.21+
 * CuPy v13.0.0+ (see [installation guide](https://docs.cupy.dev/en/stable/install.html))
@@ -83,17 +84,15 @@ Known issues:
 
 Samples for demonstrating the usage of both low-level and high-level Python APIs are
 available in the `samples` directory. The low-level API samples are 1:1 translations of the corresponding
-samples written in C. The high-level API samples demonstrate pythonic usages of the cuTensorNet
+samples written in C. The high-level API samples demonstrate pythonic usages of the cuTensorNet and cuDensityMat
 library in Python.
 
-
 ## Testing
 
 If pytest is installed, typing `pytest tests` at the command prompt in the Python source root directory will
 run all tests. Some tests would be skipped if `cffi` is not installed or if the environment
 variable `CUDA_PATH` is not set.
 
-
 ## Citing cuQuantum
 
-Please click this Zenodo badge to see the citation format: [![DOI](https://zenodo.org/badge/DOI/10.5281/zenodo.6385574.svg)](https://doi.org/10.5281/zenodo.6385574)
+H. Bayraktar et al., "cuQuantum SDK: A High-Performance Library for Accelerating Quantum Science", 2023 IEEE International Conference on Quantum Computing and Engineering (QCE), Bellevue, WA, USA, 2023, pp. 1050-1061, doi: [10.1109/QCE57702.2023.00119](https://doi.org/10.1109/QCE57702.2023.00119)
diff --git a/python/builder/pep517.py b/python/builder/pep517.py
index 0916ab7..ab04683 100644
--- a/python/builder/pep517.py
+++ b/python/builder/pep517.py
@@ -30,8 +30,9 @@ def get_requires_for_build_wheel(config_settings=None):
     # set up version constraints: note that CalVer like 22.03 is normalized to
     # 22.3 by setuptools, so we must follow the same practice in the constraints;
     # also, we don't need the patch number here
-    cuqnt_require = [f'custatevec-cu{utils.cuda_major_ver}~=1.6',   # ">=1.6.0,<2"
-                     f'cutensornet-cu{utils.cuda_major_ver}~=2.5',   # ">=2.5.0,<3"
+    cuqnt_require = [f'custatevec-cu{utils.cuda_major_ver}~=1.7',   # ">=1.7.0,<2"
+                     f'cutensornet-cu{utils.cuda_major_ver}~=2.6',   # ">=2.6.0,<3"
+                     f'cudensitymat-cu{utils.cuda_major_ver}~=0.0.5'  # ">=0.0.5, <0.1"
                     ]
 
     return _build_meta.get_requires_for_build_wheel(config_settings) + cuqnt_require
diff --git a/python/builder/utils.py b/python/builder/utils.py
index 1c23f88..dce659f 100644
--- a/python/builder/utils.py
+++ b/python/builder/utils.py
@@ -79,7 +79,7 @@ def run(self):
 class build_ext(_build_ext):
 
     def _set_library_roots(self):
-        custatevec_root = cutensornet_root = cuquantum_root = None
+        custatevec_root = cutensornet_root = cudensitymat_root = cuquantum_root = None
         # Note that we need sys.path because of build isolation (since PEP 517)
         py_paths = sys.path + [site.getusersitepackages()] + site.getsitepackages()
 
@@ -89,10 +89,10 @@ def _set_library_roots(self):
         for path in py_paths:
             path = os.path.join(path, 'cuquantum')
             if os.path.isdir(os.path.join(path, 'include')):
-                custatevec_root = cutensornet_root = path
+                custatevec_root = cutensornet_root = cudensitymat_root = path
                 break
         else:
-            # We allow setting CUSTATEVEC_ROOT and CUTENSORNET_ROOT separately for the ease
+            # We allow setting CUSTATEVEC_ROOT, CUTENSORNET_ROOT & CUDENSITYMAT_ROOT separately for the ease
             # of development, but users are encouraged to either install cuquantum from PyPI
             # or conda, or set CUQUANTUM_ROOT to the existing installation.
             cuquantum_root = os.environ.get('CUQUANTUM_ROOT')
@@ -108,19 +108,26 @@ def _set_library_roots(self):
                 if cuquantum_root is None:
                     raise RuntimeError('cuTensorNet is not found, please set $CUQUANTUM_ROOT '
                                        'or $CUTENSORNET_ROOT') from e
+            
+            try:
+                cudensitymat_root = os.environ['CUDENSITYMAT_ROOT']
+            except KeyError as e:
+                if cuquantum_root is None:
+                    raise RuntimeError('cuDensityMat is not found, please set $CUQUANTUM_ROOT '
+                                       'or $CUDENSITYMAT_ROOT') from e
 
-        return custatevec_root, cutensornet_root, cuquantum_root
+        return custatevec_root, cutensornet_root, cudensitymat_root, cuquantum_root
 
     def _prep_includes_libs_rpaths(self):
         """
-        Set global vars cusv_incl_dir, cutn_incl_dir, and extra_linker_flags.
+        Set global vars cusv_incl_dir, cutn_incl_dir, cudm_incl_dir, and extra_linker_flags.
 
         With the new bindings, we no longer need to link to cuQuantum DSOs.
         """
-        custatevec_root, cutensornet_root, cuquantum_root = self._set_library_roots()
+        custatevec_root, cutensornet_root, cudensitymat_root, cuquantum_root = self._set_library_roots()
 
-        global cusv_incl_dir, cutn_incl_dir, cuqnt_incl_dir
-        cusv_incl_dir = cutn_incl_dir = cuqnt_incl_dir = None
+        global cusv_incl_dir, cutn_incl_dir, cudm_incl_dir, cuqnt_incl_dir
+        cusv_incl_dir = cutn_incl_dir = cudm_incl_dir = cuqnt_incl_dir = None
         base_incl_dir = (os.path.join(cuda_path, 'include'),)
         if cuquantum_root is not None:
             cuqnt_incl_dir = base_incl_dir + (os.path.join(cuquantum_root, 'include'),)
@@ -128,6 +135,8 @@ def _prep_includes_libs_rpaths(self):
             cusv_incl_dir = base_incl_dir + (os.path.join(custatevec_root, 'include'),)
         if cutensornet_root is not None:
             cutn_incl_dir = base_incl_dir + (os.path.join(cutensornet_root, 'include'),)
+        if cudensitymat_root is not None:
+            cudm_incl_dir = base_incl_dir + (os.path.join(cudensitymat_root, 'include'),)
 
         global extra_linker_flags
         if not building_wheel:
@@ -138,8 +147,8 @@ def _prep_includes_libs_rpaths(self):
             # Note: soname = library major version
             # We don't need to link to cuBLAS/cuSOLVER/cuTensor at build time
             # The rpaths must be adjusted given the following full-wheel installation:
-            # - cuquantum-python: site-packages/cuquantum/{custatevec, cutensornet}/_internal/  [=$ORIGIN]
-            # - cusv & cutn:      site-packages/cuquantum/lib/
+            # - cuquantum-python: site-packages/cuquantum/{custatevec, cutensornet, cudensitymat}/_internal/  [=$ORIGIN]
+            # - cusv, cutn & cudm:      site-packages/cuquantum/lib/
             # - cutensor:         site-packages/cutensor/lib/
             # - cublas:           site-packages/nvidia/cublas/lib/
             # - cusolver:         site-packages/nvidia/cusolver/lib/
@@ -157,17 +166,20 @@ def _prep_includes_libs_rpaths(self):
         print("CUDA path:", cuda_path)
         print("cuStateVec path:", custatevec_root if custatevec_root else cuquantum_root)
         print("cuTensorNet path:", cutensornet_root if cutensornet_root else cuquantum_root)
+        print("cuDensityMat path:", cudensitymat_root if cudensitymat_root else cuquantum_root)
         print("*"*80+"\n")
 
     def build_extension(self, ext):
         ext.include_dirs = ()
-        for include_dir in (cusv_incl_dir, cutn_incl_dir, cuqnt_incl_dir):
+        for include_dir in (cusv_incl_dir, cutn_incl_dir, cudm_incl_dir, cuqnt_incl_dir):
             if include_dir is not None:
                 ext.include_dirs += include_dir
         if ext.name.endswith("custatevec"):
             ext.extra_link_args = extra_linker_flags
         elif ext.name.endswith("cutensornet"):
             ext.extra_link_args = extra_linker_flags
+        elif ext.name.endswith("cudensitymat"):
+            ext.extra_link_args = extra_linker_flags
 
         super().build_extension(ext)
 
diff --git a/python/cuquantum/__init__.py b/python/cuquantum/__init__.py
index f60488c..c75db61 100644
--- a/python/cuquantum/__init__.py
+++ b/python/cuquantum/__init__.py
@@ -2,12 +2,14 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 
+from cuquantum import bindings
 from cuquantum import custatevec
 from cuquantum import cutensornet
+from cuquantum import densitymat
 from cuquantum.cutensornet import (
     contract, contract_path, einsum, einsum_path, tensor, tensor_qualifiers_dtype, BaseCUDAMemoryManager, CircuitToEinsum, MemoryPointer, 
     Network, NetworkOptions, OptimizerInfo, OptimizerOptions, PathFinderOptions, 
-    ReconfigOptions, SlicerOptions)
+    ReconfigOptions, SlicerOptions, MemoryLimitExceeded)
 from cuquantum._utils import ComputeType, cudaDataType, libraryPropertyType
 from cuquantum._version import __version__
 
diff --git a/python/cuquantum/__main__.py b/python/cuquantum/__main__.py
index 542e613..cd0ff6d 100644
--- a/python/cuquantum/__main__.py
+++ b/python/cuquantum/__main__.py
@@ -21,6 +21,9 @@ def get_lib_path(name):
     elif "cutensor" in name:  # cutensor or cutensornet
         from cuquantum import cutensornet as cutn
         cutn._internal.cutensornet._inspect_function_pointers()
+    elif "cudensitymat" in name:
+        from cuquantum import bindings
+        bindings._internal.cudensitymat._inspect_function_pointers()
 
     try:
         with open('/proc/self/maps') as f:
@@ -53,7 +56,7 @@ def get_lib_path(name):
 
 def _get_cuquantum_libs():
     paths = set()
-    for lib in ('custatevec', 'cutensornet', 'cutensor'):
+    for lib in ('custatevec', 'cutensornet', 'cutensor', 'cudensitymat'):
         path = os.path.normpath(get_lib_path(f"lib{lib}.so"))
         paths.add(path)
     return tuple(paths)
@@ -92,7 +95,7 @@ def _get_cuquantum_target(target):
     parser.add_argument('--libs', action='store_true',
                         help='get cuQuantum linker flags')
     parser.add_argument('--target', action='append', default=[],
-                        choices=('custatevec', 'cutensornet'),
+                        choices=('custatevec', 'cutensornet', 'cudensitymat'),
                         help='get the linker flag for the target cuQuantum component')
     args = parser.parse_args()
 
@@ -109,6 +112,6 @@ def _get_cuquantum_target(target):
     flag = ''
     for target in args.target:
         flag += _get_cuquantum_target(target)
-        if target == 'cutensornet':
+        if target in ('cutensornet', 'cudensitymat') :
             flag += _get_cuquantum_target('cutensor')
     print(flag)
diff --git a/python/cuquantum/_utils.pxd b/python/cuquantum/_utils.pxd
index 4fefe77..55902a4 100644
--- a/python/cuquantum/_utils.pxd
+++ b/python/cuquantum/_utils.pxd
@@ -166,9 +166,10 @@ cdef cppclass nested_resource[T]:
     nullable_unique_ptr[ vector[vector[T]] ] nested_resource_ptr
 
 
-cdef nullable_unique_ptr[ vector[ResT] ] get_resource_ptr(object obj, ResT* __unused)
-cdef nullable_unique_ptr[ vector[PtrT*] ] get_resource_ptrs(object obj, PtrT* __unused)
-cdef nested_resource[ResT] get_nested_resource_ptr(object obj, ResT* __unused)
+# accepts the output pointer as input to use the return value for exception propagation
+cdef int get_resource_ptr(nullable_unique_ptr[vector[ResT]] &in_out_ptr, object obj, ResT* __unused) except 1
+cdef int get_resource_ptrs(nullable_unique_ptr[ vector[PtrT*] ] &in_out_ptr, object obj, PtrT* __unused) except 1
+cdef int get_nested_resource_ptr(nested_resource[ResT] &in_out_ptr, object obj, ResT* __unused) except 1
 
 
 # Cython limitation: need standalone typedef if we wanna use it for casting
diff --git a/python/cuquantum/_utils.pyx b/python/cuquantum/_utils.pyx
index 409e6a5..59285c9 100644
--- a/python/cuquantum/_utils.pyx
+++ b/python/cuquantum/_utils.pyx
@@ -196,42 +196,41 @@ cdef int[29] _WHITESPACE_UNICODE_INTS = [
 WHITESPACE_UNICODE = ''.join(chr(s) for s in _WHITESPACE_UNICODE_INTS)
 
 
-# Cython can't infer the overload by return type alone, so we need a dummy
-# input argument to help it
-cdef nullable_unique_ptr[ vector[ResT] ] get_resource_ptr(object obj, ResT* __unused):
-    cdef nullable_unique_ptr[ vector[ResT] ] ptr
-    cdef vector[ResT]* vec
+# Cython can't infer the ResT overload when it is wrapped in nullable_unique_ptr,
+# so we need a dummy (__unused) input argument to help it
+cdef int get_resource_ptr(nullable_unique_ptr[vector[ResT]] &in_out_ptr, object obj, ResT* __unused) except 1:
     if isinstance(obj, _np_ndarray):
         # TODO: can we do "assert obj.dtype == some_dtype" here? it seems we have no
         # way to check the dtype...
         # TODO: how about buffer protocol?
         assert <size_t>(obj.dtype.itemsize) == sizeof(ResT)
-        ptr.reset(<vector[ResT]*><intptr_t>(obj.ctypes.data), False)
+        in_out_ptr.reset(<vector[ResT]*><intptr_t>(obj.ctypes.data), False)
     elif cpython.PySequence_Check(obj):
         vec = new vector[ResT](len(obj))
+        # set the ownership immediately to avoid leaking the `vec` memory in
+        # case of exception in the following loop
+        in_out_ptr.reset(vec, True)
         for i in range(len(obj)):
             deref(vec)[i] = obj[i]
-        ptr.reset(vec, True)
     else:
-        ptr.reset(<vector[ResT]*><intptr_t>obj, False)
-    return move(ptr)
+        in_out_ptr.reset(<vector[ResT]*><intptr_t>obj, False)
+    return 0
 
 
-cdef nullable_unique_ptr[ vector[PtrT*] ] get_resource_ptrs(object obj, PtrT* __unused):
-    cdef nullable_unique_ptr[ vector[PtrT*] ] ptr
-    cdef vector[PtrT*]* vec
+cdef int get_resource_ptrs(nullable_unique_ptr[ vector[PtrT*] ] &in_out_ptr, object obj, PtrT* __unused) except 1:
     if cpython.PySequence_Check(obj):
         vec = new vector[PtrT*](len(obj))
+        # set the ownership immediately to avoid leaking the `vec` memory in
+        # case of exception in the following loop
+        in_out_ptr.reset(vec, True)
         for i in range(len(obj)):
             deref(vec)[i] = <PtrT*><intptr_t>(obj[i])
-        ptr.reset(vec, True)
     else:
-        ptr.reset(<vector[PtrT*]*><intptr_t>obj, False)
-    return move(ptr)
+        in_out_ptr.reset(<vector[PtrT*]*><intptr_t>obj, False)
+    return 0
 
 
-cdef nested_resource[ResT] get_nested_resource_ptr(object obj, ResT* __unused):
-    cdef nested_resource[ResT] res
+cdef int get_nested_resource_ptr(nested_resource[ResT] &in_out_ptr, object obj, ResT* __unused) except 1:
     cdef nullable_unique_ptr[ vector[intptr_t] ] nested_ptr
     cdef nullable_unique_ptr[ vector[vector[ResT]] ] nested_res_ptr
     cdef vector[intptr_t]* nested_vec = NULL
@@ -243,26 +242,28 @@ cdef nested_resource[ResT] get_nested_resource_ptr(object obj, ResT* __unused):
         length = len(obj)
         nested_res_vec = new vector[vector[ResT]](length)
         nested_vec = new vector[intptr_t](length)
+        # set the ownership immediately to avoid leaking memory in case of
+        # exception in the following loop
+        nested_res_ptr.reset(nested_res_vec, True)
+        nested_ptr.reset(nested_vec, True)
         for i, obj_i in enumerate(obj):
             deref(nested_res_vec)[i] = obj_i
             deref(nested_vec)[i] = <intptr_t>(deref(nested_res_vec)[i].data())
-        nested_res_ptr.reset(nested_res_vec, True)
-        nested_ptr.reset(nested_vec, True)
     elif cpython.PySequence_Check(obj):
         length = len(obj)
         nested_vec = new vector[intptr_t](length)
+        nested_ptr.reset(nested_vec, True)
         for i, addr in enumerate(obj):
             deref(nested_vec)[i] = addr
         nested_res_ptr.reset(NULL, False)
-        nested_ptr.reset(nested_vec, True)
     else:
         # obj is an int (ResT**)
         nested_res_ptr.reset(NULL, False)
         nested_ptr.reset(<vector[intptr_t]*><intptr_t>obj, False)
 
-    res.ptrs = move(nested_ptr)
-    res.nested_resource_ptr = move(nested_res_ptr)
-    return move(res)
+    in_out_ptr.ptrs = move(nested_ptr)
+    in_out_ptr.nested_resource_ptr = move(nested_res_ptr)
+    return 0
 
 
 class FunctionNotFoundError(RuntimeError): pass
diff --git a/python/cuquantum/_version.py b/python/cuquantum/_version.py
index 4b7fb8b..81b1987 100644
--- a/python/cuquantum/_version.py
+++ b/python/cuquantum/_version.py
@@ -5,4 +5,4 @@
 # Note: cuQuantum Python follows the cuQuantum SDK version, which is now
 # switched to YY.MM and is different from individual libraries' (semantic)
 # versioning scheme.
-__version__ = '24.08.0'
+__version__ = '24.11.0'
diff --git a/python/cuquantum/bindings/__init__.pxd b/python/cuquantum/bindings/__init__.pxd
new file mode 100644
index 0000000..808298f
--- /dev/null
+++ b/python/cuquantum/bindings/__init__.pxd
@@ -0,0 +1,3 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
diff --git a/python/cuquantum/bindings/__init__.py b/python/cuquantum/bindings/__init__.py
new file mode 100644
index 0000000..6f2f79e
--- /dev/null
+++ b/python/cuquantum/bindings/__init__.py
@@ -0,0 +1,5 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+from cuquantum.bindings import cudensitymat
\ No newline at end of file
diff --git a/python/cuquantum/bindings/_internal/__init__.pxd b/python/cuquantum/bindings/_internal/__init__.pxd
new file mode 100644
index 0000000..808298f
--- /dev/null
+++ b/python/cuquantum/bindings/_internal/__init__.pxd
@@ -0,0 +1,3 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
diff --git a/python/cuquantum/bindings/_internal/cudensitymat.pxd b/python/cuquantum/bindings/_internal/cudensitymat.pxd
new file mode 100644
index 0000000..7c81336
--- /dev/null
+++ b/python/cuquantum/bindings/_internal/cudensitymat.pxd
@@ -0,0 +1,54 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ..cycudensitymat cimport *
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef cudensitymatStatus_t _cudensitymatCreate(cudensitymatHandle_t* handle) except* nogil
+cdef cudensitymatStatus_t _cudensitymatDestroy(cudensitymatHandle_t handle) except* nogil
+cdef cudensitymatStatus_t _cudensitymatResetDistributedConfiguration(cudensitymatHandle_t handle, cudensitymatDistributedProvider_t provider, const void* commPtr, size_t commSize) except* nogil
+cdef cudensitymatStatus_t _cudensitymatGetNumRanks(const cudensitymatHandle_t handle, int32_t* numRanks) except* nogil
+cdef cudensitymatStatus_t _cudensitymatGetProcRank(const cudensitymatHandle_t handle, int32_t* procRank) except* nogil
+cdef cudensitymatStatus_t _cudensitymatResetRandomSeed(cudensitymatHandle_t handle, int32_t randomSeed) except* nogil
+cdef cudensitymatStatus_t _cudensitymatCreateState(const cudensitymatHandle_t handle, cudensitymatStatePurity_t purity, int32_t numSpaceModes, const int64_t spaceModeExtents[], int64_t batchSize, cudaDataType_t dataType, cudensitymatState_t* state) except* nogil
+cdef cudensitymatStatus_t _cudensitymatDestroyState(cudensitymatState_t state) except* nogil
+cdef cudensitymatStatus_t _cudensitymatStateGetNumComponents(const cudensitymatHandle_t handle, const cudensitymatState_t state, int32_t* numStateComponents) except* nogil
+cdef cudensitymatStatus_t _cudensitymatStateGetComponentStorageSize(const cudensitymatHandle_t handle, const cudensitymatState_t state, int32_t numStateComponents, size_t componentBufferSize[]) except* nogil
+cdef cudensitymatStatus_t _cudensitymatStateAttachComponentStorage(const cudensitymatHandle_t handle, cudensitymatState_t state, int32_t numStateComponents, void* componentBuffer[], const size_t componentBufferSize[]) except* nogil
+cdef cudensitymatStatus_t _cudensitymatStateGetComponentNumModes(const cudensitymatHandle_t handle, cudensitymatState_t state, int32_t stateComponentLocalId, int32_t* stateComponentGlobalId, int32_t* stateComponentNumModes, int32_t* batchModeLocation) except* nogil
+cdef cudensitymatStatus_t _cudensitymatStateGetComponentInfo(const cudensitymatHandle_t handle, cudensitymatState_t state, int32_t stateComponentLocalId, int32_t* stateComponentGlobalId, int32_t* stateComponentNumModes, int64_t stateComponentModeExtents[], int64_t stateComponentModeOffsets[]) except* nogil
+cdef cudensitymatStatus_t _cudensitymatStateInitializeZero(const cudensitymatHandle_t handle, cudensitymatState_t state, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t _cudensitymatStateComputeScaling(const cudensitymatHandle_t handle, cudensitymatState_t state, const void* scalingFactors, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t _cudensitymatStateComputeNorm(const cudensitymatHandle_t handle, const cudensitymatState_t state, void* norm, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t _cudensitymatStateComputeTrace(const cudensitymatHandle_t handle, const cudensitymatState_t state, void* trace, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t _cudensitymatStateComputeAccumulation(const cudensitymatHandle_t handle, const cudensitymatState_t stateIn, cudensitymatState_t stateOut, const void* scalingFactors, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t _cudensitymatStateComputeInnerProduct(const cudensitymatHandle_t handle, const cudensitymatState_t stateLeft, const cudensitymatState_t stateRight, void* innerProduct, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t _cudensitymatCreateElementaryOperator(const cudensitymatHandle_t handle, int32_t numSpaceModes, const int64_t spaceModeExtents[], cudensitymatElementaryOperatorSparsity_t sparsity, int32_t numDiagonals, const int32_t diagonalOffsets[], cudaDataType_t dataType, void* tensorData, cudensitymatWrappedTensorCallback_t tensorCallback, cudensitymatElementaryOperator_t* elemOperator) except* nogil
+cdef cudensitymatStatus_t _cudensitymatDestroyElementaryOperator(cudensitymatElementaryOperator_t elemOperator) except* nogil
+cdef cudensitymatStatus_t _cudensitymatCreateOperatorTerm(const cudensitymatHandle_t handle, int32_t numSpaceModes, const int64_t spaceModeExtents[], cudensitymatOperatorTerm_t* operatorTerm) except* nogil
+cdef cudensitymatStatus_t _cudensitymatDestroyOperatorTerm(cudensitymatOperatorTerm_t operatorTerm) except* nogil
+cdef cudensitymatStatus_t _cudensitymatOperatorTermAppendElementaryProduct(const cudensitymatHandle_t handle, cudensitymatOperatorTerm_t operatorTerm, int32_t numElemOperators, const cudensitymatElementaryOperator_t elemOperators[], const int32_t stateModesActedOn[], const int32_t modeActionDuality[], cuDoubleComplex coefficient, cudensitymatWrappedScalarCallback_t coefficientCallback) except* nogil
+cdef cudensitymatStatus_t _cudensitymatOperatorTermAppendGeneralProduct(const cudensitymatHandle_t handle, cudensitymatOperatorTerm_t operatorTerm, int32_t numElemOperators, const int32_t numOperatorModes[], const int64_t* operatorModeExtents[], const int64_t* operatorModeStrides[], const int32_t stateModesActedOn[], const int32_t modeActionDuality[], cudaDataType_t dataType, void* tensorData[], cudensitymatWrappedTensorCallback_t tensorCallbacks[], cuDoubleComplex coefficient, cudensitymatWrappedScalarCallback_t coefficientCallback) except* nogil
+cdef cudensitymatStatus_t _cudensitymatCreateOperator(const cudensitymatHandle_t handle, int32_t numSpaceModes, const int64_t spaceModeExtents[], cudensitymatOperator_t* superoperator) except* nogil
+cdef cudensitymatStatus_t _cudensitymatDestroyOperator(cudensitymatOperator_t superoperator) except* nogil
+cdef cudensitymatStatus_t _cudensitymatOperatorAppendTerm(const cudensitymatHandle_t handle, cudensitymatOperator_t superoperator, cudensitymatOperatorTerm_t operatorTerm, int32_t duality, cuDoubleComplex coefficient, cudensitymatWrappedScalarCallback_t coefficientCallback) except* nogil
+cdef cudensitymatStatus_t _cudensitymatOperatorPrepareAction(const cudensitymatHandle_t handle, const cudensitymatOperator_t superoperator, const cudensitymatState_t stateIn, const cudensitymatState_t stateOut, cudensitymatComputeType_t computeType, size_t workspaceSizeLimit, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t _cudensitymatOperatorComputeAction(const cudensitymatHandle_t handle, const cudensitymatOperator_t superoperator, double time, int32_t numParams, const double params[], const cudensitymatState_t stateIn, cudensitymatState_t stateOut, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t _cudensitymatCreateOperatorAction(const cudensitymatHandle_t handle, int32_t numOperators, cudensitymatOperator_t operators[], cudensitymatOperatorAction_t* operatorAction) except* nogil
+cdef cudensitymatStatus_t _cudensitymatDestroyOperatorAction(cudensitymatOperatorAction_t operatorAction) except* nogil
+cdef cudensitymatStatus_t _cudensitymatOperatorActionPrepare(const cudensitymatHandle_t handle, cudensitymatOperatorAction_t operatorAction, const cudensitymatState_t stateIn[], const cudensitymatState_t stateOut, cudensitymatComputeType_t computeType, size_t workspaceSizeLimit, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t _cudensitymatOperatorActionCompute(const cudensitymatHandle_t handle, cudensitymatOperatorAction_t operatorAction, double time, int32_t numParams, const double params[], const cudensitymatState_t stateIn[], cudensitymatState_t stateOut, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t _cudensitymatCreateExpectation(const cudensitymatHandle_t handle, cudensitymatOperator_t superoperator, cudensitymatExpectation_t* expectation) except* nogil
+cdef cudensitymatStatus_t _cudensitymatDestroyExpectation(cudensitymatExpectation_t expectation) except* nogil
+cdef cudensitymatStatus_t _cudensitymatExpectationPrepare(const cudensitymatHandle_t handle, cudensitymatExpectation_t expectation, const cudensitymatState_t state, cudensitymatComputeType_t computeType, size_t workspaceSizeLimit, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t _cudensitymatExpectationCompute(const cudensitymatHandle_t handle, cudensitymatExpectation_t expectation, double time, int32_t numParams, const double params[], const cudensitymatState_t state, void* expectationValue, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t _cudensitymatCreateWorkspace(const cudensitymatHandle_t handle, cudensitymatWorkspaceDescriptor_t* workspaceDescr) except* nogil
+cdef cudensitymatStatus_t _cudensitymatDestroyWorkspace(cudensitymatWorkspaceDescriptor_t workspaceDescr) except* nogil
+cdef cudensitymatStatus_t _cudensitymatWorkspaceGetMemorySize(const cudensitymatHandle_t handle, const cudensitymatWorkspaceDescriptor_t workspaceDescr, cudensitymatMemspace_t memSpace, cudensitymatWorkspaceKind_t workspaceKind, size_t* memoryBufferSize) except* nogil
+cdef cudensitymatStatus_t _cudensitymatWorkspaceSetMemory(const cudensitymatHandle_t handle, cudensitymatWorkspaceDescriptor_t workspaceDescr, cudensitymatMemspace_t memSpace, cudensitymatWorkspaceKind_t workspaceKind, void* memoryBuffer, size_t memoryBufferSize) except* nogil
+cdef cudensitymatStatus_t _cudensitymatWorkspaceGetMemory(const cudensitymatHandle_t handle, const cudensitymatWorkspaceDescriptor_t workspaceDescr, cudensitymatMemspace_t memSpace, cudensitymatWorkspaceKind_t workspaceKind, void** memoryBuffer, size_t* memoryBufferSize) except* nogil
diff --git a/python/cuquantum/bindings/_internal/cudensitymat_linux.pyx b/python/cuquantum/bindings/_internal/cudensitymat_linux.pyx
new file mode 100644
index 0000000..b22154a
--- /dev/null
+++ b/python/cuquantum/bindings/_internal/cudensitymat_linux.pyx
@@ -0,0 +1,969 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+from libc.stdint cimport intptr_t
+
+from .._utils import FunctionNotFoundError, NotSupportedError
+
+
+###############################################################################
+# Extern
+###############################################################################
+
+cdef extern from "<dlfcn.h>" nogil:
+    void* dlopen(const char*, int)
+    char* dlerror()
+    void* dlsym(void*, const char*)
+    int dlclose(void*)
+
+    enum:
+        RTLD_LAZY
+        RTLD_NOW
+        RTLD_GLOBAL
+        RTLD_LOCAL
+
+    const void* RTLD_DEFAULT 'RTLD_DEFAULT'
+
+
+###############################################################################
+# Wrapper init
+###############################################################################
+
+cdef bint __py_cudensitymat_init = False
+
+cdef void* __cudensitymatCreate = NULL
+cdef void* __cudensitymatDestroy = NULL
+cdef void* __cudensitymatResetDistributedConfiguration = NULL
+cdef void* __cudensitymatGetNumRanks = NULL
+cdef void* __cudensitymatGetProcRank = NULL
+cdef void* __cudensitymatResetRandomSeed = NULL
+cdef void* __cudensitymatCreateState = NULL
+cdef void* __cudensitymatDestroyState = NULL
+cdef void* __cudensitymatStateGetNumComponents = NULL
+cdef void* __cudensitymatStateGetComponentStorageSize = NULL
+cdef void* __cudensitymatStateAttachComponentStorage = NULL
+cdef void* __cudensitymatStateGetComponentNumModes = NULL
+cdef void* __cudensitymatStateGetComponentInfo = NULL
+cdef void* __cudensitymatStateInitializeZero = NULL
+cdef void* __cudensitymatStateComputeScaling = NULL
+cdef void* __cudensitymatStateComputeNorm = NULL
+cdef void* __cudensitymatStateComputeTrace = NULL
+cdef void* __cudensitymatStateComputeAccumulation = NULL
+cdef void* __cudensitymatStateComputeInnerProduct = NULL
+cdef void* __cudensitymatCreateElementaryOperator = NULL
+cdef void* __cudensitymatDestroyElementaryOperator = NULL
+cdef void* __cudensitymatCreateOperatorTerm = NULL
+cdef void* __cudensitymatDestroyOperatorTerm = NULL
+cdef void* __cudensitymatOperatorTermAppendElementaryProduct = NULL
+cdef void* __cudensitymatOperatorTermAppendGeneralProduct = NULL
+cdef void* __cudensitymatCreateOperator = NULL
+cdef void* __cudensitymatDestroyOperator = NULL
+cdef void* __cudensitymatOperatorAppendTerm = NULL
+cdef void* __cudensitymatOperatorPrepareAction = NULL
+cdef void* __cudensitymatOperatorComputeAction = NULL
+cdef void* __cudensitymatCreateOperatorAction = NULL
+cdef void* __cudensitymatDestroyOperatorAction = NULL
+cdef void* __cudensitymatOperatorActionPrepare = NULL
+cdef void* __cudensitymatOperatorActionCompute = NULL
+cdef void* __cudensitymatCreateExpectation = NULL
+cdef void* __cudensitymatDestroyExpectation = NULL
+cdef void* __cudensitymatExpectationPrepare = NULL
+cdef void* __cudensitymatExpectationCompute = NULL
+cdef void* __cudensitymatCreateWorkspace = NULL
+cdef void* __cudensitymatDestroyWorkspace = NULL
+cdef void* __cudensitymatWorkspaceGetMemorySize = NULL
+cdef void* __cudensitymatWorkspaceSetMemory = NULL
+cdef void* __cudensitymatWorkspaceGetMemory = NULL
+
+
+cdef void* load_library() except* nogil:
+    cdef void* handle
+    handle = dlopen("libcudensitymat.so.0", RTLD_NOW | RTLD_GLOBAL)
+    if handle == NULL:
+        with gil:
+            err_msg = dlerror()
+            raise RuntimeError(f'Failed to dlopen libcudensitymat ({err_msg.decode()})')
+    return handle
+
+
+cdef int _check_or_init_cudensitymat() except -1 nogil:
+    global __py_cudensitymat_init
+    if __py_cudensitymat_init:
+        return 0
+
+    # Load function
+    cdef void* handle = NULL
+    global __cudensitymatCreate
+    __cudensitymatCreate = dlsym(RTLD_DEFAULT, 'cudensitymatCreate')
+    if __cudensitymatCreate == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatCreate = dlsym(handle, 'cudensitymatCreate')
+    
+    global __cudensitymatDestroy
+    __cudensitymatDestroy = dlsym(RTLD_DEFAULT, 'cudensitymatDestroy')
+    if __cudensitymatDestroy == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatDestroy = dlsym(handle, 'cudensitymatDestroy')
+    
+    global __cudensitymatResetDistributedConfiguration
+    __cudensitymatResetDistributedConfiguration = dlsym(RTLD_DEFAULT, 'cudensitymatResetDistributedConfiguration')
+    if __cudensitymatResetDistributedConfiguration == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatResetDistributedConfiguration = dlsym(handle, 'cudensitymatResetDistributedConfiguration')
+    
+    global __cudensitymatGetNumRanks
+    __cudensitymatGetNumRanks = dlsym(RTLD_DEFAULT, 'cudensitymatGetNumRanks')
+    if __cudensitymatGetNumRanks == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatGetNumRanks = dlsym(handle, 'cudensitymatGetNumRanks')
+    
+    global __cudensitymatGetProcRank
+    __cudensitymatGetProcRank = dlsym(RTLD_DEFAULT, 'cudensitymatGetProcRank')
+    if __cudensitymatGetProcRank == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatGetProcRank = dlsym(handle, 'cudensitymatGetProcRank')
+    
+    global __cudensitymatResetRandomSeed
+    __cudensitymatResetRandomSeed = dlsym(RTLD_DEFAULT, 'cudensitymatResetRandomSeed')
+    if __cudensitymatResetRandomSeed == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatResetRandomSeed = dlsym(handle, 'cudensitymatResetRandomSeed')
+    
+    global __cudensitymatCreateState
+    __cudensitymatCreateState = dlsym(RTLD_DEFAULT, 'cudensitymatCreateState')
+    if __cudensitymatCreateState == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatCreateState = dlsym(handle, 'cudensitymatCreateState')
+    
+    global __cudensitymatDestroyState
+    __cudensitymatDestroyState = dlsym(RTLD_DEFAULT, 'cudensitymatDestroyState')
+    if __cudensitymatDestroyState == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatDestroyState = dlsym(handle, 'cudensitymatDestroyState')
+    
+    global __cudensitymatStateGetNumComponents
+    __cudensitymatStateGetNumComponents = dlsym(RTLD_DEFAULT, 'cudensitymatStateGetNumComponents')
+    if __cudensitymatStateGetNumComponents == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatStateGetNumComponents = dlsym(handle, 'cudensitymatStateGetNumComponents')
+    
+    global __cudensitymatStateGetComponentStorageSize
+    __cudensitymatStateGetComponentStorageSize = dlsym(RTLD_DEFAULT, 'cudensitymatStateGetComponentStorageSize')
+    if __cudensitymatStateGetComponentStorageSize == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatStateGetComponentStorageSize = dlsym(handle, 'cudensitymatStateGetComponentStorageSize')
+    
+    global __cudensitymatStateAttachComponentStorage
+    __cudensitymatStateAttachComponentStorage = dlsym(RTLD_DEFAULT, 'cudensitymatStateAttachComponentStorage')
+    if __cudensitymatStateAttachComponentStorage == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatStateAttachComponentStorage = dlsym(handle, 'cudensitymatStateAttachComponentStorage')
+    
+    global __cudensitymatStateGetComponentNumModes
+    __cudensitymatStateGetComponentNumModes = dlsym(RTLD_DEFAULT, 'cudensitymatStateGetComponentNumModes')
+    if __cudensitymatStateGetComponentNumModes == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatStateGetComponentNumModes = dlsym(handle, 'cudensitymatStateGetComponentNumModes')
+    
+    global __cudensitymatStateGetComponentInfo
+    __cudensitymatStateGetComponentInfo = dlsym(RTLD_DEFAULT, 'cudensitymatStateGetComponentInfo')
+    if __cudensitymatStateGetComponentInfo == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatStateGetComponentInfo = dlsym(handle, 'cudensitymatStateGetComponentInfo')
+    
+    global __cudensitymatStateInitializeZero
+    __cudensitymatStateInitializeZero = dlsym(RTLD_DEFAULT, 'cudensitymatStateInitializeZero')
+    if __cudensitymatStateInitializeZero == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatStateInitializeZero = dlsym(handle, 'cudensitymatStateInitializeZero')
+    
+    global __cudensitymatStateComputeScaling
+    __cudensitymatStateComputeScaling = dlsym(RTLD_DEFAULT, 'cudensitymatStateComputeScaling')
+    if __cudensitymatStateComputeScaling == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatStateComputeScaling = dlsym(handle, 'cudensitymatStateComputeScaling')
+    
+    global __cudensitymatStateComputeNorm
+    __cudensitymatStateComputeNorm = dlsym(RTLD_DEFAULT, 'cudensitymatStateComputeNorm')
+    if __cudensitymatStateComputeNorm == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatStateComputeNorm = dlsym(handle, 'cudensitymatStateComputeNorm')
+    
+    global __cudensitymatStateComputeTrace
+    __cudensitymatStateComputeTrace = dlsym(RTLD_DEFAULT, 'cudensitymatStateComputeTrace')
+    if __cudensitymatStateComputeTrace == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatStateComputeTrace = dlsym(handle, 'cudensitymatStateComputeTrace')
+    
+    global __cudensitymatStateComputeAccumulation
+    __cudensitymatStateComputeAccumulation = dlsym(RTLD_DEFAULT, 'cudensitymatStateComputeAccumulation')
+    if __cudensitymatStateComputeAccumulation == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatStateComputeAccumulation = dlsym(handle, 'cudensitymatStateComputeAccumulation')
+    
+    global __cudensitymatStateComputeInnerProduct
+    __cudensitymatStateComputeInnerProduct = dlsym(RTLD_DEFAULT, 'cudensitymatStateComputeInnerProduct')
+    if __cudensitymatStateComputeInnerProduct == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatStateComputeInnerProduct = dlsym(handle, 'cudensitymatStateComputeInnerProduct')
+    
+    global __cudensitymatCreateElementaryOperator
+    __cudensitymatCreateElementaryOperator = dlsym(RTLD_DEFAULT, 'cudensitymatCreateElementaryOperator')
+    if __cudensitymatCreateElementaryOperator == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatCreateElementaryOperator = dlsym(handle, 'cudensitymatCreateElementaryOperator')
+    
+    global __cudensitymatDestroyElementaryOperator
+    __cudensitymatDestroyElementaryOperator = dlsym(RTLD_DEFAULT, 'cudensitymatDestroyElementaryOperator')
+    if __cudensitymatDestroyElementaryOperator == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatDestroyElementaryOperator = dlsym(handle, 'cudensitymatDestroyElementaryOperator')
+    
+    global __cudensitymatCreateOperatorTerm
+    __cudensitymatCreateOperatorTerm = dlsym(RTLD_DEFAULT, 'cudensitymatCreateOperatorTerm')
+    if __cudensitymatCreateOperatorTerm == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatCreateOperatorTerm = dlsym(handle, 'cudensitymatCreateOperatorTerm')
+    
+    global __cudensitymatDestroyOperatorTerm
+    __cudensitymatDestroyOperatorTerm = dlsym(RTLD_DEFAULT, 'cudensitymatDestroyOperatorTerm')
+    if __cudensitymatDestroyOperatorTerm == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatDestroyOperatorTerm = dlsym(handle, 'cudensitymatDestroyOperatorTerm')
+    
+    global __cudensitymatOperatorTermAppendElementaryProduct
+    __cudensitymatOperatorTermAppendElementaryProduct = dlsym(RTLD_DEFAULT, 'cudensitymatOperatorTermAppendElementaryProduct')
+    if __cudensitymatOperatorTermAppendElementaryProduct == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatOperatorTermAppendElementaryProduct = dlsym(handle, 'cudensitymatOperatorTermAppendElementaryProduct')
+    
+    global __cudensitymatOperatorTermAppendGeneralProduct
+    __cudensitymatOperatorTermAppendGeneralProduct = dlsym(RTLD_DEFAULT, 'cudensitymatOperatorTermAppendGeneralProduct')
+    if __cudensitymatOperatorTermAppendGeneralProduct == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatOperatorTermAppendGeneralProduct = dlsym(handle, 'cudensitymatOperatorTermAppendGeneralProduct')
+    
+    global __cudensitymatCreateOperator
+    __cudensitymatCreateOperator = dlsym(RTLD_DEFAULT, 'cudensitymatCreateOperator')
+    if __cudensitymatCreateOperator == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatCreateOperator = dlsym(handle, 'cudensitymatCreateOperator')
+    
+    global __cudensitymatDestroyOperator
+    __cudensitymatDestroyOperator = dlsym(RTLD_DEFAULT, 'cudensitymatDestroyOperator')
+    if __cudensitymatDestroyOperator == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatDestroyOperator = dlsym(handle, 'cudensitymatDestroyOperator')
+    
+    global __cudensitymatOperatorAppendTerm
+    __cudensitymatOperatorAppendTerm = dlsym(RTLD_DEFAULT, 'cudensitymatOperatorAppendTerm')
+    if __cudensitymatOperatorAppendTerm == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatOperatorAppendTerm = dlsym(handle, 'cudensitymatOperatorAppendTerm')
+    
+    global __cudensitymatOperatorPrepareAction
+    __cudensitymatOperatorPrepareAction = dlsym(RTLD_DEFAULT, 'cudensitymatOperatorPrepareAction')
+    if __cudensitymatOperatorPrepareAction == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatOperatorPrepareAction = dlsym(handle, 'cudensitymatOperatorPrepareAction')
+    
+    global __cudensitymatOperatorComputeAction
+    __cudensitymatOperatorComputeAction = dlsym(RTLD_DEFAULT, 'cudensitymatOperatorComputeAction')
+    if __cudensitymatOperatorComputeAction == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatOperatorComputeAction = dlsym(handle, 'cudensitymatOperatorComputeAction')
+    
+    global __cudensitymatCreateOperatorAction
+    __cudensitymatCreateOperatorAction = dlsym(RTLD_DEFAULT, 'cudensitymatCreateOperatorAction')
+    if __cudensitymatCreateOperatorAction == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatCreateOperatorAction = dlsym(handle, 'cudensitymatCreateOperatorAction')
+    
+    global __cudensitymatDestroyOperatorAction
+    __cudensitymatDestroyOperatorAction = dlsym(RTLD_DEFAULT, 'cudensitymatDestroyOperatorAction')
+    if __cudensitymatDestroyOperatorAction == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatDestroyOperatorAction = dlsym(handle, 'cudensitymatDestroyOperatorAction')
+    
+    global __cudensitymatOperatorActionPrepare
+    __cudensitymatOperatorActionPrepare = dlsym(RTLD_DEFAULT, 'cudensitymatOperatorActionPrepare')
+    if __cudensitymatOperatorActionPrepare == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatOperatorActionPrepare = dlsym(handle, 'cudensitymatOperatorActionPrepare')
+    
+    global __cudensitymatOperatorActionCompute
+    __cudensitymatOperatorActionCompute = dlsym(RTLD_DEFAULT, 'cudensitymatOperatorActionCompute')
+    if __cudensitymatOperatorActionCompute == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatOperatorActionCompute = dlsym(handle, 'cudensitymatOperatorActionCompute')
+    
+    global __cudensitymatCreateExpectation
+    __cudensitymatCreateExpectation = dlsym(RTLD_DEFAULT, 'cudensitymatCreateExpectation')
+    if __cudensitymatCreateExpectation == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatCreateExpectation = dlsym(handle, 'cudensitymatCreateExpectation')
+    
+    global __cudensitymatDestroyExpectation
+    __cudensitymatDestroyExpectation = dlsym(RTLD_DEFAULT, 'cudensitymatDestroyExpectation')
+    if __cudensitymatDestroyExpectation == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatDestroyExpectation = dlsym(handle, 'cudensitymatDestroyExpectation')
+    
+    global __cudensitymatExpectationPrepare
+    __cudensitymatExpectationPrepare = dlsym(RTLD_DEFAULT, 'cudensitymatExpectationPrepare')
+    if __cudensitymatExpectationPrepare == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatExpectationPrepare = dlsym(handle, 'cudensitymatExpectationPrepare')
+    
+    global __cudensitymatExpectationCompute
+    __cudensitymatExpectationCompute = dlsym(RTLD_DEFAULT, 'cudensitymatExpectationCompute')
+    if __cudensitymatExpectationCompute == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatExpectationCompute = dlsym(handle, 'cudensitymatExpectationCompute')
+    
+    global __cudensitymatCreateWorkspace
+    __cudensitymatCreateWorkspace = dlsym(RTLD_DEFAULT, 'cudensitymatCreateWorkspace')
+    if __cudensitymatCreateWorkspace == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatCreateWorkspace = dlsym(handle, 'cudensitymatCreateWorkspace')
+    
+    global __cudensitymatDestroyWorkspace
+    __cudensitymatDestroyWorkspace = dlsym(RTLD_DEFAULT, 'cudensitymatDestroyWorkspace')
+    if __cudensitymatDestroyWorkspace == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatDestroyWorkspace = dlsym(handle, 'cudensitymatDestroyWorkspace')
+    
+    global __cudensitymatWorkspaceGetMemorySize
+    __cudensitymatWorkspaceGetMemorySize = dlsym(RTLD_DEFAULT, 'cudensitymatWorkspaceGetMemorySize')
+    if __cudensitymatWorkspaceGetMemorySize == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatWorkspaceGetMemorySize = dlsym(handle, 'cudensitymatWorkspaceGetMemorySize')
+    
+    global __cudensitymatWorkspaceSetMemory
+    __cudensitymatWorkspaceSetMemory = dlsym(RTLD_DEFAULT, 'cudensitymatWorkspaceSetMemory')
+    if __cudensitymatWorkspaceSetMemory == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatWorkspaceSetMemory = dlsym(handle, 'cudensitymatWorkspaceSetMemory')
+    
+    global __cudensitymatWorkspaceGetMemory
+    __cudensitymatWorkspaceGetMemory = dlsym(RTLD_DEFAULT, 'cudensitymatWorkspaceGetMemory')
+    if __cudensitymatWorkspaceGetMemory == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cudensitymatWorkspaceGetMemory = dlsym(handle, 'cudensitymatWorkspaceGetMemory')
+
+    __py_cudensitymat_init = True
+    return 0
+
+
+cpdef dict _inspect_function_pointers():
+    _check_or_init_cudensitymat()
+    cdef dict data = {}
+
+    global __cudensitymatCreate
+    data["__cudensitymatCreate"] = <intptr_t>__cudensitymatCreate
+    
+    global __cudensitymatDestroy
+    data["__cudensitymatDestroy"] = <intptr_t>__cudensitymatDestroy
+    
+    global __cudensitymatResetDistributedConfiguration
+    data["__cudensitymatResetDistributedConfiguration"] = <intptr_t>__cudensitymatResetDistributedConfiguration
+    
+    global __cudensitymatGetNumRanks
+    data["__cudensitymatGetNumRanks"] = <intptr_t>__cudensitymatGetNumRanks
+    
+    global __cudensitymatGetProcRank
+    data["__cudensitymatGetProcRank"] = <intptr_t>__cudensitymatGetProcRank
+    
+    global __cudensitymatResetRandomSeed
+    data["__cudensitymatResetRandomSeed"] = <intptr_t>__cudensitymatResetRandomSeed
+    
+    global __cudensitymatCreateState
+    data["__cudensitymatCreateState"] = <intptr_t>__cudensitymatCreateState
+    
+    global __cudensitymatDestroyState
+    data["__cudensitymatDestroyState"] = <intptr_t>__cudensitymatDestroyState
+    
+    global __cudensitymatStateGetNumComponents
+    data["__cudensitymatStateGetNumComponents"] = <intptr_t>__cudensitymatStateGetNumComponents
+    
+    global __cudensitymatStateGetComponentStorageSize
+    data["__cudensitymatStateGetComponentStorageSize"] = <intptr_t>__cudensitymatStateGetComponentStorageSize
+    
+    global __cudensitymatStateAttachComponentStorage
+    data["__cudensitymatStateAttachComponentStorage"] = <intptr_t>__cudensitymatStateAttachComponentStorage
+    
+    global __cudensitymatStateGetComponentNumModes
+    data["__cudensitymatStateGetComponentNumModes"] = <intptr_t>__cudensitymatStateGetComponentNumModes
+    
+    global __cudensitymatStateGetComponentInfo
+    data["__cudensitymatStateGetComponentInfo"] = <intptr_t>__cudensitymatStateGetComponentInfo
+    
+    global __cudensitymatStateInitializeZero
+    data["__cudensitymatStateInitializeZero"] = <intptr_t>__cudensitymatStateInitializeZero
+    
+    global __cudensitymatStateComputeScaling
+    data["__cudensitymatStateComputeScaling"] = <intptr_t>__cudensitymatStateComputeScaling
+    
+    global __cudensitymatStateComputeNorm
+    data["__cudensitymatStateComputeNorm"] = <intptr_t>__cudensitymatStateComputeNorm
+    
+    global __cudensitymatStateComputeTrace
+    data["__cudensitymatStateComputeTrace"] = <intptr_t>__cudensitymatStateComputeTrace
+    
+    global __cudensitymatStateComputeAccumulation
+    data["__cudensitymatStateComputeAccumulation"] = <intptr_t>__cudensitymatStateComputeAccumulation
+    
+    global __cudensitymatStateComputeInnerProduct
+    data["__cudensitymatStateComputeInnerProduct"] = <intptr_t>__cudensitymatStateComputeInnerProduct
+    
+    global __cudensitymatCreateElementaryOperator
+    data["__cudensitymatCreateElementaryOperator"] = <intptr_t>__cudensitymatCreateElementaryOperator
+    
+    global __cudensitymatDestroyElementaryOperator
+    data["__cudensitymatDestroyElementaryOperator"] = <intptr_t>__cudensitymatDestroyElementaryOperator
+    
+    global __cudensitymatCreateOperatorTerm
+    data["__cudensitymatCreateOperatorTerm"] = <intptr_t>__cudensitymatCreateOperatorTerm
+    
+    global __cudensitymatDestroyOperatorTerm
+    data["__cudensitymatDestroyOperatorTerm"] = <intptr_t>__cudensitymatDestroyOperatorTerm
+    
+    global __cudensitymatOperatorTermAppendElementaryProduct
+    data["__cudensitymatOperatorTermAppendElementaryProduct"] = <intptr_t>__cudensitymatOperatorTermAppendElementaryProduct
+    
+    global __cudensitymatOperatorTermAppendGeneralProduct
+    data["__cudensitymatOperatorTermAppendGeneralProduct"] = <intptr_t>__cudensitymatOperatorTermAppendGeneralProduct
+    
+    global __cudensitymatCreateOperator
+    data["__cudensitymatCreateOperator"] = <intptr_t>__cudensitymatCreateOperator
+    
+    global __cudensitymatDestroyOperator
+    data["__cudensitymatDestroyOperator"] = <intptr_t>__cudensitymatDestroyOperator
+    
+    global __cudensitymatOperatorAppendTerm
+    data["__cudensitymatOperatorAppendTerm"] = <intptr_t>__cudensitymatOperatorAppendTerm
+    
+    global __cudensitymatOperatorPrepareAction
+    data["__cudensitymatOperatorPrepareAction"] = <intptr_t>__cudensitymatOperatorPrepareAction
+    
+    global __cudensitymatOperatorComputeAction
+    data["__cudensitymatOperatorComputeAction"] = <intptr_t>__cudensitymatOperatorComputeAction
+    
+    global __cudensitymatCreateOperatorAction
+    data["__cudensitymatCreateOperatorAction"] = <intptr_t>__cudensitymatCreateOperatorAction
+    
+    global __cudensitymatDestroyOperatorAction
+    data["__cudensitymatDestroyOperatorAction"] = <intptr_t>__cudensitymatDestroyOperatorAction
+    
+    global __cudensitymatOperatorActionPrepare
+    data["__cudensitymatOperatorActionPrepare"] = <intptr_t>__cudensitymatOperatorActionPrepare
+    
+    global __cudensitymatOperatorActionCompute
+    data["__cudensitymatOperatorActionCompute"] = <intptr_t>__cudensitymatOperatorActionCompute
+    
+    global __cudensitymatCreateExpectation
+    data["__cudensitymatCreateExpectation"] = <intptr_t>__cudensitymatCreateExpectation
+    
+    global __cudensitymatDestroyExpectation
+    data["__cudensitymatDestroyExpectation"] = <intptr_t>__cudensitymatDestroyExpectation
+    
+    global __cudensitymatExpectationPrepare
+    data["__cudensitymatExpectationPrepare"] = <intptr_t>__cudensitymatExpectationPrepare
+    
+    global __cudensitymatExpectationCompute
+    data["__cudensitymatExpectationCompute"] = <intptr_t>__cudensitymatExpectationCompute
+    
+    global __cudensitymatCreateWorkspace
+    data["__cudensitymatCreateWorkspace"] = <intptr_t>__cudensitymatCreateWorkspace
+    
+    global __cudensitymatDestroyWorkspace
+    data["__cudensitymatDestroyWorkspace"] = <intptr_t>__cudensitymatDestroyWorkspace
+    
+    global __cudensitymatWorkspaceGetMemorySize
+    data["__cudensitymatWorkspaceGetMemorySize"] = <intptr_t>__cudensitymatWorkspaceGetMemorySize
+    
+    global __cudensitymatWorkspaceSetMemory
+    data["__cudensitymatWorkspaceSetMemory"] = <intptr_t>__cudensitymatWorkspaceSetMemory
+    
+    global __cudensitymatWorkspaceGetMemory
+    data["__cudensitymatWorkspaceGetMemory"] = <intptr_t>__cudensitymatWorkspaceGetMemory
+
+    return data
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef cudensitymatStatus_t _cudensitymatCreate(cudensitymatHandle_t* handle) except* nogil:
+    global __cudensitymatCreate
+    _check_or_init_cudensitymat()
+    if __cudensitymatCreate == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatCreate is not found")
+    return (<cudensitymatStatus_t (*)(cudensitymatHandle_t*) nogil>__cudensitymatCreate)(
+        handle)
+
+
+cdef cudensitymatStatus_t _cudensitymatDestroy(cudensitymatHandle_t handle) except* nogil:
+    global __cudensitymatDestroy
+    _check_or_init_cudensitymat()
+    if __cudensitymatDestroy == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatDestroy is not found")
+    return (<cudensitymatStatus_t (*)(cudensitymatHandle_t) nogil>__cudensitymatDestroy)(
+        handle)
+
+
+cdef cudensitymatStatus_t _cudensitymatResetDistributedConfiguration(cudensitymatHandle_t handle, cudensitymatDistributedProvider_t provider, const void* commPtr, size_t commSize) except* nogil:
+    global __cudensitymatResetDistributedConfiguration
+    _check_or_init_cudensitymat()
+    if __cudensitymatResetDistributedConfiguration == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatResetDistributedConfiguration is not found")
+    return (<cudensitymatStatus_t (*)(cudensitymatHandle_t, cudensitymatDistributedProvider_t, const void*, size_t) nogil>__cudensitymatResetDistributedConfiguration)(
+        handle, provider, commPtr, commSize)
+
+
+cdef cudensitymatStatus_t _cudensitymatGetNumRanks(const cudensitymatHandle_t handle, int32_t* numRanks) except* nogil:
+    global __cudensitymatGetNumRanks
+    _check_or_init_cudensitymat()
+    if __cudensitymatGetNumRanks == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatGetNumRanks is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, int32_t*) nogil>__cudensitymatGetNumRanks)(
+        handle, numRanks)
+
+
+cdef cudensitymatStatus_t _cudensitymatGetProcRank(const cudensitymatHandle_t handle, int32_t* procRank) except* nogil:
+    global __cudensitymatGetProcRank
+    _check_or_init_cudensitymat()
+    if __cudensitymatGetProcRank == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatGetProcRank is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, int32_t*) nogil>__cudensitymatGetProcRank)(
+        handle, procRank)
+
+
+cdef cudensitymatStatus_t _cudensitymatResetRandomSeed(cudensitymatHandle_t handle, int32_t randomSeed) except* nogil:
+    global __cudensitymatResetRandomSeed
+    _check_or_init_cudensitymat()
+    if __cudensitymatResetRandomSeed == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatResetRandomSeed is not found")
+    return (<cudensitymatStatus_t (*)(cudensitymatHandle_t, int32_t) nogil>__cudensitymatResetRandomSeed)(
+        handle, randomSeed)
+
+
+cdef cudensitymatStatus_t _cudensitymatCreateState(const cudensitymatHandle_t handle, cudensitymatStatePurity_t purity, int32_t numSpaceModes, const int64_t spaceModeExtents[], int64_t batchSize, cudaDataType_t dataType, cudensitymatState_t* state) except* nogil:
+    global __cudensitymatCreateState
+    _check_or_init_cudensitymat()
+    if __cudensitymatCreateState == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatCreateState is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, cudensitymatStatePurity_t, int32_t, const int64_t*, int64_t, cudaDataType_t, cudensitymatState_t*) nogil>__cudensitymatCreateState)(
+        handle, purity, numSpaceModes, spaceModeExtents, batchSize, dataType, state)
+
+
+cdef cudensitymatStatus_t _cudensitymatDestroyState(cudensitymatState_t state) except* nogil:
+    global __cudensitymatDestroyState
+    _check_or_init_cudensitymat()
+    if __cudensitymatDestroyState == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatDestroyState is not found")
+    return (<cudensitymatStatus_t (*)(cudensitymatState_t) nogil>__cudensitymatDestroyState)(
+        state)
+
+
+cdef cudensitymatStatus_t _cudensitymatStateGetNumComponents(const cudensitymatHandle_t handle, const cudensitymatState_t state, int32_t* numStateComponents) except* nogil:
+    global __cudensitymatStateGetNumComponents
+    _check_or_init_cudensitymat()
+    if __cudensitymatStateGetNumComponents == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatStateGetNumComponents is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, const cudensitymatState_t, int32_t*) nogil>__cudensitymatStateGetNumComponents)(
+        handle, state, numStateComponents)
+
+
+cdef cudensitymatStatus_t _cudensitymatStateGetComponentStorageSize(const cudensitymatHandle_t handle, const cudensitymatState_t state, int32_t numStateComponents, size_t componentBufferSize[]) except* nogil:
+    global __cudensitymatStateGetComponentStorageSize
+    _check_or_init_cudensitymat()
+    if __cudensitymatStateGetComponentStorageSize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatStateGetComponentStorageSize is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, const cudensitymatState_t, int32_t, size_t*) nogil>__cudensitymatStateGetComponentStorageSize)(
+        handle, state, numStateComponents, componentBufferSize)
+
+
+cdef cudensitymatStatus_t _cudensitymatStateAttachComponentStorage(const cudensitymatHandle_t handle, cudensitymatState_t state, int32_t numStateComponents, void* componentBuffer[], const size_t componentBufferSize[]) except* nogil:
+    global __cudensitymatStateAttachComponentStorage
+    _check_or_init_cudensitymat()
+    if __cudensitymatStateAttachComponentStorage == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatStateAttachComponentStorage is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, cudensitymatState_t, int32_t, void**, const size_t*) nogil>__cudensitymatStateAttachComponentStorage)(
+        handle, state, numStateComponents, componentBuffer, componentBufferSize)
+
+
+cdef cudensitymatStatus_t _cudensitymatStateGetComponentNumModes(const cudensitymatHandle_t handle, cudensitymatState_t state, int32_t stateComponentLocalId, int32_t* stateComponentGlobalId, int32_t* stateComponentNumModes, int32_t* batchModeLocation) except* nogil:
+    global __cudensitymatStateGetComponentNumModes
+    _check_or_init_cudensitymat()
+    if __cudensitymatStateGetComponentNumModes == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatStateGetComponentNumModes is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, cudensitymatState_t, int32_t, int32_t*, int32_t*, int32_t*) nogil>__cudensitymatStateGetComponentNumModes)(
+        handle, state, stateComponentLocalId, stateComponentGlobalId, stateComponentNumModes, batchModeLocation)
+
+
+cdef cudensitymatStatus_t _cudensitymatStateGetComponentInfo(const cudensitymatHandle_t handle, cudensitymatState_t state, int32_t stateComponentLocalId, int32_t* stateComponentGlobalId, int32_t* stateComponentNumModes, int64_t stateComponentModeExtents[], int64_t stateComponentModeOffsets[]) except* nogil:
+    global __cudensitymatStateGetComponentInfo
+    _check_or_init_cudensitymat()
+    if __cudensitymatStateGetComponentInfo == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatStateGetComponentInfo is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, cudensitymatState_t, int32_t, int32_t*, int32_t*, int64_t*, int64_t*) nogil>__cudensitymatStateGetComponentInfo)(
+        handle, state, stateComponentLocalId, stateComponentGlobalId, stateComponentNumModes, stateComponentModeExtents, stateComponentModeOffsets)
+
+
+cdef cudensitymatStatus_t _cudensitymatStateInitializeZero(const cudensitymatHandle_t handle, cudensitymatState_t state, cudaStream_t stream) except* nogil:
+    global __cudensitymatStateInitializeZero
+    _check_or_init_cudensitymat()
+    if __cudensitymatStateInitializeZero == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatStateInitializeZero is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, cudensitymatState_t, cudaStream_t) nogil>__cudensitymatStateInitializeZero)(
+        handle, state, stream)
+
+
+cdef cudensitymatStatus_t _cudensitymatStateComputeScaling(const cudensitymatHandle_t handle, cudensitymatState_t state, const void* scalingFactors, cudaStream_t stream) except* nogil:
+    global __cudensitymatStateComputeScaling
+    _check_or_init_cudensitymat()
+    if __cudensitymatStateComputeScaling == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatStateComputeScaling is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, cudensitymatState_t, const void*, cudaStream_t) nogil>__cudensitymatStateComputeScaling)(
+        handle, state, scalingFactors, stream)
+
+
+cdef cudensitymatStatus_t _cudensitymatStateComputeNorm(const cudensitymatHandle_t handle, const cudensitymatState_t state, void* norm, cudaStream_t stream) except* nogil:
+    global __cudensitymatStateComputeNorm
+    _check_or_init_cudensitymat()
+    if __cudensitymatStateComputeNorm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatStateComputeNorm is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, const cudensitymatState_t, void*, cudaStream_t) nogil>__cudensitymatStateComputeNorm)(
+        handle, state, norm, stream)
+
+
+cdef cudensitymatStatus_t _cudensitymatStateComputeTrace(const cudensitymatHandle_t handle, const cudensitymatState_t state, void* trace, cudaStream_t stream) except* nogil:
+    global __cudensitymatStateComputeTrace
+    _check_or_init_cudensitymat()
+    if __cudensitymatStateComputeTrace == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatStateComputeTrace is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, const cudensitymatState_t, void*, cudaStream_t) nogil>__cudensitymatStateComputeTrace)(
+        handle, state, trace, stream)
+
+
+cdef cudensitymatStatus_t _cudensitymatStateComputeAccumulation(const cudensitymatHandle_t handle, const cudensitymatState_t stateIn, cudensitymatState_t stateOut, const void* scalingFactors, cudaStream_t stream) except* nogil:
+    global __cudensitymatStateComputeAccumulation
+    _check_or_init_cudensitymat()
+    if __cudensitymatStateComputeAccumulation == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatStateComputeAccumulation is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, const cudensitymatState_t, cudensitymatState_t, const void*, cudaStream_t) nogil>__cudensitymatStateComputeAccumulation)(
+        handle, stateIn, stateOut, scalingFactors, stream)
+
+
+cdef cudensitymatStatus_t _cudensitymatStateComputeInnerProduct(const cudensitymatHandle_t handle, const cudensitymatState_t stateLeft, const cudensitymatState_t stateRight, void* innerProduct, cudaStream_t stream) except* nogil:
+    global __cudensitymatStateComputeInnerProduct
+    _check_or_init_cudensitymat()
+    if __cudensitymatStateComputeInnerProduct == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatStateComputeInnerProduct is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, const cudensitymatState_t, const cudensitymatState_t, void*, cudaStream_t) nogil>__cudensitymatStateComputeInnerProduct)(
+        handle, stateLeft, stateRight, innerProduct, stream)
+
+
+cdef cudensitymatStatus_t _cudensitymatCreateElementaryOperator(const cudensitymatHandle_t handle, int32_t numSpaceModes, const int64_t spaceModeExtents[], cudensitymatElementaryOperatorSparsity_t sparsity, int32_t numDiagonals, const int32_t diagonalOffsets[], cudaDataType_t dataType, void* tensorData, cudensitymatWrappedTensorCallback_t tensorCallback, cudensitymatElementaryOperator_t* elemOperator) except* nogil:
+    global __cudensitymatCreateElementaryOperator
+    _check_or_init_cudensitymat()
+    if __cudensitymatCreateElementaryOperator == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatCreateElementaryOperator is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, int32_t, const int64_t*, cudensitymatElementaryOperatorSparsity_t, int32_t, const int32_t*, cudaDataType_t, void*, cudensitymatWrappedTensorCallback_t, cudensitymatElementaryOperator_t*) nogil>__cudensitymatCreateElementaryOperator)(
+        handle, numSpaceModes, spaceModeExtents, sparsity, numDiagonals, diagonalOffsets, dataType, tensorData, tensorCallback, elemOperator)
+
+
+cdef cudensitymatStatus_t _cudensitymatDestroyElementaryOperator(cudensitymatElementaryOperator_t elemOperator) except* nogil:
+    global __cudensitymatDestroyElementaryOperator
+    _check_or_init_cudensitymat()
+    if __cudensitymatDestroyElementaryOperator == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatDestroyElementaryOperator is not found")
+    return (<cudensitymatStatus_t (*)(cudensitymatElementaryOperator_t) nogil>__cudensitymatDestroyElementaryOperator)(
+        elemOperator)
+
+
+cdef cudensitymatStatus_t _cudensitymatCreateOperatorTerm(const cudensitymatHandle_t handle, int32_t numSpaceModes, const int64_t spaceModeExtents[], cudensitymatOperatorTerm_t* operatorTerm) except* nogil:
+    global __cudensitymatCreateOperatorTerm
+    _check_or_init_cudensitymat()
+    if __cudensitymatCreateOperatorTerm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatCreateOperatorTerm is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, int32_t, const int64_t*, cudensitymatOperatorTerm_t*) nogil>__cudensitymatCreateOperatorTerm)(
+        handle, numSpaceModes, spaceModeExtents, operatorTerm)
+
+
+cdef cudensitymatStatus_t _cudensitymatDestroyOperatorTerm(cudensitymatOperatorTerm_t operatorTerm) except* nogil:
+    global __cudensitymatDestroyOperatorTerm
+    _check_or_init_cudensitymat()
+    if __cudensitymatDestroyOperatorTerm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatDestroyOperatorTerm is not found")
+    return (<cudensitymatStatus_t (*)(cudensitymatOperatorTerm_t) nogil>__cudensitymatDestroyOperatorTerm)(
+        operatorTerm)
+
+
+cdef cudensitymatStatus_t _cudensitymatOperatorTermAppendElementaryProduct(const cudensitymatHandle_t handle, cudensitymatOperatorTerm_t operatorTerm, int32_t numElemOperators, const cudensitymatElementaryOperator_t elemOperators[], const int32_t stateModesActedOn[], const int32_t modeActionDuality[], cuDoubleComplex coefficient, cudensitymatWrappedScalarCallback_t coefficientCallback) except* nogil:
+    global __cudensitymatOperatorTermAppendElementaryProduct
+    _check_or_init_cudensitymat()
+    if __cudensitymatOperatorTermAppendElementaryProduct == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatOperatorTermAppendElementaryProduct is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, cudensitymatOperatorTerm_t, int32_t, const cudensitymatElementaryOperator_t*, const int32_t*, const int32_t*, cuDoubleComplex, cudensitymatWrappedScalarCallback_t) nogil>__cudensitymatOperatorTermAppendElementaryProduct)(
+        handle, operatorTerm, numElemOperators, elemOperators, stateModesActedOn, modeActionDuality, coefficient, coefficientCallback)
+
+
+cdef cudensitymatStatus_t _cudensitymatOperatorTermAppendGeneralProduct(const cudensitymatHandle_t handle, cudensitymatOperatorTerm_t operatorTerm, int32_t numElemOperators, const int32_t numOperatorModes[], const int64_t* operatorModeExtents[], const int64_t* operatorModeStrides[], const int32_t stateModesActedOn[], const int32_t modeActionDuality[], cudaDataType_t dataType, void* tensorData[], cudensitymatWrappedTensorCallback_t tensorCallbacks[], cuDoubleComplex coefficient, cudensitymatWrappedScalarCallback_t coefficientCallback) except* nogil:
+    global __cudensitymatOperatorTermAppendGeneralProduct
+    _check_or_init_cudensitymat()
+    if __cudensitymatOperatorTermAppendGeneralProduct == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatOperatorTermAppendGeneralProduct is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, cudensitymatOperatorTerm_t, int32_t, const int32_t*, const int64_t**, const int64_t**, const int32_t*, const int32_t*, cudaDataType_t, void**, cudensitymatWrappedTensorCallback_t*, cuDoubleComplex, cudensitymatWrappedScalarCallback_t) nogil>__cudensitymatOperatorTermAppendGeneralProduct)(
+        handle, operatorTerm, numElemOperators, numOperatorModes, operatorModeExtents, operatorModeStrides, stateModesActedOn, modeActionDuality, dataType, tensorData, tensorCallbacks, coefficient, coefficientCallback)
+
+
+cdef cudensitymatStatus_t _cudensitymatCreateOperator(const cudensitymatHandle_t handle, int32_t numSpaceModes, const int64_t spaceModeExtents[], cudensitymatOperator_t* superoperator) except* nogil:
+    global __cudensitymatCreateOperator
+    _check_or_init_cudensitymat()
+    if __cudensitymatCreateOperator == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatCreateOperator is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, int32_t, const int64_t*, cudensitymatOperator_t*) nogil>__cudensitymatCreateOperator)(
+        handle, numSpaceModes, spaceModeExtents, superoperator)
+
+
+cdef cudensitymatStatus_t _cudensitymatDestroyOperator(cudensitymatOperator_t superoperator) except* nogil:
+    global __cudensitymatDestroyOperator
+    _check_or_init_cudensitymat()
+    if __cudensitymatDestroyOperator == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatDestroyOperator is not found")
+    return (<cudensitymatStatus_t (*)(cudensitymatOperator_t) nogil>__cudensitymatDestroyOperator)(
+        superoperator)
+
+
+cdef cudensitymatStatus_t _cudensitymatOperatorAppendTerm(const cudensitymatHandle_t handle, cudensitymatOperator_t superoperator, cudensitymatOperatorTerm_t operatorTerm, int32_t duality, cuDoubleComplex coefficient, cudensitymatWrappedScalarCallback_t coefficientCallback) except* nogil:
+    global __cudensitymatOperatorAppendTerm
+    _check_or_init_cudensitymat()
+    if __cudensitymatOperatorAppendTerm == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatOperatorAppendTerm is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, cudensitymatOperator_t, cudensitymatOperatorTerm_t, int32_t, cuDoubleComplex, cudensitymatWrappedScalarCallback_t) nogil>__cudensitymatOperatorAppendTerm)(
+        handle, superoperator, operatorTerm, duality, coefficient, coefficientCallback)
+
+
+cdef cudensitymatStatus_t _cudensitymatOperatorPrepareAction(const cudensitymatHandle_t handle, const cudensitymatOperator_t superoperator, const cudensitymatState_t stateIn, const cudensitymatState_t stateOut, cudensitymatComputeType_t computeType, size_t workspaceSizeLimit, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil:
+    global __cudensitymatOperatorPrepareAction
+    _check_or_init_cudensitymat()
+    if __cudensitymatOperatorPrepareAction == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatOperatorPrepareAction is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, const cudensitymatOperator_t, const cudensitymatState_t, const cudensitymatState_t, cudensitymatComputeType_t, size_t, cudensitymatWorkspaceDescriptor_t, cudaStream_t) nogil>__cudensitymatOperatorPrepareAction)(
+        handle, superoperator, stateIn, stateOut, computeType, workspaceSizeLimit, workspace, stream)
+
+
+cdef cudensitymatStatus_t _cudensitymatOperatorComputeAction(const cudensitymatHandle_t handle, const cudensitymatOperator_t superoperator, double time, int32_t numParams, const double params[], const cudensitymatState_t stateIn, cudensitymatState_t stateOut, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil:
+    global __cudensitymatOperatorComputeAction
+    _check_or_init_cudensitymat()
+    if __cudensitymatOperatorComputeAction == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatOperatorComputeAction is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, const cudensitymatOperator_t, double, int32_t, const double*, const cudensitymatState_t, cudensitymatState_t, cudensitymatWorkspaceDescriptor_t, cudaStream_t) nogil>__cudensitymatOperatorComputeAction)(
+        handle, superoperator, time, numParams, params, stateIn, stateOut, workspace, stream)
+
+
+cdef cudensitymatStatus_t _cudensitymatCreateOperatorAction(const cudensitymatHandle_t handle, int32_t numOperators, cudensitymatOperator_t operators[], cudensitymatOperatorAction_t* operatorAction) except* nogil:
+    global __cudensitymatCreateOperatorAction
+    _check_or_init_cudensitymat()
+    if __cudensitymatCreateOperatorAction == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatCreateOperatorAction is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, int32_t, cudensitymatOperator_t*, cudensitymatOperatorAction_t*) nogil>__cudensitymatCreateOperatorAction)(
+        handle, numOperators, operators, operatorAction)
+
+
+cdef cudensitymatStatus_t _cudensitymatDestroyOperatorAction(cudensitymatOperatorAction_t operatorAction) except* nogil:
+    global __cudensitymatDestroyOperatorAction
+    _check_or_init_cudensitymat()
+    if __cudensitymatDestroyOperatorAction == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatDestroyOperatorAction is not found")
+    return (<cudensitymatStatus_t (*)(cudensitymatOperatorAction_t) nogil>__cudensitymatDestroyOperatorAction)(
+        operatorAction)
+
+
+cdef cudensitymatStatus_t _cudensitymatOperatorActionPrepare(const cudensitymatHandle_t handle, cudensitymatOperatorAction_t operatorAction, const cudensitymatState_t stateIn[], const cudensitymatState_t stateOut, cudensitymatComputeType_t computeType, size_t workspaceSizeLimit, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil:
+    global __cudensitymatOperatorActionPrepare
+    _check_or_init_cudensitymat()
+    if __cudensitymatOperatorActionPrepare == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatOperatorActionPrepare is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, cudensitymatOperatorAction_t, const cudensitymatState_t*, const cudensitymatState_t, cudensitymatComputeType_t, size_t, cudensitymatWorkspaceDescriptor_t, cudaStream_t) nogil>__cudensitymatOperatorActionPrepare)(
+        handle, operatorAction, stateIn, stateOut, computeType, workspaceSizeLimit, workspace, stream)
+
+
+cdef cudensitymatStatus_t _cudensitymatOperatorActionCompute(const cudensitymatHandle_t handle, cudensitymatOperatorAction_t operatorAction, double time, int32_t numParams, const double params[], const cudensitymatState_t stateIn[], cudensitymatState_t stateOut, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil:
+    global __cudensitymatOperatorActionCompute
+    _check_or_init_cudensitymat()
+    if __cudensitymatOperatorActionCompute == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatOperatorActionCompute is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, cudensitymatOperatorAction_t, double, int32_t, const double*, const cudensitymatState_t*, cudensitymatState_t, cudensitymatWorkspaceDescriptor_t, cudaStream_t) nogil>__cudensitymatOperatorActionCompute)(
+        handle, operatorAction, time, numParams, params, stateIn, stateOut, workspace, stream)
+
+
+cdef cudensitymatStatus_t _cudensitymatCreateExpectation(const cudensitymatHandle_t handle, cudensitymatOperator_t superoperator, cudensitymatExpectation_t* expectation) except* nogil:
+    global __cudensitymatCreateExpectation
+    _check_or_init_cudensitymat()
+    if __cudensitymatCreateExpectation == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatCreateExpectation is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, cudensitymatOperator_t, cudensitymatExpectation_t*) nogil>__cudensitymatCreateExpectation)(
+        handle, superoperator, expectation)
+
+
+cdef cudensitymatStatus_t _cudensitymatDestroyExpectation(cudensitymatExpectation_t expectation) except* nogil:
+    global __cudensitymatDestroyExpectation
+    _check_or_init_cudensitymat()
+    if __cudensitymatDestroyExpectation == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatDestroyExpectation is not found")
+    return (<cudensitymatStatus_t (*)(cudensitymatExpectation_t) nogil>__cudensitymatDestroyExpectation)(
+        expectation)
+
+
+cdef cudensitymatStatus_t _cudensitymatExpectationPrepare(const cudensitymatHandle_t handle, cudensitymatExpectation_t expectation, const cudensitymatState_t state, cudensitymatComputeType_t computeType, size_t workspaceSizeLimit, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil:
+    global __cudensitymatExpectationPrepare
+    _check_or_init_cudensitymat()
+    if __cudensitymatExpectationPrepare == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatExpectationPrepare is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, cudensitymatExpectation_t, const cudensitymatState_t, cudensitymatComputeType_t, size_t, cudensitymatWorkspaceDescriptor_t, cudaStream_t) nogil>__cudensitymatExpectationPrepare)(
+        handle, expectation, state, computeType, workspaceSizeLimit, workspace, stream)
+
+
+cdef cudensitymatStatus_t _cudensitymatExpectationCompute(const cudensitymatHandle_t handle, cudensitymatExpectation_t expectation, double time, int32_t numParams, const double params[], const cudensitymatState_t state, void* expectationValue, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil:
+    global __cudensitymatExpectationCompute
+    _check_or_init_cudensitymat()
+    if __cudensitymatExpectationCompute == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatExpectationCompute is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, cudensitymatExpectation_t, double, int32_t, const double*, const cudensitymatState_t, void*, cudensitymatWorkspaceDescriptor_t, cudaStream_t) nogil>__cudensitymatExpectationCompute)(
+        handle, expectation, time, numParams, params, state, expectationValue, workspace, stream)
+
+
+cdef cudensitymatStatus_t _cudensitymatCreateWorkspace(const cudensitymatHandle_t handle, cudensitymatWorkspaceDescriptor_t* workspaceDescr) except* nogil:
+    global __cudensitymatCreateWorkspace
+    _check_or_init_cudensitymat()
+    if __cudensitymatCreateWorkspace == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatCreateWorkspace is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, cudensitymatWorkspaceDescriptor_t*) nogil>__cudensitymatCreateWorkspace)(
+        handle, workspaceDescr)
+
+
+cdef cudensitymatStatus_t _cudensitymatDestroyWorkspace(cudensitymatWorkspaceDescriptor_t workspaceDescr) except* nogil:
+    global __cudensitymatDestroyWorkspace
+    _check_or_init_cudensitymat()
+    if __cudensitymatDestroyWorkspace == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatDestroyWorkspace is not found")
+    return (<cudensitymatStatus_t (*)(cudensitymatWorkspaceDescriptor_t) nogil>__cudensitymatDestroyWorkspace)(
+        workspaceDescr)
+
+
+cdef cudensitymatStatus_t _cudensitymatWorkspaceGetMemorySize(const cudensitymatHandle_t handle, const cudensitymatWorkspaceDescriptor_t workspaceDescr, cudensitymatMemspace_t memSpace, cudensitymatWorkspaceKind_t workspaceKind, size_t* memoryBufferSize) except* nogil:
+    global __cudensitymatWorkspaceGetMemorySize
+    _check_or_init_cudensitymat()
+    if __cudensitymatWorkspaceGetMemorySize == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatWorkspaceGetMemorySize is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, const cudensitymatWorkspaceDescriptor_t, cudensitymatMemspace_t, cudensitymatWorkspaceKind_t, size_t*) nogil>__cudensitymatWorkspaceGetMemorySize)(
+        handle, workspaceDescr, memSpace, workspaceKind, memoryBufferSize)
+
+
+cdef cudensitymatStatus_t _cudensitymatWorkspaceSetMemory(const cudensitymatHandle_t handle, cudensitymatWorkspaceDescriptor_t workspaceDescr, cudensitymatMemspace_t memSpace, cudensitymatWorkspaceKind_t workspaceKind, void* memoryBuffer, size_t memoryBufferSize) except* nogil:
+    global __cudensitymatWorkspaceSetMemory
+    _check_or_init_cudensitymat()
+    if __cudensitymatWorkspaceSetMemory == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatWorkspaceSetMemory is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, cudensitymatWorkspaceDescriptor_t, cudensitymatMemspace_t, cudensitymatWorkspaceKind_t, void*, size_t) nogil>__cudensitymatWorkspaceSetMemory)(
+        handle, workspaceDescr, memSpace, workspaceKind, memoryBuffer, memoryBufferSize)
+
+
+cdef cudensitymatStatus_t _cudensitymatWorkspaceGetMemory(const cudensitymatHandle_t handle, const cudensitymatWorkspaceDescriptor_t workspaceDescr, cudensitymatMemspace_t memSpace, cudensitymatWorkspaceKind_t workspaceKind, void** memoryBuffer, size_t* memoryBufferSize) except* nogil:
+    global __cudensitymatWorkspaceGetMemory
+    _check_or_init_cudensitymat()
+    if __cudensitymatWorkspaceGetMemory == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cudensitymatWorkspaceGetMemory is not found")
+    return (<cudensitymatStatus_t (*)(const cudensitymatHandle_t, const cudensitymatWorkspaceDescriptor_t, cudensitymatMemspace_t, cudensitymatWorkspaceKind_t, void**, size_t*) nogil>__cudensitymatWorkspaceGetMemory)(
+        handle, workspaceDescr, memSpace, workspaceKind, memoryBuffer, memoryBufferSize)
diff --git a/python/cuquantum/bindings/_utils.pxd b/python/cuquantum/bindings/_utils.pxd
new file mode 100644
index 0000000..1a73e7d
--- /dev/null
+++ b/python/cuquantum/bindings/_utils.pxd
@@ -0,0 +1,184 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+cimport cpython
+from libc.stdint cimport int32_t, int64_t, intptr_t, uint32_t
+from libcpp.vector cimport vector
+from libcpp cimport bool as cppbool
+from libcpp cimport nullptr_t, nullptr
+from libcpp.memory cimport unique_ptr
+
+cdef extern from "driver_types.h" nogil:
+    ctypedef void* Stream 'cudaStream_t'
+
+
+cdef extern from * nogil:
+    """
+    template<typename T>
+    class nullable_unique_ptr {
+      public:
+        nullable_unique_ptr() noexcept = default;
+
+        nullable_unique_ptr(std::nullptr_t) noexcept = delete;
+
+        explicit nullable_unique_ptr(T* data, bool own_data):
+            own_data_(own_data)
+        {
+            if (own_data)
+                manager_.reset(data);
+            else
+                raw_data_ = data;
+        }
+
+        nullable_unique_ptr(const nullable_unique_ptr&) = delete;
+
+        nullable_unique_ptr& operator=(const nullable_unique_ptr&) = delete;
+
+        nullable_unique_ptr(nullable_unique_ptr&& other) noexcept
+        {
+            own_data_ = other.own_data_;
+            other.own_data_ = false;  // ownership is transferred
+            if (own_data_)
+            {
+                manager_ = std::move(other.manager_);
+                raw_data_ = nullptr;  // just in case
+            }
+            else
+            {
+                manager_.reset(nullptr);  // just in case
+                raw_data_ = other.raw_data_;
+            }
+        }
+
+        nullable_unique_ptr& operator=(nullable_unique_ptr&& other) noexcept
+        {
+            own_data_ = other.own_data_;
+            other.own_data_ = false;  // ownership is transferred
+            if (own_data_)
+            {
+                manager_ = std::move(other.manager_);
+                raw_data_ = nullptr;  // just in case
+            }
+            else
+            {
+                manager_.reset(nullptr);  // just in case
+                raw_data_ = other.raw_data_;
+            }
+            return *this;
+        }
+
+        ~nullable_unique_ptr() = default;
+
+        void reset(T* data, bool own_data)
+        {
+            own_data_ = own_data;
+            if (own_data_)
+            {
+                manager_.reset(data);
+                raw_data_ = nullptr;
+            }
+            else
+            {
+                manager_.reset(nullptr);
+                raw_data_ = data;
+            }
+        }
+
+        void swap(nullable_unique_ptr& other) noexcept
+        {
+            std::swap(manager_, other.manager_);
+            std::swap(raw_data_, other.raw_data_);
+            std::swap(own_data_, other.own_data_);
+        }
+
+        /*
+         * Get the pointer to the underlying object (this is different from data()!).
+         */
+        T* get() const noexcept
+        {
+            if (own_data_)
+                return manager_.get();
+            else
+                return raw_data_;
+        }
+
+        /*
+         * Get the pointer to the underlying buffer (this is different from get()!).
+         */
+        void* data() noexcept
+        {
+            if (own_data_)
+                return manager_.get()->data();
+            else
+                return raw_data_;
+        }
+
+        T& operator*()
+        {
+            if (own_data_)
+                return *manager_;
+            else
+                return *raw_data_;
+        }
+
+      private:
+        std::unique_ptr<T> manager_{};
+        T* raw_data_{nullptr};
+        bool own_data_{false};
+    };
+    """
+    # xref: cython/Cython/Includes/libcpp/memory.pxd
+    cdef cppclass nullable_unique_ptr[T]:
+        nullable_unique_ptr()
+        nullable_unique_ptr(T*, cppbool)
+        nullable_unique_ptr(nullable_unique_ptr[T]&)
+
+        # Modifiers
+        void reset(T*, cppbool)
+        void swap(nullable_unique_ptr&)
+
+        # Observers
+        T* get()
+        T& operator*()
+        void* data()
+
+
+ctypedef fused ResT:
+    int
+    double
+    intptr_t
+    int32_t
+    int64_t
+    uint32_t
+    size_t
+
+ctypedef fused PtrT:
+    void
+    int32_t
+    int64_t
+    (void*)
+
+cdef cppclass nested_resource[T]:
+    nullable_unique_ptr[ vector[intptr_t] ] ptrs
+    nullable_unique_ptr[ vector[vector[T]] ] nested_resource_ptr
+
+
+# accepts the output pointer as input to use the return value for exception propagation
+cdef int get_resource_ptr(nullable_unique_ptr[vector[ResT]] &in_out_ptr, object obj, ResT* __unused) except 1
+cdef int get_resource_ptrs(nullable_unique_ptr[ vector[PtrT*] ] &in_out_ptr, object obj, PtrT* __unused) except 1
+cdef int get_nested_resource_ptr(nested_resource[ResT] &in_out_ptr, object obj, ResT* __unused) except 1
+
+
+# Cython limitation: need standalone typedef if we wanna use it for casting
+ctypedef int (*DeviceAllocType)(void*, void**, size_t, Stream)
+ctypedef int (*DeviceFreeType)(void*, void*, size_t, Stream)
+
+
+cdef bint is_nested_sequence(data)
+cdef int cuqnt_alloc_wrapper(void* ctx, void** ptr, size_t size, Stream stream) with gil
+cdef int cuqnt_free_wrapper(void* ctx, void* ptr, size_t size, Stream stream) with gil
+cdef void logger_callback_with_data(
+        int32_t log_level, const char* func_name, const char* message,
+        void* func_arg) with gil
+cdef void* get_buffer_pointer(buf, Py_ssize_t size, readonly=*) except*
diff --git a/python/cuquantum/bindings/_utils.pyx b/python/cuquantum/bindings/_utils.pyx
new file mode 100644
index 0000000..0cb8e0a
--- /dev/null
+++ b/python/cuquantum/bindings/_utils.pyx
@@ -0,0 +1,267 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+from libcpp.utility cimport move
+from cython.operator cimport dereference as deref
+
+from enum import IntEnum
+
+from numpy import ndarray as _np_ndarray
+
+
+cdef bint is_nested_sequence(data):
+    if not cpython.PySequence_Check(data):
+        return False
+    else:
+        for i in data:
+            if not cpython.PySequence_Check(i):
+                return False
+        else:
+            return True
+
+
+cdef int cuqnt_alloc_wrapper(void* ctx, void** ptr, size_t size, Stream stream) with gil:
+    """Assuming the user provides an alloc routine: ptr = alloc(size, stream).
+
+    Note: this function holds the Python GIL.
+    """
+    cdef tuple pairs
+
+    try:
+        pairs = <object>(ctx)
+        user_alloc = pairs[0]
+        ptr[0] = <void*>(<intptr_t>user_alloc(size, <intptr_t>stream))
+    except:
+        # TODO: logging?
+        return 1
+    else:
+        return 0
+
+
+cdef int cuqnt_free_wrapper(void* ctx, void* ptr, size_t size, Stream stream) with gil:
+    """Assuming the user provides a free routine: free(ptr, size, stream).
+
+    Note: this function holds the Python GIL.
+    """
+    cdef tuple pairs
+
+    try:
+        pairs = <object>(ctx)
+        user_free = pairs[1]
+        user_free(<intptr_t>ptr, size, <intptr_t>stream)
+    except:
+        # TODO: logging?
+        return 1
+    else:
+        return 0
+
+
+cdef void logger_callback_with_data(
+        int32_t log_level, const char* func_name, const char* message,
+        void* func_arg) with gil:
+    func, args, kwargs = <object>func_arg
+    cdef bytes function_name = func_name
+    cdef bytes function_message = message
+    func(log_level, function_name.decode(), function_message.decode(),
+         *args, **kwargs)
+
+
+cdef void* get_buffer_pointer(buf, Py_ssize_t size, readonly=True) except*:
+    """The caller must ensure ``buf`` is alive when the returned pointer is in use.""" 
+    cdef void* bufPtr
+    cdef int flags = cpython.PyBUF_ANY_CONTIGUOUS
+    if not readonly:
+        flags |= cpython.PyBUF_WRITABLE
+    cdef int status = -1
+    cdef cpython.Py_buffer view
+
+    if isinstance(buf, int):
+        bufPtr = <void*><intptr_t>buf
+    else:  # try buffer protocol
+        try:
+            status = cpython.PyObject_GetBuffer(buf, &view, flags)
+            assert view.len == size
+            assert view.ndim == 1
+        except Exception as e:
+            adj = "writable " if not readonly else ""
+            raise ValueError(
+                 "buf must be either a Python int representing the pointer "
+                f"address to a valid buffer, or a 1D contiguous {adj}"
+                 "buffer, of size bytes") from e
+        else:
+            bufPtr = view.buf
+        finally:
+            if status == 0:
+                cpython.PyBuffer_Release(&view)
+
+    return bufPtr
+
+
+# The (subset of) compute types below are shared by cuStateVec and cuTensorNet
+class ComputeType(IntEnum):
+    """An enumeration of CUDA compute types."""
+    COMPUTE_DEFAULT = 0
+    COMPUTE_16F     = 1 << 0
+    COMPUTE_32F     = 1 << 2
+    COMPUTE_64F     = 1 << 4
+    COMPUTE_8U      = 1 << 6
+    COMPUTE_8I      = 1 << 8
+    COMPUTE_32U     = 1 << 7
+    COMPUTE_32I     = 1 << 9
+    COMPUTE_16BF    = 1 << 10
+    COMPUTE_TF32    = 1 << 12
+
+
+# TODO: use those exposed by CUDA Python instead, but before removing these
+# duplicates, check if they are fixed to inherit IntEnum instead of Enum.
+class cudaDataType(IntEnum):
+    """An enumeration of `cudaDataType_t`."""
+    CUDA_R_16F  =  2
+    CUDA_C_16F  =  6
+    CUDA_R_16BF = 14
+    CUDA_C_16BF = 15
+    CUDA_R_32F  =  0
+    CUDA_C_32F  =  4
+    CUDA_R_64F  =  1
+    CUDA_C_64F  =  5
+    CUDA_R_4I   = 16
+    CUDA_C_4I   = 17
+    CUDA_R_4U   = 18
+    CUDA_C_4U   = 19
+    CUDA_R_8I   =  3
+    CUDA_C_8I   =  7
+    CUDA_R_8U   =  8
+    CUDA_C_8U   =  9
+    CUDA_R_16I  = 20
+    CUDA_C_16I  = 21
+    CUDA_R_16U  = 22
+    CUDA_C_16U  = 23
+    CUDA_R_32I  = 10
+    CUDA_C_32I  = 11
+    CUDA_R_32U  = 12
+    CUDA_C_32U  = 13
+    CUDA_R_64I  = 24
+    CUDA_C_64I  = 25
+    CUDA_R_64U  = 26
+    CUDA_C_64U  = 27
+
+
+class libraryPropertyType(IntEnum):
+    """An enumeration of library version information."""
+    MAJOR_VERSION = 0
+    MINOR_VERSION = 1
+    PATCH_LEVEL = 2
+
+
+del IntEnum
+
+
+# Defined in CPython:
+# https://github.com/python/cpython/blob/26bc2cc06128890ac89492eca20e83abe0789c1c/Objects/unicodetype_db.h#L6311-L6349
+cdef int[29] _WHITESPACE_UNICODE_INTS = [
+    0x0009,
+    0x000A,
+    0x000B,
+    0x000C,
+    0x000D,
+    0x001C,
+    0x001D,
+    0x001E,
+    0x001F,
+    0x0020,
+    0x0085,
+    0x00A0,
+    0x1680,
+    0x2000,
+    0x2001,
+    0x2002,
+    0x2003,
+    0x2004,
+    0x2005,
+    0x2006,
+    0x2007,
+    0x2008,
+    0x2009,
+    0x200A,
+    0x2028,
+    0x2029,
+    0x202F,
+    0x205F,
+    0x3000,
+]
+
+
+WHITESPACE_UNICODE = ''.join(chr(s) for s in _WHITESPACE_UNICODE_INTS)
+
+
+# Cython can't infer the overload by return type alone, so we need a dummy
+# input argument to help it
+cdef int get_resource_ptr(nullable_unique_ptr[vector[ResT]] &in_out_ptr, object obj, ResT* __unused) except 1:
+    cdef vector[ResT]* vec
+    if cpython.PySequence_Check(obj):
+        vec = new vector[ResT](len(obj))
+        # set the ownership immediately to avoid leaking the `vec` memory in
+        # case of exception in the following loop
+        in_out_ptr.reset(vec, True)
+        for i in range(len(obj)):
+            deref(vec)[i] = obj[i]
+    else:
+        in_out_ptr.reset(<vector[ResT]*><intptr_t>obj, False)
+    return 0
+
+
+cdef int get_resource_ptrs(nullable_unique_ptr[ vector[PtrT*] ] &in_out_ptr, object obj, PtrT* __unused) except 1:
+    cdef vector[PtrT*]* vec
+    if cpython.PySequence_Check(obj):
+        vec = new vector[PtrT*](len(obj))
+        # set the ownership immediately to avoid leaking the `vec` memory in
+        # case of exception in the following loop
+        in_out_ptr.reset(vec, True)
+        for i in range(len(obj)):
+            deref(vec)[i] = <PtrT*><intptr_t>(obj[i])
+    else:
+        in_out_ptr.reset(<vector[PtrT*]*><intptr_t>obj, False)
+    return 0
+
+
+cdef int get_nested_resource_ptr(nested_resource[ResT] &in_out_ptr, object obj, ResT* __unused) except 1:
+    cdef nullable_unique_ptr[ vector[intptr_t] ] nested_ptr
+    cdef nullable_unique_ptr[ vector[vector[ResT]] ] nested_res_ptr
+    cdef vector[intptr_t]* nested_vec = NULL
+    cdef vector[vector[ResT]]* nested_res_vec = NULL
+    cdef size_t i = 0, length = 0
+    cdef intptr_t addr
+
+    if is_nested_sequence(obj):
+        length = len(obj)
+        nested_res_vec = new vector[vector[ResT]](length)
+        nested_vec = new vector[intptr_t](length)
+        # set the ownership immediately to avoid leaking memory in case of
+        # exception in the following loop
+        nested_res_ptr.reset(nested_res_vec, True)
+        nested_ptr.reset(nested_vec, True)
+        for i, obj_i in enumerate(obj):
+            deref(nested_res_vec)[i] = obj_i
+            deref(nested_vec)[i] = <intptr_t>(deref(nested_res_vec)[i].data())
+    elif cpython.PySequence_Check(obj):
+        length = len(obj)
+        nested_vec = new vector[intptr_t](length)
+        nested_ptr.reset(nested_vec, True)
+        for i, addr in enumerate(obj):
+            deref(nested_vec)[i] = addr
+        nested_res_ptr.reset(NULL, False)
+    else:
+        # obj is an int (ResT**)
+        nested_res_ptr.reset(NULL, False)
+        nested_ptr.reset(<vector[intptr_t]*><intptr_t>obj, False)
+
+    in_out_ptr.ptrs = move(nested_ptr)
+    in_out_ptr.nested_resource_ptr = move(nested_res_ptr)
+    return 0
+
+
+
+class FunctionNotFoundError(RuntimeError): pass
+
+class NotSupportedError(RuntimeError): pass
diff --git a/python/cuquantum/bindings/cudensitymat.pxd b/python/cuquantum/bindings/cudensitymat.pxd
new file mode 100644
index 0000000..d177e31
--- /dev/null
+++ b/python/cuquantum/bindings/cudensitymat.pxd
@@ -0,0 +1,102 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+from libc.stdint cimport intptr_t
+
+from .cycudensitymat cimport *
+
+
+###############################################################################
+# Types
+###############################################################################
+
+ctypedef cudensitymatHandle_t Handle
+ctypedef cudensitymatState_t State
+ctypedef cudensitymatElementaryOperator_t ElementaryOperator
+ctypedef cudensitymatOperatorTerm_t OperatorTerm
+ctypedef cudensitymatOperator_t Operator
+ctypedef cudensitymatOperatorAction_t OperatorAction
+ctypedef cudensitymatExpectation_t Expectation
+ctypedef cudensitymatWorkspaceDescriptor_t WorkspaceDescriptor
+ctypedef cudensitymatDistributedRequest_t DistributedRequest
+ctypedef cudensitymatScalarCallback_t ScalarCallback
+ctypedef cudensitymatTensorCallback_t TensorCallback
+ctypedef cudensitymatLoggerCallback_t LoggerCallback
+ctypedef cudensitymatLoggerCallbackData_t LoggerCallbackData
+ctypedef cudensitymatTimeRange_t TimeRange
+ctypedef cudensitymatDistributedCommunicator_t DistributedCommunicator
+ctypedef cudensitymatWrappedScalarCallback_t WrappedScalarCallback
+ctypedef cudensitymatWrappedTensorCallback_t WrappedTensorCallback
+ctypedef cudensitymatDistributedInterface_t DistributedInterface
+
+ctypedef cudaStream_t Stream
+ctypedef cudaDataType DataType
+ctypedef libraryPropertyType_t LibraryPropertyType
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+ctypedef cudensitymatStatus_t _Status
+ctypedef cudensitymatComputeType_t _ComputeType
+ctypedef cudensitymatDistributedProvider_t _DistributedProvider
+ctypedef cudensitymatStatePurity_t _StatePurity
+ctypedef cudensitymatElementaryOperatorSparsity_t _ElementaryOperatorSparsity
+ctypedef cudensitymatMemspace_t _Memspace
+ctypedef cudensitymatWorkspaceKind_t _WorkspaceKind
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cpdef intptr_t create() except? 0
+cpdef destroy(intptr_t handle)
+cpdef reset_distributed_configuration(intptr_t handle, int provider, intptr_t comm_ptr, size_t comm_size)
+cpdef int32_t get_num_ranks(intptr_t handle) except? -1
+cpdef int32_t get_proc_rank(intptr_t handle) except? -1
+cpdef reset_random_seed(intptr_t handle, int32_t random_seed)
+cpdef intptr_t create_state(intptr_t handle, int purity, int32_t num_space_modes, space_mode_extents, int64_t batch_size, int data_type) except? 0
+cpdef destroy_state(intptr_t state)
+cpdef int32_t state_get_num_components(intptr_t handle, intptr_t state) except? -1
+cpdef state_attach_component_storage(intptr_t handle, intptr_t state, int32_t num_state_components, component_buffer, component_buffer_size)
+cpdef state_get_component_num_modes(intptr_t handle, intptr_t state, int32_t state_component_local_id, intptr_t state_component_global_id, intptr_t state_component_num_modes, intptr_t batch_mode_location)
+cpdef state_get_component_info(intptr_t handle, intptr_t state, int32_t state_component_local_id, intptr_t state_component_global_id, intptr_t state_component_num_modes, intptr_t state_component_mode_extents, intptr_t state_component_mode_offsets)
+cpdef state_initialize_zero(intptr_t handle, intptr_t state, intptr_t stream)
+cpdef state_compute_scaling(intptr_t handle, intptr_t state, intptr_t scaling_factors, intptr_t stream)
+cpdef state_compute_norm(intptr_t handle, intptr_t state, intptr_t norm, intptr_t stream)
+cpdef state_compute_trace(intptr_t handle, intptr_t state, intptr_t trace, intptr_t stream)
+cpdef state_compute_accumulation(intptr_t handle, intptr_t state_in, intptr_t state_out, intptr_t scaling_factors, intptr_t stream)
+cpdef state_compute_inner_product(intptr_t handle, intptr_t state_left, intptr_t state_right, intptr_t inner_product, intptr_t stream)
+cpdef destroy_elementary_operator(intptr_t elem_operator)
+cpdef intptr_t create_operator_term(intptr_t handle, int32_t num_space_modes, space_mode_extents) except? 0
+cpdef destroy_operator_term(intptr_t operator_term)
+cpdef intptr_t create_operator(intptr_t handle, int32_t num_space_modes, space_mode_extents) except? 0
+cpdef destroy_operator(intptr_t superoperator)
+cpdef operator_prepare_action(intptr_t handle, intptr_t superoperator, intptr_t state_in, intptr_t state_out, int compute_type, size_t workspace_size_limit, intptr_t workspace, intptr_t stream)
+cpdef operator_compute_action(intptr_t handle, intptr_t superoperator, double time, int32_t num_params, params, intptr_t state_in, intptr_t state_out, intptr_t workspace, intptr_t stream)
+cpdef intptr_t create_operator_action(intptr_t handle, int32_t num_operators, operators) except? 0
+cpdef destroy_operator_action(intptr_t operator_action)
+cpdef operator_action_prepare(intptr_t handle, intptr_t operator_action, state_in, intptr_t state_out, int compute_type, size_t workspace_size_limit, intptr_t workspace, intptr_t stream)
+cpdef operator_action_compute(intptr_t handle, intptr_t operator_action, double time, int32_t num_params, params, state_in, intptr_t state_out, intptr_t workspace, intptr_t stream)
+cpdef intptr_t create_expectation(intptr_t handle, intptr_t superoperator) except? 0
+cpdef destroy_expectation(intptr_t expectation)
+cpdef expectation_prepare(intptr_t handle, intptr_t expectation, intptr_t state, int compute_type, size_t workspace_size_limit, intptr_t workspace, intptr_t stream)
+cpdef expectation_compute(intptr_t handle, intptr_t expectation, double time, int32_t num_params, params, intptr_t state, intptr_t expectation_value, intptr_t workspace, intptr_t stream)
+cpdef intptr_t create_workspace(intptr_t handle) except? 0
+cpdef destroy_workspace(intptr_t workspace_descr)
+cpdef size_t workspace_get_memory_size(intptr_t handle, intptr_t workspace_descr, int mem_space, int workspace_kind) except? -1
+cpdef workspace_set_memory(intptr_t handle, intptr_t workspace_descr, int mem_space, int workspace_kind, intptr_t memory_buffer, size_t memory_buffer_size)
+cpdef tuple workspace_get_memory(intptr_t handle, intptr_t workspace_descr, int mem_space, int workspace_kind)
+
+cpdef tuple state_get_component_storage_size(intptr_t handle, intptr_t state, int32_t num_state_components)
+
+cpdef intptr_t create_elementary_operator(intptr_t handle, int32_t num_space_modes, space_mode_extents, int sparsity, int32_t num_diagonals, diagonal_offsets, int data_type, intptr_t tensor_data, tensor_callback) except? 0
+
+cpdef operator_term_append_elementary_product(intptr_t handle, intptr_t operator_term, int32_t num_elem_operators, elem_operators, state_modes_acted_on, mode_action_duality, coefficient, coefficient_callback)
+
+cpdef operator_term_append_general_product(intptr_t handle, intptr_t operator_term, int32_t num_elem_operators, num_operator_modes, operator_mode_extents, operator_mode_strides, state_modes_acted_on, mode_action_duality, int data_type, tensor_data, tensor_callbacks, coefficient, coefficient_callback)
+
+cpdef operator_append_term(intptr_t handle, intptr_t superoperator, intptr_t operator_term, int32_t duality, coefficient, coefficient_callback)
diff --git a/python/cuquantum/bindings/cudensitymat.pyx b/python/cuquantum/bindings/cudensitymat.pyx
new file mode 100644
index 0000000..c40ed10
--- /dev/null
+++ b/python/cuquantum/bindings/cudensitymat.pyx
@@ -0,0 +1,1171 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+cimport cython
+from cpython.memoryview cimport PyMemoryView_FromMemory
+from cpython.buffer cimport PyBUF_WRITE
+
+from libcpp.vector cimport vector
+
+from ._utils cimport (get_resource_ptr, get_nested_resource_ptr, nested_resource, nullable_unique_ptr,
+                      get_buffer_pointer, get_resource_ptrs, DeviceAllocType, DeviceFreeType,
+                      cuqnt_alloc_wrapper, cuqnt_free_wrapper, logger_callback_with_data)
+
+from enum import IntEnum as _IntEnum
+import warnings as _warnings
+
+import numpy as _numpy
+
+
+###############################################################################
+# Callback wrappers
+###############################################################################
+
+cdef cuda_to_numpy_data_type(cudaDataType_t data_type):
+    """Convert cudaDataType_t to NumPy data type."""
+    if data_type == CUDA_R_32F:
+        return _numpy.dtype("float32")
+    elif data_type == CUDA_R_64F:
+        return _numpy.dtype("float64")
+    elif data_type == CUDA_C_32F:
+        return _numpy.dtype("complex64")
+    elif data_type == CUDA_C_64F:
+        return _numpy.dtype("complex128")
+
+
+cdef int32_t scalar_callback_wrapper(cudensitymatScalarCallback_t _callback_,
+                                     double time, int32_t num_params, const double * _params_,
+                                     cudaDataType_t _data_type_, void * _storage_) with gil:
+    """Scalar callback wrapper."""
+    callback = <object>(<void *>_callback_)
+    
+    # Reconstruct tuple from pointer
+    cdef tuple params = tuple(_params_[i] for i in range(num_params))
+
+    # Construct NumPy array for data storage
+    data_type = cuda_to_numpy_data_type(_data_type_)
+    memory_view = PyMemoryView_FromMemory(<char *>_storage_, data_type.itemsize, PyBUF_WRITE)
+    storage = _numpy.ndarray((1,), dtype=data_type, buffer=memory_view)
+
+    # Python function call
+    try:
+        callback(time, params, storage)
+    except:
+        return -1
+
+    return 0
+
+
+cdef int32_t tensor_callback_wrapper(cudensitymatTensorCallback_t _callback_,
+                                     cudensitymatElementaryOperatorSparsity_t sparsity,
+                                     int32_t num_modes, const int64_t * _mode_extents_,
+                                     const int32_t * _diagonal_offsets_,
+                                     double time, int32_t num_params, const double * _params_,
+                                     cudaDataType_t _data_type_, void * _storage_) with gil:
+    """Tensor callback wrapper."""
+    callback = <object>(<void *>_callback_)
+
+    # Reconstruct tuples from pointers
+    mode_extents = tuple(_mode_extents_[i] for i in range(num_modes))
+    params = tuple(_params_[i] for i in range(num_params))
+
+    # Construct NumPy array for data storage
+    data_type = cuda_to_numpy_data_type(_data_type_)
+    _size = _numpy.prod(mode_extents)
+    cdef size_t size = data_type.itemsize * _size
+    memory_view = PyMemoryView_FromMemory(<char *>_storage_, size, PyBUF_WRITE)
+    storage = _numpy.ndarray(mode_extents, dtype=data_type, buffer=memory_view, order='F')
+    
+    # Python function call
+    try:
+        callback(time, params, storage)
+    except Exception as e:
+        print("Caught exception in python callback:")
+        print(e)
+        return -1
+    return 0
+
+
+###############################################################################
+# Enum
+###############################################################################
+
+class Status(_IntEnum):
+    """See `cudensitymatStatus_t`."""
+    SUCCESS = CUDENSITYMAT_STATUS_SUCCESS
+    NOT_INITIALIZED = CUDENSITYMAT_STATUS_NOT_INITIALIZED
+    ALLOC_FAILED = CUDENSITYMAT_STATUS_ALLOC_FAILED
+    INVALID_VALUE = CUDENSITYMAT_STATUS_INVALID_VALUE
+    ARCH_MISMATCH = CUDENSITYMAT_STATUS_ARCH_MISMATCH
+    EXECUTION_FAILED = CUDENSITYMAT_STATUS_EXECUTION_FAILED
+    INTERNAL_ERROR = CUDENSITYMAT_STATUS_INTERNAL_ERROR
+    NOT_SUPPORTED = CUDENSITYMAT_STATUS_NOT_SUPPORTED
+    CALLBACK_ERROR = CUDENSITYMAT_STATUS_CALLBACK_ERROR
+    CUBLAS_ERROR = CUDENSITYMAT_STATUS_CUBLAS_ERROR
+    CUDA_ERROR = CUDENSITYMAT_STATUS_CUDA_ERROR
+    INSUFFICIENT_WORKSPACE = CUDENSITYMAT_STATUS_INSUFFICIENT_WORKSPACE
+    INSUFFICIENT_DRIVER = CUDENSITYMAT_STATUS_INSUFFICIENT_DRIVER
+    IO_ERROR = CUDENSITYMAT_STATUS_IO_ERROR
+    CUTENSOR_VERSION_MISMATCH = CUDENSITYMAT_STATUS_CUTENSOR_VERSION_MISMATCH
+    NO_DEVICE_ALLOCATOR = CUDENSITYMAT_STATUS_NO_DEVICE_ALLOCATOR
+    CUTENSOR_ERROR = CUDENSITYMAT_STATUS_CUTENSOR_ERROR
+    CUDMLVER_ERROR = CUDENSITYMAT_STATUS_CUDMLVER_ERROR
+    DEVICE_ALLOCATOR_ERROR = CUDENSITYMAT_STATUS_DEVICE_ALLOCATOR_ERROR
+    DISTRIBUTED_FAILURE = CUDENSITYMAT_STATUS_DISTRIBUTED_FAILURE
+    INTERRUPTED = CUDENSITYMAT_STATUS_INTERRUPTED
+    CUTENSORNET_ERROR = CUDENSITYMAT_STATUS_CUTENSORNET_ERROR
+
+class ComputeType(_IntEnum):
+    """See `cudensitymatComputeType_t`."""
+    COMPUTE_64F = CUDENSITYMAT_COMPUTE_64F
+    COMPUTE_32F = CUDENSITYMAT_COMPUTE_32F
+
+class DistributedProvider(_IntEnum):
+    """See `cudensitymatDistributedProvider_t`."""
+    NONE = CUDENSITYMAT_DISTRIBUTED_PROVIDER_NONE
+    MPI = CUDENSITYMAT_DISTRIBUTED_PROVIDER_MPI
+    NCCL = CUDENSITYMAT_DISTRIBUTED_PROVIDER_NCCL
+    NVSHMEM = CUDENSITYMAT_DISTRIBUTED_PROVIDER_NVSHMEM
+
+class StatePurity(_IntEnum):
+    """See `cudensitymatStatePurity_t`."""
+    PURE = CUDENSITYMAT_STATE_PURITY_PURE
+    MIXED = CUDENSITYMAT_STATE_PURITY_MIXED
+
+class ElementaryOperatorSparsity(_IntEnum):
+    """See `cudensitymatElementaryOperatorSparsity_t`."""
+    OPERATOR_SPARSITY_NONE = CUDENSITYMAT_OPERATOR_SPARSITY_NONE
+    OPERATOR_SPARSITY_MULTIDIAGONAL = CUDENSITYMAT_OPERATOR_SPARSITY_MULTIDIAGONAL
+
+class Memspace(_IntEnum):
+    """See `cudensitymatMemspace_t`."""
+    DEVICE = CUDENSITYMAT_MEMSPACE_DEVICE
+    HOST = CUDENSITYMAT_MEMSPACE_HOST
+
+class WorkspaceKind(_IntEnum):
+    """See `cudensitymatWorkspaceKind_t`."""
+    WORKSPACE_SCRATCH = CUDENSITYMAT_WORKSPACE_SCRATCH
+
+
+###############################################################################
+# Error handling
+###############################################################################
+
+cpdef str get_error_string(int error):
+    """Returns the description string for an error code.
+
+    Args:
+        error (Status): Error code to convert to string.
+
+    .. seealso:: `cudensitymatGetErrorString`
+    """
+    return ""
+
+
+class cuDensityMatError(Exception):
+
+    def __init__(self, status):
+        self.status = status
+        s = Status(status)
+        cdef str err = f"{s.name} ({s.value}): {get_error_string(status)}"
+        super(cuDensityMatError, self).__init__(err)
+
+    def __reduce__(self):
+        return (type(self), (self.status,))
+
+
+@cython.profile(False)
+cpdef inline check_status(int status):
+    if status != 0:
+        raise cuDensityMatError(status)
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cpdef intptr_t create() except? 0:
+    """Creates and initializes the library context.
+
+    Returns:
+        intptr_t: Library handle.
+
+    .. seealso:: `cudensitymatCreate`
+    """
+    cdef Handle handle
+    with nogil:
+        status = cudensitymatCreate(&handle)
+    check_status(status)
+    return <intptr_t>handle
+
+
+cpdef destroy(intptr_t handle):
+    """Destroys the library context.
+
+    Args:
+        handle (intptr_t): Library handle.
+
+    .. seealso:: `cudensitymatDestroy`
+    """
+    with nogil:
+        status = cudensitymatDestroy(<Handle>handle)
+    check_status(status)
+
+
+cpdef reset_distributed_configuration(intptr_t handle, int provider, intptr_t comm_ptr, size_t comm_size):
+    """Resets the current distributed execution configuration associated with the given library context.
+
+    Args:
+        handle (intptr_t): Library handle.
+        provider (DistributedProvider): Communication service provider.
+        comm_ptr (intptr_t): Pointer to the communicator in a type-erased form.
+        comm_size (size_t): Size of the communicator in bytes.
+
+    .. seealso:: `cudensitymatResetDistributedConfiguration`
+    """
+    with nogil:
+        status = cudensitymatResetDistributedConfiguration(<Handle>handle, <_DistributedProvider>provider, <const void*>comm_ptr, comm_size)
+    check_status(status)
+
+
+cpdef int32_t get_num_ranks(intptr_t handle) except? -1:
+    """Returns the total number of distributed processes associated with the given library context.
+
+    Args:
+        handle (intptr_t): Library handle.
+
+    Returns:
+        int32_t: Number of distributed processes.
+
+    .. seealso:: `cudensitymatGetNumRanks`
+    """
+    cdef int32_t num_ranks
+    with nogil:
+        status = cudensitymatGetNumRanks(<const Handle>handle, &num_ranks)
+    check_status(status)
+    return num_ranks
+
+
+cpdef int32_t get_proc_rank(intptr_t handle) except? -1:
+    """Returns the rank of the current process in the distributed configuration associated with the given library context.
+
+    Args:
+        handle (intptr_t): Library handle.
+
+    Returns:
+        int32_t: Rank of the current distributed process.
+
+    .. seealso:: `cudensitymatGetProcRank`
+    """
+    cdef int32_t proc_rank
+    with nogil:
+        status = cudensitymatGetProcRank(<const Handle>handle, &proc_rank)
+    check_status(status)
+    return proc_rank
+
+
+cpdef reset_random_seed(intptr_t handle, int32_t random_seed):
+    """Resets the random seed used by the random number generator inside the library context.
+
+    Args:
+        handle (intptr_t): Library handle.
+        random_seed (int32_t): Random seed value.
+
+    .. seealso:: `cudensitymatResetRandomSeed`
+    """
+    with nogil:
+        status = cudensitymatResetRandomSeed(<Handle>handle, random_seed)
+    check_status(status)
+
+
+cpdef intptr_t create_state(intptr_t handle, int purity, int32_t num_space_modes, space_mode_extents, int64_t batch_size, int data_type) except? 0:
+    """Defines an empty quantum state of a given purity and shape, or a batch of such quantum states.
+
+    Args:
+        handle (intptr_t): Library handle.
+        purity (StatePurity): Desired quantum state purity.
+        num_space_modes (int32_t): Number of space modes (number of degrees of freedom).
+        space_mode_extents (object): Extents of the space modes (dimensions of the degrees of freedom). It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int64_t``.
+
+        batch_size (int64_t): Batch size (number of equally-shaped quantum states). Setting the batch size to zero is the same as setting it to 1.
+        data_type (int): Representation data type (type of tensor elements).
+
+    Returns:
+        intptr_t: Empty quantum state (or a batch of quantum states).
+
+    .. seealso:: `cudensitymatCreateState`
+    """
+    cdef nullable_unique_ptr[ vector[int64_t] ] _space_mode_extents_
+    get_resource_ptr[int64_t](_space_mode_extents_, space_mode_extents, <int64_t*>NULL)
+    cdef State state
+    with nogil:
+        status = cudensitymatCreateState(<const Handle>handle, <_StatePurity>purity, num_space_modes, <const int64_t*>(_space_mode_extents_.data()), batch_size, <DataType>data_type, &state)
+    check_status(status)
+    return <intptr_t>state
+
+
+cpdef destroy_state(intptr_t state):
+    """Destroys the quantum state.
+
+    Args:
+        state (intptr_t): Quantum state (or a batch of quantum states).
+
+    .. seealso:: `cudensitymatDestroyState`
+    """
+    with nogil:
+        status = cudensitymatDestroyState(<State>state)
+    check_status(status)
+
+
+cpdef int32_t state_get_num_components(intptr_t handle, intptr_t state) except? -1:
+    """Queries the number of components (tensors) constituting the chosen quantum state representation (on the current process in multi-process runs).
+
+    Args:
+        handle (intptr_t): Library handle.
+        state (intptr_t): Quantum state (or a batch of quantum states).
+
+    Returns:
+        int32_t: Number of components (tensors) in the quantum state representation (on the current process).
+
+    .. seealso:: `cudensitymatStateGetNumComponents`
+    """
+    cdef int32_t num_state_components
+    with nogil:
+        status = cudensitymatStateGetNumComponents(<const Handle>handle, <const State>state, &num_state_components)
+    check_status(status)
+    return num_state_components
+
+
+cpdef state_attach_component_storage(intptr_t handle, intptr_t state, int32_t num_state_components, component_buffer, component_buffer_size):
+    """Attaches a user-owned GPU-accessible storage buffer for each component (tensor) constituting the quantum state representation (on the current process in multi-process runs).
+
+    Args:
+        handle (intptr_t): Library handle.
+        state (intptr_t): Quantum state (or a batch of quantum states).
+        num_state_components (int32_t): Number of components (tensors) in the quantum state representation (on the current process).
+        component_buffer (object): Pointers to user-owned GPU-accessible storage buffers for all components (tensors) constituting the quantum state representation (on the current process). It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``intptr_t``.
+
+        component_buffer_size (object): Sizes of the provded storage buffers for all components (tensors) constituting the quantum state representation (on the current process). It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``size_t``.
+
+
+    .. seealso:: `cudensitymatStateAttachComponentStorage`
+    """
+    cdef nullable_unique_ptr[ vector[intptr_t] ] _component_buffer_
+    get_resource_ptr[intptr_t](_component_buffer_, component_buffer, <intptr_t*>NULL)
+    cdef nullable_unique_ptr[ vector[size_t] ] _component_buffer_size_
+    get_resource_ptr[size_t](_component_buffer_size_, component_buffer_size, <size_t*>NULL)
+    with nogil:
+        status = cudensitymatStateAttachComponentStorage(<const Handle>handle, <State>state, num_state_components, <void**>(_component_buffer_.data()), <const size_t*>(_component_buffer_size_.data()))
+    check_status(status)
+
+
+cpdef state_get_component_num_modes(intptr_t handle, intptr_t state, int32_t state_component_local_id, intptr_t state_component_global_id, intptr_t state_component_num_modes, intptr_t batch_mode_location):
+    """Queries the number of modes in a local component tensor (on the current process in multi-process runs).
+
+    Args:
+        handle (intptr_t): Library handle.
+        state (intptr_t): Quantum state (or a batch of quantum states).
+        state_component_local_id (int32_t): Component local id (on the current parallel process).
+        state_component_global_id (intptr_t): Component global id (across all parallel processes).
+        state_component_num_modes (intptr_t): Component tensor order (number of modes).
+        batch_mode_location (intptr_t): Location of the batch mode (or -1 if the batch mode is absent).
+
+    .. seealso:: `cudensitymatStateGetComponentNumModes`
+    """
+    with nogil:
+        status = cudensitymatStateGetComponentNumModes(<const Handle>handle, <State>state, state_component_local_id, <int32_t*>state_component_global_id, <int32_t*>state_component_num_modes, <int32_t*>batch_mode_location)
+    check_status(status)
+
+
+cpdef state_get_component_info(intptr_t handle, intptr_t state, int32_t state_component_local_id, intptr_t state_component_global_id, intptr_t state_component_num_modes, intptr_t state_component_mode_extents, intptr_t state_component_mode_offsets):
+    """Queries information for a locally stored component tensor which represents either the full component or its slice (on the current process in multi-process runs).
+
+    Args:
+        handle (intptr_t): Library handle.
+        state (intptr_t): Quantum state (or a batch of quantum states).
+        state_component_local_id (int32_t): Component local id (on the current parallel process).
+        state_component_global_id (intptr_t): Component global id (across all parallel processes).
+        state_component_num_modes (intptr_t): Component tensor order (number of modes).
+        state_component_mode_extents (intptr_t): Component tensor mode extents (the size of the array must be sufficient, see ``cudensitymatStateGetComponentNumModes``).
+        state_component_mode_offsets (intptr_t): Component tensor mode offsets (the size of the array must be sufficient, see ``cudensitymatStateGetComponentNumModes``).
+
+    .. seealso:: `cudensitymatStateGetComponentInfo`
+    """
+    with nogil:
+        status = cudensitymatStateGetComponentInfo(<const Handle>handle, <State>state, state_component_local_id, <int32_t*>state_component_global_id, <int32_t*>state_component_num_modes, <int64_t*>state_component_mode_extents, <int64_t*>state_component_mode_offsets)
+    check_status(status)
+
+
+cpdef state_initialize_zero(intptr_t handle, intptr_t state, intptr_t stream):
+    """Initializes the quantum state to zero (null state).
+
+    Args:
+        handle (intptr_t): Library handle.
+        state (intptr_t): Quantum state (or a batch of quantum states).
+        stream (intptr_t): CUDA stream.
+
+    .. seealso:: `cudensitymatStateInitializeZero`
+    """
+    with nogil:
+        status = cudensitymatStateInitializeZero(<const Handle>handle, <State>state, <Stream>stream)
+    check_status(status)
+
+
+cpdef state_compute_scaling(intptr_t handle, intptr_t state, intptr_t scaling_factors, intptr_t stream):
+    """Initializes the quantum state to a random value.
+
+    Args:
+        handle (intptr_t): Library handle.
+        state (intptr_t): Quantum state (or a batch of quantum states).
+        scaling_factors (intptr_t): CUDA stream.
+        stream (intptr_t): Library handle.
+
+    .. seealso:: `cudensitymatStateComputeScaling`
+    """
+    with nogil:
+        status = cudensitymatStateComputeScaling(<const Handle>handle, <State>state, <const void*>scaling_factors, <Stream>stream)
+    check_status(status)
+
+
+cpdef state_compute_norm(intptr_t handle, intptr_t state, intptr_t norm, intptr_t stream):
+    """Computes the squared Frobenius norm(s) of the quantum state(s).
+
+    Args:
+        handle (intptr_t): Library handle.
+        state (intptr_t): Quantum state (or a batch of quantum states).
+        norm (intptr_t): Pointer to the squared Frobenius norm(s) vector storage in the GPU-accessible RAM (float or double real data type).
+        stream (intptr_t): CUDA stream.
+
+    .. seealso:: `cudensitymatStateComputeNorm`
+    """
+    with nogil:
+        status = cudensitymatStateComputeNorm(<const Handle>handle, <const State>state, <void*>norm, <Stream>stream)
+    check_status(status)
+
+
+cpdef state_compute_trace(intptr_t handle, intptr_t state, intptr_t trace, intptr_t stream):
+    """Computes the trace(s) of the quantum state(s).
+
+    Args:
+        handle (intptr_t): Library handle.
+        state (intptr_t): Quantum state (or a batch of quantum states).
+        trace (intptr_t): Pointer to the trace(s) vector storage in the GPU-accessible RAM (same data type as used by the state).
+        stream (intptr_t): CUDA stream.
+
+    .. seealso:: `cudensitymatStateComputeTrace`
+    """
+    with nogil:
+        status = cudensitymatStateComputeTrace(<const Handle>handle, <const State>state, <void*>trace, <Stream>stream)
+    check_status(status)
+
+
+cpdef state_compute_accumulation(intptr_t handle, intptr_t state_in, intptr_t state_out, intptr_t scaling_factors, intptr_t stream):
+    """Computes accumulation of a quantum state(s) into another quantum state(s) of compatible shape.
+
+    Args:
+        handle (intptr_t): Library handle.
+        state_in (intptr_t): Accumulated quantum state (or a batch of quantum states).
+        state_out (intptr_t): Accumulating quantum state (or a batch of quantum states).
+        scaling_factors (intptr_t): Array of scaling factor(s) of dimension equal to the batch size in the GPU-accessible RAM (same data type as used by the state).
+        stream (intptr_t): CUDA stream.
+
+    .. seealso:: `cudensitymatStateComputeAccumulation`
+    """
+    with nogil:
+        status = cudensitymatStateComputeAccumulation(<const Handle>handle, <const State>state_in, <State>state_out, <const void*>scaling_factors, <Stream>stream)
+    check_status(status)
+
+
+cpdef state_compute_inner_product(intptr_t handle, intptr_t state_left, intptr_t state_right, intptr_t inner_product, intptr_t stream):
+    """Computes the inner product(s) between the left quantum state(s) and the right quantum state(s): < state(s)Left | state(s)Right >.
+
+    Args:
+        handle (intptr_t): Library handle.
+        state_left (intptr_t): Left quantum state (or a batch of quantum states).
+        state_right (intptr_t): Right quantum state (or a batch of quantum states).
+        inner_product (intptr_t): Pointer to the inner product(s) vector storage in the GPU-accessible RAM (same data type as the one used by the quantum states).
+        stream (intptr_t): CUDA stream.
+
+    .. seealso:: `cudensitymatStateComputeInnerProduct`
+    """
+    with nogil:
+        status = cudensitymatStateComputeInnerProduct(<const Handle>handle, <const State>state_left, <const State>state_right, <void*>inner_product, <Stream>stream)
+    check_status(status)
+
+
+cpdef destroy_elementary_operator(intptr_t elem_operator):
+    """Destroys an elementary tensor operator.
+
+    Args:
+        elem_operator (intptr_t): Elementary tensor operator.
+
+    .. seealso:: `cudensitymatDestroyElementaryOperator`
+    """
+    with nogil:
+        status = cudensitymatDestroyElementaryOperator(<ElementaryOperator>elem_operator)
+    check_status(status)
+
+
+cpdef intptr_t create_operator_term(intptr_t handle, int32_t num_space_modes, space_mode_extents) except? 0:
+    """Creates an empty operator term which is going to be a sum of tensor products of individual tensor operators, where each individual tensor operator within a product acts on disjoint quantum state modes (quantum degrees of freedom).
+
+    Args:
+        handle (intptr_t): Library handle.
+        num_space_modes (int32_t): Number of modes (degrees of freedom) defining the primary/dual tensor product space in which the operator term will act.
+        space_mode_extents (object): Extents of the modes (degrees of freedom) defining the primary/dual tensor product space in which the operator term will act. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int64_t``.
+
+
+    Returns:
+        intptr_t: Operator term.
+
+    .. seealso:: `cudensitymatCreateOperatorTerm`
+    """
+    cdef nullable_unique_ptr[ vector[int64_t] ] _space_mode_extents_
+    get_resource_ptr[int64_t](_space_mode_extents_, space_mode_extents, <int64_t*>NULL)
+    cdef OperatorTerm operator_term
+    with nogil:
+        status = cudensitymatCreateOperatorTerm(<const Handle>handle, num_space_modes, <const int64_t*>(_space_mode_extents_.data()), &operator_term)
+    check_status(status)
+    return <intptr_t>operator_term
+
+
+cpdef destroy_operator_term(intptr_t operator_term):
+    """Destroys an operator term.
+
+    Args:
+        operator_term (intptr_t): Operator term.
+
+    .. seealso:: `cudensitymatDestroyOperatorTerm`
+    """
+    with nogil:
+        status = cudensitymatDestroyOperatorTerm(<OperatorTerm>operator_term)
+    check_status(status)
+
+
+cpdef intptr_t create_operator(intptr_t handle, int32_t num_space_modes, space_mode_extents) except? 0:
+    """Creates an empty operator which is going to be a collection of operator terms.
+
+    Args:
+        handle (intptr_t): Library handle.
+        num_space_modes (int32_t): Number of modes (degrees of freedom) defining the primary/dual tensor product space in which the operator term will act.
+        space_mode_extents (object): Extents of the modes (degrees of freedom) defining the primary/dual tensor product space in which the operator term will act. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int64_t``.
+
+
+    Returns:
+        intptr_t: Operator.
+
+    .. seealso:: `cudensitymatCreateOperator`
+    """
+    cdef nullable_unique_ptr[ vector[int64_t] ] _space_mode_extents_
+    get_resource_ptr[int64_t](_space_mode_extents_, space_mode_extents, <int64_t*>NULL)
+    cdef Operator superoperator
+    with nogil:
+        status = cudensitymatCreateOperator(<const Handle>handle, num_space_modes, <const int64_t*>(_space_mode_extents_.data()), &superoperator)
+    check_status(status)
+    return <intptr_t>superoperator
+
+
+cpdef destroy_operator(intptr_t superoperator):
+    """Destroys an operator.
+
+    Args:
+        superoperator (intptr_t): Operator.
+
+    .. seealso:: `cudensitymatDestroyOperator`
+    """
+    with nogil:
+        status = cudensitymatDestroyOperator(<Operator>superoperator)
+    check_status(status)
+
+
+cpdef operator_prepare_action(intptr_t handle, intptr_t superoperator, intptr_t state_in, intptr_t state_out, int compute_type, size_t workspace_size_limit, intptr_t workspace, intptr_t stream):
+    """Prepares the operator for an action on a quantum state.
+
+    Args:
+        handle (intptr_t): Library handle.
+        superoperator (intptr_t): Operator.
+        state_in (intptr_t): Representative input quantum state on which the operator is supposed to act. The actual state acted on during computation may be different, but it has to be of the same shape, kind, and factorization structure (topology, bond dimensions, etc).
+        state_out (intptr_t): Representative output quantum state produced by the action of the operator on the input quantum state. The actual state acted on during computation may be different, but it has to be of the same shape, kind, and factorization structure (topology, bond dimensions, etc).
+        compute_type (ComputeType): Desired compute type.
+        workspace_size_limit (size_t): Workspace buffer size limit (bytes).
+        workspace (intptr_t): Empty workspace descriptor on entrance. The workspace size required for the computation will be set on exit.
+        stream (intptr_t): CUDA stream.
+
+    .. seealso:: `cudensitymatOperatorPrepareAction`
+    """
+    with nogil:
+        status = cudensitymatOperatorPrepareAction(<const Handle>handle, <const Operator>superoperator, <const State>state_in, <const State>state_out, <_ComputeType>compute_type, workspace_size_limit, <WorkspaceDescriptor>workspace, <Stream>stream)
+    check_status(status)
+
+
+cpdef operator_compute_action(intptr_t handle, intptr_t superoperator, double time, int32_t num_params, params, intptr_t state_in, intptr_t state_out, intptr_t workspace, intptr_t stream):
+    """Computes the action of the operator on a given input quantum state, accumulating the result in the output quantum state (accumulative action).
+
+    Args:
+        handle (intptr_t): Library handle.
+        superoperator (intptr_t): Operator.
+        time (double): Time value.
+        num_params (int32_t): Number of variable parameters defined by the user.
+        params (object): Variable parameters defined by the user. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``float``.
+
+        state_in (intptr_t): Input quantum state (or a batch of input quantum states).
+        state_out (intptr_t): Updated resulting quantum state which accumulates operator action on the input quantum state.
+        workspace (intptr_t): Allocated workspace descriptor.
+        stream (intptr_t): CUDA stream.
+
+    .. seealso:: `cudensitymatOperatorComputeAction`
+    """
+    cdef nullable_unique_ptr[ vector[double] ] _params_
+    get_resource_ptr[double](_params_, params, <double*>NULL)
+    with nogil:
+        status = cudensitymatOperatorComputeAction(<const Handle>handle, <const Operator>superoperator, time, num_params, <const double*>(_params_.data()), <const State>state_in, <State>state_out, <WorkspaceDescriptor>workspace, <Stream>stream)
+    check_status(status)
+
+
+cpdef intptr_t create_operator_action(intptr_t handle, int32_t num_operators, operators) except? 0:
+    """Creates an action descriptor for one or more operators, thus defining an aggregate action of the operator(s) on a set of input quantum states compliant with the operator domains, where all input quantum states can also be batched.
+
+    Args:
+        handle (intptr_t): Library handle.
+        num_operators (int32_t): Number of operators involved (number of operator-state products).
+        operators (object): Constituting operator(s) with the same domain of action. Some of the operators may be set to NULL to represent zero action on a specific input quantum state. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of :class:`int`\s (as pointer addresses).
+
+
+    Returns:
+        intptr_t: Operator action.
+
+    .. seealso:: `cudensitymatCreateOperatorAction`
+    """
+    cdef nullable_unique_ptr[ vector[Operator*] ] _operators_
+    get_resource_ptrs[Operator](_operators_, operators, <Operator*>NULL)
+    cdef OperatorAction operator_action
+    with nogil:
+        status = cudensitymatCreateOperatorAction(<const Handle>handle, num_operators, <Operator*>(_operators_.data()), &operator_action)
+    check_status(status)
+    return <intptr_t>operator_action
+
+
+cpdef destroy_operator_action(intptr_t operator_action):
+    """Destroys the operator action descriptor.
+
+    Args:
+        operator_action (intptr_t): Operator action.
+
+    .. seealso:: `cudensitymatDestroyOperatorAction`
+    """
+    with nogil:
+        status = cudensitymatDestroyOperatorAction(<OperatorAction>operator_action)
+    check_status(status)
+
+
+cpdef operator_action_prepare(intptr_t handle, intptr_t operator_action, state_in, intptr_t state_out, int compute_type, size_t workspace_size_limit, intptr_t workspace, intptr_t stream):
+    """Prepares the (aggregate) operator(s) action for computation.
+
+    Args:
+        handle (intptr_t): Library handle.
+        operator_action (intptr_t): Operator(s) action specification.
+        state_in (object): Input quantum state(s) for all operator(s) defining the current Operator Action. Each input quantum state can be a batch of quantum states itself (with the same batch dimension). It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of :class:`int`\s (as pointer addresses).
+
+        state_out (intptr_t): Updated output quantum state (or a batch) which accumulates the (aggregate) operator(s) action on all input quantum state(s).
+        compute_type (ComputeType): Desired compute type.
+        workspace_size_limit (size_t): Workspace buffer size limit (bytes).
+        workspace (intptr_t): Empty workspace descriptor on entrance. The workspace size required for the computation will be set on exit.
+        stream (intptr_t): CUDA stream.
+
+    .. seealso:: `cudensitymatOperatorActionPrepare`
+    """
+    cdef nullable_unique_ptr[ vector[State*] ] _state_in_
+    get_resource_ptrs[State](_state_in_, state_in, <State*>NULL)
+    with nogil:
+        status = cudensitymatOperatorActionPrepare(<const Handle>handle, <OperatorAction>operator_action, <const State*>(_state_in_.data()), <const State>state_out, <_ComputeType>compute_type, workspace_size_limit, <WorkspaceDescriptor>workspace, <Stream>stream)
+    check_status(status)
+
+
+cpdef operator_action_compute(intptr_t handle, intptr_t operator_action, double time, int32_t num_params, params, state_in, intptr_t state_out, intptr_t workspace, intptr_t stream):
+    """Executes the action of one or more operators constituting the aggreggate operator(s) action on the same number of input quantum states, accumulating the results into a single output quantum state.
+
+    Args:
+        handle (intptr_t): Library handle.
+        operator_action (intptr_t): Operator(s) action.
+        time (double): Time value.
+        num_params (int32_t): Number of variable parameters defined by the user.
+        params (object): Variable parameters defined by the user. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``float``.
+
+        state_in (object): Input quantum state(s). Each input quantum state can be a batch of quantum states, in general. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of :class:`int`\s (as pointer addresses).
+
+        state_out (intptr_t): Updated output quantum state which accumulates operator action(s) on all input quantum state(s).
+        workspace (intptr_t): Allocated workspace descriptor.
+        stream (intptr_t): CUDA stream.
+
+    .. seealso:: `cudensitymatOperatorActionCompute`
+    """
+    cdef nullable_unique_ptr[ vector[double] ] _params_
+    get_resource_ptr[double](_params_, params, <double*>NULL)
+    cdef nullable_unique_ptr[ vector[State*] ] _state_in_
+    get_resource_ptrs[State](_state_in_, state_in, <State*>NULL)
+    with nogil:
+        status = cudensitymatOperatorActionCompute(<const Handle>handle, <OperatorAction>operator_action, time, num_params, <const double*>(_params_.data()), <const State*>(_state_in_.data()), <State>state_out, <WorkspaceDescriptor>workspace, <Stream>stream)
+    check_status(status)
+
+
+cpdef intptr_t create_expectation(intptr_t handle, intptr_t superoperator) except? 0:
+    """Creates the operator expectation value computation object.
+
+    Args:
+        handle (intptr_t): Library handle.
+        superoperator (intptr_t): Operator.
+
+    Returns:
+        intptr_t: Expectation value object.
+
+    .. seealso:: `cudensitymatCreateExpectation`
+    """
+    cdef Expectation expectation
+    with nogil:
+        status = cudensitymatCreateExpectation(<const Handle>handle, <Operator>superoperator, &expectation)
+    check_status(status)
+    return <intptr_t>expectation
+
+
+cpdef destroy_expectation(intptr_t expectation):
+    """Destroys an expectation value object.
+
+    Args:
+        expectation (intptr_t): Expectation value object.
+
+    .. seealso:: `cudensitymatDestroyExpectation`
+    """
+    with nogil:
+        status = cudensitymatDestroyExpectation(<Expectation>expectation)
+    check_status(status)
+
+
+cpdef expectation_prepare(intptr_t handle, intptr_t expectation, intptr_t state, int compute_type, size_t workspace_size_limit, intptr_t workspace, intptr_t stream):
+    """Prepares the expectation value object for computation.
+
+    Args:
+        handle (intptr_t): Library handle.
+        expectation (intptr_t): Expectation value object.
+        state (intptr_t): Quantum state (or a batch of quantum states).
+        compute_type (ComputeType): Desired compute type.
+        workspace_size_limit (size_t): Workspace buffer size limit (bytes).
+        workspace (intptr_t): Empty workspace descriptor on entrance. The workspace size required for the computation will be set on exit.
+        stream (intptr_t): CUDA stream.
+
+    .. seealso:: `cudensitymatExpectationPrepare`
+    """
+    with nogil:
+        status = cudensitymatExpectationPrepare(<const Handle>handle, <Expectation>expectation, <const State>state, <_ComputeType>compute_type, workspace_size_limit, <WorkspaceDescriptor>workspace, <Stream>stream)
+    check_status(status)
+
+
+cpdef expectation_compute(intptr_t handle, intptr_t expectation, double time, int32_t num_params, params, intptr_t state, intptr_t expectation_value, intptr_t workspace, intptr_t stream):
+    """Computes the operator expectation value(s) with respect to the given quantum state(s).
+
+    Args:
+        handle (intptr_t): Library handle.
+        expectation (intptr_t): Expectation value object.
+        time (double): Specified time.
+        num_params (int32_t): Number of variable parameters defined by the user.
+        params (object): Variable parameters defined by the user. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``float``.
+
+        state (intptr_t): Quantum state (or a batch of quantum states).
+        expectation_value (intptr_t): Pointer to the expectation value(s) vector storage in GPU-accessible RAM of the same data type as used by the state and operator.
+        workspace (intptr_t): Allocated workspace descriptor.
+        stream (intptr_t): CUDA stream.
+
+    .. seealso:: `cudensitymatExpectationCompute`
+    """
+    cdef nullable_unique_ptr[ vector[double] ] _params_
+    get_resource_ptr[double](_params_, params, <double*>NULL)
+    with nogil:
+        status = cudensitymatExpectationCompute(<const Handle>handle, <Expectation>expectation, time, num_params, <const double*>(_params_.data()), <const State>state, <void*>expectation_value, <WorkspaceDescriptor>workspace, <Stream>stream)
+    check_status(status)
+
+
+cpdef intptr_t create_workspace(intptr_t handle) except? 0:
+    """Creates a workspace descriptor.
+
+    Args:
+        handle (intptr_t): Library handle.
+
+    Returns:
+        intptr_t: Workspace descriptor.
+
+    .. seealso:: `cudensitymatCreateWorkspace`
+    """
+    cdef WorkspaceDescriptor workspace_descr
+    with nogil:
+        status = cudensitymatCreateWorkspace(<const Handle>handle, &workspace_descr)
+    check_status(status)
+    return <intptr_t>workspace_descr
+
+
+cpdef destroy_workspace(intptr_t workspace_descr):
+    """Destroys a workspace descriptor.
+
+    Args:
+        workspace_descr (intptr_t): Workspace descriptor.
+
+    .. seealso:: `cudensitymatDestroyWorkspace`
+    """
+    with nogil:
+        status = cudensitymatDestroyWorkspace(<WorkspaceDescriptor>workspace_descr)
+    check_status(status)
+
+
+cpdef size_t workspace_get_memory_size(intptr_t handle, intptr_t workspace_descr, int mem_space, int workspace_kind) except? -1:
+    """Queries the required workspace buffer size.
+
+    Args:
+        handle (intptr_t): Library handle.
+        workspace_descr (intptr_t): Workspace descriptor.
+        mem_space (Memspace): Memory space.
+        workspace_kind (WorkspaceKind): Workspace kind.
+
+    Returns:
+        size_t: Required workspace buffer size in bytes.
+
+    .. seealso:: `cudensitymatWorkspaceGetMemorySize`
+    """
+    cdef size_t memory_buffer_size
+    with nogil:
+        status = cudensitymatWorkspaceGetMemorySize(<const Handle>handle, <const WorkspaceDescriptor>workspace_descr, <_Memspace>mem_space, <_WorkspaceKind>workspace_kind, &memory_buffer_size)
+    check_status(status)
+    return memory_buffer_size
+
+
+cpdef workspace_set_memory(intptr_t handle, intptr_t workspace_descr, int mem_space, int workspace_kind, intptr_t memory_buffer, size_t memory_buffer_size):
+    """Attaches memory to a workspace buffer.
+
+    Args:
+        handle (intptr_t): Library handle.
+        workspace_descr (intptr_t): Workspace descriptor.
+        mem_space (Memspace): Memory space.
+        workspace_kind (WorkspaceKind): Workspace kind.
+        memory_buffer (intptr_t): Pointer to a user-owned memory buffer to be used by the specified workspace.
+        memory_buffer_size (size_t): Size of the provided memory buffer in bytes.
+
+    .. seealso:: `cudensitymatWorkspaceSetMemory`
+    """
+    with nogil:
+        status = cudensitymatWorkspaceSetMemory(<const Handle>handle, <WorkspaceDescriptor>workspace_descr, <_Memspace>mem_space, <_WorkspaceKind>workspace_kind, <void*>memory_buffer, memory_buffer_size)
+    check_status(status)
+
+
+cpdef tuple workspace_get_memory(intptr_t handle, intptr_t workspace_descr, int mem_space, int workspace_kind):
+    """Retrieves a workspace buffer.
+
+    Args:
+        handle (intptr_t): Library handle.
+        workspace_descr (intptr_t): Workspace descriptor.
+        mem_space (Memspace): Memory space.
+        workspace_kind (WorkspaceKind): Workspace kind.
+
+    Returns:
+        A 2-tuple containing:
+
+        - intptr_t: Pointer to a user-owned memory buffer used by the specified workspace.
+        - size_t: Size of the memory buffer in bytes.
+
+    .. seealso:: `cudensitymatWorkspaceGetMemory`
+    """
+    cdef void* memory_buffer
+    cdef size_t memory_buffer_size
+    with nogil:
+        status = cudensitymatWorkspaceGetMemory(<const Handle>handle, <const WorkspaceDescriptor>workspace_descr, <_Memspace>mem_space, <_WorkspaceKind>workspace_kind, &memory_buffer, &memory_buffer_size)
+    check_status(status)
+    return (<intptr_t>memory_buffer, memory_buffer_size)
+
+###############################################################################
+# Handwritten functions
+###############################################################################
+
+cpdef tuple state_get_component_storage_size(intptr_t handle, intptr_t state, int32_t num_state_components):
+    """Queries the storage size (in bytes) for each component (tensor) constituting the quantum state representation (on the current process in multi-process runs).
+
+    Args:
+        handle (intptr_t): Library handle.
+        state (intptr_t): Quantum state (or a batch of quantum states).
+        num_state_components (int32_t): Number of components (tensors) in the quantum state representation (on the current process).
+
+    Returns:
+        object: Storage size (bytes) for each component (tensor) consituting the quantum state representation (on the current process).
+
+    .. seealso:: `cudensitymatStateGetComponentStorageSize`
+    """
+    cdef vector[size_t] _component_buffer_size_
+    _component_buffer_size_.resize(num_state_components)
+    with nogil:
+        status = cudensitymatStateGetComponentStorageSize(<const Handle>handle, <const State>state, num_state_components, _component_buffer_size_.data())
+    check_status(status)
+
+    # NOTE: The syntax tuple(_component_buffer_size_[i] for i in range(num_state_components)) did
+    # not work, so had to create a list first and then convert to tuple
+    component_buffer_size = []
+    for i in range(num_state_components):
+        component_buffer_size.append(_component_buffer_size_[i])
+    return tuple(component_buffer_size)
+
+
+cpdef intptr_t create_elementary_operator(intptr_t handle, int32_t num_space_modes, space_mode_extents, int sparsity, int32_t num_diagonals, diagonal_offsets, int data_type, intptr_t tensor_data, tensor_callback) except? 0:
+    """Creates an elementary tensor operator acting on a given number of quantum state modes (aka space modes).
+
+    Args:
+        handle (intptr_t): Library handle.
+        num_space_modes (int32_t): Number of the (state) space modes acted on.
+        space_mode_extents (object): Extents of the (state) space modes acted on. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int64_t``.
+
+        sparsity (ElementaryOperatorSparsity): Tensor operator sparsity defining the storage scheme.
+        num_diagonals (int32_t): For multi-diagonal tensor operator matrices, specifies the total number of non-zero diagonals.
+        diagonal_offsets (object): Offsets of the non-zero diagonals (for example, the main diagonal has offset 0, the diagonal right above the main diagonal has offset +1, the diagonal right below the main diagonal has offset -1, and so on). It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        data_type (int): Tensor operator data type.
+        tensor_data (intptr_t): GPU-accessible pointer to the tensor operator elements storage.
+        tensor_callback (object): Optional user-defined tensor callback function which can be called later to fill in the tensor elements in the provided storage, or NULL.
+
+    Returns:
+        intptr_t: Elementary tensor operator.
+
+    .. seealso:: `cudensitymatCreateElementaryOperator`
+    """
+    cdef nullable_unique_ptr[ vector[int64_t] ] _space_mode_extents_
+    get_resource_ptr[int64_t](_space_mode_extents_, space_mode_extents, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _diagonal_offsets_
+    get_resource_ptr[int32_t](_diagonal_offsets_, diagonal_offsets, <int32_t*>NULL)
+    cdef ElementaryOperator elem_operator
+
+    cdef cudensitymatWrappedTensorCallback_t wrapped_tensor_callback
+    if tensor_callback is not None:
+        wrapped_tensor_callback.callback = <cudensitymatTensorCallback_t>(<void *>tensor_callback)
+    else:
+        wrapped_tensor_callback.callback = NULL
+    wrapped_tensor_callback.wrapper = <void *>tensor_callback_wrapper
+
+    with nogil:
+        status = cudensitymatCreateElementaryOperator(<const Handle>handle, num_space_modes, <const int64_t*>(_space_mode_extents_.data()), <_ElementaryOperatorSparsity>sparsity, num_diagonals, <const int32_t*>(_diagonal_offsets_.data()), <DataType>data_type, <void*>tensor_data, wrapped_tensor_callback, &elem_operator)
+    check_status(status)
+    return <intptr_t>elem_operator
+
+
+cpdef operator_term_append_elementary_product(intptr_t handle, intptr_t operator_term, int32_t num_elem_operators, elem_operators, state_modes_acted_on, mode_action_duality, coefficient, coefficient_callback):
+    """Appends a product of elementary tensor operators acting on quantum state modes to the operator term.
+
+    Args:
+        handle (intptr_t): Library handle.
+        operator_term (intptr_t): Operator term.
+        num_elem_operators (int32_t): Number of elementary tensor operators in the tensor operator product.
+        elem_operators (object): Elementary tensor operators constituting the tensor operator product. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``ElementaryOperator``.
+
+        state_modes_acted_on (object): State modes acted on by the tensor operator product. This is a concatenated list of the state modes acted on by all constituting elementary tensor operators in the same order how they appear in the elem_operators argument. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        mode_action_duality (object): Duality status of each mode action, that is, whether the action applies to a ket mode of the quantum state (value 0) or a bra mode of the quantum state (value 1 or other non-zero). It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        coefficient (complex): Constant complex scalar coefficient associated with the tensor operator product.
+        coefficient_callback (object): User-defined complex scalar callback function which can be called later to update the scalar coefficient associated with the tensor operator product, or NULL. The total coefficient associated with the tensor operator product is a product of the constant coefficient and the result of the scalar callback function, if defined.
+
+    .. seealso:: `cudensitymatOperatorTermAppendElementaryProduct`
+    """
+    cdef nullable_unique_ptr[ vector[ElementaryOperator*] ] _elem_operators_
+    get_resource_ptrs[ElementaryOperator](_elem_operators_, elem_operators, <ElementaryOperator*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _state_modes_acted_on_
+    get_resource_ptr[int32_t](_state_modes_acted_on_, state_modes_acted_on, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_action_duality_
+    get_resource_ptr[int32_t](_mode_action_duality_, mode_action_duality, <int32_t*>NULL)
+
+    cdef cuDoubleComplex _coefficient_
+    _coefficient_.x = coefficient.real
+    _coefficient_.y = coefficient.imag
+
+    cdef cudensitymatScalarCallback_t _coefficient_callback
+    if coefficient_callback:
+        _coefficient_callback = <cudensitymatScalarCallback_t>(<void*>coefficient_callback)
+    else:
+        _coefficient_callback = <cudensitymatScalarCallback_t>NULL
+    
+    cdef cudensitymatWrappedScalarCallback_t wrapped_coefficient_callback
+    wrapped_coefficient_callback.wrapper = <void *>scalar_callback_wrapper
+    wrapped_coefficient_callback.callback = _coefficient_callback
+    
+    with nogil:
+        status = cudensitymatOperatorTermAppendElementaryProduct(<const Handle>handle, <OperatorTerm>operator_term, num_elem_operators, <const ElementaryOperator*>(_elem_operators_.data()), <const int32_t*>(_state_modes_acted_on_.data()), <const int32_t*>(_mode_action_duality_.data()), _coefficient_, wrapped_coefficient_callback)
+    check_status(status)
+
+
+cpdef operator_term_append_general_product(intptr_t handle, intptr_t operator_term, int32_t num_elem_operators, num_operator_modes, operator_mode_extents, operator_mode_strides, state_modes_acted_on, mode_action_duality, int data_type, tensor_data, tensor_callbacks, coefficient, coefficient_callback):
+    """Appends a product of generic dense tensor operators acting on different quantum state modes to the operator term.
+
+    Args:
+        handle (intptr_t): Library handle.
+        operator_term (intptr_t): Operator term.
+        num_elem_operators (int32_t): Number of dense tensor operators in the given tensor operator product.
+        num_operator_modes (object): Number of modes in each tensor operator (twice the number of state modes it acts on). It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        operator_mode_extents (object): Mode extents for each dense tensor operator. It can be:
+
+            - an :class:`int` as the pointer address to the nested sequence, or
+            - a Python sequence of :class:`int`\s, each of which is a pointer address
+              to a valid sequence, or
+            - a nested Python sequence of ``int64_t``.
+
+        operator_mode_strides (object): Mode strides for each dense tensor operator. If a specific element is set to NULL, the corresponding dense tensor operator will assume the default generalized column-wise storage strides. It can be:
+
+            - an :class:`int` as the pointer address to the nested sequence, or
+            - a Python sequence of :class:`int`\s, each of which is a pointer address
+              to a valid sequence, or
+            - a nested Python sequence of ``int64_t``.
+
+        state_modes_acted_on (object): State modes acted on by the tensor operator product. This is a concatenated list of the state modes acted on by all constituting dense tensor operators in the same order how they appear in the above arguments. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        mode_action_duality (object): Duality status of each mode action, whether the action applies to a ket mode of the quantum state (value 0) or a bra mode of the quantum state (value 1 or other non-zero). It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        data_type (int): Data type (for all dense tensor operators).
+        tensor_data (object): GPU-accessible pointers to the elements of each dense tensor operator constituting the tensor operator product. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``intptr_t``.
+
+        tensor_callbacks (object): User-defined tensor callback functions which can be called later to update the elements of each dense tensor operator (any of the callbacks can be NULL). It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``cudensitymatWrappedTensorCallback_t``.
+
+        coefficient (complex): Constant complex scalar coefficient associated with the tensor operator product.
+        coefficient_callback (object): User-defined complex scalar callback function which can be called later to update the scalar coefficient associated with the tensor operator product, or NULL. The total coefficient associated with the tensor operator product is a product of the constant coefficient and the result of the scalar callback function, if defined.
+
+    .. seealso:: `cudensitymatOperatorTermAppendGeneralProduct`
+    """
+    cdef nullable_unique_ptr[ vector[int32_t] ] _num_operator_modes_
+    get_resource_ptr[int32_t](_num_operator_modes_, num_operator_modes, <int32_t*>NULL)
+    cdef nested_resource[ int64_t ] _operator_mode_extents_
+    get_nested_resource_ptr[int64_t](_operator_mode_extents_, operator_mode_extents, <int64_t*>NULL)
+    cdef nested_resource[ int64_t ] _operator_mode_strides_
+    get_nested_resource_ptr[int64_t](_operator_mode_strides_, operator_mode_strides, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _state_modes_acted_on_
+    get_resource_ptr[int32_t](_state_modes_acted_on_, state_modes_acted_on, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mode_action_duality_
+    get_resource_ptr[int32_t](_mode_action_duality_, mode_action_duality, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[intptr_t] ] _tensor_data_
+    get_resource_ptr[intptr_t](_tensor_data_, tensor_data, <intptr_t*>NULL)
+    
+    cdef vector[cudensitymatWrappedTensorCallback_t] _tensor_callbacks_
+    cdef cudensitymatWrappedTensorCallback_t wrapped_tensor_callback
+    
+    for i in range(num_elem_operators):
+        if tensor_callbacks[i] is not None:
+            wrapped_tensor_callback.callback = <cudensitymatTensorCallback_t>(<void *>tensor_callbacks[i])
+        else:
+            wrapped_tensor_callback.callback = <cudensitymatTensorCallback_t>NULL
+        wrapped_tensor_callback.wrapper = <void *>tensor_callback_wrapper
+        _tensor_callbacks_.push_back(wrapped_tensor_callback)
+
+    cdef cuDoubleComplex _coefficient_
+    _coefficient_.x = coefficient.real
+    _coefficient_.y = coefficient.imag
+
+    cdef cudensitymatScalarCallback_t _coefficient_callback
+    if coefficient_callback:
+        _coefficient_callback = <cudensitymatScalarCallback_t>(<void*>coefficient_callback)
+    else:
+        _coefficient_callback = <cudensitymatScalarCallback_t>NULL
+    
+    cdef cudensitymatWrappedScalarCallback_t wrapped_coefficient_callback
+    wrapped_coefficient_callback.wrapper = <void *>scalar_callback_wrapper
+    wrapped_coefficient_callback.callback = _coefficient_callback
+    
+    with nogil:
+        status = cudensitymatOperatorTermAppendGeneralProduct(<const Handle>handle, <OperatorTerm>operator_term, num_elem_operators, <const int32_t*>(_num_operator_modes_.data()), <const int64_t**>(_operator_mode_extents_.ptrs.data()), 
+        <const int64_t**>(_operator_mode_strides_.ptrs.data()), 
+        # NULL,
+        <const int32_t*>(_state_modes_acted_on_.data()), <const int32_t*>(_mode_action_duality_.data()), <DataType>data_type, <void**>(_tensor_data_.data()), <cudensitymatWrappedTensorCallback_t*>(_tensor_callbacks_.data()), _coefficient_, wrapped_coefficient_callback)
+    check_status(status)
+
+
+cpdef operator_append_term(intptr_t handle, intptr_t superoperator, intptr_t operator_term, int32_t duality, coefficient, coefficient_callback):
+    """Appends an operator term to the operator.
+
+    Args:
+        handle (intptr_t): Library handle.
+        superoperator (intptr_t): Operator.
+        operator_term (intptr_t): Operator term.
+        duality (int32_t): Duality status of the operator term action as a whole. If not zero, the duality status of each mode action inside the operator term will be flipped, that is, action from the left will be replaced by action from the right, and vice versa.
+        coefficient (complex): Constant complex scalar coefficient associated with the operator term.
+        coefficient_callback (object): User-defined complex scalar callback function which can be called later to update the scalar coefficient associated with the operator term, or NULL. The total coefficient associated with the operator term is a product of the constant coefficient and the result of the scalar callback function, if defined.
+
+    .. seealso:: `cudensitymatOperatorAppendTerm`
+    """
+    cdef cuDoubleComplex _coefficient_
+    _coefficient_.x = coefficient.real
+    _coefficient_.y = coefficient.imag
+
+    cdef cudensitymatScalarCallback_t _coefficient_callback
+    if coefficient_callback:
+        _coefficient_callback = <cudensitymatScalarCallback_t>(<void*>coefficient_callback)
+    else:
+        _coefficient_callback = <cudensitymatScalarCallback_t>NULL
+    
+    cdef cudensitymatWrappedScalarCallback_t wrapped_coefficient_callback
+    wrapped_coefficient_callback.wrapper = <void *>scalar_callback_wrapper
+    wrapped_coefficient_callback.callback = _coefficient_callback
+
+    with nogil:
+        status = cudensitymatOperatorAppendTerm(<const Handle>handle, <Operator>superoperator, <OperatorTerm>operator_term, duality, _coefficient_, wrapped_coefficient_callback)
+    check_status(status)
diff --git a/python/cuquantum/bindings/cycudensitymat.pxd b/python/cuquantum/bindings/cycudensitymat.pxd
new file mode 100644
index 0000000..04b19c2
--- /dev/null
+++ b/python/cuquantum/bindings/cycudensitymat.pxd
@@ -0,0 +1,211 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t
+from libc.stdio cimport FILE
+
+
+###############################################################################
+# Types (structs, enums, ...)
+###############################################################################
+
+cdef extern from *:
+    ctypedef void* cudaStream_t 'cudaStream_t'
+
+    ctypedef enum cudaDataType_t:
+        CUDA_R_32F = 0
+        CUDA_C_32F = 4
+        CUDA_R_64F = 1
+        CUDA_C_64F = 5
+    ctypedef cudaDataType_t cudaDataType 'cudaDataType'
+
+    ctypedef int libraryPropertyType_t 'libraryPropertyType_t'
+    ctypedef int libraryPropertyType 'libraryPropertyType'
+
+    ctypedef struct cuDoubleComplex:
+        double x
+        double y
+
+
+cdef extern from '<cudensitymat.h>' nogil:
+    # enums
+    ctypedef enum cudensitymatStatus_t:
+        CUDENSITYMAT_STATUS_SUCCESS
+        CUDENSITYMAT_STATUS_NOT_INITIALIZED
+        CUDENSITYMAT_STATUS_ALLOC_FAILED
+        CUDENSITYMAT_STATUS_INVALID_VALUE
+        CUDENSITYMAT_STATUS_ARCH_MISMATCH
+        CUDENSITYMAT_STATUS_EXECUTION_FAILED
+        CUDENSITYMAT_STATUS_INTERNAL_ERROR
+        CUDENSITYMAT_STATUS_NOT_SUPPORTED
+        CUDENSITYMAT_STATUS_CALLBACK_ERROR
+        CUDENSITYMAT_STATUS_CUBLAS_ERROR
+        CUDENSITYMAT_STATUS_CUDA_ERROR
+        CUDENSITYMAT_STATUS_INSUFFICIENT_WORKSPACE
+        CUDENSITYMAT_STATUS_INSUFFICIENT_DRIVER
+        CUDENSITYMAT_STATUS_IO_ERROR
+        CUDENSITYMAT_STATUS_CUTENSOR_VERSION_MISMATCH
+        CUDENSITYMAT_STATUS_NO_DEVICE_ALLOCATOR
+        CUDENSITYMAT_STATUS_CUTENSOR_ERROR
+        CUDENSITYMAT_STATUS_CUDMLVER_ERROR
+        CUDENSITYMAT_STATUS_DEVICE_ALLOCATOR_ERROR
+        CUDENSITYMAT_STATUS_DISTRIBUTED_FAILURE
+        CUDENSITYMAT_STATUS_INTERRUPTED
+        CUDENSITYMAT_STATUS_CUTENSORNET_ERROR
+
+    ctypedef enum cudensitymatComputeType_t:
+        CUDENSITYMAT_COMPUTE_64F
+        CUDENSITYMAT_COMPUTE_32F
+
+    ctypedef enum cudensitymatDistributedProvider_t:
+        CUDENSITYMAT_DISTRIBUTED_PROVIDER_NONE
+        CUDENSITYMAT_DISTRIBUTED_PROVIDER_MPI
+        CUDENSITYMAT_DISTRIBUTED_PROVIDER_NCCL
+        CUDENSITYMAT_DISTRIBUTED_PROVIDER_NVSHMEM
+
+    ctypedef enum cudensitymatStatePurity_t:
+        CUDENSITYMAT_STATE_PURITY_PURE
+        CUDENSITYMAT_STATE_PURITY_MIXED
+
+    ctypedef enum cudensitymatElementaryOperatorSparsity_t:
+        CUDENSITYMAT_OPERATOR_SPARSITY_NONE
+        CUDENSITYMAT_OPERATOR_SPARSITY_MULTIDIAGONAL
+
+    ctypedef enum cudensitymatMemspace_t:
+        CUDENSITYMAT_MEMSPACE_DEVICE
+        CUDENSITYMAT_MEMSPACE_HOST
+
+    ctypedef enum cudensitymatWorkspaceKind_t:
+        CUDENSITYMAT_WORKSPACE_SCRATCH
+
+    # types
+    ctypedef void* cudensitymatHandle_t 'cudensitymatHandle_t'
+    ctypedef void* cudensitymatState_t 'cudensitymatState_t'
+    ctypedef void* cudensitymatElementaryOperator_t 'cudensitymatElementaryOperator_t'
+    ctypedef void* cudensitymatOperatorTerm_t 'cudensitymatOperatorTerm_t'
+    ctypedef void* cudensitymatOperator_t 'cudensitymatOperator_t'
+    ctypedef void* cudensitymatOperatorAction_t 'cudensitymatOperatorAction_t'
+    ctypedef void* cudensitymatExpectation_t 'cudensitymatExpectation_t'
+    ctypedef void* cudensitymatWorkspaceDescriptor_t 'cudensitymatWorkspaceDescriptor_t'
+    ctypedef void* cudensitymatDistributedRequest_t 'cudensitymatDistributedRequest_t'
+    ctypedef int32_t (*cudensitymatScalarCallback_t 'cudensitymatScalarCallback_t')(
+        double time,
+        int32_t numParams,
+        const double params[],
+        cudaDataType_t dataType,
+        void* scalarStorage
+    )
+    ctypedef int32_t (*cudensitymatTensorCallback_t 'cudensitymatTensorCallback_t')(
+        cudensitymatElementaryOperatorSparsity_t sparsity,
+        int32_t numModes,
+        const int64_t modeExtents[],
+        const int32_t diagonalOffsets[],
+        double time,
+        int32_t numParams,
+        const double params[],
+        cudaDataType_t dataType,
+        void* tensorStorage
+    )
+    ctypedef void (*cudensitymatLoggerCallback_t 'cudensitymatLoggerCallback_t')(
+        int32_t logLevel,
+        const char* functionName,
+        const char* message
+    )
+    ctypedef void (*cudensitymatLoggerCallbackData_t 'cudensitymatLoggerCallbackData_t')(
+        int32_t logLevel,
+        const char* functionName,
+        const char* message,
+        void* userData
+    )
+    ctypedef struct cudensitymatTimeRange_t 'cudensitymatTimeRange_t':
+        double timeStart
+        double timeFinish
+        double timeStep
+        int64_t numPoints
+        double* points
+    ctypedef struct cudensitymatDistributedCommunicator_t 'cudensitymatDistributedCommunicator_t':
+        void* commPtr
+        size_t commSize
+    ctypedef struct cudensitymatWrappedScalarCallback_t 'cudensitymatWrappedScalarCallback_t':
+        cudensitymatScalarCallback_t callback
+        void* wrapper
+    ctypedef struct cudensitymatWrappedTensorCallback_t 'cudensitymatWrappedTensorCallback_t':
+        cudensitymatTensorCallback_t callback
+        void* wrapper
+    ctypedef struct cudensitymatDistributedInterface_t 'cudensitymatDistributedInterface_t':
+        int version
+        int (*getNumRanks)(const cudensitymatDistributedCommunicator_t*, int32_t*)
+        int (*getNumRanksShared)(const cudensitymatDistributedCommunicator_t*, int32_t*)
+        int (*getProcRank)(const cudensitymatDistributedCommunicator_t*, int32_t*)
+        int (*barrier)(const cudensitymatDistributedCommunicator_t*)
+        int (*createRequest)(cudensitymatDistributedRequest_t*)
+        int (*destroyRequest)(cudensitymatDistributedRequest_t)
+        int (*waitRequest)(cudensitymatDistributedRequest_t)
+        int (*testRequest)(cudensitymatDistributedRequest_t, int32_t*)
+        int (*send)(const cudensitymatDistributedCommunicator_t*, const void*, int32_t, cudaDataType_t, int32_t, int32_t)
+        int (*sendAsync)(const cudensitymatDistributedCommunicator_t*, const void*, int32_t, cudaDataType_t, int32_t, int32_t, cudensitymatDistributedRequest_t)
+        int (*receive)(const cudensitymatDistributedCommunicator_t*, void*, int32_t, cudaDataType_t, int32_t, int32_t)
+        int (*receiveAsync)(const cudensitymatDistributedCommunicator_t*, void*, int32_t, cudaDataType_t, int32_t, int32_t, cudensitymatDistributedRequest_t)
+        int (*bcast)(const cudensitymatDistributedCommunicator_t*, void*, int32_t, cudaDataType_t, int32_t)
+        int (*allreduce)(const cudensitymatDistributedCommunicator_t*, const void*, void*, int32_t, cudaDataType_t)
+        int (*allreduceInPlace)(const cudensitymatDistributedCommunicator_t*, void*, int32_t, cudaDataType_t)
+        int (*allreduceInPlaceMin)(const cudensitymatDistributedCommunicator_t*, void*, int32_t, cudaDataType_t)
+        int (*allreduceDoubleIntMinloc)(const cudensitymatDistributedCommunicator_t*, const void*, void*)
+        int (*allgather)(const cudensitymatDistributedCommunicator_t*, const void*, void*, int32_t, cudaDataType_t)
+
+    # constants
+    const int CUDENSITYMAT_ALLOCATOR_NAME_LEN
+    const int CUDENSITYMAT_MAJOR
+    const int CUDENSITYMAT_MINOR
+    const int CUDENSITYMAT_PATCH
+    const int CUDENSITYMAT_VERSION
+
+
+###############################################################################
+# Functions
+###############################################################################
+
+cdef cudensitymatStatus_t cudensitymatCreate(cudensitymatHandle_t* handle) except* nogil
+cdef cudensitymatStatus_t cudensitymatDestroy(cudensitymatHandle_t handle) except* nogil
+cdef cudensitymatStatus_t cudensitymatResetDistributedConfiguration(cudensitymatHandle_t handle, cudensitymatDistributedProvider_t provider, const void* commPtr, size_t commSize) except* nogil
+cdef cudensitymatStatus_t cudensitymatGetNumRanks(const cudensitymatHandle_t handle, int32_t* numRanks) except* nogil
+cdef cudensitymatStatus_t cudensitymatGetProcRank(const cudensitymatHandle_t handle, int32_t* procRank) except* nogil
+cdef cudensitymatStatus_t cudensitymatResetRandomSeed(cudensitymatHandle_t handle, int32_t randomSeed) except* nogil
+cdef cudensitymatStatus_t cudensitymatCreateState(const cudensitymatHandle_t handle, cudensitymatStatePurity_t purity, int32_t numSpaceModes, const int64_t spaceModeExtents[], int64_t batchSize, cudaDataType_t dataType, cudensitymatState_t* state) except* nogil
+cdef cudensitymatStatus_t cudensitymatDestroyState(cudensitymatState_t state) except* nogil
+cdef cudensitymatStatus_t cudensitymatStateGetNumComponents(const cudensitymatHandle_t handle, const cudensitymatState_t state, int32_t* numStateComponents) except* nogil
+cdef cudensitymatStatus_t cudensitymatStateGetComponentStorageSize(const cudensitymatHandle_t handle, const cudensitymatState_t state, int32_t numStateComponents, size_t componentBufferSize[]) except* nogil
+cdef cudensitymatStatus_t cudensitymatStateAttachComponentStorage(const cudensitymatHandle_t handle, cudensitymatState_t state, int32_t numStateComponents, void* componentBuffer[], const size_t componentBufferSize[]) except* nogil
+cdef cudensitymatStatus_t cudensitymatStateGetComponentNumModes(const cudensitymatHandle_t handle, cudensitymatState_t state, int32_t stateComponentLocalId, int32_t* stateComponentGlobalId, int32_t* stateComponentNumModes, int32_t* batchModeLocation) except* nogil
+cdef cudensitymatStatus_t cudensitymatStateGetComponentInfo(const cudensitymatHandle_t handle, cudensitymatState_t state, int32_t stateComponentLocalId, int32_t* stateComponentGlobalId, int32_t* stateComponentNumModes, int64_t stateComponentModeExtents[], int64_t stateComponentModeOffsets[]) except* nogil
+cdef cudensitymatStatus_t cudensitymatStateInitializeZero(const cudensitymatHandle_t handle, cudensitymatState_t state, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t cudensitymatStateComputeScaling(const cudensitymatHandle_t handle, cudensitymatState_t state, const void* scalingFactors, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t cudensitymatStateComputeNorm(const cudensitymatHandle_t handle, const cudensitymatState_t state, void* norm, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t cudensitymatStateComputeTrace(const cudensitymatHandle_t handle, const cudensitymatState_t state, void* trace, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t cudensitymatStateComputeAccumulation(const cudensitymatHandle_t handle, const cudensitymatState_t stateIn, cudensitymatState_t stateOut, const void* scalingFactors, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t cudensitymatStateComputeInnerProduct(const cudensitymatHandle_t handle, const cudensitymatState_t stateLeft, const cudensitymatState_t stateRight, void* innerProduct, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t cudensitymatCreateElementaryOperator(const cudensitymatHandle_t handle, int32_t numSpaceModes, const int64_t spaceModeExtents[], cudensitymatElementaryOperatorSparsity_t sparsity, int32_t numDiagonals, const int32_t diagonalOffsets[], cudaDataType_t dataType, void* tensorData, cudensitymatWrappedTensorCallback_t tensorCallback, cudensitymatElementaryOperator_t* elemOperator) except* nogil
+cdef cudensitymatStatus_t cudensitymatDestroyElementaryOperator(cudensitymatElementaryOperator_t elemOperator) except* nogil
+cdef cudensitymatStatus_t cudensitymatCreateOperatorTerm(const cudensitymatHandle_t handle, int32_t numSpaceModes, const int64_t spaceModeExtents[], cudensitymatOperatorTerm_t* operatorTerm) except* nogil
+cdef cudensitymatStatus_t cudensitymatDestroyOperatorTerm(cudensitymatOperatorTerm_t operatorTerm) except* nogil
+cdef cudensitymatStatus_t cudensitymatOperatorTermAppendElementaryProduct(const cudensitymatHandle_t handle, cudensitymatOperatorTerm_t operatorTerm, int32_t numElemOperators, const cudensitymatElementaryOperator_t elemOperators[], const int32_t stateModesActedOn[], const int32_t modeActionDuality[], cuDoubleComplex coefficient, cudensitymatWrappedScalarCallback_t coefficientCallback) except* nogil
+cdef cudensitymatStatus_t cudensitymatOperatorTermAppendGeneralProduct(const cudensitymatHandle_t handle, cudensitymatOperatorTerm_t operatorTerm, int32_t numElemOperators, const int32_t numOperatorModes[], const int64_t* operatorModeExtents[], const int64_t* operatorModeStrides[], const int32_t stateModesActedOn[], const int32_t modeActionDuality[], cudaDataType_t dataType, void* tensorData[], cudensitymatWrappedTensorCallback_t tensorCallbacks[], cuDoubleComplex coefficient, cudensitymatWrappedScalarCallback_t coefficientCallback) except* nogil
+cdef cudensitymatStatus_t cudensitymatCreateOperator(const cudensitymatHandle_t handle, int32_t numSpaceModes, const int64_t spaceModeExtents[], cudensitymatOperator_t* superoperator) except* nogil
+cdef cudensitymatStatus_t cudensitymatDestroyOperator(cudensitymatOperator_t superoperator) except* nogil
+cdef cudensitymatStatus_t cudensitymatOperatorAppendTerm(const cudensitymatHandle_t handle, cudensitymatOperator_t superoperator, cudensitymatOperatorTerm_t operatorTerm, int32_t duality, cuDoubleComplex coefficient, cudensitymatWrappedScalarCallback_t coefficientCallback) except* nogil
+cdef cudensitymatStatus_t cudensitymatOperatorPrepareAction(const cudensitymatHandle_t handle, const cudensitymatOperator_t superoperator, const cudensitymatState_t stateIn, const cudensitymatState_t stateOut, cudensitymatComputeType_t computeType, size_t workspaceSizeLimit, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t cudensitymatOperatorComputeAction(const cudensitymatHandle_t handle, const cudensitymatOperator_t superoperator, double time, int32_t numParams, const double params[], const cudensitymatState_t stateIn, cudensitymatState_t stateOut, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t cudensitymatCreateOperatorAction(const cudensitymatHandle_t handle, int32_t numOperators, cudensitymatOperator_t operators[], cudensitymatOperatorAction_t* operatorAction) except* nogil
+cdef cudensitymatStatus_t cudensitymatDestroyOperatorAction(cudensitymatOperatorAction_t operatorAction) except* nogil
+cdef cudensitymatStatus_t cudensitymatOperatorActionPrepare(const cudensitymatHandle_t handle, cudensitymatOperatorAction_t operatorAction, const cudensitymatState_t stateIn[], const cudensitymatState_t stateOut, cudensitymatComputeType_t computeType, size_t workspaceSizeLimit, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t cudensitymatOperatorActionCompute(const cudensitymatHandle_t handle, cudensitymatOperatorAction_t operatorAction, double time, int32_t numParams, const double params[], const cudensitymatState_t stateIn[], cudensitymatState_t stateOut, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t cudensitymatCreateExpectation(const cudensitymatHandle_t handle, cudensitymatOperator_t superoperator, cudensitymatExpectation_t* expectation) except* nogil
+cdef cudensitymatStatus_t cudensitymatDestroyExpectation(cudensitymatExpectation_t expectation) except* nogil
+cdef cudensitymatStatus_t cudensitymatExpectationPrepare(const cudensitymatHandle_t handle, cudensitymatExpectation_t expectation, const cudensitymatState_t state, cudensitymatComputeType_t computeType, size_t workspaceSizeLimit, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t cudensitymatExpectationCompute(const cudensitymatHandle_t handle, cudensitymatExpectation_t expectation, double time, int32_t numParams, const double params[], const cudensitymatState_t state, void* expectationValue, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil
+cdef cudensitymatStatus_t cudensitymatCreateWorkspace(const cudensitymatHandle_t handle, cudensitymatWorkspaceDescriptor_t* workspaceDescr) except* nogil
+cdef cudensitymatStatus_t cudensitymatDestroyWorkspace(cudensitymatWorkspaceDescriptor_t workspaceDescr) except* nogil
+cdef cudensitymatStatus_t cudensitymatWorkspaceGetMemorySize(const cudensitymatHandle_t handle, const cudensitymatWorkspaceDescriptor_t workspaceDescr, cudensitymatMemspace_t memSpace, cudensitymatWorkspaceKind_t workspaceKind, size_t* memoryBufferSize) except* nogil
+cdef cudensitymatStatus_t cudensitymatWorkspaceSetMemory(const cudensitymatHandle_t handle, cudensitymatWorkspaceDescriptor_t workspaceDescr, cudensitymatMemspace_t memSpace, cudensitymatWorkspaceKind_t workspaceKind, void* memoryBuffer, size_t memoryBufferSize) except* nogil
+cdef cudensitymatStatus_t cudensitymatWorkspaceGetMemory(const cudensitymatHandle_t handle, const cudensitymatWorkspaceDescriptor_t workspaceDescr, cudensitymatMemspace_t memSpace, cudensitymatWorkspaceKind_t workspaceKind, void** memoryBuffer, size_t* memoryBufferSize) except* nogil
diff --git a/python/cuquantum/bindings/cycudensitymat.pyx b/python/cuquantum/bindings/cycudensitymat.pyx
new file mode 100644
index 0000000..da0c75a
--- /dev/null
+++ b/python/cuquantum/bindings/cycudensitymat.pyx
@@ -0,0 +1,181 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+from ._internal cimport cudensitymat as _cudensitymat
+
+
+###############################################################################
+# Wrapper functions
+###############################################################################
+
+cdef cudensitymatStatus_t cudensitymatCreate(cudensitymatHandle_t* handle) except* nogil:
+    return _cudensitymat._cudensitymatCreate(handle)
+
+
+cdef cudensitymatStatus_t cudensitymatDestroy(cudensitymatHandle_t handle) except* nogil:
+    return _cudensitymat._cudensitymatDestroy(handle)
+
+
+cdef cudensitymatStatus_t cudensitymatResetDistributedConfiguration(cudensitymatHandle_t handle, cudensitymatDistributedProvider_t provider, const void* commPtr, size_t commSize) except* nogil:
+    return _cudensitymat._cudensitymatResetDistributedConfiguration(handle, provider, commPtr, commSize)
+
+
+cdef cudensitymatStatus_t cudensitymatGetNumRanks(const cudensitymatHandle_t handle, int32_t* numRanks) except* nogil:
+    return _cudensitymat._cudensitymatGetNumRanks(handle, numRanks)
+
+
+cdef cudensitymatStatus_t cudensitymatGetProcRank(const cudensitymatHandle_t handle, int32_t* procRank) except* nogil:
+    return _cudensitymat._cudensitymatGetProcRank(handle, procRank)
+
+
+cdef cudensitymatStatus_t cudensitymatResetRandomSeed(cudensitymatHandle_t handle, int32_t randomSeed) except* nogil:
+    return _cudensitymat._cudensitymatResetRandomSeed(handle, randomSeed)
+
+
+cdef cudensitymatStatus_t cudensitymatCreateState(const cudensitymatHandle_t handle, cudensitymatStatePurity_t purity, int32_t numSpaceModes, const int64_t spaceModeExtents[], int64_t batchSize, cudaDataType_t dataType, cudensitymatState_t* state) except* nogil:
+    return _cudensitymat._cudensitymatCreateState(handle, purity, numSpaceModes, spaceModeExtents, batchSize, dataType, state)
+
+
+cdef cudensitymatStatus_t cudensitymatDestroyState(cudensitymatState_t state) except* nogil:
+    return _cudensitymat._cudensitymatDestroyState(state)
+
+
+cdef cudensitymatStatus_t cudensitymatStateGetNumComponents(const cudensitymatHandle_t handle, const cudensitymatState_t state, int32_t* numStateComponents) except* nogil:
+    return _cudensitymat._cudensitymatStateGetNumComponents(handle, state, numStateComponents)
+
+
+cdef cudensitymatStatus_t cudensitymatStateGetComponentStorageSize(const cudensitymatHandle_t handle, const cudensitymatState_t state, int32_t numStateComponents, size_t componentBufferSize[]) except* nogil:
+    return _cudensitymat._cudensitymatStateGetComponentStorageSize(handle, state, numStateComponents, componentBufferSize)
+
+
+cdef cudensitymatStatus_t cudensitymatStateAttachComponentStorage(const cudensitymatHandle_t handle, cudensitymatState_t state, int32_t numStateComponents, void* componentBuffer[], const size_t componentBufferSize[]) except* nogil:
+    return _cudensitymat._cudensitymatStateAttachComponentStorage(handle, state, numStateComponents, componentBuffer, componentBufferSize)
+
+
+cdef cudensitymatStatus_t cudensitymatStateGetComponentNumModes(const cudensitymatHandle_t handle, cudensitymatState_t state, int32_t stateComponentLocalId, int32_t* stateComponentGlobalId, int32_t* stateComponentNumModes, int32_t* batchModeLocation) except* nogil:
+    return _cudensitymat._cudensitymatStateGetComponentNumModes(handle, state, stateComponentLocalId, stateComponentGlobalId, stateComponentNumModes, batchModeLocation)
+
+
+cdef cudensitymatStatus_t cudensitymatStateGetComponentInfo(const cudensitymatHandle_t handle, cudensitymatState_t state, int32_t stateComponentLocalId, int32_t* stateComponentGlobalId, int32_t* stateComponentNumModes, int64_t stateComponentModeExtents[], int64_t stateComponentModeOffsets[]) except* nogil:
+    return _cudensitymat._cudensitymatStateGetComponentInfo(handle, state, stateComponentLocalId, stateComponentGlobalId, stateComponentNumModes, stateComponentModeExtents, stateComponentModeOffsets)
+
+
+cdef cudensitymatStatus_t cudensitymatStateInitializeZero(const cudensitymatHandle_t handle, cudensitymatState_t state, cudaStream_t stream) except* nogil:
+    return _cudensitymat._cudensitymatStateInitializeZero(handle, state, stream)
+
+
+cdef cudensitymatStatus_t cudensitymatStateComputeScaling(const cudensitymatHandle_t handle, cudensitymatState_t state, const void* scalingFactors, cudaStream_t stream) except* nogil:
+    return _cudensitymat._cudensitymatStateComputeScaling(handle, state, scalingFactors, stream)
+
+
+cdef cudensitymatStatus_t cudensitymatStateComputeNorm(const cudensitymatHandle_t handle, const cudensitymatState_t state, void* norm, cudaStream_t stream) except* nogil:
+    return _cudensitymat._cudensitymatStateComputeNorm(handle, state, norm, stream)
+
+
+cdef cudensitymatStatus_t cudensitymatStateComputeTrace(const cudensitymatHandle_t handle, const cudensitymatState_t state, void* trace, cudaStream_t stream) except* nogil:
+    return _cudensitymat._cudensitymatStateComputeTrace(handle, state, trace, stream)
+
+
+cdef cudensitymatStatus_t cudensitymatStateComputeAccumulation(const cudensitymatHandle_t handle, const cudensitymatState_t stateIn, cudensitymatState_t stateOut, const void* scalingFactors, cudaStream_t stream) except* nogil:
+    return _cudensitymat._cudensitymatStateComputeAccumulation(handle, stateIn, stateOut, scalingFactors, stream)
+
+
+cdef cudensitymatStatus_t cudensitymatStateComputeInnerProduct(const cudensitymatHandle_t handle, const cudensitymatState_t stateLeft, const cudensitymatState_t stateRight, void* innerProduct, cudaStream_t stream) except* nogil:
+    return _cudensitymat._cudensitymatStateComputeInnerProduct(handle, stateLeft, stateRight, innerProduct, stream)
+
+
+cdef cudensitymatStatus_t cudensitymatCreateElementaryOperator(const cudensitymatHandle_t handle, int32_t numSpaceModes, const int64_t spaceModeExtents[], cudensitymatElementaryOperatorSparsity_t sparsity, int32_t numDiagonals, const int32_t diagonalOffsets[], cudaDataType_t dataType, void* tensorData, cudensitymatWrappedTensorCallback_t tensorCallback, cudensitymatElementaryOperator_t* elemOperator) except* nogil:
+    return _cudensitymat._cudensitymatCreateElementaryOperator(handle, numSpaceModes, spaceModeExtents, sparsity, numDiagonals, diagonalOffsets, dataType, tensorData, tensorCallback, elemOperator)
+
+
+cdef cudensitymatStatus_t cudensitymatDestroyElementaryOperator(cudensitymatElementaryOperator_t elemOperator) except* nogil:
+    return _cudensitymat._cudensitymatDestroyElementaryOperator(elemOperator)
+
+
+cdef cudensitymatStatus_t cudensitymatCreateOperatorTerm(const cudensitymatHandle_t handle, int32_t numSpaceModes, const int64_t spaceModeExtents[], cudensitymatOperatorTerm_t* operatorTerm) except* nogil:
+    return _cudensitymat._cudensitymatCreateOperatorTerm(handle, numSpaceModes, spaceModeExtents, operatorTerm)
+
+
+cdef cudensitymatStatus_t cudensitymatDestroyOperatorTerm(cudensitymatOperatorTerm_t operatorTerm) except* nogil:
+    return _cudensitymat._cudensitymatDestroyOperatorTerm(operatorTerm)
+
+
+cdef cudensitymatStatus_t cudensitymatOperatorTermAppendElementaryProduct(const cudensitymatHandle_t handle, cudensitymatOperatorTerm_t operatorTerm, int32_t numElemOperators, const cudensitymatElementaryOperator_t elemOperators[], const int32_t stateModesActedOn[], const int32_t modeActionDuality[], cuDoubleComplex coefficient, cudensitymatWrappedScalarCallback_t coefficientCallback) except* nogil:
+    return _cudensitymat._cudensitymatOperatorTermAppendElementaryProduct(handle, operatorTerm, numElemOperators, elemOperators, stateModesActedOn, modeActionDuality, coefficient, coefficientCallback)
+
+
+cdef cudensitymatStatus_t cudensitymatOperatorTermAppendGeneralProduct(const cudensitymatHandle_t handle, cudensitymatOperatorTerm_t operatorTerm, int32_t numElemOperators, const int32_t numOperatorModes[], const int64_t* operatorModeExtents[], const int64_t* operatorModeStrides[], const int32_t stateModesActedOn[], const int32_t modeActionDuality[], cudaDataType_t dataType, void* tensorData[], cudensitymatWrappedTensorCallback_t tensorCallbacks[], cuDoubleComplex coefficient, cudensitymatWrappedScalarCallback_t coefficientCallback) except* nogil:
+    return _cudensitymat._cudensitymatOperatorTermAppendGeneralProduct(handle, operatorTerm, numElemOperators, numOperatorModes, operatorModeExtents, operatorModeStrides, stateModesActedOn, modeActionDuality, dataType, tensorData, tensorCallbacks, coefficient, coefficientCallback)
+
+
+cdef cudensitymatStatus_t cudensitymatCreateOperator(const cudensitymatHandle_t handle, int32_t numSpaceModes, const int64_t spaceModeExtents[], cudensitymatOperator_t* superoperator) except* nogil:
+    return _cudensitymat._cudensitymatCreateOperator(handle, numSpaceModes, spaceModeExtents, superoperator)
+
+
+cdef cudensitymatStatus_t cudensitymatDestroyOperator(cudensitymatOperator_t superoperator) except* nogil:
+    return _cudensitymat._cudensitymatDestroyOperator(superoperator)
+
+
+cdef cudensitymatStatus_t cudensitymatOperatorAppendTerm(const cudensitymatHandle_t handle, cudensitymatOperator_t superoperator, cudensitymatOperatorTerm_t operatorTerm, int32_t duality, cuDoubleComplex coefficient, cudensitymatWrappedScalarCallback_t coefficientCallback) except* nogil:
+    return _cudensitymat._cudensitymatOperatorAppendTerm(handle, superoperator, operatorTerm, duality, coefficient, coefficientCallback)
+
+
+cdef cudensitymatStatus_t cudensitymatOperatorPrepareAction(const cudensitymatHandle_t handle, const cudensitymatOperator_t superoperator, const cudensitymatState_t stateIn, const cudensitymatState_t stateOut, cudensitymatComputeType_t computeType, size_t workspaceSizeLimit, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil:
+    return _cudensitymat._cudensitymatOperatorPrepareAction(handle, superoperator, stateIn, stateOut, computeType, workspaceSizeLimit, workspace, stream)
+
+
+cdef cudensitymatStatus_t cudensitymatOperatorComputeAction(const cudensitymatHandle_t handle, const cudensitymatOperator_t superoperator, double time, int32_t numParams, const double params[], const cudensitymatState_t stateIn, cudensitymatState_t stateOut, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil:
+    return _cudensitymat._cudensitymatOperatorComputeAction(handle, superoperator, time, numParams, params, stateIn, stateOut, workspace, stream)
+
+
+cdef cudensitymatStatus_t cudensitymatCreateOperatorAction(const cudensitymatHandle_t handle, int32_t numOperators, cudensitymatOperator_t operators[], cudensitymatOperatorAction_t* operatorAction) except* nogil:
+    return _cudensitymat._cudensitymatCreateOperatorAction(handle, numOperators, operators, operatorAction)
+
+
+cdef cudensitymatStatus_t cudensitymatDestroyOperatorAction(cudensitymatOperatorAction_t operatorAction) except* nogil:
+    return _cudensitymat._cudensitymatDestroyOperatorAction(operatorAction)
+
+
+cdef cudensitymatStatus_t cudensitymatOperatorActionPrepare(const cudensitymatHandle_t handle, cudensitymatOperatorAction_t operatorAction, const cudensitymatState_t stateIn[], const cudensitymatState_t stateOut, cudensitymatComputeType_t computeType, size_t workspaceSizeLimit, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil:
+    return _cudensitymat._cudensitymatOperatorActionPrepare(handle, operatorAction, stateIn, stateOut, computeType, workspaceSizeLimit, workspace, stream)
+
+
+cdef cudensitymatStatus_t cudensitymatOperatorActionCompute(const cudensitymatHandle_t handle, cudensitymatOperatorAction_t operatorAction, double time, int32_t numParams, const double params[], const cudensitymatState_t stateIn[], cudensitymatState_t stateOut, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil:
+    return _cudensitymat._cudensitymatOperatorActionCompute(handle, operatorAction, time, numParams, params, stateIn, stateOut, workspace, stream)
+
+
+cdef cudensitymatStatus_t cudensitymatCreateExpectation(const cudensitymatHandle_t handle, cudensitymatOperator_t superoperator, cudensitymatExpectation_t* expectation) except* nogil:
+    return _cudensitymat._cudensitymatCreateExpectation(handle, superoperator, expectation)
+
+
+cdef cudensitymatStatus_t cudensitymatDestroyExpectation(cudensitymatExpectation_t expectation) except* nogil:
+    return _cudensitymat._cudensitymatDestroyExpectation(expectation)
+
+
+cdef cudensitymatStatus_t cudensitymatExpectationPrepare(const cudensitymatHandle_t handle, cudensitymatExpectation_t expectation, const cudensitymatState_t state, cudensitymatComputeType_t computeType, size_t workspaceSizeLimit, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil:
+    return _cudensitymat._cudensitymatExpectationPrepare(handle, expectation, state, computeType, workspaceSizeLimit, workspace, stream)
+
+
+cdef cudensitymatStatus_t cudensitymatExpectationCompute(const cudensitymatHandle_t handle, cudensitymatExpectation_t expectation, double time, int32_t numParams, const double params[], const cudensitymatState_t state, void* expectationValue, cudensitymatWorkspaceDescriptor_t workspace, cudaStream_t stream) except* nogil:
+    return _cudensitymat._cudensitymatExpectationCompute(handle, expectation, time, numParams, params, state, expectationValue, workspace, stream)
+
+
+cdef cudensitymatStatus_t cudensitymatCreateWorkspace(const cudensitymatHandle_t handle, cudensitymatWorkspaceDescriptor_t* workspaceDescr) except* nogil:
+    return _cudensitymat._cudensitymatCreateWorkspace(handle, workspaceDescr)
+
+
+cdef cudensitymatStatus_t cudensitymatDestroyWorkspace(cudensitymatWorkspaceDescriptor_t workspaceDescr) except* nogil:
+    return _cudensitymat._cudensitymatDestroyWorkspace(workspaceDescr)
+
+
+cdef cudensitymatStatus_t cudensitymatWorkspaceGetMemorySize(const cudensitymatHandle_t handle, const cudensitymatWorkspaceDescriptor_t workspaceDescr, cudensitymatMemspace_t memSpace, cudensitymatWorkspaceKind_t workspaceKind, size_t* memoryBufferSize) except* nogil:
+    return _cudensitymat._cudensitymatWorkspaceGetMemorySize(handle, workspaceDescr, memSpace, workspaceKind, memoryBufferSize)
+
+
+cdef cudensitymatStatus_t cudensitymatWorkspaceSetMemory(const cudensitymatHandle_t handle, cudensitymatWorkspaceDescriptor_t workspaceDescr, cudensitymatMemspace_t memSpace, cudensitymatWorkspaceKind_t workspaceKind, void* memoryBuffer, size_t memoryBufferSize) except* nogil:
+    return _cudensitymat._cudensitymatWorkspaceSetMemory(handle, workspaceDescr, memSpace, workspaceKind, memoryBuffer, memoryBufferSize)
+
+
+cdef cudensitymatStatus_t cudensitymatWorkspaceGetMemory(const cudensitymatHandle_t handle, const cudensitymatWorkspaceDescriptor_t workspaceDescr, cudensitymatMemspace_t memSpace, cudensitymatWorkspaceKind_t workspaceKind, void** memoryBuffer, size_t* memoryBufferSize) except* nogil:
+    return _cudensitymat._cudensitymatWorkspaceGetMemory(handle, workspaceDescr, memSpace, workspaceKind, memoryBuffer, memoryBufferSize)
diff --git a/python/cuquantum/custatevec/_internal/custatevec.pxd b/python/cuquantum/custatevec/_internal/custatevec.pxd
index c0971ac..775994d 100644
--- a/python/cuquantum/custatevec/_internal/custatevec.pxd
+++ b/python/cuquantum/custatevec/_internal/custatevec.pxd
@@ -1,8 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES
 #
 # SPDX-License-Identifier: BSD-3-Clause
-
-# This code was automatically generated. Do not modify it directly.
+#
+# This code was automatically generated across versions from 23.03.0 to 24.03.0. Do not modify it directly.
 
 from ..cycustatevec cimport *
 
diff --git a/python/cuquantum/custatevec/_internal/custatevec_linux.pyx b/python/cuquantum/custatevec/_internal/custatevec_linux.pyx
index e3334c4..2dea820 100644
--- a/python/cuquantum/custatevec/_internal/custatevec_linux.pyx
+++ b/python/cuquantum/custatevec/_internal/custatevec_linux.pyx
@@ -1,8 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES
 #
 # SPDX-License-Identifier: BSD-3-Clause
-
-# This code was automatically generated. Do not modify it directly.
+#
+# This code was automatically generated across versions from 23.03.0 to 24.03.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/python/cuquantum/custatevec/custatevec.pxd b/python/cuquantum/custatevec/custatevec.pxd
index 1d752c4..cbacea9 100644
--- a/python/cuquantum/custatevec/custatevec.pxd
+++ b/python/cuquantum/custatevec/custatevec.pxd
@@ -1,8 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES
 #
 # SPDX-License-Identifier: BSD-3-Clause
-
-# This code was automatically generated. Do not modify it directly.
+#
+# This code was automatically generated across versions from 23.03.0 to 24.03.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
diff --git a/python/cuquantum/custatevec/custatevec.pyx b/python/cuquantum/custatevec/custatevec.pyx
index 1309a90..b22bba4 100644
--- a/python/cuquantum/custatevec/custatevec.pyx
+++ b/python/cuquantum/custatevec/custatevec.pyx
@@ -1,8 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES
 #
 # SPDX-License-Identifier: BSD-3-Clause
-
-# This code was automatically generated. Do not modify it directly.
+#
+# This code was automatically generated across versions from 23.03.0 to 24.03.0. Do not modify it directly.
 
 cimport cython  # NOQA
 cimport cpython
@@ -514,12 +514,12 @@ cpdef abs2sum_array(intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_i
 
     .. seealso:: `custatevecAbs2SumArray`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_ordering_ = \
-        get_resource_ptr[int32_t](bit_ordering, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_bit_string_ = \
-        get_resource_ptr[int32_t](mask_bit_string, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_ordering_ = \
-        get_resource_ptr[int32_t](mask_ordering, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_ordering_
+    get_resource_ptr[int32_t](_bit_ordering_, bit_ordering, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_bit_string_
+    get_resource_ptr[int32_t](_mask_bit_string_, mask_bit_string, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_ordering_
+    get_resource_ptr[int32_t](_mask_ordering_, mask_ordering, <int32_t*>NULL)
     with nogil:
         status = custatevecAbs2SumArray(<Handle>handle, <const void*>sv, <DataType>sv_data_type, <const uint32_t>n_index_bits, <double*>abs2sum, <const int32_t*>(_bit_ordering_.data()), <const uint32_t>bit_ordering_len, <const int32_t*>(_mask_bit_string_.data()), <const int32_t*>(_mask_ordering_.data()), <const uint32_t>mask_len)
     check_status(status)
@@ -544,8 +544,8 @@ cpdef collapse_on_z_basis(intptr_t handle, intptr_t sv, int sv_data_type, uint32
 
     .. seealso:: `custatevecCollapseOnZBasis`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _basis_bits_ = \
-        get_resource_ptr[int32_t](basis_bits, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _basis_bits_
+    get_resource_ptr[int32_t](_basis_bits_, basis_bits, <int32_t*>NULL)
     with nogil:
         status = custatevecCollapseOnZBasis(<Handle>handle, <void*>sv, <DataType>sv_data_type, <const uint32_t>n_index_bits, <const int32_t>parity, <const int32_t*>(_basis_bits_.data()), <const uint32_t>n_basis_bits, norm)
     check_status(status)
@@ -574,10 +574,10 @@ cpdef collapse_by_bit_string(intptr_t handle, intptr_t sv, int sv_data_type, uin
 
     .. seealso:: `custatevecCollapseByBitString`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_string_ = \
-        get_resource_ptr[int32_t](bit_string, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_ordering_ = \
-        get_resource_ptr[int32_t](bit_ordering, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_string_
+    get_resource_ptr[int32_t](_bit_string_, bit_string, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_ordering_
+    get_resource_ptr[int32_t](_bit_ordering_, bit_ordering, <int32_t*>NULL)
     with nogil:
         status = custatevecCollapseByBitString(<Handle>handle, <void*>sv, <DataType>sv_data_type, <const uint32_t>n_index_bits, <const int32_t*>(_bit_string_.data()), <const int32_t*>(_bit_ordering_.data()), <const uint32_t>bit_string_len, norm)
     check_status(status)
@@ -605,8 +605,8 @@ cpdef int32_t measure_on_z_basis(intptr_t handle, intptr_t sv, int sv_data_type,
 
     .. seealso:: `custatevecMeasureOnZBasis`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _basis_bits_ = \
-        get_resource_ptr[int32_t](basis_bits, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _basis_bits_
+    get_resource_ptr[int32_t](_basis_bits_, basis_bits, <int32_t*>NULL)
     cdef int32_t parity
     with nogil:
         status = custatevecMeasureOnZBasis(<Handle>handle, <void*>sv, <DataType>sv_data_type, <const uint32_t>n_index_bits, &parity, <const int32_t*>(_basis_bits_.data()), <const uint32_t>n_basis_bits, <const double>randnum, <_CollapseOp>collapse)
@@ -634,8 +634,8 @@ cpdef batch_measure(intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_i
 
     .. seealso:: `custatevecBatchMeasure`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_ordering_ = \
-        get_resource_ptr[int32_t](bit_ordering, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_ordering_
+    get_resource_ptr[int32_t](_bit_ordering_, bit_ordering, <int32_t*>NULL)
     with nogil:
         status = custatevecBatchMeasure(<Handle>handle, <void*>sv, <DataType>sv_data_type, <const uint32_t>n_index_bits, <int32_t*>bit_string, <const int32_t*>(_bit_ordering_.data()), <const uint32_t>bit_string_len, <const double>randnum, <_CollapseOp>collapse)
     check_status(status)
@@ -663,8 +663,8 @@ cpdef batch_measure_with_offset(intptr_t handle, intptr_t sv, int sv_data_type,
 
     .. seealso:: `custatevecBatchMeasureWithOffset`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_ordering_ = \
-        get_resource_ptr[int32_t](bit_ordering, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_ordering_
+    get_resource_ptr[int32_t](_bit_ordering_, bit_ordering, <int32_t*>NULL)
     with nogil:
         status = custatevecBatchMeasureWithOffset(<Handle>handle, <void*>sv, <DataType>sv_data_type, <const uint32_t>n_index_bits, <int32_t*>bit_string, <const int32_t*>(_bit_ordering_.data()), <const uint32_t>bit_string_len, <const double>randnum, <_CollapseOp>collapse, <const double>offset, <const double>abs2sum)
     check_status(status)
@@ -704,14 +704,14 @@ cpdef apply_pauli_rotation(intptr_t handle, intptr_t sv, int sv_data_type, uint3
 
     .. seealso:: `custatevecApplyPauliRotation`
     """
-    cdef nullable_unique_ptr[ vector[int] ] _paulis_ = \
-        get_resource_ptr[int](paulis, <int*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _targets_ = \
-        get_resource_ptr[int32_t](targets, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _controls_ = \
-        get_resource_ptr[int32_t](controls, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _control_bit_values_ = \
-        get_resource_ptr[int32_t](control_bit_values, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int] ] _paulis_
+    get_resource_ptr[int](_paulis_, paulis, <int*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _targets_
+    get_resource_ptr[int32_t](_targets_, targets, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _controls_
+    get_resource_ptr[int32_t](_controls_, controls, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _control_bit_values_
+    get_resource_ptr[int32_t](_control_bit_values_, control_bit_values, <int32_t*>NULL)
     with nogil:
         status = custatevecApplyPauliRotation(<Handle>handle, <void*>sv, <DataType>sv_data_type, <const uint32_t>n_index_bits, theta, <const _Pauli*>(_paulis_.data()), <const int32_t*>(_targets_.data()), <const uint32_t>n_targets, <const int32_t*>(_controls_.data()), <const int32_t*>(_control_bit_values_.data()), <const uint32_t>n_controls)
     check_status(status)
@@ -779,12 +779,12 @@ cpdef apply_matrix(intptr_t handle, intptr_t sv, int sv_data_type, uint32_t n_in
 
     .. seealso:: `custatevecApplyMatrix`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _targets_ = \
-        get_resource_ptr[int32_t](targets, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _controls_ = \
-        get_resource_ptr[int32_t](controls, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _control_bit_values_ = \
-        get_resource_ptr[int32_t](control_bit_values, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _targets_
+    get_resource_ptr[int32_t](_targets_, targets, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _controls_
+    get_resource_ptr[int32_t](_controls_, controls, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _control_bit_values_
+    get_resource_ptr[int32_t](_control_bit_values_, control_bit_values, <int32_t*>NULL)
     with nogil:
         status = custatevecApplyMatrix(<Handle>handle, <void*>sv, <DataType>sv_data_type, <const uint32_t>n_index_bits, <const void*>matrix, <DataType>matrix_data_type, <_MatrixLayout>layout, <const int32_t>adjoint, <const int32_t*>(_targets_.data()), <const uint32_t>n_targets, <const int32_t*>(_controls_.data()), <const int32_t*>(_control_bit_values_.data()), <const uint32_t>n_controls, <_ComputeType>compute_type, <void*>extra_workspace, extra_workspace_size_in_bytes)
     check_status(status)
@@ -843,8 +843,8 @@ cpdef double compute_expectation(intptr_t handle, intptr_t sv, int sv_data_type,
 
     .. seealso:: `custatevecComputeExpectation`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _basis_bits_ = \
-        get_resource_ptr[int32_t](basis_bits, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _basis_bits_
+    get_resource_ptr[int32_t](_basis_bits_, basis_bits, <int32_t*>NULL)
     cdef double residual_norm
     with nogil:
         status = custatevecComputeExpectation(<Handle>handle, <const void*>sv, <DataType>sv_data_type, <const uint32_t>n_index_bits, <void*>expectation_value, <DataType>expectation_data_type, &residual_norm, <const void*>matrix, <DataType>matrix_data_type, <_MatrixLayout>layout, <const int32_t*>(_basis_bits_.data()), <const uint32_t>n_basis_bits, <_ComputeType>compute_type, <void*>extra_workspace, extra_workspace_size_in_bytes)
@@ -967,10 +967,10 @@ cpdef sampler_sample(intptr_t handle, intptr_t sampler, intptr_t bit_strings, bi
 
     .. seealso:: `custatevecSamplerSample`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_ordering_ = \
-        get_resource_ptr[int32_t](bit_ordering, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[double] ] _randnums_ = \
-        get_resource_ptr[double](randnums, <double*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_ordering_
+    get_resource_ptr[int32_t](_bit_ordering_, bit_ordering, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[double] ] _randnums_
+    get_resource_ptr[double](_randnums_, randnums, <double*>NULL)
     with nogil:
         status = custatevecSamplerSample(<Handle>handle, <SamplerDescriptor>sampler, <custatevecIndex_t*>bit_strings, <const int32_t*>(_bit_ordering_.data()), <const uint32_t>bit_string_len, <const double*>(_randnums_.data()), <const uint32_t>n_shots, <_SamplerOutput>output)
     check_status(status)
@@ -1003,10 +1003,10 @@ cpdef size_t apply_generalized_permutation_matrix_get_workspace_size(intptr_t ha
 
     .. seealso:: `custatevecApplyGeneralizedPermutationMatrixGetWorkspaceSize`
     """
-    cdef nullable_unique_ptr[ vector[int64_t] ] _permutation_ = \
-        get_resource_ptr[int64_t](permutation, <int64_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _targets_ = \
-        get_resource_ptr[int32_t](targets, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _permutation_
+    get_resource_ptr[int64_t](_permutation_, permutation, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _targets_
+    get_resource_ptr[int32_t](_targets_, targets, <int32_t*>NULL)
     cdef size_t extra_workspace_size_in_bytes
     with nogil:
         status = custatevecApplyGeneralizedPermutationMatrixGetWorkspaceSize(<Handle>handle, <DataType>sv_data_type, <const uint32_t>n_index_bits, <const custatevecIndex_t*>(_permutation_.data()), <const void*>diagonals, <DataType>diagonals_data_type, <const int32_t*>(_targets_.data()), <const uint32_t>n_targets, <const uint32_t>n_controls, &extra_workspace_size_in_bytes)
@@ -1052,14 +1052,14 @@ cpdef apply_generalized_permutation_matrix(intptr_t handle, intptr_t sv, int sv_
 
     .. seealso:: `custatevecApplyGeneralizedPermutationMatrix`
     """
-    cdef nullable_unique_ptr[ vector[int64_t] ] _permutation_ = \
-        get_resource_ptr[int64_t](permutation, <int64_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _targets_ = \
-        get_resource_ptr[int32_t](targets, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _controls_ = \
-        get_resource_ptr[int32_t](controls, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _control_bit_values_ = \
-        get_resource_ptr[int32_t](control_bit_values, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _permutation_
+    get_resource_ptr[int64_t](_permutation_, permutation, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _targets_
+    get_resource_ptr[int32_t](_targets_, targets, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _controls_
+    get_resource_ptr[int32_t](_controls_, controls, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _control_bit_values_
+    get_resource_ptr[int32_t](_control_bit_values_, control_bit_values, <int32_t*>NULL)
     with nogil:
         status = custatevecApplyGeneralizedPermutationMatrix(<Handle>handle, <void*>sv, <DataType>sv_data_type, <const uint32_t>n_index_bits, <custatevecIndex_t*>(_permutation_.data()), <const void*>diagonals, <DataType>diagonals_data_type, <const int32_t>adjoint, <const int32_t*>(_targets_.data()), <const uint32_t>n_targets, <const int32_t*>(_controls_.data()), <const int32_t*>(_control_bit_values_.data()), <const uint32_t>n_controls, <void*>extra_workspace, extra_workspace_size_in_bytes)
     check_status(status)
@@ -1097,12 +1097,12 @@ cpdef compute_expectations_on_pauli_basis(intptr_t handle, intptr_t sv, int sv_d
 
     .. seealso:: `custatevecComputeExpectationsOnPauliBasis`
     """
-    cdef nested_resource[ int ] _pauli_operators_array_ = \
-        get_nested_resource_ptr[int](pauli_operators_array, <int*>NULL)
-    cdef nested_resource[ int32_t ] _basis_bits_array_ = \
-        get_nested_resource_ptr[int32_t](basis_bits_array, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[uint32_t] ] _n_basis_bits_array_ = \
-        get_resource_ptr[uint32_t](n_basis_bits_array, <uint32_t*>NULL)
+    cdef nested_resource[ int ] _pauli_operators_array_
+    get_nested_resource_ptr[int](_pauli_operators_array_, pauli_operators_array, <int*>NULL)
+    cdef nested_resource[ int32_t ] _basis_bits_array_
+    get_nested_resource_ptr[int32_t](_basis_bits_array_, basis_bits_array, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[uint32_t] ] _n_basis_bits_array_
+    get_resource_ptr[uint32_t](_n_basis_bits_array_, n_basis_bits_array, <uint32_t*>NULL)
     with nogil:
         status = custatevecComputeExpectationsOnPauliBasis(<Handle>handle, <const void*>sv, <DataType>sv_data_type, <const uint32_t>n_index_bits, <double*>expectation_values, <const _Pauli**>(_pauli_operators_array_.ptrs.data()), <const uint32_t>n_pauli_operator_arrays, <const int32_t**>(_basis_bits_array_.ptrs.data()), <const uint32_t*>(_n_basis_bits_array_.data()))
     check_status(status)
@@ -1144,12 +1144,12 @@ cpdef tuple accessor_create(intptr_t handle, intptr_t sv, int sv_data_type, uint
     """
     cdef AccessorDescriptor accessor
     cdef size_t extra_workspace_size_in_bytes
-    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_ordering_ = \
-        get_resource_ptr[int32_t](bit_ordering, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_bit_string_ = \
-        get_resource_ptr[int32_t](mask_bit_string, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_ordering_ = \
-        get_resource_ptr[int32_t](mask_ordering, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_ordering_
+    get_resource_ptr[int32_t](_bit_ordering_, bit_ordering, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_bit_string_
+    get_resource_ptr[int32_t](_mask_bit_string_, mask_bit_string, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_ordering_
+    get_resource_ptr[int32_t](_mask_ordering_, mask_ordering, <int32_t*>NULL)
     with nogil:
         status = custatevecAccessorCreate(<Handle>handle, <void*>sv, <DataType>sv_data_type, <const uint32_t>n_index_bits, &accessor, <const int32_t*>(_bit_ordering_.data()), <const uint32_t>bit_ordering_len, <const int32_t*>(_mask_bit_string_.data()), <const int32_t*>(_mask_ordering_.data()), <const uint32_t>mask_len, &extra_workspace_size_in_bytes)
     check_status(status)
@@ -1192,12 +1192,12 @@ cpdef tuple accessor_create_view(intptr_t handle, intptr_t sv, int sv_data_type,
     """
     cdef AccessorDescriptor accessor
     cdef size_t extra_workspace_size_in_bytes
-    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_ordering_ = \
-        get_resource_ptr[int32_t](bit_ordering, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_bit_string_ = \
-        get_resource_ptr[int32_t](mask_bit_string, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_ordering_ = \
-        get_resource_ptr[int32_t](mask_ordering, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_ordering_
+    get_resource_ptr[int32_t](_bit_ordering_, bit_ordering, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_bit_string_
+    get_resource_ptr[int32_t](_mask_bit_string_, mask_bit_string, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_ordering_
+    get_resource_ptr[int32_t](_mask_ordering_, mask_ordering, <int32_t*>NULL)
     with nogil:
         status = custatevecAccessorCreateView(<Handle>handle, <const void*>sv, <DataType>sv_data_type, <const uint32_t>n_index_bits, &accessor, <const int32_t*>(_bit_ordering_.data()), <const uint32_t>bit_ordering_len, <const int32_t*>(_mask_bit_string_.data()), <const int32_t*>(_mask_ordering_.data()), <const uint32_t>mask_len, &extra_workspace_size_in_bytes)
     check_status(status)
@@ -1492,12 +1492,12 @@ cpdef sv_swap_worker_set_sub_svs_p2p(intptr_t handle, intptr_t sv_swap_worker, d
 
     .. seealso:: `custatevecSVSwapWorkerSetSubSVsP2P`
     """
-    cdef nullable_unique_ptr[ vector[intptr_t] ] _dst_sub_svs_p2p_ = \
-        get_resource_ptr[intptr_t](dst_sub_svs_p2p, <intptr_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _dst_sub_sv_indices_p2p_ = \
-        get_resource_ptr[int32_t](dst_sub_sv_indices_p2p, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[intptr_t] ] _dst_events_ = \
-        get_resource_ptr[intptr_t](dst_events, <intptr_t*>NULL)
+    cdef nullable_unique_ptr[ vector[intptr_t] ] _dst_sub_svs_p2p_
+    get_resource_ptr[intptr_t](_dst_sub_svs_p2p_, dst_sub_svs_p2p, <intptr_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _dst_sub_sv_indices_p2p_
+    get_resource_ptr[int32_t](_dst_sub_sv_indices_p2p_, dst_sub_sv_indices_p2p, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[intptr_t] ] _dst_events_
+    get_resource_ptr[intptr_t](_dst_events_, dst_events, <intptr_t*>NULL)
     with nogil:
         status = custatevecSVSwapWorkerSetSubSVsP2P(<Handle>handle, <SVSwapWorkerDescriptor>sv_swap_worker, <void**>(_dst_sub_svs_p2p_.data()), <const int32_t*>(_dst_sub_sv_indices_p2p_.data()), <Event*>(_dst_events_.data()), <const uint32_t>n_dst_sub_svs_p2p)
     check_status(status)
@@ -1565,8 +1565,8 @@ cpdef size_t apply_matrix_batched_get_workspace_size(intptr_t handle, int sv_dat
 
     .. seealso:: `custatevecApplyMatrixBatchedGetWorkspaceSize`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _matrix_indices_ = \
-        get_resource_ptr[int32_t](matrix_indices, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _matrix_indices_
+    get_resource_ptr[int32_t](_matrix_indices_, matrix_indices, <int32_t*>NULL)
     cdef size_t extra_workspace_size_in_bytes
     with nogil:
         status = custatevecApplyMatrixBatchedGetWorkspaceSize(<Handle>handle, <DataType>sv_data_type, <const uint32_t>n_index_bits, <const uint32_t>n_svs, <const custatevecIndex_t>sv_stride, <_MatrixMapType>map_type, <const int32_t*>(_matrix_indices_.data()), <const void*>matrices, <DataType>matrix_data_type, <_MatrixLayout>layout, <const int32_t>adjoint, <const uint32_t>n_matrices, <const uint32_t>n_targets, <const uint32_t>n_controls, <_ComputeType>compute_type, &extra_workspace_size_in_bytes)
@@ -1618,14 +1618,14 @@ cpdef apply_matrix_batched(intptr_t handle, intptr_t batched_sv, int sv_data_typ
 
     .. seealso:: `custatevecApplyMatrixBatched`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _matrix_indices_ = \
-        get_resource_ptr[int32_t](matrix_indices, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _targets_ = \
-        get_resource_ptr[int32_t](targets, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _controls_ = \
-        get_resource_ptr[int32_t](controls, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _control_bit_values_ = \
-        get_resource_ptr[int32_t](control_bit_values, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _matrix_indices_
+    get_resource_ptr[int32_t](_matrix_indices_, matrix_indices, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _targets_
+    get_resource_ptr[int32_t](_targets_, targets, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _controls_
+    get_resource_ptr[int32_t](_controls_, controls, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _control_bit_values_
+    get_resource_ptr[int32_t](_control_bit_values_, control_bit_values, <int32_t*>NULL)
     with nogil:
         status = custatevecApplyMatrixBatched(<Handle>handle, <void*>batched_sv, <DataType>sv_data_type, <const uint32_t>n_index_bits, <const uint32_t>n_svs, <custatevecIndex_t>sv_stride, <_MatrixMapType>map_type, <const int32_t*>(_matrix_indices_.data()), <const void*>matrices, <DataType>matrix_data_type, <_MatrixLayout>layout, <const int32_t>adjoint, <const uint32_t>n_matrices, <const int32_t*>(_targets_.data()), <const uint32_t>n_targets, <const int32_t*>(_controls_.data()), <const int32_t*>(_control_bit_values_.data()), <const uint32_t>n_controls, <_ComputeType>compute_type, <void*>extra_workspace, extra_workspace_size_in_bytes)
     check_status(status)
@@ -1663,12 +1663,12 @@ cpdef abs2sum_array_batched(intptr_t handle, intptr_t batched_sv, int sv_data_ty
 
     .. seealso:: `custatevecAbs2SumArrayBatched`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_ordering_ = \
-        get_resource_ptr[int32_t](bit_ordering, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int64_t] ] _mask_bit_strings_ = \
-        get_resource_ptr[int64_t](mask_bit_strings, <int64_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_ordering_ = \
-        get_resource_ptr[int32_t](mask_ordering, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_ordering_
+    get_resource_ptr[int32_t](_bit_ordering_, bit_ordering, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _mask_bit_strings_
+    get_resource_ptr[int64_t](_mask_bit_strings_, mask_bit_strings, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_ordering_
+    get_resource_ptr[int32_t](_mask_ordering_, mask_ordering, <int32_t*>NULL)
     with nogil:
         status = custatevecAbs2SumArrayBatched(<Handle>handle, <const void*>batched_sv, <DataType>sv_data_type, <const uint32_t>n_index_bits, <const uint32_t>n_svs, <const custatevecIndex_t>sv_stride, <double*>abs2sum_arrays, <const custatevecIndex_t>abs2sum_array_stride, <const int32_t*>(_bit_ordering_.data()), <const uint32_t>bit_ordering_len, <const custatevecIndex_t*>(_mask_bit_strings_.data()), <const int32_t*>(_mask_ordering_.data()), <const uint32_t>mask_len)
     check_status(status)
@@ -1696,10 +1696,10 @@ cpdef size_t collapse_by_bit_string_batched_get_workspace_size(intptr_t handle,
 
     .. seealso:: `custatevecCollapseByBitStringBatchedGetWorkspaceSize`
     """
-    cdef nullable_unique_ptr[ vector[int64_t] ] _bit_strings_ = \
-        get_resource_ptr[int64_t](bit_strings, <int64_t*>NULL)
-    cdef nullable_unique_ptr[ vector[double] ] _norms_ = \
-        get_resource_ptr[double](norms, <double*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _bit_strings_
+    get_resource_ptr[int64_t](_bit_strings_, bit_strings, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[double] ] _norms_
+    get_resource_ptr[double](_norms_, norms, <double*>NULL)
     cdef size_t extra_workspace_size_in_bytes
     with nogil:
         status = custatevecCollapseByBitStringBatchedGetWorkspaceSize(<Handle>handle, <const uint32_t>n_svs, <const custatevecIndex_t*>(_bit_strings_.data()), <const double*>(_norms_.data()), &extra_workspace_size_in_bytes)
@@ -1738,12 +1738,12 @@ cpdef collapse_by_bit_string_batched(intptr_t handle, intptr_t batched_sv, int s
 
     .. seealso:: `custatevecCollapseByBitStringBatched`
     """
-    cdef nullable_unique_ptr[ vector[int64_t] ] _bit_strings_ = \
-        get_resource_ptr[int64_t](bit_strings, <int64_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_ordering_ = \
-        get_resource_ptr[int32_t](bit_ordering, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[double] ] _norms_ = \
-        get_resource_ptr[double](norms, <double*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _bit_strings_
+    get_resource_ptr[int64_t](_bit_strings_, bit_strings, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_ordering_
+    get_resource_ptr[int32_t](_bit_ordering_, bit_ordering, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[double] ] _norms_
+    get_resource_ptr[double](_norms_, norms, <double*>NULL)
     with nogil:
         status = custatevecCollapseByBitStringBatched(<Handle>handle, <void*>batched_sv, <DataType>sv_data_type, <const uint32_t>n_index_bits, <const uint32_t>n_svs, <const custatevecIndex_t>sv_stride, <const custatevecIndex_t*>(_bit_strings_.data()), <const int32_t*>(_bit_ordering_.data()), <const uint32_t>bit_string_len, <const double*>(_norms_.data()), <void*>extra_workspace, extra_workspace_size_in_bytes)
     check_status(status)
@@ -1775,10 +1775,10 @@ cpdef measure_batched(intptr_t handle, intptr_t batched_sv, int sv_data_type, ui
 
     .. seealso:: `custatevecMeasureBatched`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_ordering_ = \
-        get_resource_ptr[int32_t](bit_ordering, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[double] ] _randnums_ = \
-        get_resource_ptr[double](randnums, <double*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _bit_ordering_
+    get_resource_ptr[int32_t](_bit_ordering_, bit_ordering, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[double] ] _randnums_
+    get_resource_ptr[double](_randnums_, randnums, <double*>NULL)
     with nogil:
         status = custatevecMeasureBatched(<Handle>handle, <void*>batched_sv, <DataType>sv_data_type, <const uint32_t>n_index_bits, <const uint32_t>n_svs, <const custatevecIndex_t>sv_stride, <custatevecIndex_t*>bit_strings, <const int32_t*>(_bit_ordering_.data()), <const uint32_t>bit_string_len, <const double*>(_randnums_.data()), <_CollapseOp>collapse)
     check_status(status)
@@ -1894,8 +1894,8 @@ cpdef compute_expectation_batched(intptr_t handle, intptr_t batched_sv, int sv_d
 
     .. seealso:: `custatevecComputeExpectationBatched`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _basis_bits_ = \
-        get_resource_ptr[int32_t](basis_bits, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _basis_bits_
+    get_resource_ptr[int32_t](_basis_bits_, basis_bits, <int32_t*>NULL)
     with nogil:
         status = custatevecComputeExpectationBatched(<Handle>handle, <const void*>batched_sv, <DataType>sv_data_type, <const uint32_t>n_index_bits, <const uint32_t>n_svs, <custatevecIndex_t>sv_stride, <double2*>expectation_values, <const void*>matrices, <DataType>matrix_data_type, <_MatrixLayout>layout, <const uint32_t>n_matrices, <const int32_t*>(_basis_bits_.data()), <const uint32_t>n_basis_bits, <_ComputeType>compute_type, <void*>extra_workspace, extra_workspace_size_in_bytes)
     check_status(status)
@@ -1953,8 +1953,8 @@ cpdef tuple abs2sum_on_z_basis(
     abs2sum1_ptr = &abs2sum1 if get_parity1 else NULL
 
     # basis_bits can be a pointer address, or a Python sequence
-    cdef nullable_unique_ptr[ vector[int32_t] ] _basis_bits_ = \
-        get_resource_ptr[int32_t](basis_bits, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _basis_bits_
+    get_resource_ptr[int32_t](_basis_bits_, basis_bits, <int32_t*>NULL)
 
     with nogil:
         status = custatevecAbs2SumOnZBasis(
@@ -2030,12 +2030,12 @@ cpdef swap_index_bits(
                          "un-recognized format")
 
     # mask_bit_string can be a pointer address, or a Python sequence
-    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_bit_string_ = \
-        get_resource_ptr[int32_t](mask_bit_string, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_bit_string_
+    get_resource_ptr[int32_t](_mask_bit_string_, mask_bit_string, <int32_t*>NULL)
 
     # mask_ordering can be a pointer address, or a Python sequence
-    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_ordering_ = \
-        get_resource_ptr[int32_t](mask_ordering, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_ordering_
+    get_resource_ptr[int32_t](_mask_ordering_, mask_ordering, <int32_t*>NULL)
 
     with nogil:
         status = custatevecSwapIndexBits(
@@ -2092,12 +2092,12 @@ cpdef multi_device_swap_index_bits(
     .. seealso:: `custatevecMultiDeviceSwapIndexBits`
     """
     # handles can be a pointer address, or a Python sequence
-    cdef nullable_unique_ptr[ vector[intptr_t] ] _handles_ = \
-        get_resource_ptr[intptr_t](handles, <intptr_t*>NULL)
+    cdef nullable_unique_ptr[ vector[intptr_t] ] _handles_
+    get_resource_ptr[intptr_t](_handles_, handles, <intptr_t*>NULL)
 
     # sub_svs can be a pointer address, or a Python sequence
-    cdef nullable_unique_ptr[ vector[intptr_t] ] _sub_svs_ = \
-        get_resource_ptr[intptr_t](sub_svs, <intptr_t*>NULL)
+    cdef nullable_unique_ptr[ vector[intptr_t] ] _sub_svs_
+    get_resource_ptr[intptr_t](_sub_svs_, sub_svs, <intptr_t*>NULL)
 
     # swapped_bits can be:
     #   - a plain pointer address
@@ -2127,12 +2127,12 @@ cpdef multi_device_swap_index_bits(
                          "un-recognized format")
 
     # mask_bit_string can be a pointer address, or a Python sequence
-    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_bit_string_ = \
-        get_resource_ptr[int32_t](mask_bit_string, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_bit_string_
+    get_resource_ptr[int32_t](_mask_bit_string_, mask_bit_string, <int32_t*>NULL)
 
     # mask_ordering can be a pointer address, or a Python sequence
-    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_ordering_ = \
-        get_resource_ptr[int32_t](mask_ordering, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_ordering_
+    get_resource_ptr[int32_t](_mask_ordering_, mask_ordering, <int32_t*>NULL)
 
     with nogil:
         status = custatevecMultiDeviceSwapIndexBits(
@@ -2295,12 +2295,12 @@ cpdef uint32_t dist_index_bit_swap_scheduler_set_index_bit_swaps(
                          "un-recognized format")
 
     # mask_bit_string can be a pointer address, or a Python sequence
-    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_bit_string_ = \
-        get_resource_ptr[int32_t](mask_bit_string, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_bit_string_
+    get_resource_ptr[int32_t](_mask_bit_string_, mask_bit_string, <int32_t*>NULL)
 
     # mask_ordering can be a pointer address, or a Python sequence
-    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_ordering_ = \
-        get_resource_ptr[int32_t](mask_ordering, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _mask_ordering_
+    get_resource_ptr[int32_t](_mask_ordering_, mask_ordering, <int32_t*>NULL)
 
     cdef uint32_t n_swap_batches
     with nogil:
diff --git a/python/cuquantum/custatevec/cycustatevec.pxd b/python/cuquantum/custatevec/cycustatevec.pxd
index 3aa2398..1acb2ab 100644
--- a/python/cuquantum/custatevec/cycustatevec.pxd
+++ b/python/cuquantum/custatevec/cycustatevec.pxd
@@ -1,8 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES
 #
 # SPDX-License-Identifier: BSD-3-Clause
-
-# This code was automatically generated. Do not modify it directly.
+#
+# This code was automatically generated across versions from 23.03.0 to 24.03.0. Do not modify it directly.
 # This layer exposes the C header to Cython as-is.
 
 from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t
@@ -20,6 +20,10 @@ cdef extern from *:
     ctypedef int cudaDataType 'cudaDataType'
     ctypedef int libraryPropertyType_t 'libraryPropertyType_t'
     ctypedef int libraryPropertyType 'libraryPropertyType'
+    ctypedef struct int2 'int2':
+        pass
+    ctypedef struct double2 'double2':
+        pass
 
 
 cdef extern from '<custatevec.h>' nogil:
@@ -40,55 +44,55 @@ cdef extern from '<custatevec.h>' nogil:
         CUSTATEVEC_STATUS_COMMUNICATOR_ERROR
         CUSTATEVEC_STATUS_LOADING_LIBRARY_FAILED
         CUSTATEVEC_STATUS_MAX_VALUE
-    
+
     ctypedef enum custatevecPauli_t:
         CUSTATEVEC_PAULI_I
         CUSTATEVEC_PAULI_X
         CUSTATEVEC_PAULI_Y
         CUSTATEVEC_PAULI_Z
-    
+
     ctypedef enum custatevecMatrixLayout_t:
         CUSTATEVEC_MATRIX_LAYOUT_COL
         CUSTATEVEC_MATRIX_LAYOUT_ROW
-    
+
     ctypedef enum custatevecMatrixType_t:
         CUSTATEVEC_MATRIX_TYPE_GENERAL
         CUSTATEVEC_MATRIX_TYPE_UNITARY
         CUSTATEVEC_MATRIX_TYPE_HERMITIAN
-    
+
     ctypedef enum custatevecCollapseOp_t:
         CUSTATEVEC_COLLAPSE_NONE
         CUSTATEVEC_COLLAPSE_NORMALIZE_AND_ZERO
-    
+
     ctypedef enum custatevecComputeType_t:
         CUSTATEVEC_COMPUTE_DEFAULT
         CUSTATEVEC_COMPUTE_32F
         CUSTATEVEC_COMPUTE_64F
         CUSTATEVEC_COMPUTE_TF32
-    
+
     ctypedef enum custatevecSamplerOutput_t:
         CUSTATEVEC_SAMPLER_OUTPUT_RANDNUM_ORDER
         CUSTATEVEC_SAMPLER_OUTPUT_ASCENDING_ORDER
-    
+
     ctypedef enum custatevecDeviceNetworkType_t:
         CUSTATEVEC_DEVICE_NETWORK_TYPE_SWITCH
         CUSTATEVEC_DEVICE_NETWORK_TYPE_FULLMESH
-    
+
     ctypedef enum custatevecCommunicatorType_t:
         CUSTATEVEC_COMMUNICATOR_TYPE_EXTERNAL
         CUSTATEVEC_COMMUNICATOR_TYPE_OPENMPI
         CUSTATEVEC_COMMUNICATOR_TYPE_MPICH
-    
+
     ctypedef enum custatevecDataTransferType_t:
         CUSTATEVEC_DATA_TRANSFER_TYPE_NONE
         CUSTATEVEC_DATA_TRANSFER_TYPE_SEND
         CUSTATEVEC_DATA_TRANSFER_TYPE_RECV
         CUSTATEVEC_DATA_TRANSFER_TYPE_SEND_RECV
-    
+
     ctypedef enum custatevecMatrixMapType_t:
         CUSTATEVEC_MATRIX_MAP_TYPE_BROADCAST
         CUSTATEVEC_MATRIX_MAP_TYPE_MATRIX_INDEXED
-    
+
     ctypedef enum custatevecStateVectorType_t:
         CUSTATEVEC_STATE_VECTOR_TYPE_ZERO
         CUSTATEVEC_STATE_VECTOR_TYPE_UNIFORM
@@ -141,13 +145,6 @@ cdef extern from '<custatevec.h>' nogil:
     const int CUSTATEVEC_VERSION
 
 
-cdef extern from "vector_types.h" nogil:
-    ctypedef struct int2 'int2':
-        pass
-    ctypedef struct double2 'double2':
-        pass
-
-
 ###############################################################################
 # Functions
 ###############################################################################
diff --git a/python/cuquantum/custatevec/cycustatevec.pyx b/python/cuquantum/custatevec/cycustatevec.pyx
index e6c5256..1c6929e 100644
--- a/python/cuquantum/custatevec/cycustatevec.pyx
+++ b/python/cuquantum/custatevec/cycustatevec.pyx
@@ -1,8 +1,8 @@
 # Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES
 #
 # SPDX-License-Identifier: BSD-3-Clause
-
-# This code was automatically generated. Do not modify it directly.
+#
+# This code was automatically generated across versions from 23.03.0 to 24.03.0. Do not modify it directly.
 
 from ._internal cimport custatevec as _custatevec
 
diff --git a/python/cuquantum/cutensornet/_internal/circuit_parser_utils_qiskit.py b/python/cuquantum/cutensornet/_internal/circuit_parser_utils_qiskit.py
index d6ed782..ce206cf 100644
--- a/python/cuquantum/cutensornet/_internal/circuit_parser_utils_qiskit.py
+++ b/python/cuquantum/cutensornet/_internal/circuit_parser_utils_qiskit.py
@@ -9,7 +9,7 @@
 try:
     # qiskit 1.0
     from qiskit.circuit.library import UnitaryGate
-except ModuleNotFoundError:
+except ImportError:
     # qiskit < 1.0
     from qiskit.extensions import UnitaryGate
 
diff --git a/python/cuquantum/cutensornet/_internal/cutensornet.pxd b/python/cuquantum/cutensornet/_internal/cutensornet.pxd
index 2db79fc..be08752 100644
--- a/python/cuquantum/cutensornet/_internal/cutensornet.pxd
+++ b/python/cuquantum/cutensornet/_internal/cutensornet.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 #
-# This code was automatically generated across versions from 23.03.0 to 24.08.0. Do not modify it directly.
+# This code was automatically generated across versions from 23.03.0 to 24.11.0. Do not modify it directly.
 
 from ..cycutensornet cimport *
 
@@ -125,3 +125,5 @@ cdef cutensornetStatus_t _cutensornetAccessorGetInfo(const cutensornetHandle_t h
 cdef cutensornetStatus_t _cutensornetExpectationGetInfo(const cutensornetHandle_t handle, const cutensornetStateExpectation_t tensorNetworkExpectation, cutensornetExpectationAttributes_t attribute, void* attributeValue, size_t attributeSize) except* nogil
 cdef cutensornetStatus_t _cutensornetMarginalGetInfo(const cutensornetHandle_t handle, const cutensornetStateMarginal_t tensorNetworkMarginal, cutensornetMarginalAttributes_t attribute, void* attributeValue, size_t attributeSize) except* nogil
 cdef cutensornetStatus_t _cutensornetSamplerGetInfo(const cutensornetHandle_t handle, const cutensornetStateSampler_t tensorNetworkSampler, cutensornetSamplerAttributes_t attribute, void* attributeValue, size_t attributeSize) except* nogil
+cdef cutensornetStatus_t _cutensornetStateApplyUnitaryChannel(const cutensornetHandle_t handle, cutensornetState_t tensorNetworkState, int32_t numStateModes, const int32_t* stateModes, int32_t numTensors, void* tensorData[], const int64_t* tensorModeStrides, const double probabilities[], int64_t* channelId) except* nogil
+cdef cutensornetStatus_t _cutensornetStateCaptureMPS(const cutensornetHandle_t handle, cutensornetState_t tensorNetworkState) except* nogil
diff --git a/python/cuquantum/cutensornet/_internal/cutensornet_linux.pyx b/python/cuquantum/cutensornet/_internal/cutensornet_linux.pyx
index cd54946..510ab93 100644
--- a/python/cuquantum/cutensornet/_internal/cutensornet_linux.pyx
+++ b/python/cuquantum/cutensornet/_internal/cutensornet_linux.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 #
-# This code was automatically generated across versions from 23.03.0 to 24.08.0. Do not modify it directly.
+# This code was automatically generated across versions from 23.03.0 to 24.11.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
@@ -148,6 +148,8 @@ cdef void* __cutensornetAccessorGetInfo = NULL
 cdef void* __cutensornetExpectationGetInfo = NULL
 cdef void* __cutensornetMarginalGetInfo = NULL
 cdef void* __cutensornetSamplerGetInfo = NULL
+cdef void* __cutensornetStateApplyUnitaryChannel = NULL
+cdef void* __cutensornetStateCaptureMPS = NULL
 
 
 cdef void* load_library() except* nogil:
@@ -964,6 +966,20 @@ cdef int _check_or_init_cutensornet() except -1 nogil:
         if handle == NULL:
             handle = load_library()
         __cutensornetSamplerGetInfo = dlsym(handle, 'cutensornetSamplerGetInfo')
+    
+    global __cutensornetStateApplyUnitaryChannel
+    __cutensornetStateApplyUnitaryChannel = dlsym(RTLD_DEFAULT, 'cutensornetStateApplyUnitaryChannel')
+    if __cutensornetStateApplyUnitaryChannel == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cutensornetStateApplyUnitaryChannel = dlsym(handle, 'cutensornetStateApplyUnitaryChannel')
+    
+    global __cutensornetStateCaptureMPS
+    __cutensornetStateCaptureMPS = dlsym(RTLD_DEFAULT, 'cutensornetStateCaptureMPS')
+    if __cutensornetStateCaptureMPS == NULL:
+        if handle == NULL:
+            handle = load_library()
+        __cutensornetStateCaptureMPS = dlsym(handle, 'cutensornetStateCaptureMPS')
 
     __py_cutensornet_init = True
     return 0
@@ -1314,6 +1330,12 @@ cpdef dict _inspect_function_pointers():
     
     global __cutensornetSamplerGetInfo
     data["__cutensornetSamplerGetInfo"] = <intptr_t>__cutensornetSamplerGetInfo
+    
+    global __cutensornetStateApplyUnitaryChannel
+    data["__cutensornetStateApplyUnitaryChannel"] = <intptr_t>__cutensornetStateApplyUnitaryChannel
+    
+    global __cutensornetStateCaptureMPS
+    data["__cutensornetStateCaptureMPS"] = <intptr_t>__cutensornetStateCaptureMPS
 
     return data
 
@@ -2460,3 +2482,23 @@ cdef cutensornetStatus_t _cutensornetSamplerGetInfo(const cutensornetHandle_t ha
             raise FunctionNotFoundError("function cutensornetSamplerGetInfo is not found")
     return (<cutensornetStatus_t (*)(const cutensornetHandle_t, const cutensornetStateSampler_t, cutensornetSamplerAttributes_t, void*, size_t) nogil>__cutensornetSamplerGetInfo)(
         handle, tensorNetworkSampler, attribute, attributeValue, attributeSize)
+
+
+cdef cutensornetStatus_t _cutensornetStateApplyUnitaryChannel(const cutensornetHandle_t handle, cutensornetState_t tensorNetworkState, int32_t numStateModes, const int32_t* stateModes, int32_t numTensors, void* tensorData[], const int64_t* tensorModeStrides, const double probabilities[], int64_t* channelId) except* nogil:
+    global __cutensornetStateApplyUnitaryChannel
+    _check_or_init_cutensornet()
+    if __cutensornetStateApplyUnitaryChannel == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensornetStateApplyUnitaryChannel is not found")
+    return (<cutensornetStatus_t (*)(const cutensornetHandle_t, cutensornetState_t, int32_t, const int32_t*, int32_t, void**, const int64_t*, const double*, int64_t*) nogil>__cutensornetStateApplyUnitaryChannel)(
+        handle, tensorNetworkState, numStateModes, stateModes, numTensors, tensorData, tensorModeStrides, probabilities, channelId)
+
+
+cdef cutensornetStatus_t _cutensornetStateCaptureMPS(const cutensornetHandle_t handle, cutensornetState_t tensorNetworkState) except* nogil:
+    global __cutensornetStateCaptureMPS
+    _check_or_init_cutensornet()
+    if __cutensornetStateCaptureMPS == NULL:
+        with gil:
+            raise FunctionNotFoundError("function cutensornetStateCaptureMPS is not found")
+    return (<cutensornetStatus_t (*)(const cutensornetHandle_t, cutensornetState_t) nogil>__cutensornetStateCaptureMPS)(
+        handle, tensorNetworkState)
diff --git a/python/cuquantum/cutensornet/_internal/decomposition_utils.py b/python/cuquantum/cutensornet/_internal/decomposition_utils.py
index 814fdae..cc7a868 100644
--- a/python/cuquantum/cutensornet/_internal/decomposition_utils.py
+++ b/python/cuquantum/cutensornet/_internal/decomposition_utils.py
@@ -9,6 +9,7 @@
 import logging
 
 import numpy
+import cupy as cp
 
 from . import einsum_parser
 from . import formatters
@@ -17,6 +18,7 @@
 from . import utils
 from .. import cutensornet as cutn
 from .. import memory
+from ..configuration import NetworkOptions, MemoryLimitExceeded
 
 
 DECOMPOSITION_DTYPE_NAMES = ('float32', 'float64', 'complex64', 'complex128')
@@ -131,7 +133,6 @@ def parse_decomposition(subscripts, *operands):
     Returns wrapped operands, mapped inputs and output, size dictionary based on internal mode numbers, 
     the forward as well as the reverse mode maps, and the largest mid extent expected for the decomposition.
     """
-
     inputs, outputs = parse_decomposition_subscripts(subscripts)
     num_operand, num_input = len(operands), len(inputs)
     if num_operand != num_input:
@@ -278,6 +279,7 @@ def parse_svd_config(handle, svd_config, svd_method, logger=None):
         if logger is not None:
             logger.info(f"The SVDConfig attribute '{cutn.TensorSVDConfigAttribute.ALGO_PARAMS}' has been set to {algo_params}.")
 
+
 def get_svd_info_dict(handle, svd_info):
     """
     Parse the information in SVDInfo in a dictionary object.
@@ -322,7 +324,12 @@ def parse_decompose_operands_options(options, wrapped_operands, stream, allowed_
     dtype_name = utils.get_operands_dtype(wrapped_operands)
     if allowed_dtype_names is not None and dtype_name not in allowed_dtype_names:
         raise ValueError(f"dtype {dtype_name} not supported")
-    compute_type = options.compute_type if options.compute_type is not None else typemaps.NAME_TO_COMPUTE_TYPE[dtype_name]
+    
+    # compute_type for decomposition should be None
+    if options.__class__.__name__ == 'NetworkOptions':
+        compute_type = options.compute_type if options.compute_type is not None else typemaps.NAME_TO_COMPUTE_TYPE[dtype_name]
+    else:
+        compute_type = None
 
     stream_holder = utils.get_or_create_stream(options.device_id, stream, package)
 
@@ -333,7 +340,7 @@ def parse_decompose_operands_options(options, wrapped_operands, stream, allowed_
         logger.info("Input data transfer finished")
 
     allocator = options.allocator if options.allocator is not None else memory._MEMORY_MANAGER[package](device_id, logger)
-
+    
     internal_options = options.__class__(device_id=device_id,
                                         logger=logger,
                                         handle=handle,
@@ -345,17 +352,31 @@ def parse_decompose_operands_options(options, wrapped_operands, stream, allowed_
     return wrapped_operands, internal_options, own_handle, operands_location, stream_holder
 
 
-def allocate_and_set_workspace(handle, allocator, workspace_desc, pref, mem_space, workspace_kind, device_id, stream_holder, logger, task_name=''):
+def allocate_and_set_workspace(options: NetworkOptions, workspace_desc, pref, mem_space, workspace_kind, stream_holder, task_name=''):
     """
     Allocate and set the workspace in the workspace descriptor.
+
+    The ``options`` argument should be properly initialized using :func:``create_operands_and_descriptors``.
+
+    Options used: 
+        - options.handle
+        - options.allocator
+        - options.device_id
+        - options.logger
+        - options.memory_limit
     """
-    workspace_size = cutn.workspace_get_memory_size(handle, workspace_desc, pref, mem_space, workspace_kind)
+    logger = options.logger
+    workspace_size = cutn.workspace_get_memory_size(options.handle, workspace_desc, pref, mem_space, workspace_kind)
+    _device = cp.cuda.Device(options.device_id)
+    _memory_limit =  utils.get_memory_limit(options.memory_limit, _device)
+    if _memory_limit < workspace_size:
+        raise MemoryLimitExceeded(_memory_limit, workspace_size, options.device_id)
     # Allocate and set workspace
     if mem_space == cutn.Memspace.DEVICE:
-        with utils.device_ctx(device_id), stream_holder.ctx:
+        with utils.device_ctx(options.device_id), stream_holder.ctx:
             try:
                 logger.debug(f"Allocating device memory for {task_name}")
-                workspace_ptr = allocator.memalloc(workspace_size)
+                workspace_ptr = options.allocator.memalloc(workspace_size)
             except TypeError as e:
                 message = "The method 'memalloc' in the allocator object must conform to the interface in the "\
                         "'BaseCUDAMemoryManager' protocol."
@@ -363,7 +384,7 @@ def allocate_and_set_workspace(handle, allocator, workspace_desc, pref, mem_spac
         
         logger.debug(f"Finished allocating device memory of size {formatters.MemoryStr(workspace_size)} for decomposition in the context of stream {stream_holder.obj}.")
         device_ptr = utils.get_ptr_from_memory_pointer(workspace_ptr)
-        cutn.workspace_set_memory(handle, workspace_desc, mem_space, workspace_kind, device_ptr, workspace_size)
+        cutn.workspace_set_memory(options.handle, workspace_desc, mem_space, workspace_kind, device_ptr, workspace_size)
         logger.debug(f"The workspace memory (device pointer = {device_ptr}) has been set in the workspace descriptor.")
         return workspace_ptr
     elif workspace_size != 0:
@@ -371,7 +392,7 @@ def allocate_and_set_workspace(handle, allocator, workspace_desc, pref, mem_spac
         logger.debug(f"Allocating host memory for {task_name}")
         workspace_host = numpy.empty(workspace_size, dtype=numpy.int8)
         logger.debug(f"Finished allocating host memory of size {formatters.MemoryStr(workspace_size)} for decomposition.")
-        cutn.workspace_set_memory(handle, workspace_desc, mem_space, workspace_kind, workspace_host.ctypes.data, workspace_size)
+        cutn.workspace_set_memory(options.handle, workspace_desc, mem_space, workspace_kind, workspace_host.ctypes.data, workspace_size)
         logger.debug(f"The workspace memory (host pointer = {workspace_host.ctypes.data}) has been set in the workspace descriptor.")
         return workspace_host
     else:
@@ -432,6 +453,7 @@ def create_operands_and_descriptors(
 
     return input_tensor_descriptors, output_operands, output_tensor_descriptors, s, s_ptr
 
+
 def get_return_operand_data(tensor, target_location, stream_holder):
     """
     Given wrapped tensors, fetch the return operands based on target location.
diff --git a/python/cuquantum/cutensornet/circuit_converter.py b/python/cuquantum/cutensornet/circuit_converter.py
index 9cf8070..dfdb52e 100644
--- a/python/cuquantum/cutensornet/circuit_converter.py
+++ b/python/cuquantum/cutensornet/circuit_converter.py
@@ -31,7 +31,7 @@ class CircuitToEinsum:
         dtype : The datatype for the output tensor operands. If not specified, double complex is used. 
         backend: The backend for the output tensor operands. If not specified, ``cupy`` is used.
     
-    Notes:
+    .. note::
 
       - For :class:`qiskit.QuantumCircuit`, composite gates will be decomposed into either Qiskit standard gates or customized unitary gates.
 
diff --git a/python/cuquantum/cutensornet/configuration.py b/python/cuquantum/cutensornet/configuration.py
index 3bf9562..d0aa1d5 100644
--- a/python/cuquantum/cutensornet/configuration.py
+++ b/python/cuquantum/cutensornet/configuration.py
@@ -7,7 +7,7 @@
 """
 
 __all__ = ['NetworkOptions', 'OptimizerInfo', 'OptimizerOptions', 'PathFinderOptions', 
-    'ReconfigOptions', 'SlicerOptions']
+    'ReconfigOptions', 'SlicerOptions', 'MemoryLimitExceeded']
 
 import collections
 from dataclasses import dataclass, fields
@@ -213,4 +213,39 @@ def __str__(self):
         s += f"""
     Intermediate tensor mode labels = {formatters.array2string(intermediate_modes)}"""
 
-        return s
\ No newline at end of file
+        return s
+
+
+class MemoryLimitExceeded(MemoryError):
+    """
+    This exception is raised when the operation requires more device memory than what was specified in operation options.
+
+    Attributes:
+        - limit: int
+            The memory limit in bytes.
+        - requirement: int
+            Memory required to perform the operation.
+        - device_id: int
+            The device selected to run the operation.
+
+    If the options was set to str, this value is the calculated limit.
+    """
+    limit: int
+    device_id: int
+    requirement: int
+
+    def __init__(self,
+                 limit:int,
+                 requirement:int,
+                 device_id:int,
+                 specified: Optional[Union[str, int]]=None):
+        message = f"""GPU memory limit exceeded. Device id: {device_id}.
+The memory limit is {limit}, while the minimum workspace size needed is {requirement}.
+"""
+        if specified is not None:
+            message += f"Memory limit specified by options: {specified}."
+
+        super().__init__(message)
+        self.limit = limit
+        self.requirement = requirement
+        self.device_id = device_id
diff --git a/python/cuquantum/cutensornet/cutensornet.pxd b/python/cuquantum/cutensornet/cutensornet.pxd
index fe18a9d..35d0267 100644
--- a/python/cuquantum/cutensornet/cutensornet.pxd
+++ b/python/cuquantum/cutensornet/cutensornet.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 #
-# This code was automatically generated across versions from 23.03.0 to 24.08.0. Do not modify it directly.
+# This code was automatically generated across versions from 23.03.0 to 24.11.0. Do not modify it directly.
 
 from libc.stdint cimport intptr_t
 
@@ -205,3 +205,5 @@ cpdef accessor_get_info(intptr_t handle, intptr_t tensor_network_accessor, int a
 cpdef expectation_get_info(intptr_t handle, intptr_t tensor_network_expectation, int attribute, intptr_t attribute_value, size_t attribute_size)
 cpdef marginal_get_info(intptr_t handle, intptr_t tensor_network_marginal, int attribute, intptr_t attribute_value, size_t attribute_size)
 cpdef sampler_get_info(intptr_t handle, intptr_t tensor_network_sampler, int attribute, intptr_t attribute_value, size_t attribute_size)
+cpdef int64_t state_apply_unitary_channel(intptr_t handle, intptr_t tensor_network_state, int32_t num_state_modes, state_modes, int32_t num_tensors, tensor_data, tensor_mode_strides, probabilities) except? -1
+cpdef state_capture_mps(intptr_t handle, intptr_t tensor_network_state)
diff --git a/python/cuquantum/cutensornet/cutensornet.pyx b/python/cuquantum/cutensornet/cutensornet.pyx
index dd3b329..1b703c7 100644
--- a/python/cuquantum/cutensornet/cutensornet.pyx
+++ b/python/cuquantum/cutensornet/cutensornet.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 #
-# This code was automatically generated across versions from 23.03.0 to 24.08.0. Do not modify it directly.
+# This code was automatically generated across versions from 23.03.0 to 24.11.0. Do not modify it directly.
 
 cimport cython
 cimport cpython
@@ -393,21 +393,21 @@ cpdef intptr_t create_network_descriptor(intptr_t handle, int32_t num_inputs, nu
 
             - an :class:`int` as the pointer address to the nested sequence, or
             - a Python sequence of :class:`int`\s, each of which is a pointer address
-              to a valid sequence, or
+              to a valid sequence of 'int64_t', or
             - a nested Python sequence of ``int64_t``.
 
         strides_in (object): Array of size ``num_inputs``; ``strides_in[i]`` has ``num_modes_in[i]`` many entries with ``strides_in[i][j]`` (``j`` < ``num_modes_in[i]``) corresponding to the linearized offset -- in physical memory -- between two logically-neighboring elements w.r.t the j-th mode of tensor ``i``. It can be:
 
             - an :class:`int` as the pointer address to the nested sequence, or
             - a Python sequence of :class:`int`\s, each of which is a pointer address
-              to a valid sequence, or
+              to a valid sequence of 'int64_t', or
             - a nested Python sequence of ``int64_t``.
 
         modes_in (object): Array of size ``num_inputs``; ``modes_in[i]`` has ``num_modes_in[i]`` many entries -- each entry corresponds to a mode. Each mode that does not appear in the input tensor is implicitly contracted. It can be:
 
             - an :class:`int` as the pointer address to the nested sequence, or
             - a Python sequence of :class:`int`\s, each of which is a pointer address
-              to a valid sequence, or
+              to a valid sequence of 'int32_t', or
             - a nested Python sequence of ``int32_t``.
 
         qualifiers_in (object): Array of size ``num_inputs``; ``qualifiers_in[i]`` denotes the qualifiers of i-th input tensor. Refer to ``cutensornetTensorQualifiers_t``. It can be:
@@ -439,22 +439,22 @@ cpdef intptr_t create_network_descriptor(intptr_t handle, int32_t num_inputs, nu
 
     .. seealso:: `cutensornetCreateNetworkDescriptor`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _num_modes_in_ = \
-        get_resource_ptr[int32_t](num_modes_in, <int32_t*>NULL)
-    cdef nested_resource[ int64_t ] _extents_in_ = \
-        get_nested_resource_ptr[int64_t](extents_in, <int64_t*>NULL)
-    cdef nested_resource[ int64_t ] _strides_in_ = \
-        get_nested_resource_ptr[int64_t](strides_in, <int64_t*>NULL)
-    cdef nested_resource[ int32_t ] _modes_in_ = \
-        get_nested_resource_ptr[int32_t](modes_in, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[cutensornetTensorQualifiers_t] ] _qualifiers_in_ = \
-        get_resource_ptr[cutensornetTensorQualifiers_t](qualifiers_in, <cutensornetTensorQualifiers_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int64_t] ] _extents_out_ = \
-        get_resource_ptr[int64_t](extents_out, <int64_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int64_t] ] _strides_out_ = \
-        get_resource_ptr[int64_t](strides_out, <int64_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _modes_out_ = \
-        get_resource_ptr[int32_t](modes_out, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _num_modes_in_
+    get_resource_ptr[int32_t](_num_modes_in_, num_modes_in, <int32_t*>NULL)
+    cdef nested_resource[ int64_t ] _extents_in_
+    get_nested_resource_ptr[int64_t](_extents_in_, extents_in, <int64_t*>NULL)
+    cdef nested_resource[ int64_t ] _strides_in_
+    get_nested_resource_ptr[int64_t](_strides_in_, strides_in, <int64_t*>NULL)
+    cdef nested_resource[ int32_t ] _modes_in_
+    get_nested_resource_ptr[int32_t](_modes_in_, modes_in, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[cutensornetTensorQualifiers_t] ] _qualifiers_in_
+    get_resource_ptr[cutensornetTensorQualifiers_t](_qualifiers_in_, qualifiers_in, <cutensornetTensorQualifiers_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _extents_out_
+    get_resource_ptr[int64_t](_extents_out_, extents_out, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _strides_out_
+    get_resource_ptr[int64_t](_strides_out_, strides_out, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _modes_out_
+    get_resource_ptr[int32_t](_modes_out_, modes_out, <int32_t*>NULL)
     cdef NetworkDescriptor desc_net
     with nogil:
         status = cutensornetCreateNetworkDescriptor(<const Handle>handle, num_inputs, <const int32_t*>(_num_modes_in_.data()), <const int64_t* const*>(_extents_in_.ptrs.data()), <const int64_t* const*>(_strides_in_.ptrs.data()), <const int32_t* const*>(_modes_in_.ptrs.data()), <const cutensornetTensorQualifiers_t*>(_qualifiers_in_.data()), num_modes_out, <const int64_t*>(_extents_out_.data()), <const int64_t*>(_strides_out_.data()), <const int32_t*>(_modes_out_.data()), <DataType>data_type, <_ComputeType>compute_type, &desc_net)
@@ -971,8 +971,8 @@ cpdef contraction_autotune(intptr_t handle, intptr_t plan, raw_data_in, intptr_t
 
     .. seealso:: `cutensornetContractionAutotune`
     """
-    cdef nullable_unique_ptr[ vector[void*] ] _raw_data_in_ = \
-        get_resource_ptrs[void](raw_data_in, <void*>NULL)
+    cdef nullable_unique_ptr[ vector[void*] ] _raw_data_in_
+    get_resource_ptrs[void](_raw_data_in_, raw_data_in, <void*>NULL)
     with nogil:
         status = cutensornetContractionAutotune(<const Handle>handle, <ContractionPlan>plan, <const void* const*>(_raw_data_in_.data()), <void*>raw_data_out, <WorkspaceDescriptor>work_desc, <const ContractionAutotunePreference>pref, <Stream>stream)
     check_status(status)
@@ -1126,8 +1126,8 @@ cpdef contract_slices(intptr_t handle, intptr_t plan, raw_data_in, intptr_t raw_
 
     .. seealso:: `cutensornetContractSlices`
     """
-    cdef nullable_unique_ptr[ vector[void*] ] _raw_data_in_ = \
-        get_resource_ptrs[void](raw_data_in, <void*>NULL)
+    cdef nullable_unique_ptr[ vector[void*] ] _raw_data_in_
+    get_resource_ptrs[void](_raw_data_in_, raw_data_in, <void*>NULL)
     with nogil:
         status = cutensornetContractSlices(<const Handle>handle, <ContractionPlan>plan, <const void* const*>(_raw_data_in_.data()), <void*>raw_data_out, accumulate_output, <WorkspaceDescriptor>work_desc, <const SliceGroup>slice_group, <Stream>stream)
     check_status(status)
@@ -1161,12 +1161,12 @@ cpdef intptr_t create_tensor_descriptor(intptr_t handle, int32_t num_modes, exte
 
     .. seealso:: `cutensornetCreateTensorDescriptor`
     """
-    cdef nullable_unique_ptr[ vector[int64_t] ] _extents_ = \
-        get_resource_ptr[int64_t](extents, <int64_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int64_t] ] _strides_ = \
-        get_resource_ptr[int64_t](strides, <int64_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _modes_ = \
-        get_resource_ptr[int32_t](modes, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _extents_
+    get_resource_ptr[int64_t](_extents_, extents, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _strides_
+    get_resource_ptr[int64_t](_strides_, strides, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _modes_
+    get_resource_ptr[int32_t](_modes_, modes, <int32_t*>NULL)
     cdef TensorDescriptor desc_tensor
     with nogil:
         status = cutensornetCreateTensorDescriptor(<const Handle>handle, num_modes, <const int64_t*>(_extents_.data()), <const int64_t*>(_strides_.data()), <const int32_t*>(_modes_.data()), <DataType>data_type, &desc_tensor)
@@ -1764,10 +1764,10 @@ cpdef compute_gradients_backward(intptr_t handle, intptr_t plan, raw_data_in, in
 
     .. seealso:: `cutensornetComputeGradientsBackward`
     """
-    cdef nullable_unique_ptr[ vector[void*] ] _raw_data_in_ = \
-        get_resource_ptrs[void](raw_data_in, <void*>NULL)
-    cdef nullable_unique_ptr[ vector[void*] ] _gradients_ = \
-        get_resource_ptrs[void](gradients, <void*>NULL)
+    cdef nullable_unique_ptr[ vector[void*] ] _raw_data_in_
+    get_resource_ptrs[void](_raw_data_in_, raw_data_in, <void*>NULL)
+    cdef nullable_unique_ptr[ vector[void*] ] _gradients_
+    get_resource_ptrs[void](_gradients_, gradients, <void*>NULL)
     with nogil:
         status = cutensornetComputeGradientsBackward(<const Handle>handle, <ContractionPlan>plan, <const void* const*>(_raw_data_in_.data()), <const void*>output_gradient, <void* const*>(_gradients_.data()), accumulate_output, <WorkspaceDescriptor>work_desc, <Stream>stream)
     check_status(status)
@@ -1792,8 +1792,8 @@ cpdef intptr_t create_state(intptr_t handle, int purity, int32_t num_state_modes
 
     .. seealso:: `cutensornetCreateState`
     """
-    cdef nullable_unique_ptr[ vector[int64_t] ] _state_mode_extents_ = \
-        get_resource_ptr[int64_t](state_mode_extents, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _state_mode_extents_
+    get_resource_ptr[int64_t](_state_mode_extents_, state_mode_extents, <int64_t*>NULL)
     cdef State tensor_network_state
     with nogil:
         status = cutensornetCreateState(<const Handle>handle, <_StatePurity>purity, num_state_modes, <const int64_t*>(_state_mode_extents_.data()), <DataType>data_type, &tensor_network_state)
@@ -1828,10 +1828,10 @@ cpdef int64_t state_apply_tensor(intptr_t handle, intptr_t tensor_network_state,
 
     .. seealso:: `cutensornetStateApplyTensor`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _state_modes_ = \
-        get_resource_ptr[int32_t](state_modes, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int64_t] ] _tensor_mode_strides_ = \
-        get_resource_ptr[int64_t](tensor_mode_strides, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _state_modes_
+    get_resource_ptr[int32_t](_state_modes_, state_modes, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _tensor_mode_strides_
+    get_resource_ptr[int64_t](_tensor_mode_strides_, tensor_mode_strides, <int64_t*>NULL)
     cdef int64_t tensor_id
     with nogil:
         status = cutensornetStateApplyTensor(<const Handle>handle, <State>tensor_network_state, num_state_modes, <const int32_t*>(_state_modes_.data()), <void*>tensor_data, <const int64_t*>(_tensor_mode_strides_.data()), <const int32_t>immutable, <const int32_t>adjoint, <const int32_t>unitary, &tensor_id)
@@ -1898,12 +1898,12 @@ cpdef intptr_t create_marginal(intptr_t handle, intptr_t tensor_network_state, i
 
     .. seealso:: `cutensornetCreateMarginal`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _marginal_modes_ = \
-        get_resource_ptr[int32_t](marginal_modes, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _projected_modes_ = \
-        get_resource_ptr[int32_t](projected_modes, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int64_t] ] _marginal_tensor_strides_ = \
-        get_resource_ptr[int64_t](marginal_tensor_strides, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _marginal_modes_
+    get_resource_ptr[int32_t](_marginal_modes_, marginal_modes, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _projected_modes_
+    get_resource_ptr[int32_t](_projected_modes_, projected_modes, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _marginal_tensor_strides_
+    get_resource_ptr[int64_t](_marginal_tensor_strides_, marginal_tensor_strides, <int64_t*>NULL)
     cdef StateMarginal tensor_network_marginal
     with nogil:
         status = cutensornetCreateMarginal(<const Handle>handle, <State>tensor_network_state, num_marginal_modes, <const int32_t*>(_marginal_modes_.data()), num_projected_modes, <const int32_t*>(_projected_modes_.data()), <const int64_t*>(_marginal_tensor_strides_.data()), &tensor_network_marginal)
@@ -1990,8 +1990,8 @@ cpdef marginal_compute(intptr_t handle, intptr_t tensor_network_marginal, projec
 
     .. seealso:: `cutensornetMarginalCompute`
     """
-    cdef nullable_unique_ptr[ vector[int64_t] ] _projected_mode_values_ = \
-        get_resource_ptr[int64_t](projected_mode_values, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _projected_mode_values_
+    get_resource_ptr[int64_t](_projected_mode_values_, projected_mode_values, <int64_t*>NULL)
     with nogil:
         status = cutensornetMarginalCompute(<const Handle>handle, <StateMarginal>tensor_network_marginal, <const int64_t*>(_projected_mode_values_.data()), <WorkspaceDescriptor>work_desc, <void*>marginal_tensor, <Stream>cuda_stream)
     check_status(status)
@@ -2028,8 +2028,8 @@ cpdef intptr_t create_sampler(intptr_t handle, intptr_t tensor_network_state, in
 
     .. seealso:: `cutensornetCreateSampler`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _modes_to_sample_ = \
-        get_resource_ptr[int32_t](modes_to_sample, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _modes_to_sample_
+    get_resource_ptr[int32_t](_modes_to_sample_, modes_to_sample, <int32_t*>NULL)
     cdef StateSampler tensor_network_sampler
     with nogil:
         status = cutensornetCreateSampler(<const Handle>handle, <State>tensor_network_state, num_modes_to_sample, <const int32_t*>(_modes_to_sample_.data()), &tensor_network_sampler)
@@ -2142,23 +2142,23 @@ cpdef state_finalize_mps(intptr_t handle, intptr_t tensor_network_state, int bou
 
             - an :class:`int` as the pointer address to the nested sequence, or
             - a Python sequence of :class:`int`\s, each of which is a pointer address
-              to a valid sequence, or
+              to a valid sequence of 'int64_t', or
             - a nested Python sequence of ``int64_t``.
 
         strides_out (object): Array of size ``nStateModes`` specifying the strides of all tensors defining the target MPS representation. Similar to ``extents_out``, ``strides_out`` is also expected to be consistent with the mode order of each MPS tensor. If NULL, the default generalized column-major strides will be assumed. It can be:
 
             - an :class:`int` as the pointer address to the nested sequence, or
             - a Python sequence of :class:`int`\s, each of which is a pointer address
-              to a valid sequence, or
+              to a valid sequence of 'int64_t', or
             - a nested Python sequence of ``int64_t``.
 
 
     .. seealso:: `cutensornetStateFinalizeMPS`
     """
-    cdef nested_resource[ int64_t ] _extents_out_ = \
-        get_nested_resource_ptr[int64_t](extents_out, <int64_t*>NULL)
-    cdef nested_resource[ int64_t ] _strides_out_ = \
-        get_nested_resource_ptr[int64_t](strides_out, <int64_t*>NULL)
+    cdef nested_resource[ int64_t ] _extents_out_
+    get_nested_resource_ptr[int64_t](_extents_out_, extents_out, <int64_t*>NULL)
+    cdef nested_resource[ int64_t ] _strides_out_
+    get_nested_resource_ptr[int64_t](_strides_out_, strides_out, <int64_t*>NULL)
     with nogil:
         status = cutensornetStateFinalizeMPS(<const Handle>handle, <State>tensor_network_state, <_BoundaryCondition>boundary_condition, <const int64_t* const*>(_extents_out_.ptrs.data()), <const int64_t* const*>(_strides_out_.ptrs.data()))
     check_status(status)
@@ -2261,8 +2261,8 @@ cpdef intptr_t create_network_operator(intptr_t handle, int32_t num_state_modes,
 
     .. seealso:: `cutensornetCreateNetworkOperator`
     """
-    cdef nullable_unique_ptr[ vector[int64_t] ] _state_mode_extents_ = \
-        get_resource_ptr[int64_t](state_mode_extents, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _state_mode_extents_
+    get_resource_ptr[int64_t](_state_mode_extents_, state_mode_extents, <int64_t*>NULL)
     cdef NetworkOperator tensor_network_operator
     with nogil:
         status = cutensornetCreateNetworkOperator(<const Handle>handle, num_state_modes, <const int64_t*>(_state_mode_extents_.data()), <DataType>data_type, &tensor_network_operator)
@@ -2287,14 +2287,14 @@ cpdef int64_t network_operator_append_product(intptr_t handle, intptr_t tensor_n
 
             - an :class:`int` as the pointer address to the nested sequence, or
             - a Python sequence of :class:`int`\s, each of which is a pointer address
-              to a valid sequence, or
+              to a valid sequence of 'int32_t', or
             - a nested Python sequence of ``int32_t``.
 
         tensor_mode_strides (object): Tensor mode strides for each tensor factor (length = ``num_state_modes`` * 2). If NULL, the default generalized column-major strides will be used. It can be:
 
             - an :class:`int` as the pointer address to the nested sequence, or
             - a Python sequence of :class:`int`\s, each of which is a pointer address
-              to a valid sequence, or
+              to a valid sequence of 'int64_t', or
             - a nested Python sequence of ``int64_t``.
 
         tensor_data (object): Tensor data stored in GPU memory for each tensor factor. It can be:
@@ -2309,14 +2309,14 @@ cpdef int64_t network_operator_append_product(intptr_t handle, intptr_t tensor_n
     .. seealso:: `cutensornetNetworkOperatorAppendProduct`
     """
     cdef cuDoubleComplex _coefficient_ = cuDoubleComplex(coefficient.real, coefficient.imag)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _num_state_modes_ = \
-        get_resource_ptr[int32_t](num_state_modes, <int32_t*>NULL)
-    cdef nested_resource[ int32_t ] _state_modes_ = \
-        get_nested_resource_ptr[int32_t](state_modes, <int32_t*>NULL)
-    cdef nested_resource[ int64_t ] _tensor_mode_strides_ = \
-        get_nested_resource_ptr[int64_t](tensor_mode_strides, <int64_t*>NULL)
-    cdef nullable_unique_ptr[ vector[void*] ] _tensor_data_ = \
-        get_resource_ptrs[void](tensor_data, <void*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _num_state_modes_
+    get_resource_ptr[int32_t](_num_state_modes_, num_state_modes, <int32_t*>NULL)
+    cdef nested_resource[ int32_t ] _state_modes_
+    get_nested_resource_ptr[int32_t](_state_modes_, state_modes, <int32_t*>NULL)
+    cdef nested_resource[ int64_t ] _tensor_mode_strides_
+    get_nested_resource_ptr[int64_t](_tensor_mode_strides_, tensor_mode_strides, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[void*] ] _tensor_data_
+    get_resource_ptrs[void](_tensor_data_, tensor_data, <void*>NULL)
     cdef int64_t component_id
     with nogil:
         status = cutensornetNetworkOperatorAppendProduct(<const Handle>handle, <NetworkOperator>tensor_network_operator, <cuDoubleComplex>_coefficient_, num_tensors, <const int32_t*>(_num_state_modes_.data()), <const int32_t**>(_state_modes_.ptrs.data()), <const int64_t**>(_tensor_mode_strides_.ptrs.data()), <const void**>(_tensor_data_.data()), &component_id)
@@ -2360,10 +2360,10 @@ cpdef intptr_t create_accessor(intptr_t handle, intptr_t tensor_network_state, i
 
     .. seealso:: `cutensornetCreateAccessor`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _projected_modes_ = \
-        get_resource_ptr[int32_t](projected_modes, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int64_t] ] _amplitudes_tensor_strides_ = \
-        get_resource_ptr[int64_t](amplitudes_tensor_strides, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _projected_modes_
+    get_resource_ptr[int32_t](_projected_modes_, projected_modes, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _amplitudes_tensor_strides_
+    get_resource_ptr[int64_t](_amplitudes_tensor_strides_, amplitudes_tensor_strides, <int64_t*>NULL)
     cdef StateAccessor tensor_network_accessor
     with nogil:
         status = cutensornetCreateAccessor(<const Handle>handle, <State>tensor_network_state, num_projected_modes, <const int32_t*>(_projected_modes_.data()), <const int64_t*>(_amplitudes_tensor_strides_.data()), &tensor_network_accessor)
@@ -2451,8 +2451,8 @@ cpdef accessor_compute(intptr_t handle, intptr_t tensor_network_accessor, projec
 
     .. seealso:: `cutensornetAccessorCompute`
     """
-    cdef nullable_unique_ptr[ vector[int64_t] ] _projected_mode_values_ = \
-        get_resource_ptr[int64_t](projected_mode_values, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _projected_mode_values_
+    get_resource_ptr[int64_t](_projected_mode_values_, projected_mode_values, <int64_t*>NULL)
     with nogil:
         status = cutensornetAccessorCompute(<const Handle>handle, <StateAccessor>tensor_network_accessor, <const int64_t*>(_projected_mode_values_.data()), <WorkspaceDescriptor>work_desc, <void*>amplitudes_tensor, <void*>state_norm, <Stream>cuda_stream)
     check_status(status)
@@ -2597,7 +2597,7 @@ cpdef int64_t state_apply_tensor_operator(intptr_t handle, intptr_t tensor_netwo
             - a Python sequence of ``int32_t``.
 
         tensor_data (intptr_t): Elements of the tensor operator (must be of the same data type as the elements of the state tensor).
-        tensor_mode_strides (object): Strides of the tensor operator data layout (note that the tensor operator has twice more modes than the number of state modes it acts on). Passing NULL will assume the default generalized columnwise layout. It can be:
+        tensor_mode_strides (object): Strides of the tensor operator data layout (note that the tensor operator has twice more modes than the number of state modes it acts on). Passing NULL will assume the default generalized columnwise storage layout. It can be:
 
             - an :class:`int` as the pointer address to the array, or
             - a Python sequence of ``int64_t``.
@@ -2611,10 +2611,10 @@ cpdef int64_t state_apply_tensor_operator(intptr_t handle, intptr_t tensor_netwo
 
     .. seealso:: `cutensornetStateApplyTensorOperator`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _state_modes_ = \
-        get_resource_ptr[int32_t](state_modes, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int64_t] ] _tensor_mode_strides_ = \
-        get_resource_ptr[int64_t](tensor_mode_strides, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _state_modes_
+    get_resource_ptr[int32_t](_state_modes_, state_modes, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _tensor_mode_strides_
+    get_resource_ptr[int64_t](_tensor_mode_strides_, tensor_mode_strides, <int64_t*>NULL)
     cdef int64_t tensor_id
     with nogil:
         status = cutensornetStateApplyTensorOperator(<const Handle>handle, <State>tensor_network_state, num_state_modes, <const int32_t*>(_state_modes_.data()), <void*>tensor_data, <const int64_t*>(_tensor_mode_strides_.data()), <const int32_t>immutable, <const int32_t>adjoint, <const int32_t>unitary, &tensor_id)
@@ -2646,7 +2646,7 @@ cpdef int64_t state_apply_controlled_tensor_operator(intptr_t handle, intptr_t t
             - a Python sequence of ``int32_t``.
 
         tensor_data (intptr_t): Elements of the target tensor of the controlled tensor operator (must be of the same data type as the elements of the state tensor).
-        tensor_mode_strides (object): Strides of the tensor operator data layout (note that the tensor operator has twice more modes than the number of the target state modes it acts on). Passing NULL will assume the default generalized columnwise layout. It can be:
+        tensor_mode_strides (object): Strides of the tensor operator data layout (note that the tensor operator has twice more modes than the number of the target state modes it acts on). Passing NULL will assume the default generalized columnwise storage layout. It can be:
 
             - an :class:`int` as the pointer address to the array, or
             - a Python sequence of ``int64_t``.
@@ -2660,14 +2660,14 @@ cpdef int64_t state_apply_controlled_tensor_operator(intptr_t handle, intptr_t t
 
     .. seealso:: `cutensornetStateApplyControlledTensorOperator`
     """
-    cdef nullable_unique_ptr[ vector[int32_t] ] _state_control_modes_ = \
-        get_resource_ptr[int32_t](state_control_modes, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int64_t] ] _state_control_values_ = \
-        get_resource_ptr[int64_t](state_control_values, <int64_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _state_target_modes_ = \
-        get_resource_ptr[int32_t](state_target_modes, <int32_t*>NULL)
-    cdef nullable_unique_ptr[ vector[int64_t] ] _tensor_mode_strides_ = \
-        get_resource_ptr[int64_t](tensor_mode_strides, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _state_control_modes_
+    get_resource_ptr[int32_t](_state_control_modes_, state_control_modes, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _state_control_values_
+    get_resource_ptr[int64_t](_state_control_values_, state_control_values, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _state_target_modes_
+    get_resource_ptr[int32_t](_state_target_modes_, state_target_modes, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _tensor_mode_strides_
+    get_resource_ptr[int64_t](_tensor_mode_strides_, tensor_mode_strides, <int64_t*>NULL)
     cdef int64_t tensor_id
     with nogil:
         status = cutensornetStateApplyControlledTensorOperator(<const Handle>handle, <State>tensor_network_state, num_control_modes, <const int32_t*>(_state_control_modes_.data()), <const int64_t*>(_state_control_values_.data()), num_target_modes, <const int32_t*>(_state_target_modes_.data()), <void*>tensor_data, <const int64_t*>(_tensor_mode_strides_.data()), <const int32_t>immutable, <const int32_t>adjoint, <const int32_t>unitary, &tensor_id)
@@ -2726,14 +2726,14 @@ cpdef state_initialize_mps(intptr_t handle, intptr_t tensor_network_state, int b
 
             - an :class:`int` as the pointer address to the nested sequence, or
             - a Python sequence of :class:`int`\s, each of which is a pointer address
-              to a valid sequence, or
+              to a valid sequence of 'int64_t', or
             - a nested Python sequence of ``int64_t``.
 
         strides_in (object): Array of size ``nStateModes`` specifying the strides of all tensors in the chosen MPS representation. Similar to ``extents_in``, ``strides_in`` is also expected to be consistent with the mode order of each MPS tensor. If NULL, the default generalized column-major strides will be assumed. It can be:
 
             - an :class:`int` as the pointer address to the nested sequence, or
             - a Python sequence of :class:`int`\s, each of which is a pointer address
-              to a valid sequence, or
+              to a valid sequence of 'int64_t', or
             - a nested Python sequence of ``int64_t``.
 
         state_tensors_in (object): Array of size ``nStateModes`` specifying the data for all tensors defining the chosen MPS representation. If NULL, the initial MPS-factorized state will represent the vacuum state. It can be:
@@ -2744,12 +2744,12 @@ cpdef state_initialize_mps(intptr_t handle, intptr_t tensor_network_state, int b
 
     .. seealso:: `cutensornetStateInitializeMPS`
     """
-    cdef nested_resource[ int64_t ] _extents_in_ = \
-        get_nested_resource_ptr[int64_t](extents_in, <int64_t*>NULL)
-    cdef nested_resource[ int64_t ] _strides_in_ = \
-        get_nested_resource_ptr[int64_t](strides_in, <int64_t*>NULL)
-    cdef nullable_unique_ptr[ vector[void*] ] _state_tensors_in_ = \
-        get_resource_ptrs[void](state_tensors_in, <void*>NULL)
+    cdef nested_resource[ int64_t ] _extents_in_
+    get_nested_resource_ptr[int64_t](_extents_in_, extents_in, <int64_t*>NULL)
+    cdef nested_resource[ int64_t ] _strides_in_
+    get_nested_resource_ptr[int64_t](_strides_in_, strides_in, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[void*] ] _state_tensors_in_
+    get_resource_ptrs[void](_state_tensors_in_, state_tensors_in, <void*>NULL)
     with nogil:
         status = cutensornetStateInitializeMPS(<const Handle>handle, <State>tensor_network_state, <_BoundaryCondition>boundary_condition, <const int64_t* const*>(_extents_in_.ptrs.data()), <const int64_t* const*>(_strides_in_.ptrs.data()), <void**>(_state_tensors_in_.data()))
     check_status(status)
@@ -2792,14 +2792,14 @@ cpdef int64_t network_operator_append_mpo(intptr_t handle, intptr_t tensor_netwo
 
             - an :class:`int` as the pointer address to the nested sequence, or
             - a Python sequence of :class:`int`\s, each of which is a pointer address
-              to a valid sequence, or
+              to a valid sequence of 'int64_t', or
             - a nested Python sequence of ``int64_t``.
 
         tensor_mode_strides (object): Storage strides for each MPO tensor or NULL (default generalized column-wise strides). It can be:
 
             - an :class:`int` as the pointer address to the nested sequence, or
             - a Python sequence of :class:`int`\s, each of which is a pointer address
-              to a valid sequence, or
+              to a valid sequence of 'int64_t', or
             - a nested Python sequence of ``int64_t``.
 
         tensor_data (object): Tensor data stored in GPU memory for each MPO tensor factor. It can be:
@@ -2815,14 +2815,14 @@ cpdef int64_t network_operator_append_mpo(intptr_t handle, intptr_t tensor_netwo
     .. seealso:: `cutensornetNetworkOperatorAppendMPO`
     """
     cdef cuDoubleComplex _coefficient_ = cuDoubleComplex(coefficient.real, coefficient.imag)
-    cdef nullable_unique_ptr[ vector[int32_t] ] _state_modes_ = \
-        get_resource_ptr[int32_t](state_modes, <int32_t*>NULL)
-    cdef nested_resource[ int64_t ] _tensor_mode_extents_ = \
-        get_nested_resource_ptr[int64_t](tensor_mode_extents, <int64_t*>NULL)
-    cdef nested_resource[ int64_t ] _tensor_mode_strides_ = \
-        get_nested_resource_ptr[int64_t](tensor_mode_strides, <int64_t*>NULL)
-    cdef nullable_unique_ptr[ vector[void*] ] _tensor_data_ = \
-        get_resource_ptrs[void](tensor_data, <void*>NULL)
+    cdef nullable_unique_ptr[ vector[int32_t] ] _state_modes_
+    get_resource_ptr[int32_t](_state_modes_, state_modes, <int32_t*>NULL)
+    cdef nested_resource[ int64_t ] _tensor_mode_extents_
+    get_nested_resource_ptr[int64_t](_tensor_mode_extents_, tensor_mode_extents, <int64_t*>NULL)
+    cdef nested_resource[ int64_t ] _tensor_mode_strides_
+    get_nested_resource_ptr[int64_t](_tensor_mode_strides_, tensor_mode_strides, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[void*] ] _tensor_data_
+    get_resource_ptrs[void](_tensor_data_, tensor_data, <void*>NULL)
     cdef int64_t component_id
     with nogil:
         status = cutensornetNetworkOperatorAppendMPO(<const Handle>handle, <NetworkOperator>tensor_network_operator, <cuDoubleComplex>_coefficient_, num_state_modes, <const int32_t*>(_state_modes_.data()), <const int64_t**>(_tensor_mode_extents_.ptrs.data()), <const int64_t**>(_tensor_mode_strides_.ptrs.data()), <const void**>(_tensor_data_.data()), <_BoundaryCondition>boundary_condition, &component_id)
@@ -2910,6 +2910,69 @@ cpdef sampler_get_info(intptr_t handle, intptr_t tensor_network_sampler, int att
     check_status(status)
 
 
+cpdef int64_t state_apply_unitary_channel(intptr_t handle, intptr_t tensor_network_state, int32_t num_state_modes, state_modes, int32_t num_tensors, tensor_data, tensor_mode_strides, probabilities) except? -1:
+    """Applies a tensor channel consisting of one or more unitary tensor operators to the tensor network state.
+
+    Args:
+        handle (intptr_t): cuTensorNet library handle.
+        tensor_network_state (intptr_t): Tensor network state.
+        num_state_modes (int32_t): Number of state modes the tensor channel acts on.
+        state_modes (object): Pointer to the state modes the tensor channel acts on. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int32_t``.
+
+        num_tensors (int32_t): Number of constituting tensor operators defining the tensor channel.
+        tensor_data (object): Elements of the tensor operators constituting the tensor channel (must be of the same data type as the elements of the state tensor). It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of :class:`int`\s (as pointer addresses).
+
+        tensor_mode_strides (object): Strides of the tensor data storage layout (note that the supplied tensors have twice more modes than the number of state modes they act on). Passing NULL will assume the default generalized columnwise storage layout. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``int64_t``.
+
+        probabilities (object): Probabilities associated with the individual tensor operators. It can be:
+
+            - an :class:`int` as the pointer address to the array, or
+            - a Python sequence of ``float``.
+
+
+    Returns:
+        int64_t: Unique integer id (for later identification of the tensor channel).
+
+    .. seealso:: `cutensornetStateApplyUnitaryChannel`
+    """
+    cdef nullable_unique_ptr[ vector[int32_t] ] _state_modes_
+    get_resource_ptr[int32_t](_state_modes_, state_modes, <int32_t*>NULL)
+    cdef nullable_unique_ptr[ vector[void*] ] _tensor_data_
+    get_resource_ptrs[void](_tensor_data_, tensor_data, <void*>NULL)
+    cdef nullable_unique_ptr[ vector[int64_t] ] _tensor_mode_strides_
+    get_resource_ptr[int64_t](_tensor_mode_strides_, tensor_mode_strides, <int64_t*>NULL)
+    cdef nullable_unique_ptr[ vector[double] ] _probabilities_
+    get_resource_ptr[double](_probabilities_, probabilities, <double*>NULL)
+    cdef int64_t channel_id
+    with nogil:
+        status = cutensornetStateApplyUnitaryChannel(<const Handle>handle, <State>tensor_network_state, num_state_modes, <const int32_t*>(_state_modes_.data()), num_tensors, <void**>(_tensor_data_.data()), <const int64_t*>(_tensor_mode_strides_.data()), <const double*>(_probabilities_.data()), &channel_id)
+    check_status(status)
+    return channel_id
+
+
+cpdef state_capture_mps(intptr_t handle, intptr_t tensor_network_state):
+    """Resets the tensor network state to the MPS state previously computed via ``cutensornetStateCompute``.
+
+    Args:
+        handle (intptr_t): cuTensorNet library handle.
+        tensor_network_state (intptr_t): Tensor network state.
+
+    .. seealso:: `cutensornetStateCaptureMPS`
+    """
+    with nogil:
+        status = cutensornetStateCaptureMPS(<const Handle>handle, <State>tensor_network_state)
+    check_status(status)
+
+
 # for backward compat
 contraction_optimizer_config_get_attribute_dtype = get_contraction_optimizer_config_attribute_dtype
 contraction_optimizer_info_get_attribute_dtype = get_contraction_optimizer_info_attribute_dtype
diff --git a/python/cuquantum/cutensornet/cycutensornet.pxd b/python/cuquantum/cutensornet/cycutensornet.pxd
index 79a95d0..f35f022 100644
--- a/python/cuquantum/cutensornet/cycutensornet.pxd
+++ b/python/cuquantum/cutensornet/cycutensornet.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 #
-# This code was automatically generated across versions from 23.03.0 to 24.08.0. Do not modify it directly.
+# This code was automatically generated across versions from 23.03.0 to 24.11.0. Do not modify it directly.
 # This layer exposes the C header to Cython as-is.
 
 from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t
@@ -438,3 +438,5 @@ cdef cutensornetStatus_t cutensornetAccessorGetInfo(const cutensornetHandle_t ha
 cdef cutensornetStatus_t cutensornetExpectationGetInfo(const cutensornetHandle_t handle, const cutensornetStateExpectation_t tensorNetworkExpectation, cutensornetExpectationAttributes_t attribute, void* attributeValue, size_t attributeSize) except* nogil
 cdef cutensornetStatus_t cutensornetMarginalGetInfo(const cutensornetHandle_t handle, const cutensornetStateMarginal_t tensorNetworkMarginal, cutensornetMarginalAttributes_t attribute, void* attributeValue, size_t attributeSize) except* nogil
 cdef cutensornetStatus_t cutensornetSamplerGetInfo(const cutensornetHandle_t handle, const cutensornetStateSampler_t tensorNetworkSampler, cutensornetSamplerAttributes_t attribute, void* attributeValue, size_t attributeSize) except* nogil
+cdef cutensornetStatus_t cutensornetStateApplyUnitaryChannel(const cutensornetHandle_t handle, cutensornetState_t tensorNetworkState, int32_t numStateModes, const int32_t* stateModes, int32_t numTensors, void* tensorData[], const int64_t* tensorModeStrides, const double probabilities[], int64_t* channelId) except* nogil
+cdef cutensornetStatus_t cutensornetStateCaptureMPS(const cutensornetHandle_t handle, cutensornetState_t tensorNetworkState) except* nogil
diff --git a/python/cuquantum/cutensornet/cycutensornet.pyx b/python/cuquantum/cutensornet/cycutensornet.pyx
index b4b5ba6..85308aa 100644
--- a/python/cuquantum/cutensornet/cycutensornet.pyx
+++ b/python/cuquantum/cutensornet/cycutensornet.pyx
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 #
-# This code was automatically generated across versions from 23.03.0 to 24.08.0. Do not modify it directly.
+# This code was automatically generated across versions from 23.03.0 to 24.11.0. Do not modify it directly.
 
 from ._internal cimport cutensornet as _cutensornet
 
@@ -465,3 +465,11 @@ cdef cutensornetStatus_t cutensornetMarginalGetInfo(const cutensornetHandle_t ha
 
 cdef cutensornetStatus_t cutensornetSamplerGetInfo(const cutensornetHandle_t handle, const cutensornetStateSampler_t tensorNetworkSampler, cutensornetSamplerAttributes_t attribute, void* attributeValue, size_t attributeSize) except* nogil:
     return _cutensornet._cutensornetSamplerGetInfo(handle, tensorNetworkSampler, attribute, attributeValue, attributeSize)
+
+
+cdef cutensornetStatus_t cutensornetStateApplyUnitaryChannel(const cutensornetHandle_t handle, cutensornetState_t tensorNetworkState, int32_t numStateModes, const int32_t* stateModes, int32_t numTensors, void* tensorData[], const int64_t* tensorModeStrides, const double probabilities[], int64_t* channelId) except* nogil:
+    return _cutensornet._cutensornetStateApplyUnitaryChannel(handle, tensorNetworkState, numStateModes, stateModes, numTensors, tensorData, tensorModeStrides, probabilities, channelId)
+
+
+cdef cutensornetStatus_t cutensornetStateCaptureMPS(const cutensornetHandle_t handle, cutensornetState_t tensorNetworkState) except* nogil:
+    return _cutensornet._cutensornetStateCaptureMPS(handle, tensorNetworkState)
diff --git a/python/cuquantum/cutensornet/experimental/_internal/network_state_utils.py b/python/cuquantum/cutensornet/experimental/_internal/network_state_utils.py
index 45704e0..c99a0b3 100644
--- a/python/cuquantum/cutensornet/experimental/_internal/network_state_utils.py
+++ b/python/cuquantum/cutensornet/experimental/_internal/network_state_utils.py
@@ -105,9 +105,15 @@ def decorator(func):
         @functools.wraps(func)
         def wrapper(*args, **kwargs):
             result = func(*args, **kwargs)
+            norm = None
             if result is not None:
+                if isinstance(result, tuple):
+                    result, norm = result
                 if is_scalar:
-                    return result.tensor.item()
+                    if norm is None:
+                        return result.tensor.item()
+                    else: 
+                        return result.tensor.item(), norm
                 obj = args[0]
                 if obj.output_location == 'cpu':
                     stream = kwargs.get('stream')
@@ -115,7 +121,10 @@ def wrapper(*args, **kwargs):
                     result = result.to('cpu', stream_holder=stream_holder)
                 else:
                     result = result.tensor
-            return result 
+            if norm is None:
+                return result
+            else:
+                return result, norm
         return wrapper    
     return decorator
 
@@ -151,7 +160,7 @@ def asarray(*args, **kwargs):
         return asarray
 
 
-def create_pauli_operands(pauli_strings, dtype, backend='cupy', device_id=None, stream=None):
+def get_pauli_map(dtype, backend='cupy', device_id=None, stream=None):
     asarray = _get_asarray_function(backend, device_id, stream)
     if backend == 'torch':
         module = importlib.import_module(backend)
@@ -165,7 +174,10 @@ def create_pauli_operands(pauli_strings, dtype, backend='cupy', device_id=None,
                  'X': pauli_x,
                  'Y': pauli_y,
                  'Z': pauli_z}
+    return pauli_map
 
+def create_pauli_operands(pauli_strings, dtype, backend='cupy', device_id=None, stream=None):
+    pauli_map = get_pauli_map(dtype, backend=backend, device_id=device_id, stream=stream)
     operands_data = []
     n_qubits = None
     for pauli_string, coefficient in pauli_strings.items():
@@ -184,4 +196,12 @@ def create_pauli_operands(pauli_strings, dtype, backend='cupy', device_id=None,
             tensors = [pauli_map['I'],] * n_qubits
             modes = [(q, ) for q in range(n_qubits)]
         operands_data.append([tensors, modes, coefficient])
-    return operands_data
\ No newline at end of file
+    return operands_data
+
+def get_operand_key(o):
+    """Return a key that marks the underlying operand"""
+    return o.shape, o.strides, o.data_ptr
+
+def get_mps_key(mps_operands):
+    """Return a key that marks the underlying MPS state"""
+    return [get_operand_key(o) for o in mps_operands]
\ No newline at end of file
diff --git a/python/cuquantum/cutensornet/experimental/configuration.py b/python/cuquantum/cutensornet/experimental/configuration.py
index 853a264..2c4efb4 100644
--- a/python/cuquantum/cutensornet/experimental/configuration.py
+++ b/python/cuquantum/cutensornet/experimental/configuration.py
@@ -46,7 +46,6 @@ class ContractDecomposeAlgorithm:
     Attributes:
         qr_method: The QR method used for the decomposition. See :class:`~cuquantum.cutensornet.tensor.QRMethod`.
         svd_method: The SVD method used for the decomposition. See :class:`~cuquantum.cutensornet.tensor.SVDMethod`.
-        svd_info: The SVD information during runtime. See :class:`~cuquantum.cutensornet.tensor.SVDInfo`.
     """
 
     qr_method: Optional[Union[QRMethod, Literal[False, None],Dict]] = dataclasses.field(default_factory=QRMethod)
diff --git a/python/cuquantum/cutensornet/experimental/network_state.py b/python/cuquantum/cutensornet/experimental/network_state.py
index f03da4e..ccf7e8c 100644
--- a/python/cuquantum/cutensornet/experimental/network_state.py
+++ b/python/cuquantum/cutensornet/experimental/network_state.py
@@ -13,7 +13,15 @@
 
 from .configuration import MPSConfig, TNConfig
 from .network_operator import NetworkOperator
-from ._internal.network_state_utils import EXACT_MPS_EXTENT_LIMIT, STATE_DEFAULT_DTYPE, check_dtype_supported, state_operands_wrapper, state_result_wrapper, state_labels_wrapper
+from ._internal.network_state_utils import (
+    EXACT_MPS_EXTENT_LIMIT, 
+    STATE_DEFAULT_DTYPE, 
+    check_dtype_supported, 
+    get_mps_key, 
+    state_operands_wrapper, 
+    state_result_wrapper, 
+    state_labels_wrapper,
+)
 from .. import memory
 from ..tensor_network import Network
 from ..circuit_converter import CircuitToEinsum
@@ -115,6 +123,9 @@ def __init__(self, state_mode_extents, *, dtype=STATE_DEFAULT_DTYPE, config=None
         options = utils.check_or_create_options(NetworkOptions, options, "network options")
         self.options = options
         self.device_id = self.options.device_id
+        if state_labels is not None:
+            if any(isinstance(element, int) for element in state_labels):
+                raise ValueError("ints are currently not supported in state_labels to avoid potential conflicting usage")
         self.state_labels = list(state_labels) if state_labels is not None else state_labels
 
         # Get cuTensorNet version (as seen at run-time).
@@ -159,6 +170,9 @@ def __init__(self, state_mode_extents, *, dtype=STATE_DEFAULT_DTYPE, config=None
         else:
             raise ValueError("method must be either a TNConfig/MPSConfig object or a dict that can be used to construct TNConfig/MPSConfig")
         
+        if self.n == 1 and isinstance(self.config, MPSConfig):
+            raise ValueError(f"For system with one physical dimension, please switch to tensor network simulation method via TNConfig")
+        
         self.operands = {}
         self.owned_network_operators = {}
         self.non_owned_network_operators = {}
@@ -183,7 +197,7 @@ def __init__(self, state_mode_extents, *, dtype=STATE_DEFAULT_DTYPE, config=None
         self.workspace_h_scratch_ptr, self.workspace_h_scratch_size = None, None
         self.workspace_h_cache_ptr, self.workspace_h_cache_size = None, None
         self.workspace_scratch_allocated_here, self.workspace_cache_allocated_here = False, False
-        
+        self.workspace_sizes_requirements = {} # a dictionary to cache workspace requirements
         # Attributes to establish stream ordering.
         self.blocking = None # This will be set when operators are applied
         self.workspace_stream = None
@@ -198,12 +212,13 @@ def __init__(self, state_mode_extents, *, dtype=STATE_DEFAULT_DTYPE, config=None
         self.target_state_set = False
         self.state_prepared = False
         self.state_computed = False
-        self.norm = None
         self.initial_state = []
         self.valid_state = True
 
         self.cached_task_obj = {}
+        self.contains_stochastic_channels = False
         self.logger.info("The network state has been created.")
+        self.prev_state_key = None
     
     
     def _check_backend_setup(self, *args, **kwargs):
@@ -341,6 +356,22 @@ def _calculate_workspace_size(self):
             self.handle, self.workspace_desc, cutn.WorksizePref.RECOMMENDED, cutn.Memspace.HOST, cutn.WorkspaceKind.SCRATCH)
         self.workspace_h_cache_size = cutn.workspace_get_memory_size(
             self.handle, self.workspace_desc, cutn.WorksizePref.RECOMMENDED, cutn.Memspace.HOST, cutn.WorkspaceKind.CACHE)
+        
+        return {
+            'scratch_size': self.workspace_scratch_size,
+            'cache_size': self.workspace_cache_size,
+            'h_scratch_size': self.workspace_h_scratch_size,
+            'h_cache_size': self.workspace_h_cache_size
+        }
+
+    def _reset_workspace_size_requirement(self, workspace_dict):
+        """
+        When preparation can be skipped, this API must be called to update the correct workspace size requirement
+        """
+        for name in ('scratch', 'cache', 'h_scratch', 'h_cache'):
+            if getattr(self, f'workspace_{name}_size') != workspace_dict[f'{name}_size']:
+                setattr(self, f'workspace_{name}_size', workspace_dict[f'{name}_size'])
+                setattr(self, f'workspace_{name}_ptr', None)
 
     @utils.precondition(_check_valid_network)
     @utils.atomic(_free_workspace_memory, method=True)
@@ -420,7 +451,6 @@ def _mark_updated(self, structural=True):
         The only exception is when update_tensor_operator is called for a tensor network contraction simulation or an MPS simulation without value based truncation, 
         in which case the cached task objects are still valid and do not need to be freed.
         """
-        self.norm = None
         self.state_computed = False
         if structural:
             self.state_prepared = False
@@ -527,6 +557,47 @@ def apply_tensor_operator(self, modes, operand, *, control_modes=None, control_v
         self._mark_updated()
         return tensor_id
     
+    @state_labels_wrapper(marker_index=1, marker_type='seq')
+    @state_operands_wrapper(operands_arg_index=2, is_single_operand=False)
+    @utils.precondition(_check_valid_network)
+    def apply_unitary_tensor_channel(self, modes, operands, probabilities, *, stream=None):
+        """
+        Apply a unitary tensor channel to the network state.
+
+        Args:
+            modes : A sequence of integers denoting the modes where the tensor operator acts on. 
+                If ``state_labels`` has been provided during initialization, ``modes`` can also be provided as a sequence of labels. 
+            operands : A sequence of ndarray-like objects for the unitary tensor operators defining the unitary channel.
+                The modes of the operand is expected to be ordered as ``ABC...abc...``, 
+                where ``ABC...`` denotes output bra modes and ``abc...`` denotes input ket modes corresponding to ``modes`` 
+            probabilities : A sequence of positive floats representing the probabilities of each operand.
+            stream : Provide the CUDA stream to use for applying the tensor operator (this is used to copy the operands to the GPU if they are provided on the CPU). 
+                Acceptable inputs include ``cudaStream_t`` (as Python :class:`int`), :class:`cupy.cuda.Stream`, and :class:`torch.cuda.Stream`. 
+                If a stream is not provided, the current stream will be used.
+        
+        Returns:
+            An integer `channel_id` specifying the location of the unitary channel.
+
+        Notes:
+            - For MPS simulation, the size of ``modes`` shall be restricted to no larger than 2 (two-body operator).
+        """
+        if len(operands) != len(probabilities):
+            raise ValueError(f"The number of operands ({len(operands)}) does not matching the size of probabilities ({len(probabilities)})")
+        # operand indices (b, a, B, A) required for modes a, b
+        operands = [o.T for o in operands]
+        tensor_data = [o.data_ptr for o in operands]
+        tensor_mode_strides = [o.strides for o in operands]
+        if not all(strides == tensor_mode_strides[0] for strides in tensor_mode_strides):
+            raise ValueError(f"The strides for all input operands must be the identical.")
+        channel_id = cutn.state_apply_unitary_channel(self.handle, 
+            self.state, len(modes), modes, len(operands), tensor_data, tensor_mode_strides[0], probabilities)
+        # keep operand alive otherwise cupy will re-use the memory space
+        self.operands[channel_id] = operands, True
+        # reset norm / state vector
+        self._mark_updated()
+        self.contains_stochastic_channels = True
+        return channel_id
+    
     @state_operands_wrapper(operands_arg_index=2, is_single_operand=True)
     @utils.precondition(_check_valid_network)
     def update_tensor_operator(self, tensor_id, operand, *, unitary=False, stream=None):
@@ -623,6 +694,19 @@ def apply_mpo(self, modes, mpo_tensors, *, immutable=False, adjoint=False, unita
     ###### APIs for property computation ######
     ###########################################
 
+    def _maybe_setup_recompute(self, *args, **kwargs):
+        """
+        When stochastic channels exist in the state, for property computation methods, this API call is needed before calling _compute_target to activate re-computation of MPS for every compute_xxx call.
+        """
+        if self.contains_stochastic_channels and isinstance(self.config, MPSConfig):
+            self.state_computed = False
+    
+    def _get_current_key(self):
+        if isinstance(self.config, TNConfig) or not hasattr(self, 'mps_tensors'):
+            return None
+        else:
+            return get_mps_key(self.mps_tensors)
+
     def _maybe_configure_state(self):
         if not self.state_configured:
             # configure the state
@@ -632,6 +716,7 @@ def _maybe_configure_state(self):
     def _maybe_set_target_state(self, stream):
         self._maybe_configure_state()
         if isinstance(self.config, MPSConfig) and (not self.target_state_set):
+            self.prev_state_key = None
             # specify the largest output MPS tensors' sizes
             max_extent = self.config.max_extent
             self.mps_tensors = []
@@ -668,6 +753,8 @@ def _maybe_compute_state(self, stream, release_workspace):
         if not self.state_computed and isinstance(self.config, MPSConfig):
             create_args = ()
             execute_args = (self.workspace_desc, [o.data_ptr for o in self.mps_tensors])
+            # record the key from last MPS computation
+            self.prev_state_key = self._get_current_key()
             # compute the final MPS tensors
             output = self._compute_target('state', create_args, execute_args, stream, release_workspace)
             if output is None:
@@ -716,7 +803,11 @@ def _compute_target(self, task, create_args, execute_args, stream, release_works
         else:
             if task_key is None:
                 task_key = (task, create_args)
-            if task_key in self.cached_task_obj:
+            if task_key in self.cached_task_obj and (self.property_prepare_reusable or self._get_current_key() == self.prev_state_key):
+                # In the following cases, re-creation/preparation of property computation objects can be skipped:
+                # 1. Contraction based simulation  -> self.property_prepare_reusable
+                # 2. MPS without value based truncation -> self.property_prepare_reusable
+                # 3. MPS with value based truncation but at runtime yields the same extents as the previous compute call -> check key against last computation
                 task_obj = self.cached_task_obj[task_key]
                 self.logger.info(f"Found the same {task} object from the cache")
                 prepare_needed = False
@@ -743,13 +834,18 @@ def _compute_target(self, task, create_args, execute_args, stream, release_works
                 self.logger.info(f"The preparation of {caller_name} computation took {elapsed.data:.3f} ms to complete.")
             else:
                 self.logger.info(f"Preparation for {caller_name} has been completed")
-            self._calculate_workspace_size()
+            # cache the workspace size requirements
+            self.workspace_sizes_requirements[caller_name] = self._calculate_workspace_size()
             if task == 'state':
                 self.state_prepared = True
         else:
             self.logger.info(f"Preparation for {caller_name} has been skipped due to cache usage")
-
+            # While the compute object has been prepared in a previous call, 
+            # the current workspace size need to be reset to align with the actual requirement
+            self._reset_workspace_size_requirement(self.workspace_sizes_requirements[caller_name])
+            
         self._allocate_workspace_memory_perhaps(stream_holder, "scratch")
+        self._allocate_workspace_memory_perhaps(stream_holder, "cache")
 
         if self.logger.isEnabledFor(logging.INFO):
             info_flops_enum = getattr(cutn, f'{task.capitalize()}Attribute').INFO_FLOPS
@@ -768,15 +864,14 @@ def _compute_target(self, task, create_args, execute_args, stream, release_works
         
         # Establish ordering wrt the computation and free scratch and cache workspace based on user request.
         self._release_workspace_memory_perhaps("scratch", release_workspace=release_workspace)
+        self._release_workspace_memory_perhaps("cache", release_workspace=release_workspace)
         self._reset_workspace_allocation_tracking()
 
-        if isinstance(output, tuple):
-            return output
-        else:
-            return True
+        # for task == 'state', output is a tuple, otherwise None
+        return output
     
 
-    def _run_state_accessor(self, caller_name, *, fixed_modes=None, stream=None, release_workspace=False):
+    def _run_state_accessor(self, caller_name, return_norm, *, fixed_modes=None, stream=None, release_workspace=False):
         if fixed_modes:
             # compute batched amplitudes
             shape = [self.state_mode_extents[q] for q in range(self.n) if q not in fixed_modes]
@@ -795,27 +890,28 @@ def _run_state_accessor(self, caller_name, *, fixed_modes=None, stream=None, rel
         norm = np.empty(1, dtype=self.dtype)
 
         create_args = (num_fixed_modes, fixed_modes, tuple(amplitudes.strides))
-        compute_norm = self.norm is None
-        if compute_norm:
+        if return_norm:
             execute_args = (fixed_values, self.workspace_desc, amplitudes.data_ptr, norm.ctypes.data)
         else:
             execute_args = (fixed_values, self.workspace_desc, amplitudes.data_ptr, 0)
-        if self._compute_target('accessor', create_args, execute_args, stream, release_workspace, caller_name=caller_name):
-            if compute_norm:
-                self.norm = norm.real.item()
-            return amplitudes
+        self._compute_target('accessor', create_args, execute_args, stream, release_workspace, caller_name=caller_name)
+        if return_norm:
+            return amplitudes, norm.real.item()
         else:
-            return None
+            return amplitudes
+
     
     @state_result_wrapper(is_scalar=True)
+    @utils.precondition(_maybe_setup_recompute)
     @utils.precondition(_check_valid_network)
     @utils.precondition(_check_backend_setup, "Amplitude computation")
-    def compute_amplitude(self, bitstring, *, stream=None, release_workspace=False):
+    def compute_amplitude(self, bitstring, *, return_norm=False, stream=None, release_workspace=False):
         """
         Compute the probability amplitude of a bitstring.
 
         Args:
             bitstring : A sequence of integers specifying the desired measured state dimension. 
+            return_norm : If true, the squared norm of the state will also be returned.
             stream : Provide the CUDA stream to use for the computation. 
                 Acceptable inputs include ``cudaStream_t`` (as Python :class:`int`), :class:`cupy.cuda.Stream`, and :class:`torch.cuda.Stream`. 
                 If a stream is not provided, the current stream will be used.
@@ -828,26 +924,29 @@ def compute_amplitude(self, bitstring, *, stream=None, release_workspace=False):
                 The default is `False`.
         
         Returns:
-            A scalar for the bitstring amplitude. 
+            If ``return_norm`` is `False`, a scalar for the bitstring amplitude; otherwise, a 2-tuple consisting of the bitstring of the amplitude 
+            and a scalar for the squared norm of the state, i.e, inner product of bra and ket state.
         """
         if len(bitstring) != self.n:
             raise ValueError(f"Length of bitstring is expected to match the dimension of the underlying state ({self.n}), found ({len(bitstring)})")
         fixed_modes = {}
         for i, bit in enumerate(bitstring):
             fixed_modes[i] = int(bit)
-        return self._run_state_accessor('amplitude', fixed_modes=fixed_modes, stream=stream, release_workspace=release_workspace)
+        return self._run_state_accessor('amplitude', return_norm, fixed_modes=fixed_modes, stream=stream, release_workspace=release_workspace)
     
     @state_labels_wrapper(marker_index=1, marker_type='dict')
     @state_result_wrapper(is_scalar=False)
+    @utils.precondition(_maybe_setup_recompute)
     @utils.precondition(_check_valid_network)
     @utils.precondition(_check_backend_setup, "Batched amplitude computation")
-    def compute_batched_amplitudes(self, fixed, *, stream=None, release_workspace=False):
+    def compute_batched_amplitudes(self, fixed, *, return_norm=False, stream=None, release_workspace=False):
         """
         Compute the batched amplitudes for a given slice.
 
         Args:    
             fixed : A dictionary mapping a subset of state dimensions to correponding fixed states. 
                 If ``state_labels`` has been provided during initialization, ``fixed`` can also be provided as a dictionary mapping a subset of labels to corresponding fixed states. 
+            return_norm : If true, the squared norm of the state will also be returned.
             stream : Provide the CUDA stream to use for the computation. 
                 Acceptable inputs include ``cudaStream_t`` (as Python :class:`int`), :class:`cupy.cuda.Stream`, and :class:`torch.cuda.Stream`. 
                 If a stream is not provided, the current stream will be used.
@@ -860,20 +959,23 @@ def compute_batched_amplitudes(self, fixed, *, stream=None, release_workspace=Fa
                 The default is `False`.
         
         Returns:
-            An ndarray-like object as batched amplitudes. The package and storage location of the ndarray will be the same as 
-            the operands provided in :meth:`apply_tensor_operator`, :meth:`apply_mpo` and :meth:`set_initial_mps`. 
+            If ``return_norm`` is `False`, An ndarray-like object as batched amplitudes. The package and storage location of the ndarray will be the same as 
+            the operands provided in :meth:`apply_tensor_operator`, :meth:`apply_mpo` and :meth:`set_initial_mps`; otherwise, a 2-tuple consisting of the batched amplitudes 
+            and a scalar for the squared norm of the state, i.e, inner product of bra and ket state.
         """
-        return self._run_state_accessor('batched_amplitudes', fixed_modes=fixed, stream=stream, release_workspace=release_workspace)
+        return self._run_state_accessor('batched_amplitudes', return_norm, fixed_modes=fixed, stream=stream, release_workspace=release_workspace)
     
 
     @state_result_wrapper(is_scalar=False)
+    @utils.precondition(_maybe_setup_recompute)
     @utils.precondition(_check_valid_network)
     @utils.precondition(_check_backend_setup, "State vector computation")
-    def compute_state_vector(self, *, stream=None, release_workspace=False):
+    def compute_state_vector(self, *, return_norm=False, stream=None, release_workspace=False):
         """
         Compute the state vector.
 
         Args:
+            return_norm : If true, the squared norm of the state will also be returned.
             stream : Provide the CUDA stream to use for the computation. 
                 Acceptable inputs include ``cudaStream_t`` (as Python :class:`int`), :class:`cupy.cuda.Stream`, and :class:`torch.cuda.Stream`. 
                 If a stream is not provided, the current stream will be used.
@@ -886,51 +988,16 @@ def compute_state_vector(self, *, stream=None, release_workspace=False):
                 The default is `False`.
 
         Returns:
-            An ndarray-like object as the state vector. The package and storage location of the ndarray will be the same as 
-            the operands provided in :meth:`apply_tensor_operator`, :meth:`apply_mpo` and :meth:`set_initial_mps`. 
+            If ``return_norm`` is `False`, An ndarray-like object as the state vector. The package and storage location of the ndarray will be the same as 
+            the operands provided in :meth:`apply_tensor_operator`, :meth:`apply_mpo` and :meth:`set_initial_mps`; otherwise, a 2-tuple consisting of the state vector 
+            and a scalar for the squared norm of the state, i.e, inner product of bra and ket state.
         """
-        return self._run_state_accessor('state vector', fixed_modes={}, stream=stream, release_workspace=release_workspace)
-    
-    @utils.precondition(_check_valid_network)
-    @utils.precondition(_check_backend_setup, "Norm computation")
-    def compute_norm(self, *, stream=None, release_workspace=False):
-        """
-        Compute the norm of the state.
-
-        Args:       
-            stream : Provide the CUDA stream to use for the computation. 
-                Acceptable inputs include ``cudaStream_t`` (as Python :class:`int`), :class:`cupy.cuda.Stream`, and :class:`torch.cuda.Stream`. 
-                If a stream is not provided, the current stream will be used.
-            release_workspace : A value of `True` specifies that the state object should release workspace memory back to
-                the package memory pool on function return, while a value of `False` specifies that the state object
-                should retain the memory. This option may be set to `True` if the application performs other operations that consume
-                a lot of memory between successive calls to the (same or different) execution API such as :meth:`compute_sampling`,
-                :meth:`compute_reduced_density_matrix`, :meth:`compute_amplitude`, :meth:`compute_batched_amplitudes`, or :meth:`compute_expectation`, 
-                but incurs a small overhead due to obtaining and releasing workspace memory from and to the package memory pool on every call. 
-                The default is `False`.
-        
-        Returns:
-            A scalar for the norm of the state.
-        
-        Note:
-            - The norm of the state is also computed and cached if any of the following API is called:
-
-                - :meth:`compute_state_vector`
-                - :meth:`compute_amplitude`
-                - :meth:`compute_batched_amplitudes`
-                - :meth:`compute_expectation`
-        """
-        if self.norm is None:
-            fixed_modes = {}
-            for i in range(self.n):
-                fixed_modes[i] = 0
-            # use the least costive method to compute the norm
-            self._run_state_accessor('norm', fixed_modes=fixed_modes, stream=stream, release_workspace=release_workspace)
-        return self.norm
+        return self._run_state_accessor('state vector', return_norm, fixed_modes={}, stream=stream, release_workspace=release_workspace)
 
     @state_labels_wrapper(marker_index=1, marker_type='seq')
     @state_labels_wrapper(key='fixed', marker_type='dict')
     @state_result_wrapper(is_scalar=False)
+    @utils.precondition(_maybe_setup_recompute)
     @utils.precondition(_check_valid_network)
     @utils.precondition(_check_backend_setup, "Reduced density matrix computation")
     def compute_reduced_density_matrix(self, where, *, fixed=EMPTY_DICT, stream=None, release_workspace=False):
@@ -971,14 +1038,13 @@ def compute_reduced_density_matrix(self, where, *, fixed=EMPTY_DICT, stream=None
         rdm = utils.create_empty_tensor(self.intermediate_class, rdm_shape, self.dtype, self.device_id, stream_holder)
         create_args = (n_marginal_modes, tuple(where), n_projected_modes, projected_modes, tuple(rdm.strides))
         execute_args = (projected_mode_values, self.workspace_desc, rdm.data_ptr)
-        if self._compute_target('marginal', create_args, execute_args, stream, release_workspace):
-            return rdm
-        else:
-            return None
+        self._compute_target('marginal', create_args, execute_args, stream, release_workspace)
+        return rdm
     
+    @utils.precondition(_maybe_setup_recompute)
     @utils.precondition(_check_valid_network)
     @utils.precondition(_check_backend_setup, "Output state")
-    def compute_output_state(self, stream=None, release_workspace=False):
+    def compute_output_state(self, *, stream=None, release_workspace=False, release_operators=False):
         """
         Compute the final output state for the underlying network state object. This method currently is only valid for MPS based simulation.
 
@@ -993,6 +1059,10 @@ def compute_output_state(self, stream=None, release_workspace=False):
                 :meth:`compute_reduced_density_matrix`, :meth:`compute_amplitude`, :meth:`compute_batched_amplitudes`, or :meth:`compute_expectation`, 
                 but incurs a small overhead due to obtaining and releasing workspace memory from and to the package memory pool on every call. 
                 The default is `False`.
+            release_operators : A value of `True` will release the reference of all underlying tensor operators and :class:`NetworkOperator` objects. 
+                The previous ``tensor_id`` returned by :meth:`apply_tensor_operator`, :meth:`apply_network_operator` and :meth:`apply_mpo` will be invalid.
+                If the output state has already been computed, which is an intermediate step in other ``compute_xxx`` methods, the output state will be cached and returned directly. 
+                Thus passing ``release_operators=True`` can be used to reset the underlying :class:`NetworkState` object.
     
         Returns:
             When MPS simulation when is specified using the ``options`` argument during object initialization, a sequence of operands representing the underlying 
@@ -1008,11 +1078,26 @@ def compute_output_state(self, stream=None, release_workspace=False):
                 result = [o.to('cpu', stream_holder=stream_holder) for o in self.mps_tensors]
             else:
                 result = [o.tensor for o in self.mps_tensors]
+            if release_operators:
+                # event synchronization
+                if self.last_compute_event is not None:
+                    if stream is None: 
+                        stream = utils.get_or_create_stream(self.device_id, stream, self.internal_package).obj
+                    stream.wait_event(self.last_compute_event)
+                # release reference to underlying operators and NetworkOperator
+                cutn.state_capture_mps(self.handle, self.state)
+                self.operands = {}
+                self.owned_network_operators = {}
+                self.non_owned_network_operators = {}
+                self.initial_state = list(self.mps_tensors)
+                # mark state as no longer computed & stochastic channels as resolved
+                self.state_computed = self.state_prepared = self.contains_stochastic_channels = False
             return result
         else:
             raise NotImplementedError()
     
     @state_labels_wrapper(key='modes', marker_type='seq')
+    @utils.precondition(_maybe_setup_recompute)
     @utils.precondition(_check_valid_network)
     @utils.precondition(_check_backend_setup, "Sampling")
     def compute_sampling(self, nshots, *, modes=None, seed=None, stream=None, release_workspace=False):
@@ -1056,19 +1141,17 @@ def compute_sampling(self, nshots, *, modes=None, seed=None, stream=None, releas
             attr = cutn.SamplerAttribute.CONFIG_DETERMINISTIC
             val = np.asarray(seed, dtype=cutn.get_sampler_attribute_dtype(attr))
             config_args = (attr, val.ctypes.data, val.dtype.itemsize)
-        if self._compute_target('sampler', create_args, execute_args, stream, release_workspace, config_args=config_args):
-            sampling = {}
-            for bitstring, n_sampling in zip(*np.unique(samples, axis=0, return_counts=True)):
-                bitstring = np.array2string(bitstring, separator='')[1:-1]
-                sampling[bitstring] = n_sampling
-            return sampling
-        else:
-            return None
+        self._compute_target('sampler', create_args, execute_args, stream, release_workspace, config_args=config_args)
+        sampling = {}
+        for bitstring, n_sampling in zip(*np.unique(samples, axis=0, return_counts=True)):
+            bitstring = np.array2string(bitstring, separator='')[1:-1]
+            sampling[bitstring] = n_sampling
+        return sampling
     
-
+    @utils.precondition(_maybe_setup_recompute)
     @utils.precondition(_check_valid_network)
     @utils.precondition(_check_backend_setup, "Expectation computation")
-    def compute_expectation(self, operators, *, stream=None, release_workspace=False):
+    def compute_expectation(self, operators, *, return_norm=False, stream=None, release_workspace=False):
         """
         Compute the expectation value (not normalized) for the given tensor network operator.
 
@@ -1079,6 +1162,7 @@ def compute_expectation(self, operators, *, stream=None, release_workspace=False
                 - A single pauli string specifying the pauli operator for each qubit.
                 - A dictionary mapping each single pauli string to corresponding coefficient.
 
+            return_norm : If true, the squared norm of the state will also be returned.
             stream : Provide the CUDA stream to use for the computation. 
                 Acceptable inputs include ``cudaStream_t`` (as Python :class:`int`), :class:`cupy.cuda.Stream`, and :class:`torch.cuda.Stream`. 
                 If a stream is not provided, the current stream will be used.
@@ -1091,7 +1175,8 @@ def compute_expectation(self, operators, *, stream=None, release_workspace=False
                 The default is `False`.
             
         Returns:
-            A scalar for the total expectation value.
+            If ``return_norm`` is `False`, a scalar for the total expectation value; otherwise, a 2-tuple consisting of the total expectation value
+            and a scalar for the squared norm of the state, i.e, inner product of bra and ket state.
         
         Note:
             - If user wishes to perform expectation value computation on the same operator multiple times, it is recommended to explicitly provide a :class:`NetworkOperator` object 
@@ -1114,17 +1199,15 @@ def compute_expectation(self, operators, *, stream=None, release_workspace=False
         expectation_value = np.empty(1, dtype=self.dtype)
         norm = np.empty(1, dtype=self.dtype)
         create_args = (operators.network_operator, )
-        compute_norm = self.norm is None
         # only compute and cache norm when it's has not been computed
-        if compute_norm:
+        if return_norm:
             execute_args = (self.workspace_desc, expectation_value.ctypes.data, norm.ctypes.data)
         else:
             execute_args = (self.workspace_desc, expectation_value.ctypes.data, 0)
         task_key = ('expectation', operators._get_key())
-        if self._compute_target('expectation', create_args, execute_args, stream, release_workspace, task_key=task_key):
-            output = expectation_value.item()
-            if compute_norm:
-                self.norm = norm.real.item()
+        self._compute_target('expectation', create_args, execute_args, stream, release_workspace, task_key=task_key)
+        output = expectation_value.item()
+        if return_norm:
+            return output, norm.real.item()
         else:
-            output = None
-        return output
\ No newline at end of file
+            return output
\ No newline at end of file
diff --git a/python/cuquantum/cutensornet/experimental/tensor_network.py b/python/cuquantum/cutensornet/experimental/tensor_network.py
index 00fae61..e7c6b83 100644
--- a/python/cuquantum/cutensornet/experimental/tensor_network.py
+++ b/python/cuquantum/cutensornet/experimental/tensor_network.py
@@ -10,11 +10,12 @@
 
 import dataclasses
 import logging
+import cupy as cp
 
 from .configuration import ContractDecomposeAlgorithm, ContractDecomposeInfo
 from ._internal.utils import is_gate_split, maybe_truncate_qr_output_operands
 from .. import cutensornet as cutn
-from ..configuration import NetworkOptions
+from ..configuration import NetworkOptions, MemoryLimitExceeded
 from ..tensor import decompose, SVDInfo
 from ..tensor_network import contract
 from .._internal import decomposition_utils
@@ -78,13 +79,14 @@ def _gate_split(wrapped_operands, inputs, outputs, size_dict, max_mid_extent, al
         
         cutn.workspace_compute_gate_split_sizes(handle, 
             *input_tensor_descriptors, *output_tensor_descriptors, 
-            gate_algorithm, svd_config, options.compute_type, workspace_desc)
-
+            gate_algorithm, svd_config, options.compute_type, workspace_desc) 
+        
         # Allocate and set workspace
         for mem_space in (cutn.Memspace.DEVICE, cutn.Memspace.HOST):
-            workspaces[mem_space] = decomposition_utils.allocate_and_set_workspace(handle, options.allocator, workspace_desc, 
-                    cutn.WorksizePref.MIN, mem_space, cutn.WorkspaceKind.SCRATCH, options.device_id, 
-                    stream_holder, options.logger, task_name='contract decomposition')
+            pref = cutn.WorksizePref.MIN
+            workspace_kind = cutn.WorkspaceKind.SCRATCH
+            workspaces[mem_space] = decomposition_utils.allocate_and_set_workspace(options, workspace_desc, 
+                    pref, mem_space, workspace_kind, stream_holder, task_name='contract decomposition')
 
         options.logger.info("Starting contract-decompose (gate split)...")
         timing =  bool(options.logger and options.logger.handlers)
@@ -105,7 +107,7 @@ def _gate_split(wrapped_operands, inputs, outputs, size_dict, max_mid_extent, al
                 output_tensor_descriptors[1], output_operands[1].data_ptr,
                 gate_algorithm, 
                 svd_config, 
-                options.compute_type, 
+                options.compute_type,
                 svd_info, 
                 workspace_desc, 
                 stream_ptr)
@@ -197,6 +199,9 @@ def contract_decompose(subscripts, *operands, algorithm=None, options=None, opti
               Note, depending on the choice of :attr:`~ContractDecomposeAlgorithm.svd_method.partition`, the returned S operand may be `None`. 
               Also see :attr:`~SVDMethod.partition`. 
     
+    Raises:
+        :class:`MemoryLimitExceeded`: the memory needed to perform the operation is larger than the ``options.memory_limit``
+
     The contract and decompose expression adopts a combination of Einstein summation notation for contraction and the decomposition notation
     introduced in :func:`~cuquantum.cutensornet.tensor.decompose`.
     The ``subscripts`` string is a list of subscript labels where each label refers to a mode of the corresponding operand. 
@@ -226,7 +231,7 @@ def contract_decompose(subscripts, *operands, algorithm=None, options=None, opti
         >>> # equivalent:
         >>> # t = contract('ijc,cad,dbe->ijabe', a, b, c)
         >>> # u, s, v = tensor.decompose('ijabe->ixeb,jax', t, method=SVDMethod())
-        >>> u, s, v = contract_decompose('ijc,cad,dbe->ixeb,jax', a, b, c, algorithm={'qr_method': False, 'svd_method': True})
+        >>> u, s, v = contract_decompose('ijc,cad,dbe->ixeb,jax', a, b, c, algorithm={'qr_method': False, 'svd_method': {}})
 
     If the contract and decompose problem amounts to a **ternary-operand gate split problem** commonly seen in quantum circuit simulation 
     (see :ref:`Gate Split Algorithm<gatesplitalgo>` for details), 
@@ -352,10 +357,12 @@ def contract_decompose(subscripts, *operands, algorithm=None, options=None, opti
         # For contract SVD decomposition, we inject max_extent as part of the internal SVDMethod.
 
         logger.info("Beginning decomposition of the intermediate tensor...")
+        decompose_options = dataclasses.asdict(options)
+        decompose_options['compute_type'] = None
         if algorithm.qr_method and algorithm.svd_method is False:
             # contract and QR decompose
             results = decompose(
-                decompose_subscripts, intm_output, method=algorithm.qr_method, options=dataclasses.asdict(options),
+                decompose_subscripts, intm_output, method=algorithm.qr_method, options=decompose_options,
                 stream=stream, return_info=False)
             results = maybe_truncate_qr_output_operands(results, outputs, max_mid_extent)
             if operands_location == 'cpu':
@@ -367,7 +374,7 @@ def contract_decompose(subscripts, *operands, algorithm=None, options=None, opti
             if use_max_mid_extent:
                 algorithm.svd_method.max_extent = max_mid_extent
             results = decompose(
-                decompose_subscripts, intm_output, method=algorithm.svd_method, options=dataclasses.asdict(options),
+                decompose_subscripts, intm_output, method=algorithm.svd_method, options=decompose_options,
                 stream=stream, return_info=return_info)
             if use_max_mid_extent:
                 # revert back
diff --git a/python/cuquantum/cutensornet/tensor.py b/python/cuquantum/cutensornet/tensor.py
index b6f5fc0..d3abcb9 100644
--- a/python/cuquantum/cutensornet/tensor.py
+++ b/python/cuquantum/cutensornet/tensor.py
@@ -8,20 +8,26 @@
 
 __all__ = ['decompose', 'DecompositionOptions', 'QRMethod', 'SVDInfo', 'SVDMethod']
 
-import dataclasses
+from dataclasses import dataclass
 import logging
 import re
 from typing import Optional
 
 import numpy
+import cupy as cp
 
 from . import cutensornet as cutn
-from .configuration import NetworkOptions
+from .configuration import NetworkOptions, MemoryLimitExceeded
 from ._internal import decomposition_utils
 from ._internal import utils 
 
 
-DecompositionOptions = dataclasses.make_dataclass("DecompositionOptions", fields=[(field.name, field.type, field) for field in dataclasses.fields(NetworkOptions)], bases=(NetworkOptions,))
+@dataclass
+class DecompositionOptions(NetworkOptions):
+    def __post_init__(self):
+        super().__post_init__()
+        if self.compute_type is not None:
+            raise ValueError("The compute_type for decomposition should be None.")
 DecompositionOptions.__doc__ = re.sub(":class:`cuquantum.Network` object", ":func:`cuquantum.cutensornet.tensor.decompose` and :func:`cuquantum.cutensornet.experimental.contract_decompose` functions", NetworkOptions.__doc__)
 
 
@@ -74,6 +80,9 @@ def decompose(
               Note, depending on the choice of :attr:`cuquantum.cutensornet.tensor.SVDMethod.partition`, the returned S operand may be `None`.
               Also see :attr:`~SVDMethod.partition`. 
 
+    Raises:
+        :class:`MemoryLimitExceeded`: the memory needed to perform the operation is larger than the ``options.memory_limit``
+
     The decomposition expression adopts a similar notation as einsum expression. 
     The ``subscripts`` string is a list of subscript labels where each label refers to a mode of the corresponding operand.
     The subscript labels are separated by either comma or identifier ``->``. 
@@ -271,9 +280,10 @@ def decompose(
         
         # Allocate and set workspace
         for mem_space in (cutn.Memspace.DEVICE, cutn.Memspace.HOST):
-            workspaces[mem_space] = decomposition_utils.allocate_and_set_workspace(handle, options.allocator, workspace_desc, 
-                cutn.WorksizePref.MIN, mem_space, cutn.WorkspaceKind.SCRATCH, options.device_id,
-                stream_holder, options.logger, task_name='tensor decomposition')
+            pref = cutn.WorksizePref.MIN
+            workspace_kind = cutn.WorkspaceKind.SCRATCH
+            workspaces[mem_space] = decomposition_utils.allocate_and_set_workspace(options, workspace_desc, 
+                pref, mem_space, workspace_kind, stream_holder, task_name='tensor decomposition')
         
         svd_info_obj = None
 
@@ -349,13 +359,13 @@ def decompose(
         raise NotImplementedError
 
 
-@dataclasses.dataclass
+@dataclass
 class QRMethod:
     """A data class for providing QR options to the :func:`cuquantum.cutensornet.tensor.decompose` function."""
     pass
 
 
-@dataclasses.dataclass
+@dataclass
 class SVDInfo:
 
     """A data class for holding information regarding SVD truncation at runtime.
@@ -400,7 +410,7 @@ def __str__(self):
         return s
     
 
-@dataclasses.dataclass
+@dataclass
 class SVDMethod:
     """A data class for providing SVD options to the :func:`cuquantum.cutensornet.tensor.decompose` function.
 
diff --git a/python/cuquantum/cutensornet/tensor_network.py b/python/cuquantum/cutensornet/tensor_network.py
index 7a5ef42..0d8271b 100644
--- a/python/cuquantum/cutensornet/tensor_network.py
+++ b/python/cuquantum/cutensornet/tensor_network.py
@@ -26,6 +26,7 @@
 from ._internal import tensor_wrapper
 from ._internal import typemaps
 from ._internal import utils
+from .configuration import MemoryLimitExceeded
 
 
 class InvalidNetworkState(Exception):
@@ -576,12 +577,9 @@ def _calculate_workspace_size(self):
         max_cache_size = cutn.workspace_get_memory_size(
             self.handle, self.workspace_desc, cutn.WorksizePref.MAX, cutn.Memspace.DEVICE, cutn.WorkspaceKind.CACHE)
 
-        if (self.memory_limit < min_scratch_size + self.require_grad * min_cache_size):
-            message = f"""Insufficient memory.
-The memory limit specified is {self.memory_limit}, while the minimum workspace size needed is {min_scratch_size + self.require_grad * min_cache_size}.
-"""
-            # such failure is due to problem configuration, not due to implementation or runtime factors
-            raise MemoryError(message)
+        min_workspace_size = min_scratch_size + self.require_grad * min_cache_size
+        if self.memory_limit < min_workspace_size:
+            raise MemoryLimitExceeded(self.memory_limit, min_workspace_size, self.device_id)
 
         if min_cache_size > 0:
             if self.require_grad:
diff --git a/python/cuquantum/densitymat/__init__.py b/python/cuquantum/densitymat/__init__.py
new file mode 100644
index 0000000..f04fc32
--- /dev/null
+++ b/python/cuquantum/densitymat/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+from .work_stream import *
+from .elementary_operator import *
+from .operators import *
+from .state import *
diff --git a/python/cuquantum/densitymat/_internal/__init__.py b/python/cuquantum/densitymat/_internal/__init__.py
new file mode 100644
index 0000000..808298f
--- /dev/null
+++ b/python/cuquantum/densitymat/_internal/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
diff --git a/python/cuquantum/densitymat/_internal/callbacks.py b/python/cuquantum/densitymat/_internal/callbacks.py
new file mode 100644
index 0000000..86d9c11
--- /dev/null
+++ b/python/cuquantum/densitymat/_internal/callbacks.py
@@ -0,0 +1,148 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+# TODO[FUTURE]: Implement gradients for the callback.
+from dataclasses import dataclass
+from numbers import Number
+from typing import Optional, Callable, Union
+from cuquantum.cutensornet._internal.tensor_ifc import Tensor
+import numpy as np
+import cupy as cp
+
+
+def _wrap_callback(func):
+    """
+    Returns callback that writes into scalar ndarray `storage` (t,args,storage) -> None, given `func` with signature (t,args)->Union[Number,ndarray].
+
+    Parameters:
+    -----------
+    func: Callable
+        Function with signature (t: float, args: Tuple[float]) returning a scalar.
+    """
+
+    def inplace_func(t: np.float64, args: tuple, buf: np.ndarray):
+        buf[:] = func(t, args)
+
+    return inplace_func
+
+
+@dataclass
+class CallbackCoefficient:
+    """
+    Wrapper class for treating static and dynamics coefficients on the same footing.
+
+    Attributes
+    callback: Optional[Callable]
+        Callable with signature (t,args) -> Number, returning the dynamic coefficient.
+    scalar: Number
+        The static coefficient. If both callback and scalar are specified, the effective coefficient is the product of static and dynamic coefficient.
+    """
+
+    callback: Optional[Callable] = None
+    scalar: Optional[Number] = 1.0 + 0.0j
+
+    def __post_init__(self):
+        if not isinstance(self.scalar, Number):
+            raise TypeError(
+                f"CallbackCoefficient received a scalar argument of type {type(self.scalar)}. CallbackCoefficient only accepts scalar arguments that are instances of Number."
+            )
+        self._wrapped_callback = 0
+        if self.callback is not None:
+            if not isinstance(self.callback, Callable):
+                raise TypeError(
+                    f"CallbackCoefficient received a callback argument of type {type(self.callback)}. CallbackCoefficient only accepts callback arguments that are instances of Callable."
+                )
+            self._wrapped_callback = _wrap_callback(self.callback)
+
+    @property
+    def is_callable(self) -> bool:
+        return True if self.callback else False
+
+    def __neg__(self) -> "CallbackCoefficient":
+        return self.__mul__(self, -1)
+
+    def __mul__(self, factor: Union[Number, Callable, "CallbackCoefficient"]):
+        """
+        Multiplication
+        """
+        # ToDo: Add test that cover all branches
+        if isinstance(factor, CallbackCoefficient):
+            if self.is_callable and factor.is_callable:
+                callback = lambda t, args: self.callback(t, args) * factor.callback(t, args)
+            elif self.is_callable:
+                callback = self.callback
+            elif factor.is_callable:
+                callback = factor.callback
+            else:
+                callback = None
+            return CallbackCoefficient(callback, self.scalar * factor.scalar)
+        # Probably Not required unless this is exposed to user
+        elif isinstance(factor, Number):
+            return CallbackCoefficient(self.callback, self.scalar * factor)
+        elif isinstance(factor, Callable):
+            return CallbackCoefficient(
+                lambda t, args: (self.callback(t, args) * factor(t, args)), self.scalar
+            )
+
+    def __rmul__(
+        self, factor: Union[Number, Callable, "CallbackCoefficient"]
+    ) -> "CallbackCoefficient":
+        # right/left logic is handled in __mul__ and is symmetric
+        return self.__mul__(factor)
+
+    def __lmul__(
+        self, factor: Union[Number, Callable, "CallbackCoefficient"]
+    ) -> "CallbackCoefficient":
+        # right/left logic is handled in __mul__ and is symmetric
+        return self.__mul__(factor)
+
+    def __add__(self, summand: Union["CallbackCoefficient", Number, Callable]):
+        """
+        Addition
+        """
+        if isinstance(summand, CallbackCoefficient):
+            if self.callback == summand.callback:
+                return CallbackCoefficient(self.callback, self.scalar + summand.scalar)
+            elif np.isclose(self.scalar, summand.scalar):
+                if self.is_callable and summand.is_callable:
+                    callback = lambda t, args: self.callback(t, args) + summand.callback(t, args)
+                elif self.is_callable:
+                    callback = lambda t, args: self.callback(t, args) + 1
+                elif summand.is_callable:
+                    callback = lambda t, args: summand.callback(t, args) + 1
+                return CallbackCoefficient(callback, self.scalar)
+            else:
+                if self.is_callable and summand.is_callable:
+                    callback = (
+                        lambda t, args: self.callback(t, args) * self.scalar
+                        + summand.callback(t, args) * summand.scalar
+                    )
+                elif self.is_callable:
+                    callback = lambda t, args: self.callback(t, args) * self.scalar + summand.scalar
+                elif summand.is_callable:
+                    callback = (
+                        lambda t, args: self.scalar + summand.callback(t, args) * summand.scalar
+                    )
+                return CallbackCoefficient(callback)
+        elif isinstance(summand, Number):
+            return self + CallbackCoefficient(None, summand)
+        elif isinstance(summand, Callable):
+            return self + CallbackCoefficient(summand)
+        else:
+            raise TypeError(
+                f"{type(summand)} cannot be added to CallbackCoefficient. CallbackCoefficient only supports addition of CallbackCoefficient, Number or Callable."
+            )
+
+    def __sub__(self, subtrahend: Union["CallbackCoefficient", Number, Callable]):
+        """
+        Substraction
+        """
+        return self + (-1) * subtrahend
+
+    def conjugate(self):
+        conj_callback = (
+            (lambda t, args: self.callback(t, args).conjugate()) if self.is_callable else None
+        )
+        conj_scalar = self.scalar.conjugate()
+        return CallbackCoefficient(conj_callback, conj_scalar)
diff --git a/python/cuquantum/densitymat/_internal/library_handle.py b/python/cuquantum/densitymat/_internal/library_handle.py
new file mode 100644
index 0000000..888fa30
--- /dev/null
+++ b/python/cuquantum/densitymat/_internal/library_handle.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+from logging import getLogger, Logger
+import weakref
+import collections
+
+from cuquantum.cutensornet._internal import utils as cutn_utils
+
+from cuquantum.bindings import cudensitymat as cudm
+from . import utils
+
+# TODO[OPTIONAL] move elsewhere and refactor
+
+_comm_provider_map = {}
+_comm_provider_map["None"] = cudm.DistributedProvider.NONE
+_comm_provider_map["MPI"] = cudm.DistributedProvider.MPI
+_comm_provider_map["NCCL"] = cudm.DistributedProvider.NCCL
+_comm_provider_map["NVSHMEM"] = cudm.DistributedProvider.NVSHMEM
+
+
+class LibraryHandle:
+    """
+    A wrapper around the library handle for cudensitymat.
+    """
+
+    def __init__(self, device_id: int, logger: Logger):
+        """
+        Create a library handle on the specified device.
+
+        Parameters:
+        -----------
+        device_id: int
+            If not provided, device 0 will be used by default.
+        logger: Logger
+            If a logger is passed, creation and destruction of library handle are logged there.
+        """
+        self._finalizer = weakref.finalize(self, lambda: None)
+        self._finalizer.detach()
+
+        self._device_id = device_id
+        with cutn_utils.device_ctx(self._device_id):
+            self._ptr = cudm.create()
+        self.logger = logger
+        self._comm = None
+        self._comm_set = False
+        self.logger.info(f"cuDensityMat library handle created on device {self.device_id}.")
+        self.logger.debug(
+            f"{self} instance holds cuDensityMat library handle with pointer {self._ptr} on device {self.device_id}."
+        )
+        self._upstream_finalizers = collections.OrderedDict()
+        self._finalizer = weakref.finalize(
+            self,
+            utils.generic_finalizer,
+            self.logger,
+            self._upstream_finalizers,
+            (cudm.destroy, self._ptr),
+            msg=f"Destroying Handle instance {self}",
+        )  # may also use trivial finalizer here
+        self.logger.debug(f"{self} instance's finalizer registered.")
+
+    def _check_valid_state(self, *args, **kwargs):
+        if not self._valid_state:
+            raise utils.InvalidObjectState("The handle cannot be used after resources are freed.")
+
+    @property
+    def _valid_state(self):
+        return self._finalizer.alive
+
+    @property
+    @cutn_utils.precondition(_check_valid_state)
+    def _validated_ptr(self):
+        return self._ptr
+
+    @property
+    def device_id(self):
+        return self._device_id
+
+    def set_communicator(self, comm, provider: str = "None") -> None:
+        """
+        Sets the communicator attached to the current context's library handle.
+
+        Parameters:
+        -----------
+        comm:
+            The communicator instance with which to set the library context's communicator.
+        provider: str
+            The package/backend providing the communicator.
+        """
+        if self._comm_set:
+            raise RuntimeError(
+                "Communicator has already been set on library handle.\
+            Resetting the communicator is not supported."
+            )
+        assert provider in ["None", "MPI"]
+        self._comm = comm
+        _comm_ptr, _size = cutn_utils.get_mpi_comm_pointer(comm)
+        cudm.reset_distributed_configuration(
+            self._validated_ptr, _comm_provider_map[provider], _comm_ptr, _size
+        )
+        self._comm_set = True
+
+    def get_communicator(self):
+        """
+        Returns the communicator associated with the given library context.
+        """
+        return self._comm
+
+    @cutn_utils.precondition(_check_valid_state)
+    def get_num_ranks(self) -> int:
+        """
+        Returns the total number of distributed processes associated with the given library context.
+        """
+        return cudm.get_num_ranks(self._validated_ptr)
+
+    @cutn_utils.precondition(_check_valid_state)
+    def get_proc_rank(self) -> int:
+        """
+        Returns the rank of the current process in the distributed configuration associated with the given library context.
+        """
+        return cudm.get_proc_rank(self._validated_ptr)
+
+    @cutn_utils.precondition(_check_valid_state)
+    def set_random_seed(self, seed: int) -> None:
+        """
+        Sets the random seed used by the random number generator inside the library context.
+        """
+        cudm.reset_random_seed(self._validated_ptr, seed)
+
+    @cutn_utils.precondition(_check_valid_state)
+    def _register_user(self, user):
+        assert self != user
+        self._upstream_finalizers[user._finalizer] = weakref.ref(
+            user
+        )  # We may not want to store weakref as value here, but let's see
+        self.logger.debug(f"{self} registered user {user} for finalizer execution.")
+
+    def _unregister_user(self, user):
+        assert self != user
+        if self._upstream_finalizers is not None:
+            del self._upstream_finalizers[user._finalizer]
+            self.logger.debug(f"{self} unregistered user {user} for finalizer execution.")
+
diff --git a/python/cuquantum/densitymat/_internal/utils.py b/python/cuquantum/densitymat/_internal/utils.py
new file mode 100644
index 0000000..cb12c9d
--- /dev/null
+++ b/python/cuquantum/densitymat/_internal/utils.py
@@ -0,0 +1,204 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import collections
+import weakref
+from typing import Optional, Union
+
+import numpy as np
+import cupy as cp
+
+from cuquantum.cutensornet._internal import tensor_wrapper
+from cuquantum.cutensornet._internal.tensor_ifc import Tensor
+from cuquantum.cutensornet._internal.tensor_ifc_numpy import NumpyTensor
+from cuquantum.cutensornet._internal.utils import device_ctx, StreamHolder
+from cuquantum.cutensornet._internal import utils as cutn_utils
+
+
+NDArrayType = Union[np.ndarray, cp.ndarray]
+
+
+class InvalidObjectState(Exception):
+    pass
+
+
+def cuda_call_ctx(ctx, blocking: Optional[bool] = None):
+    blocking = ctx.blocking if blocking is None else blocking
+    return cutn_utils.cuda_call_ctx(ctx._stream_holder, blocking, ctx._do_timing)
+
+
+def generic_finalizer(
+    logger,
+    upstream_finalizers: collections.OrderedDict,
+    *destructor_ptr_pairs,
+    msg="In generic finalizer call.",
+):
+    logger.debug(msg + " - Outer Loop")
+    for upstream_finalizer in reversed(upstream_finalizers):
+        upstream_finalizer()
+    logger.debug(msg + " - Inner Loop")
+    for destructor, ptr in destructor_ptr_pairs:
+        if ptr is not None:
+            destructor(ptr)
+            logger.debug(f"Released resource: {ptr}.")
+        else:
+            logger.debug("Resource already released.")
+    logger.debug(msg + " - End of Inner Loop")
+    logger.debug(msg + " - End of Outer Loop")
+
+
+def register_with(user, downstream_dependency, logger):
+    try:
+        assert downstream_dependency._valid_state
+    except AssertionError as e:
+        raise RuntimeError(
+            f"Failing to register {user} as dependent on {downstream_dependency} because the latter's finalizer has already been called."
+        ) from e
+    if downstream_dependency == user:
+        raise RuntimeError(f"Cannot register {user} as dependent on itself.")
+    downstream_dependency._upstream_finalizers[user._finalizer] = weakref.ref(
+        user
+    )  # We may not want to store weakref as value here, but let's see
+    logger.debug(f"{downstream_dependency} registered user {user} for finalizer execution.")
+
+
+def unregister_with(user, downstream_dependency, logger):
+    if downstream_dependency is not None:
+        if downstream_dependency == user:
+            raise RuntimeError(f"Cannot register {user} as dependent on itself.")
+        if downstream_dependency._upstream_finalizers is not None:
+            del downstream_dependency._upstream_finalizers[user._finalizer]
+            logger.debug(
+                f"{downstream_dependency} unregistered user {user} for finalizer execution."
+            )
+
+
+def wrap_callback(func):
+    """
+    Returns callback that writes into scalar ndarray `storage` (t,args,storage) -> None, given `func` with signature (t,args)->Union[Number,ndarray].
+
+    Parameters:
+    -----------
+    func: Callable
+        Function with signature (t: float, args: Tuple[float]) returning a scalar.
+    """
+
+    if func is not None:
+
+        def inplace_func(t: np.float64, args: tuple, buf: np.ndarray):
+            buf[:] = func(t, args)
+
+        return inplace_func
+    else:
+        return None
+
+
+def single_tensor_copy(maybe_wrapped_operand, ctx):
+    """
+    Blocking copy on device of wrapped_operand.
+    """
+    wrapped_operand = (
+        single_tensor_wrap(maybe_wrapped_operand)
+        if not isinstance(maybe_wrapped_operand, Tensor)
+        else maybe_wrapped_operand
+    )
+    if isinstance(wrapped_operand, NumpyTensor):
+        return wrapped_operand.tensor.copy()
+    else:
+        if ctx is None:
+            stream_holder = StreamHolder(obj=cp.cuda.Stream())
+            with cutn_utils.device_ctx(wrapped_operand.device), cutn_utils.cuda_call_ctx(stream_holder, timing=False):
+                tensor_copy = wrapped_operand.tensor.copy()
+        else:
+            assert wrapped_operand.device == ctx.device_id
+            with cutn_utils.device_ctx(wrapped_operand.device), cuda_call_ctx(ctx, blocking=True):
+                tensor_copy = wrapped_operand.tensor.copy()
+        return tensor_copy
+
+
+def single_tensor_to(operand: Tensor, device, stream_holder: StreamHolder):
+    """
+    Moves a single tensor to device and wraps the copied tensor in tensor wrapper.
+    The equivalent of cutensornet._internal.tensor.to for a single tensor and extended to treating multidiagonal Tensors.
+
+    Parameters:
+    -----------
+    operand: Tensor
+        Wrapped input tensor (subclass of Tensor)
+    device: int
+        Destination GPU.
+    stream_holder: StreamHolder
+        Stream onto which the data transfer is submitted, wrapped in StreamHolder class.
+    """
+    device_operand = operand.to(device, stream_holder)
+    return single_tensor_wrap(device_operand)
+
+
+def single_tensor_wrap(operand) -> Tensor:
+    """
+    Wraps a single tensor in the corresponding Tensor wrapper.
+    The equivalent of cutensornet._internal.tensor.wrap_operands for a single tensor and extended to treating multidiagonal Tensors.
+
+    Parameters:
+    -----------
+    operand:
+        Input tensor. Either a subclass of NDArray for dense tensors or a cudensitymat.MultiDiagonalTensor.
+
+    Returns:
+    --------
+    Tensor
+        Input tensor wrapped in Tensor subclass.
+    """
+    # TODO: Should use wrap_operand instead of wrap_operands
+    return tensor_wrapper.wrap_operands((operand,))[0]
+
+
+def transpose_bipartite_tensor(tensor):
+    # if isinstance(t, MultiDiagonalTensor):
+    #     return t.T
+    dims = tensor.shape
+    return matricize_bipartite_tensor(tensor).transpose().reshape(dims)
+
+
+def matricize_bipartite_tensor(tensor):
+    dims = tensor.shape
+    ndims = len(dims)
+    assert ndims % 2 == 0
+    assert dims[: ndims // 2] == dims[ndims // 2 :]
+    matricized_dim = np.prod(dims[: ndims // 2])
+    return tensor.reshape(matricized_dim, matricized_dim)
+
+
+def multidiagonal_to_dense(sparse_data, offsets, package):
+    shape = (sparse_data.shape[0], sparse_data.shape[0])
+    dense_matrix = package.zeros(shape, sparse_data.dtype, order="F")
+    row, col = package.indices(shape)
+    for i, offset in enumerate(offsets):
+        dense_matrix[row == col - offset] = (
+            sparse_data[: -abs(offset), i] if offset != 0 else sparse_data[:, i]
+        )
+    return dense_matrix
+
+
+# TODO: Possibly remove this function in a future release
+def optimize_strides(tensor: NDArrayType) -> NDArrayType:
+    """
+    Return `tensor` as a contiguous array in F-order.
+
+    Args:
+        Input tensor.
+
+    Returns:
+        Input tensor in F-order. If input was not F-ordered, a copy on the same device/host is returned.
+    """
+    if tensor.flags["F_CONTIGUOUS"]:
+        return tensor
+    else:
+        wrapped_tensor = tensor_wrapper.wrap_operand(tensor)
+        device_id = wrapped_tensor.device_id
+        if device_id is None:
+            return tensor.copy(order="F")
+        else:
+            with device_ctx(device_id):
+                return tensor.copy(order="F")
diff --git a/python/cuquantum/densitymat/elementary_operator.py b/python/cuquantum/densitymat/elementary_operator.py
new file mode 100644
index 0000000..b5ce1f7
--- /dev/null
+++ b/python/cuquantum/densitymat/elementary_operator.py
@@ -0,0 +1,762 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Elementary operator."""
+
+from typing import Callable, Sequence, Union
+from abc import ABC, abstractmethod
+import weakref
+import collections
+from numbers import Number
+import importlib
+from operator import add, sub
+
+import numpy as np
+import cupy as cp
+import cupyx
+
+try:
+    import scipy as sp
+except ImportError:
+    sp = None
+
+from cuquantum.bindings import cudensitymat as cudm
+from ._internal.utils import (
+    generic_finalizer,
+    register_with,
+    wrap_callback,
+    single_tensor_to,
+    single_tensor_copy,
+    device_ctx,
+    transpose_bipartite_tensor,
+    matricize_bipartite_tensor,
+    NDArrayType,
+    InvalidObjectState,
+)
+from .work_stream import WorkStream
+from cuquantum.cutensornet._internal import tensor_wrapper, typemaps
+from cuquantum.cutensornet._internal.utils import precondition, StreamHolder, cuda_call_ctx
+
+
+__all__ = ["DenseOperator", "MultidiagonalOperator"]
+
+
+DiaMatrixType = Union["sp.sparse.dia_matrix", "cupyx.scipy.sparse.dia_matrix"]
+CallbackType = Callable[[float, Sequence], np.ndarray]
+ElementaryOperatorType = Union["DenseOperator", "MultidiagonalOperator"]
+
+
+class ElementaryOperator(ABC):
+    """
+    Elementary operator abstract base class.
+    """
+
+    def __init__(self, data: NDArrayType, callback: CallbackType | None = None) -> None:
+        """
+        Initialize an elementary operator from data buffer and callback.
+        """
+        # Input attributes
+        self.callback = callback
+        self.dtype: str = data.dtype.name
+
+        # Internal attributes
+        self._data = tensor_wrapper.wrap_operand(data)
+        if self._data.device_id is None:
+            self._data.tensor = self._data.tensor.copy(order="F")
+        else:
+            with device_ctx(self._data.device_id):
+                self._data.tensor = self._data.tensor.copy(order="F")
+
+        self._callback = wrap_callback(callback)
+        self._dtype = typemaps.NAME_TO_DATA_TYPE[self.dtype]
+
+        self._ctx: WorkStream = None
+        self._ptr = None
+
+        self._last_compute_event = None
+        self._upstream_finalizers = collections.OrderedDict()
+        self._finalizer = weakref.finalize(self, lambda: None)
+        self._finalizer.detach()
+
+    @property
+    @abstractmethod
+    def _sparsity_and_diagonal_offsets(self):
+        pass
+
+    @property
+    def data(self) -> NDArrayType:
+        """
+        Data buffer of the elementary operator.
+        """
+        return self._data.tensor
+
+    @abstractmethod
+    def to_array(self, t: float | None, args: Sequence | None):
+        pass
+
+    @abstractmethod
+    def copy(self):
+        pass
+
+    @abstractmethod
+    def __mul__(self, scalar: Number):
+        pass
+
+    @abstractmethod
+    def __rmul__(self, scalar: Number):
+        pass
+
+    @abstractmethod
+    def __add__(self, other):
+        pass
+
+    @abstractmethod
+    def __matmul__(self, other):
+        pass
+
+    @abstractmethod
+    def dag(self):
+        pass
+
+    def _check_scalar_operation_compability(self, scalar, what: str = ""):
+        if not isinstance(scalar, Number):
+            raise TypeError(f"Cannot multiply {type(self).__name__} with {type(scalar).__name__}.")
+
+    def _check_binary_operation_compability(self, other: "ElementaryOperator", what: str = ""):
+        if not isinstance(other, ElementaryOperator):
+            raise TypeError(
+                f"Cannot perform {what} between {type(self).__name__} and {type(other).__name__}."
+            )
+
+        if self.shape != other.shape:
+            raise ValueError(
+                f"Cannot perform {what} between {type(self).__name__}s with mismatching shapes: {self.shape} and {other.shape}."
+            )
+
+        if self._data.module != other._data.module:
+            raise ValueError(
+                f"Cannot perform {what} between {type(self).__name__}s based on arrays from "
+                f"different packages: {self._data.module} and {other._data.module}."
+            )
+
+        if self._data.device_id != other._data.device_id:
+            raise ValueError(
+                f"Cannot perform {what} between {type(self).__name__}s based on arrays from "
+                f"different devices: device {self._data.device_id} and device {other._data.device_id}"
+            )
+
+    def _check_valid_state(self, *args, **kwargs):
+        if not self._valid_state:
+            raise InvalidObjectState("The tensor operator cannot be used after resources are freed")
+
+    def _sync(self) -> None:
+        if self._last_compute_event:
+            self._last_compute_event.synchronize()
+            self._last_compute_event = None
+
+    @property
+    def _valid_state(self):
+        return self._finalizer.alive
+
+    @property
+    @precondition(_check_valid_state)
+    def _validated_ptr(self):
+        return self._ptr
+
+    def _maybe_instantiate(self, ctx: WorkStream) -> None:
+        """
+        Instantiate this instance if it hasn't been instantiated yet.
+        """
+        if self._ctx is not None:
+            if self._ctx != ctx:
+                raise ValueError(
+                    "Using an ElementaryOperator with a different WorkStream from its original WorkStream is not supported."
+                )
+        if not self._ctx:
+            self._ctx = ctx
+            if self._data.device == "cpu":
+                # NOTE: Check if this preserve stridedness
+                self._data = single_tensor_to(
+                    self._data, self._ctx.device_id, self._ctx._stream_holder
+                )
+            else:
+                try:
+                    assert self._data.device_id == self._ctx.device_id
+                except AssertionError as e:
+                    raise RuntimeError(
+                        "Device id of input array does not match device id of library context."
+                    ) from e
+            self._instantiate()
+            register_with(self, self._ctx, self._ctx.logger)
+
+    def _instantiate(self) -> None:
+        """
+        Instantiate an ElementaryOperator.
+        """
+        sparsity, num_diagonals, diagonal_offsets = self._sparsity_and_diagonal_offsets
+        self._ptr = cudm.create_elementary_operator(
+            self._ctx._handle._validated_ptr,
+            self.num_modes,
+            self.mode_dims,
+            sparsity,
+            num_diagonals,
+            diagonal_offsets,
+            self._dtype,
+            self._data.data_ptr,
+            self._callback,
+        )
+
+        self._finalizer = weakref.finalize(
+            self,
+            generic_finalizer,
+            self._ctx.logger,
+            self._upstream_finalizers,
+            (cudm.destroy_elementary_operator, self._ptr),
+            msg=f"Destroying ElementaryOperator instance {self}, ptr: {self._ptr}.",
+        )
+
+
+class DenseOperator(ElementaryOperator):
+    """
+    DenseOperator(data, callback=None)
+
+    Dense elementary operator from data buffer and optional callback.
+
+    Args:
+        data: Data buffer for operator elements.
+        callback: A CPU callback function with signature ``(t, args) -> np.ndarray``.
+
+    .. note::
+        - A copy will be created on the data buffer and can be accessed through the :attr:`data` attribute.
+        - The returned array needs to be consistent with the provided data buffer in terms of shape and data type. 
+          The data buffer will be updated when this instance is involved in a ``compute`` method of an :class:`Operator` or :class:`OperatorAction`.
+    
+    Examples:
+
+        >>> import numpy as np
+        >>> from cuquantum.densitymat import DenseOperator
+
+        Suppose we want to construct a creation operator on a Hilbert space of dimension 3 as a ``DenseOperator``. It can be constructed from the data buffer directly as
+
+        >>> data = np.array([
+        >>>     [0, 0, 0],
+        >>>     [1.0, 0, 0],
+        >>>     [0, np.sqrt(2), 0],
+        >>> ])
+        >>> dense_op = DenseOperator(data)
+    """
+
+    def __init__(self, data: NDArrayType, callback: CallbackType | None = None) -> None:
+        """
+        Initialize a dense elementary operator from data buffer and optional callback.
+        """
+        super().__init__(data, callback)
+        self.shape = data.shape
+
+        self.num_modes = len(self.shape) // 2
+        self.mode_dims = self.data.shape[: self.num_modes]
+
+    @property
+    def _sparsity_and_diagonal_offsets(self):
+        return cudm.ElementaryOperatorSparsity.OPERATOR_SPARSITY_NONE, 0, 0
+
+    def to_array(
+        self, t: float | None = None, args: Sequence | None = None, device: str = "cpu"
+    ) -> NDArrayType:
+        r"""
+        Return the array form of the dense elementary operator.
+
+        Args:
+            t: Time variable in callback, only required if callback is not ``None``.
+            args: Additional arguments in callback, only required if callback is not ``None``.
+            device: Device on which to return the array. Defaults to ``"cpu"``.
+
+        Returns:
+            Array form of the dense elementary operator on the specified device.
+        """
+        if self.callback is None:
+            return self._data.to(device, StreamHolder(obj=cp.cuda.Stream()))
+        else:
+            if t is None or args is None:
+                raise ValueError(
+                    "For a DenseOperator with callback, callback arguments must be passed in "
+                    "when converted to an array."
+                )
+            return self.callback(t, args)
+
+    def copy(self) -> "DenseOperator":
+        """
+        Return a copy of the dense elementary operator.
+        """
+        return DenseOperator(single_tensor_copy(self._data, self._ctx), self.callback)
+
+    @staticmethod
+    def _unary_operation(operation):
+        def _operation(dense_op):
+            if dense_op.callback is None:
+                data = operation(dense_op.data)
+                callback = None
+            else:
+                data = dense_op._data.module.empty_like(dense_op.data, dtype=dense_op.dtype)
+                callback = lambda t, args: operation(dense_op.callback(t, args))
+            return DenseOperator(data, callback)
+
+        return _operation
+
+    @precondition(ElementaryOperator._check_scalar_operation_compability)
+    def __mul__(self, scalar: Number) -> "DenseOperator":
+        """
+        Multiply this instance with a scalar on the left.
+        """
+        if self._data.device_id is None:
+            return DenseOperator._unary_operation(lambda x: x * scalar)(self)
+        else:
+            with cp.cuda.Device(self._data.device_id):
+                return DenseOperator._unary_operation(lambda x: x * scalar)(self)
+
+    @precondition(ElementaryOperator._check_scalar_operation_compability)
+    def __rmul__(self, scalar: Number) -> "DenseOperator":
+        """
+        Multiply this instance with a scalar on the right.
+        """
+        return self * scalar
+
+    @_unary_operation
+    def _dag(data):
+        return transpose_bipartite_tensor(data).conj()
+
+    def dag(self) -> "DenseOperator":
+        """
+        Return the conjugate complex transpose of this instance.
+        """
+        return self._dag()
+
+    @staticmethod
+    def _binary_operation(operation):
+        def _operation(dense_op1, dense_op2):
+            if dense_op1.callback is None and dense_op2.callback is None:
+                data = operation(dense_op1.data, dense_op2.data)
+                callback = None
+            else:  # result is dynamic DenseOperator
+                data = dense_op1._data.module.empty_like(
+                    dense_op1.data,
+                    dtype=dense_op1._data.module.promote_types(dense_op1.dtype, dense_op2.dtype),
+                )
+                if dense_op1.callback is None and dense_op2.callback is not None:
+                    stream_holder = StreamHolder(obj=cp.cuda.Stream())
+                    with cuda_call_ctx(stream_holder, timing=False):
+                        data1 = dense_op1._data.to(stream_holder=stream_holder)
+                    callback = lambda t, args: operation(data1, dense_op2.callback(t, args))
+                elif dense_op1.callback is not None and dense_op2.callback is None:
+                    stream_holder = StreamHolder(obj=cp.cuda.Stream())
+                    with cuda_call_ctx(stream_holder, timing=False):
+                        data2 = dense_op2._data.to(stream_holder=stream_holder)
+                    callback = lambda t, args: operation(dense_op1.callback(t, args), data2)
+                else:  # both inputs are dynamic
+                    callback = lambda t, args: operation(
+                        dense_op1.callback(t, args), dense_op2.callback(t, args)
+                    )
+            return DenseOperator(data, callback)
+
+        return _operation
+
+    @precondition(ElementaryOperator._check_binary_operation_compability, what="addition")
+    def __add__(self, other: ElementaryOperatorType) -> "DenseOperator":
+        """
+        Add an elementary operator to this instance and return a new :class:`DenseOperator`.
+        """
+        if isinstance(other, DenseOperator):
+            if self._data.device_id is None:
+                return DenseOperator._binary_operation(add)(self, other)
+            else:
+                with cp.cuda.Device(self._data.device_id):
+                    return DenseOperator._binary_operation(add)(self, other)
+        else:
+            package = self._data.module if self.callback is None and other.callback is None else np
+            return self + other._to_dense(package=package)
+
+    @precondition(ElementaryOperator._check_binary_operation_compability, what="subtraction")
+    def __sub__(self, other: ElementaryOperatorType) -> "DenseOperator":
+        """
+        Subtract an elementary operator from this instance and return a new :class:`DenseOperator`.
+        """
+        if isinstance(other, DenseOperator):
+            if self._data.device_id is None:
+                return DenseOperator._binary_operation(sub)(self, other)
+            else:
+                with cp.cuda.Device(self._data.device_id):
+                    return DenseOperator._binary_operation(sub)(self, other)
+        else:
+            package = self._data.module if self.callback is None and other.callback is None else np
+            return self - other._to_dense(package=package)
+
+    @_binary_operation
+    def _matmul(a, b):
+        return (
+            (matricize_bipartite_tensor(a) @ matricize_bipartite_tensor(b))
+            .reshape(a.shape)
+            .copy(order="F")
+        )
+
+    @precondition(
+        ElementaryOperator._check_binary_operation_compability, what="matrix multiplication"
+    )
+    def __matmul__(self, other: ElementaryOperatorType) -> "DenseOperator":
+        """
+        Perform matrix multiplication between this instance and an elementary operator and return a new :class:`DenseOperator`.
+        """
+        if isinstance(other, DenseOperator):
+            if self._data.device_id is None:
+                return self._matmul(other)
+            else:
+                with cp.cuda.Device(self._data.device_id):
+                    return self._matmul(other)
+        else:
+            package = self._data.module if self.callback is None and other.callback is None else np
+            return self @ other._to_dense(package=package)
+
+
+class MultidiagonalOperator(ElementaryOperator):
+    """
+    MultidiagonalOperator(data, offsets, callback=None)
+
+    Multidiagonal single-mode operator from data buffer, offsets and optional callback.
+
+    Args:
+        data: Data buffer for diagonal elements, of shape ``(mode_dimension, num_diagonals)``.
+        offsets: The diagonal offsets of length ``num_diagonals``.
+        callback: A CPU callback function with signature ``(t, args) -> np.ndarray``.
+
+    .. note::
+        - The data layout is different from :class:`scipy.sparse.dia_matrix` and :class:`cupyx.scipy.sparse.dia_matrix`. 
+          In this class, the elements of the ``offsets[i]``-th diagonal corresponds to the ``i``-th column of the input data buffer read from the top of the column.
+        - A copy will be created on the data buffer and can be accessed through the :attr:`data` attribute.
+        - The returned array needs to be consistent with the provided data buffer in terms of shape and data type. 
+          The data buffer will be updated when this instance is involved in a ``compute`` method of an :class:`Operator` or :class:`OperatorAction`.
+
+    Examples:
+
+        >>> import numpy as np
+        >>> from cuquantum.densitymat import MultidiagonalOperator
+
+        Suppose we want to construct a creation operator on a Hilbert space of dimension 3 as a ``MultidiagonalOperator``. It can be constructed from the data buffer and diagonal offsets as
+
+        >>> data = np.array([[1], [np.sqrt(2)], [0]]) # the last element doesn't matter
+        >>> offsets = [-1]
+        >>> dia_op = MultidiagonalOperator(data, offsets)
+
+        If we already have the elementary operator in :class:`scipy.sparse.dia_matrix` format, e.g,
+
+        >>> dia_matrix = scipy.sparse.dia_matrix(...) # put your data here
+
+        We can create a ``MultidiagonalOperator`` with the following:
+
+        >>> offsets = list(dia_matrix.offsets)
+        >>> data = np.zeros((dia_matrix.shape[0], len(offsets)), dtype=dia_matrix.dtype)
+        >>> for i, offset in enumerate(offsets):
+        >>>    end = None if offset == 0 else -abs(offset)
+        >>>    data[:end, i] = dia_matrix.diagonal(offset)
+        >>> dia_op = MultidiagonalOperator(data, offsets)
+    """
+
+    def __init__(
+        self, data: NDArrayType, offsets: Sequence[int], callback: CallbackType | None = None
+    ) -> None:
+        """
+        Initialize a multidiagonal single-mode operator from data buffer, offsets and optional callback.
+        """
+        if len(offsets) != len(set(offsets)):
+            raise ValueError("Offsets cannot contain duplicate elements.")
+        if data.shape[1] != len(offsets):
+            raise ValueError("Number of columns in data does not match length of offsets.")
+
+        super().__init__(data, callback)
+        self.offsets = list(offsets)
+
+        mode_dim, self.num_diagonals = data.shape
+        self.shape = (mode_dim, mode_dim)
+        self.num_modes = 1
+        self.mode_dims = (mode_dim,)
+
+    @property
+    def _sparsity_and_diagonal_offsets(self):
+        return (
+            cudm.ElementaryOperatorSparsity.OPERATOR_SPARSITY_MULTIDIAGONAL,
+            self.num_diagonals,
+            self.offsets,
+        )
+
+    @staticmethod
+    def _unary_operation(return_type: str = "multidiagonal"):
+        assert return_type in ["multidiagonal", "dense"]
+
+        def _decorator(operation):
+            def _operation(dia_op, offsets=None, package=None):
+                if offsets is None:
+                    offsets = dia_op.offsets
+                if package is None:
+                    package = dia_op._data.module
+
+                if dia_op.callback is None:
+                    data = operation(dia_op.data, offsets, dia_op._data.module)
+                    callback = None
+                else:
+                    if return_type == "multidiagonal":
+                        data = dia_op._data.module.empty_like(dia_op.data, dtype=dia_op.dtype)
+                    else:
+                        data = dia_op._data.module.empty(dia_op.shape, dtype=dia_op.dtype)
+
+                    callback = lambda t, args: operation(dia_op.callback(t, args), offsets, package)
+
+                if return_type == "multidiagonal":
+                    return MultidiagonalOperator(data, offsets, callback)
+                else:
+                    return DenseOperator(data, callback)
+
+            return _operation
+
+        return _decorator
+
+    @staticmethod
+    def _multidiagonal_to_dense(sparse_data, offsets, package):
+        shape = (sparse_data.shape[0], sparse_data.shape[0])
+        dense_matrix = package.zeros(shape, sparse_data.dtype, order="F")
+        row, col = package.indices(shape)
+        for i, offset in enumerate(offsets):
+            dense_matrix[row == col - offset] = (
+                sparse_data[: -abs(offset), i] if offset != 0 else sparse_data[:, i]
+            )
+        return dense_matrix
+
+    @_unary_operation("dense")
+    def _to_dense(sparse_data, offsets, package):
+        return MultidiagonalOperator._multidiagonal_to_dense(sparse_data, offsets, package)
+
+    def to_dense(self) -> DenseOperator:
+        """
+        Return the `DenseOperator` form of the multidiagonal elementary operator.
+        """
+        return self._to_dense()
+
+    def to_array(
+        self, t: float | None = None, args: Sequence | None = None, device: str = "cpu"
+    ) -> NDArrayType:
+        """
+        Return the array form of the multidiagonal elementary operator.
+
+        Args:
+            t: Time variable in callback, only required is callback is not ``None``.
+            args: Additional arguments in callback, only required if callback is not ``None``.
+            device: Device on which to return the array. Defaults to ``"cpu"``.
+
+        Returns:
+            Array form of the multidiagonal elementary operator on the specified device.
+
+        .. note::
+            This function returns the dense array form of the multidiagonal elementary operator. If the original data buffer containing the diagonal elements is needed, use the :attr:`data` attribute.
+        """
+        package = np if device == "cpu" else cp
+        return self._to_dense(package=package).to_array(t, args, device)
+
+    def copy(self) -> "MultidiagonalOperator":
+        """
+        Return a copy of the multidiagonal elementary operator.
+        """
+        return MultidiagonalOperator(
+            single_tensor_copy(self._data, self._ctx), self.offsets, self.callback
+        )
+
+    @precondition(ElementaryOperator._check_scalar_operation_compability)
+    def __mul__(self, scalar: Number) -> "MultidiagonalOperator":
+        """
+        Multiply this instance with a scalar on the left.
+        """
+
+        @MultidiagonalOperator._unary_operation()
+        def _mul(data, offsets, package):
+            return data * scalar
+
+        if self._data.device_id is None:
+            return _mul(self)
+        else:
+            with cp.cuda.Device(self._data.device_id):
+                return _mul(self)
+
+    @precondition(ElementaryOperator._check_scalar_operation_compability)
+    def __rmul__(self, scalar: Number) -> "MultidiagonalOperator":
+        """
+        Multiply this instance with a scalar on the right.
+        """
+        return self * scalar
+
+    @_unary_operation()
+    def _dag(data, offsets, package):
+        return data.conj()
+
+    def dag(self) -> "MultidiagonalOperator":
+        """
+        Return the conjugate complex transpose of this instance.
+        """
+        offsets = [-offset for offset in self.offsets]
+        return self._dag(offsets)
+
+    @staticmethod
+    def _binary_operation(operation):
+        def _operation(dia_op1, dia_op2, offsets):
+            if dia_op1.callback is None and dia_op2.callback is None:
+                data = operation(
+                    dia_op1.data,
+                    dia_op1.offsets,
+                    dia_op2.data,
+                    dia_op2.offsets,
+                    offsets,
+                    package=dia_op1._data.module,
+                )
+                callback = None
+            else:  # result is dynamic MultidiagonalOperator
+                data = dia_op1._data.module.empty(
+                    (dia_op1.shape[0], len(offsets)),
+                    dtype=np.promote_types(dia_op1.dtype, dia_op2.dtype),
+                )
+                if dia_op1.callback is None and dia_op2.callback is not None:
+                    stream_holder = StreamHolder(obj=cp.cuda.Stream())
+                    with cuda_call_ctx(stream_holder, timing=False):
+                        data1 = dia_op1._data.to(stream_holder=stream_holder)
+                    callback = lambda t, args: operation(
+                        data1,
+                        dia_op1.offsets,
+                        dia_op2.callback(t, args),
+                        dia_op2.offsets,
+                        offsets,
+                        package=np,
+                    )
+                elif dia_op1.callback is not None and dia_op2.callback is None:
+                    stream_holder = StreamHolder(obj=cp.cuda.Stream())
+                    with cuda_call_ctx(stream_holder, timing=False):
+                        data2 = dia_op2._data.to(stream_holder=stream_holder)
+                    callback = lambda t, args: operation(
+                        dia_op1.callback(t, args),
+                        dia_op1.offsets,
+                        data2,
+                        dia_op2.offsets,
+                        offsets,
+                        package=np,
+                    )
+                else:  # both inputs are dynamic
+                    callback = lambda t, args: operation(
+                        dia_op1.callback(t, args),
+                        dia_op1.offsets,
+                        dia_op2.callback(t, args),
+                        dia_op2.offsets,
+                        offsets,
+                        package=np,
+                    )
+            return MultidiagonalOperator(data, offsets, callback)
+
+        return _operation
+
+    @_binary_operation
+    def _add(a, offsets_a, b, offsets_b, offsets, package):
+        data = package.zeros(
+            (a.shape[0], len(offsets)),
+            dtype=package.promote_types(a.dtype, b.dtype),
+            order="F",
+        )
+        for i, offset in enumerate(offsets):
+            if offset in offsets_a:
+                index = offsets_a.index(offset)
+                data[:, i] += a[:, index]
+            if offset in offsets_b:
+                index = offsets_b.index(offset)
+                data[:, i] += b[:, index]
+        return data
+
+    @precondition(ElementaryOperator._check_binary_operation_compability, what="addition")
+    def __add__(self, other: ElementaryOperatorType) -> ElementaryOperatorType:
+        """
+        Add an elementary operator to this instance and return a new elementary operator of the same type as ``other``.
+        """
+        if isinstance(other, MultidiagonalOperator):
+            offsets = sorted(list(set(self.offsets) | set(other.offsets)))
+            if self._data.device_id is None:
+                return self._add(other, offsets)
+            else:
+                with cp.cuda.Device(self._data.device_id):
+                    return self._add(other, offsets)
+        else:
+            package = self._data.module if self.callback is None and other.callback is None else np
+            return self._to_dense(package=package) + other
+
+    @_binary_operation
+    def _sub(a, offsets_a, b, offsets_b, offsets, package):
+        data = package.zeros(
+            (a.shape[0], len(offsets)),
+            dtype=package.promote_types(a.dtype, b.dtype),
+            order="F",
+        )
+        for i, offset in enumerate(offsets):
+            if offset in offsets_a:
+                index = offsets_a.index(offset)
+                data[:, i] += a[:, index]
+            if offset in offsets_b:
+                index = offsets_b.index(offset)
+                data[:, i] -= b[:, index]
+        return data
+
+    @precondition(ElementaryOperator._check_binary_operation_compability, what="subtraction")
+    def __sub__(self, other: ElementaryOperatorType) -> ElementaryOperatorType:
+        """
+        Subtract an elementary operator from this instance and return a new elementary operator of the same type as ``other``.
+        """
+        if isinstance(other, MultidiagonalOperator):
+            offsets = sorted(list(set(self.offsets) | set(other.offsets)))
+            if self._data.device_id is None:
+                return self._sub(other, offsets)
+            else:
+                with cp.cuda.Device(self._data.device_id):
+                    return self._sub(other, offsets)
+        else:
+            package = self._data.module if self.callback is None and other.callback is None else np
+            return self._to_dense(package=package) - other
+
+    @_binary_operation
+    def _matmul(a, offsets_a, b, offsets_b, offsets, package):
+        matrix_dim = a.shape[0]
+        result_array = MultidiagonalOperator._multidiagonal_to_dense(
+            a, offsets_a, package
+        ) @ MultidiagonalOperator._multidiagonal_to_dense(b, offsets_b, package)
+
+        data = package.zeros(
+            (matrix_dim, len(offsets)),
+            dtype=package.promote_types(a.dtype, b.dtype),
+            order="F",
+        )
+        for i, offset in enumerate(offsets):
+            end = matrix_dim - abs(offset)
+            data[:, i][:end] = package.diag(result_array, offset)
+        return data
+
+    @precondition(
+        ElementaryOperator._check_binary_operation_compability, what="matrix multiplication"
+    )
+    def __matmul__(self, other: ElementaryOperatorType) -> ElementaryOperatorType:
+        """
+        Perform matrix multiplication between this instance and another elementary operator and return a new elementary operator of the same type as ``other``.
+        """
+        if isinstance(other, MultidiagonalOperator):
+            offsets = np.kron(self.offsets, np.ones(len(other.offsets), dtype=int)) + np.kron(
+                np.ones(len(self.offsets), dtype=int), other.offsets
+            )
+            offsets = np.unique(offsets)
+            matrix_dim = self.shape[0]
+            offsets = list(offsets[np.logical_and(-matrix_dim < offsets, offsets < matrix_dim)])
+
+            if self._data.device_id is None:
+                return self._matmul(other, offsets)
+            else:
+                with cp.cuda.Device(self._data.device_id):
+                    return self._matmul(other, offsets)
+        else:
+            package = self._data.module if self.callback is None and other.callback is None else np
+            return self._to_dense(package=package) @ other
diff --git a/python/cuquantum/densitymat/operators.py b/python/cuquantum/densitymat/operators.py
new file mode 100644
index 0000000..b9d5b0c
--- /dev/null
+++ b/python/cuquantum/densitymat/operators.py
@@ -0,0 +1,1249 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+
+from typing import Iterable, Optional, Tuple, List, Set, Union, Sequence, Callable
+from numbers import Number
+import weakref
+import collections
+
+import cupy as cp
+from cuquantum.cutensornet._internal import typemaps as cutn_typemaps
+from cuquantum.cutensornet._internal import utils as cutn_utils
+import numpy as np
+
+from cuquantum.bindings import cudensitymat as cudm
+from .elementary_operator import ElementaryOperator, DenseOperator, MultidiagonalOperator
+
+from .state import State
+from .work_stream import WorkStream
+from ._internal.callbacks import CallbackCoefficient
+from ._internal import utils
+from ._internal.utils import NDArrayType, InvalidObjectState
+
+
+__all__ = [
+    "tensor_product",
+    "OperatorTerm",
+    "Operator",
+    "OperatorAction",
+]
+
+ScalarCallbackType = Callable[[Number, Sequence], Number]
+
+
+class OperatorTerm:
+    """
+    Operator term consisting of tensor products of elementary operators.
+
+    An :class:`OperatorTerm` containing a tensor product of elementary operators can be obtained from the free function :func:`tensor_product`. Sums of more than a single product are obtained by in-place (``+=``) or out-of-place addition (``+``) of :class:`OperatorTerm` objects.
+
+    Args:
+        dtype: Numeric data type of the underlying elementary operators' data. Defaults to ``None`` and will be inferred from the appended tensor products of elementary operators.
+
+    .. note::
+        - Scalar operators, for which no product is appended, require specification of ``dtype`` at construction.
+    """
+
+    def __init__(self, dtype: Optional[str] = None):
+        """
+        Initialize an operator term consisting of tensor products of elementary operators.
+        """
+        self.terms = []
+        self.modes = []
+        self.duals = []
+
+        self._finalizer = weakref.finalize(self, lambda: None)
+        self._finalizer.detach()
+
+        self._coefficients = []
+        self._dtype: Optional[str] = dtype  # TODO: check for validity
+        self._hilbert_space_dims = None
+        self._ptr = None
+        self._ctx: "WorkStream" = None
+        self._last_compute_event = None
+        self._using_ops: set[ElementaryOperator] = set()
+        self._upstream_finalizers = collections.OrderedDict()
+
+    def _check_valid_state(self, *args, **kwargs):
+        if not self._valid_state:
+            raise InvalidObjectState("The operator term cannot be used after resources are freed")
+
+    @property
+    def _valid_state(self) -> bool:
+        return self._finalizer.alive
+
+    @property
+    def hilbert_space_dims(self) -> Tuple[int]:
+        """
+        Hilbert space dimensions of this `OperatorTerm`.
+        """
+        return self._hilbert_space_dims
+
+    @property
+    def dtype(self) -> str:
+        """
+        Data type of this :class:`OperatorTerm`.
+        """
+        return self._dtype
+
+    @property
+    @cutn_utils.precondition(_check_valid_state)
+    def _validated_ptr(self):
+        return self._ptr
+
+    def _append(
+        self,
+        elem_ops: Iterable[ElementaryOperator],
+        modes: Iterable[int],
+        duals: Iterable[bool],
+        coeff: Union[Callable, Number, CallbackCoefficient],
+    ) -> None:
+        """
+        Appends a product of tensor operators to this instances ``terms`` and the associated coefficient to ``coefficients``.
+
+        Args:
+            operands: The product of tensors is specified as a tuple for each tensor (tensor: Tensor, modes: Tuple[int], duals: Optional[Tuple[bool]]) with `modes` denoting the mode-indices in Hilbert space over which the tensor is supported, and `duals` specifies whether the tensor operator act on the ket (default, dual=False) or the bra (dual=True) space for the respective mode.
+            coeff: The coefficient for the appended product.
+                A static coefficient is passed as a number.
+                A dynamic coefficient is passed as a Callable.
+                Passing a coefficient (static, dynamic, or both) wrapped in CallbackCoefficient class is also supported.
+        """
+        if self._valid_state:
+            raise RuntimeError(
+                "Cannot append to OperatorTerm after it's instantiate method has been called."
+            )
+        self._check_dtype(elem_ops)
+        assert len(elem_ops) == len(modes)
+        assert len(elem_ops) == len(duals)
+
+        for operand, operand_modes in zip(elem_ops, modes):
+            _shape = operand.shape
+            assert len(_shape) % 2 == 0 and len(_shape) // 2 == len(operand_modes)
+
+        self.terms.append(elem_ops)
+        self.modes.append(modes)
+        self.duals.append(duals)
+        if not isinstance(coeff, CallbackCoefficient):
+            if isinstance(coeff, Callable):
+                coeff = CallbackCoefficient(coeff)
+            elif isinstance(coeff, Number):
+                coeff = CallbackCoefficient(None, coeff)
+            else:
+                raise TypeError(
+                    "`coeff` that are not a Number, Callable, CallbackCoefficient are not supported."
+                )
+        self._coefficients.append(coeff)
+
+    def _check_dtype(self, operands):
+        """
+        Checks that the operands to be appended to self.term are of the same dtype, and that the latter is the same dtype as self._dtype .
+        If self._dtype has not been set yet, this method will set it (unless empty operands are passed)
+        """
+        # handle case of empty operator
+        if len(operands) == 0:
+            if self._dtype is None:
+                raise TypeError(
+                    "OperatorTerms consisting of scalar terms need to specify a data type."
+                )
+            return
+        # check consistency of operands dtypes
+        dtypes = {op.dtype for op in operands}
+        dtype = dtypes.pop()
+
+        if len(dtypes) != 0:
+            raise TypeError(
+                "The provided operands have more than one data type, which is not supported. Please cast to same data type."
+            )
+
+        # check consistency of operands dtypes with this instances dtype
+        if self._dtype is None:
+            self._dtype = dtype
+        elif dtype is not None:
+            try:
+                self._dtype != dtype
+            except AssertionError as e:
+                raise TypeError(
+                    "The provided operands are required to have the same data type as this OperatorTerm instance."
+                ) from e
+
+    def _append_product(
+        self,
+        elem_ops: Sequence[ElementaryOperator],
+        modes: Sequence[int],
+        duals: Sequence[bool],
+        coeff: CallbackCoefficient,
+    ):
+        """
+        Appends `term`, i.e. product of ElementaryOperator to C-API counterpart of this OperatorTerm.
+        Before appending, the creation of the C-API counterpart for any ElementaryOperator in `term` is triggered if necessary.
+        """
+        ptrs = []
+        flattened_modes = []
+        flattened_duals = []
+        for elem_op, _modes, _duals in zip(elem_ops, modes, duals):
+            elem_op._maybe_instantiate(self._ctx)
+            assert elem_op.data.flags["F_CONTIGUOUS"]
+            self._using_ops.add(elem_op)
+            utils.register_with(self, elem_op, self._ctx.logger)
+            ptrs.append(elem_op._validated_ptr)
+            flattened_modes.extend(_modes)
+            flattened_duals.extend(map(lambda i: int(i), _duals))
+        cudm.operator_term_append_elementary_product(
+            self._ctx._handle._validated_ptr,
+            self._ptr,
+            len(elem_ops),
+            ptrs,
+            flattened_modes,
+            flattened_duals,
+            np.complex128(coeff.scalar),
+            coeff._wrapped_callback,
+        )
+
+    def _maybe_instantiate(self, ctx: "WorkStream", hilbert_space_dims: Tuple[int]) -> None:
+        """
+        Create C-API equivalent of this instance (and potentially of its downstream dependencies) and store pointer as attribute.
+
+        Args:
+            ctx: WorkStream
+                Library context, workspace, stream and other configuration information.
+            hilbert_space_dims: Tuple[int]
+                The local hilbert space dimensions as an iterable.
+            stream: Optional[int]
+                The stream to use for moving tensor storage from host to device,
+                which is potentially triggered in downstream dependencies.
+        """
+        if not self._valid_state:
+            self._ctx = ctx
+            self._hilbert_space_dims = tuple(hilbert_space_dims)
+            num_space_modes = len(hilbert_space_dims)
+            if self._dtype is None:
+                raise RuntimeError("Cannot use an OperatorTerm with unspecified datatype.")
+            self._ptr = cudm.create_operator_term(
+                self._ctx._handle._validated_ptr, num_space_modes, self.hilbert_space_dims
+            )
+            self._finalizer = weakref.finalize(
+                self,
+                utils.generic_finalizer,
+                self._ctx.logger,
+                self._upstream_finalizers,
+                (cudm.destroy_operator_term, self._ptr),
+                msg=f"Destroying OperatorTerm instance {self}, ptr: {self._ptr}",
+            )
+            utils.register_with(self, self._ctx, self._ctx.logger)
+            for term, modes, duals, coeff in zip(
+                self.terms, self.modes, self.duals, self._coefficients
+            ):
+                self._append_product(term, modes, duals, coeff)
+
+        else:
+            try:
+                assert self._ctx == ctx
+            except AssertionError as e:
+                raise ValueError(
+                    "Using an object with a different WorkStream than it was originally used with is not supported."
+                ) from e
+            try:
+                assert self._hilbert_space_dims == tuple(hilbert_space_dims)
+            except AssertionError as e:
+                raise ValueError(
+                    "Using an object from an object with different Hilbert space dimensions is not supported."
+                ) from e
+
+    def __add__(self, other: "OperatorTerm") -> "OperatorTerm":
+        """
+        Return a new :class:`OperatorTerm` equal to the sum of this :class:`OperatorTerm` and another :class:`OperatorTerm`.
+        """
+        if not isinstance(other, OperatorTerm):
+            raise TypeError(
+                f"Cannot add {type(other)} to OperatorTerm. OperatorTerm only supports addition of OperatorTerm."
+            )
+        if self._dtype is None or self._dtype != other.dtype:
+            raise TypeError(
+                f"Cannot add OperatorTerm of datatype {self._dtype}  and datatype {other._dtype}."
+            )
+        new_terms = [*self.terms, *other.terms]
+        new_modes = [*self.modes, *other.modes]
+        new_duals = [*self.duals, *other.duals]
+        new_coefficients = [*self._coefficients, *other._coefficients]
+        new_opterm = OperatorTerm(dtype=self._dtype)
+        # append method will raise error if dtypes are not compatible
+        for term, modes, duals, coeff in zip(new_terms, new_modes, new_duals, new_coefficients):
+            new_opterm._append(term, modes, duals, coeff)
+        return new_opterm
+
+    def __iadd__(self, other: "OperatorTerm") -> "OperatorTerm":
+        """
+        Inplace add another :class:`OperatorTerm` into this :class:`OperatorTerm`.
+        """
+        if self._valid_state:
+            raise RuntimeError(
+                "Cannot in-place add to this OperatorTerm after either\n\
+                               a) a prepare or compute method has been executed on an Operator depending on this instance, or\n\
+                               b) an OperatorAction has been created that depends on an Operator that depends on this instance."
+            )
+        if not isinstance(other, OperatorTerm):
+            raise TypeError(
+                f"Cannot in-place add {type(other)} to OperatorTerm. OperatorTerm only supports in-place addition of OperatorTerm."
+            )
+        if self._dtype and other._dtype:
+            assert self._dtype == other.dtype
+        assert (
+            self._dtype and self._dtype == other.dtype
+        )  # TODO: allow self to have indefinite dtype if other has definite dtype
+        for term, modes, duals, coeff in zip(
+            other.terms, other.modes, other.duals, other._coefficients
+        ):
+            self._append(term, modes, duals, coeff)
+        if self._dtype is None and other._dtype is not None:
+            self._dtype = other._dtype
+        return self
+
+    def __mul__(self, other: Union[Number, Callable, "OperatorTerm"]) -> "OperatorTerm":
+        """
+        Multiply this :class:`OperatorTerm` with a number, callable or another :class:`OperatorTerm` on the left.
+        """
+        if isinstance(other, (Number, Callable)):
+            new_opterm = OperatorTerm(dtype=self._dtype)
+            for term, modes, duals, coeff in zip(
+                self.terms, self.modes, self.duals, self._coefficients
+            ):
+                new_opterm._append(term, modes, duals, other * coeff)
+        elif isinstance(other, OperatorTerm):
+            if other.dtype is not None:
+                if self._dtype is None:
+                    dtype = other.dtype
+                elif self._dtype != other.dtype:
+                    raise ValueError(
+                        f"Data types of OperatorTerms to be multiplied, {self.dtype} and {other.dtype}, do not match."
+                    )
+                else:
+                    dtype = self._dtype
+            else:
+                dtype = self._dtype
+            new_opterm = OperatorTerm(dtype=dtype)
+            for term_l, modes_l, duals_l, coeff_l in zip(
+                self.terms, self.modes, self.duals, self._coefficients
+            ):
+                for term_r, modes_r, duals_r, coeff_r in zip(
+                    other.terms, other.modes, other.duals, other._coefficients
+                ):
+                    new_terms = [*term_l, *term_r]
+                    new_modes = [*modes_l, *modes_r]
+                    new_duals = [*duals_l, *duals_r]
+                    new_opterm._append(new_terms, new_modes, new_duals, coeff_l * coeff_r)
+        else:
+            raise TypeError(
+                f"Cannot multiply OperatorTerm by {type(other)}. OperatorTerm only supports multiplication by Number, Callable or OperatorTerm."
+            )
+        return new_opterm
+
+    def __rmul__(self, other: Union[Number, Callable, "OperatorTerm"]) -> "OperatorTerm":
+        """
+        Multiply this :class:`OperatorTerm` with a number, callable or another :class:`OperatorTerm` on the right.
+        """
+        return self * other
+
+    def dag(self) -> "OperatorTerm":
+        """
+        Return a new :class:`OperatorTerm` equal to the complex conjugate of this :class:`OperatorTerm`.
+
+        .. warning::
+            A error will be raised if the :class:`OperatorTerm` contains tensor products of elementary operators acting on both bra and ket modes at the same time.
+        """
+        if not all([all(len(set(_dual)) == 1 for _dual in dual) for dual in self.duals[::-1]]):
+            raise NotImplementedError(
+                "OperatorTerm's `dag` method is only supported if none of its products contains ElementaryOperators acting on both bra and ket modes at the same time."
+            )
+        # TODO: perform dag on self._using_ops in order to not create more daggered ElementaryOperator's than necessary within this scope
+        new_opterm = OperatorTerm(self._dtype)
+        for term, modes, duals, coeff in zip(
+            self.terms, self.modes, self.duals, self._coefficients
+        ):
+            new_opterm._append(
+                [op.dag() for op in term[::-1]], modes[::-1], duals[::-1], coeff.conjugate()
+            )
+        return new_opterm
+
+    def dual(self) -> "OperatorTerm":
+        """
+        Return a new :class:`OperatorTerm` with duality reversed.
+        """
+        new_opterm = OperatorTerm(self._dtype)
+        for term, modes, duals, coeff in zip(
+            self.terms, self.modes, self.duals, self._coefficients
+        ):
+            recursive_logical_not = lambda x: (
+                not x if isinstance(x, bool) else list(map(recursive_logical_not, x))
+            )
+            new_opterm._append(term[::-1], modes[::-1], recursive_logical_not(duals[::-1]), coeff)
+        return new_opterm
+
+    def _sync(self):
+        if self._last_compute_event is not None:
+            self._last_compute_event.synchronize()
+            self._last_compute_event = None
+
+
+class Operator:
+    """
+    Operator(hilbert_space_dims, *terms)
+
+    Operator representing a collection of :class:`OperatorTerm` objects.
+
+    The action of an :class:`Operator` maps a ``State`` to another ``State``. 
+    An :class:`Operator` acts on an instance of ``State`` through its ``compute`` method after its ``prepare`` method is called on the same instance of ``State``.
+
+    Args:
+        hilbert_space_dims: Hilbert space dimensions of the physical system.
+        terms: A sequence of tuples specifying each term. 
+            Each tuple can consist of a single element (:class:`OperatorTerm`), two elements (:class:`OperatorTerm` and coefficient), or three elements (:class:`OperatorTerm`, coefficient and duality). 
+            If the second or third element is not given, they will be set to the default values (``coefficient=1``, ``duality=False``).
+    """
+
+    def __init__(
+            self,
+            hilbert_space_dims: Sequence[int],
+            *terms: Tuple[OperatorTerm, Optional[Union[Number, Tuple]], Optional[bool]]
+        ) -> None:
+        """
+        Initialize an operator representing a collection of :class:`OperatorTerm` objects.
+        """
+        self._finalizer = weakref.finalize(self, lambda: None)
+        self._finalizer.detach()
+        self._using_terms: Set[OperatorTerm] = set()
+        self._using_ops: Set[ElementaryOperator] = set()
+        self._ctx = None
+
+        self._hilbert_space_dims: Tuple[int] = tuple(hilbert_space_dims)
+
+        self._dtype = None  # str
+        self.terms: List[OperatorTerm] = []
+        self._coefficients: List[CallbackCoefficient] = []
+        self.dualities: List[bool] = []
+
+        self._ptr = None
+        self._expectation_ptr = None
+        self._work_size = None
+        self._expectation_work_size = None
+        self._last_compute_event = None
+
+        self._current_expectation_compute_type = None
+        self._current_action_compute_type = None
+        self._upstream_finalizers = collections.OrderedDict()
+
+        for term in terms:
+            self._append(*term)
+
+    def _check_valid_state(self, *args, **kwargs):
+        """ """
+        if not self._valid_state:
+            raise InvalidObjectState("The operator cannot be used after resources have been freed!")
+
+    @property
+    def _valid_state(self):
+        return self._finalizer.alive
+
+    @property
+    def dtype(self):
+        """
+        Data type of this :class:`Operator`.
+        """
+        return self._dtype
+
+    @property
+    def hilbert_space_dims(self):
+        """
+        Hilbert space dimension of this :class:`Operator`.
+        """
+        return self._hilbert_space_dims
+
+    @property
+    @cutn_utils.precondition(_check_valid_state)
+    def _validated_ptr(self):
+        """
+        The pointer to this instance's C-API counterpart.
+        """
+        return self._ptr
+
+    @property
+    @cutn_utils.precondition(_check_valid_state)
+    def _validated_expectation_ptr(self):
+        """
+        The pointer to this instance's C-API counterpart.
+        """
+        return self._expectation_ptr
+
+    def _sync(self):
+        if self._last_compute_event is not None:
+            self._last_compute_event.synchronize()
+            self._last_compute_event = None
+
+    def append(
+        self,
+        term: OperatorTerm,
+        coeff: Union[Number, ScalarCallbackType] = 1.0,
+        duality: bool = False,
+    ) -> None:
+        """
+        Append an :class:`OperatorTerm` to this :class:`Operator`.
+
+        Args:
+            term: The :class:`OperatorTerm` to be appended.
+            coeff: The coefficient associated with this :class:`OperatorTerm`.
+            duality: Whether the elementary operators in ``term`` are applied on ket modes (``False``) or bra modes (``True``).
+        """
+        self._append(term, coeff, duality)
+
+    def _append(
+        self,
+        term: OperatorTerm,
+        coeff: Optional[Union[Number, Callable, CallbackCoefficient]] = 1.0,
+        duality: Optional[bool] = False,
+    ) -> None:
+        """
+        Appends an OperatorTerm to this Operator.
+
+        Args:
+            term: OperatorTerm
+                The OperatorTerm to be appended.
+            coeff: Union[Number, Callable, CallbackCoefficient]
+                The coefficient associated with this term.
+                A static coefficient is provided as a number.
+                A dynamic coefficient is provided as a callable with signature (t,args: Tuple) -> Number.
+            duality: Optional[bool]
+                Specifies whether the tensor operators in ``term`` are applied on ket or bra modes
+                as specified for its constituents (False) or the opposite (True).
+        """
+        if self._valid_state:
+            # TODO[FUTURE]/TODO[OPTIONAL]: Maybe relax this in the future, requires sync
+            raise RuntimeError(
+                "Cannot inplace add to this Operator after either\n\
+                               a) its prepare or compute method has been called or\n\
+                               b) an OperatorAction has been created that depends on this Operator."
+            )
+        else:
+            if term.dtype is not None:
+                if self._dtype is None:
+                    self._dtype = term.dtype
+                elif self._dtype != term.dtype:
+                    raise ValueError(
+                        "Data type of OperatorTerm to be appended to Operator does not match data type of Operator."
+                    )
+            elif self._dtype is None:
+                raise ValueError(
+                    "Cannot append OperatorTerm without definite datatype to Operator without definite datatype."
+                )
+            self.terms.append(term)
+            if not isinstance(coeff, CallbackCoefficient):
+                if isinstance(coeff, Callable):
+                    coeff = CallbackCoefficient(coeff)
+                elif isinstance(coeff, Number):
+                    coeff = CallbackCoefficient(None, coeff)
+                else:
+                    raise TypeError(
+                        f"Coefficient of term to be appendend to OperatorTerm is of unexpected type: {type(coeff)}. Expecting either a Callable, a Number or a CallbackCoefficient."
+                    )
+            self._coefficients.append(coeff)
+            self.dualities.append(duality)
+
+    def _append_internal(self, term: OperatorTerm, coeff: CallbackCoefficient, dual: bool):
+        """
+        Appends `term` to C-API counterpart.
+        If OperatorTerm instances in self.terms are not instantiated,
+        this method will instantiate them.
+        """
+        if not isinstance(term, OperatorTerm):
+            raise TypeError("Can only append instances of OperatorTerm to Operator.")
+        # side effect on entries of self.terms
+        term._maybe_instantiate(self._ctx, self.hilbert_space_dims)
+        utils.register_with(self, term, self._ctx.logger)
+        self._using_terms.add(term)
+        self._using_ops = self._using_ops.union(term._using_ops)
+        cudm.operator_append_term(
+            self._ctx._handle._validated_ptr,
+            self._ptr,
+            term._validated_ptr,
+            int(dual),
+            cp.complex128(coeff.scalar),
+            coeff._wrapped_callback,
+        )
+
+    def _maybe_instantiate(self, ctx: "WorkStream") -> None:
+        """
+        Creates the C-API counterpart of this instance, stores its pointer as attribute and appends the terms in
+        self.terms to this instance's C-API counterpart, triggering the terms' instantiations if they haven't been
+        instantiated yet.
+
+        Args:
+            ctx: WorkStream
+                Library context, workspace, stream and other configuration information.
+        """
+        if self._valid_state:
+            if self._ctx != ctx:
+                raise ValueError(
+                    "Operator objects can only be used with a single WorkStream, and this instance was originally used with another WorkStream (either directly or via an OperatorAction). Switching WorkStream is not supported."
+                )
+        else:
+            self._ctx = ctx
+
+            try:
+                assert self.dtype is not None
+            except AssertionError as e:
+                raise RuntimeError(
+                    "Operator must have a definite data type before indirect usage through OperatorAction or calls to its prepare or compute methods."
+                ) from e
+
+            self._ptr = cudm.create_operator(
+                self._ctx._handle._validated_ptr,
+                len(self.hilbert_space_dims),
+                self.hilbert_space_dims,
+            )
+
+            self._expectation_ptr = cudm.create_expectation(
+                self._ctx._handle._validated_ptr, self._ptr
+            )
+
+            self._finalizer = weakref.finalize(
+                self,
+                utils.generic_finalizer,
+                self._ctx.logger,
+                self._upstream_finalizers,
+                (cudm.destroy_expectation, self._expectation_ptr),
+                (cudm.destroy_operator, self._ptr),
+                msg=f"Destroying Operator instance {self}, ptr: {self._ptr}",
+            )
+            utils.register_with(self, self._ctx, self._ctx.logger)
+
+            for term, coeff, dual in zip(self.terms, self._coefficients, self.dualities):
+                self._append_internal(term, coeff, dual)
+
+    def dual(self) -> "Operator":
+        """
+        Return a shallow, partial copy of this :class:`Operator` with flipped duality for each term.
+        """
+        return Operator(
+            self._hilbert_space_dims,
+            *(
+                tuple(
+                    zip(
+                        self.terms,
+                        self._coefficients,
+                        (not (duality) for duality in self.dualities),
+                    )
+                )
+            ),
+        )
+
+    def prepare_action(
+        self,
+        ctx: "WorkStream",
+        state: "State",
+        state_out: Optional["State"] = None,
+        compute_type: Optional[str] = None,
+    ) -> None:
+        """
+        Prepare the action of this :class:`Operator` on an input state and accumulate into the output state.
+
+        Args:
+            ctx: Library context, which contains workspace, stream and other configuration information.
+            state: The input quantum state to which the :class:`Operator` is to be applied.
+            state_out: The output quantum state to which the action is to be accumulated. Defaults to ``state``.
+            compute_type: The CUDA compute type to be used by the computation.
+
+        .. attention::
+            The ``compute_type`` argument is currently not used and will default to the data type.
+        """
+        if not self._valid_state:
+            self._maybe_instantiate(ctx)
+        else:
+            if self._ctx != ctx:
+                raise ValueError(
+                    "Operator objects can only be used with a single WorkStream, and this instance was originally used with another WorkStream. Switching WorkStream is not supported."
+                )
+
+        if self.hilbert_space_dims != state.hilbert_space_dims:
+            raise ValueError(
+                f"Hilbert space dimensions of Operator, {self.hilbert_space_dims}, and input State, {state.hilbert_space_dims}, instances are not matching."
+            )
+        if state_out is not None and self.hilbert_space_dims != state_out.hilbert_space_dims:
+            raise ValueError(
+                f"Hilbert space dimensions of Operator, {self.hilbert_space_dims}, and output State, {state.hilbert_space_dims}, instances are not matching."
+            )
+
+        default_compute_type = (
+            self._ctx.compute_type if self._ctx.compute_type is not None else self.dtype
+        )
+        self._current_action_compute_type = compute_type if compute_type else default_compute_type
+
+        cudm.operator_prepare_action(
+            self._ctx._handle._validated_ptr,
+            self._ptr,
+            state._validated_ptr,
+            state_out._validated_ptr if state_out else state._validated_ptr,
+            cutn_typemaps.NAME_TO_COMPUTE_TYPE[self._current_action_compute_type],
+            self._ctx._memory_limit,
+            self._ctx._validated_ptr,
+            0,  # TODO[FUTURE] / TODO[OPTIONAL]: accept stream as optional argument and pass here
+        )
+        self._expectation_work_size = None
+        self._work_size, _ = self._ctx._update_required_size_upper_bound()
+
+    def prepare_expectation(
+        self,
+        ctx: "WorkStream",
+        state: "State",
+        compute_type: Optional[str] = None,
+    ) -> None:
+        """
+        Prepare the computation of an expectation value of this :class:`Operator` on a state.
+
+        Args:
+            ctx: Library context, which contains workspace, stream and other configuration information.
+            state: The quantum state on which the expectation value is evaluated.
+            compute_type: The CUDA compute type to be used by the computation.
+
+        .. attention::
+            The ``compute_type`` argument is currently not used and will default to the data type.
+        """
+        if not self._valid_state:
+            self._maybe_instantiate(ctx)
+        else:
+            if self._ctx != ctx:
+                raise ValueError(
+                    "Operator objects can only be used with a single WorkStream, and this instance was originally used with another WorkStream. Switching WorkStream is not supported."
+                )
+
+        if self.hilbert_space_dims != state.hilbert_space_dims:
+            raise ValueError(
+                f"Hilbert space dimensions of Operator, {self.hilbert_space_dims}, and State, {state.hilbert_space_dims}, instances are not matching."
+            )
+
+        default_compute_type = (
+            self._ctx.compute_type if self._ctx.compute_type is not None else self.dtype
+        )
+        self._current_expectation_compute_type = (
+            compute_type if compute_type else default_compute_type
+        )
+
+        cudm.expectation_prepare(
+            self._ctx._handle._validated_ptr,
+            self._expectation_ptr,
+            state._validated_ptr,
+            cutn_typemaps.NAME_TO_COMPUTE_TYPE[self._current_expectation_compute_type],
+            self._ctx._memory_limit,
+            self._ctx._validated_ptr,
+            0,  # TODO[FUTURE] / TODO[OPTIONAL]: accept stream as optional argument and pass here
+        )
+        self._work_size = None
+        self._expectation_work_size, _ = self._ctx._update_required_size_upper_bound()
+        return
+
+    # @cutn_utils.precondition(_check_valid_state)
+    def compute_action(
+        self,
+        t: float,
+        params: list[float],
+        state_in: "State",
+        state_out: "State",
+    ) -> None:
+        """
+        Compute the action of this :class:`Operator` on an input state and accumulate into the output state.
+
+        Args:
+            t: Time argument to be passed to all callback functions.
+            params: Additional arguments to be passed to all callback functions.
+            state: The input quantum state to which the :class:`Operator` is to be applied.
+            state_out: The output quantum state to which the action is to be accumulated. Defaults to ``state``.
+        """
+        params = tuple(map(float, params))
+        # TODO[OPTIONAL] / TODO[FUTURE]: Maybe change semantics once check for state compatibility is exposed in C-API
+
+        # handle released workspace descriptor or lack of preceding prepare_action call
+        if self._ctx is None:
+            raise RuntimeError(
+                "This instance has not been used with a WorkStream, please call its ``prepare_expectation`` or ``_prepare_action`` method once before calls to this method."
+            )
+        _ = self._validated_ptr  # just check the instance hasn't been finalized yet
+        if self._ctx != state_in._ctx:
+            raise ValueError(
+                "This Operator's WorkStream and the WorkStream of State on which to compute action do not match."
+            )
+        if self._ctx != state_out._ctx:
+            raise ValueError(
+                "This Operator's WorkStream and the WorkStream of State in which to accumulate action do not match."
+            )
+        self.prepare_action(self._ctx, state_in, state_out, self._current_action_compute_type)
+        self._ctx._maybe_allocate()
+
+        with cutn_utils.device_ctx(self._ctx.device_id), utils.cuda_call_ctx(self._ctx) as (
+            self._last_compute_event,
+            elapsed,
+        ):
+            # update last event in participating elementary/general operators to ensure proper stream synchronization and shutdown order
+            self._ctx._last_compute_event = self._last_compute_event
+            state_in._last_compute_event = self._last_compute_event
+            state_out._last_compute_event = self._last_compute_event
+            for _op in self._using_ops:
+                _op._last_compute_event = self._last_compute_event
+            # update last event for contained OperatorTerms as well
+            for term in set(self.terms):
+                term._last_compute_event = self._last_compute_event
+
+            cudm.operator_compute_action(
+                self._ctx._handle._validated_ptr,
+                self._validated_ptr,
+                t,
+                len(params),
+                params,
+                state_in._validated_ptr,
+                state_out._validated_ptr,
+                self._ctx._validated_ptr,
+                self._ctx._stream_holder.ptr,
+            )
+
+    # TODO[OPTIONAL]:  Wrap return in class that waits for compute event when retrieving the value for async execution.
+    @cutn_utils.precondition(_check_valid_state)
+    def compute_expectation(
+        self,
+        t: float,
+        params: Sequence[float],
+        state: "State",
+    ) -> cp.ndarray:
+        """
+        Compute the expectation value of this :class:`Operator` on a state.
+
+        Args:
+            t: Time argument to be passed to all callback functions.
+            params: Additional arguments to be passed to all callback functions.
+            state: The quantum state on which the expectation value is evaluated.
+
+        Returns:
+            The computed expectation value wrapped in a :class:`cupy.ndarray`.
+
+        .. note::
+            Currently, this method executes in blocking manner, returning the expectation value only after the computation is finished.
+        """
+        params = tuple(map(float, params))
+        # TODO[FUTURE] / TODO[OPTIONAL] : Maybe change semantics once check for state compatibility is exposed in C-API
+        if self._ctx is None:
+            raise RuntimeError(
+                "This instance has not been used with a WorkStream, please call its ``prepare_expectation`` or ``_prepare_action`` method once before calls to this method."
+            )
+        elif self._ctx != state._ctx:
+            raise RuntimeError(
+                "This Operator's WorkStream and the WorkStream of ``state`` for which to compute expectation value do not match."
+            )
+        _ = self._validated_expectation_ptr  # just check the instance hasn't been finalized yet
+        self.prepare_expectation(self._ctx, state, self._current_expectation_compute_type)
+        self._ctx._maybe_allocate()
+
+        with cutn_utils.device_ctx(self._ctx.device_id), utils.cuda_call_ctx(
+            self._ctx, blocking=True
+        ) as (self._last_compute_event, elapsed):
+            # update last event in participating elementary/general operators to ensure proper stream synchronization and shutdown order
+            self._ctx._last_compute_event = self._last_compute_event
+            state._last_compute_event = self._last_compute_event
+            for _op in self._using_ops:
+                _op._last_compute_event = self._last_compute_event
+            # update last event for contained OperatorTerms as well
+            for term in set(self.terms):
+                term._last_compute_event = self._last_compute_event
+
+            out = cp.ndarray((state.batch_size,), dtype=state.dtype)
+
+            cudm.expectation_compute(
+                self._ctx._handle._validated_ptr,
+                self._validated_expectation_ptr,
+                t,
+                len(params),
+                params,
+                state._validated_ptr,
+                out.data.ptr,
+                self._ctx._validated_ptr,
+                self._ctx._stream_holder.ptr,
+            )
+        return out
+
+    def __add__(self, other: "Operator") -> "Operator":
+        """
+        Return a new :class:`Operator` equal to the sum of this :class:`Operator` with another :class:`Operator`.
+        """
+        if not isinstance(other, Operator):
+            raise TypeError("Only Operator instances can be out-of-place added to Operator")
+        if self.hilbert_space_dims != other.hilbert_space_dims:
+            raise ValueError(
+                "Addition of two Operators with mismatching Hilbert space dimensions is not supported."
+            )
+        return Operator(self.hilbert_space_dims, *_unpack_operator(self), *_unpack_operator(other))
+
+    def __iadd__(self, other: Union["Operator", "OperatorTerm"]) -> None:
+        """
+        Inplace add another :class:`Operator` or :class:`OperatorTerm` into this :class:`Operator`.
+        """
+        if isinstance(other, OperatorTerm):
+            self._append(other)
+        elif isinstance(other, Operator):
+            for term, duality, coeff in _unpack_operator(other):
+                self._append(term, duality, coeff)
+        else:
+            raise TypeError(
+                "Only Operator and OperatorTerm instances can be in-place added to Operator"
+            )
+        return self
+
+    def __neg__(self) -> "Operator":
+        """
+        Return a new :class:`Operator` equal to this :class:`Operator` with all terms negated.
+        """
+        return self * -1
+
+    def __sub__(self, other: "Operator") -> "Operator":
+        """
+        Return the difference of this :class:`Operator` with another :class:`Operator`.
+        """
+        return Operator(self.hilbert_space_dims, *_unpack_operator(self), *_unpack_operator(-other))
+
+    def __mul__(self, scalar) -> "Operator":
+        """
+        Return a new :class:`Operator` equal to this :class:`Operator` multiplied by a scalar on the left.
+        """
+        return Operator(
+            self.hilbert_space_dims,
+            *(
+                tuple(
+                    zip(
+                        self.terms,
+                        (scalar * coeff for coeff in self._coefficients),
+                        self.dualities,
+                    )
+                )
+            ),
+        )
+
+    def __rmul__(self, scalar) -> "Operator":
+        """
+        Return a new :class:`Operator` equal to this :class:`Operator` multiplied by a scalar on the right.
+        """
+        return self * scalar
+
+
+class OperatorAction:
+    """
+    OperatorAction(ctx, operators)
+
+    Operator action representing the action of a set of :class:`Operator` objects on a set of input states, accumulated into a single output state.
+
+    Args:
+        ctx: Library context, which contains workspace, stream and other configuration information.
+        operators: A sequence of :class:`Operator` objects, the length of which is identical to the length of sequence of input states accepted when computing this instance's action.
+    """
+
+    def __init__(
+        self,
+        ctx: WorkStream,
+        operators: Tuple[Operator],
+    ):
+        """
+        Initialize an operator action representing the action of a set of :class:`Operator` objects on a set of input states, accumulated into a single output state.
+        """
+        self._finalizer = weakref.finalize(self, lambda: None)
+        self._finalizer.detach()
+        self.operators = []
+
+        self._dtype = None
+        self._set_or_check_dtype(operators)
+        self.operators = operators
+        if self.dtype is None:
+            raise ValueError(
+                "Datatype of OperatorAction cannot be inferred from its constituent Operators."
+            )
+        _hilbert_space_dims = set(op.hilbert_space_dims for op in operators)
+        if len(_hilbert_space_dims) != 1:
+            raise RuntimeError(
+                "Operator's constituting this OperatorAction have mismatching Hilbert space dimensions."
+            )
+        self._hilbert_space_dims = tuple(_hilbert_space_dims.pop())
+
+        self._ctx = ctx
+        self._default_compute_type = (
+            self._ctx.compute_type if self._ctx.compute_type is not None else self._dtype
+        )
+        self._current_compute_type = None
+        self._last_compute_event = None
+        self._work_size = None
+        self._upstream_finalizers = collections.OrderedDict()  # future proofing
+        self._ptr = None
+        operators = []
+        for op in self.operators:
+            op._maybe_instantiate(self._ctx)
+            operators.append(op._validated_ptr)
+        self._ptr = cudm.create_operator_action(
+            self._ctx._handle._validated_ptr, len(self.operators), operators
+        )
+        self._finalizer = weakref.finalize(
+            self,
+            utils.generic_finalizer,
+            self._ctx.logger,
+            self._upstream_finalizers,
+            (cudm.destroy_operator_action, self._ptr),
+            msg=f"Destroying OperatorAction instance {self}, ptr: {self._ptr}",
+        )
+        utils.register_with(self, self._ctx, self._ctx.logger)
+
+        for op in self.operators:
+            utils.register_with(self, op, self._ctx.logger)
+            # op._upstream_finalizers[self._finalizer] = weakref.ref(self)
+
+        self._using_tensor_ops = set()
+        self._using_terms = set()
+        for op in self.operators:
+            self._using_terms = self._using_terms.union(set(op.terms))
+            self._using_tensor_ops = self._using_tensor_ops.union(op._using_ops)
+
+    def _check_valid_state(self, *args, **kwargs) -> None:
+        """ """
+        if not self._valid_state:
+            raise InvalidObjectState(
+                "The operator action cannot be used after resources are free'd"
+            )
+
+    @property
+    def _valid_state(self):
+        return self._finalizer.alive
+
+    @property
+    @cutn_utils.precondition(_check_valid_state)
+    def _validated_ptr(self) -> int:
+        """
+        The pointer to this instances C-API counterpart.
+        """
+        return self._ptr
+
+    @property
+    def hilbert_space_dims(self):
+        """
+        Hilbert space dimension of this :class:`OperatorAction`.
+        """
+        return self._hilbert_space_dims
+
+    @property
+    def dtype(self):
+        """
+        Data type of this :class:`OperatorAction`.
+        """
+        return self._dtype
+
+    #   Expose as free function with Protocol
+    def _sync(self):
+        if self._last_compute_event:
+            self._last_compute_event.synchronize()
+            self._last_compute_event = None
+
+    # TODO[OPTIONAL]: move to free function or superclass method to remove code duplication
+    def _set_or_check_dtype(self, operands) -> None:
+        """
+        Checks that the operands to be appended to self.term are of the same dtype, and that the latter is the same dtype as self.dtype .
+        If self.dtype has not been set yet, this method will set it (unless empty operands are passed)
+        """
+        # check consistency of operands dtypes
+        dtypes = {op.dtype for op in operands}
+        try:
+            dtype = dtypes.pop()
+        except KeyError:
+            dtype = None
+        if len(dtypes) != 0:
+            raise ValueError(
+                "The provided operands have more than one dtype, which is not supported. Please cast to same dtype."
+            )
+        # check consistency of operands dtypes with this instances dtype
+        if self.dtype is None:
+            self._dtype = dtype
+        elif dtype is not None:
+            try:
+                assert self.dtype != dtype
+            except AssertionError as e:
+                raise TypeError(
+                    "The provided operands are required to have the same dtype as this OperatorTerm instance."
+                ) from e
+
+    # TODO[OPTIONAL] / TODO[FUTURE]: Maybe keep track of state signature to verify whether prepare is valid for input
+    @cutn_utils.precondition(_check_valid_state)
+    def prepare(
+        self,
+        ctx: "WorkStream",
+        states_in: Sequence["State"],
+        state_out: Optional["State"] = None,
+        compute_type: Optional[str] = None,
+    ) -> None:
+        """
+        Prepare the action of this instance on input states.
+
+        Args:
+            ctx: Library context, which contains workspace, stream and other configuration information.
+            states_in: The input quantum states to which the action is to be applied.
+            state_out: The output quantum state to which the action is to be accumulated. Defaults to the first element of ``state_in``.
+            compute_type: The CUDA compute type to be used by the computation.
+
+        .. attention::
+            The ``compute_type`` argument is currently not used and will default to the data type.
+        """
+        if self._ctx != ctx:
+            raise ValueError(
+                "OperatorAction objects can only be used with a single WorkStream, and this instance was originally used with another WorkStream. Switching WorkStream is not supported."
+            )
+        self._current_compute_type = compute_type if compute_type else self._default_compute_type
+
+        _state_hilbert_spaces = set(state.hilbert_space_dims for state in states_in)
+        if len(_state_hilbert_spaces) != 1:
+            raise ValueError("Input states have mismatching Hilbert space dimensions.")
+        elif state_out is not None:
+            _state_hilbert_spaces.add(state_out.hilbert_space_dims)
+            if len(_state_hilbert_spaces) != 1:
+                raise ValueError(
+                    "Output state's Hilbert space dimensions do not match input states'."
+                )
+        if set((self.hilbert_space_dims,)) != _state_hilbert_spaces:
+            raise ValueError(
+                f"Hilbert space dimensions of OperatorAction, {self.hilbert_space_dims}, and of input states, {_state_hilbert_spaces.pop()},  are not matching."
+            )
+        cudm.operator_action_prepare(
+            self._ctx._handle._validated_ptr,
+            self._ptr,
+            [state._validated_ptr for state in states_in],
+            state_out._validated_ptr if state_out else states_in[0]._validated_ptr,
+            cutn_typemaps.NAME_TO_COMPUTE_TYPE[self._current_compute_type],
+            self._ctx._memory_limit,
+            self._ctx._validated_ptr,
+            0,  # TODO[OPTIONAL] / TODO[FUTURE]: pass stream if C-API enables non-blocking prepare?
+        )
+        self._work_size, _ = self._ctx._update_required_size_upper_bound()
+
+        return
+
+    @cutn_utils.precondition(_check_valid_state)
+    def compute(
+        self,
+        t: float,
+        params: Sequence[float],
+        states_in: Sequence["State"],
+        state_out: "State",
+    ) -> None:
+        """
+        Compute the action of this instance on a sequence of input states and accumulate the results into an output state.
+
+        Args:
+            t: Time argument to be passed to all callback functions.
+            params: Additional arguments to be passed to all callback functions.
+            states_in: The quantum states to which the :class:`OperatorAction` is applied.
+            state_out: The quantum state into which the result is accumulated.
+        """
+        params = tuple(map(float, params))
+        for state_in in states_in:
+            if self._ctx != state_in._ctx:
+                raise ValueError(
+                    "This OperatorAction's WorkStream and the WorkStream of an input state do not match."
+                )
+        if self._ctx != state_out._ctx:
+            raise ValueError(
+                "This OperatorAction's WorkStream and the WorkStream of output state do not match."
+            )
+        _ = self._ctx._validated_ptr
+        self.prepare(self._ctx, states_in, state_out, self._current_compute_type)
+        self._ctx._maybe_allocate()
+
+        with cutn_utils.device_ctx(self._ctx.device_id), utils.cuda_call_ctx(self._ctx) as (
+            self._last_compute_event,
+            elapsed,
+        ):
+            # update last event in participating elementary/general operators to ensure proper stream synchronization and shutdown order
+            self._ctx._last_compute_event = self._last_compute_event
+            for state_in in states_in:
+                state_in._last_compute_event = self._last_compute_event
+            state_out._last_compute_event = self._last_compute_event
+            for _op in self._using_tensor_ops:
+                _op._last_compute_event = self._last_compute_event
+            # update last event for contained OperatorTerms as well
+            for _term in self._using_terms:
+                _term._last_compute_event = self._last_compute_event
+            for op in self.operators:
+                op._last_compute_event = self._last_compute_event
+
+            cudm.operator_action_compute(
+                self._ctx._handle._validated_ptr,
+                self._ptr,
+                t,
+                len(params),
+                params,
+                [state._validated_ptr for state in states_in],
+                state_out._validated_ptr,
+                self._ctx._validated_ptr,
+                self._ctx._stream_holder.ptr,
+            )
+
+
+def _unpack_operator(op):
+    return tuple(zip(op.terms, op._coefficients, op.dualities))
+
+
+def tensor_product(
+    *operands: Sequence[
+        Tuple[
+            Union[ElementaryOperator, Tuple[NDArrayType, Optional[Callable]]],
+            Sequence[int],
+            Optional[Sequence[bool]],
+        ]
+    ],
+    coeff: Union[Number, Callable] = 1.0,
+    dtype: Optional[str] = None,
+) -> OperatorTerm:
+    """
+    Return an :class:`OperatorTerm` from a tensor product of elementary operators.
+
+    Args:
+        operands: Operands in the tensor product. Each operand is a tuple of length 2 or 3 of the form ``(tensor, modes, dual)``, where ``dual`` is optional. ``tensor`` contains the numerical data of the elementary operator and an optional callback function providing the tensor data. Accepted inputs for ``tensor`` are
+
+            - Subclass of ``ElementaryOperator``, i.e. :class:`DenseOperator` and :class:`MultidiagonalOperator`
+            - ``NDArrayType``, which will be converted to a :class:`DenseOperator`
+            - ``Tuple[NDArrayType, Callable]``, which will be passed to the initializer of :class:`DenseOperator`
+
+        coeff: Coefficient associated with this :class:`OperatorTerm`.
+        dtype: Data type of this :class:`OperatorTerm`. Default value is inferred from input operands unless this function returns a scalar :class:`OperatorTerm`, in which case ``dtype`` is required.
+
+    Returns:
+        An :class:`OperatorTerm` constructed from the tensor product of elementary operators.
+    """
+
+    tensors = []
+    modes = []
+    duals = []
+    for op in operands:
+        if len(op) == 2:
+            tensor, _modes = op
+            _duals = (False,) * len(_modes)
+        elif len(op) == 3:
+            tensor, _modes, _duals = op
+            assert len(modes) == len(duals)
+        else:
+            # TODO: FIX ERROR MSG
+            raise TypeError("Expect 2-tuple or 3-tuple as input.")
+
+        if not isinstance(tensor, ElementaryOperator):
+            # MultidiagonalOperators need to be wrapped before passing
+            # safe to specialize to DenseOperator here
+            if isinstance(tensor, tuple):
+                tensor = DenseOperator(*tensor)
+            else:
+                tensor = DenseOperator(tensor)
+        tensors.append(tensor)
+        modes.append(_modes)
+        duals.append(_duals)
+
+    if len(operands) == 0 and dtype is None:
+        raise ValueError(
+            "A data type needs to be specified when creating an OperatorTerm proportional to the identity."
+        )
+    term = OperatorTerm(dtype=dtype)
+    term._append(tensors, modes, duals, coeff=coeff)
+    return term
diff --git a/python/cuquantum/densitymat/state.py b/python/cuquantum/densitymat/state.py
new file mode 100644
index 0000000..a9a1715
--- /dev/null
+++ b/python/cuquantum/densitymat/state.py
@@ -0,0 +1,612 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+from abc import ABC, abstractmethod
+from numbers import Number
+from typing import Sequence, Any, Tuple, Union, List
+import weakref
+import collections
+
+import cupy as cp
+import numpy as np
+from cuquantum.cutensornet._internal import utils as cutn_utils, tensor_wrapper, typemaps
+from cuquantum.cutensornet._internal.tensor_ifc import Tensor
+
+from cuquantum.bindings import cudensitymat as cudm
+from .work_stream import WorkStream
+from ._internal import utils
+from ._internal.utils import InvalidObjectState
+
+
+__all__ = ["DensePureState", "DenseMixedState"]
+
+
+class State(ABC):
+    """
+    An base class on which all concrete state representations are based.
+    This class mirrors the C-API more closely than its subclasses.
+
+    Args:
+        ctx: WorkStream
+            Library context and other configuration information.
+        hilbert_space_dims: Tuple[int]
+            A tuple of the local Hilbert space dimensions.
+        purity: str
+            The states purity, either "PURE" or "MIXED".
+        batch_size: int
+            The batch dimension of the state.
+        dtype: str
+            The numeric datatype for the state's coefficients.
+    """
+
+    def __init__(
+        self, ctx: WorkStream, hilbert_space_dims: Tuple[int], batch_size: int, dtype: str
+    ) -> None:
+
+        self.batch_size = batch_size
+        self.hilbert_space_dims = tuple(hilbert_space_dims)
+        self.dtype = dtype
+
+        self._bufs = None
+        self._last_compute_event = None
+        self._ctx = ctx
+
+        # register dummy finalizer, for safe cleanup if error occurs before proper finalizer is set
+        self._finalizer = weakref.finalize(self, lambda: None)
+        self._finalizer.detach()
+        self._upstream_finalizers = collections.OrderedDict()  # not really needed here
+
+    def _instantiate(self, ctx: WorkStream):
+        assert ctx is not None
+        if self._valid_state:
+            assert self._ctx == ctx
+        else:
+            self._ctx = ctx
+            # create state handle
+            self._ptr = cudm.create_state(
+                self._ctx._handle._validated_ptr,
+                self._purity,
+                len(self.hilbert_space_dims),
+                self.hilbert_space_dims,
+                self.batch_size,
+                typemaps.NAME_TO_DATA_TYPE[self.dtype],
+            )
+            self._finalizer = weakref.finalize(
+                self,
+                utils.generic_finalizer,
+                self._ctx.logger,
+                self._upstream_finalizers,
+                (cudm.destroy_state, self._ptr),
+                msg=f"Destroying State instance {self}, ptr: {self._ptr}.",
+            )
+            utils.register_with(self, self._ctx, self._ctx.logger)
+
+    def _check_valid_state(self, *args, **kwargs):
+        if not self._valid_state:
+            raise InvalidObjectState("The state cannot be used after resources are freed")
+
+    @property
+    def _valid_state(self):
+        return self._finalizer.alive
+
+    @property
+    @cutn_utils.precondition(_check_valid_state)
+    def _validated_ptr(self):
+        """
+        Pointer to C-API counterpart.
+        """
+        return self._ptr
+
+    @property
+    @abstractmethod
+    def _purity(self):
+        pass
+
+    @property
+    @abstractmethod
+    def storage(self) -> Any:
+        pass
+
+    @abstractmethod
+    def attach_storage(self, storage: Any):
+        pass
+
+    @property
+    @abstractmethod
+    def local_info(self) -> Any:
+        pass
+
+    def _sync(self) -> None:
+        """ """
+        if self._last_compute_event:
+            self._last_compute_event.synchronize()
+            self._last_compute_event = None
+
+    # override in concrete subclasses if other criteria for compatibility exist
+    def _check_state_compatibility(self, other):
+        try:
+            assert type(self) == type(other)
+            assert self.hilbert_space_dims == other.hilbert_space_dims
+            assert self.batch_size == other.batch_size
+            assert self.dtype == other.dtype
+            assert self._ctx == other._ctx
+            assert self._purity == other._purity
+        except AssertionError as e:
+            raise ValueError(
+                "`other` argument in State.inner(other) is incompatible with instance."
+            ) from e
+
+    def _check_and_return_factors(self, factors):
+        # Check input shape
+        if isinstance(factors, Number):
+            factors = np.full((self.batch_size,), factors)
+        elif isinstance(factors, Sequence):
+            if not len(factors) == self.batch_size:
+                raise ValueError("factors must be of same length as State's batch_size.")
+            factors = np.array(factors)
+        elif isinstance(factors, (np.ndarray, cp.ndarray)):
+            if not factors.shape == (self.batch_size,):
+                raise ValueError(
+                    "factors passed as NDArray must be one-dimensional and of length batch_size."
+                )
+        else:
+            raise TypeError("factors must be of type Number, Sequence, np.ndarray or cp.ndarray.")
+
+        # Put factors onto GPU
+        with cutn_utils.device_ctx(self._ctx.device_id), utils.cuda_call_ctx(
+            self._ctx, blocking=True
+        ):
+            if not cp.can_cast(factors, self.dtype, casting="same_kind"):
+                raise TypeError(
+                    f"The provided scaling factors with data type {type(factors.dtype)} "
+                    f"cannot be safely cast to State's data type {self.dtype}."
+                )
+            factors_arr = cp.asarray(factors, dtype=self.dtype)
+
+        return factors_arr
+
+    @cutn_utils.precondition(_check_valid_state)
+    def inplace_scale(self, factors: Union[Number, Sequence, np.ndarray, cp.ndarray]) -> None:
+        """
+        Scale the state by scalar factor(s).
+
+        Args:
+            factors: Scalar factor(s) used in scaling the state. If a single number is provided, 
+                scale all batched states by the same factor.
+        """
+        factors_arr = self._check_and_return_factors(factors)
+
+        with cutn_utils.device_ctx(self._ctx.device_id), utils.cuda_call_ctx(self._ctx) as (
+            self._last_compute_event,
+            elapsed,
+        ):
+            cudm.state_compute_scaling(
+                self._ctx._handle._validated_ptr,
+                self._ptr,
+                factors_arr.data.ptr,
+                self._ctx._stream_holder.ptr,
+            )
+
+    @cutn_utils.precondition(_check_valid_state)
+    def norm(self) -> cp.ndarray:
+        """
+        Compute the squared Frobenius norm(s) of the state.
+
+        Returns:
+            An array of squared Frobenius norm(s) of length ``batch_size``.
+        """
+        # Translate complex datatypes to real datatypes
+        if self.dtype.startswith("complex"):
+            dtype = self.storage.real.dtype.name
+        else:
+            dtype = self.dtype
+
+        with cutn_utils.device_ctx(self._ctx.device_id), utils.cuda_call_ctx(self._ctx) as (
+            self._last_compute_event,
+            elapsed,
+        ):
+            res = cp.empty(self.batch_size, dtype=dtype, order="F")
+            cudm.state_compute_norm(
+                self._ctx._handle._validated_ptr,
+                self._ptr,
+                res.data.ptr,
+                self._ctx._stream_holder.ptr,
+            )
+        return res
+
+    @cutn_utils.precondition(_check_valid_state)
+    def trace(self) -> cp.ndarray:
+        """
+        Compute the trace(s) of the state.
+
+        Returns:
+            An array of trace(s) of length ``batch_size``.
+        """
+        with cutn_utils.device_ctx(self._ctx.device_id), utils.cuda_call_ctx(self._ctx) as (
+            self._last_compute_event,
+            elapsed,
+        ):
+            res = cp.empty(self.batch_size, dtype=self.dtype, order="F")
+            cudm.state_compute_trace(
+                self._ctx._handle._validated_ptr,
+                self._ptr,
+                res.data.ptr,
+                self._ctx._stream_holder.ptr,
+            )
+        return res
+
+    @cutn_utils.precondition(_check_valid_state)
+    def inplace_accumulate(
+        self, other, factors: Union[Number, Sequence, np.ndarray, cp.ndarray] = 1
+    ) -> None:
+        """
+        Inplace accumulate another state scaled by factor(s) into this state.
+        
+        Args:
+            other: The other state to be scaled and accumulated into this state.
+            factors: Scalar factor(s) used in scaling `other`. If a single number is provided, 
+                scale all batched states in `other` by the same factor. Defaults to 1.
+        """
+        self._check_state_compatibility(other)
+        factors_arr = self._check_and_return_factors(factors)
+
+        with cutn_utils.device_ctx(self._ctx.device_id), utils.cuda_call_ctx(self._ctx) as (
+            self._last_compute_event,
+            elapsed,
+        ):
+            # update last event in other
+            other._last_compute_event = self._last_compute_event
+            cudm.state_compute_accumulation(
+                self._ctx._handle._validated_ptr,
+                other._validated_ptr,
+                self._ptr,
+                factors_arr.data.ptr,
+                self._ctx._stream_holder.ptr,
+            )
+
+    @cutn_utils.precondition(_check_valid_state)
+    def inner_product(self, other) -> cp.ndarray:
+        """
+        Compute the inner product(s) between two states.
+
+        Args:
+            other: The other state to compute inner product with.
+
+        Returns:
+            An array of inner product(s) of length ``batch_size``.
+        """
+        self._check_state_compatibility(other)
+        with cutn_utils.device_ctx(self._ctx.device_id), utils.cuda_call_ctx(self._ctx) as (
+            self._last_compute_event,
+            elapsed,
+        ):
+            # update last event in other
+            other._last_compute_event = self._last_compute_event
+            # update last event in participating elementary/general operators to ensure proper stream synchronization and shutdown order
+            res = cp.empty(self.batch_size, dtype=self.dtype, order="F")
+            cudm.state_compute_inner_product(
+                self._ctx._handle._validated_ptr,
+                self._ptr,
+                other._validated_ptr,
+                res.data.ptr,
+                self._ctx._stream_holder.ptr,
+            )
+        return res
+
+    @cutn_utils.precondition(_check_valid_state)
+    def _attach_component_storage(self, data: Sequence) -> None:
+        """
+        Attaches GPU buffers to this instance. This instance doesn't own the buffers.
+        All elements of data need to be on this instances device_id and Fortran ordered. No copy is created and the buffer is a reference to data argument.
+
+        Args:
+            data: Sequence
+                Sequence of NDarray like objects containing the statevector coefficients.
+                The length of the sequence should be identical to self._num_components.
+                The elements of data need to be on device(options.device_id) and need to be F-ordered, otherwise exceptions are raised in the initialization.
+        """
+        bufs = [tensor_wrapper.wrap_operands((d,))[0] for d in data]
+        try:
+            num_components = len(data)
+            if num_components != self._num_components:
+                raise ValueError(
+                    "Trying to attach component storage of incorrect length to this instance of state.",
+                )
+            for buf in bufs:
+                if buf.dtype != self.dtype:
+                    raise ValueError("Supplied buffer's dtype doesn't match instances dtype.")
+                if buf.device != "cuda" or buf.device_id != self._ctx.device_id:
+                    raise ValueError(
+                        "State component storages needs to be provided as GPU residing ndarray-like objects located on same GPU as State instance.",
+                    )
+                if not buf.tensor.flags["F_CONTIGUOUS"]:
+                    raise ValueError(
+                        "State component storages need to be contiguous and F-ordered."
+                    )
+            expected_sizes = self._component_storage_size
+            received_sizes = tuple(buf.tensor.dtype.itemsize * buf.tensor.size for buf in bufs)
+            if not received_sizes == expected_sizes:
+                raise ValueError(
+                    f"The supplied storage sizes, {received_sizes}, do not match the expected storage sizes, {expected_sizes}.\
+                Both storage sizes are reported in bytes."
+                )
+        except ValueError as e:
+            raise e
+
+        cudm.state_attach_component_storage(
+            self._ctx._handle._validated_ptr,
+            self._ptr,
+            num_components,
+            tuple(buf.data_ptr for buf in bufs),
+            received_sizes,
+        )
+        self._bufs = bufs
+
+    @property
+    @cutn_utils.precondition(_check_valid_state)
+    def _num_components(self):
+        """
+        Number of components in the state storage.
+        """
+        return cudm.state_get_num_components(self._ctx._handle._validated_ptr, self._ptr)
+
+    @property
+    @cutn_utils.precondition(_check_valid_state)
+    def _component_storage_size(self):
+        """
+        Size of each of the components in the state storage in bytes.
+        """
+        sizes = cudm.state_get_component_storage_size(
+            self._ctx._handle._validated_ptr, self._ptr, self._num_components
+        )
+        return sizes
+
+    @property
+    @cutn_utils.precondition(_check_valid_state)
+    def _component_storage(self) -> Sequence[Tensor]:
+        """
+        Non-blocking return of reference to buffers.
+        """
+        return self._bufs
+
+    @property
+    @cutn_utils.precondition(_check_valid_state)
+    def _local_info(self) -> List[Tuple[Tuple[int], Tuple[int]]]:
+        infos = []
+        for local_component_index in range(self._num_components):
+            _, component_num_modes, _ = self._get_component_num_modes(local_component_index)
+            component_mode_extents = np.zeros((component_num_modes,), dtype="int64")
+            component_mode_offsets = np.zeros((component_num_modes,), dtype="int64")
+            _global_component_index = np.zeros((1,), dtype="int32")
+            _component_num_modes = np.zeros((1,), dtype="int32")
+
+            cudm.state_get_component_info(
+                self._ctx._handle._validated_ptr,
+                self._validated_ptr,
+                local_component_index,
+                _global_component_index.ctypes.data,
+                _component_num_modes.ctypes.data,
+                component_mode_extents.ctypes.data,
+                component_mode_offsets.ctypes.data,
+            )
+            component_mode_extents = tuple(component_mode_extents)
+            component_mode_offsets = tuple(component_mode_offsets)
+            if self.batch_size == 1:
+                component_mode_extents = component_mode_extents + (1,)
+                component_mode_offsets = component_mode_offsets + (0,)
+            infos.append((component_mode_extents, component_mode_offsets))
+        return infos
+
+    def _get_component_num_modes(self, local_component_index: int):
+        batch_mode_location = np.zeros((1,), dtype=np.int32)
+        component_num_modes = np.zeros((1,), dtype=np.int32)
+        global_component_index = np.zeros((1,), dtype=np.int32)
+        cudm.state_get_component_num_modes(
+            self._ctx._handle._validated_ptr,
+            self._validated_ptr,
+            local_component_index,
+            global_component_index.ctypes.data,
+            component_num_modes.ctypes.data,
+            batch_mode_location.ctypes.data,
+        )
+        return global_component_index[0], component_num_modes[0], batch_mode_location[0]
+
+    @abstractmethod
+    def clone(self, bufs) -> "State":
+        pass
+
+
+class DenseState(State):
+    """
+    A state in dense representation.
+    """
+
+    @property
+    def storage(self) -> cp.ndarray:
+        """
+        The state's local storage buffer.
+
+        Returns:
+            cp.ndarray:
+                The state's local storage buffer.
+        """
+        data = self._component_storage
+        if data is not None:
+            return data[0].tensor
+
+    @property
+    def storage_size(self) -> int:
+        """
+        Storage buffer size in number of elements of data type `dtype`.
+        
+        Returns:
+            int: Storage buffer size in number of elements of data type `dtype`.
+        """
+        return self._component_storage_size[0] // np.dtype(self.dtype).itemsize
+
+    def view(self) -> cp.ndarray:
+        """
+        Return a multidimensional view on the local slice of the storage buffer.
+
+        .. note::
+            When ``batch_size`` is 1, the last mode of the view will be the batch mode of dimension 1.
+        """
+        shape, _ = self.local_info
+        if self.storage.size == np.prod(shape) and len(self.storage.shape) > 1:
+            view = self.storage.reshape(shape, order="F")
+
+        else:
+            view = self.storage[: np.prod(shape)].reshape(shape, order="F")
+        assert view.base is self.storage
+        return view
+
+    @property
+    def local_info(self) -> Tuple[Tuple[int], Tuple[int]]:
+        """
+        Local storage buffer dimensions as well as local mode offsets.
+
+        Returns:
+            Tuple[int]
+                Local storage buffer dimensions, with the last dimension being the batch dimension.
+            Tuple[int]
+                Local mode offsets.
+        """
+        dims, offsets = self._local_info[0]
+        return dims, offsets
+
+    def attach_storage(self, data: cp.ndarray) -> None:
+        """
+        Attach a data buffer to the state.
+
+        Args:
+            data: The data buffer to be attached to the state.
+
+        .. note::
+            The data buffer needs to match the hilbert space dimensions, batch size and data type 
+            passed to the ``__init__`` function. In addition, the data buffer needs to be Fortran 
+            contiguous and located on the same device as the :class:`WorkStream` passed to the ``__init__`` function.
+        """
+        self._attach_component_storage((data,))
+
+    def allocate_storage(self) -> None:
+        """
+        Allocate an appropriately sized data buffer and attach it to the state.
+        """
+        with cp.cuda.Device(self._ctx.device_id):
+            state_storage_buf = cp.zeros((self.storage_size,), dtype=self.dtype)
+            self.attach_storage(state_storage_buf)
+
+    def clone(self, buf: cp.ndarray) -> "DenseState":
+        """Clone the state with a new data buffer.
+        
+        Args:
+            buf: The data buffer to be attached to the new state.
+            
+        Returns:
+            A state with same metadata as the original state and a new data buffer.
+        """
+        if buf.dtype != self.dtype:
+            raise ValueError(
+                f"The supplied data buffer's data type {buf.dtype} does not match the original "
+                f"instances data type {self.dtype}."
+            )
+        new_instance = type(self)(self._ctx, self.hilbert_space_dims, self.batch_size, self.dtype)
+        size = new_instance.storage_size
+        if not buf.flags.f_contiguous:
+            raise ValueError("The supplied data buffer is not Fortran ordered and contiguous.")
+        if np.prod(buf.shape) != size:
+            raise ValueError(
+                f"The supplied data buffer size, {buf.size} does not match the expected size: {size}."
+            )
+        if len(buf.shape) > 1:
+            # only applicable to multi-GPU usage, may break for MGMN with correctly sized buffers
+            new_instance_shape, _ = new_instance.local_info
+            squeezed_shape = new_instance_shape[:-1] if self.batch_size == 1 else new_instance_shape
+            if not (buf.shape == new_instance_shape or buf.shape == squeezed_shape):
+                raise ValueError(
+                    f"The supplied data buffer shape is not compatible with the required local state slice size."
+                    " Note that non-1D data buffers are only supported in single-GPU usage."
+                )
+        new_instance.attach_storage(buf)
+        return new_instance
+
+
+class DensePureState(DenseState):
+    """
+    DensePureState(ctx, hilbert_space_dims, batch_size, dtype)
+
+    Pure state in dense (state-vector) representation.
+
+    A storage buffer needs to be attached via the :meth:`attach_storage` method or allocated via the :meth:`allocate_storage` method. The appropriate size for the storage buffer as well as information on the storage layout is available in the :attr:`local_info` attribute.
+
+    Args:
+        ctx: The execution context, which contains information on device ID, logging and blocking/non-blocking execution.
+        hilbert_space_dims: A tuple of the local Hilbert space dimensions.
+        batch_size: Batch dimension of the state.
+        dtype: Numeric data type of the state's coefficients.
+    
+    Examples:
+        >>> import cupy as cp
+        >>> from cuquantum.densitymat import WorkStream, DensePureState
+
+        To create a ``DensePureState`` of batch size 1 and double-precision complex data type, we need to first initialize it and then attach the storage buffer through the :meth:`attach_storage` method as follows
+        
+        >>> ctx = WorkStream(stream=cp.cuda.Stream())
+        >>> hilbert_space_dims = (2, 2, 2)
+        >>> rho = DensePureState(ctx, hilbert_space_dims, 1, "complex128")
+        >>> rho.attach_storage(cp.zeros(rho.storage_size, dtype=rho.dtype))
+    """
+
+    def __init__(
+        self, ctx: WorkStream, hilbert_space_dims: Sequence[int], batch_size: int, dtype: str
+    ) -> None:
+        """
+        Initialize a pure state in dense (state-vector) representation.
+        """
+        super().__init__(ctx, hilbert_space_dims, batch_size, dtype)
+        self._instantiate(ctx)
+
+    @property
+    def _purity(self):
+        return cudm.StatePurity.PURE
+
+
+class DenseMixedState(DenseState):
+    """
+    DenseMixedState(ctx, hilbert_space_dims, batch_size, dtype)
+
+    Mixed state in dense (density-matrix) representation.
+
+    A storage buffer needs to be attached via the :meth:`attach_storage` method or allocated via the :meth:`allocate_storage` method. The appropriate size for the storage buffer as well as information on the storage layout is available in the :attr:`local_info` attribute.
+
+    Args:
+        ctx: The execution context, which contains information on device ID, logging and blocking/non-blocking execution.
+        hilbert_space_dims: A tuple of the local Hilbert space dimensions.
+        batch_size: Batch dimension of the state.
+        dtype: Numeric data type of the state's coefficients.
+
+    Examples:
+        >>> import cupy as cp
+        >>> from cuquantum.densitymat import WorkStream, DenseMixedState
+
+        To create a ``DenseMixedState`` of batch size 1 and double-precision complex data type, we need to first initialize it and then attach the storage buffer through the :meth:`attach_storage` method as follows
+        
+        >>> ctx = WorkStream(stream=cp.cuda.Stream())
+        >>> hilbert_space_dims = (2, 2, 2)
+        >>> rho = DenseMixedState(ctx, hilbert_space_dims, 1, "complex128")
+        >>> rho.attach_storage(cp.zeros(rho.storage_size, dtype=rho.dtype))
+    """
+
+    def __init__(
+        self, ctx: WorkStream, hilbert_space_dims: Sequence[int], batch_size: int, dtype: str
+    ) -> None:
+        """
+        Initialize a mixed state in dense (density-matrix) representation.
+        """
+        super().__init__(ctx, hilbert_space_dims, batch_size, dtype)
+        self._instantiate(ctx)
+
+    @property
+    def _purity(self):
+        return cudm.StatePurity.MIXED
diff --git a/python/cuquantum/densitymat/work_stream.py b/python/cuquantum/densitymat/work_stream.py
new file mode 100644
index 0000000..19831d9
--- /dev/null
+++ b/python/cuquantum/densitymat/work_stream.py
@@ -0,0 +1,355 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+__all__ = ["WorkStream"]
+
+from dataclasses import dataclass
+from logging import Logger, getLogger
+from typing import Optional, Union, Tuple
+import weakref
+import collections
+
+import cupy as cp
+import cuquantum.cutensornet as cutn
+from cuquantum.cutensornet.memory import BaseCUDAMemoryManager
+from cuquantum.cutensornet._internal import utils as cutn_utils
+from cuquantum.cutensornet._internal.mem_limit import check_memory_str
+
+from cuquantum.bindings import cudensitymat as cudm
+from ._internal.library_handle import LibraryHandle
+from ._internal import utils
+from ._internal.utils import InvalidObjectState
+
+
+# TODO[OPTIONAL]: move this map elsewhere
+WORK_SPACE_KIND_MAP = {}
+WORK_SPACE_KIND_MAP["SCRATCH"] = cudm.WorkspaceKind.WORKSPACE_SCRATCH
+# WORK_SPACE_KIND_MAP["CACHE"] = cudm.WorkspaceKind.WORKSPACE_CACHE #Not yet implemented
+
+# TODO[OPTIONAL]: move this map elsewhere
+MEM_SPACE_MAP = {}
+MEM_SPACE_MAP["DEVICE"] = cudm.Memspace.DEVICE
+MEM_SPACE_MAP["HOST"] = cudm.Memspace.HOST
+
+
+@dataclass
+class WorkStream:
+    """
+    A data class containing the library handle, stream, workspace and configuration parameters.
+
+    This object handles allocation and synchronization automatically. Additionally, a method to release the workspace is provided. The size of the workspace buffer is determined by either the :attr:`memory_limit` attribute or the maximum required workspace size among all objects using this ``WorkStream``.
+
+    Attributes:
+        device_id: CUDA device ordinal (used if the tensor network resides on the CPU). Device 0 will be used if not specified.
+        stream: CUDA stream. The current stream will be used if not specified.
+        memory_limit: Maximum memory available. It can be specified as a value (with optional suffix like
+            K[iB], M[iB], G[iB]) or as a percentage. The default is 80% of the device memory.
+        allocator: An object that supports the :class:`BaseCUDAMemoryManager` protocol, used to draw device memory. If an allocator is not provided, a memory allocator from the library package will be used (:func:`torch.cuda.caching_allocator_alloc` for PyTorch operands, :func:`cupy.cuda.alloc` otherwise).
+        compute_type (cuquantum.ComputeType): CUDA compute type. A suitable compute type will be selected if not specified.
+        logger (logging.Logger): Python Logger object. The root logger will be used if a logger object is not provided.
+        workspace_info: A property attribute that stores a 2-tuple of ints representing currently allocated and anticipated workspace size in bytes.
+
+    Methods:
+
+        set_communicator(comm, provider="None") -> None
+            Register a communicator with the library.
+            Currently only ``mpi4py.Comm`` objects are supported and the only supported provider is ``"MPI"``.
+        
+        get_proc_rank() -> int
+            Return the process rank if a communicator was set previously via :meth:`WorkStream.set_communicator`.
+        
+        get_num_ranks() -> int
+            Return the number of processes in the communicator that was set previously via :meth:`WorkStream.set_communicator`.
+        
+        get_communicator()
+            Return the communicator object if set previously via :meth:`WorkStream.set_communicator`.
+
+        release_workspace(kind="SCRATCH") -> None
+            Release the workspace.
+
+    .. note::
+        - Releasing the workspace releases both its workspace buffer and resets the maximum required size among the objects that uses this ``WorkStream`` instance. 
+        - Objects which have previously been exposed to this ``WorkStream`` instance do not require explicit calls to their ``prepare`` methods after the workspace has been released.
+        - Releasing the workspace buffer may be useful when intermediate computations do not involve the cuDensityMat API, or when the following computations require less workspace than the preceding ones.
+        - Objects can only interact with each other if they use the same ``WorkStream`` and cannot change the ``WorkStream`` they use.
+        - Some objects require a ``WorkStream`` at creation (``State``, :class:`OperatorAction`), while other objects require it only when their ``prepare`` method is called (:class:`Operator`).
+        - Some objects acquire the ``WorkStream`` possibly indirectly (:class:`Operator`), while other objects acquire it always indirectly (:class:`OperatorTerm`, :class:`DenseOperator`, :class:`MultidiagonalOperator`).
+
+    .. attention::
+        The ``compute_type`` argument is currently not used and will default to the data type.
+    
+    Examples:
+
+        >>> import cupy as cp
+        >>> from cuquantum.densitymat import WorkStream
+
+        To create a ``WorkStream`` on a new CUDA stream, we can do
+
+        >>> ctx = WorkStream(stream=cp.cuda.Stream())
+    """
+
+    device_id: Optional[int] = None
+    stream: Optional[cp.cuda.Stream] = None
+    memory_limit: Optional[Union[int, str]] = r"80%"
+    allocator: Optional[BaseCUDAMemoryManager] = cutn.memory._MEMORY_MANAGER["cupy"]
+    compute_type: Optional[str] = None
+    logger: Optional[Logger] = None
+
+    def __post_init__(self):
+        """
+        Cast to cuquantum types, infer values dependent on multiple attributes, create handle if not passed and perform checks.
+        """
+        # register dummy finalizer, for safe cleanup if error occurs before proper finalizer is set
+        self._finalizer = weakref.finalize(self, lambda: None)
+        self._finalizer.detach()
+
+        self.blocking = True  # TODO: Support non-blocking WorkStream
+        self.logger = getLogger() if self.logger is None else self.logger
+        self.device_id = self.device_id if self.device_id is not None else 0
+        self._handle = LibraryHandle(self.device_id, self.logger)
+        self._do_timing = bool(self.logger and self.logger.handlers)
+        # TODO: remove restrictions to cupy.cuda.Stream
+        self._stream_holder = cutn_utils.get_or_create_stream(self.device_id, self.stream, "cupy")
+        self.stream = self._stream_holder.obj
+        check_memory_str(self.memory_limit, "memory limit")
+        self._memory_limit = cutn_utils.get_memory_limit(
+            self.memory_limit, cp.cuda.Device(self.device_id)
+        )
+        if issubclass(self.allocator, BaseCUDAMemoryManager):
+            self.allocator = self.allocator(self.device_id, self.logger)
+        if not isinstance(self.allocator, BaseCUDAMemoryManager):
+            raise TypeError(
+                "The allocator must be an object of type (or subclass of) that fulfils the BaseCUDAMemoryManager protocol."
+            )
+
+        # internal resource creation and release
+        self._ptr = cudm.create_workspace(
+            self._handle._validated_ptr
+        )  # lifetime tied to instance lifetime
+        self.logger.debug(
+            f"WorkStream instance {self} created workspace descriptor {self._ptr} on device {self.device_id} with stream {self.stream}."
+        )
+        self._upstream_finalizers = collections.OrderedDict()
+        self._finalizer = weakref.finalize(
+            self,
+            utils.generic_finalizer,
+            self.logger,
+            self._upstream_finalizers,
+            (cudm.destroy_workspace, self._ptr),
+            msg=f"Destroying Workspace instance {self}",
+        )
+        utils.register_with(self, self._handle, self.logger)
+
+        # initialize other private attributes
+        self._buf_scratch = None
+        self._size_scratch = 0
+        self._last_compute_event = None
+        self._required_size_upper_bound = 0
+        self.logger.info(
+            f"Created WorkStream on device {self.device_id} with stream {self.stream}."
+        )
+
+    def _check_valid_state(self, *args, **kwargs):
+        """ """
+        if not self._valid_state:
+            raise InvalidObjectState("The workspace cannot be used after resources are free'd")
+
+    @property
+    def _valid_state(self):
+        return self._finalizer.alive
+
+    @property
+    @cutn_utils.precondition(_check_valid_state)
+    def _validated_ptr(self) -> int:
+        """
+        The workspace descriptor.
+        """
+        return self._ptr
+
+    @property
+    def workspace_info(self) -> Tuple[int, int]:
+        """
+        Information on current and anticipated workspace size in bytes.
+
+        Returns:
+            int
+                the size of the currently allocated workspace buffer.
+            int
+                the size of the workspace buffer to be allocated in the future based on previous ``prepare`` calls of other API objects.
+        """
+        return self._size_scratch, self._required_size_upper_bound
+
+    def set_communicator(self, comm, provider: str = "None") -> None:
+        """
+        Register a communicator with the library.
+        Currently only ``mpi4py.Comm`` objects are supported and the only supported provider is "MPI".
+        """
+        self._handle.set_communicator(comm, provider)
+
+    def get_proc_rank(self) -> int:
+        """
+        Returns the process rank if a communicator was set previously via ``WorkStream.set_communicator``.
+        """
+        return self._handle.get_proc_rank()
+
+    def get_num_ranks(self) -> int:
+        """
+        Returns the number of processes in the communicator that was set previously via ``WorkStream.set_communicator``.
+        """
+        return self._handle.get_num_ranks()
+
+    def get_communicator(self):
+        """
+        Returns the communicator object if set previously via ``WorkStream.set_communicator``.
+        """
+        return self._handle._comm
+
+    @cutn_utils.precondition(_check_valid_state)
+    def release_workspace(self, kind="SCRATCH") -> None:
+        """
+        Releases the workspace.
+
+        This method has no direct user-facing side effects on other API objects.
+        Releasing the workspace releases both its workspace buffer and resets the maximum required size among its users. Objects which have previously been exposed to this instance of WorkStream do not require explicit calls to their prepare methods after the workspace has been released.
+        Releasing the workspace buffer may be useful when performing intermediate computation not involving the cudensitymat API.
+        Furthermore, releasing the workspace buffer may be useful if the following computations require less workspace than the preceding ones.
+        """
+        self._sync()  # may be redundand currently due to the way the memory buffer works
+        if kind.lower() != "scratch":
+            raise NotImplementedError(
+                'WorkStream object does not support workspaces other than "scratch" at the moment.'
+            )
+        new_ptr = cudm.create_workspace(self._handle._validated_ptr)
+        old_ptr = self._ptr
+        # this is required for checks for whether prepare has been called on this workspace for a given instance, if this fails we need to implement a wrapper around the pointer
+        assert new_ptr != old_ptr
+        self._ptr = cudm.destroy_workspace(self._validated_ptr)
+        setattr(self, f"_buf_{kind.lower()}", None)
+        setattr(self, f"_size_{kind.lower()}", 0)
+        self._ptr = new_ptr
+        self._finalizer.detach()
+        self._finalizer = weakref.finalize(
+            self,
+            utils.generic_finalizer,
+            self.logger,
+            self._upstream_finalizers,
+            (cudm.destroy_workspace, self._ptr),
+            msg=f"Destroying Workspace instance {self}",
+        )
+        self._required_size_upper_bound = 0
+
+    def _workspace_set_memory(
+        self,
+        memory_ptr: int,
+        size: int,
+        memspace: str = "DEVICE",
+        kind: str = "SCRATCH",
+    ):
+        """
+        Attach memory buffer to a workspace descriptor.
+
+        Args:
+            memory_ptr: int
+                Pointer to memory.
+            size: int
+                Size of allocated buffer in bytes.
+            memspace: str
+                "DEVICE" (default) or "HOST". Currently only "DEVICE" is supported.
+            kind: str
+                "SCRATCH" (default) or "CACHE". Currently only "SCRATCH" is supported.
+        """
+
+        self.logger.info(
+            f"Attaching memory buffer of size {size} on device {self.device_id} with stream {self.stream}."
+        )
+        if memspace != "DEVICE" or kind != "SCRATCH":
+            raise NotImplementedError(
+                'Currently only memspace = "DEVICE"  and kind = "SCRATCH" is supported in cudensitymat.workspace_set_memory.'
+            )
+
+        cudm.workspace_set_memory(
+            self._handle._validated_ptr,
+            self._validated_ptr,
+            MEM_SPACE_MAP[memspace],
+            WORK_SPACE_KIND_MAP[kind],
+            memory_ptr,
+            size,
+        )
+
+    def _update_required_size_upper_bound(self, memspace="DEVICE", kind="SCRATCH") -> tuple[int]:
+        """
+        Updates the upper bound to workspace sizes required among all previous prepare calls.
+        
+        Returns:
+            int:
+                Workspace size required by most recent prepare call.
+            int:
+                Upper bound to workspace sizes required.
+        """
+        _, size = cudm.workspace_get_memory(
+            self._handle._validated_ptr,
+            self._validated_ptr,
+            MEM_SPACE_MAP[memspace],
+            WORK_SPACE_KIND_MAP[kind],
+        )
+        self._required_size_upper_bound = max(self._required_size_upper_bound, size)
+        return size, self._required_size_upper_bound
+
+    @cutn_utils.precondition(_check_valid_state)
+    def _sync(self) -> None:
+        if self._last_compute_event:
+            self._last_compute_event.synchronize()
+            self._last_compute_event = None
+
+    @cutn_utils.precondition(_check_valid_state)
+    def _maybe_allocate(self, memspace="DEVICE", kind="SCRATCH") -> None:
+        """
+        Allocates workspace buffer and attaches it to workspace descriptor, if necessary.
+
+        Args:
+            memspace: str
+                "DEVICE" (default) or "HOST". Currently only "DEVICE" is supported.
+            kind: str
+                "SCRATCH" (default) or "CACHE". Currently only "SCRATCH" is supported.
+        """
+        _ptr, _size = cudm.workspace_get_memory(
+            self._handle._validated_ptr,
+            self._validated_ptr,
+            MEM_SPACE_MAP[memspace],
+            WORK_SPACE_KIND_MAP[kind],
+        )
+        if memspace != "DEVICE":
+            raise NotImplementedError("Only device memory buffers currently supported.")
+        if _ptr == 0:
+            # normal state after prepare call
+            _buf_size = getattr(self, f"_size_{kind.lower()}")
+            if _buf_size is None or _buf_size < self._required_size_upper_bound:
+                with cutn_utils.device_ctx(self.device_id), self._stream_holder.ctx:
+                    try:
+                        self.logger.info(
+                            f"Allocating memory buffer of size {self._required_size_upper_bound} on device {self.device_id} with stream {self.stream}."
+                        )
+                        _buf = self.allocator.memalloc(self._required_size_upper_bound)
+                    except TypeError as e:
+                        message = (
+                            "The method 'memalloc' in the allocator object must conform to the interface in the "
+                            "'BaseCUDAMemoryManager' protocol."
+                        )
+                        raise TypeError(message) from e
+                setattr(self, f"_size_{kind.lower()}", self._required_size_upper_bound)
+                setattr(self, f"_buf_{kind.lower()}", _buf)
+                ptr = _buf.ptr if _buf is not None else 0
+            else:
+                _buf = getattr(self, f"_buf_{kind.lower()}")
+                ptr = _buf.ptr if _buf is not None else 0
+
+            self._workspace_set_memory(ptr, getattr(self, f"_size_{kind.lower()}"), memspace, kind)
+
+        else:
+            # buffer currently attached
+            # nothing to do here
+            assert (
+                getattr(self, f"_buf_{kind.lower()}").ptr,
+                getattr(self, f"_size_{kind.lower()}"),
+            ) == (_ptr, _size)
diff --git a/python/samples/cutensornet/approxTN/mps_example.py b/python/samples/cutensornet/approxTN/mps_example.py
index f45eb9e..63538a1 100644
--- a/python/samples/cutensornet/approxTN/mps_example.py
+++ b/python/samples/cutensornet/approxTN/mps_example.py
@@ -64,7 +64,7 @@ def __init__(self, num_sites, phys_extent, max_virtual_extent, initial_state, da
 
         # create tensor descriptors
         for i in range(self.num_sites):
-            self.state_tensors.append(initial_state[i].astype(tensor.dtype, order="F"))
+            self.state_tensors.append(cp.asarray(initial_state[i], order="F"))
             extent = self.get_tensor_extent(i)
             modes = self.get_tensor_modes(i)
             desc_tensor = cutn.create_tensor_descriptor(self.handle, 3, extent, 0, modes, self.data_type)
@@ -259,17 +259,32 @@ def apply_gate(self, site_A, site_B, gate, verbose, stream):
         cutn.destroy_tensor_descriptor(desc_tensor_in_G)
     
     def __del__(self):
-        """Free all resources owned by the object."""
+        """
+        Calls `MPSHelper.free()`.
+
+        An explicit call to `MPSHelper.free()` by the user of this class allows
+        to free resources at a predictable moment in time. In some cases,
+        relying on the garbage collection can cause resource over-utilization
+        or other problems.
+
+        It is advised to always call `MPSHelper.free()` when you no longer need
+        the object.
+        """
+        self.free()
+
+    def free(self):
+        """Free all resources owned by the object, if not already freed."""
+        if self.handle is None:
+            return
+        self.handle = cutn.destroy(self.handle) # free() should be idempotent
         for desc_tensor in self.desc_tensors:
             cutn.destroy_tensor_descriptor(desc_tensor)
-        cutn.destroy(self.handle)
         cutn.destroy_workspace_descriptor(self.work_desc)
         cutn.destroy_tensor_svd_config(self.svd_config)
         cutn.destroy_tensor_svd_info(self.svd_info)
 
 
-if __name__ == '__main__':
-
+def main():
     print("cuTensorNet-vers:", cutn.get_version())
     dev = cp.cuda.Device()  # get current device
     props = cp.cuda.runtime.getDeviceProperties(dev.id)
@@ -295,6 +310,7 @@ def __del__(self):
     for i in range(num_sites):
         # we create dummpy indices for MPS tensors on the boundary for easier bookkeeping
         # we'll use Fortran layout throughout this example
+        # all tensors have to have the same dtype
         tensor = cp.zeros((1,2,1), dtype=np.complex128, order="F")
         tensor[0,0,0] = 1.0
         initial_state.append(tensor)
@@ -352,4 +368,11 @@ def __del__(self):
     for i in range(num_sites):
         tensor = mps_helper.get_tensor(i)
         modes = mps_helper.get_tensor_modes(i)
-        print(f"Site {i}, extent: {tensor.shape}, modes: {modes}")
\ No newline at end of file
+        print(f"Site {i}, extent: {tensor.shape}, modes: {modes}")
+
+    mps_helper.free()
+
+if __name__ == '__main__':
+    main()
+
+
diff --git a/python/samples/cutensornet/experimental/network_state/circuits_cirq/example05_mps_exact.py b/python/samples/cutensornet/experimental/network_state/circuits_cirq/example05_mps_exact.py
index f552e3a..2b7ebcc 100644
--- a/python/samples/cutensornet/experimental/network_state/circuits_cirq/example05_mps_exact.py
+++ b/python/samples/cutensornet/experimental/network_state/circuits_cirq/example05_mps_exact.py
@@ -62,5 +62,6 @@
 
     # compute the expectation value for a series of Pauli operators
     pauli_string = {'IXIXIXIX': 0.5, 'IYIYIYIY': 0.2, 'IZIZIZIZ': 0.3}
-    expec = state.compute_expectation(pauli_string).real / state.compute_norm()
+    expec, norm = state.compute_expectation(pauli_string, return_norm=True)
+    expec = expec.real / norm
     print(f"{expec=}")
diff --git a/python/samples/cutensornet/experimental/network_state/circuits_qiskit/example05_mps_exact.py b/python/samples/cutensornet/experimental/network_state/circuits_qiskit/example05_mps_exact.py
index 3917035..bfe567f 100644
--- a/python/samples/cutensornet/experimental/network_state/circuits_qiskit/example05_mps_exact.py
+++ b/python/samples/cutensornet/experimental/network_state/circuits_qiskit/example05_mps_exact.py
@@ -57,5 +57,7 @@
 
     # compute the expectation value for a series of Pauli operators
     pauli_string = {'IXIXIXIX': 0.5, 'IYIYIYIY': 0.2, 'IZIZIZIZ': 0.3}
-    expec = state.compute_expectation(pauli_string).real / state.compute_norm()
-    print(f"{expec=}")
+
+    expec, norm = state.compute_expectation(pauli_string, return_norm=True)
+    expec = expec.real / norm
+    print(f"{expec=}")
\ No newline at end of file
diff --git a/python/samples/cutensornet/experimental/network_state/generic_states/example01_basic_torch.py b/python/samples/cutensornet/experimental/network_state/generic_states/example01_basic_torch.py
index c6d9c97..076f012 100644
--- a/python/samples/cutensornet/experimental/network_state/generic_states/example01_basic_torch.py
+++ b/python/samples/cutensornet/experimental/network_state/generic_states/example01_basic_torch.py
@@ -74,7 +74,8 @@
 
 # compute the normalized expectation value for a series of Pauli operators
 pauli_string = {'IXIXIX': 0.5, 'IYIYIY': 0.2, 'IZIZIZ': 0.3}
-expec = state.compute_expectation(pauli_string).real / state.compute_norm()
+expec, norm = state.compute_expectation(pauli_string, return_norm=True)
+expec = expec.real / norm
 print(f"{expec=}")
 
 # release resources
diff --git a/python/samples/cutensornet/experimental/network_state/generic_states/example02_arbitrary_dimension_numpy.py b/python/samples/cutensornet/experimental/network_state/generic_states/example02_arbitrary_dimension_numpy.py
index 62324d0..41dd680 100644
--- a/python/samples/cutensornet/experimental/network_state/generic_states/example02_arbitrary_dimension_numpy.py
+++ b/python/samples/cutensornet/experimental/network_state/generic_states/example02_arbitrary_dimension_numpy.py
@@ -57,8 +57,9 @@
 
 # compute the un-normalized bitstring amplitude
 bitstring = '0' * n_state_modes
-amplitude = state.compute_amplitude(bitstring)
-print(f"Bitstring amplitude for {bitstring}: {amplitude}")
+amplitude, norm = state.compute_amplitude(bitstring, return_norm=True)
+prob = abs(amplitude) ** 2 / norm
+print(f"Bitstring amplitude for {bitstring}: {amplitude}, prob={prob}")
 
 # compute batched bitstring amplitude with first mode fixed at state 0 and second mode at state 1
 fixed = {0: 0, 1: 1}
@@ -85,8 +86,8 @@
 
 expec_operator = NetworkOperator(state_mode_extents, dtype=dtype)
 expec_operator.append_product(1, expec_prod_modes, expec_prod_operators)
-expec = state.compute_expectation(expec_operator)
-print(f"Expectation value: {expec}, norm: {state.compute_norm()}")
+expec, norm = state.compute_expectation(expec_operator, return_norm=True)
+print(f"normalized expectation value = {expec/norm}")
 
 # release resources
 state.free()
diff --git a/python/samples/cutensornet/experimental/network_state/generic_states/example03_mps_mpo_cupy.py b/python/samples/cutensornet/experimental/network_state/generic_states/example03_mps_mpo_cupy.py
index 6e98e52..00213a1 100644
--- a/python/samples/cutensornet/experimental/network_state/generic_states/example03_mps_mpo_cupy.py
+++ b/python/samples/cutensornet/experimental/network_state/generic_states/example03_mps_mpo_cupy.py
@@ -78,8 +78,9 @@
 
 # compute the bitstring amplitude
 bitstring = '0' * n_state_modes
-amplitude = state.compute_amplitude(bitstring)
-print(f"Bitstring amplitude for {bitstring}: {amplitude}")
+amplitude, norm = state.compute_amplitude(bitstring, return_norm=True)
+prob = abs(amplitude) ** 2 / norm
+print(f"Bitstring amplitude for {bitstring}: {amplitude}, prob={prob}")
 
 # compute batched bitstring amplitude with first mode fixed at state 0 and second mode at state 1
 fixed = {0: 0, 1: 1}
@@ -98,8 +99,8 @@
 print(samples)
 
 # compute the normalized expectation value for the MPO
-expec = state.compute_expectation(mpo) / state.compute_norm()
-print(f"{expec=}")
+expec, norm = state.compute_expectation(mpo, return_norm=True)
+print(f"normalized expectation value = {expec/norm}")
 
 # release resources
 state.free()
diff --git a/python/samples/cutensornet/experimental/network_state/generic_states/example04_variational_expectation.py b/python/samples/cutensornet/experimental/network_state/generic_states/example04_variational_expectation.py
index d726adf..bbeebfd 100644
--- a/python/samples/cutensornet/experimental/network_state/generic_states/example04_variational_expectation.py
+++ b/python/samples/cutensornet/experimental/network_state/generic_states/example04_variational_expectation.py
@@ -82,22 +82,25 @@
     e0 = cp.cuda.Event()
     e1 = cp.cuda.Event()
     e0.record()
-    expec_a = state_a.compute_expectation(operator).real / state_a.compute_norm()
+    expec_a, norm_a = state_a.compute_expectation(operator, return_norm=True)
+    expec_a = expec_a.real / norm_a
     e1.record()
     e1.synchronize()
-    print(f"Expectation for state_a from direct computation : {expec_a}, runtime={cp.cuda.get_elapsed_time(e0, e1)} ms")
+    print(f"Normalized expectation for state_a from direct computation : {expec_a}, runtime={cp.cuda.get_elapsed_time(e0, e1)} ms")
 
-    expec_b = state_b.compute_expectation(operator).real / state_b.compute_norm()
+    expec_b, norm_b = state_b.compute_expectation(operator, return_norm=True)
+    expec_b = expec_b.real / norm_b
     e0.record()
     e0.synchronize()
-    print(f"Expectation for state_b from direct computation : {expec_b}, runtime={cp.cuda.get_elapsed_time(e1, e0)} ms")
+    print(f"Normalized expectation for state_b from direct computation : {expec_b}, runtime={cp.cuda.get_elapsed_time(e1, e0)} ms")
 
     for tensor_id in two_body_op_ids:
         state_a.update_tensor_operator(tensor_id, op_two_body_y, unitary=False)
         print(f"Update two body operator ({tensor_id}) from X to Y in state_a")
 
-    expec_b_updated = state_a.compute_expectation(operator).real / state_a.compute_norm()
+    expec_b_updated, norm_b_updated = state_a.compute_expectation(operator, return_norm=True)
+    expec_b_updated = expec_b_updated.real / norm_b_updated
     e1.record()
     e1.synchronize()
-    print(f"Expectation for state_b from updating state_a : {expec_b_updated}, runtime={cp.cuda.get_elapsed_time(e0, e1)} ms")
+    print(f"Normalized expectation for state_b from updating state_a : {expec_b_updated}, runtime={cp.cuda.get_elapsed_time(e0, e1)} ms")
     assert cp.allclose(expec_b, expec_b_updated)
diff --git a/python/samples/cutensornet/experimental/network_state/generic_states/example05_noisy_unitary_channels.py b/python/samples/cutensornet/experimental/network_state/generic_states/example05_noisy_unitary_channels.py
new file mode 100644
index 0000000..266478f
--- /dev/null
+++ b/python/samples/cutensornet/experimental/network_state/generic_states/example05_noisy_unitary_channels.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""
+Contraction-based tensor network simulation of a noisy quantum state with unitary tensor channels.
+The custom state is constructed by iteratively applying tensor operators and unitary bitflip tensor channels with the following topology:
+
+Vacuum:                         A   B   C   D   E   F
+                                |   |   |   |   |   |
+one body op                     O   O   O   O   O   O
+                                |   |   |   |   |   |
+bit flip unitary channel        U   U   U   U   U   U
+                                |   |   |   |   |   |
+two body op                     GGGGG   GGGGG   GGGGG
+                                |   |   |   |   |   |
+two body op                     |   GGGGG   GGGGG   |
+                                |   |   |   |   |   |
+
+The expectation value is statistically computed with a trajectory based simulation.
+"""
+import cupy as cp
+
+from cuquantum.cutensornet.experimental import NetworkState, NetworkOperator
+
+#
+# specify the dimensions of the tensor network state
+n_state_modes = 6
+state_mode_extents = (2, ) * n_state_modes
+dtype = 'complex128'
+n_trajectories = 1000
+
+def random_unitary(n):
+    """
+    Create a random unitary tensor
+    """
+    mat = cp.random.random((2**n, 2**n)) + 1.j * cp.random.random((2**n, 2**n))
+    q, r = cp.linalg.qr(mat)
+    unitary = q.reshape((2,2)*n)
+    return unitary
+
+# create random operators and random unitary channels
+cp.random.seed(1)
+op_one_body = random_unitary(1)
+op_two_body = random_unitary(2)
+
+bitflip_channel = [
+    cp.eye(2, dtype=dtype), # I
+    cp.asarray([[0, 1], [1, 0]], dtype=dtype) # X
+]
+bitflip_probabilities = [0.95, 0.05] # 5% for bitflip
+
+# create an emtpy NetworkState object, by default it will tensor network contraction as simulation method
+state = NetworkState(state_mode_extents, dtype=dtype)
+
+# apply one body tensor operators & unitary channels to the tensor network state
+for i in range(n_state_modes):
+    modes_one_body = (i, )
+    tensor_id = state.apply_tensor_operator(modes_one_body, op_one_body, unitary=True, immutable=True)
+    channel_id = state.apply_unitary_tensor_channel(modes_one_body, bitflip_channel, bitflip_probabilities)
+
+# apply two body tensor operators & unitary channels to the tensor network state
+for i in range(2):
+    for site in range(i, n_state_modes, 2):
+        if site + 1 < n_state_modes:
+            modes_two_body = (site, site+1)
+            tensor_id = state.apply_tensor_operator(modes_two_body, op_two_body, unitary=True, immutable=True)
+
+# compute the normalized expectation value for a series of Pauli operators
+pauli_string = {'IXIXIX': 0.5, 'IYIYIY': 0.2, 'IZIZIZ': 0.3}
+
+# explicitly construct NetworkOperator to activate caching mechanism
+network_operator = NetworkOperator.from_pauli_strings(pauli_string, dtype=dtype)
+expec_counter = dict()
+for i in range(n_trajectories):
+    expec, norm = state.compute_expectation(network_operator, return_norm=True)
+    expec = expec.real / norm
+    if expec not in expec_counter:
+        expec_counter[expec] = 0
+    expec_counter[expec] += 1
+
+expec_average = 0
+for expec, n_count in sorted(expec_counter.items(), key=lambda item: item[1], reverse=True):
+    print(f"{expec=:.6f}, frequency={n_count / n_trajectories}")
+    expec_average += expec * n_count / n_trajectories
+print(f"Expec average: {expec_average:.6f}")
+# release resources
+state.free()
diff --git a/python/samples/cutensornet/tensor/example12-qr_mem_limit_handling.py b/python/samples/cutensornet/tensor/example12-qr_mem_limit_handling.py
new file mode 100644
index 0000000..7dd9391
--- /dev/null
+++ b/python/samples/cutensornet/tensor/example12-qr_mem_limit_handling.py
@@ -0,0 +1,33 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""
+QR Example using CuPy ndarray. Show memory limit handling.
+
+The decomposition results are also CuPy ndarrays.
+"""
+import cupy as cp
+
+from cuquantum import tensor, MemoryLimitExceeded
+
+# create a random rank-4 tensor
+cp.random.seed(2024)
+a = cp.random.random((3,2,4,5))
+
+try:
+    # use a minimal memory limit to demonstrate the handling of exceeding memory limit
+    q, r = tensor.decompose('ijab->ijx,xab', a, options={'memory_limit': 1})
+except MemoryLimitExceeded as e:
+    print("handling memory limit...")
+    free_memory = cp.cuda.runtime.memGetInfo()[0]
+    # setting device memory usage cap to 80% of free memory
+    memory_cap = int(free_memory * 0.8)
+    print(f"memory cap set to {e.limit} bytes while the required memory is {e.requirement} bytes on device {e.device_id}. (available memory: {memory_cap} bytes)")
+
+    if e.requirement <= memory_cap:
+        print(f"memory limit is set to required memory...")
+        q, r = tensor.decompose('ijab->ijx,xab', a, options={'memory_limit': e.requirement})
+        print("QR completed")
+    else:
+        print("exceeded maximal memory..., skipping QR")
diff --git a/python/samples/cutensornet/tensor/example12-svd_mem_limit_handling.py b/python/samples/cutensornet/tensor/example12-svd_mem_limit_handling.py
new file mode 100644
index 0000000..657fa48
--- /dev/null
+++ b/python/samples/cutensornet/tensor/example12-svd_mem_limit_handling.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""
+SVD Example using CuPy ndarray. Show memory limit handling.
+
+The decomposition results are also CuPy ndarrays.
+"""
+import cupy as cp
+
+from cuquantum import tensor, MemoryLimitExceeded
+
+# create a random rank-4 tensor
+cp.random.seed(2024)
+a = cp.random.random((3,2,4,5))
+
+try:
+    # use a minimal memory limit to demonstrate the handling of exceeding memory limit
+    u, s, v = tensor.decompose('ijab->ijx,xab', a, method=tensor.SVDMethod(), options={'memory_limit': 1})
+except MemoryLimitExceeded as e:
+    print("handling memory limit...")
+    free_memory = cp.cuda.runtime.memGetInfo()[0]
+    # setting device memory usage cap to 80% of free memory
+    memory_cap = int(free_memory * 0.8)
+    print(f"memory cap set to {e.limit} bytes while the required memory is {e.requirement} bytes on device {e.device_id}. (available memory: {memory_cap} bytes)")
+
+    if e.requirement <= memory_cap:
+        print(f"memory limit is set to required memory...")
+        u, s, v = tensor.decompose('ijab->ijx,xab', a, method=tensor.SVDMethod(), options={'memory_limit': e.requirement})
+        print("SVD completed")
+    else:
+        print("exceeded maximal memory..., skipping SVD")
+
+
diff --git a/python/samples/densitymat/operator_advanced.py b/python/samples/densitymat/operator_advanced.py
new file mode 100644
index 0000000..fe598f0
--- /dev/null
+++ b/python/samples/densitymat/operator_advanced.py
@@ -0,0 +1,238 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import cupy as cp
+import numpy as np
+from cuquantum.densitymat import (
+    tensor_product,
+    DenseMixedState,
+    DenseOperator,
+    WorkStream,
+    Operator,
+    OperatorAction,
+)
+
+dev = cp.cuda.Device()  # get current device
+props = cp.cuda.runtime.getDeviceProperties(dev.id)
+print("===== device info ======")
+print("GPU-local-id:", dev.id)
+print("GPU-name:", props["name"].decode())
+print("GPU-clock:", props["clockRate"])
+print("GPU-memoryClock:", props["memoryClockRate"])
+print("GPU-nSM:", props["multiProcessorCount"])
+print("GPU-major:", props["major"])
+print("GPU-minor:", props["minor"])
+print("========================")
+
+
+# define the shape of the composite tensor product space
+hilbert_space_dims = (4, 5, 2, 6, 3, 7)  # six quantum degrees of freedom
+
+# define some elementary tensor operators
+A = np.random.random((hilbert_space_dims[2],) * 2)  # one-body elementary tensor operator
+
+
+b = np.random.random(  # two-body elementary tensor operator, arbitrary strides also supported
+    (
+        hilbert_space_dims[3],
+        hilbert_space_dims[5],
+    )
+    * 2
+)
+
+
+# We can wrap the NDArray in a TensorOperator, which is useful if we want to use the same tensor multiple times.
+B = DenseOperator(b)
+
+
+# one-body elementary tensor callback operator
+def c_callback(t, args):
+    return np.random.random((hilbert_space_dims[1],) * 2)
+
+
+c_representative_tensor = c_callback(0.0, ())
+# making an instance of TensorOperator is optional, we can also pass the tuple of (c_representative_tensor, c_callback) directly below in palce of `C`
+C = DenseOperator(c_representative_tensor, c_callback)
+
+print("Defined elementary operators A, B, C.")
+
+
+# define a scalar callback function (time-dependent coefficient)
+def my_callback(t, args):  # args is an arbitrary list of real user-defined parameters
+    _omega = args[0]
+    return np.sin(np.pi * _omega * t)  # return the scalar parameterized coefficient at time t
+
+
+# construct tensor products of elementary tensor operators
+ab = tensor_product(
+    (
+        A,  # elementary tensor operator
+        (2,),  # quantum degrees of freedom it acts on
+    ),
+    (
+        B,  # elementary tensor operator
+        (3, 5),  # quantum degrees of freedom it acts on
+    ),
+    coeff=1.0,  # constant (static) coefficient
+)
+
+bc = tensor_product(
+    (
+        B,  # elementary tensor operator
+        (3, 5),  # quantum degrees of freedom it acts on
+    ),
+    (
+        C,  # elementary tensor operator
+        (1,),  # quantum degrees of freedom it acts on
+    ),
+    coeff=my_callback,  # time-dependent parameterized coefficient represented by a user-defined callback function
+)
+
+# construct different operator terms
+term1 = ab + bc  # an OperatorTerm composed of a sum of two tensor operator products
+term1 += bc  # `OperatorTerm` also supports in-place addition
+
+term2 = tensor_product(  # an operator term composed of a single elementary tensor operator
+    (
+        C,  # elementary tensor operator
+        (1,),  # quantum degrees of freedom it acts on
+        (False,),  # operator action duality (side: left/right) for each quantum degree of freedom
+    ),
+    coeff=1.0,  # constant (static) coefficient
+)
+
+print("Created OperatorTerms term1 and term2.")
+
+# construct the Hamiltonian operator from two operator terms
+hamiltonian = Operator(
+    hilbert_space_dims,  # shape of the composite tensor space
+    (term1,),  # first operator term with a default coefficient 1.0
+    (
+        term2,
+        my_callback,
+    ),  # second operator term modulated by a parameterized time-dependent coefficient (callback function)
+)
+
+print("Created Hamiltonian Operator from term1 and term2.")
+
+# construct the Liouvillian for the von Neumann equation
+liouvillian = (
+    hamiltonian - hamiltonian.dual()
+)  # Hamiltonian action on the left minus Hamiltonian action on the right: [H, *]
+
+print("Created Liouvillian Operator from Hamiltonian.")
+
+# open a work stream over a CUDA stream
+my_stream = cp.cuda.Stream()
+ctx = WorkStream(stream=my_stream)
+
+# construct the Liouvillian action for a single quantum state
+liouvillian_action = OperatorAction(ctx, (liouvillian,))
+
+print("Created Liouvillian OperatorAction from Liouvillian.")
+
+# create a mixed quantum state (density matrix) with zero initialized data buffer
+batch_size = 1
+rho0 = DenseMixedState(ctx, hilbert_space_dims, batch_size, "complex128")
+slice_shape, slice_offsets = rho0.local_info
+rho0.attach_storage(cp.zeros(rho0.storage_size, dtype=rho0.dtype))
+# set storage to a Haar random unnormalized state
+# for MGMN execution, the data buffer may be larger than the locally stored slice of the state
+# the view method returns a tensor shaped view on the local slice (the full state for single-GPU execution)
+rho0.view()[:] = cp.random.normal(size=slice_shape) + (
+    1j * cp.random.normal(size=slice_shape)
+)
+# for non-random initialization and MGMN execution, we would use slice_offsets to determine how to set the elements
+norm = rho0.norm().get()[()]
+rho0.inplace_scale(np.sqrt(1 / norm))
+assert np.isclose(rho0.norm().get()[()], 1)
+
+print(
+    "Created a Haar random normalized mixed quantum state (not physical due to lack of hermitianity)."
+)
+
+# two ways of creating another mixed quantum state of the same shape and init it to zero
+rho1 = rho0.clone(cp.zeros_like(rho0.storage))
+rho2 = DenseMixedState(ctx, hilbert_space_dims, batch_size, "complex128")
+rho2.allocate_storage()
+
+print("Created a zero-initialized output mixed quantum state.")
+
+
+# prepare operator action on a mixed quantum state
+liouvillian_action.prepare(ctx, (rho0,))
+
+print("Prepared Liouvillian action through OperatorAction.prepare.")
+
+# set a parameter for the callback function to some value
+omega = 2.4
+
+# compute the operator action on a given quantum state
+liouvillian_action.compute(
+    0.0,  # time value
+    (omega,),  # user-defined parameters
+    (rho0,),  # input quantum state
+    rho1,  # output quantum state
+)
+
+print("Computed Liouvillian action through OperatorAction.compute.")
+
+# alternatively, prepare the operator action directly via the operator
+liouvillian.prepare_action(ctx, rho0)
+
+print("Prepared Liouvillian action through Operator.prepare_action.")
+
+# compute the operator action directly via the operator
+liouvillian.compute_action(
+    0.0,  # time value
+    (omega,),  # user-defined parameters
+    rho0,  # input quantum state
+    rho2,  # output quantum state
+)
+# OperatorAction.compute and Operator.compute_action are accumulative, so rho0p should now be equivalent to twice the action of the Liouvillian.
+
+print("Computed Liouvillian action through Operator.compute_action.")
+
+# prepare the operator expectation value computation
+liouvillian.prepare_expectation(ctx, rho0)
+
+print("Prepared expectation through Operator.prepare_expectation.")
+
+# compute the operator expectation value
+expval = liouvillian.compute_expectation(
+    0.0,  # time value
+    (omega,),  # user-defined parameters
+    rho1,  # input quantum state
+)
+
+print("Computed expectation through Operator.compute_expectation.")
+
+# we can compute the operator action again without another prepare call
+liouvillian.compute_action(
+    0.0,  # time value
+    (omega,),  # user-defined parameters
+    rho0,  # input quantum state
+    rho1,  # output quantum state
+)
+
+print("Computed Liouvillian action through Operator.compute_action.")
+
+# assuming we want to some other task in between it may be useful to release the workspace
+ctx.release_workspace()
+
+# releasing the workspace has no user-facing side effects
+# for example, we do not need to reissue prepare calls
+liouvillian.compute_action(
+    0.0,  # time value
+    (omega,),  # user-defined parameters
+    rho0,  # input quantum state
+    rho2,  # output quantum state
+)
+
+print("Computed Liouvillian action through Operator.compute_action after releasing workspace.")
+
+# synchronize the work stream
+my_stream.synchronize()
+
+print("Finished computation and exit.")
diff --git a/python/samples/densitymat/operator_defaults.py b/python/samples/densitymat/operator_defaults.py
new file mode 100644
index 0000000..9c0d2b2
--- /dev/null
+++ b/python/samples/densitymat/operator_defaults.py
@@ -0,0 +1,184 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import cupy as cp
+import numpy as np
+from cuquantum.densitymat import (
+    tensor_product,
+    DenseMixedState,
+    WorkStream,
+    Operator,
+    OperatorAction,
+)
+
+dev = cp.cuda.Device()  # get current device
+props = cp.cuda.runtime.getDeviceProperties(dev.id)
+print("===== device info ======")
+print("GPU-local-id:", dev.id)
+print("GPU-name:", props["name"].decode())
+print("GPU-clock:", props["clockRate"])
+print("GPU-memoryClock:", props["memoryClockRate"])
+print("GPU-nSM:", props["multiProcessorCount"])
+print("GPU-major:", props["major"])
+print("GPU-minor:", props["minor"])
+print("========================")
+
+# define the shape of the composite tensor product space
+hilbert_space_dims = (4, 5, 2, 6, 3, 7)  # six quantum degrees of freedom
+
+# define some elementary tensor operators
+A = np.random.random(
+    (hilbert_space_dims[2],) * 2
+)  # one-body elementary tensor operator
+
+B = np.random.random(  # two-body elementary tensor operator
+    (
+        hilbert_space_dims[3],
+        hilbert_space_dims[5],
+    )
+    * 2
+)
+
+C = np.random.random(
+    (hilbert_space_dims[1],) * 2
+)  # one-body elementary tensor operator
+
+print("Defined elementary operators A, B, C.")
+
+
+# define a scalar callback function (time-dependent coefficient)
+def my_callback(t, args):  # args is an arbitrary list of real user-defined parameters
+    _omega = args[0]
+    return np.sin(
+        np.pi * _omega * t
+    )  # return the scalar parameterized coefficient at time t
+
+
+# construct tensor products of elementary tensor operators
+ab = tensor_product(
+    (
+        A,  # elementary tensor operator
+        (2,),  # quantum degrees of freedom it acts on
+    ),
+    (
+        B,  # elementary tensor operator
+        (3, 5),  # quantum degrees of freedom it acts on
+    ),
+    coeff=1.0,  # constant (static) coefficient
+)
+
+bc = tensor_product(
+    (
+        B,  # elementary tensor operator
+        (3, 5),  # quantum degrees of freedom it acts on
+    ),
+    (
+        C,  # elementary tensor operator
+        (1,),  # quantum degrees of freedom it acts on
+    ),
+    coeff=my_callback,  # time-dependent parameterized coefficient represented by a user-defined callback function
+)
+
+# construct different operator terms
+term1 = ab + bc  # an operator term composed of a sum of two tensor operator products
+
+term2 = (
+    tensor_product(  # an operator term composed of a single elementary tensor operator
+        (
+            C,  # elementary tensor operator
+            (1,),  # quantum degrees of freedom it acts on
+        ),
+    )
+)
+
+print("Created OperatorTerms term1 and term2.")
+
+# construct the Hamiltonian operator from two operator terms
+hamiltonian = Operator(
+    hilbert_space_dims,  # shape of the composite tensor space
+    (term1,),  # first operator term with a default coefficient 1.0
+    (
+        term2,
+        my_callback,
+    ),  # second operator term modulated by a parameterized time-dependent coefficient (callback function)
+)
+
+print("Created Hamiltonian Operator from term1 and term2.")
+
+# construct the Liouvillian for the von Neumann equation
+liouvillian = (
+    hamiltonian - hamiltonian.dual()
+)  # Hamiltonian action on the left minus Hamiltonian action on the right: [H, *]
+
+print("Created Liouvillian Operator from Hamiltonian.")
+
+# open a work stream
+ctx = WorkStream()
+
+# construct the Liouvillian action for a single quantum state
+liouvillian_action = OperatorAction(ctx, (liouvillian,))
+
+print("Created Liouvillian OperatorAction from Liouvillian.")
+
+# create a mixed quantum state (density matrix) with zero initialized data buffer
+batch_size = 1
+rho0 = DenseMixedState(ctx, hilbert_space_dims, batch_size, "complex128")
+slice_shape, slice_offsets = rho0.local_info
+rho0.attach_storage(cp.zeros(rho0.storage_size, dtype=rho0.dtype))
+# set storage to a Haar random unnormalized state
+# for MGMN execution, the data buffer may be larger than the locally stored slice of the state
+# the view method returns a tensor shaped view on the local slice (the full state for single-GPU execution)
+rho0.view()[:] = cp.random.normal(size=slice_shape) + (
+    1j * cp.random.normal(size=slice_shape)
+)
+# for non-random initialization and MGMN execution, we would use slice_offsets to determine how to set the elements
+norm = rho0.norm().get()[()]
+rho0.inplace_scale(np.sqrt(1 / norm))
+assert np.isclose(rho0.norm().get()[()], 1)
+
+print(
+    "Created a Haar random normalized mixed quantum state (not physical due to lack of hermitianity)."
+)
+
+# two ways of creating another mixed quantum state of the same shape and init it to zero
+rho1 = rho0.clone(cp.zeros_like(rho0.storage))
+rho2 = DenseMixedState(ctx, hilbert_space_dims, batch_size, "complex128")
+rho2.allocate_storage()
+
+print("Created a zero-initialized output mixed quantum state.")
+
+# prepare operator action on a mixed quantum state
+liouvillian_action.prepare(ctx, (rho0,))
+
+print("Prepared Liouvillian action through OperatorAction.prepare.")
+
+# set a parameter for the callback function to some value
+omega = 2.4
+
+# compute the operator action on a given quantum state
+liouvillian_action.compute(
+    0.0,  # time value
+    (omega,),  # user-defined parameters
+    (rho0,),  # input quantum state
+    rho1,  # output quantum state
+)
+
+print("Computed Liouvillian action through OperatorAction.compute.")
+
+# alternatively, prepare the operator action directly via the operator
+liouvillian.prepare_action(ctx, rho0)
+
+print("Prepared Liouvillian action through Operator.prepare_action.")
+
+# compute the operator action directly via the operator
+liouvillian.compute_action(
+    0.0,  # time value
+    (omega,),  # user-defined parameters
+    rho0,  # input quantum state
+    rho2,  # output quantum state
+)
+
+print("Computed Liouvillian action through Operator.compute_action.")
+
+print("Finished computation and exit.")
diff --git a/python/samples/densitymat/operator_mpi.py b/python/samples/densitymat/operator_mpi.py
new file mode 100644
index 0000000..fa98a1a
--- /dev/null
+++ b/python/samples/densitymat/operator_mpi.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import cupy as cp
+import numpy as np
+from mpi4py import MPI
+
+from cuquantum.densitymat import (
+    tensor_product,
+    DensePureState,
+    DenseOperator,
+    WorkStream,
+    OperatorTerm,
+    Operator,
+    OperatorAction,
+)
+
+NUM_DEVICES = cp.cuda.runtime.getDeviceCount()
+rank = MPI.COMM_WORLD.Get_rank()
+dev = cp.cuda.Device(rank % NUM_DEVICES)
+dev.use()
+props = cp.cuda.runtime.getDeviceProperties(dev.id)
+print("===== device info ======")
+print("GPU-local-id:", dev.id)
+print("GPU-name:", props["name"].decode())
+print("GPU-clock:", props["clockRate"])
+print("GPU-memoryClock:", props["memoryClockRate"])
+print("GPU-nSM:", props["multiProcessorCount"])
+print("GPU-major:", props["major"])
+print("GPU-minor:", props["minor"])
+print("========================")
+
+
+# create Workstream on the current device
+ctx = WorkStream(device_id=dev.id)
+
+# setup MPI communicator
+ctx.set_communicator(comm=MPI.COMM_WORLD.Dup(), provider="MPI")
+
+# define the shape of the composite tensor product space
+hilbert_space_dims = (4, 4, 4, 4, 4)  # six quantum degrees of freedom
+batch_size = 2
+
+# define some elementary tensor operators
+identity = DenseOperator(np.eye(hilbert_space_dims[0], dtype="complex128"))
+op_term = OperatorTerm(dtype="complex128")
+for i in range(len(hilbert_space_dims)):
+    op_term += tensor_product(
+        (
+            identity,
+            (1,),
+        )
+    )
+# This operator will just be proportional to the identity
+op = Operator(hilbert_space_dims, (op_term,))
+op_action = OperatorAction(ctx, (op,))
+
+
+def set_ditstring(state, batch_index, ditstring: list):
+    """
+    Set's the state's coefficient at for the `batch_index`'th quantum state to the product state in the computational basis encoded by `ditstring`.
+    """
+    slice_shape, slice_offsets = state.local_info
+    ditstring = np.asarray(
+        ditstring
+        + [
+            batch_index,
+        ],
+        dtype="int",
+    )
+    ditstring_is_local = True
+    state_inds = []
+    for slice_dim, slice_offset, state_dit in zip(
+        slice_shape, slice_offsets, ditstring
+    ):
+        ditstring_is_local = state_dit in range(slice_offset, slice_offset + slice_dim)
+        if not ditstring_is_local:
+            break
+        else:
+            state_inds.append(
+                range(slice_offset, slice_offset + slice_dim).index(state_dit)
+            )
+    if ditstring_is_local:
+        strides = (1,) + tuple(np.cumprod(np.array(slice_shape)[:-1]))
+        ind = np.sum(strides * np.array(state_inds))
+        state.storage[ind] = 1.0
+
+
+# product states to be set for each batch state
+global_ditstrings = [[0, 1, 3, 2, 0], [1, 0, 3, 2, 1]]
+
+# make initial state
+state = DensePureState(ctx, hilbert_space_dims, batch_size, "complex128")
+required_buffer_size = state.storage_size
+state.attach_storage(cp.zeros((required_buffer_size,), dtype="complex128"))
+# set product states for each batch input state
+for batch_ind in range(batch_size):
+    set_ditstring(state, batch_ind, global_ditstrings[batch_ind])
+# more ways to make a State instance
+state_out = state.clone(cp.zeros(required_buffer_size, dtype="complex128"))
+another_state = DensePureState(ctx, hilbert_space_dims, batch_size, "complex128")
+another_state.allocate_storage()
+# prepare and compute Operator action
+op.prepare_action(ctx, state)
+op.compute_action(0.0, [], state, state_out)
+state_out_slice = state_out.view()
+# compute Operator action
+op_action.compute(
+    0.0,
+    [],
+    [
+        state,
+    ],
+    another_state,
+)
+# OperatorAction and Operator for this specific example have the same effect
+assert cp.allclose(another_state.view(), state_out_slice)
diff --git a/python/setup.py b/python/setup.py
index 43d1080..587af4f 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -35,8 +35,9 @@
 install_requires = [
     'numpy>=1.21, <3.0',  # ">=1.21,<3"
     # 'torch', # <-- PyTorch is optional; also, the PyPI version does not support GPU...
-    f'custatevec-cu{utils.cuda_major_ver}~=1.6',   # ">=1.6.0,<2"
-    f'cutensornet-cu{utils.cuda_major_ver}>=2.5.0,<3',
+    f'custatevec-cu{utils.cuda_major_ver}~=1.7',   # ">=1.7.0,<2"
+    f'cutensornet-cu{utils.cuda_major_ver}~=2.6',  # ">=2.6.0,<3"
+    f'cudensitymat-cu{utils.cuda_major_ver}~=0.0.5', # ">=0.0.5,<0.1"
 ]
 if utils.cuda_major_ver == '11':
     install_requires.append('cupy-cuda11x>=13.0')  # no ambiguity
@@ -106,6 +107,27 @@ def cleanup_dst_files():
         sources=["cuquantum/cutensornet/_internal/cutensornet.pyx"],
         language="c++",
     ),
+    Extension(
+        "cuquantum.bindings.cudensitymat",
+        sources=["cuquantum/bindings/cudensitymat.pyx"],
+        language="c++",
+    ),
+    Extension(
+        "cuquantum.bindings.cycudensitymat",
+        sources=["cuquantum/bindings/cycudensitymat.pyx"],
+        language="c++",
+    ),
+    Extension(
+        "cuquantum.bindings._internal.cudensitymat",
+        sources=["cuquantum/bindings/_internal/cudensitymat.pyx"],
+        language="c++",
+    ),
+    Extension(
+        "cuquantum.bindings._utils",
+        sources=["cuquantum/bindings/_utils.pyx"],
+        include_dirs=[os.path.join(utils.cuda_path, 'include')],
+        language="c++",
+    ),
     Extension(
         "cuquantum._utils",
         sources=["cuquantum/_utils.pyx"],
@@ -168,6 +190,6 @@ def cleanup_dst_files():
     zip_safe=False,
     python_requires='>=3.10',
     install_requires=install_requires,
-    tests_require=install_requires+tests_require,
+    extras_require={"test": tests_require},
     cmdclass=cmdclass,
 )
diff --git a/python/tests/cuquantum_tests/custatevec_tests/test_custatevec.py b/python/tests/cuquantum_tests/custatevec_tests/test_custatevec.py
index 4e482c1..8524bf0 100644
--- a/python/tests/cuquantum_tests/custatevec_tests/test_custatevec.py
+++ b/python/tests/cuquantum_tests/custatevec_tests/test_custatevec.py
@@ -1012,6 +1012,8 @@ def test_compute_expectation(self, handle, xp, expect_dtype, input_form, mempool
             basis_bits, basis_bits_len,
             compute_type, workspace_ptr, workspace_size)
 
+        # wait expect to be updated
+        cp.cuda.Device().synchronize()
         assert xp.allclose(expect, 2**self.n_qubits)
 
     # TODO: test other input forms?
@@ -1097,6 +1099,8 @@ def test_compute_expectation_batched(
             expect.ctypes.data, matrices_ptr, data_type, cusv.MatrixLayout.ROW, n_matrices,
             basis_bits, n_basis_bits, compute_type, workspace_ptr, workspace_size)
 
+        # wait expect to be updated
+        cp.cuda.Device().synchronize()
         assert (np.allclose(expect, 1))
 
 class TestSampler(TestSV):
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/circuit_data.py b/python/tests/cuquantum_tests/cutensornet_tests/circuit_data.py
index c4a76c1..967cbd9 100644
--- a/python/tests/cuquantum_tests/cutensornet_tests/circuit_data.py
+++ b/python/tests/cuquantum_tests/cutensornet_tests/circuit_data.py
@@ -14,6 +14,10 @@
     import qiskit
 except ImportError:
     qiskit = None
+try:
+    import torch
+except ImportError:
+    torch = None
 
 from .test_utils import DEFAULT_RNG
 
@@ -185,6 +189,8 @@ def get_qiskit_multi_control_circuit():
 
 @pytest.fixture(scope="session")
 def backend_cycle():
+    if torch is None:
+        return itertools.cycle(('numpy', 'cupy'))
     return itertools.cycle(('numpy', 'cupy', 'torch'))
 
 @pytest.fixture(scope="function")
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/circuit_tester.py b/python/tests/cuquantum_tests/cutensornet_tests/circuit_tester.py
index 1f111bd..d03ee30 100644
--- a/python/tests/cuquantum_tests/cutensornet_tests/circuit_tester.py
+++ b/python/tests/cuquantum_tests/cutensornet_tests/circuit_tester.py
@@ -138,10 +138,17 @@ def test_misc(self):
         raise NotImplementedError
     
     def test_norm(self):
-        norm1 = self.reference_engine.compute_norm()
+        try:
+            norm1 = self.reference_engine.compute_norm()
+        except:
+            # for NetworkState
+            _, norm1 = self.reference_engine.compute_amplitude('0'*self.n_qubits, return_norm=True)
         for engine in self.target_engines:
             for _ in get_engine_iters(engine):
-                norm2 = engine.compute_norm()
+                try:
+                    norm2 = engine.compute_norm()
+                except:
+                    _, norm2 = engine.compute_amplitude('0'*self.n_qubits, return_norm=True)
                 message = f"{engine.__class__.__name__} maxDiff={abs(norm1-norm2)}"
                 assert np.allclose(norm1, norm2, **engine.tolerance), message
     
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/state_data.py b/python/tests/cuquantum_tests/cutensornet_tests/state_data.py
index a33526c..9030b8c 100644
--- a/python/tests/cuquantum_tests/cutensornet_tests/state_data.py
+++ b/python/tests/cuquantum_tests/cutensornet_tests/state_data.py
@@ -15,6 +15,10 @@
     import qiskit
 except ImportError:
     qiskit = None
+try:
+    import torch
+except ImportError:
+    torch = None
 from cuquantum.cutensornet.experimental import NetworkState, NetworkOperator
 
 from .circuit_data import cirq_circuits, get_qiskit_unitary_gate, qiskit_circuits
@@ -100,8 +104,21 @@ def qiskit_insert_random_layers(circuit, num_random_layers=DEFAULT_NUM_RANDOM_LA
     {'max_extent': 3, 'canonical_center': 1, 'rel_cutoff': 0.1, 'normalization': 'L2'}
 )
 
+# state with unitary channels to test
+# (qudits, initial_mps_dim, config, dtype) for each test case
+unitary_state_tests = (
+    (4, None, {}, 'complex128'), 
+    (7, 2, {}, 'complex64'),
+    (4, 2, {'mpo_application': 'exact'}, 'complex128'), # exact MPS simulation
+    (5, None, {'mpo_application': 'exact'}, 'complex128'), # exact MPS simulation
+    (6, 2, {'max_extent': 4, 'rel_cutoff': 0.1}, 'complex128'),
+    (8, None, {'max_extent': 6, 'rel_cutoff': 0.1}, 'complex64'), 
+)
+
 @pytest.fixture(scope="session")
 def factory_backend_cycle():
+    if torch is None:
+        return itertools.cycle(('numpy', 'cupy'))
     return itertools.cycle(('numpy', 'cupy', 'torch', 'torch-cpu'))
 
 @pytest.fixture(scope="function")
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/state_tester.py b/python/tests/cuquantum_tests/cutensornet_tests/state_tester.py
index 5788360..68a6621 100644
--- a/python/tests/cuquantum_tests/cutensornet_tests/state_tester.py
+++ b/python/tests/cuquantum_tests/cutensornet_tests/state_tester.py
@@ -2,24 +2,36 @@
 #
 # SPDX-License-Identifier: BSD-3-Clause
 
+import itertools
+
 import numpy as np
 
 import opt_einsum as oe
 
 from cuquantum import CircuitToEinsum
 from cuquantum.cutensornet.experimental import NetworkState, TNConfig, MPSConfig, NetworkOperator
+from cuquantum.cutensornet.experimental._internal.network_state_utils import get_pauli_map
 from cuquantum.cutensornet._internal.decomposition_utils import compute_mid_extent
 from cuquantum.cutensornet._internal.utils import infer_object_package
 from cuquantum.cutensornet._internal import tensor_wrapper
 
 from .approxTN_utils import gate_decompose, tensor_decompose, SVD_TOLERANCE, verify_unitary
-from .circuit_tester import BaseTester, get_random_pauli_strings, get_engine_iters
-from .circuit_utils import _BaseComputeEngine, ConverterComputeEngine, get_mps_tolerance
+from .circuit_tester import BaseTester, get_random_pauli_strings, get_engine_iters, compute_sample_overlap
+from .circuit_utils import (
+    _BaseComputeEngine, 
+    ConverterComputeEngine, 
+    get_mps_tolerance, 
+    reduced_density_matrix_from_sv, 
+    amplitude_from_sv,
+    batched_amplitude_from_sv,
+    expectation_from_sv,
+)
 from .test_utils import DEFAULT_RNG, EMPTY_DICT, TensorBackend, get_or_create_tensor_backend, atol_mapper, rtol_mapper, get_dtype_name, get_state_internal_backend_device
 
 
 # valid simulation setting for reference MPS class
 MPS_VALID_CONFIGS = {'max_extent', 'abs_cutoff', 'rel_cutoff', 'discarded_weight_cutoff', 'normalization', 'canonical_center', 'mpo'}
+STATE_PROPERTIES_NAMES = ('amplitude', 'batched_amplitudes', 'state_vector', 'reduced_density_matrix', 'expectation', 'sampling')
 
 def is_converter_mps_compatible(converter):
     for _, qubits in converter.gates:
@@ -59,7 +71,7 @@ def get_device_id(options):
     if isinstance(options, dict):
         device_id = options.get('device_id', 0)
     else:
-        device_id = getattr(options, 'device_id', None)
+        device_id = getattr(options, 'device_id', 0)
     return device_id
 
 def get_random_network_operator(state_dims, *, backend='cupy', rng=DEFAULT_RNG, num_repeats=2, dtype='complex128', options=None):
@@ -109,6 +121,69 @@ def get_random_modes():
         operator_obj.append_mpo(coefficient, mpo_modes, mpo_tensors)
     return operator_obj
 
+def compute_state_basic_property(state, property_name):
+    func = getattr(state, f'compute_{property_name}')
+    if property_name == 'amplitude':
+        return func('0' * state.n, return_norm=True)
+    elif property_name == 'batched_amplitudes':
+        return func({0: 0, 1:1})
+    elif property_name == 'state_vector':
+        return func()
+    elif property_name == 'reduced_density_matrix':
+        return func((0, ))
+    elif property_name == 'sampling':
+        return func(1000, seed=1)
+    elif property_name == 'expectation':
+        if set(state.state_mode_extents) == set([2]):
+            pauli_strings = {'X' * state.n: 0.1,
+                             'Y' * state.n: 0.2,
+                             'Z' * state.n: 0.4}
+            expec, norm = func(pauli_strings, return_norm=True)
+            return expec
+    else:
+        raise ValueError(f"{property_name} not supported")
+
+def compute_state_basic_quantities(state):
+    output = {}
+    for property_name in STATE_PROPERTIES_NAMES:
+        if property_name == 'expectation' and set(state.state_mode_extents) != set([2]):
+            continue
+        if property_name == 'amplitude':
+            output[property_name], output['norm'] = compute_state_basic_property(state, property_name)
+        else:
+            output[property_name] = compute_state_basic_property(state, property_name)
+    return output
+
+def apply_factory_sequence(network_state, sequence, parse_channels=True):
+    tensor_ids = []
+    channel_info = dict()
+    for op, modes, gate_info in sequence:
+        if gate_info is None:
+            if isinstance(op, (list, tuple)):
+                # MPO
+                tensor_id = network_state.apply_mpo(modes, op)
+            else:
+                # GATE
+                tensor_id = network_state.apply_tensor_operator(modes, op)
+        else:
+            if 'probabilities' in gate_info:
+                if parse_channels:
+                    # Unitary Channel
+                    tensor_id = network_state.apply_unitary_tensor_channel(modes, op, gate_info['probabilities'])
+                else:
+                    tensor_id = network_state.apply_tensor_operator(modes, op[0], unitary=True)
+                    channel_info[tensor_id] = (op, gate_info['probabilities'])
+            else:
+                assert 'control_modes' in gate_info
+                assert 'control_values' in gate_info
+                # Controlled-Tensor
+                # NetworkState currently only support immutable controlled tensors
+                tensor_id = network_state.apply_tensor_operator(modes, op, control_modes=gate_info['control_modes'], control_values=gate_info['control_values'], immutable=True)
+        tensor_ids.append(tensor_id)
+    if parse_channels:
+        return tensor_ids
+    else:
+        return tensor_ids, channel_info
 
 class StateFactory:
     def __init__(
@@ -137,7 +212,12 @@ def __init__(
         self.backend = TensorBackend(backend=backend, device_id=self.device_id)
         self.dtype = get_dtype_name(dtype)
 
-        assert set(layers).issubset(set('SDCM'))
+        dims = set(self.state_dims)
+        if len(dims) == 1 and dims.pop() == 2:
+            # unitary channel only supported for qubits
+            assert set(layers).issubset(set('SDCMU'))
+        else:
+            assert set(layers).issubset(set('SDCM'))
         self.layers = layers
 
         if rng is None:
@@ -200,12 +280,38 @@ def _generate_raw_sequence(self):
                 self._append_controlled_tensor_mpo_layer()
             elif layer == 'M':
                 self._append_mpo_layer()
+            elif layer == 'U':
+                self._append_unitary_channel_layer()
             else:
                 raise ValueError(f"layer type {layer} not supported")
     
     def append_sequence(self, sequence):
         self._sequence.append(sequence)
     
+    def _append_unitary_channel_layer(self):
+        if not hasattr(self, 'pauli_map'):
+            self.pauli_map = get_pauli_map(self.dtype, backend=self.backend.name, device_id=self.device_id)
+        pauli_map = self.pauli_map
+        qudits = list(range(self.num_qudits))
+        self.rng.shuffle(qudits)
+        for i, q in enumerate(qudits[:2]):
+            if i == 0:
+                operands = [pauli_map['I'], pauli_map['X']] # bitflip channel
+                gate_info = {'probabilities': [0.95, 0.05]}
+            else:
+                operands = [pauli_map[p] for p in 'IXYZ'] # phase shift channel
+                gate_info = {'probabilities': [0.7, 0.15, 0.1, 0.05]}
+            self._sequence.append((operands, (q, ), gate_info)) 
+        
+        if self.num_qudits >= 4:
+            XY = self.backend.einsum('Aa,Bb->ABab', pauli_map['X'], pauli_map['Y'])
+            YZ = self.backend.einsum('Aa,Bb->ABab', pauli_map['Y'], pauli_map['Z'])
+            ZX = self.backend.einsum('Aa,Bb->ABab', pauli_map['Z'], pauli_map['X'])
+            operands = [XY, YZ, ZX]
+            gate_info = {'probabilities': [0.7, 0.2, 0.1]}
+            self._sequence.append((operands, qudits[2:4], gate_info))
+        return    
+    
     def _append_single_qudit_layer(self):
         for i in range(self.num_qudits):
             shape = (self.state_dims[i], ) * 2
@@ -305,7 +411,8 @@ def _append_controlled_tensor_mpo_layer(self):
         except TypeError:
             t = t + t.conj().permute(*transpose_order)
         t /= self.backend.norm(t)
-        self._sequence.append((t, target_modes, (control_modes, control_values)))
+        gate_info = {'control_modes': control_modes, 'control_values': control_values}
+        self._sequence.append((t, target_modes, gate_info))
 
     def compute_control_tensor(self, control_dim, control_val, rank, direction):
         c1_rank3 = self.backend.asarray([1, 0, 0, 0,  0, 0, 0, 1]).reshape(2, 2, 2)
@@ -511,14 +618,17 @@ def get_sv_contraction_expression(self):
                 mode_frontier += 1
                 operands += [t, modes]
         
-        for op, qudits, control_info in self.sequence:
-            if control_info is not None:
-                # convert control tensor into MPO
-                ctrl_modes, ctrl_vals = control_info
-                op = self.compute_ct_mpo_tensors(ctrl_modes, ctrl_vals, qudits, op)
-                qudits = qudits + ctrl_modes
-                qudits = sorted(qudits)
-                control_info = None
+        for op, qudits, gate_info in self.sequence:
+            if gate_info is not None:
+                if 'control_values' in gate_info and 'control_modes' in gate_info:
+                    # convert control tensor into MPO
+                    ctrl_modes, ctrl_vals = gate_info['control_modes'], gate_info['control_values']
+                    op = self.compute_ct_mpo_tensors(ctrl_modes, ctrl_vals, qudits, op)
+                    qudits = qudits + ctrl_modes
+                    qudits = sorted(qudits)
+                    gate_info = None
+                else:
+                    raise RuntimeError("Not the expected code path")
             n_qudits = len(qudits)
             if isinstance(op, (list, tuple)):
                 # for MPO
@@ -550,23 +660,15 @@ def get_sv_contraction_expression(self):
         operands.append(qudit_modes)
         return operands
 
-    def to_network_state(self, config=None, options=None):
+    def to_network_state(self, *, parse_channels=True, config=None, options=None):
         network_state = NetworkState(self.state_dims, dtype=self.dtype, config=config, options=options)
         if self.initial_mps_dim is not None:
             network_state.set_initial_mps(self.get_initial_state())
-        for op, modes, control_info in self.sequence:
-            if control_info is None:
-                if isinstance(op, (list, tuple)):
-                    # MPO
-                    network_state.apply_mpo(modes, op)
-                else:
-                    # GATE
-                    network_state.apply_tensor_operator(modes, op)
-            else:
-                # Controlled-Tensor
-                # NetworkState currently only support immutable controlled tensors
-                network_state.apply_tensor_operator(modes, op, control_modes=control_info[0], control_values=control_info[1], immutable=True)
-        return network_state
+        outputs = apply_factory_sequence(network_state, self.sequence, parse_channels)
+        if parse_channels:
+            return network_state
+        else:
+            return network_state, outputs[1]
 
 
 class MPS(_BaseComputeEngine):
@@ -607,20 +709,15 @@ def __init__(
             if self.mps_tensors[-1].ndim == 2:
                 new_shape = self.mps_tensors[-1].shape + (1, ) 
                 self.mps_tensors[-1] = self.mps_tensors[-1].reshape(*new_shape)
-        self._minimal_compression(0, self.n-1, True)
-        if canonical_center is not None:
-            assert canonical_center >= 0 and canonical_center < self.n
-        self.canonical_center = canonical_center
-        self.sample_rng = sample_rng
         for key in svd_options.keys():
             if key not in MPS_VALID_CONFIGS:
                 raise ValueError(f"{key} not supported")
         self.svd_options = {'partition': 'UV'}
         self.svd_options.update(svd_options)
-        max_extent = self.svd_options.pop('max_extent', None)
         self.is_exact_svd = self.svd_options.get('normalization', None) is None
         for key in ('abs_cutoff', 'rel_cutoff', 'discarded_weight_cutoff'):
             self.is_exact_svd = self.is_exact_svd and self.svd_options.get(key, None) in (0, None)
+        max_extent = self.svd_options.pop('max_extent', None)
         self.max_extents = []
         for i in range(self.n-1):
             max_shared_extent = min(np.prod(self.state_dims[:i+1]), np.prod(self.state_dims[i+1:]))
@@ -628,6 +725,11 @@ def __init__(
                 self.max_extents.append(max_shared_extent)
             else:
                 self.max_extents.append(min(max_extent, max_shared_extent))
+        self._minimal_compression(0, self.n-1, True)
+        if canonical_center is not None:
+            assert canonical_center >= 0 and canonical_center < self.n
+        self.canonical_center = canonical_center
+        self.sample_rng = sample_rng
         assert mpo_application in {'exact', 'approximate'}
         self.mpo_application = mpo_application
         self._tolerance = None
@@ -786,23 +888,24 @@ def _canonicalize_site(self, i, direction, max_extent=None, **svd_options):
             tmp = self.backend.einsum('ipj,jql->ipql', ti, tj)
             self[left], _, self[right] = tensor_decompose('ipql->ipj,jql', tmp, method='svd', max_extent=max_extent, **svd_options)
     
-    def _minimal_compression(self, start, end, check_minimal=False):
+    def _minimal_compression(self, start, end, check_manageable=False):
+        if check_manageable:
+            manageable = True
+            for i in range(start, end+1):
+                if i == self.n - 1:
+                    break
+                manageable = manageable and self[i].shape[-1] <= self.max_extents[i]
+                if not manageable:
+                    break
+            if manageable:
+                return
+        
         for i in range(start, end+1):
             if i == self.n - 1: 
                 break
-            if check_minimal:
-                left_extent, shared_extent = np.prod(self[i].shape[:2]), self[i].shape[-1]
-                right_extent = np.prod(self[i+1].shape[1:])
-                if shared_extent == min(left_extent, right_extent, shared_extent):
-                    continue
             self._canonicalize_site(i, 'right')
         for i in range(end, start-1, -1):
             if i==0: break
-            if check_minimal:
-                left_extent, shared_extent = np.prod(self[i-1].shape[:2]), self[i-1].shape[-1]
-                right_extent = np.prod(self[i].shape[1:])
-                if shared_extent == min(left_extent, right_extent, shared_extent):
-                    continue
             self._canonicalize_site(i, 'left')
 
     def _apply_gate_1q(self, i, operand):
@@ -968,8 +1071,8 @@ def from_factory(cls, factory, **kwargs):
             mps_tensors = None
         qudit_dims = factory.state_dims
         mps = cls(qudits, factory.backend, qudit_dims=qudit_dims, mps_tensors=mps_tensors, dtype=factory.dtype, **kwargs)
-        for op, modes, control_info in factory.sequence:
-            if control_info is None:
+        for op, modes, gate_info in factory.sequence:
+            if gate_info is None:
                 if isinstance(op, (list, tuple)):
                     # MPO
                     mps.apply_mpo(modes, op)
@@ -977,11 +1080,14 @@ def from_factory(cls, factory, **kwargs):
                     # Gate
                     mps.apply_gate(modes, op)
             else:
-                ctrl_modes, ctrl_vals = control_info
-                ct_tensors = factory.compute_ct_mpo_tensors(ctrl_modes, ctrl_vals, modes, op) 
-                new_modes = modes + ctrl_modes
-                new_modes = sorted(new_modes)
-                mps.apply_mpo(new_modes, ct_tensors)
+                if 'control_values' in gate_info and 'control_modes' in gate_info:
+                    ctrl_modes, ctrl_vals = gate_info['control_modes'], gate_info['control_values']
+                    ct_tensors = factory.compute_ct_mpo_tensors(ctrl_modes, ctrl_vals, modes, op) 
+                    new_modes = modes + ctrl_modes
+                    new_modes = sorted(new_modes)
+                    mps.apply_mpo(new_modes, ct_tensors)
+                else:
+                    raise RuntimeError("Not expected code path")
         mps.canonicalize()
         return mps
     
@@ -1125,16 +1231,16 @@ def run_tests(self):
             for o in mps_tensors:
                 self._check_tensor(o)
     
-        sv = self.state.compute_state_vector()
+        sv, norm_0 = self.state.compute_state_vector(return_norm=True)
         self._check_tensor(sv, shape=self.state.state_mode_extents)
 
         # amplitude
-        amplitude = self.state.compute_amplitude('0' * self.n)
+        amplitude, norm_1 = self.state.compute_amplitude('0' * self.n, return_norm=True)
         np.allclose(sv.ravel()[0].item(), amplitude, **self.tolerance)
 
         # batched_amplitude
         fixed = {0:1, 1:0} if self.n > 2 else {0:1}
-        batched_amplitude = self.state.compute_batched_amplitudes(fixed)
+        batched_amplitude, norm_2 = self.state.compute_batched_amplitudes(fixed, return_norm=True)
         self._check_tensor(batched_amplitude, shape=self.state.state_mode_extents[len(fixed):])
 
         # RDM
@@ -1153,9 +1259,187 @@ def run_tests(self):
 
         # expectation
         if (set(self.state.state_mode_extents) == set([2, ])) and self.state.dtype.startswith('complex'):
-            expectation = self.state.compute_expectation('I' * self.n)
+            expectation, norm_3 = self.state.compute_expectation('I' * self.n, return_norm=True)
             assert np.allclose(expectation, norm_ref, **self.tolerance)
-
+        else:
+            norm_3 = None
+        
         # norm
-        norm = self.state.compute_norm()
-        assert np.allclose(norm, norm_ref, **self.tolerance)
\ No newline at end of file
+        for norm in (norm_0, norm_1, norm_2, norm_3):
+            if norm is not None:
+                assert np.allclose(norm, norm_ref, **self.tolerance)
+
+
+class NetworkStateChannelTester:
+    """
+    Two simulation workflows are compared in this tester:
+
+        - state_with_channel used to perform trajectory based simulation
+        - state_reference used to compute exact results for each configuration with corresponding probabilities
+    """
+    def __init__(self, factory, config, num_trajectories = 100):
+        self.factory = factory
+        self.config = config
+        self.dtype = factory.dtype
+        self.num_trajectories = num_trajectories
+        self.backend = factory.backend
+        self.n = factory.num_qudits
+        self.tolerance = {'atol': atol_mapper[self.dtype], 
+                          'rtol': rtol_mapper[self.dtype]}
+
+        # parse factory to state
+        self.state_with_channel = factory.to_network_state(config=config)
+        self.state_reference, self.channel_info = factory.to_network_state(config=config, parse_channels=False, options=self.state_with_channel.options)
+        channel_ids, channel_ops = zip(*self.channel_info.items())
+        self.ops_to_update = []
+        for op_ids in itertools.product(*[range(len(ops)) for ops, _ in channel_ops]):
+            p_tot = 1.0
+            entry = []
+            for channel_id, (ops, probabilities), op_id in zip(channel_ids, channel_ops, op_ids):
+                p_tot *= probabilities[op_id]
+                entry.append([channel_id, ops[op_id]])
+            entry = [p_tot, ] + entry
+            self.ops_to_update.append(entry)
+        
+        self.pauli_strings = {
+            'X' * self.n: 0.1,
+            'Y' * self.n: 0.2,
+            'Z' * self.n: 0.4,
+        }
+        # use an explicit NetworkOperator to activate caching for expectation computation speedup
+        self.pauli_operator = NetworkOperator.from_pauli_strings(self.pauli_strings, dtype=self.dtype, options=self.state_with_channel.options)
+        self.num_sampling_shots = 1000
+
+    def _compute_property_with_channel(self, property_name):
+        data = []
+        for _ in range(self.num_trajectories):
+            if property_name == 'expectation':
+                # use self.pauli_operators to activate caching for speedup
+                expec, norm = self.state_with_channel.compute_expectation(self.pauli_operator, return_norm=True)
+                output = expec / norm
+            elif property_name == 'amplitude':
+                # skip norm
+                output = self.state_with_channel.compute_amplitude(self.n * '0')
+            else:
+                output = compute_state_basic_property(self.state_with_channel, property_name)
+            data.append(output)
+        return data
+    
+    def _compute_property_reference(self, property_name):
+        data = []
+        for p, *entry in self.ops_to_update:
+            for (channel_id, operand) in entry:
+                self.state_reference.update_tensor_operator(channel_id, operand, unitary=True)
+            if property_name == 'expectation':
+                # use self.pauli_operators instead of a dictionary to activate caching for speedup
+                expec, norm = self.state_reference.compute_expectation(self.pauli_operator, return_norm=True)
+                output = expec / norm
+            elif property_name == 'amplitude':
+                # skip norm
+                output = self.state_reference.compute_amplitude(self.n * '0')
+            else:
+                output = compute_state_basic_property(self.state_reference, property_name)
+            data.append([p, output])
+        return data
+
+    def _verify_output(self, property_name, traj_output, reference_output):
+        if property_name == 'sampling':
+            # For sampling, since we fix the seed, we verify that each trajectory output has a high overlap with at least one of the reference output
+            for snap_shot in traj_output:
+                ovlp_with_reference = []
+                for _, reference in reference_output:
+                    ovlp = 0
+                    for key in set(snap_shot) & set(reference):
+                        ovlp += min(snap_shot[key], reference[key])
+                    ovlp_with_reference.append(ovlp / self.num_sampling_shots)
+                #NOTE: this is often 1 since we fix the seed, but leaving 0.99 here in case of machine precision error
+                assert max(ovlp_with_reference) >= 0.99
+        else:
+            # making sure that all traj_output can be obtained in the reference_output
+            traj_value_arrays = self.backend.asarray(traj_output)
+            reference_value_array = self.backend.asarray([val for _, val in reference_output])
+            diff = abs(traj_value_arrays[:, None]- reference_value_array[None,:])
+            if diff.ndim > 2:
+                diff = self.backend.sum(diff, axis=tuple(range(2, diff.ndim)))
+                # diff shape becomes (n_trajectory, n_possible_configs)
+            col_indices = self.backend.argmin(diff, axis=1)
+            min_values = diff[np.arange(diff.shape[0]), col_indices]
+            # make sure that each trajectory has a corresponding entry from the reference configurations
+            assert self.backend.allclose(min_values, self.backend.zeros_like(min_values), **self.tolerance)
+        
+            if property_name == 'expectation':
+                # For expectation, also test the average result are close for trajectory simulation and exact reference
+                traj_expectation = self.backend.average(traj_value_arrays)
+                reference_expectation = sum([p * val for p, val in reference_output])
+                # NOTE: rtol=0.05 is manually selected. 
+                # Due to the stochastic nature of noisy simulation, this test may fail when num_trajectories is low.
+                # If expectation value check fails, increase num_trajectories and verify again (up to 3 max tries) in _run_property_test
+                assert self.backend.allclose(traj_expectation, reference_expectation, rtol=0.05)      
+    
+    def _run_property_test(self, property_name):
+        if property_name != 'expectation':
+            traj_output = self._compute_property_with_channel(property_name)
+            reference_output =  self._compute_property_reference(property_name)
+            self._verify_output(property_name, traj_output, reference_output)
+        else:
+            num_trajectories = self.num_trajectories
+            self.num_trajectories = 500 # starting from 500 as default 100 is not a good starting point for convergence
+            reference_output = self._compute_property_reference(property_name)
+            test_passed = False
+            for i in range(3): # max try
+                try:
+                    traj_output = self._compute_property_with_channel(property_name)
+                    self._verify_output(property_name, traj_output, reference_output)
+                    test_passed = True
+                except AssertionError:
+                    print(f"WARNING: expectation convergence test failing with {self.num_trajectories} trajectories")
+                    self.num_trajectories *= 5
+                if test_passed:
+                    if self.num_trajectories != num_trajectories:
+                        print(f"INFO: expectation convergence test passed with {self.num_trajectories} trajectories")
+                    break
+            # revert to the original num_trajectories
+            self.num_trajectories = num_trajectories
+            assert test_passed 
+
+    def run_tests(self):
+        for property_name in STATE_PROPERTIES_NAMES:
+            self._run_property_test(property_name)
+
+        if isinstance(self.state_with_channel.config, MPSConfig):
+            # check that if release_operators is set to True, all properties map to the same state/trajectory
+            # NOTE: this test must be performed after _run_property_test as release_operators=True will capture MPS in state_with_channel and invalidate the stochastic property
+            self.state_with_channel.compute_output_state(release_operators=True)
+            sv = self.state_with_channel.compute_state_vector()
+
+            where = (0, 1)
+            rdm = self.state_with_channel.compute_reduced_density_matrix(where)
+            assert self.backend.allclose(rdm, reduced_density_matrix_from_sv(sv, where), **self.tolerance)
+            
+            bitstring = '1' * self.n
+            amp, norm = self.state_with_channel.compute_amplitude(bitstring, return_norm=True)
+            assert self.backend.allclose(amp, amplitude_from_sv(sv, bitstring), **self.tolerance)
+            assert self.backend.allclose(norm, (abs(sv)**2).sum(), **self.tolerance)
+
+            fixed = {0: 1, 1: 0}
+            batched_amp = self.state_with_channel.compute_batched_amplitudes(fixed)
+            assert self.backend.allclose(batched_amp, batched_amplitude_from_sv(sv, fixed), **self.tolerance)
+
+            expectation = self.state_with_channel.compute_expectation(self.pauli_operator)
+            assert self.backend.allclose(expectation, expectation_from_sv(sv, self.pauli_strings), **self.tolerance)
+
+            nshots = 5000
+            for _ in range(3):
+                samples = self.state_with_channel.compute_sampling(nshots, seed=1)
+                ovlp = compute_sample_overlap(samples, sv, None)
+                test_passed = ovlp >= 0.95
+                if test_passed:
+                    print(f"INFO: sampling test passed with {nshots=}")
+                    break
+                else:
+                    print(f"WARNING: sampling test failed with {nshots=}")
+                    nshots *= 10
+            assert test_passed
+
+        self.state_with_channel.free()
+        self.state_reference.free()
\ No newline at end of file
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_contract.py b/python/tests/cuquantum_tests/cutensornet_tests/test_contract.py
index 6cc15ae..c8a2248 100644
--- a/python/tests/cuquantum_tests/cutensornet_tests/test_contract.py
+++ b/python/tests/cuquantum_tests/cutensornet_tests/test_contract.py
@@ -13,6 +13,7 @@
 import cuquantum
 from cuquantum import cutensornet as cutn
 from cuquantum.cutensornet._internal.utils import infer_object_package
+from cuquantum.cutensornet.configuration import MemoryLimitExceeded
 
 from .data import backend_names, dtype_names, einsum_expressions
 from .test_utils import atol_mapper, EinsumFactory, rtol_mapper
@@ -97,12 +98,8 @@ def _test_runner(
                     pytest.skip("this TN is currently not supported")
                 else:
                     raise
-            except MemoryError as e:
-                if "Insufficient memory" in str(e):
-                    # not enough memory available to process, just skip
-                    pytest.skip("Insufficient workspace memory available.")
-                else:
-                    raise
+            except MemoryLimitExceeded as e:
+                pytest.skip("Insufficient workspace memory available.")
 
             if return_info:
                 out, (path, info) = out
@@ -146,12 +143,8 @@ def _test_runner(
                     pytest.skip("cuquantum.einsum() fail -- TN too large?")
                 else:
                     raise
-            except MemoryError as e:
-                if "Insufficient memory" in str(e):
-                    # not enough memory available to process, just skip
-                    pytest.skip("Insufficient workspace memory available.")
-                else:
-                    raise
+            except MemoryLimitExceeded as e:
+                pytest.skip("Insufficient workspace memory available.")
 
         backend_out = sys.modules[infer_object_package(out)]
         assert backend_out is backend
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_cutensornet.py b/python/tests/cuquantum_tests/cutensornet_tests/test_cutensornet.py
index 4b4f662..08fabf6 100644
--- a/python/tests/cuquantum_tests/cutensornet_tests/test_cutensornet.py
+++ b/python/tests/cuquantum_tests/cutensornet_tests/test_cutensornet.py
@@ -953,8 +953,8 @@ def test_tensor_qr(self):
         {'abs_cutoff': 0.1, 'discarded_weight_cutoff': 0.05, 'normalization': 'L2'}, # discarded weight truncation
         {'abs_cutoff': 0.1, 'rel_cutoff': 0.1, 'algorithm': 'gesvdj', 'gesvdj_tol':1e-14, 'gesvdj_max_sweeps': 80}, # value based truncation
         {'abs_cutoff': 0.1, 'normalization':'L2', 'partition':'V', 'algorithm': 'gesvdj'}, # absolute value based truncation
-        {'rel_cutoff': 0.1, 'normalization':'LInf', 'partition':'UV', 'algorithm': 'gesvdp'}, # relative value based truncation
-        {'max_extent': 4, 'abs_cutoff': 0.1, 'rel_cutoff': 0.1, 'normalization':'L1', 'partition':'UV', 'algorithm': 'gesvdp'}, # compound truncation
+        {'normalization':'LInf', 'partition':'UV', 'algorithm': 'gesvdp'}, # exact gesvdp
+        {'max_extent': 4, 'abs_cutoff': 0.1, 'rel_cutoff': 0.1, 'normalization':'L1', 'partition':'UV'}, # compound truncation
     ),
 }))
 class TestTensorSVD:
@@ -1078,8 +1078,8 @@ def test_tensor_svd(self):
         {'abs_cutoff': 0.1, 'discarded_weight_cutoff': 0.05, 'normalization': 'L2'}, # discarded weight truncation
         {'abs_cutoff': 0.1, 'rel_cutoff': 0.1, 'algorithm': 'gesvdj', 'gesvdj_tol':1e-14, 'gesvdj_max_sweeps': 80}, # value based truncation
         {'abs_cutoff': 0.1, 'normalization':'L2', 'partition':'V', 'algorithm': 'gesvdj'}, # absolute value based truncation
-        {'rel_cutoff': 0.1, 'normalization':'LInf', 'partition':'UV', 'algorithm': 'gesvdp'}, # relative value based truncation
-        {'max_extent': 4, 'abs_cutoff': 0.1, 'rel_cutoff': 0.1, 'normalization':'L1', 'partition':'UV', 'algorithm': 'gesvdp'}, # compound truncation
+        {'normalization':'LInf', 'partition':'UV', 'algorithm': 'gesvdp'}, # exact gesvdp
+        {'max_extent': 4, 'abs_cutoff': 0.1, 'rel_cutoff': 0.1, 'normalization':'L1', 'partition':'UV'}, # compound truncation
     ),
 }))
 class TestTensorGate:
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_experimental.py b/python/tests/cuquantum_tests/cutensornet_tests/test_experimental.py
index 9f14644..b31ffba 100644
--- a/python/tests/cuquantum_tests/cutensornet_tests/test_experimental.py
+++ b/python/tests/cuquantum_tests/cutensornet_tests/test_experimental.py
@@ -17,19 +17,33 @@
 from cuquantum.cutensornet.experimental._internal.network_state_utils import STATE_SUPPORTED_DTYPE_NAMES
 from cuquantum.cutensornet._internal.decomposition_utils import DECOMPOSITION_DTYPE_NAMES, parse_decomposition
 from cuquantum.cutensornet._internal.utils import infer_object_package
+from cuquantum.cutensornet.configuration import MemoryLimitExceeded
 
 from .approxTN_utils import split_contract_decompose, tensor_decompose, verify_split_QR, verify_split_SVD, SingularValueDegeneracyError
 from .circuit_data import backend, backend_cycle
 from .circuit_tester import get_random_pauli_strings
 from .circuit_utils import get_contraction_tolerance, get_mps_tolerance
 from .data import backend_names, contract_decompose_expr
-from .state_data import testing_circuits_mps, qudits_to_test, state_settings, approx_mps_options, factory_backend, factory_backend_cycle, svd_algorithm, svd_algorithm_cycle, create_vqc_states, STATE_UPDATE_CONFIGS
-from .state_tester import StateFactory, NetworkStateFunctionalityTester, ExactStateAPITester, ApproximateMPSTester, MPS, get_random_network_operator, is_converter_mps_compatible
+from .state_data import (
+    testing_circuits_mps, 
+    qudits_to_test, 
+    state_settings, 
+    approx_mps_options, 
+    factory_backend, 
+    factory_backend_cycle, 
+    svd_algorithm, 
+    svd_algorithm_cycle, 
+    create_vqc_states, 
+    STATE_UPDATE_CONFIGS,
+    unitary_state_tests
+)
+from .state_tester import StateFactory, NetworkStateFunctionalityTester, ExactStateAPITester, ApproximateMPSTester, MPS, NetworkStateChannelTester
+from .state_tester import get_random_network_operator, is_converter_mps_compatible, apply_factory_sequence, compute_state_basic_quantities
 from .test_circuit_converter import CIRCUIT_TEST_SETTING
 from .test_options import _OptionsBase
 from .test_utils import DecomposeFactory, deselect_contract_decompose_algorithm_tests, deselect_decompose_tests, get_svd_methods_for_test, DEFAULT_RNG, gen_rand_svd_method
-from .test_utils import get_stream_for_backend, deselect_network_operator_from_pauli_string_tests, deselect_invalid_device_id_tests, get_state_internal_backend_device
-
+from .test_utils import get_stream_for_backend, get_state_internal_backend_device
+from .test_utils import deselect_invalid_network_operator_tests, deselect_network_operator_from_pauli_string_tests, deselect_invalid_device_id_tests
 
 @pytest.mark.uncollect_if(func=deselect_decompose_tests)
 @pytest.mark.parametrize(
@@ -135,25 +149,29 @@ def _run_contract_decompose(self, decompose_expr, xp, dtype, order, stream, algo
                                     info_ref=info_ref,
                                     **svd_kwargs)
 
-
     def test_contract_qr_decompose(self, decompose_expr, xp, dtype, order, stream):
         algorithm = ContractDecomposeAlgorithm(qr_method={}, svd_method=False)
         self._run_contract_decompose(decompose_expr, xp, dtype, order, stream, algorithm)
 
-    
     def test_contract_svd_decompose(self, decompose_expr, xp, dtype, order, stream):
         methods = get_svd_methods_for_test(3, dtype)
         for svd_method in methods:
             algorithm = ContractDecomposeAlgorithm(qr_method=False, svd_method=svd_method)
             self._run_contract_decompose(decompose_expr, xp, dtype, order, stream, algorithm)
 
-    
     def test_contract_qr_assisted_svd_decompose(self, decompose_expr, xp, dtype, order, stream):
         methods = get_svd_methods_for_test(3, dtype)
         for svd_method in methods:
             algorithm = ContractDecomposeAlgorithm(qr_method={}, svd_method=svd_method)
             self._run_contract_decompose(decompose_expr, xp, dtype, order, stream, algorithm)
 
+def test_memory_limit():
+    decompose_expr = 'il->ix,lx'
+    factory = DecomposeFactory(decompose_expr)
+    operands = factory.generate_operands(factory.input_shapes, "numpy", "float64", "C")
+    with pytest.raises(MemoryLimitExceeded):
+        outputs = contract_decompose(decompose_expr, *operands, options={'memory_limit': 1})
+    
 
 class TestContractDecomposeAlgorithm(_OptionsBase):
 
@@ -203,7 +221,7 @@ def test_contract_decompose_info(self, qr_method, svd_method, svd_info, optimize
 
 # Correctness tests will be performed in TestNetworkState
 class TestNetworkOperator:
-    @pytest.mark.uncollect_if(func=deselect_invalid_device_id_tests)
+    @pytest.mark.uncollect_if(func=deselect_invalid_network_operator_tests)
     @pytest.mark.parametrize("backend", backend_names)
     @pytest.mark.parametrize("state_dim_extents",(3, 4, 7, (3, 2, 4, 5), (4, 5, 2, 3, 2)))
     @pytest.mark.parametrize("dtype", STATE_SUPPORTED_DTYPE_NAMES)
@@ -236,6 +254,30 @@ def test_from_pauli_strings(self, backend, n_qubits, num_pauli_strings, dtype, d
                 assert (o.name, o.device_id) == (expected_backend, expected_device)
 
 
+def create_state_factory(qudits, state_setting, backend, dtype):
+    print(f"{backend=}")
+    adjacent_double_layer, mpo_bond_dim, mpo_num_sites, mpo_geometry, ct_target_place, initial_mps_dim = state_setting
+    if isinstance(qudits, (tuple, list)):
+        # for qudits with different dimensions, exact simulation only supports adjacent double layers
+        if len(set(qudits)) != 1:
+            adjacent_double_layer = True
+    factory = StateFactory(qudits, 
+                           dtype, 
+                           backend=backend,
+                           layers='SDCMDS',
+                           adjacent_double_layer=adjacent_double_layer,
+                           mpo_bond_dim=mpo_bond_dim,
+                           mpo_num_sites=mpo_num_sites,
+                           mpo_geometry=mpo_geometry,
+                           ct_target_place=ct_target_place,
+                           initial_mps_dim=initial_mps_dim)
+    return factory
+
+@pytest.fixture(scope='function')
+def state_factory(qudits, state_setting, factory_backend, dtype):
+    return create_state_factory(qudits, state_setting, factory_backend, dtype)
+
+
 class TestNetworkStateFunctionality:
 
     @pytest.mark.uncollect_if(func=deselect_invalid_device_id_tests)
@@ -255,7 +297,7 @@ def test_circuit_state(self, circuit, dtype, config, backend, device_id):
     def test_custom_state(self, qudits, dtype, config, factory_backend):
         print(f"{factory_backend=}")
         adjacent_double_layer = False
-        if isinstance(qudits_to_test, (tuple, list)) and len(set(qudits_to_test)) != 1:
+        if isinstance(qudits, (tuple, list)) and len(set(qudits)) != 1:
             # for qudits with different dimensions, exact simulation only supports adjacent double layers
             adjacent_double_layer = True
 
@@ -281,41 +323,25 @@ def test_exact_circuit_state(self, circuit, dtype, backend, svd_algorithm):
         state_tester.run_tests()
 
     @pytest.mark.parametrize("qudits", qudits_to_test)
-    @pytest.mark.parametrize("dtype", ('float32', 'float64', 'complex64', 'complex128'))
     @pytest.mark.parametrize("state_setting", state_settings)
-    def test_exact_custom_state(self, qudits, dtype, state_setting, factory_backend):
-        print(f"{factory_backend=}")
-        adjacent_double_layer, mpo_bond_dim, mpo_num_sites, mpo_geometry, ct_target_place, initial_mps_dim = state_setting
-        if isinstance(qudits_to_test, (tuple, list)):
-            # for qudits with different dimensions, exact simulation only supports adjacent double layers
-            if len(set(qudits_to_test)) != 1:
-                adjacent_double_layer = True
-        factory = StateFactory(qudits, 
-                               dtype, 
-                               backend=factory_backend,
-                               layers='SDCMDS',
-                               adjacent_double_layer=adjacent_double_layer,
-                               mpo_bond_dim=mpo_bond_dim,
-                               mpo_num_sites=mpo_num_sites,
-                               mpo_geometry=mpo_geometry,
-                               ct_target_place=ct_target_place,
-                               initial_mps_dim=initial_mps_dim)
-        expr = factory.get_sv_contraction_expression()
+    @pytest.mark.parametrize("dtype", ('float32', 'float64', 'complex64', 'complex128'))
+    def test_exact_custom_state(self, state_factory):
+        expr = state_factory.get_sv_contraction_expression()
         sv0 = contract(*expr)
         
-        mps = MPS.from_factory(factory, mpo_application='exact')
+        mps = MPS.from_factory(state_factory, mpo_application='exact')
         sv = mps.compute_state_vector()
-        assert factory.backend.allclose(sv0, sv, **mps.tolerance)
+        assert state_factory.backend.allclose(sv0, sv, **mps.tolerance)
         sv = None
 
         rdm0 = mps.compute_reduced_density_matrix((0,))
 
         for config in (TNConfig(), MPSConfig(mpo_application='exact')):
-            with factory.to_network_state(config=config) as state:
+            with state_factory.to_network_state(config=config) as state:
                 sv = state.compute_state_vector()
                 rdm = state.compute_reduced_density_matrix((0,))
-                assert factory.backend.allclose(sv, sv0, **mps.tolerance)
-                assert factory.backend.allclose(rdm, rdm0, **mps.tolerance)
+                assert state_factory.backend.allclose(sv, sv0, **mps.tolerance)
+                assert state_factory.backend.allclose(rdm, rdm0, **mps.tolerance)
 
 
     @pytest.mark.parametrize("circuit", testing_circuits_mps)
@@ -346,22 +372,9 @@ def test_approximate_circuit_state(self, circuit, mps_config_iter, backend):
     @pytest.mark.parametrize("state_setting", state_settings)
     @pytest.mark.parametrize("dtype", ('float64', 'complex128'))
     @pytest.mark.parametrize("mps_option", approx_mps_options)
-    def test_approximate_custom_state(self, qudits, state_setting, dtype, mps_option, factory_backend):
-        print(f"{factory_backend=}")
-        adjacent_double_layer, mpo_bond_dim, mpo_num_sites, mpo_geometry, ct_target_place, initial_mps_dim = state_setting
-        factory = StateFactory(qudits, 
-                               dtype, 
-                               layers='SDCMDS',
-                               backend=factory_backend,
-                               adjacent_double_layer=adjacent_double_layer,
-                               mpo_bond_dim=mpo_bond_dim,
-                               mpo_num_sites=mpo_num_sites,
-                               mpo_geometry=mpo_geometry,
-                               ct_target_place=ct_target_place,
-                               initial_mps_dim=initial_mps_dim
-                               )
+    def test_approximate_custom_state(self, state_factory, mps_option):
         try:
-            tester = ApproximateMPSTester.from_factory(factory, mps_option, rng=np.random.default_rng(2024))
+            tester = ApproximateMPSTester.from_factory(state_factory, mps_option, rng=np.random.default_rng(2024))
             tester.run_tests()
         except SingularValueDegeneracyError:
             pytest.skip("Test skipped due to singular value degeneracy issue")
@@ -382,8 +395,8 @@ def test_update_reuse_correctness(self, config):
 
         original_expec = []
         for state in [state_a, state_b]:
-            e = state.compute_expectation(operator) / state.compute_norm()
-            original_expec.append(e)
+            e, norm = state.compute_expectation(operator, return_norm=True)
+            original_expec.append(e/norm)
         
         for tensor_id in two_body_op_ids:
             state_a.update_tensor_operator(tensor_id, op_two_body_y, unitary=False)
@@ -391,8 +404,8 @@ def test_update_reuse_correctness(self, config):
         
         updated_expec = []
         for state in [state_b, state_a]:
-            e = state.compute_expectation(operator) / state.compute_norm()
-            updated_expec.append(e)
+            e, norm = state.compute_expectation(operator, return_norm=True)
+            updated_expec.append(e/norm)
 
         for e1, e2 in zip(original_expec, updated_expec):
             np.allclose(e1, e2, **tolerance)
@@ -411,4 +424,76 @@ def test_update_reuse_correctness(self, config):
             updated_sv.append(state.compute_state_vector())
 
         for sv1, sv2 in zip(original_sv, updated_sv):
-            cp.allclose(sv1, sv2, **tolerance)
\ No newline at end of file
+            cp.allclose(sv1, sv2, **tolerance)
+    
+            
+    @pytest.mark.parametrize("qudits", qudits_to_test)
+    @pytest.mark.parametrize("state_setting", state_settings)
+    @pytest.mark.parametrize("dtype", ('complex128', ))
+    @pytest.mark.parametrize("config", (MPSConfig(max_extent=4, rel_cutoff=1e-1),))
+    def test_mps_release_operators(self, state_factory, config):
+        num_operands = len(state_factory.sequence)
+        
+        #############################################################
+        # Case I. NetworkState with release_operators in the middle #
+        #############################################################
+
+        state = NetworkState(state_factory.state_dims, dtype=state_factory.dtype, config=config)
+        if state_factory.initial_mps_dim is not None:
+            state.set_initial_mps(state_factory.get_initial_state())
+        # apply the first half operators
+        tensor_ids_first_half = set(apply_factory_sequence(state, state_factory.sequence[:num_operands//2]))
+        tensors_0 = state.compute_output_state(release_operators=True)
+        # create a copy as initial guess for another NetworkState object
+        try:
+            tensors_0 = [o.copy() for o in tensors_0] 
+        except AttributeError:
+            tensors_0 = [o.clone() for o in tensors_0] # torch
+        # compute the intermediate state output
+        intermediate_output = compute_state_basic_quantities(state)
+        # Apply the second half
+        tensor_ids_second_half = set(apply_factory_sequence(state, state_factory.sequence[num_operands//2:]))
+        # make sure that there is no overlap in the output tensor ids
+        assert not tensor_ids_first_half.intersection(tensor_ids_second_half)
+        with state:
+            output = compute_state_basic_quantities(state)
+
+        #######################################################
+        # Reference I. NetworkState without release_operators #
+        #######################################################
+        with state_factory.to_network_state(config=config) as reference_state:
+            reference_1 = compute_state_basic_quantities(reference_state)
+
+        ####################################################
+        # Reference II. NetworkState with initial state    #
+        ####################################################
+        new_state = NetworkState(state_factory.state_dims, dtype=state_factory.dtype, config=config)
+        new_state.set_initial_mps(tensors_0)
+        # Apply the second half operators
+        apply_factory_sequence(new_state, state_factory.sequence[num_operands//2:])
+        with new_state:
+            reference_2 = compute_state_basic_quantities(new_state)
+        
+        allclose = state_factory.backend.allclose
+        for key, result in output.items():
+            intm = intermediate_output[key]
+            ref1 = reference_1[key]
+            ref2 = reference_2[key]
+            if key == 'sampling':
+                assert result==ref1 and result==ref2 and result != intm
+            else:
+                # NOTE: result != intm holds here because operands are generated as random tensors
+                assert allclose(result, ref1) and allclose(result, ref2) and (not allclose(result, intm))
+    
+    @pytest.mark.parametrize('unitary_state_setting', unitary_state_tests)
+    def test_unitary_channel(self, unitary_state_setting):
+        qudits, initial_mps_dim, config, dtype = unitary_state_setting
+        factory = StateFactory(qudits, 
+                               dtype, 
+                               layers='SDMUDS', 
+                               backend='cupy', 
+                               rng=np.random.default_rng(qudits), 
+                               initial_mps_dim=initial_mps_dim, 
+                               adjacent_double_layer=True)
+        unitary_channel_tester = NetworkStateChannelTester(factory, config, num_trajectories=100)
+        unitary_channel_tester.run_tests()
\ No newline at end of file
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_tensor.py b/python/tests/cuquantum_tests/cutensornet_tests/test_tensor.py
index 67e1a00..58e166b 100644
--- a/python/tests/cuquantum_tests/cutensornet_tests/test_tensor.py
+++ b/python/tests/cuquantum_tests/cutensornet_tests/test_tensor.py
@@ -13,6 +13,7 @@
 from cuquantum import tensor
 from cuquantum.cutensornet._internal.decomposition_utils import DECOMPOSITION_DTYPE_NAMES
 from cuquantum.cutensornet._internal.utils import infer_object_package
+from cuquantum.cutensornet.configuration import MemoryLimitExceeded
 
 from .approxTN_utils import tensor_decompose, verify_split_QR, verify_split_SVD, SingularValueDegeneracyError
 from .data import backend_names, tensor_decomp_expressions
@@ -116,10 +117,20 @@ def test_svd(
                 decompose_expr, xp, dtype, order, stream, method,
                 blocking=blocking, return_info=return_info)
 
-
+def test_memory_limit():
+    decompose_expr, shapes = ('ab->ax,xb', [(8, 8)])
+    factory = DecomposeFactory(decompose_expr, shapes=shapes)
+    operand = factory.generate_operands(factory.input_shapes, "numpy", "float64", "C")[0]
+    with pytest.raises(MemoryLimitExceeded):
+        outputs = tensor.decompose(decompose_expr, operand, options={'memory_limit': 1})
+                
+    
 class TestDecompositionOptions(TestNetworkOptions):
 
     options_type = tensor.DecompositionOptions
+    
+    def test_compute_type(self):
+        pass  # Skip this test for DecompositionOptions
 
 
 class TestSVDMethod(_OptionsBase):
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/test_utils.py b/python/tests/cuquantum_tests/cutensornet_tests/test_utils.py
index 805aff9..ba80ec5 100644
--- a/python/tests/cuquantum_tests/cutensornet_tests/test_utils.py
+++ b/python/tests/cuquantum_tests/cutensornet_tests/test_utils.py
@@ -463,15 +463,21 @@ def is_device_id_valid(device_id):
         return device_id < num_devices
     return True
 
-def deselect_network_operator_from_pauli_string_tests(backend, n_qubits, num_pauli_strings, dtype, device_id, *args, **kwargs):
+def deselect_network_operator_from_pauli_string_tests(*args, **kwargs):
+    backend = kwargs.get('backend')
+    dtype = kwargs.get('dtype')
     if backend == 'torch-cpu' or dtype.startswith('float'): # NetworkOperator.from_pauli_strings not support torch-cpu
         return True
-    return not is_device_id_valid(device_id)
+    return deselect_invalid_network_operator_tests(*args, **kwargs)
 
 def deselect_invalid_device_id_tests(*args, **kwargs):
     device_id = kwargs.get('device_id', None)
     return not is_device_id_valid(device_id)
 
+def deselect_invalid_network_operator_tests(*args, **kwargs):
+    backend = kwargs.get('backend')
+    return deselect_invalid_device_id_tests(*args, **kwargs) or (backend.startswith('torch') and torch is None)
+
 def get_state_internal_backend_device(backend, device_id):
     expected_backend = {
             'numpy': 'cupy',
@@ -531,11 +537,13 @@ def zeros(self, *args, **kwargs):
     def norm(self, *args, **kwargs):
         return self.module.linalg.norm(*args, **kwargs)
     
-    def einsum(self, *args, **kwargs):
-        return self.module.einsum(*args, **kwargs)
-    
     def allclose(self, *args, **kwargs):
+        if np.isscalar(args[0]) and np.isscalar(args[1]):
+            return np.allclose(*args, **kwargs)
         return self.module.allclose(*args, **kwargs)
     
-    def vstack(self, *args, **kwargs):
-        return self.module.vstack(*args, **kwargs)
+    def __getattr__(self, name):
+        try:
+            return getattr(self.module, name)
+        except AttributeError as e:
+            raise e
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/__init__.py b/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/conftest.py b/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/conftest.py
new file mode 100644
index 0000000..1dcaa75
--- /dev/null
+++ b/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/conftest.py
@@ -0,0 +1,36 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""
+Fixture configuration for tests in trajectories_noise directory
+
+network_state_wrap.py defines `TrajectorySim` class which provides a basic quantum simulator API.
+It is also used in cusvsim ubackend tests, so the same API can be used for future tests that overlap between ubackend and cuTN
+
+- trajectory_sim fixture allows to use different TrajectorySim classes.
+- dtype fixture parametrizes over different data types
+
+If you want to specialize parameters for some particular test, redefine the fixtures in the corresponding test file.
+
+The test_* files are identical to the corresponding test files in cusvsim ubackend
+"""
+
+import pytest
+from .network_state_wrap import (
+    TrajectoryNaive,
+    TrajectoryApplyChannel,
+)
+
+# pytestmark doesn't work in conftest.py
+# pytestmark = pytest.mark.parametrize("state_algo", ["mps", "tn"])
+
+
+@pytest.fixture(params=["mps", "tn"])
+def state_algo(request):
+    return request.param
+
+
+@pytest.fixture(params=[TrajectoryNaive, TrajectoryApplyChannel])
+def trajectory_sim(request, n_qubits, state_algo):
+    return request.param(n_qubits, algo=state_algo)
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/network_state_wrap.py b/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/network_state_wrap.py
new file mode 100644
index 0000000..405760f
--- /dev/null
+++ b/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/network_state_wrap.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""
+Configuration presets for NetworkState
+
+To add a new NetworkState config preset, edit STATE_CONFIG_MAP,
+then use network_state_config to create the NetworkState object in tests.
+"""
+
+from abc import ABC
+from typing import Iterator
+import numpy as np
+from cuquantum.cutensornet.experimental import NetworkState, TNConfig, MPSConfig
+from .quantum_channels import QuantumChannel
+
+# in pytest cases, python objects are not displayed nicely, so let's use string tags
+STATE_CONFIG_MAP = {"tn": TNConfig(), "mps": MPSConfig(max_extent=4, rel_cutoff=1e-5)}
+
+
+def network_state_config(n_qubits, algo: str, dtype="complex128") -> NetworkState:
+    """
+    Helper function to configure NetworkState to use MPS or TN
+
+    Args:
+        - n_qubits: int
+        - algo: str
+            algorithm config id from network_state_config.STATE_CONFIG_MAP
+    """
+    # workaround for MPS, one qubit isn't working
+    if n_qubits == 1:
+        n_qubits = 2
+    state_mode_extents = (2,) * n_qubits
+    if algo not in STATE_CONFIG_MAP:
+        raise ValueError(f"Unknown state config id: {algo}")
+    config = STATE_CONFIG_MAP[algo]
+    nstate = NetworkState(state_mode_extents, dtype=dtype, config=config)
+    return nstate
+
+
+class TrajectorySim(ABC):
+    def apply_channel(self, qubits, channel: QuantumChannel): ...
+    def apply_gate(self, qubits, gate): ...
+    def rdm(self, qubits) -> np.ndarray: ...
+    def probs(self, qubits) -> np.ndarray: ...
+    def expectation(self, pauli_dict) -> float: ...
+    def iterate_trajectories(self, n_trajectories) -> Iterator["TrajectorySim"]:
+        """Use the object only within the iteration."""
+        ...
+
+
+class TrajectoryNaive(TrajectorySim):
+    n_qubits: int
+    algo: str
+    dtype: str
+
+    def __init__(self, n_qubits: int, algo: str, dtype="complex128"):
+        self.ns = network_state_config(n_qubits, algo, dtype)
+        self.n_qubits = n_qubits
+        self.algo = algo
+        self.dtype = dtype
+
+    def apply_channel(self, qubits, channel: QuantumChannel):
+        gate = channel.choose_op()
+        self.apply_gate(qubits, gate)
+
+    def apply_gate(self, qubits, gate, control_modes=None, control_values=None):
+        self.ns.apply_tensor_operator(
+            qubits,
+            gate,
+            unitary=False,
+            control_modes=control_modes,
+            control_values=control_values,
+            immutable=True,
+        )
+
+    def rdm(self, qubits=None):
+        if qubits is None:
+            qubits = list(range(self.n_qubits))
+        nstates_ = 2 ** len(qubits)
+        dm = self.ns.compute_reduced_density_matrix(qubits)
+        return dm.reshape(nstates_, nstates_)
+
+    def probs(self, qubits):
+        dm = self.rdm(qubits)
+        probs = np.diagonal(dm).astype(float)
+        return probs
+
+    def expectation(self, pauli_dict) -> float:
+        return self.ns.compute_expectation(pauli_dict).real
+
+    def iterate_trajectories(self, n_trajectories):
+        for _ in range(n_trajectories):
+            with network_state_config(self.n_qubits, algo=self.algo) as ns:
+                self.ns = ns
+                yield self
+
+
+class TrajectoryApplyChannel(TrajectoryNaive):
+    # Prevents applying a gate to the same object in the trajectory loop
+    _constructed: bool
+    # Prevents re-using measured MPS between trajectories
+    _evolved: bool
+
+    def apply_gate(self, qubits, gate, control_modes=None, control_values=None):
+        if self._constructed:
+            return
+        self.ns.apply_tensor_operator(
+            qubits,
+            gate,
+            unitary=True,
+            control_modes=control_modes,
+            control_values=control_values,
+            immutable=True,
+        )
+
+    def apply_channel(self, qubits, channel: QuantumChannel):
+        if self._constructed:
+            return
+        self.ns.apply_unitary_tensor_channel(qubits, channel.ops, channel.probs)
+
+    def evolve(self):
+        if self.algo == "mps":
+            self.ns.compute_output_state(release_operators=True)
+            sv = self.ns.compute_state_vector()
+        self._evolved = True
+
+    def rdm(self, qubits=None):
+        self.evolve()
+        return super().rdm(qubits)
+
+    def iterate_trajectories(self, n_trajectories):
+        self._evolved = False
+        self._constructed = False
+        self.ns = network_state_config(self.n_qubits, algo=self.algo)
+        for i in range(n_trajectories):
+            if self._evolved:
+                self.ns.free()
+                self.ns = network_state_config(self.n_qubits, algo=self.algo)
+                self._constructed = False
+            yield self
+            self._constructed = True
+        self.ns.free()
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/quantum_channels.py b/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/quantum_channels.py
new file mode 100644
index 0000000..16a47a9
--- /dev/null
+++ b/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/quantum_channels.py
@@ -0,0 +1,67 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+from dataclasses import dataclass
+from typing import Union
+
+
+@dataclass
+class QuantumChannel:
+    """
+    dataclass to store info of quantum channel
+
+    probs=None corresponds to uniform distribution
+    """
+
+    probs: Union[np.array, None, list]
+    ops: list[np.array]
+
+    def choose_op(self, dtype: str = "complex128"):
+        i_ = np.random.choice(np.arange(len(self.probs)), p=self.probs)
+        return self.ops[i_].astype(dtype)
+
+    def mul_left(self, op: np.array):
+        """
+        In-place tensor multiply by `op` from left.
+        Args:
+        - op : Operator to tensordot with
+            The modes of the operand is expected to be ordered as
+            ``ABC...abc...``, where ``ABC...`` denotes output bra modes and
+            ``abc...`` denotes input ket modes corresponding to ``modes``
+        """
+        twodim = lambda x: x.reshape(2 ** (x.ndim) // 2, -1)
+        self.ops = [
+            np.tensordot(twodim(op), twodim(o_), axes=0)
+            .transpose([0, 2, 1, 3])
+            .reshape((2,) * (op.ndim + o_.ndim))
+            for o_ in self.ops
+        ]
+
+
+@dataclass
+class QuantumGates:
+    I = np.array([[1, 0], [0, 1]]).astype("complex128")
+    X = np.array([[0, 1], [1, 0]]).astype("complex128")
+    Y = np.array([[0, -1j], [1j, 0]]).astype("complex128")
+    Z = np.array([[1, 0], [0, -1]]).astype("complex128")
+    eZZ = np.diag(np.exp(1j * np.array([1, -1, -1, 1]))).astype("complex128")
+
+
+def depolarizing_channel(l: float) -> QuantumChannel:
+    """
+    https://en.wikipedia.org/wiki/Quantum_depolarizing_channel
+    """
+    return QuantumChannel(
+        probs=[1 - 3 * l / 4, l / 4, l / 4, l / 4],
+        ops=[QuantumGates.I, QuantumGates.X, QuantumGates.Y, QuantumGates.Z],
+    )
+
+
+def bitflip_channel(p: float) -> QuantumChannel:
+    """
+    Probability of X rotation is `p`
+    Probability of I rotation is `1-p`
+    """
+    return QuantumChannel(probs=[p, 1 - p], ops=[QuantumGates.X, QuantumGates.I])
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/test_large_circuits.py b/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/test_large_circuits.py
new file mode 100644
index 0000000..5fcff62
--- /dev/null
+++ b/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/test_large_circuits.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""
+Trajectories based simulation of noisy quantum channels.
+
+This test uses TrajectorySim API to simulate changes in expectation of maxcut cost observable under noise
+"""
+
+import numpy as np
+import networkx as nx
+import pytest
+from .quantum_channels import (
+    bitflip_channel,
+    QuantumGates,
+)
+
+
+SEED = 10
+np.random.seed(SEED)
+
+
+@pytest.mark.parametrize("bitflip_p", [0.01, 0.08])
+@pytest.mark.parametrize("n_qubits", [10])
+def test_bitflip_maxcut_cost(trajectory_sim, bitflip_p, n_qubits):
+    """
+    Take a unitary operator U=exp(C) and its eigenstate.
+
+    Apply the operator to the state with bitflip noise after each gate.
+
+    Evaluate fidelity and expectation value
+    """
+    n_trajectories = 30
+    channel = bitflip_channel(bitflip_p)
+    G = nx.random_regular_graph(3, n_qubits)
+    init_cut_value, (init_flips, _) = nx.approximation.one_exchange(G, seed=SEED)
+    cost_dict = {}
+    for u, v in G.edges:
+        pstring = ["I"] * n_qubits
+        pstring[u] = "Z"
+        pstring[v] = "Z"
+        cost_dict["".join(pstring)] = 0.5
+
+    ensemble_dms = []
+    ensemble_exps = []
+    for sim in trajectory_sim.iterate_trajectories(n_trajectories):
+        # -- Prepare init state
+        for q in init_flips:
+            sim.apply_gate((q,), QuantumGates.X)
+        # -- Apply operator
+        for u, v in G.edges:
+            gate = QuantumGates.eZZ.reshape((2, 2, 2, 2))
+            sim.apply_gate((u, v), gate)
+            # -- Apply noise on the gate
+            for q in (u, v):
+                sim.apply_channel((q,), channel)
+        # -- Calculate expectation
+        exp = sim.expectation(cost_dict)
+        ensemble_exps.append(exp)
+        # -- Calculate DM
+        dm = sim.rdm()
+        ensemble_dms.append(dm)
+
+    print(f"{ensemble_exps}")
+    avg_cost = G.number_of_edges() / 2 - np.mean(ensemble_exps)
+    print(f"{avg_cost=}")
+    print(f"{init_cut_value=}")
+    ensemble_dm = np.stack(ensemble_dms).mean(axis=0).reshape(2**n_qubits, 2**n_qubits)
+
+    # -- Reference values
+    rdm_true = np.ones(2)
+    for sim in trajectory_sim.iterate_trajectories(1):
+        for q in init_flips:
+            sim.apply_gate((q,), QuantumGates.X)
+        rdm_true = sim.rdm()
+    # --
+    # F = <\psi|\rho|\psi>
+    fidelity = np.trace(rdm_true.dot(ensemble_dm))
+    print(f"{fidelity=}")
+    n_noise_gates = 2 * G.number_of_edges()
+    expected_fidelity = (1 - bitflip_p) ** (n_noise_gates)
+    print(f"{expected_fidelity=}")
+    sigma_ = 1 / np.sqrt(n_trajectories)
+    print(f"{sigma_=}")
+    # - TODO: verify the scaling of fidelity under trajectories
+    assert np.abs(fidelity - expected_fidelity) < 2.5 * sigma_
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/test_mid_circuit_measurement.py b/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/test_mid_circuit_measurement.py
new file mode 100644
index 0000000..56668d5
--- /dev/null
+++ b/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/test_mid_circuit_measurement.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""
+Trajectories based simulation of noisy quantum channels.
+
+This test uses TrajectorySim API to simulate a 3-qubit repetition code
+with mid-circuit measurement and error correction
+"""
+
+import numpy as np
+import pytest
+from .quantum_channels import QuantumGates, bitflip_channel
+
+
+np.random.seed(10)
+
+# This test file uses `trajectory_sim` fixture defined in conftest.py
+
+
+@pytest.mark.parametrize("bitflip_p", [0.2, 0.6])
+# n_qubits is needed for trajectory_sim fixture to create state
+@pytest.mark.parametrize("n_qubits", [5])
+# mid-circuit measurement is only supported for MPS for now
+@pytest.mark.parametrize("state_algo", ["mps"])
+@pytest.mark.parametrize("init_state", [QuantumGates.I[0], QuantumGates.I[1]])
+def test_3qubit_parity(trajectory_sim, bitflip_p, init_state):
+    """
+    Test 3-qubit error correction with mid-circuit measurement
+
+    Probability of X rotation is `p`
+    Probability of I rotation is `1-p`
+    """
+    n_trajectories = 300
+    # Logical qubits
+    n_qubits = 3
+
+    correction_rounds = 3
+    # Probability of getting more than 1 bitflip
+    error_prob_one_round = 3 * bitflip_p**2 * (1 - bitflip_p) + bitflip_p**3
+    # odd number of channel errors will give us a bit-flip
+    # for G(x) = \sum_n^K (K \choose n)p^n (1-p)*{K-n}x^n, the G(-1) flips the sign of even terms.
+    # The value of (G(1) + G(-1))/2 is thus the even coefficients
+    error_prob = (1 - (1 - 2 * error_prob_one_round) ** correction_rounds) / 2
+    print("Eror probability for one error correction round:", error_prob_one_round)
+    print("Error probability:", error_prob)
+
+    init_state /= np.square(init_state).sum()
+    orth_state = np.array([init_state[1], -init_state[0]])
+    init_gate = np.stack((init_state, orth_state)).astype("complex128").T
+    channel = bitflip_channel(bitflip_p)
+
+    ensemble_probs = []
+    for sim in trajectory_sim.iterate_trajectories(n_trajectories):
+        cx = lambda i, j: sim.apply_gate(
+            (j,),
+            QuantumGates.X,
+            control_modes=(i,),
+            control_values=(1,),
+        )
+        # Encode initial state
+        sim.apply_gate((0,), init_gate)
+        # init_dm = state.compute_reduced_density_matrix((0, ))
+        cx(0, 1)
+        cx(0, 2)
+        for j in range(correction_rounds):
+            # -- Apply noise channel to each qubit
+            for i in range(n_qubits):
+                sim.apply_channel((i,), channel)
+
+            # -- Apply error correction
+
+            cx(0, 3)
+            cx(1, 3)
+            cx(1, 4)
+            cx(2, 4)
+
+            # -- Measure error syndromes
+
+            probs = sim.probs((3, 4))
+            assert np.allclose(probs.max(), 1)
+            syndrome = np.argmax(probs)
+            if syndrome == 0:
+                pass
+            elif syndrome == 2:
+                sim.apply_gate((0,), QuantumGates.X)
+            elif syndrome == 1:
+                sim.apply_gate((2,), QuantumGates.X)
+            elif syndrome == 3:
+                sim.apply_gate((1,), QuantumGates.X)
+
+            # -- reset ancillas
+            if syndrome == 0:
+                pass
+            elif syndrome == 2:
+                sim.apply_gate((3,), QuantumGates.X)
+            elif syndrome == 1:
+                sim.apply_gate((4,), QuantumGates.X)
+            elif syndrome == 3:
+                sim.apply_gate((3,), QuantumGates.X)
+                sim.apply_gate((4,), QuantumGates.X)
+
+            # -- Continue to the next round of error correction
+
+        probs = sim.probs((0, 1, 2))
+        ensemble_probs.append(probs)
+
+    ensemble_probs = np.stack(ensemble_probs).mean(axis=0)
+    print("Final state probs:")
+    for i, v in enumerate(ensemble_probs):
+        print(i, v)
+    state_probs = [ensemble_probs[0], ensemble_probs[-1]]
+    sigma_ = 1 / np.sqrt(n_trajectories)
+    _T = np.array([[1 - error_prob, error_prob], [error_prob, 1 - error_prob]])
+    expected_probs = _T.dot(init_state)
+    print("Expected probabilities:", expected_probs)
+    assert np.abs(state_probs[1] - expected_probs[1]) < 2.5 * sigma_
+    assert np.abs(state_probs[0] - expected_probs[0]) < 2.5 * sigma_
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/test_onequbit_channel.py b/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/test_onequbit_channel.py
new file mode 100644
index 0000000..f325388
--- /dev/null
+++ b/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/test_onequbit_channel.py
@@ -0,0 +1,69 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""
+Trajectories based simulation of noisy quantum channels.
+
+This test uses TrajectorySim API to simulate one-qubit noise channels
+"""
+
+import numpy as np
+import pytest
+from .quantum_channels import bitflip_channel, depolarizing_channel
+
+# in pytest cases, python objects are not displayed nicely, so let's use string tags
+np.random.seed(10)
+
+# -- Tests
+
+# This test file uses `trajectory_sim` fixture defined in conftest.py
+
+
+@pytest.mark.parametrize("bitflip_p", [0.1, 0.8])
+@pytest.mark.parametrize("n_qubits", [1])
+def test_bitflip_channel(trajectory_sim, bitflip_p):
+    n_trajectories = 300
+    channel = bitflip_channel(bitflip_p)
+
+    ensemble_probs = []
+    for sim in trajectory_sim.iterate_trajectories(n_trajectories):
+        sim.apply_channel((0,), channel)
+        prob = sim.probs((0,))
+        ensemble_probs.append(prob)
+
+    ensemble_probs = np.stack(ensemble_probs).mean(axis=0)
+    print("Bitflip ensemble Probs ", ensemble_probs)
+    true_probs = np.array([1 - bitflip_p, bitflip_p])
+    # Use 2.5 sigma, which is about 99% CI
+    sigma_ = 1 / np.sqrt(n_trajectories)
+    assert (np.abs(ensemble_probs - true_probs) < 2.5 * sigma_).all()
+
+
+@pytest.mark.parametrize("n_qubits", [1, 2])
+def test_depolarizing_channel(trajectory_sim, n_qubits):
+    """
+    For n_qubits>1, pads the channel with identity on the left.
+    Checks that we get a I/2 state on the corresponding qubit
+    """
+    n_trajectories = 300
+    error = 1.0
+    channel = depolarizing_channel(error)
+    if n_qubits > 1:
+        for _ in range(n_qubits - 1):
+            channel.mul_left(np.eye(2))
+
+    ensemble_probs = []
+    qubit_id = n_qubits - 1
+    for sim in trajectory_sim.iterate_trajectories(n_trajectories):
+        sim.apply_channel(tuple(range(n_qubits)), channel)
+        probs = sim.probs((qubit_id,))
+        ensemble_probs.append(probs)
+
+    # TODO: check that we don't have a I/2 DM on qubit 0
+    ensemble_probs = np.stack(ensemble_probs).mean(axis=0)
+    print("Depolarizing ensemble probs ", ensemble_probs)
+    true_dm = np.ones(2) / 2
+    # Use 2.5 sigma, which is about 99% CI
+    sigma_ = 1 / np.sqrt(n_trajectories)
+    assert (np.abs(ensemble_probs - true_dm) < 2.5 * sigma_).all()
diff --git a/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/test_quantum_volume_mid_circuit.py b/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/test_quantum_volume_mid_circuit.py
new file mode 100644
index 0000000..f6015c6
--- /dev/null
+++ b/python/tests/cuquantum_tests/cutensornet_tests/trajectories_noise/test_quantum_volume_mid_circuit.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""
+Trajectories based simulation of noisy quantum channels.
+
+This test uses TrajectorySim API to simulate quantum volumue circuits
+"""
+
+from qiskit.circuit.library import QuantumVolume
+from qiskit import transpile
+
+import numpy as np
+import pytest
+from cuquantum.cutensornet.experimental._internal.network_state_utils import (
+    STATE_DEFAULT_DTYPE,
+)
+from cuquantum.cutensornet._internal import utils
+from cuquantum.cutensornet.experimental.configuration import MPSConfig
+from cuquantum.cutensornet.configuration import NetworkOptions
+from cuquantum.cutensornet.circuit_converter import CircuitToEinsum
+
+from .network_state_wrap import TrajectorySim
+from .quantum_channels import (
+    depolarizing_channel,
+    QuantumChannel,
+)
+
+
+SEED = 10
+np.random.seed(SEED)
+n_variations = 10
+depth = 30
+
+# This test file uses `trajectory_sim` fixture defined in conftest.py
+
+
+def get_qvolume_circuit(n_qubits, depth, seed=SEED):
+    circuit = QuantumVolume(n_qubits, depth, seed=seed)
+    circuit.measure_all()
+    circuit = transpile(circuit, basis_gates=["u3", "cx"], optimization_level=0)
+    return circuit
+
+
+def apply_circuit_with_noise(
+    ns: TrajectorySim,
+    circuit,
+    channel: QuantumChannel,
+    dtype=STATE_DEFAULT_DTYPE,
+    backend="numpy",
+    options=None,
+):
+    options = utils.check_or_create_options(NetworkOptions, options, "network options")
+    with utils.device_ctx(options.device_id):
+        converter = CircuitToEinsum(circuit, dtype=dtype, backend=backend)
+
+    # dtype = getattr(converter.dtype, '__name__', str(converter.dtype).split('.')[-1])
+    for gate_operand, gate_qubits in converter.gates:
+        # all gate operands are assumed to be unitary
+        qubits_indices = [converter.qubits.index(q) for q in gate_qubits]
+        ns.apply_gate(qubits_indices, gate_operand)
+        for q in qubits_indices:
+            ns.apply_channel((q,), channel)
+
+
+@pytest.mark.parametrize("n_qubits", [5])
+@pytest.mark.parametrize("bitflip_p", [0.01, 0.08])
+def test_quantum_volume(trajectory_sim, bitflip_p, n_qubits):
+    """
+    Apply quantum volume circuit M rounds.
+
+    Measure first two qubits at the end of each round.
+    Apply projector gate to collapse the state after the measurement.
+
+    Expect the mid-circuit measurements to be uniform
+
+    """
+    n_trajectories = 30
+    circ_depth_per_round = 5
+    channel = depolarizing_channel(bitflip_p)
+    measurement_rounds = 2
+    measured_qubits = (0, 1)
+
+    ensemble_mid_probs = []
+    for sim in trajectory_sim.iterate_trajectories(n_trajectories):
+        for m in range(measurement_rounds):
+            circuit = get_qvolume_circuit(n_qubits, circ_depth_per_round, seed=SEED)
+            apply_circuit_with_noise(sim, circuit, channel)
+            probs = sim.probs(measured_qubits)
+            # -- Sample result
+            if measurement_rounds > 1:
+                # projector operators are non-unitary
+                probs /= probs.sum()
+            if isinstance(sim.ns.config, MPSConfig):
+                # MPS is an approximate algorithm
+                probs /= probs.sum()
+
+            ensemble_mid_probs.append(probs)
+            msmt = np.random.choice(np.arange(len(probs)), p=probs)
+            # -- Apply projector based on the measurement
+            for i, q in enumerate(measured_qubits):
+                projector = np.zeros((2, 2), dtype="complex128")
+                # bit = (msmt >> i) % 2
+                bit = np.unravel_index(msmt, (2,) * n_qubits)[i]
+                projector[bit, bit] = 1
+                sim.apply_gate((q,), projector)
+            # --
+
+    mean_mid_probs = np.stack(ensemble_mid_probs).mean(axis=0)
+    print(f"{mean_mid_probs=}")
+    expect_mid_probs = np.ones_like(mean_mid_probs) / mean_mid_probs.size
+    sigma_ = 1 / np.sqrt(n_trajectories)
+    margin = 2.5 * sigma_
+    assert np.allclose(expect_mid_probs, mean_mid_probs, atol=margin)
diff --git a/python/tests/cuquantum_tests/densitymat_mpi_tests/test_state_compute_mpi.py b/python/tests/cuquantum_tests/densitymat_mpi_tests/test_state_compute_mpi.py
new file mode 100644
index 0000000..b0b6894
--- /dev/null
+++ b/python/tests/cuquantum_tests/densitymat_mpi_tests/test_state_compute_mpi.py
@@ -0,0 +1,341 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import cupy as cp
+import numpy as np
+import pytest
+from typing import Sequence
+from numbers import Number
+import gc
+
+from cuquantum.densitymat import DenseMixedState, DensePureState, WorkStream
+from cuquantum.cutensornet._internal import utils
+
+from mpi4py import MPI
+
+# TODO: Add tests for non-blocking execution, non-0 stream arg
+
+NUM_DEVICES = cp.cuda.runtime.getDeviceCount()
+
+
+def get_state(ctx, hilbert_space_dims, batch_size, package, dtype, mixed, init="random"):
+    assert package == cp
+    assert init == "random"
+    global_state_shape = (hilbert_space_dims * (2 if mixed else 1)) + (batch_size,)
+
+    comm = ctx.get_communicator()
+    global_state = np.zeros(global_state_shape, dtype=dtype)
+    if comm.Get_rank() == 0:
+        global_state += np.random.normal(0, 1, size=global_state_shape)
+        if dtype == complex:
+            global_state += 1j * np.random.normal(0, 1, size=global_state_shape)
+        norms = np.linalg.norm(global_state.reshape(-1, batch_size), axis=0)
+        global_state /= np.linalg.norm(global_state.reshape(-1, batch_size), axis=0)
+    comm.Bcast(global_state, root=0)
+    with cp.cuda.Device(ctx.device_id):
+        State = DensePureState if not mixed else DenseMixedState
+        state = State(ctx, hilbert_space_dims, batch_size, dtype)
+        size = state.storage_size
+        state.attach_storage(cp.empty(size, dtype=dtype))
+        state.view()[:] = cp.nan
+
+        # print(state.local_info, hilbert_space_dims, batch_size)
+        local_shape, offsets = state.local_info
+        local_state = global_state.copy()
+        for ind in range(len(local_shape)):
+            local_state = np.take(
+                local_state, np.arange(offsets[ind], offsets[ind] + local_shape[ind]), axis=ind
+            )
+        gpu_local_state = cp.asarray(local_state, order="F")
+        state.view()[:] = gpu_local_state
+    return state, global_state
+
+
+class TestStateAPI:
+
+    def setup_method(self):
+        self.device_id = MPI.COMM_WORLD.Get_rank() % NUM_DEVICES
+        self.ctx = WorkStream(device_id = self.device_id)
+        # self.ctx._comm = MPI.COMM_WORLD.Dup()
+        self.ctx.set_communicator(comm=MPI.COMM_WORLD.Dup(), provider="MPI")
+        cp.cuda.Device(self.device_id).use()
+
+    def teardown_method(self):
+        gc.collect()
+        self.ctx=None
+
+    @pytest.mark.parametrize("hilbert_space", [(10,), (4, 6), (2, 3), (4,), (7,), (3, 3, 3)])
+    @pytest.mark.parametrize("package", [cp])
+    @pytest.mark.parametrize(
+        "dtype,batch_size,factors",
+        [
+            ("float32", 1, 2.0),
+            ("float64", 1, 2.0),
+            ("complex64", 1, 2.0),
+            ("complex128", 1, 2.0),
+            ("complex128", 1, 2.0 + 0.5j),
+            pytest.param(
+                "float64",
+                2,
+                2.0,
+            ),
+            pytest.param(
+                "float64",
+                2,
+                (2.0, 3.0),
+            ),
+            pytest.param(
+                "float64",
+                2,
+                np.array([2.0, 3.0]),
+            ),
+            pytest.param(
+                "float64",
+                2,
+                cp.array([2.0, 3.0]),
+            ),
+            pytest.param(
+                "complex128",
+                2,
+                cp.array([2.0, 3.0], dtype="complex128"),
+            ),
+            pytest.param("float32", 2, cp.array([np.sqrt(2.0), np.sqrt(3.0)])),
+            pytest.param("float64", 2, (2.0, 3.0, 4.0), marks=pytest.mark.xfail(raises=ValueError)),
+            pytest.param(
+                "float64",
+                2,
+                np.array([[1.0, 2.0], [3.0, 4.0]]),
+                marks=pytest.mark.xfail(raises=ValueError),
+            ),
+            pytest.param("float64", 2, {1.0, 2.0}, marks=pytest.mark.xfail(raises=TypeError)),
+            pytest.param("float64", 2, (1, 2 + 3j), marks=pytest.mark.xfail(raises=TypeError)),
+            pytest.param(
+                "float64",
+                2,
+                cp.array([1 + 2j, 3 + 4j]),
+                marks=pytest.mark.xfail(raises=TypeError),
+            ),
+        ],
+    )
+    @pytest.mark.parametrize("purity", ["PURE", "MIXED"])
+    def test_state_inplace_scale(self, hilbert_space, package, dtype, batch_size, factors, purity):
+        psi, global_state = get_state(
+            self.ctx,
+            hilbert_space,
+            batch_size,
+            package,
+            dtype,
+            init="random",
+            mixed=(purity == "MIXED"),
+        )
+        gc.collect()
+        shape, offsets = psi.local_info
+        psi_arr = psi.view().get()
+        assert not np.any(np.isnan(psi_arr))
+        psi.inplace_scale(factors)
+        scaled_psi_arr = psi.view().get()
+
+        assert not np.any(np.isnan(scaled_psi_arr))
+
+        if isinstance(factors, Sequence):
+            factors_np = np.array(factors, dtype=psi.dtype)
+        elif isinstance(factors, cp.ndarray):
+            factors_np = factors.get()
+        elif isinstance(factors, Number):  # single number or numpy array
+            factors_np = np.ones(batch_size) * factors
+        else:
+            factors_np = factors
+        ref = np.einsum(
+            "...i,i->...i",
+            psi_arr,
+            factors_np[np.array(range(offsets[-1], offsets[-1] + shape[-1]))],
+        )
+        np.testing.assert_allclose(scaled_psi_arr, ref, rtol=1e-5, atol=1e-8)
+
+    @pytest.mark.parametrize("hilbert_space", [(10,), (4, 6), (3, 7)])
+    @pytest.mark.parametrize("package", [cp])
+    @pytest.mark.parametrize(
+        "dtype,batch_size,factors",
+        [
+            # ("float32", 1, 2.0),
+            ("float64", 1, 2.0),
+            ("complex64", 1, 2.0 + 3.0j),
+            ("complex128", 1, 2.0 + 3.0j),
+            (
+                "float64",
+                2,
+                2.0,
+            ),
+            (
+                "float64",
+                2,
+                (2.0, 3.0),
+            ),
+            (
+                "float64",
+                2,
+                np.array([2.0, 3.0]),
+            ),
+            (
+                "float64",
+                2,
+                cp.array([2.0, 3.0]),
+            ),
+            (
+                "float32",
+                2,
+                cp.array([np.sqrt(2.0), np.sqrt(3.0)]),
+            ),
+            pytest.param("float64", 2, (2.0, 3.0, 4.0), marks=pytest.mark.xfail(raises=ValueError)),
+            pytest.param(
+                "float64",
+                2,
+                np.array([[1.0, 2.0], [3.0, 4.0]]),
+                marks=pytest.mark.xfail(raises=ValueError),
+            ),
+            pytest.param("float64", 2, {1.0, 2.0}, marks=pytest.mark.xfail(raises=TypeError)),
+            pytest.param("float64", 2, (1, 2 + 3j), marks=pytest.mark.xfail(raises=TypeError)),
+            # ("float32", 2, np.array([1.0, 2.0])),
+            pytest.param(
+                "float64",
+                2,
+                cp.array([1 + 2j, 3 + 4j]),
+                marks=pytest.mark.xfail(raises=TypeError),
+            ),
+        ],
+    )
+    @pytest.mark.parametrize("purity", ["PURE", "MIXED"])
+    def test_state_inplace_accumulate(
+        self, hilbert_space, package, dtype, batch_size, factors, purity
+    ):
+        psi1, _ = get_state(
+            self.ctx,
+            hilbert_space,
+            batch_size,
+            package,
+            dtype,
+            init="random",
+            mixed=(purity == "MIXED"),
+        )
+        shape, offsets = psi1.local_info
+        psi2, _ = get_state(
+            self.ctx,
+            hilbert_space,
+            batch_size,
+            package,
+            dtype,
+            init="random",
+            mixed=(purity == "MIXED"),
+        )
+        psi1_arr = psi1.view().get()
+        psi2_arr = psi2.view().get()
+        assert not np.any(np.isnan(psi1_arr))
+        assert not np.any(np.isnan(psi2_arr))
+        psi1.inplace_accumulate(psi2, factors)
+        accumulated_psi_arr = psi1.view().get()
+
+        if isinstance(factors, Sequence):
+            factors_np = np.array(factors, dtype=psi1.dtype)
+        elif isinstance(factors, cp.ndarray):
+            factors_np = factors.get()
+        elif isinstance(factors, Number):  # single number or numpy array
+            factors_np = np.ones(batch_size) * factors
+        else:
+            factors_np = factors
+        ref = (
+            np.einsum(
+                "...i,i->...i",
+                psi2_arr,
+                factors_np[np.array(range(offsets[-1], offsets[-1] + shape[-1]))],
+            )
+            + psi1_arr
+        )
+        # np.testing.assert_allclose(np.unique(accumulated_psi_arr), np.unique(ref), rtol=1e-5, atol=1e-8)
+        # print(accumulated_psi_arr / ref)
+        np.testing.assert_allclose(accumulated_psi_arr, ref, rtol=1e-5, atol=1e-8)
+        
+
+    @pytest.mark.parametrize("hilbert_space", ((10,), (10, 2, 4), (3, 3, 3)))
+    @pytest.mark.parametrize("package", (cp,))
+    @pytest.mark.parametrize("dtype", ("float32", "float64", "complex64", "complex128"))
+    @pytest.mark.parametrize("batch_size", [1, 2])
+    @pytest.mark.parametrize("purity", ["PURE", "MIXED"])
+    def test_state_inner_product(self, hilbert_space, package, dtype, batch_size, purity):
+        psi1, _ = get_state(
+            self.ctx,
+            hilbert_space,
+            batch_size,
+            package,
+            dtype,
+            init="random",
+            mixed=(purity == "MIXED"),
+        )
+        psi2, _ = get_state(
+            self.ctx,
+            hilbert_space,
+            batch_size,
+            package,
+            dtype,
+            init="random",
+            mixed=(purity == "MIXED"),
+        )
+        psi1_arr = psi1.view().get()
+        psi2_arr = psi2.view().get()
+        slice_shape, offsets = psi1.local_info
+
+        inner_prod = psi1.inner_product(psi2)
+        inner_prod_arr = inner_prod.get()
+
+        psi1_arr = psi1_arr.reshape((-1, psi1_arr.shape[-1]), order="F")
+        psi2_arr = psi2_arr.reshape((-1, psi2_arr.shape[-1]), order="F")
+        ref = np.zeros((batch_size,), dtype=inner_prod.dtype)
+        reduced_ref = np.zeros((batch_size,), dtype=inner_prod.dtype)
+        local_batch_size = psi1.view().shape[-1]
+        for i in range(local_batch_size):
+            ref[offsets[-1] + i] = np.vdot(psi1_arr[:, i], psi2_arr[:, i])
+        comm = self.ctx.get_communicator()
+        comm.Allreduce(ref, reduced_ref)
+        np.testing.assert_allclose(inner_prod_arr, reduced_ref, rtol=1e-5, atol=1e-5)
+
+    @pytest.mark.parametrize(
+        "hilbert_space",
+        (
+            (10,),
+            (10, 2, 4),
+            (3, 3, 3),
+        ),
+    )
+    @pytest.mark.parametrize("package", (cp,))
+    @pytest.mark.parametrize("dtype", ("float32", "float64", "complex64", "complex128"))
+    @pytest.mark.parametrize("batch_size", [1, 2])
+    @pytest.mark.parametrize("purity", ("PURE", "MIXED"))
+    def test_state_norm(self, hilbert_space, package, dtype, batch_size, purity):
+        psi, global_state = get_state(
+            self.ctx,
+            hilbert_space,
+            batch_size,
+            package,
+            dtype,
+            mixed=(purity == "MIXED"),
+            init="random",
+        )
+        psi_arr = psi.view().get()
+        norm = psi.norm().get()
+        shape, offsets = psi.local_info
+        # psi_arr = psi_arr.reshape((-1, psi_arr.shape[-1]), order="F")
+        ref = np.zeros((batch_size,), dtype=psi.storage.real.dtype)
+        reduced_ref = np.zeros((batch_size,), dtype=norm.dtype)
+        local_batch_size = psi.view().shape[-1]
+        for i in range(local_batch_size):
+            ref[offsets[-1] + i] = np.vdot(psi_arr[..., i], psi_arr[..., i]).real
+        global_ref = np.zeros((batch_size,), dtype=norm.dtype)
+        for i in range(batch_size):
+            global_ref[i] = np.vdot(global_state[..., i], global_state[..., i]).real
+        global_state = None
+        comm = self.ctx.get_communicator()
+        comm.Allreduce(ref, reduced_ref)
+        print(global_ref, ref, reduced_ref, norm)
+
+        np.testing.assert_allclose(norm, global_ref, rtol=1e-5, atol=1e-8)
+        
diff --git a/python/tests/cuquantum_tests/densitymat_mpi_tests/test_state_mpi.py b/python/tests/cuquantum_tests/densitymat_mpi_tests/test_state_mpi.py
new file mode 100644
index 0000000..6c4a2f0
--- /dev/null
+++ b/python/tests/cuquantum_tests/densitymat_mpi_tests/test_state_mpi.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+from cuquantum.densitymat import (
+    WorkStream,
+    DensePureState,
+    DenseMixedState,
+)
+
+import cupy as cp
+import numpy as np
+import weakref
+import pytest
+from mpi4py import MPI
+
+NUM_DEVICES = cp.cuda.runtime.getDeviceCount()
+# TODO: mostly redundant with tests in test_context.py, consolidate in the future
+
+
+@pytest.mark.parametrize(
+    "hilbert_space_dims",
+    [
+        (
+            2,
+            2,
+            2,
+        )
+    ],
+)
+@pytest.mark.parametrize("batch_size", [1, 2, 3])
+@pytest.mark.parametrize("mixed", [True, False])
+@pytest.mark.parametrize("dtype", ["complex128", "complex64", "float64", "float32"])
+def test_creation(hilbert_space_dims, batch_size, mixed, dtype):
+    comm = MPI.COMM_WORLD
+    my_comm = comm.Dup()
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+    cp.cuda.Device(rank % NUM_DEVICES).use()
+    ctx = WorkStream(device_id=rank % NUM_DEVICES)
+    ctx.set_communicator(my_comm, provider="MPI")
+    State = DensePureState if not mixed else DenseMixedState
+    state = State(ctx, hilbert_space_dims, batch_size, dtype)
+    slice_shape, offsets = state.local_info
+    assert len(slice_shape) == len(offsets)
+    assert len(slice_shape) == 1 + len(hilbert_space_dims) * (2 if mixed else 1)
+    storage_size = state.storage_size
+    state_storage_buf = cp.zeros((storage_size,), dtype=state.dtype)
+    state.attach_storage(state_storage_buf)
+    state_view = state.view()
+    state_view[:] = cp.random.rand(*state_view.shape)
+    assert state_view.shape == slice_shape
+    state = None
+    state = State(ctx, hilbert_space_dims, batch_size, dtype)
+    state.allocate_storage()
+    assert state.storage.size == storage_size
+    assert state.storage.size == state.storage_size
diff --git a/python/tests/cuquantum_tests/densitymat_mpi_tests/test_work_stream_mpi.py b/python/tests/cuquantum_tests/densitymat_mpi_tests/test_work_stream_mpi.py
new file mode 100644
index 0000000..3eb2f4b
--- /dev/null
+++ b/python/tests/cuquantum_tests/densitymat_mpi_tests/test_work_stream_mpi.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+from cuquantum.densitymat import WorkStream
+
+import cupy as cp
+import weakref
+import pytest
+from mpi4py import MPI
+
+NUM_DEVICES = cp.cuda.runtime.getDeviceCount()
+
+
+def test_work_stream_mpi():
+    comm = MPI.COMM_WORLD
+    my_comm = comm.Dup()
+    rank = comm.Get_rank()
+    size = comm.Get_size()
+
+    cp.cuda.Device(rank % NUM_DEVICES).use()
+    ctx = WorkStream(device_id=rank % NUM_DEVICES)
+    ctx.set_communicator(comm, provider="MPI")
+    assert size == ctx.get_num_ranks()
+    assert rank == ctx.get_proc_rank()
diff --git a/python/tests/cuquantum_tests/densitymat_tests/__init__.py b/python/tests/cuquantum_tests/densitymat_tests/__init__.py
new file mode 100644
index 0000000..977202e
--- /dev/null
+++ b/python/tests/cuquantum_tests/densitymat_tests/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import cupy as cp
+
+# This is future proof: In the future when CuPy enables cuQuantum Python
+# as an optional backend, we don't want to create a circular dependency
+# that ultimately tests against ourselves. Here we enable CUB as the only
+# optinaly backend and exclude cuTENSOR/cuQuantum Python/etc, using CuPy's
+# private API (for development/testing).
+cp._core.set_reduction_accelerators(["cub"])
+cp._core.set_routine_accelerators(["cub"])
diff --git a/python/tests/cuquantum_tests/densitymat_tests/test_elementary_operator.py b/python/tests/cuquantum_tests/densitymat_tests/test_elementary_operator.py
new file mode 100644
index 0000000..264aa92
--- /dev/null
+++ b/python/tests/cuquantum_tests/densitymat_tests/test_elementary_operator.py
@@ -0,0 +1,427 @@
+# Copyright (c) 2021-2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+"""Tests for ElementaryOperator."""
+
+from itertools import product
+
+import pytest
+import numpy as np
+import scipy as sp
+import cupy as cp
+from scipy.sparse import dia_matrix as sp_dia_matrix
+from cupyx.scipy.sparse import dia_matrix as cp_dia_matrix
+
+from cuquantum.densitymat import DenseOperator, MultidiagonalOperator
+
+
+np.random.seed(42)
+cp.random.seed(42)
+
+
+@pytest.fixture(scope="class")
+def callback_args():
+    t = 1.0
+    args = [1.0, 2.0, 3.0]
+    return t, args
+
+
+@pytest.fixture
+def dense_operator(request):
+    hilbert_space_dims, order, package, has_callback = request.param
+    shape = (*hilbert_space_dims, *hilbert_space_dims)
+    if has_callback:
+        data = package.empty(shape, order=order)
+
+        def callback(t, args):
+            _data = np.empty(shape, order=order)
+            for i in range(shape[0]):
+                for j in range(shape[1]):
+                    _data[i, j] = np.sin((i + 2 * j) * t * np.sum(args))
+            return _data
+
+    else:
+        data = package.asarray(np.random.rand(*shape), order=order)
+        callback = None
+    return DenseOperator(data, callback)
+
+
+@pytest.fixture
+def dense_operator_(request):
+    hilbert_space_dims, order, package, has_callback = request.param
+    shape = (*hilbert_space_dims, *hilbert_space_dims)
+    if has_callback:
+        data = package.empty(shape, order=order)
+
+        def callback(t, args):
+            _data = np.empty(shape, order=order)
+            for i in range(shape[0]):
+                for j in range(shape[1]):
+                    _data[i, j] = np.cos((i + 2 * j) * t * np.sum(args))
+            return _data
+
+    else:
+        data = package.asarray(np.random.rand(*shape), order=order)
+        callback = None
+    return DenseOperator(data, callback)
+
+
+@pytest.fixture
+def multidiagonal_operator(request):
+    dim, num_diags, order, package, has_callback = request.param
+    if has_callback:
+        data = package.empty((dim, num_diags), order=order)
+        shape = data.shape
+
+        def callback(t, args):
+            _data = np.empty(shape, order=order)
+            for i in range(shape[0]):
+                for j in range(shape[1]):
+                    _data[i, j] = np.sin((i + 2 * j) * t * np.sum(args))
+            return _data
+
+    else:
+        data = package.asarray(np.random.random((dim, num_diags)), order=order)
+        callback = None
+    offsets = list(np.random.choice(range(-dim + 1, dim + 1), size=num_diags, replace=False))
+    return MultidiagonalOperator(data, offsets, callback=callback)
+
+
+@pytest.fixture
+def multidiagonal_operator_(request):
+    dim, num_diags, order, package, has_callback = request.param
+    if has_callback:
+        data = package.empty((dim, num_diags), order=order)
+        shape = data.shape
+
+        def callback(t, args):
+            _data = np.empty(shape, order=order)
+            for i in range(shape[0]):
+                for j in range(shape[1]):
+                    _data[i, j] = np.cos((i + 2 * j) * t * np.sum(args))
+            return _data
+
+    else:
+        data = package.random.random((dim, num_diags))
+        callback = None
+    offsets = list(np.random.choice(range(-dim + 1, dim + 1), size=num_diags, replace=False))
+    return MultidiagonalOperator(data, offsets, callback=callback)
+
+
+@pytest.fixture
+def dia_matrix(request):
+    dim, num_diags, dia_matrix_func, package = request.param
+    data = package.random.random((num_diags, dim))
+    offsets = list(np.random.choice(range(-dim + 1, dim + 1), size=num_diags, replace=False))
+    return dia_matrix_func((data, offsets), shape=(dim, dim))
+
+
+@pytest.mark.usefixtures("callback_args")
+@pytest.mark.parametrize(
+    "dense_operator",
+    list(
+        product(
+            [(3,), (3, 4)],
+            ["C", "F"],
+            [np, cp],
+            [False, True],
+        )
+    ),
+    indirect=True,
+)
+class TestDenseOperatorUnaryOperations:
+
+    @pytest.mark.parametrize("scalar", [2.3])
+    def test_left_scalar_multiplication(self, dense_operator, callback_args, scalar):
+        dense_op = dense_operator
+        t, args = callback_args
+
+        scaled_dense_op = scalar * dense_op
+
+        dense_op_arr = dense_op.to_array(t, args)
+        ref = scalar * dense_op_arr
+        np.testing.assert_allclose(scaled_dense_op.to_array(t, args), ref)
+
+    @pytest.mark.parametrize("scalar", [2.3])
+    def test_right_scalar_multiplication(self, dense_operator, callback_args, scalar):
+        dense_op = dense_operator
+        t, args = callback_args
+
+        scaled_dense_op = dense_op * scalar
+
+        dense_op_arr = dense_op.to_array(t, args)
+        ref = dense_op_arr * scalar
+        np.testing.assert_allclose(scaled_dense_op.to_array(t, args), ref)
+
+    def test_conjugate_transpose(self, dense_operator, callback_args):
+        dense_op = dense_operator
+        t, args = callback_args
+        dense_op_arr = dense_operator.to_array(t, args)
+
+        dense_op_dag = dense_op.dag()
+
+        n = dense_op_arr.ndim
+        indices = list(range(n // 2, n)) + list(range(n // 2))
+        ref = dense_op_arr.transpose(*indices).conj()
+        np.testing.assert_allclose(dense_op_dag.to_array(t, args), ref)
+
+
+@pytest.mark.parametrize(
+    "dense_operator,dense_operator_",
+    list(
+        product(
+            list(product([(3,)], ["C", "F"], [np], [False, True])),
+            list(product([(3,)], ["C", "F"], [np], [False, True])),
+        )
+    )
+    + list(
+        product(
+            list(product([(3,)], ["C", "F"], [cp], [False, True])),
+            list(product([(3,)], ["C", "F"], [cp], [False, True])),
+        )
+    ),
+    indirect=True,
+)
+class TestDenseOperatorBinaryOperations:
+
+    def test_addition(self, dense_operator, dense_operator_, callback_args):
+        dense_op1 = dense_operator
+        dense_op2 = dense_operator_
+        t, args = callback_args
+
+        dense_op_sum = dense_op1 + dense_op2
+
+        dense_op1_arr = dense_op1.to_array(t, args)
+        dense_op2_arr = dense_op2.to_array(t, args)
+        ref = dense_op1_arr + dense_op2_arr
+        np.testing.assert_allclose(dense_op_sum.to_array(t, args), ref)
+
+    def test_subtraction(self, dense_operator, dense_operator_, callback_args):
+        dense_op1 = dense_operator
+        dense_op2 = dense_operator_
+        t, args = callback_args
+
+        dense_op_diff = dense_op1 - dense_op2
+
+        dense_op1_arr = dense_op1.to_array(t, args)
+        dense_op2_arr = dense_op2.to_array(t, args)
+        ref = dense_op1_arr - dense_op2_arr
+        np.testing.assert_allclose(dense_op_diff.to_array(t, args), ref)
+
+    def test_matrix_multiplication(self, dense_operator, dense_operator_, callback_args):
+        dense_op1 = dense_operator
+        dense_op2 = dense_operator_
+        t, args = callback_args
+
+        dense_op_prod = dense_op1 @ dense_op2
+
+        if len(dense_op1.shape) == 2:
+            subscripts = "ab,bc->ac"
+        elif len(dense_op1.shape) == 4:
+            subscripts = "abcd,cdef->abef"
+
+        dense_op1_arr = dense_op1.to_array(t, args)
+        dense_op2_arr = dense_op2.to_array(t, args)
+        ref = np.einsum(subscripts, dense_op1_arr, dense_op2_arr)
+        np.testing.assert_allclose(dense_op_prod.to_array(t, args), ref)
+
+
+@pytest.mark.parametrize(
+    "multidiagonal_operator",
+    list(product([4], [3], ["C", "F"], [np, cp], [False, True])),
+    indirect=True,
+)
+class TestMultidiagonalOperatorUnaryOperations:
+
+    @pytest.mark.parametrize("scalar", [2.3])
+    def test_left_scalar_multiplication(self, multidiagonal_operator, callback_args, scalar):
+        dia_op = multidiagonal_operator
+        t, args = callback_args
+        dia_op_arr = dia_op.to_array(t, args)
+
+        scaled_dia_op = scalar * dia_op
+
+        ref = scalar * dia_op_arr
+        np.testing.assert_allclose(scaled_dia_op.to_array(t, args), ref)
+
+    @pytest.mark.parametrize("scalar", [2.3])
+    def test_right_scalar_multiplication(self, multidiagonal_operator, callback_args, scalar):
+        dia_op = multidiagonal_operator
+        t, args = callback_args
+        dia_op_arr = multidiagonal_operator.to_array(t, args)
+
+        scaled_dia_op = dia_op * scalar
+
+        ref = scalar * dia_op_arr
+        np.testing.assert_allclose(scaled_dia_op.to_array(t, args), ref)
+
+    def test_conjugate_transpose(self, multidiagonal_operator, callback_args):
+        dia_op = multidiagonal_operator
+        t, args = callback_args
+        dia_op_arr = dia_op.to_array(t, args)
+
+        dia_op_dag = dia_op.dag()
+
+        ref = dia_op_arr.conj().T
+        np.testing.assert_allclose(dia_op_dag.to_array(t, args), ref)
+
+
+@pytest.mark.parametrize(
+    "multidiagonal_operator,multidiagonal_operator_",
+    list(
+        product(
+            list(product([4], [3], ["C", "F"], [np], [False, True])),
+            list(product([4], [2], ["C", "F"], [np], [False, True])),
+        )
+    )
+    + list(
+        product(
+            list(product([4], [3], ["C", "F"], [cp], [False, True])),
+            list(product([4], [2], ["C", "F"], [cp], [False, True])),
+        )
+    ),
+    indirect=True,
+)
+class TestMultidiagonalOperatorBinaryOperations:
+
+    def test_addition(self, multidiagonal_operator, multidiagonal_operator_, callback_args):
+        dia_op1 = multidiagonal_operator
+        dia_op2 = multidiagonal_operator_
+        t, args = callback_args
+        dia_op1_arr = dia_op1.to_array(t, args)
+        dia_op2_arr = dia_op2.to_array(t, args)
+
+        dia_op_sum = dia_op1 + dia_op2
+
+        ref = dia_op1_arr + dia_op2_arr
+        np.testing.assert_allclose(dia_op_sum.to_array(t, args), ref)
+
+    def test_subtraction(self, multidiagonal_operator, multidiagonal_operator_, callback_args):
+        dia_op1 = multidiagonal_operator
+        dia_op2 = multidiagonal_operator_
+        t, args = callback_args
+        dia_op1_arr = dia_op1.to_array(t, args)
+        dia_op2_arr = dia_op2.to_array(t, args)
+
+        dia_op_diff = dia_op1 - dia_op2
+
+        ref = dia_op1_arr - dia_op2_arr
+        np.testing.assert_allclose(dia_op_diff.to_array(t, args), ref)
+
+    def test_matrix_multiplication(
+        self, multidiagonal_operator, multidiagonal_operator_, callback_args
+    ):
+        dia_op1 = multidiagonal_operator
+        dia_op2 = multidiagonal_operator_
+        t, args = callback_args
+        dia_op1_arr = dia_op1.to_array(t, args)
+        dia_op2_arr = dia_op2.to_array(t, args)
+
+        dia_op_prod = dia_op1 @ dia_op2
+
+        ref = dia_op1_arr @ dia_op2_arr
+        np.testing.assert_allclose(dia_op_prod.to_array(t, args), ref)
+
+
+@pytest.mark.parametrize(
+    "dense_operator,multidiagonal_operator",
+    list(
+        product(
+            list(product([(4,)], ["C", "F"], [np], [False, True])),
+            list(product([4], [2], ["C", "F"], [np], [False, True])),
+        )
+    )
+    + list(
+        product(
+            list(product([(4,)], ["C", "F"], [cp], [False, True])),
+            list(product([4], [2], ["C", "F"], [cp], [False, True])),
+        )
+    ),
+    indirect=True,
+)
+class TestMixedOperations:
+
+    def test_dense_multidiagonal_addition(
+        self, dense_operator, multidiagonal_operator, callback_args
+    ):
+        dense_op = dense_operator
+        dia_op = multidiagonal_operator
+        t, args = callback_args
+        dense_op_arr = dense_op.to_array(t, args)
+        dia_op_arr = dia_op.to_array(t, args)
+
+        dense_dia_op_sum = dense_op + dia_op
+
+        ref = dense_op_arr + dia_op_arr
+        np.testing.assert_allclose(dense_dia_op_sum.to_array(t, args), ref)
+
+    def test_multidiagonal_dense_addition(
+        self, dense_operator, multidiagonal_operator, callback_args
+    ):
+        dense_op = dense_operator
+        dia_op = multidiagonal_operator
+        t, args = callback_args
+        dense_op_arr = dense_op.to_array(t, args)
+        dia_op_arr = dia_op.to_array(t, args)
+
+        dense_dia_op_sum = dia_op + dense_op
+
+        ref = dense_op_arr + dia_op_arr
+        np.testing.assert_allclose(dense_dia_op_sum.to_array(t, args), ref)
+
+    def test_dense_multidiagonal_subtraction(
+        self, dense_operator, multidiagonal_operator, callback_args
+    ):
+        dense_op = dense_operator
+        dia_op = multidiagonal_operator
+        t, args = callback_args
+        dense_op_arr = dense_op.to_array(t, args)
+        dia_op_arr = dia_op.to_array(t, args)
+
+        dense_dia_op_diff = dense_op - dia_op
+
+        ref = dense_op_arr - dia_op_arr
+        np.testing.assert_allclose(dense_dia_op_diff.to_array(t, args), ref)
+
+    def test_multidiagonal_dense_subtraction(
+        self, dense_operator, multidiagonal_operator, callback_args
+    ):
+        dense_op = dense_operator
+        dia_op = multidiagonal_operator
+        t, args = callback_args
+        dense_op_arr = dense_op.to_array(t, args)
+        dia_op_arr = dia_op.to_array(t, args)
+
+        dense_dia_op_diff = dia_op - dense_op
+
+        ref = dia_op_arr - dense_op_arr
+        np.testing.assert_allclose(dense_dia_op_diff.to_array(t, args), ref)
+
+    def test_dense_multidiagonal_matrix_multiplication(
+        self, dense_operator, multidiagonal_operator, callback_args
+    ):
+        dense_op = dense_operator
+        dia_op = multidiagonal_operator
+        t, args = callback_args
+        dense_op_arr = dense_op.to_array(t, args)
+        dia_op_arr = dia_op.to_array(t, args)
+
+        dense_dia_op_prod = dense_op @ dia_op
+
+        ref = dense_op_arr @ dia_op_arr
+        np.testing.assert_allclose(dense_dia_op_prod.to_array(t, args), ref)
+
+    def test_multidiagonal_dense_matrix_multiplication(
+        self, dense_operator, multidiagonal_operator, callback_args
+    ):
+        dense_op = dense_operator
+        dia_op = multidiagonal_operator
+        t, args = callback_args
+        dense_op_arr = dense_op.to_array(t, args)
+        dia_op_arr = dia_op.to_array(t, args)
+
+        dense_dia_op_prod = dia_op @ dense_op
+
+        ref = dia_op_arr @ dense_op_arr
+        np.testing.assert_allclose(dense_dia_op_prod.to_array(t, args), ref)
diff --git a/python/tests/cuquantum_tests/densitymat_tests/test_operators.py b/python/tests/cuquantum_tests/densitymat_tests/test_operators.py
new file mode 100644
index 0000000..b88af57
--- /dev/null
+++ b/python/tests/cuquantum_tests/densitymat_tests/test_operators.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import numpy as np
+import cupy as cp
+import pytest
+
+from cuquantum.densitymat import (
+    tensor_product,
+    DenseOperator,
+    MultidiagonalOperator,
+    Operator,
+    OperatorAction,
+    WorkStream,
+)
+
+
+@pytest.fixture
+def work_stream():
+    np.random.seed(42)
+    cp.random.seed(42)
+    return WorkStream()
+
+
+def get_dia_example(hilbert_space_dims):
+    A = DenseOperator(np.random.rand(*((hilbert_space_dims[2],) * 2)))
+    B = MultidiagonalOperator(np.random.rand(hilbert_space_dims[3], 3), [-1, 0, 1])
+
+    ab = tensor_product((A, (2,)), (B, (3,)), coeff=lambda t, args: np.sin(args[0] * t))
+    ab2 = tensor_product((A, (2,)), (B, (3,)), coeff=2)
+    return ab, ab2
+
+
+def get_dense_example(hilbert_space_dims):
+    A = DenseOperator(np.random.rand(*((hilbert_space_dims[2],) * 2)))
+    B = DenseOperator(np.random.rand(*((hilbert_space_dims[3], hilbert_space_dims[5]) * 2)))
+
+    ab = tensor_product((A, (2,)), (B, (3, 5)), coeff=lambda t, args: np.sin(args[0] * t))
+    ab2 = tensor_product((A, (2,)), (B, (3, 5)), coeff=2)
+    return ab, ab2
+
+
+class TestOperators:
+
+    @pytest.mark.parametrize("hilbert_space_dims", [(4, 5, 2, 6, 3, 7)])
+    @pytest.mark.parametrize("example_getter", [get_dense_example, get_dia_example])
+    def test_operator_term(self, hilbert_space_dims, example_getter):
+        a, b = example_getter(hilbert_space_dims)
+        c, d = example_getter(hilbert_space_dims)
+        a_times_b = self._test_multiplication_operatorterms(a, b)
+        c_times_d = self._test_multiplication_operatorterms(c, d)
+        a_plus_b = self._test_addition_operatorterms(a, b)
+        c_plus_d = self._test_addition_operatorterms(c, d)
+        a_plus_b_times_c_plus_d = self._test_multiplication_operatorterms(a_plus_b, c_plus_d)
+        a_times_b_plus_c_times_d = self._test_addition_operatorterms(a_times_b, c_times_d)
+        assert len(a_plus_b_times_c_plus_d.terms) == 4
+        assert len(a_times_b_plus_c_times_d.terms) == 2
+
+        # test dag method
+        ops = []
+        for op in a.terms[0][::-1]:
+            ops.append(op)
+        adag = a.dag()
+        ops_dag = []
+        for op in adag.terms[0]:
+            ops_dag.append(op)
+        for op, op_dag in zip(ops, ops_dag):
+            n = len(op.shape)
+            indices = list(range(n // 2, n)) + list(range(n // 2))
+            np.testing.assert_allclose(op.to_array().transpose(*indices).conjugate(), op_dag.to_array())
+
+        # test dual method
+        ops = []
+        for op in a.terms[0][::-1]:
+            ops.append(op)
+        adual = a.dual()
+        ops_dual = []
+        for op in adual.terms[0]:
+            ops_dual.append(op)
+        assert ops == ops_dual
+
+    @pytest.mark.parametrize("hilbert_space_dims", [(4, 5, 2, 6, 3, 7)])
+    @pytest.mark.parametrize("example_getter", [get_dense_example, get_dia_example])
+    def test_tensor_product(self, work_stream, hilbert_space_dims, example_getter):
+        ctx = work_stream
+
+        a, b = example_getter(hilbert_space_dims)
+        # test addition
+        self._test_addition_operatorterms(a, b)
+
+        # test out-of-place multiplication
+        a_scaled = a * 2
+        a_scaled = 2 * a_scaled
+        assert a._coefficients[0].scalar * 4 == a_scaled._coefficients[0].scalar
+
+        # test scalar operators
+        iden = tensor_product(dtype="float64")
+        scaled_iden = tensor_product(coeff=2.0, dtype="float64")
+        two_ids = iden + scaled_iden
+        general = iden + example_getter(hilbert_space_dims)[0]
+        iden._maybe_instantiate(ctx, hilbert_space_dims)
+        two_ids._maybe_instantiate(ctx, hilbert_space_dims)
+        general._maybe_instantiate(ctx, hilbert_space_dims)
+
+    def _test_addition_operatorterms(self, a, b):
+        ab = a + b
+        assert len(ab.terms) == len(a.terms) + len(b.terms)
+        return ab
+
+    def _test_multiplication_operatorterms(self, a, b):
+        ab = a * b
+        assert len(ab.terms) == len(a.terms) * len(b.terms)
+        counter = 0
+        for term_a in a.terms:
+            for term_b in b.terms:
+                ab.terms[counter][: len(term_a)] == term_a
+                ab.terms[counter][len(term_a) :] == term_b
+                counter += 1
+        return ab
+
+    @pytest.mark.parametrize("hilbert_space_dims", [(4, 5, 2, 6, 3, 7)])
+    @pytest.mark.parametrize("n_ops", [1, 2])
+    def test_operator_action(self, work_stream, hilbert_space_dims, n_ops):
+        ctx = work_stream
+        ops = []
+        for _ in range(n_ops):
+            a, b = get_dense_example(hilbert_space_dims)
+            ops.append(Operator(hilbert_space_dims, (a, np.random.rand()), (b, np.random.rand())))
+        OperatorAction(ctx, ops)
diff --git a/python/tests/cuquantum_tests/densitymat_tests/test_state.py b/python/tests/cuquantum_tests/densitymat_tests/test_state.py
new file mode 100644
index 0000000..aef1051
--- /dev/null
+++ b/python/tests/cuquantum_tests/densitymat_tests/test_state.py
@@ -0,0 +1,283 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+from typing import Sequence
+from itertools import product
+
+import cupy as cp
+import numpy as np
+import pytest
+from cuquantum.densitymat import WorkStream, DensePureState, DenseMixedState
+from cuquantum.cutensornet._internal import utils
+
+
+def get_state(ctx, hilbert_space_dims, batch_size, package, dtype, init="random", mixed=False):
+    ctor_args = (ctx, hilbert_space_dims, batch_size, dtype)
+    state = DenseMixedState(*ctor_args) if mixed else DensePureState(*ctor_args)
+    shape, offsets = state.local_info
+
+    _state = package.empty(shape, dtype=dtype, order="F")
+    if init == "random":
+        _state[:] = package.random.rand(*_state.shape) - 0.5
+        if "complex" in dtype:
+            _state[:] += 1j * (package.random.rand(*_state.shape) - 0.5)
+    elif init == "zeros":
+        _state[:] = 0.0
+    state.attach_storage(_state)
+    return state
+
+
+@pytest.fixture
+def work_stream():
+    # NOTE: If random seeds are set at module or class level, some single-precision tests fail
+    np.random.seed(42)
+    cp.random.seed(42)
+    return WorkStream()
+
+
+@pytest.fixture(params=list(product([(2,), (2, 3)], [cp], ["random"], [True, False])))
+def state(request, work_stream):
+    hilbert_space_dims, package, init, mixed = request.param
+
+    def _state(batch_size, dtype):
+        return get_state(work_stream, hilbert_space_dims, batch_size, package, dtype, init, mixed)
+
+    return _state
+
+
+class TestState:
+
+    @pytest.mark.parametrize("batch_size,factors", [(1, 2.0)])
+    @pytest.mark.parametrize("dtype", ["float32", "float64", "complex64", "complex128"])
+    def test_inplace_scale_different_dtypes(self, state, batch_size, dtype, factors):
+        psi = state(batch_size, dtype)
+        psi_arr = psi.storage.get()
+
+        psi.inplace_scale(factors)
+        scaled_psi_arr = psi.storage.get()
+
+        factors_np = self._return_numpy_factors(factors, dtype)
+        ref = psi_arr * factors_np
+        np.testing.assert_allclose(scaled_psi_arr, ref)
+
+    @pytest.mark.parametrize("batch_size,dtype", [(2, "float64")])
+    @pytest.mark.parametrize(
+        "factors", [2.0, (2.0, 3.0), np.array([2.0, 3.0]), cp.array([2.0, 3.0])]
+    )
+    def test_inplace_scale_different_factors(self, state, batch_size, dtype, factors):
+        psi = state(batch_size, dtype)
+        psi_arr = psi.storage.get()
+
+        psi.inplace_scale(factors)
+        scaled_psi_arr = psi.storage.get()
+
+        factors_np = self._return_numpy_factors(factors, dtype)
+        ref = psi_arr * factors_np
+        np.testing.assert_allclose(scaled_psi_arr, ref)
+
+    @pytest.mark.parametrize("dtype", ["float64"])
+    @pytest.mark.parametrize(
+        "batch_size,factors",
+        [
+            (2, (1.0, 2.0, 3.0)),
+            (2, np.array([[1.0, 2.0], [3.0, 4.0]])),
+        ],
+    )
+    def test_inplace_scale_improper_factors_shape(self, state, batch_size, dtype, factors):
+        psi = state(batch_size, dtype)
+        with pytest.raises(ValueError):
+            psi.inplace_scale(factors)
+
+    @pytest.mark.parametrize("dtype", ["float64"])
+    @pytest.mark.parametrize(
+        "batch_size,factors",
+        [
+            (2, {1.0, 2.0}),
+            (2, (1, 2 + 3j)),
+        ],
+    )
+    def test_inplace_scale_improper_factors_type(self, state, batch_size, dtype, factors):
+        psi = state(batch_size, dtype)
+        with pytest.raises(TypeError):
+            psi.inplace_scale(factors)
+
+    @pytest.mark.parametrize("batch_size", [2])
+    @pytest.mark.parametrize(
+        "dtype,factors",
+        [
+            ("complex128", np.array([1.0, 2.0 + 3.0j])),
+            ("float32", cp.array([2.0, 3.0], dtype="float64")),
+            pytest.param(
+                "float64", cp.array([1 + 2j, 3 + 4j]), marks=pytest.mark.xfail(raises=TypeError)
+            ),
+        ],
+    )
+    def test_inplace_scale_dtype_factors_compatibility(self, state, batch_size, dtype, factors):
+        psi = state(batch_size, dtype)
+        psi.inplace_scale(factors)
+
+    @pytest.mark.parametrize("batch_size,factors", [(1, 2.0)])
+    @pytest.mark.parametrize("dtype", ["float32", "float64", "complex64", "complex128"])
+    def test_inplace_accumulate_different_dtypes(self, state, batch_size, dtype, factors):
+        psi1 = state(batch_size, dtype)
+        psi2 = psi1.clone(cp.zeros_like(psi1.storage))
+        psi2.storage[:] = cp.random.rand(*psi2.storage.shape)
+        psi1_arr = psi1.storage.get()
+        psi2_arr = psi2.storage.get()
+
+        psi1.inplace_accumulate(psi2, factors)
+        accumulated_psi_arr = psi1.storage.get()
+
+        factors_np = self._return_numpy_factors(factors, dtype)
+        ref = factors_np * psi2_arr + psi1_arr
+        np.testing.assert_allclose(accumulated_psi_arr, ref)
+
+    @pytest.mark.parametrize("batch_size,dtype", [(2, "float64")])
+    @pytest.mark.parametrize(
+        "factors", [2.0, (2.0, 3.0), np.array([2.0, 3.0]), cp.array([2.0, 3.0])]
+    )
+    def test_inplace_accumulate_different_factors(self, state, batch_size, dtype, factors):
+        psi1 = state(batch_size, dtype)
+        psi2 = psi1.clone(cp.zeros_like(psi1.storage))
+        psi2.storage[:] = cp.random.rand(*psi2.storage.shape)
+        psi1_arr = psi1.storage.get()
+        psi2_arr = psi2.storage.get()
+
+        psi1.inplace_accumulate(psi2, factors)
+        accumulated_psi_arr = psi1.storage.get()
+
+        factors_np = self._return_numpy_factors(factors, dtype)
+        ref = factors_np * psi2_arr + psi1_arr
+        np.testing.assert_allclose(accumulated_psi_arr, ref)
+
+    @pytest.mark.parametrize("dtype", ["float64"])
+    @pytest.mark.parametrize(
+        "batch_size,factors",
+        [
+            (2, (1.0, 2.0, 3.0)),
+            (2, np.array([[1.0, 2.0], [3.0, 4.0]])),
+        ],
+    )
+    def test_inplace_accumulate_improper_factors_shape(self, state, batch_size, dtype, factors):
+        psi1 = state(batch_size, dtype)
+        psi2 = psi1.clone(cp.zeros_like(psi1.storage))
+        with pytest.raises(ValueError):
+            psi1.inplace_accumulate(psi2, factors)
+
+    @pytest.mark.parametrize("dtype", ["float64"])
+    @pytest.mark.parametrize(
+        "batch_size,factors",
+        [
+            (1, 1 + 2j),
+            (2, {1.0, 2.0}),
+            (2, (1, 2 + 3j)),
+        ],
+    )
+    def test_inplace_accumulate_improper_factors_type(self, state, batch_size, dtype, factors):
+        psi1 = state(batch_size, dtype)
+        psi2 = psi1.clone(cp.zeros_like(psi1.storage))
+        with pytest.raises(TypeError):
+            psi1.inplace_accumulate(psi2, factors)
+
+    @pytest.mark.parametrize("batch_size", [2])
+    @pytest.mark.parametrize(
+        "dtype,factors",
+        [
+            ("complex128", np.array([1.0, 2.0 + 3.0j])),
+            ("float32", cp.array([2.0, 3.0], dtype="float64")),
+            pytest.param(
+                "float64", cp.array([1 + 2j, 3 + 4j]), marks=pytest.mark.xfail(raises=TypeError)
+            ),
+        ],
+    )
+    def test_inplace_accumulate_dtype_factors_compatibility(
+        self, state, batch_size, dtype, factors
+    ):
+        psi1 = state(batch_size, dtype)
+        psi2 = psi1.clone(cp.zeros_like(psi1.storage))
+        psi1.inplace_accumulate(psi2, factors)
+
+    @pytest.mark.parametrize("batch_size", [1, 2])
+    @pytest.mark.parametrize("dtype", ["float32", "float64", "complex64", "complex128"])
+    def test_state_inner_product(self, state, batch_size, dtype):
+        psi1 = state(batch_size, dtype)
+        psi2 = psi1.clone(cp.zeros_like(psi1.storage))
+        psi2.storage[:] = cp.random.rand(*psi2.storage.shape)
+        psi1_arr = psi1.storage.get()
+        psi2_arr = psi2.storage.get()
+
+        inner_prod = psi1.inner_product(psi2)
+        inner_prod_arr = inner_prod.get()
+
+        psi1_arr = psi1_arr.reshape((-1, psi1_arr.shape[-1]), order="F")
+        psi2_arr = psi2_arr.reshape((-1, psi2_arr.shape[-1]), order="F")
+        ref = np.zeros((batch_size,), dtype=inner_prod.dtype)
+        for i in range(batch_size):
+            ref[i] = np.vdot(psi1_arr[:, i], psi2_arr[:, i])
+
+        np.testing.assert_allclose(inner_prod_arr, ref, rtol=1e-5, atol=1e-8)
+
+    @pytest.mark.parametrize("batch_size", [1, 2])
+    @pytest.mark.parametrize("dtype", ["float32", "float64", "complex64", "complex128"])
+    def test_norm(self, state, batch_size, dtype):
+        psi = state(batch_size, dtype)
+        psi_arr = psi.storage.get()
+        norm = psi.norm().get()
+
+        psi_arr = psi_arr.reshape((-1, psi_arr.shape[-1]), order="F")
+        ref = np.empty((batch_size,), dtype=psi.storage.real.dtype)
+        for i in range(batch_size):
+            ref[i] = np.vdot(psi_arr[:, i], psi_arr[:, i]).real
+
+        np.testing.assert_allclose(norm, ref, rtol=1e-5, atol=1e-8)
+
+    @pytest.mark.parametrize("batch_size", [1, 2])
+    @pytest.mark.parametrize("dtype", ["float32", "float64", "complex64", "complex128"])
+    def test_trace(self, state, batch_size, dtype):
+        psi = state(batch_size, dtype)
+        psi_arr = psi.storage.get()
+
+        trace = psi.trace()
+
+        ref = cp.empty((batch_size,), dtype=dtype)
+        if isinstance(psi, DensePureState):
+            psi_arr = psi_arr.reshape((-1, psi_arr.shape[-1]), order="F")
+            for i in range(batch_size):
+                ref[i] = np.vdot(psi_arr[:, i], psi_arr[:, i])
+        else:
+            psi_arr = psi_arr.reshape(
+                (np.prod(psi.hilbert_space_dims), np.prod(psi.hilbert_space_dims), batch_size),
+                order="F",
+            )
+            for i in range(batch_size):
+                ref[i] = cp.trace(psi_arr[:, :, i])
+
+        cp.testing.assert_allclose(trace, ref, rtol=1e-5, atol=1e-8)
+
+    @pytest.mark.parametrize("batch_size", [1, 2])
+    @pytest.mark.parametrize("dtype", ["float32", "float64", "complex64", "complex128"])
+    def test_attach_storage(self, state, batch_size, dtype):
+        psi = state(batch_size, dtype)
+        shape, _ = psi.local_info
+        with utils.device_ctx(psi._ctx.device_id):
+            psi_arr = cp.zeros(shape, dtype=dtype, order="F")
+            psi_arr_wrong_shape = cp.zeros([x + 1 for x in shape], dtype=dtype, order="F")
+            psi_arr_c_order = cp.zeros(shape, dtype=dtype, order="C")
+
+        psi.attach_storage(psi_arr)
+        with pytest.raises(ValueError):
+            psi.clone(psi_arr_wrong_shape)
+        if len(psi.hilbert_space_dims) > 1:
+            with pytest.raises(ValueError):
+                psi.clone(psi_arr_c_order)
+
+    @staticmethod
+    def _return_numpy_factors(factors, dtype):
+        if isinstance(factors, Sequence):
+            factors_np = np.array(factors, dtype=dtype)
+        elif isinstance(factors, cp.ndarray):
+            factors_np = factors.get()
+        else:  # single number or numpy array
+            factors_np = factors
+        return factors_np
diff --git a/python/tests/cuquantum_tests/densitymat_tests/test_work_stream.py b/python/tests/cuquantum_tests/densitymat_tests/test_work_stream.py
new file mode 100644
index 0000000..8d524cd
--- /dev/null
+++ b/python/tests/cuquantum_tests/densitymat_tests/test_work_stream.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+
+import weakref
+
+import cupy as cp
+import pytest
+
+from cuquantum.densitymat import WorkStream
+
+
+class TestWorkStream:
+
+    def test_default(self):
+        ctx = WorkStream()
+        assert ctx.device_id == 0
+        assert ctx.blocking == True
+        assert ctx._valid_state
+        assert ctx._handle._valid_state
+
+    def test_set_memory_limit(self):
+        ctx = WorkStream(memory_limit="75%")
+        assert ctx.memory_limit == "75%"
+
+    def test_set_stream(self):
+        ctx = WorkStream(stream=cp.cuda.Stream(4))
+        assert ctx.stream == cp.cuda.Stream(4)
+
+    @pytest.mark.skipif(cp.cuda.runtime.getDeviceCount() < 2, reason="not enough GPUs")
+    def test_multiple_devices(self):
+        with cp.cuda.Device(1):
+            ctx_default = WorkStream()
+            ctx_explicit = WorkStream(device_id=cp.cuda.Device().id)
+        assert ctx_default.device_id == 0
+        assert ctx_explicit.device_id == 1
+
+    def test_handle_reference(self):
+        ctx = WorkStream()
+        ref = weakref.ref(ctx._handle)
+        ctx = None
+        assert ref() is None
diff --git a/python/tests/requirements.txt b/python/tests/requirements.txt
index 261c918..21e568d 100644
--- a/python/tests/requirements.txt
+++ b/python/tests/requirements.txt
@@ -9,3 +9,5 @@ qiskit
 qiskit-aer
 pylatexenc
 jsonschema == 4.17.3
+scipy
+networkx
\ No newline at end of file
diff --git a/python/tests/samples_tests/densitymat_tests/__init__.py b/python/tests/samples_tests/densitymat_tests/__init__.py
new file mode 100644
index 0000000..808298f
--- /dev/null
+++ b/python/tests/samples_tests/densitymat_tests/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
diff --git a/python/tests/samples_tests/densitymat_tests/test_cudensitymat_samples.py b/python/tests/samples_tests/densitymat_tests/test_cudensitymat_samples.py
new file mode 100644
index 0000000..945fa91
--- /dev/null
+++ b/python/tests/samples_tests/densitymat_tests/test_cudensitymat_samples.py
@@ -0,0 +1,21 @@
+# Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES
+#
+# SPDX-License-Identifier: BSD-3-Clause
+
+import glob
+import os
+
+import pytest
+
+from ..test_utils import run_sample
+
+samples_path = os.path.join(
+    os.path.dirname(__file__), '..', '..', '..', 'samples', 'densitymat')
+sample_files = glob.glob(samples_path+'/**/*.py', recursive=True)
+
+
+@pytest.mark.parametrize("sample", sample_files)
+class TestcuDensityMatSamples:
+
+    def test_sample(self, sample):
+        run_sample(samples_path, sample)
diff --git a/samples/cudensitymat/helpers.h b/samples/cudensitymat/helpers.h
new file mode 100644
index 0000000..d8e663d
--- /dev/null
+++ b/samples/cudensitymat/helpers.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#pragma once
+
+#include <cuda_runtime_api.h>
+
+#include <complex>
+#include <vector>
+#include <iostream>
+
+
+/** Error handling macro definitions */
+
+#define HANDLE_CUDA_ERROR(x)                                \
+{                                                           \
+  const auto err = x;                                       \
+  if (err != cudaSuccess)                                   \
+  {                                                         \
+    const char * error = cudaGetErrorString(err);           \
+    printf("CUDA Error: %s in line %d\n", error, __LINE__); \
+    fflush(stdout);                                         \
+    std::abort();                                           \
+  }                                                         \
+};
+
+#define HANDLE_CUDM_ERROR(x)                               \
+{                                                          \
+  const auto err = x;                                      \
+  if (err != CUDENSITYMAT_STATUS_SUCCESS)                  \
+  {                                                        \
+    printf("cuDensityMat error in line %d\n", __LINE__);   \
+    fflush(stdout);                                        \
+    std::abort();                                          \
+  }                                                        \
+};
+
+#ifdef MPI_ENABLED
+#define HANDLE_MPI_ERROR(x)                                \
+{                                                          \
+  const auto err = x;                                      \
+  if (err != MPI_SUCCESS)                                  \
+  {                                                        \
+    char error[MPI_MAX_ERROR_STRING];                      \
+    int len;                                               \
+    MPI_Error_string(err, error, &len);                    \
+    printf("MPI Error: %s in line %d\n", error, __LINE__); \
+    fflush(stdout);                                        \
+    MPI_Abort(MPI_COMM_WORLD, err);                        \
+  }                                                        \
+};
+#endif
+
+
+/** Helper function for creating an array copy in GPU memory:
+ *   RealComplexType = {float, double, complex<float>, complex<double>} */
+template <typename RealComplexType>
+void * createArrayGPU(const std::vector<RealComplexType> & cpuArray)
+{
+  void * gpuArray {nullptr};
+  const std::size_t arraySize = cpuArray.size() * sizeof(RealComplexType);
+  if (arraySize > 0) {
+    HANDLE_CUDA_ERROR(cudaMalloc(&gpuArray, arraySize));
+    HANDLE_CUDA_ERROR(cudaMemcpy(gpuArray, static_cast<const void*>(cpuArray.data()),
+                                 arraySize, cudaMemcpyHostToDevice));
+  }
+  return gpuArray;
+}
+
+
+/** Helper function for destroying a previously created array copy in GPU memory */
+inline void destroyArrayGPU(void * gpuArray)
+{
+  HANDLE_CUDA_ERROR(cudaFree(gpuArray));
+  return;
+}
+
+
+/** Helper function for printing a GPU array */
+template <typename RealComplexType>
+void printArrayGPU(void * gpuArray,
+                   std::size_t arrayLen)
+{
+  std::vector<RealComplexType> cpuArray(arrayLen);
+  const std::size_t arraySize = arrayLen * sizeof(RealComplexType);
+  HANDLE_CUDA_ERROR(cudaDeviceSynchronize());
+  HANDLE_CUDA_ERROR(cudaMemcpy(cpuArray.data(), gpuArray, arraySize, cudaMemcpyDeviceToHost));
+  std::cout << "\nPrinting array " << gpuArray << "[" << arrayLen << "]:\n";
+  for (std::size_t i = 0; i < arrayLen; ++i) {
+    std::cout << " " << i << "   " << cpuArray[i] << std::endl;
+  }
+  std::cout << std::flush;
+  return;
+}
diff --git a/samples/cudensitymat/operator_action_example.cpp b/samples/cudensitymat/operator_action_example.cpp
new file mode 100644
index 0000000..e260684
--- /dev/null
+++ b/samples/cudensitymat/operator_action_example.cpp
@@ -0,0 +1,244 @@
+/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <cudensitymat.h>  // cuDensityMat library header
+#include "helpers.h"       // helper functions
+
+
+// Transverse Ising Hamiltonian with double summation ordering
+// and spin-operator fusion, plus fused dissipation terms
+#include "transverse_ising_full_fused_noisy.h"  // user-defined Liouvillian operator example
+
+
+#include <cmath>
+#include <complex>
+#include <vector>
+#include <chrono>
+#include <iostream>
+#include <cassert>
+
+
+// Number of times to perform operator action on a quantum state
+constexpr int NUM_REPEATS = 2;
+
+// Logging verbosity
+bool verbose = true;
+
+
+// Example workflow
+void exampleWorkflow(cudensitymatHandle_t handle)
+{
+  // Define the composite Hilbert space shape and
+  // quantum state batch size (number of individual quantum states)
+  const std::vector<int64_t> spaceShape({2,2,2,2,2,2,2,2}); // dimensions of quantum degrees of freedom
+  const int64_t batchSize = 1;                              // number of quantum states per batch (default is 1)
+
+  if (verbose) {
+    std::cout << "Hilbert space rank = " << spaceShape.size() << "; Shape = (";
+    for (const auto & dimsn: spaceShape)
+      std::cout << dimsn << ",";
+    std::cout << ")" << std::endl;
+    std::cout << "Quantum state batch size = " << batchSize << std::endl;
+  }
+
+  // Construct a user-defined Liouvillian operator using a convenience C++ class
+  UserDefinedLiouvillian liouvillian(handle, spaceShape);
+  if (verbose)
+    std::cout << "Constructed the Liouvillian operator\n";
+
+  // Declare the input quantum state
+  cudensitymatState_t inputState;
+  HANDLE_CUDM_ERROR(cudensitymatCreateState(handle,
+                      CUDENSITYMAT_STATE_PURITY_MIXED,  // pure (state vector) or mixed (density matrix) state
+                      spaceShape.size(),
+                      spaceShape.data(),
+                      batchSize,
+                      CUDA_C_64F,  // data type must match that of the operators created above
+                      &inputState));
+
+  // Query the size of the quantum state storage
+  std::size_t storageSize {0}; // only one storage component (tensor) is needed
+  HANDLE_CUDM_ERROR(cudensitymatStateGetComponentStorageSize(handle,
+                      inputState,
+                      1,               // only one storage component
+                      &storageSize));  // storage size in bytes
+  const std::size_t stateVolume = storageSize / sizeof(std::complex<double>);  // quantum state tensor volume (number of elements)
+  if (verbose)
+    std::cout << "Quantum state storage size (bytes) = " << storageSize << std::endl;
+
+  // Prepare some initial value for the input quantum state
+  std::vector<std::complex<double>> inputStateValue(stateVolume);
+  for (std::size_t i = 0; i < stateVolume; ++i) {
+    inputStateValue[i] = std::complex<double>{double(i+1), double(-(i+2))}; // just some value
+  }
+
+  // Allocate initialized GPU storage for the input quantum state with prepared values
+  auto * inputStateElems = createArrayGPU(inputStateValue);
+
+  // Attach initialized GPU storage to the input quantum state
+  HANDLE_CUDM_ERROR(cudensitymatStateAttachComponentStorage(handle,
+                      inputState,
+                      1,                                                 // only one storage component (tensor)
+                      std::vector<void*>({inputStateElems}).data(),      // pointer to the GPU storage for the quantum state
+                      std::vector<std::size_t>({storageSize}).data()));  // size of the GPU storage for the quantum state
+  if (verbose)
+    std::cout << "Constructed input quantum state\n";
+
+  // Declare the output quantum state of the same shape
+  cudensitymatState_t outputState;
+  HANDLE_CUDM_ERROR(cudensitymatCreateState(handle,
+                      CUDENSITYMAT_STATE_PURITY_MIXED,  // pure (state vector) or mixed (density matrix) state
+                      spaceShape.size(),
+                      spaceShape.data(),
+                      batchSize,
+                      CUDA_C_64F,  // data type must match that of the operators created above
+                      &outputState));
+
+  // Allocate initialized GPU storage for the output quantum state
+  auto * outputStateElems = createArrayGPU(std::vector<std::complex<double>>(stateVolume, {0.0, 0.0}));
+
+  // Attach initialized GPU storage to the output quantum state
+  HANDLE_CUDM_ERROR(cudensitymatStateAttachComponentStorage(handle,
+                      outputState,
+                      1,                                                 // only one storage component (no tensor factorization)
+                      std::vector<void*>({outputStateElems}).data(),     // pointer to the GPU storage for the quantum state
+                      std::vector<std::size_t>({storageSize}).data()));  // size of the GPU storage for the quantum state
+  if (verbose)
+    std::cout << "Constructed output quantum state\n";
+
+  // Declare a workspace descriptor
+  cudensitymatWorkspaceDescriptor_t workspaceDescr;
+  HANDLE_CUDM_ERROR(cudensitymatCreateWorkspace(handle, &workspaceDescr));
+
+  // Query free GPU memory
+  std::size_t freeMem = 0, totalMem = 0;
+  HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeMem, &totalMem));
+  freeMem = static_cast<std::size_t>(static_cast<double>(freeMem) * 0.95); // take 95% of the free memory for the workspace buffer
+  if (verbose)
+    std::cout << "Max workspace buffer size (bytes) = " << freeMem << std::endl;
+
+  // Prepare the Liouvillian operator action on a quantum state (needs to be done only once)
+  const auto startTime = std::chrono::high_resolution_clock::now();
+  HANDLE_CUDM_ERROR(cudensitymatOperatorPrepareAction(handle,
+                      liouvillian.get(),
+                      inputState,
+                      outputState,
+                      CUDENSITYMAT_COMPUTE_64F,  // GPU compute type
+                      freeMem,                   // max available GPU free memory for the workspace
+                      workspaceDescr,            // workspace descriptor
+                      0x0));                     // default CUDA stream
+  const auto finishTime = std::chrono::high_resolution_clock::now();
+  const std::chrono::duration<double> timeSec = finishTime - startTime;
+  if (verbose)
+    std::cout << "Operator action prepation time (sec) = " << timeSec.count() << std::endl;
+
+  // Query the required workspace buffer size (bytes)
+  std::size_t requiredBufferSize {0};
+  HANDLE_CUDM_ERROR(cudensitymatWorkspaceGetMemorySize(handle,
+                      workspaceDescr,
+                      CUDENSITYMAT_MEMSPACE_DEVICE,
+                      CUDENSITYMAT_WORKSPACE_SCRATCH,
+                      &requiredBufferSize));
+  if (verbose)
+    std::cout << "Required workspace buffer size (bytes) = " << requiredBufferSize << std::endl;
+
+  // Allocate GPU storage for the workspace buffer
+  const std::size_t bufferVolume = requiredBufferSize / sizeof(std::complex<double>);
+  auto * workspaceBuffer = createArrayGPU(std::vector<std::complex<double>>(bufferVolume, {0.0, 0.0}));
+  if (verbose)
+    std::cout << "Allocated workspace buffer of size (bytes) = " << requiredBufferSize << std::endl;
+
+  // Attach the workspace buffer to the workspace descriptor
+  HANDLE_CUDM_ERROR(cudensitymatWorkspaceSetMemory(handle,
+                      workspaceDescr,
+                      CUDENSITYMAT_MEMSPACE_DEVICE,
+                      CUDENSITYMAT_WORKSPACE_SCRATCH,
+                      workspaceBuffer,
+                      requiredBufferSize));
+  if (verbose)
+    std::cout << "Attached workspace buffer of size (bytes) = " << requiredBufferSize << std::endl;
+
+  // Zero out the output quantum state
+  HANDLE_CUDM_ERROR(cudensitymatStateInitializeZero(handle,
+                      outputState,
+                      0x0));
+  if (verbose)
+    std::cout << "Initialized the output state to zero\n";
+
+  // Apply the Liouvillian operator to the input quatum state
+  // and accumulate its action into the output quantum state (note += semantics)
+  for (int32_t repeat = 0; repeat < NUM_REPEATS; ++repeat) { // repeat multiple times for accurate timing
+    HANDLE_CUDA_ERROR(cudaDeviceSynchronize());
+    const auto startTime = std::chrono::high_resolution_clock::now();
+    HANDLE_CUDM_ERROR(cudensitymatOperatorComputeAction(handle,
+                        liouvillian.get(),
+                        0.01,                                  // time point
+                        1,                                     // number of external user-defined Hamiltonian parameters
+                        std::vector<double>({13.42}).data(),   // Hamiltonian parameter(s)
+                        inputState,                            // input quantum state
+                        outputState,                           // output quantum state
+                        workspaceDescr,                        // workspace descriptor
+                        0x0));                                 // default CUDA stream
+    HANDLE_CUDA_ERROR(cudaDeviceSynchronize());
+    const auto finishTime = std::chrono::high_resolution_clock::now();
+    const std::chrono::duration<double> timeSec = finishTime - startTime;
+    if (verbose)
+      std::cout << "Operator action computation time (sec) = " << timeSec.count() << std::endl;
+  }
+
+  // Compute the squared norm of the output quantum state
+  void * norm2 = createArrayGPU(std::vector<double>(batchSize, 0.0));
+  HANDLE_CUDM_ERROR(cudensitymatStateComputeNorm(handle,
+                      outputState,
+                      norm2,
+                      0x0));
+  if (verbose)
+    std::cout << "Computed the output state norm\n";
+  HANDLE_CUDA_ERROR(cudaDeviceSynchronize());
+  destroyArrayGPU(norm2);
+
+  // Destroy workspace descriptor
+  HANDLE_CUDM_ERROR(cudensitymatDestroyWorkspace(workspaceDescr));
+
+  // Destroy workspace buffer storage
+  destroyArrayGPU(workspaceBuffer);
+
+  // Destroy quantum states
+  HANDLE_CUDM_ERROR(cudensitymatDestroyState(outputState));
+  HANDLE_CUDM_ERROR(cudensitymatDestroyState(inputState));
+
+  // Destroy quantum state storage
+  destroyArrayGPU(outputStateElems);
+  destroyArrayGPU(inputStateElems);
+
+  if (verbose)
+    std::cout << "Destroyed resources\n" << std::flush;
+}
+
+
+int main(int argc, char ** argv)
+{
+  // Assign a GPU to the process
+  HANDLE_CUDA_ERROR(cudaSetDevice(0));
+  if (verbose)
+    std::cout << "Set active device\n";
+
+  // Create a library handle
+  cudensitymatHandle_t handle;
+  HANDLE_CUDM_ERROR(cudensitymatCreate(&handle));
+  if (verbose)
+    std::cout << "Created a library handle\n";
+
+  // Run the example
+  exampleWorkflow(handle);
+
+  // Destroy the library handle
+  HANDLE_CUDM_ERROR(cudensitymatDestroy(handle));
+  if (verbose)
+    std::cout << "Destroyed the library handle\n";
+
+  // Done
+  return 0;
+}
diff --git a/samples/cudensitymat/operator_action_mpi_example.cpp b/samples/cudensitymat/operator_action_mpi_example.cpp
new file mode 100644
index 0000000..0d85cfe
--- /dev/null
+++ b/samples/cudensitymat/operator_action_mpi_example.cpp
@@ -0,0 +1,288 @@
+/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#include <cudensitymat.h>  // cuDensityMat library header
+#include "helpers.h"       // helper functions
+
+
+// Transverse Ising Hamiltonian with double summation ordering
+// and spin-operator fusion, plus fused dissipation terms
+#include "transverse_ising_full_fused_noisy.h"  // user-defined Liouvillian operator example
+
+
+// MPI library (optional)
+#ifdef MPI_ENABLED
+#include <mpi.h>
+#endif
+
+#include <cmath>
+#include <complex>
+#include <vector>
+#include <chrono>
+#include <iostream>
+#include <cassert>
+
+
+// Number of times to perform operator action on a quantum state
+constexpr int NUM_REPEATS = 2;
+
+// Logging verbosity
+bool verbose = true;
+
+
+// Example workflow
+void exampleWorkflow(cudensitymatHandle_t handle)
+{
+  // Define the composite Hilbert space shape and
+  // quantum state batch size (number of individual quantum states)
+  const std::vector<int64_t> spaceShape({2,2,2,2,2,2,2,2}); // dimensions of quantum degrees of freedom
+  const int64_t batchSize = 1;                              // number of quantum states per batch (default is 1)
+
+  if (verbose) {
+    std::cout << "Hilbert space rank = " << spaceShape.size() << "; Shape = (";
+    for (const auto & dimsn: spaceShape)
+      std::cout << dimsn << ",";
+    std::cout << ")" << std::endl;
+    std::cout << "Quantum state batch size = " << batchSize << std::endl;
+  }
+
+  // Construct a user-defined Liouvillian operator using a convenience C++ class
+  UserDefinedLiouvillian liouvillian(handle, spaceShape);
+  if (verbose)
+    std::cout << "Constructed the Liouvillian operator\n";
+
+  // Declare the input quantum state
+  cudensitymatState_t inputState;
+  HANDLE_CUDM_ERROR(cudensitymatCreateState(handle,
+                      CUDENSITYMAT_STATE_PURITY_MIXED,  // pure (state vector) or mixed (density matrix) state
+                      spaceShape.size(),
+                      spaceShape.data(),
+                      batchSize,
+                      CUDA_C_64F,  // data type must match that of the operators created above
+                      &inputState));
+
+  // Query the size of the quantum state storage
+  std::size_t storageSize {0}; // only one storage component (tensor) is needed
+  HANDLE_CUDM_ERROR(cudensitymatStateGetComponentStorageSize(handle,
+                      inputState,
+                      1,               // only one storage component
+                      &storageSize));  // storage size in bytes
+  const std::size_t stateVolume = storageSize / sizeof(std::complex<double>);  // quantum state tensor volume (number of elements)
+  if (verbose)
+    std::cout << "Quantum state storage size (bytes) = " << storageSize << std::endl;
+
+  // Prepare some initial value for the input quantum state
+  std::vector<std::complex<double>> inputStateValue(stateVolume);
+  for (std::size_t i = 0; i < stateVolume; ++i) {
+    inputStateValue[i] = std::complex<double>{double(i+1), double(-(i+2))}; // just some value
+  }
+
+  // Allocate initialized GPU storage for the input quantum state with prepared values
+  auto * inputStateElems = createArrayGPU(inputStateValue);
+
+  // Attach initialized GPU storage to the input quantum state
+  HANDLE_CUDM_ERROR(cudensitymatStateAttachComponentStorage(handle,
+                      inputState,
+                      1,                                                 // only one storage component (tensor)
+                      std::vector<void*>({inputStateElems}).data(),      // pointer to the GPU storage for the quantum state
+                      std::vector<std::size_t>({storageSize}).data()));  // size of the GPU storage for the quantum state
+  if (verbose)
+    std::cout << "Constructed input quantum state\n";
+
+  // Declare the output quantum state of the same shape
+  cudensitymatState_t outputState;
+  HANDLE_CUDM_ERROR(cudensitymatCreateState(handle,
+                      CUDENSITYMAT_STATE_PURITY_MIXED,  // pure (state vector) or mixed (density matrix) state
+                      spaceShape.size(),
+                      spaceShape.data(),
+                      batchSize,
+                      CUDA_C_64F,  // data type must match that of the operators created above
+                      &outputState));
+
+  // Allocate initialized GPU storage for the output quantum state
+  auto * outputStateElems = createArrayGPU(std::vector<std::complex<double>>(stateVolume, {0.0, 0.0}));
+
+  // Attach initialized GPU storage to the output quantum state
+  HANDLE_CUDM_ERROR(cudensitymatStateAttachComponentStorage(handle,
+                      outputState,
+                      1,                                                 // only one storage component (no tensor factorization)
+                      std::vector<void*>({outputStateElems}).data(),     // pointer to the GPU storage for the quantum state
+                      std::vector<std::size_t>({storageSize}).data()));  // size of the GPU storage for the quantum state
+  if (verbose)
+    std::cout << "Constructed output quantum state\n";
+
+  // Declare a workspace descriptor
+  cudensitymatWorkspaceDescriptor_t workspaceDescr;
+  HANDLE_CUDM_ERROR(cudensitymatCreateWorkspace(handle, &workspaceDescr));
+
+  // Query free GPU memory
+  std::size_t freeMem = 0, totalMem = 0;
+  HANDLE_CUDA_ERROR(cudaMemGetInfo(&freeMem, &totalMem));
+  freeMem = static_cast<std::size_t>(static_cast<double>(freeMem) * 0.95); // take 95% of the free memory for the workspace buffer
+  if (verbose)
+    std::cout << "Max workspace buffer size (bytes) = " << freeMem << std::endl;
+
+  // Prepare the Liouvillian operator action on a quantum state (needs to be done only once)
+  const auto startTime = std::chrono::high_resolution_clock::now();
+  HANDLE_CUDM_ERROR(cudensitymatOperatorPrepareAction(handle,
+                      liouvillian.get(),
+                      inputState,
+                      outputState,
+                      CUDENSITYMAT_COMPUTE_64F,  // GPU compute type
+                      freeMem,                   // max available GPU free memory for the workspace
+                      workspaceDescr,            // workspace descriptor
+                      0x0));                     // default CUDA stream
+  const auto finishTime = std::chrono::high_resolution_clock::now();
+  const std::chrono::duration<double> timeSec = finishTime - startTime;
+  if (verbose)
+    std::cout << "Operator action prepation time (sec) = " << timeSec.count() << std::endl;
+
+  // Query the required workspace buffer size (bytes)
+  std::size_t requiredBufferSize {0};
+  HANDLE_CUDM_ERROR(cudensitymatWorkspaceGetMemorySize(handle,
+                      workspaceDescr,
+                      CUDENSITYMAT_MEMSPACE_DEVICE,
+                      CUDENSITYMAT_WORKSPACE_SCRATCH,
+                      &requiredBufferSize));
+  if (verbose)
+    std::cout << "Required workspace buffer size (bytes) = " << requiredBufferSize << std::endl;
+
+  // Allocate GPU storage for the workspace buffer
+  const std::size_t bufferVolume = requiredBufferSize / sizeof(std::complex<double>);
+  auto * workspaceBuffer = createArrayGPU(std::vector<std::complex<double>>(bufferVolume, {0.0, 0.0}));
+  if (verbose)
+    std::cout << "Allocated workspace buffer of size (bytes) = " << requiredBufferSize << std::endl;
+
+  // Attach the workspace buffer to the workspace descriptor
+  HANDLE_CUDM_ERROR(cudensitymatWorkspaceSetMemory(handle,
+                      workspaceDescr,
+                      CUDENSITYMAT_MEMSPACE_DEVICE,
+                      CUDENSITYMAT_WORKSPACE_SCRATCH,
+                      workspaceBuffer,
+                      requiredBufferSize));
+  if (verbose)
+    std::cout << "Attached workspace buffer of size (bytes) = " << requiredBufferSize << std::endl;
+
+  // Zero out the output quantum state
+  HANDLE_CUDM_ERROR(cudensitymatStateInitializeZero(handle,
+                      outputState,
+                      0x0));
+  if (verbose)
+    std::cout << "Initialized the output state to zero\n";
+
+  // Apply the Liouvillian operator to the input quatum state
+  // and accumulate its action into the output quantum state (note += semantics)
+  for (int32_t repeat = 0; repeat < NUM_REPEATS; ++repeat) { // repeat multiple times for accurate timing
+    HANDLE_CUDA_ERROR(cudaDeviceSynchronize());
+    const auto startTime = std::chrono::high_resolution_clock::now();
+    HANDLE_CUDM_ERROR(cudensitymatOperatorComputeAction(handle,
+                        liouvillian.get(),
+                        0.01,                                  // time point
+                        1,                                     // number of external user-defined Hamiltonian parameters
+                        std::vector<double>({13.42}).data(),   // Hamiltonian parameter(s)
+                        inputState,                            // input quantum state
+                        outputState,                           // output quantum state
+                        workspaceDescr,                        // workspace descriptor
+                        0x0));                                 // default CUDA stream
+    HANDLE_CUDA_ERROR(cudaDeviceSynchronize());
+    const auto finishTime = std::chrono::high_resolution_clock::now();
+    const std::chrono::duration<double> timeSec = finishTime - startTime;
+    if (verbose)
+      std::cout << "Operator action computation time (sec) = " << timeSec.count() << std::endl;
+  }
+
+  // Compute the squared norm of the output quantum state
+  void * norm2 = createArrayGPU(std::vector<double>(batchSize, 0.0));
+  HANDLE_CUDM_ERROR(cudensitymatStateComputeNorm(handle,
+                      outputState,
+                      norm2,
+                      0x0));
+  if (verbose)
+    std::cout << "Computed the output state norm\n";
+  HANDLE_CUDA_ERROR(cudaDeviceSynchronize());
+  destroyArrayGPU(norm2);
+
+  // Destroy workspace descriptor
+  HANDLE_CUDM_ERROR(cudensitymatDestroyWorkspace(workspaceDescr));
+
+  // Destroy workspace buffer storage
+  destroyArrayGPU(workspaceBuffer);
+
+  // Destroy quantum states
+  HANDLE_CUDM_ERROR(cudensitymatDestroyState(outputState));
+  HANDLE_CUDM_ERROR(cudensitymatDestroyState(inputState));
+
+  // Destroy quantum state storage
+  destroyArrayGPU(outputStateElems);
+  destroyArrayGPU(inputStateElems);
+
+  if (verbose)
+    std::cout << "Destroyed resources\n" << std::flush;
+}
+
+
+int main(int argc, char ** argv)
+{
+  // Initialize MPI library (if needed)
+#ifdef MPI_ENABLED
+  HANDLE_MPI_ERROR(MPI_Init(&argc, &argv));
+  int procRank {-1};
+  HANDLE_MPI_ERROR(MPI_Comm_rank(MPI_COMM_WORLD, &procRank));
+  int numProcs {0};
+  HANDLE_MPI_ERROR(MPI_Comm_size(MPI_COMM_WORLD, &numProcs));
+  if (procRank != 0) verbose = false;
+  if (verbose)
+    std::cout << "Initialized MPI library\n";
+#else
+  const int procRank {0};
+  const int numProcs {1};
+#endif
+
+  // Assign a GPU to the process
+  int numDevices {0};
+  HANDLE_CUDA_ERROR(cudaGetDeviceCount(&numDevices));
+  const int deviceId = procRank % numDevices;
+  HANDLE_CUDA_ERROR(cudaSetDevice(deviceId));
+  if (verbose)
+    std::cout << "Set active device\n";
+
+  // Create a library handle
+  cudensitymatHandle_t handle;
+  HANDLE_CUDM_ERROR(cudensitymatCreate(&handle));
+  if (verbose)
+    std::cout << "Created a library handle\n";
+
+  // Reset distributed configuration (once)
+#ifdef MPI_ENABLED
+  MPI_Comm comm;
+  HANDLE_MPI_ERROR(MPI_Comm_dup(MPI_COMM_WORLD, &comm));
+  HANDLE_CUDM_ERROR(cudensitymatResetDistributedConfiguration(handle,
+                      CUDENSITYMAT_DISTRIBUTED_PROVIDER_MPI,
+                      &comm, sizeof(comm)));
+#endif
+
+  // Run the example
+  exampleWorkflow(handle);
+
+  // Synchronize MPI processes
+#ifdef MPI_ENABLED
+  HANDLE_MPI_ERROR(MPI_Barrier(MPI_COMM_WORLD));
+#endif
+
+  // Destroy the library handle
+  HANDLE_CUDM_ERROR(cudensitymatDestroy(handle));
+  if (verbose)
+    std::cout << "Destroyed the library handle\n";
+
+  // Finalize the MPI library
+#ifdef MPI_ENABLED
+  HANDLE_MPI_ERROR(MPI_Finalize());
+  if (verbose)
+    std::cout << "Finalized MPI library\n";
+#endif
+
+  // Done
+  return 0;
+}
diff --git a/samples/cudensitymat/transverse_ising_full_fused_noisy.h b/samples/cudensitymat/transverse_ising_full_fused_noisy.h
new file mode 100644
index 0000000..5603516
--- /dev/null
+++ b/samples/cudensitymat/transverse_ising_full_fused_noisy.h
@@ -0,0 +1,256 @@
+/* Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
+ *
+ * SPDX-License-Identifier: BSD-3-Clause
+ */
+
+#pragma once
+
+#include <cudensitymat.h> // cuDensityMat library header
+#include "helpers.h"   // helper functions
+
+#include <cmath>
+#include <complex>
+#include <vector>
+#include <iostream>
+#include <cassert>
+
+
+/* Time-dependent transverse-field Ising Hamiltonian operator
+   with ordered and fused ZZ terms, plus fused unitary dissipation terms:
+    H = sum_{i} {h_i * X_i}                // transverse field sum of X_i
+      + f(t) * sum_{i < j} {g_ij * ZZ_ij}  // modulated sum of fused {Z_i * Z_j} terms
+      + d * sum_{i} {Y_i * {..} * Y_i}     // dissipation terms {Y_i * {..} * Y_i} will be fused into the YY_ii super-operator
+   where {..} is the placeholder for the density matrix to show that the operators act from a different side
+*/
+
+
+// User-defined C++ callback function defining a time-dependent coefficient inside the Hamiltonian:
+// f(t) = cos(omega * t) + i * sin(omega * t)
+extern "C"
+int32_t tdCoefComplex64(double time,             // time point
+                        int32_t numParams,       // number of external user-defined Liouvillian parameters (= 1 here)
+                        const double params[],   // params[0] is omega (user-defined Liouvillian parameter)
+                        cudaDataType_t dataType, // data type (CUDA_C_64F here)
+                        void * scalarStorage)    // CPU storage for the returned function value
+{
+  const auto omega = params[0];
+  auto * tdCoef = static_cast<std::complex<double>*>(scalarStorage); // casting to complex<double> because it returns CUDA_C_64F data type
+  *tdCoef = {std::cos(omega * time), std::sin(omega * time)};
+  return 0; // error code (0: Success)
+}
+
+
+/** Convenience class to encapsulate the Liouvillian operator:
+ *   - Constructor constructs the desired Liouvillian operator (`cudensitymatOperator_t`)
+ *   - get() method returns a reference to the constructed Liouvillian operator
+ *   - Destructor releases all resources used by the Liouvillian operator
+ */
+class UserDefinedLiouvillian final
+{
+private:
+  // Data members
+  cudensitymatHandle_t handle;              // library context handle
+  const std::vector<int64_t> spaceShape;    // Hilbert space shape
+  void * spinXelems {nullptr};              // elements of the X spin operator in GPU RAM
+  void * spinYYelems {nullptr};             // elements of the YY two-spin operator in GPU RAM
+  void * spinZZelems {nullptr};             // elements of the ZZ two-spin operator in GPU RAM
+  cudensitymatElementaryOperator_t spinX;   // X spin operator
+  cudensitymatElementaryOperator_t spinYY;  // YY two-spin operator
+  cudensitymatElementaryOperator_t spinZZ;  // ZZ two-spin operator
+  cudensitymatOperatorTerm_t oneBodyTerm;   // operator term: H1 = sum_{i} {h_i * X_i}
+  cudensitymatOperatorTerm_t twoBodyTerm;   // operator term: H2 = f(t) * sum_{i < j} {g_ij * ZZ_ij}
+  cudensitymatOperatorTerm_t noiseTerm;     // operator term: D1 = d * sum_{i} {YY_ii}  // Y_i operators act from different sides on the density matrix
+  cudensitymatOperator_t liouvillian;       // (-i * (H1 + f(t) * H2) * rho) + (i * rho * (H1 + f(t) * H2)) + D1
+
+public:
+
+  // Constructor constructs a user-defined Liouvillian operator
+  UserDefinedLiouvillian(cudensitymatHandle_t contextHandle,              // library context handle
+                         const std::vector<int64_t> & hilbertSpaceShape): // Hilbert space shape
+    handle(contextHandle), spaceShape(hilbertSpaceShape)
+  {
+    // Define the necessary elementary tensors in GPU memory (F-order storage!)
+    spinXelems = createArrayGPU<std::complex<double>>(
+                  {{0.0, 0.0}, {1.0, 0.0},   // 1st column of matrix X
+                   {1.0, 0.0}, {0.0, 0.0}}); // 2nd column of matrix X
+
+    spinYYelems = createArrayGPU<std::complex<double>>(  // YY[i0, i1; j0, j1] := Y[i0; j0] * Y[i1; j1]
+                    {{0.0, 0.0},  {0.0, 0.0}, {0.0, 0.0}, {-1.0, 0.0},  // 1st column of matrix YY
+                     {0.0, 0.0},  {0.0, 0.0}, {1.0, 0.0}, {0.0, 0.0},   // 2nd column of matrix YY
+                     {0.0, 0.0},  {1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0},   // 3rd column of matrix YY
+                     {-1.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}, {0.0, 0.0}}); // 4th column of matrix YY
+
+    spinZZelems = createArrayGPU<std::complex<double>>(  // ZZ[i0, i1; j0, j1] := Z[i0; j0] * Z[i1; j1]
+                    {{1.0, 0.0}, {0.0, 0.0},  {0.0, 0.0},  {0.0, 0.0},   // 1st column of matrix ZZ
+                     {0.0, 0.0}, {-1.0, 0.0}, {0.0, 0.0},  {0.0, 0.0},   // 2nd column of matrix ZZ
+                     {0.0, 0.0}, {0.0, 0.0},  {-1.0, 0.0}, {0.0, 0.0},   // 3rd column of matrix ZZ
+                     {0.0, 0.0}, {0.0, 0.0},  {0.0, 0.0},  {1.0, 0.0}}); // 4th column of matrix ZZ
+
+    // Construct the necessary Elementary Tensor Operators
+    //   X_i operator
+    HANDLE_CUDM_ERROR(cudensitymatCreateElementaryOperator(handle,
+                        1,                                   // one-body operator
+                        std::vector<int64_t>({2}).data(),    // acts in tensor space of shape {2}
+                        CUDENSITYMAT_OPERATOR_SPARSITY_NONE, // dense tensor storage
+                        0,                                   // 0 for dense tensors
+                        nullptr,                             // nullptr for dense tensors
+                        CUDA_C_64F,                          // data type
+                        spinXelems,                          // tensor elements in GPU memory
+                        {nullptr, nullptr},                  // no tensor callback function (tensor is not time-dependent)
+                        &spinX));                            // the created elementary tensor operator
+    //  ZZ_ij = Z_i * Z_j fused operator
+    HANDLE_CUDM_ERROR(cudensitymatCreateElementaryOperator(handle,
+                        2,                                   // two-body operator
+                        std::vector<int64_t>({2,2}).data(),  // acts in tensor space of shape {2,2}
+                        CUDENSITYMAT_OPERATOR_SPARSITY_NONE, // dense tensor storage
+                        0,                                   // 0 for dense tensors
+                        nullptr,                             // nullptr for dense tensors
+                        CUDA_C_64F,                          // data type
+                        spinZZelems,                         // tensor elements in GPU memory
+                        {nullptr, nullptr},                  // no tensor callback function (tensor is not time-dependent)
+                        &spinZZ));                           // the created elementary tensor operator
+    //  YY_ii = Y_i * {..} * Y_i fused operator (note action from different sides)
+    HANDLE_CUDM_ERROR(cudensitymatCreateElementaryOperator(handle,
+                        2,                                   // two-body operator
+                        std::vector<int64_t>({2,2}).data(),  // acts in tensor space of shape {2,2}
+                        CUDENSITYMAT_OPERATOR_SPARSITY_NONE, // dense tensor storage
+                        0,                                   // 0 for dense tensors
+                        nullptr,                             // nullptr for dense tensors
+                        CUDA_C_64F,                          // data type
+                        spinYYelems,                         // tensor elements in GPU memory
+                        {nullptr, nullptr},                  // no tensor callback function (tensor is not time-dependent)
+                        &spinYY));                           // the created elementary tensor operator
+
+    // Construct the necessary Operator Terms from direct products of Elementary Tensor Operators
+    //  Create an empty operator term
+    HANDLE_CUDM_ERROR(cudensitymatCreateOperatorTerm(handle,
+                        spaceShape.size(),                   // Hilbert space rank (number of dimensions)
+                        spaceShape.data(),                   // Hilbert space shape
+                        &oneBodyTerm));                      // the created empty operator term
+    //  Define the operator term
+    for (int32_t i = 0; i < spaceShape.size(); ++i) {
+      const double h_i = 1.0 / static_cast<double>(i+1);  // just some value (time-independent h_i coefficient)
+      HANDLE_CUDM_ERROR(cudensitymatOperatorTermAppendElementaryProduct(handle,
+                          oneBodyTerm,
+                          1,                                                             // number of elementary tensor operators in the product
+                          std::vector<cudensitymatElementaryOperator_t>({spinX}).data(), // elementary tensor operators forming the product
+                          std::vector<int32_t>({i}).data(),                              // space modes acted on by the operator product
+                          std::vector<int32_t>({0}).data(),                              // space mode action duality (0: from the left; 1: from the right)
+                          make_cuDoubleComplex(h_i, 0.0),                                // h_i constant coefficient: Always 64-bit-precision complex number
+                          {nullptr, nullptr}));                                          // no time-dependent coefficient associated with the operator product
+    }
+    //  Create an empty operator term
+    HANDLE_CUDM_ERROR(cudensitymatCreateOperatorTerm(handle,
+                        spaceShape.size(),                   // Hilbert space rank (number of dimensions)
+                        spaceShape.data(),                   // Hilbert space shape
+                        &twoBodyTerm));                      // the created empty operator term
+    //  Define the operator term
+    for (int32_t i = 0; i < spaceShape.size() - 1; ++i) {
+      for (int32_t j = (i + 1); j < spaceShape.size(); ++j) {
+        const double g_ij = -1.0 / static_cast<double>(i + j + 1);  // just some value (time-independent g_ij coefficient)
+        HANDLE_CUDM_ERROR(cudensitymatOperatorTermAppendElementaryProduct(handle,
+                            twoBodyTerm,
+                            1,                                                              // number of elementary tensor operators in the product
+                            std::vector<cudensitymatElementaryOperator_t>({spinZZ}).data(), // elementary tensor operators forming the product
+                            std::vector<int32_t>({i, j}).data(),                            // space modes acted on by the operator product
+                            std::vector<int32_t>({0, 0}).data(),                            // space mode action duality (0: from the left; 1: from the right)
+                            make_cuDoubleComplex(g_ij, 0.0),                                // g_ij constant coefficient: Always 64-bit-precision complex number
+                            {nullptr, nullptr}));                                           // no time-dependent coefficient associated with the operator product
+      }
+    }
+    //  Create an empty operator term
+    HANDLE_CUDM_ERROR(cudensitymatCreateOperatorTerm(handle,
+                        spaceShape.size(),                   // Hilbert space rank (number of dimensions)
+                        spaceShape.data(),                   // Hilbert space shape
+                        &noiseTerm));                        // the created empty operator term
+    //  Define the operator term
+    for (int32_t i = 0; i < spaceShape.size(); ++i) {
+      HANDLE_CUDM_ERROR(cudensitymatOperatorTermAppendElementaryProduct(handle,
+                          noiseTerm,
+                          1,                                                              // number of elementary tensor operators in the product
+                          std::vector<cudensitymatElementaryOperator_t>({spinYY}).data(), // elementary tensor operators forming the product
+                          std::vector<int32_t>({i, i}).data(),                            // space modes acted on by the operator product (from different sides)
+                          std::vector<int32_t>({0, 1}).data(),                            // space mode action duality (0: from the left; 1: from the right)
+                          make_cuDoubleComplex(1.0, 0.0),                                 // default coefficient: Always 64-bit-precision complex number
+                          {nullptr, nullptr}));                                           // no time-dependent coefficient associated with the operator product
+    }
+
+    // Construct the full Liouvillian operator as a sum of the operator terms
+    //  Create an empty operator (super-operator)
+    HANDLE_CUDM_ERROR(cudensitymatCreateOperator(handle,
+                        spaceShape.size(),               // Hilbert space rank (number of dimensions)
+                        spaceShape.data(),               // Hilbert space shape
+                        &liouvillian));                  // the created empty operator (super-operator)
+    //  Append an operator term to the operator (super-operator)
+    HANDLE_CUDM_ERROR(cudensitymatOperatorAppendTerm(handle,
+                        liouvillian,
+                        oneBodyTerm,                     // appended operator term
+                        0,                               // operator term action duality as a whole (0: acting from the left; 1: acting from the right)
+                        make_cuDoubleComplex(0.0, -1.0), // -i constant
+                        {nullptr, nullptr}));            // no time-dependent coefficient associated with the operator term as a whole
+    //  Append an operator term to the operator (super-operator)
+    HANDLE_CUDM_ERROR(cudensitymatOperatorAppendTerm(handle,
+                        liouvillian,
+                        twoBodyTerm,                     // appended operator term
+                        0,                               // operator term action duality as a whole (0: acting from the left; 1: acting from the right)
+                        make_cuDoubleComplex(0.0, -1.0), // -i constant
+                        {tdCoefComplex64, nullptr}));    // function callback defining the time-dependent coefficient associated with this operator term as a whole
+    //  Append an operator term to the operator (super-operator)
+    HANDLE_CUDM_ERROR(cudensitymatOperatorAppendTerm(handle,
+                        liouvillian,
+                        oneBodyTerm,                    // appended operator term
+                        1,                              // operator term action duality as a whole (0: acting from the left; 1: acting from the right)
+                        make_cuDoubleComplex(0.0, 1.0), // i constant
+                        {nullptr, nullptr}));           // no time-dependent coefficient associated with the operator term as a whole
+    //  Append an operator term to the operator (super-operator)
+    HANDLE_CUDM_ERROR(cudensitymatOperatorAppendTerm(handle,
+                        liouvillian,
+                        twoBodyTerm,                    // appended operator term
+                        1,                              // operator term action duality as a whole (0: acting from the left; 1: acting from the right)
+                        make_cuDoubleComplex(0.0, 1.0), // i constant
+                        {tdCoefComplex64, nullptr}));   // function callback defining the time-dependent coefficient associated with this operator term as a whole
+    //  Append an operator term to the operator (super-operator)
+    const double d = 0.42; // just some value (time-independent coefficient)
+    HANDLE_CUDM_ERROR(cudensitymatOperatorAppendTerm(handle,
+                        liouvillian,
+                        noiseTerm,                    // appended operator term
+                        0,                            // operator term action duality as a whole (no duality reversing in this case)
+                        make_cuDoubleComplex(d, 0.0), // constant coefficient associated with the operator term as a whole
+                        {nullptr, nullptr}));         // no time-dependent coefficient associated with the operator term as a whole
+  }
+
+  // Destructor destructs the user-defined Liouvillian operator
+  ~UserDefinedLiouvillian()
+  {
+    // Destroy the Liouvillian operator
+    HANDLE_CUDM_ERROR(cudensitymatDestroyOperator(liouvillian));
+
+    // Destroy operator terms
+    HANDLE_CUDM_ERROR(cudensitymatDestroyOperatorTerm(noiseTerm));
+    HANDLE_CUDM_ERROR(cudensitymatDestroyOperatorTerm(twoBodyTerm));
+    HANDLE_CUDM_ERROR(cudensitymatDestroyOperatorTerm(oneBodyTerm));
+
+    // Destroy elementary tensor operators
+    HANDLE_CUDM_ERROR(cudensitymatDestroyElementaryOperator(spinYY));
+    HANDLE_CUDM_ERROR(cudensitymatDestroyElementaryOperator(spinZZ));
+    HANDLE_CUDM_ERROR(cudensitymatDestroyElementaryOperator(spinX));
+
+    // Destroy elementary tensors
+    destroyArrayGPU(spinYYelems);
+    destroyArrayGPU(spinZZelems);
+    destroyArrayGPU(spinXelems);
+  }
+
+  // Disable copy constructor/assignment (GPU resources are private, no deep copy)
+  UserDefinedLiouvillian(const UserDefinedLiouvillian &) = delete;
+  UserDefinedLiouvillian & operator=(const UserDefinedLiouvillian &) = delete;
+  UserDefinedLiouvillian(UserDefinedLiouvillian &&) noexcept = default;
+  UserDefinedLiouvillian & operator=(UserDefinedLiouvillian &&) noexcept = default;
+
+  // Get access to the constructed Liouvillian
+  cudensitymatOperator_t & get()
+  {
+    return liouvillian;
+  }
+
+};
diff --git a/samples/custatevec/samples_mpi/distributedIndexBitSwap.cpp b/samples/custatevec/samples_mpi/distributedIndexBitSwap.cpp
index c1d6ca9..39e9e6a 100644
--- a/samples/custatevec/samples_mpi/distributedIndexBitSwap.cpp
+++ b/samples/custatevec/samples_mpi/distributedIndexBitSwap.cpp
@@ -124,6 +124,10 @@ void runDistributedIndexBitSwaps(
     int nP2PDeviceBits = 0;
     int nSubSVsP2P = 1 << nP2PDeviceBits;
 
+    // use CUDA IPC event or semaphore
+    // set useCudaIpcEvent to use CUDA IPC event, otherwise use semaphore
+    bool useCudaIpcEvent = true;
+
     // use rank and size to map sub state vectors
     // this sample assigns one device to one rank and allocates one sub state vector on the assigned device
     // use the rank as the index of the sub state vector locally allocated in this process
@@ -143,6 +147,7 @@ void runDistributedIndexBitSwaps(
     void* d_orgSubSV = nullptr;
     cudaStream_t localStream = nullptr;
     cudaEvent_t localEvent = nullptr;
+    void* d_localSemaphore = nullptr;
 
     // bind the device to the process
     // this is based on the assumption of the global rank placement that the
@@ -166,8 +171,6 @@ void runDistributedIndexBitSwaps(
     ERRCHK(cudaMalloc(&d_orgSubSV, subSVSize));
     ERRCHK(cudaMemset(d_orgSubSV, 0, subSVSize));
     ERRCHK(cudaStreamCreate(&localStream));
-    // event should be created with the cudaEventInterprocess flag
-    ERRCHK(cudaEventCreateWithFlags(&localEvent, cudaEventInterprocess | cudaEventDisableTiming));
 
     // create cuStateVec handle
     custatevecHandle_t handle;
@@ -230,20 +233,37 @@ void runDistributedIndexBitSwaps(
     ERRCHK(custatevecCommunicatorCreate(handle, &communicator, communicatorType, soname));
 
     //
-    // create sv segment swap worker
+    // create SvSwapWorker
     //
     custatevecSVSwapWorkerDescriptor_t svSegSwapWorker = nullptr;
     size_t extraWorkspaceSize = 0;
     size_t minTransferWorkspaceSize = 0;
-    ERRCHK(custatevecSVSwapWorkerCreate(
-                   handle, &svSegSwapWorker, communicator,
-                   d_orgSubSV, orgSubSVIndex, localEvent, svDataType,
-                   localStream, &extraWorkspaceSize, &minTransferWorkspaceSize));
+
+    if (useCudaIpcEvent)
+    {
+        // event should be created with the cudaEventInterprocess flag
+        ERRCHK(cudaEventCreateWithFlags(&localEvent, cudaEventInterprocess | cudaEventDisableTiming));
+        // create SVSwapWorker
+        ERRCHK(custatevecSVSwapWorkerCreate(
+                       handle, &svSegSwapWorker, communicator,
+                       d_orgSubSV, orgSubSVIndex, localEvent, svDataType,
+                       localStream, &extraWorkspaceSize, &minTransferWorkspaceSize));
+    }
+    else
+    {
+        ERRCHK(cudaMalloc(&d_localSemaphore, sizeof(int)));
+        // create SVSwapWorker
+        ERRCHK(custatevecSVSwapWorkerCreateWithSemaphore(
+                       handle, &svSegSwapWorker, communicator,
+                       d_orgSubSV, orgSubSVIndex, d_localSemaphore, svDataType,
+                       localStream, &extraWorkspaceSize, &minTransferWorkspaceSize));
+    }
     // set extra workspace
     void* d_extraWorkspace = nullptr;
     ERRCHK(cudaMalloc(&d_extraWorkspace, extraWorkspaceSize));
     ERRCHK(custatevecSVSwapWorkerSetExtraWorkspace(
                    handle, svSegSwapWorker, d_extraWorkspace, extraWorkspaceSize));
+
     // set transfer workspace
     // The size should be equal to or larger than minTransferWorkspaceSize
     // Depending on the systems, larger transfer workspace can improve the performance
@@ -260,6 +280,7 @@ void runDistributedIndexBitSwaps(
     std::vector<void*> d_subSVsP2P;
     std::vector<int> subSVIndicesP2P;
     std::vector<cudaEvent_t> remoteEvents;
+    std::vector<void*> remoteSemaphores;
     if (nP2PDeviceBits > 0)
     {
         // distribute device memory handles
@@ -269,13 +290,6 @@ void runDistributedIndexBitSwaps(
         ERRCHK(MPI_Allgather(&ipcMemHandle, sizeof(ipcMemHandle), MPI_UINT8_T,
                              ipcMemHandles.data(), sizeof(ipcMemHandle), MPI_UINT8_T, MPI_COMM_WORLD));
 
-        // distribute event handles
-        cudaIpcEventHandle_t eventHandle;
-        ERRCHK(cudaIpcGetEventHandle(&eventHandle, localEvent));
-        std::vector<cudaIpcEventHandle_t> ipcEventHandles(nSubSVs);
-        ERRCHK(MPI_Allgather(&eventHandle, sizeof(eventHandle), MPI_UINT8_T,
-                             ipcEventHandles.data(), sizeof(eventHandle), MPI_UINT8_T, MPI_COMM_WORLD));
-
         // get remove device pointers and events
         // this calculation assumes that the global rank placement is done in a round-robin fashion
         // across nodes, so for example if nP2PDeviceBits=2 there are 2^2=4 processes/node (and
@@ -296,17 +310,58 @@ void runDistributedIndexBitSwaps(
             const auto& dstMemHandle = ipcMemHandles[p2pSubSVIndex];
             ERRCHK(cudaIpcOpenMemHandle(&d_subSVP2P, dstMemHandle, cudaIpcMemLazyEnablePeerAccess));
             d_subSVsP2P.push_back(d_subSVP2P);
-            cudaEvent_t eventP2P = nullptr;
-            ERRCHK(cudaIpcOpenEventHandle(&eventP2P, ipcEventHandles[p2pSubSVIndex]));
-            remoteEvents.push_back(eventP2P);
             subSVIndicesP2P.push_back(p2pSubSVIndex);
         }
 
-        // set p2p sub state vectors
-        ERRCHK(custatevecSVSwapWorkerSetSubSVsP2P(
-                       handle, svSegSwapWorker,
-                       d_subSVsP2P.data(), subSVIndicesP2P.data(), remoteEvents.data(),
-                       static_cast<int>(d_subSVsP2P.size())));
+        if (useCudaIpcEvent)
+        {
+            // distribute event handles
+            cudaIpcEventHandle_t eventHandle;
+            ERRCHK(cudaIpcGetEventHandle(&eventHandle, localEvent));
+            std::vector<cudaIpcEventHandle_t> ipcEventHandles(nSubSVs);
+            ERRCHK(MPI_Allgather(&eventHandle, sizeof(eventHandle), MPI_UINT8_T,
+                                 ipcEventHandles.data(), sizeof(eventHandle), MPI_UINT8_T, MPI_COMM_WORLD));
+
+            for (int p2pSubSVIndex = p2pSubSVIndexBegin; p2pSubSVIndex < p2pSubSVIndexEnd; ++p2pSubSVIndex)
+            {
+                if (orgSubSVIndex == p2pSubSVIndex)
+                    continue;  // don't need local sub state vector pointer
+                cudaEvent_t eventP2P = nullptr;
+                ERRCHK(cudaIpcOpenEventHandle(&eventP2P, ipcEventHandles[p2pSubSVIndex]));
+                remoteEvents.push_back(eventP2P);
+            }
+            // set p2p sub state vectors
+            ERRCHK(custatevecSVSwapWorkerSetSubSVsP2P(
+                           handle, svSegSwapWorker,
+                           d_subSVsP2P.data(), subSVIndicesP2P.data(), remoteEvents.data(),
+                           static_cast<int>(d_subSVsP2P.size())));
+        }
+        else
+        {
+            // distribute semaphore memory handles
+            cudaIpcMemHandle_t ipcSemaphoreMemHandle;
+            ERRCHK(cudaIpcGetMemHandle(&ipcSemaphoreMemHandle, d_localSemaphore));
+            std::vector<cudaIpcMemHandle_t> ipcSemaphoreMemHandles(nSubSVs);
+            ERRCHK(MPI_Allgather(
+                           &ipcSemaphoreMemHandle, sizeof(ipcMemHandle), MPI_UINT8_T,
+                           ipcSemaphoreMemHandles.data(), sizeof(ipcSemaphoreMemHandle), MPI_UINT8_T,
+                           MPI_COMM_WORLD));
+
+            for (int p2pSubSVIndex = p2pSubSVIndexBegin; p2pSubSVIndex < p2pSubSVIndexEnd; ++p2pSubSVIndex)
+            {
+                if (orgSubSVIndex == p2pSubSVIndex)
+                    continue;  // don't need local sub state vector pointer
+                void* d_semaphoreP2P = nullptr;
+                const auto& dstSemaphoreMemHandle = ipcSemaphoreMemHandles[p2pSubSVIndex];
+                ERRCHK(cudaIpcOpenMemHandle(&d_semaphoreP2P, dstSemaphoreMemHandle, cudaIpcMemLazyEnablePeerAccess));
+                remoteSemaphores.push_back(d_semaphoreP2P);
+            }
+            // set p2p sub state vectors
+            ERRCHK(custatevecSVSwapWorkerSetSubSVsP2PWithSemaphores(
+                           handle, svSegSwapWorker,
+                           d_subSVsP2P.data(), subSVIndicesP2P.data(), remoteSemaphores.data(),
+                           static_cast<int>(remoteSemaphores.size())));
+        }
     }
 
     //
@@ -377,8 +432,18 @@ void runDistributedIndexBitSwaps(
         ERRCHK(cudaIpcCloseMemHandle(d_subSV));
     for (auto event : remoteEvents)
         ERRCHK(cudaEventDestroy(event));
+    for (auto* semaphore : remoteSemaphores)
+        ERRCHK(cudaIpcCloseMemHandle(semaphore));
+
+    // ensure all remote resources are released
+    ERRCHK(cudaDeviceSynchronize());
+    ERRCHK(MPI_Barrier(MPI_COMM_WORLD));
+
     ERRCHK(cudaFree(d_orgSubSV));
-    ERRCHK(cudaEventDestroy(localEvent));
+    if (localEvent != nullptr)
+        ERRCHK(cudaEventDestroy(localEvent));
+    if (d_localSemaphore != nullptr)
+        ERRCHK(cudaFree(d_localSemaphore));
     ERRCHK(cudaStreamDestroy(localStream));
 }
 
diff --git a/samples/cutensornet/README.md b/samples/cutensornet/README.md
index d6b1945..1994148 100644
--- a/samples/cutensornet/README.md
+++ b/samples/cutensornet/README.md
@@ -66,7 +66,7 @@ The cuTENSOR library path would depend on the CUDA major version. Please refer t
 ## Prerequisites
 
 * [CUDA Toolkit 11.x](https://developer.nvidia.com/cuda-downloads) and compatible driver r450+ (see [CUDA Driver Release Notes](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions)).
-* cuTENSOR 1.5.0+.
+* cuTENSOR 2.0.2+.
 * GNU OpenMP (GOMP) runtime.
 * CMake 3.17+ if using `cmake`.