diff --git a/.github/workflows/config/spelling_allowlist.txt b/.github/workflows/config/spelling_allowlist.txt
index f8fa9f09d92..e5d7558dcc5 100644
--- a/.github/workflows/config/spelling_allowlist.txt
+++ b/.github/workflows/config/spelling_allowlist.txt
@@ -191,6 +191,8 @@ couplers
 cuQuantum
 cuTensor
 cudaq
+dataclass
+dataclasses
 dataflow
 ddots
 deallocate
diff --git a/include/cudaq/Optimizer/Dialect/CC/CCTypes.h b/include/cudaq/Optimizer/Dialect/CC/CCTypes.h
index 3e7a17f36b7..33d476e3305 100644
--- a/include/cudaq/Optimizer/Dialect/CC/CCTypes.h
+++ b/include/cudaq/Optimizer/Dialect/CC/CCTypes.h
@@ -38,10 +38,16 @@ inline bool SpanLikeType::classof(mlir::Type type) {
   return mlir::isa<StdvecType, CharspanType>(type);
 }
 
-/// Return true if and only if \p ty has dynamic extent. This is a recursive
+/// Returns true if and only if \p ty has dynamic extent. This is a recursive
 /// test on composable types.
 bool isDynamicType(mlir::Type ty);
 
+/// Returns true if and only if the memory needed to store a value of type
+/// \p ty is not known at compile time. This is a recursive test on composable
+/// types. In contrast to `isDynamicType`, the size of the type is statically
+/// known even if it contains pointers that may point to memory of dynamic size.
+bool isDynamicallySizedType(mlir::Type ty);
+
 /// Determine the number of hidden arguments, which is 0, 1, or 2.
 inline unsigned numberOfHiddenArgs(bool thisPtr, bool sret) {
   return (thisPtr ? 1 : 0) + (sret ? 1 : 0);
diff --git a/lib/Frontend/nvqpp/ConvertStmt.cpp b/lib/Frontend/nvqpp/ConvertStmt.cpp
index 964a2169c16..7be52808877 100644
--- a/lib/Frontend/nvqpp/ConvertStmt.cpp
+++ b/lib/Frontend/nvqpp/ConvertStmt.cpp
@@ -359,6 +359,9 @@ bool QuakeBridgeVisitor::VisitReturnStmt(clang::ReturnStmt *x) {
       if (!cudaq::cc::isDynamicType(eleTy))
         tySize = irb.getByteSizeOfType(loc, eleTy);
       if (!tySize) {
+        // TODO: we need to recursively create copies of all
+        // dynamic memory used within the type. See the
+        // implementation of `visit_Return` in the Python bridge.
         TODO_x(toLocation(x), x, mangler, "unhandled vector element type");
         return false;
       }
diff --git a/lib/Optimizer/CodeGen/CCToLLVM.cpp b/lib/Optimizer/CodeGen/CCToLLVM.cpp
index edda47668cf..4304ce48992 100644
--- a/lib/Optimizer/CodeGen/CCToLLVM.cpp
+++ b/lib/Optimizer/CodeGen/CCToLLVM.cpp
@@ -524,8 +524,8 @@ class SizeOfOpPattern : public ConvertOpToLLVMPattern<cudaq::cc::SizeOfOp> {
                   ConversionPatternRewriter &rewriter) const override {
     auto inputTy = sizeOfOp.getInputType();
     auto resultTy = sizeOfOp.getType();
-    if (quake::isQuakeType(inputTy) || cudaq::cc::isDynamicType(inputTy)) {
-      // Types that cannot be reified produce the poison op.
+    if (quake::isQuakeType(inputTy) ||
+        cudaq::cc::isDynamicallySizedType(inputTy)) {
       rewriter.replaceOpWithNewOp<cudaq::cc::PoisonOp>(sizeOfOp, resultTy);
       return success();
     }
diff --git a/lib/Optimizer/Dialect/CC/CCTypes.cpp b/lib/Optimizer/Dialect/CC/CCTypes.cpp
index 51271cdb582..6d40a9ceea4 100644
--- a/lib/Optimizer/Dialect/CC/CCTypes.cpp
+++ b/lib/Optimizer/Dialect/CC/CCTypes.cpp
@@ -213,6 +213,22 @@ bool isDynamicType(Type ty) {
   return false;
 }
 
+bool isDynamicallySizedType(Type ty) {
+  if (isa<SpanLikeType>(ty))
+    return false;
+  if (auto strTy = dyn_cast<StructType>(ty)) {
+    for (auto memTy : strTy.getMembers())
+      if (isDynamicallySizedType(memTy))
+        return true;
+    return false;
+  }
+  if (auto arrTy = dyn_cast<ArrayType>(ty))
+    return arrTy.isUnknownSize() ||
+           isDynamicallySizedType(arrTy.getElementType());
+  // Note: this isn't considering quake, builtin, etc. types.
+  return false;
+}
+
 CallableType CallableType::getNoSignature(MLIRContext *ctx) {
   return CallableType::get(ctx, FunctionType::get(ctx, {}, {}));
 }
diff --git a/python/cudaq/kernel/analysis.py b/python/cudaq/kernel/analysis.py
index 8528a37ef0d..151dff05076 100644
--- a/python/cudaq/kernel/analysis.py
+++ b/python/cudaq/kernel/analysis.py
@@ -53,68 +53,20 @@ def visit_FunctionDef(self, node):
                             cc.CallableType.getFunctionType(callableTy)):
                         self.depKernels[k] = globalAstRegistry[k]
 
-        self.generic_visit(node)
-
-    def visit_Call(self, node):
-        """
-        Here we look for function calls within this kernel. We will 
-        add these to dependent kernels dictionary. We will also look for 
-        kernels that are passed to control and adjoint.
-        """
-        if hasattr(node, 'func'):
-            if isinstance(node.func,
-                          ast.Name) and node.func.id in globalAstRegistry:
-                self.depKernels[node.func.id] = globalAstRegistry[node.func.id]
-            elif isinstance(node.func, ast.Attribute):
-                if hasattr(
-                        node.func.value, 'id'
-                ) and node.func.value.id == 'cudaq' and node.func.attr == 'kernel':
-                    return
-                # May need to somehow import a library kernel, find
-                # all module names in a mod1.mod2.mod3.function type call
-                moduleNames = []
-                value = node.func.value
-                while isinstance(value, ast.Attribute):
-                    moduleNames.append(value.attr)
-                    value = value.value
-                    if isinstance(value, ast.Name):
-                        moduleNames.append(value.id)
-                        break
-
-                if all(x in moduleNames for x in ['cudaq', 'dbg', 'ast']):
-                    return
-
-                if len(moduleNames):
-                    moduleNames.reverse()
-                    if cudaq_runtime.isRegisteredDeviceModule(
-                            '.'.join(moduleNames)):
-                        return
-
-                    # This will throw if the function / module is invalid
-                    try:
-                        m = importlib.import_module('.'.join(moduleNames))
-                    except:
-                        return
-
-                    getattr(m, node.func.attr)
-                    name = node.func.attr
-
-                    if name not in globalAstRegistry:
-                        raise RuntimeError(
-                            f"{name} is not a valid kernel to call ({'.'.join(moduleNames)}). Registry: {globalAstRegistry}"
-                        )
-
-                    self.depKernels[name] = globalAstRegistry[name]
-
-                elif hasattr(node.func,
-                             'attr') and node.func.attr in globalAstRegistry:
-                    self.depKernels[node.func.attr] = globalAstRegistry[
-                        node.func.attr]
-                elif node.func.value.id == 'cudaq' and node.func.attr in [
-                        'control', 'adjoint'
-                ] and node.args[0].id in globalAstRegistry:
-                    self.depKernels[node.args[0].id] = globalAstRegistry[
-                        node.args[0].id]
+        [self.visit(stm) for stm in node.body]
+
+    def visit_Attribute(self, node):
+        if not self.kernelName:
+            return
+        if node.attr in globalAstRegistry:
+            self.depKernels[node.attr] = globalAstRegistry[node.attr]
+        self.visit(node.value)
+
+    def visit_Name(self, node):
+        if not self.kernelName:
+            return
+        if node.id in globalAstRegistry:
+            self.depKernels[node.id] = globalAstRegistry[node.id]
 
 
 class HasReturnNodeVisitor(ast.NodeVisitor):
diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index 0af3d978a5c..f92f8204cd3 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -7,7 +7,6 @@
 # ============================================================================ #
 
 import ast
-import inspect
 import importlib
 import graphlib
 import textwrap
@@ -106,6 +105,95 @@ def __init__(self, *args, **kwargs):
         RuntimeError.__init__(self, *args, **kwargs)
 
 
+class PyStack(object):
+    '''
+    Takes care of managing values produced while vising Python
+    AST nodes. Each visit to a node is expected to match one
+    stack frame. Values produced (meaning pushed) by child frames
+    are accessible (meaning can be popped) by the parent. A frame
+    cannot access the value it produced (it is owned by the parent).
+    '''
+
+    class Frame(object):
+
+        def __init__(self, parent=None):
+            self.entries = None
+            self.parent = parent
+
+    def __init__(self, error_handler=None):
+
+        def default_error_handler(msg):
+            raise RuntimeError(msg)
+
+        self._frame = None
+        self.emitError = error_handler or default_error_handler
+
+    def pushFrame(self):
+        '''
+        A new frame should be pushed to process a new node in the AST.
+        '''
+        if self._frame and not self._frame.entries:
+            self._frame.entries = deque()
+        self._frame = PyStack.Frame(parent=self._frame)
+
+    def popFrame(self):
+        '''
+        A frame should be popped once a node in the AST has been processed.
+        '''
+        if not self._frame:
+            self.emitError("stack has no frames to pop")
+        elif self._frame.entries:
+            self.emitError(
+                "all values must be processed before popping a frame")
+        else:
+            self._frame = self._frame.parent
+
+    def pushValue(self, value):
+        '''
+        Pushes a value to the make it available to the parent frame.
+        '''
+        if not self._frame:
+            self.emitError("cannot push value to empty stack")
+        elif not self._frame.parent:
+            self.emitError("no parent frame is defined to push values to")
+        else:
+            self._frame.parent.entries.append(value)
+
+    def popValue(self):
+        '''
+        Pops the most recently produced (pushed) value by a child frame.
+        '''
+        if not self._frame:
+            self.emitError("value stack is empty")
+        elif not self._frame.entries:
+            # This is the only error that may be directly user-facing even when
+            # the bridge is doing its processing correctly.
+            # We hence give a somewhat general error.
+            # For internal purposes, the error might be better stated as something like:
+            # either this frame has not had a child or the child did not produce any values
+            self.emitError("no valid value was created")
+        else:
+            return self._frame.entries.pop()
+
+    @property
+    def isEmpty(self):
+        '''
+        Returns true if and only if there are no remaining stack frames.
+        '''
+        return not self._frame
+
+    @property
+    def currentNumValues(self):
+        '''
+        Returns the number of values that are accessible for processing by the current frame.
+        '''
+        if not self._frame:
+            self.emitError("no frame defined for empty stack")
+        elif self._frame.entries:
+            return len(self._frame.entries)
+        return 0
+
+
 class PyASTBridge(ast.NodeVisitor):
     """
     The `PyASTBridge` class implements the `ast.NodeVisitor` type to convert a
@@ -125,7 +213,8 @@ def __init__(self, capturedDataStorage: CapturedDataStorage, **kwargs):
         track of a symbol table, which maps variable names to constructed
         `mlir.Values`.
         """
-        self.valueStack = deque()
+        self.valueStack = PyStack(lambda msg: self.emitFatalError(
+            f'processing error - {msg}', self.currentNode))
         self.knownResultType = kwargs[
             'knownResultType'] if 'knownResultType' in kwargs else None
         if 'existingModule' in kwargs:
@@ -182,8 +271,8 @@ def __init__(self, capturedDataStorage: CapturedDataStorage, **kwargs):
         self.currentAssignVariableName = None
         self.walkingReturnNode = False
         self.controlNegations = []
-        self.subscriptPushPointerValue = False
-        self.attributePushPointerValue = False
+        self.pushPointerValue = False
+        self.isSubscriptRoot = False
         self.verbose = 'verbose' in kwargs and kwargs['verbose']
         self.currentNode = None
 
@@ -191,9 +280,13 @@ def debug_msg(self, msg, node=None):
         if self.verbose:
             print(f'{self.indent * self.indent_level}{msg()}')
             if node is not None:
-                print(
-                    textwrap.indent(ast.unparse(node),
-                                    (self.indent * (self.indent_level + 1))))
+                try:
+                    print(
+                        textwrap.indent(ast.unparse(node),
+                                        (self.indent *
+                                         (self.indent_level + 1))))
+                except:
+                    pass
 
     def emitWarning(self, msg, astNode=None):
         """
@@ -222,15 +315,19 @@ def emitFatalError(self, msg, astNode=None):
         codeFile = os.path.basename(self.locationOffset[0])
         if astNode == None:
             astNode = self.currentNode
-        lineNumber = '' if astNode == None else astNode.lineno + self.locationOffset[
-            1] - 1
+        lineNumber = '' if astNode == None or not hasattr(
+            astNode, 'lineno') else astNode.lineno + self.locationOffset[1] - 1
+
+        try:
+            offending_source = "\n\t (offending source -> " + ast.unparse(
+                astNode) + ")"
+        except:
+            offending_source = ''
 
         print(Color.BOLD, end='')
         msg = codeFile + ":" + str(
             lineNumber
-        ) + ": " + Color.RED + "error: " + Color.END + Color.BOLD + msg + (
-            "\n\t (offending source -> " + ast.unparse(astNode) + ")" if
-            hasattr(ast, 'unparse') and astNode is not None else '') + Color.END
+        ) + ": " + Color.RED + "error: " + Color.END + Color.BOLD + msg + offending_source + Color.END
         raise CompilerError(msg)
 
     def getVeqType(self, size=None):
@@ -268,6 +365,23 @@ def isMeasureResultType(self, ty, value):
             return False
         return IntegerType.isinstance(ty) and ty == IntegerType.get_signless(1)
 
+    def isFunctionArgument(self, value):
+        return (BlockArgument.isinstance(value) and
+                isinstance(value.owner.owner, func.FuncOp))
+
+    def containsList(self, ty, innerListsOnly=False):
+        """
+        Returns true if the give type is a vector or contains
+        items that are vectors.
+        """
+        if cc.StdvecType.isinstance(ty):
+            return (not innerListsOnly or
+                    self.containsList(cc.StdvecType.getElementType(ty)))
+        if not cc.StructType.isinstance(ty):
+            return False
+        eleTys = cc.StructType.getTypes(ty)
+        return any((self.containsList(t) for t in eleTys))
+
     def getIntegerType(self, width=64):
         """
         Return an MLIR `IntegerType` of the given bit width (defaults to 64
@@ -366,6 +480,29 @@ def getConstantInt(self, value, width=64):
         ty = self.getIntegerType(width)
         return arith.ConstantOp(ty, self.getIntegerAttr(ty, value)).result
 
+    def __arithmetic_to_bool(self, value):
+        """
+        Converts an integer or floating point value to a bool by 
+        comparing it to zero.
+        """
+        if self.getIntegerType(1) == value.type:
+            return value
+        if IntegerType.isinstance(value.type):
+            zero = self.getConstantInt(0, width=IntegerType(value.type).width)
+            condPred = IntegerAttr.get(self.getIntegerType(), 1)
+            return arith.CmpIOp(condPred, value, zero).result
+        elif F32Type.isinstance(value.type):
+            zero = self.getConstantFloat(0, width=32)
+            condPred = IntegerAttr.get(self.getIntegerType(), 13)
+            return arith.CmpFOp(condPred, value, zero).result
+        elif F64Type.isinstance(value.type):
+            zero = self.getConstantFloat(0, width=64)
+            condPred = IntegerAttr.get(self.getIntegerType(), 13)
+            return arith.CmpFOp(condPred, value, zero).result
+        else:
+            self.emitFatalError("value cannot be converted to bool",
+                                self.currentNode)
+
     def changeOperandToType(self, ty, operand, allowDemotion=False):
         """
         Change the type of an operand to a specified type. This function primarily 
@@ -375,6 +512,13 @@ def changeOperandToType(self, ty, operand, allowDemotion=False):
         """
         if ty == operand.type:
             return operand
+        if cc.CallableType.isinstance(ty):
+            fctTy = cc.CallableType.getFunctionType(ty)
+            if fctTy == operand.type:
+                return operand
+            self.emitFatalError(
+                f'cannot convert value of type {operand.type} to the requested type {fctTy}',
+                self.currentNode)
 
         if ComplexType.isinstance(ty):
             complexType = ComplexType(ty)
@@ -385,21 +529,46 @@ def changeOperandToType(self, ty, operand, allowDemotion=False):
                 if (floatType != otherFloatType):
                     real = self.changeOperandToType(
                         floatType,
-                        complex.ReOp(operand).result)
+                        complex.ReOp(operand).result,
+                        allowDemotion=allowDemotion)
                     imag = self.changeOperandToType(
                         floatType,
-                        complex.ImOp(operand).result)
+                        complex.ImOp(operand).result,
+                        allowDemotion=allowDemotion)
                     return complex.CreateOp(complexType, real, imag).result
             else:
-                real = self.changeOperandToType(floatType, operand)
+                real = self.changeOperandToType(floatType,
+                                                operand,
+                                                allowDemotion=allowDemotion)
                 imag = self.getConstantFloatWithType(0.0, floatType)
                 return complex.CreateOp(complexType, real, imag).result
 
         if (cc.StdvecType.isinstance(ty)):
-            eleTy = cc.StdvecType.getElementType(ty)
             if cc.StdvecType.isinstance(operand.type):
-                return self.__copyVectorAndCastElements(
-                    operand, eleTy, allowDemotion=allowDemotion)
+                eleTy = cc.StdvecType.getElementType(ty)
+                return self.__copyVectorAndConvertElements(
+                    operand,
+                    eleTy,
+                    allowDemotion=allowDemotion,
+                    alwaysCopy=False)
+
+        if (cc.StructType.isinstance(ty)):
+            if cc.StructType.isinstance(operand.type):
+                expectedEleTys = cc.StructType.getTypes(ty)
+                currentEleTys = cc.StructType.getTypes(operand.type)
+                if len(expectedEleTys) == len(currentEleTys):
+
+                    def conversion(idx, value):
+                        return self.changeOperandToType(
+                            expectedEleTys[idx],
+                            value,
+                            allowDemotion=allowDemotion)
+
+                    return self.__copyStructAndConvertElements(
+                        operand,
+                        expectedTy=ty,
+                        allowDemotion=allowDemotion,
+                        conversion=conversion)
 
         if F64Type.isinstance(ty):
             if F32Type.isinstance(operand.type):
@@ -427,6 +596,8 @@ def changeOperandToType(self, ty, operand, allowDemotion=False):
                 if requested_width == operand_width:
                     return operand
                 elif requested_width < operand_width:
+                    if requested_width == 1:
+                        return self.__arithmetic_to_bool(operand)
                     return cc.CastOp(ty, operand).result
                 return cc.CastOp(ty,
                                  operand,
@@ -460,16 +631,26 @@ def pushValue(self, value):
         visit method.
         """
         self.debug_msg(lambda: f'push {value}')
-        self.valueStack.append(value)
+        self.valueStack.pushValue(value)
 
     def popValue(self):
         """
         Pop an MLIR Value from the stack. 
         """
-        val = self.valueStack.pop()
+        val = self.valueStack.popValue()
         self.debug_msg(lambda: f'pop {val}')
         return val
 
+    def popAllValues(self, expectedNumVals):
+        values = [
+            self.popValue() for _ in range(self.valueStack.currentNumValues)
+        ]
+        if len(values) != expectedNumVals:
+            self.emitFatalError(
+                "processing error - expression did not produce a valid value in this context",
+                self.currentNode)
+        return values
+
     def pushForBodyStack(self, bodyBlockArgs):
         """
         Indicate that we are entering a for loop body block. 
@@ -559,38 +740,16 @@ def __isUnitaryGate(self, id):
                 id in ['swap', 'u3', 'exp_pauli'] or
                 id in globalRegisteredOperations)
 
-    def ifPointerThenLoad(self, value):
-        """
-        If the given value is of pointer type, load the pointer and return that
-        new value.
-        """
-        if cc.PointerType.isinstance(value.type):
-            return cc.LoadOp(value).result
-        return value
-
-    def ifNotPointerThenStore(self, value):
-        """
-        If the given value is not of a pointer type, allocate a slot on the
-        stack, store the the value in the slot, and return the slot address.
-        """
-        if not cc.PointerType.isinstance(value.type):
-            slot = cc.AllocaOp(cc.PointerType.get(value.type),
-                               TypeAttr.get(value.type)).result
-            cc.StoreOp(value, slot)
-            return slot
-        return value
+    def __createStdvecWithKnownValues(self, listElementValues):
 
-    def __createStdvecWithKnownValues(self, size, listElementValues):
-        # Turn this List into a StdVec<T>
-        arrSize = self.getConstantInt(size)
+        assert (len(set((v.type for v in listElementValues))) == 1)
+        arrSize = self.getConstantInt(len(listElementValues))
         elemTy = listElementValues[0].type
         # If this is an `i1`, turns it into an `i8` array.
         isBool = elemTy == self.getIntegerType(1)
         if isBool:
             elemTy = self.getIntegerType(8)
-
-        arrTy = cc.ArrayType.get(elemTy)
-        alloca = cc.AllocaOp(cc.PointerType.get(arrTy),
+        alloca = cc.AllocaOp(cc.PointerType.get(cc.ArrayType.get(elemTy)),
                              TypeAttr.get(elemTy),
                              seqSize=arrSize).result
 
@@ -604,15 +763,10 @@ def __createStdvecWithKnownValues(self, size, listElementValues):
                 v = self.changeOperandToType(self.getIntegerType(8), v)
             cc.StoreOp(v, eleAddr)
 
-        # Create the `StdVec<T>` from the `alloca`
-        # We still use `i1` as the vector element type if the
-        # original list was of `bool` type.
-        vecTy = elemTy if not isBool else self.getIntegerType(1)
-        if cc.PointerType.isinstance(vecTy):
-            vecTy = cc.PointerType.getElementType(vecTy)
-
-        return cc.StdvecInitOp(cc.StdvecType.get(vecTy), alloca,
-                               length=arrSize).result
+        # We still use `i1` as the vector element type for `cc.StdvecInitOp`.
+        vecTy = cc.StdvecType.get(elemTy) if not isBool else cc.StdvecType.get(
+            self.getIntegerType(1))
+        return cc.StdvecInitOp(vecTy, alloca, length=arrSize).result
 
     def getStructMemberIdx(self, memberName, structTy):
         """
@@ -624,6 +778,8 @@ def getStructMemberIdx(self, memberName, structTy):
         else:
             structName = quake.StruqType.getName(structTy)
         structIdx = None
+        if structName == 'tuple':
+            self.emitFatalError('`tuple` does not support attribute access')
         if not globalRegisteredTypes.isRegisteredClass(structName):
             self.emitFatalError(f'Dataclass is not registered: {structName})')
 
@@ -638,79 +794,185 @@ def getStructMemberIdx(self, memberName, structTy):
             )
         return structIdx, mlirTypeFromPyType(userType[memberName], self.ctx)
 
-    # Create a new vector with source elements converted to the target element type if needed.
-    def __copyVectorAndCastElements(self,
-                                    source,
-                                    targetEleType,
-                                    allowDemotion=False):
-        if not cc.PointerType.isinstance(source.type):
-            if cc.StdvecType.isinstance(source.type):
-                # Exit early if no copy is needed to avoid an unneeded store.
-                sourceEleType = cc.StdvecType.getElementType(source.type)
-                if (sourceEleType == targetEleType):
-                    return source
-
-        sourcePtr = source
-        if not cc.PointerType.isinstance(sourcePtr.type):
-            sourcePtr = self.ifNotPointerThenStore(sourcePtr)
-
-        sourceType = cc.PointerType.getElementType(sourcePtr.type)
-        if not cc.StdvecType.isinstance(sourceType):
-            raise RuntimeError(
-                f"expected vector type to copy and cast elements but received {sourceType}"
-            )
-
-        sourceEleType = cc.StdvecType.getElementType(sourceType)
-        if (sourceEleType == targetEleType):
-            return sourcePtr
+    def __copyStructAndConvertElements(self,
+                                       struct,
+                                       expectedTy=None,
+                                       allowDemotion=False,
+                                       conversion=None):
+        """
+        Creates a new struct on the stack. If a conversion is provided, applies the conversion on each
+        element before changing its type to match the corresponding element type in `expectedTy`.
+        """
+        assert cc.StructType.isinstance(struct.type)
+        if not expectedTy:
+            expectedTy = struct.type
+        assert cc.StructType.isinstance(expectedTy)
+        eleTys = cc.StructType.getTypes(struct.type)
+        expectedEleTys = cc.StructType.getTypes(expectedTy)
+        assert len(eleTys) == len(expectedEleTys)
+
+        returnVal = cc.UndefOp(expectedTy)
+        for idx, eleTy in enumerate(eleTys):
+            element = cc.ExtractValueOp(
+                eleTy, struct, [],
+                DenseI32ArrayAttr.get([idx], context=self.ctx)).result
+            element = conversion(idx, element) if conversion else element
+            element = self.changeOperandToType(expectedEleTys[idx],
+                                               element,
+                                               allowDemotion=allowDemotion)
+            returnVal = cc.InsertValueOp(
+                expectedTy, returnVal, element,
+                DenseI64ArrayAttr.get([idx], context=self.ctx)).result
+        return returnVal
 
+    # Create a new vector with source elements converted to the target element type if needed.
+    def __copyVectorAndConvertElements(self,
+                                       source,
+                                       targetEleType=None,
+                                       allowDemotion=False,
+                                       alwaysCopy=False,
+                                       conversion=None):
+        '''
+        Creates a new vector with the requested element type.
+        Returns the original vector if the requested element type already matches
+        the current element type unless `alwaysCopy` is set to True.
+        If a conversion is provided, applies the conversion to each element before
+        changing its type to match the `targetEleType`.
+        If `alwaysCopy` is set to True, return a shallow copy of the vector by
+        default (conversion can be used to create a deep copy).
+        '''
+
+        assert cc.StdvecType.isinstance(source.type)
+        sourceEleType = cc.StdvecType.getElementType(source.type)
+        if not targetEleType:
+            targetEleType = sourceEleType
+        if not alwaysCopy and sourceEleType == targetEleType:
+            return source
         isSourceBool = sourceEleType == self.getIntegerType(1)
         if isSourceBool:
             sourceEleType = self.getIntegerType(8)
-
-        sourceArrType = cc.ArrayType.get(sourceEleType)
-        sourceElePtrTy = cc.PointerType.get(sourceEleType)
-        sourceArrElePtrTy = cc.PointerType.get(sourceArrType)
-        sourceValue = self.ifPointerThenLoad(sourcePtr)
-        sourceDataPtr = cc.StdvecDataOp(sourceArrElePtrTy, sourceValue).result
-        sourceSize = cc.StdvecSizeOp(self.getIntegerType(), sourceValue).result
-
         isTargetBool = targetEleType == self.getIntegerType(1)
-        # Vector type reflects the true type, including `i1`
-        targetVecTy = cc.StdvecType.get(targetEleType)
-
         if isTargetBool:
             targetEleType = self.getIntegerType(8)
 
-        targetElePtrType = cc.PointerType.get(targetEleType)
-        targetTy = cc.ArrayType.get(targetEleType)
-        targetArrElePtrTy = cc.PointerType.get(targetTy)
-        targetPtr = cc.AllocaOp(targetArrElePtrTy,
+        sourceArrPtrTy = cc.PointerType.get(cc.ArrayType.get(sourceEleType))
+        sourceDataPtr = cc.StdvecDataOp(sourceArrPtrTy, source).result
+        sourceSize = cc.StdvecSizeOp(self.getIntegerType(), source).result
+        targetPtr = cc.AllocaOp(cc.PointerType.get(
+            cc.ArrayType.get(targetEleType)),
                                 TypeAttr.get(targetEleType),
                                 seqSize=sourceSize).result
 
         rawIndex = DenseI32ArrayAttr.get([kDynamicPtrIndex], context=self.ctx)
 
         def bodyBuilder(iterVar):
-            eleAddr = cc.ComputePtrOp(sourceElePtrTy, sourceDataPtr, [iterVar],
-                                      rawIndex).result
+            eleAddr = cc.ComputePtrOp(cc.PointerType.get(sourceEleType),
+                                      sourceDataPtr, [iterVar], rawIndex).result
             loadedEle = cc.LoadOp(eleAddr).result
-            castedEle = self.changeOperandToType(targetEleType,
-                                                 loadedEle,
-                                                 allowDemotion=allowDemotion)
-            targetEleAddr = cc.ComputePtrOp(targetElePtrType, targetPtr,
-                                            [iterVar], rawIndex).result
-            cc.StoreOp(castedEle, targetEleAddr)
+            convertedEle = conversion(iterVar,
+                                      loadedEle) if conversion else loadedEle
+            convertedEle = self.changeOperandToType(targetEleType,
+                                                    convertedEle,
+                                                    allowDemotion=allowDemotion)
+            targetEleAddr = cc.ComputePtrOp(cc.PointerType.get(targetEleType),
+                                            targetPtr, [iterVar],
+                                            rawIndex).result
+            cc.StoreOp(convertedEle, targetEleAddr)
+
+        self.createInvariantForLoop(bodyBuilder, sourceSize)
+
+        # We still use `i1` as the vector element type for `cc.StdvecInitOp`.
+        vecTy = cc.StdvecType.get(
+            targetEleType) if not isTargetBool else cc.StdvecType.get(
+                self.getIntegerType(1))
+        return cc.StdvecInitOp(vecTy, targetPtr, length=sourceSize).result
+
+    def __copyAndValidateContainer(self, value, pyVal, deepCopy, dataType=None):
+        """
+        Helper function to implement deep and shallow copies for structs and vectors.
+        Arguments:
+            `value`: The MLIR value to copy
+            `pyVal`: The Python AST node to use for validation of the container entries.
+            `deepCopy`: Whether to perform a deep or shallow copy.
+            `dataType`: Must be None unless the value to copy is a vector.
+                If the value is a vector, then the element type of the new vector.  
+        """
+        # NOTE: Creating a copy means we are creating a new container.
+        # As such, all elements in the container need to pass the validation
+        # in `__validate_container_entry`.
+        if deepCopy:
+
+            def conversion(idx, structItem):
+                if cc.StdvecType.isinstance(structItem.type):
+                    structItem = self.__copyVectorAndConvertElements(
+                        structItem, alwaysCopy=True, conversion=conversion)
+                elif (cc.StructType.isinstance(structItem.type) and
+                      self.containsList(structItem.type)):
+                    structItem = self.__copyStructAndConvertElements(
+                        structItem, conversion=conversion)
+                self.__validate_container_entry(structItem, pyVal)
+                return structItem
+        else:
 
-        self.createInvariantForLoop(sourceSize, bodyBuilder)
-        return cc.StdvecInitOp(targetVecTy, targetPtr, length=sourceSize).result
+            def conversion(idx, structItem):
+                self.__validate_container_entry(structItem, pyVal)
+                return structItem
+
+        if cc.StdvecType.isinstance(value.type):
+            listVal = self.__copyVectorAndConvertElements(value,
+                                                          dataType,
+                                                          alwaysCopy=True,
+                                                          conversion=conversion)
+            return listVal
+
+        if cc.StructType.isinstance(value.type):
+            if dataType:
+                self.emitFatalError("unsupported data type argument",
+                                    self.currentNode)
+            struct = self.__copyStructAndConvertElements(value,
+                                                         conversion=conversion)
+            return struct
+
+        self.emitFatalError(
+            f'copy is not supported on value of type {value.type}',
+            self.currentNode)
+
+    def __migrateLists(self, value, migrate):
+        """
+        Replaces all lists in the given value by the list returned
+        by the `migrate` function, including inner lists. Does an
+        in-place replacement for list elements.
+        """
+        if cc.StdvecType.isinstance(value.type):
+            eleTy = cc.StdvecType.getElementType(value.type)
+            if self.containsList(eleTy):
+                size = cc.StdvecSizeOp(self.getIntegerType(), value).result
+                ptrTy = cc.PointerType.get(cc.ArrayType.get(eleTy))
+                iterable = cc.StdvecDataOp(ptrTy, value).result
+
+                def bodyBuilder(iterVar):
+                    eleAddr = cc.ComputePtrOp(
+                        cc.PointerType.get(eleTy), iterable, [iterVar],
+                        DenseI32ArrayAttr.get([kDynamicPtrIndex],
+                                              context=self.ctx))
+                    loadedEle = cc.LoadOp(eleAddr).result
+                    element = self.__migrateLists(loadedEle, migrate)
+                    cc.StoreOp(element, eleAddr)
+
+                self.createInvariantForLoop(bodyBuilder, size)
+            return migrate(value)
+        if (cc.StructType.isinstance(value.type) and
+                self.containsList(value.type)):
+            return self.__copyStructAndConvertElements(
+                value, conversion=lambda _, v: self.__migrateLists(v, migrate))
+        assert not self.containsList(value.type)
+        return value
 
     def __insertDbgStmt(self, value, dbgStmt):
         """
         Insert a debug print out statement if the programmer requested. Handles 
         statements like `cudaq.dbg.ast.print_i64(i)`.
         """
-        value = self.ifPointerThenLoad(value)
         printFunc = None
         printStr = '[cudaq-ast-dbg] '
         argsTy = [cc.PointerType.get(self.getIntegerType(8))]
@@ -765,25 +1027,6 @@ def __insertDbgStmt(self, value, dbgStmt):
         func.CallOp(printFunc, [strLit, value])
         return
 
-    def __get_vector_size(self, vector):
-        """
-        Get the size of a vector or array type.
-        
-        Args:
-            vector: MLIR Value of vector/array type
-            
-        Returns:
-            MLIR Value containing the size as an integer
-        """
-        if cc.StdvecType.isinstance(vector.type):
-            return cc.StdvecSizeOp(self.getIntegerType(), vector).result
-        elif cc.ArrayType.isinstance(vector.type):
-            return self.getConstantInt(
-                cc.ArrayType.getSize(cc.PointerType.getElementType(
-                    vector.type)))
-        self.emitFatalError("cannot get the size for a value of type {}".format(
-            vector.type))
-
     def __load_vector_element(self, vector, index):
         """
         Load an element from a vector or array at the given index.
@@ -886,83 +1129,84 @@ def mlirTypeFromAnnotation(self, annotation):
         if msg is not None:
             self.emitFatalError(msg, annotation)
 
-    def createInvariantForLoop(self,
-                               endVal,
-                               bodyBuilder,
-                               startVal=None,
-                               stepVal=None,
-                               isDecrementing=False,
-                               elseStmts=None):
-        """
-        Create an invariant loop using the CC dialect. 
-        """
-        startVal = self.getConstantInt(0) if startVal == None else startVal
-        stepVal = self.getConstantInt(1) if stepVal == None else stepVal
+    def createForLoop(self,
+                      argTypes,
+                      bodyBuilder,
+                      inputs,
+                      evalCond,
+                      evalStep,
+                      orElseBuilder=None):
 
-        iTy = self.getIntegerType()
-        inputs = [startVal]
-        resultTys = [iTy]
-
-        loop = cc.LoopOp(resultTys, inputs, BoolAttr.get(False))
+        # post-conditional would be a do-while loop
+        isPostConditional = BoolAttr.get(False)
+        loop = cc.LoopOp(argTypes, inputs, isPostConditional)
 
-        whileBlock = Block.create_at_start(loop.whileRegion, [iTy])
+        whileBlock = Block.create_at_start(loop.whileRegion, argTypes)
         with InsertionPoint(whileBlock):
-            condPred = IntegerAttr.get(
-                iTy, 2) if not isDecrementing else IntegerAttr.get(iTy, 4)
-            cc.ConditionOp(
-                arith.CmpIOp(condPred, whileBlock.arguments[0], endVal).result,
-                whileBlock.arguments)
+            condVal = evalCond(whileBlock.arguments)
+            cc.ConditionOp(condVal, whileBlock.arguments)
 
-        bodyBlock = Block.create_at_start(loop.bodyRegion, [iTy])
+        bodyBlock = Block.create_at_start(loop.bodyRegion, argTypes)
         with InsertionPoint(bodyBlock):
             self.symbolTable.pushScope()
             self.pushForBodyStack(bodyBlock.arguments)
-            bodyBuilder(bodyBlock.arguments[0])
+            bodyBuilder(bodyBlock.arguments)
             if not self.hasTerminator(bodyBlock):
                 cc.ContinueOp(bodyBlock.arguments)
             self.popForBodyStack()
             self.symbolTable.popScope()
 
-        stepBlock = Block.create_at_start(loop.stepRegion, [iTy])
+        stepBlock = Block.create_at_start(loop.stepRegion, argTypes)
         with InsertionPoint(stepBlock):
-            incr = arith.AddIOp(stepBlock.arguments[0], stepVal).result
-            cc.ContinueOp([incr])
+            stepVals = evalStep(stepBlock.arguments)
+            cc.ContinueOp(stepVals)
 
-        if elseStmts:
-            elseBlock = Block.create_at_start(loop.elseRegion, [iTy])
+        if orElseBuilder:
+            elseBlock = Block.create_at_start(loop.elseRegion, argTypes)
             with InsertionPoint(elseBlock):
                 self.symbolTable.pushScope()
-                for stmt in elseStmts:
-                    self.visit(stmt)
+                orElseBuilder(elseBlock.arguments)
                 if not self.hasTerminator(elseBlock):
                     cc.ContinueOp(elseBlock.arguments)
                 self.symbolTable.popScope()
 
-        loop.attributes.__setitem__('invariant', UnitAttr.get())
-        return
+        return loop
 
-    def __applyQuantumOperation(self, opName, parameters, targets):
-        opCtor = getattr(quake, '{}Op'.format(opName.title()))
-        for quantumValue in targets:
-            if quake.VeqType.isinstance(quantumValue.type):
+    def createMonotonicForLoop(self,
+                               bodyBuilder,
+                               startVal,
+                               stepVal,
+                               endVal,
+                               isDecrementing=False,
+                               orElseBuilder=None):
 
-                def bodyBuilder(iterVal):
-                    q = quake.ExtractRefOp(self.getRefType(),
-                                           quantumValue,
-                                           -1,
-                                           index=iterVal).result
-                    opCtor([], parameters, [], [q])
+        iTy = self.getIntegerType()
+        assert startVal.type == iTy
+        assert stepVal.type == iTy
+        assert endVal.type == iTy
 
-                veqSize = quake.VeqSizeOp(self.getIntegerType(),
-                                          quantumValue).result
-                self.createInvariantForLoop(veqSize, bodyBuilder)
-            elif quake.RefType.isinstance(quantumValue.type):
-                opCtor([], parameters, [], [quantumValue])
-            else:
-                self.emitFatalError(
-                    f'quantum operation {opName} on incorrect quantum type {quantumValue.type}.'
-                )
-        return
+        condPred = IntegerAttr.get(
+            iTy, 4) if isDecrementing else IntegerAttr.get(iTy, 2)
+        return self.createForLoop(
+            [iTy], lambda args: bodyBuilder(args[0]), [startVal],
+            lambda args: arith.CmpIOp(condPred, args[0], endVal).result,
+            lambda args: [arith.AddIOp(args[0], stepVal).result],
+            None if orElseBuilder is None else
+            (lambda args: orElseBuilder(args[0])))
+
+    def createInvariantForLoop(self, bodyBuilder, endVal):
+        """
+        Create an invariant loop using the CC dialect. 
+        """
+
+        startVal = self.getConstantInt(0)
+        stepVal = self.getConstantInt(1)
+
+        loop = self.createMonotonicForLoop(bodyBuilder,
+                                           startVal=startVal,
+                                           stepVal=stepVal,
+                                           endVal=endVal)
+        loop.attributes.__setitem__('invariant', UnitAttr.get())
 
     def __deconstructAssignment(self, target, value, process=None):
         if process is not None:
@@ -974,44 +1218,45 @@ def __deconstructAssignment(self, target, value, process=None):
             if (isinstance(value, ast.Tuple) or isinstance(value, ast.List)):
                 nrArgs = len(value.elts)
                 getItem = lambda idx: value.elts[idx]
+            elif (isinstance(value, tuple) or isinstance(value, list)):
+                nrArgs = len(value)
+                getItem = lambda idx: value[idx]
+            elif cc.StructType.isinstance(value.type):
+                argTypes = cc.StructType.getTypes(value.type)
+                nrArgs = len(argTypes)
+                getItem = lambda idx: cc.ExtractValueOp(
+                    argTypes[idx], value, [],
+                    DenseI32ArrayAttr.get([idx], context=self.ctx)).result
+            elif quake.StruqType.isinstance(value.type):
+                argTypes = quake.StruqType.getTypes(value.type)
+                nrArgs = len(argTypes)
+                getItem = lambda idx: quake.GetMemberOp(
+                    argTypes[idx], value,
+                    IntegerAttr.get(self.getIntegerType(32), idx)).result
+            elif cc.StdvecType.isinstance(value.type):
+                # We will get a runtime error for out of bounds access
+                eleTy = cc.StdvecType.getElementType(value.type)
+                elePtrTy = cc.PointerType.get(eleTy)
+                arrTy = cc.ArrayType.get(eleTy)
+                ptrArrTy = cc.PointerType.get(arrTy)
+                vecPtr = cc.StdvecDataOp(ptrArrTy, value).result
+                attr = DenseI32ArrayAttr.get([kDynamicPtrIndex],
+                                             context=self.ctx)
+                nrArgs = len(target.elts)
+                getItem = lambda idx: cc.LoadOp(
+                    cc.ComputePtrOp(elePtrTy, vecPtr, [
+                        self.getConstantInt(idx)
+                    ], attr).result).result
+            elif quake.VeqType.isinstance(value.type):
+                # We will get a runtime error for out of bounds access
+                nrArgs = len(target.elts)
+                getItem = lambda idx: quake.ExtractRefOp(
+                    quake.RefType.get(),
+                    value,
+                    -1,
+                    index=self.getConstantInt(idx)).result
             else:
-                value = self.ifPointerThenLoad(value)
-                if cc.StructType.isinstance(value.type):
-                    argTypes = cc.StructType.getTypes(value.type)
-                    nrArgs = len(argTypes)
-                    getItem = lambda idx: cc.ExtractValueOp(
-                        argTypes[idx], value, [],
-                        DenseI32ArrayAttr.get([idx], context=self.ctx)).result
-                elif quake.StruqType.isinstance(value.type):
-                    argTypes = quake.StruqType.getTypes(value.type)
-                    nrArgs = len(argTypes)
-                    getItem = lambda idx: quake.GetMemberOp(
-                        argTypes[idx], value,
-                        IntegerAttr.get(self.getIntegerType(32), idx)).result
-                elif cc.StdvecType.isinstance(value.type):
-                    # We will get a runtime error for out of bounds access
-                    eleTy = cc.StdvecType.getElementType(value.type)
-                    elePtrTy = cc.PointerType.get(eleTy)
-                    arrTy = cc.ArrayType.get(eleTy)
-                    ptrArrTy = cc.PointerType.get(arrTy)
-                    vecPtr = cc.StdvecDataOp(ptrArrTy, value).result
-                    attr = DenseI32ArrayAttr.get([kDynamicPtrIndex],
-                                                 context=self.ctx)
-                    nrArgs = len(target.elts)
-                    getItem = lambda idx: cc.LoadOp(
-                        cc.ComputePtrOp(elePtrTy, vecPtr, [
-                            self.getConstantInt(idx)
-                        ], attr).result).result
-                elif quake.VeqType.isinstance(value.type):
-                    # We will get a runtime error for out of bounds access
-                    nrArgs = len(target.elts)
-                    getItem = lambda idx: quake.ExtractRefOp(
-                        quake.RefType.get(),
-                        value,
-                        -1,
-                        index=self.getConstantInt(idx)).result
-                else:
-                    nrArgs = 0
+                nrArgs = 0
             if nrArgs != len(target.elts):
                 self.emitFatalError("shape mismatch in tuple deconstruction",
                                     self.currentNode)
@@ -1023,7 +1268,7 @@ def __deconstructAssignment(self, target, value, process=None):
             self.emitFatalError("unsupported target in tuple deconstruction",
                                 self.currentNode)
 
-    def __processRangeLoopIterationBounds(self, argumentNodes):
+    def __processRangeLoopIterationBounds(self, pyVals):
         """
         Analyze `range(...)` bounds and return the start, end, and step values,
         as well as whether or not this a decrementing range.
@@ -1031,135 +1276,225 @@ def __processRangeLoopIterationBounds(self, argumentNodes):
         iTy = self.getIntegerType(64)
         zero = arith.ConstantOp(iTy, IntegerAttr.get(iTy, 0))
         one = arith.ConstantOp(iTy, IntegerAttr.get(iTy, 1))
+        values = self.__groupValues(pyVals, [(1, 3)])
+
         isDecrementing = False
-        if len(argumentNodes) == 3:
+        if len(pyVals) == 3:
             # Find the step val and we need to know if its decrementing can be
             # incrementing or decrementing
-            stepVal = self.popValue()
-            if isinstance(argumentNodes[2], ast.UnaryOp):
-                self.debug_msg(lambda: f'[(Inline) Visit UnaryOp]',
-                               argumentNodes[2])
-                if isinstance(argumentNodes[2].op, ast.USub):
-                    if isinstance(argumentNodes[2].operand, ast.Constant):
-                        self.debug_msg(lambda: f'[(Inline) Visit Constant]',
-                                       argumentNodes[2].operand)
-                        if argumentNodes[2].operand.value > 0:
-                            isDecrementing = True
-                    else:
-                        self.emitFatalError(
-                            'CUDA-Q requires step value on range() to be a constant.'
-                        )
+            stepVal = values[2]
+            if isinstance(pyVals[2], ast.Constant):
+                pyStepVal = pyVals[2].value
+            elif (isinstance(pyVals[2], ast.UnaryOp) and
+                  isinstance(pyVals[2].op, ast.USub) and
+                  isinstance(pyVals[2].operand, ast.Constant)):
+                pyStepVal = -pyVals[2].operand.value
+            else:
+                self.emitFatalError('range step value must be a constant',
+                                    self.currentNode)
+            if pyStepVal == 0:
+                self.emitFatalError("range step value must be non-zero",
+                                    self.currentNode)
+            isDecrementing = pyStepVal < 0
 
             # exclusive end
-            endVal = self.popValue()
+            endVal = values[1]
 
             # inclusive start
-            startVal = self.popValue()
+            startVal = values[0]
 
-        elif len(argumentNodes) == 2:
+        elif len(pyVals) == 2:
             stepVal = one
-            endVal = self.popValue()
-            startVal = self.popValue()
+            endVal = values[1]
+            startVal = values[0]
         else:
             stepVal = one
-            endVal = self.popValue()
+            endVal = values[0]
             startVal = zero
 
-        startVal = self.ifPointerThenLoad(startVal)
-        endVal = self.ifPointerThenLoad(endVal)
-        stepVal = self.ifPointerThenLoad(stepVal)
-
         for idx, v in enumerate([startVal, endVal, stepVal]):
             if not IntegerType.isinstance(v.type):
                 # matching Python behavior to error on non-integer values
-                self.emitFatalError(
-                    "non-integer value in range expression",
-                    argumentNodes[idx if len(argumentNodes) > 1 else 0])
+                self.emitFatalError("non-integer value in range expression",
+                                    pyVals[idx if len(pyVals) > 1 else 0])
         return startVal, endVal, stepVal, isDecrementing
 
-    def __visitStructAttribute(self, node, structValue):
-        """
-        Handle struct member extraction from either a pointer to struct or
-        direct struct value.  Uses the most efficient approach for each case.
-        """
-        if cc.PointerType.isinstance(structValue.type):
-            # Handle pointer to struct - use ComputePtrOp
-            eleType = cc.PointerType.getElementType(structValue.type)
-            if cc.StructType.isinstance(eleType):
-                structIdx, memberTy = self.getStructMemberIdx(
-                    node.attr, eleType)
-                eleAddr = cc.ComputePtrOp(cc.PointerType.get(memberTy),
-                                          structValue, [],
-                                          DenseI32ArrayAttr.get([structIdx
-                                                                ])).result
-
-                if self.attributePushPointerValue:
-                    self.pushValue(eleAddr)
-                    return
-
-                # Load the value
-                eleAddr = cc.LoadOp(eleAddr).result
-                self.pushValue(eleAddr)
-                return
-        elif cc.StructType.isinstance(structValue.type):
-            # Handle direct struct value - use ExtractValueOp (more efficient)
-            structIdx, memberTy = self.getStructMemberIdx(
-                node.attr, structValue.type)
-            extractedValue = cc.ExtractValueOp(
-                memberTy, structValue, [],
-                DenseI32ArrayAttr.get([structIdx])).result
-
-            if self.attributePushPointerValue:
-                # If we need a pointer, we have to create a temporary slot
-                tempSlot = cc.AllocaOp(cc.PointerType.get(memberTy),
-                                       TypeAttr.get(memberTy)).result
-                cc.StoreOp(extractedValue, tempSlot)
-                self.pushValue(tempSlot)
-                return
-
-            self.pushValue(extractedValue)
-            return
+    def __groupValues(self, pyvals, groups: list[int | tuple[int, int]]):
+        '''
+        Helper function that visits the given AST nodes (`pyvals`),
+        and groups them according to the specified list.
+        The list contains integers or tuples of two integers.
+        Integer values have to be positive or -1, where -1
+        indicates that any number of values is acceptable.
+        Tuples of two integers (min, max) indicate that any number
+        of values in [min, max] is acceptable.
+        The list may only contain at most one negative integer or
+        tuple (enforced via assert only).
+
+        Emits a fatal error if any of the given `pyvals` did not
+        generate a value. Emits a fatal error if there are too
+        many or too few values to satisfy the requested grouping.
+
+        Returns a tuple of value groups. Each value group is
+        either a single value (if the corresponding entry in `groups`
+        equals 1), or a list of values.
+        '''
+
+        def group_values(numExpected, values, reverse):
+            groupedVals = []
+            current_idx = 0
+            for nArgs in numExpected:
+                if (isinstance(nArgs, int) and nArgs == 1 and
+                        current_idx < len(values)):
+                    groupedVals.append(values[current_idx])
+                    current_idx += 1
+                    continue
+                if isinstance(nArgs, tuple):
+                    minNumArgs, maxNumArgs = nArgs
+                    if minNumArgs == maxNumArgs:
+                        nArgs = minNumArgs
+                if not isinstance(nArgs, int) or nArgs < 0:
+                    break
+                if current_idx + nArgs > len(values):
+                    self.emitFatalError("missing value", self.currentNode)
+                groupedVals.append(values[current_idx:current_idx + nArgs])
+                if reverse:
+                    groupedVals[-1].reverse()
+                current_idx += nArgs
+            remaining = values[current_idx:]
+            numExpected = numExpected[len(groupedVals):]
+            if reverse:
+                remaining.reverse()
+                groupedVals.reverse()
+                numExpected.reverse()
+            return groupedVals, numExpected, remaining
+
+        [self.visit(arg) for arg in pyvals]
+        values = self.popAllValues(len(pyvals))
+        groups.reverse()
+        backVals, groups, values = group_values(groups, values, reverse=True)
+        frontVals, groups, values = group_values(groups, values, reverse=False)
+        if not groups:
+            if values:
+                self.emitFatalError("too many values", self.currentNode)
+            groupedVals = *frontVals, *backVals
         else:
+            assert len(groups) == 1  # ambiguous otherwise
+            if isinstance(groups[0], tuple):
+                minNumArgs, maxNumArgs = groups[0]
+                assert 0 <= minNumArgs and (minNumArgs <= maxNumArgs or
+                                            maxNumArgs < 0)
+                if len(values) < minNumArgs:
+                    self.emitFatalError("missing value", self.currentNode)
+                if len(values) > maxNumArgs and maxNumArgs > 0:
+                    self.emitFatalError("too many values", self.currentNode)
+            groupedVals = *frontVals, values, *backVals
+        return groupedVals[0] if len(groupedVals) == 1 else groupedVals
+
+    def __get_root_value(self, pyVal):
+        '''
+        Strips any attribute and subscript expressions from the node
+        to get the root node that the expression accesses.
+        Returns the symbol table entry for the root node, if such an
+        entry exists, and return None otherwise.
+        '''
+        pyValRoot = pyVal
+        while (isinstance(pyValRoot, ast.Subscript) or
+               isinstance(pyValRoot, ast.Attribute)):
+            pyValRoot = pyValRoot.value
+        if (isinstance(pyValRoot, ast.Name) and
+                pyValRoot.id in self.symbolTable):
+            return self.symbolTable[pyValRoot.id]
+        return None
+
+    def __validate_container_entry(self, mlirVal, pyVal):
+        '''
+        Helper function that should be invoked for any elements that are stored in
+        tuple, dataclass, or list. Note that the `pyVal` argument is only used to
+        determine the root of `mlirVal` and as such could be either the Python
+        AST node matching the container item (`mlirVal`) or the AST node for the 
+        container itself.
+        '''
+
+        rootVal = self.__get_root_value(pyVal)
+        assert rootVal or not self.isFunctionArgument(mlirVal)
+
+        if cc.PointerType.isinstance(mlirVal.type):
+            # We do not allow to create container that contain pointers.
+            valTy = cc.PointerType.getElementType(mlirVal.type)
+            assert cc.StateType.isinstance(valTy)
+            if cc.StateType.isinstance(valTy):
+                self.emitFatalError(
+                    "cannot use `cudaq.State` as element in lists, tuples, or dataclasses",
+                    self.currentNode)
             self.emitFatalError(
-                f"Cannot access attribute '{node.attr}' on type {structValue.type}"
-            )
+                "lists, tuples, and dataclasses must not contain modifiable values",
+                self.currentNode)
 
-    def needsStackSlot(self, type):
-        """
-        Return true if this is a type that has been "passed by value" and 
-        needs a stack slot created (i.e. a `cc.alloca`) for use throughout the 
-        function. 
-        """
-        # FIXME add more as we need them
-        return ComplexType.isinstance(type) or F64Type.isinstance(
-            type) or F32Type.isinstance(type) or IntegerType.isinstance(
-                type) or cc.StructType.isinstance(type)
+        if cc.StructType.isinstance(mlirVal.type):
+            structName = cc.StructType.getName(mlirVal.type)
+            # We need to give a proper error if we try to assign
+            # a mutable dataclass to an item in another container.
+            # Allowing this would lead to incorrect behavior (i.e.
+            # inconsistent with Python) unless we change the
+            # representation of structs to be like `StdvecType`
+            # where we have a container that is passed by value
+            # wrapping the actual pointer, thus ensuring that the
+            # reference behavior actually works across function
+            # boundaries.
+            if structName != 'tuple' and rootVal:
+                msg = "only dataclass literals may be used as items in other container values"
+                self.emitFatalError(
+                    f"{msg} - use `.copy(deep)` to create a new {structName}",
+                    self.currentNode)
+
+        if (self.knownResultType and self.containsList(self.knownResultType) and
+                self.containsList(mlirVal.type)):
+            # For lists that were created inside a kernel, we have to
+            # copy the stack allocated array to the heap when we return such a list.
+            # In the case where the list was created by the caller, this copy leads
+            # to incorrect behavior (i.e. not matching Python behavior). We hence
+            # want to make sure that we can know when a host allocated list is returned.
+            # If we allow to assign lists passed as function arguments to inner items
+            # of other lists and dataclasses, we loose the information that this list
+            # was allocated by the parent. We hence forbid this. All of this applies
+            # regardless of how the list was passed (e.g. the list might be an inner
+            # item in a tuple or dataclass that was passed) or how it is assigned
+            # (e.g. the assigned value might be a tuple or dataclass that contains a list).
+            if rootVal and self.isFunctionArgument(rootVal):
+                msg = "lists passed as or contained in function arguments cannot be inner items in other container values when a list is returned"
+                self.emitFatalError(
+                    f"{msg} - use `.copy(deep)` to create a new list",
+                    self.currentNode)
 
     def visit(self, node):
         self.debug_msg(lambda: f'[Visit {type(node).__name__}]', node)
         self.indent_level += 1
         parentNode = self.currentNode
         self.currentNode = node
+        numVals = 0 if isinstance(
+            node, ast.Module) else self.valueStack.currentNumValues
+        self.valueStack.pushFrame()
         super().visit(node)
+        self.valueStack.popFrame()
+        if isinstance(node, ast.Module):
+            if not self.valueStack.isEmpty:
+                self.emitFatalError(
+                    "processing error - unprocessed frame(s) in value stack",
+                    node)
+        elif self.valueStack.currentNumValues - numVals > 1:
+            # Do **NOT** change this to be more permissive and allow
+            # multiple values to be pushed without pushing proper
+            # frames for sub-nodes. If visiting a single node
+            # potentially produces more than one value, the bridge
+            # quickly will be a mess because we will easily end up
+            # with values in the wrong places.
+            self.emitFatalError(
+                "must not generate more one value at a time in each frame",
+                node)
         self.currentNode = parentNode
         self.indent_level -= 1
 
-    # FIXME: using generic_visit the way we do seems incredibly dangerous;
-    # we use this and make assumptions about what values are on the value stack
-    # without any validation that we got the right values.
-    # The whole value stack needs to be revised; we need to properly push and pop
-    # not just individual values but groups of values to ensure that the right
-    # pieces get the right arguments (and give a proper error otherwise).
-    def generic_visit(self, node):
-        self.debug_msg(lambda: f'[Generic Visit]', node)
-        for field, value in reversed(list(ast.iter_fields(node))):
-            if isinstance(value, list):
-                for item in value:
-                    if isinstance(item, ast.AST):
-                        self.visit(item)
-            elif isinstance(value, ast.AST):
-                self.visit(value)
-
     def visit_FunctionDef(self, node):
         """
         Create an MLIR `func.FuncOp` for the given FunctionDef AST node. For the
@@ -1173,6 +1508,7 @@ def visit_FunctionDef(self, node):
         We keep track of the top-level function name as well as its internal
         MLIR name, prefixed with the __nvqpp__mlirgen__ prefix.
         """
+
         if self.buildingEntryPoint:
             # This is an inner function def, we will
             # treat it as a cc.callable (cc.create_lambda)
@@ -1213,9 +1549,6 @@ def visit_FunctionDef(self, node):
                                                  (node.returns.value is None)):
                 self.knownResultType = self.mlirTypeFromAnnotation(node.returns)
 
-            # Get the argument names
-            argNames = [arg.arg for arg in node.args.args]
-
             self.name = node.name
             self.capturedDataStorage.name = self.name
 
@@ -1242,45 +1575,44 @@ def visit_FunctionDef(self, node):
 
             # Set the insertion point to the start of the entry block
             with InsertionPoint(self.entry):
-                self.buildingEntryPoint = True
                 self.symbolTable.pushScope()
-                # Add the block arguments to the symbol table, create a stack
-                # slot for value arguments
-                blockArgs = self.entry.arguments
-                for i, b in enumerate(blockArgs):
-                    if self.needsStackSlot(b.type):
-                        stackSlot = cc.AllocaOp(cc.PointerType.get(b.type),
-                                                TypeAttr.get(b.type)).result
-                        cc.StoreOp(b, stackSlot)
-                        self.symbolTable[argNames[i]] = stackSlot
+                # Process function arguments like any other assignments.
+                if node.args.args:
+                    assignNode = ast.Assign()
+                    if len(node.args.args) == 1:
+                        assignNode.targets = [ast.Name(node.args.args[0].arg)]
+                        assignNode.value = self.entry.arguments[0]
                     else:
-                        self.symbolTable[argNames[i]] = b
-
-                # Visit the function
-                startIdx = 0
-                # Search for the potential documentation string, and
-                # if found, start the body visitation after it.
-                if len(node.body) and isinstance(node.body[0], ast.Expr):
-                    self.debug_msg(lambda: f'[(Inline) Visit Expr]',
-                                   node.body[0])
-                    expr = node.body[0]
-                    if hasattr(expr, 'value') and isinstance(
-                            expr.value, ast.Constant):
-                        self.debug_msg(lambda: f'[(Inline) Visit Constant]',
-                                       expr.value)
-                        constant = expr.value
-                        if isinstance(constant.value, str):
-                            startIdx = 1
-                [self.visit(n) for n in node.body[startIdx:]]
+                        assignNode.targets = [
+                            ast.Tuple(
+                                [ast.Name(arg.arg) for arg in node.args.args])
+                        ]
+                        assignNode.value = [
+                            self.entry.arguments[idx]
+                            for idx in range(len(self.entry.arguments.types))
+                        ]
+                    assignNode.lineno = node.lineno
+                    self.visit_Assign(assignNode)
+
+                # Intentionally set after we process the argument assignment,
+                # since we currently treat value vs reference semantics slightly
+                # differently when we have arguments vs when we have local values.
+                # To not make this distinction, we would need to add support
+                # for having proper reference arguments, which we don't want to.
+                # Barring that, we at least try to be nice and give errors on
+                # assignments that may lead to unexpected behavior (i.e. behavior
+                # not following expected Python behavior).
+                self.buildingEntryPoint = True
+                [self.visit(n) for n in node.body]
                 # Add the return operation
                 if not self.hasTerminator(self.entry):
                     # If the function has a known (non-None) return type, emit
                     # an `undef` of that type and return it; else return void
                     if self.knownResultType is not None:
                         undef = cc.UndefOp(self.knownResultType).result
-                        ret = func.ReturnOp([undef])
+                        func.ReturnOp([undef])
                     else:
-                        ret = func.ReturnOp([])
+                        func.ReturnOp([])
                 self.buildingEntryPoint = False
                 self.symbolTable.popScope()
 
@@ -1298,8 +1630,6 @@ def visit_FunctionDef(self, node):
 
             globalKernelRegistry[node.name] = f
             self.symbolTable.clear()
-            self.valueStack.clear()
-
             self.knownResultType = parentResultType
 
     def visit_Expr(self, node):
@@ -1317,6 +1647,13 @@ def visit_Expr(self, node):
                 return
 
         self.visit(node.value)
+        if self.valueStack.currentNumValues > 0:
+            # An `ast.Expr` object is created when an expression
+            # is used as a statement. This expression may produce
+            # a value, which is ignored (not assigned) in the
+            # Python code. We hence need to pop that value to
+            # match that behavior and ignore it.
+            self.popValue()
 
     def visit_Lambda(self, node):
         """
@@ -1360,25 +1697,47 @@ def functor(qubits):
     def visit_Assign(self, node):
         """
         Map an assign operation in the AST to an equivalent variable value
-        assignment in the MLIR. This method will first see if this is a tuple
-        assignment, enabling one to assign multiple values in a single
-        statement.
+        assignment in the MLIR. This method handles assignments, item updates,
+        as well as deconstruction.
 
         For all assignments, the variable name will be used as a key for the
-        symbol table, mapping to the corresponding MLIR Value. For values of
-        `ref` / `veq`, `i1`, or `cc.callable`, the values will be stored
-        directly in the table. For all other values, the variable will be
-        allocated with a `cc.alloca` op, and the loaded value will be stored in
+        symbol table, mapping to the corresponding MLIR Value. Quantum values,
+        measurements results, `cc.callable`, and `cc.stdvec` will be stored as
+        values in the symbol table.  For all other values, the variable will be
+        allocated with a `cc.alloca` op, and the pointer will be stored in
         the symbol table.
         """
 
-        def check_not_captured(name):
-            if name in self.capturedVars:
-                self.emitFatalError(
-                    "CUDA-Q does not allow assignment to variable {} captured from parent scope."
-                    .format(name), node)
+        # FIXME: Measurement results are stored as values
+        # to preserve their origin from discriminate.
+        # This should be revised when we introduce the proper
+        # type distinction.
+        def storedAsValue(val):
+            varTy = val.type
+            if cc.PointerType.isinstance(varTy):
+                varTy = cc.PointerType.getElementType(varTy)
+            # If `buildingEntryPoint` is not set we are processing function
+            # arguments. Function arguments are always passed by value,
+            # except states. We can treat non-container function arguments
+            # like any local variable and create a stack slot for them.
+            # For container types, on the the other hand, we need to preserve
+            # them as values in the symbol table to make sure we can detect
+            # any access to reference types that are function arguments, or
+            # function argument items.
+            containerFuncArg = (not self.buildingEntryPoint and
+                                (cc.StructType.isinstance(varTy) or
+                                 cc.StdvecType.isinstance(varTy)))
+            storeAsVal = (containerFuncArg or self.isQuantumType(varTy) or
+                          cc.CallableType.isinstance(varTy) or
+                          cc.StdvecType.isinstance(varTy) or
+                          self.isMeasureResultType(varTy, val))
+            # Nothing should ever produce a pointer
+            # to a type we store as value in the symbol table.
+            assert (not storeAsVal or not cc.PointerType.isinstance(val.type))
+            return storeAsVal
 
         def process_assignment(target, value):
+
             if isinstance(target, ast.Tuple):
 
                 if (isinstance(value, ast.Tuple) or
@@ -1386,120 +1745,248 @@ def process_assignment(target, value):
                     return target, value
 
                 if isinstance(value, ast.AST):
+                    # Measurements need to push their values to the stack,
+                    # so we set a so we set a non-None variable name here.
+                    self.currentAssignVariableName = ''
+                    # NOTE: The way the assignment logic is processed,
+                    # including that we load this value for the purpose
+                    # of deconstruction, does not preserve any inner
+                    # references. There are a bunch of issues that
+                    # prevent us from properly dealing with any
+                    # reference types stored as items in lists and
+                    # dataclasses. We hence currently prevent the
+                    # creation of such lists and dataclasses, and would
+                    # need to change the representation for dataclasses
+                    # to allow that.
                     self.visit(value)
-                    if len(self.valueStack) == 0:
-                        self.emitFatalError("invalid assignment detected.",
-                                            node)
-                    return target, self.popValue()
+                    value = self.popValue()
+                    self.currentAssignVariableName = None
+                    return target, value
 
                 return target, value
 
-            # Handle simple `var = expr`
-            elif isinstance(target, ast.Name):
-                check_not_captured(target.id)
+            # Make sure we process arbitrary combinations
+            # of subscript and attributes
+            target_root = target
+            while (isinstance(target_root, ast.Subscript) or
+                   isinstance(target_root, ast.Attribute)):
+                target_root = target_root.value
+            if not isinstance(target_root, ast.Name):
+                self.emitFatalError("invalid target for assignment", node)
+            target_root_defined_in_parent_scope = (
+                target_root.id in self.symbolTable and
+                target_root.id not in self.symbolTable.symbolTable[-1])
+            value_root = self.__get_root_value(value)
+
+            def update_in_parent_scope(destination, value):
+                assert not cc.PointerType.isinstance(value.type)
+                if cc.StructType.isinstance(
+                        value.type) and cc.StructType.getName(
+                            value.type) != 'tuple':
+                    # We can't properly deal with this case if the value we are assigning
+                    # is not an `rvalue`. Consider the case were we have `v1` defined in
+                    # the parent scope, `v2` in a child scope, and we are assigning v2 to
+                    # v1 in the child scope. To do this assignment properly, we would need to
+                    # make sure that the pointers for both v1 and v2 points to the same memory
+                    # location such that any changes to v1 after the assignment are reflected
+                    # in v2 and vice versa (v2 could be changed in the child while v1 is still
+                    # alive). Since we merely store the raw pointer in the symbol table for
+                    # dataclasses, we have no way of updating that pointer conditionally on
+                    # the child scope being executed.
+                    # To determine whether the value we assign is an `rvalue`, it is
+                    # sufficient to check whether its root is a value in the symbol table
+                    # (values returned from calls are never `lvalues`).
+                    if value_root:
+                        # Note that this check also makes sure that function arguments are
+                        # not assigned to local variables, since function arguments are in
+                        # the symbol table.
+                        self.emitFatalError(
+                            "only literals can be assigned to variables defined in parent scope - use `.copy(deep)` to create a new value that can be assigned",
+                            node)
+                if cc.StdvecType.isinstance(destination.type):
+                    # In this case, we are assigning a list to a variable in a parent scope.
+                    assert isinstance(target, ast.Name)
+                    # If the value we are assigning is an `rvalue` then we can do an in-place
+                    # update of the data in the parent; the restrictions for container items
+                    # in `__validate_container_entry` ensure that the value we are assigning
+                    # does not contain any references to dataclass values, and any lists
+                    # contained in the value behave like proper references since they contain
+                    # a data pointer (i.e. in-place update only does a shallow copy).
+                    # TODO: The only reason we cannot currently support this is because we
+                    # have no way of updating the size of an existing vector...
+                    self.emitFatalError(
+                        "variable defined in parent scope cannot be modified",
+                        node)
+                # Allowing to assign vectors to container items in the parent scope
+                # should be fine regardless of whether the assigned value is an `rvalue`
+                # or not; replacing the item in the container with the value leads to the
+                # correct behavior much like it does for the case where the target is defined
+                # in the same scope.
+                # NOTE: The assignment is subject to the usual restrictions for container
+                # items - these should be validated before calling update_in_parent_scope.
+                if not cc.StdvecType.isinstance(
+                        value.type) and storedAsValue(destination):
+                    # We can't properly deal with this, since there is no way to ensure that
+                    # the target in the symbol table is updated conditionally on the child
+                    # scope executing.
+                    self.emitFatalError(
+                        "variable defined in parent scope cannot be modified",
+                        node)
+                assert cc.PointerType.isinstance(destination.type)
+                expectedTy = cc.PointerType.getElementType(destination.type)
+                value = self.changeOperandToType(expectedTy,
+                                                 value,
+                                                 allowDemotion=False)
+                cc.StoreOp(value, destination)
 
+            # Handle assignment `var = expr`
+            if isinstance(target, ast.Name):
+                if target.id in self.capturedVars:
+                    # Local variable shadows the captured one
+                    del self.capturedVars[target.id]
+
+                # This is so that we properly preserve the references
+                # to local variables. These variables can be of a reference
+                # type and other values in the symbol table may be assigned
+                # to the same reference. It is hence important to keep the
+                # reference as is, since otherwise changes to it would not
+                # be reflected in other values.
+                # NOTE: we don't need to worry about any references in
+                # values that are not `ast.Name` objects, since we don't
+                # allow containers to contain references.
+                value_is_name = False
+                if (isinstance(value, ast.Name) and
+                        value.id in self.symbolTable):
+                    value_is_name = True
+                    value = self.symbolTable[value.id]
                 if isinstance(value, ast.AST):
                     # Retain the variable name for potential children (like `mz(q, registerName=...)`)
                     self.currentAssignVariableName = target.id
                     self.visit(value)
-                    self.currentAssignVariableName = None
-                    if len(self.valueStack) == 0:
-                        self.emitFatalError("invalid assignment detected.",
-                                            node)
                     value = self.popValue()
+                    self.currentAssignVariableName = None
+                storeAsVal = storedAsValue(value)
+
+                if value_root and self.isFunctionArgument(value_root):
+                    # If we assign a function argument or argument item to
+                    # a local variable, we need to be careful to not loose
+                    # the information about contained lists that have been
+                    # allocated by the caller, if the return value contains
+                    # any lists. This is problematic for reasons commented
+                    # in `__validate_container_entry`.
+                    if (cc.StdvecType.isinstance(value.type) and
+                            self.knownResultType and
+                            self.containsList(self.knownResultType)):
+                        # We loose this information if we assign an item of
+                        # a function argument.
+                        if not value_is_name:
+                            self.emitFatalError(
+                                "lists passed as or contained in function arguments cannot be assigned to to a local variable when a list is returned - use `.copy(deep)` to create a new value that can be assigned",
+                                node)
+                        # We also loose this information if we assign to
+                        # a value in the parent scope.
+                        elif target_root_defined_in_parent_scope:
+                            self.emitFatalError(
+                                "lists passed as or contained in function arguments cannot be assigned to variables in the parent scope when a list is returned - use `.copy(deep)` to create a new value that can be assigned",
+                                node)
+                    if cc.StructType.isinstance(value.type):
+                        structName = cc.StructType.getName(value.type)
+                        # For dataclasses, we have to do an additional check
+                        # to ensure that their behavior (for cases that don't
+                        # give an error) is consistent with Python;
+                        # since we pass them by value across functions, we
+                        # either have to force that an explicit copy is made
+                        # when using them as call arguments, or we have to
+                        # force that an explicit copy is made when a dataclass
+                        # argument is assigned to a local variable (as long as
+                        # it is not assigned, it will not be possible to make
+                        # any modification to it since the argument itself is
+                        # represented as an immutable value). The latter seems
+                        # more comprehensive and also ensures that there is no
+                        # unexpected behavior with regards to kernels not being
+                        # able to modify dataclass values in host code.
+                        # NOTE: It is sufficient to check the value itself (not
+                        # its root) is a function argument, (only!) since inner
+                        # items are never references to dataclasses (enforced
+                        # in `__validate_container_entry`).
+                        if value_is_name and structName != 'tuple':
+                            self.emitFatalError(
+                                f"cannot assign dataclass passed as function argument to a local variable - use `.copy(deep)` to create a new value that can be assigned",
+                                node)
+                        elif (self.knownResultType and
+                              self.containsList(self.knownResultType) and
+                              self.containsList(value.type)):
+                            self.emitFatalError(
+                                f"cannot assign tuple or dataclass passed as function argument to a local variable if it contains a list when a list is returned - use `.copy(deep)` to create a new value that can be assigned",
+                                node)
 
-                if self.isQuantumType(value.type) or cc.CallableType.isinstance(
-                        value.type):
-                    return target, value
-                elif self.isMeasureResultType(value.type, value):
-                    value = self.ifPointerThenLoad(value)
-                    if target.id in self.symbolTable:
-                        addr = self.ifNotPointerThenStore(
-                            self.symbolTable[target.id])
-                        cc.StoreOp(value, addr)
-                    return target, value
-                elif target.id in self.symbolTable:
-                    value = self.ifPointerThenLoad(value)
-                    cc.StoreOp(value, self.symbolTable[target.id])
+                if target_root_defined_in_parent_scope:
+                    if cc.PointerType.isinstance(value.type):
+                        # This is fine since/as long as update_in_parent_scope
+                        # validates that `lvalues` of reference types cannot be
+                        # assigned. Note that tuples and states are value types.
+                        value = cc.LoadOp(value).result
+                    destination = self.symbolTable[target.id]
+                    update_in_parent_scope(destination, value)
                     return target, None
-                elif cc.PointerType.isinstance(value.type):
-                    return target, value
-                elif cc.StructType.isinstance(value.type) and isinstance(
-                        value.owner.opview, cc.InsertValueOp):
-                    # If we have a new struct from `cc.undef` and `cc.insert_value`, we don't
-                    # want to allocate new memory.
+
+                # The target variable has either not been defined
+                # or is defined within the current scope;
+                # we can simply modify the symbol table entry.
+                if storeAsVal or cc.PointerType.isinstance(value.type):
                     return target, value
-                else:
-                    # We should allocate and store
-                    alloca = cc.AllocaOp(cc.PointerType.get(value.type),
-                                         TypeAttr.get(value.type)).result
-                    cc.StoreOp(value, alloca)
-                    return target, alloca
-
-            # Handle assignments like `listVar[IDX] = expr`
-            elif (isinstance(target, ast.Subscript) and
-                  isinstance(target.value, ast.Name) and
-                  target.value.id in self.symbolTable):
-                check_not_captured(target.value.id)
-
-                # Visit_Subscript will try to load any pointer and return it
-                # but here we want the pointer, so flip that flag
-                self.subscriptPushPointerValue = True
-                # Visit the subscript node, get the pointer value
-                self.visit(target)
-                # Reset the push pointer value flag
-                self.subscriptPushPointerValue = False
-                ptrVal = self.popValue()
-                if not cc.PointerType.isinstance(ptrVal.type):
-                    self.emitFatalError(
-                        "Invalid CUDA-Q subscript assignment, variable must be a pointer.",
-                        node)
-                # See if this is a pointer to an array, if so cast it
-                # to a pointer on the array type
-                ptrEleType = cc.PointerType.getElementType(ptrVal.type)
-                if cc.ArrayType.isinstance(ptrEleType):
-                    ptrVal = cc.CastOp(
-                        cc.PointerType.get(
-                            cc.ArrayType.getElementType(ptrEleType)),
-                        ptrVal).result
-
-                # Visit the value being assigned
-                self.visit(node.value)
-                valueToStore = self.popValue()
-                # Cast if necessary
-                valueToStore = self.changeOperandToType(ptrEleType,
-                                                        valueToStore)
-                # Store the value
-                cc.StoreOp(valueToStore, ptrVal)
-                return target.value, None
-
-            # Handle assignments like `classVar.attr = expr`
-            elif (isinstance(target, ast.Attribute) and
-                  isinstance(target.value, ast.Name) and
-                  target.value.id in self.symbolTable):
-                check_not_captured(target.value.id)
-
-                self.attributePushPointerValue = True
-                # Visit the attribute node, get the pointer value
-                self.visit(target)
-                # Reset the push pointer value flag
-                self.attributePushPointerValue = False
-                ptrVal = self.popValue()
-                if not cc.PointerType.isinstance(ptrVal.type):
-                    self.emitFatalError("invalid CUDA-Q attribute assignment",
-                                        node)
-                # Visit the value being assigned
-                self.visit(node.value)
-                valueToStore = self.popValue()
-                # Cast if necessary
-                valueToStore = self.changeOperandToType(
-                    cc.PointerType.getElementType(ptrVal.type), valueToStore)
-                # Store the value
-                cc.StoreOp(valueToStore, ptrVal)
-                return target.value, None
 
-            else:
-                self.emitFatalError("Invalid target for assignment", node)
+                address = cc.AllocaOp(cc.PointerType.get(value.type),
+                                      TypeAttr.get(value.type)).result
+                cc.StoreOp(value, address)
+                return target, address
+
+            # Handle updates of existing variables
+            # (target is a combination of attribute and subscript)
+            self.pushPointerValue = True
+            self.visit(target)
+            destination = self.popValue()
+            self.pushPointerValue = False
+
+            # We should have a pointer since we requested a pointer.
+            assert cc.PointerType.isinstance(destination.type)
+            expectedTy = cc.PointerType.getElementType(destination.type)
+            # We prevent the creation of lists and structs that
+            # contain pointers, and prevent obtaining pointers to
+            # quantum types.
+            assert not cc.PointerType.isinstance(expectedTy)
+            assert not self.isQuantumType(expectedTy)
+
+            if not isinstance(value, ast.AST):
+                # Can arise if have something like `l[0], l[1] = getTuple()`
+                self.emitFatalError(
+                    "updating lists or dataclasses as part of deconstruction is not supported",
+                    node)
+
+            # Measurements need to push their values to the stack,
+            # so we set a so we set a non-None variable name here.
+            self.currentAssignVariableName = ''
+            self.visit(value)
+            mlirVal = self.popValue()
+            self.currentAssignVariableName = None
+            assert not cc.PointerType.isinstance(mlirVal.type)
+
+            # Must validate the container entry regardless of what scope the
+            # target is defined in.
+            self.__validate_container_entry(mlirVal, value)
+
+            if target_root_defined_in_parent_scope:
+                update_in_parent_scope(destination, mlirVal)
+                return target_root, None
+
+            mlirVal = self.changeOperandToType(expectedTy,
+                                               mlirVal,
+                                               allowDemotion=False)
+            cc.StoreOp(mlirVal, destination)
+            # The returned target root has no effect here since no value
+            # is returns to push to he symbol table. We merely need to make
+            # sure that it is an `ast.Name` object to break the recursion.
+            return target_root, None
 
         if len(node.targets) > 1:
             # I am not entirely sure what kinds of Python language constructs would
@@ -1555,62 +2042,99 @@ def visit_Attribute(self, node):
                         'ZError', 'XError', 'YError', 'Pauli1', 'Pauli2',
                         'Depolarization1', 'Depolarization2'
                 ]:
-                    cudaq_module = importlib.import_module('cudaq')
-                    channel_class = getattr(cudaq_module, node.attr)
-                    self.pushValue(
-                        self.getConstantInt(channel_class.num_parameters))
-                    self.pushValue(self.getConstantInt(hash(channel_class)))
-                    return
+                    self.emitFatalError(
+                        "noise channels may only be used as part of call expressions",
+                        node)
 
-                # Any other cudaq attributes should be handled by the parent
+                # must be handled by the parent
                 return
 
             if node.attr == 'ctrl' or node.attr == 'adj':
                 # to be processed by the caller
                 return
 
-        def process_potential_ptr_types(value):
-            """
-            Helper function to process anything that the parent may assign to, 
-            depending on whether value is a pointer or not.
-            """
-            valType = value.type
-            if cc.PointerType.isinstance(valType):
-                valType = cc.PointerType.getElementType(valType)
-
-            if quake.StruqType.isinstance(valType):
-                # Need to extract value instead of load from compute pointer.
-                structIdx, memberTy = self.getStructMemberIdx(
-                    node.attr, value.type)
-                attr = IntegerAttr.get(self.getIntegerType(32), structIdx)
-                self.pushValue(quake.GetMemberOp(memberTy, value, attr).result)
-                return True
+        if node.attr == 'copy':
+            if self.pushPointerValue:
+                self.emitFatalError(
+                    "function call does not produce a modifiable value", node)
+            # needs to be handled by the caller
+            return
 
-            if cc.StructType.isinstance(valType):
-                # Handle the case where we have a struct member extraction, memory semantics
-                self.__visitStructAttribute(node, value)
-                return True
+        # Only variable names, subscripts and attributes can
+        # produce modifiable values. Anything else produces an
+        # immutable value. We make sure the visit gets processed
+        # such that the rest of the code can give a proper error.
+        value_root = node.value
+        while (isinstance(value_root, ast.Subscript) or
+               isinstance(value_root, ast.Attribute)):
+            value_root = value_root.value
+        if self.pushPointerValue and not isinstance(value_root, ast.Name):
+            self.pushPointerValue = False
+            self.visit(node.value)
+            value = self.popValue()
+            self.pushPointerValue = True
+        else:
+            self.visit(node.value)
+            value = self.popValue()
 
-            elif (quake.VeqType.isinstance(valType) or
-                  cc.StdvecType.isinstance(valType) or
-                  cc.ArrayType.isinstance(valType)):
-                return self.__isSupportedVectorFunction(node.attr)
+        valType = value.type
+        if cc.PointerType.isinstance(valType):
+            valType = cc.PointerType.getElementType(valType)
 
-            return False
+        if quake.StruqType.isinstance(valType):
+            if self.pushPointerValue:
+                self.emitFatalError(
+                    "accessing attribute of quantum tuple or dataclass does not produce a modifiable value",
+                    node)
+            # Need to extract value instead of load from compute pointer.
+            structIdx, memberTy = self.getStructMemberIdx(node.attr, value.type)
+            attr = IntegerAttr.get(self.getIntegerType(32), structIdx)
+            self.pushValue(quake.GetMemberOp(memberTy, value, attr).result)
+            return
 
-        # Make sure we preserve pointers for structs
-        if isinstance(node.value,
-                      ast.Name) and node.value.id in self.symbolTable:
-            value = self.symbolTable[node.value.id]
-            processed = process_potential_ptr_types(value)
-            if processed:
+        if (cc.PointerType.isinstance(value.type) and
+                cc.StructType.isinstance(valType)):
+            assert self.pushPointerValue
+            structIdx, memberTy = self.getStructMemberIdx(node.attr, valType)
+            eleAddr = cc.ComputePtrOp(cc.PointerType.get(memberTy), value, [],
+                                      DenseI32ArrayAttr.get([structIdx])).result
+
+            if self.pushPointerValue:
+                self.pushValue(eleAddr)
                 return
 
-        self.visit(node.value)
-        if len(self.valueStack) == 0:
-            self.emitFatalError("failed to create value to access attribute",
-                                node)
-        value = self.ifPointerThenLoad(self.popValue())
+            eleAddr = cc.LoadOp(eleAddr).result
+            self.pushValue(eleAddr)
+            return
+
+        if cc.StructType.isinstance(value.type):
+            if self.pushPointerValue:
+                self.emitFatalError(
+                    "value cannot be modified - use `.copy(deep)` to create a new value that can be modified",
+                    node)
+
+            # Handle direct struct value - use ExtractValueOp (more efficient)
+            structIdx, memberTy = self.getStructMemberIdx(node.attr, value.type)
+            extractedValue = cc.ExtractValueOp(
+                memberTy, value, [], DenseI32ArrayAttr.get([structIdx])).result
+
+            self.pushValue(extractedValue)
+            return
+
+        if (quake.VeqType.isinstance(valType) or
+                cc.StdvecType.isinstance(valType)):
+            if self.__isSupportedVectorFunction(node.attr):
+                if self.pushPointerValue:
+                    self.emitFatalError(
+                        "function call does not produce a modifiable value",
+                        node)
+                # needs to be handled by the caller
+                return
+
+        # everything else does not produce a modifiable value
+        if self.pushPointerValue:
+            self.emitFatalError(
+                "attribute expression does not produce a modifiable value")
 
         if ComplexType.isinstance(value.type):
             if (node.attr == 'real'):
@@ -1626,52 +2150,23 @@ def process_potential_ptr_types(value):
             if quake.VeqType.isinstance(value.type):
                 self.pushValue(
                     quake.VeqSizeOp(self.getIntegerType(), value).result)
-                return True
-            if cc.StdvecType.isinstance(value.type) or cc.ArrayType.isinstance(
-                    value.type):
-                self.pushValue(self.__get_vector_size(value))
-                return True
+                return
+            if cc.StdvecType.isinstance(value.type):
+                self.pushValue(
+                    cc.StdvecSizeOp(self.getIntegerType(), value).result)
+                return
 
-        processed = process_potential_ptr_types(value)
-        if not processed:
-            self.emitFatalError("unrecognized attribute {}".format(node.attr),
-                                node)
+        self.emitFatalError("unrecognized attribute {}".format(node.attr), node)
 
     def visit_Call(self, node):
         """
-        Map a Python Call operation to equivalent MLIR. This method will first
-        check for call operations that are `ast.Name` nodes in the tree (the
-        name of a function to call).  It will handle the Python `range(start,
-        stop, step)` function by creating an array of integers to loop through
-        via an invariant CC loop operation. Subsequent users of the `range()`
-        result can iterate through the elements of the returned `cc.array`. It
-        will handle the Python `enumerate(iterable)` function by constructing
-        another invariant loop that builds up and array of `cc.struct<i64, T>`,
-        representing the counter and the element.
-
-        It will next handle any quantum operation (optionally with a rotation
-        parameter).  Single target operations can be represented that take a
-        single qubit reference, multiple single qubits, or a vector of qubits,
-        where the latter two will apply the operation to every qubit in the
-        vector:
-
-        Valid single qubit operations are `h`, `x`, `y`, `z`, `s`, `t`, `rx`,
-        `ry`, `rz`, `r1`.
-
-        Measurements `mx`, `my`, `mz` are mapped to corresponding quake
-        operations and the return i1 value is added to the value
-        stack. Measurements of single qubit reference and registers of qubits
-        are supported.
-
-        General calls to previously seen CUDA-Q kernels are supported. By this
-        we mean that an kernel can not be invoked from a kernel unless it was
-        defined before the current kernel.  Kernels can also be reversed or
-        controlled with `cudaq.adjoint(kernel, ...)` and `cudaq.control(kernel,
-        ...)`.
-
-        Finally, general operation modifiers are supported, specifically
-        `OPERATION.adj` and `OPERATION.ctrl` for adjoint and control synthesis
-        of the operation.
+        Map a Python Call operation to equivalent MLIR. This method handles
+        functions that are `ast.Name` and `ast.Attribute` objects.
+
+        This function handles all built-in unitary and measurement gates
+        as well as all the ways to adjoint and control them.
+        General calls to previously seen CUDA-Q kernels or registered
+        operations are supported. 
 
         ```python
             q, r = cudaq.qubit(), cudaq.qubit()
@@ -1686,21 +2181,11 @@ def visit_Call(self, node):
         global globalRegisteredOperations
 
         def convertArguments(expectedArgTypes, values):
-            fName = 'function'
-            if hasattr(node.func, 'id'):
-                fName = node.func.id
-            elif hasattr(node.func, 'attr'):
-                fName = node.func.attr
-            if len(expectedArgTypes) != len(values):
-                self.emitFatalError(
-                    f"invalid number of arguments passed in call to {fName} ({len(values)} vs required {len(expectedArgTypes)})",
-                    node)
+            assert len(expectedArgTypes) == len(values)
             args = []
-            for idx, value in enumerate(values):
-                arg = self.ifPointerThenLoad(value)
-                expectedTy = expectedArgTypes[idx]
+            for idx, expectedTy in enumerate(expectedArgTypes):
                 arg = self.changeOperandToType(expectedTy,
-                                               arg,
+                                               values[idx],
                                                allowDemotion=True)
                 args.append(arg)
             return args
@@ -1715,122 +2200,45 @@ def getNegatedControlQubits(controls):
                 self.controlNegations.clear()
             return negatedControlQubits
 
-        def processControlAndAdjoint(pyFuncVal, attrName):
-            # NOTE: We currently generally don't have the means in the
-            # compiler to handle composition of control and adjoint, since
-            # control and adjoint are not proper functors (i.e. there is
-            # no way to obtain a new callable object that is the adjoint
-            # or controlled version of another callable).
-            # Since we don't really treat callables as first-class values,
-            # the first argument to control and adjoint indeed has to be
-            # a Name object.
-            if not isinstance(pyFuncVal, ast.Name):
-                self.emitFatalError(
-                    f'unsupported argument in call to {attrName} - first argument must be a symbol name',
-                    node)
-            otherFuncName = pyFuncVal.id
-
-            values = [self.popValue() for _ in range(len(self.valueStack))]
-            values.reverse()
-            indirectCallee = []
-            kwargs = {"is_adj": attrName == 'adjoint'}
-
-            if otherFuncName in self.symbolTable and \
-                cc.CallableType.isinstance(values[0].type):
-                functionTy = FunctionType(
-                    cc.CallableType.getFunctionType(values[0].type))
-                inputTys, outputTys = functionTy.inputs, functionTy.results
-                indirectCallee.append(values[0])
-                values = values[1:]
-            elif otherFuncName in globalKernelRegistry:
-                otherFunc = globalKernelRegistry[otherFuncName]
-                inputTys, outputTys = otherFunc.arguments.types, otherFunc.results.types
-                kwargs["callee"] = FlatSymbolRefAttr.get(nvqppPrefix +
-                                                         otherFuncName)
-            elif otherFuncName in globalRegisteredOperations:
-                self.emitFatalError(
-                    f"calling cudaq.control or cudaq.adjoint on a globally registered operation is not supported",
-                    node)
-            elif self.__isUnitaryGate(
-                    otherFuncName) or self.__isMeasurementGate(otherFuncName):
-                self.emitFatalError(
-                    f"calling cudaq.control or cudaq.adjoint on a built-in gate is not supported",
-                    node)
-            else:
-                self.emitFatalError(
-                    f"{otherFuncName} is not a known quantum kernel - maybe a cudaq.kernel attribute is missing?.",
-                    node)
-
-            numControlArgs = attrName == 'control'
-            if len(values) < numControlArgs:
-                self.emitFatalError(
-                    "missing control qubit(s) argument in cudaq.control", node)
-            controls = values[:numControlArgs]
-            invert_controls = lambda: None
-            if len(controls) != 0:
-                assert (len(controls) == 1)
-                if (not quake.RefType.isinstance(controls[0].type) and
-                        not quake.VeqType.isinstance(controls[0].type)):
-                    self.emitFatalError(
-                        'invalid argument type for control operand', node)
-                # TODO: it would be cleaner to add support for negated control
-                # qubits to `quake.ApplyOp`
-                if controls[0] in self.controlNegations:
-                    invert_controls = lambda: self.__applyQuantumOperation(
-                        'x', [], controls)
-                    self.controlNegations.clear()
-            args = convertArguments(inputTys, values[numControlArgs:])
-            if len(outputTys) != 0:
-                self.emitFatalError(
-                    f'cannot take {attrName} of kernel {otherFuncName} that '
-                    f'returns a value', node)
-            invert_controls()
-            quake.ApplyOp([], indirectCallee, controls, args, **kwargs)
-            invert_controls()
-
-        def processFunctionCall(fType, nrValsToPop):
-            if len(fType.inputs) != nrValsToPop:
-                fName = 'function'
-                if hasattr(node.func, 'id'):
-                    fName = node.func.id
-                elif hasattr(node.func, 'attr'):
-                    fName = node.func.attr
-                self.emitFatalError(
-                    f"invalid number of arguments passed in call to {fName} "
-                    f"({nrValsToPop} vs required {len(fType.inputs)})", node)
-            values = [self.popValue() for _ in node.args]
-            values.reverse()
-            values = convertArguments([t for t in fType.inputs], values)
-            if len(fType.results) == 0:
-                func.CallOp(otherKernel, values)
-            else:
-                result = func.CallOp(otherKernel, values).result
-                # Copy to stack if necessary
-                if cc.StdvecType.isinstance(result.type):
-                    elemTy = cc.StdvecType.getElementType(result.type)
-                    if elemTy == self.getIntegerType(1):
-                        elemTy = self.getIntegerType(8)
-                    data = cc.StdvecDataOp(cc.PointerType.get(elemTy),
-                                           result).result
-                    i64Ty = self.getIntegerType(64)
-                    length = cc.StdvecSizeOp(i64Ty, result).result
-                    elemSize = cc.SizeOfOp(i64Ty, TypeAttr.get(elemTy)).result
-                    buffer = cc.AllocaOp(cc.PointerType.get(
-                        cc.ArrayType.get(elemTy)),
-                                         TypeAttr.get(elemTy),
-                                         seqSize=length).result
-                    i8PtrTy = cc.PointerType.get(self.getIntegerType(8))
-                    cbuffer = cc.CastOp(i8PtrTy, buffer).result
-                    cdata = cc.CastOp(i8PtrTy, data).result
-                    symName = '__nvqpp_vectorCopyToStack'
-                    load_intrinsic(self.module, symName)
-                    sizeInBytes = arith.MulIOp(length, elemSize).result
-                    func.CallOp([], symName, [cbuffer, cdata, sizeInBytes])
-                    # Replace result with the stack buffer-backed vector
-                    result = cc.StdvecInitOp(result.type, buffer,
-                                             length=length).result
-
-                self.pushValue(result)
+        def processFunctionCall(kernel):
+            nrArgs = len(kernel.type.inputs)
+            values = self.__groupValues(node.args, [(nrArgs, nrArgs)])
+            values = convertArguments([t for t in kernel.type.inputs], values)
+            if len(kernel.type.results) == 0:
+                func.CallOp(kernel, values)
+                return
+            # The logic for calls that return values must
+            # match the logic in `visit_Return`; anything
+            # copied to the heap during return must be copied
+            # back to the stack. Compiler optimizations should
+            # take care of eliminating unnecessary copies.
+            result = func.CallOp(kernel, values).result
+
+            def copy_list_to_stack(value):
+                symName = '__nvqpp_vectorCopyToStack'
+                load_intrinsic(self.module, symName)
+                elemTy = cc.StdvecType.getElementType(value.type)
+                if elemTy == self.getIntegerType(1):
+                    elemTy = self.getIntegerType(8)
+                ptrTy = cc.PointerType.get(self.getIntegerType(8))
+                resBuf = cc.StdvecDataOp(cc.PointerType.get(elemTy),
+                                         value).result
+                eleSize = cc.SizeOfOp(self.getIntegerType(),
+                                      TypeAttr.get(elemTy)).result
+                dynSize = cc.StdvecSizeOp(self.getIntegerType(), value).result
+                stackCopy = cc.AllocaOp(cc.PointerType.get(
+                    cc.ArrayType.get(elemTy)),
+                                        TypeAttr.get(elemTy),
+                                        seqSize=dynSize).result
+                func.CallOp([], symName, [
+                    cc.CastOp(ptrTy, stackCopy).result,
+                    cc.CastOp(ptrTy, resBuf).result,
+                    arith.MulIOp(dynSize, eleSize).result
+                ])
+                return cc.StdvecInitOp(value.type, stackCopy,
+                                       length=dynSize).result
+
+            return self.__migrateLists(result, copy_list_to_stack)
 
         def checkControlAndTargetTypes(controls, targets):
             """
@@ -1855,6 +2263,67 @@ def is_qvec_or_qubits(vals):
                 self.emitFatalError(f'invalid argument type for target operand',
                                     node)
 
+        def processQuantumOperation(opName,
+                                    controls,
+                                    targets,
+                                    *args,
+                                    broadcast=lambda q: [q],
+                                    **kwargs):
+            opCtor = getattr(quake, f'{opName}Op')
+            checkControlAndTargetTypes(controls, targets)
+            if not broadcast:
+                return opCtor(*args, controls, targets, **kwargs)
+            elif quake.VeqType.isinstance(targets[0].type):
+                assert len(targets) == 1
+
+                def bodyBuilder(iterVal):
+                    q = quake.ExtractRefOp(self.getRefType(),
+                                           targets[0],
+                                           -1,
+                                           index=iterVal).result
+                    opCtor(*args, controls, broadcast(q), **kwargs)
+
+                veqSize = quake.VeqSizeOp(self.getIntegerType(),
+                                          targets[0]).result
+                self.createInvariantForLoop(bodyBuilder, veqSize)
+            else:
+                for target in targets:
+                    opCtor(*args, controls, broadcast(target), **kwargs)
+
+        def processQuakeCtor(opName,
+                             pyArgs,
+                             isCtrl,
+                             isAdj,
+                             numParams=0,
+                             numTargets=1):
+            kwargs = {}
+            if isCtrl:
+                argGroups = [(numParams, numParams), (1, -1),
+                             (numTargets, numTargets)]
+                # FIXME: we could allow this as long as we have 1 target
+                kwargs['broadcast'] = False
+            elif numTargets == 1:
+                # when we have a single target and no controls, we generally
+                # support any version of `x(qubit)`, `x(qvector)`, `x(q, r)`
+                argGroups = [(numParams, numParams), 0, (1, -1)]
+            else:
+                argGroups = [(numParams, numParams), 0,
+                             (numTargets, numTargets)]
+                kwargs['broadcast'] = False
+
+            params, controls, targets = self.__groupValues(pyArgs, argGroups)
+            if isCtrl:
+                negatedControlQubits = getNegatedControlQubits(controls)
+                kwargs['negated_qubit_controls'] = negatedControlQubits
+            if isAdj:
+                kwargs['is_adj'] = True
+            params = [
+                self.changeOperandToType(self.getFloatType(), param)
+                for param in params
+            ]
+            processQuantumOperation(opName, controls, targets, [], params,
+                                    **kwargs)
+
         # do not walk the FunctionDef decorator_list arguments
         if isinstance(node.func, ast.Attribute):
             self.debug_msg(lambda: f'[(Inline) Visit Attribute]', node.func)
@@ -1911,9 +2380,9 @@ def get_full_module_path(partial_path):
                 if maybeKernelName != None:
                     otherKernel = SymbolTable(
                         self.module.operation)[maybeKernelName]
-
-                    [self.visit(arg) for arg in node.args]
-                    processFunctionCall(otherKernel.type, len(node.args))
+                    res = processFunctionCall(otherKernel)
+                    if res is not None:
+                        self.pushValue(res)
                     return
 
             # Start by seeing if we have mod1.mod2.mod3...
@@ -1930,13 +2399,8 @@ def get_full_module_path(partial_path):
 
             if all(x in moduleNames for x in ['cudaq', 'dbg', 'ast']):
                 # Handle a debug print statement
-                [self.visit(arg) for arg in node.args]
-                if len(self.valueStack) != 1:
-                    self.emitFatalError(
-                        f"cudaq.dbg.ast.{node.func.attr} call invalid - "
-                        f"too many arguments passed.", node)
-
-                self.__insertDbgStmt(self.popValue(), node.func.attr)
+                arg = self.__groupValues(node.args, [1])
+                self.__insertDbgStmt(arg, node.func.attr)
                 return
 
             # If we did have module names, then this is what we are looking for
@@ -1947,30 +2411,43 @@ def get_full_module_path(partial_path):
                     # If it is in `globalKernelRegistry`, it has to be in this Module
                     otherKernel = SymbolTable(
                         self.module.operation)[nvqppPrefix + name]
-                    [self.visit(arg) for arg in node.args]
-                    processFunctionCall(otherKernel.type, len(node.args))
+
+                    res = processFunctionCall(otherKernel)
+                    if res is not None:
+                        self.pushValue(res)
                     return
 
-        # FIXME: This whole thing is widely inconsistent;
-        # For example; we pop all values on the value stack for a simple gate
-        # and allow x(q1, q2, q3, ...) here, but for a simple adjoint gate we
-        # only ever pop a single value. Then there are the control qubits,
-        # where we also allow to pass individual qubits instead of a vector.
-        # I'll tackle this as part of revising the value stack.
-        # FIXME: Expand the tests in test_control_negations as needed after
-        # revising this.
         if isinstance(node.func, ast.Name):
-            # Just visit the arguments, we know the name
-            [self.visit(arg) for arg in node.args]
 
-            namedArgs = {}
-            for keyword in node.keywords:
-                self.visit(keyword.value)
-                namedArgs[keyword.arg] = self.popValue()
+            if node.func.id == 'complex':
+
+                keywords = [kw.arg for kw in node.keywords]
+                kwreal = 'real' in keywords
+                kwimag = 'imag' in keywords
+                real, imag = self.__groupValues(node.args,
+                                                [not kwreal, not kwimag])
+                for keyword in node.keywords:
+                    self.visit(keyword.value)
+                    kwval = self.popValue()
+                    if keyword.arg == 'real':
+                        real = kwval
+                    elif keyword.arg == 'imag':
+                        imag = kwval
+                    else:
+                        self.emitFatalError(f"unknown keyword `{keyword.arg}`",
+                                            node)
+                if not real or not imag:
+                    self.emitFatalError("missing value", node)
+
+                imag = self.changeOperandToType(self.getFloatType(), imag)
+                real = self.changeOperandToType(self.getFloatType(), real)
+                self.pushValue(
+                    complex.CreateOp(self.getComplexType(), real, imag).result)
+                return
 
-            self.debug_msg(lambda: f'[(Inline) Visit Name]', node.func)
             if node.func.id == 'len':
-                listVal = self.ifPointerThenLoad(self.popValue())
+                listVal = self.__groupValues(node.args, [1])
+
                 if cc.StdvecType.isinstance(listVal.type):
                     self.pushValue(
                         cc.StdvecSizeOp(self.getIntegerType(), listVal).result)
@@ -1991,17 +2468,15 @@ def get_full_module_path(partial_path):
                 zero = arith.ConstantOp(iTy, IntegerAttr.get(iTy, 0))
                 one = arith.ConstantOp(iTy, IntegerAttr.get(iTy, 1))
 
-                # The total number of elements in the iterable
-                # we are generating should be `N == endVal - startVal`
-                actualSize = arith.SubIOp(endVal, startVal).result
-                totalSize = math.AbsIOp(actualSize).result
-
-                # If the step is not == 1, then we also have
-                # to update the total size for the range iterable
-                actualSize = arith.DivSIOp(actualSize,
-                                           math.AbsIOp(stepVal).result).result
-                totalSize = arith.DivSIOp(totalSize,
-                                          math.AbsIOp(stepVal).result).result
+                totalSize = arith.SubIOp(endVal, startVal).result
+                if isDecrementing:
+                    roundingOffset = arith.AddIOp(stepVal, one)
+                else:
+                    roundingOffset = arith.SubIOp(stepVal, one)
+                totalSize = arith.AddIOp(totalSize, roundingOffset)
+                totalSize = arith.MaxSIOp(
+                    zero,
+                    arith.DivSIOp(totalSize, stepVal).result).result
 
                 # Create an array of i64 of the total size
                 arrTy = cc.ArrayType.get(iTy)
@@ -2020,91 +2495,67 @@ def get_full_module_path(partial_path):
 
                 def bodyBuilder(iterVar):
                     loadedCounter = cc.LoadOp(counter).result
-                    tmp = arith.MulIOp(loadedCounter, stepVal).result
-                    arrElementVal = arith.AddIOp(startVal, tmp).result
                     eleAddr = cc.ComputePtrOp(
                         cc.PointerType.get(iTy), iterable, [loadedCounter],
                         DenseI32ArrayAttr.get([kDynamicPtrIndex],
                                               context=self.ctx))
-                    cc.StoreOp(arrElementVal, eleAddr)
+                    cc.StoreOp(iterVar, eleAddr)
                     incrementedCounter = arith.AddIOp(loadedCounter, one).result
                     cc.StoreOp(incrementedCounter, counter)
 
-                self.createInvariantForLoop(endVal,
-                                            bodyBuilder,
+                self.createMonotonicForLoop(bodyBuilder,
                                             startVal=startVal,
                                             stepVal=stepVal,
+                                            endVal=endVal,
                                             isDecrementing=isDecrementing)
 
-                self.pushValue(iterable)
-                self.pushValue(actualSize)
+                vect = cc.StdvecInitOp(cc.StdvecType.get(iTy),
+                                       iterable,
+                                       length=totalSize).result
+                self.pushValue(vect)
                 return
 
             if node.func.id == 'enumerate':
                 # We have to have something "iterable" on the stack,
                 # could be coming from `range()` or an iterable like `qvector`
-                totalSize = None
-                iterable = None
-                iterEleTy = None
-                extractFunctor = None
-                if len(self.valueStack) == 1:
-                    # `qreg`-like or `stdvec`-like thing thing
-                    iterable = self.ifPointerThenLoad(self.popValue())
-                    # Create a new iterable, `alloca cc.struct<i64, T>`
-                    totalSize = None
-                    if quake.VeqType.isinstance(iterable.type):
-                        iterEleTy = self.getRefType()
-                        totalSize = quake.VeqSizeOp(self.getIntegerType(),
-                                                    iterable).result
-
-                        def extractFunctor(idxVal):
-                            return quake.ExtractRefOp(iterEleTy,
-                                                      iterable,
-                                                      -1,
-                                                      index=idxVal).result
-                    elif cc.StdvecType.isinstance(iterable.type):
-                        iterEleTy = cc.StdvecType.getElementType(iterable.type)
-                        totalSize = cc.StdvecSizeOp(self.getIntegerType(),
-                                                    iterable).result
-
-                        def extractFunctor(idxVal):
-                            arrEleTy = cc.ArrayType.get(iterEleTy)
-                            elePtrTy = cc.PointerType.get(iterEleTy)
-                            arrPtrTy = cc.PointerType.get(arrEleTy)
-                            vecPtr = cc.StdvecDataOp(arrPtrTy, iterable).result
-                            eleAddr = cc.ComputePtrOp(
-                                elePtrTy, vecPtr, [idxVal],
-                                DenseI32ArrayAttr.get([kDynamicPtrIndex],
-                                                      context=self.ctx)).result
-                            return cc.LoadOp(eleAddr).result
-                    else:
-                        self.emitFatalError(
-                            "could not infer enumerate tuple type ({})".format(
-                                iterable.type), node)
-                else:
-                    if len(self.valueStack) != 2:
-                        msg = 'Error in AST processing, should have 2 values on the stack for enumerate'
-                        self.emitFatalError(msg, node)
+                iterable = self.__groupValues(node.args, [1])
 
-                    totalSize = self.popValue()
-                    iterable = self.popValue()
-                    arrTy = cc.PointerType.getElementType(iterable.type)
-                    iterEleTy = cc.ArrayType.getElementType(arrTy)
+                # Create a new iterable, `alloca cc.struct<i64, T>`
+                if quake.VeqType.isinstance(iterable.type):
+                    iterEleTy = self.getRefType()
+                    totalSize = quake.VeqSizeOp(self.getIntegerType(),
+                                                iterable).result
+
+                    def extractFunctor(idxVal):
+                        return quake.ExtractRefOp(iterEleTy,
+                                                  iterable,
+                                                  -1,
+                                                  index=idxVal).result
+                elif cc.StdvecType.isinstance(iterable.type):
+                    iterEleTy = cc.StdvecType.getElementType(iterable.type)
+                    totalSize = cc.StdvecSizeOp(self.getIntegerType(),
+                                                iterable).result
 
-                    def localFunc(idxVal):
+                    def extractFunctor(idxVal):
+                        arrEleTy = cc.ArrayType.get(iterEleTy)
+                        elePtrTy = cc.PointerType.get(iterEleTy)
+                        arrPtrTy = cc.PointerType.get(arrEleTy)
+                        vecPtr = cc.StdvecDataOp(arrPtrTy, iterable).result
                         eleAddr = cc.ComputePtrOp(
-                            cc.PointerType.get(iterEleTy), iterable, [idxVal],
+                            elePtrTy, vecPtr, [idxVal],
                             DenseI32ArrayAttr.get([kDynamicPtrIndex],
                                                   context=self.ctx)).result
                         return cc.LoadOp(eleAddr).result
-
-                    extractFunctor = localFunc
+                else:
+                    self.emitFatalError(
+                        "could not infer enumerate tuple type ({})".format(
+                            iterable.type), node)
 
                 # Enumerate returns a iterable of tuple(i64, T) for type T
                 # Allocate an array of struct<i64, T> == tuple (for us)
                 structTy = cc.StructType.get([self.getIntegerType(), iterEleTy])
-                arrTy = cc.ArrayType.get(structTy)
-                enumIterable = cc.AllocaOp(cc.PointerType.get(arrTy),
+                enumIterable = cc.AllocaOp(cc.PointerType.get(
+                    cc.ArrayType.get(structTy)),
                                            TypeAttr.get(structTy),
                                            seqSize=totalSize).result
 
@@ -2129,122 +2580,48 @@ def bodyBuilder(iterVar):
                         DenseI64ArrayAttr.get([1], context=self.ctx)).result
                     cc.StoreOp(element, eleAddr)
 
-                self.createInvariantForLoop(totalSize, bodyBuilder)
-                self.pushValue(enumIterable)
-                self.pushValue(totalSize)
+                self.createInvariantForLoop(bodyBuilder, totalSize)
+                vect = cc.StdvecInitOp(cc.StdvecType.get(structTy),
+                                       enumIterable,
+                                       length=totalSize).result
+                self.pushValue(vect)
                 return
 
-            if node.func.id == 'complex':
-                if len(namedArgs) == 0:
-                    imag = self.popValue()
-                    real = self.popValue()
-                else:
-                    imag = namedArgs['imag']
-                    real = namedArgs['real']
-                imag = self.changeOperandToType(self.getFloatType(), imag)
-                real = self.changeOperandToType(self.getFloatType(), real)
-                self.pushValue(
-                    complex.CreateOp(self.getComplexType(), real, imag).result)
+            if self.__isSimpleGate(node.func.id):
+                processQuakeCtor(node.func.id.title(),
+                                 node.args,
+                                 isCtrl=False,
+                                 isAdj=False)
                 return
 
-            if self.__isSimpleGate(node.func.id):
-                # Here we enable application of the op on all the
-                # provided arguments, e.g. `x(qubit)`, `x(qvector)`, `x(q, r)`, etc.
-                numValues = len(self.valueStack)
-                qubitTargets = [self.popValue() for _ in range(numValues)]
-                qubitTargets.reverse()
-                checkControlAndTargetTypes([], qubitTargets)
-                self.__applyQuantumOperation(node.func.id, [], qubitTargets)
+            if self.__isAdjointSimpleGate(node.func.id):
+                processQuakeCtor(node.func.id[0].title(),
+                                 node.args,
+                                 isCtrl=False,
+                                 isAdj=True)
                 return
 
             if self.__isControlledSimpleGate(node.func.id):
-                # These are single target controlled quantum operations
-                MAX_ARGS = 2
-                numValues = len(self.valueStack)
-                if numValues != MAX_ARGS:
-                    raise RuntimeError(
-                        "invalid number of arguments passed to callable {} ({} vs required {})"
-                        .format(node.func.id, len(node.args), MAX_ARGS))
-                target = self.popValue()
-                control = self.popValue()
-                negatedControlQubits = getNegatedControlQubits([control])
-                checkControlAndTargetTypes([control], [target])
-                # Map `cx` to `XOp`...
-                opCtor = getattr(
-                    quake, '{}Op'.format(node.func.id.title()[1:].upper()))
-                opCtor([], [], [control], [target],
-                       negated_qubit_controls=negatedControlQubits)
+                processQuakeCtor(node.func.id[1:].title(),
+                                 node.args,
+                                 isCtrl=True,
+                                 isAdj=False)
                 return
 
             if self.__isRotationGate(node.func.id):
-                numValues = len(self.valueStack)
-                if numValues < 2:
-                    self.emitFatalError(
-                        f'invalid number of arguments ({numValues}) passed to {node.func.id} (requires at least 2 arguments)',
-                        node)
-                qubitTargets = [self.popValue() for _ in range(numValues - 1)]
-                qubitTargets.reverse()
-                param = self.popValue()
-                if IntegerType.isinstance(param.type):
-                    param = arith.SIToFPOp(self.getFloatType(), param).result
-                elif not F64Type.isinstance(param.type):
-                    self.emitFatalError(
-                        'rotational parameter must be a float, or int.', node)
-                checkControlAndTargetTypes([], qubitTargets)
-                self.__applyQuantumOperation(node.func.id, [param],
-                                             qubitTargets)
+                processQuakeCtor(node.func.id.title(),
+                                 node.args,
+                                 isCtrl=False,
+                                 isAdj=False,
+                                 numParams=1)
                 return
 
             if self.__isControlledRotationGate(node.func.id):
-                ## These are single target, one parameter, controlled quantum operations
-                MAX_ARGS = 3
-                numValues = len(self.valueStack)
-                if numValues != MAX_ARGS:
-                    raise RuntimeError(
-                        "invalid number of arguments passed to callable {} ({} vs required {})"
-                        .format(node.func.id, len(node.args), MAX_ARGS))
-                target = self.popValue()
-                control = self.popValue()
-                negatedControlQubits = getNegatedControlQubits([control])
-                checkControlAndTargetTypes([control], [target])
-                param = self.popValue()
-                if IntegerType.isinstance(param.type):
-                    param = arith.SIToFPOp(self.getFloatType(), param).result
-                elif not F64Type.isinstance(param.type):
-                    self.emitFatalError(
-                        'rotational parameter must be a float, or int.', node)
-                # Map `crx` to `RxOp`...
-                opCtor = getattr(
-                    quake, '{}Op'.format(node.func.id.title()[1:].capitalize()))
-                opCtor([], [param], [control], [target],
-                       negated_qubit_controls=negatedControlQubits)
-                return
-
-            if self.__isAdjointSimpleGate(node.func.id):
-                target = self.popValue()
-                checkControlAndTargetTypes([], [target])
-                # Map `sdg` to `SOp`...
-                opCtor = getattr(quake, '{}Op'.format(node.func.id.title()[0]))
-                if quake.VeqType.isinstance(target.type):
-
-                    def bodyBuilder(iterVal):
-                        q = quake.ExtractRefOp(self.getRefType(),
-                                               target,
-                                               -1,
-                                               index=iterVal).result
-                        opCtor([], [], [], [q], is_adj=True)
-
-                    veqSize = quake.VeqSizeOp(self.getIntegerType(),
-                                              target).result
-                    self.createInvariantForLoop(veqSize, bodyBuilder)
-                    return
-                elif quake.RefType.isinstance(target.type):
-                    opCtor([], [], [], [target], is_adj=True)
-                    return
-                else:
-                    self.emitFatalError(
-                        'adj quantum operation on incorrect type {}.'.format(
-                            target.type), node)
+                processQuakeCtor(node.func.id[1:].title(),
+                                 node.args,
+                                 isCtrl=True,
+                                 isAdj=False,
+                                 numParams=1)
                 return
 
             if self.__isMeasurementGate(node.func.id):
@@ -2269,105 +2646,68 @@ def bodyBuilder(iterVal):
                         self.debug_msg(lambda: f'[(Inline) Visit Constant]',
                                        userProvidedRegName.value)
                         registerName = userProvidedRegName.value.value
-                qubits = [self.popValue() for _ in range(len(self.valueStack))]
-                checkControlAndTargetTypes([], qubits)
-                opCtor = getattr(quake, '{}Op'.format(node.func.id.title()))
-                i1Ty = self.getIntegerType(1)
-                resTy = i1Ty if len(qubits) == 1 and quake.RefType.isinstance(
-                    qubits[0].type) else cc.StdvecType.get(i1Ty)
-                measTy = quake.MeasureType.get(
-                ) if len(qubits) == 1 and quake.RefType.isinstance(
-                    qubits[0].type) else cc.StdvecType.get(
-                        quake.MeasureType.get())
-                label = registerName
-                if not label:
-                    label = None
-                measureResult = opCtor(measTy, [], qubits,
-                                       registerName=label).result
+
+                qubits = self.__groupValues(node.args, [(1, -1)])
+                label = registerName or None
+                if len(qubits) == 1 and quake.RefType.isinstance(
+                        qubits[0].type):
+                    measTy = quake.MeasureType.get()
+                    resTy = self.getIntegerType(1)
+                else:
+                    measTy = cc.StdvecType.get(quake.MeasureType.get())
+                    resTy = cc.StdvecType.get(self.getIntegerType(1))
+                measureResult = processQuantumOperation(
+                    node.func.id.title(), [],
+                    qubits,
+                    measTy,
+                    broadcast=False,
+                    registerName=label).result
+
+                # FIXME: needs to be revised when we properly distinguish measurement types
                 if pushResultToStack:
                     self.pushValue(
                         quake.DiscriminateOp(resTy, measureResult).result)
                 return
 
             if node.func.id == 'swap':
-                qubitB = self.popValue()
-                qubitA = self.popValue()
-                checkControlAndTargetTypes([], [qubitA, qubitB])
-                opCtor = getattr(quake, '{}Op'.format(node.func.id.title()))
-                opCtor([], [], [], [qubitA, qubitB])
+                processQuakeCtor(node.func.id.title(),
+                                 node.args,
+                                 isCtrl=False,
+                                 isAdj=False,
+                                 numTargets=2)
                 return
 
             if node.func.id == 'reset':
-                target = self.popValue()
-                checkControlAndTargetTypes([], [target])
-                if quake.RefType.isinstance(target.type):
-                    quake.ResetOp([], target)
-                    return
-                if quake.VeqType.isinstance(target.type):
-
-                    def bodyBuilder(iterVal):
-                        q = quake.ExtractRefOp(
-                            self.getRefType(),
-                            target,
-                            -1,  # `kDynamicIndex`
-                            index=iterVal).result
-                        quake.ResetOp([], q)
-
-                    veqSize = quake.VeqSizeOp(self.getIntegerType(),
-                                              target).result
-                    self.createInvariantForLoop(veqSize, bodyBuilder)
-                    return
-                self.emitFatalError(
-                    'reset quantum operation on incorrect type {}.'.format(
-                        target.type), node)
+                targets = self.__groupValues(node.args, [(1, -1)])
+                processQuantumOperation(node.func.id.title(), [],
+                                        targets,
+                                        broadcast=lambda q: q)
+                return
 
             if node.func.id == 'u3':
-                # Single target, three parameters `u3(θ,φ,λ)`
-                all_args = [
-                    self.popValue() for _ in range(len(self.valueStack))
-                ]
-                if len(all_args) < 4:
-                    self.emitFatalError(
-                        f'invalid number of arguments ({len(all_args)}) passed to {node.func.id} (requires at least 4 arguments)',
-                        node)
-                qubitTargets = all_args[:-3]
-                qubitTargets.reverse()
-                checkControlAndTargetTypes([], qubitTargets)
-                params = all_args[-3:]
-                params.reverse()
-                for idx, val in enumerate(params):
-                    if IntegerType.isinstance(val.type):
-                        params[idx] = arith.SIToFPOp(self.getFloatType(),
-                                                     val).result
-                    elif not F64Type.isinstance(val.type):
-                        self.emitFatalError(
-                            'rotational parameter must be a float, or int.',
-                            node)
-                self.__applyQuantumOperation(node.func.id, params, qubitTargets)
+                processQuakeCtor(node.func.id.title(),
+                                 node.args,
+                                 isCtrl=False,
+                                 isAdj=False,
+                                 numParams=3)
                 return
 
             if node.func.id == 'exp_pauli':
-                pauliWord = self.popValue()
-                qubits = self.popValue()
-                checkControlAndTargetTypes([], [qubits])
-                theta = self.popValue()
-                if IntegerType.isinstance(theta.type):
-                    theta = arith.SIToFPOp(self.getFloatType(), theta).result
-                quake.ExpPauliOp([], [theta], [], [qubits], pauli=pauliWord)
+                # Note: C++ also has a constructor that takes an `f64`, `string`,
+                # any any number of qubits. We don't support this here.
+                theta, target, pauliWord = self.__groupValues(
+                    node.args, [1, 1, 1])
+                theta = self.changeOperandToType(self.getFloatType(), theta)
+                processQuantumOperation("ExpPauli", [], [target], [], [theta],
+                                        broadcast=False,
+                                        pauli=pauliWord)
                 return
 
             if node.func.id in globalRegisteredOperations:
                 unitary = globalRegisteredOperations[node.func.id]
                 numTargets = int(np.log2(np.sqrt(unitary.size)))
-
-                numValues = len(self.valueStack)
-                if numValues != numTargets:
-                    self.emitFatalError(
-                        f'invalid number of arguments ({numValues}) passed to {node.func.id} (requires {numTargets} arguments)',
-                        node)
-
-                targets = [self.popValue() for _ in range(numTargets)]
-                targets.reverse()
+                targets = self.__groupValues(node.args,
+                                             [(numTargets, numTargets)])
 
                 for i, t in enumerate(targets):
                     if not quake.RefType.isinstance(t.type):
@@ -2416,29 +2756,30 @@ def bodyBuilder(iterVal):
                 otherKernel = SymbolTable(self.module.operation)[nvqppPrefix +
                                                                  node.func.id]
 
-                processFunctionCall(otherKernel.type, len(node.args))
+                res = processFunctionCall(otherKernel)
+                if res is not None:
+                    self.pushValue(res)
                 return
 
             elif node.func.id in self.symbolTable:
                 val = self.symbolTable[node.func.id]
                 if cc.CallableType.isinstance(val.type):
                     callableTy = cc.CallableType.getFunctionType(val.type)
-                    numVals = len(self.valueStack)
-                    values = [self.popValue() for _ in range(numVals)]
-                    values.reverse()
-                    values = convertArguments(
-                        FunctionType(callableTy).inputs, values)
+                    funcTy = FunctionType(callableTy)
+                    numArgs = len(funcTy.inputs)
+                    values = self.__groupValues(node.args, [(numArgs, numArgs)])
+                    values = convertArguments(funcTy.inputs, values)
                     callable = cc.CallableFuncOp(callableTy, val).result
                     func.CallIndirectOp([], callable, values)
                     return
-                else:
-                    self.emitFatalError(
-                        f"`{node.func.id}` object is not callable, found symbol of type {val.type}",
-                        node)
+
+                self.emitFatalError(
+                    f"`{node.func.id}` object is not callable, found symbol of type {val.type}",
+                    node)
 
             elif node.func.id == 'int':
                 # cast operation
-                value = self.popValue()
+                value = self.__groupValues(node.args, [1])
                 casted = self.changeOperandToType(IntegerType.get_signless(64),
                                                   value,
                                                   allowDemotion=True)
@@ -2446,33 +2787,17 @@ def bodyBuilder(iterVal):
                 return
 
             elif node.func.id == 'list':
-                if len(self.valueStack) == 2:
-                    maybeIterableSize = self.popValue()
-                    maybeIterable = self.popValue()
-
-                    # Make sure that we have a list + size
-                    if IntegerType.isinstance(maybeIterableSize.type):
-                        if cc.PointerType.isinstance(maybeIterable.type):
-                            ptrEleTy = cc.PointerType.getElementType(
-                                maybeIterable.type)
-                            if cc.ArrayType.isinstance(ptrEleTy):
-                                # We're good, just pass this back through.
-                                self.pushValue(maybeIterable)
-                                self.pushValue(maybeIterableSize)
-                                return
-                if len(self.valueStack) == 1:
-                    arrayTy = self.valueStack[0].type
-                    if cc.PointerType.isinstance(arrayTy):
-                        arrayTy = cc.PointerType.getElementType(arrayTy)
-                    if cc.StdvecType.isinstance(arrayTy):
-                        return
-                    if cc.ArrayType.isinstance(arrayTy):
-                        return
-
-                self.emitFatalError('Invalid list() cast requested.', node)
+                # The expected Python behavior is that a constructor call
+                # to list creates a new list (a shallow copy).
+                value = self.__groupValues(node.args, [1])
+                copy = self.__copyAndValidateContainer(value, node.args[0],
+                                                       False)
+                self.pushValue(copy)
+                return
 
             elif node.func.id in ['print_i64', 'print_f64']:
-                self.__insertDbgStmt(self.popValue(), node.func.id)
+                value = self.__groupValues(node.args, [1])
+                self.__insertDbgStmt(value, node.func.id)
                 return
 
             elif node.func.id in globalRegisteredTypes.classes:
@@ -2485,12 +2810,22 @@ def bodyBuilder(iterVal):
                         f"Adding new fields in data classes is not yet supported. The dataclass must be declared with @dataclass(slots=True) or @dataclasses.dataclass(slots=True).",
                         node)
 
-                # Alloca the struct
+                if node.keywords:
+                    self.emitFatalError(
+                        "keyword arguments for data classes are not yet supported",
+                        node)
+
                 structTys = [
                     mlirTypeFromPyType(v, self.ctx)
                     for _, v in annotations.items()
                 ]
 
+                numArgs = len(structTys)
+                ctorArgs = self.__groupValues(node.args, [(numArgs, numArgs)])
+                ctorArgs = convertArguments(structTys, ctorArgs)
+                for idx, arg in enumerate(ctorArgs):
+                    self.__validate_container_entry(arg, node.args[idx])
+
                 structTy = mlirTryCreateStructType(structTys,
                                                    name=node.func.id,
                                                    context=self.ctx)
@@ -2510,12 +2845,6 @@ def bodyBuilder(iterVal):
                         'struct types with user specified methods are not allowed.',
                         node)
 
-                ctorArgs = [
-                    self.popValue() for _ in range(len(self.valueStack))
-                ]
-                ctorArgs.reverse()
-                ctorArgs = convertArguments(structTys, ctorArgs)
-
                 if quake.StruqType.isinstance(structTy):
                     # If we have a quantum struct. We cannot allocate classical
                     # memory and load / store quantum type values to that memory
@@ -2523,17 +2852,14 @@ def bodyBuilder(iterVal):
                     self.pushValue(quake.MakeStruqOp(structTy, ctorArgs).result)
                     return
 
-                stackSlot = cc.AllocaOp(cc.PointerType.get(structTy),
-                                        TypeAttr.get(structTy)).result
-
-                # loop over each type and `compute_ptr` / store
-                for i, ty in enumerate(structTys):
-                    eleAddr = cc.ComputePtrOp(
-                        cc.PointerType.get(ty), stackSlot, [],
-                        DenseI32ArrayAttr.get([i], context=self.ctx)).result
-                    cc.StoreOp(ctorArgs[i], eleAddr)
-                self.pushValue(stackSlot)
+                struct = cc.UndefOp(structTy)
+                for idx, element in enumerate(ctorArgs):
+                    struct = cc.InsertValueOp(
+                        structTy, struct, element,
+                        DenseI64ArrayAttr.get([idx], context=self.ctx)).result
+                self.pushValue(struct)
                 return
+
             else:
                 self.emitFatalError(
                     "unhandled function call - {}, known kernels are {}".format(
@@ -2550,6 +2876,40 @@ def bodyBuilder(iterVal):
                 # Handled in the Attribute visit,
                 # since `numpy` arrays have a size attribute
                 self.visit(node.func)
+                self.pushValue(self.popValue())
+                return
+
+            if node.func.attr == 'copy':
+                self.visit(node.func.value)
+                funcVal = self.popValue()
+                deepCopy, dTy = None, None
+
+                for keyword in node.keywords:
+                    if keyword.arg == 'deep':
+                        deepCopy = keyword.value
+                    elif keyword.arg == 'dtype':
+                        self.visit(keyword.value)
+                        dTy = self.popValue()
+                    else:
+                        self.emitFatalError(f"unknown keyword `{keyword.arg}`",
+                                            node)
+
+                if len(node.args) == 1 and deepCopy is None:
+                    deepCopy = node.args[0]
+                else:
+                    self.__groupValues(node.args, [0])
+                if deepCopy:
+                    if not isinstance(deepCopy, ast.Constant):
+                        self.emitFatalError(
+                            "argument to `copy` must be a constant", node)
+                    deepCopy = deepCopy.value
+
+                # If we created a deep copy, we can set the parent node
+                # of the value to copy to be this node for validation purposes.
+                pyVal = node if deepCopy else node.func.value
+                copy = self.__copyAndValidateContainer(funcVal, pyVal, deepCopy,
+                                                       dTy)
+                self.pushValue(copy)
                 return
 
             if self.__isSupportedVectorFunction(node.func.attr):
@@ -2559,7 +2919,7 @@ def bodyBuilder(iterVal):
                 # we make the functions we support on values explicit
                 # somewhere, there is no way around that.
                 self.visit(node.func.value)
-                funcVal = self.ifPointerThenLoad(self.popValue())
+                funcVal = self.popValue()
 
                 # Just to be nice and give a dedicated error.
                 if (node.func.attr == 'append' and
@@ -2578,12 +2938,9 @@ def bodyBuilder(iterVal):
                         node)
 
                 funcArg = None
-                if len(node.args) > 1:
-                    self.emitFatalError(
-                        f'call to {node.func.attr} supports at most one value')
-                elif len(node.args) == 1:
-                    self.visit(node.args[0])
-                    funcArg = self.ifPointerThenLoad(self.popValue())
+                args = self.__groupValues(node.args, [(0, 1)])
+                if args:
+                    funcArg = args[0]
                     if not IntegerType.isinstance(funcArg.type):
                         self.emitFatalError(
                             f'expecting an integer argument for call to {node.func.attr}',
@@ -2647,39 +3004,19 @@ def bodyBuilder(iterVal):
             if isinstance(node.func.value, ast.Name):
 
                 if node.func.value.id in ['numpy', 'np']:
-                    [self.visit(arg) for arg in node.args]
 
-                    namedArgs = {}
-                    for keyword in node.keywords:
-                        self.visit(keyword.value)
-                        namedArgs[keyword.arg] = self.popValue()
-
-                    value = self.popValue()
+                    value = self.__groupValues(node.args, [1])
 
                     if node.func.attr == 'array':
-                        # `np.array(vec, <dtype = ty>)`
-                        arrayType = value.type
-                        if cc.PointerType.isinstance(value.type):
-                            arrayType = cc.PointerType.getElementType(
-                                value.type)
-
-                        if cc.StdvecType.isinstance(arrayType):
-                            eleTy = cc.StdvecType.getElementType(arrayType)
-                            dTy = eleTy
-                            if len(namedArgs) > 0:
-                                dTy = namedArgs['dtype']
-
-                            # Convert the vector to the provided data type if needed.
-                            self.pushValue(
-                                self.__copyVectorAndCastElements(
-                                    value, dTy, allowDemotion=True))
-                            return
-
-                        raise self.emitFatalError(
-                            f"unexpected numpy array initializer type: {value.type}",
-                            node)
-
-                    value = self.ifPointerThenLoad(value)
+                        # The expected Python behavior is that a constructor call
+                        # to array creates a new array (a shallow copy). Additionally, since
+                        # a new value is created, we need to make sure container entries
+                        # are properly validated. To not duplicate the logic, we simply
+                        # call `copy` here.
+                        self.visit_Call(
+                            ast.Call(ast.Attribute(node.args[0], 'copy'), [],
+                                     node.keywords))
+                        return
 
                     if node.func.attr in ['complex128', 'complex64']:
                         if node.func.attr == 'complex128':
@@ -2778,20 +3115,19 @@ def bodyBuilder(iterVal):
                     self.emitFatalError(
                         f"unsupported NumPy call ({node.func.attr})", node)
 
-                [self.visit(arg) for arg in node.args]
-
                 if node.func.value.id == 'cudaq':
                     if node.func.attr == 'complex':
+                        self.__groupValues(node.args, [0])
                         self.pushValue(self.simulationDType())
                         return
 
                     if node.func.attr == 'amplitudes':
-                        value = self.popValue()
-                        arrayType = value.type
+                        value = self.__groupValues(node.args, [1])
+
+                        valueTy = value.type
                         if cc.PointerType.isinstance(value.type):
-                            arrayType = cc.PointerType.getElementType(
-                                value.type)
-                        if cc.StdvecType.isinstance(arrayType):
+                            valueTy = cc.PointerType.getElementType(value.type)
+                        if cc.StdvecType.isinstance(valueTy):
                             self.pushValue(value)
                             return
 
@@ -2800,32 +3136,26 @@ def bodyBuilder(iterVal):
                             node)
 
                     if node.func.attr == 'qvector':
-                        if len(self.valueStack) == 0:
+                        if len(node.args) == 0:
                             self.emitFatalError(
                                 'qvector does not have default constructor. Init from size or existing state.',
                                 node)
 
-                        valueOrPtr = self.popValue()
-                        initializerTy = valueOrPtr.type
+                        value = self.__groupValues(node.args, [1])
 
-                        if cc.PointerType.isinstance(initializerTy):
-                            initializerTy = cc.PointerType.getElementType(
-                                initializerTy)
-
-                        if (IntegerType.isinstance(initializerTy)):
+                        if (IntegerType.isinstance(value.type)):
                             # handle `cudaq.qvector(n)`
-                            value = self.ifPointerThenLoad(valueOrPtr)
                             ty = self.getVeqType()
                             qubits = quake.AllocaOp(ty, size=value).result
                             self.pushValue(qubits)
                             return
-                        if cc.StdvecType.isinstance(initializerTy):
+
+                        if cc.StdvecType.isinstance(value.type):
                             # handle `cudaq.qvector(initState)`
 
                             # Validate the length in case of a constant initializer:
                             # `cudaq.qvector([1., 0., ...])`
                             # `cudaq.qvector(np.array([1., 0., ...]))`
-                            value = self.ifPointerThenLoad(valueOrPtr)
                             listScalar = None
                             arrNode = node.args[0]
                             if isinstance(arrNode, ast.List):
@@ -2856,7 +3186,6 @@ def bodyBuilder(iterVal):
                             # TODO: Dynamically check if number of qubits is power of 2
                             # and if the state is normalized
 
-                            ptrTy = cc.PointerType.get(eleTy)
                             arrTy = cc.ArrayType.get(eleTy)
                             ptrArrTy = cc.PointerType.get(arrTy)
                             veqTy = quake.VeqType.get()
@@ -2869,19 +3198,20 @@ def bodyBuilder(iterVal):
                             self.pushValue(init)
                             return
 
-                        if cc.StateType.isinstance(initializerTy):
+                        if (cc.PointerType.isinstance(value.type) and
+                                cc.StateType.isinstance(
+                                    cc.PointerType.getElementType(value.type))):
                             # handle `cudaq.qvector(state)`
-                            statePtr = self.ifNotPointerThenStore(valueOrPtr)
 
                             i64Ty = self.getIntegerType()
-                            numQubits = quake.GetNumberOfQubitsOp(
-                                i64Ty, statePtr).result
+                            numQubits = quake.GetNumberOfQubitsOp(i64Ty,
+                                                                  value).result
 
                             veqTy = quake.VeqType.get()
                             qubits = quake.AllocaOp(veqTy,
                                                     size=numQubits).result
                             init = quake.InitializeStateOp(
-                                veqTy, qubits, statePtr).result
+                                veqTy, qubits, value).result
 
                             self.pushValue(init)
                             return
@@ -2891,8 +3221,7 @@ def bodyBuilder(iterVal):
                             node)
 
                     if node.func.attr == "qubit":
-                        if len(self.valueStack) == 1 and IntegerType.isinstance(
-                                self.valueStack[0].type):
+                        if len(node.args) != 0:
                             self.emitFatalError(
                                 'cudaq.qubit() constructor does not take any arguments. To construct a vector of qubits, use `cudaq.qvector(N)`.'
                             )
@@ -2900,34 +3229,132 @@ def bodyBuilder(iterVal):
                         return
 
                     if node.func.attr == 'adjoint' or node.func.attr == 'control':
-                        processControlAndAdjoint(node.args[0], node.func.attr)
+
+                        # NOTE: We currently generally don't have the means in the
+                        # compiler to handle composition of control and adjoint, since
+                        # control and adjoint are not proper functors (i.e. there is
+                        # no way to obtain a new callable object that is the adjoint
+                        # or controlled version of another callable).
+                        # Since we don't really treat callables as first-class values,
+                        # the first argument to control and adjoint indeed has to be
+                        # a Name object.
+                        if not node.args or not isinstance(
+                                node.args[0], ast.Name):
+                            self.emitFatalError(
+                                f'unsupported argument in call to {node.func.attr} - first argument must be a symbol name',
+                                node)
+                        otherFuncName = node.args[0].id
+                        kwargs = {"is_adj": node.func.attr == 'adjoint'}
+
+                        if otherFuncName in self.symbolTable:
+                            self.visit(node.args[0])
+                            fctArg = self.popValue()
+                            if not cc.CallableType.isinstance(fctArg.type):
+                                self.emitFatalError(
+                                    f"{otherFuncName} is not a quantum kernel",
+                                    node)
+                            functionTy = FunctionType(
+                                cc.CallableType.getFunctionType(fctArg.type))
+                            inputTys, outputTys = functionTy.inputs, functionTy.results
+                            indirectCallee = [fctArg]
+                        elif otherFuncName in globalKernelRegistry:
+                            otherFunc = globalKernelRegistry[otherFuncName]
+                            inputTys, outputTys = otherFunc.arguments.types, otherFunc.results.types
+                            indirectCallee = []
+                            kwargs["callee"] = FlatSymbolRefAttr.get(
+                                nvqppPrefix + otherFuncName)
+                        elif otherFuncName in globalRegisteredOperations:
+                            self.emitFatalError(
+                                f"calling cudaq.control or cudaq.adjoint on a globally registered operation is not supported",
+                                node)
+                        elif self.__isUnitaryGate(
+                                otherFuncName) or self.__isMeasurementGate(
+                                    otherFuncName):
+                            self.emitFatalError(
+                                f"calling cudaq.control or cudaq.adjoint on a built-in gate is not supported",
+                                node)
+                        else:
+                            self.emitFatalError(
+                                f"{otherFuncName} is not a known quantum kernel - maybe a cudaq.kernel attribute is missing?.",
+                                node)
+
+                        numArgs = len(inputTys)
+                        invert_controls = lambda: None
+                        if node.func.attr == 'control':
+                            controls, args = self.__groupValues(
+                                node.args[1:], [(1, -1), (numArgs, numArgs)])
+                            qvec_or_qubits = (
+                                all((quake.RefType.isinstance(v.type)
+                                     for v in controls)) or
+                                (len(controls) == 1 and
+                                 quake.VeqType.isinstance(controls[0].type)))
+                            if not qvec_or_qubits:
+                                self.emitFatalError(
+                                    f'invalid argument type for control operand',
+                                    node)
+                            # TODO: it would be cleaner to add support for negated control
+                            # qubits to `quake.ApplyOp`
+                            negatedControlQubits = self.controlNegations.copy()
+                            self.controlNegations.clear()
+                            if negatedControlQubits:
+                                invert_controls = lambda: processQuantumOperation(
+                                    'X', [], negatedControlQubits, [], [])
+                        else:
+                            controls, args = self.__groupValues(
+                                node.args[1:], [(0, 0), (numArgs, numArgs)])
+
+                        args = convertArguments(inputTys, args)
+                        if len(outputTys) != 0:
+                            self.emitFatalError(
+                                f'cannot take {node.func.attr} of kernel {otherFuncName} that returns a value',
+                                node)
+                        invert_controls()
+                        quake.ApplyOp([], indirectCallee, controls, args,
+                                      **kwargs)
+                        invert_controls()
                         return
 
                     if node.func.attr == 'apply_noise':
-                        # Pop off all the arguments we need
-                        values = [
-                            self.popValue() for _ in range(len(self.valueStack))
+
+                        supportedChannels = [
+                            'DepolarizationChannel', 'AmplitudeDampingChannel',
+                            'PhaseFlipChannel', 'BitFlipChannel',
+                            'PhaseDamping', 'ZError', 'XError', 'YError',
+                            'Pauli1', 'Pauli2', 'Depolarization1',
+                            'Depolarization2'
                         ]
-                        # They are in reverse order
-                        values.reverse()
-                        # First one should be the number of Kraus channel parameters
-                        numParamsVal = values[0]
-                        # Shrink the arguments down
-                        values = values[1:]
-
-                        # Need to get the number of parameters as an integer
-                        concreteIntAttr = IntegerAttr(
-                            numParamsVal.owner.attributes['value'])
-                        numParams = concreteIntAttr.value
-
-                        # Next Value is our generated key for the channel
-                        # Get it and shrink the list
-                        key = values[0]
-                        values = values[1:]
-
-                        # Now we know the next `numParams` arguments are
-                        # our Kraus channel parameters
-                        params = values[:numParams]
+
+                        # The first argument must be the Kraus channel
+                        numParams, key = 0, None
+                        if (isinstance(node.args[0], ast.Attribute) and
+                                node.args[0].value.id == 'cudaq' and
+                                node.args[0].attr in supportedChannels):
+
+                            cudaq_module = importlib.import_module('cudaq')
+                            channel_class = getattr(cudaq_module,
+                                                    node.args[0].attr)
+                            numParams = channel_class.num_parameters
+                            key = self.getConstantInt(hash(channel_class))
+                        elif (isinstance(node.args[0], ast.Name) and
+                              node.args[0].id in self.capturedVars):
+                            arg = self.capturedVars[node.args[0].id]
+                            try:
+                                # We should have a custom Kraus channel.
+                                if issubclass(arg, cudaq_runtime.KrausChannel):
+                                    numParams = arg.num_parameters
+                                    key = self.getConstantInt(hash(arg))
+                            except:
+                                pass
+                        if key is None:
+                            self.emitFatalError(
+                                "unsupported argument for Kraus channel in apply_noise",
+                                node)
+
+                        # This currently requires at least one qubit argument
+                        params, values = self.__groupValues(
+                            node.args[1:], [(numParams, numParams), (1, -1)])
+                        checkControlAndTargetTypes([], values)
+
                         for i, p in enumerate(params):
                             # If we have a F64 value, we want to
                             # store it to a pointer
@@ -2938,63 +3365,24 @@ def bodyBuilder(iterVal):
                                 cc.StoreOp(p, alloca)
                                 params[i] = alloca
 
-                        # The remaining arguments are the qubits
-                        asVeq = quake.ConcatOp(self.getVeqType(),
-                                               values[numParams:]).result
+                        asVeq = quake.ConcatOp(self.getVeqType(), values).result
                         quake.ApplyNoiseOp(params, [asVeq], key=key)
                         return
 
                     if node.func.attr == 'compute_action':
-                        # There can only be 2 arguments here.
-                        action = None
-                        compute = None
-                        actionArg = node.args[1]
-                        if isinstance(actionArg, ast.Name):
-                            actionName = actionArg.id
-                            if actionName in self.symbolTable:
-                                action = self.symbolTable[actionName]
-                            else:
-                                self.emitFatalError(
-                                    "could not find action lambda / function in the symbol table.",
-                                    node)
-                        else:
-                            action = self.popValue()
-
-                        computeArg = node.args[0]
-                        if isinstance(computeArg, ast.Name):
-                            computeName = computeArg.id
-                            if computeName in self.symbolTable:
-                                compute = self.symbolTable[computeName]
-                            else:
-                                self.emitFatalError(
-                                    "could not find compute lambda / function in the symbol table.",
-                                    node)
-                        else:
-                            compute = self.popValue()
-
+                        compute, action = self.__groupValues(node.args, [2])
                         quake.ComputeActionOp(compute, action)
                         return
 
                     if node.func.attr == 'to_integer':
-                        boolVec = self.popValue()
-                        boolVec = self.ifPointerThenLoad(boolVec)
-                        if not cc.StdvecType.isinstance(boolVec.type):
-                            self.emitFatalError(
-                                "to_integer expects a vector of booleans. Got type {}"
-                                .format(boolVec.type), node)
-                        elemTy = cc.StdvecType.getElementType(boolVec.type)
-                        if elemTy != self.getIntegerType(1):
-                            self.emitFatalError(
-                                "to_integer expects a vector of booleans. Got type {}"
-                                .format(boolVec.type), node)
+                        boolVec = self.__groupValues(node.args, [1])
+                        args = convertArguments(
+                            [cc.StdvecType.get(self.getIntegerType(1))],
+                            [boolVec])
                         cudaqConvertToInteger = "__nvqpp_cudaqConvertToInteger"
-                        # Load the intrinsic
                         load_intrinsic(self.module, cudaqConvertToInteger)
-                        # Signature:
-                        # `func.func private @__nvqpp_cudaqConvertToInteger(%arg : !cc.stdvec<i1>) -> i64`
-                        resultTy = self.getIntegerType(64)
-                        result = func.CallOp([resultTy], cudaqConvertToInteger,
-                                             [boolVec]).result
+                        result = func.CallOp([self.getIntegerType(64)],
+                                             cudaqConvertToInteger, args).result
                         self.pushValue(result)
                         return
 
@@ -3023,207 +3411,63 @@ def maybeProposeOpAttrFix(opName, attrName):
                 # We have a `func_name.ctrl`
                 if self.__isSimpleGate(node.func.value.id):
                     if node.func.attr == 'ctrl':
-                        target = self.popValue()
-                        # Should be number of arguments minus one for the controls
-                        controls = [
-                            self.popValue() for i in range(len(node.args) - 1)
-                        ]
-                        if not controls:
-                            self.emitFatalError(
-                                'controlled operation requested without any control argument(s).',
-                                node)
-                        negatedControlQubits = getNegatedControlQubits(controls)
-
-                        opCtor = getattr(
-                            quake, '{}Op'.format(node.func.value.id.title()))
-                        checkControlAndTargetTypes(controls, [target])
-                        opCtor([], [],
-                               controls, [target],
-                               negated_qubit_controls=negatedControlQubits)
+                        processQuakeCtor(node.func.value.id.title(),
+                                         node.args,
+                                         isCtrl=True,
+                                         isAdj=False)
                         return
                     if node.func.attr == 'adj':
-                        target = self.popValue()
-                        checkControlAndTargetTypes([], [target])
-                        opCtor = getattr(
-                            quake, '{}Op'.format(node.func.value.id.title()))
-                        if quake.VeqType.isinstance(target.type):
-
-                            def bodyBuilder(iterVal):
-                                q = quake.ExtractRefOp(self.getRefType(),
-                                                       target,
-                                                       -1,
-                                                       index=iterVal).result
-                                opCtor([], [], [], [q], is_adj=True)
-
-                            veqSize = quake.VeqSizeOp(self.getIntegerType(),
-                                                      target).result
-                            self.createInvariantForLoop(veqSize, bodyBuilder)
-                            return
-                        elif quake.RefType.isinstance(target.type):
-                            opCtor([], [], [], [target], is_adj=True)
-                            return
-                        else:
-                            self.emitFatalError(
-                                'adj quantum operation on incorrect type {}.'.
-                                format(target.type), node)
-
+                        processQuakeCtor(node.func.value.id.title(),
+                                         node.args,
+                                         isCtrl=False,
+                                         isAdj=True)
+                        return
                     self.emitFatalError(
                         f'Unknown attribute on quantum operation {node.func.value.id} ({node.func.attr}). {maybeProposeOpAttrFix(node.func.value.id, node.func.attr)}'
                     )
 
-                # We have a `func_name.ctrl`
-                if node.func.value.id == 'swap' and node.func.attr == 'ctrl':
-                    targetB = self.popValue()
-                    targetA = self.popValue()
-                    controls = [
-                        self.popValue() for i in range(len(self.valueStack))
-                    ]
-                    if not controls:
-                        self.emitFatalError(
-                            'controlled operation requested without any control argument(s).',
-                            node)
-                    negatedControlQubits = getNegatedControlQubits(controls)
-                    opCtor = getattr(quake,
-                                     '{}Op'.format(node.func.value.id.title()))
-                    checkControlAndTargetTypes(controls, [targetA, targetB])
-                    opCtor([], [],
-                           controls, [targetA, targetB],
-                           negated_qubit_controls=negatedControlQubits)
-                    return
-
                 if self.__isRotationGate(node.func.value.id):
                     if node.func.attr == 'ctrl':
-                        target = self.popValue()
-                        controls = [
-                            self.popValue() for i in range(len(self.valueStack))
-                        ]
-                        param = controls[-1]
-                        controls = controls[:-1]
-                        if not controls:
-                            self.emitFatalError(
-                                'controlled operation requested without any control argument(s).',
-                                node)
-                        negatedControlQubits = getNegatedControlQubits(controls)
-                        if IntegerType.isinstance(param.type):
-                            param = arith.SIToFPOp(self.getFloatType(),
-                                                   param).result
-                        elif not F64Type.isinstance(param.type):
-                            self.emitFatalError(
-                                'rotational parameter must be a float, or int.',
-                                node)
-                        opCtor = getattr(
-                            quake, '{}Op'.format(node.func.value.id.title()))
-                        checkControlAndTargetTypes(controls, [target])
-                        opCtor([], [param],
-                               controls, [target],
-                               negated_qubit_controls=negatedControlQubits)
+                        processQuakeCtor(node.func.value.id.title(),
+                                         node.args,
+                                         isCtrl=True,
+                                         isAdj=False,
+                                         numParams=1)
                         return
-
                     if node.func.attr == 'adj':
-                        target = self.popValue()
-                        param = self.popValue()
-                        if IntegerType.isinstance(param.type):
-                            param = arith.SIToFPOp(self.getFloatType(),
-                                                   param).result
-                        elif not F64Type.isinstance(param.type):
-                            self.emitFatalError(
-                                'rotational parameter must be a float, or int.',
-                                node)
-                        opCtor = getattr(
-                            quake, '{}Op'.format(node.func.value.id.title()))
-                        checkControlAndTargetTypes([], [target])
-                        if quake.VeqType.isinstance(target.type):
-
-                            def bodyBuilder(iterVal):
-                                q = quake.ExtractRefOp(self.getRefType(),
-                                                       target,
-                                                       -1,
-                                                       index=iterVal).result
-                                opCtor([], [param], [], [q], is_adj=True)
-
-                            veqSize = quake.VeqSizeOp(self.getIntegerType(),
-                                                      target).result
-                            self.createInvariantForLoop(veqSize, bodyBuilder)
-                            return
-                        elif quake.RefType.isinstance(target.type):
-                            opCtor([], [param], [], [target], is_adj=True)
-                            return
-                        else:
-                            self.emitFatalError(
-                                'adj quantum operation on incorrect type {}.'.
-                                format(target.type), node)
-
+                        processQuakeCtor(node.func.value.id.title(),
+                                         node.args,
+                                         isCtrl=False,
+                                         isAdj=True,
+                                         numParams=1)
+                        return
                     self.emitFatalError(
                         f'Unknown attribute on quantum operation {node.func.value.id} ({node.func.attr}). {maybeProposeOpAttrFix(node.func.value.id, node.func.attr)}'
                     )
 
-                if node.func.value.id == 'u3':
-                    numValues = len(self.valueStack)
-                    target = self.popValue()
-                    other_args = [self.popValue() for _ in range(numValues - 1)]
-
-                    opCtor = getattr(quake,
-                                     '{}Op'.format(node.func.value.id.title()))
+                if node.func.value.id == 'swap' and node.func.attr == 'ctrl':
+                    processQuakeCtor(node.func.value.id.title(),
+                                     node.args,
+                                     isCtrl=True,
+                                     isAdj=False,
+                                     numTargets=2)
+                    return
 
+                if node.func.value.id == 'u3':
                     if node.func.attr == 'ctrl':
-                        controls = other_args[:-3]
-                        if not controls:
-                            self.emitFatalError(
-                                'controlled operation requested without any control argument(s).',
-                                node)
-                        negatedControlQubits = getNegatedControlQubits(controls)
-                        params = other_args[-3:]
-                        params.reverse()
-                        for idx, val in enumerate(params):
-                            if IntegerType.isinstance(val.type):
-                                params[idx] = arith.SIToFPOp(
-                                    self.getFloatType(), val).result
-                            elif not F64Type.isinstance(val.type):
-                                self.emitFatalError(
-                                    'rotational parameter must be a float, or int.',
-                                    node)
-
-                        checkControlAndTargetTypes(controls, [target])
-                        opCtor([],
-                               params,
-                               controls, [target],
-                               negated_qubit_controls=negatedControlQubits)
+                        processQuakeCtor(node.func.value.id.title(),
+                                         node.args,
+                                         isCtrl=True,
+                                         isAdj=False,
+                                         numParams=3)
                         return
-
                     if node.func.attr == 'adj':
-                        params = other_args
-                        params.reverse()
-                        for idx, val in enumerate(params):
-                            if IntegerType.isinstance(val.type):
-                                params[idx] = arith.SIToFPOp(
-                                    self.getFloatType(), val).result
-                            elif not F64Type.isinstance(val.type):
-                                self.emitFatalError(
-                                    'rotational parameter must be a float, or int.',
-                                    node)
-
-                        checkControlAndTargetTypes([], [target])
-                        if quake.VeqType.isinstance(target.type):
-
-                            def bodyBuilder(iterVal):
-                                q = quake.ExtractRefOp(self.getRefType(),
-                                                       target,
-                                                       -1,
-                                                       index=iterVal).result
-                                opCtor([], params, [], [q], is_adj=True)
-
-                            veqSize = quake.VeqSizeOp(self.getIntegerType(),
-                                                      target).result
-                            self.createInvariantForLoop(veqSize, bodyBuilder)
-                            return
-                        elif quake.RefType.isinstance(target.type):
-                            opCtor([], params, [], [target], is_adj=True)
-                            return
-                        else:
-                            self.emitFatalError(
-                                'adj quantum operation on incorrect type {}.'.
-                                format(target.type), node)
-
+                        processQuakeCtor(node.func.value.id.title(),
+                                         node.args,
+                                         isCtrl=False,
+                                         isAdj=True,
+                                         numParams=3)
+                        return
                     self.emitFatalError(
                         f'unknown attribute {node.func.attr} on u3', node)
 
@@ -3236,18 +3480,7 @@ def bodyBuilder(iterVal):
 
                     unitary = globalRegisteredOperations[node.func.value.id]
                     numTargets = int(np.log2(np.sqrt(unitary.size)))
-                    numValues = len(self.valueStack)
-                    targets = [self.popValue() for _ in range(numTargets)]
-                    targets.reverse()
-
-                    for i, t in enumerate(targets):
-                        if not quake.RefType.isinstance(t.type):
-                            self.emitFatalError(
-                                f'invalid target operand {i}, broadcasting is not supported on custom operations.'
-                            )
-
                     globalName = f'{nvqppPrefix}{node.func.value.id}_generator_{numTargets}.rodata'
-
                     currentST = SymbolTable(self.module.operation)
                     if not globalName in currentST:
                         with InsertionPoint(self.module.body):
@@ -3255,24 +3488,26 @@ def bodyBuilder(iterVal):
                                 self.loc, self.module, globalName,
                                 unitary.tolist())
 
-                    negatedControlQubits = None
-                    controls = []
-                    is_adj = False
-
                     if node.func.attr == 'ctrl':
-                        controls = [
-                            self.popValue()
-                            for _ in range(numValues - numTargets)
-                        ]
-                        if not controls:
-                            self.emitFatalError(
-                                'controlled operation requested without any control argument(s).',
-                                node)
+                        controls, targets = self.__groupValues(
+                            node.args, [(1, -1), (numTargets, numTargets)])
                         negatedControlQubits = getNegatedControlQubits(controls)
+                        is_adj = False
                     if node.func.attr == 'adj':
+                        controls, targets = self.__groupValues(
+                            node.args, [0, (numTargets, numTargets)])
+                        negatedControlQubits = None
                         is_adj = True
 
                     checkControlAndTargetTypes(controls, targets)
+                    # The check above makes sure targets are either a list
+                    # of individual qubits, or a single `qvector`. Since
+                    # a `qvector` is not allowed, we check this here:
+                    if not quake.RefType.isinstance(targets[0].type):
+                        self.emitFatalError(
+                            f'invalid target operand - target must not be a qvector'
+                        )
+
                     quake.CustomUnitarySymbolOp(
                         [],
                         generator=FlatSymbolRefAttr.get(globalName),
@@ -3297,42 +3532,18 @@ def visit_ListComp(self, node):
                 "CUDA-Q only supports single generators for list comprehension.",
                 node)
 
-        # Let's handle the following `listVar` types
-        # `   %9 = cc.alloca !cc.array<!cc.stdvec<T> x 2> -> ptr<array<stdvec<T> x N>`
-        # or
-        # `    %3 = cc.alloca T[%2 : i64] -> ptr<array<T>>`
         self.visit(node.generators[0].iter)
-
-        if len(self.valueStack) == 1:
-            iterable = self.ifPointerThenLoad(self.popValue())
-            iterableSize = None
-            if cc.StdvecType.isinstance(iterable.type):
-                iterableSize = cc.StdvecSizeOp(self.getIntegerType(),
-                                               iterable).result
-                iterTy = cc.StdvecType.getElementType(iterable.type)
-                iterArrPtrTy = cc.PointerType.get(cc.ArrayType.get(iterTy))
-                iterable = cc.StdvecDataOp(iterArrPtrTy, iterable).result
-            elif quake.VeqType.isinstance(iterable.type):
-                iterableSize = quake.VeqSizeOp(self.getIntegerType(),
-                                               iterable).result
-                iterTy = quake.RefType.get()
-            if iterableSize is None:
-                self.emitFatalError(
-                    "CUDA-Q only supports list comprehension on ranges and arrays",
-                    node)
-        elif len(self.valueStack) == 2:
-            iterableSize = self.popValue()
-            iterable = self.popValue()
-            if not cc.PointerType.isinstance(iterable.type):
-                self.emitFatalError(
-                    "CUDA-Q only supports list comprehension on ranges and arrays",
-                    node)
-            iterArrTy = cc.PointerType.getElementType(iterable.type)
-            if not cc.ArrayType.isinstance(iterArrTy):
-                self.emitFatalError(
-                    "CUDA-Q only supports list comprehension on ranges and arrays",
-                    node)
-            iterTy = cc.ArrayType.getElementType(iterArrTy)
+        iterable = self.popValue()
+        if cc.StdvecType.isinstance(iterable.type):
+            iterableSize = cc.StdvecSizeOp(self.getIntegerType(),
+                                           iterable).result
+            iterTy = cc.StdvecType.getElementType(iterable.type)
+            iterArrPtrTy = cc.PointerType.get(cc.ArrayType.get(iterTy))
+            iterable = cc.StdvecDataOp(iterArrPtrTy, iterable).result
+        elif quake.VeqType.isinstance(iterable.type):
+            iterableSize = quake.VeqSizeOp(self.getIntegerType(),
+                                           iterable).result
+            iterTy = quake.RefType.get()
         else:
             self.emitFatalError(
                 "CUDA-Q only supports list comprehension on ranges and arrays",
@@ -3352,11 +3563,13 @@ def process_void_list():
             forNode.target = node.generators[0].target
             forNode.body = [node.elt]
             forNode.orelse = []
+            forNode.lineno = node.lineno
+            # this loop could be marked as invariant if we didn't use `visit_For`
             self.visit_For(forNode)
 
         target_types = {}
 
-        def get_item_type(target, targetType):
+        def get_target_type(target, targetType):
             if isinstance(target, ast.Name):
                 if target.id in target_types:
                     self.emitFatalError(
@@ -3373,12 +3586,12 @@ def get_item_type(target, targetType):
                     self.emitFatalError(
                         "shape mismatch in tuple deconstruction", node)
                 for i, ty in enumerate(types):
-                    get_item_type(target.elts[i], ty)
+                    get_target_type(target.elts[i], ty)
             else:
                 self.emitFatalError(
                     "unsupported target in tuple deconstruction", node)
 
-        get_item_type(node.generators[0].target, iterTy)
+        get_target_type(node.generators[0].target, iterTy)
 
         # We need to know the element type of the list we are creating.
         # Unfortunately, dynamic typing makes this a bit painful.
@@ -3398,7 +3611,12 @@ def get_item_type(pyval):
                 elts = [get_item_type(v) for v in pyval.elts]
                 if None in elts:
                     return None
-                return cc.PointerType.get(cc.StructType.getNamed("tuple", elts))
+                structTy = mlirTryCreateStructType(elts, context=self.ctx)
+                if not structTy:
+                    # we return anything here since, or rather to make sure that,
+                    # a comprehensive error is generated when `elt` is walked below.
+                    return cc.StructType.getNamed("tuple", elts)
+                return structTy
             elif (isinstance(pyval, ast.Subscript) and
                   IntegerType.isinstance(get_item_type(pyval.slice))):
                 parentType = get_item_type(pyval.value)
@@ -3430,13 +3648,8 @@ def get_item_type(pyval):
                 return cc.StdvecType.get(base_elTy)
             elif isinstance(pyval, ast.Call):
                 if isinstance(pyval.func, ast.Name):
-                    # supported for calls but not here:
-                    # 'range', 'enumerate', 'list'
-                    if pyval.func.id == 'len' or pyval.func.id == 'int':
-                        return IntegerType.get_signless(64)
-                    elif pyval.func.id == 'complex':
-                        return self.getComplexType()
-                    elif self.__isUnitaryGate(
+                    # supported for calls but not here: 'range', 'enumerate'
+                    if self.__isUnitaryGate(
                             pyval.func.id) or pyval.func.id == 'reset':
                         process_void_list()
                         return None
@@ -3489,25 +3702,31 @@ def get_item_type(pyval):
                     elif pyval.func.id in globalRegisteredTypes.classes:
                         _, annotations = globalRegisteredTypes.getClassAttributes(
                             pyval.func.id)
-                        structTys = [
+                        elts = [
                             mlirTypeFromPyType(v, self.ctx)
                             for _, v in annotations.items()
                         ]
-                        # no need to do much verification on the validity of the type here -
-                        # this will be handled when we build the body
-                        isStruq = any(
-                            (self.isQuantumType(t) for t in structTys))
-                        if isStruq:
-                            return quake.StruqType.getNamed(
-                                pyval.func.id, structTys)
-                        else:
-                            return cc.PointerType.get(
-                                cc.StructType.getNamed(pyval.func.id,
-                                                       structTys))
-                elif (isinstance(pyval.func, ast.Attribute) and
-                      (pyval.func.attr == 'ctrl' or pyval.func.attr == 'adj')):
-                    process_void_list()
-                    return None
+                        structTy = mlirTryCreateStructType(elts,
+                                                           pyval.func.id,
+                                                           context=self.ctx)
+                        if not structTy:
+                            # we return anything here since, or rather to make sure that,
+                            # a comprehensive error is generated when `elt` is walked below.
+                            return cc.StructType.getNamed(pyval.func.id, elts)
+                        return structTy
+                    elif pyval.func.id == 'len' or pyval.func.id == 'int':
+                        return IntegerType.get_signless(64)
+                    elif pyval.func.id == 'complex':
+                        return self.getComplexType()
+                    elif pyval.func.id == 'list' and len(pyval.args) == 1:
+                        return get_item_type(pyval.args[0])
+                elif isinstance(pyval.func, ast.Attribute):
+                    if (pyval.func.attr == 'copy' and
+                            'dtype' not in pyval.keywords):
+                        return get_item_type(pyval.func.value)
+                    if pyval.func.attr == 'ctrl' or pyval.func.attr == 'adj':
+                        process_void_list()
+                        return None
                 self.emitFatalError("unsupported call in list comprehension",
                                     node)
             elif isinstance(pyval, ast.Compare):
@@ -3541,8 +3760,7 @@ def get_item_type(pyval):
             return
 
         resultVecTy = cc.StdvecType.get(listElemTy)
-        isBool = listElemTy == self.getIntegerType(1)
-        if isBool:
+        if listElemTy == self.getIntegerType(1):
             listElemTy = self.getIntegerType(8)
         listTy = cc.ArrayType.get(listElemTy)
         listValue = cc.AllocaOp(cc.PointerType.get(listTy),
@@ -3558,31 +3776,38 @@ def get_item_type(pyval):
         def bodyBuilder(iterVar):
             self.symbolTable.pushScope()
             if quake.VeqType.isinstance(iterable.type):
-                loadedEle = quake.ExtractRefOp(iterTy,
-                                               iterable,
-                                               -1,
-                                               index=iterVar).result
+                iterVal = quake.ExtractRefOp(iterTy,
+                                             iterable,
+                                             -1,
+                                             index=iterVar).result
             else:
                 eleAddr = cc.ComputePtrOp(
                     cc.PointerType.get(iterTy), iterable, [iterVar],
                     DenseI32ArrayAttr.get([kDynamicPtrIndex], context=self.ctx))
-                loadedEle = cc.LoadOp(eleAddr).result
-            self.__deconstructAssignment(node.generators[0].target, loadedEle)
+                iterVal = cc.LoadOp(eleAddr).result
+
+            # We don't do support anything within list comprehensions that would
+            # require being careful about assigning references, so simply
+            # adding them to the symbol table is enough for list comprehension.
+            self.__deconstructAssignment(node.generators[0].target, iterVal)
             self.visit(node.elt)
-            result = self.popValue()
+            element = self.popValue()
+            # We do need to be careful, however, about validating the list elements.
+            self.__validate_container_entry(element, node.elt)
+
             listValueAddr = cc.ComputePtrOp(
                 cc.PointerType.get(listElemTy), listValue, [iterVar],
                 DenseI32ArrayAttr.get([kDynamicPtrIndex], context=self.ctx))
-
-            if isBool:
-                result = self.changeOperandToType(self.getIntegerType(8),
-                                                  result)
-            cc.StoreOp(result, listValueAddr)
+            element = self.changeOperandToType(listElemTy,
+                                               element,
+                                               allowDemotion=False)
+            cc.StoreOp(element, listValueAddr)
             self.symbolTable.popScope()
 
-        self.createInvariantForLoop(iterableSize, bodyBuilder)
-        self.pushValue(
-            cc.StdvecInitOp(resultVecTy, listValue, length=iterableSize).result)
+        self.createInvariantForLoop(bodyBuilder, iterableSize)
+        res = cc.StdvecInitOp(resultVecTy, listValue,
+                              length=iterableSize).result
+        self.pushValue(res)
         return
 
     def visit_List(self, node):
@@ -3611,14 +3836,14 @@ def visit_List(self, node):
                 listElementValues.append(evalElem)
             else:
                 self.visit(element)
-                # We do not store lists of pointers
-                evalElem = self.ifPointerThenLoad(self.popValue())
+                evalElem = self.popValue()
                 if self.isQuantumType(
                         evalElem.type) and not quake.RefType.isinstance(
                             evalElem.type):
                     self.emitFatalError(
                         "list must not contain a qvector or quantum struct - use `*` operator to unpack qvectors",
                         node)
+                self.__validate_container_entry(evalElem, element)
                 listElementValues.append(evalElem)
 
         numQuantumTs = sum(
@@ -3659,9 +3884,7 @@ def visit_List(self, node):
             ]
 
         # Turn this List into a StdVec<T>
-        self.pushValue(
-            self.__createStdvecWithKnownValues(len(node.elts),
-                                               listElementValues))
+        self.pushValue(self.__createStdvecWithKnownValues(listElementValues))
 
     def visit_Constant(self, node):
         """
@@ -3733,11 +3956,16 @@ def fix_negative_idx(idx, get_size):
         # handle complex slice, VAR[lower:upper]
         if isinstance(node.slice, ast.Slice):
             self.debug_msg(lambda: f'[(Inline) Visit Slice]', node.slice)
+            if self.pushPointerValue:
+                self.emitFatalError(
+                    "slicing a list or qvector does not produce a modifiable value",
+                    node)
+
             self.visit(node.value)
-            var = self.ifPointerThenLoad(self.popValue())
+            var = self.popValue()
             vectorSize = get_size(var)
 
-            lowerVal, upperVal, stepVal = (None, None, None)
+            lowerVal, upperVal = None, None
             if node.slice.lower is not None:
                 self.visit(node.slice.lower)
                 lowerVal = fix_negative_idx(self.popValue(), lambda: vectorSize)
@@ -3795,16 +4023,41 @@ def fix_negative_idx(idx, get_size):
 
             return
 
-        self.generic_visit(node)
-
-        assert len(self.valueStack) > 1
+        # Only variable names, subscripts and attributes can
+        # produce modifiable values. Anything else produces an
+        # immutable value. We make sure the visit gets processed
+        # such that the rest of the code can give a proper error.
+        value_root = node.value
+        while (isinstance(value_root, ast.Subscript) or
+               isinstance(value_root, ast.Attribute)):
+            value_root = value_root.value
+        if self.pushPointerValue and not isinstance(value_root, ast.Name):
+            self.pushPointerValue = False
+            self.visit(node.value)
+            var = self.popValue()
+            self.pushPointerValue = True
+        else:
+            # `isSubscriptRoot` is only used/needed to enable
+            # modification of items in lists and dataclasses
+            # contained in a tuple
+            subscriptRoot = self.isSubscriptRoot
+            self.isSubscriptRoot = True
+            self.visit(node.value)
+            var = self.popValue()
+            self.isSubscriptRoot = subscriptRoot
 
-        # get the last name, should be name of var being subscripted
-        var = self.ifPointerThenLoad(self.popValue())
+        pushPtr = self.pushPointerValue
+        self.pushPointerValue = False
+        self.visit(node.slice)
         idx = self.popValue()
+        self.pushPointerValue = pushPtr
 
-        # Support `VAR[-1]` as the last element of `VAR`
         if quake.VeqType.isinstance(var.type):
+            if self.pushPointerValue:
+                self.emitFatalError(
+                    "indexing into a qvector does not produce a modifyable value",
+                    node)
+
             if not IntegerType.isinstance(idx.type):
                 self.emitFatalError(
                     f'invalid index variable type used for qvector extraction ({idx.type})',
@@ -3815,6 +4068,44 @@ def fix_negative_idx(idx, get_size):
                                    index=idx).result)
             return
 
+        if cc.PointerType.isinstance(var.type):
+            # We should only ever get a pointer if we
+            # explicitly asked for it.
+            assert self.pushPointerValue
+            varType = cc.PointerType.getElementType(var.type)
+            if cc.StdvecType.isinstance(varType):
+                # We can get a pointer to a vector (only) if we
+                # are updating a struct item that is a pointer.
+                if self.pushPointerValue:
+                    # In this case, it should be save to load
+                    # the vector, since the underlying data is
+                    # not loaded.
+                    var = cc.LoadOp(var).result
+
+            if cc.StructType.isinstance(varType):
+                structName = cc.StructType.getName(varType)
+                if not self.isSubscriptRoot and structName == 'tuple':
+                    self.emitFatalError("tuple value cannot be modified", node)
+                if not isinstance(node.slice, ast.Constant):
+                    if self.pushPointerValue:
+                        if structName == 'tuple':
+                            self.emitFatalError(
+                                "tuple value cannot be modified via non-constant subscript",
+                                node)
+                        self.emitFatalError(
+                            f"{structName} value cannot be modified via non-constant subscript - use attribute access instead",
+                            node)
+
+                idxVal = node.slice.value
+                structTys = cc.StructType.getTypes(varType)
+                eleAddr = cc.ComputePtrOp(cc.PointerType.get(structTys[idxVal]),
+                                          var, [],
+                                          DenseI32ArrayAttr.get([idxVal
+                                                                ])).result
+                if self.pushPointerValue:
+                    self.pushValue(eleAddr)
+                    return
+
         if cc.StdvecType.isinstance(var.type):
             idx = fix_negative_idx(idx, lambda: get_size(var))
             eleTy = cc.StdvecType.getElementType(var.type)
@@ -3829,7 +4120,7 @@ def fix_negative_idx(idx, get_size):
                 elePtrTy, vecPtr, [idx],
                 DenseI32ArrayAttr.get([kDynamicPtrIndex],
                                       context=self.ctx)).result
-            if self.subscriptPushPointerValue:
+            if self.pushPointerValue:
                 self.pushValue(eleAddr)
                 return
             val = cc.LoadOp(eleAddr).result
@@ -3838,24 +4129,6 @@ def fix_negative_idx(idx, get_size):
             self.pushValue(val)
             return
 
-        if cc.PointerType.isinstance(var.type):
-            ptrEleTy = cc.PointerType.getElementType(var.type)
-            # Return the pointer if someone asked for it
-            if self.subscriptPushPointerValue:
-                self.pushValue(var)
-                return
-            if cc.ArrayType.isinstance(ptrEleTy):
-                # Here we want subscript on `ptr<array<>>`
-                arrayEleTy = cc.ArrayType.getElementType(ptrEleTy)
-                ptrEleTy = cc.PointerType.get(arrayEleTy)
-                casted = cc.CastOp(ptrEleTy, var).result
-                eleAddr = cc.ComputePtrOp(
-                    ptrEleTy, casted, [idx],
-                    DenseI32ArrayAttr.get([kDynamicPtrIndex],
-                                          context=self.ctx)).result
-                self.pushValue(cc.LoadOp(eleAddr).result)
-                return
-
         def get_idx_value(upper_bound):
             idxValue = None
             if hasattr(idx.owner, 'opview') and isinstance(
@@ -3877,33 +4150,29 @@ def get_idx_value(upper_bound):
         # We allow subscripts into `Structs`, but only if we don't need a pointer
         # (i.e. no updating of Tuples).
         if cc.StructType.isinstance(var.type):
-            if self.subscriptPushPointerValue:
+            if self.pushPointerValue:
+                structName = cc.StructType.getName(var.type)
+                if structName == 'tuple':
+                    self.emitFatalError("tuple value cannot be modified", node)
                 self.emitFatalError(
-                    "indexing into tuple or dataclass must not modify value",
+                    f"{structName} value cannot be modified - use `.copy(deep)` to create a new value that can be modified",
                     node)
 
-            # Handle the case where we have a tuple member extraction, memory semantics
             memberTys = cc.StructType.getTypes(var.type)
             idxValue = get_idx_value(len(memberTys))
 
-            structPtr = self.ifNotPointerThenStore(var)
-            eleAddr = cc.ComputePtrOp(
-                cc.PointerType.get(memberTys[idxValue]), structPtr, [],
-                DenseI32ArrayAttr.get([idxValue], context=self.ctx)).result
+            member = cc.ExtractValueOp(memberTys[idxValue], var, [],
+                                       DenseI32ArrayAttr.get([idxValue])).result
 
-            # Return the pointer if someone asked for it
-            if self.subscriptPushPointerValue:
-                self.pushValue(eleAddr)
-                return
-            self.pushValue(cc.LoadOp(eleAddr).result)
+            self.pushValue(member)
             return
 
-        # Let's allow subscripts into `Struqs`, but only if we don't need a pointer
+        # We allow subscripts into `Struqs`, but only if we don't need a pointer
         # (i.e. no updating of `Struqs`).
         if quake.StruqType.isinstance(var.type):
-            if self.subscriptPushPointerValue:
+            if self.pushPointerValue:
                 self.emitFatalError(
-                    "indexing into quantum tuple or dataclass must not modify value",
+                    "indexing into quantum tuple or dataclass does not produce a modifiable value",
                     node)
 
             memberTys = quake.StruqType.getTypes(var.type)
@@ -3924,6 +4193,8 @@ def visit_For(self, node):
         `veq` type, the `stdvec` type, and the result of range() and
         enumerate().
         """
+
+        getValues = None
         if isinstance(node.iter, ast.Call):
             self.debug_msg(lambda: f'[(Inline) Visit Call]', node.iter)
 
@@ -3931,288 +4202,175 @@ def visit_For(self, node):
             # by just building a for loop with N as the upper value,
             # no need to generate an array from the `range` call.
             if node.iter.func.id == 'range':
-                # This is a range(N) for loop, we just need
-                # the upper bound N for this loop
-                [self.visit(arg) for arg in node.iter.args]
+                iterable = None
                 startVal, endVal, stepVal, isDecrementing = self.__processRangeLoopIterationBounds(
                     node.iter.args)
-
-                if not isinstance(node.target, ast.Name):
-                    self.emitFatalError(
-                        "iteration variable must be a single name", node)
-
-                def bodyBuilder(iterVar):
-                    self.symbolTable.pushScope()
-                    self.symbolTable.add(node.target.id, iterVar)
-                    [self.visit(b) for b in node.body]
-                    self.symbolTable.popScope()
-
-                self.createInvariantForLoop(endVal,
-                                            bodyBuilder,
-                                            startVal=startVal,
-                                            stepVal=stepVal,
-                                            isDecrementing=isDecrementing,
-                                            elseStmts=node.orelse)
-
-                return
+                getValues = lambda iterVar: iterVar
 
             # We can simplify `for i,j in enumerate(L)` MLIR code immensely
             # by just building a for loop over the iterable object L and using
             # the index into that iterable and the element.
-            if node.iter.func.id == 'enumerate':
-                [self.visit(arg) for arg in node.iter.args]
-                if len(self.valueStack) == 2:
-                    iterable = self.popValue()
-                    self.popValue()
-                else:
-                    assert len(self.valueStack) == 1
-                    iterable = self.popValue()
-                iterable = self.ifPointerThenLoad(iterable)
-                totalSize = None
-                extractFunctor = None
-
-                beEfficient = False
-                if quake.VeqType.isinstance(iterable.type):
-                    totalSize = quake.VeqSizeOp(self.getIntegerType(),
-                                                iterable).result
-
-                    def functor(seq, idx):
-                        q = quake.ExtractRefOp(self.getRefType(),
-                                               seq,
-                                               -1,
-                                               index=idx).result
-                        return [idx, q]
-
-                    extractFunctor = functor
-                    beEfficient = True
-                elif cc.StdvecType.isinstance(iterable.type):
-                    totalSize = cc.StdvecSizeOp(self.getIntegerType(),
-                                                iterable).result
-
-                    def functor(seq, idx):
-                        vecTy = cc.StdvecType.getElementType(seq.type)
-                        dataTy = cc.PointerType.get(vecTy)
-                        arrTy = vecTy
-                        if not cc.ArrayType.isinstance(arrTy):
-                            arrTy = cc.ArrayType.get(vecTy)
-                        dataArrTy = cc.PointerType.get(arrTy)
-                        data = cc.StdvecDataOp(dataArrTy, seq).result
-                        v = cc.ComputePtrOp(
-                            dataTy, data, [idx],
-                            DenseI32ArrayAttr.get([kDynamicPtrIndex],
-                                                  context=self.ctx)).result
-                        return [idx, v]
-
-                    extractFunctor = functor
-                    beEfficient = True
-
-                if beEfficient:
+            elif node.iter.func.id == 'enumerate':
+                if len(node.iter.args) != 1:
+                    self.emitFatalError(
+                        "invalid number of arguments to enumerate - expecting 1 argument",
+                        node)
 
-                    if (not isinstance(node.target, ast.Tuple) or
-                            len(node.target.elts) != 2):
-                        self.emitFatalError(
-                            "iteration variable must be a tuple of two items",
-                            node)
+                self.visit(node.iter.args[0])
+                iterable = self.popValue()
+                getValues = lambda iterVar, v: (iterVar, v)
 
-                    def bodyBuilder(iterVar):
-                        self.symbolTable.pushScope()
-                        values = extractFunctor(iterable, iterVar)
-                        assert (len(values) == 2)
-                        for i, v in enumerate(values):
-                            self.__deconstructAssignment(node.target.elts[i], v)
-                        [self.visit(b) for b in node.body]
-                        self.symbolTable.popScope()
-
-                    self.createInvariantForLoop(totalSize,
-                                                bodyBuilder,
-                                                elseStmts=node.orelse)
-                    return
+        if not getValues:
+            self.visit(node.iter)
+            iterable = self.popValue()
 
-        self.visit(node.iter)
-        assert len(self.valueStack) > 0 and len(self.valueStack) < 3
+        if iterable:
 
-        totalSize = None
-        iterable = None
-        extractFunctor = None
+            isDecrementing = False
+            startVal = self.getConstantInt(0)
+            stepVal = self.getConstantInt(1)
+            relevantVals = getValues or (lambda iterVar, v: v)
 
-        # It could be that its the only value we have,
-        # in which case we know we have for var in iterable,
-        # but we could also have another value on the stack,
-        # the total size of the iterable, produced by range() / enumerate()
-        if len(self.valueStack) == 1:
-            # Get the iterable from the stack
-            iterable = self.ifPointerThenLoad(self.popValue())
             # we currently handle `veq` and `stdvec` types
             if quake.VeqType.isinstance(iterable.type):
                 size = quake.VeqType.getSize(iterable.type)
                 if quake.VeqType.hasSpecifiedSize(iterable.type):
-                    totalSize = self.getConstantInt(size)
+                    endVal = self.getConstantInt(size)
                 else:
-                    totalSize = quake.VeqSizeOp(self.getIntegerType(64),
-                                                iterable).result
+                    endVal = quake.VeqSizeOp(self.getIntegerType(),
+                                             iterable).result
+
+                def loadElement(iterVar):
+                    val = quake.ExtractRefOp(self.getRefType(),
+                                             iterable,
+                                             -1,
+                                             index=iterVar).result
+                    return relevantVals(iterVar, val)
 
-                def functor(iter, idx):
-                    return quake.ExtractRefOp(self.getRefType(),
-                                              iter,
-                                              -1,
-                                              index=idx).result
+                getValues = loadElement
 
-                extractFunctor = functor
             elif cc.StdvecType.isinstance(iterable.type):
                 iterEleTy = cc.StdvecType.getElementType(iterable.type)
-                totalSize = cc.StdvecSizeOp(self.getIntegerType(),
-                                            iterable).result
                 isBool = iterEleTy == self.getIntegerType(1)
                 if isBool:
                     iterEleTy = self.getIntegerType(8)
+                endVal = cc.StdvecSizeOp(self.getIntegerType(), iterable).result
 
-                def functor(iter, idxVal):
+                def loadElement(iterVar):
                     elePtrTy = cc.PointerType.get(iterEleTy)
                     arrTy = cc.ArrayType.get(iterEleTy)
                     ptrArrTy = cc.PointerType.get(arrTy)
-                    vecPtr = cc.StdvecDataOp(ptrArrTy, iter).result
+                    vecPtr = cc.StdvecDataOp(ptrArrTy, iterable).result
                     eleAddr = cc.ComputePtrOp(
-                        elePtrTy, vecPtr, [idxVal],
+                        elePtrTy, vecPtr, [iterVar],
                         DenseI32ArrayAttr.get([kDynamicPtrIndex],
                                               context=self.ctx)).result
-                    result = cc.LoadOp(eleAddr).result
+                    val = cc.LoadOp(eleAddr).result
                     if isBool:
-                        result = self.changeOperandToType(
-                            self.getIntegerType(1), result)
-                    return result
+                        val = self.changeOperandToType(self.getIntegerType(1),
+                                                       val)
+                    return relevantVals(iterVar, val)
 
-                extractFunctor = functor
+                getValues = loadElement
 
             else:
                 self.emitFatalError('{} iterable type not supported.', node)
 
-        else:
-            # In this case, we are coming from range() or enumerate(),
-            # and the iterable is a cc.array and the total size of the
-            # array is on the stack, pop it here
-            totalSize = self.popValue()
-            # Get the iterable from the stack
-            iterable = self.popValue()
-
-            # Double check our types are right
-            assert cc.PointerType.isinstance(iterable.type)
-            arrayType = cc.PointerType.getElementType(iterable.type)
-            assert cc.ArrayType.isinstance(arrayType)
-            elementType = cc.ArrayType.getElementType(arrayType)
-
-            def functor(iter, idx):
-                eleAddr = cc.ComputePtrOp(
-                    cc.PointerType.get(elementType), iter, [idx],
-                    DenseI32ArrayAttr.get([kDynamicPtrIndex],
-                                          context=self.ctx)).result
-                return cc.LoadOp(eleAddr).result
-
-            extractFunctor = functor
-
-        def bodyBuilder(iterVar):
+        def blockBuilder(iterVar, stmts):
             self.symbolTable.pushScope()
-            # we set the extract functor above, use it here
-            value = extractFunctor(iterable, iterVar)
-            self.__deconstructAssignment(node.target, value)
-            [self.visit(b) for b in node.body]
+            values = getValues(iterVar)
+            # We need to create proper assignments to the loop
+            # iteration variable(s) to have consistent behavior.
+            assignNode = ast.Assign()
+            assignNode.targets = [node.target]
+            assignNode.value = values
+            assignNode.lineno = node.lineno
+            self.visit(assignNode)
+            [self.visit(b) for b in stmts]
             self.symbolTable.popScope()
 
-        self.createInvariantForLoop(totalSize,
-                                    bodyBuilder,
-                                    elseStmts=node.orelse)
+        self.createMonotonicForLoop(
+            lambda iterVar: blockBuilder(iterVar, node.body),
+            startVal=startVal,
+            stepVal=stepVal,
+            endVal=endVal,
+            isDecrementing=isDecrementing,
+            orElseBuilder=None if not node.orelse else
+            lambda iterVar: blockBuilder(iterVar, node.orelse))
 
     def visit_While(self, node):
         """
         Convert Python while statements into the equivalent CC `LoopOp`. 
         """
-        loop = cc.LoopOp([], [], BoolAttr.get(False))
-        whileBlock = Block.create_at_start(loop.whileRegion, [])
-        with InsertionPoint(whileBlock):
+
+        def evalCond(args):
             # BUG you cannot print MLIR values while building the cc `LoopOp` while region.
             # verify will get called, no terminator yet, CCOps.cpp:520
             v = self.verbose
             self.verbose = False
             self.visit(node.test)
-            condition = self.popValue()
-            if self.getIntegerType(1) != condition.type:
-                # not equal to 0, then compare with 1
-                condPred = IntegerAttr.get(self.getIntegerType(), 1)
-                condition = arith.CmpIOp(condPred, condition,
-                                         self.getConstantInt(0)).result
-            cc.ConditionOp(condition, [])
+            condition = self.__arithmetic_to_bool(self.popValue())
             self.verbose = v
+            return condition
 
-        bodyBlock = Block.create_at_start(loop.bodyRegion, [])
-        with InsertionPoint(bodyBlock):
-            self.symbolTable.pushScope()
-            self.pushForBodyStack([])
-            [self.visit(b) for b in node.body]
-            if not self.hasTerminator(bodyBlock):
-                cc.ContinueOp([])
-            self.popForBodyStack()
-            self.symbolTable.popScope()
-
-        stepBlock = Block.create_at_start(loop.stepRegion, [])
-        with InsertionPoint(stepBlock):
-            cc.ContinueOp([])
-
-        if node.orelse:
-            elseBlock = Block.create_at_start(loop.elseRegion, [])
-            with InsertionPoint(elseBlock):
-                self.symbolTable.pushScope()
-                for stmt in node.orelse:
-                    self.visit(stmt)
-                if not self.hasTerminator(elseBlock):
-                    cc.ContinueOp(elseBlock.arguments)
-                self.symbolTable.popScope()
+        self.createForLoop([], lambda _: [self.visit(b) for b in node.body], [],
+                           evalCond, lambda _: [], None if not node.orelse else
+                           lambda _: [self.visit(stmt) for stmt in node.orelse])
 
     def visit_BoolOp(self, node):
         """
         Convert boolean operations into equivalent MLIR operations using the
         Arith Dialect.
         """
-        shortCircuitWhenTrue = isinstance(node.op, ast.Or)
         if isinstance(node.op, ast.And) or isinstance(node.op, ast.Or):
+
             # Visit the LHS and pop the value
             # Note we want any `mz(q)` calls to push their
             # result value to the stack, so we set a non-None
             # variable name here.
             self.currentAssignVariableName = ''
             self.visit(node.values[0])
-            lhs = self.popValue()
-            zero = self.getConstantInt(0, IntegerType(lhs.type).width)
+            cond = self.__arithmetic_to_bool(self.popValue())
 
-            cond = arith.CmpIOp(
-                self.getIntegerAttr(self.getIntegerType(),
-                                    1 if shortCircuitWhenTrue else 0), lhs,
-                zero).result
+            def process_boolean_op(prior, values):
+
+                if len(values) == 0:
+                    return prior
 
-            ifOp = cc.IfOp([cond.type], cond, [])
-            thenBlock = Block.create_at_start(ifOp.thenRegion, [])
-            with InsertionPoint(thenBlock):
                 if isinstance(node.op, ast.And):
-                    constantFalse = arith.ConstantOp(cond.type,
-                                                     BoolAttr.get(False))
-                    cc.ContinueOp([constantFalse])
-                else:
-                    cc.ContinueOp([cond])
+                    prior = arith.XOrIOp(prior, self.getConstantInt(1,
+                                                                    1)).result
+
+                ifOp = cc.IfOp([prior.type], prior, [])
+                thenBlock = Block.create_at_start(ifOp.thenRegion, [])
+                with InsertionPoint(thenBlock):
+                    if isinstance(node.op, ast.And):
+                        constantFalse = arith.ConstantOp(
+                            prior.type, BoolAttr.get(False))
+                        cc.ContinueOp([constantFalse])
+                    else:
+                        cc.ContinueOp([prior])
 
-            elseBlock = Block.create_at_start(ifOp.elseRegion, [])
-            with InsertionPoint(elseBlock):
-                self.symbolTable.pushScope()
-                self.pushIfStmtBlockStack()
-                self.visit(node.values[1])
-                rhs = self.popValue()
-                cc.ContinueOp([rhs])
-                self.popIfStmtBlockStack()
-                self.symbolTable.popScope()
+                elseBlock = Block.create_at_start(ifOp.elseRegion, [])
+                with InsertionPoint(elseBlock):
+                    self.symbolTable.pushScope()
+                    self.pushIfStmtBlockStack()
+                    self.visit(values[0])
+                    rhs = process_boolean_op(
+                        self.__arithmetic_to_bool(self.popValue()), values[1:])
+                    cc.ContinueOp([rhs])
+                    self.popIfStmtBlockStack()
+                    self.symbolTable.popScope()
+
+                return ifOp.result
 
+            self.pushValue(process_boolean_op(cond, node.values[1:]))
             # Reset the assign variable name
             self.currentAssignVariableName = None
-
-            self.pushValue(ifOp.result)
             return
 
+        self.emitFatalError(f'unsupported boolean expression {node.op}', node)
+
     def visit_Compare(self, node):
         """
         Visit while loop compare operations and translate to equivalent MLIR. 
@@ -4249,6 +4407,9 @@ def convert_arithmetic_types(item1, item2):
                                              allowDemotion=False)
             return item1, item2
 
+        # To understand the integer attributes used here (the predicates)
+        # see `arith::CmpIPredicate` and `arith::CmpFPredicate`.
+
         def compare_equality(item1, item2):
             # TODO: the In/NotIn case should be recursive such
             # that we can search for a list in a list of lists.
@@ -4342,31 +4503,41 @@ def compare_equality(item1, item2):
         if isinstance(op, (ast.In, ast.NotIn)):
 
             # Type validation and vector initialization
-            if not (cc.StdvecType.isinstance(right.type) or
-                    cc.ArrayType.isinstance(right.type)):
+            if not cc.StdvecType.isinstance(right.type):
                 self.emitFatalError(
                     "Right operand must be a list/vector for 'in' comparison")
+            vectSize = cc.StdvecSizeOp(self.getIntegerType(), right).result
 
             # Loop setup
             i1_type = self.getIntegerType(1)
+            trueVal = self.getConstantInt(1, 1)
             accumulator = cc.AllocaOp(cc.PointerType.get(i1_type),
                                       TypeAttr.get(i1_type)).result
-            cc.StoreOp(self.getConstantInt(0, 1), accumulator)
+            cc.StoreOp(trueVal, accumulator)
 
             # Element comparison loop
-            def check_element(idx):
-                element = self.__load_vector_element(right, idx)
+            def check_element(args):
+                element = self.__load_vector_element(right, args[0])
                 compRes = compare_equality(left, element)
+                neqRes = arith.XOrIOp(compRes, trueVal).result
                 current = cc.LoadOp(accumulator).result
-                cc.StoreOp(arith.OrIOp(current, compRes), accumulator)
+                cc.StoreOp(arith.AndIOp(current, neqRes), accumulator)
+
+            def check_condition(args):
+                notListEnd = arith.CmpIOp(IntegerAttr.get(iTy, 2), args[0],
+                                          vectSize).result
+                notFound = cc.LoadOp(accumulator).result
+                return arith.AndIOp(notListEnd, notFound).result
 
-            self.createInvariantForLoop(self.__get_vector_size(right),
-                                        check_element)
+            # Break early if we found the item
+            self.createForLoop(
+                [self.getIntegerType()], check_element,
+                [self.getConstantInt(0)], check_condition, lambda args:
+                [arith.AddIOp(args[0], self.getConstantInt(1)).result])
 
             final_result = cc.LoadOp(accumulator).result
-            if isinstance(op, ast.NotIn):
-                final_result = arith.XOrIOp(final_result,
-                                            self.getConstantInt(1, 1)).result
+            if isinstance(op, ast.In):
+                final_result = arith.XOrIOp(final_result, trueVal).result
             self.pushValue(final_result)
 
             return
@@ -4382,26 +4553,7 @@ def visit_If(self, node):
         self.currentAssignVariableName = ''
         self.visit(node.test)
         self.currentAssignVariableName = None
-
-        condition = self.popValue()
-        condition = self.ifPointerThenLoad(condition)
-
-        # To understand the integer attributes used here (the predicates)
-        # see `arith::CmpIPredicate` and `arith::CmpFPredicate`.
-
-        if self.getIntegerType(1) != condition.type:
-            if IntegerType.isinstance(condition.type):
-                condPred = IntegerAttr.get(self.getIntegerType(), 1)
-                condition = arith.CmpIOp(condPred, condition,
-                                         self.getConstantInt(0)).result
-
-            elif F64Type.isinstance(condition.type):
-                condPred = IntegerAttr.get(self.getIntegerType(), 13)
-                condition = arith.CmpFOp(condPred, condition,
-                                         self.getConstantFloat(0)).result
-            else:
-                self.emitFatalError("condition cannot be converted to bool",
-                                    node)
+        condition = self.__arithmetic_to_bool(self.popValue())
 
         ifOp = cc.IfOp([], condition, [])
         thenBlock = Block.create_at_start(ifOp.thenRegion, [])
@@ -4434,63 +4586,93 @@ def visit_Return(self, node):
         self.visit(node.value)
         self.walkingReturnNode = False
 
-        if len(self.valueStack) == 0:
+        if self.valueStack.currentNumValues == 0:
             return
 
-        result = self.ifPointerThenLoad(self.popValue())
-        result = self.ifPointerThenLoad(result)
         result = self.changeOperandToType(self.knownResultType,
-                                          result,
+                                          self.popValue(),
                                           allowDemotion=True)
 
-        if cc.StdvecType.isinstance(result.type):
+        # Generally, anything that was allocated locally on the stack
+        # needs to be copied to the heap to ensure it lives past the
+        # the function. This holds recursively; if we have a struct
+        # that contains a list, then the list data may need to be
+        # copied if it was allocated inside the function.
+        def copy_list_to_heap(value):
             symName = '__nvqpp_vectorCopyCtor'
             load_intrinsic(self.module, symName)
-            eleTy = cc.StdvecType.getElementType(result.type)
+            elemTy = cc.StdvecType.getElementType(value.type)
+            if elemTy == self.getIntegerType(1):
+                elemTy = self.getIntegerType(8)
             ptrTy = cc.PointerType.get(self.getIntegerType(8))
             arrTy = cc.ArrayType.get(self.getIntegerType(8))
             ptrArrTy = cc.PointerType.get(arrTy)
-            resBuf = cc.StdvecDataOp(ptrArrTy, result).result
-            # TODO Revisit this calculation
-            byteWidth = 16 if ComplexType.isinstance(eleTy) else 8
-            eleSize = self.getConstantInt(byteWidth)
-            dynSize = cc.StdvecSizeOp(self.getIntegerType(), result).result
+            resBuf = cc.StdvecDataOp(ptrArrTy, value).result
+            eleSize = cc.SizeOfOp(self.getIntegerType(),
+                                  TypeAttr.get(elemTy)).result
+            dynSize = cc.StdvecSizeOp(self.getIntegerType(), value).result
             resBuf = cc.CastOp(ptrTy, resBuf)
             heapCopy = func.CallOp([ptrTy], symName,
                                    [resBuf, dynSize, eleSize]).result
-            res = cc.StdvecInitOp(result.type, heapCopy, length=dynSize).result
-            func.ReturnOp([res])
-            return
+            return cc.StdvecInitOp(value.type, heapCopy, length=dynSize).result
+
+        rootVal = self.__get_root_value(node.value)
+        if rootVal and self.isFunctionArgument(rootVal):
+            # If we allow assigning a value that contains a list to an item
+            # of a function argument (which we do with the exceptions
+            # commented below), then we necessarily need to make a copy when
+            # we return function arguments, or function argument elements,
+            # that contain lists, since we have to assume that their data may
+            # be allocated on the stack. However, this leads to incorrect
+            # behavior if a returned list was indeed caller-side allocated
+            # (and should correspondingly have been returned by reference).
+            # Rather than preventing that lists in function arguments can be
+            # updated, we instead ensure that lists contained in function
+            # arguments stay recognizable as such, and prevent that function
+            # arguments that contain list are returned.
+            # NOTE: Why is seems straightforward in principle to fail only
+            # for when we return *inner* lists of function arguments, this
+            # is still not a good option for two reasons:
+            # 1) Even if we return the reference to the outer list correctly,
+            # any caller-side assignment of the return value would no longer
+            # be recognizable as being the same reference given as argument,
+            # which is a problem if the list was an argument to the caller.
+            # I.e. while this works for one function indirection, it does
+            # not work for two (see assignment tests).
+            # 2) To ensure that we don't have any memory leaks, we copy any
+            # lists returned from function calls to the stack. This copy (as
+            # of the time of writing this) results in a segfault when the
+            # list is not on the heap. As it is, we hence indeed have to copy
+            # every returned list to the heap, followed by a copy to the stack
+            # in the caller. Subsequent optimization passes should largely
+            # eliminate unnecessary copies.
+            if (self.containsList(result.type)):
+                self.emitFatalError(
+                    "return value must not contain a list that is a function argument or an item in a function argument"
+                    +
+                    " - for device kernels, lists passed as arguments will be modified in place; "
+                    +
+                    "remove the return value or use .copy(deep) to create a copy",
+                    node)
+        else:
+            result = self.__migrateLists(result, copy_list_to_heap)
 
         if self.symbolTable.numLevels() > 1:
             # We are in an inner scope, release all scopes before returning
             cc.UnwindReturnOp([result])
             return
-
         func.ReturnOp([result])
 
     def visit_Tuple(self, node):
         """
         Map tuples in the Python AST to equivalents in MLIR.
         """
-        # FIXME: The handling of tuples in Python likely needs to be examined carefully;
-        # The corresponding issue to clarify the expected behavior is
-        # https://github.com/NVIDIA/cuda-quantum/issues/3031
-        # I re-enabled the tuple support in kernel signatures, given that we were already
-        # allowing the use of data classes everywhere, and supporting tuple use within a
-        # kernel. It hence seems that any issues with tuples also apply to named structs.
-
-        self.generic_visit(node)
 
-        elementValues = [self.popValue() for _ in range(len(node.elts))]
+        [self.visit(el) for el in node.elts]
+        elementValues = self.popAllValues(len(node.elts))
         elementValues.reverse()
-
-        # We do not store structs of pointers
-        elementValues = [
-            cc.LoadOp(ele).result
-            if cc.PointerType.isinstance(ele.type) else ele
-            for ele in elementValues
-        ]
+        for idx, value in enumerate(elementValues):
+            self.__validate_container_entry(value, node.elts[idx])
 
         structTys = [v.type for v in elementValues]
         structTy = mlirTryCreateStructType(structTys, context=self.ctx)
@@ -4501,27 +4683,23 @@ def visit_Tuple(self, node):
 
         if quake.StruqType.isinstance(structTy):
             self.pushValue(quake.MakeStruqOp(structTy, elementValues).result)
-        else:
-            stackSlot = cc.AllocaOp(cc.PointerType.get(structTy),
-                                    TypeAttr.get(structTy)).result
-
-            # loop over each type and `compute_ptr` / store
-            for i, ty in enumerate(structTys):
-                eleAddr = cc.ComputePtrOp(
-                    cc.PointerType.get(ty), stackSlot, [],
-                    DenseI32ArrayAttr.get([i], context=self.ctx)).result
-                cc.StoreOp(elementValues[i], eleAddr)
-
-            self.pushValue(stackSlot)
             return
 
+        struct = cc.UndefOp(structTy)
+        for idx, element in enumerate(elementValues):
+            struct = cc.InsertValueOp(
+                structTy, struct, element,
+                DenseI64ArrayAttr.get([idx], context=self.ctx)).result
+        self.pushValue(struct)
+
     def visit_UnaryOp(self, node):
         """
         Map unary operations in the Python AST to equivalents in MLIR.
         """
 
-        self.generic_visit(node)
+        self.visit(node.operand)
         operand = self.popValue()
+
         # Handle qubit negations
         if isinstance(node.op, ast.Invert):
             if quake.RefType.isinstance(operand.type):
@@ -4556,13 +4734,9 @@ def visit_UnaryOp(self, node):
             return
 
         if isinstance(node.op, ast.Not):
-            if not IntegerType.isinstance(operand.type):
-                self.emitFatalError("UnaryOp Not() on non-integer value.", node)
-
-            zero = self.getConstantInt(0, IntegerType(operand.type).width)
             self.pushValue(
-                arith.CmpIOp(IntegerAttr.get(self.getIntegerType(), 0), operand,
-                             zero).result)
+                arith.XOrIOp(self.__arithmetic_to_bool(operand),
+                             self.getConstantInt(1, 1)).result)
             return
 
         self.emitFatalError("unhandled UnaryOp.", node)
@@ -4601,9 +4775,6 @@ def __process_binary_op(self, left, right, nodeType):
         MLIR. This method handles arithmetic operations between values. 
         """
 
-        left = self.ifPointerThenLoad(left)
-        right = self.ifPointerThenLoad(right)
-
         # type promotion for anything except pow to match Python behavior
         if not issubclass(nodeType, ast.Pow):
             superiorTy = self.__get_superior_type(left.type, right.type)
@@ -4777,36 +4948,45 @@ def visit_BinOp(self, node):
         MLIR. This method handles arithmetic operations between values. 
         """
 
-        # Get the left and right parts of this expression
         self.visit(node.left)
         left = self.popValue()
         self.visit(node.right)
         right = self.popValue()
 
+        # pushes to the value stack
         self.__process_binary_op(left, right, type(node.op))
 
     def visit_AugAssign(self, node):
         """
         Visit augment-assign operations (e.g. +=). 
         """
-        target = None
 
-        if isinstance(node.target,
-                      ast.Name) and node.target.id in self.symbolTable:
-            self.debug_msg(lambda: f'[(Inline) Visit Name]', node.target)
-            target = self.symbolTable[node.target.id]
-        else:
+        self.pushPointerValue = True
+        self.visit(node.target)
+        self.pushPointerValue = False
+        target = self.popValue()
+
+        if not cc.PointerType.isinstance(target.type):
             self.emitFatalError(
                 "augment-assign target variable is not defined or "
                 "cannot be assigned to.", node)
 
         self.visit(node.value)
         value = self.popValue()
-
         loaded = cc.LoadOp(target).result
-        self.__process_binary_op(loaded, value, type(node.op))
 
+        # NOTE: `aug-assign` is usually defined as producing a value,
+        # which we are not doing here. However, if this produces
+        # a value, then we need to start worrying that arbitrary
+        # expressions might contain assignments, which would require
+        # updates to the bridge in a bunch of places and add some
+        # complexity. We hence effectively disallow using
+        # any kind of assignment as expression.
+        self.valueStack.pushFrame()
+        self.__process_binary_op(loaded, value, type(node.op))
+        self.valueStack.popFrame()
         res = self.popValue()
+
         if res.type != loaded.type:
             self.emitFatalError(
                 "augment-assign must not change the variable type", node)
@@ -4820,44 +5000,38 @@ def visit_Name(self, node):
 
         if node.id in self.symbolTable:
             value = self.symbolTable[node.id]
-            if cc.PointerType.isinstance(value.type):
-                eleTy = cc.PointerType.getElementType(value.type)
-                if cc.ArrayType.isinstance(eleTy):
-                    self.pushValue(value)
-                    return
-                # Retain `ptr<i8>`
-                if IntegerType.isinstance(eleTy) and IntegerType(
-                        eleTy).width == 8:
-                    self.pushValue(value)
-                    return
-                if cc.StdvecType.isinstance(eleTy):
-                    self.pushValue(value)
-                    return
-                if cc.StateType.isinstance(eleTy):
-                    self.pushValue(value)
-                    return
-                loaded = cc.LoadOp(value).result
-                self.pushValue(loaded)
-            elif cc.CallableType.isinstance(
-                    value.type) and not BlockArgument.isinstance(value):
+            if (self.pushPointerValue or
+                    not cc.PointerType.isinstance(value.type)):
+                self.pushValue(value)
                 return
-            else:
-                self.pushValue(self.symbolTable[node.id])
+
+            eleTy = cc.PointerType.getElementType(value.type)
+
+            # Retain state types as pointers
+            # (function arguments of `StateType` are passed as pointers)
+            if cc.StateType.isinstance(eleTy):
+                self.pushValue(value)
+                return
+
+            loaded = cc.LoadOp(value).result
+            self.pushValue(loaded)
             return
 
         if node.id in self.capturedVars:
             # Only support a small subset of types here
             complexType = type(1j)
             value = self.capturedVars[node.id]
+            processed = False
 
             if isinstance(value, State):
                 self.pushValue(self.capturedDataStorage.storeCudaqState(value))
-                return
+                processed = True
 
-            if isinstance(value, (list, np.ndarray)) and isinstance(
+            elif isinstance(value, (list, np.ndarray)) and isinstance(
                     value[0],
                 (int, bool, float, np.int32, np.int64, np.float32, np.float64,
                  complexType, np.complex64, np.complex128)):
+
                 elementValues = None
                 if isinstance(value[0], bool):
                     elementValues = [self.getConstantInt(el, 1) for el in value]
@@ -4887,68 +5061,49 @@ def visit_Name(self, node):
                     # Save the copy of the captured list so we can compare
                     # it to the scope to detect changes on recompilation.
                     self.dependentCaptureVars[node.id] = value.copy()
-                    mlirVal = self.__createStdvecWithKnownValues(
-                        len(value), elementValues)
+                    mlirVal = self.__createStdvecWithKnownValues(elementValues)
+                    # This is just basically a form of caching to ensure that
+                    # we only ever create one copy of a captured value.
                     self.symbolTable.add(node.id, mlirVal, 0)
                     self.pushValue(mlirVal)
-                    return
+                    processed = True
 
-            mlirValCreator = None
-            self.dependentCaptureVars[node.id] = value
-            if isinstance(value, bool):
-                mlirValCreator = lambda: self.getConstantInt(value, 1)
-            elif isinstance(value, np.int32):
-                mlirValCreator = lambda: self.getConstantInt(value, width=32)
-            elif isinstance(value, (int, np.int64)):
-                mlirValCreator = lambda: self.getConstantInt(value)
-            elif isinstance(value, np.float32):
-                mlirValCreator = lambda: self.getConstantFloat(value, width=32)
-            elif isinstance(value, (float, np.float64)):
-                mlirValCreator = lambda: self.getConstantFloat(value)
-            elif isinstance(value, np.complex64):
-                mlirValCreator = lambda: self.getConstantComplex(value,
-                                                                 width=32)
-            elif isinstance(value, complexType) or isinstance(
-                    value, np.complex128):
-                mlirValCreator = lambda: self.getConstantComplex(value,
-                                                                 width=64)
-
-            if mlirValCreator != None:
-                with InsertionPoint.at_block_begin(self.entry):
-                    mlirVal = mlirValCreator()
-                    stackSlot = cc.AllocaOp(cc.PointerType.get(mlirVal.type),
-                                            TypeAttr.get(mlirVal.type)).result
-                    cc.StoreOp(mlirVal, stackSlot)
-                    # Store at the top-level
-                    self.symbolTable.add(node.id, stackSlot, 0)
-                # to match the behavior as when we load them from the symbol table
-                loaded = cc.LoadOp(stackSlot).result
-                self.pushValue(loaded)
-                return
+            else:
 
-            errorType = type(value).__name__
-            if (isinstance(value, list)):
-                errorType = f"{errorType}[{type(value[0]).__name__}]"
-
-            try:
-                if issubclass(value, cudaq_runtime.KrausChannel):
-                    # Here we have a KrausChannel as part of the AST.  We want
-                    # to create a hash value from it, and we then want to push
-                    # the number of parameters and that hash value. This can
-                    # only be used with apply_noise.
-                    if not hasattr(value, 'num_parameters'):
-                        self.emitFatalError(
-                            'apply_noise kraus channels must have `num_parameters` constant class attribute specified.'
-                        )
+                mlirVal = None
+                self.dependentCaptureVars[node.id] = value
+                if isinstance(value, bool):
+                    mlirVal = self.getConstantInt(value, 1)
+                elif isinstance(value, np.int32):
+                    mlirVal = self.getConstantInt(value, width=32)
+                elif isinstance(value, (int, np.int64)):
+                    mlirVal = self.getConstantInt(value)
+                elif isinstance(value, np.float32):
+                    mlirVal = self.getConstantFloat(value, width=32)
+                elif isinstance(value, (float, np.float64)):
+                    mlirVal = self.getConstantFloat(value)
+                elif isinstance(value, np.complex64):
+                    mlirVal = self.getConstantComplex(value, width=32)
+                elif isinstance(value, complexType) or isinstance(
+                        value, np.complex128):
+                    mlirVal = self.getConstantComplex(value, width=64)
+
+                if mlirVal != None:
+                    self.pushValue(mlirVal)
+                    processed = True
 
-                    self.pushValue(self.getConstantInt(value.num_parameters))
-                    self.pushValue(self.getConstantInt(hash(value)))
-                    return
-            except TypeError:
-                pass
+            if processed:
+                if self.pushPointerValue:
+                    self.emitFatalError(
+                        "CUDA-Q does not allow assignments to variables captured from parent scope",
+                        node)
+                return
 
             if (node.id not in globalKernelRegistry and
                     node.id not in globalRegisteredOperations):
+                errorType = type(value).__name__
+                if (isinstance(value, list)):
+                    errorType = f"{errorType}[{type(value[0]).__name__}]"
                 self.emitFatalError(
                     f"Invalid type for variable ({node.id}) captured from parent scope (only int, bool, float, complex, cudaq.State, and list/np.ndarray[int|bool|float|complex] accepted, type was {errorType}).",
                     node)
@@ -4971,9 +5126,8 @@ def visit_Name(self, node):
         # Throw an exception for the case that the name is not
         # in the symbol table
         self.emitFatalError(
-            f"Invalid variable name requested - '{node.id}' is "
-            f"not defined within the quantum kernel it is used "
-            f"in.", node)
+            f"Invalid variable name requested - '{node.id}' is not defined within the scope it is used in.",
+            node)
 
 
 def compile_to_mlir(astModule, capturedDataStorage: CapturedDataStorage,
@@ -5021,21 +5175,23 @@ def compile_to_mlir(astModule, capturedDataStorage: CapturedDataStorage,
     # Visit dependent kernels recursively to
     # ensure we have all necessary kernels added to the
     # module
-    transitiveDeps = depKernels
+    transitiveDeps = {**depKernels}
     while len(transitiveDeps):
         # For each found dependency, see if that kernel
         # has further dependencies
-        for depKernelName, depKernelAst in transitiveDeps.items():
+        newDeps = {}
+        for _, depKernelAst in transitiveDeps.items():
             localVis = FindDepKernelsVisitor(bridge.ctx)
             localVis.visit(depKernelAst[0])
             # Append the found dependencies to our running tally
-            depKernels = {**depKernels, **localVis.depKernels}
-            # Reset for the next go around
-            transitiveDeps = localVis.depKernels
+            for k in localVis.depKernels:
+                if not k in depKernels:
+                    v = localVis.depKernels[k]
+                    depKernels[k] = v
+                    newDeps[k] = v
             # Update the call graph
-            callGraph[localVis.kernelName] = {
-                k for k, v in localVis.depKernels.items()
-            }
+            callGraph[localVis.kernelName] = {k for k in localVis.depKernels}
+        transitiveDeps = newDeps
 
     # Sort the call graph topologically
     callGraphSorter = graphlib.TopologicalSorter(callGraph)
diff --git a/python/cudaq/kernel/utils.py b/python/cudaq/kernel/utils.py
index 879dd60aae6..91eac0f0431 100644
--- a/python/cudaq/kernel/utils.py
+++ b/python/cudaq/kernel/utils.py
@@ -11,7 +11,7 @@
 import sys
 import traceback
 import numpy as np
-from typing import get_origin, Callable, List
+from typing import get_origin, get_args, Callable, List
 import types
 import weakref
 
@@ -120,6 +120,8 @@ def isQuantumType(ty):
 
     numQuantumMembers = sum((isQuantumType(t) for t in mlirEleTypes))
     if numQuantumMembers == 0:
+        if any((cc.PointerType.isinstance(t) for t in mlirEleTypes)):
+            return None
         return cc.StructType.getNamed(name, mlirEleTypes, context=context)
     if numQuantumMembers != len(mlirEleTypes) or \
         any((quake.StruqType.isinstance(t) for t in mlirEleTypes)):
@@ -185,18 +187,24 @@ def emitFatalErrorOverride(msg):
                     f"Callable type must have signature specified ({ast.unparse(annotation) if hasattr(ast, 'unparse') else annotation})."
                 )
 
-            if hasattr(annotation.slice, 'elts'):
-                firstElement = annotation.slice.elts[0]
+            if hasattr(annotation.slice, 'elts') and len(
+                    annotation.slice.elts) == 2:
+                args = annotation.slice.elts[0]
+                ret = annotation.slice.elts[1]
             elif hasattr(annotation.slice, 'value') and hasattr(
-                    annotation.slice.value, 'elts'):
-                firstElement = annotation.slice.value.elts[0]
+                    annotation.slice.value, 'elts') and len(
+                        annotation.slice.value.elts) == 2:
+                args = annotation.slice.value.elts[0]
+                ret = annotation.slice.value.elts[1]
             else:
                 localEmitFatalError(
                     f"Unable to get list elements when inferring type from annotation ({ast.unparse(annotation) if hasattr(ast, 'unparse') else annotation})."
                 )
-            argTypes = [
-                mlirTypeFromAnnotation(a, ctx) for a in firstElement.elts
-            ]
+            argTypes = [mlirTypeFromAnnotation(a, ctx) for a in args.elts]
+            if not isinstance(ret, ast.Constant) or ret.value:
+                localEmitFatalError(
+                    "passing kernels as arguments that return a value is not currently supported"
+                )
             return cc.CallableType.get(argTypes)
 
         if isinstance(annotation,
@@ -368,12 +376,11 @@ def mlirTypeFromPyType(argType, ctx, **kwargs):
         return cc.PointerType.get(cc.StateType.get(ctx), ctx)
 
     if get_origin(argType) == list:
-        result = re.search(r'ist\[(.*)\]', str(argType))
-        eleTyName = result.group(1)
+        pyEleTy = get_args(argType)
+        if len(pyEleTy) == 1:
+            eleTy = mlirTypeFromPyType(pyEleTy[0], ctx)
+            return cc.StdvecType.get(eleTy, ctx)
         argType = list
-        inst = pyInstanceFromName(eleTyName)
-        if (inst != None):
-            kwargs['argInstance'] = [inst]
 
     if argType in [list, np.ndarray, List]:
         if 'argInstance' not in kwargs:
@@ -406,18 +413,9 @@ def mlirTypeFromPyType(argType, ctx, **kwargs):
                                  ctx)
 
     if get_origin(argType) == tuple:
-        result = re.search(r'uple\[(?P<names>.*)\]', str(argType))
-        eleTyNames = result.group('names')
         eleTypes = []
-        while eleTyNames != None:
-            result = re.search(r'(?P<names>.*),\s*(?P<name>.*)', eleTyNames)
-            eleTyName = result.group('name') if result != None else eleTyNames
-            eleTyNames = result.group('names') if result != None else None
-            pyInstance = pyInstanceFromName(eleTyName)
-            if pyInstance == None:
-                emitFatalError(f'Invalid tuple element type ({eleTyName})')
-            eleTypes.append(mlirTypeFromPyType(type(pyInstance), ctx))
-        eleTypes.reverse()
+        for pyEleTy in get_args(argType):
+            eleTypes.append(mlirTypeFromPyType(pyEleTy, ctx))
         tupleTy = mlirTryCreateStructType(eleTypes, context=ctx)
         if tupleTy is None:
             emitFatalError(
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 6374a53acdb..8c6456ad3fe 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -202,14 +202,29 @@ ExecutionEngine *jitKernel(const std::string &name, MlirModule module,
       pm.enableIRPrinting();
     }
 
+    std::string error_msg;
+    mlir::DiagnosticEngine &engine = context->getDiagEngine();
+    auto handlerId = engine.registerHandler(
+        [&error_msg](mlir::Diagnostic &diag) -> mlir::LogicalResult {
+          if (diag.getSeverity() == mlir::DiagnosticSeverity::Error) {
+            error_msg += diag.str();
+            return mlir::failure(false);
+          }
+          return mlir::failure();
+        });
+
     DefaultTimingManager tm;
     tm.setEnabled(cudaq::isTimingTagEnabled(cudaq::TIMING_JIT_PASSES));
     auto timingScope = tm.getRootScope(); // starts the timer
     pm.enableTiming(timingScope);         // do this right before pm.run
-    if (failed(pm.run(cloned)))
+
+    if (failed(pm.run(cloned))) {
+      engine.eraseHandler(handlerId);
       throw std::runtime_error(
-          "cudaq::builder failed to JIT compile the Quake representation.");
+          "failed to JIT compile the Quake representation\n" + error_msg);
+    }
     timingScope.stop();
+    engine.eraseHandler(handlerId);
 
     // The "fast" instruction selection compilation algorithm is actually very
     // slow for large quantum circuits. Disable that here. Revisit this
@@ -317,6 +332,12 @@ jitAndCreateArgs(const std::string &name, MlirModule module,
         .Case([&](cudaq::cc::StructType ty) {
           auto funcOp = getKernelFuncOp(module, name);
           auto [size, offsets] = getTargetLayout(funcOp, ty);
+          auto memberTys = ty.getMembers();
+          for (auto mTy : memberTys) {
+            if (auto vecTy = dyn_cast<cudaq::cc::StdvecType>(mTy))
+              throw std::runtime_error("return values with dynamically sized "
+                                       "element types are not yet supported");
+          }
           auto ourAllocatedArg = std::malloc(size);
           runtimeArgs.emplace_back(ourAllocatedArg,
                                    [](void *ptr) { std::free(ptr); });
@@ -897,18 +918,34 @@ MlirModule synthesizeKernel(const std::string &name, MlirModule module,
   pm.addNestedPass<func::FuncOp>(cudaq::opt::createLoopUnroll());
   pm.addNestedPass<func::FuncOp>(createCanonicalizerPass());
   pm.addPass(createSymbolDCEPass());
-  DefaultTimingManager tm;
-  tm.setEnabled(cudaq::isTimingTagEnabled(cudaq::TIMING_JIT_PASSES));
-  auto timingScope = tm.getRootScope(); // starts the timer
-  pm.enableTiming(timingScope);         // do this right before pm.run
   if (disableMLIRthreading || enablePrintMLIREachPass)
     context->disableMultithreading();
   if (enablePrintMLIREachPass)
     pm.enableIRPrinting();
-  if (failed(pm.run(cloned)))
+
+  std::string error_msg;
+  mlir::DiagnosticEngine &engine = context->getDiagEngine();
+  auto handlerId = engine.registerHandler(
+      [&error_msg](mlir::Diagnostic &diag) -> mlir::LogicalResult {
+        if (diag.getSeverity() == mlir::DiagnosticSeverity::Error) {
+          error_msg += diag.str();
+          return mlir::failure(false);
+        }
+        return mlir::failure();
+      });
+
+  DefaultTimingManager tm;
+  tm.setEnabled(cudaq::isTimingTagEnabled(cudaq::TIMING_JIT_PASSES));
+  auto timingScope = tm.getRootScope(); // starts the timer
+  pm.enableTiming(timingScope);         // do this right before pm.run
+
+  if (failed(pm.run(cloned))) {
+    engine.eraseHandler(handlerId);
     throw std::runtime_error(
-        "cudaq::builder failed to JIT compile the Quake representation.");
+        "failed to JIT compile the Quake representation\n" + error_msg);
+  }
   timingScope.stop();
+  engine.eraseHandler(handlerId);
   std::free(rawArgs);
   return wrap(cloned);
 }
diff --git a/python/tests/custom/test_custom_operations.py b/python/tests/custom/test_custom_operations.py
index 454196f1fd6..bd23bbe553c 100644
--- a/python/tests/custom/test_custom_operations.py
+++ b/python/tests/custom/test_custom_operations.py
@@ -226,8 +226,7 @@ def bell():
 
     with pytest.raises(RuntimeError) as error:
         bell.compile()
-    assert 'controlled operation requested without any control argument(s)' in repr(
-        error)
+    assert 'missing value' in repr(error)
 
 
 def test_bug_2452():
@@ -265,8 +264,7 @@ def kernel3():
 
     with pytest.raises(RuntimeError) as error:
         cudaq.sample(kernel3)
-    assert 'invalid number of arguments (1) passed to custom_cz (requires 2 arguments)' in repr(
-        error)
+    assert 'missing value' in repr(error)
 
 
 # leave for gdb debugging
diff --git a/python/tests/kernel/test_assignments.py b/python/tests/kernel/test_assignments.py
new file mode 100644
index 00000000000..adb9ba7e86d
--- /dev/null
+++ b/python/tests/kernel/test_assignments.py
@@ -0,0 +1,1700 @@
+# ============================================================================ #
+# Copyright (c) 2025 NVIDIA Corporation & Affiliates.                          #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+
+import os, pytest
+import cudaq
+from dataclasses import dataclass
+from typing import Callable
+
+
+@pytest.fixture(autouse=True)
+def do_something():
+    yield
+    cudaq.__clearKernelRegistries()
+
+
+def test_list_update():
+
+    @cudaq.kernel
+    def sum(l: list[int]) -> int:
+        total = 0
+        for item in l:
+            total += item
+        return total
+
+    @cudaq.kernel
+    def to_integer(ms: list[bool]) -> int:
+        res = 0
+        for idx, v in enumerate(ms):
+            res = res | (v << idx)
+        return res
+
+    @cudaq.kernel
+    def test1(arg: list[int]) -> tuple[int, int]:
+        qs = cudaq.qvector(len(arg) + 1)
+        for i in arg:
+            i += 1
+            x(qs[i])
+        return sum(arg), to_integer(mz(qs))
+
+    results = cudaq.run(test1, [0, 1, 2], shots_count=1)
+    # to_integer(0111) = 2 + 4 + 8 = 14
+    assert len(results) == 1 and results[0] == (3, 14)
+
+    @cudaq.kernel
+    def double_entries(arg: list[int]):
+        for i, v in enumerate(arg):
+            arg[i] = 2 * v
+
+    @cudaq.kernel
+    def test2(arg: list[int]) -> int:
+        double_entries(arg)
+        return sum(arg)
+
+    arg = [4, 5, 6]
+    results = cudaq.run(test2, arg, shots_count=1)
+    assert len(results) == 1 and results[0] == 30  # 2 * (4 + 5 + 6) = 30
+    # TODO: we generally create a copy when passing values
+    # from host to kernel (with the exception of State).
+    # Changes hence won't currently be reflected in the
+    # host code.
+    assert arg == [4, 5, 6]
+
+    @cudaq.kernel
+    def test3(arg: list[int]) -> tuple[int, int]:
+        alias = arg
+        double_entries(alias)
+        return sum(alias), sum(arg)
+
+    results = cudaq.run(test3, [0, 1, 2], shots_count=1)
+    assert len(results) == 1 and results[0] == (6, 6)
+
+    @cudaq.kernel
+    def test4(arg: list[int]) -> tuple[int, int]:
+        alias = arg
+        double_entries(arg)
+        return sum(alias), sum(arg)
+
+    results = cudaq.run(test4, [0, 1, 2], shots_count=1)
+    assert len(results) == 1 and results[0] == (6, 6)
+
+    @cudaq.kernel
+    def test4(arg: list[int]) -> tuple[int, int]:
+        alias = arg
+        double_entries(arg)
+        return sum(alias), sum(arg)
+
+    results = cudaq.run(test4, [0, 1, 2], shots_count=1)
+    assert len(results) == 1 and results[0] == (6, 6)
+
+    @cudaq.kernel
+    def modify_and_return(arg: list[int]) -> list[int]:
+        for i, v in enumerate(arg):
+            arg[i] = v * v
+        return arg.copy()
+
+    @cudaq.kernel
+    def test5(arg: list[int]) -> tuple[int, int]:
+        alias = modify_and_return(arg)
+        alias[0] = 5
+        return sum(alias), sum(arg)
+
+    results = cudaq.run(test5, [0, 1, 2], shots_count=1)
+    assert len(results) == 1 and results[0] == (10, 5)
+
+    @cudaq.kernel
+    def get_list() -> list[int]:
+        return [0, 1, 2]
+
+    assert get_list() == [0, 1, 2]
+
+    @cudaq.kernel
+    def test6() -> tuple[int, int]:
+        local = get_list()
+        alias = modify_and_return(local)
+        alias[0] = 5
+        return sum(alias), sum(local)
+
+    results = cudaq.run(test6, shots_count=1)
+    assert len(results) == 1 and results[0] == (10, 5)
+
+    @dataclass(slots=True)
+    class MyTuple:
+        l1: list[int]
+        l2: list[int]
+
+    @cudaq.kernel
+    def get_MyTuple(arg: list[int]) -> MyTuple:
+        return MyTuple(arg.copy(), [1, 1])
+
+    @cudaq.kernel
+    def test7() -> tuple[int, int, int]:
+        arg = [2, 2]
+        t = get_MyTuple(arg)
+        arg[0] = 3
+        return sum(arg), sum(t.l1), sum(t.l2)
+
+    results = cudaq.run(test7, shots_count=1)
+    assert len(results) == 1 and results[0] == (5, 4, 2)
+
+    @cudaq.kernel
+    def test8() -> tuple[int, int, int]:
+        arg = [2, 2]
+        t = get_MyTuple(arg)
+        t.l1[0] = 4
+        t.l2[1] = 2
+        return sum(arg), sum(t.l1), sum(t.l2)
+
+    results = cudaq.run(test8, shots_count=1)
+    assert len(results) == 1 and results[0] == (4, 6, 3)
+
+    @cudaq.kernel
+    def create_list_list_int(val: int, size: tuple[int,
+                                                   int]) -> list[list[int]]:
+        inner_list = [val for _ in range(size[1])]
+        return [inner_list.copy() for _ in range(size[0])]
+
+    @cudaq.kernel
+    def test9() -> int:
+        ls = create_list_list_int(1, (3, 4))
+        tot = 0
+        ls[1] = [5]
+        ls[2][3] = 2
+        inner = ls[2]
+        inner[1] = 2
+        for l in ls:
+            tot += sum(l)
+        return tot
+
+    assert test9() == 15
+
+
+def test_list_update_failures():
+
+    @dataclass(slots=True)
+    class MyTuple:
+        l1: list[int]
+        l2: list[int]
+
+    @cudaq.kernel
+    def kernel1(l1: list[int]) -> MyTuple:
+        return MyTuple(l1, [1, 1])
+
+    with pytest.raises(RuntimeError) as e:
+        cudaq.run(kernel1, [1, 2])
+    assert 'lists passed as or contained in function arguments cannot be inner items in other container values' in str(
+        e.value)
+    assert '(offending source -> MyTuple(l1, [1, 1]))' in str(e.value)
+
+    @cudaq.kernel
+    def get_MyTuple(l1: list[int]) -> MyTuple:
+        return MyTuple(l1.copy(), [1, 1])
+
+    with pytest.raises(RuntimeError) as e:
+        get_MyTuple([0, 0])
+    assert 'return values with dynamically sized element types are not yet supported' in str(
+        e.value)
+
+    with pytest.raises(RuntimeError) as e:
+        cudaq.run(get_MyTuple, [0, 0])
+    assert 'return values with dynamically sized element types are not yet supported' in str(
+        e.value)
+
+    @cudaq.kernel
+    def sum(l: list[int]) -> int:
+        total = 0
+        for item in l:
+            total += item
+        return total
+
+    @cudaq.kernel
+    def modify_and_return(arg: list[int]) -> list[int]:
+        for i, v in enumerate(arg):
+            arg[i] = v * v
+        # If we allowed this, then the correct output of
+        # kernel2 below would be 10, 10
+        return arg
+
+    @cudaq.kernel
+    def call_modifier(mod: Callable[[list[int]], list[int]],
+                      arg: list[int]) -> list[int]:
+        return mod(arg)
+
+    with pytest.raises(RuntimeError) as e:
+        print(call_modifier)
+    assert 'passing kernels as arguments that return a value is not currently supported' in str(
+        e.value)
+
+    @cudaq.kernel
+    def call_multiply(arg: list[int]) -> list[int]:
+        return modify_and_return(arg)
+
+    @cudaq.kernel
+    def kernel2(arg: list[int]) -> tuple[int, int]:
+        alias = call_multiply(arg)
+        alias[0] = 5
+        return sum(alias), sum(arg)
+
+    with pytest.raises(RuntimeError) as e:
+        kernel2([0, 1, 2])
+    assert 'return value must not contain a list that is a function argument or an item in a function argument' in str(
+        e.value)
+    assert '(offending source -> return arg)' in str(e.value)
+
+
+def test_dataclass_update():
+
+    @dataclass(slots=True)
+    class MyTuple:
+        angle: float
+        idx: int
+
+    @cudaq.kernel
+    def update_tuple1(arg: MyTuple) -> MyTuple:
+        t = arg.copy()
+        t.angle = 5.
+        return arg
+
+    @cudaq.kernel
+    def update1() -> MyTuple:
+        t = MyTuple(0., 0)
+        return update_tuple1(t)
+
+    out = cudaq.run(update1, shots_count=1)
+    assert len(out) == 1 and out[0] == MyTuple(0., 0)
+    print("result update1: " + str(out[0]))
+
+    @cudaq.kernel
+    def update_tuple2(arg: MyTuple) -> MyTuple:
+        t = arg.copy()
+        t.angle = 5.
+        return t
+
+    @cudaq.kernel
+    def update2() -> MyTuple:
+        return update_tuple2(MyTuple(0., 0))
+
+    out = cudaq.run(update2, shots_count=1)
+    assert len(out) == 1 and out[0] == MyTuple(5., 0)
+    print("result update2: " + str(out[0]))
+
+    @cudaq.kernel
+    def update3(arg: MyTuple) -> MyTuple:
+        t = arg.copy()
+        t.angle += 5.
+        return t
+
+    arg = MyTuple(1, 1)
+    out = cudaq.run(update3, MyTuple(1, 1), shots_count=1)
+    assert len(out) == 1 and out[0] == MyTuple(6., 1)
+    assert arg == MyTuple(1, 1)
+    print("result update3: " + str(out[0]))
+
+    @cudaq.kernel
+    def serialize(t1: MyTuple, t2: MyTuple, t3: MyTuple) -> list[float]:
+        return [t1.angle, t1.idx, t2.angle, t2.idx, t3.angle, t3.idx]
+
+    @cudaq.kernel
+    def update4() -> list[float]:
+        t1 = MyTuple(1, 1)
+        t2 = t1
+        t3 = MyTuple(2, 2)
+        t1 = t3
+        t3.angle = 5
+        return serialize(t1, t2, t3)
+
+    assert update4() == [5.0, 2.0, 1.0, 1.0, 5.0, 2.0]
+
+    @cudaq.kernel
+    def update5(cond: bool) -> list[float]:
+        t1 = MyTuple(1, 1)
+        t2 = t1
+        if cond:
+            t1.angle = 5
+        return [t1.angle, t1.idx, t2.angle, t2.idx]
+
+    assert update5(True) == [5.0, 1.0, 5.0, 1.0]
+    assert update5(False) == [1.0, 1.0, 1.0, 1.0]
+
+
+def test_dataclass_update_failures():
+
+    @dataclass(slots=True)
+    class MyQTuple:
+        controls: cudaq.qview
+        target: cudaq.qubit
+
+    # We do not currently allow any kind of updates to
+    # quantum structs.
+    @cudaq.kernel
+    def test1(t: MyQTuple, controls: cudaq.qview):
+        t.controls = controls
+
+    with pytest.raises(RuntimeError) as e:
+        print(test1)
+    assert 'accessing attribute of quantum tuple or dataclass does not produce a modifiable value' in str(
+        e.value)
+    assert '(offending source -> t.controls)' in str(e.value)
+
+    @cudaq.kernel
+    def test2(arg: MyQTuple, controls: cudaq.qview):
+        t = arg.copy()
+        t.controls = controls
+
+    with pytest.raises(RuntimeError) as e:
+        print(test2)
+    assert 'copy is not supported' in str(e.value)
+    assert '(offending source -> arg.copy())' in str(e.value)
+
+    @dataclass(slots=True)
+    class MyTuple:
+        angle: float
+        idx: int
+
+    @cudaq.kernel
+    def update_tuple1(t: MyTuple):
+        t.angle = 5.
+
+    @cudaq.kernel
+    def test3() -> MyTuple:
+        t = MyTuple(0., 0)
+        update_tuple1(t)
+        return t
+
+    with pytest.raises(RuntimeError) as e:
+        print(test3)
+    assert 'value cannot be modified - use `.copy(deep)` to create a new value that can be modified' in str(
+        e.value)
+    assert '(offending source -> t.angle)' in str(e.value)
+
+    @cudaq.kernel
+    def update_tuple2(t: MyTuple):
+        t.angle += 5.
+
+    @cudaq.kernel
+    def test4() -> MyTuple:
+        t = MyTuple(0., 0)
+        update_tuple2(t)
+        return t
+
+    with pytest.raises(RuntimeError) as e:
+        print(test4)
+    assert 'value cannot be modified - use `.copy(deep)` to create a new value that can be modified' in str(
+        e.value)
+    assert '(offending source -> t.angle)' in str(e.value)
+
+    @cudaq.kernel
+    def update_tuple3(arg: MyTuple):
+        t = arg
+        t.angle = 5.
+
+    @cudaq.kernel
+    def test5() -> MyTuple:
+        t = MyTuple(0., 0)
+        update_tuple3(t)
+        return t
+
+    with pytest.raises(RuntimeError) as e:
+        print(test5())
+    assert 'cannot assign dataclass passed as function argument to a local variable' in str(
+        e.value)
+    assert 'use `.copy(deep)` to create a new value that can be assigned' in str(
+        e.value)
+    assert '(offending source -> t = arg)' in str(e.value)
+
+    @dataclass(slots=True)
+    class NumberedMyTuple:
+        val: MyTuple
+        num: int
+
+    @cudaq.kernel
+    def test6() -> NumberedMyTuple:
+        t = MyTuple(0.5, 1)
+        return NumberedMyTuple(t, 0)
+
+    with pytest.raises(RuntimeError) as e:
+        test6()
+    assert 'only dataclass literals may be used as items in other container values' in str(
+        e.value)
+    assert 'use `.copy(deep)` to create a new MyTuple' in str(e.value)
+
+    @cudaq.kernel
+    def test7(cond: bool) -> tuple[MyTuple, MyTuple]:
+        t1 = MyTuple(1, 1)
+        t2 = t1
+        if cond:
+            t3 = MyTuple(2, 2)
+            t1 = t3
+            t3.angle = 5
+        return (t1, t2)
+
+    with pytest.raises(RuntimeError) as e:
+        test7(True)
+    assert 'only literals can be assigned to variables defined in parent scope' in str(
+        e.value)
+    assert '(offending source -> t1 = t3)' in str(e.value)
+
+    @cudaq.kernel
+    def test8(cond: bool) -> MyTuple:
+        t1 = [MyTuple(1, 1)]
+        if cond:
+            t3 = MyTuple(2, 2)
+            t1[0] = t3
+            t3.angle = 5
+        return t1
+
+    with pytest.raises(RuntimeError) as e:
+        test8(True)
+    assert 'only dataclass literals may be used as items in other container values' in str(
+        e.value)
+    assert 'use `.copy(deep)` to create a new MyTuple' in str(e.value)
+    assert '(offending source -> t1[0] = t3)' in str(e.value)
+
+
+def test_list_of_tuple_updates():
+
+    @cudaq.kernel
+    def fill_back(l: list[tuple[int, int]], t: tuple[int, int], n: int):
+        for idx in range(len(l) - n, len(l)):
+            l[idx] = t
+
+    @cudaq.kernel
+    def test10() -> list[int]:
+        l = [(1, 1) for _ in range(3)]
+        fill_back(l, (2, 2), 2)
+        res = [0 for _ in range(6)]
+        for i in range(3):
+            res[2 * i] = l[i][0]
+            res[2 * i + 1] = l[i][1]
+        return res
+
+    assert test10() == [1, 1, 2, 2, 2, 2]
+
+    @cudaq.kernel
+    def get_list_of_int_tuple(t: tuple[int, int],
+                              size: int) -> list[tuple[int, int]]:
+        l = [t for _ in range(size + 1)]
+        l[0] = (3, 3)
+        return l
+
+    @cudaq.kernel
+    def test11() -> list[int]:
+        t = (1, 2)
+        l = get_list_of_int_tuple(t, 2)
+        l[1] = (4, 4)
+        res = [0 for _ in range(6)]
+        for idx in range(3):
+            res[2 * idx] = l[idx][0]
+            res[2 * idx + 1] = l[idx][1]
+        return res
+
+    assert test11() == [3, 3, 4, 4, 1, 2]
+
+    @cudaq.kernel
+    def get_list_of_int_tuple2(arg: tuple[int, int],
+                               size: int) -> list[tuple[int, int]]:
+        t = arg.copy()
+        l = [t for _ in range(size + 1)]
+        l[0] = (3, 3)
+        return l
+
+    @cudaq.kernel
+    def test12() -> list[int]:
+        t = (1, 2)
+        l = get_list_of_int_tuple2(t, 2)
+        l[1] = (4, 4)
+        res = [0 for _ in range(6)]
+        for idx in range(3):
+            res[2 * idx] = l[idx][0]
+            res[2 * idx + 1] = l[idx][1]
+        return res
+
+    assert test12() == [3, 3, 4, 4, 1, 2]
+
+    @cudaq.kernel
+    def modify_first_item(ls: list[tuple[list[int], list[int]]], idx: int,
+                          val: int):
+        ls[0][0][idx] = val
+
+    @cudaq.kernel
+    def test13() -> list[int]:
+        l1 = [0, 0]
+        tlist = [(l1, l1)]
+        modify_first_item(tlist, 0, 2)
+        l1[1] = 3
+        t = tlist[0]
+        return [t[0][0], t[0][1], t[1][0], t[1][1], l1[0], l1[1]]
+
+    assert test13() == [2, 3, 2, 3, 2, 3]
+
+    @dataclass(slots=True)
+    class NumberedTuple:
+        idx: int
+        vals: tuple[int, list[int]]
+
+    @cudaq.kernel
+    def test7() -> list[int]:
+        l = [1]
+        t = NumberedTuple(0, (0, [0]))
+        t.vals = (1, l)
+        t.vals[1][0] = 2
+        return [t.idx, t.vals[0], t.vals[1][0], l[0]]
+
+    assert test7() == [0, 1, 2, 2]
+
+
+def test_list_of_tuple_update_failures():
+
+    @cudaq.kernel
+    def get_list_of_int_tuple(t: tuple[int, int],
+                              size: int) -> list[tuple[int, int]]:
+        l = [t for _ in range(size + 1)]
+        l[0] = (3, 3)
+        return l
+
+    with pytest.raises(RuntimeError) as e:
+        get_list_of_int_tuple((1, 2), 2)
+    assert 'Expected a complex, floating, or integral type' in str(e.value)
+
+    @cudaq.kernel
+    def test2() -> list[int]:
+        t = (1, 2)
+        l = get_list_of_int_tuple(t, 2)
+        l[1][0] = 4
+        res = [0 for _ in range(6)]
+        for idx in range(3):
+            res[2 * idx] = l[idx][0]
+            res[2 * idx + 1] = l[idx][1]
+        return res
+
+    with pytest.raises(RuntimeError) as e:
+        print(test2)
+    assert 'tuple value cannot be modified' in str(e.value)
+
+    @cudaq.kernel
+    def assign_and_return_list_tuple(
+            value: tuple[list[int], list[int]]) -> tuple[list[int], list[int]]:
+        local = ([1], [1])
+        local = value
+        return local
+
+    @cudaq.kernel
+    def test3() -> list[int]:
+        l1 = [1]
+        t1 = (l1, l1)
+        t2 = assign_and_return_list_tuple(t1)
+        l1[0] = 2
+        return [l1[0], t1[0][0], t1[1][0], t2[0][0], t2[1][0]]
+
+    with pytest.raises(RuntimeError) as e:
+        test3()  # should output [2,2,2,2,2]
+    assert 'cannot assign tuple or dataclass passed as function argument to a local variable if it contains a list' in str(
+        e.value)
+
+    @cudaq.kernel
+    def get_item(ls: list[tuple[list[int], list[int]]],
+                 idx: int) -> tuple[list[int], list[int]]:
+        return ls[idx]
+
+    @cudaq.kernel
+    def test4() -> list[int]:
+        l1 = [0, 0]
+        tlist = [(l1, l1)]
+        t = get_item(tlist, 0)
+        l1[1] = 3
+        # If we allowed the return in modify_and_return_item,
+        # the correct output would be [0, 3, 0, 3, 0, 3]
+        return [t[0][0], t[0][1], t[1][0], t[1][1], l1[0], l1[1]]
+
+    with pytest.raises(RuntimeError) as e:
+        test4()
+    assert 'return value must not contain a list that is a function argument or an item in a function argument' in str(
+        e.value)
+    assert '(offending source -> return ls[idx])' in str(e.value)
+
+    @cudaq.kernel
+    def test5():
+        l = [(0, 1) for _ in range(3)]
+        l[0][1] = 2
+
+    with pytest.raises(RuntimeError) as e:
+        test5()
+    assert 'tuple value cannot be modified' in str(e.value)
+    assert '(offending source -> l[0][1])' in str(e.value)
+
+    @cudaq.kernel
+    def test6():
+        l = [(0, [(1, 1)]) for _ in range(3)]
+        l[-1][1][0] = (2, 2)
+        l[2][1][0][0] = 3
+
+    with pytest.raises(RuntimeError) as e:
+        test6()
+    assert 'tuple value cannot be modified' in str(e.value)
+    assert '(offending source -> l[2][1][0][0])' in str(e.value)
+
+    @dataclass(slots=True)
+    class NumberedTuple:
+        idx: int
+        vals: tuple[int, list[int]]
+
+    @cudaq.kernel
+    def test7():
+        t = NumberedTuple(0, (0, [0]))
+        t.vals = (1, [1])
+        t.vals[1] = [2]
+
+    with pytest.raises(RuntimeError) as e:
+        test7()
+    assert 'tuple value cannot be modified' in str(e.value)
+    assert '(offending source -> t.vals[1])' in str(e.value)
+
+
+def test_list_of_dataclass_updates():
+
+    @dataclass(slots=True)
+    class MyTuple:
+        l1: list[int]
+        l2: list[int]
+
+    @cudaq.kernel
+    def serialize(tlist: list[MyTuple]) -> list[int]:
+        tot_size = 2 * len(tlist)
+        for t in tlist:
+            tot_size += len(t.l1) + len(t.l2)
+        res = [0 for _ in range(tot_size)]
+        idx = 0
+        for t in tlist:
+            res[idx] = len(t.l1)
+            idx += 1
+            for i, v in enumerate(t.l1):
+                res[idx + i] = v
+            idx += len(t.l1)
+            res[idx] = len(t.l2)
+            idx += 1
+            for i, v in enumerate(t.l2):
+                res[idx + i] = v
+            idx += len(t.l2)
+        return res
+
+    @cudaq.kernel
+    def populate_MyTuple_list(t: MyTuple, size: int) -> list[MyTuple]:
+        return [t.copy(deep=True) for _ in range(size)]
+
+    @cudaq.kernel
+    def test1() -> list[int]:
+        l = populate_MyTuple_list(MyTuple([1], [1]), 2)
+        return serialize(l)
+
+    assert test1() == [1, 1, 1, 1, 1, 1, 1, 1]
+
+    @cudaq.kernel
+    def test2() -> list[int]:
+        l = populate_MyTuple_list(MyTuple([1, 1], [1, 1]), 2)
+        l[0].l1 = [2]
+        return serialize(l)
+
+    assert test2() == [1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 1]
+
+    @cudaq.kernel
+    def test3() -> list[int]:
+        l = populate_MyTuple_list(MyTuple([1, 1], [1, 1]), 2)
+        l[1].l2[0] = 3
+        return serialize(l)
+
+    assert test3() == [2, 1, 1, 2, 1, 1, 2, 1, 1, 2, 3, 1]
+
+    @cudaq.kernel
+    def flatten(ls: list[list[int]]) -> list[int]:
+        size = 0
+        for l in ls:
+            size += len(l)
+        res = [0 for _ in range(size)]
+        idx = 0
+        for l in ls:
+            for i in l:
+                res[idx] = i
+                idx += 1
+        return res
+
+    @cudaq.kernel
+    def test4() -> list[int]:
+        l1 = [1, 1]
+        t = MyTuple(l1, l1)
+        l3 = [2, 2]
+        t.l1 = l3
+        l3[0] = 5
+        return flatten([t.l1, t.l2, l1, l3])
+
+    assert test4() == [5, 2, 1, 1, 1, 1, 5, 2]
+
+    @cudaq.kernel
+    def test5(cond: bool) -> list[int]:
+        l1 = [1, 1]
+        t = MyTuple(l1, l1)
+        if cond:
+            t.l1 = [2, 2]
+        t.l1[0] = 5
+        return flatten([t.l1, t.l2, l1])
+
+    assert test5(True) == [5, 2, 1, 1, 1, 1]
+    assert test5(False) == [5, 1, 5, 1, 5, 1]
+
+    @cudaq.kernel
+    def update_list(old: list[int], new: list[int]):
+        old = new
+
+    @cudaq.kernel
+    def test6(cond: bool) -> list[int]:
+        l1 = [1, 1]
+        t = MyTuple(l1, l1)
+        if cond:
+            update_list(t.l1, [2, 2])
+        t.l1[0] = 5
+        return flatten([t.l1, t.l2, l1])
+
+    assert test6(True) == [5, 1, 5, 1, 5, 1]
+    assert test6(False) == [5, 1, 5, 1, 5, 1]
+
+    @cudaq.kernel
+    def update_list2(old: list[int], new: list[int]):
+        for idx, v in enumerate(new):
+            old[idx] = v
+
+    @cudaq.kernel
+    def test7(cond: bool) -> list[int]:
+        l1 = [1, 1]
+        t = MyTuple(l1, l1)
+        if cond:
+            update_list2(t.l1, [2, 2])
+        t.l1[0] = 5
+        return flatten([t.l1, t.l2, l1])
+
+    assert test7(True) == [5, 2, 5, 2, 5, 2]
+    assert test7(False) == [5, 1, 5, 1, 5, 1]
+
+    @cudaq.kernel
+    def modify_MyTuple(ls: list[MyTuple], idx: int, val: list[int]):
+        ls[idx].l1 = val.copy()
+        ls[idx].l2 = val
+
+    @cudaq.kernel
+    def test8() -> list[int]:
+        default = [0]
+        vals = [1, 1]
+        tlist = [MyTuple(default, default)]
+        modify_MyTuple(tlist, 0, vals)
+        tlist[0].l1[0] = 2
+        return flatten([default, vals, tlist[0].l1, tlist[0].l2])
+
+    assert test8() == [0, 1, 1, 2, 1, 1, 1]
+
+    @cudaq.kernel
+    def test9() -> list[int]:
+        default = [0]
+        vals = [1, 1]
+        tlist = [MyTuple(default, default)]
+        modify_MyTuple(tlist, 0, vals)
+        vals[0] = 2
+        return flatten([default, vals, tlist[0].l1, tlist[0].l2])
+
+    assert test9() == [0, 2, 1, 1, 1, 2, 1]
+
+    @cudaq.kernel
+    def test10() -> list[int]:
+        default = [0]
+        vals = [1, 1]
+        tlist = [MyTuple(default, default)]
+        modify_MyTuple(tlist, 0, vals)
+        tlist[0].l2[0] = 3
+        return flatten([default, vals, tlist[0].l1, tlist[0].l2])
+
+    assert test10() == [0, 3, 1, 1, 1, 3, 1]
+
+
+def test_list_of_dataclass_update_failures():
+
+    @dataclass(slots=True)
+    class MyTuple:
+        l1: list[int]
+        l2: list[int]
+
+    @cudaq.kernel
+    def get_MyTuple_list(t: MyTuple) -> list[MyTuple]:
+        return [t]
+
+    with pytest.raises(RuntimeError) as e:
+        print(get_MyTuple_list)
+    assert 'only dataclass literals may be used as items in other container values' in str(
+        e.value)
+    assert 'use `.copy(deep)` to create a new MyTuple' in str(e.value)
+
+    @cudaq.kernel
+    def populate_MyTuple_list(t: MyTuple, size: int) -> list[MyTuple]:
+        # If we allowed this, then the following scenario would lead to
+        # incorrect behavior due to the copy of inner lists during return:
+        # Caller allocates l1, creates MyTuple using l1 as its first item,
+        # calls `populate_MyTuple_list`, modifies an item in l1.
+        # In this case, the correct behavior would be that the change to l1
+        # is reflected in the list returned by `populate_MyTuple_list`.
+        return [MyTuple(t.l1, t.l2) for _ in range(size)]
+
+    with pytest.raises(RuntimeError) as e:
+        print(populate_MyTuple_list)
+    assert 'lists passed as or contained in function arguments cannot be inner items in other container values' in str(
+        e.value)
+    assert 'use `.copy(deep)` to create a new list' in str(e.value)
+
+    @cudaq.kernel
+    def get_MyTuple_list(size: int) -> list[MyTuple]:
+        return [MyTuple([1], [1]) for _ in range(size)]
+
+    with pytest.raises(RuntimeError) as e:
+        print(get_MyTuple_list(2))
+    assert 'Expected a complex, floating, or integral type' in str(e.value)
+
+    @cudaq.kernel
+    def test1(t: MyTuple, size: int) -> list[int]:
+        l = [t.copy(deep=True) for _ in range(size)]
+        res = [0 for _ in range(4 * len(l))]
+        for idx, item in enumerate(l):
+            res[4 * idx] = len(item.l1)
+            res[4 * idx + 1] = item.l1[0]
+            res[4 * idx + 2] = len(item.l2)
+            res[4 * idx + 3] = item.l2[0]
+        return res
+
+    # TODO: support.
+    # The argument conversion from host to device is not correct currently.
+    with pytest.raises(RuntimeError) as e:
+        test1(MyTuple([1], [1]), 2)
+    assert 'dynamically sized element types for function arguments are not yet supported' in str(
+        e.value)
+
+    @cudaq.kernel
+    def populate_MyTuple_list2(t: MyTuple, size: int) -> list[MyTuple]:
+        return [t.copy(deep=True) for _ in range(size)]
+
+    @cudaq.kernel
+    def test2() -> MyTuple:
+        l = populate_MyTuple_list2(MyTuple([1, 1], [1, 1]), 2)
+        l[0].l1 = [2]
+        return l[0]
+
+    # TODO: support.
+    with pytest.raises(RuntimeError) as e:
+        test2()
+    assert 'return values with dynamically sized element types are not yet supported' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test3() -> list[MyTuple]:
+        t1 = MyTuple([1, 1], [1, 1])
+        t2 = MyTuple([2, 2], [2, 2])
+        l = [t1, t2]
+        return l
+
+    with pytest.raises(RuntimeError) as e:
+        test3()
+    assert 'only dataclass literals may be used as items in other container values' in str(
+        e.value)
+    assert 'use `.copy(deep)` to create a new MyTuple' in str(e.value)
+
+    @cudaq.kernel
+    def test4() -> list[MyTuple]:
+        t = MyTuple([2, 2], [2, 2])
+        l = [MyTuple([1, 1], [1, 1]) for _ in range(3)]
+        l[0] = t
+        return l
+
+    with pytest.raises(RuntimeError) as e:
+        test4()
+    assert 'only dataclass literals may be used as items in other container values' in str(
+        e.value)
+    assert 'use `.copy(deep)` to create a new MyTuple' in str(e.value)
+
+    @cudaq.kernel
+    def test5() -> tuple[MyTuple, MyTuple]:
+        t1 = MyTuple([1, 1], [1, 1])
+        t2 = MyTuple([2, 2], [2, 2])
+        return (t1, t2)
+
+    with pytest.raises(RuntimeError) as e:
+        test5()
+    assert 'only dataclass literals may be used as items in other container values' in str(
+        e.value)
+    assert 'use `.copy(deep)` to create a new MyTuple' in str(e.value)
+
+    @cudaq.kernel
+    def test6() -> tuple[MyTuple, MyTuple]:
+        l = [MyTuple([1], [1])]
+        t = MyTuple([2], [2])
+        l[0] = t
+        t.first = [3]
+        l[0].second = 4
+        # If we allowed this, then
+        # t should be MyTuple(first=3, second=4) and
+        # l should be [MyTuple(first=3, second=4)]
+        return (l[0], t)
+
+    with pytest.raises(RuntimeError) as e:
+        test6()
+    assert 'only dataclass literals may be used as items in other container values' in str(
+        e.value)
+    assert 'use `.copy(deep)` to create a new MyTuple' in str(e.value)
+
+    @cudaq.kernel
+    def update_list(old: MyTuple, new: list[int]):
+        for idx, v in enumerate(new):
+            old.l1[idx] = v
+
+    @cudaq.kernel
+    def test7(cond: bool) -> list[int]:
+        l1 = [1, 1]
+        t = MyTuple(l1, l1)
+        if cond:
+            update_list(t, [2, 2])
+        t.l1[0] = 5
+        return [t.l1[0], t.l1[1], t.l2[0], t.l2[1], l1[0], l1[1]]
+
+    with pytest.raises(RuntimeError) as e:
+        test7()
+    assert 'value cannot be modified - use `.copy(deep)` to create a new value that can be modified' in str(
+        e.value)
+    assert '(offending source -> old.l1)' in str(e.value)
+
+    @cudaq.kernel
+    def modify_and_return_item(ls: list[MyTuple], idx: int) -> MyTuple:
+        ls[idx].l1[0] = 2
+        return ls[idx]
+
+    @cudaq.kernel
+    def test8() -> list[int]:
+        l1 = [0, 0]
+        tlist = [MyTuple(l1, l1)]
+        t = modify_and_return_item(tlist, 0)
+        t.l1[1] = 3
+        # If we allowed the return in modify_and_return_item,
+        # the correct output would be [2, 3, 2, 3, 2, 3]
+        return [t.l1[0], t.l1[1], t.l2[0], t.l2[1], l1[0], l1[1]]
+
+    with pytest.raises(RuntimeError) as e:
+        test8()
+    assert 'return value must not contain a list that is a function argument or an item in a function argument' in str(
+        e.value)
+    assert '(offending source -> return ls[idx])' in str(e.value)
+
+
+def test_list_of_list_updates():
+
+    @cudaq.kernel
+    def flatten(ls: list[list[int]]) -> list[int]:
+        size = 0
+        for l in ls:
+            size += len(l)
+        res = [0 for _ in range(size)]
+        idx = 0
+        for l in ls:
+            for i in l:
+                res[idx] = i
+                idx += 1
+        return res
+
+    @cudaq.kernel
+    def test1() -> list[int]:
+        l1 = [1, 1]
+        l2 = l1
+        l3 = [2, 2]
+        l1 = l3
+        l3[0] = 5
+        return flatten([l1, l2, l3])
+
+    assert test1() == [5, 2, 1, 1, 5, 2]
+
+    @cudaq.kernel
+    def test2(cond: bool) -> list[int]:
+        element = [1, 1]
+        ls = [element, element]
+        if cond:
+            update = [2, 2]
+            ls[0] = update
+            update[0] = 5
+        return flatten([ls[0], ls[1], element])
+
+    assert test2(True) == [5, 2, 1, 1, 1, 1]
+    assert test2(False) == [1, 1, 1, 1, 1, 1]
+
+    @cudaq.kernel
+    def test3(cond: bool) -> list[int]:
+        element = [1, 1]
+        ls = [element, element]
+        if cond:
+            update = [2, 2]
+            ls[0] = update
+            ls[0][0] = 5
+            return flatten([ls[0], ls[1], update])
+        return flatten([ls[0], ls[1], element])
+
+    assert test3(True) == [5, 2, 1, 1, 5, 2]
+    assert test3(False) == [1, 1, 1, 1, 1, 1]
+
+    @cudaq.kernel
+    def test4(cond: bool) -> list[int]:
+        element = [1, 1]
+        ls = [element, element]
+        if cond:
+            ls[0][0] = 5
+        return flatten([ls[0], ls[1], element])
+
+    assert test4(True) == [5, 1, 5, 1, 5, 1]
+    assert test4(False) == [1, 1, 1, 1, 1, 1]
+
+    @cudaq.kernel
+    def test5(cond: bool) -> list[int]:
+        element = [1, 1]
+        ls = [element]
+        copy = ls[0]
+        if cond:
+            ls[0][0] = 5
+        return flatten([ls[0], copy, element])
+
+    assert test5(True) == [5, 1, 5, 1, 5, 1]
+    assert test5(False) == [1, 1, 1, 1, 1, 1]
+
+
+def test_list_of_list_update_failures():
+
+    @cudaq.kernel
+    def flatten(ls: list[list[int]]) -> list[int]:
+        size = 0
+        for l in ls:
+            size += len(l)
+        res = [0 for _ in range(size)]
+        idx = 0
+        for l in ls:
+            for i in l:
+                res[idx] = i
+                idx += 1
+        return res
+
+    @cudaq.kernel
+    def test1(cond: bool) -> list[int]:
+        l1 = [1, 1]
+        l2 = l1
+        if cond:
+            l3 = [2, 2]
+            l1 = l3
+            l3[0] = 5
+            return flatten([l1, l2, l3])
+        return flatten([l1, l2])
+
+    with pytest.raises(RuntimeError) as e:
+        test1(True)
+    assert 'variable defined in parent scope cannot be modified' in str(e.value)
+    assert '(offending source -> l1 = l3)' in str(e.value)
+
+
+def test_disallow_update_capture():
+
+    n = 3
+    ls = [1, 2, 3]
+
+    @cudaq.kernel
+    def kernel1() -> int:
+        # Shadow n, no error
+        n = 4
+        return n
+
+    res = kernel1()
+    assert res == 4
+
+    @cudaq.kernel
+    def kernel2() -> int:
+        if True:
+            # Shadow n, no error
+            n = 4
+        # n is not defined in this scope, error
+        return n
+
+    with pytest.raises(RuntimeError) as e:
+        kernel2()
+    assert "'n' is not defined" in repr(e)
+
+    @cudaq.kernel
+    def kernel3() -> int:
+        if True:
+            # causes the variable to be added to the symbol table
+            cudaq.dbg.ast.print_i64(n)
+            # Change n, emits an error
+            n += 4
+        return n
+
+    with pytest.raises(RuntimeError) as e:
+        kernel3()
+    assert "CUDA-Q does not allow assignments to variables captured from parent scope" in str(
+        e.value)
+    assert "(offending source -> n)" in str(e.value)
+
+    @cudaq.kernel
+    def kernel4() -> list[int]:
+        vals = ls
+        vals[0] = 5
+        return vals
+
+    assert kernel4() == [5, 2, 3] and ls == [1, 2, 3]
+
+    @cudaq.kernel
+    def kernel5():
+        ls[0] = 5
+
+    with pytest.raises(RuntimeError) as e:
+        kernel5()
+    assert "CUDA-Q does not allow assignments to variables captured from parent scope" in str(
+        e.value)
+    assert "(offending source -> ls)" in str(e.value)
+
+    tp = (1, 5)
+
+    @cudaq.kernel
+    def kernel6() -> tuple[int, int]:
+        # Capturing tuples is not currently supported.
+        # If support is enabled, add test to check that it
+        # cannot be modified inside the kernel.
+        return tp
+
+    with pytest.raises(RuntimeError) as e:
+        kernel6()
+    assert "Invalid type for variable (tp) captured from parent scope" in str(
+        e.value)
+    assert "(offending source -> tp)" in str(e.value)
+
+
+def test_disallow_value_updates():
+
+    @cudaq.kernel
+    def test1() -> list[bool]:
+        qs = cudaq.qvector(4)
+        c = qs[0]
+        if True:
+            c = qs[1]
+        x(c)
+        return mz(qs)
+
+    with pytest.raises(RuntimeError) as e:
+        test1()
+    assert 'variable defined in parent scope cannot be modified' in str(e.value)
+    assert '(offending source -> c = qs[1])' in str(e.value)
+
+    @cudaq.kernel
+    def test2() -> bool:
+        qs = cudaq.qvector(2)
+        res = mz(qs[0])
+        if True:
+            x(qs[1])
+            res = mz(qs[1])
+        return res
+
+    # TODO: The reason we cannot currently support this is
+    # because we store measurement results as values in the
+    # symbol table. This should be changed and supported when
+    # we do the change to properly distinguish measurement
+    # types from booleans.
+    with pytest.raises(RuntimeError) as e:
+        test2()
+    assert 'variable defined in parent scope cannot be modified' in str(e.value)
+    assert '(offending source -> res = mz(qs[1]))' in str(e.value)
+
+
+def test_function_arguments():
+
+    @dataclass(slots=True)
+    class BasicTuple:
+        first: int
+        second: float
+
+    @dataclass(slots=True)
+    class ListTuple:
+        first: list[int]
+        second: list[float]
+
+    # Case 1: value is function arg
+    # Case 2: value is item in function arg
+    # Case a: value is a list
+    # Case b: value is a tuple that does not contain a list
+    # Case c: value is a tuple that contains a list
+    # Case d: value is a dataclass that does not contain a list
+    # Case e: value is a dataclass that contains a list
+
+    # Assignment to the same scope
+
+    @cudaq.kernel
+    def test1a(value: list[int]) -> list[int]:
+        local = [1., 1.]
+        local = value
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test1a.compile()
+    assert 'return value must not contain a list that is a function argument or an item in a function argument' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test1b(value: tuple[int, int]) -> list[tuple[int, int]]:
+        local = (1., 1.)
+        local = value
+        return [local]
+
+    test1b.compile()
+
+    @cudaq.kernel
+    def test1c(
+            value: tuple[list[int], list[int]]) -> tuple[list[int], list[int]]:
+        local = ([1], [1])
+        local = value
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test1c.compile()
+    assert 'cannot assign tuple or dataclass passed as function argument to a local variable if it contains a list' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test1d(value: BasicTuple) -> BasicTuple:
+        local = BasicTuple(1, 5)
+        local = value
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test1d.compile()
+    assert 'cannot assign dataclass passed as function argument to a local variable' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test1e(value: ListTuple) -> ListTuple:
+        local = ListTuple([1], [1])
+        local = value
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test1e.compile()
+    assert 'cannot assign dataclass passed as function argument to a local variable' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test2a(value: list[list[int]]) -> list[int]:
+        local = [1., 1.]
+        local = value[0]
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test2a.compile()
+    assert 'lists passed as or contained in function arguments cannot be assigned to to a local variable' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test2b(value: list[tuple[int, int]]) -> list[tuple[int, int]]:
+        local = (1., 1.)
+        local = value[0]
+        return [local]
+
+    test2b.compile()
+
+    @cudaq.kernel
+    def test2c(
+        value: list[tuple[list[int],
+                          list[int]]]) -> tuple[list[int], list[int]]:
+        local = ([1.], [1.])
+        local = value[0]
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test2c.compile()
+    assert 'cannot assign tuple or dataclass passed as function argument to a local variable if it contains a list' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test2d(value: tuple[BasicTuple, BasicTuple]) -> BasicTuple:
+        local = BasicTuple(1, 1)
+        local = value[0]
+        return local
+
+    test2d.compile()
+
+    @cudaq.kernel
+    def test2e(value: tuple[ListTuple, ListTuple]) -> ListTuple:
+        local = ListTuple([1], [1])
+        local = value[0]
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test2e.compile()
+    assert 'cannot assign tuple or dataclass passed as function argument to a local variable if it contains a list' in str(
+        e.value)
+
+    # Assignment to a parent scope
+
+    @cudaq.kernel
+    def test1a(cond: bool, value: list[int]) -> list[int]:
+        local = [1., 1.]
+        if cond:
+            local = value
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test1a.compile()
+    assert 'lists passed as or contained in function arguments cannot be assigned to variables in the parent scope' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test1b(cond: bool, value: tuple[int, int]) -> list[tuple[int, int]]:
+        local = (1., 1.)
+        if cond:
+            local = value
+        return [local]
+
+    test1b.compile()
+
+    @cudaq.kernel
+    def test1c(
+            cond: bool, value: tuple[list[int],
+                                     list[int]]) -> tuple[list[int], list[int]]:
+        local = ([1], [1])
+        if cond:
+            local = value
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test1c.compile()
+    assert 'cannot assign tuple or dataclass passed as function argument to a local variable if it contains a list' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test1d(cond: bool, value: BasicTuple) -> BasicTuple:
+        local = BasicTuple(1, 5)
+        if cond:
+            local = value
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test1d.compile()
+    assert 'cannot assign dataclass passed as function argument to a local variable' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test1e(cond: bool, value: ListTuple) -> ListTuple:
+        local = ListTuple([1], [1])
+        if cond:
+            local = value
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test1e.compile()
+    assert 'cannot assign dataclass passed as function argument to a local variable' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test2a(cond: bool, value: tuple[list[int], list[int]]) -> list[int]:
+        local = [1., 1.]
+        if cond:
+            local = value[0]
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test2a.compile()
+    assert 'lists passed as or contained in function arguments cannot be assigned to to a local variable' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test2b(
+        cond: bool, value: tuple[tuple[int, int],
+                                 tuple[int, int]]) -> list[tuple[int, int]]:
+        local = (1., 1.)
+        if cond:
+            local = value[0]
+        return [local]
+
+    test2b.compile()
+
+    @cudaq.kernel
+    def test2c(
+        cond: bool,
+        value: list[tuple[list[int],
+                          list[int]]]) -> tuple[list[int], list[int]]:
+        local = ([1.], [1.])
+        if cond:
+            local = value[0]
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test2c.compile()
+    assert 'cannot assign tuple or dataclass passed as function argument to a local variable if it contains a list' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test2d(cond: bool, value: list[BasicTuple]) -> BasicTuple:
+        local = BasicTuple(1, 1)
+        if cond:
+            local = value[0]
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test2d.compile()
+    assert 'only literals can be assigned to variables defined in parent scope' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test2e(cond: bool, value: list[ListTuple]) -> ListTuple:
+        local = ListTuple([1], [1])
+        if cond:
+            local = value[0]
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test2e.compile()
+    assert 'cannot assign tuple or dataclass passed as function argument to a local variable if it contains a list' in str(
+        e.value)
+
+    # Item assignment to a container in the same scope
+
+    @cudaq.kernel
+    def test1a(value: list[int]) -> list[list[int]]:
+        local = [[1., 1.]]
+        local[0] = value
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test1a.compile()
+    assert 'lists passed as or contained in function arguments cannot be inner items in other container values' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test1b(value: tuple[int, int]) -> list[tuple[int, int]]:
+        local = [(1., 1.)]
+        local[0] = value
+        return local
+
+    test1b.compile()
+
+    @cudaq.kernel
+    def test1c(
+        value: tuple[list[int],
+                     list[int]]) -> list[tuple[list[int], list[int]]]:
+        local = [([1], [1])]
+        local[0] = value
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test1c.compile()
+    assert 'lists passed as or contained in function arguments cannot be inner items in other container values' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test1d(value: BasicTuple) -> list[BasicTuple]:
+        local = [BasicTuple(1, 5)]
+        local[0] = value
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test1d.compile()
+    assert 'only dataclass literals may be used as items in other container values' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test1e(value: ListTuple) -> list[ListTuple]:
+        local = [ListTuple([1], [1])]
+        local[0] = value
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test1e.compile()
+    assert 'only dataclass literals may be used as items in other container values' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test2a(value: list[list[int]]) -> list[list[int]]:
+        local = [[1., 1.]]
+        local[0] = value[0]
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test2a.compile()
+    assert 'lists passed as or contained in function arguments cannot be inner items in other container values' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test2b(value: list[tuple[int, int]]) -> list[tuple[int, int]]:
+        local = [(1., 1.)]
+        local[0] = value[0]
+        return local
+
+    test2b.compile()
+
+    @cudaq.kernel
+    def test2c(
+        value: list[tuple[list[int], list[int]]]
+    ) -> list[tuple[list[int], list[int]]]:
+        local = [([1.], [1.])]
+        local[0] = value[0]
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test2c.compile()
+    assert 'lists passed as or contained in function arguments cannot be inner items in other container values' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test2d(value: tuple[BasicTuple, BasicTuple]) -> list[BasicTuple]:
+        local = [BasicTuple(1, 1)]
+        local[0] = value[0]
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test2d.compile()
+    assert 'only dataclass literals may be used as items in other container values' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test2e(value: tuple[ListTuple, ListTuple]) -> list[ListTuple]:
+        local = [ListTuple([1], [1])]
+        local[0] = value[0]
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test2e.compile()
+    assert 'only dataclass literals may be used as items in other container values' in str(
+        e.value)
+
+    # Item assignment to a container in a parent scope
+
+    @cudaq.kernel
+    def test1a(cond: bool, value: list[int]) -> list[list[int]]:
+        local = [[1., 1.]]
+        if cond:
+            local[0] = value
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test1a.compile()
+    assert 'lists passed as or contained in function arguments cannot be inner items in other container values' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test1b(cond: bool, value: tuple[int, int]) -> list[tuple[int, int]]:
+        local = [(1., 1.)]
+        if cond:
+            local[0] = value
+        return local
+
+    test1b.compile()
+
+    @cudaq.kernel
+    def test1c(
+        cond: bool,
+        value: tuple[list[int],
+                     list[int]]) -> list[tuple[list[int], list[int]]]:
+        local = [([1], [1])]
+        if cond:
+            local[0] = value
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test1c.compile()
+    assert 'lists passed as or contained in function arguments cannot be inner items in other container values' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test1d(cond: bool, value: BasicTuple) -> list[BasicTuple]:
+        local = [BasicTuple(1, 5)]
+        if cond:
+            local[0] = value
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test1d.compile()
+    assert 'only dataclass literals may be used as items in other container values' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test1e(cond: bool, value: ListTuple) -> list[ListTuple]:
+        local = [ListTuple([1], [1])]
+        if cond:
+            local[0] = value
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test1e.compile()
+    assert 'only dataclass literals may be used as items in other container values' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test2a(cond: bool, value: list[list[int]]) -> list[list[int]]:
+        local = [[1., 1.]]
+        if cond:
+            local[0] = value[0]
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test2a.compile()
+    assert 'lists passed as or contained in function arguments cannot be inner items in other container values' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test2b(cond: bool, value: list[tuple[int,
+                                             int]]) -> list[tuple[int, int]]:
+        local = [(1., 1.)]
+        if cond:
+            local[0] = value[0]
+        return local
+
+    test2b.compile()
+
+    @cudaq.kernel
+    def test2c(
+        cond: bool, value: list[tuple[list[int], list[int]]]
+    ) -> list[tuple[list[int], list[int]]]:
+        local = [([1.], [1.])]
+        if cond:
+            local[0] = value[0]
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test2c.compile()
+    assert 'lists passed as or contained in function arguments cannot be inner items in other container values' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test2d(cond: bool, value: tuple[BasicTuple,
+                                        BasicTuple]) -> list[BasicTuple]:
+        local = [BasicTuple(1, 1)]
+        if cond:
+            local[0] = value[0]
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test2d.compile()
+    assert 'only dataclass literals may be used as items in other container values' in str(
+        e.value)
+
+    @cudaq.kernel
+    def test2e(cond: bool, value: tuple[ListTuple,
+                                        ListTuple]) -> list[ListTuple]:
+        local = [ListTuple([1], [1])]
+        if cond:
+            local[0] = value[0]
+        return local
+
+    with pytest.raises(RuntimeError) as e:
+        test2e.compile()
+    assert 'only dataclass literals may be used as items in other container values' in str(
+        e.value)
+
+
+# leave for gdb debugging
+if __name__ == "__main__":
+    loc = os.path.abspath(__file__)
+    pytest.main([loc, "-rP"])
diff --git a/python/tests/kernel/test_control_negations.py b/python/tests/kernel/test_control_negations.py
index b22d3d0e00b..49d990bb717 100644
--- a/python/tests/kernel/test_control_negations.py
+++ b/python/tests/kernel/test_control_negations.py
@@ -28,6 +28,15 @@ def control_simple_gate():
     counts = cudaq.sample(control_simple_gate)
     assert counts["01"] == 1000
 
+    @cudaq.kernel
+    def multi_control_simple_gate():
+        c, q = cudaq.qvector(4), cudaq.qubit()
+        x(c[0], c[3])
+        cx(c[0], ~c[1], ~c[2], c[3], q)
+
+    counts = cudaq.sample(multi_control_simple_gate)
+    assert counts["10011"] == 1000
+
     @cudaq.kernel
     def control_rotation_gate():
         c, q = cudaq.qubit(), cudaq.qubit()
@@ -37,6 +46,15 @@ def control_rotation_gate():
     counts = cudaq.sample(control_rotation_gate)
     assert counts["01"] == 1000
 
+    @cudaq.kernel
+    def multi_control_rotation_gate():
+        c, q = cudaq.qvector(4), cudaq.qubit()
+        x(c[0], c[3])
+        cry(np.pi, c[0], ~c[1], ~c[2], c[3], q)
+
+    counts = cudaq.sample(multi_control_rotation_gate)
+    assert counts["10011"] == 1000
+
     # Note: u3, swap, and exp_pauli do not have a built-in
     # c<gatename> version at the time of writing this.
 
@@ -61,16 +79,6 @@ def multi_control_simple_gate():
     counts = cudaq.sample(multi_control_simple_gate)
     assert counts["10011"] == 1000
 
-    @cudaq.kernel
-    def multi_control_simple_gate2():
-        c, q = cudaq.qvector(4), cudaq.qubit()
-        x(c[0], c[3])
-        c1, c2, c3, c4 = c
-        x.ctrl(c1, ~c2, ~c3, c4, q)
-
-    counts = cudaq.sample(multi_control_simple_gate2)
-    assert counts["10011"] == 1000
-
     @cudaq.kernel
     def control_rotation_gate():
         c, q = cudaq.qubit(), cudaq.qubit()
@@ -89,16 +97,6 @@ def multi_control_rotation_gate():
     counts = cudaq.sample(multi_control_rotation_gate)
     assert counts["10011"] == 1000
 
-    @cudaq.kernel
-    def multi_control_rotation_gate2():
-        c, q = cudaq.qvector(4), cudaq.qubit()
-        x(c[0], c[3])
-        c1, c2, c3, c4 = c
-        ry.ctrl(np.pi, c1, ~c2, ~c3, c4, q)
-
-    counts = cudaq.sample(multi_control_rotation_gate2)
-    assert counts["10011"] == 1000
-
     @cudaq.kernel
     def control_swap_gate():
         c, q1, q2 = cudaq.qubit(), cudaq.qubit(), cudaq.qubit()
@@ -119,17 +117,6 @@ def multi_control_swap_gate():
     counts = cudaq.sample(multi_control_swap_gate)
     assert counts["100101"] == 1000
 
-    @cudaq.kernel
-    def multi_control_swap_gate2():
-        c, q1, q2 = cudaq.qvector(4), cudaq.qubit(), cudaq.qubit()
-        x(q1)
-        x(c[0], c[3])
-        c1, c2, c3, c4 = c
-        swap.ctrl(c1, ~c2, ~c3, c4, q1, q2)
-
-    counts = cudaq.sample(multi_control_swap_gate2)
-    assert counts["100101"] == 1000
-
     @cudaq.kernel
     def control_u3_gate():
         c, q = cudaq.qubit(), cudaq.qubit()
@@ -150,17 +137,6 @@ def multi_control_u3_gate():
     counts = cudaq.sample(multi_control_u3_gate)
     assert counts["10101"] == 1000
 
-    @cudaq.kernel
-    def multi_control_u3_gate2():
-        c, q = cudaq.qvector(4), cudaq.qubit()
-        x(c[0], c[2])
-        c1, c2, c3, c4 = c
-        t, p, l = np.pi, 0., 0.
-        u3.ctrl(t, p, l, c1, ~c2, c3, ~c4, q)
-
-    counts = cudaq.sample(multi_control_u3_gate2)
-    assert counts["10101"] == 1000
-
     cudaq.register_operation("custom_x", np.array([0, 1, 1, 0]))
 
     @cudaq.kernel
@@ -181,16 +157,6 @@ def multi_control_registered_operation():
     counts = cudaq.sample(multi_control_registered_operation)
     assert counts["10101"] == 1000
 
-    @cudaq.kernel
-    def multi_control_registered_operation2():
-        c, q = cudaq.qvector(4), cudaq.qubit()
-        x(c[0], c[2])
-        c1, c2, c3, c4 = c
-        custom_x.ctrl(c1, ~c2, c3, ~c4, q)
-
-    counts = cudaq.sample(multi_control_registered_operation2)
-    assert counts["10101"] == 1000
-
 
 def test_cudaq_control():
 
@@ -207,6 +173,15 @@ def control_kernel():
     counts = cudaq.sample(control_kernel)
     assert counts["01"] == 1000
 
+    @cudaq.kernel
+    def multi_control_kernel():
+        c, q = cudaq.qvector(4), cudaq.qubit()
+        x(c[0], c[3])
+        cudaq.control(custom_x, c[0], ~c[1], ~c[2], c[3], q)
+
+    counts = cudaq.sample(multi_control_kernel)
+    assert counts["10011"] == 1000
+
     # Note: calling cudaq.control on a registered operation
     # or on a built-in gate is not supported at the time of writing this
 
diff --git a/python/tests/kernel/test_direct_call_return_kernel.py b/python/tests/kernel/test_direct_call_return_kernel.py
index 7827b85bf54..20fc1705495 100644
--- a/python/tests/kernel/test_direct_call_return_kernel.py
+++ b/python/tests/kernel/test_direct_call_return_kernel.py
@@ -203,7 +203,7 @@ def simple_list_bool_no_args() -> list[bool]:
     @cudaq.kernel
     def simple_list_bool(n: int, t: list[bool]) -> list[bool]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     result = simple_list_bool(2, [True, False, True])
     assert result == [True, False, True]
@@ -221,7 +221,7 @@ def simple_list_int_no_args() -> list[int]:
     @cudaq.kernel
     def simple_list_int(n: int, t: list[int]) -> list[int]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     result = simple_list_int(2, [-13, 5, 42])
     assert result == [-13, 5, 42]
@@ -239,7 +239,7 @@ def simple_list_int32_no_args() -> list[np.int32]:
     @cudaq.kernel
     def simple_list_int32(n: int, t: list[np.int32]) -> list[np.int32]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     result = simple_list_int32(2, [-13, 5, 42])
     assert result == [-13, 5, 42]
@@ -257,7 +257,7 @@ def simple_list_int16_no_args() -> list[np.int16]:
     @cudaq.kernel
     def simple_list_int16(n: int, t: list[np.int16]) -> list[np.int16]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     result = simple_list_int16(2, [-13, 5, 42])
     assert result == [-13, 5, 42]
@@ -275,7 +275,7 @@ def simple_list_int8_no_args() -> list[np.int8]:
     @cudaq.kernel
     def simple_list_int8(n: int, t: list[np.int8]) -> list[np.int8]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     result = simple_list_int8(2, [-13, 5, 42])
     assert result == [-13, 5, 42]
@@ -293,7 +293,7 @@ def simple_list_int64_no_args() -> list[np.int64]:
     @cudaq.kernel
     def simple_list_int64(n: int, t: list[np.int64]) -> list[np.int64]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     result = simple_list_int64(2, [-13, 5, 42])
     assert result == [-13, 5, 42]
@@ -311,7 +311,7 @@ def simple_list_float_no_args() -> list[float]:
     @cudaq.kernel
     def simple_list_float(n: int, t: list[float]) -> list[float]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     result = simple_list_float(2, [-13.2, 5.0, 42.99])
     assert result == [-13.2, 5.0, 42.99]
@@ -330,7 +330,7 @@ def simple_list_float32_no_args() -> list[np.float32]:
     @cudaq.kernel
     def simple_list_float32(n: int, t: list[np.float32]) -> list[np.float32]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     result = simple_list_float32(2, [-13.2, 5.0, 42.99])
     assert is_close_array(result, [-13.2, 5.0, 42.99])
@@ -348,7 +348,7 @@ def simple_list_float64_no_args() -> list[np.float64]:
     @cudaq.kernel
     def simple_list_float64(n: int, t: list[np.float64]) -> list[np.float64]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     result = simple_list_float64(2, [-13.2, 5.0, 42.99])
     assert result == [-13.2, 5.0, 42.99]
@@ -382,8 +382,7 @@ def simple_tuple_int_float_assign(
 
     with pytest.raises(RuntimeError) as e:
         simple_tuple_int_float_assign(2, (-13, 42.3))
-    assert 'indexing into tuple or dataclass must not modify value' in str(
-        e.value)
+    assert 'tuple value cannot be modified' in str(e.value)
 
 
 def test_return_tuple_float_int():
@@ -447,12 +446,9 @@ def test_return_tuple_int32_bool():
     def simple_tuple_int32_bool_no_args() -> tuple[np.int32, bool]:
         return (-13, True)
 
-    with pytest.raises(RuntimeError) as e:
-        simple_tuple_int32_bool_no_args()
-    # Note: it may make sense to support that if/since we support
-    # the cast for the individual item types.
-    assert 'cannot convert value of type !cc.struct<"tuple" {i64, i1}> to the requested type !cc.struct<"tuple" {i32, i1}>' in str(
-        e.value)
+    result = simple_tuple_int32_bool_no_args()
+    # See https://github.com/NVIDIA/cuda-quantum/issues/3524
+    assert result == (-13, True)
 
     @cudaq.kernel
     def simple_tuple_int32_bool_no_args1() -> tuple[np.int32, bool]:
@@ -473,10 +469,6 @@ def simple_tuple_int32_bool(
         return t
 
     result = simple_tuple_int32_bool(2, (np.int32(-13), True))
-    # Note: printing the kernel correctly shows the MLIR
-    # values return type as "tuple" {i32, i1}, but we don't
-    # actually create numpy values even when these are requested
-    # in the signature.
     # See https://github.com/NVIDIA/cuda-quantum/issues/3524
     assert result == (-13, True)
 
diff --git a/python/tests/kernel/test_explicit_measurements.py b/python/tests/kernel/test_explicit_measurements.py
index d14b764570e..5cba790e668 100644
--- a/python/tests/kernel/test_explicit_measurements.py
+++ b/python/tests/kernel/test_explicit_measurements.py
@@ -110,7 +110,7 @@ def kernel(theta: float, phi: float):
     assert len(seq[0]) == 20  # num qubits * num_rounds
 
 
-def test_named_measurment():
+def test_named_measurement():
     """ Test for while using "explicit measurements" mode, the sample result 
         will not be saved to a mid-circuit measurement register. """
 
diff --git a/python/tests/kernel/test_kernel_features.py b/python/tests/kernel/test_kernel_features.py
index 100a7dc648a..15502764a59 100644
--- a/python/tests/kernel/test_kernel_features.py
+++ b/python/tests/kernel/test_kernel_features.py
@@ -839,6 +839,124 @@ def kernel6():
     assert len(counts) == 1
     assert '0101' in counts
 
+    @cudaq.kernel
+    def kernel7():
+        qubits = cudaq.qvector(5)
+        r = [i for i in range(2, 5)]
+        for i in r:
+            x(qubits[i])
+
+    counts = cudaq.sample(kernel7)
+    assert len(counts) == 1
+    assert '00111' in counts
+
+    @cudaq.kernel
+    def kernel8():
+        qubits = cudaq.qvector(5)
+        r = [i for i in range(2, 6, 2)]
+        for i in r:
+            x(qubits[i])
+
+    counts = cudaq.sample(kernel8)
+    assert len(counts) == 1
+    assert '00101' in counts
+
+    @cudaq.kernel
+    def kernel9():
+        qubits = cudaq.qvector(5)
+        r = [i for i in range(6, 2, 2)]
+        for i in r:
+            x(qubits[i])
+
+    counts = cudaq.sample(kernel9)
+    assert len(counts) == 1
+    assert '00000' in counts
+
+    @cudaq.kernel
+    def kernel10():
+        qubits = cudaq.qvector(5)
+        r = [i for i in range(3, 0, -2)]
+        for i in r:
+            x(qubits[i])
+
+    counts = cudaq.sample(kernel10)
+    assert len(counts) == 1
+    assert '01010' in counts
+
+    @cudaq.kernel
+    def kernel11():
+        qubits = cudaq.qvector(5)
+        r = [i for i in range(-5, -2, -2)]
+        for i in r:
+            x(qubits[i])
+
+    counts = cudaq.sample(kernel11)
+    assert len(counts) == 1
+    assert '00000' in counts
+
+    @cudaq.kernel
+    def kernel12():
+        qubits = cudaq.qvector(5)
+        r = [i for i in range(-1, -5, -2)]
+        for i in r:
+            x(qubits[-i])
+
+    counts = cudaq.sample(kernel12)
+    assert len(counts) == 1
+    assert '01010' in counts
+
+    @cudaq.kernel
+    def kernel13():
+        qubits = cudaq.qvector(5)
+        r = [i for i in range(1, -4, -1)]
+        for i in r:
+            if i < 0:
+                x(qubits[-i])
+            else:
+                x(qubits[i])
+
+    counts = cudaq.sample(kernel13)
+    assert len(counts) == 1
+    assert '10110' in counts
+
+    @cudaq.kernel
+    def kernel14():
+        qubits = cudaq.qvector(5)
+        r = [i for i in range(-2, 6, 2)]
+        for i in r:
+            if i < 0:
+                x(qubits[-i])
+            else:
+                x(qubits[i])
+
+    counts = cudaq.sample(kernel14)
+    assert len(counts) == 1
+    assert '10001' in counts
+
+    @cudaq.kernel
+    def kernel15():
+        qubits = cudaq.qvector(5)
+        r = [i for i in range(1, 4, 0)]
+        for i in r:
+            x(qubits[i])
+
+    with pytest.raises(RuntimeError) as e:
+        cudaq.sample(kernel15)
+    assert "range step value must be non-zero" in str(e.value)
+    assert "offending source -> range(1, 4, 0)" in str(e.value)
+
+    @cudaq.kernel
+    def kernel16(v: int):
+        qubits = cudaq.qvector(5)
+        r = [i for i in range(1, 4, v)]
+        for i in r:
+            x(qubits[i])
+
+    with pytest.raises(RuntimeError) as e:
+        cudaq.sample(kernel16)
+    assert "range step value must be a constant" in str(e.value)
+    assert "offending source -> range(1, 4, v)" in str(e.value)
+
 
 def test_array_value_assignment():
 
@@ -999,22 +1117,6 @@ def canCaptureList():
                       atol=1e-3)
 
 
-def test_capture_disallow_change_variable():
-
-    n = 3
-
-    @cudaq.kernel
-    def kernel() -> int:
-        if True:
-            cudaq.dbg.ast.print_i64(n)
-            # Change n, emits an error
-            n = 4
-        return n
-
-    with pytest.raises(RuntimeError) as e:
-        kernel()
-
-
 def test_inner_function_capture():
 
     n = 3
@@ -1558,7 +1660,8 @@ def test_kernel():
 
     with pytest.raises(RuntimeError) as e:
         test_kernel.compile()
-    assert 'invalid assignment detected.' in repr(e)
+    assert 'no valid value was created' in repr(e)
+    assert '(offending source -> state_reg = cudaq.qubit)' in repr(e)
 
 
 def test_cast_error_1451():
@@ -1955,23 +2058,82 @@ def test(input: CustomIntAndFloatType):
     counts.dump()
     assert len(counts) == 2 and '00' in counts and '11' in counts
 
+    # FIXME:
+    # While this exact test worked, the handing in OpaqueArguments.h
+    # does not match the expected layout in the args creator.
+    # Correspondingly, both subsequent tests below failed with a crash
+    # as it was. I hence choose to give a proper error until this is
+    # fixed after general Python compiler revisions.
     @dataclass(slots=True)
     class CustomIntAndListFloat:
         integer: int
-        array: List[float]
+        array: list[float]
 
     @cudaq.kernel
     def test(input: CustomIntAndListFloat):
         qubits = cudaq.qvector(input.integer)
         ry(input.array[0], qubits[0])
-        rx(input.array[1], qubits[0])
+        ry(input.array[1], qubits[2])
         x.ctrl(qubits[0], qubits[1])
 
-    print(test)
-    instance = CustomIntAndListFloat(2, [np.pi / 2., np.pi])
-    counts = cudaq.sample(test, instance)
-    counts.dump()
-    assert len(counts) == 2 and '00' in counts and '11' in counts
+    instance = CustomIntAndListFloat(3, [np.pi, np.pi])
+    with pytest.raises(RuntimeError) as e:
+        counts = cudaq.sample(test, instance)
+    assert 'dynamically sized element types for function arguments are not yet supported' in str(
+        e.value)
+    # Should be: assert len(counts) == 1 and '111' in counts
+
+    @cudaq.kernel
+    def test(input: CustomIntAndListFloat):
+        qubits = cudaq.qvector(input.integer)
+        ry(input.array[1], qubits[0])
+        ry(input.array[3], qubits[2])
+        x.ctrl(qubits[0], qubits[1])
+
+    instance = CustomIntAndListFloat(3, [0, np.pi, 0, np.pi])
+    with pytest.raises(RuntimeError) as e:
+        counts = cudaq.sample(test, instance)
+    assert 'dynamically sized element types for function arguments are not yet supported' in str(
+        e.value)
+    # Should be: assert len(counts) == 1 and '111' in counts
+
+    @dataclass(slots=True)
+    class CustomIntAndListFloat:
+        array: list[float]
+        integer: int
+
+    @cudaq.kernel
+    def test(input: CustomIntAndListFloat):
+        qubits = cudaq.qvector(input.integer)
+        ry(input.array[1], qubits[0])
+        ry(input.array[3], qubits[2])
+        x.ctrl(qubits[0], qubits[1])
+
+    instance = CustomIntAndListFloat([0, np.pi, 0, np.pi], 3)
+    with pytest.raises(RuntimeError) as e:
+        counts = cudaq.sample(test, instance)
+    assert 'dynamically sized element types for function arguments are not yet supported' in str(
+        e.value)
+    # Should be: assert len(counts) == 1 and '111' in counts
+
+    @dataclass(slots=True)
+    class CustomIntAndListFloat:
+        integer: list[int]
+        array: list[int]
+
+    @cudaq.kernel
+    def test(input: CustomIntAndListFloat):
+        qubits = cudaq.qvector(input.integer[-1])
+        ry(input.array[1], qubits[0])
+        ry(input.array[3], qubits[2])
+        x.ctrl(qubits[0], qubits[1])
+
+    instance = CustomIntAndListFloat([3], [0, np.pi, 0, np.pi])
+    with pytest.raises(RuntimeError) as e:
+        counts = cudaq.sample(test, instance)
+    assert 'dynamically sized element types for function arguments are not yet supported' in str(
+        e.value)
+    # Should be: assert len(counts) == 1 and '111' in counts
 
     # Test that the class can be in a library
     # and the paths all work out
@@ -2151,8 +2313,8 @@ def less_arguments():
 
     with pytest.raises(RuntimeError) as error:
         print(less_arguments)
-    assert 'invalid number of arguments (1) passed to rx (requires at least 2 arguments)' in repr(
-        error)
+    assert 'missing value' in repr(error)
+    assert '(offending source -> rx(3.14))' in repr(error)
 
     @cudaq.kernel
     def wrong_arguments():
@@ -2161,7 +2323,8 @@ def wrong_arguments():
 
     with pytest.raises(RuntimeError) as error:
         print(wrong_arguments)
-    assert 'rotational parameter must be a float, or int' in repr(error)
+    assert 'cannot convert value' in repr(error)
+    assert "(offending source -> rx('random_argument', q))" in repr(error)
 
     @cudaq.kernel
     def wrong_type():
@@ -2179,8 +2342,8 @@ def invalid_ctrl():
 
     with pytest.raises(RuntimeError) as error:
         print(invalid_ctrl)
-    assert 'controlled operation requested without any control argument(s)' in repr(
-        error)
+    assert 'missing value' in repr(error)
+    assert '(offending source -> rx.ctrl(np.pi, q))' in repr(error)
 
 
 def test_control_then_adjoint():
@@ -2427,5 +2590,6 @@ def kernel(op: cudaq.pauli_word):
 
 # leave for gdb debugging
 if __name__ == "__main__":
+    test_custom_classical_kernel_type()
     loc = os.path.abspath(__file__)
     pytest.main([loc, "-rP"])
diff --git a/python/tests/kernel/test_run_async_kernel.py b/python/tests/kernel/test_run_async_kernel.py
index e6699289a67..08eb3bbbfe9 100644
--- a/python/tests/kernel/test_run_async_kernel.py
+++ b/python/tests/kernel/test_run_async_kernel.py
@@ -313,7 +313,7 @@ def incrementer(i: int) -> int:
 
     @cudaq.kernel
     def kernel_with_list_arg(arg: list[int]) -> list[int]:
-        result = arg
+        result = [v for v in arg]
         for i in result:
             incrementer(i)
         return result
@@ -355,7 +355,7 @@ def simple_list_bool(n: int) -> list[bool]:
     @cudaq.kernel
     def simple_list_bool_args(n: int, t: list[bool]) -> list[bool]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     results = cudaq.run_async(simple_list_bool_args,
                               2, [True, False, True],
@@ -367,7 +367,7 @@ def simple_list_bool_args(n: int, t: list[bool]) -> list[bool]:
     @cudaq.kernel
     def simple_list_bool_args_no_broadcast(t: list[bool]) -> list[bool]:
         qubits = cudaq.qvector(2)
-        return t
+        return t.copy()
 
     results = cudaq.run_async(simple_list_bool_args_no_broadcast,
                               [True, False, True],
@@ -391,7 +391,7 @@ def simple_list_int_no_args() -> list[int]:
     @cudaq.kernel
     def simple_list_int(n: int, t: list[int]) -> list[int]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     results = cudaq.run_async(simple_list_int, 2, [-13, 5, 42],
                               shots_count=2).get()
@@ -414,7 +414,7 @@ def simple_list_int8_no_args() -> list[np.int8]:
     @cudaq.kernel
     def simple_list_int8(n: int, t: list[np.int8]) -> list[np.int8]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     results = cudaq.run_async(simple_list_int8, 2, [-13, 5, 42],
                               shots_count=2).get()
@@ -437,7 +437,7 @@ def simple_list_int16_no_args() -> list[np.int16]:
     @cudaq.kernel
     def simple_list_int16(n: int, t: list[np.int16]) -> list[np.int16]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     results = cudaq.run_async(simple_list_int16, 2, [-13, 5, 42],
                               shots_count=2).get()
@@ -460,7 +460,7 @@ def simple_list_int32_no_args() -> list[np.int32]:
     @cudaq.kernel
     def simple_list_int32(n: int, t: list[np.int32]) -> list[np.int32]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     results = cudaq.run_async(simple_list_int32, 2, [-13, 5, 42],
                               shots_count=2).get()
@@ -483,7 +483,7 @@ def simple_list_int64_no_args() -> list[np.int64]:
     @cudaq.kernel
     def simple_list_int64(n: int, t: list[np.int64]) -> list[np.int64]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     results = cudaq.run_async(simple_list_int64, 2, [-13, 5, 42],
                               shots_count=2).get()
@@ -506,7 +506,7 @@ def simple_list_float_no_args() -> list[float]:
     @cudaq.kernel
     def simple_list_float(n: int, t: list[float]) -> list[float]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     results = cudaq.run_async(simple_list_float,
                               2, [-13.2, 5.0, 42.99],
@@ -530,7 +530,7 @@ def simple_list_float32_no_args() -> list[np.float32]:
     @cudaq.kernel
     def simple_list_float32(n: int, t: list[np.float32]) -> list[np.float32]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     results = cudaq.run_async(simple_list_float32,
                               2, [-13.2, 5.0, 42.99],
@@ -554,7 +554,7 @@ def simple_list_float64_no_args() -> list[np.float64]:
     @cudaq.kernel
     def simple_list_float64(n: int, t: list[np.float64]) -> list[np.float64]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     results = cudaq.run_async(simple_list_float64,
                               2, [-13.2, 5.0, 42.99],
@@ -564,11 +564,6 @@ def simple_list_float64(n: int, t: list[np.float64]) -> list[np.float64]:
     assert np.allclose(results[1], [-13.2, 5., 42.99])
 
 
-# Test tuples
-# TODO: Define spec for using tuples in kernels
-# https://github.com/NVIDIA/cuda-quantum/issues/3031
-
-
 def test_return_tuple_int_float():
 
     @cudaq.kernel
@@ -600,19 +595,18 @@ def simple_tuple_int_float_assign(
 
     with pytest.raises(RuntimeError) as e:
         cudaq.run_async(simple_tuple_int_float_assign, 2, (-13, 11.5))
-    assert 'indexing into tuple or dataclass must not modify value' in str(
-        e.value)
+    assert 'tuple value cannot be modified' in str(e.value)
 
     @cudaq.kernel
-    def simple_tuple_int_float_error(
+    def simple_tuple_int_float_conversion(
             n: int, t: tuple[int, float]) -> tuple[bool, float]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_tuple_int_float_error, 2, (-13, 11.5))
-    assert 'cannot convert value of type !cc.struct<"tuple" {i64, f64}> to the requested type !cc.struct<"tuple" {i1, f64}>' in str(
-        e.value)
+    result = cudaq.run_async(simple_tuple_int_float_conversion,
+                             2, (-13, 42.3),
+                             shots_count=1).get()
+    assert len(result) == 1 and result[0] == (True, 42.3)
 
 
 def test_return_tuple_float_int():
@@ -683,10 +677,9 @@ def test_return_tuple_int32_bool():
     def simple_tuple_int32_bool_no_args() -> tuple[np.int32, bool]:
         return (-13, True)
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_tuple_int32_bool_no_args)
-    assert 'cannot convert value of type !cc.struct<"tuple" {i64, i1}> to the requested type !cc.struct<"tuple" {i32, i1}>' in str(
-        e.value)
+    result = cudaq.run_async(simple_tuple_int32_bool_no_args,
+                             shots_count=1).get()
+    assert len(result) == 1 and result[0] == (-13, True)
 
     @cudaq.kernel
     def simple_tuple_int32_bool_no_args1() -> tuple[np.int32, bool]:
@@ -770,17 +763,16 @@ def simple_dataclass_int_bool_error() -> MyClass:
 
     with pytest.raises(RuntimeError) as e:
         cudaq.run_async(simple_dataclass_int_bool_error, shots_count=2).get()
-    assert 'invalid number of arguments passed in call to MyClass (0 vs required 2)' in repr(
-        e)
+    assert 'keyword arguments for data classes are not yet supported' in repr(e)
 
     @cudaq.kernel
-    def simple_dataclass_int_bool_error() -> MyClass:
-        return MyClass(x=0.13, y=True)
+    def simple_dataclass_int_bool() -> MyClass:
+        return MyClass(2.13, True)
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_dataclass_int_bool_error, shots_count=2).get()
-    assert 'invalid number of arguments passed in call to MyClass (0 vs required 2)' in repr(
-        e)
+    results = cudaq.run_async(simple_dataclass_int_bool, shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == MyClass(2, True)
+    assert results[1] == MyClass(2, True)
 
 
 def test_return_dataclass_bool_int():
@@ -956,12 +948,13 @@ class MyClass:
         y: bool
 
     @cudaq.kernel
-    def simple_strucA(t: MyClass) -> MyClass:
+    def simple_structA(arg: MyClass) -> MyClass:
         q = cudaq.qubit()
+        t = arg.copy()
         t.x = 42
         return t
 
-    results = cudaq.run_async(simple_strucA, MyClass(-13, True),
+    results = cudaq.run_async(simple_structA, MyClass(-13, True),
                               shots_count=2).get()
     print(results)
     assert len(results) == 2
@@ -975,14 +968,16 @@ class Foo:
         z: int
 
     @cudaq.kernel
-    def kerneB(t: Foo) -> Foo:
+    def kernelB(arg: Foo) -> Foo:
         q = cudaq.qubit()
+        t = arg.copy()
         t.z = 100
         t.y = 3.14
         t.x = True
         return t
 
-    results = cudaq.run_async(kerneB, Foo(False, 6.28, 17), shots_count=2).get()
+    results = cudaq.run_async(kernelB, Foo(False, 6.28, 17),
+                              shots_count=2).get()
     print(results)
     assert len(results) == 2
     assert results[0] == Foo(True, 3.14, 100)
diff --git a/python/tests/kernel/test_run_kernel.py b/python/tests/kernel/test_run_kernel.py
index 3a0d809e6ac..afcd36dd0ce 100644
--- a/python/tests/kernel/test_run_kernel.py
+++ b/python/tests/kernel/test_run_kernel.py
@@ -308,7 +308,7 @@ def incrementer(i: int) -> int:
 
     @cudaq.kernel
     def kernel_with_list_arg(arg: list[int]) -> list[int]:
-        result = arg
+        result = arg.copy()
         for i in result:
             incrementer(i)
         return result
@@ -323,7 +323,7 @@ def caller_kernel(arg: list[int]) -> int:
 
     results = cudaq.run(caller_kernel, [4, 5, 6], shots_count=1)
     assert len(results) == 1
-    assert results[0] == 15  # 4+1 + 5+1 + 6+1 = 15
+    assert results[0] == 15  # 4 + 5 + 6 = 15
 
 
 def test_return_list_bool():
@@ -350,7 +350,7 @@ def simple_list_bool(n: int) -> list[bool]:
     @cudaq.kernel
     def simple_list_bool_args(n: int, t: list[bool]) -> list[bool]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     results = cudaq.run(simple_list_bool_args,
                         2, [True, False, True],
@@ -362,7 +362,7 @@ def simple_list_bool_args(n: int, t: list[bool]) -> list[bool]:
     @cudaq.kernel
     def simple_list_bool_args_no_broadcast(t: list[bool]) -> list[bool]:
         qubits = cudaq.qvector(2)
-        return t
+        return t.copy()
 
     results = cudaq.run(simple_list_bool_args_no_broadcast, [True, False, True],
                         shots_count=2)
@@ -385,7 +385,7 @@ def simple_list_int_no_args() -> list[int]:
     @cudaq.kernel
     def simple_list_int(n: int, t: list[int]) -> list[int]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     results = cudaq.run(simple_list_int, 2, [-13, 5, 42], shots_count=2)
     assert len(results) == 2
@@ -407,7 +407,7 @@ def simple_list_int8_no_args() -> list[np.int8]:
     @cudaq.kernel
     def simple_list_int8(n: int, t: list[np.int8]) -> list[np.int8]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     results = cudaq.run(simple_list_int8, 2, [-13, 5, 42], shots_count=2)
     assert len(results) == 2
@@ -429,7 +429,7 @@ def simple_list_int16_no_args() -> list[np.int16]:
     @cudaq.kernel
     def simple_list_int16(n: int, t: list[np.int16]) -> list[np.int16]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     results = cudaq.run(simple_list_int16, 2, [-13, 5, 42], shots_count=2)
     assert len(results) == 2
@@ -451,7 +451,7 @@ def simple_list_int32_no_args() -> list[np.int32]:
     @cudaq.kernel
     def simple_list_int32(n: int, t: list[np.int32]) -> list[np.int32]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     results = cudaq.run(simple_list_int32, 2, [-13, 5, 42], shots_count=2)
     assert len(results) == 2
@@ -473,7 +473,7 @@ def simple_list_int64_no_args() -> list[np.int64]:
     @cudaq.kernel
     def simple_list_int64(n: int, t: list[np.int64]) -> list[np.int64]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     results = cudaq.run(simple_list_int64, 2, [-13, 5, 42], shots_count=2)
     assert len(results) == 2
@@ -495,7 +495,7 @@ def simple_list_float_no_args() -> list[float]:
     @cudaq.kernel
     def simple_list_float(n: int, t: list[float]) -> list[float]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     results = cudaq.run(simple_list_float,
                         2, [-13.2, 5.0, 42.99],
@@ -519,7 +519,7 @@ def simple_list_float32_no_args() -> list[np.float32]:
     @cudaq.kernel
     def simple_list_float32(n: int, t: list[np.float32]) -> list[np.float32]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     results = cudaq.run(simple_list_float32,
                         2, [-13.2, 5.0, 42.99],
@@ -543,7 +543,7 @@ def simple_list_float64_no_args() -> list[np.float64]:
     @cudaq.kernel
     def simple_list_float64(n: int, t: list[np.float64]) -> list[np.float64]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     results = cudaq.run(simple_list_float64,
                         2, [-13.2, 5.0, 42.99],
@@ -559,19 +559,19 @@ def test_return_list_large_size():
     @cudaq.kernel
     def kernel_with_dynamic_int_array_input(n: int, t: list[int]) -> list[int]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     @cudaq.kernel
     def kernel_with_dynamic_float_array_input(n: int,
                                               t: list[float]) -> list[float]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     @cudaq.kernel
     def kernel_with_dynamic_bool_array_input(n: int,
                                              t: list[bool]) -> list[bool]:
         qubits = cudaq.qvector(n)
-        return t
+        return t.copy()
 
     # Test with various sizes (validate dynamic output logging)
     for array_size in [10, 15, 100, 167, 1000]:
@@ -631,11 +631,6 @@ def measure_all_qubits(numQubits: int) -> list[bool]:
                     assert res[i] == False
 
 
-# Test tuples
-# TODO: Define spec for using tuples in kernels
-# https://github.com/NVIDIA/cuda-quantum/issues/3031
-
-
 def test_return_tuple_int_float():
 
     @cudaq.kernel
@@ -664,19 +659,18 @@ def simple_tuple_int_float_assign(
 
     with pytest.raises(RuntimeError) as e:
         cudaq.run(simple_tuple_int_float_assign, 2, (-13, 11.5))
-    assert 'indexing into tuple or dataclass must not modify value' in str(
-        e.value)
+    assert 'tuple value cannot be modified' in str(e.value)
 
     @cudaq.kernel
-    def simple_tuple_int_float_error(
+    def simple_tuple_int_float_conversion(
             n: int, t: tuple[int, float]) -> tuple[bool, float]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_tuple_int_float_error, 2, (-13, 11.5))
-    assert 'cannot convert value of type !cc.struct<"tuple" {i64, f64}> to the requested type !cc.struct<"tuple" {i1, f64}>' in str(
-        e.value)
+    result = cudaq.run(simple_tuple_int_float_conversion,
+                       2, (-13, 42.3),
+                       shots_count=1)
+    assert len(result) == 1 and result[0] == (True, 42.3)
 
 
 def test_return_tuple_float_int():
@@ -740,10 +734,8 @@ def test_return_tuple_int32_bool():
     def simple_tuple_int32_bool_no_args() -> tuple[np.int32, bool]:
         return (-13, True)
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_tuple_int32_bool_no_args)
-    assert 'cannot convert value of type !cc.struct<"tuple" {i64, i1}> to the requested type !cc.struct<"tuple" {i32, i1}>' in str(
-        e.value)
+    result = cudaq.run(simple_tuple_int32_bool_no_args, shots_count=1)
+    assert len(result) == 1 and result[0] == (-13, True)
 
     @cudaq.kernel
     def simple_tuple_int32_bool_no_args1() -> tuple[np.int32, bool]:
@@ -987,12 +979,30 @@ class MyClass:
         y: bool
 
     @cudaq.kernel
-    def simple_strucA(t: MyClass) -> MyClass:
+    def simple_struc_err(t: MyClass) -> MyClass:
+        q = cudaq.qubit()
+        # If we allowed this, the expected behavior for Python
+        # would be that t is modified also in the caller without
+        # having to return it. We hence give an error to make it
+        # clear that changes to structs don't propagate past
+        # function boundaries.
+        t.x = 42
+        return t
+
+    with pytest.raises(RuntimeError) as e:
+        cudaq.run(simple_struc_err, MyClass(-13, True), shots_count=2)
+    assert 'value cannot be modified - use `.copy(deep)` to create a new value that can be modified' in repr(
+        e)
+    assert '(offending source -> t.x)' in repr(e)
+
+    @cudaq.kernel
+    def simple_structA(arg: MyClass) -> MyClass:
         q = cudaq.qubit()
+        t = arg.copy()
         t.x = 42
         return t
 
-    results = cudaq.run(simple_strucA, MyClass(-13, True), shots_count=2)
+    results = cudaq.run(simple_structA, MyClass(-13, True), shots_count=2)
     print(results)
     assert len(results) == 2
     assert results[0] == MyClass(42, True)
@@ -1005,14 +1015,15 @@ class Foo:
         z: int
 
     @cudaq.kernel
-    def kerneB(t: Foo) -> Foo:
+    def kernelB(arg: Foo) -> Foo:
         q = cudaq.qubit()
+        t = arg.copy()
         t.z = 100
         t.y = 3.14
         t.x = True
         return t
 
-    results = cudaq.run(kerneB, Foo(False, 6.28, 17), shots_count=2)
+    results = cudaq.run(kernelB, Foo(False, 6.28, 17), shots_count=2)
     print(results)
     assert len(results) == 2
     assert results[0] == Foo(True, 3.14, 100)
diff --git a/python/tests/mlir/ast_attributes.py b/python/tests/mlir/ast_attributes.py
index 75193fd61d1..6c459051bbf 100644
--- a/python/tests/mlir/ast_attributes.py
+++ b/python/tests/mlir/ast_attributes.py
@@ -106,8 +106,8 @@ def kernel3():
 
 
 # CHECK-LABEL:  Failure kernel1:
-# CHECK:        invalid CUDA-Q attribute assignment
-# CHECK-NEXT:   (offending source -> l.size = 4)
+# CHECK:        attribute expression does not produce a modifiable value
+# CHECK-NEXT:   (offending source -> l.size)
 
 # CHECK-LABEL:  Failure kernel2:
 # CHECK:        CUDA-Q does not allow dynamic resizing or lists, arrays, or qvectors.
diff --git a/python/tests/mlir/ast_break.py b/python/tests/mlir/ast_break.py
index b7af2f06d94..68c92f793e3 100644
--- a/python/tests/mlir/ast_break.py
+++ b/python/tests/mlir/ast_break.py
@@ -9,6 +9,7 @@
 # RUN: PYTHONPATH=../../ pytest -rP  %s | FileCheck %s
 
 import cudaq
+import numpy
 
 
 def test_break():
@@ -22,8 +23,8 @@ def kernel(x: float):
                 break
             ry(x, q[i % 4])
 
-    print(kernel)
     kernel(1.2)
+    print(kernel)
 
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel(
@@ -42,6 +43,8 @@ def kernel(x: float):
 # CHECK:             cc.condition %[[VAL_11]](%[[VAL_10]] : i64)
 # CHECK:           } do {
 # CHECK:           ^bb0(%[[VAL_12:.*]]: i64):
+# CHECK:             %[[VAL_23:.*]] = cc.alloca i64
+# CHECK:             cc.store %[[VAL_12]], %[[VAL_23]] : !cc.ptr<i64>
 # CHECK:             %[[VAL_13:.*]] = cc.load %[[VAL_7]] : !cc.ptr<f64>
 # CHECK:             %[[VAL_14:.*]] = math.fpowi %[[VAL_13]], %[[VAL_2]] : f64, i64
 # CHECK:             %[[VAL_15:.*]] = arith.addf %[[VAL_13]], %[[VAL_14]] : f64
@@ -53,7 +56,8 @@ def kernel(x: float):
 # CHECK:             cc.break %[[VAL_12]] : i64
 # CHECK:           ^bb2:
 # CHECK:             %[[VAL_18:.*]] = cc.load %[[VAL_7]] : !cc.ptr<f64>
-# CHECK:             %[[VAL_19:.*]] = arith.remui %[[VAL_12]], %[[VAL_6]] : i64
+# CHECK:             %[[VAL_24:.*]] = cc.load %[[VAL_23]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_19:.*]] = arith.remui %[[VAL_24]], %[[VAL_6]] : i64
 # CHECK:             %[[VAL_20:.*]] = quake.extract_ref %[[VAL_8]]{{\[}}%[[VAL_19]]] : (!quake.veq<4>, i64) -> !quake.ref
 # CHECK:             quake.ry (%[[VAL_18]]) %[[VAL_20]] : (f64, !quake.ref) -> ()
 # CHECK:             cc.continue %[[VAL_12]] : i64
@@ -61,6 +65,24 @@ def kernel(x: float):
 # CHECK:           ^bb0(%[[VAL_21:.*]]: i64):
 # CHECK:             %[[VAL_22:.*]] = arith.addi %[[VAL_21]], %[[VAL_3]] : i64
 # CHECK:             cc.continue %[[VAL_22]] : i64
-# CHECK:           } {invariant}
+# CHECK:           }
 # CHECK:           return
 # CHECK:         }
+
+
+def test_break2():
+
+    @cudaq.kernel(verbose=False)
+    def kernel(x: float):
+        qs = cudaq.qvector(6)
+        for idx, q in enumerate(qs):
+            if idx > 2:
+                break
+            ry(x, q)
+
+    res = cudaq.sample(kernel, numpy.pi)
+    assert len(res) == 1 and '111000' in res
+    print(f"output test1: {res}")
+
+
+# CHECK-LABEL:  output test1: { 111000:1000 }
diff --git a/python/tests/mlir/ast_compute_action.py b/python/tests/mlir/ast_compute_action.py
index a59b98e548d..2525593b844 100644
--- a/python/tests/mlir/ast_compute_action.py
+++ b/python/tests/mlir/ast_compute_action.py
@@ -50,7 +50,7 @@ def compute():
 # CHECK:             ^bb0(%[[VAL_15:.*]]: i64):
 # CHECK:               %[[VAL_16:.*]] = arith.addi %[[VAL_15]], %[[VAL_3]] : i64
 # CHECK:               cc.continue %[[VAL_16]] : i64
-# CHECK:             } {invariant}
+# CHECK:             }
 # CHECK:             %[[VAL_17:.*]] = cc.loop while ((%[[VAL_18:.*]] = %[[VAL_2]]) -> (i64)) {
 # CHECK:               %[[VAL_19:.*]] = arith.cmpi slt, %[[VAL_18]], %[[VAL_4]] : i64
 # CHECK:               cc.condition %[[VAL_19]](%[[VAL_18]] : i64)
@@ -63,7 +63,7 @@ def compute():
 # CHECK:             ^bb0(%[[VAL_22:.*]]: i64):
 # CHECK:               %[[VAL_23:.*]] = arith.addi %[[VAL_22]], %[[VAL_3]] : i64
 # CHECK:               cc.continue %[[VAL_23]] : i64
-# CHECK:             } {invariant}
+# CHECK:             }
 # CHECK:           } : !cc.callable<() -> ()>
 # CHECK:           %[[VAL_24:.*]] = cc.create_lambda {
 # CHECK:             quake.z {{\[}}%[[VAL_6]]] %[[VAL_8]] : (!quake.veq<?>, !quake.ref) -> ()
diff --git a/python/tests/mlir/ast_conditionals.py b/python/tests/mlir/ast_conditionals.py
index 91198cf4cc6..f6a383420f2 100644
--- a/python/tests/mlir/ast_conditionals.py
+++ b/python/tests/mlir/ast_conditionals.py
@@ -247,6 +247,22 @@ def test17():
 
     # CHECK-LABEL:   func.func @__nvqpp__mlirgen__test17() attributes {"cudaq-entrypoint", "cudaq-kernel", qubitMeasurementFeedback = true} {
 
+    @cudaq.kernel
+    def test18() -> int:
+        q = cudaq.qvector(6)
+        x(q[0:3])
+        res = [mz(r) for r in q]
+        ires = 0
+        for idx, v in enumerate(res):
+            ires = ires | (int(v) << idx)
+        if ires:
+            x(q)
+        return ires
+
+    print(test18)
+
+    # CHECK-LABEL:   func.func @__nvqpp__mlirgen__test18() -> i64 attributes {"cudaq-entrypoint", "cudaq-kernel", qubitMeasurementFeedback = true}
+
 
 # leave for gdb debugging
 if __name__ == "__main__":
diff --git a/python/tests/mlir/ast_continue.py b/python/tests/mlir/ast_continue.py
index 92e4210490c..940d053affd 100644
--- a/python/tests/mlir/ast_continue.py
+++ b/python/tests/mlir/ast_continue.py
@@ -9,6 +9,7 @@
 # RUN: PYTHONPATH=../../ pytest -rP  %s | FileCheck %s
 
 import cudaq
+import numpy
 
 
 def test_continue():
@@ -43,6 +44,8 @@ def kernel(x: float):
 # CHECK:             cc.condition %[[VAL_11]](%[[VAL_10]] : i64)
 # CHECK:           } do {
 # CHECK:           ^bb0(%[[VAL_12:.*]]: i64):
+# CHECK:             %[[VAL_25:.*]] = cc.alloca i64
+# CHECK:             cc.store %[[VAL_12]], %[[VAL_25]] : !cc.ptr<i64>
 # CHECK:             %[[VAL_13:.*]] = cc.load %[[VAL_7]] : !cc.ptr<f64>
 # CHECK:             %[[VAL_14:.*]] = math.fpowi %[[VAL_13]], %[[VAL_2]] : f64, i64
 # CHECK:             %[[VAL_15:.*]] = arith.addf %[[VAL_13]], %[[VAL_14]] : f64
@@ -51,13 +54,15 @@ def kernel(x: float):
 # CHECK:             %[[VAL_17:.*]] = arith.cmpf ogt, %[[VAL_16]], %[[VAL_1]] : f64
 # CHECK:             cf.cond_br %[[VAL_17]], ^bb1, ^bb2
 # CHECK:           ^bb1:
-# CHECK:             %[[VAL_18:.*]] = arith.remui %[[VAL_12]], %[[VAL_6]] : i64
+# CHECK:             %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_18:.*]] = arith.remui %[[VAL_26]], %[[VAL_6]] : i64
 # CHECK:             %[[VAL_19:.*]] = quake.extract_ref %[[VAL_8]]{{\[}}%[[VAL_18]]] : (!quake.veq<4>, i64) -> !quake.ref
 # CHECK:             quake.x %[[VAL_19]] : (!quake.ref) -> ()
 # CHECK:             cc.continue %[[VAL_12]] : i64
 # CHECK:           ^bb2:
 # CHECK:             %[[VAL_20:.*]] = cc.load %[[VAL_7]] : !cc.ptr<f64>
-# CHECK:             %[[VAL_21:.*]] = arith.remui %[[VAL_12]], %[[VAL_6]] : i64
+# CHECK:             %[[VAL_27:.*]] = cc.load %[[VAL_25]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_21:.*]] = arith.remui %[[VAL_27]], %[[VAL_6]] : i64
 # CHECK:             %[[VAL_22:.*]] = quake.extract_ref %[[VAL_8]]{{\[}}%[[VAL_21]]] : (!quake.veq<4>, i64) -> !quake.ref
 # CHECK:             quake.ry (%[[VAL_20]]) %[[VAL_22]] : (f64, !quake.ref) -> ()
 # CHECK:             cc.continue %[[VAL_12]] : i64
@@ -65,6 +70,24 @@ def kernel(x: float):
 # CHECK:           ^bb0(%[[VAL_23:.*]]: i64):
 # CHECK:             %[[VAL_24:.*]] = arith.addi %[[VAL_23]], %[[VAL_3]] : i64
 # CHECK:             cc.continue %[[VAL_24]] : i64
-# CHECK:           } {invariant}
+# CHECK:           }
 # CHECK:           return
 # CHECK:         }
+
+
+def test_continue2():
+
+    @cudaq.kernel(verbose=False)
+    def kernel(x: float):
+        qs = cudaq.qvector(6)
+        for idx, q in enumerate(qs):
+            if idx < 3:
+                continue
+            ry(x, q)
+
+    res = cudaq.sample(kernel, numpy.pi)
+    assert len(res) == 1 and '000111' in res
+    print(f"output test1: {res}")
+
+
+# CHECK-LABEL:  output test1: { 000111:1000 }
diff --git a/python/tests/mlir/ast_decrementing_range.py b/python/tests/mlir/ast_decrementing_range.py
index fa65f383b44..9fe0545c3d7 100644
--- a/python/tests/mlir/ast_decrementing_range.py
+++ b/python/tests/mlir/ast_decrementing_range.py
@@ -19,33 +19,36 @@ def test(q: int, p: int):
         for k in range(q, p, -1):
             x(qubits[k])
 
-    print(test)
     test(2, 0)
+    print(test)
 
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__test(
 # CHECK-SAME:                                      %[[VAL_0:.*]]: i64,
 # CHECK-SAME:                                      %[[VAL_1:.*]]: i64) attributes {"cudaq-entrypoint", "cudaq-kernel"} {
 # CHECK:           %[[VAL_2:.*]] = arith.constant -1 : i64
-# CHECK:           %[[VAL_3:.*]] = cc.alloca i64
-# CHECK:           cc.store %[[VAL_0]], %[[VAL_3]] : !cc.ptr<i64>
-# CHECK:           %[[VAL_4:.*]] = cc.alloca i64
-# CHECK:           cc.store %[[VAL_1]], %[[VAL_4]] : !cc.ptr<i64>
+# CHECK:           %[[VAL_15:.*]] = cc.alloca i64
+# CHECK:           cc.store %[[VAL_0]], %[[VAL_15]] : !cc.ptr<i64>
+# CHECK:           %[[VAL_16:.*]] = cc.alloca i64
+# CHECK:           cc.store %[[VAL_1]], %[[VAL_16]] : !cc.ptr<i64>
 # CHECK:           %[[VAL_5:.*]] = quake.alloca !quake.veq<5>
-# CHECK:           %[[VAL_6:.*]] = cc.load %[[VAL_3]] : !cc.ptr<i64>
-# CHECK:           %[[VAL_7:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i64>
-# CHECK:           %[[VAL_8:.*]] = cc.loop while ((%[[VAL_9:.*]] = %[[VAL_6]]) -> (i64)) {
-# CHECK:             %[[VAL_10:.*]] = arith.cmpi sgt, %[[VAL_9]], %[[VAL_7]] : i64
+# CHECK:           %[[VAL_17:.*]] = cc.load %[[VAL_15]] : !cc.ptr<i64>
+# CHECK:           %[[VAL_18:.*]] = cc.load %[[VAL_16]] : !cc.ptr<i64>
+# CHECK:           %[[VAL_8:.*]] = cc.loop while ((%[[VAL_9:.*]] = %[[VAL_17]]) -> (i64)) {
+# CHECK:             %[[VAL_10:.*]] = arith.cmpi sgt, %[[VAL_9]], %[[VAL_18]] : i64
 # CHECK:             cc.condition %[[VAL_10]](%[[VAL_9]] : i64)
 # CHECK:           } do {
 # CHECK:           ^bb0(%[[VAL_11:.*]]: i64):
-# CHECK:             %[[VAL_12:.*]] = quake.extract_ref %[[VAL_5]]{{\[}}%[[VAL_11]]] : (!quake.veq<5>, i64) -> !quake.ref
+# CHECK:             %[[VAL_3:.*]] = cc.alloca i64
+# CHECK:             cc.store %[[VAL_11]], %[[VAL_3]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_6:.*]] = cc.load %[[VAL_3]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_12:.*]] = quake.extract_ref %[[VAL_5]]{{\[}}%[[VAL_6]]] : (!quake.veq<5>, i64) -> !quake.ref
 # CHECK:             quake.x %[[VAL_12]] : (!quake.ref) -> ()
 # CHECK:             cc.continue %[[VAL_11]] : i64
 # CHECK:           } step {
 # CHECK:           ^bb0(%[[VAL_13:.*]]: i64):
 # CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_13]], %[[VAL_2]] : i64
 # CHECK:             cc.continue %[[VAL_14]] : i64
-# CHECK:           } {invariant}
+# CHECK:           }
 # CHECK:           return
 # CHECK:         }
diff --git a/python/tests/mlir/ast_elif.py b/python/tests/mlir/ast_elif.py
index 1fe3dfb190d..4725dd13786 100644
--- a/python/tests/mlir/ast_elif.py
+++ b/python/tests/mlir/ast_elif.py
@@ -33,9 +33,9 @@ def cost(thetas: np.ndarray):  # can pass 1D ndarray or list
 # CHECK-SAME:      %[[VAL_0:.*]]: !cc.stdvec<f64>) attributes {"cudaq-entrypoint", "cudaq-kernel"} {
 # CHECK:           %[[VAL_1:.*]] = arith.constant 4 : i64
 # CHECK:           %[[VAL_2:.*]] = quake.alloca !quake.veq<?>{{\[}}%[[VAL_1]] : i64]
-# CHECK:           %[[VAL_3:.*]] = cc.stdvec_size %[[VAL_0]] : (!cc.stdvec<f64>) -> i64
 # CHECK:           %[[VAL_4:.*]] = arith.constant 0 : i64
 # CHECK:           %[[VAL_5:.*]] = arith.constant 1 : i64
+# CHECK:           %[[VAL_3:.*]] = cc.stdvec_size %[[VAL_0]] : (!cc.stdvec<f64>) -> i64
 # CHECK:           %[[VAL_6:.*]] = cc.loop while ((%[[VAL_7:.*]] = %[[VAL_4]]) -> (i64)) {
 # CHECK:             %[[VAL_8:.*]] = arith.cmpi slt, %[[VAL_7]], %[[VAL_3]] : i64
 # CHECK:             cc.condition %[[VAL_8]](%[[VAL_7]] : i64)
@@ -43,30 +43,38 @@ def cost(thetas: np.ndarray):  # can pass 1D ndarray or list
 # CHECK:           ^bb0(%[[VAL_9:.*]]: i64):
 # CHECK:             %[[VAL_10:.*]] = cc.stdvec_data %[[VAL_0]] : (!cc.stdvec<f64>) -> !cc.ptr<!cc.array<f64 x ?>>
 # CHECK:             %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_10]]{{\[}}%[[VAL_9]]] : (!cc.ptr<!cc.array<f64 x ?>>, i64) -> !cc.ptr<f64>
+# CHECK:             %[[VAL_17:.*]] = cc.load %[[VAL_11]] : !cc.ptr<f64>
+# CHECK:             %[[VAL_30:.*]] = cc.alloca i64
+# CHECK:             cc.store %[[VAL_9]], %[[VAL_30]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_31:.*]] = cc.alloca f64
+# CHECK:             cc.store %[[VAL_17]], %[[VAL_31]] : !cc.ptr<f64>
+# CHECK:             %[[VAL_32:.*]] = cc.load %[[VAL_30]] : !cc.ptr<i64>
 # CHECK:             %[[VAL_12:.*]] = arith.constant 2.000000e+00 : f64
-# CHECK:             %[[VAL_13:.*]] = cc.cast signed %[[VAL_9]] : (i64) -> f64
+# CHECK:             %[[VAL_13:.*]] = cc.cast signed %[[VAL_32]] : (i64) -> f64
 # CHECK:             %[[VAL_14:.*]] = arith.remf %[[VAL_13]], %[[VAL_12]] : f64
 # CHECK:             %[[VAL_15:.*]] = arith.constant 0.000000e+00 : f64
 # CHECK:             %[[VAL_16:.*]] = arith.cmpf une, %[[VAL_14]], %[[VAL_15]] : f64
 # CHECK:             cc.if(%[[VAL_16]]) {
-# CHECK:               %[[VAL_17:.*]] = cc.load %[[VAL_11]] : !cc.ptr<f64>
+# CHECK:               %[[VAL_33:.*]] = cc.load %[[VAL_31]] : !cc.ptr<f64>
+# CHECK:               %[[VAL_34:.*]] = cc.load %[[VAL_30]] : !cc.ptr<i64>
 # CHECK:               %[[VAL_18:.*]] = arith.constant 4 : i64
-# CHECK:               %[[VAL_19:.*]] = arith.remui %[[VAL_9]], %[[VAL_18]] : i64
+# CHECK:               %[[VAL_19:.*]] = arith.remui %[[VAL_34]], %[[VAL_18]] : i64
 # CHECK:               %[[VAL_20:.*]] = quake.extract_ref %[[VAL_2]]{{\[}}%[[VAL_19]]] : (!quake.veq<?>, i64) -> !quake.ref
-# CHECK:               quake.ry (%[[VAL_17]]) %[[VAL_20]] : (f64, !quake.ref) -> ()
+# CHECK:               quake.ry (%[[VAL_33]]) %[[VAL_20]] : (f64, !quake.ref) -> ()
 # CHECK:             } else {
-# CHECK:               %[[VAL_21:.*]] = cc.load %[[VAL_11]] : !cc.ptr<f64>
+# CHECK:               %[[VAL_33:.*]] = cc.load %[[VAL_31]] : !cc.ptr<f64>
+# CHECK:               %[[VAL_34:.*]] = cc.load %[[VAL_30]] : !cc.ptr<i64>
 # CHECK:               %[[VAL_22:.*]] = arith.constant 4 : i64
-# CHECK:               %[[VAL_23:.*]] = arith.remui %[[VAL_9]], %[[VAL_22]] : i64
+# CHECK:               %[[VAL_23:.*]] = arith.remui %[[VAL_34]], %[[VAL_22]] : i64
 # CHECK:               %[[VAL_24:.*]] = quake.extract_ref %[[VAL_2]]{{\[}}%[[VAL_23]]] : (!quake.veq<?>, i64) -> !quake.ref
-# CHECK:               quake.rx (%[[VAL_21]]) %[[VAL_24]] : (f64, !quake.ref) -> ()
+# CHECK:               quake.rx (%[[VAL_33]]) %[[VAL_24]] : (f64, !quake.ref) -> ()
 # CHECK:             }
 # CHECK:             cc.continue %[[VAL_9]] : i64
 # CHECK:           } step {
 # CHECK:           ^bb0(%[[VAL_25:.*]]: i64):
 # CHECK:             %[[VAL_26:.*]] = arith.addi %[[VAL_25]], %[[VAL_5]] : i64
 # CHECK:             cc.continue %[[VAL_26]] : i64
-# CHECK:           } {invariant}
+# CHECK:           }
 # CHECK:           return
 # CHECK:         }
 
@@ -86,25 +94,33 @@ def cost(thetas: np.ndarray):  # can pass 1D ndarray or list
 # CHECK:           ^bb0(%[[VAL_10:.*]]: i64):
 # CHECK:             %[[VAL_11:.*]] = cc.stdvec_data %[[VAL_0]] : (!cc.stdvec<f64>) -> !cc.ptr<!cc.array<f64 x ?>>
 # CHECK:             %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_11]]{{\[}}%[[VAL_10]]] : (!cc.ptr<!cc.array<f64 x ?>>, i64) -> !cc.ptr<f64>
-# CHECK:             %[[VAL_13:.*]] = cc.cast signed %[[VAL_10]] : (i64) -> f64
+# CHECK:             %[[VAL_16:.*]] = cc.load %[[VAL_12]] : !cc.ptr<f64>
+# CHECK:             %[[VAL_30:.*]] = cc.alloca i64
+# CHECK:             cc.store %[[VAL_10]], %[[VAL_30]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_31:.*]] = cc.alloca f64
+# CHECK:             cc.store %[[VAL_16]], %[[VAL_31]] : !cc.ptr<f64>
+# CHECK:             %[[VAL_32:.*]] = cc.load %[[VAL_30]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_13:.*]] = cc.cast signed %[[VAL_32]] : (i64) -> f64
 # CHECK:             %[[VAL_14:.*]] = arith.remf %[[VAL_13]], %[[VAL_1]] : f64
 # CHECK:             %[[VAL_15:.*]] = arith.cmpf une, %[[VAL_14]], %[[VAL]] : f64
 # CHECK:             cc.if(%[[VAL_15]]) {
-# CHECK:               %[[VAL_16:.*]] = cc.load %[[VAL_12]] : !cc.ptr<f64>
-# CHECK:               %[[VAL_17:.*]] = arith.remui %[[VAL_10]], %[[VAL_4]] : i64
+# CHECK:               %[[VAL_33:.*]] = cc.load %[[VAL_31]] : !cc.ptr<f64>
+# CHECK:               %[[VAL_34:.*]] = cc.load %[[VAL_30]] : !cc.ptr<i64>
+# CHECK:               %[[VAL_17:.*]] = arith.remui %[[VAL_34]], %[[VAL_4]] : i64
 # CHECK:               %[[VAL_18:.*]] = quake.extract_ref %[[VAL_5]]{{\[}}%[[VAL_17]]] : (!quake.veq<4>, i64) -> !quake.ref
-# CHECK:               quake.ry (%[[VAL_16]]) %[[VAL_18]] : (f64, !quake.ref) -> ()
+# CHECK:               quake.ry (%[[VAL_33]]) %[[VAL_18]] : (f64, !quake.ref) -> ()
 # CHECK:             } else {
-# CHECK:               %[[VAL_19:.*]] = cc.load %[[VAL_12]] : !cc.ptr<f64>
-# CHECK:               %[[VAL_20:.*]] = arith.remui %[[VAL_10]], %[[VAL_4]] : i64
+# CHECK:               %[[VAL_33:.*]] = cc.load %[[VAL_31]] : !cc.ptr<f64>
+# CHECK:               %[[VAL_34:.*]] = cc.load %[[VAL_30]] : !cc.ptr<i64>
+# CHECK:               %[[VAL_20:.*]] = arith.remui %[[VAL_34]], %[[VAL_4]] : i64
 # CHECK:               %[[VAL_21:.*]] = quake.extract_ref %[[VAL_5]]{{\[}}%[[VAL_20]]] : (!quake.veq<4>, i64) -> !quake.ref
-# CHECK:               quake.rx (%[[VAL_19]]) %[[VAL_21]] : (f64, !quake.ref) -> ()
+# CHECK:               quake.rx (%[[VAL_33]]) %[[VAL_21]] : (f64, !quake.ref) -> ()
 # CHECK:             }
 # CHECK:             cc.continue %[[VAL_10]] : i64
 # CHECK:           } step {
 # CHECK:           ^bb0(%[[VAL_22:.*]]: i64):
 # CHECK:             %[[VAL_23:.*]] = arith.addi %[[VAL_22]], %[[VAL_2]] : i64
 # CHECK:             cc.continue %[[VAL_23]] : i64
-# CHECK:           } {invariant}
+# CHECK:           }
 # CHECK:           return
 # CHECK:         }
\ No newline at end of file
diff --git a/python/tests/mlir/ast_for_stdvec.py b/python/tests/mlir/ast_for_stdvec.py
index f75a7a9887d..9503ed77748 100644
--- a/python/tests/mlir/ast_for_stdvec.py
+++ b/python/tests/mlir/ast_for_stdvec.py
@@ -43,9 +43,12 @@ def cost(thetas: np.ndarray):  # can pass 1D ndarray or list
 # CHECK:             %[[VAL_10:.*]] = cc.stdvec_data %[[VAL_0]] : (!cc.stdvec<f64>) -> !cc.ptr<!cc.array<f64 x ?>>
 # CHECK:             %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_10]][%[[VAL_9]]] : (!cc.ptr<!cc.array<f64 x ?>>, i64) -> !cc.ptr<f64>
 # CHECK:             %[[VAL_12:.*]] = cc.load %[[VAL_11]] : !cc.ptr<f64>
+# CHECK:             %[[VAL_20:.*]] = cc.alloca f64
+# CHECK:             cc.store %[[VAL_12]], %[[VAL_20]] : !cc.ptr<f64>
+# CHECK:             %[[VAL_21:.*]] = cc.load %[[VAL_20]] : !cc.ptr<f64>
 # CHECK:             %[[VAL_13:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i64>
 # CHECK:             %[[VAL_14:.*]] = quake.extract_ref %[[VAL_3]][%[[VAL_13]]] : (!quake.veq<4>, i64) -> !quake.ref
-# CHECK:             quake.ry (%[[VAL_12]]) %[[VAL_14]] : (f64, !quake.ref) -> ()
+# CHECK:             quake.ry (%[[VAL_21]]) %[[VAL_14]] : (f64, !quake.ref) -> ()
 # CHECK:             %[[VAL_15:.*]] = cc.load %[[VAL_4]] : !cc.ptr<i64>
 # CHECK:             %[[VAL_16:.*]] = arith.addi %[[VAL_15]], %[[VAL_1]] : i64
 # CHECK:             cc.store %[[VAL_16]], %[[VAL_4]] : !cc.ptr<i64>
@@ -54,6 +57,6 @@ def cost(thetas: np.ndarray):  # can pass 1D ndarray or list
 # CHECK:           ^bb0(%[[VAL_17:.*]]: i64):
 # CHECK:             %[[VAL_18:.*]] = arith.addi %[[VAL_17]], %[[VAL_1]] : i64
 # CHECK:             cc.continue %[[VAL_18]] : i64
-# CHECK:           } {invariant}
+# CHECK:           }
 # CHECK:           return
 # CHECK:         }
diff --git a/python/tests/mlir/ast_iterate_loop_init.py b/python/tests/mlir/ast_iterate_loop_init.py
index 565f06bfa40..465265f2d06 100644
--- a/python/tests/mlir/ast_iterate_loop_init.py
+++ b/python/tests/mlir/ast_iterate_loop_init.py
@@ -50,12 +50,16 @@ def kernel(x: float):
 # CHECK:           ^bb0(%[[VAL_16:.*]]: i64):
 # CHECK:             %[[VAL_18:.*]] = cc.compute_ptr %[[VAL_8]][%[[VAL_16]]] : (!cc.ptr<!cc.array<i64 x 4>>, i64) -> !cc.ptr<i64>
 # CHECK:             %[[VAL_19:.*]] = cc.load %[[VAL_18]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_30:.*]] = cc.alloca i64
+# CHECK:             cc.store %[[VAL_19]], %[[VAL_30]] : !cc.ptr<i64>
 # CHECK:             %[[VAL_20:.*]] = cc.load %[[VAL_6]] : !cc.ptr<f64>
-# CHECK:             %[[VAL_21:.*]] = cc.cast signed %[[VAL_19]] : (i64) -> f64
+# CHECK:             %[[VAL_31:.*]] = cc.load %[[VAL_30]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_21:.*]] = cc.cast signed %[[VAL_31]] : (i64) -> f64
 # CHECK:             %[[VAL_22:.*]] = arith.addf %[[VAL_20]], %[[VAL_21]] : f64
 # CHECK:             cc.store %[[VAL_22]], %[[VAL_6]] : !cc.ptr<f64>
 # CHECK:             %[[VAL_23:.*]] = cc.load %[[VAL_6]] : !cc.ptr<f64>
-# CHECK:             %[[VAL_24:.*]] = arith.remui %[[VAL_19]], %[[VAL_5]] : i64
+# CHECK:             %[[VAL_32:.*]] = cc.load %[[VAL_30]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_24:.*]] = arith.remui %[[VAL_32]], %[[VAL_5]] : i64
 # CHECK:             %[[VAL_25:.*]] = quake.extract_ref %[[VAL_7]]{{\[}}%[[VAL_24]]] : (!quake.veq<4>, i64) -> !quake.ref
 # CHECK:             quake.ry (%[[VAL_23]]) %[[VAL_25]] : (f64, !quake.ref) -> ()
 # CHECK:             cc.continue %[[VAL_16]] : i64
@@ -63,6 +67,6 @@ def kernel(x: float):
 # CHECK:           ^bb0(%[[VAL_26:.*]]: i64):
 # CHECK:             %[[VAL_27:.*]] = arith.addi %[[VAL_26]], %[[VAL_3]] : i64
 # CHECK:             cc.continue %[[VAL_27]] : i64
-# CHECK:           } {invariant}
+# CHECK:           }
 # CHECK:           return
 # CHECK:         }
diff --git a/python/tests/mlir/ast_lambda_tuple_stmts.py b/python/tests/mlir/ast_lambda_tuple_stmts.py
index d55c167be95..6ce1db87fa4 100644
--- a/python/tests/mlir/ast_lambda_tuple_stmts.py
+++ b/python/tests/mlir/ast_lambda_tuple_stmts.py
@@ -46,7 +46,7 @@ def reflect(qubits: cudaq.qview):
 # CHECK:             ^bb0(%[[VAL_15:.*]]: i64):
 # CHECK:               %[[VAL_16:.*]] = arith.addi %[[VAL_15]], %[[VAL_3]] : i64
 # CHECK:               cc.continue %[[VAL_16]] : i64
-# CHECK:             } {invariant}
+# CHECK:             }
 # CHECK:             %[[VAL_17:.*]] = cc.loop while ((%[[VAL_18:.*]] = %[[VAL_2]]) -> (i64)) {
 # CHECK:               %[[VAL_19:.*]] = arith.cmpi slt, %[[VAL_18]], %[[VAL_4]] : i64
 # CHECK:               cc.condition %[[VAL_19]](%[[VAL_18]] : i64)
@@ -59,7 +59,7 @@ def reflect(qubits: cudaq.qview):
 # CHECK:             ^bb0(%[[VAL_22:.*]]: i64):
 # CHECK:               %[[VAL_23:.*]] = arith.addi %[[VAL_22]], %[[VAL_3]] : i64
 # CHECK:               cc.continue %[[VAL_23]] : i64
-# CHECK:             } {invariant}
+# CHECK:             }
 # CHECK:           } : !cc.callable<() -> ()>
 # CHECK:           %[[VAL_24:.*]] = cc.create_lambda {
 # CHECK:             quake.z {{\[}}%[[VAL_6]]] %[[VAL_8]] : (!quake.veq<?>, !quake.ref) -> ()
diff --git a/python/tests/mlir/ast_list_comprehension.py b/python/tests/mlir/ast_list_comprehension.py
index e0051bf28e7..536b5fc94c5 100644
--- a/python/tests/mlir/ast_list_comprehension.py
+++ b/python/tests/mlir/ast_list_comprehension.py
@@ -57,10 +57,8 @@ def kernel3() -> float:
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel1() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"}
 # CHECK:            %[[VAL_0:.*]] = arith.constant 1 : i8
 # CHECK:            %[[VAL_1:.*]] = cc.alloca !cc.array<i8 x 5>
-# CHECK:            %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr<!cc.array<i8 x 5>>) -> !cc.ptr<!cc.array<i8 x ?>>
 # CHECK:            %[[VAL_3:.*]] = cc.compute_ptr %[[VAL_1]][{{.*}}] : (!cc.ptr<!cc.array<i8 x 5>>, i64) -> !cc.ptr<i8>
 # CHECK:            cc.store %[[VAL_0]], %[[VAL_3]] : !cc.ptr<i8>
-# CHECK:            %[[VAL_5:.*]] = cc.stdvec_init %[[VAL_2]], %c5_i64 : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.stdvec<i1>
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel2() -> f64 attributes {"cudaq-entrypoint", "cudaq-kernel"}
 # CHECK:            %[[VAL_0:.*]] = arith.constant 1.000000e+00 : f64
@@ -125,7 +123,6 @@ def kernel3() -> float:
 # CHECK:            %[[VAL_1:.*]] = cc.alloca i1
 # CHECK:            cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<i1>
 # CHECK:            %[[VAL_2:.*]] = cc.alloca !cc.array<i8 x 5>
-# CHECK:            %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.array<i8 x 5>>) -> !cc.ptr<!cc.array<i8 x ?>>
 # CHECK:            %[[VAL_4:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i1>
 # CHECK:            %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr<!cc.array<i8 x 5>>, i64) -> !cc.ptr<i8>
 # CHECK:            %[[VAL_6:.*]] = cc.cast unsigned %[[VAL_4]] : (i1) -> i8
@@ -199,33 +196,22 @@ def kernel3() -> float:
 
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel1() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"}
-# CHECK:            %[[VAL_0:.*]] = arith.constant true
-# CHECK:            %[[VAL_1:.*]] = cc.alloca i1
-# CHECK:            cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<i1>
+# CHECK:            %[[VAL_0:.*]] = arith.constant 1 : i8
 # CHECK:            %[[VAL_2:.*]] = cc.alloca !cc.array<i8 x 5>
-# CHECK:            %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.array<i8 x 5>>) -> !cc.ptr<!cc.array<i8 x ?>>
-# CHECK:            %[[VAL_4:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i1>
-# CHECK:            %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr<!cc.array<i8 x 5>>, i64) -> !cc.ptr<i8>
-# CHECK:            %[[VAL_6:.*]] = cc.cast unsigned %[[VAL_4]] : (i1) -> i8
-# CHECK:            cc.store %[[VAL_6]], %[[VAL_5]] : !cc.ptr<i8>
+# CHECK:            %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr<!cc.array<i8 x 5>>, i64) -> !cc.ptr<i8>
+# CHECK:            cc.store %[[VAL_0]], %[[VAL_4]] : !cc.ptr<i8>
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel2() -> f64 attributes {"cudaq-entrypoint", "cudaq-kernel"}
 # CHECK:            %[[VAL_0:.*]] = arith.constant 1.000000e+00 : f64
-# CHECK:            %[[VAL_1:.*]] = cc.alloca f64
-# CHECK:            cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<f64>
 # CHECK:            %[[VAL_2:.*]] = cc.alloca !cc.array<f64 x 5>
-# CHECK:            %[[VAL_3:.*]] = cc.load %[[VAL_1]] : !cc.ptr<f64>
 # CHECK:            %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr<!cc.array<f64 x 5>>, i64) -> !cc.ptr<f64>
-# CHECK:            cc.store %[[VAL_3]], %[[VAL_4]] : !cc.ptr<f64>
+# CHECK:            cc.store %[[VAL_0]], %[[VAL_4]] : !cc.ptr<f64>
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel3() -> f64 attributes {"cudaq-entrypoint", "cudaq-kernel"}
 # CHECK:            %[[VAL_0:.*]] = complex.constant [0.000000e+00, 1.000000e+00] : complex<f64>
-# CHECK:            %[[VAL_1:.*]] = cc.alloca complex<f64>
-# CHECK:            cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<complex<f64>>
 # CHECK:            %[[VAL_2:.*]] = cc.alloca !cc.array<complex<f64> x 5>
-# CHECK:            %[[VAL_3:.*]] = cc.load %[[VAL_1]] : !cc.ptr<complex<f64>>
 # CHECK:            %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr<!cc.array<complex<f64> x 5>>, i64) -> !cc.ptr<complex<f64>>
-# CHECK:            cc.store %[[VAL_3]], %[[VAL_4]] : !cc.ptr<complex<f64>>
+# CHECK:            cc.store %[[VAL_0]], %[[VAL_4]] : !cc.ptr<complex<f64>>
 
 
 def test_list_comprehension_list_of_constant():
@@ -289,8 +275,8 @@ def kernel3() -> float:
 # CHECK:            cc.store %[[VAL_6]], %[[VAL_7]] : !cc.ptr<!cc.stdvec<i1>>
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel2() -> f64 attributes {"cudaq-entrypoint", "cudaq-kernel"}
-# CHECK:            %[[VAL_0:.*]] = arith.constant 1 : i64
-# CHECK:            %[[VAL_1:.*]] = arith.constant 1.000000e+00 : f64
+# CHECK-DAG:        %[[VAL_0:.*]] = arith.constant 1 : i64
+# CHECK-DAG:        %[[VAL_1:.*]] = arith.constant 1.000000e+00 : f64
 # CHECK:            %[[VAL_2:.*]] = cc.alloca !cc.array<!cc.stdvec<f64> x 5>
 # CHECK:            %[[VAL_3:.*]] = cc.alloca !cc.array<f64 x 1>
 # CHECK:            %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<f64 x 1>>) -> !cc.ptr<!cc.array<f64 x ?>>
@@ -301,8 +287,8 @@ def kernel3() -> float:
 # CHECK:            cc.store %[[VAL_6]], %[[VAL_7]] : !cc.ptr<!cc.stdvec<f64>>
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel3() -> f64 attributes {"cudaq-entrypoint", "cudaq-kernel"}
-# CHECK:            %[[VAL_0:.*]] = arith.constant 1 : i64
-# CHECK:            %[[VAL_1:.*]] = complex.constant [0.000000e+00, 1.000000e+00] : complex<f64>
+# CHECK-DAG:        %[[VAL_0:.*]] = arith.constant 1 : i64
+# CHECK-DAG:        %[[VAL_1:.*]] = complex.constant [0.000000e+00, 1.000000e+00] : complex<f64>
 # CHECK:            %[[VAL_2:.*]] = cc.alloca !cc.array<!cc.stdvec<complex<f64>> x 5>
 # CHECK:            %[[VAL_3:.*]] = cc.alloca !cc.array<complex<f64> x 1>
 # CHECK:            %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<complex<f64> x 1>>) -> !cc.ptr<!cc.array<complex<f64> x ?>>
@@ -722,7 +708,6 @@ def kernel5() -> bool:
         res = [mz(r) for r in q]
         return res[3]
 
-    #print(kernel)
     out = cudaq.run(kernel5, shots_count=1)
     assert (len(out) == 1 and out[0] == True)
     print(kernel5
@@ -749,7 +734,7 @@ def kernel6() -> int:
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel3() -> !cc.struct<"MyTuple" {f64, f64}> attributes {"cudaq-entrypoint", "cudaq-kernel"}
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel4() -> f64 attributes {"cudaq-entrypoint", "cudaq-kernel"}
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel5() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"}
-# CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel6() -> i64 attributes {"cudaq-entrypoint", "cudaq-kernel", qubitMeasurementFeedback = true}
+# CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel6() -> i64 attributes {"cudaq-entrypoint", "cudaq-kernel"}
 
 
 def test_list_comprehension_void():
@@ -798,7 +783,7 @@ def kernel2():
 # CHECK:           ^bb0(%[[VAL_9:.*]]: i64):
 # CHECK:             %[[VAL_10:.*]] = arith.addi %[[VAL_9]], %[[VAL_1]] : i64
 # CHECK:             cc.continue %[[VAL_10]] : i64
-# CHECK:           } {invariant}
+# CHECK:           }
 # CHECK:           %[[VAL_11:.*]] = quake.extract_ref %[[VAL_3]][0] : (!quake.veq<6>) -> !quake.ref
 # CHECK:           quake.x %[[VAL_11]] : (!quake.ref) -> ()
 # CHECK:           %[[VAL_12:.*]] = quake.extract_ref %[[VAL_3]][1] : (!quake.veq<6>) -> !quake.ref
@@ -902,28 +887,24 @@ def kernel6() -> float:
     assert (len(out) == 1 and out[0] == 2.)
     print(kernel6
          )  # keep after assert, such that we have no output if assert fails
-    '''
-    # FIXME: This is exactly an example where the non-hierarchical nature of the value
-    # stack leads to an incorrect error...
-    # This test will be enabled as part of the PR to revise the ast bridge value stack.
 
     @cudaq.kernel
-    def foo3(func: Callable[[cudaq.qvector, float, list[bool]], None], adj: list[bool], decompose_adj: bool):
-        targets = cudaq.qvector(len(adj))
-        if decompose_adj:
-            func(targets, 1, [not b for b in adj])
-        else:
-            cudaq.adjoint(func, targets, 1, adj)
-        mz(targets)
+    def apply_rotations(qs: cudaq.qvector, angle: float, adj: list[bool]):
+        for idx, is_adj in enumerate(adj):
+            if is_adj:
+                ry(-angle, qs[idx])
+            else:
+                ry(angle, qs[idx])
 
     @cudaq.kernel
-    def baz(qs: cudaq.qvector, angle: float, adj: list[bool]):
-        for idx, is_adj in enumerate(adj):
-            if is_adj: ry(-angle, qs[idx])
-            else: ry(angle, qs[idx])
+    def kernel7(adj: list[bool]):
+        targets = cudaq.qvector(len(adj))
+        apply_rotations(targets, 1, [not b for b in adj])
 
-    out = cudaq.sample(foo5, baz, [True, False, True], False)    
-    '''
+    out = cudaq.sample(kernel7, [True, False, True])
+    assert len(out) == 8
+    print(kernel7
+         )  # keep after assert, such that we have no output if assert fails
 
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel1() attributes {"cudaq-entrypoint", "cudaq-kernel"}
@@ -932,6 +913,7 @@ def baz(qs: cudaq.qvector, angle: float, adj: list[bool]):
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel4() -> i64 attributes {"cudaq-entrypoint", "cudaq-kernel"}
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel5() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"}
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel6() -> f64 attributes {"cudaq-entrypoint", "cudaq-kernel"}
+# CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel7(%arg0: !cc.stdvec<i1>) attributes {"cudaq-entrypoint", "cudaq-kernel"}
 
 
 def test_list_comprehension_failures():
diff --git a/python/tests/mlir/ast_list_init.py b/python/tests/mlir/ast_list_init.py
index 91840062a3d..8af1a219be4 100644
--- a/python/tests/mlir/ast_list_init.py
+++ b/python/tests/mlir/ast_list_init.py
@@ -35,7 +35,6 @@ def kernel():
 # CHECK-DAG:       %[[VAL_6:.*]] = arith.constant 1.000000e+00 : f64
 # CHECK-DAG:       %[[VAL_7:.*]] = quake.alloca !quake.veq<6>
 # CHECK-DAG:       %[[VAL_8:.*]] = cc.alloca !cc.array<f64 x 4>
-# CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!cc.array<f64 x 4>>) -> !cc.ptr<!cc.array<f64 x ?>>
 # CHECK:           %[[VAL_10:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!cc.array<f64 x 4>>) -> !cc.ptr<f64>
 # CHECK:           cc.store %[[VAL_6]], %[[VAL_10]] : !cc.ptr<f64>
 # CHECK:           %[[VAL_11:.*]] = cc.compute_ptr %[[VAL_8]][1] : (!cc.ptr<!cc.array<f64 x 4>>) -> !cc.ptr<f64>
@@ -44,27 +43,27 @@ def kernel():
 # CHECK:           cc.store %[[VAL_4]], %[[VAL_12]] : !cc.ptr<f64>
 # CHECK:           %[[VAL_13:.*]] = cc.compute_ptr %[[VAL_8]][3] : (!cc.ptr<!cc.array<f64 x 4>>) -> !cc.ptr<f64>
 # CHECK:           cc.store %[[VAL_3]], %[[VAL_13]] : !cc.ptr<f64>
-# CHECK:           %[[VAL_14:.*]] = cc.stdvec_init %[[VAL_9]], %[[VAL_2]] : (!cc.ptr<!cc.array<f64 x ?>>, i64) -> !cc.stdvec<f64>
-# CHECK:           %[[VAL_15:.*]] = cc.alloca !cc.stdvec<f64>
-# CHECK:           cc.store %[[VAL_14]], %[[VAL_15]] : !cc.ptr<!cc.stdvec<f64>>
-# CHECK:           %[[VAL_16:.*]] = cc.load %[[VAL_15]] : !cc.ptr<!cc.stdvec<f64>>
-# CHECK:           %[[VAL_17:.*]] = cc.stdvec_size %[[VAL_16]] : (!cc.stdvec<f64>) -> i64
 # CHECK:           %[[VAL_18:.*]] = cc.loop while ((%[[VAL_19:.*]] = %[[VAL_1]]) -> (i64)) {
-# CHECK:             %[[VAL_20:.*]] = arith.cmpi slt, %[[VAL_19]], %[[VAL_17]] : i64
+# CHECK:             %[[VAL_20:.*]] = arith.cmpi slt, %[[VAL_19]], %[[VAL_2]] : i64
 # CHECK:             cc.condition %[[VAL_20]](%[[VAL_19]] : i64)
 # CHECK:           } do {
 # CHECK:           ^bb0(%[[VAL_21:.*]]: i64):
-# CHECK:             %[[VAL_22:.*]] = cc.stdvec_data %[[VAL_16]] : (!cc.stdvec<f64>) -> !cc.ptr<!cc.array<f64 x ?>>
-# CHECK:             %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_22]]{{\[}}%[[VAL_21]]] : (!cc.ptr<!cc.array<f64 x ?>>, i64) -> !cc.ptr<f64>
+# CHECK:             %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_21]]] : (!cc.ptr<!cc.array<f64 x 4>>, i64) -> !cc.ptr<f64>
 # CHECK:             %[[VAL_24:.*]] = cc.load %[[VAL_23]] : !cc.ptr<f64>
-# CHECK:             %[[VAL_25:.*]] = quake.extract_ref %[[VAL_7]]{{\[}}%[[VAL_21]]] : (!quake.veq<6>, i64) -> !quake.ref
-# CHECK:             quake.ry (%[[VAL_24]]) %[[VAL_25]] : (f64, !quake.ref) -> ()
+# CHECK:             %[[VAL_14:.*]] = cc.alloca i64
+# CHECK:             cc.store %[[VAL_21]], %[[VAL_14]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_15:.*]] = cc.alloca f64
+# CHECK:             cc.store %[[VAL_24]], %[[VAL_15]] : !cc.ptr<f64>
+# CHECK:             %[[VAL_16:.*]] = cc.load %[[VAL_15]] : !cc.ptr<f64>
+# CHECK:             %[[VAL_17:.*]] = cc.load %[[VAL_14]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_25:.*]] = quake.extract_ref %[[VAL_7]]{{\[}}%[[VAL_17]]] : (!quake.veq<6>, i64) -> !quake.ref
+# CHECK:             quake.ry (%[[VAL_16]]) %[[VAL_25]] : (f64, !quake.ref) -> ()
 # CHECK:             cc.continue %[[VAL_21]] : i64
 # CHECK:           } step {
 # CHECK:           ^bb0(%[[VAL_26:.*]]: i64):
 # CHECK:             %[[VAL_27:.*]] = arith.addi %[[VAL_26]], %[[VAL_0]] : i64
 # CHECK:             cc.continue %[[VAL_27]] : i64
-# CHECK:           } {invariant}
+# CHECK:           }
 # CHECK:           return
 # CHECK:         }
 
diff --git a/python/tests/mlir/ast_list_int.py b/python/tests/mlir/ast_list_int.py
index 1f286dcd8c3..e765fe1fa4e 100644
--- a/python/tests/mlir/ast_list_int.py
+++ b/python/tests/mlir/ast_list_int.py
@@ -42,9 +42,15 @@ def oracle(register: cudaq.qview, auxillary_qubit: cudaq.qubit,
 # CHECK:             %[[VAL_11:.*]] = cc.stdvec_data %[[VAL_2]] : (!cc.stdvec<i64>) -> !cc.ptr<!cc.array<i64 x ?>>
 # CHECK:             %[[VAL_12:.*]] = cc.compute_ptr %[[VAL_11]]{{\[}}%[[VAL_10]]] : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.ptr<i64>
 # CHECK:             %[[VAL_13:.*]] = cc.load %[[VAL_12]] : !cc.ptr<i64>
-# CHECK:             %[[VAL_14:.*]] = arith.cmpi eq, %[[VAL_13]], %[[VAL_3]] : i64
+# CHECK:             %[[VAL_20:.*]] = cc.alloca i64
+# CHECK:             cc.store %[[VAL_10]], %[[VAL_20]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_21:.*]] = cc.alloca i64
+# CHECK:             cc.store %[[VAL_13]], %[[VAL_21]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_22:.*]] = cc.load %[[VAL_21]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_14:.*]] = arith.cmpi eq, %[[VAL_22]], %[[VAL_3]] : i64
 # CHECK:             cc.if(%[[VAL_14]]) {
-# CHECK:               %[[VAL_15:.*]] = quake.extract_ref %[[VAL_0]]{{\[}}%[[VAL_10]]] : (!quake.veq<?>, i64) -> !quake.ref
+# CHECK:               %[[VAL_22:.*]] = cc.load %[[VAL_20]] : !cc.ptr<i64>
+# CHECK:               %[[VAL_15:.*]] = quake.extract_ref %[[VAL_0]]{{\[}}%[[VAL_22]]] : (!quake.veq<?>, i64) -> !quake.ref
 # CHECK:               quake.x {{\[}}%[[VAL_15]]] %[[VAL_1]] : (!quake.ref, !quake.ref) -> ()
 # CHECK:             }
 # CHECK:             cc.continue %[[VAL_10]] : i64
@@ -52,6 +58,6 @@ def oracle(register: cudaq.qview, auxillary_qubit: cudaq.qubit,
 # CHECK:           ^bb0(%[[VAL_16:.*]]: i64):
 # CHECK:             %[[VAL_17:.*]] = arith.addi %[[VAL_16]], %[[VAL_3]] : i64
 # CHECK:             cc.continue %[[VAL_17]] : i64
-# CHECK:           } {invariant}
+# CHECK:           }
 # CHECK:           return
 # CHECK:         }
diff --git a/python/tests/mlir/ast_qreg_slice.py b/python/tests/mlir/ast_qreg_slice.py
index 0f245349db8..d04be7c3422 100644
--- a/python/tests/mlir/ast_qreg_slice.py
+++ b/python/tests/mlir/ast_qreg_slice.py
@@ -75,7 +75,6 @@ def slice_qvec():
 # CHECK:           %[[VAL_16:.*]] = quake.extract_ref %[[VAL_14]][1] : (!quake.veq<2>) -> !quake.ref
 # CHECK:           quake.z %[[VAL_16]] : (!quake.ref) -> ()
 # CHECK:           %[[VAL_17:.*]] = cc.alloca !cc.array<i64 x 5>
-# CHECK:           %[[VAL_18:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<!cc.array<i64 x 5>>) -> !cc.ptr<!cc.array<i64 x ?>>
 # CHECK:           %[[VAL_19:.*]] = cc.cast %[[VAL_17]] : (!cc.ptr<!cc.array<i64 x 5>>) -> !cc.ptr<i64>
 # CHECK:           cc.store %[[VAL_3]], %[[VAL_19]] : !cc.ptr<i64>
 # CHECK:           %[[VAL_20:.*]] = cc.compute_ptr %[[VAL_17]][1] : (!cc.ptr<!cc.array<i64 x 5>>) -> !cc.ptr<i64>
@@ -86,35 +85,30 @@ def slice_qvec():
 # CHECK:           cc.store %[[VAL_1]], %[[VAL_22]] : !cc.ptr<i64>
 # CHECK:           %[[VAL_23:.*]] = cc.compute_ptr %[[VAL_17]][4] : (!cc.ptr<!cc.array<i64 x 5>>) -> !cc.ptr<i64>
 # CHECK:           cc.store %[[VAL_5]], %[[VAL_23]] : !cc.ptr<i64>
-# CHECK:           %[[VAL_24:.*]] = cc.stdvec_init %[[VAL_18]], %[[VAL_5]] : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.stdvec<i64>
-# CHECK:           %[[VAL_25:.*]] = cc.alloca !cc.stdvec<i64>
-# CHECK:           cc.store %[[VAL_24]], %[[VAL_25]] : !cc.ptr<!cc.stdvec<i64>>
-# CHECK:           %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr<!cc.stdvec<i64>>
-# CHECK:           %[[VAL_27:.*]] = cc.stdvec_data %[[VAL_26]] : (!cc.stdvec<i64>) -> !cc.ptr<!cc.array<i64 x ?>>
-# CHECK:           %[[VAL_28:.*]] = cc.compute_ptr %[[VAL_27]][2] : (!cc.ptr<!cc.array<i64 x ?>>) -> !cc.ptr<i64>
-# CHECK:           %[[VAL_29:.*]] = cc.stdvec_init %[[VAL_28]], %[[VAL_2]] : (!cc.ptr<i64>, i64) -> !cc.stdvec<i64>
-# CHECK:           %[[VAL_30:.*]] = cc.alloca !cc.stdvec<i64>
-# CHECK:           cc.store %[[VAL_29]], %[[VAL_30]] : !cc.ptr<!cc.stdvec<i64>>
-# CHECK:           %[[VAL_31:.*]] = cc.load %[[VAL_30]] : !cc.ptr<!cc.stdvec<i64>>
-# CHECK:           %[[VAL_32:.*]] = cc.stdvec_size %[[VAL_31]] : (!cc.stdvec<i64>) -> i64
 # CHECK:           %[[VAL_33:.*]] = cc.loop while ((%[[VAL_34:.*]] = %[[VAL_6]]) -> (i64)) {
-# CHECK:             %[[VAL_35:.*]] = arith.cmpi slt, %[[VAL_34]], %[[VAL_32]] : i64
+# CHECK:             %[[VAL_35:.*]] = arith.cmpi slt, %[[VAL_34]], %[[VAL_2]] : i64
 # CHECK:             cc.condition %[[VAL_35]](%[[VAL_34]] : i64)
 # CHECK:           } do {
 # CHECK:           ^bb0(%[[VAL_36:.*]]: i64):
-# CHECK:             %[[VAL_37:.*]] = cc.stdvec_data %[[VAL_31]] : (!cc.stdvec<i64>) -> !cc.ptr<!cc.array<i64 x ?>>
+# CHECK:             %[[VAL_37:.*]] = cc.cast %[[VAL_21]] : (!cc.ptr<i64>) -> !cc.ptr<!cc.array<i64 x ?>>
 # CHECK:             %[[VAL_38:.*]] = cc.compute_ptr %[[VAL_37]]{{\[}}%[[VAL_36]]] : (!cc.ptr<!cc.array<i64 x ?>>, i64) -> !cc.ptr<i64>
 # CHECK:             %[[VAL_39:.*]] = cc.load %[[VAL_38]] : !cc.ptr<i64>
-# CHECK:             %[[VAL_40:.*]] = arith.remui %[[VAL_36]], %[[VAL_1]] : i64
+# CHECK:             %[[VAL_50:.*]] = cc.alloca i64
+# CHECK:             cc.store %[[VAL_36]], %[[VAL_50]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_51:.*]] = cc.alloca i64
+# CHECK:             cc.store %[[VAL_39]], %[[VAL_51]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_52:.*]] = cc.load %[[VAL_51]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_53:.*]] = cc.load %[[VAL_50]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_40:.*]] = arith.remui %[[VAL_53]], %[[VAL_1]] : i64
 # CHECK:             %[[VAL_41:.*]] = quake.extract_ref %[[VAL_7]]{{\[}}%[[VAL_40]]] : (!quake.veq<4>, i64) -> !quake.ref
-# CHECK:             %[[VAL_42:.*]] = arith.sitofp %[[VAL_39]] : i64 to f64
+# CHECK:             %[[VAL_42:.*]] = cc.cast signed %[[VAL_52]] : (i64) -> f64
 # CHECK:             quake.ry (%[[VAL_42]]) %[[VAL_41]] : (f64, !quake.ref) -> ()
 # CHECK:             cc.continue %[[VAL_36]] : i64
 # CHECK:           } step {
 # CHECK:           ^bb0(%[[VAL_43:.*]]: i64):
 # CHECK:             %[[VAL_44:.*]] = arith.addi %[[VAL_43]], %[[VAL_3]] : i64
 # CHECK:             cc.continue %[[VAL_44]] : i64
-# CHECK:           } {invariant}
+# CHECK:           }
 # CHECK:           %[[VAL_45:.*]] = quake.extract_ref %[[VAL_7]][3] : (!quake.veq<4>) -> !quake.ref
 # CHECK:           quake.rz (%[[VAL_4]]) %[[VAL_45]] : (f64, !quake.ref) -> ()
 # CHECK:           return
diff --git a/python/tests/mlir/ast_structs_and_tuples.py b/python/tests/mlir/ast_structs_and_tuples.py
index 70b706cd2e6..9eb95fac051 100644
--- a/python/tests/mlir/ast_structs_and_tuples.py
+++ b/python/tests/mlir/ast_structs_and_tuples.py
@@ -325,11 +325,11 @@ def test13() -> int:
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__test2() -> f64 attributes {"cudaq-entrypoint", "cudaq-kernel"}
 # CHECK-DAG:       %[[VAL_0:.*]] = arith.constant 2 : i64
 # CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 2.000000e+00 : f64
-# CHECK:           %[[VAL_2:.*]] = cc.alloca !cc.struct<"tuple" {i64, f64}>
-# CHECK:           %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.struct<"tuple" {i64, f64}>>) -> !cc.ptr<i64>
-# CHECK:           cc.store %[[VAL_0]], %[[VAL_3]] : !cc.ptr<i64>
-# CHECK:           %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][1] : (!cc.ptr<!cc.struct<"tuple" {i64, f64}>>) -> !cc.ptr<f64>
-# CHECK:           cc.store %[[VAL_1]], %[[VAL_4]] : !cc.ptr<f64>
+# CHECK:           %[[VAL_2:.*]] = cc.undef !cc.struct<"tuple" {i64, f64}>
+# CHECK:           %[[VAL_3:.*]] = cc.insert_value %[[VAL_2]][0], %[[VAL_0]] : (!cc.struct<"tuple" {i64, f64}>, i64) -> !cc.struct<"tuple" {i64, f64}>
+# CHECK:           %[[VAL_4:.*]] = cc.insert_value %[[VAL_3]][1], %[[VAL_1]] : (!cc.struct<"tuple" {i64, f64}>, f64) -> !cc.struct<"tuple" {i64, f64}>
+# CHECK:           %[[VAL_5:.*]] = cc.alloca !cc.struct<"tuple" {i64, f64}>
+# CHECK:           cc.store %[[VAL_4]], %[[VAL_5]] : !cc.ptr<!cc.struct<"tuple" {i64, f64}>>
 # CHECK:         result test2: 4.0
 
 # CHECK-LABEL:   result test3: 5.0
diff --git a/python/tests/mlir/ast_while_loop.py b/python/tests/mlir/ast_while_loop.py
index bb553d5a6f6..2ac8b41b62a 100644
--- a/python/tests/mlir/ast_while_loop.py
+++ b/python/tests/mlir/ast_while_loop.py
@@ -23,8 +23,10 @@ def cost():
             ry(np.pi, q[i])
             i -= 1
 
+    counts = cudaq.sample(cost)
+    assert len(counts) == 1
+    assert '011111' in counts
     print(cost)
-    # cost()
 
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__cost() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
@@ -62,12 +64,16 @@ def cost():
             ry(np.pi, q[i])
             i -= 1
 
+    counts = cudaq.sample(cost)
+    assert len(counts) == 1
+    assert '000111' in counts
     print(cost)
 
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__cost() attributes {"cudaq-entrypoint", "cudaq-kernel"} {
 # CHECK-DAG:           %[[VAL_0:.*]] = arith.constant 1 : i64
 # CHECK-DAG:           %[[VAL_1:.*]] = arith.constant 3.1415926535897931 : f64
+# CHECK-DAG:           %[[VAL_10:.*]] = arith.constant 2 : i64
 # CHECK-DAG:           %[[VAL_2:.*]] = arith.constant 14 : i64
 # CHECK-DAG:           %[[VAL_3:.*]] = arith.constant false
 # CHECK-DAG:           %[[VAL_4:.*]] = arith.constant 0 : i64
@@ -77,16 +83,22 @@ def cost():
 # CHECK:           cc.store %[[VAL_5]], %[[VAL_7]] : !cc.ptr<i64>
 # CHECK:           cc.loop while {
 # CHECK:             %[[VAL_8:.*]] = cc.load %[[VAL_7]] : !cc.ptr<i64>
-# CHECK:             %[[VAL_9:.*]] = arith.cmpi sgt, %[[VAL_8]], %[[VAL_4]] : i64
-# CHECK:             %[[VAL_10:.*]] = arith.cmpi eq, %[[VAL_9]], %[[VAL_3]] : i1
-# CHECK:             %[[VAL_11:.*]] = cc.if(%[[VAL_10]]) -> i1 {
+# CHECK:             %[[VAL_9:.*]] = arith.cmpi sle, %[[VAL_8]], %[[VAL_4]] : i64
+# CHECK:             %[[VAL_11:.*]] = cc.if(%[[VAL_9]]) -> i1 {
 # CHECK:               cc.continue %[[VAL_3]] : i1
 # CHECK:             } else {
 # CHECK:               %[[VAL_12:.*]] = cc.load %[[VAL_7]] : !cc.ptr<i64>
-# CHECK:               %[[VAL_13:.*]] = arith.cmpi slt, %[[VAL_12]], %[[VAL_2]] : i64
-# CHECK:               cc.continue %[[VAL_13]] : i1
+# CHECK:               %[[VAL_13:.*]] = arith.cmpi sge, %[[VAL_12]], %[[VAL_2]] : i64
+# CHECK:               %[[VAL_20:.*]] = cc.if(%[[VAL_13]]) -> i1 {
+# CHECK:                 cc.continue %[[VAL_3]] : i1
+# CHECK:               } else {
+# CHECK:                 %[[VAL_21:.*]] = cc.load %[[VAL_7]] : !cc.ptr<i64>
+# CHECK:                 %[[VAL_22:.*]] = arith.cmpi ne, %[[VAL_21]], %[[VAL_10]] : i64
+# CHECK:                 cc.continue %[[VAL_22]] : i1
+# CHECK:               }
+# CHECK:               cc.continue %[[VAL_20:.*]]
 # CHECK:             }
-# CHECK:             cc.condition %[[VAL_14:.*]]
+# CHECK:             cc.condition %[[VAL_11:.*]]
 # CHECK:           } do {
 # CHECK:             %[[VAL_15:.*]] = cc.load %[[VAL_7]] : !cc.ptr<i64>
 # CHECK:             %[[VAL_16:.*]] = quake.extract_ref %[[VAL_6]]{{\[}}%[[VAL_15]]] : (!quake.veq<6>, i64) -> !quake.ref
diff --git a/python/tests/mlir/bug_1775.py b/python/tests/mlir/bug_1775.py
index ed65e3f06ea..5c42b7293be 100644
--- a/python/tests/mlir/bug_1775.py
+++ b/python/tests/mlir/bug_1775.py
@@ -48,13 +48,9 @@ def test():
 # CHECK:           %[[VAL_0:.*]] = arith.constant true
 # CHECK:           %[[VAL_1:.*]] = quake.alloca !quake.ref
 # CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_1]] name "res" : (!quake.ref) -> !quake.measure
-# CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!quake.measure) -> i1
 # CHECK:           quake.h %[[VAL_1]] : (!quake.ref) -> ()
 # CHECK:           %[[VAL_4:.*]] = quake.mz %[[VAL_1]] name "res" : (!quake.ref) -> !quake.measure
 # CHECK:           %[[VAL_5:.*]] = quake.discriminate %[[VAL_4]] : (!quake.measure) -> i1
-# CHECK:           %[[VAL_6:.*]] = cc.alloca i1
-# CHECK:           cc.store %[[VAL_3]], %[[VAL_6]] : !cc.ptr<i1>
-# CHECK:           cc.store %[[VAL_5]], %[[VAL_6]] : !cc.ptr<i1>
 # CHECK:           %[[VAL_7:.*]] = arith.cmpi eq, %[[VAL_5]], %[[VAL_0]] : i1
 # CHECK:           cc.if(%[[VAL_7]]) {
 # CHECK:             %[[VAL_8:.*]] = quake.mz %[[VAL_1]] name "true_res" : (!quake.ref) -> !quake.measure
diff --git a/python/tests/mlir/bug_1777.py b/python/tests/mlir/bug_1777.py
index fd5b2f75385..276d0bff4d1 100644
--- a/python/tests/mlir/bug_1777.py
+++ b/python/tests/mlir/bug_1777.py
@@ -45,30 +45,28 @@ def test():
 # CHECK:             cc.condition %[[VAL_9]](%[[VAL_8]] : i64)
 # CHECK:           } do {
 # CHECK:           ^bb0(%[[VAL_10:.*]]: i64):
-# CHECK:             %[[VAL_11:.*]] = quake.extract_ref %[[VAL_5]]{{\[}}%[[VAL_10]]] : (!quake.veq<2>, i64) -> !quake.ref
+# CHECK:             %[[VAL_25:.*]] = cc.alloca i64
+# CHECK:             cc.store %[[VAL_10]], %[[VAL_25]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_26:.*]] = cc.load %[[VAL_25]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_11:.*]] = quake.extract_ref %[[VAL_5]]{{\[}}%[[VAL_26]]] : (!quake.veq<2>, i64) -> !quake.ref
 # CHECK:             %[[VAL_12:.*]] = quake.mz %[[VAL_11]] name "res" : (!quake.ref) -> !quake.measure
 # CHECK:             %[[VAL_13:.*]] = quake.discriminate %[[VAL_12]] : (!quake.measure) -> i1
 # CHECK:             cc.store %[[VAL_13]], %[[VAL_6]] : !cc.ptr<i1>
-# CHECK:             %[[VAL_14:.*]] = arith.cmpi eq, %[[VAL_13]], %[[VAL_0]] : i1
+# CHECK:             %[[VAL_26:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i1>
+# CHECK:             %[[VAL_14:.*]] = arith.cmpi eq, %[[VAL_26]], %[[VAL_0]] : i1
 # CHECK:             cc.if(%[[VAL_14]]) {
 # CHECK:               %[[VAL_15:.*]] = quake.mz %[[VAL_5]] name "inner_mz" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
-# CHECK:               %[[VAL_16:.*]] = quake.discriminate %[[VAL_15]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
-# CHECK:               %[[VAL_17:.*]] = cc.alloca !cc.stdvec<i1>
-# CHECK:               cc.store %[[VAL_16]], %[[VAL_17]] : !cc.ptr<!cc.stdvec<i1>>
 # CHECK:             }
 # CHECK:             cc.continue %[[VAL_10]] : i64
 # CHECK:           } step {
 # CHECK:           ^bb0(%[[VAL_18:.*]]: i64):
 # CHECK:             %[[VAL_19:.*]] = arith.addi %[[VAL_18]], %[[VAL_1]] : i64
 # CHECK:             cc.continue %[[VAL_19]] : i64
-# CHECK:           } {invariant}
+# CHECK:           }
 # CHECK:           %[[VAL_20:.*]] = cc.load %[[VAL_6]] : !cc.ptr<i1>
 # CHECK:           %[[VAL_21:.*]] = arith.cmpi eq, %[[VAL_20]], %[[VAL_3]] : i1
 # CHECK:           cc.if(%[[VAL_21]]) {
 # CHECK:             %[[VAL_22:.*]] = quake.mz %[[VAL_5]] name "outer_mz" : (!quake.veq<2>) -> !cc.stdvec<!quake.measure>
-# CHECK:             %[[VAL_23:.*]] = quake.discriminate %[[VAL_22]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
-# CHECK:             %[[VAL_24:.*]] = cc.alloca !cc.stdvec<i1>
-# CHECK:             cc.store %[[VAL_23]], %[[VAL_24]] : !cc.ptr<!cc.stdvec<i1>>
 # CHECK:           }
 # CHECK:           return
 # CHECK:         }
diff --git a/python/tests/mlir/bug_1875.py b/python/tests/mlir/bug_1875.py
index 3e3ff78122b..761de66e941 100644
--- a/python/tests/mlir/bug_1875.py
+++ b/python/tests/mlir/bug_1875.py
@@ -49,9 +49,8 @@ def kernel_break():
 # CHECK:           %[[VAL_7:.*]] = quake.mz %[[VAL_3]] name "aux_2" : (!quake.ref) -> !quake.measure
 # CHECK:           %[[VAL_8:.*]] = quake.discriminate %[[VAL_7]] : (!quake.measure) -> i1
 # CHECK:           %[[VAL_9:.*]] = cc.cast unsigned %[[VAL_6]] : (i1) -> i64
-# CHECK:           %[[VAL_10:.*]] = arith.cmpi eq, %[[VAL_9]], %[[VAL_1]] : i64
-# CHECK:           %[[VAL_11:.*]] = arith.cmpi eq, %[[VAL_10]], %[[VAL_0]] : i1
-# CHECK:           %[[VAL_12:.*]] = cc.if(%[[VAL_11]]) -> i1 {
+# CHECK:           %[[VAL_10:.*]] = arith.cmpi ne, %[[VAL_9]], %[[VAL_1]] : i64
+# CHECK:           %[[VAL_12:.*]] = cc.if(%[[VAL_10]]) -> i1 {
 # CHECK:             cc.continue %[[VAL_0]] : i1
 # CHECK:           } else {
 # CHECK:             %[[VAL_13:.*]] = cc.cast unsigned %[[VAL_8]] : (i1) -> i64
diff --git a/python/tests/mlir/call_qpu.py b/python/tests/mlir/call_qpu.py
index 0655a44d9b3..e2b8a2c0799 100644
--- a/python/tests/mlir/call_qpu.py
+++ b/python/tests/mlir/call_qpu.py
@@ -52,7 +52,7 @@ def main_kernel() -> int:
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__func_achat(
 # CHECK-SAME:                                            %[[VAL_0:.*]]: !quake.veq<?>) -> !cc.stdvec<i1> attributes {"cudaq-kernel", qubitMeasurementFeedback = true} {
-# CHECK:           %[[VAL_1:.*]] = arith.constant 8 : i64
+# CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
 # CHECK:           %[[VAL_2:.*]] = quake.mz %[[VAL_0]] : (!quake.veq<?>) -> !cc.stdvec<!quake.measure>
 # CHECK:           %[[VAL_3:.*]] = quake.discriminate %[[VAL_2]] : (!cc.stdvec<!quake.measure>) -> !cc.stdvec<i1>
 # CHECK:           %[[VAL_4:.*]] = cc.stdvec_data %[[VAL_3]] : (!cc.stdvec<i1>) -> !cc.ptr<!cc.array<i8 x ?>>
@@ -65,8 +65,9 @@ def main_kernel() -> int:
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__func_shiim(
 # CHECK-SAME:                                            %[[VAL_0:.*]]: !quake.veq<?>) -> i64 attributes {"cudaq-kernel", qubitMeasurementFeedback = true} {
-# CHECK:           %[[VAL_1:.*]] = arith.constant 1 : i64
-# CHECK:           %[[VAL_2:.*]] = arith.constant 0 : i64
+# CHECK-DAG:       %[[VAL_1:.*]] = arith.constant 1 : i64
+# CHECK-DAG:       %[[VAL_10:.*]] = arith.constant 0 : i8
+# CHECK-DAG:       %[[VAL_2:.*]] = arith.constant 0 : i64
 # CHECK:           %[[VAL_3:.*]] = quake.subveq %[[VAL_0]], 1, 2 : (!quake.veq<?>) -> !quake.veq<2>
 # CHECK:           %[[VAL_4:.*]] = quake.relax_size %[[VAL_3]] : (!quake.veq<2>) -> !quake.veq<?>
 # CHECK:           %[[VAL_5:.*]] = call @__nvqpp__mlirgen__func_achat(%[[VAL_4]]) : (!quake.veq<?>) -> !cc.stdvec<i1>
@@ -75,23 +76,20 @@ def main_kernel() -> int:
 # CHECK:           %[[VAL_8:.*]] = cc.alloca i8{{\[}}%[[VAL_7]] : i64]
 # CHECK:           %[[VAL_9:.*]] = cc.cast %[[VAL_8]] : (!cc.ptr<!cc.array<i8 x ?>>) -> !cc.ptr<i8>
 # CHECK:           call @__nvqpp_vectorCopyToStack(%[[VAL_9]], %[[VAL_6]], %[[VAL_7]]) : (!cc.ptr<i8>, !cc.ptr<i8>, i64) -> ()
-# CHECK:           %[[VAL_10:.*]] = cc.stdvec_init %[[VAL_8]], %[[VAL_7]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.stdvec<i1>
-# CHECK:           %[[VAL_11:.*]] = cc.alloca !cc.stdvec<i1>
-# CHECK:           cc.store %[[VAL_10]], %[[VAL_11]] : !cc.ptr<!cc.stdvec<i1>>
 # CHECK:           %[[VAL_12:.*]] = cc.alloca i64
 # CHECK:           cc.store %[[VAL_2]], %[[VAL_12]] : !cc.ptr<i64>
-# CHECK:           %[[VAL_13:.*]] = cc.load %[[VAL_11]] : !cc.ptr<!cc.stdvec<i1>>
-# CHECK:           %[[VAL_14:.*]] = cc.stdvec_size %[[VAL_13]] : (!cc.stdvec<i1>) -> i64
 # CHECK:           %[[VAL_15:.*]] = cc.loop while ((%[[VAL_16:.*]] = %[[VAL_2]]) -> (i64)) {
-# CHECK:             %[[VAL_17:.*]] = arith.cmpi slt, %[[VAL_16]], %[[VAL_14]] : i64
+# CHECK:             %[[VAL_17:.*]] = arith.cmpi slt, %[[VAL_16]], %[[VAL_7]] : i64
 # CHECK:             cc.condition %[[VAL_17]](%[[VAL_16]] : i64)
 # CHECK:           } do {
 # CHECK:           ^bb0(%[[VAL_18:.*]]: i64):
-# CHECK:             %[[VAL_19:.*]] = cc.stdvec_data %[[VAL_13]] : (!cc.stdvec<i1>) -> !cc.ptr<!cc.array<i8 x ?>>
-# CHECK:             %[[VAL_20:.*]] = cc.compute_ptr %[[VAL_19]]{{\[}}%[[VAL_18]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+# CHECK:             %[[VAL_20:.*]] = cc.compute_ptr %[[VAL_8]]{{\[}}%[[VAL_18]]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
 # CHECK:             %[[VAL_21:.*]] = cc.load %[[VAL_20]] : !cc.ptr<i8>
-# CHECK:             %[[VAL_22:.*]] = cc.cast %[[VAL_21]] : (i8) -> i1
-# CHECK:             cc.if(%[[VAL_22]]) {
+# CHECK:             %[[VAL_11:.*]] = arith.cmpi ne, %[[VAL_21]], %[[VAL_10]] : i8
+# CHECK:             %[[VAL_13:.*]] = cc.alloca i1
+# CHECK:             cc.store %[[VAL_11]], %[[VAL_13]] : !cc.ptr<i1>
+# CHECK:             %[[VAL_14:.*]] = cc.load %[[VAL_13]] : !cc.ptr<i1>
+# CHECK:             cc.if(%[[VAL_14]]) {
 # CHECK:               %[[VAL_23:.*]] = cc.load %[[VAL_12]] : !cc.ptr<i64>
 # CHECK:               %[[VAL_24:.*]] = arith.addi %[[VAL_23]], %[[VAL_1]] : i64
 # CHECK:               cc.store %[[VAL_24]], %[[VAL_12]] : !cc.ptr<i64>
@@ -101,7 +99,7 @@ def main_kernel() -> int:
 # CHECK:           ^bb0(%[[VAL_25:.*]]: i64):
 # CHECK:             %[[VAL_26:.*]] = arith.addi %[[VAL_25]], %[[VAL_1]] : i64
 # CHECK:             cc.continue %[[VAL_26]] : i64
-# CHECK:           } {invariant}
+# CHECK:           }
 # CHECK:           %[[VAL_27:.*]] = cc.load %[[VAL_12]] : !cc.ptr<i64>
 # CHECK:           return %[[VAL_27]] : i64
 # CHECK:         }
diff --git a/python/tests/mlir/ghz.py b/python/tests/mlir/ghz.py
index ac6dd80116c..af73939dcc8 100644
--- a/python/tests/mlir/ghz.py
+++ b/python/tests/mlir/ghz.py
@@ -39,8 +39,11 @@ def ghz(N: int):
     # CHECK:             cc.condition %[[VAL_11]](%[[VAL_10]] : i64)
     # CHECK:           } do {
     # CHECK:           ^bb0(%[[VAL_12:.*]]: i64):
-    # CHECK:             %[[VAL_13:.*]] = quake.extract_ref %[[VAL_5]]{{\[}}%[[VAL_12]]] : (!quake.veq<?>, i64) -> !quake.ref
-    # CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_12]], %[[VAL_1]] : i64
+    # CHECK:             %[[VAL_20:.*]] = cc.alloca i64
+    # CHECK:             cc.store %[[VAL_12]], %[[VAL_20]] : !cc.ptr<i64>
+    # CHECK:             %[[VAL_21:.*]] = cc.load %[[VAL_20]] : !cc.ptr<i64>
+    # CHECK:             %[[VAL_13:.*]] = quake.extract_ref %[[VAL_5]]{{\[}}%[[VAL_21]]] : (!quake.veq<?>, i64) -> !quake.ref
+    # CHECK:             %[[VAL_14:.*]] = arith.addi %[[VAL_21]], %[[VAL_1]] : i64
     # CHECK:             %[[VAL_15:.*]] = quake.extract_ref %[[VAL_5]]{{\[}}%[[VAL_14]]] : (!quake.veq<?>, i64) -> !quake.ref
     # CHECK:             quake.x {{\[}}%[[VAL_13]]] %[[VAL_15]] : (!quake.ref, !quake.ref) -> ()
     # CHECK:             cc.continue %[[VAL_12]] : i64
@@ -48,7 +51,7 @@ def ghz(N: int):
     # CHECK:           ^bb0(%[[VAL_16:.*]]: i64):
     # CHECK:             %[[VAL_17:.*]] = arith.addi %[[VAL_16]], %[[VAL_1]] : i64
     # CHECK:             cc.continue %[[VAL_17]] : i64
-    # CHECK:           } {invariant}
+    # CHECK:           }
     # CHECK:           return
     # CHECK:         }
 
@@ -83,7 +86,10 @@ def simple(numQubits: int):
 # CHECK:           } do {
 # CHECK:           ^bb0(%[[VAL_15:.*]]: i64):
 # CHECK:             %[[VAL_16:.*]] = quake.extract_ref %[[VAL_10]]{{\[}}%[[VAL_15]]] : (!quake.veq<?>, i64) -> !quake.ref
-# CHECK:             %[[VAL_17:.*]] = arith.addi %[[VAL_15]], %[[VAL_2]] : i64
+# CHECK:             %[[VAL_21:.*]] = cc.alloca i64
+# CHECK:             cc.store %[[VAL_15]], %[[VAL_21]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_22:.*]] = cc.load %[[VAL_21]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_17:.*]] = arith.addi %[[VAL_22]], %[[VAL_2]] : i64
 # CHECK:             %[[VAL_18:.*]] = quake.extract_ref %[[VAL_6]]{{\[}}%[[VAL_17]]] : (!quake.veq<?>, i64) -> !quake.ref
 # CHECK:             quake.x {{\[}}%[[VAL_16]]] %[[VAL_18]] : (!quake.ref, !quake.ref) -> ()
 # CHECK:             cc.continue %[[VAL_15]] : i64
@@ -91,6 +97,6 @@ def simple(numQubits: int):
 # CHECK:           ^bb0(%[[VAL_19:.*]]: i64):
 # CHECK:             %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_2]] : i64
 # CHECK:             cc.continue %[[VAL_20]] : i64
-# CHECK:           } {invariant}
+# CHECK:           }
 # CHECK:           return
 # CHECK:         }
diff --git a/python/tests/mlir/invalid_subrange.py b/python/tests/mlir/invalid_subrange.py
index c18eb394422..ba756097138 100644
--- a/python/tests/mlir/invalid_subrange.py
+++ b/python/tests/mlir/invalid_subrange.py
@@ -46,15 +46,19 @@ def bar():
 # CHECK:             cc.condition %[[VAL_9]](%[[VAL_8]] : i64)
 # CHECK:           } do {
 # CHECK:           ^bb0(%[[VAL_10:.*]]: i64):
-# CHECK:             %[[VAL_11:.*]] = arith.cmpi eq, %[[VAL_10]], %[[VAL_2]] : i64
+# CHECK:             %[[VAL_20:.*]] = cc.alloca i64
+# CHECK:             cc.store %[[VAL_10]], %[[VAL_20]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_21:.*]] = cc.load %[[VAL_20]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_11:.*]] = arith.cmpi eq, %[[VAL_21]], %[[VAL_2]] : i64
 # CHECK:             cc.if(%[[VAL_11]]) {
 # CHECK:               %[[VAL_12:.*]] = quake.extract_ref %[[VAL_4]][0] : (!quake.veq<4>) -> !quake.ref
 # CHECK:               quake.x {{\[}}%[[VAL_3]]] %[[VAL_12]] : (!quake.ref, !quake.ref) -> ()
 # CHECK:             } else {
-# CHECK:               %[[VAL_13:.*]] = arith.subi %[[VAL_10]], %[[VAL_1]] : i64
+# CHECK:               %[[VAL_22:.*]] = cc.load %[[VAL_20]] : !cc.ptr<i64>
+# CHECK:               %[[VAL_13:.*]] = arith.subi %[[VAL_22]], %[[VAL_1]] : i64
 # CHECK:               %[[VAL_14:.*]] = quake.subveq %[[VAL_4]], 0, %[[VAL_13]] : (!quake.veq<4>, i64) -> !quake.veq<?>
 # CHECK:               %[[VAL_15:.*]] = quake.concat %[[VAL_3]], %[[VAL_14]] : (!quake.ref, !quake.veq<?>) -> !quake.veq<?>
-# CHECK:               %[[VAL_16:.*]] = quake.extract_ref %[[VAL_4]]{{\[}}%[[VAL_10]]] : (!quake.veq<4>, i64) -> !quake.ref
+# CHECK:               %[[VAL_16:.*]] = quake.extract_ref %[[VAL_4]]{{\[}}%[[VAL_22]]] : (!quake.veq<4>, i64) -> !quake.ref
 # CHECK:               quake.x {{\[}}%[[VAL_15]]] %[[VAL_16]] : (!quake.veq<?>, !quake.ref) -> ()
 # CHECK:             }
 # CHECK:             cc.continue %[[VAL_10]] : i64
@@ -62,7 +66,7 @@ def bar():
 # CHECK:           ^bb0(%[[VAL_17:.*]]: i64):
 # CHECK:             %[[VAL_18:.*]] = arith.addi %[[VAL_17]], %[[VAL_1]] : i64
 # CHECK:             cc.continue %[[VAL_18]] : i64
-# CHECK:           } {invariant}
+# CHECK:           }
 # CHECK:           return
 # CHECK:         }
 
diff --git a/python/tests/mlir/qft.py b/python/tests/mlir/qft.py
index 4565611d9a6..b48975b3595 100644
--- a/python/tests/mlir/qft.py
+++ b/python/tests/mlir/qft.py
@@ -49,9 +49,12 @@ def iqft(qubits: cudaq.qview):
 # CHECK:             cc.condition %[[VAL_12]](%[[VAL_11]] : i64)
 # CHECK:           } do {
 # CHECK:           ^bb0(%[[VAL_13:.*]]: i64):
-# CHECK:             %[[VAL_14:.*]] = quake.extract_ref %[[VAL_0]]{{\[}}%[[VAL_13]]] : (!quake.veq<?>, i64) -> !quake.ref
+# CHECK:             %[[VAL_50:.*]] = cc.alloca i64
+# CHECK:             cc.store %[[VAL_13]], %[[VAL_50]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_51:.*]] = cc.load %[[VAL_50]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_14:.*]] = quake.extract_ref %[[VAL_0]]{{\[}}%[[VAL_51]]] : (!quake.veq<?>, i64) -> !quake.ref
 # CHECK:             %[[VAL_15:.*]] = cc.load %[[VAL_7]] : !cc.ptr<i64>
-# CHECK:             %[[VAL_16:.*]] = arith.subi %[[VAL_15]], %[[VAL_13]] : i64
+# CHECK:             %[[VAL_16:.*]] = arith.subi %[[VAL_15]], %[[VAL_51]] : i64
 # CHECK:             %[[VAL_17:.*]] = arith.subi %[[VAL_16]], %[[VAL_3]] : i64
 # CHECK:             %[[VAL_18:.*]] = quake.extract_ref %[[VAL_0]]{{\[}}%[[VAL_17]]] : (!quake.veq<?>, i64) -> !quake.ref
 # CHECK:             quake.swap %[[VAL_14]], %[[VAL_18]] : (!quake.ref, !quake.ref) -> ()
@@ -60,7 +63,7 @@ def iqft(qubits: cudaq.qview):
 # CHECK:           ^bb0(%[[VAL_19:.*]]: i64):
 # CHECK:             %[[VAL_20:.*]] = arith.addi %[[VAL_19]], %[[VAL_3]] : i64
 # CHECK:             cc.continue %[[VAL_20]] : i64
-# CHECK:           } {invariant}
+# CHECK:           }
 # CHECK:           %[[VAL_21:.*]] = cc.load %[[VAL_7]] : !cc.ptr<i64>
 # CHECK:           %[[VAL_22:.*]] = arith.subi %[[VAL_21]], %[[VAL_3]] : i64
 # CHECK:           %[[VAL_23:.*]] = cc.loop while ((%[[VAL_24:.*]] = %[[VAL_4]]) -> (i64)) {
@@ -68,36 +71,44 @@ def iqft(qubits: cudaq.qview):
 # CHECK:             cc.condition %[[VAL_25]](%[[VAL_24]] : i64)
 # CHECK:           } do {
 # CHECK:           ^bb0(%[[VAL_26:.*]]: i64):
-# CHECK:             %[[VAL_27:.*]] = quake.extract_ref %[[VAL_0]]{{\[}}%[[VAL_26]]] : (!quake.veq<?>, i64) -> !quake.ref
+# CHECK:             %[[VAL_52:.*]] = cc.alloca i64
+# CHECK:             cc.store %[[VAL_26]], %[[VAL_52]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_53:.*]] = cc.load %[[VAL_52]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_27:.*]] = quake.extract_ref %[[VAL_0]]{{\[}}%[[VAL_53]]] : (!quake.veq<?>, i64) -> !quake.ref
 # CHECK:             quake.h %[[VAL_27]] : (!quake.ref) -> ()
-# CHECK:             %[[VAL_28:.*]] = arith.addi %[[VAL_26]], %[[VAL_3]] : i64
+# CHECK:             %[[VAL_54:.*]] = cc.load %[[VAL_52]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_28:.*]] = arith.addi %[[VAL_54]], %[[VAL_3]] : i64
 # CHECK:             %[[VAL_29:.*]] = cc.alloca i64
 # CHECK:             cc.store %[[VAL_28]], %[[VAL_29]] : !cc.ptr<i64>
-# CHECK:             %[[VAL_30:.*]] = cc.loop while ((%[[VAL_31:.*]] = %[[VAL_26]]) -> (i64)) {
+# CHECK:             %[[VAL_55:.*]] = cc.load %[[VAL_52]] : !cc.ptr<i64>
+# CHECK:             %[[VAL_30:.*]] = cc.loop while ((%[[VAL_31:.*]] = %[[VAL_55]]) -> (i64)) {
 # CHECK:               %[[VAL_32:.*]] = arith.cmpi sgt, %[[VAL_31]], %[[VAL_2]] : i64
 # CHECK:               cc.condition %[[VAL_32]](%[[VAL_31]] : i64)
 # CHECK:             } do {
 # CHECK:             ^bb0(%[[VAL_33:.*]]: i64):
+# CHECK:               %[[VAL_56:.*]] = cc.alloca i64
+# CHECK:               cc.store %[[VAL_33]], %[[VAL_56]] : !cc.ptr<i64>
 # CHECK:               %[[VAL_34:.*]] = cc.load %[[VAL_29]] : !cc.ptr<i64>
-# CHECK:               %[[VAL_35:.*]] = arith.subi %[[VAL_34]], %[[VAL_33]] : i64
+# CHECK:               %[[VAL_57:.*]] = cc.load %[[VAL_56]] : !cc.ptr<i64>
+# CHECK:               %[[VAL_35:.*]] = arith.subi %[[VAL_34]], %[[VAL_57]] : i64
 # CHECK:               %[[VAL_36:.*]] = math.ipowi %[[VAL_5]], %[[VAL_35]] : i64
 # CHECK:               %[[VAL_37:.*]] = cc.cast signed %[[VAL_36]] : (i64) -> f64
 # CHECK:               %[[VAL_38:.*]] = arith.divf %[[VAL_1]], %[[VAL_37]] : f64
 # CHECK:               %[[VAL_39:.*]] = quake.extract_ref %[[VAL_0]]{{\[}}%[[VAL_34]]] : (!quake.veq<?>, i64) -> !quake.ref
-# CHECK:               %[[VAL_40:.*]] = quake.extract_ref %[[VAL_0]]{{\[}}%[[VAL_33]]] : (!quake.veq<?>, i64) -> !quake.ref
+# CHECK:               %[[VAL_40:.*]] = quake.extract_ref %[[VAL_0]]{{\[}}%[[VAL_57]]] : (!quake.veq<?>, i64) -> !quake.ref
 # CHECK:               quake.r1 (%[[VAL_38]]) {{\[}}%[[VAL_39]]] %[[VAL_40]] : (f64, !quake.ref, !quake.ref) -> ()
 # CHECK:               cc.continue %[[VAL_33]] : i64
 # CHECK:             } step {
 # CHECK:             ^bb0(%[[VAL_41:.*]]: i64):
 # CHECK:               %[[VAL_42:.*]] = arith.addi %[[VAL_41]], %[[VAL_2]] : i64
 # CHECK:               cc.continue %[[VAL_42]] : i64
-# CHECK:             } {invariant}
+# CHECK:             }
 # CHECK:             cc.continue %[[VAL_26]] : i64
 # CHECK:           } step {
 # CHECK:           ^bb0(%[[VAL_43:.*]]: i64):
 # CHECK:             %[[VAL_44:.*]] = arith.addi %[[VAL_43]], %[[VAL_3]] : i64
 # CHECK:             cc.continue %[[VAL_44]] : i64
-# CHECK:           } {invariant}
+# CHECK:           }
 # CHECK:           %[[VAL_45:.*]] = cc.load %[[VAL_7]] : !cc.ptr<i64>
 # CHECK:           %[[VAL_46:.*]] = arith.subi %[[VAL_45]], %[[VAL_3]] : i64
 # CHECK:           %[[VAL_47:.*]] = quake.extract_ref %[[VAL_0]]{{\[}}%[[VAL_46]]] : (!quake.veq<?>, i64) -> !quake.ref
diff --git a/python/tests/mlir/qreg_iterable.py b/python/tests/mlir/qreg_iterable.py
index 8ece9fafe23..390a9ed428a 100644
--- a/python/tests/mlir/qreg_iterable.py
+++ b/python/tests/mlir/qreg_iterable.py
@@ -43,6 +43,6 @@ def foo(N: int):
 # CHECK:           ^bb0(%[[VAL_12:.*]]: i64):
 # CHECK:             %[[VAL_13:.*]] = arith.addi %[[VAL_12]], %[[VAL_1]] : i64
 # CHECK:             cc.continue %[[VAL_13]] : i64
-# CHECK:           } {invariant}
+# CHECK:           }
 # CHECK:           return
 # CHECK:         }
diff --git a/python/tests/mlir/test_output_qir.py b/python/tests/mlir/test_output_qir.py
deleted file mode 100644
index ba0af319aa6..00000000000
--- a/python/tests/mlir/test_output_qir.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# ============================================================================ #
-# Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                   #
-# All rights reserved.                                                         #
-#                                                                              #
-# This source code and the accompanying materials are made available under     #
-# the terms of the Apache License 2.0 which accompanies this distribution.     #
-# ============================================================================ #
-
-# RUN: PYTHONPATH=../../ pytest -rP  %s | FileCheck %s
-
-import cudaq
-
-
-def test_synth_and_qir():
-
-    @cudaq.kernel
-    def ghz(numQubits: int):
-        qubits = cudaq.qvector(numQubits)
-        h(qubits.front())
-        for i, qubitIdx in enumerate(range(numQubits - 1)):
-            x.ctrl(qubits[i], qubits[qubitIdx + 1])
-
-    print(cudaq.translate(ghz, format="qir"))
-    ghz_synth = cudaq.synthesize(ghz, 5)
-    print(cudaq.translate(ghz_synth, format='qir-base'))
-
-
-# CHECK-LABEL: define void @__nvqpp__mlirgen__ghz(i64
-# CHECK-SAME:                              %[[VAL_0:.*]]) local_unnamed_addr {
-# CHECK:         %[[VAL_1:.*]] = tail call %[[VAL_2:.*]]* @__quantum__rt__qubit_allocate_array(i64 %[[VAL_0]])
-# CHECK:         %[[VAL_3:.*]] = tail call %[[VAL_4:.*]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_1]], i64 0)
-# CHECK:         %[[VAL_5:.*]] = load %[[VAL_4]]*, %[[VAL_4]]** %[[VAL_3]]
-# CHECK:         tail call void @__quantum__qis__h(%[[VAL_4]]* %[[VAL_5]])
-# CHECK:         %[[VAL_6:.*]] = add i64 %[[VAL_0]], -1
-# CHECK:         %[[VAL_7:.*]] = tail call i64 @llvm.abs.i64(i64 %[[VAL_6]], i1 false)
-# CHECK:         %[[VAL_8:.*]] = alloca i64, i64 %[[VAL_7]]
-# CHECK:         %[[VAL_9:.*]] = icmp sgt i64 %[[VAL_6]], 0
-# CHECK:         br i1 %[[VAL_9]], label %[[VAL_10:.*]], label %[[VAL_11:.*]]
-# CHECK:       :                                           ; preds = %[[VAL_12:.*]], %[[VAL_10]]
-# CHECK:         %[[VAL_13:.*]] = phi i64 [ %[[VAL_14:.*]], %[[VAL_10]] ], [ 0, %[[VAL_12]] ]
-# CHECK:         %[[VAL_15:.*]] = getelementptr i64, i64* %[[VAL_8]], i64 %[[VAL_13]]
-# CHECK:         store i64 %[[VAL_13]], i64* %[[VAL_15]]
-# CHECK:         %[[VAL_14]] = add nuw nsw i64 %[[VAL_13]], 1
-# CHECK:         %[[VAL_16:.*]] = icmp slt i64 %[[VAL_14]], %[[VAL_6]]
-# CHECK:         br i1 %[[VAL_16]], label %[[VAL_10]], label %[[VAL_17:.*]]
-# CHECK:       :                                      ; preds = %[[VAL_10]]
-# CHECK:         %[[VAL_18:.*]] = alloca { i64, i64 }, i64 %[[VAL_6]]
-# CHECK:         br i1 %[[VAL_9]], label %[[VAL_19:.*]], label %[[VAL_11]]
-# CHECK:       :                                       ; preds = %[[VAL_19]]
-# CHECK:         br i1 %[[VAL_9]], label %[[VAL_20:.*]], label %[[VAL_11]]
-# CHECK:       :                                         ; preds = %[[VAL_17]], %[[VAL_19]]
-# CHECK:         %[[VAL_21:.*]] = phi i64 [ %[[VAL_22:.*]], %[[VAL_19]] ], [ 0, %[[VAL_17]] ]
-# CHECK:         %[[VAL_23:.*]] = getelementptr i64, i64* %[[VAL_8]], i64 %[[VAL_21]]
-# CHECK:         %[[VAL_24:.*]] = load i64, i64* %[[VAL_23]]
-# CHECK:         %[[VAL_25:.*]] = getelementptr { i64, i64 }, { i64, i64 }* %[[VAL_18]], i64 %[[VAL_21]], i32 0
-# CHECK:         store i64 %[[VAL_21]], i64* %[[VAL_25]]
-# CHECK:         %[[VAL_26:.*]] = getelementptr { i64, i64 }, { i64, i64 }* %[[VAL_18]], i64 %[[VAL_21]], i32 1
-# CHECK:         store i64 %[[VAL_24]], i64* %[[VAL_26]]
-# CHECK:         %[[VAL_22]] = add nuw nsw i64 %[[VAL_21]], 1
-# CHECK:         %[[VAL_27:.*]] = icmp slt i64 %[[VAL_22]], %[[VAL_6]]
-# CHECK:         br i1 %[[VAL_27]], label %[[VAL_19]], label %[[VAL_28:.*]]
-# CHECK:       :                                         ; preds = %[[VAL_28]], %[[VAL_20]]
-# CHECK:         %[[VAL_29:.*]] = phi i64 [ %[[VAL_30:.*]], %[[VAL_20]] ], [ 0, %[[VAL_28]] ]
-# CHECK:         %[[VAL_31:.*]] = getelementptr { i64, i64 }, { i64, i64 }* %[[VAL_18]], i64 %[[VAL_29]], i32 0
-# CHECK:         %[[VAL_32:.*]] = load i64, i64* %[[VAL_31]]
-# CHECK:         %[[VAL_33:.*]] = getelementptr { i64, i64 }, { i64, i64 }* %[[VAL_18]], i64 %[[VAL_29]], i32 1
-# CHECK:         %[[VAL_34:.*]] = load i64, i64* %[[VAL_33]]
-# CHECK:         %[[VAL_35:.*]] = tail call %[[VAL_4]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_1]], i64 %[[VAL_32]])
-# CHECK:         %[[VAL_36:.*]] = bitcast %[[VAL_4]]** %[[VAL_35]] to i8**
-# CHECK:         %[[VAL_37:.*]] = load i8*, i8** %[[VAL_36]]
-# CHECK:         %[[VAL_38:.*]] = add i64 %[[VAL_34]], 1
-# CHECK:         %[[VAL_39:.*]] = tail call %[[VAL_4]]** @__quantum__rt__array_get_element_ptr_1d(%[[VAL_2]]* %[[VAL_1]], i64 %[[VAL_38]])
-# CHECK:         %[[VAL_40:.*]] = bitcast %[[VAL_4]]** %[[VAL_39]] to i8**
-# CHECK:         %[[VAL_41:.*]] = load i8*, i8** %[[VAL_40]]
-# CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%[[VAL_2]]*, %[[VAL_4]]*)* @__quantum__qis__x__ctl to i8*), i8* %[[VAL_37]], i8* %[[VAL_41]])
-# CHECK:         %[[VAL_30]] = add nuw nsw i64 %[[VAL_29]], 1
-# CHECK:         %[[VAL_42:.*]] = icmp slt i64 %[[VAL_30]], %[[VAL_6]]
-# CHECK:         br i1 %[[VAL_42]], label %[[VAL_20]], label %[[VAL_11]]
-# CHECK:       :                                    ; preds = %[[VAL_20]], %[[VAL_12]], %[[VAL_17]], %[[VAL_28]]
-# CHECK:         tail call void @__quantum__rt__qubit_release_array(%[[VAL_2]]* %[[VAL_1]])
-# CHECK:         ret void
-# CHECK:       }
-
-# CHECK-LABEL: define void @__nvqpp__mlirgen__ghz() local_unnamed_addr #0 {
-# CHECK:         tail call void @__quantum__qis__h__body(%[[VAL_0:.*]]* null)
-# CHECK:         tail call void @__quantum__qis__cnot__body(%[[VAL_0]]* null, %[[VAL_0]]* nonnull inttoptr (i64 1 to %[[VAL_0]]*))
-# CHECK:         tail call void @__quantum__qis__cnot__body(%[[VAL_0]]* nonnull inttoptr (i64 1 to %[[VAL_0]]*), %[[VAL_0]]* nonnull inttoptr (i64 2 to %[[VAL_0]]*))
-# CHECK:         tail call void @__quantum__qis__cnot__body(%[[VAL_0]]* nonnull inttoptr (i64 2 to %[[VAL_0]]*), %[[VAL_0]]* nonnull inttoptr (i64 3 to %[[VAL_0]]*))
-# CHECK:         tail call void @__quantum__qis__cnot__body(%[[VAL_0]]* nonnull inttoptr (i64 3 to %[[VAL_0]]*), %[[VAL_0]]* nonnull inttoptr (i64 4 to %[[VAL_0]]*))
-# CHECK:         ret void
-# CHECK:       }
diff --git a/python/tests/mlir/test_output_translate_openqasm.py b/python/tests/mlir/test_output_translate_openqasm.py
index bdee4e5b202..68844933e0d 100644
--- a/python/tests/mlir/test_output_translate_openqasm.py
+++ b/python/tests/mlir/test_output_translate_openqasm.py
@@ -113,47 +113,47 @@ def gates():
 # CHECK: rz(0.000000e+00) var0[0];
 # CHECK: rx(-1.570796e+00) var0[0];
 # CHECK: rz(0.000000e+00) var0[0];
-# CHECK: cx var0[1], var0[0];
+# CHECK: cx var0[2], var0[0];
 # CHECK: rz(-7.853982e-01) var0[0];
 # CHECK: rx(1.570796e+00) var0[0];
 # CHECK: rz(0.000000e+00) var0[0];
 # CHECK: rx(-1.570796e+00) var0[0];
 # CHECK: rz(0.000000e+00) var0[0];
-# CHECK: cx var0[2], var0[0];
+# CHECK: cx var0[1], var0[0];
 # CHECK: rz(7.853982e-01) var0[0];
 # CHECK: rx(1.570796e+00) var0[0];
 # CHECK: rz(0.000000e+00) var0[0];
 # CHECK: rx(-1.570796e+00) var0[0];
 # CHECK: rz(0.000000e+00) var0[0];
-# CHECK: cx var0[1], var0[0];
+# CHECK: cx var0[2], var0[0];
 # CHECK: rz(-7.853982e-01) var0[0];
 # CHECK: rx(1.570796e+00) var0[0];
 # CHECK: rz(0.000000e+00) var0[0];
 # CHECK: rx(-1.570796e+00) var0[0];
 # CHECK: rz(0.000000e+00) var0[0];
-# CHECK: cx var0[2], var0[0];
+# CHECK: cx var0[1], var0[0];
 # CHECK: rz(7.853982e-01) var0[0];
 # CHECK: rx(1.570796e+00) var0[0];
 # CHECK: rz(0.000000e+00) var0[0];
 # CHECK: rx(-1.570796e+00) var0[0];
 # CHECK: rz(0.000000e+00) var0[0];
-# CHECK: cx var0[2], var0[1];
-# CHECK: rz(-7.853982e-01) var0[1];
-# CHECK: rx(1.570796e+00) var0[1];
-# CHECK: rz(0.000000e+00) var0[1];
-# CHECK: rx(-1.570796e+00) var0[1];
-# CHECK: rz(0.000000e+00) var0[1];
-# CHECK: cx var0[2], var0[1];
-# CHECK: rz(7.853982e-01) var0[1];
-# CHECK: rx(1.570796e+00) var0[1];
-# CHECK: rz(0.000000e+00) var0[1];
-# CHECK: rx(-1.570796e+00) var0[1];
-# CHECK: rz(0.000000e+00) var0[1];
+# CHECK: cx var0[1], var0[2];
+# CHECK: rz(-7.853982e-01) var0[2];
+# CHECK: rx(1.570796e+00) var0[2];
+# CHECK: rz(0.000000e+00) var0[2];
+# CHECK: rx(-1.570796e+00) var0[2];
+# CHECK: rz(0.000000e+00) var0[2];
+# CHECK: cx var0[1], var0[2];
 # CHECK: rz(7.853982e-01) var0[2];
 # CHECK: rx(1.570796e+00) var0[2];
 # CHECK: rz(0.000000e+00) var0[2];
 # CHECK: rx(-1.570796e+00) var0[2];
 # CHECK: rz(0.000000e+00) var0[2];
+# CHECK: rz(7.853982e-01) var0[1];
+# CHECK: rx(1.570796e+00) var0[1];
+# CHECK: rz(0.000000e+00) var0[1];
+# CHECK: rx(-1.570796e+00) var0[1];
+# CHECK: rz(0.000000e+00) var0[1];
 # CHECK: rz(1.570796e+00) var0[0];
 # CHECK: rx(1.570796e+00) var0[0];
 # CHECK: rz(0.000000e+00) var0[0];
diff --git a/python/tests/mlir/test_output_translate_qir.py b/python/tests/mlir/test_output_translate_qir.py
index 27a599dabd4..87bdf13b469 100644
--- a/python/tests/mlir/test_output_translate_qir.py
+++ b/python/tests/mlir/test_output_translate_qir.py
@@ -32,7 +32,7 @@ def ghz(numQubits: int):
 # CHECK:         %[[VAL_5:.*]] = load %Qubit*, %Qubit** %[[VAL_3]]
 # CHECK:         tail call void @__quantum__qis__h(%Qubit* %[[VAL_5]])
 # CHECK:         %[[VAL_6:.*]] = add i64 %[[VAL_0]], -1
-# CHECK:         %[[VAL_7:.*]] = tail call i64 @llvm.abs.i64(i64 %[[VAL_6]], i1 false)
+# CHECK:         %[[VAL_7:.*]] = tail call i64 @llvm.smax.i64(i64 %[[VAL_6]], i64 0)
 # CHECK:         %[[VAL_8:.*]] = alloca i64, i64 %[[VAL_7]]
 # CHECK:         %[[VAL_9:.*]] = icmp sgt i64 %[[VAL_6]], 0
 # CHECK:         br i1 %[[VAL_9]], label %[[VAL_10:.*]], label %[[VAL_11:.*]]
@@ -43,29 +43,11 @@ def ghz(numQubits: int):
 # CHECK:         %[[VAL_14]] = add nuw nsw i64 %[[VAL_13]], 1
 # CHECK:         %[[VAL_16:.*]] = icmp slt i64 %[[VAL_14]], %[[VAL_6]]
 # CHECK:         br i1 %[[VAL_16]], label %[[VAL_10]], label %[[VAL_17:.*]]
-# CHECK:       :                                      ; preds = %[[VAL_10]]
-# CHECK:         %[[VAL_18:.*]] = alloca { i64, i64 }, i64 %[[VAL_6]]
-# CHECK:         br i1 %[[VAL_9]], label %[[VAL_19:.*]], label %[[VAL_11]]
-# CHECK:       :                                       ; preds = %[[VAL_19]]
-# CHECK:         br i1 %[[VAL_9]], label %[[VAL_20:.*]], label %[[VAL_11]]
-# CHECK:       :                                         ; preds = %[[VAL_17]], %[[VAL_19]]
-# CHECK:         %[[VAL_21:.*]] = phi i64 [ %[[VAL_22:.*]], %[[VAL_19]] ], [ 0, %[[VAL_17]] ]
-# CHECK:         %[[VAL_23:.*]] = getelementptr i64, i64* %[[VAL_8]], i64 %[[VAL_21]]
-# CHECK:         %[[VAL_24:.*]] = load i64, i64* %[[VAL_23]]
-# CHECK:         %[[VAL_25:.*]] = getelementptr { i64, i64 }, { i64, i64 }* %[[VAL_18]], i64 %[[VAL_21]], i32 0
-# CHECK:         store i64 %[[VAL_21]], i64* %[[VAL_25]]
-# CHECK:         %[[VAL_26:.*]] = getelementptr { i64, i64 }, { i64, i64 }* %[[VAL_18]], i64 %[[VAL_21]], i32 1
-# CHECK:         store i64 %[[VAL_24]], i64* %[[VAL_26]]
-# CHECK:         %[[VAL_22]] = add nuw nsw i64 %[[VAL_21]], 1
-# CHECK:         %[[VAL_27:.*]] = icmp slt i64 %[[VAL_22]], %[[VAL_6]]
-# CHECK:         br i1 %[[VAL_27]], label %[[VAL_19]], label %[[VAL_28:.*]]
-# CHECK:       :                                         ; preds = %[[VAL_28]], %[[VAL_20]]
-# CHECK:         %[[VAL_29:.*]] = phi i64 [ %[[VAL_30:.*]], %[[VAL_20]] ], [ 0, %[[VAL_28]] ]
-# CHECK:         %[[VAL_31:.*]] = getelementptr { i64, i64 }, { i64, i64 }* %[[VAL_18]], i64 %[[VAL_29]], i32 0
-# CHECK:         %[[VAL_32:.*]] = load i64, i64* %[[VAL_31]]
-# CHECK:         %[[VAL_33:.*]] = getelementptr { i64, i64 }, { i64, i64 }* %[[VAL_18]], i64 %[[VAL_29]], i32 1
-# CHECK:         %[[VAL_34:.*]] = load i64, i64* %[[VAL_33]]
-# CHECK:         %[[VAL_35:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_1]], i64 %[[VAL_32]])
+# CHECK:       :                                         ; preds = %[[VAL_17]], %[[VAL_20:.*]]
+# CHECK:         %[[VAL_29:.*]] = phi i64 [ %[[VAL_30:.*]], %[[VAL_20]] ], [ 0, %[[VAL_17]] ]
+# CHECK:         %[[VAL_31:.*]] = getelementptr i64, i64* %[[VAL_8]], i64 %[[VAL_29]]
+# CHECK:         %[[VAL_34:.*]] = load i64, i64* %[[VAL_31]]
+# CHECK:         %[[VAL_35:.*]] = tail call %Qubit** @__quantum__rt__array_get_element_ptr_1d(%Array* %[[VAL_1]], i64 %[[VAL_29]])
 # CHECK:         %[[VAL_36:.*]] = bitcast %Qubit** %[[VAL_35]] to i8**
 # CHECK:         %[[VAL_37:.*]] = load i8*, i8** %[[VAL_36]]
 # CHECK:         %[[VAL_38:.*]] = add i64 %[[VAL_34]], 1
@@ -74,9 +56,9 @@ def ghz(numQubits: int):
 # CHECK:         %[[VAL_41:.*]] = load i8*, i8** %[[VAL_40]]
 # CHECK:         tail call void (i64, i64, i64, i64, i8*, ...) @generalizedInvokeWithRotationsControlsTargets(i64 0, i64 0, i64 1, i64 1, i8* nonnull bitcast (void (%Array*, %Qubit*)* @__quantum__qis__x__ctl to i8*), i8* %[[VAL_37]], i8* %[[VAL_41]])
 # CHECK:         %[[VAL_30]] = add nuw nsw i64 %[[VAL_29]], 1
-# CHECK:         %[[VAL_42:.*]] = icmp slt i64 %[[VAL_30]], %[[VAL_6]]
-# CHECK:         br i1 %[[VAL_42]], label %[[VAL_20]], label %[[VAL_11]]
-# CHECK:       ._crit_edge13:                                    ; preds = %[[VAL_20]], %[[VAL_12]], %[[VAL_17]], %[[VAL_28]]
+# CHECK:         %[[VAL_42:.*]] = icmp ult i64 %[[VAL_30]], %[[VAL_7]]
+# CHECK:         br i1 %[[VAL_42]], label %[[VAL_20]], label %._crit_edge
+# CHECK:       ._crit_edge:                                    ; preds = %[[VAL_20]], %[[VAL_12]], %[[VAL_17]]
 # CHECK:         tail call void @__quantum__rt__qubit_release_array(%Array* %[[VAL_1]])
 # CHECK:         ret void
 # CHECK:       }
diff --git a/python/tests/mlir/var_scope.py b/python/tests/mlir/var_scope.py
index 8ed3386df43..b52bf99cf86 100644
--- a/python/tests/mlir/var_scope.py
+++ b/python/tests/mlir/var_scope.py
@@ -32,23 +32,3 @@ def kernel():
 # CHECK:           cc.store %[[VAL_0]], %[[VAL_4]] : !cc.ptr<i1>
 # CHECK:           return
 # CHECK:         }
-
-
-def test_variable_name():
-
-    @cudaq.kernel
-    def slice():
-        q = cudaq.qvector(4)
-        slice = q[2:]
-        x(slice)
-
-    try:
-        print(slice)
-    except Exception as e:
-        print("Failure for slice:")
-        print(e)
-
-
-# CHECK-LABEL:    Failure for slice:
-# CHECK:          CUDA-Q does not allow assignment to variable slice captured from parent scope.
-# CHECK-NEXT:     (offending source -> slice = q[2:])
diff --git a/python/tests/visualization/test_draw.py b/python/tests/visualization/test_draw.py
index 33db2632330..db21104ff78 100644
--- a/python/tests/visualization/test_draw.py
+++ b/python/tests/visualization/test_draw.py
@@ -138,11 +138,11 @@ def hw_kernel():
     cudaq.set_target('ionq', emulate=True)
     # fmt: on
     expected_str = R"""
-     ╭───╮                                       ╭───╮╭─────╮╭───╮╭───╮
-q0 : ┤ h ├──●─────────────────────●──────────────┤ x ├┤ tdg ├┤ x ├┤ t ├
-     ╰───╯  │                     │              ╰─┬─╯╰─────╯╰─┬─╯├───┤
-q1 : ───────┼───────────●─────────┼───────────●────●───────────●──┤ t ├
-     ╭───╮╭─┴─╮╭─────╮╭─┴─╮╭───╮╭─┴─╮╭─────╮╭─┴─╮╭───╮ ╭───╮      ╰───╯
+     ╭───╮                                                        ╭───╮
+q0 : ┤ h ├──────────────●─────────────────────●────●───────────●──┤ t ├
+     ╰───╯              │                     │  ╭─┴─╮╭─────╮╭─┴─╮├───┤
+q1 : ───────●───────────┼─────────●───────────┼──┤ x ├┤ tdg ├┤ x ├┤ t ├
+     ╭───╮╭─┴─╮╭─────╮╭─┴─╮╭───╮╭─┴─╮╭─────╮╭─┴─╮├───┤╰┬───┬╯╰───╯╰───╯
 q2 : ┤ h ├┤ x ├┤ tdg ├┤ x ├┤ t ├┤ x ├┤ tdg ├┤ x ├┤ t ├─┤ h ├───────────
      ╰───╯╰───╯╰─────╯╰───╯╰───╯╰───╯╰─────╯╰───╯╰───╯ ╰───╯           
 """
diff --git a/python/utils/OpaqueArguments.h b/python/utils/OpaqueArguments.h
index b140dd93a8d..b8b0d87e966 100644
--- a/python/utils/OpaqueArguments.h
+++ b/python/utils/OpaqueArguments.h
@@ -236,35 +236,8 @@ inline void handleStructMemberVariable(void *data, std::size_t offset,
         appendValue(data, (double)value.cast<py::float_>(), offset);
       })
       .Case([&](cudaq::cc::StdvecType ty) {
-        auto appendVectorValue = []<typename T>(py::object value, void *data,
-                                                std::size_t offset, T) {
-          auto asList = value.cast<py::list>();
-          std::vector<double> *values = new std::vector<double>(asList.size());
-          for (std::size_t i = 0; auto &v : asList)
-            (*values)[i++] = v.cast<double>();
-
-          std::memcpy(((char *)data) + offset, values, 16);
-        };
-
-        mlir::TypeSwitch<mlir::Type, void>(ty.getElementType())
-            .Case([&](mlir::IntegerType type) {
-              if (type.isInteger(1)) {
-                appendVectorValue(value, data, offset, bool());
-                return;
-              }
-
-              appendVectorValue(value, data, offset, std::size_t());
-              return;
-            })
-            .Case([&](mlir::FloatType type) {
-              if (type.isF32()) {
-                appendVectorValue(value, data, offset, float());
-                return;
-              }
-
-              appendVectorValue(value, data, offset, double());
-              return;
-            });
+        throw std::runtime_error("dynamically sized element types for function "
+                                 "arguments are not yet supported");
       })
       .Default([&](mlir::Type ty) {
         ty.dump();
diff --git a/test/AST-Quake/control_flow.cpp b/test/AST-Quake/control_flow.cpp
index 95f1311bcd6..3984766e6b3 100644
--- a/test/AST-Quake/control_flow.cpp
+++ b/test/AST-Quake/control_flow.cpp
@@ -20,27 +20,27 @@ void g3();
 void g4();
 
 struct C {
-   void operator()() __qpu__ {
-      cudaq::qvector r(2);
-      g1();
-      for (int i = 0; i < 10; ++i) {
-	 if (f1(i)) {
-	    cudaq::qubit q;
-	    x<cudaq::ctrl>(q,r[0]);
-	    break;
-	 }
-	 x<cudaq::ctrl>(r[0],r[1]);
-	 g2();
-	 if (f2(i)) {
-	    y(r[1]);
-	    continue;
-	 }
-	 g3();
-	 z(r);
-      }
-      g4();
-      mz(r);
-   }
+   	void operator()() __qpu__ {
+    	cudaq::qvector r(2);
+    	g1();
+      	for (int i = 0; i < 10; ++i) {
+	 		if (f1(i)) {
+	    		cudaq::qubit q;
+	    		x<cudaq::ctrl>(q,r[0]);
+	    		break;
+	 		}
+	 		x<cudaq::ctrl>(r[0],r[1]);
+	 		g2();
+	 		if (f2(i)) {
+	    		y(r[1]);
+	    		continue;
+	 		}
+	 		g3();
+	 		z(r);
+      	}
+      	g4();
+      	mz(r);
+   	}
 };
 
 // CHECK-LABEL:   func.func @__nvqpp__mlirgen__C()
@@ -109,26 +109,26 @@ struct C {
 // CHECK:         }
 
 struct D {
-   void operator()() __qpu__ {
-      cudaq::qvector r(2);
-      g1();
-      for (int i = 0; i < 10; ++i) {
-	 if (f1(i)) {
-	    cudaq::qubit q;
-	    x<cudaq::ctrl>(q,r[0]);
-	    continue;
-	 }
-	 x<cudaq::ctrl>(r[0],r[1]);
-	 g2();
-	 if (f2(i)) {
-	    y(r[1]);
-	    break;
-	 }
-	 g3();
-	 z(r);
-      }
-      g4();
-      mz(r);
+   	void operator()() __qpu__ {
+      	cudaq::qvector r(2);
+      	g1();
+      	for (int i = 0; i < 10; ++i) {
+	 		if (f1(i)) {
+	    		cudaq::qubit q;
+	    		x<cudaq::ctrl>(q,r[0]);
+	    		continue;
+	 		}
+	 		x<cudaq::ctrl>(r[0],r[1]);
+	 		g2();
+	 		if (f2(i)) {
+	    		y(r[1]);
+	    		break;
+	 		}
+	 		g3();
+	 		z(r);
+      	}
+      	g4();
+      	mz(r);
    }
 };
 
@@ -199,25 +199,25 @@ struct D {
 
 struct E {
    void operator()() __qpu__ {
-      cudaq::qvector r(2);
-      g1();
-      for (int i = 0; i < 10; ++i) {
-	 if (f1(i)) {
-	    cudaq::qubit q;
-	    x<cudaq::ctrl>(q,r[0]);
-	    return;
-	 }
-	 x<cudaq::ctrl>(r[0],r[1]);
-	 g2();
-	 if (f2(i)) {
-	    y(r[1]);
-	    break;
-	 }
-	 g3();
-	 z(r);
-      }
-      g4();
-      mz(r);
+    	cudaq::qvector r(2);
+      	g1();
+      	for (int i = 0; i < 10; ++i) {
+	 		if (f1(i)) {
+	    		cudaq::qubit q;
+	    		x<cudaq::ctrl>(q,r[0]);
+	    		return;
+	 		}
+	 		x<cudaq::ctrl>(r[0],r[1]);
+	 		g2();
+	 		if (f2(i)) {
+	    		y(r[1]);
+	    		break;
+	 		}
+	 		g3();
+	 		z(r);
+      	}
+      	g4();
+      	mz(r);
    }
 };
 
@@ -286,26 +286,26 @@ struct E {
 // CHECK:           return
 
 struct F {
-   void operator()() __qpu__ {
-      cudaq::qvector r(2);
-      g1();
-      for (int i = 0; i < 10; ++i) {
-	 if (f1(i)) {
-	    cudaq::qubit q;
-	    x<cudaq::ctrl>(q,r[0]);
-	    continue;
-	 }
-	 x<cudaq::ctrl>(r[0],r[1]);
-	 g2();
-	 if (f2(i)) {
-	    y(r[1]);
-	    return;
-	 }
-	 g3();
-	 z(r);
-      }
-      g4();
-      mz(r);
+	void operator()() __qpu__ {
+      	cudaq::qvector r(2);
+      	g1();
+      	for (int i = 0; i < 10; ++i) {
+	 		if (f1(i)) {
+	    		cudaq::qubit q;
+	    		x<cudaq::ctrl>(q,r[0]);
+	    		continue;
+	 		}
+	 		x<cudaq::ctrl>(r[0],r[1]);
+	 		g2();
+	 		if (f2(i)) {
+	    		y(r[1]);
+	    		return;
+	 		}
+	 		g3();
+	 		z(r);
+      	}
+      	g4();
+      	mz(r);
    }
 };