diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp
index fd7622981fb..2931adefae7 100644
--- a/lib/Optimizer/Builder/Intrinsics.cpp
+++ b/lib/Optimizer/Builder/Intrinsics.cpp
@@ -371,6 +371,136 @@ static constexpr IntrinsicCode intrinsicTable[] = {
     {cudaq::stdvecBoolCtorFromInitList, {}, R"#(
   func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr<none>, !cc.ptr<none>, i64) -> ())#"},
 
+    {"__nvqpp_internal_number_of_digits", {}, R"#(
+  func.func private @__nvqpp_internal_number_of_digits(%arg0: i64) -> i64 {
+    %c10_i64 = arith.constant 10 : i64 
+    %c0_i64 = arith.constant 0 : i64 
+    %c1_i64 = arith.constant 1 : i64 
+    %0 = cc.alloca i64 
+    cc.store %arg0, %0 : !cc.ptr<i64> 
+    %1 = cc.load %0 : !cc.ptr<i64> 
+    %2 = cc.alloca i64 
+    cc.store %c0_i64, %2 : !cc.ptr<i64> 
+    %3 = arith.cmpi eq, %1, %c0_i64 : i64 
+    cc.if(%3) {
+      cc.store %c1_i64, %2 : !cc.ptr<i64> 
+    } 
+    cc.loop while {
+      %5 = cc.load %0 : !cc.ptr<i64> 
+      %6 = arith.cmpi sgt, %5, %c0_i64 : i64 
+      cc.condition %6 
+    } do {
+      %5 = cc.load %0 : !cc.ptr<i64> 
+      %6 = arith.divsi %5, %c10_i64 : i64 
+      cc.store %6, %0 : !cc.ptr<i64> 
+      %7 = cc.load %2 : !cc.ptr<i64> 
+      %8 = arith.addi %7, %c1_i64 : i64 
+      cc.store %8, %2 : !cc.ptr<i64> 
+      cc.continue 
+    } 
+    %4 = cc.load %2 : !cc.ptr<i64> 
+    return %4 : i64 
+  } 
+  )#"},
+
+    // __nvqpp_internal_tostring
+    {"__nvqpp_internal_tostring", {}, R"#(
+  func.func private @__nvqpp_internal_tostring(%buf: !cc.stdvec<i8>, %val: i64) {
+    %c48_i64 = arith.constant 48 : i64 
+    %c48_i32 = arith.constant 48 : i32 
+    %c0_i64 = arith.constant 0 : i64 
+    %c10_i64 = arith.constant 10 : i64 
+    %c1_i64 = arith.constant 1 : i64 
+    %c48_i8 = arith.constant 48 : i8 
+    %false = arith.constant false 
+    %c0_i8 = arith.constant 0 : i8 
+    %0 = cc.alloca i64 
+    cc.store %val, %0 : !cc.ptr<i64> 
+    %1 = cc.alloca i64 
+    cc.store %c10_i64, %1 : !cc.ptr<i64> 
+    %2 = cc.stdvec_size %buf : (!cc.stdvec<i8>) -> i64 
+    %3 = cc.alloca i64 
+    cc.store %2, %3 : !cc.ptr<i64> 
+    %4 = cc.load %3 : !cc.ptr<i64> 
+    %5 = arith.subi %4, %c1_i64 : i64 
+    %6 = cc.alloca i64 
+    cc.store %5, %6 : !cc.ptr<i64> 
+    %7 = cc.load %6 : !cc.ptr<i64> 
+    %8 = cc.stdvec_data %buf : (!cc.stdvec<i8>) -> !cc.ptr<!cc.array<i8 x ?>> 
+    %9 = cc.compute_ptr %8[%7] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8> 
+    cc.store %c0_i8, %9 : !cc.ptr<i8> 
+    %10 = cc.load %6 : !cc.ptr<i64> 
+    %11 = arith.subi %10, %c1_i64 : i64 
+    cc.store %11, %6 : !cc.ptr<i64> 
+    cc.loop while {
+      %18 = cc.load %0 : !cc.ptr<i64> 
+      %19 = cc.load %1 : !cc.ptr<i64> 
+      %20 = arith.cmpi sge, %18, %19 : i64 
+      %21 = arith.cmpi eq, %20, %false : i1 
+      %22 = cc.if(%21) -> i1 {
+        cc.continue %false : i1 
+      } else {
+        %23 = cc.load %6 : !cc.ptr<i64> 
+        %24 = arith.cmpi sge, %23, %c0_i64 : i64 
+        cc.continue %24 : i1 
+      } 
+      cc.condition %22 
+    } do {
+      cc.scope {
+        %18 = cc.load %0 : !cc.ptr<i64> 
+        %19 = cc.load %1 : !cc.ptr<i64> 
+        %20 = arith.remsi %18, %19 : i64 
+        %21 = cc.cast %20 : (i64) -> i32 
+        %22 = cc.alloca i32 
+        cc.store %21, %22 : !cc.ptr<i32> 
+        %23 = cc.load %1 : !cc.ptr<i64> 
+        %24 = cc.load %0 : !cc.ptr<i64> 
+        %25 = arith.divsi %24, %23 : i64 
+        cc.store %25, %0 : !cc.ptr<i64> 
+        %26 = cc.load %6 : !cc.ptr<i64> 
+        %27 = cc.stdvec_data %buf : (!cc.stdvec<i8>) -> !cc.ptr<!cc.array<i8 x ?>> 
+        %28 = cc.compute_ptr %27[%26] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8> 
+        %29 = cc.load %22 : !cc.ptr<i32> 
+        %30 = arith.addi %29, %c48_i32 : i32 
+        %31 = cc.cast %30 : (i32) -> i8 
+        cc.store %31, %28 : !cc.ptr<i8> 
+        %32 = cc.load %6 : !cc.ptr<i64> 
+        %33 = arith.subi %32, %c1_i64 : i64 
+        cc.store %33, %6 : !cc.ptr<i64> 
+      }
+      cc.continue
+    }
+    %12 = cc.load %6 : !cc.ptr<i64>
+    %13 = cc.stdvec_data %buf : (!cc.stdvec<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
+    %14 = cc.compute_ptr %13[%12] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+    %15 = cc.load %0 : !cc.ptr<i64>
+    %16 = arith.addi %15, %c48_i64 : i64
+    %17 = cc.cast %16 : (i64) -> i8
+    cc.store %17, %14 : !cc.ptr<i8>
+    cc.scope {
+      %18 = cc.alloca i64
+      cc.store %c0_i64, %18 : !cc.ptr<i64>
+      cc.loop while {
+        %19 = cc.load %18 : !cc.ptr<i64>
+        %20 = cc.load %6 : !cc.ptr<i64>
+        %21 = arith.cmpi slt, %19, %20 : i64
+        cc.condition %21
+      } do {
+        %19 = cc.load %18 : !cc.ptr<i64>
+        %20 = cc.stdvec_data %buf : (!cc.stdvec<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
+        %21 = cc.compute_ptr %20[%19] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
+        cc.store %c48_i8, %21 : !cc.ptr<i8>
+        cc.continue
+      } step {
+        %19 = cc.load %18 : !cc.ptr<i64>
+        %20 = arith.addi %19, %c1_i64 : i64
+        cc.store %20, %18 : !cc.ptr<i64>
+      }
+    }
+    return
+  }
+  )#"},
+
     // This helper function copies a buffer off the stack to the heap. This is
     // required when the data on the stack is about to go out of scope but is
     // still live.
diff --git a/lib/Optimizer/CodeGen/Pipelines.cpp b/lib/Optimizer/CodeGen/Pipelines.cpp
index c4d0141afd4..a8a3f918968 100644
--- a/lib/Optimizer/CodeGen/Pipelines.cpp
+++ b/lib/Optimizer/CodeGen/Pipelines.cpp
@@ -98,6 +98,7 @@ void createTargetCodegenPipeline(PassManager &pm,
   pm.addNestedPass<func::FuncOp>(createCSEPass());
   ::addQIRConversionPipeline(pm, options.target);
   pm.addPass(cudaq::opt::createReturnToOutputLog());
+  cudaq::opt::addLowerToCFG(pm);
   pm.addPass(createConvertMathToFuncs());
   pm.addPass(createSymbolDCEPass());
   pm.addPass(cudaq::opt::createCCToLLVM());
diff --git a/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp b/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp
index b4b175a31dd..01665222154 100644
--- a/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp
+++ b/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp
@@ -46,7 +46,8 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
   }
 
   static void genOutputLog(Location loc, PatternRewriter &rewriter, Value val,
-                           std::optional<StringRef> prefix) {
+                           std::optional<StringRef> prefix,
+                           std::optional<Value> customLabel = std::nullopt) {
     Type valTy = val.getType();
     TypeSwitch<Type>(valTy)
         .Case([&](IntegerType intTy) {
@@ -54,7 +55,8 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
           std::string labelStr = std::string("i") + std::to_string(width);
           if (prefix)
             labelStr = prefix->str();
-          Value label = makeLabel(loc, rewriter, labelStr);
+          Value label =
+              customLabel.value_or(makeLabel(loc, rewriter, labelStr));
           if (intTy.getWidth() == 1) {
             rewriter.create<func::CallOp>(loc, TypeRange{},
                                           cudaq::opt::QIRBoolRecordOutput,
@@ -80,7 +82,8 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
           std::string labelStr = std::string("f") + std::to_string(width);
           if (prefix)
             labelStr = prefix->str();
-          Value label = makeLabel(loc, rewriter, labelStr);
+          Value label =
+              customLabel.value_or(makeLabel(loc, rewriter, labelStr));
           // Floating point: convert it to double, whatever it actually is.
           Value castVal = val;
           if (floatTy != rewriter.getF64Type())
@@ -94,7 +97,8 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
           auto labelStr = translateType(structTy);
           if (prefix)
             labelStr = prefix->str();
-          Value label = makeLabel(loc, rewriter, labelStr);
+          Value label =
+              customLabel.value_or(makeLabel(loc, rewriter, labelStr));
           std::int32_t sz = structTy.getNumMembers();
           Value size = rewriter.create<arith::ConstantIntOp>(loc, sz, 64);
           rewriter.create<func::CallOp>(loc, TypeRange{},
@@ -111,7 +115,8 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
         })
         .Case([&](cudaq::cc::ArrayType arrTy) {
           auto labelStr = translateType(arrTy);
-          Value label = makeLabel(loc, rewriter, labelStr);
+          Value label =
+              customLabel.value_or(makeLabel(loc, rewriter, labelStr));
           std::int32_t sz = arrTy.getSize();
           Value size = rewriter.create<arith::ConstantIntOp>(loc, sz, 64);
           rewriter.create<func::CallOp>(loc, TypeRange{},
@@ -128,13 +133,12 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
           }
         })
         .Case([&](cudaq::cc::StdvecType vecTy) {
-          // For this type, we expect a cc.stdvec_init operation as the input.
-          // The data will be in a variable.
-          // If we reach here and we cannot determine the constant size of the
-          // buffer, then we will not generate any output logging.
           if (auto vecInit = val.getDefiningOp<cudaq::cc::StdvecInitOp>())
             if (auto maybeLen = cudaq::opt::factory::maybeValueOfIntConstant(
                     vecInit.getLength())) {
+              // For this type, we expect a cc.stdvec_init operation as the
+              // input.
+              // The data will be in a variable.
               std::int32_t sz = *maybeLen;
               auto labelStr = translateType(vecTy, sz);
               Value label = makeLabel(loc, rewriter, labelStr);
@@ -158,7 +162,55 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
                 Value w = rewriter.create<cudaq::cc::LoadOp>(loc, v);
                 genOutputLog(loc, rewriter, w, offset);
               }
+              return;
             }
+
+          // If we reach here and we cannot determine the constant size of the
+          // buffer, then we will not generate dynamic output logging with a for
+          // loop.
+          Value vecSz = rewriter.template create<cudaq::cc::StdvecSizeOp>(
+              loc, rewriter.getI64Type(), val);
+          const std::string arrayLabelPrefix =
+              "array<" + translateType(vecTy.getElementType()) + " x ";
+          Value labelBuffer =
+              makeLabel(loc, rewriter, arrayLabelPrefix, vecSz, ">");
+          rewriter.create<func::CallOp>(loc, TypeRange{},
+                                        cudaq::opt::QIRArrayRecordOutput,
+                                        ArrayRef<Value>{vecSz, labelBuffer});
+          auto eleTy = vecTy.getElementType();
+          const bool isBool = (eleTy == rewriter.getI1Type());
+          if (isBool)
+            eleTy = rewriter.getI8Type();
+          auto elePtrTy = cudaq::cc::PointerType::get(eleTy);
+          auto eleArrTy =
+              cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(eleTy));
+          auto vecPtr =
+              rewriter.create<cudaq::cc::StdvecDataOp>(loc, eleArrTy, val);
+          const std::string preStr = prefix ? prefix->str() : std::string{};
+          cudaq::opt::factory::createInvariantLoop(
+              rewriter, loc, vecSz,
+              [&](OpBuilder &builder, Location loc, Region &, Block &block) {
+                Value indexVar = block.getArgument(0);
+                auto eleAddr = rewriter.create<cudaq::cc::ComputePtrOp>(
+                    loc, elePtrTy, vecPtr, ValueRange{indexVar});
+
+                Value w = [&]() {
+                  if (isBool) {
+                    auto i1PtrTy =
+                        cudaq::cc::PointerType::get(rewriter.getI1Type());
+                    auto i1Cast = rewriter.create<cudaq::cc::CastOp>(
+                        loc, i1PtrTy, eleAddr);
+                    return rewriter.create<cudaq::cc::LoadOp>(loc, i1Cast);
+                  }
+
+                  return rewriter.create<cudaq::cc::LoadOp>(loc, eleAddr);
+                }();
+                const std::string prefix = preStr + "[";
+                const std::string postfix = "]";
+                Value dynamicLabel =
+                    makeLabel(loc, rewriter, prefix, indexVar, postfix);
+                genOutputLog(loc, rewriter, w, std::nullopt, dynamicLabel);
+              });
         })
         .Default([&](Type) {
           // If we reach here, we don't know how to handle this type.
@@ -207,6 +259,79 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
     auto i8PtrTy = cudaq::cc::PointerType::get(rewriter.getI8Type());
     return rewriter.create<cudaq::cc::CastOp>(loc, i8PtrTy, lit);
   }
+
+  static Value makeLabel(Location loc, PatternRewriter &rewriter,
+                         const std::string &prefix, Value val,
+                         const std::string &postFix) {
+    auto i64Ty = rewriter.getI64Type();
+    auto i8Ty = rewriter.getI8Type();
+    auto i8PtrTy = cudaq::cc::PointerType::get(i8Ty);
+    // Value must be i64
+    if (val.getType() != i64Ty)
+      val = rewriter.create<cudaq::cc::CastOp>(loc, i64Ty, val);
+    // Compute the number of digits required
+    Value numDigits = rewriter
+                          .create<func::CallOp>(
+                              loc, i64Ty, "__nvqpp_internal_number_of_digits",
+                              ArrayRef<Value>{val})
+                          .getResult(0);
+    Value valStrBuf = [&]() {
+      // Convert integer value to string
+      auto strSize = rewriter.create<arith::AddIOp>(
+          loc, numDigits,
+          rewriter.create<arith::ConstantIntOp>(loc, 1,
+                                                64)); // Add null terminator
+      auto buffer = rewriter.create<cudaq::cc::AllocaOp>(loc, i8Ty, strSize);
+      auto stdvecTy = cudaq::cc::StdvecType::get(i8Ty);
+      auto stringCharVec = rewriter.create<cudaq::cc::StdvecInitOp>(
+          loc, stdvecTy, buffer, strSize);
+      rewriter.create<func::CallOp>(loc, TypeRange{},
+                                    "__nvqpp_internal_tostring",
+                                    ArrayRef<Value>{stringCharVec, val});
+      return rewriter.create<cudaq::cc::CastOp>(loc, i8PtrTy, buffer);
+    }();
+
+    Value arrayPrefix = makeLabel(loc, rewriter, prefix);
+    Value arrayPostfix = makeLabel(loc, rewriter, postFix);
+    const int preFixLen = prefix.size();
+    const int postFixLen = postFix.size();
+    Value totalStrSize = rewriter.create<arith::AddIOp>(
+        loc, numDigits,
+        rewriter.create<arith::ConstantIntOp>(loc, preFixLen + postFixLen + 1,
+                                              64));
+    auto labelBufferAlloc =
+        rewriter.create<cudaq::cc::AllocaOp>(loc, i8Ty, totalStrSize);
+    Value labelBuffer =
+        rewriter.create<cudaq::cc::CastOp>(loc, i8PtrTy, labelBufferAlloc);
+
+    // Copy the prefix
+    rewriter.create<func::CallOp>(
+        loc, std::nullopt, cudaq::llvmMemCopyIntrinsic,
+        ValueRange{labelBuffer, arrayPrefix,
+                   rewriter.create<arith::ConstantIntOp>(loc, preFixLen, 64),
+                   rewriter.create<arith::ConstantIntOp>(loc, 0, 1)});
+    // Copy the integer string
+    auto toPtr = rewriter.create<cudaq::cc::ComputePtrOp>(
+        loc, i8PtrTy, labelBufferAlloc,
+        ValueRange{rewriter.create<arith::ConstantIntOp>(loc, preFixLen, 64)});
+    rewriter.create<func::CallOp>(
+        loc, std::nullopt, cudaq::llvmMemCopyIntrinsic,
+        ValueRange{toPtr, valStrBuf, numDigits,
+                   rewriter.create<arith::ConstantIntOp>(loc, 0, 1)});
+    // Copy the postfix + null terminator
+    Value shift = rewriter.create<arith::AddIOp>(
+        loc, numDigits,
+        rewriter.create<arith::ConstantIntOp>(loc, preFixLen, 64));
+    toPtr = rewriter.create<cudaq::cc::ComputePtrOp>(
+        loc, i8PtrTy, labelBufferAlloc, ValueRange{shift});
+    rewriter.create<func::CallOp>(
+        loc, std::nullopt, cudaq::llvmMemCopyIntrinsic,
+        ValueRange{
+            toPtr, arrayPostfix,
+            rewriter.create<arith::ConstantIntOp>(loc, postFixLen + 1, 64),
+            rewriter.create<arith::ConstantIntOp>(loc, 0, 1)});
+    return labelBuffer;
+  }
 };
 
 struct ReturnToOutputLogPass
@@ -230,6 +355,19 @@ struct ReturnToOutputLogPass
       return;
     }
 
+    if (failed(irBuilder.loadIntrinsic(module, "__nvqpp_internal_tostring"))) {
+      module.emitError("could not load string conversion function.");
+      signalPassFailure();
+      return;
+    }
+
+    if (failed(irBuilder.loadIntrinsic(module,
+                                       "__nvqpp_internal_number_of_digits"))) {
+      module.emitError("could not load number of digits function.");
+      signalPassFailure();
+      return;
+    }
+
     RewritePatternSet patterns(ctx);
     patterns.insert<ReturnRewrite>(ctx);
     LLVM_DEBUG(llvm::dbgs() << "Before return to output logging:\n" << module);
diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py
index aceab7ec75a..9069b4bd168 100644
--- a/python/cudaq/kernel/ast_bridge.py
+++ b/python/cudaq/kernel/ast_bridge.py
@@ -31,7 +31,7 @@
 from .utils import (Color, globalAstRegistry, globalKernelRegistry,
                     globalRegisteredOperations, globalRegisteredTypes,
                     nvqppPrefix, mlirTypeFromAnnotation, mlirTypeFromPyType,
-                    mlirTypeToPyType, mlirTryCreateStructType)
+                    mlirTypeToPyType, mlirTryCreateStructType, getInteropKernelNameIfFound)
 
 State = cudaq_runtime.State
 
@@ -434,6 +434,7 @@ def changeOperandToType(self, ty, operand, allowDemotion=False):
                                  operand,
                                  sint=operand_width != 1,
                                  zint=operand_width == 1).result
+        
         self.emitFatalError(
             f'cannot convert value of type {operand.type} to the requested type {ty}',
             self.currentNode)
@@ -578,6 +579,7 @@ def ifNotPointerThenStore(self, value):
         if not cc.PointerType.isinstance(value.type):
             slot = cc.AllocaOp(cc.PointerType.get(value.type),
                                TypeAttr.get(value.type)).result
+            assert cc.PointerType.get(value.type) == slot.type
             cc.StoreOp(value, slot)
             return slot
         return value
@@ -585,20 +587,32 @@ def ifNotPointerThenStore(self, value):
     def __createStdvecWithKnownValues(self, size, listElementValues):
         # Turn this List into a StdVec<T>
         arrSize = self.getConstantInt(size)
-        arrTy = cc.ArrayType.get(listElementValues[0].type)
+        elemTy = listElementValues[0].type
+        # If this is an `i1`, turns it into an `i8` array.
+        isBool = elemTy == self.getIntegerType(1)
+        if isBool:
+            elemTy = self.getIntegerType(8)
+
+        arrTy = cc.ArrayType.get(elemTy)
         alloca = cc.AllocaOp(cc.PointerType.get(arrTy),
-                             TypeAttr.get(listElementValues[0].type),
+                             TypeAttr.get(elemTy),
                              seqSize=arrSize).result
 
         for i, v in enumerate(listElementValues):
             eleAddr = cc.ComputePtrOp(
-                cc.PointerType.get(listElementValues[0].type), alloca,
+                cc.PointerType.get(elemTy), alloca,
                 [self.getConstantInt(i)],
                 DenseI32ArrayAttr.get([kDynamicPtrIndex],
                                       context=self.ctx)).result
+            if isBool:
+                # Cast the list value before assigning
+                v = self.changeOperandToType(self.getIntegerType(8), v)
             cc.StoreOp(v, eleAddr)
 
-        vecTy = listElementValues[0].type
+        # Create the `StdVec<T>` from the alloca
+        # We still use `i1` as the vector element type if the
+        # original list was of booleans.
+        vecTy = elemTy if not isBool else self.getIntegerType(1)
         if cc.PointerType.isinstance(vecTy):
             vecTy = cc.PointerType.getElementType(vecTy)
 
@@ -655,6 +669,10 @@ def __copyVectorAndCastElements(self,
         if (sourceEleType == targetEleType):
             return sourcePtr
 
+        isSourceBool = sourceEleType == self.getIntegerType(1)
+        if isSourceBool:
+            sourceEleType = self.getIntegerType(8)
+
         sourceArrType = cc.ArrayType.get(sourceEleType)
         sourceElePtrTy = cc.PointerType.get(sourceEleType)
         sourceArrElePtrTy = cc.PointerType.get(sourceArrType)
@@ -662,10 +680,16 @@ def __copyVectorAndCastElements(self,
         sourceDataPtr = cc.StdvecDataOp(sourceArrElePtrTy, sourceValue).result
         sourceSize = cc.StdvecSizeOp(self.getIntegerType(), sourceValue).result
 
+        isTargetBool = targetEleType == self.getIntegerType(1)
+        # Vector type reflects the true type, including `i1`
+        targetVecTy = cc.StdvecType.get(targetEleType)
+
+        if isTargetBool:
+            targetEleType = self.getIntegerType(8)
+
         targetElePtrType = cc.PointerType.get(targetEleType)
         targetTy = cc.ArrayType.get(targetEleType)
         targetArrElePtrTy = cc.PointerType.get(targetTy)
-        targetVecTy = cc.StdvecType.get(targetEleType)
         targetPtr = cc.AllocaOp(targetArrElePtrTy,
                                 TypeAttr.get(targetEleType),
                                 seqSize=sourceSize).result
@@ -681,6 +705,7 @@ def bodyBuilder(iterVar):
                                                  allowDemotion=allowDemotion)
             targetEleAddr = cc.ComputePtrOp(targetElePtrType, targetPtr,
                                             [iterVar], rawIndex).result
+            assert cc.PointerType.get(targetEleType) == targetEleAddr.type
             cc.StoreOp(castedEle, targetEleAddr)
 
         self.createInvariantForLoop(sourceSize, bodyBuilder)
@@ -777,15 +802,26 @@ def __load_vector_element(self, vector, index):
             MLIR Value containing the loaded element
         """
         if cc.StdvecType.isinstance(vector.type):
+            elem_ty = cc.StdvecType.getElementType(vector.type)
+            is_bool = elem_ty == self.getIntegerType(1)
+            # std::vector<bool> is a special case in C++ where each element
+            # is stored as a single bit, but the underlying array is actually
+            # an array of `i8` values.
+            if is_bool:
+                # `i1` elements are stored as `i8` in the underlying array.
+                elem_ty = self.getIntegerType(8)
             data_ptr = cc.StdvecDataOp(
                 cc.PointerType.get(
-                    cc.ArrayType.get(cc.StdvecType.getElementType(
-                        vector.type))), vector).result
-            return cc.LoadOp(
+                    cc.ArrayType.get(elem_ty)), vector).result
+            load_val = cc.LoadOp(
                 cc.ComputePtrOp(
-                    cc.PointerType.get(cc.StdvecType.getElementType(
-                        vector.type)), data_ptr, [index],
+                    cc.PointerType.get(elem_ty), data_ptr, [index],
                     DenseI32ArrayAttr.get([kDynamicPtrIndex]))).result
+            if is_bool:
+                # Cast back to `i1` if the original vector element type was `i1`.
+                load_val = self.changeOperandToType(self.getIntegerType(1),
+                                                   load_val)
+            return load_val
         return cc.LoadOp(
             cc.ComputePtrOp(
                 cc.PointerType.get(
@@ -1405,6 +1441,7 @@ def process_assignment(target, value):
                     # We should allocate and store
                     alloca = cc.AllocaOp(cc.PointerType.get(value.type),
                                          TypeAttr.get(value.type)).result
+                    assert cc.PointerType.get(value.type) == alloca.type
                     cc.StoreOp(value, alloca)
                     return target, alloca
 
@@ -1438,6 +1475,8 @@ def process_assignment(target, value):
                 # Visit the value being assigned
                 self.visit(node.value)
                 valueToStore = self.popValue()
+                # Cast if necessary
+                valueToStore = self.changeOperandToType(ptrEleType, valueToStore)
                 # Store the value
                 cc.StoreOp(valueToStore, ptrVal)
                 return target.value, None
@@ -1460,6 +1499,8 @@ def process_assignment(target, value):
                 # Visit the value being assigned
                 self.visit(node.value)
                 valueToStore = self.popValue()
+                # Cast if necessary
+                valueToStore = self.changeOperandToType(cc.PointerType.getElementType(ptrVal.type), valueToStore)
                 # Store the value
                 cc.StoreOp(valueToStore, ptrVal)
                 return target.value, None
@@ -1771,6 +1812,26 @@ def processFunctionCall(fType, nrValsToPop):
                 func.CallOp(otherKernel, values)
             else:
                 result = func.CallOp(otherKernel, values).result
+                # Copy to stack if necessary
+                if cc.StdvecType.isinstance(result.type):
+                    elemTy = cc.StdvecType.getElementType(result.type)
+                    if elemTy == self.getIntegerType(1):
+                        elemTy = self.getIntegerType(8)
+                    data = cc.StdvecDataOp(cc.PointerType.get(elemTy), result).result
+                    i64Ty = self.getIntegerType(64)
+                    length = cc.StdvecSizeOp(i64Ty, result).result
+                    elemSize = cc.SizeOfOp(i64Ty, TypeAttr.get(elemTy)).result
+                    buffer = cc.AllocaOp(cc.PointerType.get(cc.ArrayType.get(elemTy)), TypeAttr.get(elemTy), seqSize=length).result
+                    i8PtrTy = cc.PointerType.get(self.getIntegerType(8))
+                    cbuffer = cc.CastOp(i8PtrTy, buffer).result
+                    cdata = cc.CastOp(i8PtrTy, data).result
+                    symName = '__nvqpp_vectorCopyToStack'
+                    load_intrinsic(self.module, symName)
+                    sizeInBytes = arith.MulIOp(length, elemSize).result
+                    func.CallOp([], symName, [cbuffer, cdata, sizeInBytes])
+                    # Replace result with the stack buffer-backed vector
+                    result = cc.StdvecInitOp(result.type, buffer, length=length).result                
+                
                 self.pushValue(result)
 
         def checkControlAndTargetTypes(controls, targets):
@@ -2350,6 +2411,15 @@ def bodyBuilder(iterVal):
                     # kernel registry correctly for the next conditional check
                     if var.name in globalKernelRegistry:
                         node.func.id = var.name
+                # Check generic callable objects that may be C++ `qkernel` (with its MLIR code registered)
+                elif hasattr(var, '__call__'):
+                    # Check if this is a registered C++ kernel 
+                    maybeKernelName = getInteropKernelNameIfFound(var, self.module)
+                    if maybeKernelName != None:
+                        otherKernel = SymbolTable(
+                            self.module.operation)[maybeKernelName]
+                        processFunctionCall(otherKernel.type, len(node.args))
+                        return
 
             if node.func.id in globalKernelRegistry:
                 # If in `globalKernelRegistry`, it has to be in this Module
@@ -2431,8 +2501,10 @@ def bodyBuilder(iterVal):
                     for _, v in annotations.items()
                 ]
 
+                unnamed_struct = "__repr__" not in cls.__dict__
+                struct_name = node.func.id if not unnamed_struct else ""
                 structTy = mlirTryCreateStructType(structTys,
-                                                   name=node.func.id,
+                                                   name=struct_name,
                                                    context=self.ctx)
                 if structTy is None:
                     self.emitFatalError(
@@ -2474,7 +2546,6 @@ def bodyBuilder(iterVal):
                     cc.StoreOp(ctorArgs[i], eleAddr)
                 self.pushValue(stackSlot)
                 return
-
             else:
                 self.emitFatalError(
                     "unhandled function call - {}, known kernels are {}".format(
@@ -2915,6 +2986,30 @@ def bodyBuilder(iterVal):
                         quake.ComputeActionOp(compute, action)
                         return
 
+                    if node.func.attr == 'to_integer':
+                        boolVec = self.popValue()
+                        boolVec = self.ifPointerThenLoad(boolVec)
+                        if not cc.StdvecType.isinstance(boolVec.type):
+                            self.emitFatalError(
+                                "to_integer expects a vector of booleans. Got type {}".format(
+                                    boolVec.type),
+                                node)
+                        elemTy = cc.StdvecType.getElementType(boolVec.type)
+                        if elemTy != self.getIntegerType(1):
+                            self.emitFatalError(
+                                "to_integer expects a vector of booleans. Got type {}".format(
+                                    boolVec.type),
+                                node)
+                        cudaqConvertToInteger = "__nvqpp_cudaqConvertToInteger"
+                        # Load the intrinsic
+                        load_intrinsic(self.module, cudaqConvertToInteger)
+                        # Signature:
+                        # `func.func private @__nvqpp_cudaqConvertToInteger(%arg : !cc.stdvec<i1>) -> i64`
+                        resultTy = self.getIntegerType(64)
+                        result = func.CallOp([resultTy], cudaqConvertToInteger, [boolVec]).result
+                        self.pushValue(result)
+                        return
+
                     self.emitFatalError(
                         f'Invalid function or class type requested from the cudaq module ({node.func.attr})',
                         node)
@@ -3453,6 +3548,11 @@ def get_item_type(pyval):
         listElemTy = get_item_type(node.elt)
         if listElemTy is None:
             return
+        
+        resultVecTy = cc.StdvecType.get(listElemTy)
+        isBool = listElemTy == self.getIntegerType(1)
+        if isBool:
+            listElemTy = self.getIntegerType(8)
         listTy = cc.ArrayType.get(listElemTy)
         listValue = cc.AllocaOp(cc.PointerType.get(listTy),
                                 TypeAttr.get(listElemTy),
@@ -3482,12 +3582,15 @@ def bodyBuilder(iterVar):
             listValueAddr = cc.ComputePtrOp(
                 cc.PointerType.get(listElemTy), listValue, [iterVar],
                 DenseI32ArrayAttr.get([kDynamicPtrIndex], context=self.ctx))
+            
+            if isBool:
+                result = self.changeOperandToType(self.getIntegerType(8), result)
             cc.StoreOp(result, listValueAddr)
             self.symbolTable.popScope()
 
         self.createInvariantForLoop(iterableSize, bodyBuilder)
         self.pushValue(
-            cc.StdvecInitOp(cc.StdvecType.get(listElemTy),
+            cc.StdvecInitOp(resultVecTy,
                             listValue,
                             length=iterableSize).result)
         return
@@ -3679,6 +3782,9 @@ def fix_negative_idx(idx, get_size):
                                    upper=upperVal).result)
             elif cc.StdvecType.isinstance(var.type):
                 eleTy = cc.StdvecType.getElementType(var.type)
+                isBool = eleTy == self.getIntegerType(1)
+                if isBool:
+                    eleTy = self.getIntegerType(8)
                 ptrTy = cc.PointerType.get(eleTy)
                 arrTy = cc.ArrayType.get(eleTy)
                 ptrArrTy = cc.PointerType.get(arrTy)
@@ -3722,6 +3828,9 @@ def fix_negative_idx(idx, get_size):
         if cc.StdvecType.isinstance(var.type):
             idx = fix_negative_idx(idx, lambda: get_size(var))
             eleTy = cc.StdvecType.getElementType(var.type)
+            isBool = eleTy == self.getIntegerType(1)
+            if isBool:
+                eleTy = self.getIntegerType(8)
             elePtrTy = cc.PointerType.get(eleTy)
             arrTy = cc.ArrayType.get(eleTy)
             ptrArrTy = cc.PointerType.get(arrTy)
@@ -3733,7 +3842,10 @@ def fix_negative_idx(idx, get_size):
             if self.subscriptPushPointerValue:
                 self.pushValue(eleAddr)
                 return
-            self.pushValue(cc.LoadOp(eleAddr).result)
+            val = cc.LoadOp(eleAddr).result
+            if isBool:
+                val = self.changeOperandToType(self.getIntegerType(1), val)
+            self.pushValue(val)
             return
 
         if cc.PointerType.isinstance(var.type):
@@ -3960,7 +4072,9 @@ def functor(iter, idx):
                 iterEleTy = cc.StdvecType.getElementType(iterable.type)
                 totalSize = cc.StdvecSizeOp(self.getIntegerType(),
                                             iterable).result
-
+                isBool = iterEleTy == self.getIntegerType(1)
+                if isBool:
+                    iterEleTy = self.getIntegerType(8)
                 def functor(iter, idxVal):
                     elePtrTy = cc.PointerType.get(iterEleTy)
                     arrTy = cc.ArrayType.get(iterEleTy)
@@ -3970,7 +4084,10 @@ def functor(iter, idxVal):
                         elePtrTy, vecPtr, [idxVal],
                         DenseI32ArrayAttr.get([kDynamicPtrIndex],
                                               context=self.ctx)).result
-                    return cc.LoadOp(eleAddr).result
+                    result = cc.LoadOp(eleAddr).result
+                    if isBool:
+                        result = self.changeOperandToType(self.getIntegerType(1), result)
+                    return result
 
                 extractFunctor = functor
 
diff --git a/python/cudaq/kernel/kernel_decorator.py b/python/cudaq/kernel/kernel_decorator.py
index 799117a07bc..ad571d89393 100644
--- a/python/cudaq/kernel/kernel_decorator.py
+++ b/python/cudaq/kernel/kernel_decorator.py
@@ -21,7 +21,7 @@
 from .captured_data import CapturedDataStorage
 from .utils import (emitFatalError, emitErrorIfInvalidPauli, globalAstRegistry,
                     globalRegisteredTypes, mlirTypeFromPyType, mlirTypeToPyType,
-                    nvqppPrefix)
+                    nvqppPrefix, getInteropKernelNameIfFound)
 
 # This file implements the decorator mechanism needed to
 # JIT compile CUDA-Q kernels. It exposes the cudaq.kernel()
@@ -451,6 +451,20 @@ def __convertStringsToPauli__(self, arg):
 
         return arg
 
+    def getCallableNames(self, *args):
+        callableNames = []
+        for arg in args:
+            if isinstance(arg, PyKernelDecorator):
+                callableNames.append(arg.name)
+            else:
+                if hasattr(arg, '__call__'):
+                    maybeKernelName = getInteropKernelNameIfFound(arg, self.module)
+                    if maybeKernelName != None:
+                        # Remove "__nvqpp__mlirgen__" prefix when packing the list of callables
+                        maybeKernelName = maybeKernelName.replace("__nvqpp__mlirgen__", "")
+                        callableNames.append(maybeKernelName)
+        return callableNames
+    
     def __call__(self, *args):
         """
         Invoke the CUDA-Q kernel. JIT compilation of the kernel AST to MLIR 
@@ -481,7 +495,8 @@ def __call__(self, *args):
             mlirType = mlirTypeFromPyType(type(arg),
                                           self.module.context,
                                           argInstance=arg,
-                                          argTypeToCompareTo=self.argTypes[i])
+                                          argTypeToCompareTo=self.argTypes[i],
+                                          module=self.module)
 
             if self.isCastablePyType(mlirType, self.argTypes[i]):
                 processedArgs.append(
@@ -496,19 +511,30 @@ def __call__(self, *args):
                 )
 
             if cc.CallableType.isinstance(mlirType):
-                # Assume this is a PyKernelDecorator
-                callableNames.append(arg.name)
-                # It may be that the provided input callable kernel
-                # is not currently in the ModuleOp. Need to add it
-                # if that is the case, we have to use the AST
-                # so that it shares self.module's MLIR Context
-                symbols = SymbolTable(self.module.operation)
-                if nvqppPrefix + arg.name not in symbols:
-                    tmpBridge = PyASTBridge(self.capturedDataStorage,
-                                            existingModule=self.module,
-                                            disableEntryPointTag=True)
-                    tmpBridge.visit(globalAstRegistry[arg.name][0])
-
+                if isinstance(arg, PyKernelDecorator):
+                    # Assume this is a PyKernelDecorator
+                    callableNames.append(arg.name)
+                    # It may be that the provided input callable kernel
+                    # is not currently in the ModuleOp. Need to add it
+                    # if that is the case, we have to use the AST
+                    # so that it shares self.module's MLIR Context
+                    symbols = SymbolTable(self.module.operation)
+                    if nvqppPrefix + arg.name not in symbols:
+                        tmpBridge = PyASTBridge(self.capturedDataStorage,
+                                                existingModule=self.module,
+                                                disableEntryPointTag=True)
+                        tmpBridge.visit(globalAstRegistry[arg.name][0])
+                else:
+                    if hasattr(arg, '__call__'):
+                        maybeKernelName = getInteropKernelNameIfFound(arg, self.module)
+                        if maybeKernelName != None:   
+                            # Remove "__nvqpp__mlirgen__" prefix
+                            maybeKernelName = maybeKernelName.replace("__nvqpp__mlirgen__", "")
+                            callableNames.append(maybeKernelName)
+                    else:
+                        emitFatalError(
+                            "Invalid callable argument provided to kernel."
+                        )
             # Convert `numpy` arrays to lists
             if cc.StdvecType.isinstance(mlirType) and hasattr(arg, "tolist"):
                 if arg.ndim != 1:
diff --git a/python/cudaq/kernel/utils.py b/python/cudaq/kernel/utils.py
index efaf213b581..e7f447be516 100644
--- a/python/cudaq/kernel/utils.py
+++ b/python/cudaq/kernel/utils.py
@@ -15,8 +15,8 @@
 import types
 
 from cudaq.mlir._mlir_libs._quakeDialects import cudaq_runtime
-from cudaq.mlir.dialects import quake, cc
-from cudaq.mlir.ir import ComplexType, F32Type, F64Type, IntegerType
+from cudaq.mlir.dialects import quake, cc, func
+from cudaq.mlir.ir import ComplexType, F32Type, F64Type, IntegerType, SymbolTable
 
 State = cudaq_runtime.State
 qvector = cudaq_runtime.qvector
@@ -119,8 +119,9 @@ def isQuantumType(ty):
     if numQuantumMembers != len(mlirEleTypes) or \
         any((quake.StruqType.isinstance(t) for t in mlirEleTypes)):
         return None
-    return quake.StruqType.getNamed(name, mlirEleTypes, context=context)
-
+    if len(name) > 0:
+        return quake.StruqType.getNamed(name, mlirEleTypes, context=context)
+    return quake.StruqType.get(mlirEleTypes, context=context)
 
 def mlirTypeFromAnnotation(annotation, ctx, raiseError=False):
     """
@@ -284,6 +285,7 @@ def emitFatalErrorOverride(msg):
                     f"Adding new fields in data classes is not yet supported. The dataclass must be declared with @dataclass(slots=True) or @dataclasses.dataclass(slots=True)."
                 )
 
+            unnamed_struct = "__repr__" not in pyType.__dict__
             if len({
                     k: v
                     for k, v in pyType.__dict__.items()
@@ -293,7 +295,8 @@ def emitFatalErrorOverride(msg):
                 localEmitFatalError(
                     'struct types with user specified methods are not allowed.')
 
-            tupleTy = mlirTryCreateStructType(structTys, name=id)
+            struct_name = id if not unnamed_struct else ""
+            tupleTy = mlirTryCreateStructType(structTys, name=struct_name) 
             if tupleTy is None:
                 localEmitFatalError(
                     "Hybrid quantum-classical data types and nested quantum structs are not allowed."
@@ -442,7 +445,19 @@ def mlirTypeFromPyType(argType, ctx, **kwargs):
     if 'argInstance' in kwargs:
         argInstance = kwargs['argInstance']
         if isinstance(argInstance, Callable):
-            return cc.CallableType.get(argInstance.argTypes, ctx)
+            if hasattr(argInstance, 'argTypes'):
+                return cc.CallableType.get(argInstance.argTypes, ctx)
+            elif "module" in kwargs and hasattr(argInstance, '__call__'):
+                # This is a callable object, check if it's a C++ `qkernel`
+                maybeKernelName = getInteropKernelNameIfFound(argInstance, kwargs['module'])
+                if maybeKernelName != None:
+                    otherKernel = SymbolTable(
+                        kwargs['module'].operation)[maybeKernelName]
+                    if isinstance(otherKernel, func.FuncOp):
+                        argTypes = []
+                        for arg in otherKernel.arguments:
+                            argTypes.append(arg.type)
+                        return cc.CallableType.get(argTypes, ctx)
 
     for name in globalRegisteredTypes.classes:
         customTy, memberTys = globalRegisteredTypes.getClassAttributes(name)
@@ -557,6 +572,27 @@ def mlirTypeToPyType(argType):
     emitFatalError(
         f"Cannot infer python type from provided CUDA-Q type ({argType})")
 
+def getInteropKernelNameIfFound(pyFunc, module):
+    """
+    Given a Python function and an MLIR module, check if the function
+    is registered as an interop kernel. If so, return the kernel name.
+    Otherwise, return None.
+    """
+    if not callable(pyFunc):
+        emitFatalError(
+            f"Provided argument is not a callable function ({pyFunc})"
+        )
+
+    modulePath = str(pyFunc.__module__) if hasattr(pyFunc, '__module__') else ''
+    funcName = str(pyFunc.__name__) if hasattr(pyFunc, '__name__') else ''
+    # Look up key
+    devKey = f"{modulePath}.{funcName}"
+    if cudaq_runtime.isRegisteredDeviceModule(devKey):
+        maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(module, devKey)
+        if maybeKernelName != None:
+            return maybeKernelName
+
+    return None
 
 def emitErrorIfInvalidPauli(pauliArg):
     """
diff --git a/python/runtime/cudaq/algorithms/py_run.cpp b/python/runtime/cudaq/algorithms/py_run.cpp
index 8cd38a7295b..ef59b14f461 100644
--- a/python/runtime/cudaq/algorithms/py_run.cpp
+++ b/python/runtime/cudaq/algorithms/py_run.cpp
@@ -39,7 +39,8 @@ static std::vector<py::object> readRunResults(mlir::ModuleOp module,
 }
 
 static std::tuple<std::string, MlirModule, OpaqueArguments *,
-                  mlir::func::FuncOp, std::string, mlir::func::FuncOp>
+                  mlir::func::FuncOp, std::string, mlir::func::FuncOp,
+                  std::vector<std::string>>
 getKernelLaunchParameters(py::object &kernel, py::args args) {
   if (!py::hasattr(kernel, "arguments"))
     throw std::runtime_error(
@@ -52,6 +53,11 @@ getKernelLaunchParameters(py::object &kernel, py::args args) {
   if (py::hasattr(kernel, "compile"))
     kernel.attr("compile")();
 
+  std::vector<std::string> callableNames;
+  if (py::hasattr(kernel, "getCallableNames"))
+    callableNames =
+        kernel.attr("getCallableNames")(*args).cast<std::vector<std::string>>();
+
   auto origKernName = kernel.attr("name").cast<std::string>();
   auto kernelName = origKernName + ".run";
   if (!py::hasattr(kernel, "module") || kernel.attr("module").is_none())
@@ -78,7 +84,8 @@ getKernelLaunchParameters(py::object &kernel, py::args args) {
   }
   auto *argData = toOpaqueArgs(args, kernelMod, kernelName);
   auto funcOp = getKernelFuncOp(kernelMod, kernelName);
-  return {kernelName, kernelMod, argData, funcOp, origKernName, origKern};
+  return {kernelName,   kernelMod, argData,      funcOp,
+          origKernName, origKern,  callableNames};
 }
 
 static details::RunResultSpan
@@ -86,6 +93,7 @@ pyRunTheKernel(const std::string &name, const std::string &origName,
                MlirModule module, mlir::func::FuncOp funcOp,
                mlir::func::FuncOp origKernel, OpaqueArguments &runtimeArgs,
                quantum_platform &platform, std::size_t shots_count,
+               const std::vector<std::string> &callableNames,
                std::size_t qpu_id = 0) {
   auto returnTypes = origKernel.getResultTypes();
   if (returnTypes.empty() || returnTypes.size() > 1)
@@ -93,21 +101,24 @@ pyRunTheKernel(const std::string &name, const std::string &origName,
         "`cudaq.run` only supports kernels that return a value.");
 
   auto returnTy = returnTypes[0];
-  // Disallow returning list / vectors from entry-point kernels.
-  if (returnTy.isa<cc::StdvecType>()) {
-    throw std::runtime_error("`cudaq.run` does not yet support returning "
-                             "`list` from entry-point kernels.");
+  // Disallow returning nested vectors from entry-point kernels.
+  if (auto vecTy = dyn_cast<cudaq::cc::StdvecType>(returnTy)) {
+    auto elemTy = vecTy.getElementType();
+    if (elemTy.isa<cudaq::cc::StdvecType>())
+      throw std::runtime_error(
+          "`cudaq.run` does not yet support returning nested `list` from "
+          "entry-point kernels.");
   }
 
   auto mod = unwrap(module);
 
-  auto [rawArgs, size, returnOffset, thunk] =
-      pyAltLaunchKernelBase(name, module, returnTy, runtimeArgs, {}, 0, false);
+  auto [rawArgs, size, returnOffset, thunk] = pyAltLaunchKernelBase(
+      name, module, returnTy, runtimeArgs, callableNames, 0, false);
 
   auto results = details::runTheKernel(
       [&]() mutable {
         pyLaunchKernel(name, thunk, mod, runtimeArgs, rawArgs, size,
-                       returnOffset, {});
+                       returnOffset, callableNames);
       },
       platform, name, origName, shots_count, qpu_id);
 
@@ -133,7 +144,7 @@ std::vector<py::object> pyRun(py::object &kernel, py::args args,
   if (shots_count == 0)
     return {};
 
-  auto [name, module, argData, func, origName, origKern] =
+  auto [name, module, argData, func, origName, origKern, callableNames] =
       getKernelLaunchParameters(kernel, args);
 
   auto mod = unwrap(module);
@@ -149,7 +160,7 @@ std::vector<py::object> pyRun(py::object &kernel, py::args args,
   }
 
   auto span = pyRunTheKernel(name, origName, module, func, origKern, *argData,
-                             platform, shots_count);
+                             platform, shots_count, callableNames);
   delete argData;
   auto results = pyReadResults(span, module, func, origKern, shots_count);
 
@@ -184,7 +195,7 @@ async_run_result pyRunAsync(py::object &kernel, py::args args,
                              ") exceeds the number of available QPUs (" +
                              std::to_string(numQPUs) + ")");
 
-  auto [name, module, argData, func, origName, origKern] =
+  auto [name, module, argData, func, origName, origKern, callableNames] =
       getKernelLaunchParameters(kernel, args);
 
   auto mod = unwrap(module);
@@ -219,7 +230,7 @@ async_run_result pyRunAsync(py::object &kernel, py::args args,
     QuantumTask wrapped = detail::make_copyable_function(
         [sp = std::move(spanPromise), ep = std::move(errorPromise), shots_count,
          qpu_id, argData, name, module, func, origKern, origName,
-         noise_model = std::move(noise_model)]() mutable {
+         noise_model = std::move(noise_model), callableNames]() mutable {
           auto &platform = get_platform();
 
           // Launch the kernel in the appropriate context.
@@ -227,8 +238,9 @@ async_run_result pyRunAsync(py::object &kernel, py::args args,
             platform.set_noise(&noise_model.value());
 
           try {
-            auto span = pyRunTheKernel(name, origName, module, func, origKern,
-                                       *argData, platform, shots_count, qpu_id);
+            auto span =
+                pyRunTheKernel(name, origName, module, func, origKern, *argData,
+                               platform, shots_count, callableNames, qpu_id);
             delete argData;
             sp.set_value(span);
             ep.set_value("");
diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
index 9db3e9e431f..5543a54b37d 100644
--- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
+++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp
@@ -117,8 +117,21 @@ OpaqueArguments *toOpaqueArgs(py::args &args, MlirModule mod,
   auto *argData = new cudaq::OpaqueArguments();
   args = simplifiedValidateInputArguments(args);
   setDataLayout(mod);
-  cudaq::packArgs(*argData, args, kernelFunc,
-                  [](OpaqueArguments &, py::object &) { return false; });
+  auto callableArgHandler = [](cudaq::OpaqueArguments &argData,
+                               py::object &arg) {
+    if (py::hasattr(arg, "module") || py::hasattr(arg, "__call__")) {
+      // Just give it some dummy data that will not be used.
+      // We synthesize away all callables, the block argument
+      // remains but it is not used, so just give argsCreator
+      // something, and we'll make sure its cleaned up.
+      long *ourAllocatedArg = new long();
+      argData.emplace_back(ourAllocatedArg,
+                           [](void *ptr) { delete static_cast<long *>(ptr); });
+      return true;
+    }
+    return false;
+  };
+  cudaq::packArgs(*argData, args, kernelFunc, callableArgHandler);
   return argData;
 }
 
@@ -157,7 +170,6 @@ ExecutionEngine *jitKernel(const std::string &name, MlirModule module,
     pm.addPass(cudaq::opt::createGenerateKernelExecution(
         {.startingArgIdx = startingArgIdx}));
     pm.addPass(cudaq::opt::createGenerateDeviceCodeLoader({.jitTime = true}));
-    pm.addPass(cudaq::opt::createReturnToOutputLog());
     pm.addPass(cudaq::opt::createLambdaLiftingPass());
     pm.addPass(cudaq::opt::createDistributedDeviceCall());
     std::string tl = getTransportLayer();
@@ -947,7 +959,7 @@ void bindAltLaunchKernel(py::module &mod,
 
   auto callableArgHandler = [](cudaq::OpaqueArguments &argData,
                                py::object &arg) {
-    if (py::hasattr(arg, "module")) {
+    if (py::hasattr(arg, "module") || py::hasattr(arg, "__call__")) {
       // Just give it some dummy data that will not be used.
       // We synthesize away all callables, the block argument
       // remains but it is not used, so just give argsCreator
diff --git a/python/runtime/interop/PythonCppInterop.h b/python/runtime/interop/PythonCppInterop.h
index 9b39aada636..c74a8ec2872 100644
--- a/python/runtime/interop/PythonCppInterop.h
+++ b/python/runtime/interop/PythonCppInterop.h
@@ -7,6 +7,8 @@
  ******************************************************************************/
 #pragma once
 
+#include "cudaq/qis/qkernel.h"
+#include "cudaq/utils/registry.h"
 #include <pybind11/pybind11.h>
 
 namespace py = pybind11;
@@ -166,4 +168,28 @@ void addDeviceKernelInterop(py::module_ &m, const std::string &modName,
                                       kernelName, mangledArgs);
   return;
 }
+
+// Specialization for qkernel
+template <typename R, typename... Args>
+py::object convertQkernel(py::module_ &m, cudaq::qkernel<R(Args...)> &qkernel,
+                          const std::string &docstring = "") {
+  const char *qkernelName = cudaq::registry::getLinkableKernelNameOrNull(
+      cudaq::registry::__cudaq_getLinkableKernelKey(&qkernel));
+  if (!qkernelName)
+    throw std::runtime_error(
+        "Invalid `qkernel` passed, could not find registered kernel.");
+  std::string kernelName = qkernelName;
+  // Rremove "function_" prefix if exists
+  if (kernelName.starts_with("function_"))
+    kernelName = kernelName.substr(std::string("function_").length());
+  const std::string docStr =
+      docstring.empty()
+          ? "Auto-generated kernel from C++ " + kernelName + " qkernel."
+          : docstring;
+  m.def(
+      kernelName.c_str(), [](Args...) {}, docStr.c_str());
+  cudaq::python::registerDeviceKernel(m.attr("__name__").cast<std::string>(),
+                                      kernelName, "");
+  return m.attr(kernelName.c_str());
+}
 } // namespace cudaq::python
diff --git a/python/tests/interop/quantum_lib/quantum_lib.cpp b/python/tests/interop/quantum_lib/quantum_lib.cpp
index 7c5cbb23054..ed313b32f88 100644
--- a/python/tests/interop/quantum_lib/quantum_lib.cpp
+++ b/python/tests/interop/quantum_lib/quantum_lib.cpp
@@ -29,4 +29,12 @@ __qpu__ void qft(cudaq::qview<> qubits, const std::vector<double> &x,
 __qpu__ void another(cudaq::qview<> qubits, std::size_t i) { x(qubits[i]); }
 
 __qpu__ void uccsd(cudaq::qview<> qubits, std::size_t) { h(qubits[0]); }
+
+__qpu__ void reset_group(patch p) {
+  for (std::size_t i = 0; i < p.data.size(); i++)
+    reset(p.data[i]);
+}
+
+__qpu__ void x_group(patch p) { x(p.data); }
+
 } // namespace cudaq
diff --git a/python/tests/interop/quantum_lib/quantum_lib.h b/python/tests/interop/quantum_lib/quantum_lib.h
index a0655099237..4b9fa371351 100644
--- a/python/tests/interop/quantum_lib/quantum_lib.h
+++ b/python/tests/interop/quantum_lib/quantum_lib.h
@@ -9,6 +9,12 @@
 
 #include "cudaq/qis/qubit_qis.h"
 
+// Custom data structure
+struct patch {
+  cudaq::qview<> data;
+  cudaq::qview<> aux;
+};
+
 namespace cudaq {
 void entryPoint(const std::function<void(cudaq::qvector<> &)> &statePrep);
 
@@ -19,4 +25,8 @@ void another(cudaq::qview<> qubits, std::size_t);
 
 void uccsd(cudaq::qview<> qubits, std::size_t);
 
+void reset_group(patch p);
+
+void x_group(patch p);
+
 } // namespace cudaq
diff --git a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
index 4ea2d2176cc..92f1382f2a8 100644
--- a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
+++ b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp
@@ -8,6 +8,7 @@
 
 #include "cudaq.h"
 #include "cudaq/algorithms/sample.h"
+#include "cudaq/qis/qkernel.h"
 #include "quantum_lib/quantum_lib.h"
 #include "runtime/interop/PythonCppInterop.h"
 #include <pybind11/pybind11.h>
@@ -15,6 +16,22 @@
 
 namespace py = pybind11;
 
+namespace {
+static std::unordered_map<std::string,
+                          cudaq::qkernel<void(cudaq::qview<>, std::size_t)>>
+    g_cppKernels_1;
+
+static std::unordered_map<std::string, cudaq::qkernel<void(patch)>>
+    g_cppKernels_2;
+
+static const bool initKernels = []() {
+  g_cppKernels_1.insert(std::make_pair("uccsd", cudaq::uccsd));
+  g_cppKernels_2.insert(std::make_pair("reset", cudaq::reset_group));
+  g_cppKernels_2.insert(std::make_pair("x", cudaq::x_group));
+  return true;
+}();
+} // namespace
+
 PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
 
   m.def("test_cpp_qalgo", [](py::object statePrepIn) {
@@ -49,4 +66,26 @@ PYBIND11_MODULE(cudaq_test_cpp_algo, m) {
 
   cudaq::python::addDeviceKernelInterop<cudaq::qview<>, std::size_t>(
       m, "qstd", "uccsd", "");
+
+  // Convert the C++ kernel registry to Python-accessible kernels
+  auto interopSubMod = m.def_submodule("_cpp_interop_kernels");
+  static std::unordered_map<std::string, py::object> g_py_kernels;
+
+  for (auto &[name, kernel] : g_cppKernels_1) {
+    g_py_kernels.insert(std::make_pair(
+        name, cudaq::python::convertQkernel(interopSubMod, kernel)));
+  }
+
+  for (auto &[name, kernel] : g_cppKernels_2) {
+    g_py_kernels.insert(std::make_pair(
+        name, cudaq::python::convertQkernel(interopSubMod, kernel)));
+  }
+
+  m.def("get_cpp_kernel", [](const std::string &name) {
+    auto it = g_py_kernels.find(name);
+    if (it == g_py_kernels.end())
+      throw std::runtime_error("No C++ kernel registered for requested name.");
+
+    return it->second;
+  });
 }
diff --git a/python/tests/interop/test_interop.py b/python/tests/interop/test_interop.py
index e63588408ac..4324e79c02c 100644
--- a/python/tests/interop/test_interop.py
+++ b/python/tests/interop/test_interop.py
@@ -7,6 +7,8 @@
 # ============================================================================ #
 
 import cudaq, pytest
+from typing import Callable
+from dataclasses import dataclass
 
 cudaq_test_cpp_algo = pytest.importorskip('cudaq_test_cpp_algo')
 
@@ -242,3 +244,79 @@ def entry():
         takesCapture(spin)
 
     entry.compile()
+
+
+def test_cpp_qkernel():
+    # Test the `qkernel` provided in C++ via a map-like registry.
+    # This is provided as a function-like callable.
+    kernel_from_cpp_registry = cudaq_test_cpp_algo.get_cpp_kernel("uccsd")
+
+    # Use as a capture
+    @cudaq.kernel
+    def cpp_qkernel():
+        q = cudaq.qvector(4)
+        kernel_from_cpp_registry(q, 0)
+
+    cpp_qkernel()
+
+    # Use as a callable argument
+    @cudaq.kernel
+    def caller(k: Callable[[cudaq.qview, int], None]):
+        q = cudaq.qvector(4)
+        k(q, 0)
+
+    caller(kernel_from_cpp_registry)
+
+
+def test_cpp_custom_struct():
+    # Define a struct in Python that matches the C++ struct
+    # Note: use `repr=False` to annotate that this is an unnamed struct.
+    # This will maintain compatibility with C++ structs that do not have
+    # a name.
+    @dataclass(slots=True, repr=False)
+    class patch:
+        data: cudaq.qvector
+        aux: cudaq.qvector
+
+    reset_qkernel = cudaq_test_cpp_algo.get_cpp_kernel("reset")
+    x_qkernel = cudaq_test_cpp_algo.get_cpp_kernel("x")
+
+    # Use as a capture
+    @cudaq.kernel
+    def cpp_qkernel_struct():
+        q = cudaq.qvector(4)
+        r = cudaq.qvector(2)
+        x(q)
+        reset_qkernel(patch(q, r))
+
+    counts = cudaq.sample(cpp_qkernel_struct)
+    counts.dump()
+    assert len(counts) == 1 and '000000' in counts
+
+    @cudaq.kernel
+    def cpp_qkernel_struct_x():
+        q = cudaq.qvector(4)
+        r = cudaq.qvector(2)
+        x_qkernel(patch(q, r))
+
+    counts = cudaq.sample(cpp_qkernel_struct_x)
+    counts.dump()
+    assert len(counts) == 1 and '111100' in counts
+
+    # Callable
+    @cudaq.kernel
+    def cpp_qkernel_struct_callable(k: Callable[[patch], None]):
+        q = cudaq.qvector(4)
+        r = cudaq.qvector(2)
+        for i in range(4):
+            if i % 2 == 0:
+                x(q[i])
+        k(patch(q, r))
+
+    counts = cudaq.sample(cpp_qkernel_struct_callable, reset_qkernel)
+    counts.dump()
+    assert len(counts) == 1 and '000000' in counts
+
+    counts = cudaq.sample(cpp_qkernel_struct_callable, x_qkernel)
+    counts.dump()
+    assert len(counts) == 1 and '010100' in counts
diff --git a/python/tests/kernel/test_run_async_kernel.py b/python/tests/kernel/test_run_async_kernel.py
index fc1c0ac3aae..31796c18e15 100644
--- a/python/tests/kernel/test_run_async_kernel.py
+++ b/python/tests/kernel/test_run_async_kernel.py
@@ -14,8 +14,6 @@
 import numpy as np
 import pytest
 
-list_err_msg = 'does not yet support returning `list` from entry-point kernels'
-
 
 def is_close(actual, expected):
     return np.isclose(actual, expected, atol=1e-6)
@@ -338,38 +336,41 @@ def test_return_list_bool():
     def simple_list_bool_no_args() -> list[bool]:
         return [True, False, True]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_bool_no_args, shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_bool_no_args, shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [True, False, True]
+    assert results[1] == [True, False, True]
 
     @cudaq.kernel
     def simple_list_bool(n: int) -> list[bool]:
         qubits = cudaq.qvector(n)
         return [True, False, True]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_bool, 2, shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_bool, 2, shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [True, False, True]
+    assert results[1] == [True, False, True]
 
     @cudaq.kernel
     def simple_list_bool_args(n: int, t: list[bool]) -> list[bool]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_bool_args, 2, [True, False, True]).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_bool_args, 2, [True, False, True], shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [True, False, True]
+    assert results[1] == [True, False, True]
 
     @cudaq.kernel
     def simple_list_bool_args_no_broadcast(t: list[bool]) -> list[bool]:
         qubits = cudaq.qvector(2)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_bool_args_no_broadcast,
-                        [True, False, True]).get()
-    assert list_err_msg in str(e.value)
-
+    results = cudaq.run_async(simple_list_bool_args_no_broadcast,
+                        [True, False, True], shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [True, False, True]
+    assert results[1] == [True, False, True]
 
 def test_return_list_int():
 
@@ -377,18 +378,21 @@ def test_return_list_int():
     def simple_list_int_no_args() -> list[int]:
         return [-13, 5, 42]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_int_no_args, shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_int_no_args, shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
     @cudaq.kernel
     def simple_list_int(n: int, t: list[int]) -> list[int]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_int, 2, [-13, 5, 42], shots_count=2).get()
-    assert list_err_msg in str(e.value)
+
+    results = cudaq.run_async(simple_list_int, 2, [-13, 5, 42], shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
 
 def test_return_list_int8():
@@ -397,18 +401,22 @@ def test_return_list_int8():
     def simple_list_int8_no_args() -> list[np.int8]:
         return [-13, 5, 42]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_int8_no_args, shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    
+    results = cudaq.run_async(simple_list_int8_no_args, shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
     @cudaq.kernel
     def simple_list_int8(n: int, t: list[np.int8]) -> list[np.int8]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_int8, 2, [-13, 5, 42], shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    
+    results = cudaq.run_async(simple_list_int8, 2, [-13, 5, 42], shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
 
 def test_return_list_int16():
@@ -417,18 +425,20 @@ def test_return_list_int16():
     def simple_list_int16_no_args() -> list[np.int16]:
         return [-13, 5, 42]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_int16_no_args, shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_int16_no_args, shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
     @cudaq.kernel
     def simple_list_int16(n: int, t: list[np.int16]) -> list[np.int16]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_int16, 2, [-13, 5, 42], shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_int16, 2, [-13, 5, 42], shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
 
 def test_return_list_int32():
@@ -437,18 +447,20 @@ def test_return_list_int32():
     def simple_list_int32_no_args() -> list[np.int32]:
         return [-13, 5, 42]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_int32_no_args, shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_int32_no_args, shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
     @cudaq.kernel
     def simple_list_int32(n: int, t: list[np.int32]) -> list[np.int32]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_int32, 2, [-13, 5, 42], shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_int32, 2, [-13, 5, 42], shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
 
 def test_return_list_int64():
@@ -457,18 +469,20 @@ def test_return_list_int64():
     def simple_list_int64_no_args() -> list[np.int64]:
         return [-13, 5, 42]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_int64_no_args, shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_int64_no_args, shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
     @cudaq.kernel
     def simple_list_int64(n: int, t: list[np.int64]) -> list[np.int64]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_int64, 2, [-13, 5, 42], shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_int64, 2, [-13, 5, 42], shots_count=2).get()
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
 
 def test_return_list_float():
@@ -477,20 +491,22 @@ def test_return_list_float():
     def simple_list_float_no_args() -> list[float]:
         return [-13.2, 5., 42.99]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_float_no_args, shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_float_no_args, shots_count=2).get()
+    assert len(results) == 2
+    assert np.allclose(results[0], [-13.2, 5., 42.99])
+    assert np.allclose(results[1], [-13.2, 5., 42.99])
 
     @cudaq.kernel
     def simple_list_float(n: int, t: list[float]) -> list[float]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_float,
+    results = cudaq.run_async(simple_list_float,
                         2, [-13.2, 5.0, 42.99],
                         shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    assert len(results) == 2
+    assert np.allclose(results[0], [-13.2, 5., 42.99])
+    assert np.allclose(results[1], [-13.2, 5., 42.99])
 
 
 def test_return_list_float32():
@@ -499,20 +515,22 @@ def test_return_list_float32():
     def simple_list_float32_no_args() -> list[np.float32]:
         return [-13.2, 5., 42.99]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_float32_no_args, shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_float32_no_args, shots_count=2).get()
+    assert len(results) == 2
+    assert np.allclose(results[0], [-13.2, 5., 42.99])
+    assert np.allclose(results[1], [-13.2, 5., 42.99])
 
     @cudaq.kernel
     def simple_list_float32(n: int, t: list[np.float32]) -> list[np.float32]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_float32,
+    results = cudaq.run_async(simple_list_float32,
                         2, [-13.2, 5.0, 42.99],
                         shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    assert len(results) == 2
+    assert np.allclose(results[0], [-13.2, 5., 42.99])
+    assert np.allclose(results[1], [-13.2, 5., 42.99])
 
 
 def test_return_list_float64():
@@ -521,21 +539,22 @@ def test_return_list_float64():
     def simple_list_float64_no_args() -> list[np.float64]:
         return [-13.2, 5., 42.99]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_float64_no_args, shots_count=2).get()
-    assert list_err_msg in str(e.value)
+    results = cudaq.run_async(simple_list_float64_no_args, shots_count=2).get()
+    assert len(results) == 2
+    assert np.allclose(results[0], [-13.2, 5., 42.99])
+    assert np.allclose(results[1], [-13.2, 5., 42.99])
 
     @cudaq.kernel
     def simple_list_float64(n: int, t: list[np.float64]) -> list[np.float64]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run_async(simple_list_float64,
+    results = cudaq.run_async(simple_list_float64,
                         2, [-13.2, 5.0, 42.99],
                         shots_count=2).get()
-    assert list_err_msg in str(e.value)
-
+    assert len(results) == 2
+    assert np.allclose(results[0], [-13.2, 5., 42.99])
+    assert np.allclose(results[1], [-13.2, 5., 42.99])
 
 # Test tuples
 # TODO: Define spec for using tuples in kernels
diff --git a/python/tests/kernel/test_run_kernel.py b/python/tests/kernel/test_run_kernel.py
index 3e656ee16a9..d73b35fa352 100644
--- a/python/tests/kernel/test_run_kernel.py
+++ b/python/tests/kernel/test_run_kernel.py
@@ -14,8 +14,6 @@
 import warnings
 import pytest
 
-list_err_msg = 'does not yet support returning `list` from entry-point kernels'
-
 skipIfBraketNotInstalled = pytest.mark.skipif(
     not (cudaq.has_target("braket")),
     reason='Could not find `braket` in installation')
@@ -333,36 +331,41 @@ def test_return_list_bool():
     def simple_list_bool_no_args() -> list[bool]:
         return [True, False, True]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_bool_no_args, shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_bool_no_args, shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [True, False, True]
+    assert results[1] == [True, False, True]
 
     @cudaq.kernel
     def simple_list_bool(n: int) -> list[bool]:
         qubits = cudaq.qvector(n)
         return [True, False, True]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_bool, 2, shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_bool, 2, shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [True, False, True]
+    assert results[1] == [True, False, True]
 
     @cudaq.kernel
     def simple_list_bool_args(n: int, t: list[bool]) -> list[bool]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_bool_args, 2, [True, False, True])
-    assert list_err_msg in str(e.value)
+
+    results = cudaq.run(simple_list_bool_args, 2, [True, False, True], shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [True, False, True]
+    assert results[1] == [True, False, True]
 
     @cudaq.kernel
     def simple_list_bool_args_no_broadcast(t: list[bool]) -> list[bool]:
         qubits = cudaq.qvector(2)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_bool_args_no_broadcast, [True, False, True])
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_bool_args_no_broadcast, [True, False, True], shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [True, False, True]
+    assert results[1] == [True, False, True]
 
 
 def test_return_list_int():
@@ -371,18 +374,20 @@ def test_return_list_int():
     def simple_list_int_no_args() -> list[int]:
         return [-13, 5, 42]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_int_no_args, shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_int_no_args, shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
     @cudaq.kernel
     def simple_list_int(n: int, t: list[int]) -> list[int]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_int, 2, [-13, 5, 42], shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_int, 2, [-13, 5, 42], shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
 
 def test_return_list_int8():
@@ -391,18 +396,21 @@ def test_return_list_int8():
     def simple_list_int8_no_args() -> list[np.int8]:
         return [-13, 5, 42]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_int8_no_args, shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_int8_no_args, shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
+
 
     @cudaq.kernel
     def simple_list_int8(n: int, t: list[np.int8]) -> list[np.int8]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_int8, 2, [-13, 5, 42], shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_int8, 2, [-13, 5, 42], shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
 
 def test_return_list_int16():
@@ -411,18 +419,21 @@ def test_return_list_int16():
     def simple_list_int16_no_args() -> list[np.int16]:
         return [-13, 5, 42]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_int16_no_args, shots_count=2)
-    assert list_err_msg in str(e.value)
+
+    results = cudaq.run(simple_list_int16_no_args, shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
     @cudaq.kernel
     def simple_list_int16(n: int, t: list[np.int16]) -> list[np.int16]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_int16, 2, [-13, 5, 42], shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_int16, 2, [-13, 5, 42], shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
 
 def test_return_list_int32():
@@ -431,18 +442,20 @@ def test_return_list_int32():
     def simple_list_int32_no_args() -> list[np.int32]:
         return [-13, 5, 42]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_int32_no_args, shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_int32_no_args, shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
     @cudaq.kernel
     def simple_list_int32(n: int, t: list[np.int32]) -> list[np.int32]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_int32, 2, [-13, 5, 42], shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_int32, 2, [-13, 5, 42], shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
 
 def test_return_list_int64():
@@ -451,18 +464,20 @@ def test_return_list_int64():
     def simple_list_int64_no_args() -> list[np.int64]:
         return [-13, 5, 42]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_int64_no_args, shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_int64_no_args, shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
     @cudaq.kernel
     def simple_list_int64(n: int, t: list[np.int64]) -> list[np.int64]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_int64, 2, [-13, 5, 42], shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_int64, 2, [-13, 5, 42], shots_count=2)
+    assert len(results) == 2
+    assert results[0] == [-13, 5, 42]
+    assert results[1] == [-13, 5, 42]
 
 
 def test_return_list_float():
@@ -471,18 +486,20 @@ def test_return_list_float():
     def simple_list_float_no_args() -> list[float]:
         return [-13.2, 5., 42.99]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_float_no_args, shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_float_no_args, shots_count=2)
+    assert len(results) == 2
+    assert is_close_array(results[0], [-13.2, 5., 42.99])
+    assert is_close_array(results[1], [-13.2, 5., 42.99])
 
     @cudaq.kernel
     def simple_list_float(n: int, t: list[float]) -> list[float]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_float, 2, [-13.2, 5.0, 42.99], shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_float, 2, [-13.2, 5.0, 42.99], shots_count=2)
+    assert len(results) == 2
+    assert is_close_array(results[0], [-13.2, 5., 42.99])
+    assert is_close_array(results[1], [-13.2, 5., 42.99])
 
 
 def test_return_list_float32():
@@ -491,18 +508,20 @@ def test_return_list_float32():
     def simple_list_float32_no_args() -> list[np.float32]:
         return [-13.2, 5., 42.99]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_float32_no_args, shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_float32_no_args, shots_count=2)
+    assert len(results) == 2
+    assert is_close_array(results[0], [-13.2, 5., 42.99])
+    assert is_close_array(results[1], [-13.2, 5., 42.99])
 
     @cudaq.kernel
     def simple_list_float32(n: int, t: list[np.float32]) -> list[np.float32]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_float32, 2, [-13.2, 5.0, 42.99], shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_float32, 2, [-13.2, 5.0, 42.99], shots_count=2)
+    assert len(results) == 2
+    assert is_close_array(results[0], [-13.2, 5., 42.99])
+    assert is_close_array(results[1], [-13.2, 5., 42.99])
 
 
 def test_return_list_float64():
@@ -511,19 +530,84 @@ def test_return_list_float64():
     def simple_list_float64_no_args() -> list[np.float64]:
         return [-13.2, 5., 42.99]
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_float64_no_args, shots_count=2)
-    assert list_err_msg in str(e.value)
+    results = cudaq.run(simple_list_float64_no_args, shots_count=2)
+    assert len(results) == 2
+    assert is_close_array(results[0], [-13.2, 5., 42.99])
+    assert is_close_array(results[1], [-13.2, 5., 42.99])
 
     @cudaq.kernel
     def simple_list_float64(n: int, t: list[np.float64]) -> list[np.float64]:
         qubits = cudaq.qvector(n)
         return t
 
-    with pytest.raises(RuntimeError) as e:
-        cudaq.run(simple_list_float64, 2, [-13.2, 5.0, 42.99], shots_count=2)
-    assert list_err_msg in str(e.value)
+    
+    results = cudaq.run(simple_list_float64, 2, [-13.2, 5.0, 42.99], shots_count=2)
+    assert len(results) == 2
+    assert is_close_array(results[0], [-13.2, 5., 42.99])
+    assert is_close_array(results[1], [-13.2, 5., 42.99])
 
+def test_return_list_large_size():
+    # Returns a large list (dynamic size) to stress test the code generation
+    
+    @cudaq.kernel
+    def kernel_with_dynamic_int_array_input(n: int, t: list[int]) -> list[int]:
+        qubits = cudaq.qvector(n)
+        return t
+
+    @cudaq.kernel
+    def kernel_with_dynamic_float_array_input(n: int, t: list[float]) -> list[float]:
+        qubits = cudaq.qvector(n)
+        return t
+    
+    @cudaq.kernel
+    def kernel_with_dynamic_bool_array_input(n: int, t: list[bool]) -> list[bool]:
+        qubits = cudaq.qvector(n)
+        return t
+
+    # Test with various sizes (validate dynamic output logging)
+    for array_size in [10, 15, 100, 167, 1000]:
+        input_array = list(np.random.randint(-1000, 1000, size=array_size))
+        results = cudaq.run(kernel_with_dynamic_int_array_input, 2, input_array, shots_count=2)
+        assert len(results) == 2
+        assert results[0] == input_array
+        assert results[1] == input_array
+
+        input_array_float = list(np.random.uniform(-1000.0, 1000.0, size=array_size))
+        results = cudaq.run(kernel_with_dynamic_float_array_input, 2, input_array_float, shots_count=2)
+        assert len(results) == 2
+        assert is_close_array(results[0], input_array_float)
+        assert is_close_array(results[1], input_array_float)
+
+        input_array_bool = []
+        for _ in range(array_size):
+            input_array_bool.append(True if np.random.rand() > 0.5 else False)
+        results = cudaq.run(kernel_with_dynamic_bool_array_input, 2, input_array_bool, shots_count=2)
+        assert len(results) == 2
+        assert results[0] == input_array_bool
+        assert results[1] == input_array_bool
+
+def test_return_dynamics_measure_results():
+    @cudaq.kernel
+    def measure_all_qubits(numQubits: int) -> list[bool]:
+        # Number of qubits is dynamic
+        qubits = cudaq.qvector(numQubits)
+        for i in range(numQubits):
+            if i % 2 == 0:
+                x(qubits[i])
+
+        return mz(qubits)
+
+    for numQubits in [1, 3, 5, 11, 20]:
+        shots = 2
+        results = cudaq.run(measure_all_qubits, numQubits, shots_count=shots)
+        assert len(results) == shots
+        for res in results:
+            assert len(res) == numQubits
+            for i in range(numQubits):
+                if i % 2 == 0:
+                    assert res[i] == True
+                else:
+                    assert res[i] == False
 
 # Test tuples
 # TODO: Define spec for using tuples in kernels
diff --git a/python/tests/kernel/test_to_integer.py b/python/tests/kernel/test_to_integer.py
new file mode 100644
index 00000000000..959341a3698
--- /dev/null
+++ b/python/tests/kernel/test_to_integer.py
@@ -0,0 +1,41 @@
+# ============================================================================ #
+# Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates.                   #
+# All rights reserved.                                                         #
+#                                                                              #
+# This source code and the accompanying materials are made available under     #
+# the terms of the Apache License 2.0 which accompanies this distribution.     #
+# ============================================================================ #
+import pytest
+import os
+import cudaq
+
+def testToInteger():
+    @cudaq.kernel
+    def toIntegerKernel(applyX: list[int]) -> int:
+        q = cudaq.qvector(len(applyX))
+        for i in range(len(applyX)):
+            if applyX[i]:
+                x(q[i])
+        return cudaq.to_integer(mz(q))
+    
+    test_cases = [
+        [1, 1, 1],
+        [1, 1, 1, 1],
+        [1, 0, 1],
+        [1, 0, 0, 0],
+        [0, 0, 0, 1],
+    ]
+
+    # See reference: targettests/execution/to_integer.cpp
+    expected_results = [7, 15, 5, 1, 8]
+    for applyX in test_cases:
+        counts = cudaq.run(toIntegerKernel, applyX)
+        # All shots should yield the same integer result
+        for result in counts:
+            assert result == expected_results[test_cases.index(applyX)]
+
+
+# leave for gdb debugging
+if __name__ == "__main__":
+    loc = os.path.abspath(__file__)
+    pytest.main([loc, "-rP"])
\ No newline at end of file
diff --git a/python/tests/mlir/ast_list_comprehension.py b/python/tests/mlir/ast_list_comprehension.py
index ba3e936db4c..e0051bf28e7 100644
--- a/python/tests/mlir/ast_list_comprehension.py
+++ b/python/tests/mlir/ast_list_comprehension.py
@@ -55,10 +55,12 @@ def kernel3() -> float:
 
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel1() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"}
-# CHECK:            %[[VAL_0:.*]] = arith.constant true
-# CHECK:            %[[VAL_1:.*]] = cc.alloca !cc.array<i1 x 5>
-# CHECK:            %[[VAL_2:.*]] = cc.compute_ptr %[[VAL_1]][{{.*}}] : (!cc.ptr<!cc.array<i1 x 5>>, i64) -> !cc.ptr<i1>
-# CHECK:            cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr<i1>
+# CHECK:            %[[VAL_0:.*]] = arith.constant 1 : i8
+# CHECK:            %[[VAL_1:.*]] = cc.alloca !cc.array<i8 x 5>
+# CHECK:            %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr<!cc.array<i8 x 5>>) -> !cc.ptr<!cc.array<i8 x ?>>
+# CHECK:            %[[VAL_3:.*]] = cc.compute_ptr %[[VAL_1]][{{.*}}] : (!cc.ptr<!cc.array<i8 x 5>>, i64) -> !cc.ptr<i8>
+# CHECK:            cc.store %[[VAL_0]], %[[VAL_3]] : !cc.ptr<i8>
+# CHECK:            %[[VAL_5:.*]] = cc.stdvec_init %[[VAL_2]], %c5_i64 : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.stdvec<i1>
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel2() -> f64 attributes {"cudaq-entrypoint", "cudaq-kernel"}
 # CHECK:            %[[VAL_0:.*]] = arith.constant 1.000000e+00 : f64
@@ -122,10 +124,12 @@ def kernel3() -> float:
 # CHECK:            %[[VAL_0:.*]] = arith.constant true
 # CHECK:            %[[VAL_1:.*]] = cc.alloca i1
 # CHECK:            cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<i1>
-# CHECK:            %[[VAL_2:.*]] = cc.alloca !cc.array<i1 x 5>
-# CHECK:            %[[VAL_3:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i1>
-# CHECK:            %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr<!cc.array<i1 x 5>>, i64) -> !cc.ptr<i1>
-# CHECK:            cc.store %[[VAL_3]], %[[VAL_4]] : !cc.ptr<i1>
+# CHECK:            %[[VAL_2:.*]] = cc.alloca !cc.array<i8 x 5>
+# CHECK:            %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.array<i8 x 5>>) -> !cc.ptr<!cc.array<i8 x ?>>
+# CHECK:            %[[VAL_4:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i1>
+# CHECK:            %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr<!cc.array<i8 x 5>>, i64) -> !cc.ptr<i8>
+# CHECK:            %[[VAL_6:.*]] = cc.cast unsigned %[[VAL_4]] : (i1) -> i8
+# CHECK:            cc.store %[[VAL_6]], %[[VAL_5]] : !cc.ptr<i8>
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel2() -> i64 attributes {"cudaq-entrypoint", "cudaq-kernel"}
 # CHECK:            %[[VAL_0:.*]] = arith.constant 1.000000e+00 : f64
@@ -198,10 +202,12 @@ def kernel3() -> float:
 # CHECK:            %[[VAL_0:.*]] = arith.constant true
 # CHECK:            %[[VAL_1:.*]] = cc.alloca i1
 # CHECK:            cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr<i1>
-# CHECK:            %[[VAL_2:.*]] = cc.alloca !cc.array<i1 x 5>
-# CHECK:            %[[VAL_3:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i1>
-# CHECK:            %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr<!cc.array<i1 x 5>>, i64) -> !cc.ptr<i1>
-# CHECK:            cc.store %[[VAL_3]], %[[VAL_4]] : !cc.ptr<i1>
+# CHECK:            %[[VAL_2:.*]] = cc.alloca !cc.array<i8 x 5>
+# CHECK:            %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr<!cc.array<i8 x 5>>) -> !cc.ptr<!cc.array<i8 x ?>>
+# CHECK:            %[[VAL_4:.*]] = cc.load %[[VAL_1]] : !cc.ptr<i1>
+# CHECK:            %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr<!cc.array<i8 x 5>>, i64) -> !cc.ptr<i8>
+# CHECK:            %[[VAL_6:.*]] = cc.cast unsigned %[[VAL_4]] : (i1) -> i8
+# CHECK:            cc.store %[[VAL_6]], %[[VAL_5]] : !cc.ptr<i8>
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel2() -> f64 attributes {"cudaq-entrypoint", "cudaq-kernel"}
 # CHECK:            %[[VAL_0:.*]] = arith.constant 1.000000e+00 : f64
@@ -271,14 +277,14 @@ def kernel3() -> float:
 
 
 # CHECK-LABEL:   func.func @__nvqpp__mlirgen__kernel1() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"}
-# CHECK:            %[[VAL_0:.*]] = arith.constant 1 : i64
-# CHECK:            %[[VAL_1:.*]] = arith.constant true
+# CHECK:            %[[VAL_0:.*]] = arith.constant 1 : i8
+# CHECK:            %[[VAL_1:.*]] = arith.constant 1 : i64
 # CHECK:            %[[VAL_2:.*]] = cc.alloca !cc.array<!cc.stdvec<i1> x 5>
-# CHECK:            %[[VAL_3:.*]] = cc.alloca !cc.array<i1 x 1>
-# CHECK:            %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<i1 x 1>>) -> !cc.ptr<!cc.array<i1 x ?>>
-# CHECK:            %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<i1 x 1>>) -> !cc.ptr<i1>
-# CHECK:            cc.store %[[VAL_1]], %[[VAL_5]] : !cc.ptr<i1>
-# CHECK:            %[[VAL_6:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_0]] : (!cc.ptr<!cc.array<i1 x ?>>, i64) -> !cc.stdvec<i1>
+# CHECK:            %[[VAL_3:.*]] = cc.alloca !cc.array<i8 x 1>
+# CHECK:            %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<i8 x 1>>) -> !cc.ptr<!cc.array<i8 x ?>>
+# CHECK:            %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr<!cc.array<i8 x 1>>) -> !cc.ptr<i8>
+# CHECK:            cc.store %[[VAL_0]], %[[VAL_5]] : !cc.ptr<i8>
+# CHECK:            %[[VAL_6:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_1]] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.stdvec<i1>
 # CHECK:            %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr<!cc.array<!cc.stdvec<i1> x 5>>, i64) -> !cc.ptr<!cc.stdvec<i1>>
 # CHECK:            cc.store %[[VAL_6]], %[[VAL_7]] : !cc.ptr<!cc.stdvec<i1>>