diff --git a/lib/Optimizer/Builder/Intrinsics.cpp b/lib/Optimizer/Builder/Intrinsics.cpp index fd7622981fb..2931adefae7 100644 --- a/lib/Optimizer/Builder/Intrinsics.cpp +++ b/lib/Optimizer/Builder/Intrinsics.cpp @@ -371,6 +371,136 @@ static constexpr IntrinsicCode intrinsicTable[] = { {cudaq::stdvecBoolCtorFromInitList, {}, R"#( func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr, !cc.ptr, i64) -> ())#"}, + {"__nvqpp_internal_number_of_digits", {}, R"#( + func.func private @__nvqpp_internal_number_of_digits(%arg0: i64) -> i64 { + %c10_i64 = arith.constant 10 : i64 + %c0_i64 = arith.constant 0 : i64 + %c1_i64 = arith.constant 1 : i64 + %0 = cc.alloca i64 + cc.store %arg0, %0 : !cc.ptr + %1 = cc.load %0 : !cc.ptr + %2 = cc.alloca i64 + cc.store %c0_i64, %2 : !cc.ptr + %3 = arith.cmpi eq, %1, %c0_i64 : i64 + cc.if(%3) { + cc.store %c1_i64, %2 : !cc.ptr + } + cc.loop while { + %5 = cc.load %0 : !cc.ptr + %6 = arith.cmpi sgt, %5, %c0_i64 : i64 + cc.condition %6 + } do { + %5 = cc.load %0 : !cc.ptr + %6 = arith.divsi %5, %c10_i64 : i64 + cc.store %6, %0 : !cc.ptr + %7 = cc.load %2 : !cc.ptr + %8 = arith.addi %7, %c1_i64 : i64 + cc.store %8, %2 : !cc.ptr + cc.continue + } + %4 = cc.load %2 : !cc.ptr + return %4 : i64 + } + )#"}, + + // __nvqpp_internal_tostring + {"__nvqpp_internal_tostring", {}, R"#( + func.func private @__nvqpp_internal_tostring(%buf: !cc.stdvec, %val: i64) { + %c48_i64 = arith.constant 48 : i64 + %c48_i32 = arith.constant 48 : i32 + %c0_i64 = arith.constant 0 : i64 + %c10_i64 = arith.constant 10 : i64 + %c1_i64 = arith.constant 1 : i64 + %c48_i8 = arith.constant 48 : i8 + %false = arith.constant false + %c0_i8 = arith.constant 0 : i8 + %0 = cc.alloca i64 + cc.store %val, %0 : !cc.ptr + %1 = cc.alloca i64 + cc.store %c10_i64, %1 : !cc.ptr + %2 = cc.stdvec_size %buf : (!cc.stdvec) -> i64 + %3 = cc.alloca i64 + cc.store %2, %3 : !cc.ptr + %4 = cc.load %3 : !cc.ptr + %5 = arith.subi %4, %c1_i64 : i64 + %6 = cc.alloca i64 + cc.store %5, %6 : !cc.ptr + %7 = cc.load %6 : !cc.ptr + %8 = cc.stdvec_data %buf : (!cc.stdvec) -> !cc.ptr> + %9 = cc.compute_ptr %8[%7] : (!cc.ptr>, i64) -> !cc.ptr + cc.store %c0_i8, %9 : !cc.ptr + %10 = cc.load %6 : !cc.ptr + %11 = arith.subi %10, %c1_i64 : i64 + cc.store %11, %6 : !cc.ptr + cc.loop while { + %18 = cc.load %0 : !cc.ptr + %19 = cc.load %1 : !cc.ptr + %20 = arith.cmpi sge, %18, %19 : i64 + %21 = arith.cmpi eq, %20, %false : i1 + %22 = cc.if(%21) -> i1 { + cc.continue %false : i1 + } else { + %23 = cc.load %6 : !cc.ptr + %24 = arith.cmpi sge, %23, %c0_i64 : i64 + cc.continue %24 : i1 + } + cc.condition %22 + } do { + cc.scope { + %18 = cc.load %0 : !cc.ptr + %19 = cc.load %1 : !cc.ptr + %20 = arith.remsi %18, %19 : i64 + %21 = cc.cast %20 : (i64) -> i32 + %22 = cc.alloca i32 + cc.store %21, %22 : !cc.ptr + %23 = cc.load %1 : !cc.ptr + %24 = cc.load %0 : !cc.ptr + %25 = arith.divsi %24, %23 : i64 + cc.store %25, %0 : !cc.ptr + %26 = cc.load %6 : !cc.ptr + %27 = cc.stdvec_data %buf : (!cc.stdvec) -> !cc.ptr> + %28 = cc.compute_ptr %27[%26] : (!cc.ptr>, i64) -> !cc.ptr + %29 = cc.load %22 : !cc.ptr + %30 = arith.addi %29, %c48_i32 : i32 + %31 = cc.cast %30 : (i32) -> i8 + cc.store %31, %28 : !cc.ptr + %32 = cc.load %6 : !cc.ptr + %33 = arith.subi %32, %c1_i64 : i64 + cc.store %33, %6 : !cc.ptr + } + cc.continue + } + %12 = cc.load %6 : !cc.ptr + %13 = cc.stdvec_data %buf : (!cc.stdvec) -> !cc.ptr> + %14 = cc.compute_ptr %13[%12] : (!cc.ptr>, i64) -> !cc.ptr + %15 = cc.load %0 : !cc.ptr + %16 = arith.addi %15, %c48_i64 : i64 + %17 = cc.cast %16 : (i64) -> i8 + cc.store %17, %14 : !cc.ptr + cc.scope { + %18 = cc.alloca i64 + cc.store %c0_i64, %18 : !cc.ptr + cc.loop while { + %19 = cc.load %18 : !cc.ptr + %20 = cc.load %6 : !cc.ptr + %21 = arith.cmpi slt, %19, %20 : i64 + cc.condition %21 + } do { + %19 = cc.load %18 : !cc.ptr + %20 = cc.stdvec_data %buf : (!cc.stdvec) -> !cc.ptr> + %21 = cc.compute_ptr %20[%19] : (!cc.ptr>, i64) -> !cc.ptr + cc.store %c48_i8, %21 : !cc.ptr + cc.continue + } step { + %19 = cc.load %18 : !cc.ptr + %20 = arith.addi %19, %c1_i64 : i64 + cc.store %20, %18 : !cc.ptr + } + } + return + } + )#"}, + // This helper function copies a buffer off the stack to the heap. This is // required when the data on the stack is about to go out of scope but is // still live. diff --git a/lib/Optimizer/CodeGen/Pipelines.cpp b/lib/Optimizer/CodeGen/Pipelines.cpp index c4d0141afd4..a8a3f918968 100644 --- a/lib/Optimizer/CodeGen/Pipelines.cpp +++ b/lib/Optimizer/CodeGen/Pipelines.cpp @@ -98,6 +98,7 @@ void createTargetCodegenPipeline(PassManager &pm, pm.addNestedPass(createCSEPass()); ::addQIRConversionPipeline(pm, options.target); pm.addPass(cudaq::opt::createReturnToOutputLog()); + cudaq::opt::addLowerToCFG(pm); pm.addPass(createConvertMathToFuncs()); pm.addPass(createSymbolDCEPass()); pm.addPass(cudaq::opt::createCCToLLVM()); diff --git a/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp b/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp index b4b175a31dd..01665222154 100644 --- a/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp +++ b/lib/Optimizer/CodeGen/ReturnToOutputLog.cpp @@ -46,7 +46,8 @@ class ReturnRewrite : public OpRewritePattern { } static void genOutputLog(Location loc, PatternRewriter &rewriter, Value val, - std::optional prefix) { + std::optional prefix, + std::optional customLabel = std::nullopt) { Type valTy = val.getType(); TypeSwitch(valTy) .Case([&](IntegerType intTy) { @@ -54,7 +55,8 @@ class ReturnRewrite : public OpRewritePattern { std::string labelStr = std::string("i") + std::to_string(width); if (prefix) labelStr = prefix->str(); - Value label = makeLabel(loc, rewriter, labelStr); + Value label = + customLabel.value_or(makeLabel(loc, rewriter, labelStr)); if (intTy.getWidth() == 1) { rewriter.create(loc, TypeRange{}, cudaq::opt::QIRBoolRecordOutput, @@ -80,7 +82,8 @@ class ReturnRewrite : public OpRewritePattern { std::string labelStr = std::string("f") + std::to_string(width); if (prefix) labelStr = prefix->str(); - Value label = makeLabel(loc, rewriter, labelStr); + Value label = + customLabel.value_or(makeLabel(loc, rewriter, labelStr)); // Floating point: convert it to double, whatever it actually is. Value castVal = val; if (floatTy != rewriter.getF64Type()) @@ -94,7 +97,8 @@ class ReturnRewrite : public OpRewritePattern { auto labelStr = translateType(structTy); if (prefix) labelStr = prefix->str(); - Value label = makeLabel(loc, rewriter, labelStr); + Value label = + customLabel.value_or(makeLabel(loc, rewriter, labelStr)); std::int32_t sz = structTy.getNumMembers(); Value size = rewriter.create(loc, sz, 64); rewriter.create(loc, TypeRange{}, @@ -111,7 +115,8 @@ class ReturnRewrite : public OpRewritePattern { }) .Case([&](cudaq::cc::ArrayType arrTy) { auto labelStr = translateType(arrTy); - Value label = makeLabel(loc, rewriter, labelStr); + Value label = + customLabel.value_or(makeLabel(loc, rewriter, labelStr)); std::int32_t sz = arrTy.getSize(); Value size = rewriter.create(loc, sz, 64); rewriter.create(loc, TypeRange{}, @@ -128,13 +133,12 @@ class ReturnRewrite : public OpRewritePattern { } }) .Case([&](cudaq::cc::StdvecType vecTy) { - // For this type, we expect a cc.stdvec_init operation as the input. - // The data will be in a variable. - // If we reach here and we cannot determine the constant size of the - // buffer, then we will not generate any output logging. if (auto vecInit = val.getDefiningOp()) if (auto maybeLen = cudaq::opt::factory::maybeValueOfIntConstant( vecInit.getLength())) { + // For this type, we expect a cc.stdvec_init operation as the + // input. + // The data will be in a variable. std::int32_t sz = *maybeLen; auto labelStr = translateType(vecTy, sz); Value label = makeLabel(loc, rewriter, labelStr); @@ -158,7 +162,55 @@ class ReturnRewrite : public OpRewritePattern { Value w = rewriter.create(loc, v); genOutputLog(loc, rewriter, w, offset); } + return; } + + // If we reach here and we cannot determine the constant size of the + // buffer, then we will not generate dynamic output logging with a for + // loop. + Value vecSz = rewriter.template create( + loc, rewriter.getI64Type(), val); + const std::string arrayLabelPrefix = + "array<" + translateType(vecTy.getElementType()) + " x "; + Value labelBuffer = + makeLabel(loc, rewriter, arrayLabelPrefix, vecSz, ">"); + rewriter.create(loc, TypeRange{}, + cudaq::opt::QIRArrayRecordOutput, + ArrayRef{vecSz, labelBuffer}); + auto eleTy = vecTy.getElementType(); + const bool isBool = (eleTy == rewriter.getI1Type()); + if (isBool) + eleTy = rewriter.getI8Type(); + auto elePtrTy = cudaq::cc::PointerType::get(eleTy); + auto eleArrTy = + cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(eleTy)); + auto vecPtr = + rewriter.create(loc, eleArrTy, val); + const std::string preStr = prefix ? prefix->str() : std::string{}; + cudaq::opt::factory::createInvariantLoop( + rewriter, loc, vecSz, + [&](OpBuilder &builder, Location loc, Region &, Block &block) { + Value indexVar = block.getArgument(0); + auto eleAddr = rewriter.create( + loc, elePtrTy, vecPtr, ValueRange{indexVar}); + + Value w = [&]() { + if (isBool) { + auto i1PtrTy = + cudaq::cc::PointerType::get(rewriter.getI1Type()); + auto i1Cast = rewriter.create( + loc, i1PtrTy, eleAddr); + return rewriter.create(loc, i1Cast); + } + + return rewriter.create(loc, eleAddr); + }(); + const std::string prefix = preStr + "["; + const std::string postfix = "]"; + Value dynamicLabel = + makeLabel(loc, rewriter, prefix, indexVar, postfix); + genOutputLog(loc, rewriter, w, std::nullopt, dynamicLabel); + }); }) .Default([&](Type) { // If we reach here, we don't know how to handle this type. @@ -207,6 +259,79 @@ class ReturnRewrite : public OpRewritePattern { auto i8PtrTy = cudaq::cc::PointerType::get(rewriter.getI8Type()); return rewriter.create(loc, i8PtrTy, lit); } + + static Value makeLabel(Location loc, PatternRewriter &rewriter, + const std::string &prefix, Value val, + const std::string &postFix) { + auto i64Ty = rewriter.getI64Type(); + auto i8Ty = rewriter.getI8Type(); + auto i8PtrTy = cudaq::cc::PointerType::get(i8Ty); + // Value must be i64 + if (val.getType() != i64Ty) + val = rewriter.create(loc, i64Ty, val); + // Compute the number of digits required + Value numDigits = rewriter + .create( + loc, i64Ty, "__nvqpp_internal_number_of_digits", + ArrayRef{val}) + .getResult(0); + Value valStrBuf = [&]() { + // Convert integer value to string + auto strSize = rewriter.create( + loc, numDigits, + rewriter.create(loc, 1, + 64)); // Add null terminator + auto buffer = rewriter.create(loc, i8Ty, strSize); + auto stdvecTy = cudaq::cc::StdvecType::get(i8Ty); + auto stringCharVec = rewriter.create( + loc, stdvecTy, buffer, strSize); + rewriter.create(loc, TypeRange{}, + "__nvqpp_internal_tostring", + ArrayRef{stringCharVec, val}); + return rewriter.create(loc, i8PtrTy, buffer); + }(); + + Value arrayPrefix = makeLabel(loc, rewriter, prefix); + Value arrayPostfix = makeLabel(loc, rewriter, postFix); + const int preFixLen = prefix.size(); + const int postFixLen = postFix.size(); + Value totalStrSize = rewriter.create( + loc, numDigits, + rewriter.create(loc, preFixLen + postFixLen + 1, + 64)); + auto labelBufferAlloc = + rewriter.create(loc, i8Ty, totalStrSize); + Value labelBuffer = + rewriter.create(loc, i8PtrTy, labelBufferAlloc); + + // Copy the prefix + rewriter.create( + loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, + ValueRange{labelBuffer, arrayPrefix, + rewriter.create(loc, preFixLen, 64), + rewriter.create(loc, 0, 1)}); + // Copy the integer string + auto toPtr = rewriter.create( + loc, i8PtrTy, labelBufferAlloc, + ValueRange{rewriter.create(loc, preFixLen, 64)}); + rewriter.create( + loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, + ValueRange{toPtr, valStrBuf, numDigits, + rewriter.create(loc, 0, 1)}); + // Copy the postfix + null terminator + Value shift = rewriter.create( + loc, numDigits, + rewriter.create(loc, preFixLen, 64)); + toPtr = rewriter.create( + loc, i8PtrTy, labelBufferAlloc, ValueRange{shift}); + rewriter.create( + loc, std::nullopt, cudaq::llvmMemCopyIntrinsic, + ValueRange{ + toPtr, arrayPostfix, + rewriter.create(loc, postFixLen + 1, 64), + rewriter.create(loc, 0, 1)}); + return labelBuffer; + } }; struct ReturnToOutputLogPass @@ -230,6 +355,19 @@ struct ReturnToOutputLogPass return; } + if (failed(irBuilder.loadIntrinsic(module, "__nvqpp_internal_tostring"))) { + module.emitError("could not load string conversion function."); + signalPassFailure(); + return; + } + + if (failed(irBuilder.loadIntrinsic(module, + "__nvqpp_internal_number_of_digits"))) { + module.emitError("could not load number of digits function."); + signalPassFailure(); + return; + } + RewritePatternSet patterns(ctx); patterns.insert(ctx); LLVM_DEBUG(llvm::dbgs() << "Before return to output logging:\n" << module); diff --git a/python/cudaq/kernel/ast_bridge.py b/python/cudaq/kernel/ast_bridge.py index aceab7ec75a..9069b4bd168 100644 --- a/python/cudaq/kernel/ast_bridge.py +++ b/python/cudaq/kernel/ast_bridge.py @@ -31,7 +31,7 @@ from .utils import (Color, globalAstRegistry, globalKernelRegistry, globalRegisteredOperations, globalRegisteredTypes, nvqppPrefix, mlirTypeFromAnnotation, mlirTypeFromPyType, - mlirTypeToPyType, mlirTryCreateStructType) + mlirTypeToPyType, mlirTryCreateStructType, getInteropKernelNameIfFound) State = cudaq_runtime.State @@ -434,6 +434,7 @@ def changeOperandToType(self, ty, operand, allowDemotion=False): operand, sint=operand_width != 1, zint=operand_width == 1).result + self.emitFatalError( f'cannot convert value of type {operand.type} to the requested type {ty}', self.currentNode) @@ -578,6 +579,7 @@ def ifNotPointerThenStore(self, value): if not cc.PointerType.isinstance(value.type): slot = cc.AllocaOp(cc.PointerType.get(value.type), TypeAttr.get(value.type)).result + assert cc.PointerType.get(value.type) == slot.type cc.StoreOp(value, slot) return slot return value @@ -585,20 +587,32 @@ def ifNotPointerThenStore(self, value): def __createStdvecWithKnownValues(self, size, listElementValues): # Turn this List into a StdVec arrSize = self.getConstantInt(size) - arrTy = cc.ArrayType.get(listElementValues[0].type) + elemTy = listElementValues[0].type + # If this is an `i1`, turns it into an `i8` array. + isBool = elemTy == self.getIntegerType(1) + if isBool: + elemTy = self.getIntegerType(8) + + arrTy = cc.ArrayType.get(elemTy) alloca = cc.AllocaOp(cc.PointerType.get(arrTy), - TypeAttr.get(listElementValues[0].type), + TypeAttr.get(elemTy), seqSize=arrSize).result for i, v in enumerate(listElementValues): eleAddr = cc.ComputePtrOp( - cc.PointerType.get(listElementValues[0].type), alloca, + cc.PointerType.get(elemTy), alloca, [self.getConstantInt(i)], DenseI32ArrayAttr.get([kDynamicPtrIndex], context=self.ctx)).result + if isBool: + # Cast the list value before assigning + v = self.changeOperandToType(self.getIntegerType(8), v) cc.StoreOp(v, eleAddr) - vecTy = listElementValues[0].type + # Create the `StdVec` from the alloca + # We still use `i1` as the vector element type if the + # original list was of booleans. + vecTy = elemTy if not isBool else self.getIntegerType(1) if cc.PointerType.isinstance(vecTy): vecTy = cc.PointerType.getElementType(vecTy) @@ -655,6 +669,10 @@ def __copyVectorAndCastElements(self, if (sourceEleType == targetEleType): return sourcePtr + isSourceBool = sourceEleType == self.getIntegerType(1) + if isSourceBool: + sourceEleType = self.getIntegerType(8) + sourceArrType = cc.ArrayType.get(sourceEleType) sourceElePtrTy = cc.PointerType.get(sourceEleType) sourceArrElePtrTy = cc.PointerType.get(sourceArrType) @@ -662,10 +680,16 @@ def __copyVectorAndCastElements(self, sourceDataPtr = cc.StdvecDataOp(sourceArrElePtrTy, sourceValue).result sourceSize = cc.StdvecSizeOp(self.getIntegerType(), sourceValue).result + isTargetBool = targetEleType == self.getIntegerType(1) + # Vector type reflects the true type, including `i1` + targetVecTy = cc.StdvecType.get(targetEleType) + + if isTargetBool: + targetEleType = self.getIntegerType(8) + targetElePtrType = cc.PointerType.get(targetEleType) targetTy = cc.ArrayType.get(targetEleType) targetArrElePtrTy = cc.PointerType.get(targetTy) - targetVecTy = cc.StdvecType.get(targetEleType) targetPtr = cc.AllocaOp(targetArrElePtrTy, TypeAttr.get(targetEleType), seqSize=sourceSize).result @@ -681,6 +705,7 @@ def bodyBuilder(iterVar): allowDemotion=allowDemotion) targetEleAddr = cc.ComputePtrOp(targetElePtrType, targetPtr, [iterVar], rawIndex).result + assert cc.PointerType.get(targetEleType) == targetEleAddr.type cc.StoreOp(castedEle, targetEleAddr) self.createInvariantForLoop(sourceSize, bodyBuilder) @@ -777,15 +802,26 @@ def __load_vector_element(self, vector, index): MLIR Value containing the loaded element """ if cc.StdvecType.isinstance(vector.type): + elem_ty = cc.StdvecType.getElementType(vector.type) + is_bool = elem_ty == self.getIntegerType(1) + # std::vector is a special case in C++ where each element + # is stored as a single bit, but the underlying array is actually + # an array of `i8` values. + if is_bool: + # `i1` elements are stored as `i8` in the underlying array. + elem_ty = self.getIntegerType(8) data_ptr = cc.StdvecDataOp( cc.PointerType.get( - cc.ArrayType.get(cc.StdvecType.getElementType( - vector.type))), vector).result - return cc.LoadOp( + cc.ArrayType.get(elem_ty)), vector).result + load_val = cc.LoadOp( cc.ComputePtrOp( - cc.PointerType.get(cc.StdvecType.getElementType( - vector.type)), data_ptr, [index], + cc.PointerType.get(elem_ty), data_ptr, [index], DenseI32ArrayAttr.get([kDynamicPtrIndex]))).result + if is_bool: + # Cast back to `i1` if the original vector element type was `i1`. + load_val = self.changeOperandToType(self.getIntegerType(1), + load_val) + return load_val return cc.LoadOp( cc.ComputePtrOp( cc.PointerType.get( @@ -1405,6 +1441,7 @@ def process_assignment(target, value): # We should allocate and store alloca = cc.AllocaOp(cc.PointerType.get(value.type), TypeAttr.get(value.type)).result + assert cc.PointerType.get(value.type) == alloca.type cc.StoreOp(value, alloca) return target, alloca @@ -1438,6 +1475,8 @@ def process_assignment(target, value): # Visit the value being assigned self.visit(node.value) valueToStore = self.popValue() + # Cast if necessary + valueToStore = self.changeOperandToType(ptrEleType, valueToStore) # Store the value cc.StoreOp(valueToStore, ptrVal) return target.value, None @@ -1460,6 +1499,8 @@ def process_assignment(target, value): # Visit the value being assigned self.visit(node.value) valueToStore = self.popValue() + # Cast if necessary + valueToStore = self.changeOperandToType(cc.PointerType.getElementType(ptrVal.type), valueToStore) # Store the value cc.StoreOp(valueToStore, ptrVal) return target.value, None @@ -1771,6 +1812,26 @@ def processFunctionCall(fType, nrValsToPop): func.CallOp(otherKernel, values) else: result = func.CallOp(otherKernel, values).result + # Copy to stack if necessary + if cc.StdvecType.isinstance(result.type): + elemTy = cc.StdvecType.getElementType(result.type) + if elemTy == self.getIntegerType(1): + elemTy = self.getIntegerType(8) + data = cc.StdvecDataOp(cc.PointerType.get(elemTy), result).result + i64Ty = self.getIntegerType(64) + length = cc.StdvecSizeOp(i64Ty, result).result + elemSize = cc.SizeOfOp(i64Ty, TypeAttr.get(elemTy)).result + buffer = cc.AllocaOp(cc.PointerType.get(cc.ArrayType.get(elemTy)), TypeAttr.get(elemTy), seqSize=length).result + i8PtrTy = cc.PointerType.get(self.getIntegerType(8)) + cbuffer = cc.CastOp(i8PtrTy, buffer).result + cdata = cc.CastOp(i8PtrTy, data).result + symName = '__nvqpp_vectorCopyToStack' + load_intrinsic(self.module, symName) + sizeInBytes = arith.MulIOp(length, elemSize).result + func.CallOp([], symName, [cbuffer, cdata, sizeInBytes]) + # Replace result with the stack buffer-backed vector + result = cc.StdvecInitOp(result.type, buffer, length=length).result + self.pushValue(result) def checkControlAndTargetTypes(controls, targets): @@ -2350,6 +2411,15 @@ def bodyBuilder(iterVal): # kernel registry correctly for the next conditional check if var.name in globalKernelRegistry: node.func.id = var.name + # Check generic callable objects that may be C++ `qkernel` (with its MLIR code registered) + elif hasattr(var, '__call__'): + # Check if this is a registered C++ kernel + maybeKernelName = getInteropKernelNameIfFound(var, self.module) + if maybeKernelName != None: + otherKernel = SymbolTable( + self.module.operation)[maybeKernelName] + processFunctionCall(otherKernel.type, len(node.args)) + return if node.func.id in globalKernelRegistry: # If in `globalKernelRegistry`, it has to be in this Module @@ -2431,8 +2501,10 @@ def bodyBuilder(iterVal): for _, v in annotations.items() ] + unnamed_struct = "__repr__" not in cls.__dict__ + struct_name = node.func.id if not unnamed_struct else "" structTy = mlirTryCreateStructType(structTys, - name=node.func.id, + name=struct_name, context=self.ctx) if structTy is None: self.emitFatalError( @@ -2474,7 +2546,6 @@ def bodyBuilder(iterVal): cc.StoreOp(ctorArgs[i], eleAddr) self.pushValue(stackSlot) return - else: self.emitFatalError( "unhandled function call - {}, known kernels are {}".format( @@ -2915,6 +2986,30 @@ def bodyBuilder(iterVal): quake.ComputeActionOp(compute, action) return + if node.func.attr == 'to_integer': + boolVec = self.popValue() + boolVec = self.ifPointerThenLoad(boolVec) + if not cc.StdvecType.isinstance(boolVec.type): + self.emitFatalError( + "to_integer expects a vector of booleans. Got type {}".format( + boolVec.type), + node) + elemTy = cc.StdvecType.getElementType(boolVec.type) + if elemTy != self.getIntegerType(1): + self.emitFatalError( + "to_integer expects a vector of booleans. Got type {}".format( + boolVec.type), + node) + cudaqConvertToInteger = "__nvqpp_cudaqConvertToInteger" + # Load the intrinsic + load_intrinsic(self.module, cudaqConvertToInteger) + # Signature: + # `func.func private @__nvqpp_cudaqConvertToInteger(%arg : !cc.stdvec) -> i64` + resultTy = self.getIntegerType(64) + result = func.CallOp([resultTy], cudaqConvertToInteger, [boolVec]).result + self.pushValue(result) + return + self.emitFatalError( f'Invalid function or class type requested from the cudaq module ({node.func.attr})', node) @@ -3453,6 +3548,11 @@ def get_item_type(pyval): listElemTy = get_item_type(node.elt) if listElemTy is None: return + + resultVecTy = cc.StdvecType.get(listElemTy) + isBool = listElemTy == self.getIntegerType(1) + if isBool: + listElemTy = self.getIntegerType(8) listTy = cc.ArrayType.get(listElemTy) listValue = cc.AllocaOp(cc.PointerType.get(listTy), TypeAttr.get(listElemTy), @@ -3482,12 +3582,15 @@ def bodyBuilder(iterVar): listValueAddr = cc.ComputePtrOp( cc.PointerType.get(listElemTy), listValue, [iterVar], DenseI32ArrayAttr.get([kDynamicPtrIndex], context=self.ctx)) + + if isBool: + result = self.changeOperandToType(self.getIntegerType(8), result) cc.StoreOp(result, listValueAddr) self.symbolTable.popScope() self.createInvariantForLoop(iterableSize, bodyBuilder) self.pushValue( - cc.StdvecInitOp(cc.StdvecType.get(listElemTy), + cc.StdvecInitOp(resultVecTy, listValue, length=iterableSize).result) return @@ -3679,6 +3782,9 @@ def fix_negative_idx(idx, get_size): upper=upperVal).result) elif cc.StdvecType.isinstance(var.type): eleTy = cc.StdvecType.getElementType(var.type) + isBool = eleTy == self.getIntegerType(1) + if isBool: + eleTy = self.getIntegerType(8) ptrTy = cc.PointerType.get(eleTy) arrTy = cc.ArrayType.get(eleTy) ptrArrTy = cc.PointerType.get(arrTy) @@ -3722,6 +3828,9 @@ def fix_negative_idx(idx, get_size): if cc.StdvecType.isinstance(var.type): idx = fix_negative_idx(idx, lambda: get_size(var)) eleTy = cc.StdvecType.getElementType(var.type) + isBool = eleTy == self.getIntegerType(1) + if isBool: + eleTy = self.getIntegerType(8) elePtrTy = cc.PointerType.get(eleTy) arrTy = cc.ArrayType.get(eleTy) ptrArrTy = cc.PointerType.get(arrTy) @@ -3733,7 +3842,10 @@ def fix_negative_idx(idx, get_size): if self.subscriptPushPointerValue: self.pushValue(eleAddr) return - self.pushValue(cc.LoadOp(eleAddr).result) + val = cc.LoadOp(eleAddr).result + if isBool: + val = self.changeOperandToType(self.getIntegerType(1), val) + self.pushValue(val) return if cc.PointerType.isinstance(var.type): @@ -3960,7 +4072,9 @@ def functor(iter, idx): iterEleTy = cc.StdvecType.getElementType(iterable.type) totalSize = cc.StdvecSizeOp(self.getIntegerType(), iterable).result - + isBool = iterEleTy == self.getIntegerType(1) + if isBool: + iterEleTy = self.getIntegerType(8) def functor(iter, idxVal): elePtrTy = cc.PointerType.get(iterEleTy) arrTy = cc.ArrayType.get(iterEleTy) @@ -3970,7 +4084,10 @@ def functor(iter, idxVal): elePtrTy, vecPtr, [idxVal], DenseI32ArrayAttr.get([kDynamicPtrIndex], context=self.ctx)).result - return cc.LoadOp(eleAddr).result + result = cc.LoadOp(eleAddr).result + if isBool: + result = self.changeOperandToType(self.getIntegerType(1), result) + return result extractFunctor = functor diff --git a/python/cudaq/kernel/kernel_decorator.py b/python/cudaq/kernel/kernel_decorator.py index 799117a07bc..ad571d89393 100644 --- a/python/cudaq/kernel/kernel_decorator.py +++ b/python/cudaq/kernel/kernel_decorator.py @@ -21,7 +21,7 @@ from .captured_data import CapturedDataStorage from .utils import (emitFatalError, emitErrorIfInvalidPauli, globalAstRegistry, globalRegisteredTypes, mlirTypeFromPyType, mlirTypeToPyType, - nvqppPrefix) + nvqppPrefix, getInteropKernelNameIfFound) # This file implements the decorator mechanism needed to # JIT compile CUDA-Q kernels. It exposes the cudaq.kernel() @@ -451,6 +451,20 @@ def __convertStringsToPauli__(self, arg): return arg + def getCallableNames(self, *args): + callableNames = [] + for arg in args: + if isinstance(arg, PyKernelDecorator): + callableNames.append(arg.name) + else: + if hasattr(arg, '__call__'): + maybeKernelName = getInteropKernelNameIfFound(arg, self.module) + if maybeKernelName != None: + # Remove "__nvqpp__mlirgen__" prefix when packing the list of callables + maybeKernelName = maybeKernelName.replace("__nvqpp__mlirgen__", "") + callableNames.append(maybeKernelName) + return callableNames + def __call__(self, *args): """ Invoke the CUDA-Q kernel. JIT compilation of the kernel AST to MLIR @@ -481,7 +495,8 @@ def __call__(self, *args): mlirType = mlirTypeFromPyType(type(arg), self.module.context, argInstance=arg, - argTypeToCompareTo=self.argTypes[i]) + argTypeToCompareTo=self.argTypes[i], + module=self.module) if self.isCastablePyType(mlirType, self.argTypes[i]): processedArgs.append( @@ -496,19 +511,30 @@ def __call__(self, *args): ) if cc.CallableType.isinstance(mlirType): - # Assume this is a PyKernelDecorator - callableNames.append(arg.name) - # It may be that the provided input callable kernel - # is not currently in the ModuleOp. Need to add it - # if that is the case, we have to use the AST - # so that it shares self.module's MLIR Context - symbols = SymbolTable(self.module.operation) - if nvqppPrefix + arg.name not in symbols: - tmpBridge = PyASTBridge(self.capturedDataStorage, - existingModule=self.module, - disableEntryPointTag=True) - tmpBridge.visit(globalAstRegistry[arg.name][0]) - + if isinstance(arg, PyKernelDecorator): + # Assume this is a PyKernelDecorator + callableNames.append(arg.name) + # It may be that the provided input callable kernel + # is not currently in the ModuleOp. Need to add it + # if that is the case, we have to use the AST + # so that it shares self.module's MLIR Context + symbols = SymbolTable(self.module.operation) + if nvqppPrefix + arg.name not in symbols: + tmpBridge = PyASTBridge(self.capturedDataStorage, + existingModule=self.module, + disableEntryPointTag=True) + tmpBridge.visit(globalAstRegistry[arg.name][0]) + else: + if hasattr(arg, '__call__'): + maybeKernelName = getInteropKernelNameIfFound(arg, self.module) + if maybeKernelName != None: + # Remove "__nvqpp__mlirgen__" prefix + maybeKernelName = maybeKernelName.replace("__nvqpp__mlirgen__", "") + callableNames.append(maybeKernelName) + else: + emitFatalError( + "Invalid callable argument provided to kernel." + ) # Convert `numpy` arrays to lists if cc.StdvecType.isinstance(mlirType) and hasattr(arg, "tolist"): if arg.ndim != 1: diff --git a/python/cudaq/kernel/utils.py b/python/cudaq/kernel/utils.py index efaf213b581..e7f447be516 100644 --- a/python/cudaq/kernel/utils.py +++ b/python/cudaq/kernel/utils.py @@ -15,8 +15,8 @@ import types from cudaq.mlir._mlir_libs._quakeDialects import cudaq_runtime -from cudaq.mlir.dialects import quake, cc -from cudaq.mlir.ir import ComplexType, F32Type, F64Type, IntegerType +from cudaq.mlir.dialects import quake, cc, func +from cudaq.mlir.ir import ComplexType, F32Type, F64Type, IntegerType, SymbolTable State = cudaq_runtime.State qvector = cudaq_runtime.qvector @@ -119,8 +119,9 @@ def isQuantumType(ty): if numQuantumMembers != len(mlirEleTypes) or \ any((quake.StruqType.isinstance(t) for t in mlirEleTypes)): return None - return quake.StruqType.getNamed(name, mlirEleTypes, context=context) - + if len(name) > 0: + return quake.StruqType.getNamed(name, mlirEleTypes, context=context) + return quake.StruqType.get(mlirEleTypes, context=context) def mlirTypeFromAnnotation(annotation, ctx, raiseError=False): """ @@ -284,6 +285,7 @@ def emitFatalErrorOverride(msg): f"Adding new fields in data classes is not yet supported. The dataclass must be declared with @dataclass(slots=True) or @dataclasses.dataclass(slots=True)." ) + unnamed_struct = "__repr__" not in pyType.__dict__ if len({ k: v for k, v in pyType.__dict__.items() @@ -293,7 +295,8 @@ def emitFatalErrorOverride(msg): localEmitFatalError( 'struct types with user specified methods are not allowed.') - tupleTy = mlirTryCreateStructType(structTys, name=id) + struct_name = id if not unnamed_struct else "" + tupleTy = mlirTryCreateStructType(structTys, name=struct_name) if tupleTy is None: localEmitFatalError( "Hybrid quantum-classical data types and nested quantum structs are not allowed." @@ -442,7 +445,19 @@ def mlirTypeFromPyType(argType, ctx, **kwargs): if 'argInstance' in kwargs: argInstance = kwargs['argInstance'] if isinstance(argInstance, Callable): - return cc.CallableType.get(argInstance.argTypes, ctx) + if hasattr(argInstance, 'argTypes'): + return cc.CallableType.get(argInstance.argTypes, ctx) + elif "module" in kwargs and hasattr(argInstance, '__call__'): + # This is a callable object, check if it's a C++ `qkernel` + maybeKernelName = getInteropKernelNameIfFound(argInstance, kwargs['module']) + if maybeKernelName != None: + otherKernel = SymbolTable( + kwargs['module'].operation)[maybeKernelName] + if isinstance(otherKernel, func.FuncOp): + argTypes = [] + for arg in otherKernel.arguments: + argTypes.append(arg.type) + return cc.CallableType.get(argTypes, ctx) for name in globalRegisteredTypes.classes: customTy, memberTys = globalRegisteredTypes.getClassAttributes(name) @@ -557,6 +572,27 @@ def mlirTypeToPyType(argType): emitFatalError( f"Cannot infer python type from provided CUDA-Q type ({argType})") +def getInteropKernelNameIfFound(pyFunc, module): + """ + Given a Python function and an MLIR module, check if the function + is registered as an interop kernel. If so, return the kernel name. + Otherwise, return None. + """ + if not callable(pyFunc): + emitFatalError( + f"Provided argument is not a callable function ({pyFunc})" + ) + + modulePath = str(pyFunc.__module__) if hasattr(pyFunc, '__module__') else '' + funcName = str(pyFunc.__name__) if hasattr(pyFunc, '__name__') else '' + # Look up key + devKey = f"{modulePath}.{funcName}" + if cudaq_runtime.isRegisteredDeviceModule(devKey): + maybeKernelName = cudaq_runtime.checkRegisteredCppDeviceKernel(module, devKey) + if maybeKernelName != None: + return maybeKernelName + + return None def emitErrorIfInvalidPauli(pauliArg): """ diff --git a/python/runtime/cudaq/algorithms/py_run.cpp b/python/runtime/cudaq/algorithms/py_run.cpp index 8cd38a7295b..ef59b14f461 100644 --- a/python/runtime/cudaq/algorithms/py_run.cpp +++ b/python/runtime/cudaq/algorithms/py_run.cpp @@ -39,7 +39,8 @@ static std::vector readRunResults(mlir::ModuleOp module, } static std::tuple + mlir::func::FuncOp, std::string, mlir::func::FuncOp, + std::vector> getKernelLaunchParameters(py::object &kernel, py::args args) { if (!py::hasattr(kernel, "arguments")) throw std::runtime_error( @@ -52,6 +53,11 @@ getKernelLaunchParameters(py::object &kernel, py::args args) { if (py::hasattr(kernel, "compile")) kernel.attr("compile")(); + std::vector callableNames; + if (py::hasattr(kernel, "getCallableNames")) + callableNames = + kernel.attr("getCallableNames")(*args).cast>(); + auto origKernName = kernel.attr("name").cast(); auto kernelName = origKernName + ".run"; if (!py::hasattr(kernel, "module") || kernel.attr("module").is_none()) @@ -78,7 +84,8 @@ getKernelLaunchParameters(py::object &kernel, py::args args) { } auto *argData = toOpaqueArgs(args, kernelMod, kernelName); auto funcOp = getKernelFuncOp(kernelMod, kernelName); - return {kernelName, kernelMod, argData, funcOp, origKernName, origKern}; + return {kernelName, kernelMod, argData, funcOp, + origKernName, origKern, callableNames}; } static details::RunResultSpan @@ -86,6 +93,7 @@ pyRunTheKernel(const std::string &name, const std::string &origName, MlirModule module, mlir::func::FuncOp funcOp, mlir::func::FuncOp origKernel, OpaqueArguments &runtimeArgs, quantum_platform &platform, std::size_t shots_count, + const std::vector &callableNames, std::size_t qpu_id = 0) { auto returnTypes = origKernel.getResultTypes(); if (returnTypes.empty() || returnTypes.size() > 1) @@ -93,21 +101,24 @@ pyRunTheKernel(const std::string &name, const std::string &origName, "`cudaq.run` only supports kernels that return a value."); auto returnTy = returnTypes[0]; - // Disallow returning list / vectors from entry-point kernels. - if (returnTy.isa()) { - throw std::runtime_error("`cudaq.run` does not yet support returning " - "`list` from entry-point kernels."); + // Disallow returning nested vectors from entry-point kernels. + if (auto vecTy = dyn_cast(returnTy)) { + auto elemTy = vecTy.getElementType(); + if (elemTy.isa()) + throw std::runtime_error( + "`cudaq.run` does not yet support returning nested `list` from " + "entry-point kernels."); } auto mod = unwrap(module); - auto [rawArgs, size, returnOffset, thunk] = - pyAltLaunchKernelBase(name, module, returnTy, runtimeArgs, {}, 0, false); + auto [rawArgs, size, returnOffset, thunk] = pyAltLaunchKernelBase( + name, module, returnTy, runtimeArgs, callableNames, 0, false); auto results = details::runTheKernel( [&]() mutable { pyLaunchKernel(name, thunk, mod, runtimeArgs, rawArgs, size, - returnOffset, {}); + returnOffset, callableNames); }, platform, name, origName, shots_count, qpu_id); @@ -133,7 +144,7 @@ std::vector pyRun(py::object &kernel, py::args args, if (shots_count == 0) return {}; - auto [name, module, argData, func, origName, origKern] = + auto [name, module, argData, func, origName, origKern, callableNames] = getKernelLaunchParameters(kernel, args); auto mod = unwrap(module); @@ -149,7 +160,7 @@ std::vector pyRun(py::object &kernel, py::args args, } auto span = pyRunTheKernel(name, origName, module, func, origKern, *argData, - platform, shots_count); + platform, shots_count, callableNames); delete argData; auto results = pyReadResults(span, module, func, origKern, shots_count); @@ -184,7 +195,7 @@ async_run_result pyRunAsync(py::object &kernel, py::args args, ") exceeds the number of available QPUs (" + std::to_string(numQPUs) + ")"); - auto [name, module, argData, func, origName, origKern] = + auto [name, module, argData, func, origName, origKern, callableNames] = getKernelLaunchParameters(kernel, args); auto mod = unwrap(module); @@ -219,7 +230,7 @@ async_run_result pyRunAsync(py::object &kernel, py::args args, QuantumTask wrapped = detail::make_copyable_function( [sp = std::move(spanPromise), ep = std::move(errorPromise), shots_count, qpu_id, argData, name, module, func, origKern, origName, - noise_model = std::move(noise_model)]() mutable { + noise_model = std::move(noise_model), callableNames]() mutable { auto &platform = get_platform(); // Launch the kernel in the appropriate context. @@ -227,8 +238,9 @@ async_run_result pyRunAsync(py::object &kernel, py::args args, platform.set_noise(&noise_model.value()); try { - auto span = pyRunTheKernel(name, origName, module, func, origKern, - *argData, platform, shots_count, qpu_id); + auto span = + pyRunTheKernel(name, origName, module, func, origKern, *argData, + platform, shots_count, callableNames, qpu_id); delete argData; sp.set_value(span); ep.set_value(""); diff --git a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp index 9db3e9e431f..5543a54b37d 100644 --- a/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp +++ b/python/runtime/cudaq/platform/py_alt_launch_kernel.cpp @@ -117,8 +117,21 @@ OpaqueArguments *toOpaqueArgs(py::args &args, MlirModule mod, auto *argData = new cudaq::OpaqueArguments(); args = simplifiedValidateInputArguments(args); setDataLayout(mod); - cudaq::packArgs(*argData, args, kernelFunc, - [](OpaqueArguments &, py::object &) { return false; }); + auto callableArgHandler = [](cudaq::OpaqueArguments &argData, + py::object &arg) { + if (py::hasattr(arg, "module") || py::hasattr(arg, "__call__")) { + // Just give it some dummy data that will not be used. + // We synthesize away all callables, the block argument + // remains but it is not used, so just give argsCreator + // something, and we'll make sure its cleaned up. + long *ourAllocatedArg = new long(); + argData.emplace_back(ourAllocatedArg, + [](void *ptr) { delete static_cast(ptr); }); + return true; + } + return false; + }; + cudaq::packArgs(*argData, args, kernelFunc, callableArgHandler); return argData; } @@ -157,7 +170,6 @@ ExecutionEngine *jitKernel(const std::string &name, MlirModule module, pm.addPass(cudaq::opt::createGenerateKernelExecution( {.startingArgIdx = startingArgIdx})); pm.addPass(cudaq::opt::createGenerateDeviceCodeLoader({.jitTime = true})); - pm.addPass(cudaq::opt::createReturnToOutputLog()); pm.addPass(cudaq::opt::createLambdaLiftingPass()); pm.addPass(cudaq::opt::createDistributedDeviceCall()); std::string tl = getTransportLayer(); @@ -947,7 +959,7 @@ void bindAltLaunchKernel(py::module &mod, auto callableArgHandler = [](cudaq::OpaqueArguments &argData, py::object &arg) { - if (py::hasattr(arg, "module")) { + if (py::hasattr(arg, "module") || py::hasattr(arg, "__call__")) { // Just give it some dummy data that will not be used. // We synthesize away all callables, the block argument // remains but it is not used, so just give argsCreator diff --git a/python/runtime/interop/PythonCppInterop.h b/python/runtime/interop/PythonCppInterop.h index 9b39aada636..c74a8ec2872 100644 --- a/python/runtime/interop/PythonCppInterop.h +++ b/python/runtime/interop/PythonCppInterop.h @@ -7,6 +7,8 @@ ******************************************************************************/ #pragma once +#include "cudaq/qis/qkernel.h" +#include "cudaq/utils/registry.h" #include namespace py = pybind11; @@ -166,4 +168,28 @@ void addDeviceKernelInterop(py::module_ &m, const std::string &modName, kernelName, mangledArgs); return; } + +// Specialization for qkernel +template +py::object convertQkernel(py::module_ &m, cudaq::qkernel &qkernel, + const std::string &docstring = "") { + const char *qkernelName = cudaq::registry::getLinkableKernelNameOrNull( + cudaq::registry::__cudaq_getLinkableKernelKey(&qkernel)); + if (!qkernelName) + throw std::runtime_error( + "Invalid `qkernel` passed, could not find registered kernel."); + std::string kernelName = qkernelName; + // Rremove "function_" prefix if exists + if (kernelName.starts_with("function_")) + kernelName = kernelName.substr(std::string("function_").length()); + const std::string docStr = + docstring.empty() + ? "Auto-generated kernel from C++ " + kernelName + " qkernel." + : docstring; + m.def( + kernelName.c_str(), [](Args...) {}, docStr.c_str()); + cudaq::python::registerDeviceKernel(m.attr("__name__").cast(), + kernelName, ""); + return m.attr(kernelName.c_str()); +} } // namespace cudaq::python diff --git a/python/tests/interop/quantum_lib/quantum_lib.cpp b/python/tests/interop/quantum_lib/quantum_lib.cpp index 7c5cbb23054..ed313b32f88 100644 --- a/python/tests/interop/quantum_lib/quantum_lib.cpp +++ b/python/tests/interop/quantum_lib/quantum_lib.cpp @@ -29,4 +29,12 @@ __qpu__ void qft(cudaq::qview<> qubits, const std::vector &x, __qpu__ void another(cudaq::qview<> qubits, std::size_t i) { x(qubits[i]); } __qpu__ void uccsd(cudaq::qview<> qubits, std::size_t) { h(qubits[0]); } + +__qpu__ void reset_group(patch p) { + for (std::size_t i = 0; i < p.data.size(); i++) + reset(p.data[i]); +} + +__qpu__ void x_group(patch p) { x(p.data); } + } // namespace cudaq diff --git a/python/tests/interop/quantum_lib/quantum_lib.h b/python/tests/interop/quantum_lib/quantum_lib.h index a0655099237..4b9fa371351 100644 --- a/python/tests/interop/quantum_lib/quantum_lib.h +++ b/python/tests/interop/quantum_lib/quantum_lib.h @@ -9,6 +9,12 @@ #include "cudaq/qis/qubit_qis.h" +// Custom data structure +struct patch { + cudaq::qview<> data; + cudaq::qview<> aux; +}; + namespace cudaq { void entryPoint(const std::function &)> &statePrep); @@ -19,4 +25,8 @@ void another(cudaq::qview<> qubits, std::size_t); void uccsd(cudaq::qview<> qubits, std::size_t); +void reset_group(patch p); + +void x_group(patch p); + } // namespace cudaq diff --git a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp index 4ea2d2176cc..92f1382f2a8 100644 --- a/python/tests/interop/test_cpp_quantum_algorithm_module.cpp +++ b/python/tests/interop/test_cpp_quantum_algorithm_module.cpp @@ -8,6 +8,7 @@ #include "cudaq.h" #include "cudaq/algorithms/sample.h" +#include "cudaq/qis/qkernel.h" #include "quantum_lib/quantum_lib.h" #include "runtime/interop/PythonCppInterop.h" #include @@ -15,6 +16,22 @@ namespace py = pybind11; +namespace { +static std::unordered_map, std::size_t)>> + g_cppKernels_1; + +static std::unordered_map> + g_cppKernels_2; + +static const bool initKernels = []() { + g_cppKernels_1.insert(std::make_pair("uccsd", cudaq::uccsd)); + g_cppKernels_2.insert(std::make_pair("reset", cudaq::reset_group)); + g_cppKernels_2.insert(std::make_pair("x", cudaq::x_group)); + return true; +}(); +} // namespace + PYBIND11_MODULE(cudaq_test_cpp_algo, m) { m.def("test_cpp_qalgo", [](py::object statePrepIn) { @@ -49,4 +66,26 @@ PYBIND11_MODULE(cudaq_test_cpp_algo, m) { cudaq::python::addDeviceKernelInterop, std::size_t>( m, "qstd", "uccsd", ""); + + // Convert the C++ kernel registry to Python-accessible kernels + auto interopSubMod = m.def_submodule("_cpp_interop_kernels"); + static std::unordered_map g_py_kernels; + + for (auto &[name, kernel] : g_cppKernels_1) { + g_py_kernels.insert(std::make_pair( + name, cudaq::python::convertQkernel(interopSubMod, kernel))); + } + + for (auto &[name, kernel] : g_cppKernels_2) { + g_py_kernels.insert(std::make_pair( + name, cudaq::python::convertQkernel(interopSubMod, kernel))); + } + + m.def("get_cpp_kernel", [](const std::string &name) { + auto it = g_py_kernels.find(name); + if (it == g_py_kernels.end()) + throw std::runtime_error("No C++ kernel registered for requested name."); + + return it->second; + }); } diff --git a/python/tests/interop/test_interop.py b/python/tests/interop/test_interop.py index e63588408ac..4324e79c02c 100644 --- a/python/tests/interop/test_interop.py +++ b/python/tests/interop/test_interop.py @@ -7,6 +7,8 @@ # ============================================================================ # import cudaq, pytest +from typing import Callable +from dataclasses import dataclass cudaq_test_cpp_algo = pytest.importorskip('cudaq_test_cpp_algo') @@ -242,3 +244,79 @@ def entry(): takesCapture(spin) entry.compile() + + +def test_cpp_qkernel(): + # Test the `qkernel` provided in C++ via a map-like registry. + # This is provided as a function-like callable. + kernel_from_cpp_registry = cudaq_test_cpp_algo.get_cpp_kernel("uccsd") + + # Use as a capture + @cudaq.kernel + def cpp_qkernel(): + q = cudaq.qvector(4) + kernel_from_cpp_registry(q, 0) + + cpp_qkernel() + + # Use as a callable argument + @cudaq.kernel + def caller(k: Callable[[cudaq.qview, int], None]): + q = cudaq.qvector(4) + k(q, 0) + + caller(kernel_from_cpp_registry) + + +def test_cpp_custom_struct(): + # Define a struct in Python that matches the C++ struct + # Note: use `repr=False` to annotate that this is an unnamed struct. + # This will maintain compatibility with C++ structs that do not have + # a name. + @dataclass(slots=True, repr=False) + class patch: + data: cudaq.qvector + aux: cudaq.qvector + + reset_qkernel = cudaq_test_cpp_algo.get_cpp_kernel("reset") + x_qkernel = cudaq_test_cpp_algo.get_cpp_kernel("x") + + # Use as a capture + @cudaq.kernel + def cpp_qkernel_struct(): + q = cudaq.qvector(4) + r = cudaq.qvector(2) + x(q) + reset_qkernel(patch(q, r)) + + counts = cudaq.sample(cpp_qkernel_struct) + counts.dump() + assert len(counts) == 1 and '000000' in counts + + @cudaq.kernel + def cpp_qkernel_struct_x(): + q = cudaq.qvector(4) + r = cudaq.qvector(2) + x_qkernel(patch(q, r)) + + counts = cudaq.sample(cpp_qkernel_struct_x) + counts.dump() + assert len(counts) == 1 and '111100' in counts + + # Callable + @cudaq.kernel + def cpp_qkernel_struct_callable(k: Callable[[patch], None]): + q = cudaq.qvector(4) + r = cudaq.qvector(2) + for i in range(4): + if i % 2 == 0: + x(q[i]) + k(patch(q, r)) + + counts = cudaq.sample(cpp_qkernel_struct_callable, reset_qkernel) + counts.dump() + assert len(counts) == 1 and '000000' in counts + + counts = cudaq.sample(cpp_qkernel_struct_callable, x_qkernel) + counts.dump() + assert len(counts) == 1 and '010100' in counts diff --git a/python/tests/kernel/test_run_async_kernel.py b/python/tests/kernel/test_run_async_kernel.py index fc1c0ac3aae..31796c18e15 100644 --- a/python/tests/kernel/test_run_async_kernel.py +++ b/python/tests/kernel/test_run_async_kernel.py @@ -14,8 +14,6 @@ import numpy as np import pytest -list_err_msg = 'does not yet support returning `list` from entry-point kernels' - def is_close(actual, expected): return np.isclose(actual, expected, atol=1e-6) @@ -338,38 +336,41 @@ def test_return_list_bool(): def simple_list_bool_no_args() -> list[bool]: return [True, False, True] - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_bool_no_args, shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_bool_no_args, shots_count=2).get() + assert len(results) == 2 + assert results[0] == [True, False, True] + assert results[1] == [True, False, True] @cudaq.kernel def simple_list_bool(n: int) -> list[bool]: qubits = cudaq.qvector(n) return [True, False, True] - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_bool, 2, shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_bool, 2, shots_count=2).get() + assert len(results) == 2 + assert results[0] == [True, False, True] + assert results[1] == [True, False, True] @cudaq.kernel def simple_list_bool_args(n: int, t: list[bool]) -> list[bool]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_bool_args, 2, [True, False, True]).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_bool_args, 2, [True, False, True], shots_count=2).get() + assert len(results) == 2 + assert results[0] == [True, False, True] + assert results[1] == [True, False, True] @cudaq.kernel def simple_list_bool_args_no_broadcast(t: list[bool]) -> list[bool]: qubits = cudaq.qvector(2) return t - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_bool_args_no_broadcast, - [True, False, True]).get() - assert list_err_msg in str(e.value) - + results = cudaq.run_async(simple_list_bool_args_no_broadcast, + [True, False, True], shots_count=2).get() + assert len(results) == 2 + assert results[0] == [True, False, True] + assert results[1] == [True, False, True] def test_return_list_int(): @@ -377,18 +378,21 @@ def test_return_list_int(): def simple_list_int_no_args() -> list[int]: return [-13, 5, 42] - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_int_no_args, shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_int_no_args, shots_count=2).get() + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] @cudaq.kernel def simple_list_int(n: int, t: list[int]) -> list[int]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_int, 2, [-13, 5, 42], shots_count=2).get() - assert list_err_msg in str(e.value) + + results = cudaq.run_async(simple_list_int, 2, [-13, 5, 42], shots_count=2).get() + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] def test_return_list_int8(): @@ -397,18 +401,22 @@ def test_return_list_int8(): def simple_list_int8_no_args() -> list[np.int8]: return [-13, 5, 42] - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_int8_no_args, shots_count=2).get() - assert list_err_msg in str(e.value) + + results = cudaq.run_async(simple_list_int8_no_args, shots_count=2).get() + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] @cudaq.kernel def simple_list_int8(n: int, t: list[np.int8]) -> list[np.int8]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_int8, 2, [-13, 5, 42], shots_count=2).get() - assert list_err_msg in str(e.value) + + results = cudaq.run_async(simple_list_int8, 2, [-13, 5, 42], shots_count=2).get() + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] def test_return_list_int16(): @@ -417,18 +425,20 @@ def test_return_list_int16(): def simple_list_int16_no_args() -> list[np.int16]: return [-13, 5, 42] - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_int16_no_args, shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_int16_no_args, shots_count=2).get() + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] @cudaq.kernel def simple_list_int16(n: int, t: list[np.int16]) -> list[np.int16]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_int16, 2, [-13, 5, 42], shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_int16, 2, [-13, 5, 42], shots_count=2).get() + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] def test_return_list_int32(): @@ -437,18 +447,20 @@ def test_return_list_int32(): def simple_list_int32_no_args() -> list[np.int32]: return [-13, 5, 42] - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_int32_no_args, shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_int32_no_args, shots_count=2).get() + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] @cudaq.kernel def simple_list_int32(n: int, t: list[np.int32]) -> list[np.int32]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_int32, 2, [-13, 5, 42], shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_int32, 2, [-13, 5, 42], shots_count=2).get() + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] def test_return_list_int64(): @@ -457,18 +469,20 @@ def test_return_list_int64(): def simple_list_int64_no_args() -> list[np.int64]: return [-13, 5, 42] - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_int64_no_args, shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_int64_no_args, shots_count=2).get() + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] @cudaq.kernel def simple_list_int64(n: int, t: list[np.int64]) -> list[np.int64]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_int64, 2, [-13, 5, 42], shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_int64, 2, [-13, 5, 42], shots_count=2).get() + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] def test_return_list_float(): @@ -477,20 +491,22 @@ def test_return_list_float(): def simple_list_float_no_args() -> list[float]: return [-13.2, 5., 42.99] - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_float_no_args, shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_float_no_args, shots_count=2).get() + assert len(results) == 2 + assert np.allclose(results[0], [-13.2, 5., 42.99]) + assert np.allclose(results[1], [-13.2, 5., 42.99]) @cudaq.kernel def simple_list_float(n: int, t: list[float]) -> list[float]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_float, + results = cudaq.run_async(simple_list_float, 2, [-13.2, 5.0, 42.99], shots_count=2).get() - assert list_err_msg in str(e.value) + assert len(results) == 2 + assert np.allclose(results[0], [-13.2, 5., 42.99]) + assert np.allclose(results[1], [-13.2, 5., 42.99]) def test_return_list_float32(): @@ -499,20 +515,22 @@ def test_return_list_float32(): def simple_list_float32_no_args() -> list[np.float32]: return [-13.2, 5., 42.99] - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_float32_no_args, shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_float32_no_args, shots_count=2).get() + assert len(results) == 2 + assert np.allclose(results[0], [-13.2, 5., 42.99]) + assert np.allclose(results[1], [-13.2, 5., 42.99]) @cudaq.kernel def simple_list_float32(n: int, t: list[np.float32]) -> list[np.float32]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_float32, + results = cudaq.run_async(simple_list_float32, 2, [-13.2, 5.0, 42.99], shots_count=2).get() - assert list_err_msg in str(e.value) + assert len(results) == 2 + assert np.allclose(results[0], [-13.2, 5., 42.99]) + assert np.allclose(results[1], [-13.2, 5., 42.99]) def test_return_list_float64(): @@ -521,21 +539,22 @@ def test_return_list_float64(): def simple_list_float64_no_args() -> list[np.float64]: return [-13.2, 5., 42.99] - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_float64_no_args, shots_count=2).get() - assert list_err_msg in str(e.value) + results = cudaq.run_async(simple_list_float64_no_args, shots_count=2).get() + assert len(results) == 2 + assert np.allclose(results[0], [-13.2, 5., 42.99]) + assert np.allclose(results[1], [-13.2, 5., 42.99]) @cudaq.kernel def simple_list_float64(n: int, t: list[np.float64]) -> list[np.float64]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run_async(simple_list_float64, + results = cudaq.run_async(simple_list_float64, 2, [-13.2, 5.0, 42.99], shots_count=2).get() - assert list_err_msg in str(e.value) - + assert len(results) == 2 + assert np.allclose(results[0], [-13.2, 5., 42.99]) + assert np.allclose(results[1], [-13.2, 5., 42.99]) # Test tuples # TODO: Define spec for using tuples in kernels diff --git a/python/tests/kernel/test_run_kernel.py b/python/tests/kernel/test_run_kernel.py index 3e656ee16a9..d73b35fa352 100644 --- a/python/tests/kernel/test_run_kernel.py +++ b/python/tests/kernel/test_run_kernel.py @@ -14,8 +14,6 @@ import warnings import pytest -list_err_msg = 'does not yet support returning `list` from entry-point kernels' - skipIfBraketNotInstalled = pytest.mark.skipif( not (cudaq.has_target("braket")), reason='Could not find `braket` in installation') @@ -333,36 +331,41 @@ def test_return_list_bool(): def simple_list_bool_no_args() -> list[bool]: return [True, False, True] - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_bool_no_args, shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_bool_no_args, shots_count=2) + assert len(results) == 2 + assert results[0] == [True, False, True] + assert results[1] == [True, False, True] @cudaq.kernel def simple_list_bool(n: int) -> list[bool]: qubits = cudaq.qvector(n) return [True, False, True] - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_bool, 2, shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_bool, 2, shots_count=2) + assert len(results) == 2 + assert results[0] == [True, False, True] + assert results[1] == [True, False, True] @cudaq.kernel def simple_list_bool_args(n: int, t: list[bool]) -> list[bool]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_bool_args, 2, [True, False, True]) - assert list_err_msg in str(e.value) + + results = cudaq.run(simple_list_bool_args, 2, [True, False, True], shots_count=2) + assert len(results) == 2 + assert results[0] == [True, False, True] + assert results[1] == [True, False, True] @cudaq.kernel def simple_list_bool_args_no_broadcast(t: list[bool]) -> list[bool]: qubits = cudaq.qvector(2) return t - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_bool_args_no_broadcast, [True, False, True]) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_bool_args_no_broadcast, [True, False, True], shots_count=2) + assert len(results) == 2 + assert results[0] == [True, False, True] + assert results[1] == [True, False, True] def test_return_list_int(): @@ -371,18 +374,20 @@ def test_return_list_int(): def simple_list_int_no_args() -> list[int]: return [-13, 5, 42] - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_int_no_args, shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_int_no_args, shots_count=2) + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] @cudaq.kernel def simple_list_int(n: int, t: list[int]) -> list[int]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_int, 2, [-13, 5, 42], shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_int, 2, [-13, 5, 42], shots_count=2) + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] def test_return_list_int8(): @@ -391,18 +396,21 @@ def test_return_list_int8(): def simple_list_int8_no_args() -> list[np.int8]: return [-13, 5, 42] - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_int8_no_args, shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_int8_no_args, shots_count=2) + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] + @cudaq.kernel def simple_list_int8(n: int, t: list[np.int8]) -> list[np.int8]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_int8, 2, [-13, 5, 42], shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_int8, 2, [-13, 5, 42], shots_count=2) + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] def test_return_list_int16(): @@ -411,18 +419,21 @@ def test_return_list_int16(): def simple_list_int16_no_args() -> list[np.int16]: return [-13, 5, 42] - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_int16_no_args, shots_count=2) - assert list_err_msg in str(e.value) + + results = cudaq.run(simple_list_int16_no_args, shots_count=2) + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] @cudaq.kernel def simple_list_int16(n: int, t: list[np.int16]) -> list[np.int16]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_int16, 2, [-13, 5, 42], shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_int16, 2, [-13, 5, 42], shots_count=2) + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] def test_return_list_int32(): @@ -431,18 +442,20 @@ def test_return_list_int32(): def simple_list_int32_no_args() -> list[np.int32]: return [-13, 5, 42] - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_int32_no_args, shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_int32_no_args, shots_count=2) + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] @cudaq.kernel def simple_list_int32(n: int, t: list[np.int32]) -> list[np.int32]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_int32, 2, [-13, 5, 42], shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_int32, 2, [-13, 5, 42], shots_count=2) + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] def test_return_list_int64(): @@ -451,18 +464,20 @@ def test_return_list_int64(): def simple_list_int64_no_args() -> list[np.int64]: return [-13, 5, 42] - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_int64_no_args, shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_int64_no_args, shots_count=2) + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] @cudaq.kernel def simple_list_int64(n: int, t: list[np.int64]) -> list[np.int64]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_int64, 2, [-13, 5, 42], shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_int64, 2, [-13, 5, 42], shots_count=2) + assert len(results) == 2 + assert results[0] == [-13, 5, 42] + assert results[1] == [-13, 5, 42] def test_return_list_float(): @@ -471,18 +486,20 @@ def test_return_list_float(): def simple_list_float_no_args() -> list[float]: return [-13.2, 5., 42.99] - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_float_no_args, shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_float_no_args, shots_count=2) + assert len(results) == 2 + assert is_close_array(results[0], [-13.2, 5., 42.99]) + assert is_close_array(results[1], [-13.2, 5., 42.99]) @cudaq.kernel def simple_list_float(n: int, t: list[float]) -> list[float]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_float, 2, [-13.2, 5.0, 42.99], shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_float, 2, [-13.2, 5.0, 42.99], shots_count=2) + assert len(results) == 2 + assert is_close_array(results[0], [-13.2, 5., 42.99]) + assert is_close_array(results[1], [-13.2, 5., 42.99]) def test_return_list_float32(): @@ -491,18 +508,20 @@ def test_return_list_float32(): def simple_list_float32_no_args() -> list[np.float32]: return [-13.2, 5., 42.99] - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_float32_no_args, shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_float32_no_args, shots_count=2) + assert len(results) == 2 + assert is_close_array(results[0], [-13.2, 5., 42.99]) + assert is_close_array(results[1], [-13.2, 5., 42.99]) @cudaq.kernel def simple_list_float32(n: int, t: list[np.float32]) -> list[np.float32]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_float32, 2, [-13.2, 5.0, 42.99], shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_float32, 2, [-13.2, 5.0, 42.99], shots_count=2) + assert len(results) == 2 + assert is_close_array(results[0], [-13.2, 5., 42.99]) + assert is_close_array(results[1], [-13.2, 5., 42.99]) def test_return_list_float64(): @@ -511,19 +530,84 @@ def test_return_list_float64(): def simple_list_float64_no_args() -> list[np.float64]: return [-13.2, 5., 42.99] - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_float64_no_args, shots_count=2) - assert list_err_msg in str(e.value) + results = cudaq.run(simple_list_float64_no_args, shots_count=2) + assert len(results) == 2 + assert is_close_array(results[0], [-13.2, 5., 42.99]) + assert is_close_array(results[1], [-13.2, 5., 42.99]) @cudaq.kernel def simple_list_float64(n: int, t: list[np.float64]) -> list[np.float64]: qubits = cudaq.qvector(n) return t - with pytest.raises(RuntimeError) as e: - cudaq.run(simple_list_float64, 2, [-13.2, 5.0, 42.99], shots_count=2) - assert list_err_msg in str(e.value) + + results = cudaq.run(simple_list_float64, 2, [-13.2, 5.0, 42.99], shots_count=2) + assert len(results) == 2 + assert is_close_array(results[0], [-13.2, 5., 42.99]) + assert is_close_array(results[1], [-13.2, 5., 42.99]) +def test_return_list_large_size(): + # Returns a large list (dynamic size) to stress test the code generation + + @cudaq.kernel + def kernel_with_dynamic_int_array_input(n: int, t: list[int]) -> list[int]: + qubits = cudaq.qvector(n) + return t + + @cudaq.kernel + def kernel_with_dynamic_float_array_input(n: int, t: list[float]) -> list[float]: + qubits = cudaq.qvector(n) + return t + + @cudaq.kernel + def kernel_with_dynamic_bool_array_input(n: int, t: list[bool]) -> list[bool]: + qubits = cudaq.qvector(n) + return t + + # Test with various sizes (validate dynamic output logging) + for array_size in [10, 15, 100, 167, 1000]: + input_array = list(np.random.randint(-1000, 1000, size=array_size)) + results = cudaq.run(kernel_with_dynamic_int_array_input, 2, input_array, shots_count=2) + assert len(results) == 2 + assert results[0] == input_array + assert results[1] == input_array + + input_array_float = list(np.random.uniform(-1000.0, 1000.0, size=array_size)) + results = cudaq.run(kernel_with_dynamic_float_array_input, 2, input_array_float, shots_count=2) + assert len(results) == 2 + assert is_close_array(results[0], input_array_float) + assert is_close_array(results[1], input_array_float) + + input_array_bool = [] + for _ in range(array_size): + input_array_bool.append(True if np.random.rand() > 0.5 else False) + results = cudaq.run(kernel_with_dynamic_bool_array_input, 2, input_array_bool, shots_count=2) + assert len(results) == 2 + assert results[0] == input_array_bool + assert results[1] == input_array_bool + +def test_return_dynamics_measure_results(): + @cudaq.kernel + def measure_all_qubits(numQubits: int) -> list[bool]: + # Number of qubits is dynamic + qubits = cudaq.qvector(numQubits) + for i in range(numQubits): + if i % 2 == 0: + x(qubits[i]) + + return mz(qubits) + + for numQubits in [1, 3, 5, 11, 20]: + shots = 2 + results = cudaq.run(measure_all_qubits, numQubits, shots_count=shots) + assert len(results) == shots + for res in results: + assert len(res) == numQubits + for i in range(numQubits): + if i % 2 == 0: + assert res[i] == True + else: + assert res[i] == False # Test tuples # TODO: Define spec for using tuples in kernels diff --git a/python/tests/kernel/test_to_integer.py b/python/tests/kernel/test_to_integer.py new file mode 100644 index 00000000000..959341a3698 --- /dev/null +++ b/python/tests/kernel/test_to_integer.py @@ -0,0 +1,41 @@ +# ============================================================================ # +# Copyright (c) 2022 - 2025 NVIDIA Corporation & Affiliates. # +# All rights reserved. # +# # +# This source code and the accompanying materials are made available under # +# the terms of the Apache License 2.0 which accompanies this distribution. # +# ============================================================================ # +import pytest +import os +import cudaq + +def testToInteger(): + @cudaq.kernel + def toIntegerKernel(applyX: list[int]) -> int: + q = cudaq.qvector(len(applyX)) + for i in range(len(applyX)): + if applyX[i]: + x(q[i]) + return cudaq.to_integer(mz(q)) + + test_cases = [ + [1, 1, 1], + [1, 1, 1, 1], + [1, 0, 1], + [1, 0, 0, 0], + [0, 0, 0, 1], + ] + + # See reference: targettests/execution/to_integer.cpp + expected_results = [7, 15, 5, 1, 8] + for applyX in test_cases: + counts = cudaq.run(toIntegerKernel, applyX) + # All shots should yield the same integer result + for result in counts: + assert result == expected_results[test_cases.index(applyX)] + + +# leave for gdb debugging +if __name__ == "__main__": + loc = os.path.abspath(__file__) + pytest.main([loc, "-rP"]) \ No newline at end of file diff --git a/python/tests/mlir/ast_list_comprehension.py b/python/tests/mlir/ast_list_comprehension.py index ba3e936db4c..e0051bf28e7 100644 --- a/python/tests/mlir/ast_list_comprehension.py +++ b/python/tests/mlir/ast_list_comprehension.py @@ -55,10 +55,12 @@ def kernel3() -> float: # CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel1() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"} -# CHECK: %[[VAL_0:.*]] = arith.constant true -# CHECK: %[[VAL_1:.*]] = cc.alloca !cc.array -# CHECK: %[[VAL_2:.*]] = cc.compute_ptr %[[VAL_1]][{{.*}}] : (!cc.ptr>, i64) -> !cc.ptr -# CHECK: cc.store %[[VAL_0]], %[[VAL_2]] : !cc.ptr +# CHECK: %[[VAL_0:.*]] = arith.constant 1 : i8 +# CHECK: %[[VAL_1:.*]] = cc.alloca !cc.array +# CHECK: %[[VAL_2:.*]] = cc.cast %[[VAL_1]] : (!cc.ptr>) -> !cc.ptr> +# CHECK: %[[VAL_3:.*]] = cc.compute_ptr %[[VAL_1]][{{.*}}] : (!cc.ptr>, i64) -> !cc.ptr +# CHECK: cc.store %[[VAL_0]], %[[VAL_3]] : !cc.ptr +# CHECK: %[[VAL_5:.*]] = cc.stdvec_init %[[VAL_2]], %c5_i64 : (!cc.ptr>, i64) -> !cc.stdvec # CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel2() -> f64 attributes {"cudaq-entrypoint", "cudaq-kernel"} # CHECK: %[[VAL_0:.*]] = arith.constant 1.000000e+00 : f64 @@ -122,10 +124,12 @@ def kernel3() -> float: # CHECK: %[[VAL_0:.*]] = arith.constant true # CHECK: %[[VAL_1:.*]] = cc.alloca i1 # CHECK: cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr -# CHECK: %[[VAL_2:.*]] = cc.alloca !cc.array -# CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_1]] : !cc.ptr -# CHECK: %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr>, i64) -> !cc.ptr -# CHECK: cc.store %[[VAL_3]], %[[VAL_4]] : !cc.ptr +# CHECK: %[[VAL_2:.*]] = cc.alloca !cc.array +# CHECK: %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr>) -> !cc.ptr> +# CHECK: %[[VAL_4:.*]] = cc.load %[[VAL_1]] : !cc.ptr +# CHECK: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr>, i64) -> !cc.ptr +# CHECK: %[[VAL_6:.*]] = cc.cast unsigned %[[VAL_4]] : (i1) -> i8 +# CHECK: cc.store %[[VAL_6]], %[[VAL_5]] : !cc.ptr # CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel2() -> i64 attributes {"cudaq-entrypoint", "cudaq-kernel"} # CHECK: %[[VAL_0:.*]] = arith.constant 1.000000e+00 : f64 @@ -198,10 +202,12 @@ def kernel3() -> float: # CHECK: %[[VAL_0:.*]] = arith.constant true # CHECK: %[[VAL_1:.*]] = cc.alloca i1 # CHECK: cc.store %[[VAL_0]], %[[VAL_1]] : !cc.ptr -# CHECK: %[[VAL_2:.*]] = cc.alloca !cc.array -# CHECK: %[[VAL_3:.*]] = cc.load %[[VAL_1]] : !cc.ptr -# CHECK: %[[VAL_4:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr>, i64) -> !cc.ptr -# CHECK: cc.store %[[VAL_3]], %[[VAL_4]] : !cc.ptr +# CHECK: %[[VAL_2:.*]] = cc.alloca !cc.array +# CHECK: %[[VAL_3:.*]] = cc.cast %[[VAL_2]] : (!cc.ptr>) -> !cc.ptr> +# CHECK: %[[VAL_4:.*]] = cc.load %[[VAL_1]] : !cc.ptr +# CHECK: %[[VAL_5:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr>, i64) -> !cc.ptr +# CHECK: %[[VAL_6:.*]] = cc.cast unsigned %[[VAL_4]] : (i1) -> i8 +# CHECK: cc.store %[[VAL_6]], %[[VAL_5]] : !cc.ptr # CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel2() -> f64 attributes {"cudaq-entrypoint", "cudaq-kernel"} # CHECK: %[[VAL_0:.*]] = arith.constant 1.000000e+00 : f64 @@ -271,14 +277,14 @@ def kernel3() -> float: # CHECK-LABEL: func.func @__nvqpp__mlirgen__kernel1() -> i1 attributes {"cudaq-entrypoint", "cudaq-kernel"} -# CHECK: %[[VAL_0:.*]] = arith.constant 1 : i64 -# CHECK: %[[VAL_1:.*]] = arith.constant true +# CHECK: %[[VAL_0:.*]] = arith.constant 1 : i8 +# CHECK: %[[VAL_1:.*]] = arith.constant 1 : i64 # CHECK: %[[VAL_2:.*]] = cc.alloca !cc.array x 5> -# CHECK: %[[VAL_3:.*]] = cc.alloca !cc.array -# CHECK: %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr>) -> !cc.ptr> -# CHECK: %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr>) -> !cc.ptr -# CHECK: cc.store %[[VAL_1]], %[[VAL_5]] : !cc.ptr -# CHECK: %[[VAL_6:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_0]] : (!cc.ptr>, i64) -> !cc.stdvec +# CHECK: %[[VAL_3:.*]] = cc.alloca !cc.array +# CHECK: %[[VAL_4:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr>) -> !cc.ptr> +# CHECK: %[[VAL_5:.*]] = cc.cast %[[VAL_3]] : (!cc.ptr>) -> !cc.ptr +# CHECK: cc.store %[[VAL_0]], %[[VAL_5]] : !cc.ptr +# CHECK: %[[VAL_6:.*]] = cc.stdvec_init %[[VAL_4]], %[[VAL_1]] : (!cc.ptr>, i64) -> !cc.stdvec # CHECK: %[[VAL_7:.*]] = cc.compute_ptr %[[VAL_2]][{{.*}}] : (!cc.ptr x 5>>, i64) -> !cc.ptr> # CHECK: cc.store %[[VAL_6]], %[[VAL_7]] : !cc.ptr>