Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 130 additions & 0 deletions lib/Optimizer/Builder/Intrinsics.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,136 @@ static constexpr IntrinsicCode intrinsicTable[] = {
{cudaq::stdvecBoolCtorFromInitList, {}, R"#(
func.func private @__nvqpp_initializer_list_to_vector_bool(!cc.ptr<none>, !cc.ptr<none>, i64) -> ())#"},

{"__nvqpp_internal_number_of_digits", {}, R"#(
func.func private @__nvqpp_internal_number_of_digits(%arg0: i64) -> i64 {
%c10_i64 = arith.constant 10 : i64
%c0_i64 = arith.constant 0 : i64
%c1_i64 = arith.constant 1 : i64
%0 = cc.alloca i64
cc.store %arg0, %0 : !cc.ptr<i64>
%1 = cc.load %0 : !cc.ptr<i64>
%2 = cc.alloca i64
cc.store %c0_i64, %2 : !cc.ptr<i64>
%3 = arith.cmpi eq, %1, %c0_i64 : i64
cc.if(%3) {
cc.store %c1_i64, %2 : !cc.ptr<i64>
}
cc.loop while {
%5 = cc.load %0 : !cc.ptr<i64>
%6 = arith.cmpi sgt, %5, %c0_i64 : i64
cc.condition %6
} do {
%5 = cc.load %0 : !cc.ptr<i64>
%6 = arith.divsi %5, %c10_i64 : i64
cc.store %6, %0 : !cc.ptr<i64>
%7 = cc.load %2 : !cc.ptr<i64>
%8 = arith.addi %7, %c1_i64 : i64
cc.store %8, %2 : !cc.ptr<i64>
cc.continue
}
%4 = cc.load %2 : !cc.ptr<i64>
return %4 : i64
}
)#"},

// __nvqpp_internal_tostring
{"__nvqpp_internal_tostring", {}, R"#(
func.func private @__nvqpp_internal_tostring(%buf: !cc.stdvec<i8>, %val: i64) {
%c48_i64 = arith.constant 48 : i64
%c48_i32 = arith.constant 48 : i32
%c0_i64 = arith.constant 0 : i64
%c10_i64 = arith.constant 10 : i64
%c1_i64 = arith.constant 1 : i64
%c48_i8 = arith.constant 48 : i8
%false = arith.constant false
%c0_i8 = arith.constant 0 : i8
%0 = cc.alloca i64
cc.store %val, %0 : !cc.ptr<i64>
%1 = cc.alloca i64
cc.store %c10_i64, %1 : !cc.ptr<i64>
%2 = cc.stdvec_size %buf : (!cc.stdvec<i8>) -> i64
%3 = cc.alloca i64
cc.store %2, %3 : !cc.ptr<i64>
%4 = cc.load %3 : !cc.ptr<i64>
%5 = arith.subi %4, %c1_i64 : i64
%6 = cc.alloca i64
cc.store %5, %6 : !cc.ptr<i64>
%7 = cc.load %6 : !cc.ptr<i64>
%8 = cc.stdvec_data %buf : (!cc.stdvec<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
%9 = cc.compute_ptr %8[%7] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
cc.store %c0_i8, %9 : !cc.ptr<i8>
%10 = cc.load %6 : !cc.ptr<i64>
%11 = arith.subi %10, %c1_i64 : i64
cc.store %11, %6 : !cc.ptr<i64>
cc.loop while {
%18 = cc.load %0 : !cc.ptr<i64>
%19 = cc.load %1 : !cc.ptr<i64>
%20 = arith.cmpi sge, %18, %19 : i64
%21 = arith.cmpi eq, %20, %false : i1
%22 = cc.if(%21) -> i1 {
cc.continue %false : i1
} else {
%23 = cc.load %6 : !cc.ptr<i64>
%24 = arith.cmpi sge, %23, %c0_i64 : i64
cc.continue %24 : i1
}
cc.condition %22
} do {
cc.scope {
%18 = cc.load %0 : !cc.ptr<i64>
%19 = cc.load %1 : !cc.ptr<i64>
%20 = arith.remsi %18, %19 : i64
%21 = cc.cast %20 : (i64) -> i32
%22 = cc.alloca i32
cc.store %21, %22 : !cc.ptr<i32>
%23 = cc.load %1 : !cc.ptr<i64>
%24 = cc.load %0 : !cc.ptr<i64>
%25 = arith.divsi %24, %23 : i64
cc.store %25, %0 : !cc.ptr<i64>
%26 = cc.load %6 : !cc.ptr<i64>
%27 = cc.stdvec_data %buf : (!cc.stdvec<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
%28 = cc.compute_ptr %27[%26] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
%29 = cc.load %22 : !cc.ptr<i32>
%30 = arith.addi %29, %c48_i32 : i32
%31 = cc.cast %30 : (i32) -> i8
cc.store %31, %28 : !cc.ptr<i8>
%32 = cc.load %6 : !cc.ptr<i64>
%33 = arith.subi %32, %c1_i64 : i64
cc.store %33, %6 : !cc.ptr<i64>
}
cc.continue
}
%12 = cc.load %6 : !cc.ptr<i64>
%13 = cc.stdvec_data %buf : (!cc.stdvec<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
%14 = cc.compute_ptr %13[%12] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
%15 = cc.load %0 : !cc.ptr<i64>
%16 = arith.addi %15, %c48_i64 : i64
%17 = cc.cast %16 : (i64) -> i8
cc.store %17, %14 : !cc.ptr<i8>
cc.scope {
%18 = cc.alloca i64
cc.store %c0_i64, %18 : !cc.ptr<i64>
cc.loop while {
%19 = cc.load %18 : !cc.ptr<i64>
%20 = cc.load %6 : !cc.ptr<i64>
%21 = arith.cmpi slt, %19, %20 : i64
cc.condition %21
} do {
%19 = cc.load %18 : !cc.ptr<i64>
%20 = cc.stdvec_data %buf : (!cc.stdvec<i8>) -> !cc.ptr<!cc.array<i8 x ?>>
%21 = cc.compute_ptr %20[%19] : (!cc.ptr<!cc.array<i8 x ?>>, i64) -> !cc.ptr<i8>
cc.store %c48_i8, %21 : !cc.ptr<i8>
cc.continue
} step {
%19 = cc.load %18 : !cc.ptr<i64>
%20 = arith.addi %19, %c1_i64 : i64
cc.store %20, %18 : !cc.ptr<i64>
}
}
return
}
)#"},

// This helper function copies a buffer off the stack to the heap. This is
// required when the data on the stack is about to go out of scope but is
// still live.
Expand Down
1 change: 1 addition & 0 deletions lib/Optimizer/CodeGen/Pipelines.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,7 @@ void createTargetCodegenPipeline(PassManager &pm,
pm.addNestedPass<func::FuncOp>(createCSEPass());
::addQIRConversionPipeline(pm, options.target);
pm.addPass(cudaq::opt::createReturnToOutputLog());
cudaq::opt::addLowerToCFG(pm);
pm.addPass(createConvertMathToFuncs());
pm.addPass(createSymbolDCEPass());
pm.addPass(cudaq::opt::createCCToLLVM());
Expand Down
156 changes: 147 additions & 9 deletions lib/Optimizer/CodeGen/ReturnToOutputLog.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,17 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
}

static void genOutputLog(Location loc, PatternRewriter &rewriter, Value val,
std::optional<StringRef> prefix) {
std::optional<StringRef> prefix,
std::optional<Value> customLabel = std::nullopt) {
Type valTy = val.getType();
TypeSwitch<Type>(valTy)
.Case([&](IntegerType intTy) {
int width = intTy.getWidth();
std::string labelStr = std::string("i") + std::to_string(width);
if (prefix)
labelStr = prefix->str();
Value label = makeLabel(loc, rewriter, labelStr);
Value label =
customLabel.value_or(makeLabel(loc, rewriter, labelStr));
if (intTy.getWidth() == 1) {
rewriter.create<func::CallOp>(loc, TypeRange{},
cudaq::opt::QIRBoolRecordOutput,
Expand All @@ -80,7 +82,8 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
std::string labelStr = std::string("f") + std::to_string(width);
if (prefix)
labelStr = prefix->str();
Value label = makeLabel(loc, rewriter, labelStr);
Value label =
customLabel.value_or(makeLabel(loc, rewriter, labelStr));
// Floating point: convert it to double, whatever it actually is.
Value castVal = val;
if (floatTy != rewriter.getF64Type())
Expand All @@ -94,7 +97,8 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
auto labelStr = translateType(structTy);
if (prefix)
labelStr = prefix->str();
Value label = makeLabel(loc, rewriter, labelStr);
Value label =
customLabel.value_or(makeLabel(loc, rewriter, labelStr));
std::int32_t sz = structTy.getNumMembers();
Value size = rewriter.create<arith::ConstantIntOp>(loc, sz, 64);
rewriter.create<func::CallOp>(loc, TypeRange{},
Expand All @@ -111,7 +115,8 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
})
.Case([&](cudaq::cc::ArrayType arrTy) {
auto labelStr = translateType(arrTy);
Value label = makeLabel(loc, rewriter, labelStr);
Value label =
customLabel.value_or(makeLabel(loc, rewriter, labelStr));
std::int32_t sz = arrTy.getSize();
Value size = rewriter.create<arith::ConstantIntOp>(loc, sz, 64);
rewriter.create<func::CallOp>(loc, TypeRange{},
Expand All @@ -128,13 +133,12 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
}
})
.Case([&](cudaq::cc::StdvecType vecTy) {
// For this type, we expect a cc.stdvec_init operation as the input.
// The data will be in a variable.
// If we reach here and we cannot determine the constant size of the
// buffer, then we will not generate any output logging.
if (auto vecInit = val.getDefiningOp<cudaq::cc::StdvecInitOp>())
if (auto maybeLen = cudaq::opt::factory::maybeValueOfIntConstant(
vecInit.getLength())) {
// For this type, we expect a cc.stdvec_init operation as the
// input.
// The data will be in a variable.
std::int32_t sz = *maybeLen;
auto labelStr = translateType(vecTy, sz);
Value label = makeLabel(loc, rewriter, labelStr);
Expand All @@ -158,7 +162,55 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
Value w = rewriter.create<cudaq::cc::LoadOp>(loc, v);
genOutputLog(loc, rewriter, w, offset);
}
return;
}

// If we reach here and we cannot determine the constant size of the
// buffer, then we will not generate dynamic output logging with a for
// loop.
Value vecSz = rewriter.template create<cudaq::cc::StdvecSizeOp>(
loc, rewriter.getI64Type(), val);
const std::string arrayLabelPrefix =
"array<" + translateType(vecTy.getElementType()) + " x ";
Value labelBuffer =
makeLabel(loc, rewriter, arrayLabelPrefix, vecSz, ">");
rewriter.create<func::CallOp>(loc, TypeRange{},
cudaq::opt::QIRArrayRecordOutput,
ArrayRef<Value>{vecSz, labelBuffer});
auto eleTy = vecTy.getElementType();
const bool isBool = (eleTy == rewriter.getI1Type());
if (isBool)
eleTy = rewriter.getI8Type();
auto elePtrTy = cudaq::cc::PointerType::get(eleTy);
auto eleArrTy =
cudaq::cc::PointerType::get(cudaq::cc::ArrayType::get(eleTy));
auto vecPtr =
rewriter.create<cudaq::cc::StdvecDataOp>(loc, eleArrTy, val);
const std::string preStr = prefix ? prefix->str() : std::string{};
cudaq::opt::factory::createInvariantLoop(
rewriter, loc, vecSz,
[&](OpBuilder &builder, Location loc, Region &, Block &block) {
Value indexVar = block.getArgument(0);
auto eleAddr = rewriter.create<cudaq::cc::ComputePtrOp>(
loc, elePtrTy, vecPtr, ValueRange{indexVar});

Value w = [&]() {
if (isBool) {
auto i1PtrTy =
cudaq::cc::PointerType::get(rewriter.getI1Type());
auto i1Cast = rewriter.create<cudaq::cc::CastOp>(
loc, i1PtrTy, eleAddr);
return rewriter.create<cudaq::cc::LoadOp>(loc, i1Cast);
}

return rewriter.create<cudaq::cc::LoadOp>(loc, eleAddr);
}();
const std::string prefix = preStr + "[";
const std::string postfix = "]";
Value dynamicLabel =
makeLabel(loc, rewriter, prefix, indexVar, postfix);
genOutputLog(loc, rewriter, w, std::nullopt, dynamicLabel);
});
})
.Default([&](Type) {
// If we reach here, we don't know how to handle this type.
Expand Down Expand Up @@ -207,6 +259,79 @@ class ReturnRewrite : public OpRewritePattern<cudaq::cc::LogOutputOp> {
auto i8PtrTy = cudaq::cc::PointerType::get(rewriter.getI8Type());
return rewriter.create<cudaq::cc::CastOp>(loc, i8PtrTy, lit);
}

static Value makeLabel(Location loc, PatternRewriter &rewriter,
const std::string &prefix, Value val,
const std::string &postFix) {
auto i64Ty = rewriter.getI64Type();
auto i8Ty = rewriter.getI8Type();
auto i8PtrTy = cudaq::cc::PointerType::get(i8Ty);
// Value must be i64
if (val.getType() != i64Ty)
val = rewriter.create<cudaq::cc::CastOp>(loc, i64Ty, val);
// Compute the number of digits required
Value numDigits = rewriter
.create<func::CallOp>(
loc, i64Ty, "__nvqpp_internal_number_of_digits",
ArrayRef<Value>{val})
.getResult(0);
Value valStrBuf = [&]() {
// Convert integer value to string
auto strSize = rewriter.create<arith::AddIOp>(
loc, numDigits,
rewriter.create<arith::ConstantIntOp>(loc, 1,
64)); // Add null terminator
auto buffer = rewriter.create<cudaq::cc::AllocaOp>(loc, i8Ty, strSize);
auto stdvecTy = cudaq::cc::StdvecType::get(i8Ty);
auto stringCharVec = rewriter.create<cudaq::cc::StdvecInitOp>(
loc, stdvecTy, buffer, strSize);
rewriter.create<func::CallOp>(loc, TypeRange{},
"__nvqpp_internal_tostring",
ArrayRef<Value>{stringCharVec, val});
return rewriter.create<cudaq::cc::CastOp>(loc, i8PtrTy, buffer);
}();

Value arrayPrefix = makeLabel(loc, rewriter, prefix);
Value arrayPostfix = makeLabel(loc, rewriter, postFix);
const int preFixLen = prefix.size();
const int postFixLen = postFix.size();
Value totalStrSize = rewriter.create<arith::AddIOp>(
loc, numDigits,
rewriter.create<arith::ConstantIntOp>(loc, preFixLen + postFixLen + 1,
64));
auto labelBufferAlloc =
rewriter.create<cudaq::cc::AllocaOp>(loc, i8Ty, totalStrSize);
Value labelBuffer =
rewriter.create<cudaq::cc::CastOp>(loc, i8PtrTy, labelBufferAlloc);

// Copy the prefix
rewriter.create<func::CallOp>(
loc, std::nullopt, cudaq::llvmMemCopyIntrinsic,
ValueRange{labelBuffer, arrayPrefix,
rewriter.create<arith::ConstantIntOp>(loc, preFixLen, 64),
rewriter.create<arith::ConstantIntOp>(loc, 0, 1)});
// Copy the integer string
auto toPtr = rewriter.create<cudaq::cc::ComputePtrOp>(
loc, i8PtrTy, labelBufferAlloc,
ValueRange{rewriter.create<arith::ConstantIntOp>(loc, preFixLen, 64)});
rewriter.create<func::CallOp>(
loc, std::nullopt, cudaq::llvmMemCopyIntrinsic,
ValueRange{toPtr, valStrBuf, numDigits,
rewriter.create<arith::ConstantIntOp>(loc, 0, 1)});
// Copy the postfix + null terminator
Value shift = rewriter.create<arith::AddIOp>(
loc, numDigits,
rewriter.create<arith::ConstantIntOp>(loc, preFixLen, 64));
toPtr = rewriter.create<cudaq::cc::ComputePtrOp>(
loc, i8PtrTy, labelBufferAlloc, ValueRange{shift});
rewriter.create<func::CallOp>(
loc, std::nullopt, cudaq::llvmMemCopyIntrinsic,
ValueRange{
toPtr, arrayPostfix,
rewriter.create<arith::ConstantIntOp>(loc, postFixLen + 1, 64),
rewriter.create<arith::ConstantIntOp>(loc, 0, 1)});
return labelBuffer;
}
};

struct ReturnToOutputLogPass
Expand All @@ -230,6 +355,19 @@ struct ReturnToOutputLogPass
return;
}

if (failed(irBuilder.loadIntrinsic(module, "__nvqpp_internal_tostring"))) {
module.emitError("could not load string conversion function.");
signalPassFailure();
return;
}

if (failed(irBuilder.loadIntrinsic(module,
"__nvqpp_internal_number_of_digits"))) {
module.emitError("could not load number of digits function.");
signalPassFailure();
return;
}

RewritePatternSet patterns(ctx);
patterns.insert<ReturnRewrite>(ctx);
LLVM_DEBUG(llvm::dbgs() << "Before return to output logging:\n" << module);
Expand Down
Loading
Loading