Fix bug with separate compilation of quasi-entry point kernels. (#3352)

schweitzpgi · web-flow · commit 3588f659156e · 2025-08-15T23:35:21.000Z
Signed-off-by: Eric Schweitz &lt;eschweitz@nvidia.com&gt;
diff --git a/lib/Optimizer/Builder/Marshal.cpp b/lib/Optimizer/Builder/Marshal.cpp
@@ -785,7 +785,8 @@ cudaq::opt::marshal::dropAnyHiddenArguments(MutableArrayRef<BlockArgument> args,
 std::pair<bool, func::FuncOp> cudaq::opt::marshal::lookupHostEntryPointFunc(
     StringRef mangledEntryPointName, ModuleOp module, func::FuncOp funcOp) {
   if (mangledEntryPointName == "BuilderKernel.EntryPoint" ||
-      mangledEntryPointName.contains("_PyKernelEntryPointRewrite")) {
+      mangledEntryPointName.contains("_PyKernelEntryPointRewrite") ||
+      funcOp.empty()) {
     // No host entry point needed.
     return {false, func::FuncOp{}};
   }
diff --git a/lib/Optimizer/Transforms/GenKernelExecution.cpp b/lib/Optimizer/Transforms/GenKernelExecution.cpp
@@ -869,14 +869,13 @@ class GenerateKernelExecution
     for (auto &op : *module.getBody())
       if (auto funcOp = dyn_cast<func::FuncOp>(op))
         if (funcOp.getName().startswith(cudaq::runtime::cudaqGenPrefixName) &&
-            cudaq::opt::marshal::hasLegalType(funcOp.getFunctionType()))
+            cudaq::opt::marshal::hasLegalType(funcOp.getFunctionType()) &&
+            !funcOp.empty() && !funcOp->hasAttr(cudaq::generatorAnnotation))
           workList.push_back(funcOp);
 
     LLVM_DEBUG(llvm::dbgs()
                << workList.size() << " kernel entry functions to process\n");
     for (auto funcOp : workList) {
-      if (funcOp->hasAttr(cudaq::generatorAnnotation))
-        continue;
       auto loc = funcOp.getLoc();
       [[maybe_unused]] auto className =
           funcOp.getName().drop_front(cudaq::runtime::cudaqGenPrefixLength);
diff --git a/lib/Optimizer/Transforms/LowerToCFG.cpp b/lib/Optimizer/Transforms/LowerToCFG.cpp
@@ -57,14 +57,10 @@ class RewriteScope : public OpRewritePattern<cudaq::cc::ScopeOp> {
     auto loc = scopeOp.getLoc();
     auto *initBlock = rewriter.getInsertionBlock();
     Value stacksave;
-    auto module = scopeOp.getOperation()->getParentOfType<ModuleOp>();
     auto ptrTy = cudaq::cc::PointerType::get(rewriter.getI8Type());
     if (scopeOp.hasAllocation(/*quantumAllocs=*/false)) {
-      auto fun = cudaq::opt::factory::createFunction(
-          "llvm.stacksave", ArrayRef<Type>{ptrTy}, {}, module);
-      fun.setPrivate();
       auto call = rewriter.create<func::CallOp>(
-          loc, ptrTy, fun.getSymNameAttr(), ArrayRef<Value>{});
+          loc, ptrTy, cudaq::llvmStackSave, ArrayRef<Value>{});
       stacksave = call.getResult(0);
     }
     auto initPos = rewriter.getInsertionPoint();
@@ -93,10 +89,8 @@ class RewriteScope : public OpRewritePattern<cudaq::cc::ScopeOp> {
     rewriter.inlineRegionBefore(scopeOp.getInitRegion(), endBlock);
     if (stacksave) {
       rewriter.setInsertionPointToStart(endBlock);
-      auto fun = cudaq::opt::factory::createFunction(
-          "llvm.stackrestore", {}, ArrayRef<Type>{ptrTy}, module);
-      fun.setPrivate();
-      rewriter.create<func::CallOp>(loc, ArrayRef<Type>{}, fun.getSymNameAttr(),
+      rewriter.create<func::CallOp>(loc, ArrayRef<Type>{},
+                                    cudaq::llvmStackRestore,
                                     ArrayRef<Value>{stacksave});
     }
     rewriter.replaceOp(scopeOp, scopeResults);
@@ -331,10 +325,6 @@ class ConvertToCFGPrep
       mod.emitError("could not load llvm.stacksave intrinsic.");
       signalPassFailure();
     }
-    if (failed(irBuilder.loadIntrinsic(mod, cudaq::llvmStackRestore))) {
-      mod.emitError("could not load llvm.stackrestore intrinsic.");
-      signalPassFailure();
-    }
   }
 };
 } // namespace
diff --git a/targettests/SeparateCompilation/gke.cpp b/targettests/SeparateCompilation/gke.cpp
@@ -0,0 +1,43 @@
+/*******************************************************************************
+ * Copyright (c) 2025 NVIDIA Corporation & Affiliates.                         *
+ * All rights reserved.                                                        *
+ *                                                                             *
+ * This source code and the accompanying materials are made available under    *
+ * the terms of the Apache License 2.0 which accompanies this distribution.    *
+ ******************************************************************************/
+
+// clang-format off
+// RUN: if [ command -v split-file ]; then \
+// RUN: split-file %s %t && \
+// RUN: nvq++ %cpp_std -fenable-cudaq-run --target stim -c %t/gke-1.cpp \
+// RUN:   %t/gke-2.cpp -o %t/gke.out && %t/gke.out ; else \
+// RUN: echo "skipping" ; fi
+// clang-format on
+
+//--- gke-1.cpp
+
+#include <cudaq.h>
+
+// Will be defined in a separate file
+__qpu__ int mytest(int x, std::vector<cudaq::measure_result> y);
+
+__qpu__ int mykernel() {
+  cudaq::qvector q(2);
+  h(q);
+  auto mzq = mz(q);
+  int res = cudaq::device_call(mytest, 1, mzq);
+  return res;
+}
+
+int main() {
+  auto res = cudaq::run(1, mykernel);
+  return 0;
+}
+
+//--- gke-2.cpp
+
+#include <cudaq.h>
+
+__qpu__ int mytest(int x, std::vector<cudaq::measure_result> y) {
+  return x * 2;
+}