NVIDIA
diff --git a/‎.github/workflows/config/gitlab_commits.txt‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/config/gitlab_commits.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/publishing.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/publishing.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docker/build/devdeps.ext.Dockerfile‎
Lines changed: 4 additions & 1 deletion b/‎docker/build/devdeps.ext.Dockerfile‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎include/cudaq/Optimizer/Builder/Factory.h‎
Lines changed: 2 additions & 0 deletions b/‎include/cudaq/Optimizer/Builder/Factory.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎lib/Optimizer/Builder/Factory.cpp‎
Lines changed: 23 additions & 2 deletions b/‎lib/Optimizer/Builder/Factory.cpp‎
Lines changed: 23 additions & 2 deletions
diff --git a/‎lib/Optimizer/Transforms/ApplyOpSpecialization.cpp‎
Lines changed: 46 additions & 3 deletions b/‎lib/Optimizer/Transforms/ApplyOpSpecialization.cpp‎
Lines changed: 46 additions & 3 deletions
diff --git a/‎lib/Optimizer/Transforms/GlobalizeArrayValues.cpp‎
Lines changed: 2 additions & 0 deletions b/‎lib/Optimizer/Transforms/GlobalizeArrayValues.cpp‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pyproject.toml.cu12‎
Lines changed: 4 additions & 2 deletions b/‎pyproject.toml.cu12‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎pyproject.toml.cu13‎
Lines changed: 5 additions & 3 deletions b/‎pyproject.toml.cu13‎
Lines changed: 5 additions & 3 deletions
diff --git a/‎python/cudaq/kernel/ast_bridge.py‎
Lines changed: 3 additions & 4 deletions b/‎python/cudaq/kernel/ast_bridge.py‎
Lines changed: 3 additions & 4 deletions
@@ -1,2 +1,2 @@
 nvidia-mgpu-repo: cuda-quantum/cuquantum-mgpu.git
-nvidia-mgpu-commit: ef144598fe5beab4b80edbd30e455906373e2e0e
+nvidia-mgpu-commit: 8d7646431c824f8a7bf88bf3d9ba02f42746a024
@@ -1139,7 +1139,7 @@ jobs:
         # Note: this is the version of the conda 'nvidia/label/cuda' channel.
         # Specifically, 'nvidia/label/cuda-13.0.0' does not contain proper CUDA 13 packages,
         # hence we need to use later channels.
-        cuda_version_conda: ['12.4.0', '13.0.2']
+        cuda_version_conda: ['12.6.0', '13.0.2']
       fail-fast: false
 
     # Must have environment to access environment secreats
 
@@ -172,6 +172,9 @@ ENV PIP_BREAK_SYSTEM_PACKAGES=1
 
 # Install cuQuantum dependencies, including cuTensor.
 # Install cupy version 13.4.1
+# Note: for docker images, we fixed the cuquantum version (with `==`) to avoid unintentional upgrades.
+# e.g., API marked as deprecated in a minor version upgrade may break build.
+# For Python pip installations, we allow minor version upgrades with `~=`, assuming the API is stable.
 RUN apt-get update && apt-get install -y --no-install-recommends \
         python3 python3-pip && \
     apt-get autoremove -y && apt-get clean && rm -rf /var/lib/apt/lists/* && \
@@ -180,7 +183,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
     else \
         cupy_version=13.4.1; \
     fi && \
-    python3 -m pip install --break-system-packages cupy-cuda$(echo $CUDA_VERSION | cut -d . -f1)x==${cupy_version} cuquantum-cu$(echo $CUDA_VERSION | cut -d . -f1)==25.09 && \
+    python3 -m pip install --break-system-packages cupy-cuda$(echo $CUDA_VERSION | cut -d . -f1)x==${cupy_version} cuquantum-cu$(echo $CUDA_VERSION | cut -d . -f1)==25.09.1 && \
     if [ "$(python3 --version | grep -o [0-9\.]* | cut -d . -f -2)" != "3.12" ]; then \
         echo "expecting Python version 3.12"; \
     fi
 
@@ -82,6 +82,8 @@ cudaq::cc::PointerType getIndexedObjectType(mlir::Type eleTy);
 
 mlir::Type genArgumentBufferType(mlir::Type ty);
 
+bool isStlVectorBoolHostType(mlir::Type ty);
+
 /// Build an LLVM struct type with all the arguments and then all the results.
 /// If the type is a std::vector, then add an i64 to the struct for the
 /// length. The actual data values will be appended to the end of the
 
@@ -357,6 +357,27 @@ static cc::StructType stlHostVectorType(Type eleTy) {
   return cc::StructType::get(ctx, ArrayRef<Type>{ptrTy, padout});
 }
 
+bool factory::isStlVectorBoolHostType(Type ty) {
+  auto strTy = dyn_cast<cc::StructType>(ty);
+  if (!strTy)
+    return false;
+  if (strTy.getMembers().size() != 2)
+    return false;
+  auto ptrTy = dyn_cast<cc::PointerType>(strTy.getMember(0));
+  if (!ptrTy)
+    return false;
+  if (ptrTy.getElementType() != IntegerType::get(ty.getContext(), 1))
+    return false;
+  auto arrTy = dyn_cast<cc::ArrayType>(strTy.getMember(1));
+  if (!arrTy)
+    return false;
+  if (arrTy.getElementType() != IntegerType::get(ty.getContext(), 8))
+    return false;
+  if (arrTy.isUnknownSize() || (arrTy.getSize() != 32))
+    return false;
+  return true;
+}
+
 // FIXME: Give these front-end names so we can disambiguate more types.
 cc::StructType factory::getDynamicBufferType(MLIRContext *ctx) {
   auto ptrTy = cc::PointerType::get(IntegerType::get(ctx, 8));
@@ -374,7 +395,7 @@ Type factory::getSRetElementType(FunctionType funcTy) {
   if (funcTy.getNumResults() > 1)
     return cc::StructType::get(ctx, funcTy.getResults());
   if (auto spanTy = dyn_cast<cc::SpanLikeType>(funcTy.getResult(0)))
-    return stlVectorType(spanTy.getElementType());
+    return stlHostVectorType(spanTy.getElementType());
   return funcTy.getResult(0);
 }
 
@@ -775,7 +796,7 @@ factory::getOrAddFunc(mlir::Location loc, mlir::StringRef funcName,
 }
 
 void factory::mergeModules(ModuleOp into, ModuleOp from) {
-  for (Operation &op : *from.getBody()) {
+  for (Operation &op : from) {
     auto sym = dyn_cast<SymbolOpInterface>(op);
     if (!sym)
       continue; // Only merge named symbols, avoids duplicating anonymous ops.
 
@@ -259,8 +259,21 @@ struct ApplyOpPattern : public OpRewritePattern<quake::ApplyOp> {
 
   LogicalResult matchAndRewrite(quake::ApplyOp apply,
                                 PatternRewriter &rewriter) const override {
-    auto calleeName = getVariantFunctionName(
-        apply, apply.getCallee()->getRootReference().str());
+    std::string calleeOrigName;
+    if (apply.getCallee()) {
+      calleeOrigName = apply.getCallee()->getRootReference().str();
+    } else {
+      // Check if the first argument is a func.ConstantOp.
+      auto calleeVals = apply.getIndirectCallee();
+      if (calleeVals.empty())
+        return failure();
+      Value calleeVal = calleeVals.front();
+      auto fc = calleeVal.getDefiningOp<func::ConstantOp>();
+      if (!fc)
+        return failure();
+      calleeOrigName = fc.getValue().str();
+    }
+    auto calleeName = getVariantFunctionName(apply, calleeOrigName);
     auto *ctx = apply.getContext();
     auto consTy = quake::VeqType::getUnsized(ctx);
     SmallVector<Value> newArgs;
@@ -286,14 +299,44 @@ struct ApplyOpPattern : public OpRewritePattern<quake::ApplyOp> {
   const bool constProp;
 };
 
+struct FoldCallable : public OpRewritePattern<quake::ApplyOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(quake::ApplyOp apply,
+                                PatternRewriter &rewriter) const override {
+    // If we already know the callee function, there's nothing to do.
+    if (apply.getCallee())
+      return failure();
+
+    Value ind = apply.getIndirectCallee()[0];
+    if (auto callee = ind.getDefiningOp<cudaq::cc::InstantiateCallableOp>()) {
+      auto sym = callee.getCallee();
+      SmallVector<Value> newArguments = {ind};
+      newArguments.append(apply.getArgs().begin(), apply.getArgs().end());
+      rewriter.replaceOpWithNewOp<quake::ApplyOp>(
+          apply, apply.getResultTypes(), sym, apply.getIsAdj(),
+          apply.getControls(), newArguments);
+      return success();
+    }
+    return failure();
+  }
+};
+
 class ApplySpecializationPass
     : public cudaq::opt::impl::ApplySpecializationBase<
           ApplySpecializationPass> {
 public:
   using ApplySpecializationBase::ApplySpecializationBase;
 
   void runOnOperation() override {
-    ApplyOpAnalysis analysis(getOperation(), constantPropagation);
+    ModuleOp module = getOperation();
+    auto *ctx = module.getContext();
+    RewritePatternSet patterns(ctx);
+    patterns.insert<FoldCallable>(ctx);
+    if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
+      signalPassFailure();
+
+    ApplyOpAnalysis analysis(module, constantPropagation);
     const auto &applyVariants = analysis.getAnalysisInfo();
     if (succeeded(step1(applyVariants)))
       step2();
 
@@ -276,6 +276,8 @@ struct ReifySpanPattern : public OpRewritePattern<cudaq::cc::ReifySpanOp> {
       }
     }
 
+    // FIXME: get rid of this;
+    // see https://github.com/NVIDIA/cuda-quantum/issues/3593
     auto hasBoolElems = false;
     if (auto iTy = dyn_cast<IntegerType>(eleTy)) {
       if (iTy.getWidth() == 1) {
 
@@ -14,11 +14,13 @@ description="Python bindings for the CUDA-Q toolkit for heterogeneous quantum-cl
 authors = [{name = "NVIDIA Corporation & Affiliates"}]
 maintainers = [{name = "NVIDIA Corporation & Affiliates"}]
 readme = { file="python/README.md.in", content-type = "text/markdown"}
-requires-python = ">=3.10"
+requires-python = ">=3.11"
 license = { file="LICENSE" }
 dependencies = [
   'astpretty ~= 3.0',
-  'cuquantum-cu12 == 25.09',
+  'custatevec-cu12 ~= 1.10',
+  'cutensornet-cu12 ~= 2.9',
+  'cudensitymat-cu12 ~= 0.3',
   'numpy >= 1.24',
   'scipy >= 1.10.1',
   'requests >= 2.31',
 
@@ -14,15 +14,17 @@ description="Python bindings for the CUDA-Q toolkit for heterogeneous quantum-cl
 authors = [{name = "NVIDIA Corporation & Affiliates"}]
 maintainers = [{name = "NVIDIA Corporation & Affiliates"}]
 readme = { file="python/README.md.in", content-type = "text/markdown"}
-requires-python = ">=3.10"
+requires-python = ">=3.11"
 license = { file="LICENSE" }
 dependencies = [
   'astpretty ~= 3.0',
-  'cuquantum-cu13 == 25.09',
+  'custatevec-cu13 ~= 1.10',
+  'cutensornet-cu13 ~= 2.9',
+  'cudensitymat-cu13 ~= 0.3',  
   'numpy >= 1.24',
   'scipy >= 1.10.1',
   'requests >= 2.31',
-  'nvidia-cublas ~= 13.1',
+  'nvidia-cublas ~= 13.0',
   'nvidia-curand ~= 10.4',
   'nvidia-cusparse ~= 12.6',
   'nvidia-cuda-runtime ~= 13.0',
 
@@ -4663,8 +4663,7 @@ def __process_binary_op(self, left, right, nodeType):
             if IntegerType.isinstance(left.type):
                 self.pushValue(arith.RemUIOp(left, right).result)
                 return
-            if F64Type.isinstance(left.type) or \
-                F32Type.isinstance(left.type):
+            if (F64Type.isinstance(left.type) or F32Type.isinstance(left.type)):
                 self.pushValue(arith.RemFOp(left, right).result)
                 return
             else:
@@ -4744,8 +4743,8 @@ def visit_AugAssign(self, node):
 
         if not cc.PointerType.isinstance(target.type):
             self.emitFatalError(
-                "augment-assign target variable cannot be assigned to",
-                node)
+                "augment-assign target variable is not defined or "
+                "cannot be assigned to.", node)
 
         self.visit(node.value)
         value = self.popValue()
Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`nvidia-mgpu-repo: cuda-quantum/cuquantum-mgpu.git`
`2`		`-nvidia-mgpu-commit: ef144598fe5beab4b80edbd30e455906373e2e0e`
	`2`	`+nvidia-mgpu-commit: 8d7646431c824f8a7bf88bf3d9ba02f42746a024`
Original file line number	Diff line number	Diff line change
`@@ -276,6 +276,8 @@ struct ReifySpanPattern : public OpRewritePattern<cudaq::cc::ReifySpanOp> {`
`276`	`276`	`}`
`277`	`277`	`}`
`278`	`278`
	`279`	`+ // FIXME: get rid of this;`
	`280`	`+ // see https://github.com/NVIDIA/cuda-quantum/issues/3593`
`279`	`281`	`auto hasBoolElems = false;`
`280`	`282`	`if (auto iTy = dyn_cast<IntegerType>(eleTy)) {`
`281`	`283`	`if (iTy.getWidth() == 1) {`